From d4210d04c16861ed6cbccd589e9c19fd4511c97d Mon Sep 17 00:00:00 2001 From: Val Date: Sun, 5 Apr 2020 15:18:51 +0300 Subject: [PATCH 0001/2357] databasereplicated constructor scratch --- src/Databases/DatabaseReplicated.cpp | 215 +++++++++++++++++++++++++++ src/Databases/DatabaseReplicated.h | 61 ++++++++ 2 files changed, 276 insertions(+) create mode 100644 src/Databases/DatabaseReplicated.cpp create mode 100644 src/Databases/DatabaseReplicated.h diff --git a/src/Databases/DatabaseReplicated.cpp b/src/Databases/DatabaseReplicated.cpp new file mode 100644 index 00000000000..fd5f53a596c --- /dev/null +++ b/src/Databases/DatabaseReplicated.cpp @@ -0,0 +1,215 @@ +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include + +#include + +namespace DB +{ + + +namespace ErrorCodes +{ + extern const int NO_ZOOKEEPER; +} + +void DatabaseReplicated::setZooKeeper(zkutil::ZooKeeperPtr zookeeper) +{ + std::lock_guard lock(current_zookeeper_mutex); + current_zookeeper = zookeeper; +} + +zkutil::ZooKeeperPtr DatabaseReplicated::tryGetZooKeeper() const +{ + std::lock_guard lock(current_zookeeper_mutex); + return current_zookeeper; +} + +zkutil::ZooKeeperPtr DatabaseReplicated::getZooKeeper() const +{ + auto res = tryGetZooKeeper(); + if (!res) + throw Exception("Cannot get ZooKeeper", ErrorCodes::NO_ZOOKEEPER); + return res; +} + + +DatabaseReplicated::DatabaseReplicated( + const String & name_, + const String & metadata_path_, + const String & zookeeper_path_, + const String & replica_name_, + const Context & context_) + : DatabaseOrdinary(name_, metadata_path_, context_) + , zookeeper_path(zookeeper_path_) + , replica_name(replica_name_) +{ + + if (!zookeeper_path.empty() && zookeeper_path.back() == '/') + zookeeper_path.resize(zookeeper_path.size() - 1); + /// If zookeeper chroot prefix is used, path should start with '/', because chroot concatenates without it. + if (!zookeeper_path.empty() && zookeeper_path.front() != '/') + zookeeper_path = "/" + zookeeper_path; + replica_path = zookeeper_path + "/replicas/" + replica_name; + + if (context_.hasZooKeeper()) { + current_zookeeper = context_.getZooKeeper(); + } + + if (!current_zookeeper) + { + // TODO wtf is attach + // if (!attach) + throw Exception("Can't create replicated table without ZooKeeper", ErrorCodes::NO_ZOOKEEPER); + + /// Do not activate the replica. It will be readonly. + // TODO is it relevant for engines? + // LOG_ERROR(log, "No ZooKeeper: database will be in readonly mode."); + // TODO is_readonly = true; + // return; + } + + // can the zk path exist and no metadata on disk be available at the same moment? if so, in such a case, the db instance must be restored. + + current_zookeeper->createIfNotExists(zookeeper_path, String()); + current_zookeeper->createIfNotExists(replica_path, String()); + // TODO what to do? + // TODO createDatabaseIfNotExists ? + // TODO check database structure ? +} + +void DatabaseReplicated::createTable( + const Context & context, + const String & table_name, + const StoragePtr & table, + const ASTPtr & query) +{ + // try + DatabaseOnDisk::createTable(context, table_name, table, query); + + // replicated stuff + String statement = getObjectDefinitionFromCreateQuery(query); + auto zookeeper = getZooKeeper(); + // TODO в чем прикол именно так создавать зиноды? + Coordination::Requests ops; + ops.emplace_back(zkutil::makeCreateRequest(zookeeper_path, "", + zkutil::CreateMode::Persistent)); + ops.emplace_back(zkutil::makeCreateRequest(zookeeper_path + "/metadata", metadata, + zkutil::CreateMode::Persistent)); +// ops.emplace_back(zkutil::makeCreateRequest(zookeeper_path + "/columns", getColumns().toString(), +// zkutil::CreateMode::Persistent)); + ops.emplace_back(zkutil::makeCreateRequest(zookeeper_path + "/log", "", + zkutil::CreateMode::Persistent)); +// ops.emplace_back(zkutil::makeCreateRequest(zookeeper_path + "/blocks", "", +// zkutil::CreateMode::Persistent)); +// ops.emplace_back(zkutil::makeCreateRequest(zookeeper_path + "/block_numbers", "", +// zkutil::CreateMode::Persistent)); +// ops.emplace_back(zkutil::makeCreateRequest(zookeeper_path + "/nonincrement_block_numbers", "", +// zkutil::CreateMode::Persistent)); /// /nonincrement_block_numbers dir is unused, but is created nonetheless for backwards compatibility. + // TODO do we need a leader here? (probably yes) what is it gonna do? + ops.emplace_back(zkutil::makeCreateRequest(zookeeper_path + "/leader_election", "", + zkutil::CreateMode::Persistent)); + ops.emplace_back(zkutil::makeCreateRequest(zookeeper_path + "/temp", "", + zkutil::CreateMode::Persistent)); + ops.emplace_back(zkutil::makeCreateRequest(zookeeper_path + "/replicas", "", + zkutil::CreateMode::Persistent)); + + Coordination::Responses responses; + auto code = zookeeper->tryMulti(ops, responses); + if (code && code != Coordination::ZNODEEXISTS) + throw Coordination::Exception(code); + + // ... + +} + + +void DatabaseReplicated::renameTable( + const Context & context, + const String & table_name, + IDatabase & to_database, + const String & to_table_name, + TableStructureWriteLockHolder & lock) +{ + // try + DatabaseOnDisk::renameTable(context, table_name, to_database, to_table_name, lock); + // replicated stuff + String statement = getObjectDefinitionFromCreateQuery(query); + // this one is fairly more complex +} + +void DatabaseReplicated::removeTable( + const Context & context, + const String & table_name) +{ + // try + DatabaseOnDisk::removeTable(context, table_name); + // replicated stuff + String statement = getObjectDefinitionFromCreateQuery(query); + // ... +} + +void DatabaseReplicated::drop(const Context & context) +{ + DatabaseOnDisk::drop(context); + // replicated stuff + String statement = getObjectDefinitionFromCreateQuery(query); + // should it be possible to recover after a drop. + // if not, we can just delete all the zookeeper nodes starting from + // zookeeper path. does it work recursively? hope so... +} + +void DatabaseOrdinary::loadStoredObjects( + Context & context, + bool has_force_restore_data_flag) +{ + syncReplicaState(context); + updateMetadata(context); + + DatabaseOrdinary::loadStoredObjects(context, has_force_restore_data_flag); + +} + +// sync replica's zookeeper metadata +void syncReplicaState(Context & context) { + +} + +// get the up to date metadata from zookeeper to local metadata dir +// for replicated (only?) tables +void updateMetadata(Context & context) { + +} + +} diff --git a/src/Databases/DatabaseReplicated.h b/src/Databases/DatabaseReplicated.h new file mode 100644 index 00000000000..51f7763bb5a --- /dev/null +++ b/src/Databases/DatabaseReplicated.h @@ -0,0 +1,61 @@ +#pragma once + +#include +#include +#include + +namespace DB +{ +/** Replicated database engine. + * It stores tables list using list of .sql files, + * that contain declaration of table represented by SQL ATTACH TABLE query + * and operation log in zookeeper + */ +class DatabaseReplicated : public DatabaseOrdinary +{ +public: + DatabaseReplicated(const String & name_, const String & metadata_path_, const String & zookeeper_path_, const String & replica_name_, const Context & context); + + String getEngineName() const override { return "Replicated"; } + + void createTable( + const Context & context, + const String & table_name, + const StoragePtr & table, + const ASTPtr & query) override; + + void removeTable( + const Context & context, + const String & table_name) override; + + void renameTable( + const Context & context, + const String & table_name, + IDatabase & to_database, + const String & to_table_name, + TableStructureWriteLockHolder & lock) override; + + void drop(const Context & context) override; + + void loadStoredObjects( + Context & context, + bool has_force_restore_data_flag) override; + +private: + String zookeeper_path; + String replica_name; + String replica_path; + + zkutil::ZooKeeperPtr current_zookeeper; /// Use only the methods below. + mutable std::mutex current_zookeeper_mutex; /// To recreate the session in the background thread. + + zkutil::ZooKeeperPtr tryGetZooKeeper() const; + zkutil::ZooKeeperPtr getZooKeeper() const; + void setZooKeeper(zkutil::ZooKeeperPtr zookeeper); + + void syncReplicaState(Context & context); + + void updateMetadata(Context & context); +}; + +} From 272e31188d9b76bc4680fccf3502e459c89d5956 Mon Sep 17 00:00:00 2001 From: Val Date: Sun, 5 Apr 2020 16:06:21 +0300 Subject: [PATCH 0002/2357] databasereplicated add table functions prototype --- dbms/src/Databases/DatabaseReplicated.cpp | 156 ++++++++++++++++++++++ 1 file changed, 156 insertions(+) create mode 100644 dbms/src/Databases/DatabaseReplicated.cpp diff --git a/dbms/src/Databases/DatabaseReplicated.cpp b/dbms/src/Databases/DatabaseReplicated.cpp new file mode 100644 index 00000000000..704c678f366 --- /dev/null +++ b/dbms/src/Databases/DatabaseReplicated.cpp @@ -0,0 +1,156 @@ +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include + +#include + +namespace DB +{ + + +namespace ErrorCodes +{ + extern const int NO_ZOOKEEPER; +} + +void DatabaseReplicated::setZooKeeper(zkutil::ZooKeeperPtr zookeeper) +{ + std::lock_guard lock(current_zookeeper_mutex); + current_zookeeper = zookeeper; +} + +zkutil::ZooKeeperPtr DatabaseReplicated::tryGetZooKeeper() const +{ + std::lock_guard lock(current_zookeeper_mutex); + return current_zookeeper; +} + +zkutil::ZooKeeperPtr DatabaseReplicated::getZooKeeper() const +{ + auto res = tryGetZooKeeper(); + if (!res) + throw Exception("Cannot get ZooKeeper", ErrorCodes::NO_ZOOKEEPER); + return res; +} + + +DatabaseReplicated::DatabaseReplicated( + const String & name_, + const String & metadata_path_, + const String & zookeeper_path_, + const String & replica_name_, + const Context & context_) + : DatabaseOrdinary(name_, metadata_path_, context_) + , zookeeper_path(zookeeper_path_) + , replica_name(replica_name_) +{ + + if (!zookeeper_path.empty() && zookeeper_path.back() == '/') + zookeeper_path.resize(zookeeper_path.size() - 1); + /// If zookeeper chroot prefix is used, path should start with '/', because chroot concatenates without it. + if (!zookeeper_path.empty() && zookeeper_path.front() != '/') + zookeeper_path = "/" + zookeeper_path; + replica_path = zookeeper_path + "/replicas/" + replica_name; + + if (context_.hasZooKeeper()) { + current_zookeeper = context_.getZooKeeper(); + } + + if (!current_zookeeper) + { + // TODO wtf is attach + // if (!attach) + throw Exception("Can't create replicated table without ZooKeeper", ErrorCodes::NO_ZOOKEEPER); + + /// Do not activate the replica. It will be readonly. + // TODO is it relevant for engines? + // LOG_ERROR(log, "No ZooKeeper: database will be in readonly mode."); + // TODO is_readonly = true; + // return; + } + // getObjectDefinitionFromCreateQuery + // TODO what to do? + // TODO createDatabaseIfNotExists ? + // TODO check database structure ? +} + +void DatabaseReplicated::createTable( + const Context & context, + const String & table_name, + const StoragePtr & table, + const ASTPtr & query) +{ + // try + DatabaseOnDisk::createTable(context, table_name, table, query); + // replicated stuff + String statement = getObjectDefinitionFromCreateQuery(query); + // ... + +} + + +void DatabaseReplicated::renameTable( + const Context & context, + const String & table_name, + IDatabase & to_database, + const String & to_table_name, + TableStructureWriteLockHolder & lock) +{ + // try + DatabaseOnDisk::renameTable(context, table_name, to_database, to_table_name, lock); + // replicated stuff + String statement = getObjectDefinitionFromCreateQuery(query); + // ... +} + +void DatabaseReplicated::removeTable( + const Context & context, + const String & table_name) +{ + // try + DatabaseOnDisk::removeTable(context, table_name); + // replicated stuff + String statement = getObjectDefinitionFromCreateQuery(query); + // ... +} + +void DatabaseReplicated::drop(const Context & context) +{ + DatabaseOnDisk::drop(context); + // replicated stuff + String statement = getObjectDefinitionFromCreateQuery(query); + // ... +} + +} From edb871979a66ecd5d07346003360344e5fb51ff0 Mon Sep 17 00:00:00 2001 From: Val Date: Mon, 6 Apr 2020 14:29:45 +0300 Subject: [PATCH 0003/2357] add some zookeeper into the logic --- dbms/src/Databases/DatabaseReplicated.cpp | 40 +++++++++++++++++++++-- 1 file changed, 37 insertions(+), 3 deletions(-) diff --git a/dbms/src/Databases/DatabaseReplicated.cpp b/dbms/src/Databases/DatabaseReplicated.cpp index 704c678f366..31e28c320cb 100644 --- a/dbms/src/Databases/DatabaseReplicated.cpp +++ b/dbms/src/Databases/DatabaseReplicated.cpp @@ -99,7 +99,9 @@ DatabaseReplicated::DatabaseReplicated( // TODO is_readonly = true; // return; } - // getObjectDefinitionFromCreateQuery + + current_zookeeper->createIfNotExists(zookeeper_path, String()); + current_zookeeper->createIfNotExists(replica_path, String()); // TODO what to do? // TODO createDatabaseIfNotExists ? // TODO check database structure ? @@ -115,6 +117,36 @@ void DatabaseReplicated::createTable( DatabaseOnDisk::createTable(context, table_name, table, query); // replicated stuff String statement = getObjectDefinitionFromCreateQuery(query); + auto zookeeper = getZooKeeper(); + // TODO в чем прикол именно так создавать зиноды? + Coordination::Requests ops; + ops.emplace_back(zkutil::makeCreateRequest(zookeeper_path, "", + zkutil::CreateMode::Persistent)); + ops.emplace_back(zkutil::makeCreateRequest(zookeeper_path + "/metadata", metadata, + zkutil::CreateMode::Persistent)); +// ops.emplace_back(zkutil::makeCreateRequest(zookeeper_path + "/columns", getColumns().toString(), +// zkutil::CreateMode::Persistent)); + ops.emplace_back(zkutil::makeCreateRequest(zookeeper_path + "/log", "", + zkutil::CreateMode::Persistent)); +// ops.emplace_back(zkutil::makeCreateRequest(zookeeper_path + "/blocks", "", +// zkutil::CreateMode::Persistent)); +// ops.emplace_back(zkutil::makeCreateRequest(zookeeper_path + "/block_numbers", "", +// zkutil::CreateMode::Persistent)); +// ops.emplace_back(zkutil::makeCreateRequest(zookeeper_path + "/nonincrement_block_numbers", "", +// zkutil::CreateMode::Persistent)); /// /nonincrement_block_numbers dir is unused, but is created nonetheless for backwards compatibility. + // TODO do we need a leader here? (probably yes) what is it gonna do? + ops.emplace_back(zkutil::makeCreateRequest(zookeeper_path + "/leader_election", "", + zkutil::CreateMode::Persistent)); + ops.emplace_back(zkutil::makeCreateRequest(zookeeper_path + "/temp", "", + zkutil::CreateMode::Persistent)); + ops.emplace_back(zkutil::makeCreateRequest(zookeeper_path + "/replicas", "", + zkutil::CreateMode::Persistent)); + + Coordination::Responses responses; + auto code = zookeeper->tryMulti(ops, responses); + if (code && code != Coordination::ZNODEEXISTS) + throw Coordination::Exception(code); + // ... } @@ -131,7 +163,7 @@ void DatabaseReplicated::renameTable( DatabaseOnDisk::renameTable(context, table_name, to_database, to_table_name, lock); // replicated stuff String statement = getObjectDefinitionFromCreateQuery(query); - // ... + // this one is fairly more complex } void DatabaseReplicated::removeTable( @@ -150,7 +182,9 @@ void DatabaseReplicated::drop(const Context & context) DatabaseOnDisk::drop(context); // replicated stuff String statement = getObjectDefinitionFromCreateQuery(query); - // ... + // should it be possible to recover after a drop. + // if not, we can just delete all the zookeeper nodes starting from + // zookeeper path. does it work recursively? hope so... } } From e0f52965e5ebfbb01e7a502190bea17918e22754 Mon Sep 17 00:00:00 2001 From: Val Date: Fri, 24 Apr 2020 16:49:14 +0300 Subject: [PATCH 0004/2357] Add a comment with some thoughts --- dbms/src/Databases/DatabaseReplicated.cpp | 3 +++ 1 file changed, 3 insertions(+) diff --git a/dbms/src/Databases/DatabaseReplicated.cpp b/dbms/src/Databases/DatabaseReplicated.cpp index 31e28c320cb..e18fc1db5f4 100644 --- a/dbms/src/Databases/DatabaseReplicated.cpp +++ b/dbms/src/Databases/DatabaseReplicated.cpp @@ -100,6 +100,8 @@ DatabaseReplicated::DatabaseReplicated( // return; } + // can the zk path exist and no metadata on disk be available at the same moment? if so, in such a case, the db instance must be restored. + current_zookeeper->createIfNotExists(zookeeper_path, String()); current_zookeeper->createIfNotExists(replica_path, String()); // TODO what to do? @@ -115,6 +117,7 @@ void DatabaseReplicated::createTable( { // try DatabaseOnDisk::createTable(context, table_name, table, query); + // replicated stuff String statement = getObjectDefinitionFromCreateQuery(query); auto zookeeper = getZooKeeper(); From c1c132502c64d52e5867e3cc4ed6e3b2523567d8 Mon Sep 17 00:00:00 2001 From: Val Date: Fri, 24 Apr 2020 17:12:54 +0300 Subject: [PATCH 0005/2357] add prototypes of loadStoredObject and some relevant helpers in replicateddb --- dbms/src/Databases/DatabaseReplicated.cpp | 22 ++++++++ dbms/src/Databases/DatabaseReplicated.h | 61 +++++++++++++++++++++++ 2 files changed, 83 insertions(+) create mode 100644 dbms/src/Databases/DatabaseReplicated.h diff --git a/dbms/src/Databases/DatabaseReplicated.cpp b/dbms/src/Databases/DatabaseReplicated.cpp index e18fc1db5f4..fd5f53a596c 100644 --- a/dbms/src/Databases/DatabaseReplicated.cpp +++ b/dbms/src/Databases/DatabaseReplicated.cpp @@ -190,4 +190,26 @@ void DatabaseReplicated::drop(const Context & context) // zookeeper path. does it work recursively? hope so... } +void DatabaseOrdinary::loadStoredObjects( + Context & context, + bool has_force_restore_data_flag) +{ + syncReplicaState(context); + updateMetadata(context); + + DatabaseOrdinary::loadStoredObjects(context, has_force_restore_data_flag); + +} + +// sync replica's zookeeper metadata +void syncReplicaState(Context & context) { + +} + +// get the up to date metadata from zookeeper to local metadata dir +// for replicated (only?) tables +void updateMetadata(Context & context) { + +} + } diff --git a/dbms/src/Databases/DatabaseReplicated.h b/dbms/src/Databases/DatabaseReplicated.h new file mode 100644 index 00000000000..51f7763bb5a --- /dev/null +++ b/dbms/src/Databases/DatabaseReplicated.h @@ -0,0 +1,61 @@ +#pragma once + +#include +#include +#include + +namespace DB +{ +/** Replicated database engine. + * It stores tables list using list of .sql files, + * that contain declaration of table represented by SQL ATTACH TABLE query + * and operation log in zookeeper + */ +class DatabaseReplicated : public DatabaseOrdinary +{ +public: + DatabaseReplicated(const String & name_, const String & metadata_path_, const String & zookeeper_path_, const String & replica_name_, const Context & context); + + String getEngineName() const override { return "Replicated"; } + + void createTable( + const Context & context, + const String & table_name, + const StoragePtr & table, + const ASTPtr & query) override; + + void removeTable( + const Context & context, + const String & table_name) override; + + void renameTable( + const Context & context, + const String & table_name, + IDatabase & to_database, + const String & to_table_name, + TableStructureWriteLockHolder & lock) override; + + void drop(const Context & context) override; + + void loadStoredObjects( + Context & context, + bool has_force_restore_data_flag) override; + +private: + String zookeeper_path; + String replica_name; + String replica_path; + + zkutil::ZooKeeperPtr current_zookeeper; /// Use only the methods below. + mutable std::mutex current_zookeeper_mutex; /// To recreate the session in the background thread. + + zkutil::ZooKeeperPtr tryGetZooKeeper() const; + zkutil::ZooKeeperPtr getZooKeeper() const; + void setZooKeeper(zkutil::ZooKeeperPtr zookeeper); + + void syncReplicaState(Context & context); + + void updateMetadata(Context & context); +}; + +} From 0d392bbb34c142f6871a2bd2ab699f5baa768780 Mon Sep 17 00:00:00 2001 From: Val Date: Wed, 29 Apr 2020 14:19:16 +0300 Subject: [PATCH 0006/2357] fix after rebase --- src/Databases/DatabaseFactory.cpp | 17 +++++++++- src/Databases/DatabaseReplicated.cpp | 49 +++++++++++++++------------- src/Databases/DatabaseReplicated.h | 7 ++-- 3 files changed, 47 insertions(+), 26 deletions(-) diff --git a/src/Databases/DatabaseFactory.cpp b/src/Databases/DatabaseFactory.cpp index f27bc509ebe..0d7a711b530 100644 --- a/src/Databases/DatabaseFactory.cpp +++ b/src/Databases/DatabaseFactory.cpp @@ -1,4 +1,5 @@ #include +#include #include #include #include @@ -69,7 +70,7 @@ DatabasePtr DatabaseFactory::getImpl( { String engine_name = engine_define->engine->name; - if (engine_name != "MySQL" && engine_name != "Lazy" && engine_define->engine->arguments) + if (engine_name != "MySQL" && engine_name != "Lazy" && engine_name != "Replicated" && engine_define->engine->arguments) throw Exception("Database engine " + engine_name + " cannot have arguments", ErrorCodes::BAD_ARGUMENTS); if (engine_define->engine->parameters || engine_define->partition_by || engine_define->primary_key || engine_define->order_by || @@ -138,6 +139,20 @@ DatabasePtr DatabaseFactory::getImpl( return std::make_shared(database_name, metadata_path, cache_expiration_time_seconds, context); } + else if (engine_name == "Replicated") + { + const ASTFunction * engine = engine_define->engine; + + if (!engine->arguments || engine->arguments->children.size() != 2) + throw Exception("Replicated database requires zoo_path and replica_name arguments", ErrorCodes::BAD_ARGUMENTS); + + const auto & arguments = engine->arguments->children; + + const auto zoo_path = arguments[0]->as()->value.safeGet(); + const auto replica_name = arguments[1]->as()->value.safeGet(); + return std::make_shared(database_name, metadata_path, zoo_path, replica_name, context); + } + throw Exception("Unknown database engine: " + engine_name, ErrorCodes::UNKNOWN_DATABASE_ENGINE); } diff --git a/src/Databases/DatabaseReplicated.cpp b/src/Databases/DatabaseReplicated.cpp index fd5f53a596c..92af1c890c2 100644 --- a/src/Databases/DatabaseReplicated.cpp +++ b/src/Databases/DatabaseReplicated.cpp @@ -125,8 +125,8 @@ void DatabaseReplicated::createTable( Coordination::Requests ops; ops.emplace_back(zkutil::makeCreateRequest(zookeeper_path, "", zkutil::CreateMode::Persistent)); - ops.emplace_back(zkutil::makeCreateRequest(zookeeper_path + "/metadata", metadata, - zkutil::CreateMode::Persistent)); + //ops.emplace_back(zkutil::makeCreateRequest(zookeeper_path + "/metadata", metadata, + //zkutil::CreateMode::Persistent)); // ops.emplace_back(zkutil::makeCreateRequest(zookeeper_path + "/columns", getColumns().toString(), // zkutil::CreateMode::Persistent)); ops.emplace_back(zkutil::makeCreateRequest(zookeeper_path + "/log", "", @@ -160,23 +160,24 @@ void DatabaseReplicated::renameTable( const String & table_name, IDatabase & to_database, const String & to_table_name, - TableStructureWriteLockHolder & lock) + bool exchange) { // try - DatabaseOnDisk::renameTable(context, table_name, to_database, to_table_name, lock); - // replicated stuff - String statement = getObjectDefinitionFromCreateQuery(query); + DatabaseOnDisk::renameTable(context, table_name, to_database, to_table_name, exchange); + // replicated stuff; what to put to a znode + // String statement = getObjectDefinitionFromCreateQuery(query); // this one is fairly more complex } -void DatabaseReplicated::removeTable( +void DatabaseReplicated::dropTable( const Context & context, - const String & table_name) + const String & table_name, + bool no_delay) { // try - DatabaseOnDisk::removeTable(context, table_name); + DatabaseOnDisk::dropTable(context, table_name, no_delay); // replicated stuff - String statement = getObjectDefinitionFromCreateQuery(query); + //String statement = getObjectDefinitionFromCreateQuery(query); // ... } @@ -184,13 +185,26 @@ void DatabaseReplicated::drop(const Context & context) { DatabaseOnDisk::drop(context); // replicated stuff - String statement = getObjectDefinitionFromCreateQuery(query); + //String statement = getObjectDefinitionFromCreateQuery(query); // should it be possible to recover after a drop. // if not, we can just delete all the zookeeper nodes starting from // zookeeper path. does it work recursively? hope so... } -void DatabaseOrdinary::loadStoredObjects( +// sync replica's zookeeper metadata +void DatabaseReplicated::syncReplicaState(Context & context) { + auto c = context; // fixes unuser parameter error + return; +} + +// get the up to date metadata from zookeeper to local metadata dir +// for replicated (only?) tables +void DatabaseReplicated::updateMetadata(Context & context) { + auto c = context; // fixes unuser parameter error + return; +} + +void DatabaseReplicated::loadStoredObjects( Context & context, bool has_force_restore_data_flag) { @@ -201,15 +215,6 @@ void DatabaseOrdinary::loadStoredObjects( } -// sync replica's zookeeper metadata -void syncReplicaState(Context & context) { - -} - -// get the up to date metadata from zookeeper to local metadata dir -// for replicated (only?) tables -void updateMetadata(Context & context) { - -} + } diff --git a/src/Databases/DatabaseReplicated.h b/src/Databases/DatabaseReplicated.h index 51f7763bb5a..bc1af923277 100644 --- a/src/Databases/DatabaseReplicated.h +++ b/src/Databases/DatabaseReplicated.h @@ -24,16 +24,17 @@ public: const StoragePtr & table, const ASTPtr & query) override; - void removeTable( + void dropTable( const Context & context, - const String & table_name) override; + const String & table_name, + bool no_delay) override; void renameTable( const Context & context, const String & table_name, IDatabase & to_database, const String & to_table_name, - TableStructureWriteLockHolder & lock) override; + bool exchange) override; void drop(const Context & context) override; From 1cb96bf1762cc8b111f0cb58ed651059156442e2 Mon Sep 17 00:00:00 2001 From: Val Date: Wed, 29 Apr 2020 14:21:12 +0300 Subject: [PATCH 0007/2357] rm old files from nonexistant dir since the rebase --- dbms/src/Databases/DatabaseReplicated.cpp | 215 ---------------------- dbms/src/Databases/DatabaseReplicated.h | 61 ------ 2 files changed, 276 deletions(-) delete mode 100644 dbms/src/Databases/DatabaseReplicated.cpp delete mode 100644 dbms/src/Databases/DatabaseReplicated.h diff --git a/dbms/src/Databases/DatabaseReplicated.cpp b/dbms/src/Databases/DatabaseReplicated.cpp deleted file mode 100644 index fd5f53a596c..00000000000 --- a/dbms/src/Databases/DatabaseReplicated.cpp +++ /dev/null @@ -1,215 +0,0 @@ -#include - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include - -#include -#include -#include -#include -#include -#include -#include -#include - -#include -#include -#include - -#include - -namespace DB -{ - - -namespace ErrorCodes -{ - extern const int NO_ZOOKEEPER; -} - -void DatabaseReplicated::setZooKeeper(zkutil::ZooKeeperPtr zookeeper) -{ - std::lock_guard lock(current_zookeeper_mutex); - current_zookeeper = zookeeper; -} - -zkutil::ZooKeeperPtr DatabaseReplicated::tryGetZooKeeper() const -{ - std::lock_guard lock(current_zookeeper_mutex); - return current_zookeeper; -} - -zkutil::ZooKeeperPtr DatabaseReplicated::getZooKeeper() const -{ - auto res = tryGetZooKeeper(); - if (!res) - throw Exception("Cannot get ZooKeeper", ErrorCodes::NO_ZOOKEEPER); - return res; -} - - -DatabaseReplicated::DatabaseReplicated( - const String & name_, - const String & metadata_path_, - const String & zookeeper_path_, - const String & replica_name_, - const Context & context_) - : DatabaseOrdinary(name_, metadata_path_, context_) - , zookeeper_path(zookeeper_path_) - , replica_name(replica_name_) -{ - - if (!zookeeper_path.empty() && zookeeper_path.back() == '/') - zookeeper_path.resize(zookeeper_path.size() - 1); - /// If zookeeper chroot prefix is used, path should start with '/', because chroot concatenates without it. - if (!zookeeper_path.empty() && zookeeper_path.front() != '/') - zookeeper_path = "/" + zookeeper_path; - replica_path = zookeeper_path + "/replicas/" + replica_name; - - if (context_.hasZooKeeper()) { - current_zookeeper = context_.getZooKeeper(); - } - - if (!current_zookeeper) - { - // TODO wtf is attach - // if (!attach) - throw Exception("Can't create replicated table without ZooKeeper", ErrorCodes::NO_ZOOKEEPER); - - /// Do not activate the replica. It will be readonly. - // TODO is it relevant for engines? - // LOG_ERROR(log, "No ZooKeeper: database will be in readonly mode."); - // TODO is_readonly = true; - // return; - } - - // can the zk path exist and no metadata on disk be available at the same moment? if so, in such a case, the db instance must be restored. - - current_zookeeper->createIfNotExists(zookeeper_path, String()); - current_zookeeper->createIfNotExists(replica_path, String()); - // TODO what to do? - // TODO createDatabaseIfNotExists ? - // TODO check database structure ? -} - -void DatabaseReplicated::createTable( - const Context & context, - const String & table_name, - const StoragePtr & table, - const ASTPtr & query) -{ - // try - DatabaseOnDisk::createTable(context, table_name, table, query); - - // replicated stuff - String statement = getObjectDefinitionFromCreateQuery(query); - auto zookeeper = getZooKeeper(); - // TODO в чем прикол именно так создавать зиноды? - Coordination::Requests ops; - ops.emplace_back(zkutil::makeCreateRequest(zookeeper_path, "", - zkutil::CreateMode::Persistent)); - ops.emplace_back(zkutil::makeCreateRequest(zookeeper_path + "/metadata", metadata, - zkutil::CreateMode::Persistent)); -// ops.emplace_back(zkutil::makeCreateRequest(zookeeper_path + "/columns", getColumns().toString(), -// zkutil::CreateMode::Persistent)); - ops.emplace_back(zkutil::makeCreateRequest(zookeeper_path + "/log", "", - zkutil::CreateMode::Persistent)); -// ops.emplace_back(zkutil::makeCreateRequest(zookeeper_path + "/blocks", "", -// zkutil::CreateMode::Persistent)); -// ops.emplace_back(zkutil::makeCreateRequest(zookeeper_path + "/block_numbers", "", -// zkutil::CreateMode::Persistent)); -// ops.emplace_back(zkutil::makeCreateRequest(zookeeper_path + "/nonincrement_block_numbers", "", -// zkutil::CreateMode::Persistent)); /// /nonincrement_block_numbers dir is unused, but is created nonetheless for backwards compatibility. - // TODO do we need a leader here? (probably yes) what is it gonna do? - ops.emplace_back(zkutil::makeCreateRequest(zookeeper_path + "/leader_election", "", - zkutil::CreateMode::Persistent)); - ops.emplace_back(zkutil::makeCreateRequest(zookeeper_path + "/temp", "", - zkutil::CreateMode::Persistent)); - ops.emplace_back(zkutil::makeCreateRequest(zookeeper_path + "/replicas", "", - zkutil::CreateMode::Persistent)); - - Coordination::Responses responses; - auto code = zookeeper->tryMulti(ops, responses); - if (code && code != Coordination::ZNODEEXISTS) - throw Coordination::Exception(code); - - // ... - -} - - -void DatabaseReplicated::renameTable( - const Context & context, - const String & table_name, - IDatabase & to_database, - const String & to_table_name, - TableStructureWriteLockHolder & lock) -{ - // try - DatabaseOnDisk::renameTable(context, table_name, to_database, to_table_name, lock); - // replicated stuff - String statement = getObjectDefinitionFromCreateQuery(query); - // this one is fairly more complex -} - -void DatabaseReplicated::removeTable( - const Context & context, - const String & table_name) -{ - // try - DatabaseOnDisk::removeTable(context, table_name); - // replicated stuff - String statement = getObjectDefinitionFromCreateQuery(query); - // ... -} - -void DatabaseReplicated::drop(const Context & context) -{ - DatabaseOnDisk::drop(context); - // replicated stuff - String statement = getObjectDefinitionFromCreateQuery(query); - // should it be possible to recover after a drop. - // if not, we can just delete all the zookeeper nodes starting from - // zookeeper path. does it work recursively? hope so... -} - -void DatabaseOrdinary::loadStoredObjects( - Context & context, - bool has_force_restore_data_flag) -{ - syncReplicaState(context); - updateMetadata(context); - - DatabaseOrdinary::loadStoredObjects(context, has_force_restore_data_flag); - -} - -// sync replica's zookeeper metadata -void syncReplicaState(Context & context) { - -} - -// get the up to date metadata from zookeeper to local metadata dir -// for replicated (only?) tables -void updateMetadata(Context & context) { - -} - -} diff --git a/dbms/src/Databases/DatabaseReplicated.h b/dbms/src/Databases/DatabaseReplicated.h deleted file mode 100644 index 51f7763bb5a..00000000000 --- a/dbms/src/Databases/DatabaseReplicated.h +++ /dev/null @@ -1,61 +0,0 @@ -#pragma once - -#include -#include -#include - -namespace DB -{ -/** Replicated database engine. - * It stores tables list using list of .sql files, - * that contain declaration of table represented by SQL ATTACH TABLE query - * and operation log in zookeeper - */ -class DatabaseReplicated : public DatabaseOrdinary -{ -public: - DatabaseReplicated(const String & name_, const String & metadata_path_, const String & zookeeper_path_, const String & replica_name_, const Context & context); - - String getEngineName() const override { return "Replicated"; } - - void createTable( - const Context & context, - const String & table_name, - const StoragePtr & table, - const ASTPtr & query) override; - - void removeTable( - const Context & context, - const String & table_name) override; - - void renameTable( - const Context & context, - const String & table_name, - IDatabase & to_database, - const String & to_table_name, - TableStructureWriteLockHolder & lock) override; - - void drop(const Context & context) override; - - void loadStoredObjects( - Context & context, - bool has_force_restore_data_flag) override; - -private: - String zookeeper_path; - String replica_name; - String replica_path; - - zkutil::ZooKeeperPtr current_zookeeper; /// Use only the methods below. - mutable std::mutex current_zookeeper_mutex; /// To recreate the session in the background thread. - - zkutil::ZooKeeperPtr tryGetZooKeeper() const; - zkutil::ZooKeeperPtr getZooKeeper() const; - void setZooKeeper(zkutil::ZooKeeperPtr zookeeper); - - void syncReplicaState(Context & context); - - void updateMetadata(Context & context); -}; - -} From 8b0366ff4ff08d47b9ca7451ce33ca07683b0012 Mon Sep 17 00:00:00 2001 From: Val Date: Thu, 30 Apr 2020 19:15:27 +0300 Subject: [PATCH 0008/2357] an attempt to make something meaningful --- src/Databases/DatabaseReplicated.cpp | 91 ++++++++++++---------------- 1 file changed, 40 insertions(+), 51 deletions(-) diff --git a/src/Databases/DatabaseReplicated.cpp b/src/Databases/DatabaseReplicated.cpp index 92af1c890c2..d6bbec24791 100644 --- a/src/Databases/DatabaseReplicated.cpp +++ b/src/Databases/DatabaseReplicated.cpp @@ -71,7 +71,7 @@ DatabaseReplicated::DatabaseReplicated( const String & zookeeper_path_, const String & replica_name_, const Context & context_) - : DatabaseOrdinary(name_, metadata_path_, context_) + : DatabaseOrdinary(name_, metadata_path_, "data/", "DatabaseReplicated (" + name_ + ")", context_) , zookeeper_path(zookeeper_path_) , replica_name(replica_name_) { @@ -89,24 +89,31 @@ DatabaseReplicated::DatabaseReplicated( if (!current_zookeeper) { - // TODO wtf is attach - // if (!attach) - throw Exception("Can't create replicated table without ZooKeeper", ErrorCodes::NO_ZOOKEEPER); + throw Exception("Can't create replicated database without ZooKeeper", ErrorCodes::NO_ZOOKEEPER); + - /// Do not activate the replica. It will be readonly. - // TODO is it relevant for engines? - // LOG_ERROR(log, "No ZooKeeper: database will be in readonly mode."); - // TODO is_readonly = true; - // return; } - // can the zk path exist and no metadata on disk be available at the same moment? if so, in such a case, the db instance must be restored. + // test without this fancy mess (prob wont work) + current_zookeeper->createAncestors(replica_path); + current_zookeeper->createOrUpdate(replica_path, String(), zkutil::CreateMode::Persistent); - current_zookeeper->createIfNotExists(zookeeper_path, String()); - current_zookeeper->createIfNotExists(replica_path, String()); - // TODO what to do? - // TODO createDatabaseIfNotExists ? - // TODO check database structure ? +// if (!current_zookeeper->exists(zookeeper_path)) { +// +// LOG_DEBUG(log, "Creating database " << zookeeper_path); +// current_zookeeper->createAncestors(zookeeper_path); + + // Coordination::Requests ops; + // ops.emplace_back(zkutil::makeCreateRequest(zookeeper_path, "", + // zkutil::CreateMode::Persistent)); + // ops.emplace_back(zkutil::makeCreateRequest(zookeeper_path + "/replicas", "", + // zkutil::CreateMode::Persistent)); + + // Coordination::Responses responses; + // auto code = current_zookeeper->tryMulti(ops, responses); + // if (code && code != Coordination::ZNODEEXISTS) + // throw Coordination::Exception(code); + // } } void DatabaseReplicated::createTable( @@ -115,43 +122,16 @@ void DatabaseReplicated::createTable( const StoragePtr & table, const ASTPtr & query) { - // try + // try? DatabaseOnDisk::createTable(context, table_name, table, query); - // replicated stuff + // suppose it worked String statement = getObjectDefinitionFromCreateQuery(query); - auto zookeeper = getZooKeeper(); - // TODO в чем прикол именно так создавать зиноды? - Coordination::Requests ops; - ops.emplace_back(zkutil::makeCreateRequest(zookeeper_path, "", - zkutil::CreateMode::Persistent)); - //ops.emplace_back(zkutil::makeCreateRequest(zookeeper_path + "/metadata", metadata, - //zkutil::CreateMode::Persistent)); -// ops.emplace_back(zkutil::makeCreateRequest(zookeeper_path + "/columns", getColumns().toString(), -// zkutil::CreateMode::Persistent)); - ops.emplace_back(zkutil::makeCreateRequest(zookeeper_path + "/log", "", - zkutil::CreateMode::Persistent)); -// ops.emplace_back(zkutil::makeCreateRequest(zookeeper_path + "/blocks", "", -// zkutil::CreateMode::Persistent)); -// ops.emplace_back(zkutil::makeCreateRequest(zookeeper_path + "/block_numbers", "", -// zkutil::CreateMode::Persistent)); -// ops.emplace_back(zkutil::makeCreateRequest(zookeeper_path + "/nonincrement_block_numbers", "", -// zkutil::CreateMode::Persistent)); /// /nonincrement_block_numbers dir is unused, but is created nonetheless for backwards compatibility. - // TODO do we need a leader here? (probably yes) what is it gonna do? - ops.emplace_back(zkutil::makeCreateRequest(zookeeper_path + "/leader_election", "", - zkutil::CreateMode::Persistent)); - ops.emplace_back(zkutil::makeCreateRequest(zookeeper_path + "/temp", "", - zkutil::CreateMode::Persistent)); - ops.emplace_back(zkutil::makeCreateRequest(zookeeper_path + "/replicas", "", - zkutil::CreateMode::Persistent)); - - Coordination::Responses responses; - auto code = zookeeper->tryMulti(ops, responses); - if (code && code != Coordination::ZNODEEXISTS) - throw Coordination::Exception(code); - - // ... + LOG_DEBUG(log, "CREATE TABLE STATEMENT " << statement); + // let's do dumb write to zk at the first iteration + current_zookeeper = getZooKeeper(); + current_zookeeper->createOrUpdate(replica_path + "/" + table_name, statement, zkutil::CreateMode::Persistent); } @@ -167,6 +147,14 @@ void DatabaseReplicated::renameTable( // replicated stuff; what to put to a znode // String statement = getObjectDefinitionFromCreateQuery(query); // this one is fairly more complex + current_zookeeper = getZooKeeper(); + + // no need for now to have stat + Coordination::Stat metadata_stat; + auto statement = current_zookeeper->get(replica_path + "/" + table_name, &metadata_stat); + current_zookeeper->createOrUpdate(replica_path + "/" + to_table_name, statement, zkutil::CreateMode::Persistent); + current_zookeeper->remove(replica_path + "/" + table_name); + // TODO add rename statement to the log } void DatabaseReplicated::dropTable( @@ -176,9 +164,10 @@ void DatabaseReplicated::dropTable( { // try DatabaseOnDisk::dropTable(context, table_name, no_delay); - // replicated stuff - //String statement = getObjectDefinitionFromCreateQuery(query); - // ... + + // let's do dumb remove from zk at the first iteration + current_zookeeper = getZooKeeper(); + current_zookeeper->remove(replica_path + "/" + table_name); } void DatabaseReplicated::drop(const Context & context) From 948bd1c5cc3f069aa621055611b81f484de49dad Mon Sep 17 00:00:00 2001 From: Val Date: Thu, 30 Apr 2020 19:16:53 +0300 Subject: [PATCH 0009/2357] database replicated basic test (create and drop) --- .../01267_replicated_database_engine_zookeeper.sql | 12 ++++++++++++ 1 file changed, 12 insertions(+) create mode 100644 tests/queries/0_stateless/01267_replicated_database_engine_zookeeper.sql diff --git a/tests/queries/0_stateless/01267_replicated_database_engine_zookeeper.sql b/tests/queries/0_stateless/01267_replicated_database_engine_zookeeper.sql new file mode 100644 index 00000000000..94b461e2f93 --- /dev/null +++ b/tests/queries/0_stateless/01267_replicated_database_engine_zookeeper.sql @@ -0,0 +1,12 @@ +DROP DATABASE IF EXISTS test_db1; +DROP DATABASE IF EXISTS test_db2; +DROP TABLE IF EXISTS test_table1; +DROP TABLE IF EXISTS test_table2; + +CREATE DATABASE test_db1 ENGINE = Replicated('/clickhouse/databases/test1', 'id1'); +USE test_db1; +CREATE TABLE test_table1 (d Date, k UInt64, i32 Int32) ENGINE=ReplicatedMergeTree('/clickhouse/tables/test1', 'id1', d, k, 8192); + +CREATE DATABASE test_db2 ENGINE = Replicated('/clickhouse/databases/test1', 'id2'); +USE test_db2; +CREATE TABLE test_table2 (d Date, k UInt64, i32 Int32) ENGINE=ReplicatedMergeTree('/clickhouse/tables/test1', 'id2', d, k, 8192); From 0a4c1783a1ef45edc189e1cf19e2fdef1712e140 Mon Sep 17 00:00:00 2001 From: Val Date: Fri, 1 May 2020 16:16:02 +0300 Subject: [PATCH 0010/2357] Make drop work by fixing namespace bug data dir wasn't set right. now it's fixed. add non-replicated table to test sql --- src/Databases/DatabaseReplicated.cpp | 19 ++++++++++--------- ...7_replicated_database_engine_zookeeper.sql | 10 ++++------ 2 files changed, 14 insertions(+), 15 deletions(-) diff --git a/src/Databases/DatabaseReplicated.cpp b/src/Databases/DatabaseReplicated.cpp index d6bbec24791..61bcfc8d5a9 100644 --- a/src/Databases/DatabaseReplicated.cpp +++ b/src/Databases/DatabaseReplicated.cpp @@ -71,22 +71,24 @@ DatabaseReplicated::DatabaseReplicated( const String & zookeeper_path_, const String & replica_name_, const Context & context_) - : DatabaseOrdinary(name_, metadata_path_, "data/", "DatabaseReplicated (" + name_ + ")", context_) + : DatabaseOrdinary(name_, metadata_path_, "data/" + escapeForFileName(name_) + "/", "DatabaseReplicated (" + name_ + ")", context_) , zookeeper_path(zookeeper_path_) , replica_name(replica_name_) { + LOG_DEBUG(log, "METADATA PATH ARGUMENT " << metadata_path_); + LOG_DEBUG(log, "METADATA PATH ACTUAL " << getMetadataPath()); if (!zookeeper_path.empty() && zookeeper_path.back() == '/') zookeeper_path.resize(zookeeper_path.size() - 1); - /// If zookeeper chroot prefix is used, path should start with '/', because chroot concatenates without it. + // If zookeeper chroot prefix is used, path should start with '/', because chroot concatenates without it. if (!zookeeper_path.empty() && zookeeper_path.front() != '/') zookeeper_path = "/" + zookeeper_path; + replica_path = zookeeper_path + "/replicas/" + replica_name; if (context_.hasZooKeeper()) { current_zookeeper = context_.getZooKeeper(); } - if (!current_zookeeper) { throw Exception("Can't create replicated database without ZooKeeper", ErrorCodes::NO_ZOOKEEPER); @@ -95,6 +97,7 @@ DatabaseReplicated::DatabaseReplicated( } // test without this fancy mess (prob wont work) + // it works current_zookeeper->createAncestors(replica_path); current_zookeeper->createOrUpdate(replica_path, String(), zkutil::CreateMode::Persistent); @@ -172,12 +175,10 @@ void DatabaseReplicated::dropTable( void DatabaseReplicated::drop(const Context & context) { - DatabaseOnDisk::drop(context); - // replicated stuff - //String statement = getObjectDefinitionFromCreateQuery(query); - // should it be possible to recover after a drop. - // if not, we can just delete all the zookeeper nodes starting from - // zookeeper path. does it work recursively? hope so... + current_zookeeper = getZooKeeper(); + current_zookeeper->remove(replica_path); + + DatabaseOnDisk::drop(context); // no throw } // sync replica's zookeeper metadata diff --git a/tests/queries/0_stateless/01267_replicated_database_engine_zookeeper.sql b/tests/queries/0_stateless/01267_replicated_database_engine_zookeeper.sql index 94b461e2f93..c70de9a50d2 100644 --- a/tests/queries/0_stateless/01267_replicated_database_engine_zookeeper.sql +++ b/tests/queries/0_stateless/01267_replicated_database_engine_zookeeper.sql @@ -1,12 +1,10 @@ DROP DATABASE IF EXISTS test_db1; DROP DATABASE IF EXISTS test_db2; -DROP TABLE IF EXISTS test_table1; -DROP TABLE IF EXISTS test_table2; CREATE DATABASE test_db1 ENGINE = Replicated('/clickhouse/databases/test1', 'id1'); -USE test_db1; -CREATE TABLE test_table1 (d Date, k UInt64, i32 Int32) ENGINE=ReplicatedMergeTree('/clickhouse/tables/test1', 'id1', d, k, 8192); +CREATE TABLE test_db1.replicated_table (d Date, k UInt64, i32 Int32) ENGINE=ReplicatedMergeTree('/clickhouse/tables/test1', 'id1', d, k, 8192); +CREATE TABLE test_db1.basic_table (EventDate Date, CounterID Int) engine=MergeTree(EventDate, (CounterID, EventDate), 8192); CREATE DATABASE test_db2 ENGINE = Replicated('/clickhouse/databases/test1', 'id2'); -USE test_db2; -CREATE TABLE test_table2 (d Date, k UInt64, i32 Int32) ENGINE=ReplicatedMergeTree('/clickhouse/tables/test1', 'id2', d, k, 8192); +CREATE TABLE test_db2.replicated_table (d Date, k UInt64, i32 Int32) ENGINE=ReplicatedMergeTree('/clickhouse/tables/test1', 'id2', d, k, 8192); +CREATE TABLE test_db2.basic_table (EventDate Date, CounterID Int) engine=MergeTree(EventDate, (CounterID, EventDate), 8192); From 319256ef4f29b0e4d4d0f5034874961fbb64813d Mon Sep 17 00:00:00 2001 From: Val Date: Tue, 5 May 2020 17:16:59 +0300 Subject: [PATCH 0011/2357] an attempt to replicated create query from create query --- src/Databases/DatabaseReplicated.cpp | 198 +++++++++----------- src/Databases/DatabaseReplicated.h | 60 +++--- src/Databases/IDatabase.h | 4 + src/Interpreters/InterpreterCreateQuery.cpp | 15 +- 4 files changed, 143 insertions(+), 134 deletions(-) diff --git a/src/Databases/DatabaseReplicated.cpp b/src/Databases/DatabaseReplicated.cpp index 61bcfc8d5a9..a1eb910dedf 100644 --- a/src/Databases/DatabaseReplicated.cpp +++ b/src/Databases/DatabaseReplicated.cpp @@ -2,6 +2,7 @@ #include #include +#include #include #include #include @@ -70,8 +71,11 @@ DatabaseReplicated::DatabaseReplicated( const String & metadata_path_, const String & zookeeper_path_, const String & replica_name_, - const Context & context_) - : DatabaseOrdinary(name_, metadata_path_, "data/" + escapeForFileName(name_) + "/", "DatabaseReplicated (" + name_ + ")", context_) + Context & context_) +// : DatabaseOrdinary(name_, metadata_path_, "data/" + escapeForFileName(name_) + "/", "DatabaseReplicated (" + name_ + ")", context_) + // TODO add constructor to Atomic and call it here with path and logger name specification + // TODO ask why const and & are ommited in Atomic + : DatabaseAtomic(name_, metadata_path_, context_) , zookeeper_path(zookeeper_path_) , replica_name(replica_name_) { @@ -96,115 +100,97 @@ DatabaseReplicated::DatabaseReplicated( } - // test without this fancy mess (prob wont work) - // it works - current_zookeeper->createAncestors(replica_path); - current_zookeeper->createOrUpdate(replica_path, String(), zkutil::CreateMode::Persistent); + current_zookeeper->createAncestors(zookeeper_path); + current_zookeeper->createOrUpdate(zookeeper_path, String(), zkutil::CreateMode::Persistent); -// if (!current_zookeeper->exists(zookeeper_path)) { -// -// LOG_DEBUG(log, "Creating database " << zookeeper_path); -// current_zookeeper->createAncestors(zookeeper_path); - - // Coordination::Requests ops; - // ops.emplace_back(zkutil::makeCreateRequest(zookeeper_path, "", - // zkutil::CreateMode::Persistent)); - // ops.emplace_back(zkutil::makeCreateRequest(zookeeper_path + "/replicas", "", - // zkutil::CreateMode::Persistent)); - - // Coordination::Responses responses; - // auto code = current_zookeeper->tryMulti(ops, responses); - // if (code && code != Coordination::ZNODEEXISTS) - // throw Coordination::Exception(code); - // } -} - -void DatabaseReplicated::createTable( - const Context & context, - const String & table_name, - const StoragePtr & table, - const ASTPtr & query) -{ - // try? - DatabaseOnDisk::createTable(context, table_name, table, query); - - // suppose it worked - String statement = getObjectDefinitionFromCreateQuery(query); - LOG_DEBUG(log, "CREATE TABLE STATEMENT " << statement); - - // let's do dumb write to zk at the first iteration - current_zookeeper = getZooKeeper(); - current_zookeeper->createOrUpdate(replica_path + "/" + table_name, statement, zkutil::CreateMode::Persistent); + // TODO launch a worker here } -void DatabaseReplicated::renameTable( - const Context & context, - const String & table_name, - IDatabase & to_database, - const String & to_table_name, - bool exchange) -{ - // try - DatabaseOnDisk::renameTable(context, table_name, to_database, to_table_name, exchange); - // replicated stuff; what to put to a znode - // String statement = getObjectDefinitionFromCreateQuery(query); - // this one is fairly more complex - current_zookeeper = getZooKeeper(); - - // no need for now to have stat - Coordination::Stat metadata_stat; - auto statement = current_zookeeper->get(replica_path + "/" + table_name, &metadata_stat); - current_zookeeper->createOrUpdate(replica_path + "/" + to_table_name, statement, zkutil::CreateMode::Persistent); - current_zookeeper->remove(replica_path + "/" + table_name); - // TODO add rename statement to the log +void DatabaseReplicated::propose(const ASTPtr & query) { + LOG_DEBUG(log, "PROPOSING\n" << queryToString(query)); } -void DatabaseReplicated::dropTable( - const Context & context, - const String & table_name, - bool no_delay) -{ - // try - DatabaseOnDisk::dropTable(context, table_name, no_delay); - - // let's do dumb remove from zk at the first iteration - current_zookeeper = getZooKeeper(); - current_zookeeper->remove(replica_path + "/" + table_name); -} - -void DatabaseReplicated::drop(const Context & context) -{ - current_zookeeper = getZooKeeper(); - current_zookeeper->remove(replica_path); - - DatabaseOnDisk::drop(context); // no throw -} - -// sync replica's zookeeper metadata -void DatabaseReplicated::syncReplicaState(Context & context) { - auto c = context; // fixes unuser parameter error - return; -} - -// get the up to date metadata from zookeeper to local metadata dir -// for replicated (only?) tables -void DatabaseReplicated::updateMetadata(Context & context) { - auto c = context; // fixes unuser parameter error - return; -} - -void DatabaseReplicated::loadStoredObjects( - Context & context, - bool has_force_restore_data_flag) -{ - syncReplicaState(context); - updateMetadata(context); - - DatabaseOrdinary::loadStoredObjects(context, has_force_restore_data_flag); - -} - - +// void DatabaseReplicated::createTable( +// const Context & context, +// const String & table_name, +// const StoragePtr & table, +// const ASTPtr & query) +// { +// LOG_DEBUG(log, "CREATE TABLE"); +// +// +// DatabaseOnDisk::createTable(context, table_name, table, query); +// +// // String statement = getObjectDefinitionFromCreateQuery(query); +// +// // current_zookeeper = getZooKeeper(); +// // current_zookeeper->createOrUpdate(replica_path + "/" + table_name + ".sql", statement, zkutil::CreateMode::Persistent); +// return; +// } +// +// +// void DatabaseReplicated::renameTable( +// const Context & context, +// const String & table_name, +// IDatabase & to_database, +// const String & to_table_name, +// bool exchange) +// { +// LOG_DEBUG(log, "RENAME TABLE"); +// DatabaseAtomic::renameTable(context, table_name, to_database, to_table_name, exchange); +// // try +// // DatabaseOnDisk::renameTable(context, table_name, to_database, to_table_name, exchange); +// // replicated stuff; what to put to a znode +// // String statement = getObjectDefinitionFromCreateQuery(query); +// // this one is fairly more complex +// // current_zookeeper = getZooKeeper(); +// +// // no need for now to have stat +// // Coordination::Stat metadata_stat; +// // auto statement = current_zookeeper->get(replica_path + "/" + table_name, &metadata_stat); +// // current_zookeeper->createOrUpdate(replica_path + "/" + to_table_name, statement, zkutil::CreateMode::Persistent); +// // current_zookeeper->remove(replica_path + "/" + table_name); +// // TODO add rename statement to the log +// return; +// } +// +// void DatabaseReplicated::dropTable( +// const Context & context, +// const String & table_name, +// bool no_delay) +// { +// LOG_DEBUG(log, "DROP TABLE"); +// DatabaseAtomic::dropTable(context, table_name, no_delay); +// // try +// // DatabaseOnDisk::dropTable(context, table_name, no_delay); +// +// // let's do dumb remove from zk at the first iteration +// // current_zookeeper = getZooKeeper(); +// // current_zookeeper->remove(replica_path + "/" + table_name); +// return; +// } +// +// void DatabaseReplicated::drop(const Context & context) +// { +// LOG_DEBUG(log, "DROP"); +// DatabaseAtomic::drop(context); +// // current_zookeeper = getZooKeeper(); +// // current_zookeeper->remove(replica_path); +// +// // DatabaseOnDisk::drop(context); // no throw +// return; +// } +// +// void DatabaseReplicated::loadStoredObjects( +// Context & context, +// bool has_force_restore_data_flag) +// { +// DatabaseOrdinary::loadStoredObjects(context, has_force_restore_data_flag); +// // launch a worker maybe. i don't know +// // DatabaseAtomic::loadStoredObjects(context, has_force_restore_data_flag); +// +// return; +// } } diff --git a/src/Databases/DatabaseReplicated.h b/src/Databases/DatabaseReplicated.h index bc1af923277..df6f86c1491 100644 --- a/src/Databases/DatabaseReplicated.h +++ b/src/Databases/DatabaseReplicated.h @@ -1,6 +1,6 @@ #pragma once -#include +#include #include #include @@ -11,36 +11,47 @@ namespace DB * that contain declaration of table represented by SQL ATTACH TABLE query * and operation log in zookeeper */ -class DatabaseReplicated : public DatabaseOrdinary +class DatabaseReplicated : public DatabaseAtomic { public: - DatabaseReplicated(const String & name_, const String & metadata_path_, const String & zookeeper_path_, const String & replica_name_, const Context & context); + DatabaseReplicated(const String & name_, const String & metadata_path_, const String & zookeeper_path_, const String & replica_name_, Context & context); + +// void drop(const Context & context) override; String getEngineName() const override { return "Replicated"; } - void createTable( - const Context & context, - const String & table_name, - const StoragePtr & table, - const ASTPtr & query) override; + void propose(const ASTPtr & query) override; - void dropTable( - const Context & context, - const String & table_name, - bool no_delay) override; +// void createTable( +// const Context & context, +// const String & table_name, +// const StoragePtr & table, +// const ASTPtr & query) override; +// +// void dropTable( +// const Context & context, +// const String & table_name, +// bool no_delay) override; +// +// void renameTable( +// const Context & context, +// const String & table_name, +// IDatabase & to_database, +// const String & to_table_name, +// bool exchange) override; +// +// void alterTable( +// const Context & context, +// const StorageID & table_id, +// const StorageInMemoryMetadata & metadata) override; - void renameTable( - const Context & context, - const String & table_name, - IDatabase & to_database, - const String & to_table_name, - bool exchange) override; +// void attachTable(const String & name, const StoragePtr & table, const String & relative_table_path) override; +// +// StoragePtr detachTable(const String & name) override; - void drop(const Context & context) override; - - void loadStoredObjects( - Context & context, - bool has_force_restore_data_flag) override; +// void loadStoredObjects( +// Context & context, +// bool has_force_restore_data_flag) override; private: String zookeeper_path; @@ -54,9 +65,6 @@ private: zkutil::ZooKeeperPtr getZooKeeper() const; void setZooKeeper(zkutil::ZooKeeperPtr zookeeper); - void syncReplicaState(Context & context); - - void updateMetadata(Context & context); }; } diff --git a/src/Databases/IDatabase.h b/src/Databases/IDatabase.h index 26b27045be6..18265b153cf 100644 --- a/src/Databases/IDatabase.h +++ b/src/Databases/IDatabase.h @@ -161,6 +161,10 @@ public: /// Is the database empty. virtual bool empty() const = 0; + virtual void propose(const ASTPtr & /*query*/) { + throw Exception("There is no propose query method for Database" + getEngineName(), ErrorCodes::NOT_IMPLEMENTED); + } + /// Add the table to the database. Record its presence in the metadata. virtual void createTable( const Context & /*context*/, diff --git a/src/Interpreters/InterpreterCreateQuery.cpp b/src/Interpreters/InterpreterCreateQuery.cpp index 3e09d728c4c..99c021a72fa 100644 --- a/src/Interpreters/InterpreterCreateQuery.cpp +++ b/src/Interpreters/InterpreterCreateQuery.cpp @@ -622,7 +622,7 @@ bool InterpreterCreateQuery::doCreateTable(ASTCreateQuery & create, if (need_add_to_database) { database = DatabaseCatalog::instance().getDatabase(create.database); - if (database->getEngineName() == "Atomic") + if (database->getEngineName() == "Atomic" || database->getEngineName() == "Replicated") { /// TODO implement ATTACH FROM 'path/to/data': generate UUID and move table data to store/ if (create.attach && create.uuid == UUIDHelpers::Nil) @@ -696,7 +696,18 @@ bool InterpreterCreateQuery::doCreateTable(ASTCreateQuery & create, false); } - database->createTable(context, table_name, res, query_ptr); + + if (database->getEngineName() == "Replicated") { + // propose + // try to + database->propose(query_ptr); + database->createTable(context, table_name, res, query_ptr); + // catch + // throw and remove proposal + // otherwise + // proceed (commit to zk) + } else + database->createTable(context, table_name, res, query_ptr); /// We must call "startup" and "shutdown" while holding DDLGuard. /// Because otherwise method "shutdown" (from InterpreterDropQuery) can be called before startup From 0a860c0c2ba760bf8c6ea45378acc0f00cb2bcff Mon Sep 17 00:00:00 2001 From: Val Date: Mon, 11 May 2020 15:55:17 +0300 Subject: [PATCH 0012/2357] log based replicated --- src/Databases/DatabaseReplicated.cpp | 177 ++++++++++---------- src/Databases/DatabaseReplicated.h | 57 +++---- src/Interpreters/ClientInfo.h | 1 + src/Interpreters/Context.h | 3 + src/Interpreters/DDLWorker.cpp | 3 +- src/Interpreters/InterpreterAlterQuery.cpp | 9 + src/Interpreters/InterpreterCreateQuery.cpp | 4 +- src/Interpreters/InterpreterDropQuery.cpp | 6 + src/Interpreters/InterpreterRenameQuery.cpp | 6 +- 9 files changed, 142 insertions(+), 124 deletions(-) diff --git a/src/Databases/DatabaseReplicated.cpp b/src/Databases/DatabaseReplicated.cpp index a1eb910dedf..1bc954bfb76 100644 --- a/src/Databases/DatabaseReplicated.cpp +++ b/src/Databases/DatabaseReplicated.cpp @@ -6,11 +6,13 @@ #include #include #include +#include #include #include #include #include #include +#include #include #include #include @@ -24,6 +26,7 @@ #include #include #include +#include #include #include #include @@ -33,8 +36,10 @@ #include #include #include +#include #include +#include namespace DB { @@ -75,13 +80,11 @@ DatabaseReplicated::DatabaseReplicated( // : DatabaseOrdinary(name_, metadata_path_, "data/" + escapeForFileName(name_) + "/", "DatabaseReplicated (" + name_ + ")", context_) // TODO add constructor to Atomic and call it here with path and logger name specification // TODO ask why const and & are ommited in Atomic - : DatabaseAtomic(name_, metadata_path_, context_) + : DatabaseOrdinary(name_, metadata_path_, context_) + , context(context_) , zookeeper_path(zookeeper_path_) , replica_name(replica_name_) { - LOG_DEBUG(log, "METADATA PATH ARGUMENT " << metadata_path_); - LOG_DEBUG(log, "METADATA PATH ACTUAL " << getMetadataPath()); - if (!zookeeper_path.empty() && zookeeper_path.back() == '/') zookeeper_path.resize(zookeeper_path.size() - 1); // If zookeeper chroot prefix is used, path should start with '/', because chroot concatenates without it. @@ -103,94 +106,96 @@ DatabaseReplicated::DatabaseReplicated( current_zookeeper->createAncestors(zookeeper_path); current_zookeeper->createOrUpdate(zookeeper_path, String(), zkutil::CreateMode::Persistent); + // TODO if no last_entry then make it equal to 0 in zk; + // TODO launch a worker here + + main_thread = ThreadFromGlobalPool(&DatabaseReplicated::runMainThread, this); +} + +DatabaseReplicated::~DatabaseReplicated() +{ + stop_flag = true; + main_thread.join(); +} + +void DatabaseReplicated::runMainThread() { + setThreadName("ReplctdWorker"); // ok whatever. 15 bytes // + database_name); + LOG_DEBUG(log, "Started " << database_name << " database worker thread\n Replica: " << replica_name); + + while (!stop_flag) { + attachToThreadGroup(); + + sleepForSeconds(10); + current_zookeeper = getZooKeeper(); + String last_n = current_zookeeper->get(zookeeper_path + "/last_entry", {}, NULL); + size_t last_n_parsed = parse(last_n); + while (current_log_entry_n < last_n_parsed) { + current_log_entry_n++; + executeLog(current_log_entry_n); + } + break; // debug purpose + } +} + +void DatabaseReplicated::executeLog(size_t n) { + + LOG_DEBUG(log, "EXECUTING LOG! DB: " << database_name << "\n Replica: " << replica_name << "LOG N" << n); + current_context = std::make_unique(context); + current_context->from_replicated_log = true; + current_context->setCurrentQueryId(""); // generate random query_id + current_zookeeper = getZooKeeper(); + + String query_to_execute = current_zookeeper->get(zookeeper_path + "/log." + std::to_string(n), {}, NULL); + ReadBufferFromString istr(query_to_execute); + String dummy_string; + WriteBufferFromString ostr(dummy_string); + executeQuery(istr, ostr, false, context, {}); +} + +// TODO we might not need it here at all +void DatabaseReplicated::attachToThreadGroup() { + if (thread_group) + { + /// Put all threads to one thread pool + CurrentThread::attachToIfDetached(thread_group); + } + else + { + CurrentThread::initializeQuery(); + thread_group = CurrentThread::getGroup(); + } +} + +// taken from ddlworker +static std::unique_ptr createSimpleZooKeeperLock( + const std::shared_ptr & zookeeper, const String & lock_prefix, const String & lock_name, const String & lock_message) +{ + auto zookeeper_holder = std::make_shared(); + zookeeper_holder->initFromInstance(zookeeper); + return std::make_unique(std::move(zookeeper_holder), lock_prefix, lock_name, lock_message); } void DatabaseReplicated::propose(const ASTPtr & query) { + // TODO if source is zk then omit propose. Throw? + + // TODO remove that log message i think LOG_DEBUG(log, "PROPOSING\n" << queryToString(query)); + + current_zookeeper = getZooKeeper(); + auto lock = createSimpleZooKeeperLock(current_zookeeper, zookeeper_path, "lock", replica_name); + + // TODO check that last_entry is the same as current_log_entry_n for the replica + + current_log_entry_n++; // starting from 1 + String log_entry = zookeeper_path + "/log." + std::to_string(current_log_entry_n); + current_zookeeper->createOrUpdate(log_entry, queryToString(query), zkutil::CreateMode::Persistent); + + current_zookeeper->createOrUpdate(zookeeper_path + "/last_entry", std::to_string(current_log_entry_n), zkutil::CreateMode::Persistent); + + lock->unlock(); + // write to metastore the last entry? } -// void DatabaseReplicated::createTable( -// const Context & context, -// const String & table_name, -// const StoragePtr & table, -// const ASTPtr & query) -// { -// LOG_DEBUG(log, "CREATE TABLE"); -// -// -// DatabaseOnDisk::createTable(context, table_name, table, query); -// -// // String statement = getObjectDefinitionFromCreateQuery(query); -// -// // current_zookeeper = getZooKeeper(); -// // current_zookeeper->createOrUpdate(replica_path + "/" + table_name + ".sql", statement, zkutil::CreateMode::Persistent); -// return; -// } -// -// -// void DatabaseReplicated::renameTable( -// const Context & context, -// const String & table_name, -// IDatabase & to_database, -// const String & to_table_name, -// bool exchange) -// { -// LOG_DEBUG(log, "RENAME TABLE"); -// DatabaseAtomic::renameTable(context, table_name, to_database, to_table_name, exchange); -// // try -// // DatabaseOnDisk::renameTable(context, table_name, to_database, to_table_name, exchange); -// // replicated stuff; what to put to a znode -// // String statement = getObjectDefinitionFromCreateQuery(query); -// // this one is fairly more complex -// // current_zookeeper = getZooKeeper(); -// -// // no need for now to have stat -// // Coordination::Stat metadata_stat; -// // auto statement = current_zookeeper->get(replica_path + "/" + table_name, &metadata_stat); -// // current_zookeeper->createOrUpdate(replica_path + "/" + to_table_name, statement, zkutil::CreateMode::Persistent); -// // current_zookeeper->remove(replica_path + "/" + table_name); -// // TODO add rename statement to the log -// return; -// } -// -// void DatabaseReplicated::dropTable( -// const Context & context, -// const String & table_name, -// bool no_delay) -// { -// LOG_DEBUG(log, "DROP TABLE"); -// DatabaseAtomic::dropTable(context, table_name, no_delay); -// // try -// // DatabaseOnDisk::dropTable(context, table_name, no_delay); -// -// // let's do dumb remove from zk at the first iteration -// // current_zookeeper = getZooKeeper(); -// // current_zookeeper->remove(replica_path + "/" + table_name); -// return; -// } -// -// void DatabaseReplicated::drop(const Context & context) -// { -// LOG_DEBUG(log, "DROP"); -// DatabaseAtomic::drop(context); -// // current_zookeeper = getZooKeeper(); -// // current_zookeeper->remove(replica_path); -// -// // DatabaseOnDisk::drop(context); // no throw -// return; -// } -// -// void DatabaseReplicated::loadStoredObjects( -// Context & context, -// bool has_force_restore_data_flag) -// { -// DatabaseOrdinary::loadStoredObjects(context, has_force_restore_data_flag); -// // launch a worker maybe. i don't know -// // DatabaseAtomic::loadStoredObjects(context, has_force_restore_data_flag); -// -// return; -// } - } diff --git a/src/Databases/DatabaseReplicated.h b/src/Databases/DatabaseReplicated.h index df6f86c1491..d61f0a00ef8 100644 --- a/src/Databases/DatabaseReplicated.h +++ b/src/Databases/DatabaseReplicated.h @@ -1,9 +1,12 @@ #pragma once -#include +#include #include #include +#include +#include + namespace DB { /** Replicated database engine. @@ -11,49 +14,35 @@ namespace DB * that contain declaration of table represented by SQL ATTACH TABLE query * and operation log in zookeeper */ -class DatabaseReplicated : public DatabaseAtomic +class DatabaseReplicated : public DatabaseOrdinary { public: DatabaseReplicated(const String & name_, const String & metadata_path_, const String & zookeeper_path_, const String & replica_name_, Context & context); -// void drop(const Context & context) override; + ~DatabaseReplicated(); String getEngineName() const override { return "Replicated"; } void propose(const ASTPtr & query) override; -// void createTable( -// const Context & context, -// const String & table_name, -// const StoragePtr & table, -// const ASTPtr & query) override; -// -// void dropTable( -// const Context & context, -// const String & table_name, -// bool no_delay) override; -// -// void renameTable( -// const Context & context, -// const String & table_name, -// IDatabase & to_database, -// const String & to_table_name, -// bool exchange) override; -// -// void alterTable( -// const Context & context, -// const StorageID & table_id, -// const StorageInMemoryMetadata & metadata) override; - -// void attachTable(const String & name, const StoragePtr & table, const String & relative_table_path) override; -// -// StoragePtr detachTable(const String & name) override; - -// void loadStoredObjects( -// Context & context, -// bool has_force_restore_data_flag) override; - private: + + void runMainThread(); + void runCleanupThread(); + + void attachToThreadGroup(); + + void executeLog(size_t n); + + Context & context; // is it overkiill? + std::unique_ptr current_context; // to run executeQuery + + size_t current_log_entry_n = 0; + std::atomic stop_flag{false}; + + ThreadFromGlobalPool main_thread; + ThreadGroupStatusPtr thread_group; + String zookeeper_path; String replica_name; String replica_path; diff --git a/src/Interpreters/ClientInfo.h b/src/Interpreters/ClientInfo.h index 704fba3b3ef..2dff30e40a2 100644 --- a/src/Interpreters/ClientInfo.h +++ b/src/Interpreters/ClientInfo.h @@ -38,6 +38,7 @@ public: NO_QUERY = 0, /// Uninitialized object. INITIAL_QUERY = 1, SECONDARY_QUERY = 2, /// Query that was initiated by another query for distributed or ON CLUSTER query execution. + REPLICATED_LOG_QUERY = 3, /// TODO add comment }; diff --git a/src/Interpreters/Context.h b/src/Interpreters/Context.h index 5a4e959229f..66ea6f6914c 100644 --- a/src/Interpreters/Context.h +++ b/src/Interpreters/Context.h @@ -214,6 +214,9 @@ private: Context(); public: + ///testing + bool from_replicated_log = false; + /// Create initial Context with ContextShared and etc. static Context createGlobal(ContextShared * shared); static SharedContextHolder createShared(); diff --git a/src/Interpreters/DDLWorker.cpp b/src/Interpreters/DDLWorker.cpp index 28436f192b0..65f984924a3 100644 --- a/src/Interpreters/DDLWorker.cpp +++ b/src/Interpreters/DDLWorker.cpp @@ -585,7 +585,8 @@ bool DDLWorker::tryExecuteQuery(const String & query, const DDLTask & task, Exec try { current_context = std::make_unique(context); - current_context->getClientInfo().query_kind = ClientInfo::QueryKind::SECONDARY_QUERY; + //current_context->getClientInfo().query_kind = ClientInfo::QueryKind::SECONDARY_QUERY; + current_context->from_replicated_log = true; current_context->setCurrentQueryId(""); // generate random query_id executeQuery(istr, ostr, false, *current_context, {}); } diff --git a/src/Interpreters/InterpreterAlterQuery.cpp b/src/Interpreters/InterpreterAlterQuery.cpp index 61277b8160c..ad79bd68fed 100644 --- a/src/Interpreters/InterpreterAlterQuery.cpp +++ b/src/Interpreters/InterpreterAlterQuery.cpp @@ -15,6 +15,8 @@ #include #include #include +#include +#include namespace DB @@ -37,6 +39,7 @@ BlockIO InterpreterAlterQuery::execute() { const auto & alter = query_ptr->as(); + if (!alter.cluster.empty()) return executeDDLQueryOnCluster(query_ptr, context, getRequiredAccess()); @@ -46,6 +49,12 @@ BlockIO InterpreterAlterQuery::execute() auto alter_lock = table->lockForAlter(context.getCurrentQueryId(), context.getSettingsRef().lock_acquire_timeout); auto metadata_snapshot = table->getInMemoryMetadataPtr(); + // TODO it's dirty. need to add database to parsing stage + DatabasePtr database = DatabaseCatalog::instance().getDatabase(table_id.database_name); + if (database->getEngineName() == "Replicated" && !context.from_replicated_log) { + database->propose(query_ptr); + } + /// Add default database to table identifiers that we can encounter in e.g. default expressions, /// mutation expression, etc. AddDefaultDatabaseVisitor visitor(table_id.getDatabaseName()); diff --git a/src/Interpreters/InterpreterCreateQuery.cpp b/src/Interpreters/InterpreterCreateQuery.cpp index 99c021a72fa..5698c370fa1 100644 --- a/src/Interpreters/InterpreterCreateQuery.cpp +++ b/src/Interpreters/InterpreterCreateQuery.cpp @@ -622,7 +622,7 @@ bool InterpreterCreateQuery::doCreateTable(ASTCreateQuery & create, if (need_add_to_database) { database = DatabaseCatalog::instance().getDatabase(create.database); - if (database->getEngineName() == "Atomic" || database->getEngineName() == "Replicated") + if (database->getEngineName() == "Atomic") // || database->getEngineName() == "Replicated") { /// TODO implement ATTACH FROM 'path/to/data': generate UUID and move table data to store/ if (create.attach && create.uuid == UUIDHelpers::Nil) @@ -697,7 +697,7 @@ bool InterpreterCreateQuery::doCreateTable(ASTCreateQuery & create, } - if (database->getEngineName() == "Replicated") { + if (database->getEngineName() == "Replicated" && !context.from_replicated_log) { // propose // try to database->propose(query_ptr); diff --git a/src/Interpreters/InterpreterDropQuery.cpp b/src/Interpreters/InterpreterDropQuery.cpp index e6853a8af4c..bae1b796016 100644 --- a/src/Interpreters/InterpreterDropQuery.cpp +++ b/src/Interpreters/InterpreterDropQuery.cpp @@ -97,6 +97,9 @@ BlockIO InterpreterDropQuery::executeToTable( if (database->getEngineName() != "Atomic") table_lock = table->lockExclusively(context.getCurrentQueryId(), context.getSettingsRef().lock_acquire_timeout); /// Drop table from memory, don't touch data and metadata + if (database->getEngineName() == "Replicated" && !context.from_replicated_log) { + database->propose(query_ptr); + } database->detachTable(table_id.table_name); } else if (query.kind == ASTDropQuery::Kind::Truncate) @@ -120,6 +123,9 @@ BlockIO InterpreterDropQuery::executeToTable( if (database->getEngineName() != "Atomic") table_lock = table->lockExclusively(context.getCurrentQueryId(), context.getSettingsRef().lock_acquire_timeout); + if (database->getEngineName() == "Replicated" && !context.from_replicated_log) { + database->propose(query_ptr); + } database->dropTable(context, table_id.table_name, query.no_delay); } } diff --git a/src/Interpreters/InterpreterRenameQuery.cpp b/src/Interpreters/InterpreterRenameQuery.cpp index de2b6bb0c1c..d93b14a6bc2 100644 --- a/src/Interpreters/InterpreterRenameQuery.cpp +++ b/src/Interpreters/InterpreterRenameQuery.cpp @@ -80,7 +80,11 @@ BlockIO InterpreterRenameQuery::execute() if (!rename.exchange) database_catalog.assertTableDoesntExist(StorageID(elem.to_database_name, elem.to_table_name), context); - database_catalog.getDatabase(elem.from_database_name)->renameTable( + DatabasePtr database = database_catalog.getDatabase(elem.from_database_name); + if (database->getEngineName() == "Replicated" && !context.from_replicated_log) { + database->propose(query_ptr); + } + database->renameTable( context, elem.from_table_name, *database_catalog.getDatabase(elem.to_database_name), From 5eea58039c6f78a93eabd65792e8ed5c47615127 Mon Sep 17 00:00:00 2001 From: Val Date: Mon, 11 May 2020 16:31:14 +0300 Subject: [PATCH 0013/2357] fix not initialized last entry in zk --- src/Databases/DatabaseReplicated.cpp | 14 ++++++++------ src/Databases/DatabaseReplicated.h | 2 +- src/Interpreters/DDLWorker.cpp | 3 +-- 3 files changed, 10 insertions(+), 9 deletions(-) diff --git a/src/Databases/DatabaseReplicated.cpp b/src/Databases/DatabaseReplicated.cpp index 1bc954bfb76..36c95f68c2c 100644 --- a/src/Databases/DatabaseReplicated.cpp +++ b/src/Databases/DatabaseReplicated.cpp @@ -99,8 +99,6 @@ DatabaseReplicated::DatabaseReplicated( if (!current_zookeeper) { throw Exception("Can't create replicated database without ZooKeeper", ErrorCodes::NO_ZOOKEEPER); - - } current_zookeeper->createAncestors(zookeeper_path); @@ -109,7 +107,6 @@ DatabaseReplicated::DatabaseReplicated( // TODO if no last_entry then make it equal to 0 in zk; // TODO launch a worker here - main_thread = ThreadFromGlobalPool(&DatabaseReplicated::runMainThread, this); } @@ -126,15 +123,20 @@ void DatabaseReplicated::runMainThread() { while (!stop_flag) { attachToThreadGroup(); - sleepForSeconds(10); + sleepForSeconds(2); current_zookeeper = getZooKeeper(); - String last_n = current_zookeeper->get(zookeeper_path + "/last_entry", {}, NULL); + String last_n; + if (!current_zookeeper->tryGet(zookeeper_path + "/last_entry", last_n, {}, NULL)) { + continue; + } size_t last_n_parsed = parse(last_n); + LOG_DEBUG(log, "PARSED " << last_n_parsed); + LOG_DEBUG(log, "LOCAL CURRENT " << current_log_entry_n); while (current_log_entry_n < last_n_parsed) { current_log_entry_n++; executeLog(current_log_entry_n); } - break; // debug purpose + // break; // debug purpose } } diff --git a/src/Databases/DatabaseReplicated.h b/src/Databases/DatabaseReplicated.h index d61f0a00ef8..7700d17d9e4 100644 --- a/src/Databases/DatabaseReplicated.h +++ b/src/Databases/DatabaseReplicated.h @@ -37,7 +37,7 @@ private: Context & context; // is it overkiill? std::unique_ptr current_context; // to run executeQuery - size_t current_log_entry_n = 0; + std::atomic current_log_entry_n = 0; std::atomic stop_flag{false}; ThreadFromGlobalPool main_thread; diff --git a/src/Interpreters/DDLWorker.cpp b/src/Interpreters/DDLWorker.cpp index 65f984924a3..28436f192b0 100644 --- a/src/Interpreters/DDLWorker.cpp +++ b/src/Interpreters/DDLWorker.cpp @@ -585,8 +585,7 @@ bool DDLWorker::tryExecuteQuery(const String & query, const DDLTask & task, Exec try { current_context = std::make_unique(context); - //current_context->getClientInfo().query_kind = ClientInfo::QueryKind::SECONDARY_QUERY; - current_context->from_replicated_log = true; + current_context->getClientInfo().query_kind = ClientInfo::QueryKind::SECONDARY_QUERY; current_context->setCurrentQueryId(""); // generate random query_id executeQuery(istr, ostr, false, *current_context, {}); } From d61259cd7b2f9f49c8a1e6da6a431a97d6616f45 Mon Sep 17 00:00:00 2001 From: Val Date: Tue, 12 May 2020 16:35:05 +0300 Subject: [PATCH 0014/2357] ddl replication works --- src/Databases/DatabaseReplicated.cpp | 23 ++++++++++++++++------- src/Databases/DatabaseReplicated.h | 1 - 2 files changed, 16 insertions(+), 8 deletions(-) diff --git a/src/Databases/DatabaseReplicated.cpp b/src/Databases/DatabaseReplicated.cpp index 36c95f68c2c..2c7f6facf71 100644 --- a/src/Databases/DatabaseReplicated.cpp +++ b/src/Databases/DatabaseReplicated.cpp @@ -32,6 +32,7 @@ #include #include #include +#include #include #include @@ -81,7 +82,6 @@ DatabaseReplicated::DatabaseReplicated( // TODO add constructor to Atomic and call it here with path and logger name specification // TODO ask why const and & are ommited in Atomic : DatabaseOrdinary(name_, metadata_path_, context_) - , context(context_) , zookeeper_path(zookeeper_path_) , replica_name(replica_name_) { @@ -142,17 +142,26 @@ void DatabaseReplicated::runMainThread() { void DatabaseReplicated::executeLog(size_t n) { - LOG_DEBUG(log, "EXECUTING LOG! DB: " << database_name << "\n Replica: " << replica_name << "LOG N" << n); - current_context = std::make_unique(context); - current_context->from_replicated_log = true; - current_context->setCurrentQueryId(""); // generate random query_id current_zookeeper = getZooKeeper(); - String query_to_execute = current_zookeeper->get(zookeeper_path + "/log." + std::to_string(n), {}, NULL); ReadBufferFromString istr(query_to_execute); String dummy_string; WriteBufferFromString ostr(dummy_string); - executeQuery(istr, ostr, false, context, {}); + + try + { + current_context = std::make_unique(global_context); + current_context->from_replicated_log = true; + current_context->setCurrentQueryId(""); // generate random query_id + executeQuery(istr, ostr, false, *current_context, {}); + } + catch (...) + { + tryLogCurrentException(log, "Query " + query_to_execute + " wasn't finished successfully"); + + } + + LOG_DEBUG(log, "Executed query: " << query_to_execute); } // TODO we might not need it here at all diff --git a/src/Databases/DatabaseReplicated.h b/src/Databases/DatabaseReplicated.h index 7700d17d9e4..504be5a3ec5 100644 --- a/src/Databases/DatabaseReplicated.h +++ b/src/Databases/DatabaseReplicated.h @@ -34,7 +34,6 @@ private: void executeLog(size_t n); - Context & context; // is it overkiill? std::unique_ptr current_context; // to run executeQuery std::atomic current_log_entry_n = 0; From d7a354b24d20d2b78f91f5f745ded28e873a6b49 Mon Sep 17 00:00:00 2001 From: Val Date: Tue, 12 May 2020 17:25:36 +0300 Subject: [PATCH 0015/2357] create query fix for replicated dbs --- src/Databases/DatabaseReplicated.cpp | 1 + src/Interpreters/InterpreterCreateQuery.cpp | 5 +++++ 2 files changed, 6 insertions(+) diff --git a/src/Databases/DatabaseReplicated.cpp b/src/Databases/DatabaseReplicated.cpp index 2c7f6facf71..e507894bd3e 100644 --- a/src/Databases/DatabaseReplicated.cpp +++ b/src/Databases/DatabaseReplicated.cpp @@ -152,6 +152,7 @@ void DatabaseReplicated::executeLog(size_t n) { { current_context = std::make_unique(global_context); current_context->from_replicated_log = true; + current_context->setCurrentDatabase(database_name); current_context->setCurrentQueryId(""); // generate random query_id executeQuery(istr, ostr, false, *current_context, {}); } diff --git a/src/Interpreters/InterpreterCreateQuery.cpp b/src/Interpreters/InterpreterCreateQuery.cpp index 5698c370fa1..ed4095d63be 100644 --- a/src/Interpreters/InterpreterCreateQuery.cpp +++ b/src/Interpreters/InterpreterCreateQuery.cpp @@ -601,6 +601,11 @@ BlockIO InterpreterCreateQuery::createTable(ASTCreateQuery & create) /// Set and retrieve list of columns, indices and constraints. Set table engine if needed. Rewrite query in canonical way. TableProperties properties = setProperties(create); + // testing + if (context.from_replicated_log) { + create.database = current_database; + } + /// Actually creates table bool created = doCreateTable(create, properties); if (!created) /// Table already exists From c0924b5911ce165166a66c8f0055b34ad7dbd2ed Mon Sep 17 00:00:00 2001 From: Val Date: Tue, 12 May 2020 17:55:24 +0300 Subject: [PATCH 0016/2357] create and alter test for replicated db --- ...icated_database_engine_zookeeper.reference | 34 ++++++++++++++++ ...9_replicated_database_engine_zookeeper.sql | 39 +++++++++++++++++++ 2 files changed, 73 insertions(+) create mode 100644 tests/queries/0_stateless/01269_replicated_database_engine_zookeeper.reference create mode 100644 tests/queries/0_stateless/01269_replicated_database_engine_zookeeper.sql diff --git a/tests/queries/0_stateless/01269_replicated_database_engine_zookeeper.reference b/tests/queries/0_stateless/01269_replicated_database_engine_zookeeper.reference new file mode 100644 index 00000000000..58f951b1257 --- /dev/null +++ b/tests/queries/0_stateless/01269_replicated_database_engine_zookeeper.reference @@ -0,0 +1,34 @@ +CounterID UInt32 +StartDate Date +UserID UInt32 +VisitID UInt32 +Added0 String +Added1 UInt32 +Added2 UInt32 +AddedNested1.A Array(UInt32) +AddedNested1.C Array(String) +AddedNested2.A Array(UInt32) +AddedNested2.B Array(UInt64) +CounterID UInt32 +StartDate Date +UserID UInt32 +VisitID UInt32 +Added0 String +Added1 UInt32 +Added2 UInt32 +AddedNested1.A Array(UInt32) +AddedNested1.C Array(String) +AddedNested2.A Array(UInt32) +AddedNested2.B Array(UInt64) +CounterID UInt32 +StartDate Date +UserID UInt32 +VisitID UInt32 +Added0 String +Added1 UInt32 +Added2 UInt32 +AddedNested1.A Array(UInt32) +AddedNested1.C Array(String) +AddedNested2.A Array(UInt32) +AddedNested2.B Array(UInt64) + diff --git a/tests/queries/0_stateless/01269_replicated_database_engine_zookeeper.sql b/tests/queries/0_stateless/01269_replicated_database_engine_zookeeper.sql new file mode 100644 index 00000000000..1acc9022014 --- /dev/null +++ b/tests/queries/0_stateless/01269_replicated_database_engine_zookeeper.sql @@ -0,0 +1,39 @@ +DROP DATABASE IF EXISTS rdbtest; +DROP DATABASE IF EXISTS replicatwo; +DROP DATABASE IF EXISTS replicathree; + +CREATE DATABASE rdbtest ENGINE = Replicated('/clickhouse/db/test1/', 'id1'); +CREATE DATABASE replicatwo ENGINE = Replicated('/clickhouse/db/test1/', 'id2'); +CREATE DATABASE replicathree ENGINE = Replicated('/clickhouse/db/test1/', 'id3'); + +USE rdbtest; + +CREATE TABLE alter_test (CounterID UInt32, StartDate Date, UserID UInt32, VisitID UInt32, NestedColumn Nested(A UInt8, S String), ToDrop UInt32) ENGINE = MergeTree(StartDate, intHash32(UserID), (CounterID, StartDate, intHash32(UserID), VisitID), 8192); + +ALTER TABLE alter_test ADD COLUMN Added0 UInt32; +ALTER TABLE alter_test ADD COLUMN Added2 UInt32; +ALTER TABLE alter_test ADD COLUMN Added1 UInt32 AFTER Added0; + +ALTER TABLE alter_test ADD COLUMN AddedNested1 Nested(A UInt32, B UInt64) AFTER Added2; +ALTER TABLE alter_test ADD COLUMN AddedNested1.C Array(String) AFTER AddedNested1.B; +ALTER TABLE alter_test ADD COLUMN AddedNested2 Nested(A UInt32, B UInt64) AFTER AddedNested1; + +ALTER TABLE alter_test DROP COLUMN ToDrop; + +ALTER TABLE alter_test MODIFY COLUMN Added0 String; + +ALTER TABLE alter_test DROP COLUMN NestedColumn.A; +ALTER TABLE alter_test DROP COLUMN NestedColumn.S; + +ALTER TABLE alter_test DROP COLUMN AddedNested1.B; + +ALTER TABLE alter_test ADD COLUMN IF NOT EXISTS Added0 UInt32; +ALTER TABLE alter_test ADD COLUMN IF NOT EXISTS AddedNested1 Nested(A UInt32, B UInt64); +ALTER TABLE alter_test ADD COLUMN IF NOT EXISTS AddedNested1.C Array(String); +ALTER TABLE alter_test MODIFY COLUMN IF EXISTS ToDrop UInt64; +ALTER TABLE alter_test DROP COLUMN IF EXISTS ToDrop; +ALTER TABLE alter_test COMMENT COLUMN IF EXISTS ToDrop 'new comment'; + +DESC TABLE rdbtest.alter_test; +DESC TABLE replicatwo.alter_test; +DESC TABLE replicathree.alter_test; From f103e24a09f475f4d66038b41667b63be01a94be Mon Sep 17 00:00:00 2001 From: Val Date: Wed, 13 May 2020 17:44:01 +0300 Subject: [PATCH 0017/2357] make db replicated inherited from atomic --- src/Databases/DatabaseReplicated.cpp | 6 ++---- src/Databases/DatabaseReplicated.h | 4 ++-- src/Databases/DatabasesCommon.cpp | 2 +- src/Interpreters/InterpreterCreateQuery.cpp | 18 ++++++++---------- src/Interpreters/InterpreterDropQuery.cpp | 9 +++++++-- 5 files changed, 20 insertions(+), 19 deletions(-) diff --git a/src/Databases/DatabaseReplicated.cpp b/src/Databases/DatabaseReplicated.cpp index e507894bd3e..2b473c25ce2 100644 --- a/src/Databases/DatabaseReplicated.cpp +++ b/src/Databases/DatabaseReplicated.cpp @@ -81,7 +81,7 @@ DatabaseReplicated::DatabaseReplicated( // : DatabaseOrdinary(name_, metadata_path_, "data/" + escapeForFileName(name_) + "/", "DatabaseReplicated (" + name_ + ")", context_) // TODO add constructor to Atomic and call it here with path and logger name specification // TODO ask why const and & are ommited in Atomic - : DatabaseOrdinary(name_, metadata_path_, context_) + : DatabaseAtomic(name_, metadata_path_, context_) , zookeeper_path(zookeeper_path_) , replica_name(replica_name_) { @@ -122,8 +122,7 @@ void DatabaseReplicated::runMainThread() { while (!stop_flag) { attachToThreadGroup(); - - sleepForSeconds(2); + sleepForSeconds(1);// BURN CPU current_zookeeper = getZooKeeper(); String last_n; if (!current_zookeeper->tryGet(zookeeper_path + "/last_entry", last_n, {}, NULL)) { @@ -136,7 +135,6 @@ void DatabaseReplicated::runMainThread() { current_log_entry_n++; executeLog(current_log_entry_n); } - // break; // debug purpose } } diff --git a/src/Databases/DatabaseReplicated.h b/src/Databases/DatabaseReplicated.h index 504be5a3ec5..0cb0c57c808 100644 --- a/src/Databases/DatabaseReplicated.h +++ b/src/Databases/DatabaseReplicated.h @@ -1,6 +1,6 @@ #pragma once -#include +#include #include #include @@ -14,7 +14,7 @@ namespace DB * that contain declaration of table represented by SQL ATTACH TABLE query * and operation log in zookeeper */ -class DatabaseReplicated : public DatabaseOrdinary +class DatabaseReplicated : public DatabaseAtomic { public: DatabaseReplicated(const String & name_, const String & metadata_path_, const String & zookeeper_path_, const String & replica_name_, Context & context); diff --git a/src/Databases/DatabasesCommon.cpp b/src/Databases/DatabasesCommon.cpp index 47c54fae800..7925d812241 100644 --- a/src/Databases/DatabasesCommon.cpp +++ b/src/Databases/DatabasesCommon.cpp @@ -98,7 +98,7 @@ void DatabaseWithOwnTablesBase::attachTableUnlocked(const String & table_name, c auto table_id = table->getStorageID(); if (table_id.hasUUID()) { - assert(getDatabaseName() == DatabaseCatalog::TEMPORARY_DATABASE || getEngineName() == "Atomic"); + assert(getDatabaseName() == DatabaseCatalog::TEMPORARY_DATABASE || getEngineName() == "Atomic" || getEngineName() == "Replicated"); DatabaseCatalog::instance().addUUIDMapping(table_id.uuid, shared_from_this(), table); } } diff --git a/src/Interpreters/InterpreterCreateQuery.cpp b/src/Interpreters/InterpreterCreateQuery.cpp index ed4095d63be..648e41327ba 100644 --- a/src/Interpreters/InterpreterCreateQuery.cpp +++ b/src/Interpreters/InterpreterCreateQuery.cpp @@ -627,7 +627,7 @@ bool InterpreterCreateQuery::doCreateTable(ASTCreateQuery & create, if (need_add_to_database) { database = DatabaseCatalog::instance().getDatabase(create.database); - if (database->getEngineName() == "Atomic") // || database->getEngineName() == "Replicated") + if (database->getEngineName() == "Atomic" || (database->getEngineName() == "Replicated" && !context.from_replicated_log)) { /// TODO implement ATTACH FROM 'path/to/data': generate UUID and move table data to store/ if (create.attach && create.uuid == UUIDHelpers::Nil) @@ -635,6 +635,11 @@ bool InterpreterCreateQuery::doCreateTable(ASTCreateQuery & create, if (!create.attach && create.uuid == UUIDHelpers::Nil) create.uuid = UUIDHelpers::generateV4(); } + else if (database->getEngineName() == "Replicated" && context.from_replicated_log) { + if (create.uuid == UUIDHelpers::Nil) + // change error to incorrect log or something + throw Exception("Table UUID is not specified in the replicated log", ErrorCodes::INCORRECT_QUERY); + } else { if (create.uuid != UUIDHelpers::Nil) @@ -703,16 +708,9 @@ bool InterpreterCreateQuery::doCreateTable(ASTCreateQuery & create, if (database->getEngineName() == "Replicated" && !context.from_replicated_log) { - // propose - // try to database->propose(query_ptr); - database->createTable(context, table_name, res, query_ptr); - // catch - // throw and remove proposal - // otherwise - // proceed (commit to zk) - } else - database->createTable(context, table_name, res, query_ptr); + } + database->createTable(context, table_name, res, query_ptr); /// We must call "startup" and "shutdown" while holding DDLGuard. /// Because otherwise method "shutdown" (from InterpreterDropQuery) can be called before startup diff --git a/src/Interpreters/InterpreterDropQuery.cpp b/src/Interpreters/InterpreterDropQuery.cpp index bae1b796016..e9221fc273c 100644 --- a/src/Interpreters/InterpreterDropQuery.cpp +++ b/src/Interpreters/InterpreterDropQuery.cpp @@ -93,8 +93,8 @@ BlockIO InterpreterDropQuery::executeToTable( { context.checkAccess(table->isView() ? AccessType::DROP_VIEW : AccessType::DROP_TABLE, table_id); table->shutdown(); - TableExclusiveLockHolder table_lock; - if (database->getEngineName() != "Atomic") + TableStructureWriteLockHolder table_lock; + if (database->getEngineName() != "Atomic" && database->getEngineName() != "Replicated") table_lock = table->lockExclusively(context.getCurrentQueryId(), context.getSettingsRef().lock_acquire_timeout); /// Drop table from memory, don't touch data and metadata if (database->getEngineName() == "Replicated" && !context.from_replicated_log) { @@ -119,8 +119,13 @@ BlockIO InterpreterDropQuery::executeToTable( table->shutdown(); +<<<<<<< HEAD TableExclusiveLockHolder table_lock; if (database->getEngineName() != "Atomic") +======= + TableStructureWriteLockHolder table_lock; + if (database->getEngineName() != "Atomic" && database->getEngineName() != "Replicated") +>>>>>>> 921e85e9c9... make db replicated inherited from atomic table_lock = table->lockExclusively(context.getCurrentQueryId(), context.getSettingsRef().lock_acquire_timeout); if (database->getEngineName() == "Replicated" && !context.from_replicated_log) { From 5e076b464ea79c4d27e38a55cfc141645ddc9884 Mon Sep 17 00:00:00 2001 From: Val Date: Wed, 13 May 2020 20:00:47 +0300 Subject: [PATCH 0018/2357] add replicated db snapshot, integration test, repl alter queries, etc add an option to create replicated tables within replicated db without specifying zk path and replica id add replicated sch pool disable replication of alter queries for replicated tables in replicated dbs snapshot prototype. amend of replicated db workflow add prototype of integration tests for replicated db --- src/Common/CurrentMetrics.cpp | 2 + src/Core/Settings.h | 1 + src/Databases/DatabaseLazy.cpp | 2 +- src/Databases/DatabaseLazy.h | 2 +- src/Databases/DatabaseOnDisk.h | 3 +- src/Databases/DatabaseOrdinary.cpp | 2 +- src/Databases/DatabaseOrdinary.h | 4 +- src/Databases/DatabaseReplicated.cpp | 93 ++++++++++++------- src/Databases/DatabaseReplicated.h | 16 ++-- src/Databases/DatabaseWithDictionaries.cpp | 2 +- src/Databases/DatabaseWithDictionaries.h | 2 +- src/Interpreters/Context.cpp | 18 ++++ src/Interpreters/Context.h | 1 + src/Interpreters/InterpreterAlterQuery.cpp | 2 +- .../MergeTree/registerStorageMergeTree.cpp | 35 ++++++- .../test_replicated_database/test.py | 38 ++++++++ 16 files changed, 166 insertions(+), 57 deletions(-) create mode 100644 tests/integration/test_replicated_database/test.py diff --git a/src/Common/CurrentMetrics.cpp b/src/Common/CurrentMetrics.cpp index 4bab9ef2844..36c65953a6f 100644 --- a/src/Common/CurrentMetrics.cpp +++ b/src/Common/CurrentMetrics.cpp @@ -14,6 +14,7 @@ M(BackgroundSchedulePoolTask, "Number of active tasks in BackgroundSchedulePool. This pool is used for periodic ReplicatedMergeTree tasks, like cleaning old data parts, altering data parts, replica re-initialization, etc.") \ M(BackgroundBufferFlushSchedulePoolTask, "Number of active tasks in BackgroundBufferFlushSchedulePool. This pool is used for periodic Buffer flushes") \ M(BackgroundDistributedSchedulePoolTask, "Number of active tasks in BackgroundDistributedSchedulePool. This pool is used for distributed sends that is done in background.") \ + M(BackgroundReplicatedSchedulePoolTask, "Number of active tasks in BackgroundReplicatedSchedulePoolTask. TODO.") \ M(CacheDictionaryUpdateQueueBatches, "Number of 'batches' (a set of keys) in update queue in CacheDictionaries.") \ M(CacheDictionaryUpdateQueueKeys, "Exact number of keys in update queue in CacheDictionaries.") \ M(DiskSpaceReservedForMerge, "Disk space reserved for currently running background merges. It is slightly more than the total size of currently merging parts.") \ @@ -38,6 +39,7 @@ M(MemoryTrackingInBackgroundSchedulePool, "Total amount of memory (bytes) allocated in background schedule pool (that is dedicated for bookkeeping tasks of Replicated tables).") \ M(MemoryTrackingInBackgroundBufferFlushSchedulePool, "Total amount of memory (bytes) allocated in background buffer flushes pool (that is dedicated for background buffer flushes).") \ M(MemoryTrackingInBackgroundDistributedSchedulePool, "Total amount of memory (bytes) allocated in background distributed schedule pool (that is dedicated for distributed sends).") \ + M(MemoryTrackingInBackgroundReplicatedSchedulePool, "Total amount of memory (bytes) allocated in replicated schedule pool (TODO).") \ M(MemoryTrackingForMerges, "Total amount of memory (bytes) allocated for background merges. Included in MemoryTrackingInBackgroundProcessingPool. Note that this value may include a drift when the memory was allocated in a context of background processing pool and freed in other context or vice-versa. This happens naturally due to caches for tables indexes and doesn't indicate memory leaks.") \ M(EphemeralNode, "Number of ephemeral nodes hold in ZooKeeper.") \ M(ZooKeeperSession, "Number of sessions (connections) to ZooKeeper. Should be no more than one, because using more than one connection to ZooKeeper may lead to bugs due to lack of linearizability (stale reads) that ZooKeeper consistency model allows.") \ diff --git a/src/Core/Settings.h b/src/Core/Settings.h index f434132eccd..ea950afa70a 100644 --- a/src/Core/Settings.h +++ b/src/Core/Settings.h @@ -87,6 +87,7 @@ struct Settings : public SettingsCollection M(SettingUInt64, background_move_pool_size, 8, "Number of threads performing background moves for tables. Only has meaning at server startup.", 0) \ M(SettingUInt64, background_schedule_pool_size, 16, "Number of threads performing background tasks for replicated tables, kafka streaming, dns cache updates. Only has meaning at server startup.", 0) \ M(SettingUInt64, background_distributed_schedule_pool_size, 16, "Number of threads performing background tasks for distributed sends. Only has meaning at server startup.", 0) \ + M(SettingUInt64, background_replicated_schedule_pool_size, 16, "Number of threads performing background tasks in replicated databases. Only has meaning at server startup.", 0) \ \ M(SettingMilliseconds, distributed_directory_monitor_sleep_time_ms, 100, "Sleep time for StorageDistributed DirectoryMonitors, in case of any errors delay grows exponentially.", 0) \ M(SettingMilliseconds, distributed_directory_monitor_max_sleep_time_ms, 30000, "Maximum sleep time for StorageDistributed DirectoryMonitors, it limits exponential growth too.", 0) \ diff --git a/src/Databases/DatabaseLazy.cpp b/src/Databases/DatabaseLazy.cpp index 11e5272110e..d1a6c191bfc 100644 --- a/src/Databases/DatabaseLazy.cpp +++ b/src/Databases/DatabaseLazy.cpp @@ -27,7 +27,7 @@ namespace ErrorCodes } -DatabaseLazy::DatabaseLazy(const String & name_, const String & metadata_path_, time_t expiration_time_, const Context & context_) +DatabaseLazy::DatabaseLazy(const String & name_, const String & metadata_path_, time_t expiration_time_, Context & context_) : DatabaseOnDisk(name_, metadata_path_, "data/" + escapeForFileName(name_) + "/", "DatabaseLazy (" + name_ + ")", context_) , expiration_time(expiration_time_) { diff --git a/src/Databases/DatabaseLazy.h b/src/Databases/DatabaseLazy.h index 2e24b687be5..adda103a21e 100644 --- a/src/Databases/DatabaseLazy.h +++ b/src/Databases/DatabaseLazy.h @@ -18,7 +18,7 @@ class Context; class DatabaseLazy final : public DatabaseOnDisk { public: - DatabaseLazy(const String & name_, const String & metadata_path_, time_t expiration_time_, const Context & context_); + DatabaseLazy(const String & name_, const String & metadata_path_, time_t expiration_time_, Context & context_); String getEngineName() const override { return "Lazy"; } diff --git a/src/Databases/DatabaseOnDisk.h b/src/Databases/DatabaseOnDisk.h index d4fb9b2aa17..dc347c99542 100644 --- a/src/Databases/DatabaseOnDisk.h +++ b/src/Databases/DatabaseOnDisk.h @@ -31,7 +31,7 @@ String getObjectDefinitionFromCreateQuery(const ASTPtr & query); class DatabaseOnDisk : public DatabaseWithOwnTablesBase { public: - DatabaseOnDisk(const String & name, const String & metadata_path_, const String & data_path_, const String & logger, const Context & context); + DatabaseOnDisk(const String & name, const String & metadata_path_, const String & data_path_, const String & logger, Context & context); void createTable( const Context & context, @@ -86,6 +86,7 @@ protected: const String metadata_path; const String data_path; + Context & global_context; }; } diff --git a/src/Databases/DatabaseOrdinary.cpp b/src/Databases/DatabaseOrdinary.cpp index 9194558dffb..2f4f584b091 100644 --- a/src/Databases/DatabaseOrdinary.cpp +++ b/src/Databases/DatabaseOrdinary.cpp @@ -94,7 +94,7 @@ namespace } -DatabaseOrdinary::DatabaseOrdinary(const String & name_, const String & metadata_path_, const Context & context_) +DatabaseOrdinary::DatabaseOrdinary(const String & name_, const String & metadata_path_, Context & context_) : DatabaseOrdinary(name_, metadata_path_, "data/" + escapeForFileName(name_) + "/", "DatabaseOrdinary (" + name_ + ")", context_) { } diff --git a/src/Databases/DatabaseOrdinary.h b/src/Databases/DatabaseOrdinary.h index a9e53edfe28..4767ccdc123 100644 --- a/src/Databases/DatabaseOrdinary.h +++ b/src/Databases/DatabaseOrdinary.h @@ -14,8 +14,8 @@ namespace DB class DatabaseOrdinary : public DatabaseWithDictionaries { public: - DatabaseOrdinary(const String & name_, const String & metadata_path_, const Context & context); - DatabaseOrdinary(const String & name_, const String & metadata_path_, const String & data_path_, const String & logger, const Context & context_); + DatabaseOrdinary(const String & name_, const String & metadata_path_, Context & context); + DatabaseOrdinary(const String & name_, const String & metadata_path_, const String & data_path_, const String & logger, Context & context_); String getEngineName() const override { return "Ordinary"; } diff --git a/src/Databases/DatabaseReplicated.cpp b/src/Databases/DatabaseReplicated.cpp index 2b473c25ce2..9dd8530fc46 100644 --- a/src/Databases/DatabaseReplicated.cpp +++ b/src/Databases/DatabaseReplicated.cpp @@ -101,43 +101,58 @@ DatabaseReplicated::DatabaseReplicated( throw Exception("Can't create replicated database without ZooKeeper", ErrorCodes::NO_ZOOKEEPER); } - current_zookeeper->createAncestors(zookeeper_path); - current_zookeeper->createOrUpdate(zookeeper_path, String(), zkutil::CreateMode::Persistent); + if (!current_zookeeper->exists(zookeeper_path, {}, NULL)) { + current_zookeeper->createAncestors(zookeeper_path); + current_zookeeper->createOrUpdate(zookeeper_path, String(), zkutil::CreateMode::Persistent); + current_zookeeper->createOrUpdate(zookeeper_path + "/last_entry", "0", zkutil::CreateMode::Persistent); + current_zookeeper->createAncestors(replica_path); + } else { + } + current_zookeeper->createOrUpdate(replica_path, String(), zkutil::CreateMode::Persistent); - // TODO if no last_entry then make it equal to 0 in zk; - - // TODO launch a worker here - main_thread = ThreadFromGlobalPool(&DatabaseReplicated::runMainThread, this); + backgroundLogExecutor = global_context.getReplicatedSchedulePool().createTask(database_name + "(DatabaseReplicated::the_threeeed)", [this]{ runMainThread();} ); + backgroundLogExecutor->schedule(); } DatabaseReplicated::~DatabaseReplicated() { stop_flag = true; - main_thread.join(); } void DatabaseReplicated::runMainThread() { - setThreadName("ReplctdWorker"); // ok whatever. 15 bytes // + database_name); LOG_DEBUG(log, "Started " << database_name << " database worker thread\n Replica: " << replica_name); - - while (!stop_flag) { - attachToThreadGroup(); - sleepForSeconds(1);// BURN CPU + if (!stop_flag) { // TODO is there a need for the flag? current_zookeeper = getZooKeeper(); - String last_n; - if (!current_zookeeper->tryGet(zookeeper_path + "/last_entry", last_n, {}, NULL)) { - continue; - } + String last_n = current_zookeeper->get(zookeeper_path + "/last_entry", {}, NULL); size_t last_n_parsed = parse(last_n); LOG_DEBUG(log, "PARSED " << last_n_parsed); LOG_DEBUG(log, "LOCAL CURRENT " << current_log_entry_n); + + bool newEntries = current_log_entry_n < last_n_parsed; while (current_log_entry_n < last_n_parsed) { current_log_entry_n++; executeLog(current_log_entry_n); } + if (newEntries) { + saveState(); + } + backgroundLogExecutor->scheduleAfter(500); } } +void DatabaseReplicated::saveState() { + current_zookeeper->createOrUpdate(replica_path + "/last_entry", std::to_string(current_log_entry_n), zkutil::CreateMode::Persistent); + // TODO rename vars + String statement = std::to_string(current_log_entry_n); + String metadatafile = getMetadataPath() + ".last_entry"; + WriteBufferFromFile out(metadatafile, statement.size(), O_WRONLY | O_CREAT); + writeString(statement, out); + out.next(); + if (global_context.getSettingsRef().fsync_metadata) + out.sync(); + out.close(); +} + void DatabaseReplicated::executeLog(size_t n) { current_zookeeper = getZooKeeper(); @@ -163,21 +178,7 @@ void DatabaseReplicated::executeLog(size_t n) { LOG_DEBUG(log, "Executed query: " << query_to_execute); } -// TODO we might not need it here at all -void DatabaseReplicated::attachToThreadGroup() { - if (thread_group) - { - /// Put all threads to one thread pool - CurrentThread::attachToIfDetached(thread_group); - } - else - { - CurrentThread::initializeQuery(); - thread_group = CurrentThread::getGroup(); - } -} - -// taken from ddlworker +// TODO Move to ZooKeeper/Lock and remove it from here and ddlworker static std::unique_ptr createSimpleZooKeeperLock( const std::shared_ptr & zookeeper, const String & lock_prefix, const String & lock_name, const String & lock_message) { @@ -188,15 +189,24 @@ static std::unique_ptr createSimpleZooKeeperLock( void DatabaseReplicated::propose(const ASTPtr & query) { - // TODO if source is zk then omit propose. Throw? - // TODO remove that log message i think LOG_DEBUG(log, "PROPOSING\n" << queryToString(query)); current_zookeeper = getZooKeeper(); - auto lock = createSimpleZooKeeperLock(current_zookeeper, zookeeper_path, "lock", replica_name); + auto lock = createSimpleZooKeeperLock(current_zookeeper, zookeeper_path, "propose_lock", replica_name); - // TODO check that last_entry is the same as current_log_entry_n for the replica + + // schedule and deactive combo + // ensures that replica is up to date + // and since propose lock is acquired, + // no other propose can happen from + // different replicas during this call + backgroundLogExecutor->schedule(); + backgroundLogExecutor->deactivate(); + + if (current_log_entry_n > 5) { // make a settings variable + createSnapshot(); + } current_log_entry_n++; // starting from 1 String log_entry = zookeeper_path + "/log." + std::to_string(current_log_entry_n); @@ -205,7 +215,18 @@ void DatabaseReplicated::propose(const ASTPtr & query) { current_zookeeper->createOrUpdate(zookeeper_path + "/last_entry", std::to_string(current_log_entry_n), zkutil::CreateMode::Persistent); lock->unlock(); - // write to metastore the last entry? + saveState(); +} + +void DatabaseReplicated::createSnapshot() { + current_zookeeper->createAncestors(zookeeper_path + "/snapshot"); + current_zookeeper->createOrUpdate(zookeeper_path + "/snapshot", std::to_string(current_log_entry_n), zkutil::CreateMode::Persistent); + for (auto iterator = getTablesIterator({}); iterator->isValid(); iterator->next()) { + String table_name = iterator->name(); + auto query = getCreateQueryFromMetadata(getObjectMetadataPath(table_name), true); + String statement = queryToString(query); + current_zookeeper->createOrUpdate(zookeeper_path + "/snapshot/" + table_name, statement, zkutil::CreateMode::Persistent); + } } } diff --git a/src/Databases/DatabaseReplicated.h b/src/Databases/DatabaseReplicated.h index 0cb0c57c808..0b2d097caac 100644 --- a/src/Databases/DatabaseReplicated.h +++ b/src/Databases/DatabaseReplicated.h @@ -3,6 +3,7 @@ #include #include #include +#include #include #include @@ -25,25 +26,26 @@ public: void propose(const ASTPtr & query) override; + String zookeeper_path; + String replica_name; + private: void runMainThread(); - void runCleanupThread(); - void attachToThreadGroup(); - void executeLog(size_t n); + void saveState(); + + void createSnapshot(); + std::unique_ptr current_context; // to run executeQuery std::atomic current_log_entry_n = 0; std::atomic stop_flag{false}; - ThreadFromGlobalPool main_thread; - ThreadGroupStatusPtr thread_group; + BackgroundSchedulePool::TaskHolder backgroundLogExecutor; - String zookeeper_path; - String replica_name; String replica_path; zkutil::ZooKeeperPtr current_zookeeper; /// Use only the methods below. diff --git a/src/Databases/DatabaseWithDictionaries.cpp b/src/Databases/DatabaseWithDictionaries.cpp index e0f2aa9286b..37f5b51f4ed 100644 --- a/src/Databases/DatabaseWithDictionaries.cpp +++ b/src/Databases/DatabaseWithDictionaries.cpp @@ -317,7 +317,7 @@ void DatabaseWithDictionaries::shutdown() DatabaseWithDictionaries::DatabaseWithDictionaries( - const String & name, const String & metadata_path_, const String & data_path_, const String & logger, const Context & context) + const String & name, const String & metadata_path_, const String & data_path_, const String & logger, Context & context) : DatabaseOnDisk(name, metadata_path_, data_path_, logger, context) , external_loader(context.getExternalDictionariesLoader()) { diff --git a/src/Databases/DatabaseWithDictionaries.h b/src/Databases/DatabaseWithDictionaries.h index eb9e105e31d..0e87ae686cf 100644 --- a/src/Databases/DatabaseWithDictionaries.h +++ b/src/Databases/DatabaseWithDictionaries.h @@ -37,7 +37,7 @@ public: ~DatabaseWithDictionaries() override; protected: - DatabaseWithDictionaries(const String & name, const String & metadata_path_, const String & data_path_, const String & logger, const Context & context); + DatabaseWithDictionaries(const String & name, const String & metadata_path_, const String & data_path_, const String & logger, Context & context); ASTPtr getCreateDictionaryQueryImpl(const String & dictionary_name, bool throw_on_error) const override; diff --git a/src/Interpreters/Context.cpp b/src/Interpreters/Context.cpp index b691e9aaf60..ccd489f6c45 100644 --- a/src/Interpreters/Context.cpp +++ b/src/Interpreters/Context.cpp @@ -82,6 +82,9 @@ namespace CurrentMetrics extern const Metric BackgroundDistributedSchedulePoolTask; extern const Metric MemoryTrackingInBackgroundDistributedSchedulePool; + + extern const Metric BackgroundReplicatedSchedulePoolTask; + extern const Metric MemoryTrackingInBackgroundReplicatedSchedulePool; } @@ -338,6 +341,8 @@ struct ContextShared std::optional background_move_pool; /// The thread pool for the background moves performed by the tables. std::optional schedule_pool; /// A thread pool that can run different jobs in background (used in replicated tables) std::optional distributed_schedule_pool; /// A thread pool that can run different jobs in background (used for distributed sends) + // TODO Rename replicated table pool or even both; adjust comments + std::optional replicated_schedule_pool; /// A thread pool that can run different jobs in background (used in replicated database engine) MultiVersion macros; /// Substitutions extracted from config. std::unique_ptr ddl_worker; /// Process ddl commands from zk. /// Rules for selecting the compression settings, depending on the size of the part. @@ -437,6 +442,7 @@ struct ContextShared background_move_pool.reset(); schedule_pool.reset(); distributed_schedule_pool.reset(); + replicated_schedule_pool.reset(); ddl_worker.reset(); /// Stop trace collector if any @@ -1415,6 +1421,18 @@ BackgroundSchedulePool & Context::getDistributedSchedulePool() return *shared->distributed_schedule_pool; } +BackgroundSchedulePool & Context::getReplicatedSchedulePool() +{ + auto lock = getLock(); + if (!shared->replicated_schedule_pool) + shared->replicated_schedule_pool.emplace( + settings.background_replicated_schedule_pool_size, + CurrentMetrics::BackgroundReplicatedSchedulePoolTask, + CurrentMetrics::MemoryTrackingInBackgroundReplicatedSchedulePool, + "BgRplSchPool"); + return *shared->replicated_schedule_pool; +} + void Context::setDDLWorker(std::unique_ptr ddl_worker) { auto lock = getLock(); diff --git a/src/Interpreters/Context.h b/src/Interpreters/Context.h index 66ea6f6914c..e9c78a175d4 100644 --- a/src/Interpreters/Context.h +++ b/src/Interpreters/Context.h @@ -502,6 +502,7 @@ public: BackgroundProcessingPool & getBackgroundMovePool(); BackgroundSchedulePool & getSchedulePool(); BackgroundSchedulePool & getDistributedSchedulePool(); + BackgroundSchedulePool & getReplicatedSchedulePool(); void setDDLWorker(std::unique_ptr ddl_worker); DDLWorker & getDDLWorker() const; diff --git a/src/Interpreters/InterpreterAlterQuery.cpp b/src/Interpreters/InterpreterAlterQuery.cpp index ad79bd68fed..cef1ebd7469 100644 --- a/src/Interpreters/InterpreterAlterQuery.cpp +++ b/src/Interpreters/InterpreterAlterQuery.cpp @@ -51,7 +51,7 @@ BlockIO InterpreterAlterQuery::execute() // TODO it's dirty. need to add database to parsing stage DatabasePtr database = DatabaseCatalog::instance().getDatabase(table_id.database_name); - if (database->getEngineName() == "Replicated" && !context.from_replicated_log) { + if (database->getEngineName() == "Replicated" && !context.from_replicated_log && !table->supportsReplication()) { database->propose(query_ptr); } diff --git a/src/Storages/MergeTree/registerStorageMergeTree.cpp b/src/Storages/MergeTree/registerStorageMergeTree.cpp index 1ecac8f413d..eb62c80cc49 100644 --- a/src/Storages/MergeTree/registerStorageMergeTree.cpp +++ b/src/Storages/MergeTree/registerStorageMergeTree.cpp @@ -1,3 +1,6 @@ +#include +#include + #include #include #include @@ -277,10 +280,18 @@ static StoragePtr create(const StorageFactory::Arguments & args) String name_part = args.engine_name.substr(0, args.engine_name.size() - strlen("MergeTree")); - bool replicated = startsWith(name_part, "Replicated"); - if (replicated) + bool replicatedStorage = startsWith(name_part, "Replicated"); + if (replicatedStorage) name_part = name_part.substr(strlen("Replicated")); + String database_name = args.query.database; + auto database = DatabaseCatalog::instance().getDatabase(database_name); + bool replicatedDatabase = false; + + if (database->getEngineName() == "Replicated") { + replicatedDatabase = true; + } + MergeTreeData::MergingParams merging_params; merging_params.mode = MergeTreeData::MergingParams::Ordinary; @@ -322,7 +333,7 @@ static StoragePtr create(const StorageFactory::Arguments & args) needed_params += "]"; }; - if (replicated) + if (replicatedStorage && !replicatedDatabase) { add_mandatory_param("path in ZooKeeper"); add_mandatory_param("replica name"); @@ -392,7 +403,7 @@ static StoragePtr create(const StorageFactory::Arguments & args) String zookeeper_path; String replica_name; - if (replicated) + if (replicatedStorage && !replicatedDatabase) { const auto * ast = engine_args[arg_num]->as(); if (ast && ast->value.getType() == Field::Types::String) @@ -418,6 +429,12 @@ static StoragePtr create(const StorageFactory::Arguments & args) ++arg_num; } + if (replicatedStorage && replicatedDatabase) { + auto * database_replicated = typeid_cast(database.get()); + zookeeper_path = database_replicated->zookeeper_path + "/tables/" + toString(args.query.uuid); + replica_name = database_replicated->replica_name; + } + /// This merging param maybe used as part of sorting key std::optional merging_param_key_arg; @@ -617,7 +634,15 @@ static StoragePtr create(const StorageFactory::Arguments & args) throw Exception("You must set the setting `allow_experimental_data_skipping_indices` to 1 " \ "before using data skipping indices.", ErrorCodes::BAD_ARGUMENTS); - if (replicated) + StorageInMemoryMetadata metadata(args.columns, indices_description, args.constraints); + metadata.partition_by_ast = partition_by_ast; + metadata.order_by_ast = order_by_ast; + metadata.primary_key_ast = primary_key_ast; + metadata.ttl_for_table_ast = ttl_table_ast; + metadata.sample_by_ast = sample_by_ast; + metadata.settings_ast = settings_ast; + + if (replicatedStorage) return StorageReplicatedMergeTree::create( zookeeper_path, replica_name, args.attach, args.table_id, args.relative_data_path, metadata, args.context, date_column_name, merging_params, std::move(storage_settings), diff --git a/tests/integration/test_replicated_database/test.py b/tests/integration/test_replicated_database/test.py new file mode 100644 index 00000000000..23268bcdfd8 --- /dev/null +++ b/tests/integration/test_replicated_database/test.py @@ -0,0 +1,38 @@ +import time +import logging + +import pytest + +from helpers.cluster import ClickHouseCluster + +logging.getLogger().setLevel(logging.INFO) +logging.getLogger().addHandler(logging.StreamHandler()) + +cluster = ClickHouseCluster(__file__) + +node1 = cluster.add_instance('node1', macros={'replica': 'test1'}, with_zookeeper=True) +node2 = cluster.add_instance('node2', macros={'replica': 'test2'}, with_zookeeper=True) + +all_nodes = [node1, node2] + +@pytest.fixture(scope="module") +def started_cluster(): + try: + cluster.start() + + for node in all_nodes: + node.query("DROP DATABASE IF EXISTS testdb") + node.query("CREATE DATABASE testdb ENGINE = Replicated('/clickhouse/databases/test1', '{replica}');") + yield cluster + + finally: + cluster.shutdown() + + +def test_db(started_cluster): + DURATION_SECONDS = 5 + node1.query("CREATE TABLE testdb.replicated_table (d Date, k UInt64, i32 Int32) ENGINE=ReplicatedMergeTree(d, k, 8192);") + + time.sleep(DURATION_SECONDS) + logging.info(node2.query("desc table testdb.replicated_table")) + assert node1.query("desc table testdb.replicated_table") == node2.query("desc table testdb.replicated_table") From 34f74ff7851fbb68fb740219f339ced64242636c Mon Sep 17 00:00:00 2001 From: Val Date: Sun, 24 May 2020 20:12:24 +0300 Subject: [PATCH 0019/2357] add test cases for replicated db --- .../test_replicated_database/test.py | 44 ++++++++++++++++--- 1 file changed, 38 insertions(+), 6 deletions(-) diff --git a/tests/integration/test_replicated_database/test.py b/tests/integration/test_replicated_database/test.py index 23268bcdfd8..38977aa0bdb 100644 --- a/tests/integration/test_replicated_database/test.py +++ b/tests/integration/test_replicated_database/test.py @@ -12,15 +12,14 @@ cluster = ClickHouseCluster(__file__) node1 = cluster.add_instance('node1', macros={'replica': 'test1'}, with_zookeeper=True) node2 = cluster.add_instance('node2', macros={'replica': 'test2'}, with_zookeeper=True) - -all_nodes = [node1, node2] +node3 = cluster.add_instance('node3', macros={'replica': 'test3'}, with_zookeeper=True) @pytest.fixture(scope="module") def started_cluster(): try: cluster.start() - for node in all_nodes: + for node in [node1, node2]: node.query("DROP DATABASE IF EXISTS testdb") node.query("CREATE DATABASE testdb ENGINE = Replicated('/clickhouse/databases/test1', '{replica}');") yield cluster @@ -29,10 +28,43 @@ def started_cluster(): cluster.shutdown() -def test_db(started_cluster): - DURATION_SECONDS = 5 - node1.query("CREATE TABLE testdb.replicated_table (d Date, k UInt64, i32 Int32) ENGINE=ReplicatedMergeTree(d, k, 8192);") +def test_create_replicated_table(started_cluster): + DURATION_SECONDS = 1 + node1.query("CREATE TABLE testdb.replicated_table (d Date, k UInt64, i32 Int32) ENGINE=ReplicatedMergeTree(d, k, 8192);") time.sleep(DURATION_SECONDS) logging.info(node2.query("desc table testdb.replicated_table")) assert node1.query("desc table testdb.replicated_table") == node2.query("desc table testdb.replicated_table") + +def test_alter_table(started_cluster): + DURATION_SECONDS = 1 + node1.query("CREATE TABLE testdb.alter_test (CounterID UInt32, StartDate Date, UserID UInt32, VisitID UInt32, NestedColumn Nested(A UInt8, S String), ToDrop UInt32) ENGINE = MergeTree(StartDate, intHash32(UserID), (CounterID, StartDate, intHash32(UserID), VisitID), 8192);\ + ALTER TABLE testdb.alter_test ADD COLUMN Added0 UInt32;\ + ALTER TABLE testdb.alter_test ADD COLUMN Added2 UInt32;\ + ALTER TABLE testdb.alter_test ADD COLUMN Added1 UInt32 AFTER Added0;\ + ALTER TABLE testdb.alter_test ADD COLUMN AddedNested1 Nested(A UInt32, B UInt64) AFTER Added2;\ + ALTER TABLE testdb.alter_test ADD COLUMN AddedNested1.C Array(String) AFTER AddedNested1.B;\ + ALTER TABLE testdb.alter_test ADD COLUMN AddedNested2 Nested(A UInt32, B UInt64) AFTER AddedNested1;") + + time.sleep(DURATION_SECONDS) + assert node1.query("desc table testdb.alter_test") == node2.query("desc table testdb.alter_test") + +def test_create_replica_from_snapshot(started_cluster): + DURATION_SECONDS = 3 + """ + right now snapshot's created every 6 proposes. + later on it must be configurable + for now let's check snapshot + by creating a new node just after 10 log entries + """ + node1.query("ALTER TABLE testdb.alter_test ADD COLUMN Added3 UInt32 ;") #9 + node1.query("ALTER TABLE testdb.alter_test ADD COLUMN Added4 UInt32 ;") #10 + node1.query("ALTER TABLE testdb.alter_test ADD COLUMN Added5 UInt32 ;") #1 + # by this moment snapshot must be created + + node3.query("CREATE DATABASE testdb ENGINE = Replicated('/clickhouse/databases/test1', '{replica}');") + + time.sleep(DURATION_SECONDS) + + assert node3.query("desc table testdb.alter_test") == node1.query("desc table testdb.alter_test") + From 1f03839830c1ec92b912bab6cdcfba6908780ccf Mon Sep 17 00:00:00 2001 From: Val Date: Sun, 24 May 2020 20:12:59 +0300 Subject: [PATCH 0020/2357] add zookeeper tryRemoveChildren method --- src/Common/ZooKeeper/ZooKeeper.cpp | 17 +++++++++++++++++ src/Common/ZooKeeper/ZooKeeper.h | 5 +++++ 2 files changed, 22 insertions(+) diff --git a/src/Common/ZooKeeper/ZooKeeper.cpp b/src/Common/ZooKeeper/ZooKeeper.cpp index 476e88d7e72..541625149dd 100644 --- a/src/Common/ZooKeeper/ZooKeeper.cpp +++ b/src/Common/ZooKeeper/ZooKeeper.cpp @@ -579,6 +579,23 @@ void ZooKeeper::removeChildren(const std::string & path) } +void ZooKeeper::tryRemoveChildren(const std::string & path) +{ + Strings children; + if (tryGetChildren(path, children) != Coordination::ZOK) + return; + while (!children.empty()) + { + Coordination::Requests ops; + for (size_t i = 0; i < MULTI_BATCH_SIZE && !children.empty(); ++i) + { + ops.emplace_back(makeRemoveRequest(path + "/" + children.back(), -1)); + children.pop_back(); + } + multi(ops); + } +} + void ZooKeeper::removeChildrenRecursive(const std::string & path) { Strings children = getChildren(path); diff --git a/src/Common/ZooKeeper/ZooKeeper.h b/src/Common/ZooKeeper/ZooKeeper.h index 416e40c2da4..cb28f442392 100644 --- a/src/Common/ZooKeeper/ZooKeeper.h +++ b/src/Common/ZooKeeper/ZooKeeper.h @@ -187,7 +187,12 @@ public: /// Remove all children nodes (non recursive). void removeChildren(const std::string & path); + /// Remove all children nodes (non recursive). + /// If there're no children, this method doesn't throw an exception + void tryRemoveChildren(const std::string & path); + using WaitCondition = std::function; + /// Wait for the node to disappear or return immediately if it doesn't exist. /// If condition is speficied, it is used to return early (when condition returns false) /// The function returns true if waited and false if waiting was interrupted by condition. From 4921dc6dab978d05bf16a5cf6bfd8572a5c0f12b Mon Sep 17 00:00:00 2001 From: Val Date: Sun, 24 May 2020 20:13:53 +0300 Subject: [PATCH 0021/2357] db replicated refactoring --- src/Databases/DatabaseReplicated.cpp | 105 ++++++++++++++++----------- src/Databases/DatabaseReplicated.h | 14 ++-- 2 files changed, 69 insertions(+), 50 deletions(-) diff --git a/src/Databases/DatabaseReplicated.cpp b/src/Databases/DatabaseReplicated.cpp index 9dd8530fc46..ae5a8249202 100644 --- a/src/Databases/DatabaseReplicated.cpp +++ b/src/Databases/DatabaseReplicated.cpp @@ -80,7 +80,6 @@ DatabaseReplicated::DatabaseReplicated( Context & context_) // : DatabaseOrdinary(name_, metadata_path_, "data/" + escapeForFileName(name_) + "/", "DatabaseReplicated (" + name_ + ")", context_) // TODO add constructor to Atomic and call it here with path and logger name specification - // TODO ask why const and & are ommited in Atomic : DatabaseAtomic(name_, metadata_path_, context_) , zookeeper_path(zookeeper_path_) , replica_name(replica_name_) @@ -102,42 +101,50 @@ DatabaseReplicated::DatabaseReplicated( } if (!current_zookeeper->exists(zookeeper_path, {}, NULL)) { - current_zookeeper->createAncestors(zookeeper_path); - current_zookeeper->createOrUpdate(zookeeper_path, String(), zkutil::CreateMode::Persistent); - current_zookeeper->createOrUpdate(zookeeper_path + "/last_entry", "0", zkutil::CreateMode::Persistent); + createDatabaseZKNodes(); + } + + // replica + if (!current_zookeeper->exists(replica_path, {}, NULL)) { current_zookeeper->createAncestors(replica_path); - } else { + current_zookeeper->createOrUpdate(replica_path, String(), zkutil::CreateMode::Persistent); } - current_zookeeper->createOrUpdate(replica_path, String(), zkutil::CreateMode::Persistent); - backgroundLogExecutor = global_context.getReplicatedSchedulePool().createTask(database_name + "(DatabaseReplicated::the_threeeed)", [this]{ runMainThread();} ); - backgroundLogExecutor->schedule(); + //loadMetadataFromSnapshot(); + + background_log_executor = global_context.getReplicatedSchedulePool().createTask(database_name + "(DatabaseReplicated::the_threeeed)", [this]{ runBackgroundLogExecutor();} ); + background_log_executor->schedule(); } -DatabaseReplicated::~DatabaseReplicated() -{ - stop_flag = true; +void DatabaseReplicated::createDatabaseZKNodes() { + current_zookeeper = getZooKeeper(); + + if (current_zookeeper->exists(zookeeper_path)) + return; + + current_zookeeper->createAncestors(zookeeper_path); + + current_zookeeper->createIfNotExists(zookeeper_path, String()); + current_zookeeper->createIfNotExists(zookeeper_path + "/last_entry", "0"); + current_zookeeper->createIfNotExists(zookeeper_path + "/log", String()); + current_zookeeper->createIfNotExists(zookeeper_path + "/snapshot", String()); } -void DatabaseReplicated::runMainThread() { - LOG_DEBUG(log, "Started " << database_name << " database worker thread\n Replica: " << replica_name); - if (!stop_flag) { // TODO is there a need for the flag? - current_zookeeper = getZooKeeper(); - String last_n = current_zookeeper->get(zookeeper_path + "/last_entry", {}, NULL); - size_t last_n_parsed = parse(last_n); - LOG_DEBUG(log, "PARSED " << last_n_parsed); - LOG_DEBUG(log, "LOCAL CURRENT " << current_log_entry_n); +void DatabaseReplicated::runBackgroundLogExecutor() { + current_zookeeper = getZooKeeper(); + String last_n = current_zookeeper->get(zookeeper_path + "/last_entry", {}, NULL); + size_t last_n_parsed = parse(last_n); - bool newEntries = current_log_entry_n < last_n_parsed; - while (current_log_entry_n < last_n_parsed) { - current_log_entry_n++; - executeLog(current_log_entry_n); - } - if (newEntries) { - saveState(); - } - backgroundLogExecutor->scheduleAfter(500); + bool newEntries = current_log_entry_n < last_n_parsed; + while (current_log_entry_n < last_n_parsed) { + current_log_entry_n++; + String log_path = zookeeper_path + "/log/log." + std::to_string(current_log_entry_n); + executeFromZK(log_path); } + if (newEntries) { + saveState(); + } + background_log_executor->scheduleAfter(500); } void DatabaseReplicated::saveState() { @@ -153,10 +160,9 @@ void DatabaseReplicated::saveState() { out.close(); } -void DatabaseReplicated::executeLog(size_t n) { - +void DatabaseReplicated::executeFromZK(String & path) { current_zookeeper = getZooKeeper(); - String query_to_execute = current_zookeeper->get(zookeeper_path + "/log." + std::to_string(n), {}, NULL); + String query_to_execute = current_zookeeper->get(path, {}, NULL); ReadBufferFromString istr(query_to_execute); String dummy_string; WriteBufferFromString ostr(dummy_string); @@ -171,7 +177,7 @@ void DatabaseReplicated::executeLog(size_t n) { } catch (...) { - tryLogCurrentException(log, "Query " + query_to_execute + " wasn't finished successfully"); + tryLogCurrentException(log, "Query from zookeeper " + query_to_execute + " wasn't finished successfully"); } @@ -195,21 +201,23 @@ void DatabaseReplicated::propose(const ASTPtr & query) { current_zookeeper = getZooKeeper(); auto lock = createSimpleZooKeeperLock(current_zookeeper, zookeeper_path, "propose_lock", replica_name); - // schedule and deactive combo // ensures that replica is up to date // and since propose lock is acquired, // no other propose can happen from // different replicas during this call - backgroundLogExecutor->schedule(); - backgroundLogExecutor->deactivate(); + background_log_executor->schedule(); + background_log_executor->deactivate(); - if (current_log_entry_n > 5) { // make a settings variable - createSnapshot(); - } +// if (current_log_entry_n > 5) { // make a settings variable +// // TODO check that all the replicas are up to date! +// updateSnapshot(); +// current_log_entry_n = 0; +// current_zookeeper->removeChildren(zookeeper_path + "/log"); +// } current_log_entry_n++; // starting from 1 - String log_entry = zookeeper_path + "/log." + std::to_string(current_log_entry_n); + String log_entry = zookeeper_path + "/log/log." + std::to_string(current_log_entry_n); current_zookeeper->createOrUpdate(log_entry, queryToString(query), zkutil::CreateMode::Persistent); current_zookeeper->createOrUpdate(zookeeper_path + "/last_entry", std::to_string(current_log_entry_n), zkutil::CreateMode::Persistent); @@ -218,9 +226,9 @@ void DatabaseReplicated::propose(const ASTPtr & query) { saveState(); } -void DatabaseReplicated::createSnapshot() { - current_zookeeper->createAncestors(zookeeper_path + "/snapshot"); - current_zookeeper->createOrUpdate(zookeeper_path + "/snapshot", std::to_string(current_log_entry_n), zkutil::CreateMode::Persistent); +void DatabaseReplicated::updateSnapshot() { + current_zookeeper = getZooKeeper(); + current_zookeeper->tryRemoveChildren(zookeeper_path + "/snapshot"); for (auto iterator = getTablesIterator({}); iterator->isValid(); iterator->next()) { String table_name = iterator->name(); auto query = getCreateQueryFromMetadata(getObjectMetadataPath(table_name), true); @@ -229,4 +237,17 @@ void DatabaseReplicated::createSnapshot() { } } +void DatabaseReplicated::loadMetadataFromSnapshot() { + current_zookeeper = getZooKeeper(); + + Strings metadatas; + if (current_zookeeper->tryGetChildren(zookeeper_path + "/snapshot", metadatas) != Coordination::ZOK) + return; + + for (auto t = metadatas.begin(); t != metadatas.end(); ++t) { + String path = zookeeper_path + "/snapshot/" + *t; + executeFromZK(path); + } +} + } diff --git a/src/Databases/DatabaseReplicated.h b/src/Databases/DatabaseReplicated.h index 0b2d097caac..bd2f11390d2 100644 --- a/src/Databases/DatabaseReplicated.h +++ b/src/Databases/DatabaseReplicated.h @@ -20,8 +20,6 @@ class DatabaseReplicated : public DatabaseAtomic public: DatabaseReplicated(const String & name_, const String & metadata_path_, const String & zookeeper_path_, const String & replica_name_, Context & context); - ~DatabaseReplicated(); - String getEngineName() const override { return "Replicated"; } void propose(const ASTPtr & query) override; @@ -30,21 +28,21 @@ public: String replica_name; private: + void createDatabaseZKNodes(); - void runMainThread(); + void runBackgroundLogExecutor(); - void executeLog(size_t n); + void executeFromZK(String & path); void saveState(); - - void createSnapshot(); + void updateSnapshot(); + void loadMetadataFromSnapshot(); std::unique_ptr current_context; // to run executeQuery std::atomic current_log_entry_n = 0; - std::atomic stop_flag{false}; - BackgroundSchedulePool::TaskHolder backgroundLogExecutor; + BackgroundSchedulePool::TaskHolder background_log_executor; String replica_path; From cbcd1bea0eef7ee647f1cdcca51612cecc4697d1 Mon Sep 17 00:00:00 2001 From: Val Date: Tue, 26 May 2020 16:35:05 +0300 Subject: [PATCH 0022/2357] provide better comments and information --- src/Common/CurrentMetrics.cpp | 4 ++-- src/Common/ZooKeeper/ZooKeeper.h | 3 ++- src/Core/Settings.h | 2 +- src/Databases/IDatabase.h | 22 ++++++++++----------- src/Interpreters/Context.cpp | 1 - src/Interpreters/InterpreterCreateQuery.cpp | 8 +++++--- src/Interpreters/InterpreterDropQuery.cpp | 8 +++----- 7 files changed, 24 insertions(+), 24 deletions(-) diff --git a/src/Common/CurrentMetrics.cpp b/src/Common/CurrentMetrics.cpp index 36c65953a6f..a6a08897505 100644 --- a/src/Common/CurrentMetrics.cpp +++ b/src/Common/CurrentMetrics.cpp @@ -14,7 +14,7 @@ M(BackgroundSchedulePoolTask, "Number of active tasks in BackgroundSchedulePool. This pool is used for periodic ReplicatedMergeTree tasks, like cleaning old data parts, altering data parts, replica re-initialization, etc.") \ M(BackgroundBufferFlushSchedulePoolTask, "Number of active tasks in BackgroundBufferFlushSchedulePool. This pool is used for periodic Buffer flushes") \ M(BackgroundDistributedSchedulePoolTask, "Number of active tasks in BackgroundDistributedSchedulePool. This pool is used for distributed sends that is done in background.") \ - M(BackgroundReplicatedSchedulePoolTask, "Number of active tasks in BackgroundReplicatedSchedulePoolTask. TODO.") \ + M(BackgroundReplicatedSchedulePoolTask, "Number of active tasks in BackgroundReplicatedSchedulePoolTask. The pool is used by replicated database for executing DDL log coming from other replicas. One task corresponds to one replicated database") \ M(CacheDictionaryUpdateQueueBatches, "Number of 'batches' (a set of keys) in update queue in CacheDictionaries.") \ M(CacheDictionaryUpdateQueueKeys, "Exact number of keys in update queue in CacheDictionaries.") \ M(DiskSpaceReservedForMerge, "Disk space reserved for currently running background merges. It is slightly more than the total size of currently merging parts.") \ @@ -39,7 +39,7 @@ M(MemoryTrackingInBackgroundSchedulePool, "Total amount of memory (bytes) allocated in background schedule pool (that is dedicated for bookkeeping tasks of Replicated tables).") \ M(MemoryTrackingInBackgroundBufferFlushSchedulePool, "Total amount of memory (bytes) allocated in background buffer flushes pool (that is dedicated for background buffer flushes).") \ M(MemoryTrackingInBackgroundDistributedSchedulePool, "Total amount of memory (bytes) allocated in background distributed schedule pool (that is dedicated for distributed sends).") \ - M(MemoryTrackingInBackgroundReplicatedSchedulePool, "Total amount of memory (bytes) allocated in replicated schedule pool (TODO).") \ + M(MemoryTrackingInBackgroundReplicatedSchedulePool, "Total amount of memory (bytes) allocated in background replicated schedule pool (that is dedicated for ddl log execution by replicated database replicas).") \ M(MemoryTrackingForMerges, "Total amount of memory (bytes) allocated for background merges. Included in MemoryTrackingInBackgroundProcessingPool. Note that this value may include a drift when the memory was allocated in a context of background processing pool and freed in other context or vice-versa. This happens naturally due to caches for tables indexes and doesn't indicate memory leaks.") \ M(EphemeralNode, "Number of ephemeral nodes hold in ZooKeeper.") \ M(ZooKeeperSession, "Number of sessions (connections) to ZooKeeper. Should be no more than one, because using more than one connection to ZooKeeper may lead to bugs due to lack of linearizability (stale reads) that ZooKeeper consistency model allows.") \ diff --git a/src/Common/ZooKeeper/ZooKeeper.h b/src/Common/ZooKeeper/ZooKeeper.h index cb28f442392..47eaefa51fc 100644 --- a/src/Common/ZooKeeper/ZooKeeper.h +++ b/src/Common/ZooKeeper/ZooKeeper.h @@ -188,7 +188,8 @@ public: void removeChildren(const std::string & path); /// Remove all children nodes (non recursive). - /// If there're no children, this method doesn't throw an exception + /// If there're no children for the given path, + /// this method does not throw an exception. void tryRemoveChildren(const std::string & path); using WaitCondition = std::function; diff --git a/src/Core/Settings.h b/src/Core/Settings.h index ea950afa70a..1351b752136 100644 --- a/src/Core/Settings.h +++ b/src/Core/Settings.h @@ -87,7 +87,7 @@ struct Settings : public SettingsCollection M(SettingUInt64, background_move_pool_size, 8, "Number of threads performing background moves for tables. Only has meaning at server startup.", 0) \ M(SettingUInt64, background_schedule_pool_size, 16, "Number of threads performing background tasks for replicated tables, kafka streaming, dns cache updates. Only has meaning at server startup.", 0) \ M(SettingUInt64, background_distributed_schedule_pool_size, 16, "Number of threads performing background tasks for distributed sends. Only has meaning at server startup.", 0) \ - M(SettingUInt64, background_replicated_schedule_pool_size, 16, "Number of threads performing background tasks in replicated databases. Only has meaning at server startup.", 0) \ + M(SettingUInt64, background_replicated_schedule_pool_size, 4, "Number of threads performing background tasks in replicated databases. One task corresponds to one replicated database replica. Only has meaning at server startup.", 0) \ \ M(SettingMilliseconds, distributed_directory_monitor_sleep_time_ms, 100, "Sleep time for StorageDistributed DirectoryMonitors, in case of any errors delay grows exponentially.", 0) \ M(SettingMilliseconds, distributed_directory_monitor_max_sleep_time_ms, 30000, "Maximum sleep time for StorageDistributed DirectoryMonitors, it limits exponential growth too.", 0) \ diff --git a/src/Databases/IDatabase.h b/src/Databases/IDatabase.h index 18265b153cf..5b3003f36b4 100644 --- a/src/Databases/IDatabase.h +++ b/src/Databases/IDatabase.h @@ -162,7 +162,7 @@ public: virtual bool empty() const = 0; virtual void propose(const ASTPtr & /*query*/) { - throw Exception("There is no propose query method for Database" + getEngineName(), ErrorCodes::NOT_IMPLEMENTED); + throw Exception(getEngineName() + ": propose() is not supported", ErrorCodes::NOT_IMPLEMENTED); } /// Add the table to the database. Record its presence in the metadata. @@ -172,7 +172,7 @@ public: const StoragePtr & /*table*/, const ASTPtr & /*query*/) { - throw Exception("There is no CREATE TABLE query for Database" + getEngineName(), ErrorCodes::NOT_IMPLEMENTED); + throw Exception("There is no CREATE TABLE query for Database " + getEngineName(), ErrorCodes::NOT_IMPLEMENTED); } /// Add the dictionary to the database. Record its presence in the metadata. @@ -181,7 +181,7 @@ public: const String & /*dictionary_name*/, const ASTPtr & /*query*/) { - throw Exception("There is no CREATE DICTIONARY query for Database" + getEngineName(), ErrorCodes::NOT_IMPLEMENTED); + throw Exception("There is no CREATE DICTIONARY query for Database " + getEngineName(), ErrorCodes::NOT_IMPLEMENTED); } /// Delete the table from the database, drop table and delete the metadata. @@ -190,7 +190,7 @@ public: const String & /*name*/, [[maybe_unused]] bool no_delay = false) { - throw Exception("There is no DROP TABLE query for Database" + getEngineName(), ErrorCodes::NOT_IMPLEMENTED); + throw Exception("There is no DROP TABLE query for Database " + getEngineName(), ErrorCodes::NOT_IMPLEMENTED); } /// Delete the dictionary from the database. Delete the metadata. @@ -198,32 +198,32 @@ public: const Context & /*context*/, const String & /*dictionary_name*/) { - throw Exception("There is no DROP DICTIONARY query for Database" + getEngineName(), ErrorCodes::NOT_IMPLEMENTED); + throw Exception("There is no DROP DICTIONARY query for Database " + getEngineName(), ErrorCodes::NOT_IMPLEMENTED); } /// Add a table to the database, but do not add it to the metadata. The database may not support this method. virtual void attachTable(const String & /*name*/, const StoragePtr & /*table*/, [[maybe_unused]] const String & relative_table_path = {}) { - throw Exception("There is no ATTACH TABLE query for Database" + getEngineName(), ErrorCodes::NOT_IMPLEMENTED); + throw Exception("There is no ATTACH TABLE query for Database " + getEngineName(), ErrorCodes::NOT_IMPLEMENTED); } /// Add dictionary to the database, but do not add it to the metadata. The database may not support this method. /// If dictionaries_lazy_load is false it also starts loading the dictionary asynchronously. virtual void attachDictionary(const String & /* dictionary_name */, const DictionaryAttachInfo & /* attach_info */) { - throw Exception("There is no ATTACH DICTIONARY query for Database" + getEngineName(), ErrorCodes::NOT_IMPLEMENTED); + throw Exception("There is no ATTACH DICTIONARY query for Database " + getEngineName(), ErrorCodes::NOT_IMPLEMENTED); } /// Forget about the table without deleting it, and return it. The database may not support this method. virtual StoragePtr detachTable(const String & /*name*/) { - throw Exception("There is no DETACH TABLE query for Database" + getEngineName(), ErrorCodes::NOT_IMPLEMENTED); + throw Exception("There is no DETACH TABLE query for Database " + getEngineName(), ErrorCodes::NOT_IMPLEMENTED); } /// Forget about the dictionary without deleting it. The database may not support this method. virtual void detachDictionary(const String & /*name*/) { - throw Exception("There is no DETACH DICTIONARY query for Database" + getEngineName(), ErrorCodes::NOT_IMPLEMENTED); + throw Exception("There is no DETACH DICTIONARY query for Database " + getEngineName(), ErrorCodes::NOT_IMPLEMENTED); } /// Rename the table and possibly move the table to another database. @@ -314,14 +314,14 @@ protected: virtual ASTPtr getCreateTableQueryImpl(const String & /*name*/, const Context & /*context*/, bool throw_on_error) const { if (throw_on_error) - throw Exception("There is no SHOW CREATE TABLE query for Database" + getEngineName(), ErrorCodes::CANNOT_GET_CREATE_TABLE_QUERY); + throw Exception("There is no SHOW CREATE TABLE query for Database " + getEngineName(), ErrorCodes::CANNOT_GET_CREATE_TABLE_QUERY); return nullptr; } virtual ASTPtr getCreateDictionaryQueryImpl(const String & /*name*/, bool throw_on_error) const { if (throw_on_error) - throw Exception("There is no SHOW CREATE DICTIONARY query for Database" + getEngineName(), ErrorCodes::CANNOT_GET_CREATE_DICTIONARY_QUERY); + throw Exception("There is no SHOW CREATE DICTIONARY query for Database " + getEngineName(), ErrorCodes::CANNOT_GET_CREATE_DICTIONARY_QUERY); return nullptr; } diff --git a/src/Interpreters/Context.cpp b/src/Interpreters/Context.cpp index ccd489f6c45..14ee5284bab 100644 --- a/src/Interpreters/Context.cpp +++ b/src/Interpreters/Context.cpp @@ -341,7 +341,6 @@ struct ContextShared std::optional background_move_pool; /// The thread pool for the background moves performed by the tables. std::optional schedule_pool; /// A thread pool that can run different jobs in background (used in replicated tables) std::optional distributed_schedule_pool; /// A thread pool that can run different jobs in background (used for distributed sends) - // TODO Rename replicated table pool or even both; adjust comments std::optional replicated_schedule_pool; /// A thread pool that can run different jobs in background (used in replicated database engine) MultiVersion macros; /// Substitutions extracted from config. std::unique_ptr ddl_worker; /// Process ddl commands from zk. diff --git a/src/Interpreters/InterpreterCreateQuery.cpp b/src/Interpreters/InterpreterCreateQuery.cpp index 648e41327ba..6ff474e096f 100644 --- a/src/Interpreters/InterpreterCreateQuery.cpp +++ b/src/Interpreters/InterpreterCreateQuery.cpp @@ -601,7 +601,10 @@ BlockIO InterpreterCreateQuery::createTable(ASTCreateQuery & create) /// Set and retrieve list of columns, indices and constraints. Set table engine if needed. Rewrite query in canonical way. TableProperties properties = setProperties(create); - // testing + /// DDL log for replicated databases can not + /// contain the right database name for every replica + /// therefore for such queries the AST database + /// field is modified right before an actual execution if (context.from_replicated_log) { create.database = current_database; } @@ -637,8 +640,7 @@ bool InterpreterCreateQuery::doCreateTable(ASTCreateQuery & create, } else if (database->getEngineName() == "Replicated" && context.from_replicated_log) { if (create.uuid == UUIDHelpers::Nil) - // change error to incorrect log or something - throw Exception("Table UUID is not specified in the replicated log", ErrorCodes::INCORRECT_QUERY); + throw Exception("Table UUID is not specified in DDL log", ErrorCodes::INCORRECT_QUERY); } else { diff --git a/src/Interpreters/InterpreterDropQuery.cpp b/src/Interpreters/InterpreterDropQuery.cpp index e9221fc273c..fe94a394ba2 100644 --- a/src/Interpreters/InterpreterDropQuery.cpp +++ b/src/Interpreters/InterpreterDropQuery.cpp @@ -110,6 +110,9 @@ BlockIO InterpreterDropQuery::executeToTable( auto table_lock = table->lockExclusively(context.getCurrentQueryId(), context.getSettingsRef().lock_acquire_timeout); auto metadata_snapshot = table->getInMemoryMetadataPtr(); /// Drop table data, don't touch metadata + if (database->getEngineName() == "Replicated" && !context.from_replicated_log) { + database->propose(query_ptr); + } table->truncate(query_ptr, metadata_snapshot, context, table_lock); } else if (query.kind == ASTDropQuery::Kind::Drop) @@ -119,13 +122,8 @@ BlockIO InterpreterDropQuery::executeToTable( table->shutdown(); -<<<<<<< HEAD TableExclusiveLockHolder table_lock; - if (database->getEngineName() != "Atomic") -======= - TableStructureWriteLockHolder table_lock; if (database->getEngineName() != "Atomic" && database->getEngineName() != "Replicated") ->>>>>>> 921e85e9c9... make db replicated inherited from atomic table_lock = table->lockExclusively(context.getCurrentQueryId(), context.getSettingsRef().lock_acquire_timeout); if (database->getEngineName() == "Replicated" && !context.from_replicated_log) { From 31910e9bf1a526a2bf3e8fdf167ff3447e37747f Mon Sep 17 00:00:00 2001 From: Val Date: Tue, 26 May 2020 18:08:09 +0300 Subject: [PATCH 0023/2357] Use ClientInf::QueryKind to distinguish replicated db log queries --- src/Databases/DatabaseReplicated.cpp | 2 +- src/Interpreters/ClientInfo.h | 2 +- src/Interpreters/Context.h | 3 --- src/Interpreters/InterpreterAlterQuery.cpp | 3 +-- src/Interpreters/InterpreterCreateQuery.cpp | 8 ++++---- src/Interpreters/InterpreterDropQuery.cpp | 7 ++++--- src/Interpreters/InterpreterRenameQuery.cpp | 2 +- 7 files changed, 12 insertions(+), 15 deletions(-) diff --git a/src/Databases/DatabaseReplicated.cpp b/src/Databases/DatabaseReplicated.cpp index ae5a8249202..c6840ac0d81 100644 --- a/src/Databases/DatabaseReplicated.cpp +++ b/src/Databases/DatabaseReplicated.cpp @@ -170,7 +170,7 @@ void DatabaseReplicated::executeFromZK(String & path) { try { current_context = std::make_unique(global_context); - current_context->from_replicated_log = true; + current_context->getClientInfo().query_kind = ClientInfo::QueryKind::REPLICATED_LOG_QUERY; current_context->setCurrentDatabase(database_name); current_context->setCurrentQueryId(""); // generate random query_id executeQuery(istr, ostr, false, *current_context, {}); diff --git a/src/Interpreters/ClientInfo.h b/src/Interpreters/ClientInfo.h index 2dff30e40a2..42b3ab42bc1 100644 --- a/src/Interpreters/ClientInfo.h +++ b/src/Interpreters/ClientInfo.h @@ -38,7 +38,7 @@ public: NO_QUERY = 0, /// Uninitialized object. INITIAL_QUERY = 1, SECONDARY_QUERY = 2, /// Query that was initiated by another query for distributed or ON CLUSTER query execution. - REPLICATED_LOG_QUERY = 3, /// TODO add comment + REPLICATED_LOG_QUERY = 3, /// Query from replicated DDL log. }; diff --git a/src/Interpreters/Context.h b/src/Interpreters/Context.h index e9c78a175d4..5d1fda03221 100644 --- a/src/Interpreters/Context.h +++ b/src/Interpreters/Context.h @@ -214,9 +214,6 @@ private: Context(); public: - ///testing - bool from_replicated_log = false; - /// Create initial Context with ContextShared and etc. static Context createGlobal(ContextShared * shared); static SharedContextHolder createShared(); diff --git a/src/Interpreters/InterpreterAlterQuery.cpp b/src/Interpreters/InterpreterAlterQuery.cpp index cef1ebd7469..134531d0cf0 100644 --- a/src/Interpreters/InterpreterAlterQuery.cpp +++ b/src/Interpreters/InterpreterAlterQuery.cpp @@ -49,9 +49,8 @@ BlockIO InterpreterAlterQuery::execute() auto alter_lock = table->lockForAlter(context.getCurrentQueryId(), context.getSettingsRef().lock_acquire_timeout); auto metadata_snapshot = table->getInMemoryMetadataPtr(); - // TODO it's dirty. need to add database to parsing stage DatabasePtr database = DatabaseCatalog::instance().getDatabase(table_id.database_name); - if (database->getEngineName() == "Replicated" && !context.from_replicated_log && !table->supportsReplication()) { + if (database->getEngineName() == "Replicated" && context.getClientInfo().query_kind != ClientInfo::QueryKind::REPLICATED_LOG_QUERY && !table->supportsReplication()) { database->propose(query_ptr); } diff --git a/src/Interpreters/InterpreterCreateQuery.cpp b/src/Interpreters/InterpreterCreateQuery.cpp index 6ff474e096f..0b06fbfd874 100644 --- a/src/Interpreters/InterpreterCreateQuery.cpp +++ b/src/Interpreters/InterpreterCreateQuery.cpp @@ -605,7 +605,7 @@ BlockIO InterpreterCreateQuery::createTable(ASTCreateQuery & create) /// contain the right database name for every replica /// therefore for such queries the AST database /// field is modified right before an actual execution - if (context.from_replicated_log) { + if (context.getClientInfo().query_kind == ClientInfo::QueryKind::REPLICATED_LOG_QUERY) { create.database = current_database; } @@ -630,7 +630,7 @@ bool InterpreterCreateQuery::doCreateTable(ASTCreateQuery & create, if (need_add_to_database) { database = DatabaseCatalog::instance().getDatabase(create.database); - if (database->getEngineName() == "Atomic" || (database->getEngineName() == "Replicated" && !context.from_replicated_log)) + if (database->getEngineName() == "Atomic" || (database->getEngineName() == "Replicated" && context.getClientInfo().query_kind != ClientInfo::QueryKind::REPLICATED_LOG_QUERY)) { /// TODO implement ATTACH FROM 'path/to/data': generate UUID and move table data to store/ if (create.attach && create.uuid == UUIDHelpers::Nil) @@ -638,7 +638,7 @@ bool InterpreterCreateQuery::doCreateTable(ASTCreateQuery & create, if (!create.attach && create.uuid == UUIDHelpers::Nil) create.uuid = UUIDHelpers::generateV4(); } - else if (database->getEngineName() == "Replicated" && context.from_replicated_log) { + else if (database->getEngineName() == "Replicated" && context.getClientInfo().query_kind == ClientInfo::QueryKind::REPLICATED_LOG_QUERY) { if (create.uuid == UUIDHelpers::Nil) throw Exception("Table UUID is not specified in DDL log", ErrorCodes::INCORRECT_QUERY); } @@ -709,7 +709,7 @@ bool InterpreterCreateQuery::doCreateTable(ASTCreateQuery & create, } - if (database->getEngineName() == "Replicated" && !context.from_replicated_log) { + if (database->getEngineName() == "Replicated" && context.getClientInfo().query_kind != ClientInfo::QueryKind::REPLICATED_LOG_QUERY) { database->propose(query_ptr); } database->createTable(context, table_name, res, query_ptr); diff --git a/src/Interpreters/InterpreterDropQuery.cpp b/src/Interpreters/InterpreterDropQuery.cpp index fe94a394ba2..afbf5d31fbf 100644 --- a/src/Interpreters/InterpreterDropQuery.cpp +++ b/src/Interpreters/InterpreterDropQuery.cpp @@ -97,7 +97,7 @@ BlockIO InterpreterDropQuery::executeToTable( if (database->getEngineName() != "Atomic" && database->getEngineName() != "Replicated") table_lock = table->lockExclusively(context.getCurrentQueryId(), context.getSettingsRef().lock_acquire_timeout); /// Drop table from memory, don't touch data and metadata - if (database->getEngineName() == "Replicated" && !context.from_replicated_log) { + if (database->getEngineName() == "Replicated" && context.getClientInfo().query_kind != ClientInfo::QueryKind::REPLICATED_LOG_QUERY) { database->propose(query_ptr); } database->detachTable(table_id.table_name); @@ -110,7 +110,8 @@ BlockIO InterpreterDropQuery::executeToTable( auto table_lock = table->lockExclusively(context.getCurrentQueryId(), context.getSettingsRef().lock_acquire_timeout); auto metadata_snapshot = table->getInMemoryMetadataPtr(); /// Drop table data, don't touch metadata - if (database->getEngineName() == "Replicated" && !context.from_replicated_log) { + auto table_lock = table->lockExclusively(context.getCurrentQueryId(), context.getSettingsRef().lock_acquire_timeout); + if (database->getEngineName() == "Replicated" && context.getClientInfo().query_kind != ClientInfo::QueryKind::REPLICATED_LOG_QUERY) { database->propose(query_ptr); } table->truncate(query_ptr, metadata_snapshot, context, table_lock); @@ -126,7 +127,7 @@ BlockIO InterpreterDropQuery::executeToTable( if (database->getEngineName() != "Atomic" && database->getEngineName() != "Replicated") table_lock = table->lockExclusively(context.getCurrentQueryId(), context.getSettingsRef().lock_acquire_timeout); - if (database->getEngineName() == "Replicated" && !context.from_replicated_log) { + if (database->getEngineName() == "Replicated" && context.getClientInfo().query_kind != ClientInfo::QueryKind::REPLICATED_LOG_QUERY) { database->propose(query_ptr); } database->dropTable(context, table_id.table_name, query.no_delay); diff --git a/src/Interpreters/InterpreterRenameQuery.cpp b/src/Interpreters/InterpreterRenameQuery.cpp index d93b14a6bc2..45003ab0d14 100644 --- a/src/Interpreters/InterpreterRenameQuery.cpp +++ b/src/Interpreters/InterpreterRenameQuery.cpp @@ -81,7 +81,7 @@ BlockIO InterpreterRenameQuery::execute() database_catalog.assertTableDoesntExist(StorageID(elem.to_database_name, elem.to_table_name), context); DatabasePtr database = database_catalog.getDatabase(elem.from_database_name); - if (database->getEngineName() == "Replicated" && !context.from_replicated_log) { + if (database->getEngineName() == "Replicated" && context.getClientInfo().query_kind != ClientInfo::QueryKind::REPLICATED_LOG_QUERY) { database->propose(query_ptr); } database->renameTable( From fbbccaf98ae02b5ed463b3c05fc79595743e817a Mon Sep 17 00:00:00 2001 From: Val Date: Tue, 26 May 2020 18:10:15 +0300 Subject: [PATCH 0024/2357] remove stateless tests for replicated db --- ...7_replicated_database_engine_zookeeper.sql | 10 ----- ...icated_database_engine_zookeeper.reference | 34 ---------------- ...9_replicated_database_engine_zookeeper.sql | 39 ------------------- 3 files changed, 83 deletions(-) delete mode 100644 tests/queries/0_stateless/01267_replicated_database_engine_zookeeper.sql delete mode 100644 tests/queries/0_stateless/01269_replicated_database_engine_zookeeper.reference delete mode 100644 tests/queries/0_stateless/01269_replicated_database_engine_zookeeper.sql diff --git a/tests/queries/0_stateless/01267_replicated_database_engine_zookeeper.sql b/tests/queries/0_stateless/01267_replicated_database_engine_zookeeper.sql deleted file mode 100644 index c70de9a50d2..00000000000 --- a/tests/queries/0_stateless/01267_replicated_database_engine_zookeeper.sql +++ /dev/null @@ -1,10 +0,0 @@ -DROP DATABASE IF EXISTS test_db1; -DROP DATABASE IF EXISTS test_db2; - -CREATE DATABASE test_db1 ENGINE = Replicated('/clickhouse/databases/test1', 'id1'); -CREATE TABLE test_db1.replicated_table (d Date, k UInt64, i32 Int32) ENGINE=ReplicatedMergeTree('/clickhouse/tables/test1', 'id1', d, k, 8192); -CREATE TABLE test_db1.basic_table (EventDate Date, CounterID Int) engine=MergeTree(EventDate, (CounterID, EventDate), 8192); - -CREATE DATABASE test_db2 ENGINE = Replicated('/clickhouse/databases/test1', 'id2'); -CREATE TABLE test_db2.replicated_table (d Date, k UInt64, i32 Int32) ENGINE=ReplicatedMergeTree('/clickhouse/tables/test1', 'id2', d, k, 8192); -CREATE TABLE test_db2.basic_table (EventDate Date, CounterID Int) engine=MergeTree(EventDate, (CounterID, EventDate), 8192); diff --git a/tests/queries/0_stateless/01269_replicated_database_engine_zookeeper.reference b/tests/queries/0_stateless/01269_replicated_database_engine_zookeeper.reference deleted file mode 100644 index 58f951b1257..00000000000 --- a/tests/queries/0_stateless/01269_replicated_database_engine_zookeeper.reference +++ /dev/null @@ -1,34 +0,0 @@ -CounterID UInt32 -StartDate Date -UserID UInt32 -VisitID UInt32 -Added0 String -Added1 UInt32 -Added2 UInt32 -AddedNested1.A Array(UInt32) -AddedNested1.C Array(String) -AddedNested2.A Array(UInt32) -AddedNested2.B Array(UInt64) -CounterID UInt32 -StartDate Date -UserID UInt32 -VisitID UInt32 -Added0 String -Added1 UInt32 -Added2 UInt32 -AddedNested1.A Array(UInt32) -AddedNested1.C Array(String) -AddedNested2.A Array(UInt32) -AddedNested2.B Array(UInt64) -CounterID UInt32 -StartDate Date -UserID UInt32 -VisitID UInt32 -Added0 String -Added1 UInt32 -Added2 UInt32 -AddedNested1.A Array(UInt32) -AddedNested1.C Array(String) -AddedNested2.A Array(UInt32) -AddedNested2.B Array(UInt64) - diff --git a/tests/queries/0_stateless/01269_replicated_database_engine_zookeeper.sql b/tests/queries/0_stateless/01269_replicated_database_engine_zookeeper.sql deleted file mode 100644 index 1acc9022014..00000000000 --- a/tests/queries/0_stateless/01269_replicated_database_engine_zookeeper.sql +++ /dev/null @@ -1,39 +0,0 @@ -DROP DATABASE IF EXISTS rdbtest; -DROP DATABASE IF EXISTS replicatwo; -DROP DATABASE IF EXISTS replicathree; - -CREATE DATABASE rdbtest ENGINE = Replicated('/clickhouse/db/test1/', 'id1'); -CREATE DATABASE replicatwo ENGINE = Replicated('/clickhouse/db/test1/', 'id2'); -CREATE DATABASE replicathree ENGINE = Replicated('/clickhouse/db/test1/', 'id3'); - -USE rdbtest; - -CREATE TABLE alter_test (CounterID UInt32, StartDate Date, UserID UInt32, VisitID UInt32, NestedColumn Nested(A UInt8, S String), ToDrop UInt32) ENGINE = MergeTree(StartDate, intHash32(UserID), (CounterID, StartDate, intHash32(UserID), VisitID), 8192); - -ALTER TABLE alter_test ADD COLUMN Added0 UInt32; -ALTER TABLE alter_test ADD COLUMN Added2 UInt32; -ALTER TABLE alter_test ADD COLUMN Added1 UInt32 AFTER Added0; - -ALTER TABLE alter_test ADD COLUMN AddedNested1 Nested(A UInt32, B UInt64) AFTER Added2; -ALTER TABLE alter_test ADD COLUMN AddedNested1.C Array(String) AFTER AddedNested1.B; -ALTER TABLE alter_test ADD COLUMN AddedNested2 Nested(A UInt32, B UInt64) AFTER AddedNested1; - -ALTER TABLE alter_test DROP COLUMN ToDrop; - -ALTER TABLE alter_test MODIFY COLUMN Added0 String; - -ALTER TABLE alter_test DROP COLUMN NestedColumn.A; -ALTER TABLE alter_test DROP COLUMN NestedColumn.S; - -ALTER TABLE alter_test DROP COLUMN AddedNested1.B; - -ALTER TABLE alter_test ADD COLUMN IF NOT EXISTS Added0 UInt32; -ALTER TABLE alter_test ADD COLUMN IF NOT EXISTS AddedNested1 Nested(A UInt32, B UInt64); -ALTER TABLE alter_test ADD COLUMN IF NOT EXISTS AddedNested1.C Array(String); -ALTER TABLE alter_test MODIFY COLUMN IF EXISTS ToDrop UInt64; -ALTER TABLE alter_test DROP COLUMN IF EXISTS ToDrop; -ALTER TABLE alter_test COMMENT COLUMN IF EXISTS ToDrop 'new comment'; - -DESC TABLE rdbtest.alter_test; -DESC TABLE replicatwo.alter_test; -DESC TABLE replicathree.alter_test; From 0e9f516738adad2a22cf95d92304c6ffe3c6e55a Mon Sep 17 00:00:00 2001 From: Val Date: Wed, 27 May 2020 18:04:10 +0300 Subject: [PATCH 0025/2357] add comment for replicated db class --- src/Databases/DatabaseReplicated.h | 25 +++++++++++++++++++++---- 1 file changed, 21 insertions(+), 4 deletions(-) diff --git a/src/Databases/DatabaseReplicated.h b/src/Databases/DatabaseReplicated.h index bd2f11390d2..e81b78386f7 100644 --- a/src/Databases/DatabaseReplicated.h +++ b/src/Databases/DatabaseReplicated.h @@ -10,10 +10,27 @@ namespace DB { -/** Replicated database engine. - * It stores tables list using list of .sql files, - * that contain declaration of table represented by SQL ATTACH TABLE query - * and operation log in zookeeper +/** DatabaseReplicated engine + * supports replication of metadata + * via DDL log being written to ZooKeeper + * and executed on all of the replicas + * for a given database. + * + * One Clickhouse server can have multiple + * replicated databases running and updating + * at the same time. + * + * The engine has two parameters ZooKeeper path and + * replica name. + * The same ZooKeeper path corresponds to the same + * database. Replica names must be different for all replicas + * of the same database. + * + * Using this engine, creation of Replicated tables + * requires no ZooKeeper path and replica name parameters. + * Table's replica name is the same as database replica name. + * Table's ZooKeeper path is a concatenation of database's + * ZooKeeper path, /tables/, and UUID of the table. */ class DatabaseReplicated : public DatabaseAtomic { From a0af67b636d4a2b47d0c0898833e8c1c86731561 Mon Sep 17 00:00:00 2001 From: Val Date: Wed, 27 May 2020 21:33:37 +0300 Subject: [PATCH 0026/2357] Add one more test for db replicated and fix related bug --- src/Databases/DatabaseReplicated.cpp | 8 +++ .../test_replicated_database/test.py | 52 ++++++++++++------- 2 files changed, 40 insertions(+), 20 deletions(-) diff --git a/src/Databases/DatabaseReplicated.cpp b/src/Databases/DatabaseReplicated.cpp index c6840ac0d81..202e46c3f82 100644 --- a/src/Databases/DatabaseReplicated.cpp +++ b/src/Databases/DatabaseReplicated.cpp @@ -201,6 +201,13 @@ void DatabaseReplicated::propose(const ASTPtr & query) { current_zookeeper = getZooKeeper(); auto lock = createSimpleZooKeeperLock(current_zookeeper, zookeeper_path, "propose_lock", replica_name); + while (!lock->tryLock()) { + // TODO it seems that zk lock doesn't work at all + // need to find a different solution for proposal + pcg64 rng(randomSeed()); + std::this_thread::sleep_for(std::chrono::milliseconds(std::uniform_int_distribution(0, 1000)(rng))); + } + // schedule and deactive combo // ensures that replica is up to date // and since propose lock is acquired, @@ -224,6 +231,7 @@ void DatabaseReplicated::propose(const ASTPtr & query) { lock->unlock(); saveState(); + background_log_executor->activateAndSchedule(); } void DatabaseReplicated::updateSnapshot() { diff --git a/tests/integration/test_replicated_database/test.py b/tests/integration/test_replicated_database/test.py index 38977aa0bdb..703690a7218 100644 --- a/tests/integration/test_replicated_database/test.py +++ b/tests/integration/test_replicated_database/test.py @@ -33,38 +33,50 @@ def test_create_replicated_table(started_cluster): node1.query("CREATE TABLE testdb.replicated_table (d Date, k UInt64, i32 Int32) ENGINE=ReplicatedMergeTree(d, k, 8192);") time.sleep(DURATION_SECONDS) - logging.info(node2.query("desc table testdb.replicated_table")) assert node1.query("desc table testdb.replicated_table") == node2.query("desc table testdb.replicated_table") -def test_alter_table(started_cluster): +def test_simple_alter_table(started_cluster): DURATION_SECONDS = 1 - node1.query("CREATE TABLE testdb.alter_test (CounterID UInt32, StartDate Date, UserID UInt32, VisitID UInt32, NestedColumn Nested(A UInt8, S String), ToDrop UInt32) ENGINE = MergeTree(StartDate, intHash32(UserID), (CounterID, StartDate, intHash32(UserID), VisitID), 8192);\ - ALTER TABLE testdb.alter_test ADD COLUMN Added0 UInt32;\ - ALTER TABLE testdb.alter_test ADD COLUMN Added2 UInt32;\ - ALTER TABLE testdb.alter_test ADD COLUMN Added1 UInt32 AFTER Added0;\ - ALTER TABLE testdb.alter_test ADD COLUMN AddedNested1 Nested(A UInt32, B UInt64) AFTER Added2;\ - ALTER TABLE testdb.alter_test ADD COLUMN AddedNested1.C Array(String) AFTER AddedNested1.B;\ - ALTER TABLE testdb.alter_test ADD COLUMN AddedNested2 Nested(A UInt32, B UInt64) AFTER AddedNested1;") + node1.query("CREATE TABLE testdb.alter_test (CounterID UInt32, StartDate Date, UserID UInt32, VisitID UInt32, NestedColumn Nested(A UInt8, S String), ToDrop UInt32) ENGINE = MergeTree(StartDate, intHash32(UserID), (CounterID, StartDate, intHash32(UserID), VisitID), 8192);") + node1.query("ALTER TABLE testdb.alter_test ADD COLUMN Added0 UInt32;") + node1.query("ALTER TABLE testdb.alter_test ADD COLUMN Added2 UInt32;") + node1.query("ALTER TABLE testdb.alter_test ADD COLUMN Added1 UInt32 AFTER Added0;") + node1.query("ALTER TABLE testdb.alter_test ADD COLUMN AddedNested1 Nested(A UInt32, B UInt64) AFTER Added2;") + node1.query("ALTER TABLE testdb.alter_test ADD COLUMN AddedNested1.C Array(String) AFTER AddedNested1.B;") + node1.query("ALTER TABLE testdb.alter_test ADD COLUMN AddedNested2 Nested(A UInt32, B UInt64) AFTER AddedNested1;") time.sleep(DURATION_SECONDS) assert node1.query("desc table testdb.alter_test") == node2.query("desc table testdb.alter_test") -def test_create_replica_from_snapshot(started_cluster): +def test_create_replica_after_delay(started_cluster): DURATION_SECONDS = 3 - """ - right now snapshot's created every 6 proposes. - later on it must be configurable - for now let's check snapshot - by creating a new node just after 10 log entries - """ - node1.query("ALTER TABLE testdb.alter_test ADD COLUMN Added3 UInt32 ;") #9 - node1.query("ALTER TABLE testdb.alter_test ADD COLUMN Added4 UInt32 ;") #10 - node1.query("ALTER TABLE testdb.alter_test ADD COLUMN Added5 UInt32 ;") #1 - # by this moment snapshot must be created node3.query("CREATE DATABASE testdb ENGINE = Replicated('/clickhouse/databases/test1', '{replica}');") + node1.query("ALTER TABLE testdb.alter_test ADD COLUMN Added3 UInt32 ;") + node1.query("ALTER TABLE testdb.alter_test ADD COLUMN Added4 UInt32 ;") + node1.query("ALTER TABLE testdb.alter_test ADD COLUMN Added5 UInt32 ;") + time.sleep(DURATION_SECONDS) assert node3.query("desc table testdb.alter_test") == node1.query("desc table testdb.alter_test") +def test_alters_from_different_replicas(started_cluster): + DURATION_SECONDS = 1 + + node1.query("CREATE TABLE testdb.concurrent_test (CounterID UInt32, StartDate Date, UserID UInt32, VisitID UInt32, NestedColumn Nested(A UInt8, S String), ToDrop UInt32) ENGINE = MergeTree(StartDate, intHash32(UserID), (CounterID, StartDate, intHash32(UserID), VisitID), 8192);") + time.sleep(DURATION_SECONDS) + + node3.query("ALTER TABLE testdb.concurrent_test ADD COLUMN Added0 UInt32;") + time.sleep(DURATION_SECONDS) + node1.query("ALTER TABLE testdb.concurrent_test ADD COLUMN Added2 UInt32;") + time.sleep(DURATION_SECONDS) + node3.query("ALTER TABLE testdb.concurrent_test ADD COLUMN Added1 UInt32 AFTER Added0;") + time.sleep(DURATION_SECONDS) + node1.query("ALTER TABLE testdb.concurrent_test ADD COLUMN AddedNested1 Nested(A UInt32, B UInt64) AFTER Added2;") + time.sleep(DURATION_SECONDS) + node3.query("ALTER TABLE testdb.concurrent_test ADD COLUMN AddedNested1.C Array(String) AFTER AddedNested1.B;") + time.sleep(DURATION_SECONDS) + node1.query("ALTER TABLE testdb.concurrent_test ADD COLUMN AddedNested2 Nested(A UInt32, B UInt64) AFTER AddedNested1;") + time.sleep(DURATION_SECONDS) + assert node3.query("desc table testdb.concurrent_test") == node1.query("desc table testdb.concurrent_test") From 469f9738dff25544a35c23da2f6e207355b5f16c Mon Sep 17 00:00:00 2001 From: Val Date: Wed, 27 May 2020 21:40:00 +0300 Subject: [PATCH 0027/2357] refactor save state in db replicated --- src/Databases/DatabaseReplicated.cpp | 14 ++++++++------ 1 file changed, 8 insertions(+), 6 deletions(-) diff --git a/src/Databases/DatabaseReplicated.cpp b/src/Databases/DatabaseReplicated.cpp index 202e46c3f82..3dbacbaf33d 100644 --- a/src/Databases/DatabaseReplicated.cpp +++ b/src/Databases/DatabaseReplicated.cpp @@ -148,12 +148,14 @@ void DatabaseReplicated::runBackgroundLogExecutor() { } void DatabaseReplicated::saveState() { - current_zookeeper->createOrUpdate(replica_path + "/last_entry", std::to_string(current_log_entry_n), zkutil::CreateMode::Persistent); - // TODO rename vars - String statement = std::to_string(current_log_entry_n); - String metadatafile = getMetadataPath() + ".last_entry"; - WriteBufferFromFile out(metadatafile, statement.size(), O_WRONLY | O_CREAT); - writeString(statement, out); + String state = std::to_string(current_log_entry_n); + + current_zookeeper = getZooKeeper(); + current_zookeeper->createOrUpdate(replica_path + "/last_entry", state, zkutil::CreateMode::Persistent); + + String metadata_file = getMetadataPath() + ".last_entry"; + WriteBufferFromFile out(metadata_file, state.size(), O_WRONLY | O_CREAT); + writeString(state, out); out.next(); if (global_context.getSettingsRef().fsync_metadata) out.sync(); From f928c897cf68b4bf73bf7b6108e469ef87bb385d Mon Sep 17 00:00:00 2001 From: Val Date: Sun, 7 Jun 2020 14:20:05 +0300 Subject: [PATCH 0028/2357] change replication algorithm, remove zk lock In this version of the databaseReplicated sequential persistent zk nodes are used to order DDL queries. Db replicated ddl queries are executed in the backgrould pool no matter whether it's proposed by the same replica or not. --- src/Databases/DatabaseReplicated.cpp | 84 +++++++++------------ src/Databases/DatabaseReplicated.h | 2 + src/Interpreters/InterpreterAlterQuery.cpp | 1 + src/Interpreters/InterpreterCreateQuery.cpp | 10 +-- src/Interpreters/InterpreterDropQuery.cpp | 9 ++- src/Interpreters/InterpreterRenameQuery.cpp | 14 ++-- 6 files changed, 55 insertions(+), 65 deletions(-) diff --git a/src/Databases/DatabaseReplicated.cpp b/src/Databases/DatabaseReplicated.cpp index 3dbacbaf33d..2650bd46a58 100644 --- a/src/Databases/DatabaseReplicated.cpp +++ b/src/Databases/DatabaseReplicated.cpp @@ -132,19 +132,34 @@ void DatabaseReplicated::createDatabaseZKNodes() { void DatabaseReplicated::runBackgroundLogExecutor() { current_zookeeper = getZooKeeper(); - String last_n = current_zookeeper->get(zookeeper_path + "/last_entry", {}, NULL); - size_t last_n_parsed = parse(last_n); + Strings log_entry_names = current_zookeeper->getChildren(zookeeper_path + "/log"); - bool newEntries = current_log_entry_n < last_n_parsed; - while (current_log_entry_n < last_n_parsed) { - current_log_entry_n++; - String log_path = zookeeper_path + "/log/log." + std::to_string(current_log_entry_n); - executeFromZK(log_path); - } - if (newEntries) { - saveState(); + std::sort(log_entry_names.begin(), log_entry_names.end()); + auto newest_entry_it = std::upper_bound(log_entry_names.begin(), log_entry_names.end(), last_executed_log_entry); + + log_entry_names.erase(log_entry_names.begin(), newest_entry_it); + + for (const String & log_entry_name : log_entry_names) { + String log_entry_path = zookeeper_path + "/log/" + log_entry_name; + executeFromZK(log_entry_path); + last_executed_log_entry = log_entry_name; } + background_log_executor->scheduleAfter(500); + + // String last_n = current_zookeeper->get(zookeeper_path + "/last_entry", {}, NULL); + // size_t last_n_parsed = parse(last_n); + + // bool newEntries = current_log_entry_n < last_n_parsed; + // while (current_log_entry_n < last_n_parsed) { + // current_log_entry_n++; + // String log_path = zookeeper_path + "/log/log." + std::to_string(current_log_entry_n); + // executeFromZK(log_path); + // } + // if (newEntries) { + // saveState(); + // } + // background_log_executor->scheduleAfter(500); } void DatabaseReplicated::saveState() { @@ -187,53 +202,22 @@ void DatabaseReplicated::executeFromZK(String & path) { } // TODO Move to ZooKeeper/Lock and remove it from here and ddlworker -static std::unique_ptr createSimpleZooKeeperLock( - const std::shared_ptr & zookeeper, const String & lock_prefix, const String & lock_name, const String & lock_message) -{ - auto zookeeper_holder = std::make_shared(); - zookeeper_holder->initFromInstance(zookeeper); - return std::make_unique(std::move(zookeeper_holder), lock_prefix, lock_name, lock_message); -} +// static std::unique_ptr createSimpleZooKeeperLock( +// const std::shared_ptr & zookeeper, const String & lock_prefix, const String & lock_name, const String & lock_message) +// { +// auto zookeeper_holder = std::make_shared(); +// zookeeper_holder->initFromInstance(zookeeper); +// return std::make_unique(std::move(zookeeper_holder), lock_prefix, lock_name, lock_message); +// } void DatabaseReplicated::propose(const ASTPtr & query) { - // TODO remove that log message i think - LOG_DEBUG(log, "PROPOSING\n" << queryToString(query)); - current_zookeeper = getZooKeeper(); - auto lock = createSimpleZooKeeperLock(current_zookeeper, zookeeper_path, "propose_lock", replica_name); - while (!lock->tryLock()) { - // TODO it seems that zk lock doesn't work at all - // need to find a different solution for proposal - pcg64 rng(randomSeed()); - std::this_thread::sleep_for(std::chrono::milliseconds(std::uniform_int_distribution(0, 1000)(rng))); - } + LOG_DEBUG(log, "PROPOSINGGG query: " << queryToString(query)); + current_zookeeper->create(zookeeper_path + "/log/log-", queryToString(query), zkutil::CreateMode::PersistentSequential); - // schedule and deactive combo - // ensures that replica is up to date - // and since propose lock is acquired, - // no other propose can happen from - // different replicas during this call background_log_executor->schedule(); - background_log_executor->deactivate(); - -// if (current_log_entry_n > 5) { // make a settings variable -// // TODO check that all the replicas are up to date! -// updateSnapshot(); -// current_log_entry_n = 0; -// current_zookeeper->removeChildren(zookeeper_path + "/log"); -// } - - current_log_entry_n++; // starting from 1 - String log_entry = zookeeper_path + "/log/log." + std::to_string(current_log_entry_n); - current_zookeeper->createOrUpdate(log_entry, queryToString(query), zkutil::CreateMode::Persistent); - - current_zookeeper->createOrUpdate(zookeeper_path + "/last_entry", std::to_string(current_log_entry_n), zkutil::CreateMode::Persistent); - - lock->unlock(); - saveState(); - background_log_executor->activateAndSchedule(); } void DatabaseReplicated::updateSnapshot() { diff --git a/src/Databases/DatabaseReplicated.h b/src/Databases/DatabaseReplicated.h index e81b78386f7..19a0ea09e11 100644 --- a/src/Databases/DatabaseReplicated.h +++ b/src/Databases/DatabaseReplicated.h @@ -59,6 +59,8 @@ private: std::atomic current_log_entry_n = 0; + String last_executed_log_entry = ""; + BackgroundSchedulePool::TaskHolder background_log_executor; String replica_path; diff --git a/src/Interpreters/InterpreterAlterQuery.cpp b/src/Interpreters/InterpreterAlterQuery.cpp index 134531d0cf0..6b4bcdde067 100644 --- a/src/Interpreters/InterpreterAlterQuery.cpp +++ b/src/Interpreters/InterpreterAlterQuery.cpp @@ -52,6 +52,7 @@ BlockIO InterpreterAlterQuery::execute() DatabasePtr database = DatabaseCatalog::instance().getDatabase(table_id.database_name); if (database->getEngineName() == "Replicated" && context.getClientInfo().query_kind != ClientInfo::QueryKind::REPLICATED_LOG_QUERY && !table->supportsReplication()) { database->propose(query_ptr); + return {}; } /// Add default database to table identifiers that we can encounter in e.g. default expressions, diff --git a/src/Interpreters/InterpreterCreateQuery.cpp b/src/Interpreters/InterpreterCreateQuery.cpp index 0b06fbfd874..6806679cb4d 100644 --- a/src/Interpreters/InterpreterCreateQuery.cpp +++ b/src/Interpreters/InterpreterCreateQuery.cpp @@ -688,6 +688,11 @@ bool InterpreterCreateQuery::doCreateTable(ASTCreateQuery & create, return true; } + if (database->getEngineName() == "Replicated" && context.getClientInfo().query_kind != ClientInfo::QueryKind::REPLICATED_LOG_QUERY) { + database->propose(query_ptr); + return true; + } + StoragePtr res; /// NOTE: CREATE query may be rewritten by Storage creator or table function if (create.as_table_function) @@ -707,11 +712,6 @@ bool InterpreterCreateQuery::doCreateTable(ASTCreateQuery & create, properties.constraints, false); } - - - if (database->getEngineName() == "Replicated" && context.getClientInfo().query_kind != ClientInfo::QueryKind::REPLICATED_LOG_QUERY) { - database->propose(query_ptr); - } database->createTable(context, table_name, res, query_ptr); /// We must call "startup" and "shutdown" while holding DDLGuard. diff --git a/src/Interpreters/InterpreterDropQuery.cpp b/src/Interpreters/InterpreterDropQuery.cpp index afbf5d31fbf..05418f275a2 100644 --- a/src/Interpreters/InterpreterDropQuery.cpp +++ b/src/Interpreters/InterpreterDropQuery.cpp @@ -99,8 +99,9 @@ BlockIO InterpreterDropQuery::executeToTable( /// Drop table from memory, don't touch data and metadata if (database->getEngineName() == "Replicated" && context.getClientInfo().query_kind != ClientInfo::QueryKind::REPLICATED_LOG_QUERY) { database->propose(query_ptr); + } else { + database->detachTable(table_id.table_name); } - database->detachTable(table_id.table_name); } else if (query.kind == ASTDropQuery::Kind::Truncate) { @@ -113,8 +114,9 @@ BlockIO InterpreterDropQuery::executeToTable( auto table_lock = table->lockExclusively(context.getCurrentQueryId(), context.getSettingsRef().lock_acquire_timeout); if (database->getEngineName() == "Replicated" && context.getClientInfo().query_kind != ClientInfo::QueryKind::REPLICATED_LOG_QUERY) { database->propose(query_ptr); + } else { + table->truncate(query_ptr, metadata_snapshot, context, table_lock); } - table->truncate(query_ptr, metadata_snapshot, context, table_lock); } else if (query.kind == ASTDropQuery::Kind::Drop) { @@ -129,8 +131,9 @@ BlockIO InterpreterDropQuery::executeToTable( if (database->getEngineName() == "Replicated" && context.getClientInfo().query_kind != ClientInfo::QueryKind::REPLICATED_LOG_QUERY) { database->propose(query_ptr); + } else { + database->dropTable(context, table_id.table_name, query.no_delay); } - database->dropTable(context, table_id.table_name, query.no_delay); } } diff --git a/src/Interpreters/InterpreterRenameQuery.cpp b/src/Interpreters/InterpreterRenameQuery.cpp index 45003ab0d14..97206f6b364 100644 --- a/src/Interpreters/InterpreterRenameQuery.cpp +++ b/src/Interpreters/InterpreterRenameQuery.cpp @@ -83,15 +83,15 @@ BlockIO InterpreterRenameQuery::execute() DatabasePtr database = database_catalog.getDatabase(elem.from_database_name); if (database->getEngineName() == "Replicated" && context.getClientInfo().query_kind != ClientInfo::QueryKind::REPLICATED_LOG_QUERY) { database->propose(query_ptr); + } else { + database->renameTable( + context, + elem.from_table_name, + *database_catalog.getDatabase(elem.to_database_name), + elem.to_table_name, + rename.exchange); } - database->renameTable( - context, - elem.from_table_name, - *database_catalog.getDatabase(elem.to_database_name), - elem.to_table_name, - rename.exchange); } - return {}; } From f6de720f59e8bc8619fbf8684e6d80e8459ba432 Mon Sep 17 00:00:00 2001 From: Val Date: Sun, 7 Jun 2020 14:26:42 +0300 Subject: [PATCH 0029/2357] speed up db replicated test --- tests/integration/test_replicated_database/test.py | 14 ++++++++------ 1 file changed, 8 insertions(+), 6 deletions(-) diff --git a/tests/integration/test_replicated_database/test.py b/tests/integration/test_replicated_database/test.py index 703690a7218..95ca5c1e138 100644 --- a/tests/integration/test_replicated_database/test.py +++ b/tests/integration/test_replicated_database/test.py @@ -49,7 +49,7 @@ def test_simple_alter_table(started_cluster): assert node1.query("desc table testdb.alter_test") == node2.query("desc table testdb.alter_test") def test_create_replica_after_delay(started_cluster): - DURATION_SECONDS = 3 + DURATION_SECONDS = 2 node3.query("CREATE DATABASE testdb ENGINE = Replicated('/clickhouse/databases/test1', '{replica}');") @@ -65,18 +65,20 @@ def test_alters_from_different_replicas(started_cluster): DURATION_SECONDS = 1 node1.query("CREATE TABLE testdb.concurrent_test (CounterID UInt32, StartDate Date, UserID UInt32, VisitID UInt32, NestedColumn Nested(A UInt8, S String), ToDrop UInt32) ENGINE = MergeTree(StartDate, intHash32(UserID), (CounterID, StartDate, intHash32(UserID), VisitID), 8192);") + time.sleep(DURATION_SECONDS) node3.query("ALTER TABLE testdb.concurrent_test ADD COLUMN Added0 UInt32;") - time.sleep(DURATION_SECONDS) node1.query("ALTER TABLE testdb.concurrent_test ADD COLUMN Added2 UInt32;") - time.sleep(DURATION_SECONDS) node3.query("ALTER TABLE testdb.concurrent_test ADD COLUMN Added1 UInt32 AFTER Added0;") - time.sleep(DURATION_SECONDS) node1.query("ALTER TABLE testdb.concurrent_test ADD COLUMN AddedNested1 Nested(A UInt32, B UInt64) AFTER Added2;") - time.sleep(DURATION_SECONDS) node3.query("ALTER TABLE testdb.concurrent_test ADD COLUMN AddedNested1.C Array(String) AFTER AddedNested1.B;") - time.sleep(DURATION_SECONDS) node1.query("ALTER TABLE testdb.concurrent_test ADD COLUMN AddedNested2 Nested(A UInt32, B UInt64) AFTER AddedNested1;") + time.sleep(DURATION_SECONDS) + + logging.info("NODE3") + logging.info(node3.query("desc table testdb.concurrent_test")) + logging.info("NODE1") + logging.info(node1.query("desc table testdb.concurrent_test")) assert node3.query("desc table testdb.concurrent_test") == node1.query("desc table testdb.concurrent_test") From e8e4e4d21c559fc3548d791dea65aa7871e8d19f Mon Sep 17 00:00:00 2001 From: Val Date: Sat, 20 Jun 2020 18:38:20 +0300 Subject: [PATCH 0030/2357] add tests for db replicated --- .../configs/disable_snapshots.xml | 3 ++ .../configs/snapshot_each_query.xml | 3 ++ .../test_replicated_database/test.py | 40 ++++++++++++------- 3 files changed, 31 insertions(+), 15 deletions(-) create mode 100644 tests/integration/test_replicated_database/configs/disable_snapshots.xml create mode 100644 tests/integration/test_replicated_database/configs/snapshot_each_query.xml diff --git a/tests/integration/test_replicated_database/configs/disable_snapshots.xml b/tests/integration/test_replicated_database/configs/disable_snapshots.xml new file mode 100644 index 00000000000..9a656bdcea1 --- /dev/null +++ b/tests/integration/test_replicated_database/configs/disable_snapshots.xml @@ -0,0 +1,3 @@ + + 0 + diff --git a/tests/integration/test_replicated_database/configs/snapshot_each_query.xml b/tests/integration/test_replicated_database/configs/snapshot_each_query.xml new file mode 100644 index 00000000000..6eae1d9d992 --- /dev/null +++ b/tests/integration/test_replicated_database/configs/snapshot_each_query.xml @@ -0,0 +1,3 @@ + + 1 + diff --git a/tests/integration/test_replicated_database/test.py b/tests/integration/test_replicated_database/test.py index 95ca5c1e138..b557354b6ba 100644 --- a/tests/integration/test_replicated_database/test.py +++ b/tests/integration/test_replicated_database/test.py @@ -10,18 +10,16 @@ logging.getLogger().addHandler(logging.StreamHandler()) cluster = ClickHouseCluster(__file__) -node1 = cluster.add_instance('node1', macros={'replica': 'test1'}, with_zookeeper=True) -node2 = cluster.add_instance('node2', macros={'replica': 'test2'}, with_zookeeper=True) -node3 = cluster.add_instance('node3', macros={'replica': 'test3'}, with_zookeeper=True) +node1 = cluster.add_instance('node1', main_configs=['configs/disable_snapshots.xml'], with_zookeeper=True, stay_alive=True) +node2 = cluster.add_instance('node2', main_configs=['configs/disable_snapshots.xml'], with_zookeeper=True) +node3 = cluster.add_instance('node3', main_configs=['configs/disable_snapshots.xml'], with_zookeeper=True) @pytest.fixture(scope="module") def started_cluster(): try: cluster.start() - - for node in [node1, node2]: - node.query("DROP DATABASE IF EXISTS testdb") - node.query("CREATE DATABASE testdb ENGINE = Replicated('/clickhouse/databases/test1', '{replica}');") + node1.query("CREATE DATABASE testdb ENGINE = Replicated('/clickhouse/databases/test1', 'replica1');") + node2.query("CREATE DATABASE testdb ENGINE = Replicated('/clickhouse/databases/test1', 'replica2');") yield cluster finally: @@ -49,15 +47,13 @@ def test_simple_alter_table(started_cluster): assert node1.query("desc table testdb.alter_test") == node2.query("desc table testdb.alter_test") def test_create_replica_after_delay(started_cluster): - DURATION_SECONDS = 2 - - node3.query("CREATE DATABASE testdb ENGINE = Replicated('/clickhouse/databases/test1', '{replica}');") + node3.query("CREATE DATABASE testdb ENGINE = Replicated('/clickhouse/databases/test1', 'replica3');") node1.query("ALTER TABLE testdb.alter_test ADD COLUMN Added3 UInt32 ;") node1.query("ALTER TABLE testdb.alter_test ADD COLUMN Added4 UInt32 ;") node1.query("ALTER TABLE testdb.alter_test ADD COLUMN Added5 UInt32 ;") - time.sleep(DURATION_SECONDS) + time.sleep(6) assert node3.query("desc table testdb.alter_test") == node1.query("desc table testdb.alter_test") @@ -77,8 +73,22 @@ def test_alters_from_different_replicas(started_cluster): time.sleep(DURATION_SECONDS) - logging.info("NODE3") - logging.info(node3.query("desc table testdb.concurrent_test")) - logging.info("NODE1") - logging.info(node1.query("desc table testdb.concurrent_test")) assert node3.query("desc table testdb.concurrent_test") == node1.query("desc table testdb.concurrent_test") + +def test_drop_and_create_table(started_cluster): + node1.query("DROP TABLE testdb.concurrent_test") + node1.query("CREATE TABLE testdb.concurrent_test (CounterID UInt32, StartDate Date, UserID UInt32, VisitID UInt32, NestedColumn Nested(A UInt8, S String), ToDrop UInt32) ENGINE = MergeTree(StartDate, intHash32(UserID), (CounterID, StartDate, intHash32(UserID), VisitID), 8192);") + time.sleep(5) + assert node3.query("desc table testdb.concurrent_test") == node1.query("desc table testdb.concurrent_test") + +def test_replica_restart(started_cluster): + node1.restart_clickhouse() + time.sleep(5) + assert node3.query("desc table testdb.concurrent_test") == node1.query("desc table testdb.concurrent_test") + +#def test_drop_and_create_replica(started_cluster): +# node1.query("DROP DATABASE testdb") +# node1.query("CREATE DATABASE testdb ENGINE = Replicated('/clickhouse/databases/test1', 'replica1');") +# time.sleep(6) +# assert node3.query("desc table testdb.concurrent_test") == node1.query("desc table testdb.concurrent_test") + From f57fd52e3b564072d7c2ae61ecaf06138c4201ed Mon Sep 17 00:00:00 2001 From: Val Date: Sat, 20 Jun 2020 18:39:05 +0300 Subject: [PATCH 0031/2357] fix recursive propose for drop database db replicated query --- src/Interpreters/InterpreterDropQuery.cpp | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/Interpreters/InterpreterDropQuery.cpp b/src/Interpreters/InterpreterDropQuery.cpp index 05418f275a2..368024da043 100644 --- a/src/Interpreters/InterpreterDropQuery.cpp +++ b/src/Interpreters/InterpreterDropQuery.cpp @@ -129,7 +129,8 @@ BlockIO InterpreterDropQuery::executeToTable( if (database->getEngineName() != "Atomic" && database->getEngineName() != "Replicated") table_lock = table->lockExclusively(context.getCurrentQueryId(), context.getSettingsRef().lock_acquire_timeout); - if (database->getEngineName() == "Replicated" && context.getClientInfo().query_kind != ClientInfo::QueryKind::REPLICATED_LOG_QUERY) { + // Prevents recursive drop from drop database query. The original query must specify a table. + if (!query_ptr->as().table.empty() && database->getEngineName() == "Replicated" && context.getClientInfo().query_kind != ClientInfo::QueryKind::REPLICATED_LOG_QUERY) { database->propose(query_ptr); } else { database->dropTable(context, table_id.table_name, query.no_delay); From 4fc4b1d195bce04dfd08252eb6c0e3f58d0182f9 Mon Sep 17 00:00:00 2001 From: Val Date: Sat, 20 Jun 2020 18:39:58 +0300 Subject: [PATCH 0032/2357] db replicated minor enhancements --- src/Databases/DatabaseAtomic.cpp | 7 ++ src/Databases/DatabaseAtomic.h | 1 + src/Databases/DatabaseReplicated.cpp | 176 +++++++++++++++++++-------- src/Databases/DatabaseReplicated.h | 16 +-- src/Databases/DatabasesCommon.cpp | 4 +- 5 files changed, 142 insertions(+), 62 deletions(-) diff --git a/src/Databases/DatabaseAtomic.cpp b/src/Databases/DatabaseAtomic.cpp index ff30b95d139..85f6c70a07c 100644 --- a/src/Databases/DatabaseAtomic.cpp +++ b/src/Databases/DatabaseAtomic.cpp @@ -40,6 +40,13 @@ DatabaseAtomic::DatabaseAtomic(String name_, String metadata_path_, Context & co Poco::File(path_to_table_symlinks).createDirectories(); } +DatabaseAtomic::DatabaseAtomic(String name_, String metadata_path_, const String & data_path_, const String & logger, Context & context_) + : DatabaseOrdinary(name_, std::move(metadata_path_), data_path_, logger, context_) + , path_to_table_symlinks(context_.getPath() + "data/" + escapeForFileName(name_) + "/") +{ + Poco::File(path_to_table_symlinks).createDirectories(); +} + String DatabaseAtomic::getTableDataPath(const String & table_name) const { std::lock_guard lock(mutex); diff --git a/src/Databases/DatabaseAtomic.h b/src/Databases/DatabaseAtomic.h index 71428fdb420..88a77da53a4 100644 --- a/src/Databases/DatabaseAtomic.h +++ b/src/Databases/DatabaseAtomic.h @@ -22,6 +22,7 @@ class DatabaseAtomic : public DatabaseOrdinary public: DatabaseAtomic(String name_, String metadata_path_, Context & context_); + DatabaseAtomic(String name_, String metadata_path_, const String & data_path_, const String & logger, Context & context_); String getEngineName() const override { return "Atomic"; } diff --git a/src/Databases/DatabaseReplicated.cpp b/src/Databases/DatabaseReplicated.cpp index 2650bd46a58..4d16a5d05c0 100644 --- a/src/Databases/DatabaseReplicated.cpp +++ b/src/Databases/DatabaseReplicated.cpp @@ -49,6 +49,7 @@ namespace DB namespace ErrorCodes { extern const int NO_ZOOKEEPER; + extern const int FILE_DOESNT_EXIST; } void DatabaseReplicated::setZooKeeper(zkutil::ZooKeeperPtr zookeeper) @@ -78,9 +79,7 @@ DatabaseReplicated::DatabaseReplicated( const String & zookeeper_path_, const String & replica_name_, Context & context_) -// : DatabaseOrdinary(name_, metadata_path_, "data/" + escapeForFileName(name_) + "/", "DatabaseReplicated (" + name_ + ")", context_) - // TODO add constructor to Atomic and call it here with path and logger name specification - : DatabaseAtomic(name_, metadata_path_, context_) + : DatabaseAtomic(name_, metadata_path_, "store/", "DatabaseReplicated (" + name_ + ")", context_) , zookeeper_path(zookeeper_path_) , replica_name(replica_name_) { @@ -90,8 +89,6 @@ DatabaseReplicated::DatabaseReplicated( if (!zookeeper_path.empty() && zookeeper_path.front() != '/') zookeeper_path = "/" + zookeeper_path; - replica_path = zookeeper_path + "/replicas/" + replica_name; - if (context_.hasZooKeeper()) { current_zookeeper = context_.getZooKeeper(); } @@ -100,37 +97,101 @@ DatabaseReplicated::DatabaseReplicated( throw Exception("Can't create replicated database without ZooKeeper", ErrorCodes::NO_ZOOKEEPER); } + // New database if (!current_zookeeper->exists(zookeeper_path, {}, NULL)) { createDatabaseZKNodes(); - } + // Old replica recovery + } else if (current_zookeeper->exists(zookeeper_path + "/replicas/" + replica_name, {}, NULL)) { + String local_last_entry; + try + { + ReadBufferFromFile in(getMetadataPath() + ".last_entry", 16); + readStringUntilEOF(local_last_entry, in); + } + catch (const Exception & e) + { + if (e.code() == ErrorCodes::FILE_DOESNT_EXIST) { + // that is risky cause + // if replica name is the same + // than the last one wins + saveState(); + } else { + throw; + } + } - // replica - if (!current_zookeeper->exists(replica_path, {}, NULL)) { - current_zookeeper->createAncestors(replica_path); - current_zookeeper->createOrUpdate(replica_path, String(), zkutil::CreateMode::Persistent); + String remote_last_entry = current_zookeeper->get(zookeeper_path + "/replicas/" + replica_name, {}, NULL); + if (local_last_entry == remote_last_entry) { + last_executed_log_entry = local_last_entry; + } else { + LOG_DEBUG(log, "LOCAL: " << local_last_entry); + LOG_DEBUG(log, "ZK: " << remote_last_entry); + throw Exception("Can't create replicated database MISCONFIGURATION or something", ErrorCodes::NO_ZOOKEEPER); + } } - //loadMetadataFromSnapshot(); + snapshot_period = context_.getConfigRef().getInt("database_replicated_snapshot_period", 10); + LOG_DEBUG(log, "Snapshot period is set to " << snapshot_period); - background_log_executor = global_context.getReplicatedSchedulePool().createTask(database_name + "(DatabaseReplicated::the_threeeed)", [this]{ runBackgroundLogExecutor();} ); - background_log_executor->schedule(); + background_log_executor = global_context.getReplicatedSchedulePool().createTask(database_name + "(DatabaseReplicated::background_executor)", [this]{ runBackgroundLogExecutor();} ); + + background_log_executor->scheduleAfter(500); } void DatabaseReplicated::createDatabaseZKNodes() { current_zookeeper = getZooKeeper(); - if (current_zookeeper->exists(zookeeper_path)) - return; - current_zookeeper->createAncestors(zookeeper_path); current_zookeeper->createIfNotExists(zookeeper_path, String()); - current_zookeeper->createIfNotExists(zookeeper_path + "/last_entry", "0"); current_zookeeper->createIfNotExists(zookeeper_path + "/log", String()); - current_zookeeper->createIfNotExists(zookeeper_path + "/snapshot", String()); + current_zookeeper->createIfNotExists(zookeeper_path + "/snapshots", String()); + current_zookeeper->createIfNotExists(zookeeper_path + "/replicas", String()); +} + +void DatabaseReplicated::RemoveOutdatedSnapshotsAndLog() { + // This method removes all snapshots and logged queries + // that no longer will be in use by current replicas or + // new coming ones. + // Each registered replica has its state in ZooKeeper. + // Therefore removed snapshots and logged queries are less + // than a least advanced replica. + // It does not interfere with a new coming replica + // metadata loading from snapshot + // because the replica will use the last snapshot available + // and this snapshot will set the last executed log query + // to a greater one than the least advanced current replica. + current_zookeeper = getZooKeeper(); + Strings replica_states = current_zookeeper->getChildren(zookeeper_path + "/replicas"); + auto least_advanced = std::min_element(replica_states.begin(), replica_states.end()); + Strings snapshots = current_zookeeper->getChildren(zookeeper_path + "/snapshots"); + + if (snapshots.size() < 2) { + return; + } + + std::sort(snapshots.begin(), snapshots.end()); + auto still_useful = std::lower_bound(snapshots.begin(), snapshots.end(), *least_advanced); + snapshots.erase(still_useful, snapshots.end()); + for (const String & snapshot : snapshots) { + current_zookeeper->tryRemoveRecursive(zookeeper_path + "/snapshots/" + snapshot); + } + + Strings log_entry_names = current_zookeeper->getChildren(zookeeper_path + "/log"); + std::sort(log_entry_names.begin(), log_entry_names.end()); + auto still_useful_log = std::upper_bound(log_entry_names.begin(), log_entry_names.end(), *still_useful); + log_entry_names.erase(still_useful_log, log_entry_names.end()); + for (const String & log_entry_name : log_entry_names) { + String log_entry_path = zookeeper_path + "/log/" + log_entry_name; + current_zookeeper->tryRemove(log_entry_path); + } } void DatabaseReplicated::runBackgroundLogExecutor() { + if (last_executed_log_entry == "") { + loadMetadataFromSnapshot(); + } + current_zookeeper = getZooKeeper(); Strings log_entry_names = current_zookeeper->getChildren(zookeeper_path + "/log"); @@ -143,34 +204,27 @@ void DatabaseReplicated::runBackgroundLogExecutor() { String log_entry_path = zookeeper_path + "/log/" + log_entry_name; executeFromZK(log_entry_path); last_executed_log_entry = log_entry_name; + saveState(); + + int log_n = parse(log_entry_name.substr(4)); + int last_log_n = parse(log_entry_names.back().substr(4)); + + // The third condition gurantees at most one snapshot per batch + if (log_n > 0 && snapshot_period > 0 && (last_log_n - log_n) / snapshot_period == 0 && log_n % snapshot_period == 0) { + createSnapshot(); + } } background_log_executor->scheduleAfter(500); - - // String last_n = current_zookeeper->get(zookeeper_path + "/last_entry", {}, NULL); - // size_t last_n_parsed = parse(last_n); - - // bool newEntries = current_log_entry_n < last_n_parsed; - // while (current_log_entry_n < last_n_parsed) { - // current_log_entry_n++; - // String log_path = zookeeper_path + "/log/log." + std::to_string(current_log_entry_n); - // executeFromZK(log_path); - // } - // if (newEntries) { - // saveState(); - // } - // background_log_executor->scheduleAfter(500); } void DatabaseReplicated::saveState() { - String state = std::to_string(current_log_entry_n); - current_zookeeper = getZooKeeper(); - current_zookeeper->createOrUpdate(replica_path + "/last_entry", state, zkutil::CreateMode::Persistent); + current_zookeeper->createOrUpdate(zookeeper_path + "/replicas/" + replica_name, last_executed_log_entry, zkutil::CreateMode::Persistent); String metadata_file = getMetadataPath() + ".last_entry"; - WriteBufferFromFile out(metadata_file, state.size(), O_WRONLY | O_CREAT); - writeString(state, out); + WriteBufferFromFile out(metadata_file, last_executed_log_entry.size(), O_WRONLY | O_CREAT); + writeString(last_executed_log_entry, out); out.next(); if (global_context.getSettingsRef().fsync_metadata) out.sync(); @@ -201,47 +255,63 @@ void DatabaseReplicated::executeFromZK(String & path) { LOG_DEBUG(log, "Executed query: " << query_to_execute); } -// TODO Move to ZooKeeper/Lock and remove it from here and ddlworker -// static std::unique_ptr createSimpleZooKeeperLock( -// const std::shared_ptr & zookeeper, const String & lock_prefix, const String & lock_name, const String & lock_message) -// { -// auto zookeeper_holder = std::make_shared(); -// zookeeper_holder->initFromInstance(zookeeper); -// return std::make_unique(std::move(zookeeper_holder), lock_prefix, lock_name, lock_message); -// } - - void DatabaseReplicated::propose(const ASTPtr & query) { current_zookeeper = getZooKeeper(); - LOG_DEBUG(log, "PROPOSINGGG query: " << queryToString(query)); + LOG_DEBUG(log, "Writing the query to log: " << queryToString(query)); current_zookeeper->create(zookeeper_path + "/log/log-", queryToString(query), zkutil::CreateMode::PersistentSequential); background_log_executor->schedule(); } -void DatabaseReplicated::updateSnapshot() { +void DatabaseReplicated::createSnapshot() { current_zookeeper = getZooKeeper(); - current_zookeeper->tryRemoveChildren(zookeeper_path + "/snapshot"); + String snapshot_path = zookeeper_path + "/snapshots/" + last_executed_log_entry; + + if (Coordination::ZNODEEXISTS == current_zookeeper->tryCreate(snapshot_path, String(), zkutil::CreateMode::Persistent)) { + return; + } + for (auto iterator = getTablesIterator({}); iterator->isValid(); iterator->next()) { String table_name = iterator->name(); auto query = getCreateQueryFromMetadata(getObjectMetadataPath(table_name), true); String statement = queryToString(query); - current_zookeeper->createOrUpdate(zookeeper_path + "/snapshot/" + table_name, statement, zkutil::CreateMode::Persistent); + current_zookeeper->createOrUpdate(snapshot_path + "/" + table_name, statement, zkutil::CreateMode::Persistent); } + + RemoveOutdatedSnapshotsAndLog(); } void DatabaseReplicated::loadMetadataFromSnapshot() { current_zookeeper = getZooKeeper(); + Strings snapshots; + if (current_zookeeper->tryGetChildren(zookeeper_path + "/snapshots", snapshots) != Coordination::ZOK) + return; + + if (snapshots.size() < 1) { + return; + } + + auto latest_snapshot = std::max_element(snapshots.begin(), snapshots.end()); Strings metadatas; - if (current_zookeeper->tryGetChildren(zookeeper_path + "/snapshot", metadatas) != Coordination::ZOK) + if (current_zookeeper->tryGetChildren(zookeeper_path + "/snapshots/" + *latest_snapshot, metadatas) != Coordination::ZOK) return; for (auto t = metadatas.begin(); t != metadatas.end(); ++t) { - String path = zookeeper_path + "/snapshot/" + *t; + String path = zookeeper_path + "/snapshots/" + *latest_snapshot + "/" + *t; executeFromZK(path); } + + last_executed_log_entry = *latest_snapshot; + saveState(); +} + +void DatabaseReplicated::drop(const Context & context_) +{ + current_zookeeper = getZooKeeper(); + current_zookeeper->tryRemove(zookeeper_path + "/replicas/" + replica_name); + DatabaseAtomic::drop(context_); } } diff --git a/src/Databases/DatabaseReplicated.h b/src/Databases/DatabaseReplicated.h index 19a0ea09e11..471365361b7 100644 --- a/src/Databases/DatabaseReplicated.h +++ b/src/Databases/DatabaseReplicated.h @@ -23,13 +23,13 @@ namespace DB * The engine has two parameters ZooKeeper path and * replica name. * The same ZooKeeper path corresponds to the same - * database. Replica names must be different for all replicas + * database. Replica names MUST be different for all replicas * of the same database. * * Using this engine, creation of Replicated tables * requires no ZooKeeper path and replica name parameters. * Table's replica name is the same as database replica name. - * Table's ZooKeeper path is a concatenation of database's + * Table's ZooKeeper path is a concatenation of database * ZooKeeper path, /tables/, and UUID of the table. */ class DatabaseReplicated : public DatabaseAtomic @@ -37,6 +37,8 @@ class DatabaseReplicated : public DatabaseAtomic public: DatabaseReplicated(const String & name_, const String & metadata_path_, const String & zookeeper_path_, const String & replica_name_, Context & context); + void drop(const Context & /*context*/) override; + String getEngineName() const override { return "Replicated"; } void propose(const ASTPtr & query) override; @@ -48,23 +50,23 @@ private: void createDatabaseZKNodes(); void runBackgroundLogExecutor(); - + void executeFromZK(String & path); void saveState(); - void updateSnapshot(); + void loadMetadataFromSnapshot(); + void createSnapshot(); + void RemoveOutdatedSnapshotsAndLog(); std::unique_ptr current_context; // to run executeQuery - std::atomic current_log_entry_n = 0; + int snapshot_period; String last_executed_log_entry = ""; BackgroundSchedulePool::TaskHolder background_log_executor; - String replica_path; - zkutil::ZooKeeperPtr current_zookeeper; /// Use only the methods below. mutable std::mutex current_zookeeper_mutex; /// To recreate the session in the background thread. diff --git a/src/Databases/DatabasesCommon.cpp b/src/Databases/DatabasesCommon.cpp index 7925d812241..4575e6da953 100644 --- a/src/Databases/DatabasesCommon.cpp +++ b/src/Databases/DatabasesCommon.cpp @@ -78,7 +78,7 @@ StoragePtr DatabaseWithOwnTablesBase::detachTableUnlocked(const String & table_n auto table_id = res->getStorageID(); if (table_id.hasUUID()) { - assert(getDatabaseName() == DatabaseCatalog::TEMPORARY_DATABASE || getEngineName() == "Atomic"); + assert(getDatabaseName() == DatabaseCatalog::TEMPORARY_DATABASE || getEngineName() == "Atomic" || getEngineName() == "Replicated"); DatabaseCatalog::instance().removeUUIDMapping(table_id.uuid); } @@ -120,7 +120,7 @@ void DatabaseWithOwnTablesBase::shutdown() kv.second->shutdown(); if (table_id.hasUUID()) { - assert(getDatabaseName() == DatabaseCatalog::TEMPORARY_DATABASE || getEngineName() == "Atomic"); + assert(getDatabaseName() == DatabaseCatalog::TEMPORARY_DATABASE || getEngineName() == "Atomic" || getEngineName() == "Replicated"); DatabaseCatalog::instance().removeUUIDMapping(table_id.uuid); } } From 82f5281cfe52ce4643ced3b4ad3f2c229b894014 Mon Sep 17 00:00:00 2001 From: Val Date: Sun, 21 Jun 2020 18:03:04 +0300 Subject: [PATCH 0033/2357] remove redundant includes --- src/Databases/DatabaseReplicated.cpp | 28 ---------------------------- src/Databases/DatabaseReplicated.h | 4 ---- 2 files changed, 32 deletions(-) diff --git a/src/Databases/DatabaseReplicated.cpp b/src/Databases/DatabaseReplicated.cpp index 4d16a5d05c0..5a42edd9f0d 100644 --- a/src/Databases/DatabaseReplicated.cpp +++ b/src/Databases/DatabaseReplicated.cpp @@ -1,46 +1,18 @@ -#include - -#include -#include -#include #include -#include #include #include #include #include #include #include -#include #include -#include -#include -#include -#include -#include -#include -#include - #include - -#include -#include -#include -#include -#include -#include -#include -#include -#include #include - #include #include #include #include -#include -#include namespace DB { diff --git a/src/Databases/DatabaseReplicated.h b/src/Databases/DatabaseReplicated.h index 471365361b7..ab7b596eb4e 100644 --- a/src/Databases/DatabaseReplicated.h +++ b/src/Databases/DatabaseReplicated.h @@ -1,13 +1,9 @@ #pragma once #include -#include #include #include -#include -#include - namespace DB { /** DatabaseReplicated engine From 67588edcf5c5fea7e29958329b38b6d3db2b9d0f Mon Sep 17 00:00:00 2001 From: Val Date: Mon, 22 Jun 2020 17:19:26 +0300 Subject: [PATCH 0034/2357] clean up db replicated files and add more tests --- src/Databases/DatabaseReplicated.cpp | 39 +++++---- src/Databases/DatabaseReplicated.h | 2 +- .../test_replicated_database/test.py | 81 ++++++++++--------- 3 files changed, 65 insertions(+), 57 deletions(-) diff --git a/src/Databases/DatabaseReplicated.cpp b/src/Databases/DatabaseReplicated.cpp index 5a42edd9f0d..6a137a2af0c 100644 --- a/src/Databases/DatabaseReplicated.cpp +++ b/src/Databases/DatabaseReplicated.cpp @@ -21,7 +21,7 @@ namespace DB namespace ErrorCodes { extern const int NO_ZOOKEEPER; - extern const int FILE_DOESNT_EXIST; + extern const int LOGICAL_ERROR; } void DatabaseReplicated::setZooKeeper(zkutil::ZooKeeperPtr zookeeper) @@ -74,6 +74,8 @@ DatabaseReplicated::DatabaseReplicated( createDatabaseZKNodes(); // Old replica recovery } else if (current_zookeeper->exists(zookeeper_path + "/replicas/" + replica_name, {}, NULL)) { + String remote_last_entry = current_zookeeper->get(zookeeper_path + "/replicas/" + replica_name, {}, NULL); + String local_last_entry; try { @@ -82,28 +84,21 @@ DatabaseReplicated::DatabaseReplicated( } catch (const Exception & e) { - if (e.code() == ErrorCodes::FILE_DOESNT_EXIST) { - // that is risky cause - // if replica name is the same - // than the last one wins - saveState(); - } else { - throw; - } + // Metadata is corrupted. + // Replica erases the previous zk last executed log entry + // and behaves like a new clean replica. + writeLastExecutedToDiskAndZK(); } - String remote_last_entry = current_zookeeper->get(zookeeper_path + "/replicas/" + replica_name, {}, NULL); - if (local_last_entry == remote_last_entry) { + if (!local_last_entry.empty() && local_last_entry == remote_last_entry) { last_executed_log_entry = local_last_entry; } else { - LOG_DEBUG(log, "LOCAL: " << local_last_entry); - LOG_DEBUG(log, "ZK: " << remote_last_entry); - throw Exception("Can't create replicated database MISCONFIGURATION or something", ErrorCodes::NO_ZOOKEEPER); + throw Exception("Replica name might be in use by a different node. Please check replica_name parameter. Remove .last_entry file from metadata to create a new replica.", ErrorCodes::LOGICAL_ERROR); } } snapshot_period = context_.getConfigRef().getInt("database_replicated_snapshot_period", 10); - LOG_DEBUG(log, "Snapshot period is set to " << snapshot_period); + LOG_DEBUG(log, "Snapshot period is set to " << snapshot_period << " log entries per one snapshot"); background_log_executor = global_context.getReplicatedSchedulePool().createTask(database_name + "(DatabaseReplicated::background_executor)", [this]{ runBackgroundLogExecutor();} ); @@ -176,12 +171,12 @@ void DatabaseReplicated::runBackgroundLogExecutor() { String log_entry_path = zookeeper_path + "/log/" + log_entry_name; executeFromZK(log_entry_path); last_executed_log_entry = log_entry_name; - saveState(); + writeLastExecutedToDiskAndZK(); int log_n = parse(log_entry_name.substr(4)); int last_log_n = parse(log_entry_names.back().substr(4)); - // The third condition gurantees at most one snapshot per batch + // The third condition gurantees at most one snapshot creation per batch if (log_n > 0 && snapshot_period > 0 && (last_log_n - log_n) / snapshot_period == 0 && log_n % snapshot_period == 0) { createSnapshot(); } @@ -190,7 +185,7 @@ void DatabaseReplicated::runBackgroundLogExecutor() { background_log_executor->scheduleAfter(500); } -void DatabaseReplicated::saveState() { +void DatabaseReplicated::writeLastExecutedToDiskAndZK() { current_zookeeper = getZooKeeper(); current_zookeeper->createOrUpdate(zookeeper_path + "/replicas/" + replica_name, last_executed_log_entry, zkutil::CreateMode::Persistent); @@ -230,7 +225,7 @@ void DatabaseReplicated::executeFromZK(String & path) { void DatabaseReplicated::propose(const ASTPtr & query) { current_zookeeper = getZooKeeper(); - LOG_DEBUG(log, "Writing the query to log: " << queryToString(query)); + LOG_DEBUG(log, "Proposing query: " << queryToString(query)); current_zookeeper->create(zookeeper_path + "/log/log-", queryToString(query), zkutil::CreateMode::PersistentSequential); background_log_executor->schedule(); @@ -255,6 +250,8 @@ void DatabaseReplicated::createSnapshot() { } void DatabaseReplicated::loadMetadataFromSnapshot() { + // Executes the latest snapshot. + // Used by new replicas only. current_zookeeper = getZooKeeper(); Strings snapshots; @@ -270,13 +267,15 @@ void DatabaseReplicated::loadMetadataFromSnapshot() { if (current_zookeeper->tryGetChildren(zookeeper_path + "/snapshots/" + *latest_snapshot, metadatas) != Coordination::ZOK) return; + LOG_DEBUG(log, "Executing " << *latest_snapshot << " snapshot"); for (auto t = metadatas.begin(); t != metadatas.end(); ++t) { String path = zookeeper_path + "/snapshots/" + *latest_snapshot + "/" + *t; + executeFromZK(path); } last_executed_log_entry = *latest_snapshot; - saveState(); + writeLastExecutedToDiskAndZK(); } void DatabaseReplicated::drop(const Context & context_) diff --git a/src/Databases/DatabaseReplicated.h b/src/Databases/DatabaseReplicated.h index ab7b596eb4e..1cdcc3e990c 100644 --- a/src/Databases/DatabaseReplicated.h +++ b/src/Databases/DatabaseReplicated.h @@ -49,7 +49,7 @@ private: void executeFromZK(String & path); - void saveState(); + void writeLastExecutedToDiskAndZK(); void loadMetadataFromSnapshot(); void createSnapshot(); diff --git a/tests/integration/test_replicated_database/test.py b/tests/integration/test_replicated_database/test.py index b557354b6ba..0b7f8aadec2 100644 --- a/tests/integration/test_replicated_database/test.py +++ b/tests/integration/test_replicated_database/test.py @@ -10,16 +10,18 @@ logging.getLogger().addHandler(logging.StreamHandler()) cluster = ClickHouseCluster(__file__) -node1 = cluster.add_instance('node1', main_configs=['configs/disable_snapshots.xml'], with_zookeeper=True, stay_alive=True) -node2 = cluster.add_instance('node2', main_configs=['configs/disable_snapshots.xml'], with_zookeeper=True) -node3 = cluster.add_instance('node3', main_configs=['configs/disable_snapshots.xml'], with_zookeeper=True) +main_node = cluster.add_instance('main_node', main_configs=['configs/disable_snapshots.xml'], with_zookeeper=True, stay_alive=True) +dummy_node = cluster.add_instance('dummy_node', main_configs=['configs/disable_snapshots.xml'], with_zookeeper=True) +competing_node = cluster.add_instance('competing_node', main_configs=['configs/disable_snapshots.xml'], with_zookeeper=True) +snapshotting_node = cluster.add_instance('snapshotting_node', main_configs=['configs/snapshot_each_query.xml'], with_zookeeper=True) +snapshot_recovering_node = cluster.add_instance('snapshot_recovering_node', main_configs=['configs/disable_snapshots.xml'], with_zookeeper=True) @pytest.fixture(scope="module") def started_cluster(): try: cluster.start() - node1.query("CREATE DATABASE testdb ENGINE = Replicated('/clickhouse/databases/test1', 'replica1');") - node2.query("CREATE DATABASE testdb ENGINE = Replicated('/clickhouse/databases/test1', 'replica2');") + main_node.query("CREATE DATABASE testdb ENGINE = Replicated('/clickhouse/databases/test1', 'replica1');") + dummy_node.query("CREATE DATABASE testdb ENGINE = Replicated('/clickhouse/databases/test1', 'replica2');") yield cluster finally: @@ -28,67 +30,74 @@ def started_cluster(): def test_create_replicated_table(started_cluster): DURATION_SECONDS = 1 - node1.query("CREATE TABLE testdb.replicated_table (d Date, k UInt64, i32 Int32) ENGINE=ReplicatedMergeTree(d, k, 8192);") + main_node.query("CREATE TABLE testdb.replicated_table (d Date, k UInt64, i32 Int32) ENGINE=ReplicatedMergeTree(d, k, 8192);") time.sleep(DURATION_SECONDS) - assert node1.query("desc table testdb.replicated_table") == node2.query("desc table testdb.replicated_table") + assert main_node.query("desc table testdb.replicated_table") == dummy_node.query("desc table testdb.replicated_table") def test_simple_alter_table(started_cluster): DURATION_SECONDS = 1 - node1.query("CREATE TABLE testdb.alter_test (CounterID UInt32, StartDate Date, UserID UInt32, VisitID UInt32, NestedColumn Nested(A UInt8, S String), ToDrop UInt32) ENGINE = MergeTree(StartDate, intHash32(UserID), (CounterID, StartDate, intHash32(UserID), VisitID), 8192);") - node1.query("ALTER TABLE testdb.alter_test ADD COLUMN Added0 UInt32;") - node1.query("ALTER TABLE testdb.alter_test ADD COLUMN Added2 UInt32;") - node1.query("ALTER TABLE testdb.alter_test ADD COLUMN Added1 UInt32 AFTER Added0;") - node1.query("ALTER TABLE testdb.alter_test ADD COLUMN AddedNested1 Nested(A UInt32, B UInt64) AFTER Added2;") - node1.query("ALTER TABLE testdb.alter_test ADD COLUMN AddedNested1.C Array(String) AFTER AddedNested1.B;") - node1.query("ALTER TABLE testdb.alter_test ADD COLUMN AddedNested2 Nested(A UInt32, B UInt64) AFTER AddedNested1;") + main_node.query("CREATE TABLE testdb.alter_test (CounterID UInt32, StartDate Date, UserID UInt32, VisitID UInt32, NestedColumn Nested(A UInt8, S String), ToDrop UInt32) ENGINE = MergeTree(StartDate, intHash32(UserID), (CounterID, StartDate, intHash32(UserID), VisitID), 8192);") + main_node.query("ALTER TABLE testdb.alter_test ADD COLUMN Added0 UInt32;") + main_node.query("ALTER TABLE testdb.alter_test ADD COLUMN Added2 UInt32;") + main_node.query("ALTER TABLE testdb.alter_test ADD COLUMN Added1 UInt32 AFTER Added0;") + main_node.query("ALTER TABLE testdb.alter_test ADD COLUMN AddedNested1 Nested(A UInt32, B UInt64) AFTER Added2;") + main_node.query("ALTER TABLE testdb.alter_test ADD COLUMN AddedNested1.C Array(String) AFTER AddedNested1.B;") + main_node.query("ALTER TABLE testdb.alter_test ADD COLUMN AddedNested2 Nested(A UInt32, B UInt64) AFTER AddedNested1;") time.sleep(DURATION_SECONDS) - assert node1.query("desc table testdb.alter_test") == node2.query("desc table testdb.alter_test") + assert main_node.query("desc table testdb.alter_test") == dummy_node.query("desc table testdb.alter_test") def test_create_replica_after_delay(started_cluster): - node3.query("CREATE DATABASE testdb ENGINE = Replicated('/clickhouse/databases/test1', 'replica3');") + competing_node.query("CREATE DATABASE testdb ENGINE = Replicated('/clickhouse/databases/test1', 'replica3');") - node1.query("ALTER TABLE testdb.alter_test ADD COLUMN Added3 UInt32 ;") - node1.query("ALTER TABLE testdb.alter_test ADD COLUMN Added4 UInt32 ;") - node1.query("ALTER TABLE testdb.alter_test ADD COLUMN Added5 UInt32 ;") + main_node.query("ALTER TABLE testdb.alter_test ADD COLUMN Added3 UInt32 ;") + main_node.query("ALTER TABLE testdb.alter_test ADD COLUMN Added4 UInt32 ;") + main_node.query("ALTER TABLE testdb.alter_test ADD COLUMN Added5 UInt32 ;") time.sleep(6) - assert node3.query("desc table testdb.alter_test") == node1.query("desc table testdb.alter_test") + assert competing_node.query("desc table testdb.alter_test") == main_node.query("desc table testdb.alter_test") def test_alters_from_different_replicas(started_cluster): DURATION_SECONDS = 1 - node1.query("CREATE TABLE testdb.concurrent_test (CounterID UInt32, StartDate Date, UserID UInt32, VisitID UInt32, NestedColumn Nested(A UInt8, S String), ToDrop UInt32) ENGINE = MergeTree(StartDate, intHash32(UserID), (CounterID, StartDate, intHash32(UserID), VisitID), 8192);") + main_node.query("CREATE TABLE testdb.concurrent_test (CounterID UInt32, StartDate Date, UserID UInt32, VisitID UInt32, NestedColumn Nested(A UInt8, S String), ToDrop UInt32) ENGINE = MergeTree(StartDate, intHash32(UserID), (CounterID, StartDate, intHash32(UserID), VisitID), 8192);") time.sleep(DURATION_SECONDS) - node3.query("ALTER TABLE testdb.concurrent_test ADD COLUMN Added0 UInt32;") - node1.query("ALTER TABLE testdb.concurrent_test ADD COLUMN Added2 UInt32;") - node3.query("ALTER TABLE testdb.concurrent_test ADD COLUMN Added1 UInt32 AFTER Added0;") - node1.query("ALTER TABLE testdb.concurrent_test ADD COLUMN AddedNested1 Nested(A UInt32, B UInt64) AFTER Added2;") - node3.query("ALTER TABLE testdb.concurrent_test ADD COLUMN AddedNested1.C Array(String) AFTER AddedNested1.B;") - node1.query("ALTER TABLE testdb.concurrent_test ADD COLUMN AddedNested2 Nested(A UInt32, B UInt64) AFTER AddedNested1;") + competing_node.query("ALTER TABLE testdb.concurrent_test ADD COLUMN Added0 UInt32;") + main_node.query("ALTER TABLE testdb.concurrent_test ADD COLUMN Added2 UInt32;") + competing_node.query("ALTER TABLE testdb.concurrent_test ADD COLUMN Added1 UInt32 AFTER Added0;") + main_node.query("ALTER TABLE testdb.concurrent_test ADD COLUMN AddedNested1 Nested(A UInt32, B UInt64) AFTER Added2;") + competing_node.query("ALTER TABLE testdb.concurrent_test ADD COLUMN AddedNested1.C Array(String) AFTER AddedNested1.B;") + main_node.query("ALTER TABLE testdb.concurrent_test ADD COLUMN AddedNested2 Nested(A UInt32, B UInt64) AFTER AddedNested1;") time.sleep(DURATION_SECONDS) - assert node3.query("desc table testdb.concurrent_test") == node1.query("desc table testdb.concurrent_test") + assert competing_node.query("desc table testdb.concurrent_test") == main_node.query("desc table testdb.concurrent_test") def test_drop_and_create_table(started_cluster): - node1.query("DROP TABLE testdb.concurrent_test") - node1.query("CREATE TABLE testdb.concurrent_test (CounterID UInt32, StartDate Date, UserID UInt32, VisitID UInt32, NestedColumn Nested(A UInt8, S String), ToDrop UInt32) ENGINE = MergeTree(StartDate, intHash32(UserID), (CounterID, StartDate, intHash32(UserID), VisitID), 8192);") + main_node.query("DROP TABLE testdb.concurrent_test") + main_node.query("CREATE TABLE testdb.concurrent_test (CounterID UInt32, StartDate Date, UserID UInt32, VisitID UInt32, NestedColumn Nested(A UInt8, S String), ToDrop UInt32) ENGINE = MergeTree(StartDate, intHash32(UserID), (CounterID, StartDate, intHash32(UserID), VisitID), 8192);") time.sleep(5) - assert node3.query("desc table testdb.concurrent_test") == node1.query("desc table testdb.concurrent_test") + assert competing_node.query("desc table testdb.concurrent_test") == main_node.query("desc table testdb.concurrent_test") def test_replica_restart(started_cluster): - node1.restart_clickhouse() + main_node.restart_clickhouse() time.sleep(5) - assert node3.query("desc table testdb.concurrent_test") == node1.query("desc table testdb.concurrent_test") + assert competing_node.query("desc table testdb.concurrent_test") == main_node.query("desc table testdb.concurrent_test") + +def test_snapshot_and_snapshot_recover(started_cluster): + snapshotting_node.query("CREATE DATABASE testdb ENGINE = Replicated('/clickhouse/databases/test1', 'replica4');") + time.sleep(5) + snapshot_recovering_node.query("CREATE DATABASE testdb ENGINE = Replicated('/clickhouse/databases/test1', 'replica5');") + time.sleep(5) + assert snapshotting_node.query("desc table testdb.alter_test") == snapshot_recovering_node.query("desc table testdb.alter_test") #def test_drop_and_create_replica(started_cluster): -# node1.query("DROP DATABASE testdb") -# node1.query("CREATE DATABASE testdb ENGINE = Replicated('/clickhouse/databases/test1', 'replica1');") +# main_node.query("DROP DATABASE testdb") +# main_node.query("CREATE DATABASE testdb ENGINE = Replicated('/clickhouse/databases/test1', 'replica1');") # time.sleep(6) -# assert node3.query("desc table testdb.concurrent_test") == node1.query("desc table testdb.concurrent_test") +# assert competing_node.query("desc table testdb.concurrent_test") == main_node.query("desc table testdb.concurrent_test") From 16e50e33d76f4c4e4ccd167f2354c41782fcf76a Mon Sep 17 00:00:00 2001 From: Val Date: Mon, 22 Jun 2020 17:22:26 +0300 Subject: [PATCH 0035/2357] fix typo --- src/Databases/DatabaseReplicated.cpp | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/Databases/DatabaseReplicated.cpp b/src/Databases/DatabaseReplicated.cpp index 6a137a2af0c..bf974901e41 100644 --- a/src/Databases/DatabaseReplicated.cpp +++ b/src/Databases/DatabaseReplicated.cpp @@ -121,11 +121,11 @@ void DatabaseReplicated::RemoveOutdatedSnapshotsAndLog() { // that no longer will be in use by current replicas or // new coming ones. // Each registered replica has its state in ZooKeeper. - // Therefore removed snapshots and logged queries are less - // than a least advanced replica. + // Therefore, snapshots and logged queries that are less + // than a least advanced replica are removed. // It does not interfere with a new coming replica // metadata loading from snapshot - // because the replica will use the last snapshot available + // because the replica will use the latest snapshot available // and this snapshot will set the last executed log query // to a greater one than the least advanced current replica. current_zookeeper = getZooKeeper(); From d293e002a7251f58eee5601749169435d25136ba Mon Sep 17 00:00:00 2001 From: Val Date: Wed, 24 Jun 2020 15:45:42 +0300 Subject: [PATCH 0036/2357] address pr comments --- src/Databases/DatabaseReplicated.cpp | 24 +++++++++++++++------ src/Interpreters/InterpreterCreateQuery.cpp | 2 +- 2 files changed, 19 insertions(+), 7 deletions(-) diff --git a/src/Databases/DatabaseReplicated.cpp b/src/Databases/DatabaseReplicated.cpp index bf974901e41..adfd28f8914 100644 --- a/src/Databases/DatabaseReplicated.cpp +++ b/src/Databases/DatabaseReplicated.cpp @@ -22,6 +22,7 @@ namespace ErrorCodes { extern const int NO_ZOOKEEPER; extern const int LOGICAL_ERROR; + extern const int BAD_ARGUMENTS; } void DatabaseReplicated::setZooKeeper(zkutil::ZooKeeperPtr zookeeper) @@ -55,10 +56,14 @@ DatabaseReplicated::DatabaseReplicated( , zookeeper_path(zookeeper_path_) , replica_name(replica_name_) { - if (!zookeeper_path.empty() && zookeeper_path.back() == '/') + if (zookeeper_path.empty() || replica_name.empty()) { + throw Exception("ZooKeeper path and replica name must be non-empty", ErrorCodes::BAD_ARGUMENTS); + } + + if (zookeeper_path.back() == '/') zookeeper_path.resize(zookeeper_path.size() - 1); // If zookeeper chroot prefix is used, path should start with '/', because chroot concatenates without it. - if (!zookeeper_path.empty() && zookeeper_path.front() != '/') + if (zookeeper_path.front() != '/') zookeeper_path = "/" + zookeeper_path; if (context_.hasZooKeeper()) { @@ -70,10 +75,10 @@ DatabaseReplicated::DatabaseReplicated( } // New database - if (!current_zookeeper->exists(zookeeper_path, {}, NULL)) { + if (!current_zookeeper->exists(zookeeper_path)) { createDatabaseZKNodes(); // Old replica recovery - } else if (current_zookeeper->exists(zookeeper_path + "/replicas/" + replica_name, {}, NULL)) { + } else if (current_zookeeper->exists(zookeeper_path + "/replicas/" + replica_name)) { String remote_last_entry = current_zookeeper->get(zookeeper_path + "/replicas/" + replica_name, {}, NULL); String local_last_entry; @@ -243,8 +248,9 @@ void DatabaseReplicated::createSnapshot() { String table_name = iterator->name(); auto query = getCreateQueryFromMetadata(getObjectMetadataPath(table_name), true); String statement = queryToString(query); - current_zookeeper->createOrUpdate(snapshot_path + "/" + table_name, statement, zkutil::CreateMode::Persistent); + current_zookeeper->createIfNotExists(snapshot_path + "/" + table_name, statement); } + current_zookeeper->createIfNotExists(snapshot_path + "/.completed", String()); RemoveOutdatedSnapshotsAndLog(); } @@ -258,11 +264,17 @@ void DatabaseReplicated::loadMetadataFromSnapshot() { if (current_zookeeper->tryGetChildren(zookeeper_path + "/snapshots", snapshots) != Coordination::ZOK) return; + auto latest_snapshot = std::max_element(snapshots.begin(), snapshots.end()); + while (snapshots.size() > 0 && !current_zookeeper->exists(zookeeper_path + "/snapshots/" + *latest_snapshot + "/.completed")) { + snapshots.erase(latest_snapshot); + latest_snapshot = std::max_element(snapshots.begin(), snapshots.end()); + } + if (snapshots.size() < 1) { return; } - auto latest_snapshot = std::max_element(snapshots.begin(), snapshots.end()); + Strings metadatas; if (current_zookeeper->tryGetChildren(zookeeper_path + "/snapshots/" + *latest_snapshot, metadatas) != Coordination::ZOK) return; diff --git a/src/Interpreters/InterpreterCreateQuery.cpp b/src/Interpreters/InterpreterCreateQuery.cpp index 6806679cb4d..9d3abf2c8a6 100644 --- a/src/Interpreters/InterpreterCreateQuery.cpp +++ b/src/Interpreters/InterpreterCreateQuery.cpp @@ -640,7 +640,7 @@ bool InterpreterCreateQuery::doCreateTable(ASTCreateQuery & create, } else if (database->getEngineName() == "Replicated" && context.getClientInfo().query_kind == ClientInfo::QueryKind::REPLICATED_LOG_QUERY) { if (create.uuid == UUIDHelpers::Nil) - throw Exception("Table UUID is not specified in DDL log", ErrorCodes::INCORRECT_QUERY); + throw Exception("Table UUID is not specified in DDL log", ErrorCodes::LOGICAL_ERROR); } else { From 9635ea64bed93a587a147a21fbeda27cc08cf43d Mon Sep 17 00:00:00 2001 From: Val Date: Wed, 24 Jun 2020 15:50:23 +0300 Subject: [PATCH 0037/2357] Add desc of propose idatabase method --- src/Databases/IDatabase.h | 1 + 1 file changed, 1 insertion(+) diff --git a/src/Databases/IDatabase.h b/src/Databases/IDatabase.h index 5b3003f36b4..b80e73be108 100644 --- a/src/Databases/IDatabase.h +++ b/src/Databases/IDatabase.h @@ -161,6 +161,7 @@ public: /// Is the database empty. virtual bool empty() const = 0; + /// Submit query to log. Currently used by DatabaseReplicated engine only. virtual void propose(const ASTPtr & /*query*/) { throw Exception(getEngineName() + ": propose() is not supported", ErrorCodes::NOT_IMPLEMENTED); } From dde293fc3d10470bbe65b5ef4f58a5c2cd2d851e Mon Sep 17 00:00:00 2001 From: Val Date: Wed, 24 Jun 2020 16:37:29 +0300 Subject: [PATCH 0038/2357] check schema after alters in test --- .../test_replicated_database/test.py | 22 +++++++++++++++++++ 1 file changed, 22 insertions(+) diff --git a/tests/integration/test_replicated_database/test.py b/tests/integration/test_replicated_database/test.py index 0b7f8aadec2..346114cb8c4 100644 --- a/tests/integration/test_replicated_database/test.py +++ b/tests/integration/test_replicated_database/test.py @@ -46,6 +46,28 @@ def test_simple_alter_table(started_cluster): main_node.query("ALTER TABLE testdb.alter_test ADD COLUMN AddedNested2 Nested(A UInt32, B UInt64) AFTER AddedNested1;") time.sleep(DURATION_SECONDS) + + schema = main_node.query("show create table testdb.alter_test") + fields = [ + "`CounterID`", + "`StartDate`", + "`UserID`", + "`VisitID`", + "`NestedColumn.A`", + "`NestedColumn.S`", + "`ToDrop`", + "`Added0`", + "`Added1`", + "`Added2`", + "`AddedNested1.A`", + "`AddedNested1.B`", + "`AddedNested1.C`", + "`AddedNested2.A`", + "`AddedNested2.B`"] + + for field in fields: + assert field in schema + assert main_node.query("desc table testdb.alter_test") == dummy_node.query("desc table testdb.alter_test") def test_create_replica_after_delay(started_cluster): From e23c7a313eaafa174b3e0404469c152c1ff08c00 Mon Sep 17 00:00:00 2001 From: Val Date: Fri, 26 Jun 2020 17:05:27 +0300 Subject: [PATCH 0039/2357] address pr comments --- src/Databases/DatabaseOnDisk.h | 2 +- src/Databases/DatabaseReplicated.cpp | 15 ++++++++------- 2 files changed, 9 insertions(+), 8 deletions(-) diff --git a/src/Databases/DatabaseOnDisk.h b/src/Databases/DatabaseOnDisk.h index dc347c99542..00689900edf 100644 --- a/src/Databases/DatabaseOnDisk.h +++ b/src/Databases/DatabaseOnDisk.h @@ -86,7 +86,7 @@ protected: const String metadata_path; const String data_path; - Context & global_context; + const Context & global_context; }; } diff --git a/src/Databases/DatabaseReplicated.cpp b/src/Databases/DatabaseReplicated.cpp index adfd28f8914..0ddc976d8d0 100644 --- a/src/Databases/DatabaseReplicated.cpp +++ b/src/Databases/DatabaseReplicated.cpp @@ -105,7 +105,7 @@ DatabaseReplicated::DatabaseReplicated( snapshot_period = context_.getConfigRef().getInt("database_replicated_snapshot_period", 10); LOG_DEBUG(log, "Snapshot period is set to " << snapshot_period << " log entries per one snapshot"); - background_log_executor = global_context.getReplicatedSchedulePool().createTask(database_name + "(DatabaseReplicated::background_executor)", [this]{ runBackgroundLogExecutor();} ); + background_log_executor = context_.getReplicatedSchedulePool().createTask(database_name + "(DatabaseReplicated::background_executor)", [this]{ runBackgroundLogExecutor();} ); background_log_executor->scheduleAfter(500); } @@ -206,9 +206,9 @@ void DatabaseReplicated::writeLastExecutedToDiskAndZK() { void DatabaseReplicated::executeFromZK(String & path) { current_zookeeper = getZooKeeper(); String query_to_execute = current_zookeeper->get(path, {}, NULL); - ReadBufferFromString istr(query_to_execute); - String dummy_string; - WriteBufferFromString ostr(dummy_string); + //ReadBufferFromString istr(query_to_execute); + //String dummy_string; + //WriteBufferFromString ostr(dummy_string); try { @@ -216,7 +216,8 @@ void DatabaseReplicated::executeFromZK(String & path) { current_context->getClientInfo().query_kind = ClientInfo::QueryKind::REPLICATED_LOG_QUERY; current_context->setCurrentDatabase(database_name); current_context->setCurrentQueryId(""); // generate random query_id - executeQuery(istr, ostr, false, *current_context, {}); + //executeQuery(istr, ostr, false, *current_context, {}); + executeQuery(query_to_execute, *current_context); } catch (...) { @@ -248,9 +249,9 @@ void DatabaseReplicated::createSnapshot() { String table_name = iterator->name(); auto query = getCreateQueryFromMetadata(getObjectMetadataPath(table_name), true); String statement = queryToString(query); - current_zookeeper->createIfNotExists(snapshot_path + "/" + table_name, statement); + current_zookeeper->create(snapshot_path + "/" + table_name, statement, zkutil::CreateMode::Persistent); } - current_zookeeper->createIfNotExists(snapshot_path + "/.completed", String()); + current_zookeeper->create(snapshot_path + "/.completed", String(), zkutil::CreateMode::Persistent); RemoveOutdatedSnapshotsAndLog(); } From 8273248c4e3cc8431ee30b71729a9da369f54a7a Mon Sep 17 00:00:00 2001 From: Val Date: Sat, 27 Jun 2020 16:39:41 +0300 Subject: [PATCH 0040/2357] add log_name_to_exec to dbreplicated --- src/Databases/DatabaseFactory.cpp | 5 +- src/Databases/DatabaseOnDisk.cpp | 2 +- src/Databases/DatabaseOnDisk.h | 1 - src/Databases/DatabaseOrdinary.cpp | 2 +- src/Databases/DatabaseReplicated.cpp | 47 ++++++++++++------- src/Databases/DatabaseReplicated.h | 9 +++- src/Interpreters/InterpreterDropQuery.cpp | 3 +- .../MergeTree/registerStorageMergeTree.cpp | 8 ---- tests/integration/runner | 4 +- 9 files changed, 46 insertions(+), 35 deletions(-) diff --git a/src/Databases/DatabaseFactory.cpp b/src/Databases/DatabaseFactory.cpp index 0d7a711b530..752eeba4e81 100644 --- a/src/Databases/DatabaseFactory.cpp +++ b/src/Databases/DatabaseFactory.cpp @@ -148,8 +148,9 @@ DatabasePtr DatabaseFactory::getImpl( const auto & arguments = engine->arguments->children; - const auto zoo_path = arguments[0]->as()->value.safeGet(); - const auto replica_name = arguments[1]->as()->value.safeGet(); + const auto & zoo_path = safeGetLiteralValue(arguments[0], "Replicated"); + const auto & replica_name = safeGetLiteralValue(arguments[1], "Replicated"); + return std::make_shared(database_name, metadata_path, zoo_path, replica_name, context); } diff --git a/src/Databases/DatabaseOnDisk.cpp b/src/Databases/DatabaseOnDisk.cpp index 0a16b6eacff..6c72773fb69 100644 --- a/src/Databases/DatabaseOnDisk.cpp +++ b/src/Databases/DatabaseOnDisk.cpp @@ -127,7 +127,7 @@ DatabaseOnDisk::DatabaseOnDisk( const String & metadata_path_, const String & data_path_, const String & logger, - const Context & context) + Context & context) : DatabaseWithOwnTablesBase(name, logger, context) , metadata_path(metadata_path_) , data_path(data_path_) diff --git a/src/Databases/DatabaseOnDisk.h b/src/Databases/DatabaseOnDisk.h index 00689900edf..4e7b2ab1709 100644 --- a/src/Databases/DatabaseOnDisk.h +++ b/src/Databases/DatabaseOnDisk.h @@ -86,7 +86,6 @@ protected: const String metadata_path; const String data_path; - const Context & global_context; }; } diff --git a/src/Databases/DatabaseOrdinary.cpp b/src/Databases/DatabaseOrdinary.cpp index 2f4f584b091..69fbbce8b7d 100644 --- a/src/Databases/DatabaseOrdinary.cpp +++ b/src/Databases/DatabaseOrdinary.cpp @@ -100,7 +100,7 @@ DatabaseOrdinary::DatabaseOrdinary(const String & name_, const String & metadata } DatabaseOrdinary::DatabaseOrdinary( - const String & name_, const String & metadata_path_, const String & data_path_, const String & logger, const Context & context_) + const String & name_, const String & metadata_path_, const String & data_path_, const String & logger, Context & context_) : DatabaseWithDictionaries(name_, metadata_path_, data_path_, logger, context_) { } diff --git a/src/Databases/DatabaseReplicated.cpp b/src/Databases/DatabaseReplicated.cpp index 0ddc976d8d0..47298996236 100644 --- a/src/Databases/DatabaseReplicated.cpp +++ b/src/Databases/DatabaseReplicated.cpp @@ -13,6 +13,8 @@ #include #include +#include + namespace DB { @@ -103,13 +105,15 @@ DatabaseReplicated::DatabaseReplicated( } snapshot_period = context_.getConfigRef().getInt("database_replicated_snapshot_period", 10); - LOG_DEBUG(log, "Snapshot period is set to " << snapshot_period << " log entries per one snapshot"); + LOG_DEBUG(log, "Snapshot period is set to {} log entries per one snapshot", snapshot_period); background_log_executor = context_.getReplicatedSchedulePool().createTask(database_name + "(DatabaseReplicated::background_executor)", [this]{ runBackgroundLogExecutor();} ); background_log_executor->scheduleAfter(500); } +DatabaseReplicated::~DatabaseReplicated() = default; + void DatabaseReplicated::createDatabaseZKNodes() { current_zookeeper = getZooKeeper(); @@ -174,7 +178,13 @@ void DatabaseReplicated::runBackgroundLogExecutor() { for (const String & log_entry_name : log_entry_names) { String log_entry_path = zookeeper_path + "/log/" + log_entry_name; - executeFromZK(log_entry_path); + bool yield = false; + { + std::lock_guard lock(log_name_mutex); + if (log_name_to_exec_with_result == log_entry_name) + yield = true; + } + executeFromZK(log_entry_path, yield); last_executed_log_entry = log_entry_name; writeLastExecutedToDiskAndZK(); @@ -203,12 +213,9 @@ void DatabaseReplicated::writeLastExecutedToDiskAndZK() { out.close(); } -void DatabaseReplicated::executeFromZK(String & path) { +void DatabaseReplicated::executeFromZK(String & path, bool yield) { current_zookeeper = getZooKeeper(); String query_to_execute = current_zookeeper->get(path, {}, NULL); - //ReadBufferFromString istr(query_to_execute); - //String dummy_string; - //WriteBufferFromString ostr(dummy_string); try { @@ -216,23 +223,29 @@ void DatabaseReplicated::executeFromZK(String & path) { current_context->getClientInfo().query_kind = ClientInfo::QueryKind::REPLICATED_LOG_QUERY; current_context->setCurrentDatabase(database_name); current_context->setCurrentQueryId(""); // generate random query_id - //executeQuery(istr, ostr, false, *current_context, {}); executeQuery(query_to_execute, *current_context); } catch (...) { - tryLogCurrentException(log, "Query from zookeeper " + query_to_execute + " wasn't finished successfully"); + if (yield) + tryLogCurrentException(log, "Query from zookeeper " + query_to_execute + " wasn't finished successfully"); } - LOG_DEBUG(log, "Executed query: " << query_to_execute); + std::lock_guard lock(log_name_mutex); + log_name_to_exec_with_result.clear(); + LOG_DEBUG(log, "Executed query: {}", query_to_execute); } void DatabaseReplicated::propose(const ASTPtr & query) { current_zookeeper = getZooKeeper(); - LOG_DEBUG(log, "Proposing query: " << queryToString(query)); - current_zookeeper->create(zookeeper_path + "/log/log-", queryToString(query), zkutil::CreateMode::PersistentSequential); + LOG_DEBUG(log, "Proposing query: {}", queryToString(query)); + + { + std::lock_guard lock(log_name_mutex); + log_name_to_exec_with_result = current_zookeeper->create(zookeeper_path + "/log/log-", queryToString(query), zkutil::CreateMode::PersistentSequential); + } background_log_executor->schedule(); } @@ -241,11 +254,11 @@ void DatabaseReplicated::createSnapshot() { current_zookeeper = getZooKeeper(); String snapshot_path = zookeeper_path + "/snapshots/" + last_executed_log_entry; - if (Coordination::ZNODEEXISTS == current_zookeeper->tryCreate(snapshot_path, String(), zkutil::CreateMode::Persistent)) { + if (Coordination::Error::ZNODEEXISTS == current_zookeeper->tryCreate(snapshot_path, String(), zkutil::CreateMode::Persistent)) { return; } - for (auto iterator = getTablesIterator({}); iterator->isValid(); iterator->next()) { + for (auto iterator = getTablesIterator(global_context, {}); iterator->isValid(); iterator->next()) { String table_name = iterator->name(); auto query = getCreateQueryFromMetadata(getObjectMetadataPath(table_name), true); String statement = queryToString(query); @@ -262,7 +275,7 @@ void DatabaseReplicated::loadMetadataFromSnapshot() { current_zookeeper = getZooKeeper(); Strings snapshots; - if (current_zookeeper->tryGetChildren(zookeeper_path + "/snapshots", snapshots) != Coordination::ZOK) + if (current_zookeeper->tryGetChildren(zookeeper_path + "/snapshots", snapshots) != Coordination::Error::ZOK) return; auto latest_snapshot = std::max_element(snapshots.begin(), snapshots.end()); @@ -277,14 +290,14 @@ void DatabaseReplicated::loadMetadataFromSnapshot() { Strings metadatas; - if (current_zookeeper->tryGetChildren(zookeeper_path + "/snapshots/" + *latest_snapshot, metadatas) != Coordination::ZOK) + if (current_zookeeper->tryGetChildren(zookeeper_path + "/snapshots/" + *latest_snapshot, metadatas) != Coordination::Error::ZOK) return; - LOG_DEBUG(log, "Executing " << *latest_snapshot << " snapshot"); + LOG_DEBUG(log, "Executing {} snapshot", *latest_snapshot); for (auto t = metadatas.begin(); t != metadatas.end(); ++t) { String path = zookeeper_path + "/snapshots/" + *latest_snapshot + "/" + *t; - executeFromZK(path); + executeFromZK(path, false); } last_executed_log_entry = *latest_snapshot; diff --git a/src/Databases/DatabaseReplicated.h b/src/Databases/DatabaseReplicated.h index 1cdcc3e990c..2aa6c0d9a68 100644 --- a/src/Databases/DatabaseReplicated.h +++ b/src/Databases/DatabaseReplicated.h @@ -4,6 +4,7 @@ #include #include + namespace DB { /** DatabaseReplicated engine @@ -33,6 +34,8 @@ class DatabaseReplicated : public DatabaseAtomic public: DatabaseReplicated(const String & name_, const String & metadata_path_, const String & zookeeper_path_, const String & replica_name_, Context & context); + ~DatabaseReplicated(); + void drop(const Context & /*context*/) override; String getEngineName() const override { return "Replicated"; } @@ -47,7 +50,7 @@ private: void runBackgroundLogExecutor(); - void executeFromZK(String & path); + void executeFromZK(String & path, bool yield); void writeLastExecutedToDiskAndZK(); @@ -57,6 +60,10 @@ private: std::unique_ptr current_context; // to run executeQuery + //BlockIO execution_result; + std::mutex log_name_mutex; + String log_name_to_exec_with_result; + int snapshot_period; String last_executed_log_entry = ""; diff --git a/src/Interpreters/InterpreterDropQuery.cpp b/src/Interpreters/InterpreterDropQuery.cpp index 368024da043..8eef9059f69 100644 --- a/src/Interpreters/InterpreterDropQuery.cpp +++ b/src/Interpreters/InterpreterDropQuery.cpp @@ -93,7 +93,7 @@ BlockIO InterpreterDropQuery::executeToTable( { context.checkAccess(table->isView() ? AccessType::DROP_VIEW : AccessType::DROP_TABLE, table_id); table->shutdown(); - TableStructureWriteLockHolder table_lock; + TableExclusiveLockHolder table_lock; if (database->getEngineName() != "Atomic" && database->getEngineName() != "Replicated") table_lock = table->lockExclusively(context.getCurrentQueryId(), context.getSettingsRef().lock_acquire_timeout); /// Drop table from memory, don't touch data and metadata @@ -111,7 +111,6 @@ BlockIO InterpreterDropQuery::executeToTable( auto table_lock = table->lockExclusively(context.getCurrentQueryId(), context.getSettingsRef().lock_acquire_timeout); auto metadata_snapshot = table->getInMemoryMetadataPtr(); /// Drop table data, don't touch metadata - auto table_lock = table->lockExclusively(context.getCurrentQueryId(), context.getSettingsRef().lock_acquire_timeout); if (database->getEngineName() == "Replicated" && context.getClientInfo().query_kind != ClientInfo::QueryKind::REPLICATED_LOG_QUERY) { database->propose(query_ptr); } else { diff --git a/src/Storages/MergeTree/registerStorageMergeTree.cpp b/src/Storages/MergeTree/registerStorageMergeTree.cpp index eb62c80cc49..9836cd2ee23 100644 --- a/src/Storages/MergeTree/registerStorageMergeTree.cpp +++ b/src/Storages/MergeTree/registerStorageMergeTree.cpp @@ -634,14 +634,6 @@ static StoragePtr create(const StorageFactory::Arguments & args) throw Exception("You must set the setting `allow_experimental_data_skipping_indices` to 1 " \ "before using data skipping indices.", ErrorCodes::BAD_ARGUMENTS); - StorageInMemoryMetadata metadata(args.columns, indices_description, args.constraints); - metadata.partition_by_ast = partition_by_ast; - metadata.order_by_ast = order_by_ast; - metadata.primary_key_ast = primary_key_ast; - metadata.ttl_for_table_ast = ttl_table_ast; - metadata.sample_by_ast = sample_by_ast; - metadata.settings_ast = settings_ast; - if (replicatedStorage) return StorageReplicatedMergeTree::create( zookeeper_path, replica_name, args.attach, args.table_id, args.relative_data_path, diff --git a/tests/integration/runner b/tests/integration/runner index 399c87dcf06..058badcee66 100755 --- a/tests/integration/runner +++ b/tests/integration/runner @@ -1,4 +1,4 @@ -#!/usr/bin/env python +#!/usr/bin/env python2 #-*- coding: utf-8 -*- import subprocess import os @@ -105,7 +105,7 @@ if __name__ == "__main__": bridge_bin=args.bridge_binary, cfg=args.configs_dir, pth=args.clickhouse_root, - opts=' '.join(args.pytest_args), + opts='-vv ' + ' '.join(args.pytest_args), img=DIND_INTEGRATION_TESTS_IMAGE_NAME, name=CONTAINER_NAME, command=args.command From 147fa9fed92c6b35061091971590e3243522bb84 Mon Sep 17 00:00:00 2001 From: Val Date: Sat, 4 Jul 2020 16:39:17 +0300 Subject: [PATCH 0041/2357] fix type error in zookeeper --- src/Common/ZooKeeper/ZooKeeper.cpp | 2 +- tests/integration/runner | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/src/Common/ZooKeeper/ZooKeeper.cpp b/src/Common/ZooKeeper/ZooKeeper.cpp index 541625149dd..e09533874e3 100644 --- a/src/Common/ZooKeeper/ZooKeeper.cpp +++ b/src/Common/ZooKeeper/ZooKeeper.cpp @@ -582,7 +582,7 @@ void ZooKeeper::removeChildren(const std::string & path) void ZooKeeper::tryRemoveChildren(const std::string & path) { Strings children; - if (tryGetChildren(path, children) != Coordination::ZOK) + if (tryGetChildren(path, children) != Coordination::Error::ZOK) return; while (!children.empty()) { diff --git a/tests/integration/runner b/tests/integration/runner index 058badcee66..399c87dcf06 100755 --- a/tests/integration/runner +++ b/tests/integration/runner @@ -1,4 +1,4 @@ -#!/usr/bin/env python2 +#!/usr/bin/env python #-*- coding: utf-8 -*- import subprocess import os @@ -105,7 +105,7 @@ if __name__ == "__main__": bridge_bin=args.bridge_binary, cfg=args.configs_dir, pth=args.clickhouse_root, - opts='-vv ' + ' '.join(args.pytest_args), + opts=' '.join(args.pytest_args), img=DIND_INTEGRATION_TESTS_IMAGE_NAME, name=CONTAINER_NAME, command=args.command From e591fe501412cce7bf2c9105ba7b572cc3b89ddb Mon Sep 17 00:00:00 2001 From: Val Date: Sat, 4 Jul 2020 19:32:23 +0300 Subject: [PATCH 0042/2357] database replicated feedback mechanism prototype --- src/Databases/DatabaseReplicated.cpp | 77 ++++++++++++++++----- src/Databases/DatabaseReplicated.h | 10 +-- src/Interpreters/InterpreterAlterQuery.cpp | 4 +- src/Interpreters/InterpreterCreateQuery.cpp | 11 ++- src/Interpreters/InterpreterDropQuery.cpp | 6 ++ src/Interpreters/InterpreterRenameQuery.cpp | 8 +++ 6 files changed, 92 insertions(+), 24 deletions(-) diff --git a/src/Databases/DatabaseReplicated.cpp b/src/Databases/DatabaseReplicated.cpp index 47298996236..fb64a005320 100644 --- a/src/Databases/DatabaseReplicated.cpp +++ b/src/Databases/DatabaseReplicated.cpp @@ -7,11 +7,13 @@ #include #include #include +#include #include #include #include #include #include +#include #include @@ -105,6 +107,7 @@ DatabaseReplicated::DatabaseReplicated( } snapshot_period = context_.getConfigRef().getInt("database_replicated_snapshot_period", 10); + feedback_timeout = context_.getConfigRef().getInt("database_replicated_feedback_timeout", 0); LOG_DEBUG(log, "Snapshot period is set to {} log entries per one snapshot", snapshot_period); background_log_executor = context_.getReplicatedSchedulePool().createTask(database_name + "(DatabaseReplicated::background_executor)", [this]{ runBackgroundLogExecutor();} ); @@ -177,14 +180,7 @@ void DatabaseReplicated::runBackgroundLogExecutor() { log_entry_names.erase(log_entry_names.begin(), newest_entry_it); for (const String & log_entry_name : log_entry_names) { - String log_entry_path = zookeeper_path + "/log/" + log_entry_name; - bool yield = false; - { - std::lock_guard lock(log_name_mutex); - if (log_name_to_exec_with_result == log_entry_name) - yield = true; - } - executeFromZK(log_entry_path, yield); + executeLogName(log_entry_name); last_executed_log_entry = log_entry_name; writeLastExecutedToDiskAndZK(); @@ -213,7 +209,8 @@ void DatabaseReplicated::writeLastExecutedToDiskAndZK() { out.close(); } -void DatabaseReplicated::executeFromZK(String & path, bool yield) { +void DatabaseReplicated::executeLogName(const String & log_entry_name) { + String path = zookeeper_path + "/log/" + log_entry_name; current_zookeeper = getZooKeeper(); String query_to_execute = current_zookeeper->get(path, {}, NULL); @@ -225,15 +222,12 @@ void DatabaseReplicated::executeFromZK(String & path, bool yield) { current_context->setCurrentQueryId(""); // generate random query_id executeQuery(query_to_execute, *current_context); } - catch (...) + catch (const Exception & e) { - if (yield) - tryLogCurrentException(log, "Query from zookeeper " + query_to_execute + " wasn't finished successfully"); - + tryLogCurrentException(log, "Query from zookeeper " + query_to_execute + " wasn't finished successfully"); + current_zookeeper->create(zookeeper_path + "/replicas/" + replica_name + "/errors/" + log_entry_name, e.what(), zkutil::CreateMode::Persistent); } - std::lock_guard lock(log_name_mutex); - log_name_to_exec_with_result.clear(); LOG_DEBUG(log, "Executed query: {}", query_to_execute); } @@ -250,6 +244,48 @@ void DatabaseReplicated::propose(const ASTPtr & query) { background_log_executor->schedule(); } +BlockIO DatabaseReplicated::getFeedback() { + BlockIO res; + if (feedback_timeout == 0) + return res; + + Stopwatch watch; + + NamesAndTypes block_structure = { + {"replica_name", std::make_shared()}, + {"execution_feedback", std::make_shared()}, + }; + auto replica_name_column = block_structure[0].type->createColumn(); + auto feedback_column = block_structure[1].type->createColumn(); + + current_zookeeper = getZooKeeper(); + Strings replica_states = current_zookeeper->getChildren(zookeeper_path + "/replicas"); + auto replica_iter = replica_states.begin(); + + while (!replica_states.empty() && watch.elapsedSeconds() < feedback_timeout) { + String last_executed = current_zookeeper->get(zookeeper_path + "/replicas/" + *replica_iter); + if (last_executed > log_name_to_exec_with_result) { + replica_name_column->insert(*replica_iter); + String err_path = zookeeper_path + "/replicas/" + *replica_iter + "/errors/" + log_name_to_exec_with_result; + if (!current_zookeeper->exists(err_path)) { + feedback_column->insert("OK"); + } else { + String feedback = current_zookeeper->get(err_path, {}, NULL); + feedback_column->insert(feedback); + } + replica_states.erase(replica_iter); + replica_iter = replica_states.begin(); + } + } + + Block block = Block({ + {std::move(replica_name_column), block_structure[0].type, block_structure[0].name}, + {std::move(feedback_column), block_structure[1].type, block_structure[1].name}}); + + res.in = std::make_shared(block); + return res; +} + void DatabaseReplicated::createSnapshot() { current_zookeeper = getZooKeeper(); String snapshot_path = zookeeper_path + "/snapshots/" + last_executed_log_entry; @@ -288,16 +324,23 @@ void DatabaseReplicated::loadMetadataFromSnapshot() { return; } - Strings metadatas; if (current_zookeeper->tryGetChildren(zookeeper_path + "/snapshots/" + *latest_snapshot, metadatas) != Coordination::Error::ZOK) return; LOG_DEBUG(log, "Executing {} snapshot", *latest_snapshot); + for (auto t = metadatas.begin(); t != metadatas.end(); ++t) { String path = zookeeper_path + "/snapshots/" + *latest_snapshot + "/" + *t; - executeFromZK(path, false); + String query_to_execute = current_zookeeper->get(path, {}, NULL); + + current_context = std::make_unique(global_context); + current_context->getClientInfo().query_kind = ClientInfo::QueryKind::REPLICATED_LOG_QUERY; + current_context->setCurrentDatabase(database_name); + current_context->setCurrentQueryId(""); // generate random query_id + + executeQuery(query_to_execute, *current_context); } last_executed_log_entry = *latest_snapshot; diff --git a/src/Databases/DatabaseReplicated.h b/src/Databases/DatabaseReplicated.h index 2aa6c0d9a68..0f448b8061c 100644 --- a/src/Databases/DatabaseReplicated.h +++ b/src/Databases/DatabaseReplicated.h @@ -3,6 +3,8 @@ #include #include #include +#include +#include namespace DB @@ -42,6 +44,8 @@ public: void propose(const ASTPtr & query) override; + BlockIO getFeedback(); + String zookeeper_path; String replica_name; @@ -49,9 +53,7 @@ private: void createDatabaseZKNodes(); void runBackgroundLogExecutor(); - - void executeFromZK(String & path, bool yield); - + void executeLogName(const String &); void writeLastExecutedToDiskAndZK(); void loadMetadataFromSnapshot(); @@ -60,11 +62,11 @@ private: std::unique_ptr current_context; // to run executeQuery - //BlockIO execution_result; std::mutex log_name_mutex; String log_name_to_exec_with_result; int snapshot_period; + int feedback_timeout; String last_executed_log_entry = ""; diff --git a/src/Interpreters/InterpreterAlterQuery.cpp b/src/Interpreters/InterpreterAlterQuery.cpp index 6b4bcdde067..96f3628b637 100644 --- a/src/Interpreters/InterpreterAlterQuery.cpp +++ b/src/Interpreters/InterpreterAlterQuery.cpp @@ -16,6 +16,7 @@ #include #include #include +#include #include @@ -52,7 +53,8 @@ BlockIO InterpreterAlterQuery::execute() DatabasePtr database = DatabaseCatalog::instance().getDatabase(table_id.database_name); if (database->getEngineName() == "Replicated" && context.getClientInfo().query_kind != ClientInfo::QueryKind::REPLICATED_LOG_QUERY && !table->supportsReplication()) { database->propose(query_ptr); - return {}; + auto * database_replicated = typeid_cast(database.get()); + return database_replicated->getFeedback(); } /// Add default database to table identifiers that we can encounter in e.g. default expressions, diff --git a/src/Interpreters/InterpreterCreateQuery.cpp b/src/Interpreters/InterpreterCreateQuery.cpp index 9d3abf2c8a6..0c312cfc863 100644 --- a/src/Interpreters/InterpreterCreateQuery.cpp +++ b/src/Interpreters/InterpreterCreateQuery.cpp @@ -46,6 +46,7 @@ #include #include +#include #include #include @@ -571,12 +572,12 @@ BlockIO InterpreterCreateQuery::createTable(ASTCreateQuery & create) ErrorCodes::BAD_DATABASE_FOR_TEMPORARY_TABLE); String current_database = context.getCurrentDatabase(); + auto database_name = create.database.empty() ? current_database : create.database; + auto database = DatabaseCatalog::instance().getDatabase(database_name); // If this is a stub ATTACH query, read the query definition from the database if (create.attach && !create.storage && !create.columns_list) { - auto database_name = create.database.empty() ? current_database : create.database; - auto database = DatabaseCatalog::instance().getDatabase(database_name); bool if_not_exists = create.if_not_exists; // Table SQL definition is available even if the table is detached @@ -611,6 +612,12 @@ BlockIO InterpreterCreateQuery::createTable(ASTCreateQuery & create) /// Actually creates table bool created = doCreateTable(create, properties); + + if (database->getEngineName() == "Replicated" && context.getClientInfo().query_kind != ClientInfo::QueryKind::REPLICATED_LOG_QUERY) { + auto * database_replicated = typeid_cast(database.get()); + return database_replicated->getFeedback(); + } + if (!created) /// Table already exists return {}; diff --git a/src/Interpreters/InterpreterDropQuery.cpp b/src/Interpreters/InterpreterDropQuery.cpp index 8eef9059f69..d5ac832e46c 100644 --- a/src/Interpreters/InterpreterDropQuery.cpp +++ b/src/Interpreters/InterpreterDropQuery.cpp @@ -12,6 +12,7 @@ #include #include #include +#include namespace DB @@ -137,6 +138,11 @@ BlockIO InterpreterDropQuery::executeToTable( } } + if (database->getEngineName() == "Replicated" && context.getClientInfo().query_kind != ClientInfo::QueryKind::REPLICATED_LOG_QUERY) { + auto * database_replicated = typeid_cast(database.get()); + return database_replicated->getFeedback(); + } + return {}; } diff --git a/src/Interpreters/InterpreterRenameQuery.cpp b/src/Interpreters/InterpreterRenameQuery.cpp index 97206f6b364..b950edac5bc 100644 --- a/src/Interpreters/InterpreterRenameQuery.cpp +++ b/src/Interpreters/InterpreterRenameQuery.cpp @@ -6,6 +6,7 @@ #include #include #include +#include namespace DB @@ -91,7 +92,14 @@ BlockIO InterpreterRenameQuery::execute() elem.to_table_name, rename.exchange); } + + // TODO it can't work + if (database->getEngineName() == "Replicated" && context.getClientInfo().query_kind != ClientInfo::QueryKind::REPLICATED_LOG_QUERY) { + auto * database_replicated = typeid_cast(database.get()); + return database_replicated->getFeedback(); + } } + return {}; } From acf86568a7e21176ba2cca15861da231bec6932a Mon Sep 17 00:00:00 2001 From: Anton Ivashkin Date: Thu, 8 Oct 2020 18:45:10 +0300 Subject: [PATCH 0043/2357] S3 zero copy replication proof of concept --- src/Disks/DiskCacheWrapper.cpp | 18 +- src/Disks/DiskCacheWrapper.h | 6 +- src/Disks/DiskDecorator.cpp | 12 +- src/Disks/DiskDecorator.h | 7 +- src/Disks/DiskLocal.cpp | 6 +- src/Disks/DiskLocal.h | 6 +- src/Disks/DiskMemory.cpp | 6 +- src/Disks/DiskMemory.h | 6 +- src/Disks/IDisk.h | 13 +- src/Disks/S3/DiskS3.cpp | 34 ++- src/Disks/S3/DiskS3.h | 8 +- src/Interpreters/InterserverIOHandler.h | 8 + src/Storages/MergeTree/DataPartsExchange.cpp | 238 +++++++++++++++++- src/Storages/MergeTree/DataPartsExchange.h | 39 ++- src/Storages/MergeTree/IMergeTreeDataPart.cpp | 20 +- src/Storages/MergeTree/IMergeTreeDataPart.h | 2 +- src/Storages/StorageReplicatedMergeTree.cpp | 34 ++- 17 files changed, 392 insertions(+), 71 deletions(-) diff --git a/src/Disks/DiskCacheWrapper.cpp b/src/Disks/DiskCacheWrapper.cpp index c60f69920f4..94b15920cee 100644 --- a/src/Disks/DiskCacheWrapper.cpp +++ b/src/Disks/DiskCacheWrapper.cpp @@ -198,11 +198,11 @@ DiskCacheWrapper::writeFile(const String & path, size_t buf_size, WriteMode mode buf_size); } -void DiskCacheWrapper::clearDirectory(const String & path) +void DiskCacheWrapper::clearDirectory(const String & path, bool keep_s3) { if (cache_disk->exists(path)) - cache_disk->clearDirectory(path); - DiskDecorator::clearDirectory(path); + cache_disk->clearDirectory(path, keep_s3); + DiskDecorator::clearDirectory(path, keep_s3); } void DiskCacheWrapper::moveDirectory(const String & from_path, const String & to_path) @@ -251,18 +251,18 @@ void DiskCacheWrapper::copyFile(const String & from_path, const String & to_path DiskDecorator::copyFile(from_path, to_path); } -void DiskCacheWrapper::remove(const String & path) +void DiskCacheWrapper::remove(const String & path, bool keep_s3) { if (cache_disk->exists(path)) - cache_disk->remove(path); - DiskDecorator::remove(path); + cache_disk->remove(path, keep_s3); + DiskDecorator::remove(path, keep_s3); } -void DiskCacheWrapper::removeRecursive(const String & path) +void DiskCacheWrapper::removeRecursive(const String & path, bool keep_s3) { if (cache_disk->exists(path)) - cache_disk->removeRecursive(path); - DiskDecorator::removeRecursive(path); + cache_disk->removeRecursive(path, keep_s3); + DiskDecorator::removeRecursive(path, keep_s3); } void DiskCacheWrapper::createHardLink(const String & src_path, const String & dst_path) diff --git a/src/Disks/DiskCacheWrapper.h b/src/Disks/DiskCacheWrapper.h index b0b373d900c..9fca4e02e34 100644 --- a/src/Disks/DiskCacheWrapper.h +++ b/src/Disks/DiskCacheWrapper.h @@ -28,7 +28,7 @@ public: std::function cache_file_predicate_); void createDirectory(const String & path) override; void createDirectories(const String & path) override; - void clearDirectory(const String & path) override; + void clearDirectory(const String & path, bool keep_s3 = false) override; void moveDirectory(const String & from_path, const String & to_path) override; void moveFile(const String & from_path, const String & to_path) override; void replaceFile(const String & from_path, const String & to_path) override; @@ -37,8 +37,8 @@ public: readFile(const String & path, size_t buf_size, size_t estimated_size, size_t aio_threshold, size_t mmap_threshold) const override; std::unique_ptr writeFile(const String & path, size_t buf_size, WriteMode mode, size_t estimated_size, size_t aio_threshold) override; - void remove(const String & path) override; - void removeRecursive(const String & path) override; + void remove(const String & path, bool keep_s3 = false) override; + void removeRecursive(const String & path, bool keep_s3 = false) override; void createHardLink(const String & src_path, const String & dst_path) override; ReservationPtr reserve(UInt64 bytes) override; diff --git a/src/Disks/DiskDecorator.cpp b/src/Disks/DiskDecorator.cpp index 7f2ea58d7cf..9d61141a162 100644 --- a/src/Disks/DiskDecorator.cpp +++ b/src/Disks/DiskDecorator.cpp @@ -73,9 +73,9 @@ void DiskDecorator::createDirectories(const String & path) delegate->createDirectories(path); } -void DiskDecorator::clearDirectory(const String & path) +void DiskDecorator::clearDirectory(const String & path, bool keep_s3) { - delegate->clearDirectory(path); + delegate->clearDirectory(path, keep_s3); } void DiskDecorator::moveDirectory(const String & from_path, const String & to_path) @@ -130,14 +130,14 @@ DiskDecorator::writeFile(const String & path, size_t buf_size, WriteMode mode, s return delegate->writeFile(path, buf_size, mode, estimated_size, aio_threshold); } -void DiskDecorator::remove(const String & path) +void DiskDecorator::remove(const String & path, bool keep_s3) { - delegate->remove(path); + delegate->remove(path, keep_s3); } -void DiskDecorator::removeRecursive(const String & path) +void DiskDecorator::removeRecursive(const String & path, bool keep_s3) { - delegate->removeRecursive(path); + delegate->removeRecursive(path, keep_s3); } void DiskDecorator::setLastModified(const String & path, const Poco::Timestamp & timestamp) diff --git a/src/Disks/DiskDecorator.h b/src/Disks/DiskDecorator.h index f1ddfff4952..f1fea043843 100644 --- a/src/Disks/DiskDecorator.h +++ b/src/Disks/DiskDecorator.h @@ -22,7 +22,7 @@ public: size_t getFileSize(const String & path) const override; void createDirectory(const String & path) override; void createDirectories(const String & path) override; - void clearDirectory(const String & path) override; + void clearDirectory(const String & path, bool keep_s3 = false) override; void moveDirectory(const String & from_path, const String & to_path) override; DiskDirectoryIteratorPtr iterateDirectory(const String & path) override; void createFile(const String & path) override; @@ -35,8 +35,8 @@ public: readFile(const String & path, size_t buf_size, size_t estimated_size, size_t aio_threshold, size_t mmap_threshold) const override; std::unique_ptr writeFile(const String & path, size_t buf_size, WriteMode mode, size_t estimated_size, size_t aio_threshold) override; - void remove(const String & path) override; - void removeRecursive(const String & path) override; + void remove(const String & path, bool keep_s3 = false) override; + void removeRecursive(const String & path, bool keep_s3 = false) override; void setLastModified(const String & path, const Poco::Timestamp & timestamp) override; Poco::Timestamp getLastModified(const String & path) override; void setReadOnly(const String & path) override; @@ -46,6 +46,7 @@ public: void close(int fd) const override; void sync(int fd) const override; const String getType() const override { return delegate->getType(); } + const String getUniqueId(const String & path) const override { return delegate->getUniqueId(path); } protected: DiskPtr delegate; diff --git a/src/Disks/DiskLocal.cpp b/src/Disks/DiskLocal.cpp index a09ab7c5ac5..ad85fdf4236 100644 --- a/src/Disks/DiskLocal.cpp +++ b/src/Disks/DiskLocal.cpp @@ -180,7 +180,7 @@ void DiskLocal::createDirectories(const String & path) Poco::File(disk_path + path).createDirectories(); } -void DiskLocal::clearDirectory(const String & path) +void DiskLocal::clearDirectory(const String & path, bool) { std::vector files; Poco::File(disk_path + path).list(files); @@ -236,12 +236,12 @@ DiskLocal::writeFile(const String & path, size_t buf_size, WriteMode mode, size_ return createWriteBufferFromFileBase(disk_path + path, estimated_size, aio_threshold, buf_size, flags); } -void DiskLocal::remove(const String & path) +void DiskLocal::remove(const String & path, bool) { Poco::File(disk_path + path).remove(false); } -void DiskLocal::removeRecursive(const String & path) +void DiskLocal::removeRecursive(const String & path, bool) { Poco::File(disk_path + path).remove(true); } diff --git a/src/Disks/DiskLocal.h b/src/Disks/DiskLocal.h index 762a8502faa..18e6d072874 100644 --- a/src/Disks/DiskLocal.h +++ b/src/Disks/DiskLocal.h @@ -55,7 +55,7 @@ public: void createDirectories(const String & path) override; - void clearDirectory(const String & path) override; + void clearDirectory(const String & path, bool keep_s3 = false) override; void moveDirectory(const String & from_path, const String & to_path) override; @@ -87,9 +87,9 @@ public: size_t estimated_size, size_t aio_threshold) override; - void remove(const String & path) override; + void remove(const String & path, bool keep_s3 = false) override; - void removeRecursive(const String & path) override; + void removeRecursive(const String & path, bool keep_s3 = false) override; void setLastModified(const String & path, const Poco::Timestamp & timestamp) override; diff --git a/src/Disks/DiskMemory.cpp b/src/Disks/DiskMemory.cpp index d185263d48c..fc375707feb 100644 --- a/src/Disks/DiskMemory.cpp +++ b/src/Disks/DiskMemory.cpp @@ -233,7 +233,7 @@ void DiskMemory::createDirectoriesImpl(const String & path) files.emplace(path, FileData{FileType::Directory}); } -void DiskMemory::clearDirectory(const String & path) +void DiskMemory::clearDirectory(const String & path, bool) { std::lock_guard lock(mutex); @@ -348,7 +348,7 @@ std::unique_ptr DiskMemory::writeFile(const String & pa return std::make_unique(this, path, mode, buf_size); } -void DiskMemory::remove(const String & path) +void DiskMemory::remove(const String & path, bool) { std::lock_guard lock(mutex); @@ -368,7 +368,7 @@ void DiskMemory::remove(const String & path) } } -void DiskMemory::removeRecursive(const String & path) +void DiskMemory::removeRecursive(const String & path, bool) { std::lock_guard lock(mutex); diff --git a/src/Disks/DiskMemory.h b/src/Disks/DiskMemory.h index 4d4b947098b..e75d9bff100 100644 --- a/src/Disks/DiskMemory.h +++ b/src/Disks/DiskMemory.h @@ -48,7 +48,7 @@ public: void createDirectories(const String & path) override; - void clearDirectory(const String & path) override; + void clearDirectory(const String & path, bool keep_s3 = false) override; void moveDirectory(const String & from_path, const String & to_path) override; @@ -78,9 +78,9 @@ public: size_t estimated_size, size_t aio_threshold) override; - void remove(const String & path) override; + void remove(const String & path, bool keep_s3 = false) override; - void removeRecursive(const String & path) override; + void removeRecursive(const String & path, bool keep_s3 = false) override; void setLastModified(const String &, const Poco::Timestamp &) override {} diff --git a/src/Disks/IDisk.h b/src/Disks/IDisk.h index 688c1dfad42..324384fade6 100644 --- a/src/Disks/IDisk.h +++ b/src/Disks/IDisk.h @@ -105,7 +105,7 @@ public: virtual void createDirectories(const String & path) = 0; /// Remove all files from the directory. Directories are not removed. - virtual void clearDirectory(const String & path) = 0; + virtual void clearDirectory(const String & path, bool keep_s3 = false) = 0; /// Move directory from `from_path` to `to_path`. virtual void moveDirectory(const String & from_path, const String & to_path) = 0; @@ -153,16 +153,16 @@ public: size_t aio_threshold = 0) = 0; /// Remove file or directory. Throws exception if file doesn't exists or if directory is not empty. - virtual void remove(const String & path) = 0; + virtual void remove(const String & path, bool keep_s3 = false) = 0; /// Remove file or directory with all children. Use with extra caution. Throws exception if file doesn't exists. - virtual void removeRecursive(const String & path) = 0; + virtual void removeRecursive(const String & path, bool keep_s3 = false) = 0; /// Remove file or directory if it exists. - void removeIfExists(const String & path) + void removeIfExists(const String & path, bool keep_s3 = false) { if (exists(path)) - remove(path); + remove(path, keep_s3); } /// Set last modified time to file or directory at `path`. @@ -195,6 +195,9 @@ public: /// Invoked when Global Context is shutdown. virtual void shutdown() { } + /// Return some uniq string for file, overrided for S3 + virtual const String getUniqueId(const String & path) const { return path; } + private: /// Returns executor to perform asynchronous operations. Executor & getExecutor() { return *executor; } diff --git a/src/Disks/S3/DiskS3.cpp b/src/Disks/S3/DiskS3.cpp index 6abb72efeb0..8b6c3c8465c 100644 --- a/src/Disks/S3/DiskS3.cpp +++ b/src/Disks/S3/DiskS3.cpp @@ -535,16 +535,25 @@ void DiskS3::createDirectories(const String & path) Poco::File(metadata_path + path).createDirectories(); } +const String DiskS3::getUniqueId(const String & path) const +{ + Metadata metadata(s3_root_path, metadata_path, path); + String id; + if (!metadata.s3_objects.empty()) + id = metadata.s3_objects[0].first; + return id; +} + DiskDirectoryIteratorPtr DiskS3::iterateDirectory(const String & path) { return std::make_unique(metadata_path + path, path); } -void DiskS3::clearDirectory(const String & path) +void DiskS3::clearDirectory(const String & path, bool keep_s3) { for (auto it{iterateDirectory(path)}; it->isValid(); it->next()) if (isFile(it->path())) - remove(it->path()); + remove(it->path(), keep_s3); } void DiskS3::moveFile(const String & from_path, const String & to_path) @@ -634,7 +643,7 @@ std::unique_ptr DiskS3::writeFile(const String & path, } } -void DiskS3::remove(const String & path) +void DiskS3::remove(const String & path, bool keep_s3) { LOG_DEBUG(&Poco::Logger::get("DiskS3"), "Remove file by path: {}", backQuote(metadata_path + path)); @@ -647,13 +656,16 @@ void DiskS3::remove(const String & path) if (metadata.ref_count == 0) { file.remove(); - for (const auto & [s3_object_path, _] : metadata.s3_objects) + if (!keep_s3) { - /// TODO: Make operation idempotent. Do not throw exception if key is already deleted. - Aws::S3::Model::DeleteObjectRequest request; - request.SetBucket(bucket); - request.SetKey(s3_root_path + s3_object_path); - throwIfError(client->DeleteObject(request)); + for (const auto & [s3_object_path, _] : metadata.s3_objects) + { + /// TODO: Make operation idempotent. Do not throw exception if key is already deleted. + Aws::S3::Model::DeleteObjectRequest request; + request.SetBucket(bucket); + request.SetKey(s3_root_path + s3_object_path); + throwIfError(client->DeleteObject(request)); + } } } else /// In other case decrement number of references, save metadata and delete file. @@ -667,7 +679,7 @@ void DiskS3::remove(const String & path) file.remove(); } -void DiskS3::removeRecursive(const String & path) +void DiskS3::removeRecursive(const String & path, bool keep_s3) { checkStackSize(); /// This is needed to prevent stack overflow in case of cyclic symlinks. @@ -679,7 +691,7 @@ void DiskS3::removeRecursive(const String & path) else { for (auto it{iterateDirectory(path)}; it->isValid(); it->next()) - removeRecursive(it->path()); + removeRecursive(it->path(), keep_s3); file.remove(); } } diff --git a/src/Disks/S3/DiskS3.h b/src/Disks/S3/DiskS3.h index 2d9c7f79865..48644dcccf0 100644 --- a/src/Disks/S3/DiskS3.h +++ b/src/Disks/S3/DiskS3.h @@ -58,7 +58,7 @@ public: void createDirectories(const String & path) override; - void clearDirectory(const String & path) override; + void clearDirectory(const String & path, bool keep_s3 = false) override; void moveDirectory(const String & from_path, const String & to_path) override { moveFile(from_path, to_path); } @@ -86,9 +86,9 @@ public: size_t estimated_size, size_t aio_threshold) override; - void remove(const String & path) override; + void remove(const String & path, bool keep_s3 = false) override; - void removeRecursive(const String & path) override; + void removeRecursive(const String & path, bool keep_s3 = false) override; void createHardLink(const String & src_path, const String & dst_path) override; @@ -108,6 +108,8 @@ public: void shutdown() override; + const String getUniqueId(const String & path) const override; + private: bool tryReserve(UInt64 bytes); diff --git a/src/Interpreters/InterserverIOHandler.h b/src/Interpreters/InterserverIOHandler.h index 6d62c9651ca..bcb0e8736f0 100644 --- a/src/Interpreters/InterserverIOHandler.h +++ b/src/Interpreters/InterserverIOHandler.h @@ -16,6 +16,12 @@ namespace Poco { namespace Net { class HTTPServerResponse; } } +namespace zkutil +{ + class ZooKeeper; + using ZooKeeperPtr = std::shared_ptr; +} + namespace DB { @@ -34,6 +40,8 @@ public: virtual void processQuery(const Poco::Net::HTMLForm & params, ReadBuffer & body, WriteBuffer & out, Poco::Net::HTTPServerResponse & response) = 0; virtual ~InterserverIOEndpoint() = default; + virtual void setZooKeeper(const zkutil::ZooKeeperPtr &zookeeper_, const String & zookeeper_path_, const String & replica_name_) = 0; + /// You need to stop the data transfer if blocker is activated. ActionBlocker blocker; std::shared_mutex rwlock; diff --git a/src/Storages/MergeTree/DataPartsExchange.cpp b/src/Storages/MergeTree/DataPartsExchange.cpp index f9fb157942a..d9a37a01585 100644 --- a/src/Storages/MergeTree/DataPartsExchange.cpp +++ b/src/Storages/MergeTree/DataPartsExchange.cpp @@ -3,11 +3,15 @@ #include #include #include +#include #include #include #include +#include #include #include +#include +#include #include #include #include @@ -34,6 +38,7 @@ namespace ErrorCodes extern const int INSECURE_PATH; extern const int CORRUPTED_DATA; extern const int LOGICAL_ERROR; + extern const int S3_ERROR; } namespace DataPartsExchange @@ -45,6 +50,7 @@ constexpr auto REPLICATION_PROTOCOL_VERSION_WITH_PARTS_SIZE = 1; constexpr auto REPLICATION_PROTOCOL_VERSION_WITH_PARTS_SIZE_AND_TTL_INFOS = 2; constexpr auto REPLICATION_PROTOCOL_VERSION_WITH_PARTS_TYPE = 3; constexpr auto REPLICATION_PROTOCOL_VERSION_WITH_PARTS_DEFAULT_COMPRESSION = 4; +constexpr auto REPLICATION_PROTOCOL_VERSION_WITH_PARTS_S3_COPY = 5; std::string getEndpointId(const std::string & node_id) @@ -85,7 +91,7 @@ void Service::processQuery(const Poco::Net::HTMLForm & params, ReadBuffer & /*bo } /// We pretend to work as older server version, to be sure that client will correctly process our version - response.addCookie({"server_protocol_version", toString(std::min(client_protocol_version, REPLICATION_PROTOCOL_VERSION_WITH_PARTS_DEFAULT_COMPRESSION))}); + response.addCookie({"server_protocol_version", toString(std::min(client_protocol_version, REPLICATION_PROTOCOL_VERSION_WITH_PARTS_S3_COPY))}); ++total_sends; SCOPE_EXIT({--total_sends;}); @@ -118,8 +124,30 @@ void Service::processQuery(const Poco::Net::HTMLForm & params, ReadBuffer & /*bo sendPartFromMemory(part, out); else { - bool send_default_compression_file = client_protocol_version >= REPLICATION_PROTOCOL_VERSION_WITH_PARTS_DEFAULT_COMPRESSION; - sendPartFromDisk(part, out, send_default_compression_file); + bool try_use_s3_copy = false; + + if (client_protocol_version >= REPLICATION_PROTOCOL_VERSION_WITH_PARTS_S3_COPY) + { /// if source and destination are in the same S3 storage we try to use S3 CopyObject request first + int send_s3_metadata = parse(params.get("send_s3_metadata", "0")); + if (send_s3_metadata == 1) + { + auto disk = part->volume->getDisk(); + if (disk->getType() == "s3") + { + try_use_s3_copy = true; + } + } + } + if (try_use_s3_copy) + { + response.addCookie({"send_s3_metadata", "1"}); + sendPartS3Metadata(part, out); + } + else + { + bool send_default_compression_file = client_protocol_version >= REPLICATION_PROTOCOL_VERSION_WITH_PARTS_DEFAULT_COMPRESSION; + sendPartFromDisk(part, out, send_default_compression_file); + } } } catch (const NetException &) @@ -199,6 +227,62 @@ void Service::sendPartFromDisk(const MergeTreeData::DataPartPtr & part, WriteBuf part->checksums.checkEqual(data_checksums, false); } +void Service::sendPartS3Metadata(const MergeTreeData::DataPartPtr & part, WriteBuffer & out) +{ + /// We'll take a list of files from the list of checksums. + MergeTreeData::DataPart::Checksums checksums = part->checksums; + /// Add files that are not in the checksum list. + auto file_names_without_checksums = part->getFileNamesWithoutChecksums(); + for (const auto & file_name : file_names_without_checksums) + checksums.files[file_name] = {}; + + auto disk = part->volume->getDisk(); + if (disk->getType() != "s3") + throw Exception("S3 disk is not S3 anymore", ErrorCodes::LOGICAL_ERROR); + + String id = disk->getUniqueId(part->getFullRelativePath() + "checksums.txt"); + + if (id.empty()) + throw Exception("Can't lock part on S3 storage", ErrorCodes::LOGICAL_ERROR); + + String zookeeper_node = zookeeper_path + "/zero_copy_s3/" + id + "/" + replica_name; + + LOG_TRACE(log, "Set zookeeper lock {}", id); + + zookeeper->createAncestors(zookeeper_node); + zookeeper->createIfNotExists(zookeeper_node, "lock"); + + writeBinary(checksums.files.size(), out); + for (const auto & it : checksums.files) + { + String file_name = it.first; + + String metadata_file = disk->getPath() + part->getFullRelativePath() + file_name; + + Poco::File metadata(metadata_file); + + if (!metadata.exists()) + throw Exception("S3 metadata '" + file_name + "' is not exists", ErrorCodes::LOGICAL_ERROR); + if (!metadata.isFile()) + throw Exception("S3 metadata '" + file_name + "' is not a file", ErrorCodes::LOGICAL_ERROR); + UInt64 file_size = metadata.getSize(); + + writeStringBinary(it.first, out); + writeBinary(file_size, out); + + auto file_in = createReadBufferFromFileBase(metadata_file, 0, 0, 0, DBMS_DEFAULT_BUFFER_SIZE); + HashingWriteBuffer hashing_out(out); + copyData(*file_in, hashing_out, blocker.getCounter()); + if (blocker.isCancelled()) + throw Exception("Transferring part to replica was cancelled", ErrorCodes::ABORTED); + + if (hashing_out.count() != file_size) + throw Exception("Unexpected size of file " + metadata_file, ErrorCodes::BAD_SIZE_OF_FILE_IN_DATA_PART); + + writePODBinary(hashing_out.getHash(), out); + } +} + MergeTreeData::DataPartPtr Service::findPart(const String & name) { /// It is important to include PreCommitted and Outdated parts here because remote replicas cannot reliably @@ -222,7 +306,8 @@ MergeTreeData::MutableDataPartPtr Fetcher::fetchPart( const String & password, const String & interserver_scheme, bool to_detached, - const String & tmp_prefix_) + const String & tmp_prefix_, + bool try_use_s3_copy) { if (blocker.isCancelled()) throw Exception("Fetching of part was cancelled", ErrorCodes::ABORTED); @@ -239,10 +324,29 @@ MergeTreeData::MutableDataPartPtr Fetcher::fetchPart( { {"endpoint", getEndpointId(replica_path)}, {"part", part_name}, - {"client_protocol_version", toString(REPLICATION_PROTOCOL_VERSION_WITH_PARTS_DEFAULT_COMPRESSION)}, + {"client_protocol_version", toString(REPLICATION_PROTOCOL_VERSION_WITH_PARTS_S3_COPY)}, {"compress", "false"} }); + ReservationPtr reservationS3; + + if (try_use_s3_copy) + { + /// TODO: Make a normal check for S3 Disk + reservationS3 = data.makeEmptyReservationOnLargestDisk(); + auto disk = reservationS3->getDisk(); + + if (disk->getType() != "s3") + { + try_use_s3_copy = false; + } + } + + if (try_use_s3_copy) + { + uri.addQueryParameter("send_s3_metadata", "1"); + } + Poco::Net::HTTPBasicCredentials creds{}; if (!user.empty()) { @@ -263,6 +367,40 @@ MergeTreeData::MutableDataPartPtr Fetcher::fetchPart( int server_protocol_version = parse(in.getResponseCookie("server_protocol_version", "0")); + int send_s3 = parse(in.getResponseCookie("send_s3_metadata", "0")); + + if (send_s3 == 1) + { + if (server_protocol_version < REPLICATION_PROTOCOL_VERSION_WITH_PARTS_S3_COPY) + throw Exception("Got 'send_s3_metadata' cookie with old protocol version", ErrorCodes::LOGICAL_ERROR); + if (!try_use_s3_copy) + throw Exception("Got 'send_s3_metadata' cookie when was not requested", ErrorCodes::LOGICAL_ERROR); + + size_t sum_files_size = 0; + readBinary(sum_files_size, in); + IMergeTreeDataPart::TTLInfos ttl_infos; + /// Skip ttl infos, not required for S3 metadata + String ttl_infos_string; + readBinary(ttl_infos_string, in); + String part_type = "Wide"; + readStringBinary(part_type, in); + if (part_type == "InMemory") + throw Exception("Got 'send_s3_metadata' cookie for in-memory partition", ErrorCodes::LOGICAL_ERROR); + + try + { + return downloadPartToS3(part_name, replica_path, to_detached, tmp_prefix_, sync, std::move(reservationS3), in); + } + catch(const Exception& e) + { + if (e.code() != ErrorCodes::S3_ERROR) + throw; + /// Try again but without S3 copy + return fetchPart(metadata_snapshot, part_name, replica_path, host, port, timeouts, + user, password, interserver_scheme, to_detached, tmp_prefix_, false); + } + } + ReservationPtr reservation; size_t sum_files_size = 0; if (server_protocol_version >= REPLICATION_PROTOCOL_VERSION_WITH_PARTS_SIZE) @@ -418,6 +556,96 @@ MergeTreeData::MutableDataPartPtr Fetcher::downloadPartToDisk( return new_data_part; } +MergeTreeData::MutableDataPartPtr Fetcher::downloadPartToS3( + const String & part_name, + const String & replica_path, + bool to_detached, + const String & tmp_prefix_, + bool ,//sync, + const ReservationPtr reservation, + PooledReadWriteBufferFromHTTP & in + ) +{ + auto disk = reservation->getDisk(); + if (disk->getType() != "s3") + throw Exception("S3 disk is not S3 anymore", ErrorCodes::LOGICAL_ERROR); + + static const String TMP_PREFIX = "tmp_fetch_"; + String tmp_prefix = tmp_prefix_.empty() ? TMP_PREFIX : tmp_prefix_; + + String part_relative_path = String(to_detached ? "detached/" : "") + tmp_prefix + part_name; + String part_download_path = data.getRelativeDataPath() + part_relative_path + "/"; + + if (disk->exists(part_download_path)) + throw Exception("Directory " + fullPath(disk, part_download_path) + " already exists.", ErrorCodes::DIRECTORY_ALREADY_EXISTS); + + CurrentMetrics::Increment metric_increment{CurrentMetrics::ReplicatedFetch}; + + disk->createDirectories(part_download_path); + + size_t files; + readBinary(files, in); + + auto volume = std::make_shared("volume_" + part_name, disk); + MergeTreeData::MutableDataPartPtr new_data_part = data.createPart(part_name, volume, part_relative_path); + + for (size_t i = 0; i < files; ++i) + { + String file_name; + UInt64 file_size; + + readStringBinary(file_name, in); + readBinary(file_size, in); + + String metadata_file = disk->getPath() + new_data_part->getFullRelativePath() + file_name; + + auto file_out = createWriteBufferFromFileBase(metadata_file, 0, 0, DBMS_DEFAULT_BUFFER_SIZE, -1); + + HashingWriteBuffer hashing_out(*file_out); + + copyData(in, hashing_out, file_size, blocker.getCounter()); + + if (blocker.isCancelled()) + { + /// NOTE The is_cancelled flag also makes sense to check every time you read over the network, + /// performing a poll with a not very large timeout. + /// And now we check it only between read chunks (in the `copyData` function). + throw Exception("Fetching of part was cancelled", ErrorCodes::ABORTED); + } + + MergeTreeDataPartChecksum::uint128 expected_hash; + readPODBinary(expected_hash, in); + + if (expected_hash != hashing_out.getHash()) + { + throw Exception("Checksum mismatch for file " + metadata_file + " transferred from " + replica_path, + ErrorCodes::CHECKSUM_DOESNT_MATCH); + } + } + + assertEOF(in); + + new_data_part->is_temp = true; + new_data_part->modification_time = time(nullptr); + new_data_part->loadColumnsChecksumsIndexes(true, false); + + + String id = disk->getUniqueId(new_data_part->getFullRelativePath() + "checksums.txt"); + + if (id.empty()) + throw Exception("Can't lock part on S3 storage", ErrorCodes::LOGICAL_ERROR); + + String zookeeper_node = zookeeper_path + "/zero_copy_s3/" + id + "/" + replica_name; + + LOG_TRACE(log, "Set zookeeper lock {}", id); + + zookeeper->createAncestors(zookeeper_node); + zookeeper->createIfNotExists(zookeeper_node, "lock"); + + + return new_data_part; +} + } } diff --git a/src/Storages/MergeTree/DataPartsExchange.h b/src/Storages/MergeTree/DataPartsExchange.h index 52a34a2239a..e2e7b2adf4f 100644 --- a/src/Storages/MergeTree/DataPartsExchange.h +++ b/src/Storages/MergeTree/DataPartsExchange.h @@ -9,6 +9,12 @@ #include +namespace zkutil +{ + class ZooKeeper; + using ZooKeeperPtr = std::shared_ptr; +} + namespace DB { @@ -29,16 +35,27 @@ public: std::string getId(const std::string & node_id) const override; void processQuery(const Poco::Net::HTMLForm & params, ReadBuffer & body, WriteBuffer & out, Poco::Net::HTTPServerResponse & response) override; + void setZooKeeper(const zkutil::ZooKeeperPtr & zookeeper_, const String & zookeeper_path_, const String & replica_name_) override + { + zookeeper = zookeeper_; + zookeeper_path = zookeeper_path_; + replica_name = replica_name_; + } + private: MergeTreeData::DataPartPtr findPart(const String & name); void sendPartFromMemory(const MergeTreeData::DataPartPtr & part, WriteBuffer & out); void sendPartFromDisk(const MergeTreeData::DataPartPtr & part, WriteBuffer & out, bool send_default_compression_file); + void sendPartS3Metadata(const MergeTreeData::DataPartPtr & part, WriteBuffer & out); private: /// StorageReplicatedMergeTree::shutdown() waits for all parts exchange handlers to finish, /// so Service will never access dangling reference to storage MergeTreeData & data; Poco::Logger * log; + zkutil::ZooKeeperPtr zookeeper; + String zookeeper_path; + String replica_name; }; /** Client for getting the parts from the table *MergeTree. @@ -63,11 +80,19 @@ public: const String & password, const String & interserver_scheme, bool to_detached = false, - const String & tmp_prefix_ = ""); + const String & tmp_prefix_ = "", + bool try_use_s3_copy = true); /// You need to stop the data transfer. ActionBlocker blocker; + void setZooKeeper(const zkutil::ZooKeeperPtr & zookeeper_, const String & zookeeper_path_, const String & replica_name_) + { + zookeeper = zookeeper_; + zookeeper_path = zookeeper_path_; + replica_name = replica_name_; + } + private: MergeTreeData::MutableDataPartPtr downloadPartToDisk( const String & part_name, @@ -84,8 +109,20 @@ private: ReservationPtr reservation, PooledReadWriteBufferFromHTTP & in); + MergeTreeData::MutableDataPartPtr downloadPartToS3( + const String & part_name, + const String & replica_path, + bool to_detached, + const String & tmp_prefix_, + bool sync, + const ReservationPtr reservation, + PooledReadWriteBufferFromHTTP & in); + MergeTreeData & data; Poco::Logger * log; + zkutil::ZooKeeperPtr zookeeper; + String zookeeper_path; + String replica_name; }; } diff --git a/src/Storages/MergeTree/IMergeTreeDataPart.cpp b/src/Storages/MergeTree/IMergeTreeDataPart.cpp index 486e444763d..23fe60b44e5 100644 --- a/src/Storages/MergeTree/IMergeTreeDataPart.cpp +++ b/src/Storages/MergeTree/IMergeTreeDataPart.cpp @@ -262,7 +262,7 @@ void IMergeTreeDataPart::removeIfNeeded() } } - remove(); + remove(false); if (state == State::DeleteOnDestroy) { @@ -809,7 +809,7 @@ void IMergeTreeDataPart::renameTo(const String & new_relative_path, bool remove_ } -void IMergeTreeDataPart::remove() const +void IMergeTreeDataPart::remove(bool keep_s3) const { if (!isStoredOnDisk()) return; @@ -839,7 +839,7 @@ void IMergeTreeDataPart::remove() const try { - volume->getDisk()->removeRecursive(to + "/"); + volume->getDisk()->removeRecursive(to + "/", keep_s3); } catch (...) { @@ -862,7 +862,7 @@ void IMergeTreeDataPart::remove() const if (checksums.empty()) { /// If the part is not completely written, we cannot use fast path by listing files. - volume->getDisk()->removeRecursive(to + "/"); + volume->getDisk()->removeRecursive(to + "/", keep_s3); } else { @@ -875,18 +875,18 @@ void IMergeTreeDataPart::remove() const # pragma GCC diagnostic ignored "-Wunused-variable" #endif for (const auto & [file, _] : checksums.files) - volume->getDisk()->remove(to + "/" + file); + volume->getDisk()->remove(to + "/" + file, keep_s3); #if !__clang__ # pragma GCC diagnostic pop #endif for (const auto & file : {"checksums.txt", "columns.txt"}) - volume->getDisk()->remove(to + "/" + file); + volume->getDisk()->remove(to + "/" + file, keep_s3); - volume->getDisk()->removeIfExists(to + "/" + DEFAULT_COMPRESSION_CODEC_FILE_NAME); - volume->getDisk()->removeIfExists(to + "/" + DELETE_ON_DESTROY_MARKER_FILE_NAME); + volume->getDisk()->removeIfExists(to + "/" + DEFAULT_COMPRESSION_CODEC_FILE_NAME, keep_s3); + volume->getDisk()->removeIfExists(to + "/" + DELETE_ON_DESTROY_MARKER_FILE_NAME, keep_s3); - volume->getDisk()->remove(to); + volume->getDisk()->remove(to, keep_s3); } catch (...) { @@ -894,7 +894,7 @@ void IMergeTreeDataPart::remove() const LOG_ERROR(storage.log, "Cannot quickly remove directory {} by removing files; fallback to recursive removal. Reason: {}", fullPath(volume->getDisk(), to), getCurrentExceptionMessage(false)); - volume->getDisk()->removeRecursive(to + "/"); + volume->getDisk()->removeRecursive(to + "/", keep_s3); } } } diff --git a/src/Storages/MergeTree/IMergeTreeDataPart.h b/src/Storages/MergeTree/IMergeTreeDataPart.h index 78daf6c9017..3e7b03b2903 100644 --- a/src/Storages/MergeTree/IMergeTreeDataPart.h +++ b/src/Storages/MergeTree/IMergeTreeDataPart.h @@ -124,7 +124,7 @@ public: /// Throws an exception if part is not stored in on-disk format. void assertOnDisk() const; - void remove() const; + void remove(bool keep_s3 = false) const; /// Initialize columns (from columns.txt if exists, or create from column files if not). /// Load checksums from checksums.txt if exists. Load index if required. diff --git a/src/Storages/StorageReplicatedMergeTree.cpp b/src/Storages/StorageReplicatedMergeTree.cpp index 9613bd5111d..dbbf8645d36 100644 --- a/src/Storages/StorageReplicatedMergeTree.cpp +++ b/src/Storages/StorageReplicatedMergeTree.cpp @@ -370,6 +370,8 @@ StorageReplicatedMergeTree::StorageReplicatedMergeTree( } createNewZooKeeperNodes(); + + fetcher.setZooKeeper(current_zookeeper, zookeeper_path, replica_name); } @@ -3364,6 +3366,7 @@ void StorageReplicatedMergeTree::startup() queue.initialize(getDataParts()); data_parts_exchange_endpoint = std::make_shared(*this); + data_parts_exchange_endpoint->setZooKeeper(tryGetZooKeeper(), zookeeper_path, replica_name); global_context.getInterserverIOHandler().addEndpoint(data_parts_exchange_endpoint->getId(replica_path), data_parts_exchange_endpoint); /// In this thread replica will be activated. @@ -5010,13 +5013,40 @@ void StorageReplicatedMergeTree::clearOldPartsAndRemoveFromZK() } parts.clear(); - auto remove_parts_from_filesystem = [log=log] (const DataPartsVector & parts_to_remove) + auto remove_parts_from_filesystem = [log=log,&zookeeper=zookeeper,&zookeeper_path=zookeeper_path,&replica_name=replica_name] (const DataPartsVector & parts_to_remove) { for (const auto & part : parts_to_remove) { try { - part->remove(); + bool keep_s3 = false; + + auto disk = part->volume->getDisk(); + + if (disk->getType() == "s3") + { + String id = disk->getUniqueId(part->getFullRelativePath() + "checksums.txt"); + + if (!id.empty()) + { + String zookeeper_part_node = zookeeper_path + "/zero_copy_s3/" + id; + String zookeeper_node = zookeeper_part_node + "/" + replica_name; + + LOG_TRACE(log, "Remove zookeeper lock for {}", id); + + zookeeper->remove(zookeeper_node); + + Strings children; + zookeeper->tryGetChildren(zookeeper_part_node, children); + if (!children.empty()) + { + LOG_TRACE(log, "Found zookeper locks for {}", id); + keep_s3 = true; + } + } + } + + part->remove(keep_s3); } catch (...) { From a4adb39b2576eb26accc8336a27d6ab9eca4e1b4 Mon Sep 17 00:00:00 2001 From: Anton Ivashkin Date: Thu, 8 Oct 2020 19:23:04 +0300 Subject: [PATCH 0044/2357] S3 zero copy replication proof of concept - description --- S3ZeroCopyReplication.md | 47 ++++++++++++++++++++++++++++++++++++++++ 1 file changed, 47 insertions(+) create mode 100644 S3ZeroCopyReplication.md diff --git a/S3ZeroCopyReplication.md b/S3ZeroCopyReplication.md new file mode 100644 index 00000000000..7e7709ff5a7 --- /dev/null +++ b/S3ZeroCopyReplication.md @@ -0,0 +1,47 @@ +# ClickHouse S3 Zero Copy Replication + +Говнокод просто для теста, не production-ready ни разу. + +[Коммит](https://github.com/ianton-ru/ClickHouse/commit/acf86568a7e21176ba2cca15861da231bec6932a) + +[Ветка](https://github.com/ianton-ru/ClickHouse/tree/s3_zero_copy_replication) + +## Как сделано + +При fetch-е парта при репликации в случае, если источник хранит, а приемник собирается хранить парт в S3, вместо данных пересылаются только метаданные S3, приемник кладет их локально себе +и испольузет общие с источником данные на S3. Для того, чтобы не удалить такие пошареные данные, делается пометка в ZooKeeper. + +Введена новая версия протокола REPLICATION_PROTOCOL_VERSION_WITH_PARTS_S3_COPY. В запросе новый параметр send_s3_metadata, если 1, то приемних просит у источника метаданные вместо данных, если это возможно. +Приемник в ответ отсылает куку send_s3_metadata=1 в случае, если идут метаданные. В остальных случаях отсылаются данные, как и прежде. + +Применик перед запросом смотрит, будет ли хранить данные в S3. Провеока сейчас кривая - запрашивается резервирование на диске с наибольшим доступным местом, а потом смотрится, не на S3 ли оно. +Если на S3, то отсылает в запросе send_s3_metadata=1. + +Источник при получении такого запроса смотрит, лежит ли парт на S3. Если да, то в Зукипере ставит метку по пути `<путь к данным таблицы>/zero_copy_s3/<некий ID парта>/`, +ставит в ответ куку send_s3_metadata=1 и вместо файлов с данными отсылает только файлы метаданных. + +Приемник при получении ответа с send_s3_metadata=1 создает только файлики с идентичными меаданными, которые в итоге будут ссылаться на те же ключи в S3, ставит в зукипере аналогичную метку, +только со своим ID реплики, и работает с этим. + +При желании удалить парт нода удаляет в Зукипере ключ `<путь к данным таблицы>/zero_copy_s3/<некий ID парта>/`, потом получает все подключи `<путь к данным таблицы>/zero_copy_s3/<некий ID парта>`. +Если список не пустой, то считает, что данные использует другая нода и удаляет только локальные метаданные, если пустой, то удаляет и данные в S3. + +## Костыли и недоработки, коих много + +* Никакой проверки, один и тот же S3 у нод или разный сейчас нет, если будет несколько разных S3, работать не будет. + +* В качестве ID парта берется имя первого S3-ключа от файла checksums.txt. + +* Не нашел удобного способа прокидывать в коде зукипер, прокинул хадркодом. + +* При удалении класс диска ничего не знает про парты, прокинул флаг, что надо оставлять данные в S3 параметром, это очень криво получилось. + +* Возможна гонка, если источник отошлет метаданные про парт и тут же решит его удалить до того, как приемник поставит в зукипер пометку. + +* В протоколе репликации обмен инфой через параметр запрос в одну сторону и куку в другую мне не нравится, хотя так сделан обмен версиями репликации. + +* При ошибке должно пытаться реплицироваться по старому, но хз, всегда ли сработает + +* Не будет обратной совместимости, если образуются такие шареные парты, откатиться на старую версию кликхауса не получится, иначе нода может удалить используемые другой данные. + +* И вообще From 9272ed06b427f017d1b95e0d20ff6132f5cb06a2 Mon Sep 17 00:00:00 2001 From: Anton Ivashkin Date: Fri, 9 Oct 2020 17:24:10 +0300 Subject: [PATCH 0045/2357] Move Zookeeper lock for S3 shared part in IMergeTreeDataPart --- src/Storages/MergeTree/DataPartsExchange.cpp | 27 +--------- src/Storages/MergeTree/IMergeTreeDataPart.cpp | 52 +++++++++++++++++++ src/Storages/MergeTree/IMergeTreeDataPart.h | 14 +++++ src/Storages/StorageReplicatedMergeTree.cpp | 28 +--------- 4 files changed, 69 insertions(+), 52 deletions(-) diff --git a/src/Storages/MergeTree/DataPartsExchange.cpp b/src/Storages/MergeTree/DataPartsExchange.cpp index 2708373d1a4..da5acdbefcd 100644 --- a/src/Storages/MergeTree/DataPartsExchange.cpp +++ b/src/Storages/MergeTree/DataPartsExchange.cpp @@ -7,7 +7,6 @@ #include #include #include -#include #include #include #include @@ -240,17 +239,7 @@ void Service::sendPartS3Metadata(const MergeTreeData::DataPartPtr & part, WriteB if (disk->getType() != "s3") throw Exception("S3 disk is not S3 anymore", ErrorCodes::LOGICAL_ERROR); - String id = disk->getUniqueId(part->getFullRelativePath() + "checksums.txt"); - - if (id.empty()) - throw Exception("Can't lock part on S3 storage", ErrorCodes::LOGICAL_ERROR); - - String zookeeper_node = zookeeper_path + "/zero_copy_s3/" + id + "/" + replica_name; - - LOG_TRACE(log, "Set zookeeper lock {}", id); - - zookeeper->createAncestors(zookeeper_node); - zookeeper->createIfNotExists(zookeeper_node, "lock"); + part->lockSharedData(zookeeper_path, replica_name, zookeeper); writeBinary(checksums.files.size(), out); for (const auto & it : checksums.files) @@ -629,19 +618,7 @@ MergeTreeData::MutableDataPartPtr Fetcher::downloadPartToS3( new_data_part->modification_time = time(nullptr); new_data_part->loadColumnsChecksumsIndexes(true, false); - - String id = disk->getUniqueId(new_data_part->getFullRelativePath() + "checksums.txt"); - - if (id.empty()) - throw Exception("Can't lock part on S3 storage", ErrorCodes::LOGICAL_ERROR); - - String zookeeper_node = zookeeper_path + "/zero_copy_s3/" + id + "/" + replica_name; - - LOG_TRACE(log, "Set zookeeper lock {}", id); - - zookeeper->createAncestors(zookeeper_node); - zookeeper->createIfNotExists(zookeeper_node, "lock"); - + new_data_part->lockSharedData(zookeeper_path, replica_name, zookeeper); return new_data_part; } diff --git a/src/Storages/MergeTree/IMergeTreeDataPart.cpp b/src/Storages/MergeTree/IMergeTreeDataPart.cpp index 40a6569cd46..786bc056702 100644 --- a/src/Storages/MergeTree/IMergeTreeDataPart.cpp +++ b/src/Storages/MergeTree/IMergeTreeDataPart.cpp @@ -12,6 +12,7 @@ #include #include #include +#include #include #include #include @@ -1079,6 +1080,56 @@ bool IMergeTreeDataPart::checkAllTTLCalculated(const StorageMetadataPtr & metada return true; } +void IMergeTreeDataPart::lockSharedData(const String & zookeeper_path, const String & replica_name, zkutil::ZooKeeperPtr zookeeper) const +{ + auto disk = volume->getDisk(); + + if (disk->getType() != "s3") + return; + + String id = disk->getUniqueId(getFullRelativePath() + "checksums.txt"); + + if (id.empty()) + throw Exception("Can't lock part on S3 storage", ErrorCodes::LOGICAL_ERROR); + + String zookeeper_node = zookeeper_path + "/zero_copy_s3/" + id + "/" + replica_name; + + LOG_TRACE(storage.log, "Set zookeeper lock {}", id); + + zookeeper->createAncestors(zookeeper_node); + zookeeper->createIfNotExists(zookeeper_node, "lock"); +} + +bool IMergeTreeDataPart::unlockSharedData(const String & zookeeper_path, const String & replica_name, zkutil::ZooKeeperPtr zookeeper) const +{ + auto disk = volume->getDisk(); + + if (disk->getType() != "s3") + return true; + + String id = disk->getUniqueId(getFullRelativePath() + "checksums.txt"); + + if (id.empty()) + return true; + + String zookeeper_part_node = zookeeper_path + "/zero_copy_s3/" + id; + String zookeeper_node = zookeeper_part_node + "/" + replica_name; + + LOG_TRACE(storage.log, "Remove zookeeper lock for {}", id); + + zookeeper->remove(zookeeper_node); + + Strings children; + zookeeper->tryGetChildren(zookeeper_part_node, children); + + if (!children.empty()) + { + LOG_TRACE(storage.log, "Found zookeper locks for {}", id); + } + + return children.empty(); +} + bool isCompactPart(const MergeTreeDataPartPtr & data_part) { return (data_part && data_part->getType() == MergeTreeDataPartType::COMPACT); @@ -1095,3 +1146,4 @@ bool isInMemoryPart(const MergeTreeDataPartPtr & data_part) } } + diff --git a/src/Storages/MergeTree/IMergeTreeDataPart.h b/src/Storages/MergeTree/IMergeTreeDataPart.h index 3e7b03b2903..d40ff40f157 100644 --- a/src/Storages/MergeTree/IMergeTreeDataPart.h +++ b/src/Storages/MergeTree/IMergeTreeDataPart.h @@ -22,6 +22,12 @@ #include +namespace zkutil +{ + class ZooKeeper; + using ZooKeeperPtr = std::shared_ptr; +} + namespace DB { @@ -349,6 +355,14 @@ public: /// part creation (using alter query with materialize_ttl setting). bool checkAllTTLCalculated(const StorageMetadataPtr & metadata_snapshot) const; + /// Lock part in zookeeper for use common S3 data in several nodes + void lockSharedData(const String & zookeeper_path, const String & replica_name, zkutil::ZooKeeperPtr zookeeper) const; + + /// Unlock common S3 data part in zookeeper + /// Return true if data unlocked + /// Return false if data is still used by another node + bool unlockSharedData(const String & zookeeper_path, const String & replica_name, zkutil::ZooKeeperPtr zookeeper) const; + protected: /// Total size of all columns, calculated once in calcuateColumnSizesOnDisk diff --git a/src/Storages/StorageReplicatedMergeTree.cpp b/src/Storages/StorageReplicatedMergeTree.cpp index b1c7c754637..6355894d59e 100644 --- a/src/Storages/StorageReplicatedMergeTree.cpp +++ b/src/Storages/StorageReplicatedMergeTree.cpp @@ -5111,33 +5111,7 @@ void StorageReplicatedMergeTree::clearOldPartsAndRemoveFromZK() { try { - bool keep_s3 = false; - - auto disk = part->volume->getDisk(); - - if (disk->getType() == "s3") - { - String id = disk->getUniqueId(part->getFullRelativePath() + "checksums.txt"); - - if (!id.empty()) - { - String zookeeper_part_node = zookeeper_path + "/zero_copy_s3/" + id; - String zookeeper_node = zookeeper_part_node + "/" + replica_name; - - LOG_TRACE(log, "Remove zookeeper lock for {}", id); - - zookeeper->remove(zookeeper_node); - - Strings children; - zookeeper->tryGetChildren(zookeeper_part_node, children); - if (!children.empty()) - { - LOG_TRACE(log, "Found zookeper locks for {}", id); - keep_s3 = true; - } - } - } - + bool keep_s3 = !part->unlockSharedData(zookeeper_path, replica_name, zookeeper); part->remove(keep_s3); } catch (...) From b877459cf78fbff327c3b75481220e39ea8ee9a6 Mon Sep 17 00:00:00 2001 From: Anton Ivashkin Date: Wed, 14 Oct 2020 18:05:59 +0300 Subject: [PATCH 0046/2357] Zero copy replication over S3: check s3 storage --- src/Disks/DiskDecorator.h | 1 + src/Disks/IDisk.h | 3 + src/Disks/S3/DiskS3.cpp | 38 +++++++ src/Disks/S3/DiskS3.h | 2 + src/Disks/StoragePolicy.cpp | 11 ++ src/Disks/StoragePolicy.h | 3 + src/Storages/MergeTree/DataPartsExchange.cpp | 106 +++++++++++++------ src/Storages/MergeTree/DataPartsExchange.h | 2 +- src/Storages/MergeTree/MergeTreeData.h | 2 + 9 files changed, 136 insertions(+), 32 deletions(-) diff --git a/src/Disks/DiskDecorator.h b/src/Disks/DiskDecorator.h index f1fea043843..86d842ce2cf 100644 --- a/src/Disks/DiskDecorator.h +++ b/src/Disks/DiskDecorator.h @@ -47,6 +47,7 @@ public: void sync(int fd) const override; const String getType() const override { return delegate->getType(); } const String getUniqueId(const String & path) const override { return delegate->getUniqueId(path); } + bool checkFile(const String & path) const override { return delegate->checkFile(path); } protected: DiskPtr delegate; diff --git a/src/Disks/IDisk.h b/src/Disks/IDisk.h index 324384fade6..96a2e5e4669 100644 --- a/src/Disks/IDisk.h +++ b/src/Disks/IDisk.h @@ -198,6 +198,9 @@ public: /// Return some uniq string for file, overrided for S3 virtual const String getUniqueId(const String & path) const { return path; } + /// Check file, overrided for S3 only + virtual bool checkFile(const String & path) const { return exists(path); } + private: /// Returns executor to perform asynchronous operations. Executor & getExecutor() { return *executor; } diff --git a/src/Disks/S3/DiskS3.cpp b/src/Disks/S3/DiskS3.cpp index 9cb3178350c..b563c84094a 100644 --- a/src/Disks/S3/DiskS3.cpp +++ b/src/Disks/S3/DiskS3.cpp @@ -20,6 +20,7 @@ #include #include #include +#include #include @@ -83,6 +84,16 @@ namespace } } + template + void throwIfError(const Aws::Utils::Outcome & response) + { + if (!response.IsSuccess()) + { + const auto & err = response.GetError(); + throw Exception(err.GetMessage(), static_cast(err.GetErrorType())); + } + } + /** * S3 metadata file layout: * Number of S3 objects, Total size of all S3 objects. @@ -835,4 +846,31 @@ void DiskS3::shutdown() client->DisableRequestProcessing(); } +bool DiskS3::checkFile(const String & path) const +{ + Metadata metadata(s3_root_path, metadata_path, path); + + /// empty s3_objects list for empty file + if (metadata.s3_objects.empty()) + return true; + + String object = metadata.s3_root_path + metadata.s3_objects[0].first; + + Aws::S3::Model::ListObjectsRequest request; + request.SetBucket(bucket); + request.SetPrefix(object); + auto resp = client->ListObjects(request); + throwIfError(resp); + Aws::Vector object_list = resp.GetResult().GetContents(); + + /// Should be only one object with name equal to prefix + if (object_list.size() != 1) + return false; + + if (object_list[0].GetKey() != object) + return false; + return true; +} + + } diff --git a/src/Disks/S3/DiskS3.h b/src/Disks/S3/DiskS3.h index 7808f5a8007..07348c53417 100644 --- a/src/Disks/S3/DiskS3.h +++ b/src/Disks/S3/DiskS3.h @@ -112,6 +112,8 @@ public: const String getUniqueId(const String & path) const override; + bool checkFile(const String & path) const override; + private: bool tryReserve(UInt64 bytes); diff --git a/src/Disks/StoragePolicy.cpp b/src/Disks/StoragePolicy.cpp index 1aa20301bc0..746438bc72c 100644 --- a/src/Disks/StoragePolicy.cpp +++ b/src/Disks/StoragePolicy.cpp @@ -123,6 +123,17 @@ Disks StoragePolicy::getDisks() const } +Disks StoragePolicy::getDisksByType(const String & type) const +{ + Disks res; + for (const auto & volume : volumes) + for (const auto & disk : volume->getDisks()) + if (disk->getType() == type) + res.push_back(disk); + return res; +} + + DiskPtr StoragePolicy::getAnyDisk() const { /// StoragePolicy must contain at least one Volume diff --git a/src/Disks/StoragePolicy.h b/src/Disks/StoragePolicy.h index 0e0795d8bf1..b42886afcb2 100644 --- a/src/Disks/StoragePolicy.h +++ b/src/Disks/StoragePolicy.h @@ -41,6 +41,9 @@ public: /// Returns disks ordered by volumes priority Disks getDisks() const; + /// Returns disks by type ordered by volumes priority + Disks getDisksByType(const String & type) const; + /// Returns any disk /// Used when it's not important, for example for /// mutations files diff --git a/src/Storages/MergeTree/DataPartsExchange.cpp b/src/Storages/MergeTree/DataPartsExchange.cpp index da5acdbefcd..678acc2d848 100644 --- a/src/Storages/MergeTree/DataPartsExchange.cpp +++ b/src/Storages/MergeTree/DataPartsExchange.cpp @@ -317,18 +317,13 @@ MergeTreeData::MutableDataPartPtr Fetcher::fetchPart( {"compress", "false"} }); - ReservationPtr reservationS3; + Disks disksS3; if (try_use_s3_copy) { - /// TODO: Make a normal check for S3 Disk - reservationS3 = data.makeEmptyReservationOnLargestDisk(); - auto disk = reservationS3->getDisk(); - - if (disk->getType() != "s3") - { + disksS3 = data.getDisksByType("s3"); + if (disksS3.empty()) try_use_s3_copy = false; - } } if (try_use_s3_copy) @@ -378,7 +373,7 @@ MergeTreeData::MutableDataPartPtr Fetcher::fetchPart( try { - return downloadPartToS3(part_name, replica_path, to_detached, tmp_prefix_, sync, std::move(reservationS3), in); + return downloadPartToS3(part_name, replica_path, to_detached, tmp_prefix_, sync, std::move(disksS3), in); } catch(const Exception& e) { @@ -551,13 +546,14 @@ MergeTreeData::MutableDataPartPtr Fetcher::downloadPartToS3( bool to_detached, const String & tmp_prefix_, bool ,//sync, - const ReservationPtr reservation, + const Disks & disksS3, PooledReadWriteBufferFromHTTP & in ) { - auto disk = reservation->getDisk(); - if (disk->getType() != "s3") - throw Exception("S3 disk is not S3 anymore", ErrorCodes::LOGICAL_ERROR); + if (disksS3.empty()) + throw Exception("No S3 disks anymore", ErrorCodes::LOGICAL_ERROR); + + auto disk = disksS3[0]; static const String TMP_PREFIX = "tmp_fetch_"; String tmp_prefix = tmp_prefix_.empty() ? TMP_PREFIX : tmp_prefix_; @@ -586,29 +582,77 @@ MergeTreeData::MutableDataPartPtr Fetcher::downloadPartToS3( readStringBinary(file_name, in); readBinary(file_size, in); - String metadata_file = disk->getPath() + new_data_part->getFullRelativePath() + file_name; + String data_path = new_data_part->getFullRelativePath() + file_name; + String metadata_file = fullPath(disk, data_path); - auto file_out = createWriteBufferFromFileBase(metadata_file, 0, 0, DBMS_DEFAULT_BUFFER_SIZE, -1); - - HashingWriteBuffer hashing_out(*file_out); - - copyData(in, hashing_out, file_size, blocker.getCounter()); - - if (blocker.isCancelled()) { - /// NOTE The is_cancelled flag also makes sense to check every time you read over the network, - /// performing a poll with a not very large timeout. - /// And now we check it only between read chunks (in the `copyData` function). - throw Exception("Fetching of part was cancelled", ErrorCodes::ABORTED); + auto file_out = createWriteBufferFromFileBase(metadata_file, 0, 0, DBMS_DEFAULT_BUFFER_SIZE, -1); + + HashingWriteBuffer hashing_out(*file_out); + + copyData(in, hashing_out, file_size, blocker.getCounter()); + + if (blocker.isCancelled()) + { + /// NOTE The is_cancelled flag also makes sense to check every time you read over the network, + /// performing a poll with a not very large timeout. + /// And now we check it only between read chunks (in the `copyData` function). + disk->removeRecursive(part_download_path, true); + throw Exception("Fetching of part was cancelled", ErrorCodes::ABORTED); + } + + MergeTreeDataPartChecksum::uint128 expected_hash; + readPODBinary(expected_hash, in); + + if (expected_hash != hashing_out.getHash()) + { + throw Exception("Checksum mismatch for file " + metadata_file + " transferred from " + replica_path, + ErrorCodes::CHECKSUM_DOESNT_MATCH); + } } - MergeTreeDataPartChecksum::uint128 expected_hash; - readPODBinary(expected_hash, in); + if (!i) + { /// Check access for first s3 object of first file + if (!disk->checkFile(data_path)) + { /// Wrong S3 disk + Poco::File metadata(metadata_file); - if (expected_hash != hashing_out.getHash()) - { - throw Exception("Checksum mismatch for file " + metadata_file + " transferred from " + replica_path, - ErrorCodes::CHECKSUM_DOESNT_MATCH); + size_t disk_id = 1; + while (true) + { + if (disk_id >= disksS3.size()) + { /// No more S3 disks + disk->removeRecursive(part_download_path, true); + /// After catch this exception replication continues with full data copy + throw Exception("Can't find S3 drive for shared data", ErrorCodes::S3_ERROR); + } + + /// Try next S3 disk + auto next_disk = disksS3[disk_id]; + + auto next_volume = std::make_shared("volume_" + part_name, next_disk); + MergeTreeData::MutableDataPartPtr next_new_data_part = data.createPart(part_name, next_volume, part_relative_path); + + next_disk->createDirectories(part_download_path); + + String next_data_path = next_new_data_part->getFullRelativePath() + file_name; + String next_metadata_file = fullPath(next_disk, next_data_path); + metadata.copyTo(next_metadata_file); + if (next_disk->checkFile(next_data_path)) + { /// Right disk found + disk->removeRecursive(part_download_path, true); + disk = next_disk; + volume = next_volume; + data_path = next_data_path; + new_data_part = next_new_data_part; + break; + } + + /// Wrong disk again + next_disk->removeRecursive(part_download_path, true); + ++disk_id; + } + } } } diff --git a/src/Storages/MergeTree/DataPartsExchange.h b/src/Storages/MergeTree/DataPartsExchange.h index e2e7b2adf4f..7e59e81d6dc 100644 --- a/src/Storages/MergeTree/DataPartsExchange.h +++ b/src/Storages/MergeTree/DataPartsExchange.h @@ -115,7 +115,7 @@ private: bool to_detached, const String & tmp_prefix_, bool sync, - const ReservationPtr reservation, + const Disks & disksS3, PooledReadWriteBufferFromHTTP & in); MergeTreeData & data; diff --git a/src/Storages/MergeTree/MergeTreeData.h b/src/Storages/MergeTree/MergeTreeData.h index 5c18661dad1..1b620b3bdae 100644 --- a/src/Storages/MergeTree/MergeTreeData.h +++ b/src/Storages/MergeTree/MergeTreeData.h @@ -651,6 +651,8 @@ public: /// Reserves 0 bytes ReservationPtr makeEmptyReservationOnLargestDisk() { return getStoragePolicy()->makeEmptyReservationOnLargestDisk(); } + Disks getDisksByType(const String & type) const { return getStoragePolicy()->getDisksByType(type); } + /// Return alter conversions for part which must be applied on fly. AlterConversions getAlterConversionsForPart(const MergeTreeDataPartPtr part) const; /// Returns destination disk or volume for the TTL rule according to current storage policy From 14a78f87b03141721ad5978793d22c3d8fc36baa Mon Sep 17 00:00:00 2001 From: Anton Ivashkin Date: Thu, 15 Oct 2020 18:23:20 +0300 Subject: [PATCH 0047/2357] Zero copy replication over S3: fetch instead of merge --- S3ZeroCopyReplication.md | 33 ++++++++++++------- src/Storages/MergeTree/IMergeTreeDataPart.cpp | 4 +-- src/Storages/StorageReplicatedMergeTree.cpp | 14 ++++++++ 3 files changed, 38 insertions(+), 13 deletions(-) diff --git a/S3ZeroCopyReplication.md b/S3ZeroCopyReplication.md index 7e7709ff5a7..0744460012a 100644 --- a/S3ZeroCopyReplication.md +++ b/S3ZeroCopyReplication.md @@ -2,8 +2,6 @@ Говнокод просто для теста, не production-ready ни разу. -[Коммит](https://github.com/ianton-ru/ClickHouse/commit/acf86568a7e21176ba2cca15861da231bec6932a) - [Ветка](https://github.com/ianton-ru/ClickHouse/tree/s3_zero_copy_replication) ## Как сделано @@ -14,21 +12,24 @@ Введена новая версия протокола REPLICATION_PROTOCOL_VERSION_WITH_PARTS_S3_COPY. В запросе новый параметр send_s3_metadata, если 1, то приемних просит у источника метаданные вместо данных, если это возможно. Приемник в ответ отсылает куку send_s3_metadata=1 в случае, если идут метаданные. В остальных случаях отсылаются данные, как и прежде. -Применик перед запросом смотрит, будет ли хранить данные в S3. Провеока сейчас кривая - запрашивается резервирование на диске с наибольшим доступным местом, а потом смотрится, не на S3 ли оно. -Если на S3, то отсылает в запросе send_s3_metadata=1. +Применик перед запросом смотрит, будет ли хранить данные в S3. Проверка сейчас кривая - если в сторадже есть S3, то считаем, что будет S3. +Если да S3, то отсылает в запросе send_s3_metadata=1. -Источник при получении такого запроса смотрит, лежит ли парт на S3. Если да, то в Зукипере ставит метку по пути `<путь к данным таблицы>/zero_copy_s3/<некий ID парта>/`, +Источник при получении такого запроса смотрит, лежит ли парт на S3. Если да, то в Зукипере ставит метку по пути `<путь к данным таблицы>/zero_copy_s3/shared/<некий ID парта>/`, ставит в ответ куку send_s3_metadata=1 и вместо файлов с данными отсылает только файлы метаданных. Приемник при получении ответа с send_s3_metadata=1 создает только файлики с идентичными меаданными, которые в итоге будут ссылаться на те же ключи в S3, ставит в зукипере аналогичную метку, -только со своим ID реплики, и работает с этим. +только со своим ID реплики, и работает с этим. Для первого фалйа из списка проверяет наличие первого ы3-объекта (просто наличие), если объект с таким именем найден, то все ок, если нет, то откат на старую версию. +(Сейчас есть еще код на случай наличия более одного диска S3, тогда перебирает все и если на каком-то файл найден, то использует его, но мы внутри команды MDB смотрим на такую конфигурацию как на странную. +Планируем ограничить функционал только случаем одного S3 диска.) -При желании удалить парт нода удаляет в Зукипере ключ `<путь к данным таблицы>/zero_copy_s3/<некий ID парта>/`, потом получает все подключи `<путь к данным таблицы>/zero_copy_s3/<некий ID парта>`. +При желании удалить парт нода удаляет в Зукипере ключ `<путь к данным таблицы>/zero_copy_s3/shared/<некий ID парта>/`, потом получает все подключи `<путь к данным таблицы>/zero_copy_s3/shared/<некий ID парта>`. Если список не пустой, то считает, что данные использует другая нода и удаляет только локальные метаданные, если пустой, то удаляет и данные в S3. -## Костыли и недоработки, коих много +При мерже если реузльтат будет на S3, нода ставит эфемерную метку в Zookeeper по пути `<путь к данным таблицы>/zero_copy_s3/merged/<имя нового парта>`. Если такая метка уже есть, то считает, что другая нода +уже помержила или мержит сейчас, и надо сделать fetch вместо мержа самой. -* Никакой проверки, один и тот же S3 у нод или разный сейчас нет, если будет несколько разных S3, работать не будет. +## Костыли и недоработки, коих много * В качестве ID парта берется имя первого S3-ключа от файла checksums.txt. @@ -40,8 +41,18 @@ * В протоколе репликации обмен инфой через параметр запрос в одну сторону и куку в другую мне не нравится, хотя так сделан обмен версиями репликации. -* При ошибке должно пытаться реплицироваться по старому, но хз, всегда ли сработает +* При ошибке должно пытаться реплицироваться по старому, но не уверен, всегда ли сработает * Не будет обратной совместимости, если образуются такие шареные парты, откатиться на старую версию кликхауса не получится, иначе нода может удалить используемые другой данные. -* И вообще +* Возможны все же дублирования партов. Пример - нода делает мерж, падает. Другая нода незавимо делает мерж, первая нода поднимается. В итоге есть две копии померженого парта. + +* ... много их. Честно. + +## TODO, чего еще вообще не делалось + +* Флаг в конфиге для включения функционала, по умолчанию будет выключен. + +* Для гибридного хранилища сделать проверку и fetch при переезде парта с локального диска в S3. + +* Тесты. diff --git a/src/Storages/MergeTree/IMergeTreeDataPart.cpp b/src/Storages/MergeTree/IMergeTreeDataPart.cpp index 786bc056702..d9098aec1dc 100644 --- a/src/Storages/MergeTree/IMergeTreeDataPart.cpp +++ b/src/Storages/MergeTree/IMergeTreeDataPart.cpp @@ -1092,7 +1092,7 @@ void IMergeTreeDataPart::lockSharedData(const String & zookeeper_path, const Str if (id.empty()) throw Exception("Can't lock part on S3 storage", ErrorCodes::LOGICAL_ERROR); - String zookeeper_node = zookeeper_path + "/zero_copy_s3/" + id + "/" + replica_name; + String zookeeper_node = zookeeper_path + "/zero_copy_s3/shared/" + id + "/" + replica_name; LOG_TRACE(storage.log, "Set zookeeper lock {}", id); @@ -1112,7 +1112,7 @@ bool IMergeTreeDataPart::unlockSharedData(const String & zookeeper_path, const S if (id.empty()) return true; - String zookeeper_part_node = zookeeper_path + "/zero_copy_s3/" + id; + String zookeeper_part_node = zookeeper_path + "/zero_copy_s3/shared/" + id; String zookeeper_node = zookeeper_part_node + "/" + replica_name; LOG_TRACE(storage.log, "Remove zookeeper lock for {}", id); diff --git a/src/Storages/StorageReplicatedMergeTree.cpp b/src/Storages/StorageReplicatedMergeTree.cpp index 6355894d59e..c8e8388028b 100644 --- a/src/Storages/StorageReplicatedMergeTree.cpp +++ b/src/Storages/StorageReplicatedMergeTree.cpp @@ -1436,6 +1436,20 @@ bool StorageReplicatedMergeTree::tryExecuteMerge(const LogEntry & entry) future_merged_part.updatePath(*this, reserved_space); future_merged_part.merge_type = entry.merge_type; + { + auto disk = reserved_space->getDisk(); + if (disk->getType() == "s3") + { + auto zookeeper = getZooKeeper(); + String zookeeper_node = zookeeper_path + "/zero_copy_s3/merged/" + entry.new_part_name; + zookeeper->createAncestors(zookeeper_node); + auto code = zookeeper->tryCreate(zookeeper_node, "lock", zkutil::CreateMode::Ephemeral); + /// Someone else created or started create this merge + if (code == Coordination::Error::ZNODEEXISTS) + return false; + } + } + auto table_id = getStorageID(); MergeList::EntryPtr merge_entry = global_context.getMergeList().insert(table_id.database_name, table_id.table_name, future_merged_part); From fb178ef2139d56775f60d5b9d6cd2401aee6dd8c Mon Sep 17 00:00:00 2001 From: Anton Ivashkin Date: Mon, 19 Oct 2020 15:20:45 +0300 Subject: [PATCH 0048/2357] Zero copy replication over S3: base tests --- S3ZeroCopyReplication.md | 2 +- .../test_s3_zero_copy_replication/__init__.py | 0 .../configs/config.d/s3.xml | 49 +++++++++++ .../test_s3_zero_copy_replication/test.py | 84 +++++++++++++++++++ 4 files changed, 134 insertions(+), 1 deletion(-) create mode 100644 tests/integration/test_s3_zero_copy_replication/__init__.py create mode 100644 tests/integration/test_s3_zero_copy_replication/configs/config.d/s3.xml create mode 100644 tests/integration/test_s3_zero_copy_replication/test.py diff --git a/S3ZeroCopyReplication.md b/S3ZeroCopyReplication.md index 0744460012a..1e152977753 100644 --- a/S3ZeroCopyReplication.md +++ b/S3ZeroCopyReplication.md @@ -1,6 +1,6 @@ # ClickHouse S3 Zero Copy Replication -Говнокод просто для теста, не production-ready ни разу. +Код просто для теста, не production-ready ни разу. [Ветка](https://github.com/ianton-ru/ClickHouse/tree/s3_zero_copy_replication) diff --git a/tests/integration/test_s3_zero_copy_replication/__init__.py b/tests/integration/test_s3_zero_copy_replication/__init__.py new file mode 100644 index 00000000000..e69de29bb2d diff --git a/tests/integration/test_s3_zero_copy_replication/configs/config.d/s3.xml b/tests/integration/test_s3_zero_copy_replication/configs/config.d/s3.xml new file mode 100644 index 00000000000..24a3fb95c53 --- /dev/null +++ b/tests/integration/test_s3_zero_copy_replication/configs/config.d/s3.xml @@ -0,0 +1,49 @@ + + + + + + s3 + http://minio1:9001/root/data/ + minio + minio123 + + + + + +
+ s31 +
+
+
+
+
+ + + 0 + 2 + + + + + + + node1 + 9000 + + + + + node2 + 9000 + + + + + + + test_cluster + + +
diff --git a/tests/integration/test_s3_zero_copy_replication/test.py b/tests/integration/test_s3_zero_copy_replication/test.py new file mode 100644 index 00000000000..278559f73f1 --- /dev/null +++ b/tests/integration/test_s3_zero_copy_replication/test.py @@ -0,0 +1,84 @@ +import logging +import time + +import pytest +from helpers.cluster import ClickHouseCluster + +logging.getLogger().setLevel(logging.INFO) +logging.getLogger().addHandler(logging.StreamHandler()) + + +@pytest.fixture(scope="module") +def cluster(): + try: + cluster = ClickHouseCluster(__file__) + cluster.add_instance("node1", main_configs=["configs/config.d/s3.xml"], macros={'replica': '1'}, + with_minio=True, + with_zookeeper=True) + cluster.add_instance("node2", main_configs=["configs/config.d/s3.xml"], macros={'replica': '2'}, + with_minio=True, + with_zookeeper=True) + logging.info("Starting cluster...") + cluster.start() + logging.info("Cluster started") + + yield cluster + finally: + cluster.shutdown() + + +def get_large_objects_count(cluster, size=100): + minio = cluster.minio_client + counter = 0 + for obj in minio.list_objects(cluster.minio_bucket, 'data/'): + if obj.size >= size: + counter = counter + 1 + return counter + + +@pytest.mark.parametrize( + "policy", ["s3"] +) +def test_s3_zero_copy_replication(cluster, policy): + node1 = cluster.instances["node1"] + node2 = cluster.instances["node2"] + + node1.query( + """ + CREATE TABLE s3_test ON CLUSTER test_cluster (id UInt32, value String) + ENGINE=ReplicatedMergeTree('/clickhouse/tables/s3_test', '{}') + ORDER BY id + SETTINGS storage_policy='{}' + """ + .format('{replica}', policy) + ) + + node1.query("INSERT INTO s3_test VALUES (0,'data'),(1,'data')") + assert node1.query("SELECT * FROM s3_test order by id FORMAT Values") == "(0,'data'),(1,'data')" + assert node2.query("SELECT * FROM s3_test order by id FORMAT Values") == "(0,'data'),(1,'data')" + + # Based on version 20.x - should be only one file with size 100+ (checksums.txt), used by both nodes + assert get_large_objects_count(cluster) == 1 + + node2.query("INSERT INTO s3_test VALUES (2,'data'),(3,'data')") + assert node2.query("SELECT * FROM s3_test order by id FORMAT Values") == "(0,'data'),(1,'data'),(2,'data'),(3,'data')" + assert node1.query("SELECT * FROM s3_test order by id FORMAT Values") == "(0,'data'),(1,'data'),(2,'data'),(3,'data')" + + # Based on version 20.x - two parts + assert get_large_objects_count(cluster) == 2 + + node1.query("OPTIMIZE TABLE s3_test") + + time.sleep(1) + + # Based on version 20.x - after merge, two old parts and one merged + assert get_large_objects_count(cluster) == 3 + + time.sleep(60) + + # Based on version 20.x - after cleanup - only one merged part + assert get_large_objects_count(cluster) == 1 + + node1.query("DROP TABLE IF EXISTS s3_test NO DELAY") + node2.query("DROP TABLE IF EXISTS s3_test NO DELAY") + From 652c56e74e7fcb560c535f6695845c6b16ab32a4 Mon Sep 17 00:00:00 2001 From: Anton Ivashkin Date: Thu, 22 Oct 2020 12:32:05 +0300 Subject: [PATCH 0049/2357] Fix style, fix build --- src/Disks/DiskDecorator.h | 2 +- src/Disks/IDisk.h | 2 +- src/Disks/S3/DiskS3.cpp | 2 +- src/Disks/S3/DiskS3.h | 2 +- src/Storages/MergeTree/DataPartsExchange.cpp | 12 +++++------- src/Storages/MergeTree/DataPartsExchange.h | 7 +++---- src/Storages/MergeTree/IMergeTreeDataPart.cpp | 2 +- src/Storages/MergeTree/IMergeTreeDataPart.h | 2 +- 8 files changed, 14 insertions(+), 17 deletions(-) diff --git a/src/Disks/DiskDecorator.h b/src/Disks/DiskDecorator.h index d230d49b400..8dcdb64ead5 100644 --- a/src/Disks/DiskDecorator.h +++ b/src/Disks/DiskDecorator.h @@ -50,7 +50,7 @@ public: void close(int fd) const override; void sync(int fd) const override; const String getType() const override { return delegate->getType(); } - const String getUniqueId(const String & path) const override { return delegate->getUniqueId(path); } + String getUniqueId(const String & path) const override { return delegate->getUniqueId(path); } bool checkFile(const String & path) const override { return delegate->checkFile(path); } Executor & getExecutor() override; diff --git a/src/Disks/IDisk.h b/src/Disks/IDisk.h index 143b094fb38..63432bc226a 100644 --- a/src/Disks/IDisk.h +++ b/src/Disks/IDisk.h @@ -196,7 +196,7 @@ public: virtual void shutdown() { } /// Return some uniq string for file, overrided for S3 - virtual const String getUniqueId(const String & path) const { return path; } + virtual String getUniqueId(const String & path) const { return path; } /// Check file, overrided for S3 only virtual bool checkFile(const String & path) const { return exists(path); } diff --git a/src/Disks/S3/DiskS3.cpp b/src/Disks/S3/DiskS3.cpp index b563c84094a..8e5e230d9db 100644 --- a/src/Disks/S3/DiskS3.cpp +++ b/src/Disks/S3/DiskS3.cpp @@ -572,7 +572,7 @@ void DiskS3::createDirectories(const String & path) Poco::File(metadata_path + path).createDirectories(); } -const String DiskS3::getUniqueId(const String & path) const +String DiskS3::getUniqueId(const String & path) const { Metadata metadata(s3_root_path, metadata_path, path); String id; diff --git a/src/Disks/S3/DiskS3.h b/src/Disks/S3/DiskS3.h index 07348c53417..cc52722f973 100644 --- a/src/Disks/S3/DiskS3.h +++ b/src/Disks/S3/DiskS3.h @@ -110,7 +110,7 @@ public: void shutdown() override; - const String getUniqueId(const String & path) const override; + String getUniqueId(const String & path) const override; bool checkFile(const String & path) const override; diff --git a/src/Storages/MergeTree/DataPartsExchange.cpp b/src/Storages/MergeTree/DataPartsExchange.cpp index c6568340620..265d855ba31 100644 --- a/src/Storages/MergeTree/DataPartsExchange.cpp +++ b/src/Storages/MergeTree/DataPartsExchange.cpp @@ -3,7 +3,6 @@ #include #include #include -#include #include #include #include @@ -269,7 +268,7 @@ void Service::sendPartS3Metadata(const MergeTreeData::DataPartPtr & part, WriteB throw Exception("Unexpected size of file " + metadata_file, ErrorCodes::BAD_SIZE_OF_FILE_IN_DATA_PART); writePODBinary(hashing_out.getHash(), out); - } + } } MergeTreeData::DataPartPtr Service::findPart(const String & name) @@ -359,7 +358,7 @@ MergeTreeData::MutableDataPartPtr Fetcher::fetchPart( throw Exception("Got 'send_s3_metadata' cookie with old protocol version", ErrorCodes::LOGICAL_ERROR); if (!try_use_s3_copy) throw Exception("Got 'send_s3_metadata' cookie when was not requested", ErrorCodes::LOGICAL_ERROR); - + size_t sum_files_size = 0; readBinary(sum_files_size, in); IMergeTreeDataPart::TTLInfos ttl_infos; @@ -373,14 +372,14 @@ MergeTreeData::MutableDataPartPtr Fetcher::fetchPart( try { - return downloadPartToS3(part_name, replica_path, to_detached, tmp_prefix_, sync, std::move(disksS3), in); + return downloadPartToS3(part_name, replica_path, to_detached, tmp_prefix_, std::move(disksS3), in); } - catch(const Exception& e) + catch (const Exception & e) { if (e.code() != ErrorCodes::S3_ERROR) throw; /// Try again but without S3 copy - return fetchPart(metadata_snapshot, part_name, replica_path, host, port, timeouts, + return fetchPart(metadata_snapshot, part_name, replica_path, host, port, timeouts, user, password, interserver_scheme, to_detached, tmp_prefix_, false); } } @@ -545,7 +544,6 @@ MergeTreeData::MutableDataPartPtr Fetcher::downloadPartToS3( const String & replica_path, bool to_detached, const String & tmp_prefix_, - bool ,//sync, const Disks & disksS3, PooledReadWriteBufferFromHTTP & in ) diff --git a/src/Storages/MergeTree/DataPartsExchange.h b/src/Storages/MergeTree/DataPartsExchange.h index 7e59e81d6dc..ac591c2046a 100644 --- a/src/Storages/MergeTree/DataPartsExchange.h +++ b/src/Storages/MergeTree/DataPartsExchange.h @@ -35,8 +35,8 @@ public: std::string getId(const std::string & node_id) const override; void processQuery(const Poco::Net::HTMLForm & params, ReadBuffer & body, WriteBuffer & out, Poco::Net::HTTPServerResponse & response) override; - void setZooKeeper(const zkutil::ZooKeeperPtr & zookeeper_, const String & zookeeper_path_, const String & replica_name_) override - { + void setZooKeeper(const zkutil::ZooKeeperPtr & zookeeper_, const String & zookeeper_path_, const String & replica_name_) override + { zookeeper = zookeeper_; zookeeper_path = zookeeper_path_; replica_name = replica_name_; @@ -87,7 +87,7 @@ public: ActionBlocker blocker; void setZooKeeper(const zkutil::ZooKeeperPtr & zookeeper_, const String & zookeeper_path_, const String & replica_name_) - { + { zookeeper = zookeeper_; zookeeper_path = zookeeper_path_; replica_name = replica_name_; @@ -114,7 +114,6 @@ private: const String & replica_path, bool to_detached, const String & tmp_prefix_, - bool sync, const Disks & disksS3, PooledReadWriteBufferFromHTTP & in); diff --git a/src/Storages/MergeTree/IMergeTreeDataPart.cpp b/src/Storages/MergeTree/IMergeTreeDataPart.cpp index 93f424cf0d1..badfb32cf58 100644 --- a/src/Storages/MergeTree/IMergeTreeDataPart.cpp +++ b/src/Storages/MergeTree/IMergeTreeDataPart.cpp @@ -1106,7 +1106,7 @@ void IMergeTreeDataPart::lockSharedData(const String & zookeeper_path, const Str if (id.empty()) throw Exception("Can't lock part on S3 storage", ErrorCodes::LOGICAL_ERROR); - + String zookeeper_node = zookeeper_path + "/zero_copy_s3/shared/" + id + "/" + replica_name; LOG_TRACE(storage.log, "Set zookeeper lock {}", id); diff --git a/src/Storages/MergeTree/IMergeTreeDataPart.h b/src/Storages/MergeTree/IMergeTreeDataPart.h index 9213578c831..8d21f5856fc 100644 --- a/src/Storages/MergeTree/IMergeTreeDataPart.h +++ b/src/Storages/MergeTree/IMergeTreeDataPart.h @@ -361,7 +361,7 @@ public: /// Lock part in zookeeper for use common S3 data in several nodes void lockSharedData(const String & zookeeper_path, const String & replica_name, zkutil::ZooKeeperPtr zookeeper) const; - + /// Unlock common S3 data part in zookeeper /// Return true if data unlocked /// Return false if data is still used by another node From 478eb0b8a5df5f602651268cc396178b6adcf17e Mon Sep 17 00:00:00 2001 From: Alexander Tokmakov Date: Thu, 22 Oct 2020 18:08:00 +0300 Subject: [PATCH 0050/2357] fix --- src/Databases/DatabaseReplicated.cpp | 206 ++++++++++++-------- src/Databases/IDatabase.h | 3 +- src/Databases/ya.make | 1 + src/Interpreters/InterpreterAlterQuery.cpp | 3 +- src/Interpreters/InterpreterCreateQuery.cpp | 10 +- src/Interpreters/InterpreterDropQuery.cpp | 19 +- src/Interpreters/InterpreterRenameQuery.cpp | 10 +- 7 files changed, 149 insertions(+), 103 deletions(-) diff --git a/src/Databases/DatabaseReplicated.cpp b/src/Databases/DatabaseReplicated.cpp index 42662d836d4..328f5476064 100644 --- a/src/Databases/DatabaseReplicated.cpp +++ b/src/Databases/DatabaseReplicated.cpp @@ -1,3 +1,4 @@ +#include #include #include #include @@ -7,20 +8,15 @@ #include #include #include -#include #include +#include #include #include #include -#include - -#include namespace DB { - - namespace ErrorCodes { extern const int NO_ZOOKEEPER; @@ -60,29 +56,34 @@ DatabaseReplicated::DatabaseReplicated( , zookeeper_path(zookeeper_path_) , replica_name(replica_name_) { - if (zookeeper_path.empty() || replica_name.empty()) { + if (zookeeper_path.empty() || replica_name.empty()) + { throw Exception("ZooKeeper path and replica name must be non-empty", ErrorCodes::BAD_ARGUMENTS); } if (zookeeper_path.back() == '/') zookeeper_path.resize(zookeeper_path.size() - 1); - // If zookeeper chroot prefix is used, path should start with '/', because chroot concatenates without it. + /// If zookeeper chroot prefix is used, path should start with '/', because chroot concatenates without it. if (zookeeper_path.front() != '/') zookeeper_path = "/" + zookeeper_path; - if (context_.hasZooKeeper()) { + if (context_.hasZooKeeper()) + { current_zookeeper = context_.getZooKeeper(); } if (!current_zookeeper) { - throw Exception("Can't create replicated database without ZooKeeper", ErrorCodes::NO_ZOOKEEPER); + throw Exception("Can't create replicated database without ZooKeeper", ErrorCodes::NO_ZOOKEEPER); } - // New database - if (!current_zookeeper->exists(zookeeper_path)) { + /// New database + if (!current_zookeeper->exists(zookeeper_path)) + { createDatabaseZKNodes(); - // Old replica recovery - } else if (current_zookeeper->exists(zookeeper_path + "/replicas/" + replica_name)) { + /// Old replica recovery + } + else if (current_zookeeper->exists(zookeeper_path + "/replicas/" + replica_name)) + { String remote_last_entry = current_zookeeper->get(zookeeper_path + "/replicas/" + replica_name, {}, nullptr); String local_last_entry; @@ -93,16 +94,22 @@ DatabaseReplicated::DatabaseReplicated( } catch (const Exception &) { - // Metadata is corrupted. - // Replica erases the previous zk last executed log entry - // and behaves like a new clean replica. - writeLastExecutedToDiskAndZK(); + /// Metadata is corrupted. + /// Replica erases the previous zk last executed log entry + /// and behaves like a new clean replica. + writeLastExecutedToDiskAndZK(); } - if (!local_last_entry.empty() && local_last_entry == remote_last_entry) { + if (!local_last_entry.empty() && local_last_entry == remote_last_entry) + { last_executed_log_entry = local_last_entry; - } else { - throw Exception("Replica name might be in use by a different node. Please check replica_name parameter. Remove .last_entry file from metadata to create a new replica.", ErrorCodes::LOGICAL_ERROR); + } + else + { + throw Exception( + "Replica name might be in use by a different node. Please check replica_name parameter. Remove .last_entry file from " + "metadata to create a new replica.", + ErrorCodes::LOGICAL_ERROR); } } @@ -110,12 +117,15 @@ DatabaseReplicated::DatabaseReplicated( feedback_timeout = context_.getConfigRef().getInt("database_replicated_feedback_timeout", 0); LOG_DEBUG(log, "Snapshot period is set to {} log entries per one snapshot", snapshot_period); - background_log_executor = context_.getReplicatedSchedulePool().createTask(database_name + "(DatabaseReplicated::background_executor)", [this]{ runBackgroundLogExecutor();} ); + background_log_executor = context_.getReplicatedSchedulePool().createTask( + database_name + "(DatabaseReplicated::background_executor)", [this] { runBackgroundLogExecutor(); } + ); background_log_executor->scheduleAfter(500); } -void DatabaseReplicated::createDatabaseZKNodes() { +void DatabaseReplicated::createDatabaseZKNodes() +{ current_zookeeper = getZooKeeper(); current_zookeeper->createAncestors(zookeeper_path); @@ -126,31 +136,34 @@ void DatabaseReplicated::createDatabaseZKNodes() { current_zookeeper->createIfNotExists(zookeeper_path + "/replicas", String()); } -void DatabaseReplicated::RemoveOutdatedSnapshotsAndLog() { - // This method removes all snapshots and logged queries - // that no longer will be in use by current replicas or - // new coming ones. - // Each registered replica has its state in ZooKeeper. - // Therefore, snapshots and logged queries that are less - // than a least advanced replica are removed. - // It does not interfere with a new coming replica - // metadata loading from snapshot - // because the replica will use the latest snapshot available - // and this snapshot will set the last executed log query - // to a greater one than the least advanced current replica. +void DatabaseReplicated::RemoveOutdatedSnapshotsAndLog() +{ + /// This method removes all snapshots and logged queries + /// that no longer will be in use by current replicas or + /// new coming ones. + /// Each registered replica has its state in ZooKeeper. + /// Therefore, snapshots and logged queries that are less + /// than a least advanced replica are removed. + /// It does not interfere with a new coming replica + /// metadata loading from snapshot + /// because the replica will use the latest snapshot available + /// and this snapshot will set the last executed log query + /// to a greater one than the least advanced current replica. current_zookeeper = getZooKeeper(); Strings replica_states = current_zookeeper->getChildren(zookeeper_path + "/replicas"); auto least_advanced = std::min_element(replica_states.begin(), replica_states.end()); Strings snapshots = current_zookeeper->getChildren(zookeeper_path + "/snapshots"); - - if (snapshots.size() < 2) { + + if (snapshots.size() < 2) + { return; } std::sort(snapshots.begin(), snapshots.end()); auto still_useful = std::lower_bound(snapshots.begin(), snapshots.end(), *least_advanced); snapshots.erase(still_useful, snapshots.end()); - for (const String & snapshot : snapshots) { + for (const String & snapshot : snapshots) + { current_zookeeper->tryRemoveRecursive(zookeeper_path + "/snapshots/" + snapshot); } @@ -158,14 +171,17 @@ void DatabaseReplicated::RemoveOutdatedSnapshotsAndLog() { std::sort(log_entry_names.begin(), log_entry_names.end()); auto still_useful_log = std::upper_bound(log_entry_names.begin(), log_entry_names.end(), *still_useful); log_entry_names.erase(still_useful_log, log_entry_names.end()); - for (const String & log_entry_name : log_entry_names) { + for (const String & log_entry_name : log_entry_names) + { String log_entry_path = zookeeper_path + "/log/" + log_entry_name; current_zookeeper->tryRemove(log_entry_path); } } -void DatabaseReplicated::runBackgroundLogExecutor() { - if (last_executed_log_entry == "") { +void DatabaseReplicated::runBackgroundLogExecutor() +{ + if (last_executed_log_entry == "") + { loadMetadataFromSnapshot(); } @@ -177,7 +193,8 @@ void DatabaseReplicated::runBackgroundLogExecutor() { log_entry_names.erase(log_entry_names.begin(), newest_entry_it); - for (const String & log_entry_name : log_entry_names) { + for (const String & log_entry_name : log_entry_names) + { executeLogName(log_entry_name); last_executed_log_entry = log_entry_name; writeLastExecutedToDiskAndZK(); @@ -185,8 +202,9 @@ void DatabaseReplicated::runBackgroundLogExecutor() { int log_n = parse(log_entry_name.substr(4)); int last_log_n = parse(log_entry_names.back().substr(4)); - // The third condition gurantees at most one snapshot creation per batch - if (log_n > 0 && snapshot_period > 0 && (last_log_n - log_n) / snapshot_period == 0 && log_n % snapshot_period == 0) { + /// The third condition gurantees at most one snapshot creation per batch + if (log_n > 0 && snapshot_period > 0 && (last_log_n - log_n) / snapshot_period == 0 && log_n % snapshot_period == 0) + { createSnapshot(); } } @@ -194,9 +212,11 @@ void DatabaseReplicated::runBackgroundLogExecutor() { background_log_executor->scheduleAfter(500); } -void DatabaseReplicated::writeLastExecutedToDiskAndZK() { +void DatabaseReplicated::writeLastExecutedToDiskAndZK() +{ current_zookeeper = getZooKeeper(); - current_zookeeper->createOrUpdate(zookeeper_path + "/replicas/" + replica_name, last_executed_log_entry, zkutil::CreateMode::Persistent); + current_zookeeper->createOrUpdate( + zookeeper_path + "/replicas/" + replica_name, last_executed_log_entry, zkutil::CreateMode::Persistent); String metadata_file = getMetadataPath() + ".last_entry"; WriteBufferFromFile out(metadata_file, last_executed_log_entry.size(), O_WRONLY | O_CREAT); @@ -207,42 +227,47 @@ void DatabaseReplicated::writeLastExecutedToDiskAndZK() { out.close(); } -void DatabaseReplicated::executeLogName(const String & log_entry_name) { - String path = zookeeper_path + "/log/" + log_entry_name; - current_zookeeper = getZooKeeper(); - String query_to_execute = current_zookeeper->get(path, {}, nullptr); +void DatabaseReplicated::executeLogName(const String & log_entry_name) +{ + String path = zookeeper_path + "/log/" + log_entry_name; + current_zookeeper = getZooKeeper(); + String query_to_execute = current_zookeeper->get(path, {}, nullptr); - try - { - current_context = std::make_unique(global_context); - current_context->getClientInfo().query_kind = ClientInfo::QueryKind::REPLICATED_LOG_QUERY; - current_context->setCurrentDatabase(database_name); - current_context->setCurrentQueryId(""); // generate random query_id - executeQuery(query_to_execute, *current_context); - } - catch (const Exception & e) - { - tryLogCurrentException(log, "Query from zookeeper " + query_to_execute + " wasn't finished successfully"); - current_zookeeper->create(zookeeper_path + "/replicas/" + replica_name + "/errors/" + log_entry_name, e.what(), zkutil::CreateMode::Persistent); - } + try + { + current_context = std::make_unique(global_context); + current_context->getClientInfo().query_kind = ClientInfo::QueryKind::REPLICATED_LOG_QUERY; + current_context->setCurrentDatabase(database_name); + current_context->setCurrentQueryId(""); // generate random query_id + executeQuery(query_to_execute, *current_context); + } + catch (const Exception & e) + { + tryLogCurrentException(log, "Query from zookeeper " + query_to_execute + " wasn't finished successfully"); + current_zookeeper->create( + zookeeper_path + "/replicas/" + replica_name + "/errors/" + log_entry_name, e.what(), zkutil::CreateMode::Persistent); + } - LOG_DEBUG(log, "Executed query: {}", query_to_execute); + LOG_DEBUG(log, "Executed query: {}", query_to_execute); } -void DatabaseReplicated::propose(const ASTPtr & query) { +void DatabaseReplicated::propose(const ASTPtr & query) +{ current_zookeeper = getZooKeeper(); LOG_DEBUG(log, "Proposing query: {}", queryToString(query)); { std::lock_guard lock(log_name_mutex); - log_name_to_exec_with_result = current_zookeeper->create(zookeeper_path + "/log/log-", queryToString(query), zkutil::CreateMode::PersistentSequential); + log_name_to_exec_with_result + = current_zookeeper->create(zookeeper_path + "/log/log-", queryToString(query), zkutil::CreateMode::PersistentSequential); } background_log_executor->schedule(); } -BlockIO DatabaseReplicated::getFeedback() { +BlockIO DatabaseReplicated::getFeedback() +{ BlockIO res; if (feedback_timeout == 0) return res; @@ -260,39 +285,48 @@ BlockIO DatabaseReplicated::getFeedback() { Strings replica_states = current_zookeeper->getChildren(zookeeper_path + "/replicas"); auto replica_iter = replica_states.begin(); - while (!replica_states.empty() && watch.elapsedSeconds() < feedback_timeout) { + while (!replica_states.empty() && watch.elapsedSeconds() < feedback_timeout) + { String last_executed = current_zookeeper->get(zookeeper_path + "/replicas/" + *replica_iter); - if (last_executed > log_name_to_exec_with_result) { + if (last_executed > log_name_to_exec_with_result) + { replica_name_column->insert(*replica_iter); String err_path = zookeeper_path + "/replicas/" + *replica_iter + "/errors/" + log_name_to_exec_with_result; - if (!current_zookeeper->exists(err_path)) { + if (!current_zookeeper->exists(err_path)) + { feedback_column->insert("OK"); - } else { + } + else + { String feedback = current_zookeeper->get(err_path, {}, nullptr); feedback_column->insert(feedback); } - replica_states.erase(replica_iter); - replica_iter = replica_states.begin(); + replica_states.erase(replica_iter); + replica_iter = replica_states.begin(); } } Block block = Block({ {std::move(replica_name_column), block_structure[0].type, block_structure[0].name}, - {std::move(feedback_column), block_structure[1].type, block_structure[1].name}}); + {std::move(feedback_column), block_structure[1].type, block_structure[1].name} + }); res.in = std::make_shared(block); return res; } -void DatabaseReplicated::createSnapshot() { +void DatabaseReplicated::createSnapshot() +{ current_zookeeper = getZooKeeper(); String snapshot_path = zookeeper_path + "/snapshots/" + last_executed_log_entry; - if (Coordination::Error::ZNODEEXISTS == current_zookeeper->tryCreate(snapshot_path, String(), zkutil::CreateMode::Persistent)) { + if (Coordination::Error::ZNODEEXISTS == current_zookeeper->tryCreate(snapshot_path, String(), zkutil::CreateMode::Persistent)) + { return; } - - for (auto iterator = getTablesIterator(global_context, {}); iterator->isValid(); iterator->next()) { + + for (auto iterator = getTablesIterator(global_context, {}); iterator->isValid(); iterator->next()) + { String table_name = iterator->name(); auto query = getCreateQueryFromMetadata(getObjectMetadataPath(table_name), true); String statement = queryToString(query); @@ -303,9 +337,10 @@ void DatabaseReplicated::createSnapshot() { RemoveOutdatedSnapshotsAndLog(); } -void DatabaseReplicated::loadMetadataFromSnapshot() { - // Executes the latest snapshot. - // Used by new replicas only. +void DatabaseReplicated::loadMetadataFromSnapshot() +{ + /// Executes the latest snapshot. + /// Used by new replicas only. current_zookeeper = getZooKeeper(); Strings snapshots; @@ -313,12 +348,14 @@ void DatabaseReplicated::loadMetadataFromSnapshot() { return; auto latest_snapshot = std::max_element(snapshots.begin(), snapshots.end()); - while (snapshots.size() > 0 && !current_zookeeper->exists(zookeeper_path + "/snapshots/" + *latest_snapshot + "/.completed")) { + while (snapshots.size() > 0 && !current_zookeeper->exists(zookeeper_path + "/snapshots/" + *latest_snapshot + "/.completed")) + { snapshots.erase(latest_snapshot); latest_snapshot = std::max_element(snapshots.begin(), snapshots.end()); } - if (snapshots.size() < 1) { + if (snapshots.size() < 1) + { return; } @@ -328,7 +365,8 @@ void DatabaseReplicated::loadMetadataFromSnapshot() { LOG_DEBUG(log, "Executing {} snapshot", *latest_snapshot); - for (auto t = metadatas.begin(); t != metadatas.end(); ++t) { + for (auto t = metadatas.begin(); t != metadatas.end(); ++t) + { String path = zookeeper_path + "/snapshots/" + *latest_snapshot + "/" + *t; String query_to_execute = current_zookeeper->get(path, {}, nullptr); diff --git a/src/Databases/IDatabase.h b/src/Databases/IDatabase.h index 2fd0c62b72e..9bec6394be7 100644 --- a/src/Databases/IDatabase.h +++ b/src/Databases/IDatabase.h @@ -181,7 +181,8 @@ public: virtual bool empty() const = 0; /// Submit query to log. Currently used by DatabaseReplicated engine only. - virtual void propose(const ASTPtr & /*query*/) { + virtual void propose(const ASTPtr & /*query*/) + { throw Exception(getEngineName() + ": propose() is not supported", ErrorCodes::NOT_IMPLEMENTED); } diff --git a/src/Databases/ya.make b/src/Databases/ya.make index b4173057e03..4ce56859d66 100644 --- a/src/Databases/ya.make +++ b/src/Databases/ya.make @@ -15,6 +15,7 @@ SRCS( DatabaseMemory.cpp DatabaseOnDisk.cpp DatabaseOrdinary.cpp + DatabaseReplicated.cpp DatabasesCommon.cpp DatabaseWithDictionaries.cpp MySQL/ConnectionMySQLSettings.cpp diff --git a/src/Interpreters/InterpreterAlterQuery.cpp b/src/Interpreters/InterpreterAlterQuery.cpp index 0b53e84564f..e229cb120e5 100644 --- a/src/Interpreters/InterpreterAlterQuery.cpp +++ b/src/Interpreters/InterpreterAlterQuery.cpp @@ -51,7 +51,8 @@ BlockIO InterpreterAlterQuery::execute() auto metadata_snapshot = table->getInMemoryMetadataPtr(); DatabasePtr database = DatabaseCatalog::instance().getDatabase(table_id.database_name); - if (database->getEngineName() == "Replicated" && context.getClientInfo().query_kind != ClientInfo::QueryKind::REPLICATED_LOG_QUERY && !table->supportsReplication()) { + if (database->getEngineName() == "Replicated" && context.getClientInfo().query_kind != ClientInfo::QueryKind::REPLICATED_LOG_QUERY && !table->supportsReplication()) + { database->propose(query_ptr); auto * database_replicated = typeid_cast(database.get()); return database_replicated->getFeedback(); diff --git a/src/Interpreters/InterpreterCreateQuery.cpp b/src/Interpreters/InterpreterCreateQuery.cpp index 7c809e65639..5210230859c 100644 --- a/src/Interpreters/InterpreterCreateQuery.cpp +++ b/src/Interpreters/InterpreterCreateQuery.cpp @@ -75,6 +75,7 @@ namespace ErrorCodes extern const int DICTIONARY_ALREADY_EXISTS; extern const int ILLEGAL_SYNTAX_FOR_DATA_TYPE; extern const int ILLEGAL_COLUMN; + extern const int LOGICAL_ERROR; } namespace fs = std::filesystem; @@ -713,14 +714,16 @@ BlockIO InterpreterCreateQuery::createTable(ASTCreateQuery & create) /// contain the right database name for every replica /// therefore for such queries the AST database /// field is modified right before an actual execution - if (context.getClientInfo().query_kind == ClientInfo::QueryKind::REPLICATED_LOG_QUERY) { + if (context.getClientInfo().query_kind == ClientInfo::QueryKind::REPLICATED_LOG_QUERY) + { create.database = current_database; } /// Actually creates table bool created = doCreateTable(create, properties); - if (database->getEngineName() == "Replicated" && context.getClientInfo().query_kind != ClientInfo::QueryKind::REPLICATED_LOG_QUERY) { + if (database->getEngineName() == "Replicated" && context.getClientInfo().query_kind != ClientInfo::QueryKind::REPLICATED_LOG_QUERY) + { auto * database_replicated = typeid_cast(database.get()); return database_replicated->getFeedback(); } @@ -786,7 +789,8 @@ bool InterpreterCreateQuery::doCreateTable(ASTCreateQuery & create, return true; } - if (database->getEngineName() == "Replicated" && context.getClientInfo().query_kind != ClientInfo::QueryKind::REPLICATED_LOG_QUERY) { + if (database->getEngineName() == "Replicated" && context.getClientInfo().query_kind != ClientInfo::QueryKind::REPLICATED_LOG_QUERY) + { database->propose(query_ptr); return true; } diff --git a/src/Interpreters/InterpreterDropQuery.cpp b/src/Interpreters/InterpreterDropQuery.cpp index 455b40c30e3..393f4ef3dc9 100644 --- a/src/Interpreters/InterpreterDropQuery.cpp +++ b/src/Interpreters/InterpreterDropQuery.cpp @@ -101,11 +101,10 @@ BlockIO InterpreterDropQuery::executeToTable( if (database->getEngineName() != "Atomic" && database->getEngineName() != "Replicated") table_lock = table->lockExclusively(context.getCurrentQueryId(), context.getSettingsRef().lock_acquire_timeout); /// Drop table from memory, don't touch data and metadata - if (database->getEngineName() == "Replicated" && context.getClientInfo().query_kind != ClientInfo::QueryKind::REPLICATED_LOG_QUERY) { + if (database->getEngineName() == "Replicated" && context.getClientInfo().query_kind != ClientInfo::QueryKind::REPLICATED_LOG_QUERY) database->propose(query_ptr); - } else { + else database->detachTable(table_id.table_name); - } } else if (query.kind == ASTDropQuery::Kind::Truncate) { @@ -115,11 +114,10 @@ BlockIO InterpreterDropQuery::executeToTable( auto table_lock = table->lockExclusively(context.getCurrentQueryId(), context.getSettingsRef().lock_acquire_timeout); auto metadata_snapshot = table->getInMemoryMetadataPtr(); /// Drop table data, don't touch metadata - if (database->getEngineName() == "Replicated" && context.getClientInfo().query_kind != ClientInfo::QueryKind::REPLICATED_LOG_QUERY) { + if (database->getEngineName() == "Replicated" && context.getClientInfo().query_kind != ClientInfo::QueryKind::REPLICATED_LOG_QUERY) database->propose(query_ptr); - } else { + else table->truncate(query_ptr, metadata_snapshot, context, table_lock); - } } else if (query.kind == ASTDropQuery::Kind::Drop) { @@ -132,12 +130,11 @@ BlockIO InterpreterDropQuery::executeToTable( if (database->getEngineName() != "Atomic" && database->getEngineName() != "Replicated") table_lock = table->lockExclusively(context.getCurrentQueryId(), context.getSettingsRef().lock_acquire_timeout); - // Prevents recursive drop from drop database query. The original query must specify a table. - if (!query_ptr->as().table.empty() && database->getEngineName() == "Replicated" && context.getClientInfo().query_kind != ClientInfo::QueryKind::REPLICATED_LOG_QUERY) { + /// Prevents recursive drop from drop database query. The original query must specify a table. + if (!query_ptr->as().table.empty() && database->getEngineName() == "Replicated" && context.getClientInfo().query_kind != ClientInfo::QueryKind::REPLICATED_LOG_QUERY) database->propose(query_ptr); - } else { + else database->dropTable(context, table_id.table_name, query.no_delay); - } } } @@ -154,7 +151,7 @@ BlockIO InterpreterDropQuery::executeToTable( } } - if (database->getEngineName() == "Replicated" && context.getClientInfo().query_kind != ClientInfo::QueryKind::REPLICATED_LOG_QUERY) + if (database && database->getEngineName() == "Replicated" && context.getClientInfo().query_kind != ClientInfo::QueryKind::REPLICATED_LOG_QUERY) { auto * database_replicated = typeid_cast(database.get()); return database_replicated->getFeedback(); diff --git a/src/Interpreters/InterpreterRenameQuery.cpp b/src/Interpreters/InterpreterRenameQuery.cpp index 3d8855b6458..65ed33bd9db 100644 --- a/src/Interpreters/InterpreterRenameQuery.cpp +++ b/src/Interpreters/InterpreterRenameQuery.cpp @@ -75,9 +75,12 @@ BlockIO InterpreterRenameQuery::executeToTables(const ASTRenameQuery & rename, c database_catalog.assertTableDoesntExist(StorageID(elem.to_database_name, elem.to_table_name), context); DatabasePtr database = database_catalog.getDatabase(elem.from_database_name); - if (database->getEngineName() == "Replicated" && context.getClientInfo().query_kind != ClientInfo::QueryKind::REPLICATED_LOG_QUERY) { + if (database->getEngineName() == "Replicated" && context.getClientInfo().query_kind != ClientInfo::QueryKind::REPLICATED_LOG_QUERY) + { database->propose(query_ptr); - } else { + } + else + { database->renameTable( context, elem.from_table_name, @@ -88,7 +91,8 @@ BlockIO InterpreterRenameQuery::executeToTables(const ASTRenameQuery & rename, c } // TODO it can't work - if (database->getEngineName() == "Replicated" && context.getClientInfo().query_kind != ClientInfo::QueryKind::REPLICATED_LOG_QUERY) { + if (database->getEngineName() == "Replicated" && context.getClientInfo().query_kind != ClientInfo::QueryKind::REPLICATED_LOG_QUERY) + { auto * database_replicated = typeid_cast(database.get()); return database_replicated->getFeedback(); } From 1ffe0b1d03db9fedafe4918489b4ca5598553480 Mon Sep 17 00:00:00 2001 From: Anton Ivashkin Date: Fri, 23 Oct 2020 13:01:40 +0300 Subject: [PATCH 0051/2357] S3 zero copy replication: fix tests --- .../configs/config.d/storage_conf.xml | 1 + .../__init__.py | 0 .../configs/config.d/storage_conf.xml | 50 +++++++++ .../test.py | 105 ++++++++++++++++++ .../configs/config.d/s3.xml | 1 + .../test_s3_zero_copy_replication/test.py | 2 + 6 files changed, 159 insertions(+) create mode 100644 tests/integration/test_replicated_merge_tree_s3_zero_copy/__init__.py create mode 100644 tests/integration/test_replicated_merge_tree_s3_zero_copy/configs/config.d/storage_conf.xml create mode 100644 tests/integration/test_replicated_merge_tree_s3_zero_copy/test.py diff --git a/tests/integration/test_replicated_merge_tree_s3/configs/config.d/storage_conf.xml b/tests/integration/test_replicated_merge_tree_s3/configs/config.d/storage_conf.xml index 20b750ffff3..1f75a4efeae 100644 --- a/tests/integration/test_replicated_merge_tree_s3/configs/config.d/storage_conf.xml +++ b/tests/integration/test_replicated_merge_tree_s3/configs/config.d/storage_conf.xml @@ -21,6 +21,7 @@ 0 + 0 diff --git a/tests/integration/test_replicated_merge_tree_s3_zero_copy/__init__.py b/tests/integration/test_replicated_merge_tree_s3_zero_copy/__init__.py new file mode 100644 index 00000000000..e69de29bb2d diff --git a/tests/integration/test_replicated_merge_tree_s3_zero_copy/configs/config.d/storage_conf.xml b/tests/integration/test_replicated_merge_tree_s3_zero_copy/configs/config.d/storage_conf.xml new file mode 100644 index 00000000000..d8c7f49fc49 --- /dev/null +++ b/tests/integration/test_replicated_merge_tree_s3_zero_copy/configs/config.d/storage_conf.xml @@ -0,0 +1,50 @@ + + + + + s3 + http://minio1:9001/root/data/ + minio + minio123 + + + + + +
+ s3 +
+
+
+
+
+ + + 0 + 1 + + + + + + + node1 + 9000 + + + node2 + 9000 + + + node3 + 9000 + + + + + + + 0 + + +
diff --git a/tests/integration/test_replicated_merge_tree_s3_zero_copy/test.py b/tests/integration/test_replicated_merge_tree_s3_zero_copy/test.py new file mode 100644 index 00000000000..793abc53566 --- /dev/null +++ b/tests/integration/test_replicated_merge_tree_s3_zero_copy/test.py @@ -0,0 +1,105 @@ +import logging +import random +import string + +import pytest +from helpers.cluster import ClickHouseCluster + +logging.getLogger().setLevel(logging.INFO) +logging.getLogger().addHandler(logging.StreamHandler()) + + +@pytest.fixture(scope="module") +def cluster(): + try: + cluster = ClickHouseCluster(__file__) + + cluster.add_instance("node1", main_configs=["configs/config.d/storage_conf.xml"], macros={'replica': '1'}, + with_minio=True, with_zookeeper=True) + cluster.add_instance("node2", main_configs=["configs/config.d/storage_conf.xml"], macros={'replica': '2'}, + with_zookeeper=True) + cluster.add_instance("node3", main_configs=["configs/config.d/storage_conf.xml"], macros={'replica': '3'}, + with_zookeeper=True) + + logging.info("Starting cluster...") + cluster.start() + logging.info("Cluster started") + + yield cluster + finally: + cluster.shutdown() + + +FILES_OVERHEAD = 1 +FILES_OVERHEAD_PER_COLUMN = 2 # Data and mark files +FILES_OVERHEAD_PER_PART_WIDE = FILES_OVERHEAD_PER_COLUMN * 3 + 2 + 6 + 1 +FILES_OVERHEAD_PER_PART_COMPACT = 10 + 1 + + +def random_string(length): + letters = string.ascii_letters + return ''.join(random.choice(letters) for i in range(length)) + + +def generate_values(date_str, count, sign=1): + data = [[date_str, sign * (i + 1), random_string(10)] for i in range(count)] + data.sort(key=lambda tup: tup[1]) + return ",".join(["('{}',{},'{}')".format(x, y, z) for x, y, z in data]) + + +def create_table(cluster, additional_settings=None): + create_table_statement = """ + CREATE TABLE s3_test ON CLUSTER cluster( + dt Date, + id Int64, + data String, + INDEX min_max (id) TYPE minmax GRANULARITY 3 + ) ENGINE=ReplicatedMergeTree() + PARTITION BY dt + ORDER BY (dt, id) + SETTINGS storage_policy='s3' + """ + if additional_settings: + create_table_statement += "," + create_table_statement += additional_settings + + list(cluster.instances.values())[0].query(create_table_statement) + + +@pytest.fixture(autouse=True) +def drop_table(cluster): + yield + for node in list(cluster.instances.values()): + node.query("DROP TABLE IF EXISTS s3_test") + + minio = cluster.minio_client + # Remove extra objects to prevent tests cascade failing + for obj in list(minio.list_objects(cluster.minio_bucket, 'data/')): + minio.remove_object(cluster.minio_bucket, obj.object_name) + +@pytest.mark.parametrize( + "min_rows_for_wide_part,files_per_part", + [ + (0, FILES_OVERHEAD_PER_PART_WIDE), + (8192, FILES_OVERHEAD_PER_PART_COMPACT) + ] +) +def test_insert_select_replicated(cluster, min_rows_for_wide_part, files_per_part): + create_table(cluster, additional_settings="min_rows_for_wide_part={}".format(min_rows_for_wide_part)) + + all_values = "" + for node_idx in range(1, 4): + node = cluster.instances["node" + str(node_idx)] + values = generate_values("2020-01-0" + str(node_idx), 4096) + node.query("INSERT INTO s3_test VALUES {}".format(values), settings={"insert_quorum": 3}) + if node_idx != 1: + all_values += "," + all_values += values + + for node_idx in range(1, 4): + node = cluster.instances["node" + str(node_idx)] + assert node.query("SELECT * FROM s3_test order by dt, id FORMAT Values", + settings={"select_sequential_consistency": 1}) == all_values + + minio = cluster.minio_client + assert len(list(minio.list_objects(cluster.minio_bucket, 'data/'))) == (3 * FILES_OVERHEAD) + (files_per_part * 3) diff --git a/tests/integration/test_s3_zero_copy_replication/configs/config.d/s3.xml b/tests/integration/test_s3_zero_copy_replication/configs/config.d/s3.xml index 24a3fb95c53..285ade3f727 100644 --- a/tests/integration/test_s3_zero_copy_replication/configs/config.d/s3.xml +++ b/tests/integration/test_s3_zero_copy_replication/configs/config.d/s3.xml @@ -23,6 +23,7 @@ 0 2 + 1 diff --git a/tests/integration/test_s3_zero_copy_replication/test.py b/tests/integration/test_s3_zero_copy_replication/test.py index 278559f73f1..88d038e357b 100644 --- a/tests/integration/test_s3_zero_copy_replication/test.py +++ b/tests/integration/test_s3_zero_copy_replication/test.py @@ -54,6 +54,7 @@ def test_s3_zero_copy_replication(cluster, policy): ) node1.query("INSERT INTO s3_test VALUES (0,'data'),(1,'data')") + time.sleep(1) assert node1.query("SELECT * FROM s3_test order by id FORMAT Values") == "(0,'data'),(1,'data')" assert node2.query("SELECT * FROM s3_test order by id FORMAT Values") == "(0,'data'),(1,'data')" @@ -61,6 +62,7 @@ def test_s3_zero_copy_replication(cluster, policy): assert get_large_objects_count(cluster) == 1 node2.query("INSERT INTO s3_test VALUES (2,'data'),(3,'data')") + time.sleep(1) assert node2.query("SELECT * FROM s3_test order by id FORMAT Values") == "(0,'data'),(1,'data'),(2,'data'),(3,'data')" assert node1.query("SELECT * FROM s3_test order by id FORMAT Values") == "(0,'data'),(1,'data'),(2,'data'),(3,'data')" From e3879afa69672d28686b591dc3b088d1bf451b7a Mon Sep 17 00:00:00 2001 From: Anton Ivashkin Date: Fri, 23 Oct 2020 15:01:50 +0300 Subject: [PATCH 0052/2357] S3 zero copy replication: fix virtual method default parameter --- src/Disks/DiskCacheWrapper.cpp | 32 +++++++++++++------ src/Disks/DiskCacheWrapper.h | 8 +++-- src/Disks/DiskDecorator.cpp | 22 +++++++++---- src/Disks/DiskDecorator.h | 8 +++-- src/Disks/DiskLocal.cpp | 6 ++-- src/Disks/DiskLocal.h | 6 ++-- src/Disks/DiskMemory.cpp | 6 ++-- src/Disks/DiskMemory.h | 6 ++-- src/Disks/IDisk.h | 19 ++++++++--- src/Disks/S3/DiskS3.cpp | 8 ++--- src/Disks/S3/DiskS3.h | 10 ++++-- src/Storages/MergeTree/DataPartsExchange.cpp | 32 +++++++++++-------- src/Storages/MergeTree/DataPartsExchange.h | 2 +- src/Storages/MergeTree/IMergeTreeDataPart.cpp | 16 +++++----- src/Storages/MergeTree/MergeTreeSettings.h | 1 + 15 files changed, 114 insertions(+), 68 deletions(-) diff --git a/src/Disks/DiskCacheWrapper.cpp b/src/Disks/DiskCacheWrapper.cpp index 79e615d3609..8e0f77eed6d 100644 --- a/src/Disks/DiskCacheWrapper.cpp +++ b/src/Disks/DiskCacheWrapper.cpp @@ -199,11 +199,11 @@ DiskCacheWrapper::writeFile(const String & path, size_t buf_size, WriteMode mode buf_size); } -void DiskCacheWrapper::clearDirectory(const String & path, bool keep_s3) +void DiskCacheWrapper::clearDirectory(const String & path) { if (cache_disk->exists(path)) - cache_disk->clearDirectory(path, keep_s3); - DiskDecorator::clearDirectory(path, keep_s3); + cache_disk->clearDirectory(path); + DiskDecorator::clearDirectory(path); } void DiskCacheWrapper::moveDirectory(const String & from_path, const String & to_path) @@ -252,18 +252,32 @@ void DiskCacheWrapper::copyFile(const String & from_path, const String & to_path DiskDecorator::copyFile(from_path, to_path); } -void DiskCacheWrapper::remove(const String & path, bool keep_s3) +void DiskCacheWrapper::remove(const String & path) { if (cache_disk->exists(path)) - cache_disk->remove(path, keep_s3); - DiskDecorator::remove(path, keep_s3); + cache_disk->remove(path); + DiskDecorator::remove(path); } -void DiskCacheWrapper::removeRecursive(const String & path, bool keep_s3) +void DiskCacheWrapper::removeRecursive(const String & path) { if (cache_disk->exists(path)) - cache_disk->removeRecursive(path, keep_s3); - DiskDecorator::removeRecursive(path, keep_s3); + cache_disk->removeRecursive(path); + DiskDecorator::removeRecursive(path); +} + +void DiskCacheWrapper::removeShared(const String & path, bool keep_s3) +{ + if (cache_disk->exists(path)) + cache_disk->removeShared(path, keep_s3); + DiskDecorator::removeShared(path, keep_s3); +} + +void DiskCacheWrapper::removeSharedRecursive(const String & path, bool keep_s3) +{ + if (cache_disk->exists(path)) + cache_disk->removeSharedRecursive(path, keep_s3); + DiskDecorator::removeSharedRecursive(path, keep_s3); } void DiskCacheWrapper::createHardLink(const String & src_path, const String & dst_path) diff --git a/src/Disks/DiskCacheWrapper.h b/src/Disks/DiskCacheWrapper.h index 9fca4e02e34..6722d5bd1a5 100644 --- a/src/Disks/DiskCacheWrapper.h +++ b/src/Disks/DiskCacheWrapper.h @@ -28,7 +28,7 @@ public: std::function cache_file_predicate_); void createDirectory(const String & path) override; void createDirectories(const String & path) override; - void clearDirectory(const String & path, bool keep_s3 = false) override; + void clearDirectory(const String & path) override; void moveDirectory(const String & from_path, const String & to_path) override; void moveFile(const String & from_path, const String & to_path) override; void replaceFile(const String & from_path, const String & to_path) override; @@ -37,8 +37,10 @@ public: readFile(const String & path, size_t buf_size, size_t estimated_size, size_t aio_threshold, size_t mmap_threshold) const override; std::unique_ptr writeFile(const String & path, size_t buf_size, WriteMode mode, size_t estimated_size, size_t aio_threshold) override; - void remove(const String & path, bool keep_s3 = false) override; - void removeRecursive(const String & path, bool keep_s3 = false) override; + void remove(const String & path) override; + void removeRecursive(const String & path) override; + void removeShared(const String & path, bool keep_s3) override; + void removeSharedRecursive(const String & path, bool keep_s3) override; void createHardLink(const String & src_path, const String & dst_path) override; ReservationPtr reserve(UInt64 bytes) override; diff --git a/src/Disks/DiskDecorator.cpp b/src/Disks/DiskDecorator.cpp index 4ad71a67f95..e7a5beeaff1 100644 --- a/src/Disks/DiskDecorator.cpp +++ b/src/Disks/DiskDecorator.cpp @@ -73,9 +73,9 @@ void DiskDecorator::createDirectories(const String & path) delegate->createDirectories(path); } -void DiskDecorator::clearDirectory(const String & path, bool keep_s3) +void DiskDecorator::clearDirectory(const String & path) { - delegate->clearDirectory(path, keep_s3); + delegate->clearDirectory(path); } void DiskDecorator::moveDirectory(const String & from_path, const String & to_path) @@ -130,14 +130,24 @@ DiskDecorator::writeFile(const String & path, size_t buf_size, WriteMode mode, s return delegate->writeFile(path, buf_size, mode, estimated_size, aio_threshold); } -void DiskDecorator::remove(const String & path, bool keep_s3) +void DiskDecorator::remove(const String & path) { - delegate->remove(path, keep_s3); + delegate->remove(path); } -void DiskDecorator::removeRecursive(const String & path, bool keep_s3) +void DiskDecorator::removeRecursive(const String & path) { - delegate->removeRecursive(path, keep_s3); + delegate->removeRecursive(path); +} + +void DiskDecorator::removeShared(const String & path, bool keep_s3) +{ + delegate->removeShared(path, keep_s3); +} + +void DiskDecorator::removeSharedRecursive(const String & path, bool keep_s3) +{ + delegate->removeSharedRecursive(path, keep_s3); } void DiskDecorator::setLastModified(const String & path, const Poco::Timestamp & timestamp) diff --git a/src/Disks/DiskDecorator.h b/src/Disks/DiskDecorator.h index 8dcdb64ead5..4bc7879ffd3 100644 --- a/src/Disks/DiskDecorator.h +++ b/src/Disks/DiskDecorator.h @@ -26,7 +26,7 @@ public: size_t getFileSize(const String & path) const override; void createDirectory(const String & path) override; void createDirectories(const String & path) override; - void clearDirectory(const String & path, bool keep_s3 = false) override; + void clearDirectory(const String & path) override; void moveDirectory(const String & from_path, const String & to_path) override; DiskDirectoryIteratorPtr iterateDirectory(const String & path) override; void createFile(const String & path) override; @@ -39,8 +39,10 @@ public: readFile(const String & path, size_t buf_size, size_t estimated_size, size_t aio_threshold, size_t mmap_threshold) const override; std::unique_ptr writeFile(const String & path, size_t buf_size, WriteMode mode, size_t estimated_size, size_t aio_threshold) override; - void remove(const String & path, bool keep_s3 = false) override; - void removeRecursive(const String & path, bool keep_s3 = false) override; + void remove(const String & path) override; + void removeRecursive(const String & path) override; + void removeShared(const String & path, bool keep_s3) override; + void removeSharedRecursive(const String & path, bool keep_s3) override; void setLastModified(const String & path, const Poco::Timestamp & timestamp) override; Poco::Timestamp getLastModified(const String & path) override; void setReadOnly(const String & path) override; diff --git a/src/Disks/DiskLocal.cpp b/src/Disks/DiskLocal.cpp index ad85fdf4236..a09ab7c5ac5 100644 --- a/src/Disks/DiskLocal.cpp +++ b/src/Disks/DiskLocal.cpp @@ -180,7 +180,7 @@ void DiskLocal::createDirectories(const String & path) Poco::File(disk_path + path).createDirectories(); } -void DiskLocal::clearDirectory(const String & path, bool) +void DiskLocal::clearDirectory(const String & path) { std::vector files; Poco::File(disk_path + path).list(files); @@ -236,12 +236,12 @@ DiskLocal::writeFile(const String & path, size_t buf_size, WriteMode mode, size_ return createWriteBufferFromFileBase(disk_path + path, estimated_size, aio_threshold, buf_size, flags); } -void DiskLocal::remove(const String & path, bool) +void DiskLocal::remove(const String & path) { Poco::File(disk_path + path).remove(false); } -void DiskLocal::removeRecursive(const String & path, bool) +void DiskLocal::removeRecursive(const String & path) { Poco::File(disk_path + path).remove(true); } diff --git a/src/Disks/DiskLocal.h b/src/Disks/DiskLocal.h index 18e6d072874..762a8502faa 100644 --- a/src/Disks/DiskLocal.h +++ b/src/Disks/DiskLocal.h @@ -55,7 +55,7 @@ public: void createDirectories(const String & path) override; - void clearDirectory(const String & path, bool keep_s3 = false) override; + void clearDirectory(const String & path) override; void moveDirectory(const String & from_path, const String & to_path) override; @@ -87,9 +87,9 @@ public: size_t estimated_size, size_t aio_threshold) override; - void remove(const String & path, bool keep_s3 = false) override; + void remove(const String & path) override; - void removeRecursive(const String & path, bool keep_s3 = false) override; + void removeRecursive(const String & path) override; void setLastModified(const String & path, const Poco::Timestamp & timestamp) override; diff --git a/src/Disks/DiskMemory.cpp b/src/Disks/DiskMemory.cpp index fc375707feb..d185263d48c 100644 --- a/src/Disks/DiskMemory.cpp +++ b/src/Disks/DiskMemory.cpp @@ -233,7 +233,7 @@ void DiskMemory::createDirectoriesImpl(const String & path) files.emplace(path, FileData{FileType::Directory}); } -void DiskMemory::clearDirectory(const String & path, bool) +void DiskMemory::clearDirectory(const String & path) { std::lock_guard lock(mutex); @@ -348,7 +348,7 @@ std::unique_ptr DiskMemory::writeFile(const String & pa return std::make_unique(this, path, mode, buf_size); } -void DiskMemory::remove(const String & path, bool) +void DiskMemory::remove(const String & path) { std::lock_guard lock(mutex); @@ -368,7 +368,7 @@ void DiskMemory::remove(const String & path, bool) } } -void DiskMemory::removeRecursive(const String & path, bool) +void DiskMemory::removeRecursive(const String & path) { std::lock_guard lock(mutex); diff --git a/src/Disks/DiskMemory.h b/src/Disks/DiskMemory.h index e75d9bff100..4d4b947098b 100644 --- a/src/Disks/DiskMemory.h +++ b/src/Disks/DiskMemory.h @@ -48,7 +48,7 @@ public: void createDirectories(const String & path) override; - void clearDirectory(const String & path, bool keep_s3 = false) override; + void clearDirectory(const String & path) override; void moveDirectory(const String & from_path, const String & to_path) override; @@ -78,9 +78,9 @@ public: size_t estimated_size, size_t aio_threshold) override; - void remove(const String & path, bool keep_s3 = false) override; + void remove(const String & path) override; - void removeRecursive(const String & path, bool keep_s3 = false) override; + void removeRecursive(const String & path) override; void setLastModified(const String &, const Poco::Timestamp &) override {} diff --git a/src/Disks/IDisk.h b/src/Disks/IDisk.h index 63432bc226a..915c6da5a21 100644 --- a/src/Disks/IDisk.h +++ b/src/Disks/IDisk.h @@ -105,7 +105,7 @@ public: virtual void createDirectories(const String & path) = 0; /// Remove all files from the directory. Directories are not removed. - virtual void clearDirectory(const String & path, bool keep_s3 = false) = 0; + virtual void clearDirectory(const String & path) = 0; /// Move directory from `from_path` to `to_path`. virtual void moveDirectory(const String & from_path, const String & to_path) = 0; @@ -153,18 +153,27 @@ public: size_t aio_threshold = 0) = 0; /// Remove file or directory. Throws exception if file doesn't exists or if directory is not empty. - virtual void remove(const String & path, bool keep_s3 = false) = 0; + virtual void remove(const String & path) = 0; /// Remove file or directory with all children. Use with extra caution. Throws exception if file doesn't exists. - virtual void removeRecursive(const String & path, bool keep_s3 = false) = 0; + virtual void removeRecursive(const String & path) = 0; /// Remove file or directory if it exists. - void removeIfExists(const String & path, bool keep_s3 = false) + void removeIfExists(const String & path) { if (exists(path)) - remove(path, keep_s3); + remove(path); } + /// Remove file or directory. Throws exception if file doesn't exists or if directory is not empty. + virtual void removeShared(const String & path, bool) { remove(path); } + + /// Remove file or directory with all children. Use with extra caution. Throws exception if file doesn't exists. + virtual void removeSharedRecursive(const String & path, bool) { removeRecursive(path); } + + /// Remove file or directory if it exists. + void removeSharedIfExists(const String & path, bool) { removeIfExists(path); } + /// Set last modified time to file or directory at `path`. virtual void setLastModified(const String & path, const Poco::Timestamp & timestamp) = 0; diff --git a/src/Disks/S3/DiskS3.cpp b/src/Disks/S3/DiskS3.cpp index 8e5e230d9db..7334a5b8a9b 100644 --- a/src/Disks/S3/DiskS3.cpp +++ b/src/Disks/S3/DiskS3.cpp @@ -586,11 +586,11 @@ DiskDirectoryIteratorPtr DiskS3::iterateDirectory(const String & path) return std::make_unique(metadata_path + path, path); } -void DiskS3::clearDirectory(const String & path, bool keep_s3) +void DiskS3::clearDirectory(const String & path) { for (auto it{iterateDirectory(path)}; it->isValid(); it->next()) if (isFile(it->path())) - remove(it->path(), keep_s3); + remove(it->path()); } void DiskS3::moveFile(const String & from_path, const String & to_path) @@ -744,7 +744,7 @@ void DiskS3::removeAws(const AwsS3KeyKeeper & keys) } } -void DiskS3::remove(const String & path, bool keep_s3) +void DiskS3::removeShared(const String & path, bool keep_s3) { AwsS3KeyKeeper keys; removeMeta(path, keys); @@ -752,7 +752,7 @@ void DiskS3::remove(const String & path, bool keep_s3) removeAws(keys); } -void DiskS3::removeRecursive(const String & path, bool keep_s3) +void DiskS3::removeSharedRecursive(const String & path, bool keep_s3) { AwsS3KeyKeeper keys; removeMetaRecursive(path, keys); diff --git a/src/Disks/S3/DiskS3.h b/src/Disks/S3/DiskS3.h index cc52722f973..80752fa8253 100644 --- a/src/Disks/S3/DiskS3.h +++ b/src/Disks/S3/DiskS3.h @@ -60,7 +60,7 @@ public: void createDirectories(const String & path) override; - void clearDirectory(const String & path, bool keep_s3 = false) override; + void clearDirectory(const String & path) override; void moveDirectory(const String & from_path, const String & to_path) override { moveFile(from_path, to_path); } @@ -88,9 +88,13 @@ public: size_t estimated_size, size_t aio_threshold) override; - void remove(const String & path, bool keep_s3 = false) override; + void remove(const String & path) override { removeShared(path, false); } - void removeRecursive(const String & path, bool keep_s3 = false) override; + void removeRecursive(const String & path) override { removeSharedRecursive(path, false); } + + void removeShared(const String & path, bool keep_s3) override; + + void removeSharedRecursive(const String & path, bool keep_s3) override; void createHardLink(const String & src_path, const String & dst_path) override; diff --git a/src/Storages/MergeTree/DataPartsExchange.cpp b/src/Storages/MergeTree/DataPartsExchange.cpp index 265d855ba31..d2bd3c21173 100644 --- a/src/Storages/MergeTree/DataPartsExchange.cpp +++ b/src/Storages/MergeTree/DataPartsExchange.cpp @@ -124,7 +124,8 @@ void Service::processQuery(const Poco::Net::HTMLForm & params, ReadBuffer & /*bo { bool try_use_s3_copy = false; - if (client_protocol_version >= REPLICATION_PROTOCOL_VERSION_WITH_PARTS_S3_COPY) + if (data_settings->allow_s3_zero_copy_replication + && client_protocol_version >= REPLICATION_PROTOCOL_VERSION_WITH_PARTS_S3_COPY) { /// if source and destination are in the same S3 storage we try to use S3 CopyObject request first int send_s3_metadata = parse(params.get("send_s3_metadata", "0")); if (send_s3_metadata == 1) @@ -316,12 +317,15 @@ MergeTreeData::MutableDataPartPtr Fetcher::fetchPart( {"compress", "false"} }); - Disks disksS3; + Disks disks_s3; + + if (!data_settings->allow_s3_zero_copy_replication) + try_use_s3_copy = false; if (try_use_s3_copy) { - disksS3 = data.getDisksByType("s3"); - if (disksS3.empty()) + disks_s3 = data.getDisksByType("s3"); + if (disks_s3.empty()) try_use_s3_copy = false; } @@ -372,7 +376,7 @@ MergeTreeData::MutableDataPartPtr Fetcher::fetchPart( try { - return downloadPartToS3(part_name, replica_path, to_detached, tmp_prefix_, std::move(disksS3), in); + return downloadPartToS3(part_name, replica_path, to_detached, tmp_prefix_, std::move(disks_s3), in); } catch (const Exception & e) { @@ -544,14 +548,14 @@ MergeTreeData::MutableDataPartPtr Fetcher::downloadPartToS3( const String & replica_path, bool to_detached, const String & tmp_prefix_, - const Disks & disksS3, + const Disks & disks_s3, PooledReadWriteBufferFromHTTP & in ) { - if (disksS3.empty()) + if (disks_s3.empty()) throw Exception("No S3 disks anymore", ErrorCodes::LOGICAL_ERROR); - auto disk = disksS3[0]; + auto disk = disks_s3[0]; static const String TMP_PREFIX = "tmp_fetch_"; String tmp_prefix = tmp_prefix_.empty() ? TMP_PREFIX : tmp_prefix_; @@ -595,7 +599,7 @@ MergeTreeData::MutableDataPartPtr Fetcher::downloadPartToS3( /// NOTE The is_cancelled flag also makes sense to check every time you read over the network, /// performing a poll with a not very large timeout. /// And now we check it only between read chunks (in the `copyData` function). - disk->removeRecursive(part_download_path, true); + disk->removeSharedRecursive(part_download_path, true); throw Exception("Fetching of part was cancelled", ErrorCodes::ABORTED); } @@ -618,15 +622,15 @@ MergeTreeData::MutableDataPartPtr Fetcher::downloadPartToS3( size_t disk_id = 1; while (true) { - if (disk_id >= disksS3.size()) + if (disk_id >= disks_s3.size()) { /// No more S3 disks - disk->removeRecursive(part_download_path, true); + disk->removeSharedRecursive(part_download_path, true); /// After catch this exception replication continues with full data copy throw Exception("Can't find S3 drive for shared data", ErrorCodes::S3_ERROR); } /// Try next S3 disk - auto next_disk = disksS3[disk_id]; + auto next_disk = disks_s3[disk_id]; auto next_volume = std::make_shared("volume_" + part_name, next_disk); MergeTreeData::MutableDataPartPtr next_new_data_part = data.createPart(part_name, next_volume, part_relative_path); @@ -638,7 +642,7 @@ MergeTreeData::MutableDataPartPtr Fetcher::downloadPartToS3( metadata.copyTo(next_metadata_file); if (next_disk->checkFile(next_data_path)) { /// Right disk found - disk->removeRecursive(part_download_path, true); + disk->removeSharedRecursive(part_download_path, true); disk = next_disk; volume = next_volume; data_path = next_data_path; @@ -647,7 +651,7 @@ MergeTreeData::MutableDataPartPtr Fetcher::downloadPartToS3( } /// Wrong disk again - next_disk->removeRecursive(part_download_path, true); + next_disk->removeSharedRecursive(part_download_path, true); ++disk_id; } } diff --git a/src/Storages/MergeTree/DataPartsExchange.h b/src/Storages/MergeTree/DataPartsExchange.h index ac591c2046a..91edc3ba6d4 100644 --- a/src/Storages/MergeTree/DataPartsExchange.h +++ b/src/Storages/MergeTree/DataPartsExchange.h @@ -114,7 +114,7 @@ private: const String & replica_path, bool to_detached, const String & tmp_prefix_, - const Disks & disksS3, + const Disks & disks_s3, PooledReadWriteBufferFromHTTP & in); MergeTreeData & data; diff --git a/src/Storages/MergeTree/IMergeTreeDataPart.cpp b/src/Storages/MergeTree/IMergeTreeDataPart.cpp index badfb32cf58..be2f88e74e5 100644 --- a/src/Storages/MergeTree/IMergeTreeDataPart.cpp +++ b/src/Storages/MergeTree/IMergeTreeDataPart.cpp @@ -851,7 +851,7 @@ void IMergeTreeDataPart::remove(bool keep_s3) const try { - volume->getDisk()->removeRecursive(to + "/", keep_s3); + volume->getDisk()->removeSharedRecursive(to + "/", keep_s3); } catch (...) { @@ -874,7 +874,7 @@ void IMergeTreeDataPart::remove(bool keep_s3) const if (checksums.empty()) { /// If the part is not completely written, we cannot use fast path by listing files. - volume->getDisk()->removeRecursive(to + "/", keep_s3); + volume->getDisk()->removeSharedRecursive(to + "/", keep_s3); } else { @@ -887,18 +887,18 @@ void IMergeTreeDataPart::remove(bool keep_s3) const # pragma GCC diagnostic ignored "-Wunused-variable" #endif for (const auto & [file, _] : checksums.files) - volume->getDisk()->remove(to + "/" + file, keep_s3); + volume->getDisk()->removeShared(to + "/" + file, keep_s3); #if !__clang__ # pragma GCC diagnostic pop #endif for (const auto & file : {"checksums.txt", "columns.txt"}) - volume->getDisk()->remove(to + "/" + file, keep_s3); + volume->getDisk()->removeShared(to + "/" + file, keep_s3); - volume->getDisk()->removeIfExists(to + "/" + DEFAULT_COMPRESSION_CODEC_FILE_NAME, keep_s3); - volume->getDisk()->removeIfExists(to + "/" + DELETE_ON_DESTROY_MARKER_FILE_NAME, keep_s3); + volume->getDisk()->removeSharedIfExists(to + "/" + DEFAULT_COMPRESSION_CODEC_FILE_NAME, keep_s3); + volume->getDisk()->removeSharedIfExists(to + "/" + DELETE_ON_DESTROY_MARKER_FILE_NAME, keep_s3); - volume->getDisk()->remove(to, keep_s3); + volume->getDisk()->removeShared(to, keep_s3); } catch (...) { @@ -906,7 +906,7 @@ void IMergeTreeDataPart::remove(bool keep_s3) const LOG_ERROR(storage.log, "Cannot quickly remove directory {} by removing files; fallback to recursive removal. Reason: {}", fullPath(volume->getDisk(), to), getCurrentExceptionMessage(false)); - volume->getDisk()->removeRecursive(to + "/", keep_s3); + volume->getDisk()->removeSharedRecursive(to + "/", keep_s3); } } } diff --git a/src/Storages/MergeTree/MergeTreeSettings.h b/src/Storages/MergeTree/MergeTreeSettings.h index 97bc73caf5b..d39d212c5fc 100644 --- a/src/Storages/MergeTree/MergeTreeSettings.h +++ b/src/Storages/MergeTree/MergeTreeSettings.h @@ -105,6 +105,7 @@ struct Settings; M(UInt64, concurrent_part_removal_threshold, 100, "Activate concurrent part removal (see 'max_part_removal_threads') only if the number of inactive data parts is at least this.", 0) \ M(String, storage_policy, "default", "Name of storage disk policy", 0) \ M(Bool, allow_nullable_key, false, "Allow Nullable types as primary keys.", 0) \ + M(Bool, allow_s3_zero_copy_replication, true, "Allow Zero-copy replication over S3", 0) \ \ /** Settings for testing purposes */ \ M(Bool, randomize_part_type, false, "For testing purposes only. Randomizes part type between wide and compact", 0) \ From cd14f095abe7f355353054172533d1f097d6105e Mon Sep 17 00:00:00 2001 From: Alexander Tokmakov Date: Mon, 26 Oct 2020 18:12:16 +0300 Subject: [PATCH 0053/2357] fix tests --- src/Databases/DatabaseReplicated.cpp | 9 +- src/Databases/DatabaseReplicated.h | 2 +- src/Interpreters/InterpreterCreateQuery.cpp | 2 +- tests/integration/helpers/test_tools.py | 10 +- .../test_replicated_database/__init__.py | 0 .../test_replicated_database/test.py | 143 ++++++++++-------- 6 files changed, 95 insertions(+), 71 deletions(-) create mode 100644 tests/integration/test_replicated_database/__init__.py diff --git a/src/Databases/DatabaseReplicated.cpp b/src/Databases/DatabaseReplicated.cpp index 328f5476064..7fb7be61d35 100644 --- a/src/Databases/DatabaseReplicated.cpp +++ b/src/Databases/DatabaseReplicated.cpp @@ -136,7 +136,7 @@ void DatabaseReplicated::createDatabaseZKNodes() current_zookeeper->createIfNotExists(zookeeper_path + "/replicas", String()); } -void DatabaseReplicated::RemoveOutdatedSnapshotsAndLog() +void DatabaseReplicated::removeOutdatedSnapshotsAndLog() { /// This method removes all snapshots and logged queries /// that no longer will be in use by current replicas or @@ -180,7 +180,7 @@ void DatabaseReplicated::RemoveOutdatedSnapshotsAndLog() void DatabaseReplicated::runBackgroundLogExecutor() { - if (last_executed_log_entry == "") + if (last_executed_log_entry.empty()) { loadMetadataFromSnapshot(); } @@ -274,7 +274,8 @@ BlockIO DatabaseReplicated::getFeedback() Stopwatch watch; - NamesAndTypes block_structure = { + NamesAndTypes block_structure = + { {"replica_name", std::make_shared()}, {"execution_feedback", std::make_shared()}, }; @@ -334,7 +335,7 @@ void DatabaseReplicated::createSnapshot() } current_zookeeper->create(snapshot_path + "/.completed", String(), zkutil::CreateMode::Persistent); - RemoveOutdatedSnapshotsAndLog(); + removeOutdatedSnapshotsAndLog(); } void DatabaseReplicated::loadMetadataFromSnapshot() diff --git a/src/Databases/DatabaseReplicated.h b/src/Databases/DatabaseReplicated.h index 4b647915079..62997e953ac 100644 --- a/src/Databases/DatabaseReplicated.h +++ b/src/Databases/DatabaseReplicated.h @@ -57,7 +57,7 @@ private: void loadMetadataFromSnapshot(); void createSnapshot(); - void RemoveOutdatedSnapshotsAndLog(); + void removeOutdatedSnapshotsAndLog(); std::unique_ptr current_context; // to run executeQuery diff --git a/src/Interpreters/InterpreterCreateQuery.cpp b/src/Interpreters/InterpreterCreateQuery.cpp index 5210230859c..0f7d441c0d6 100644 --- a/src/Interpreters/InterpreterCreateQuery.cpp +++ b/src/Interpreters/InterpreterCreateQuery.cpp @@ -141,7 +141,7 @@ BlockIO InterpreterCreateQuery::createDatabase(ASTCreateQuery & create) throw Exception("Unknown database engine: " + ostr.str(), ErrorCodes::UNKNOWN_DATABASE_ENGINE); } - if (create.storage->engine->name == "Atomic") + if (create.storage->engine->name == "Atomic" || create.storage->engine->name == "Replicated") { if (create.attach && create.uuid == UUIDHelpers::Nil) throw Exception("UUID must be specified for ATTACH", ErrorCodes::INCORRECT_QUERY); diff --git a/tests/integration/helpers/test_tools.py b/tests/integration/helpers/test_tools.py index 75ae8f67f7a..639b47a7179 100644 --- a/tests/integration/helpers/test_tools.py +++ b/tests/integration/helpers/test_tools.py @@ -44,20 +44,20 @@ class TSV: def assert_eq_with_retry(instance, query, expectation, retry_count=20, sleep_time=0.5, stdin=None, timeout=None, - settings=None, user=None, ignore_error=False): + settings=None, user=None, ignore_error=False, get_result=lambda x: x): expectation_tsv = TSV(expectation) for i in range(retry_count): try: - if TSV(instance.query(query, user=user, stdin=stdin, timeout=timeout, settings=settings, - ignore_error=ignore_error)) == expectation_tsv: + if TSV(get_result(instance.query(query, user=user, stdin=stdin, timeout=timeout, settings=settings, + ignore_error=ignore_error))) == expectation_tsv: break time.sleep(sleep_time) except Exception as ex: print(("assert_eq_with_retry retry {} exception {}".format(i + 1, ex))) time.sleep(sleep_time) else: - val = TSV(instance.query(query, user=user, stdin=stdin, timeout=timeout, settings=settings, - ignore_error=ignore_error)) + val = TSV(get_result(instance.query(query, user=user, stdin=stdin, timeout=timeout, settings=settings, + ignore_error=ignore_error))) if expectation_tsv != val: raise AssertionError("'{}' != '{}'\n{}".format(expectation_tsv, val, '\n'.join( expectation_tsv.diff(val, n1="expectation", n2="query")))) diff --git a/tests/integration/test_replicated_database/__init__.py b/tests/integration/test_replicated_database/__init__.py new file mode 100644 index 00000000000..e69de29bb2d diff --git a/tests/integration/test_replicated_database/test.py b/tests/integration/test_replicated_database/test.py index 346114cb8c4..372ac7a7c3e 100644 --- a/tests/integration/test_replicated_database/test.py +++ b/tests/integration/test_replicated_database/test.py @@ -1,20 +1,24 @@ import time -import logging - +import re import pytest from helpers.cluster import ClickHouseCluster - -logging.getLogger().setLevel(logging.INFO) -logging.getLogger().addHandler(logging.StreamHandler()) +from helpers.test_tools import assert_eq_with_retry cluster = ClickHouseCluster(__file__) -main_node = cluster.add_instance('main_node', main_configs=['configs/disable_snapshots.xml'], with_zookeeper=True, stay_alive=True) -dummy_node = cluster.add_instance('dummy_node', main_configs=['configs/disable_snapshots.xml'], with_zookeeper=True) -competing_node = cluster.add_instance('competing_node', main_configs=['configs/disable_snapshots.xml'], with_zookeeper=True) -snapshotting_node = cluster.add_instance('snapshotting_node', main_configs=['configs/snapshot_each_query.xml'], with_zookeeper=True) -snapshot_recovering_node = cluster.add_instance('snapshot_recovering_node', main_configs=['configs/disable_snapshots.xml'], with_zookeeper=True) +main_node = cluster.add_instance('main_node', main_configs=['configs/disable_snapshots.xml'], with_zookeeper=True, stay_alive=True, macros={"shard": 1, "replica": 1}) +dummy_node = cluster.add_instance('dummy_node', main_configs=['configs/disable_snapshots.xml'], with_zookeeper=True, macros={"shard": 1, "replica": 2}) +competing_node = cluster.add_instance('competing_node', main_configs=['configs/disable_snapshots.xml'], with_zookeeper=True, macros={"shard": 1, "replica": 3}) +snapshotting_node = cluster.add_instance('snapshotting_node', main_configs=['configs/snapshot_each_query.xml'], with_zookeeper=True, macros={"shard": 2, "replica": 1}) +snapshot_recovering_node = cluster.add_instance('snapshot_recovering_node', main_configs=['configs/disable_snapshots.xml'], with_zookeeper=True, macros={"shard": 2, "replica": 2}) + +uuid_regex = re.compile("[0-9a-f]{8}\-[0-9a-f]{4}\-[0-9a-f]{4}\-[0-9a-f]{4}\-[0-9a-f]{12}") +def assert_create_query(nodes, table_name, expected): + replace_uuid = lambda x: re.sub(uuid_regex, "uuid", x) + query = "show create table testdb.{}".format(table_name) + for node in nodes: + assert_eq_with_retry(node, query, expected, get_result=replace_uuid) @pytest.fixture(scope="module") def started_cluster(): @@ -27,17 +31,25 @@ def started_cluster(): finally: cluster.shutdown() +#TODO better tests def test_create_replicated_table(started_cluster): - DURATION_SECONDS = 1 - main_node.query("CREATE TABLE testdb.replicated_table (d Date, k UInt64, i32 Int32) ENGINE=ReplicatedMergeTree(d, k, 8192);") + #FIXME should fail (replicated with old syntax) + #main_node.query("CREATE TABLE testdb.replicated_table (d Date, k UInt64, i32 Int32) ENGINE=ReplicatedMergeTree(d, k, 8192);") + main_node.query("CREATE TABLE testdb.replicated_table (d Date, k UInt64, i32 Int32) ENGINE=ReplicatedMergeTree ORDER BY k PARTITION BY toYYYYMM(d);") - time.sleep(DURATION_SECONDS) - assert main_node.query("desc table testdb.replicated_table") == dummy_node.query("desc table testdb.replicated_table") + expected = "CREATE TABLE testdb.replicated_table\\n(\\n `d` Date,\\n `k` UInt64,\\n `i32` Int32\\n)\\n" \ + "ENGINE = ReplicatedMergeTree(\\'/clickhouse/tables/uuid/{shard}\\', \\'{replica}\\')\\n" \ + "PARTITION BY toYYYYMM(d)\\nORDER BY k\\nSETTINGS index_granularity = 8192" + assert_create_query([main_node, dummy_node], "replicated_table", expected) + # assert without replacing uuid + assert main_node.query("show create testdb.replicated_table") == dummy_node.query("show create testdb.replicated_table") def test_simple_alter_table(started_cluster): - DURATION_SECONDS = 1 - main_node.query("CREATE TABLE testdb.alter_test (CounterID UInt32, StartDate Date, UserID UInt32, VisitID UInt32, NestedColumn Nested(A UInt8, S String), ToDrop UInt32) ENGINE = MergeTree(StartDate, intHash32(UserID), (CounterID, StartDate, intHash32(UserID), VisitID), 8192);") + #TODO add test with ReplicatedMergeTree + main_node.query("CREATE TABLE testdb.alter_test " + "(CounterID UInt32, StartDate Date, UserID UInt32, VisitID UInt32, NestedColumn Nested(A UInt8, S String), ToDrop UInt32) " + "ENGINE = MergeTree(StartDate, intHash32(UserID), (CounterID, StartDate, intHash32(UserID), VisitID), 8192);") main_node.query("ALTER TABLE testdb.alter_test ADD COLUMN Added0 UInt32;") main_node.query("ALTER TABLE testdb.alter_test ADD COLUMN Added2 UInt32;") main_node.query("ALTER TABLE testdb.alter_test ADD COLUMN Added1 UInt32 AFTER Added0;") @@ -45,48 +57,37 @@ def test_simple_alter_table(started_cluster): main_node.query("ALTER TABLE testdb.alter_test ADD COLUMN AddedNested1.C Array(String) AFTER AddedNested1.B;") main_node.query("ALTER TABLE testdb.alter_test ADD COLUMN AddedNested2 Nested(A UInt32, B UInt64) AFTER AddedNested1;") - time.sleep(DURATION_SECONDS) + expected = "CREATE TABLE testdb.alter_test\\n(\\n `CounterID` UInt32,\\n `StartDate` Date,\\n `UserID` UInt32,\\n" \ + " `VisitID` UInt32,\\n `NestedColumn.A` Array(UInt8),\\n `NestedColumn.S` Array(String),\\n" \ + " `ToDrop` UInt32,\\n `Added0` UInt32,\\n `Added1` UInt32,\\n `Added2` UInt32,\\n" \ + " `AddedNested1.A` Array(UInt32),\\n `AddedNested1.B` Array(UInt64),\\n `AddedNested1.C` Array(String),\\n" \ + " `AddedNested2.A` Array(UInt32),\\n `AddedNested2.B` Array(UInt64)\\n)\\n" \ + "ENGINE = MergeTree(StartDate, intHash32(UserID), (CounterID, StartDate, intHash32(UserID), VisitID), 8192)" - schema = main_node.query("show create table testdb.alter_test") - fields = [ - "`CounterID`", - "`StartDate`", - "`UserID`", - "`VisitID`", - "`NestedColumn.A`", - "`NestedColumn.S`", - "`ToDrop`", - "`Added0`", - "`Added1`", - "`Added2`", - "`AddedNested1.A`", - "`AddedNested1.B`", - "`AddedNested1.C`", - "`AddedNested2.A`", - "`AddedNested2.B`"] - - for field in fields: - assert field in schema - - assert main_node.query("desc table testdb.alter_test") == dummy_node.query("desc table testdb.alter_test") + assert_create_query([main_node, dummy_node], "alter_test", expected) def test_create_replica_after_delay(started_cluster): competing_node.query("CREATE DATABASE testdb ENGINE = Replicated('/clickhouse/databases/test1', 'replica3');") - main_node.query("ALTER TABLE testdb.alter_test ADD COLUMN Added3 UInt32 ;") - main_node.query("ALTER TABLE testdb.alter_test ADD COLUMN Added4 UInt32 ;") - main_node.query("ALTER TABLE testdb.alter_test ADD COLUMN Added5 UInt32 ;") + main_node.query("ALTER TABLE testdb.alter_test ADD COLUMN Added3 UInt32;") + main_node.query("ALTER TABLE testdb.alter_test DROP COLUMN AddedNested1;") + main_node.query("ALTER TABLE testdb.alter_test RENAME COLUMN Added1 TO AddedNested1;") - time.sleep(6) + expected = "CREATE TABLE testdb.alter_test\\n(\\n `CounterID` UInt32,\\n `StartDate` Date,\\n `UserID` UInt32,\\n" \ + " `VisitID` UInt32,\\n `NestedColumn.A` Array(UInt8),\\n `NestedColumn.S` Array(String),\\n" \ + " `ToDrop` UInt32,\\n `Added0` UInt32,\\n `AddedNested1` UInt32,\\n `Added2` UInt32,\\n" \ + " `AddedNested2.A` Array(UInt32),\\n `AddedNested2.B` Array(UInt64),\\n `Added3` UInt32\\n)\\n" \ + "ENGINE = MergeTree(StartDate, intHash32(UserID), (CounterID, StartDate, intHash32(UserID), VisitID), 8192)" - assert competing_node.query("desc table testdb.alter_test") == main_node.query("desc table testdb.alter_test") + assert_create_query([main_node, dummy_node, competing_node], "alter_test", expected) def test_alters_from_different_replicas(started_cluster): - DURATION_SECONDS = 1 + main_node.query("CREATE TABLE testdb.concurrent_test " + "(CounterID UInt32, StartDate Date, UserID UInt32, VisitID UInt32, NestedColumn Nested(A UInt8, S String), ToDrop UInt32) " + "ENGINE = MergeTree(StartDate, intHash32(UserID), (CounterID, StartDate, intHash32(UserID), VisitID), 8192);") - main_node.query("CREATE TABLE testdb.concurrent_test (CounterID UInt32, StartDate Date, UserID UInt32, VisitID UInt32, NestedColumn Nested(A UInt8, S String), ToDrop UInt32) ENGINE = MergeTree(StartDate, intHash32(UserID), (CounterID, StartDate, intHash32(UserID), VisitID), 8192);") - - time.sleep(DURATION_SECONDS) + time.sleep(1) #FIXME + dummy_node.kill_clickhouse(stop_start_wait_sec=0) competing_node.query("ALTER TABLE testdb.concurrent_test ADD COLUMN Added0 UInt32;") main_node.query("ALTER TABLE testdb.concurrent_test ADD COLUMN Added2 UInt32;") @@ -95,31 +96,53 @@ def test_alters_from_different_replicas(started_cluster): competing_node.query("ALTER TABLE testdb.concurrent_test ADD COLUMN AddedNested1.C Array(String) AFTER AddedNested1.B;") main_node.query("ALTER TABLE testdb.concurrent_test ADD COLUMN AddedNested2 Nested(A UInt32, B UInt64) AFTER AddedNested1;") - time.sleep(DURATION_SECONDS) + expected = "CREATE TABLE testdb.concurrent_test\\n(\\n `CounterID` UInt32,\\n `StartDate` Date,\\n `UserID` UInt32,\\n" \ + " `VisitID` UInt32,\\n `NestedColumn.A` Array(UInt8),\\n `NestedColumn.S` Array(String),\\n `ToDrop` UInt32,\\n" \ + " `Added0` UInt32,\\n `Added1` UInt32,\\n `Added2` UInt32,\\n `AddedNested1.A` Array(UInt32),\\n" \ + " `AddedNested1.B` Array(UInt64),\\n `AddedNested1.C` Array(String),\\n `AddedNested2.A` Array(UInt32),\\n" \ + " `AddedNested2.B` Array(UInt64)\\n)\\n" \ + "ENGINE = MergeTree(StartDate, intHash32(UserID), (CounterID, StartDate, intHash32(UserID), VisitID), 8192)" - assert competing_node.query("desc table testdb.concurrent_test") == main_node.query("desc table testdb.concurrent_test") + assert_create_query([main_node, competing_node], "concurrent_test", expected) def test_drop_and_create_table(started_cluster): main_node.query("DROP TABLE testdb.concurrent_test") - main_node.query("CREATE TABLE testdb.concurrent_test (CounterID UInt32, StartDate Date, UserID UInt32, VisitID UInt32, NestedColumn Nested(A UInt8, S String), ToDrop UInt32) ENGINE = MergeTree(StartDate, intHash32(UserID), (CounterID, StartDate, intHash32(UserID), VisitID), 8192);") - time.sleep(5) - assert competing_node.query("desc table testdb.concurrent_test") == main_node.query("desc table testdb.concurrent_test") + main_node.query("CREATE TABLE testdb.concurrent_test " + "(CounterID UInt32, StartDate Date, UserID UInt32, VisitID UInt32, NestedColumn Nested(A UInt8, S String), ToDrop UInt32) " + "ENGINE = MergeTree(StartDate, intHash32(UserID), (CounterID, StartDate, intHash32(UserID), VisitID), 8192);") + + expected = "CREATE TABLE testdb.concurrent_test\\n(\\n `CounterID` UInt32,\\n `StartDate` Date,\\n `UserID` UInt32,\\n" \ + " `VisitID` UInt32,\\n `NestedColumn.A` Array(UInt8),\\n `NestedColumn.S` Array(String),\\n `ToDrop` UInt32\\n)\\n" \ + "ENGINE = MergeTree(StartDate, intHash32(UserID), (CounterID, StartDate, intHash32(UserID), VisitID), 8192)" + + assert_create_query([main_node, competing_node], "concurrent_test", expected) def test_replica_restart(started_cluster): main_node.restart_clickhouse() - time.sleep(5) - assert competing_node.query("desc table testdb.concurrent_test") == main_node.query("desc table testdb.concurrent_test") + + expected = "CREATE TABLE testdb.concurrent_test\\n(\\n `CounterID` UInt32,\\n `StartDate` Date,\\n `UserID` UInt32,\\n" \ + " `VisitID` UInt32,\\n `NestedColumn.A` Array(UInt8),\\n `NestedColumn.S` Array(String),\\n `ToDrop` UInt32\\n)\\n" \ + "ENGINE = MergeTree(StartDate, intHash32(UserID), (CounterID, StartDate, intHash32(UserID), VisitID), 8192)" + + assert_create_query([main_node, competing_node], "concurrent_test", expected) def test_snapshot_and_snapshot_recover(started_cluster): + #FIXME bad test snapshotting_node.query("CREATE DATABASE testdb ENGINE = Replicated('/clickhouse/databases/test1', 'replica4');") time.sleep(5) snapshot_recovering_node.query("CREATE DATABASE testdb ENGINE = Replicated('/clickhouse/databases/test1', 'replica5');") time.sleep(5) assert snapshotting_node.query("desc table testdb.alter_test") == snapshot_recovering_node.query("desc table testdb.alter_test") -#def test_drop_and_create_replica(started_cluster): -# main_node.query("DROP DATABASE testdb") -# main_node.query("CREATE DATABASE testdb ENGINE = Replicated('/clickhouse/databases/test1', 'replica1');") -# time.sleep(6) -# assert competing_node.query("desc table testdb.concurrent_test") == main_node.query("desc table testdb.concurrent_test") +def test_drop_and_create_replica(started_cluster): + main_node.query("DROP DATABASE testdb") + main_node.query("CREATE DATABASE testdb ENGINE = Replicated('/clickhouse/databases/test1', 'replica1');") + + expected = "CREATE TABLE testdb.concurrent_test\\n(\\n `CounterID` UInt32,\\n `StartDate` Date,\\n `UserID` UInt32,\\n" \ + " `VisitID` UInt32,\\n `NestedColumn.A` Array(UInt8),\\n `NestedColumn.S` Array(String),\\n `ToDrop` UInt32\\n)\\n" \ + "ENGINE = MergeTree(StartDate, intHash32(UserID), (CounterID, StartDate, intHash32(UserID), VisitID), 8192)" + + assert_create_query([main_node, competing_node], "concurrent_test", expected) + +#TODO tests with Distributed From 10a7a61da9c7554d13cf3bd381f5f7b3dfa96e35 Mon Sep 17 00:00:00 2001 From: tavplubix Date: Mon, 2 Nov 2020 16:50:59 +0300 Subject: [PATCH 0054/2357] Update cluster.py --- tests/integration/helpers/cluster.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/tests/integration/helpers/cluster.py b/tests/integration/helpers/cluster.py index 7c44065320b..90f59db05af 100644 --- a/tests/integration/helpers/cluster.py +++ b/tests/integration/helpers/cluster.py @@ -908,12 +908,12 @@ class ClickHouseInstance: return "-fsanitize=thread" in build_opts # Connects to the instance via clickhouse-client, sends a query (1st argument) and returns the answer - def query(self, sql, stdin=None, timeout=None, settings=None, user=None, password=None, database=None, + def query(self, sql, stdin=None, timeout=60, settings=None, user=None, password=None, database=None, ignore_error=False): return self.client.query(sql, stdin=stdin, timeout=timeout, settings=settings, user=user, password=password, database=database, ignore_error=ignore_error) - def query_with_retry(self, sql, stdin=None, timeout=None, settings=None, user=None, password=None, database=None, + def query_with_retry(self, sql, stdin=None, timeout=10, settings=None, user=None, password=None, database=None, ignore_error=False, retry_count=20, sleep_time=0.5, check_callback=lambda x: True): result = None @@ -937,13 +937,13 @@ class ClickHouseInstance: return self.client.get_query_request(*args, **kwargs) # Connects to the instance via clickhouse-client, sends a query (1st argument), expects an error and return its code - def query_and_get_error(self, sql, stdin=None, timeout=None, settings=None, user=None, password=None, + def query_and_get_error(self, sql, stdin=None, timeout=60, settings=None, user=None, password=None, database=None): return self.client.query_and_get_error(sql, stdin=stdin, timeout=timeout, settings=settings, user=user, password=password, database=database) # The same as query_and_get_error but ignores successful query. - def query_and_get_answer_with_error(self, sql, stdin=None, timeout=None, settings=None, user=None, password=None, + def query_and_get_answer_with_error(self, sql, stdin=None, timeout=60, settings=None, user=None, password=None, database=None): return self.client.query_and_get_answer_with_error(sql, stdin=stdin, timeout=timeout, settings=settings, user=user, password=password, database=database) From 2e5125739e68cb5046d693e3c5d0350cb58ee63d Mon Sep 17 00:00:00 2001 From: tavplubix Date: Mon, 2 Nov 2020 17:58:06 +0300 Subject: [PATCH 0055/2357] Update cluster.py --- tests/integration/helpers/cluster.py | 1 + 1 file changed, 1 insertion(+) diff --git a/tests/integration/helpers/cluster.py b/tests/integration/helpers/cluster.py index 90f59db05af..17a6944cc12 100644 --- a/tests/integration/helpers/cluster.py +++ b/tests/integration/helpers/cluster.py @@ -907,6 +907,7 @@ class ClickHouseInstance: build_opts = self.query("SELECT value FROM system.build_options WHERE name = 'CXX_FLAGS'") return "-fsanitize=thread" in build_opts + # Connects to the instance via clickhouse-client, sends a query (1st argument) and returns the answer def query(self, sql, stdin=None, timeout=60, settings=None, user=None, password=None, database=None, ignore_error=False): From 78021714f1cbbf54246d09383bdf2a4d06389fa3 Mon Sep 17 00:00:00 2001 From: Anton Ivashkin Date: Tue, 3 Nov 2020 11:58:26 +0300 Subject: [PATCH 0056/2357] S3 zero copy replication: more simple s3 check --- S3ZeroCopyReplication.md | 22 ++++--- src/Disks/DiskDecorator.h | 2 +- src/Disks/IDisk.h | 2 +- src/Disks/S3/DiskS3.cpp | 27 +++----- src/Disks/S3/DiskS3.h | 2 +- src/Storages/MergeTree/DataPartsExchange.cpp | 64 ++++++------------- src/Storages/MergeTree/IMergeTreeDataPart.cpp | 29 ++++----- src/Storages/MergeTree/IMergeTreeDataPart.h | 2 + 8 files changed, 60 insertions(+), 90 deletions(-) diff --git a/S3ZeroCopyReplication.md b/S3ZeroCopyReplication.md index 1e152977753..22c01caa90c 100644 --- a/S3ZeroCopyReplication.md +++ b/S3ZeroCopyReplication.md @@ -12,23 +12,26 @@ Введена новая версия протокола REPLICATION_PROTOCOL_VERSION_WITH_PARTS_S3_COPY. В запросе новый параметр send_s3_metadata, если 1, то приемних просит у источника метаданные вместо данных, если это возможно. Приемник в ответ отсылает куку send_s3_metadata=1 в случае, если идут метаданные. В остальных случаях отсылаются данные, как и прежде. +В новой версии протокола перед полем с количеством файлов добавлена еще одна строка. Абстрактно это некий ID, по которому ноды могу понять, с одним S3 они работают или с разными. +Практически сейчас это один имя первого объекта файла checksums.txt. Эта же строка используется в качестве ID парта в зукипере. + Применик перед запросом смотрит, будет ли хранить данные в S3. Проверка сейчас кривая - если в сторадже есть S3, то считаем, что будет S3. Если да S3, то отсылает в запросе send_s3_metadata=1. -Источник при получении такого запроса смотрит, лежит ли парт на S3. Если да, то в Зукипере ставит метку по пути `<путь к данным таблицы>/zero_copy_s3/shared/<некий ID парта>/`, +Источник при получении такого запроса смотрит, лежит ли парт на S3. Если да, то в Зукипере ставит метку по пути `<путь к данным таблицы>/zero_copy_s3/shared//`, ставит в ответ куку send_s3_metadata=1 и вместо файлов с данными отсылает только файлы метаданных. -Приемник при получении ответа с send_s3_metadata=1 создает только файлики с идентичными меаданными, которые в итоге будут ссылаться на те же ключи в S3, ставит в зукипере аналогичную метку, -только со своим ID реплики, и работает с этим. Для первого фалйа из списка проверяет наличие первого ы3-объекта (просто наличие), если объект с таким именем найден, то все ок, если нет, то откат на старую версию. -(Сейчас есть еще код на случай наличия более одного диска S3, тогда перебирает все и если на каком-то файл найден, то использует его, но мы внутри команды MDB смотрим на такую конфигурацию как на странную. -Планируем ограничить функционал только случаем одного S3 диска.) +Приемник при получении ответа с send_s3_metadata=1 проверяет доступность по переданному ключу (первый объект checksums.txt) создает только файлики с идентичными меаданными, которые в итоге будут ссылаться на те же ключи в S3, ставит в зукипере аналогичную метку, +только со своим ID реплики, и работает с этим. -При желании удалить парт нода удаляет в Зукипере ключ `<путь к данным таблицы>/zero_copy_s3/shared/<некий ID парта>/`, потом получает все подключи `<путь к данным таблицы>/zero_copy_s3/shared/<некий ID парта>`. +При желании удалить парт нода удаляет в Зукипере ключ `<путь к данным таблицы>/zero_copy_s3/shared//`, потом получает все подключи `<путь к данным таблицы>/zero_copy_s3/shared/`. Если список не пустой, то считает, что данные использует другая нода и удаляет только локальные метаданные, если пустой, то удаляет и данные в S3. -При мерже если реузльтат будет на S3, нода ставит эфемерную метку в Zookeeper по пути `<путь к данным таблицы>/zero_copy_s3/merged/<имя нового парта>`. Если такая метка уже есть, то считает, что другая нода +При мерже если реузльтат будет на S3, нода ставит эфемерную метку в Zookeeper по пути `<путь к данным таблицы>/zero_copy_s3/merged/<имя нового парта>` (!! НЕ !!). Если такая метка уже есть, то считает, что другая нода уже помержила или мержит сейчас, и надо сделать fetch вместо мержа самой. +В конфиг добавлен флаг, по которому включается функционал нового протокола репликации - merge_tree->allow_s3_zero_copy_replication. Сейчас стоит в true - это времеменно, чтобы все тесты сейчас проходили с включенным флагом, перед финальным мержем надо не забыть заменить на false. + ## Костыли и недоработки, коих много * В качестве ID парта берется имя первого S3-ключа от файла checksums.txt. @@ -47,12 +50,11 @@ * Возможны все же дублирования партов. Пример - нода делает мерж, падает. Другая нода незавимо делает мерж, первая нода поднимается. В итоге есть две копии померженого парта. +* Тесты пока только самые базовые. + * ... много их. Честно. ## TODO, чего еще вообще не делалось -* Флаг в конфиге для включения функционала, по умолчанию будет выключен. - * Для гибридного хранилища сделать проверку и fetch при переезде парта с локального диска в S3. -* Тесты. diff --git a/src/Disks/DiskDecorator.h b/src/Disks/DiskDecorator.h index 4bc7879ffd3..71d75b92ab6 100644 --- a/src/Disks/DiskDecorator.h +++ b/src/Disks/DiskDecorator.h @@ -53,7 +53,7 @@ public: void sync(int fd) const override; const String getType() const override { return delegate->getType(); } String getUniqueId(const String & path) const override { return delegate->getUniqueId(path); } - bool checkFile(const String & path) const override { return delegate->checkFile(path); } + bool checkUniqueId(const String & id) const override { return delegate->checkUniqueId(id); } Executor & getExecutor() override; protected: diff --git a/src/Disks/IDisk.h b/src/Disks/IDisk.h index 915c6da5a21..e05b52c4a78 100644 --- a/src/Disks/IDisk.h +++ b/src/Disks/IDisk.h @@ -208,7 +208,7 @@ public: virtual String getUniqueId(const String & path) const { return path; } /// Check file, overrided for S3 only - virtual bool checkFile(const String & path) const { return exists(path); } + virtual bool checkUniqueId(const String & id) const { return exists(id); } /// Returns executor to perform asynchronous operations. virtual Executor & getExecutor() { return *executor; } diff --git a/src/Disks/S3/DiskS3.cpp b/src/Disks/S3/DiskS3.cpp index 7334a5b8a9b..01221d7c1a2 100644 --- a/src/Disks/S3/DiskS3.cpp +++ b/src/Disks/S3/DiskS3.cpp @@ -577,7 +577,7 @@ String DiskS3::getUniqueId(const String & path) const Metadata metadata(s3_root_path, metadata_path, path); String id; if (!metadata.s3_objects.empty()) - id = metadata.s3_objects[0].first; + id = metadata.s3_root_path + metadata.s3_objects[0].first; return id; } @@ -846,30 +846,23 @@ void DiskS3::shutdown() client->DisableRequestProcessing(); } -bool DiskS3::checkFile(const String & path) const +bool DiskS3::checkUniqueId(const String & id) const { - Metadata metadata(s3_root_path, metadata_path, path); - - /// empty s3_objects list for empty file - if (metadata.s3_objects.empty()) - return true; - - String object = metadata.s3_root_path + metadata.s3_objects[0].first; - + /// Check that we have right s3 and have access rights + /// Actually interprets id as s3 object name and checks if it exists Aws::S3::Model::ListObjectsRequest request; request.SetBucket(bucket); - request.SetPrefix(object); + request.SetPrefix(id); auto resp = client->ListObjects(request); throwIfError(resp); Aws::Vector object_list = resp.GetResult().GetContents(); - /// Should be only one object with name equal to prefix - if (object_list.size() != 1) + if (object_list.size() < 1) return false; - - if (object_list[0].GetKey() != object) - return false; - return true; + for (const auto & object : object_list) + if (object.GetKey() == id) + return true; + return false; } diff --git a/src/Disks/S3/DiskS3.h b/src/Disks/S3/DiskS3.h index 80752fa8253..43cec7838eb 100644 --- a/src/Disks/S3/DiskS3.h +++ b/src/Disks/S3/DiskS3.h @@ -116,7 +116,7 @@ public: String getUniqueId(const String & path) const override; - bool checkFile(const String & path) const override; + bool checkUniqueId(const String & path) const override; private: bool tryReserve(UInt64 bytes); diff --git a/src/Storages/MergeTree/DataPartsExchange.cpp b/src/Storages/MergeTree/DataPartsExchange.cpp index d2bd3c21173..2a1da0e0eaf 100644 --- a/src/Storages/MergeTree/DataPartsExchange.cpp +++ b/src/Storages/MergeTree/DataPartsExchange.cpp @@ -241,6 +241,9 @@ void Service::sendPartS3Metadata(const MergeTreeData::DataPartPtr & part, WriteB part->lockSharedData(zookeeper_path, replica_name, zookeeper); + String part_id = part->getUniqueId(); + writeStringBinary(part_id, out); + writeBinary(checksums.files.size(), out); for (const auto & it : checksums.files) { @@ -555,7 +558,22 @@ MergeTreeData::MutableDataPartPtr Fetcher::downloadPartToS3( if (disks_s3.empty()) throw Exception("No S3 disks anymore", ErrorCodes::LOGICAL_ERROR); - auto disk = disks_s3[0]; + String part_id; + readStringBinary(part_id, in); + + DiskPtr disk = disks_s3[0]; + + for (const auto & disk_ : disks_s3) + { + if (disk_->checkUniqueId(part_id)) + { + disk = disk_; + break; + } + } + + if (!disk) + throw Exception("Can't find S3 disk", ErrorCodes::S3_ERROR); static const String TMP_PREFIX = "tmp_fetch_"; String tmp_prefix = tmp_prefix_.empty() ? TMP_PREFIX : tmp_prefix_; @@ -612,50 +630,6 @@ MergeTreeData::MutableDataPartPtr Fetcher::downloadPartToS3( ErrorCodes::CHECKSUM_DOESNT_MATCH); } } - - if (!i) - { /// Check access for first s3 object of first file - if (!disk->checkFile(data_path)) - { /// Wrong S3 disk - Poco::File metadata(metadata_file); - - size_t disk_id = 1; - while (true) - { - if (disk_id >= disks_s3.size()) - { /// No more S3 disks - disk->removeSharedRecursive(part_download_path, true); - /// After catch this exception replication continues with full data copy - throw Exception("Can't find S3 drive for shared data", ErrorCodes::S3_ERROR); - } - - /// Try next S3 disk - auto next_disk = disks_s3[disk_id]; - - auto next_volume = std::make_shared("volume_" + part_name, next_disk); - MergeTreeData::MutableDataPartPtr next_new_data_part = data.createPart(part_name, next_volume, part_relative_path); - - next_disk->createDirectories(part_download_path); - - String next_data_path = next_new_data_part->getFullRelativePath() + file_name; - String next_metadata_file = fullPath(next_disk, next_data_path); - metadata.copyTo(next_metadata_file); - if (next_disk->checkFile(next_data_path)) - { /// Right disk found - disk->removeSharedRecursive(part_download_path, true); - disk = next_disk; - volume = next_volume; - data_path = next_data_path; - new_data_part = next_new_data_part; - break; - } - - /// Wrong disk again - next_disk->removeSharedRecursive(part_download_path, true); - ++disk_id; - } - } - } } assertEOF(in); diff --git a/src/Storages/MergeTree/IMergeTreeDataPart.cpp b/src/Storages/MergeTree/IMergeTreeDataPart.cpp index be2f88e74e5..85c2d5e4ab4 100644 --- a/src/Storages/MergeTree/IMergeTreeDataPart.cpp +++ b/src/Storages/MergeTree/IMergeTreeDataPart.cpp @@ -1095,17 +1095,24 @@ bool IMergeTreeDataPart::checkAllTTLCalculated(const StorageMetadataPtr & metada return true; } -void IMergeTreeDataPart::lockSharedData(const String & zookeeper_path, const String & replica_name, zkutil::ZooKeeperPtr zookeeper) const +String IMergeTreeDataPart::getUniqueId() const { + String id; + auto disk = volume->getDisk(); - if (disk->getType() != "s3") - return; - - String id = disk->getUniqueId(getFullRelativePath() + "checksums.txt"); + if (disk->getType() == "s3") + id = disk->getUniqueId(getFullRelativePath() + "checksums.txt"); if (id.empty()) - throw Exception("Can't lock part on S3 storage", ErrorCodes::LOGICAL_ERROR); + throw Exception("Can't get unique S3 object", ErrorCodes::LOGICAL_ERROR); + + return id; +} + +void IMergeTreeDataPart::lockSharedData(const String & zookeeper_path, const String & replica_name, zkutil::ZooKeeperPtr zookeeper) const +{ + String id = getUniqueId(); String zookeeper_node = zookeeper_path + "/zero_copy_s3/shared/" + id + "/" + replica_name; @@ -1117,15 +1124,7 @@ void IMergeTreeDataPart::lockSharedData(const String & zookeeper_path, const Str bool IMergeTreeDataPart::unlockSharedData(const String & zookeeper_path, const String & replica_name, zkutil::ZooKeeperPtr zookeeper) const { - auto disk = volume->getDisk(); - - if (disk->getType() != "s3") - return true; - - String id = disk->getUniqueId(getFullRelativePath() + "checksums.txt"); - - if (id.empty()) - return true; + String id = getUniqueId(); String zookeeper_part_node = zookeeper_path + "/zero_copy_s3/shared/" + id; String zookeeper_node = zookeeper_part_node + "/" + replica_name; diff --git a/src/Storages/MergeTree/IMergeTreeDataPart.h b/src/Storages/MergeTree/IMergeTreeDataPart.h index 8d21f5856fc..f948cbaa18c 100644 --- a/src/Storages/MergeTree/IMergeTreeDataPart.h +++ b/src/Storages/MergeTree/IMergeTreeDataPart.h @@ -359,6 +359,8 @@ public: /// part creation (using alter query with materialize_ttl setting). bool checkAllTTLCalculated(const StorageMetadataPtr & metadata_snapshot) const; + String getUniqueId() const; + /// Lock part in zookeeper for use common S3 data in several nodes void lockSharedData(const String & zookeeper_path, const String & replica_name, zkutil::ZooKeeperPtr zookeeper) const; From d8ae9fcdb4aea22a83d6fc917ec9d070d2780470 Mon Sep 17 00:00:00 2001 From: Alexander Tokmakov Date: Tue, 27 Oct 2020 12:19:45 +0300 Subject: [PATCH 0057/2357] fixes, add shard name --- src/Common/ZooKeeper/ZooKeeper.cpp | 17 -------------- src/Common/ZooKeeper/ZooKeeper.h | 5 ----- src/Databases/DatabaseFactory.cpp | 12 +++++----- src/Databases/DatabaseReplicated.cpp | 33 +++++++++++++++++++++------- src/Databases/DatabaseReplicated.h | 14 +++++++----- src/Databases/IDatabase.h | 20 ++++++++--------- src/Interpreters/DDLWorker.cpp | 1 + 7 files changed, 52 insertions(+), 50 deletions(-) diff --git a/src/Common/ZooKeeper/ZooKeeper.cpp b/src/Common/ZooKeeper/ZooKeeper.cpp index f4174faf057..bee875d1c74 100644 --- a/src/Common/ZooKeeper/ZooKeeper.cpp +++ b/src/Common/ZooKeeper/ZooKeeper.cpp @@ -588,23 +588,6 @@ void ZooKeeper::removeChildren(const std::string & path) } -void ZooKeeper::tryRemoveChildren(const std::string & path) -{ - Strings children; - if (tryGetChildren(path, children) != Coordination::Error::ZOK) - return; - while (!children.empty()) - { - Coordination::Requests ops; - for (size_t i = 0; i < MULTI_BATCH_SIZE && !children.empty(); ++i) - { - ops.emplace_back(makeRemoveRequest(path + "/" + children.back(), -1)); - children.pop_back(); - } - multi(ops); - } -} - void ZooKeeper::removeChildrenRecursive(const std::string & path) { Strings children = getChildren(path); diff --git a/src/Common/ZooKeeper/ZooKeeper.h b/src/Common/ZooKeeper/ZooKeeper.h index bbe3787197a..1ad744102c6 100644 --- a/src/Common/ZooKeeper/ZooKeeper.h +++ b/src/Common/ZooKeeper/ZooKeeper.h @@ -189,11 +189,6 @@ public: /// Remove all children nodes (non recursive). void removeChildren(const std::string & path); - /// Remove all children nodes (non recursive). - /// If there're no children for the given path, - /// this method does not throw an exception. - void tryRemoveChildren(const std::string & path); - using WaitCondition = std::function; /// Wait for the node to disappear or return immediately if it doesn't exist. diff --git a/src/Databases/DatabaseFactory.cpp b/src/Databases/DatabaseFactory.cpp index 5afa0b216ac..7758fe0bddc 100644 --- a/src/Databases/DatabaseFactory.cpp +++ b/src/Databases/DatabaseFactory.cpp @@ -169,15 +169,17 @@ DatabasePtr DatabaseFactory::getImpl(const ASTCreateQuery & create, const String { const ASTFunction * engine = engine_define->engine; - if (!engine->arguments || engine->arguments->children.size() != 2) - throw Exception("Replicated database requires zoo_path and replica_name arguments", ErrorCodes::BAD_ARGUMENTS); + if (!engine->arguments || engine->arguments->children.size() != 3) + throw Exception("Replicated database requires 3 arguments: zookeeper path, shard name and replica name", ErrorCodes::BAD_ARGUMENTS); const auto & arguments = engine->arguments->children; - const auto & zoo_path = safeGetLiteralValue(arguments[0], "Replicated"); - const auto & replica_name = safeGetLiteralValue(arguments[1], "Replicated"); + //TODO allow macros in arguments + const auto & zookeeper_path = safeGetLiteralValue(arguments[0], "Replicated"); + const auto & shard_name = safeGetLiteralValue(arguments[1], "Replicated"); + const auto & replica_name = safeGetLiteralValue(arguments[2], "Replicated"); - return std::make_shared(database_name, metadata_path, uuid, zoo_path, replica_name, context); + return std::make_shared(database_name, metadata_path, uuid, zookeeper_path, shard_name, replica_name, context); } throw Exception("Unknown database engine: " + engine_name, ErrorCodes::UNKNOWN_DATABASE_ENGINE); diff --git a/src/Databases/DatabaseReplicated.cpp b/src/Databases/DatabaseReplicated.cpp index 7fb7be61d35..145b3abba00 100644 --- a/src/Databases/DatabaseReplicated.cpp +++ b/src/Databases/DatabaseReplicated.cpp @@ -24,6 +24,7 @@ namespace ErrorCodes extern const int BAD_ARGUMENTS; } +//FIXME never used void DatabaseReplicated::setZooKeeper(zkutil::ZooKeeperPtr zookeeper) { std::lock_guard lock(current_zookeeper_mutex); @@ -50,16 +51,16 @@ DatabaseReplicated::DatabaseReplicated( const String & metadata_path_, UUID uuid, const String & zookeeper_path_, + const String & shard_name_, const String & replica_name_, Context & context_) : DatabaseAtomic(name_, metadata_path_, uuid, "DatabaseReplicated (" + name_ + ")", context_) , zookeeper_path(zookeeper_path_) + , shard_name(shard_name_) , replica_name(replica_name_) { - if (zookeeper_path.empty() || replica_name.empty()) - { - throw Exception("ZooKeeper path and replica name must be non-empty", ErrorCodes::BAD_ARGUMENTS); - } + if (zookeeper_path.empty() || shard_name.empty() || replica_name.empty()) + throw Exception("ZooKeeper path and shard and replica names must be non-empty", ErrorCodes::BAD_ARGUMENTS); if (zookeeper_path.back() == '/') zookeeper_path.resize(zookeeper_path.size() - 1); @@ -79,10 +80,12 @@ DatabaseReplicated::DatabaseReplicated( /// New database if (!current_zookeeper->exists(zookeeper_path)) { - createDatabaseZKNodes(); - /// Old replica recovery + createDatabaseZooKeeperNodes(); } - else if (current_zookeeper->exists(zookeeper_path + "/replicas/" + replica_name)) + + /// Attach existing replica + //TODO better protection from wrong replica names + if (current_zookeeper->exists(zookeeper_path + "/replicas/" + replica_name)) { String remote_last_entry = current_zookeeper->get(zookeeper_path + "/replicas/" + replica_name, {}, nullptr); @@ -106,17 +109,23 @@ DatabaseReplicated::DatabaseReplicated( } else { + //FIXME throw Exception( "Replica name might be in use by a different node. Please check replica_name parameter. Remove .last_entry file from " "metadata to create a new replica.", ErrorCodes::LOGICAL_ERROR); } } + else + { + createReplicaZooKeeperNodes(); + } snapshot_period = context_.getConfigRef().getInt("database_replicated_snapshot_period", 10); feedback_timeout = context_.getConfigRef().getInt("database_replicated_feedback_timeout", 0); LOG_DEBUG(log, "Snapshot period is set to {} log entries per one snapshot", snapshot_period); + //TODO do we need separate pool? background_log_executor = context_.getReplicatedSchedulePool().createTask( database_name + "(DatabaseReplicated::background_executor)", [this] { runBackgroundLogExecutor(); } ); @@ -124,7 +133,7 @@ DatabaseReplicated::DatabaseReplicated( background_log_executor->scheduleAfter(500); } -void DatabaseReplicated::createDatabaseZKNodes() +void DatabaseReplicated::createDatabaseZooKeeperNodes() { current_zookeeper = getZooKeeper(); @@ -136,6 +145,11 @@ void DatabaseReplicated::createDatabaseZKNodes() current_zookeeper->createIfNotExists(zookeeper_path + "/replicas", String()); } +void DatabaseReplicated::createReplicaZooKeeperNodes() +{ + current_zookeeper->create(zookeeper_path + "/replicas/" + replica_name, "", zkutil::CreateMode::Persistent); +} + void DatabaseReplicated::removeOutdatedSnapshotsAndLog() { /// This method removes all snapshots and logged queries @@ -151,6 +165,9 @@ void DatabaseReplicated::removeOutdatedSnapshotsAndLog() /// to a greater one than the least advanced current replica. current_zookeeper = getZooKeeper(); Strings replica_states = current_zookeeper->getChildren(zookeeper_path + "/replicas"); + //TODO do not use log pointers to determine which entries to remove if there are staled pointers. + // We can just remove all entries older than previous snapshot version. + // Possible invariant: store all entries since last snapshot, replica becomes lost when it cannot get log entry. auto least_advanced = std::min_element(replica_states.begin(), replica_states.end()); Strings snapshots = current_zookeeper->getChildren(zookeeper_path + "/snapshots"); diff --git a/src/Databases/DatabaseReplicated.h b/src/Databases/DatabaseReplicated.h index 62997e953ac..375118e7356 100644 --- a/src/Databases/DatabaseReplicated.h +++ b/src/Databases/DatabaseReplicated.h @@ -35,7 +35,9 @@ namespace DB class DatabaseReplicated : public DatabaseAtomic { public: - DatabaseReplicated(const String & name_, const String & metadata_path_, UUID uuid, const String & zookeeper_path_, const String & replica_name_, Context & context); + DatabaseReplicated(const String & name_, const String & metadata_path_, UUID uuid, + const String & zookeeper_path_, const String & shard_name_, const String & replica_name_, + Context & context); void drop(const Context & /*context*/) override; @@ -45,11 +47,9 @@ public: BlockIO getFeedback(); - String zookeeper_path; - String replica_name; - private: - void createDatabaseZKNodes(); + void createDatabaseZooKeeperNodes(); + void createReplicaZooKeeperNodes(); void runBackgroundLogExecutor(); void executeLogName(const String &); @@ -59,6 +59,10 @@ private: void createSnapshot(); void removeOutdatedSnapshotsAndLog(); + String zookeeper_path; + String shard_name; + String replica_name; + std::unique_ptr current_context; // to run executeQuery std::mutex log_name_mutex; diff --git a/src/Databases/IDatabase.h b/src/Databases/IDatabase.h index eeb69a97092..393e8f2d10c 100644 --- a/src/Databases/IDatabase.h +++ b/src/Databases/IDatabase.h @@ -197,7 +197,7 @@ public: const StoragePtr & /*table*/, const ASTPtr & /*query*/) { - throw Exception("There is no CREATE TABLE query for Database " + getEngineName(), ErrorCodes::NOT_IMPLEMENTED); + throw Exception("There is no CREATE TABLE query for Database" + getEngineName(), ErrorCodes::NOT_IMPLEMENTED); } /// Add the dictionary to the database. Record its presence in the metadata. @@ -206,7 +206,7 @@ public: const String & /*dictionary_name*/, const ASTPtr & /*query*/) { - throw Exception("There is no CREATE DICTIONARY query for Database " + getEngineName(), ErrorCodes::NOT_IMPLEMENTED); + throw Exception("There is no CREATE DICTIONARY query for Database" + getEngineName(), ErrorCodes::NOT_IMPLEMENTED); } /// Delete the table from the database, drop table and delete the metadata. @@ -215,7 +215,7 @@ public: const String & /*name*/, [[maybe_unused]] bool no_delay = false) { - throw Exception("There is no DROP TABLE query for Database " + getEngineName(), ErrorCodes::NOT_IMPLEMENTED); + throw Exception("There is no DROP TABLE query for Database" + getEngineName(), ErrorCodes::NOT_IMPLEMENTED); } /// Delete the dictionary from the database. Delete the metadata. @@ -223,32 +223,32 @@ public: const Context & /*context*/, const String & /*dictionary_name*/) { - throw Exception("There is no DROP DICTIONARY query for Database " + getEngineName(), ErrorCodes::NOT_IMPLEMENTED); + throw Exception("There is no DROP DICTIONARY query for Database" + getEngineName(), ErrorCodes::NOT_IMPLEMENTED); } /// Add a table to the database, but do not add it to the metadata. The database may not support this method. virtual void attachTable(const String & /*name*/, const StoragePtr & /*table*/, [[maybe_unused]] const String & relative_table_path = {}) { - throw Exception("There is no ATTACH TABLE query for Database " + getEngineName(), ErrorCodes::NOT_IMPLEMENTED); + throw Exception("There is no ATTACH TABLE query for Database" + getEngineName(), ErrorCodes::NOT_IMPLEMENTED); } /// Add dictionary to the database, but do not add it to the metadata. The database may not support this method. /// If dictionaries_lazy_load is false it also starts loading the dictionary asynchronously. virtual void attachDictionary(const String & /* dictionary_name */, const DictionaryAttachInfo & /* attach_info */) { - throw Exception("There is no ATTACH DICTIONARY query for Database " + getEngineName(), ErrorCodes::NOT_IMPLEMENTED); + throw Exception("There is no ATTACH DICTIONARY query for Database" + getEngineName(), ErrorCodes::NOT_IMPLEMENTED); } /// Forget about the table without deleting it, and return it. The database may not support this method. virtual StoragePtr detachTable(const String & /*name*/) { - throw Exception("There is no DETACH TABLE query for Database " + getEngineName(), ErrorCodes::NOT_IMPLEMENTED); + throw Exception("There is no DETACH TABLE query for Database" + getEngineName(), ErrorCodes::NOT_IMPLEMENTED); } /// Forget about the dictionary without deleting it. The database may not support this method. virtual void detachDictionary(const String & /*name*/) { - throw Exception("There is no DETACH DICTIONARY query for Database " + getEngineName(), ErrorCodes::NOT_IMPLEMENTED); + throw Exception("There is no DETACH DICTIONARY query for Database" + getEngineName(), ErrorCodes::NOT_IMPLEMENTED); } /// Rename the table and possibly move the table to another database. @@ -352,14 +352,14 @@ protected: virtual ASTPtr getCreateTableQueryImpl(const String & /*name*/, const Context & /*context*/, bool throw_on_error) const { if (throw_on_error) - throw Exception("There is no SHOW CREATE TABLE query for Database " + getEngineName(), ErrorCodes::CANNOT_GET_CREATE_TABLE_QUERY); + throw Exception("There is no SHOW CREATE TABLE query for Database" + getEngineName(), ErrorCodes::CANNOT_GET_CREATE_TABLE_QUERY); return nullptr; } virtual ASTPtr getCreateDictionaryQueryImpl(const String & /*name*/, bool throw_on_error) const { if (throw_on_error) - throw Exception("There is no SHOW CREATE DICTIONARY query for Database " + getEngineName(), ErrorCodes::CANNOT_GET_CREATE_DICTIONARY_QUERY); + throw Exception("There is no SHOW CREATE DICTIONARY query for Database" + getEngineName(), ErrorCodes::CANNOT_GET_CREATE_DICTIONARY_QUERY); return nullptr; } diff --git a/src/Interpreters/DDLWorker.cpp b/src/Interpreters/DDLWorker.cpp index 32d0e25bde5..4e2dcc98767 100644 --- a/src/Interpreters/DDLWorker.cpp +++ b/src/Interpreters/DDLWorker.cpp @@ -759,6 +759,7 @@ void DDLWorker::processTask(DDLTask & task) else if (code == Coordination::Error::ZNONODE) { /// There is no parent + //TODO why not to create parent before active_node? createStatusDirs(task.entry_path, zookeeper); if (Coordination::Error::ZOK != zookeeper->tryCreate(active_node_path, "", zkutil::CreateMode::Ephemeral, dummy)) throw Coordination::Exception(code, active_node_path); From cbcdee0cf9f735e9c8545f32fe73579d01bbb9a5 Mon Sep 17 00:00:00 2001 From: Alexander Tokmakov Date: Tue, 3 Nov 2020 16:47:26 +0300 Subject: [PATCH 0058/2357] split DDLWorker.cpp --- src/Interpreters/DDLTask.cpp | 81 +++ src/Interpreters/DDLTask.h | 88 ++++ src/Interpreters/DDLWorker.cpp | 479 +----------------- src/Interpreters/DDLWorker.h | 22 +- src/Interpreters/InterpreterAlterQuery.cpp | 2 +- src/Interpreters/InterpreterCreateQuery.cpp | 3 +- .../InterpreterCreateQuotaQuery.cpp | 2 +- .../InterpreterCreateRoleQuery.cpp | 2 +- .../InterpreterCreateRowPolicyQuery.cpp | 2 +- .../InterpreterCreateSettingsProfileQuery.cpp | 2 +- .../InterpreterCreateUserQuery.cpp | 2 +- .../InterpreterDropAccessEntityQuery.cpp | 2 +- src/Interpreters/InterpreterDropQuery.cpp | 2 +- src/Interpreters/InterpreterGrantQuery.cpp | 2 +- .../InterpreterKillQueryQuery.cpp | 2 +- src/Interpreters/InterpreterOptimizeQuery.cpp | 2 +- src/Interpreters/InterpreterRenameQuery.cpp | 2 +- src/Interpreters/InterpreterSystemQuery.cpp | 2 +- src/Interpreters/executeDDLQueryOnCluster.cpp | 317 ++++++++++++ src/Interpreters/executeDDLQueryOnCluster.h | 63 +++ src/Interpreters/ya.make | 2 + 21 files changed, 576 insertions(+), 505 deletions(-) create mode 100644 src/Interpreters/DDLTask.cpp create mode 100644 src/Interpreters/DDLTask.h create mode 100644 src/Interpreters/executeDDLQueryOnCluster.cpp create mode 100644 src/Interpreters/executeDDLQueryOnCluster.h diff --git a/src/Interpreters/DDLTask.cpp b/src/Interpreters/DDLTask.cpp new file mode 100644 index 00000000000..dfb8f5ff746 --- /dev/null +++ b/src/Interpreters/DDLTask.cpp @@ -0,0 +1,81 @@ +#include +#include +#include +#include +#include +#include +#include +#include + +namespace DB +{ + +namespace ErrorCodes +{ + extern const int UNKNOWN_FORMAT_VERSION; +} + +HostID HostID::fromString(const String & host_port_str) +{ + HostID res; + std::tie(res.host_name, res.port) = Cluster::Address::fromString(host_port_str); + return res; +} + +bool HostID::isLocalAddress(UInt16 clickhouse_port) const +{ + try + { + return DB::isLocalAddress(DNSResolver::instance().resolveAddress(host_name, port), clickhouse_port); + } + catch (const Poco::Net::NetException &) + { + /// Avoid "Host not found" exceptions + return false; + } +} + + +String DDLLogEntry::toString() const +{ + WriteBufferFromOwnString wb; + + Strings host_id_strings(hosts.size()); + std::transform(hosts.begin(), hosts.end(), host_id_strings.begin(), HostID::applyToString); + + auto version = CURRENT_VERSION; + wb << "version: " << version << "\n"; + wb << "query: " << escape << query << "\n"; + wb << "hosts: " << host_id_strings << "\n"; + wb << "initiator: " << initiator << "\n"; + + return wb.str(); +} + +void DDLLogEntry::parse(const String & data) +{ + ReadBufferFromString rb(data); + + int version; + rb >> "version: " >> version >> "\n"; + + if (version != CURRENT_VERSION) + throw Exception(ErrorCodes::UNKNOWN_FORMAT_VERSION, "Unknown DDLLogEntry format version: {}", version); + + Strings host_id_strings; + rb >> "query: " >> escape >> query >> "\n"; + rb >> "hosts: " >> host_id_strings >> "\n"; + + if (!rb.eof()) + rb >> "initiator: " >> initiator >> "\n"; + else + initiator.clear(); + + assertEOF(rb); + + hosts.resize(host_id_strings.size()); + std::transform(host_id_strings.begin(), host_id_strings.end(), hosts.begin(), HostID::fromString); +} + + +} diff --git a/src/Interpreters/DDLTask.h b/src/Interpreters/DDLTask.h new file mode 100644 index 00000000000..51f09efd0bd --- /dev/null +++ b/src/Interpreters/DDLTask.h @@ -0,0 +1,88 @@ +#pragma once +#include +#include + + +namespace DB +{ + +class ASTQueryWithOnCluster; + +struct HostID +{ + String host_name; + UInt16 port; + + HostID() = default; + + explicit HostID(const Cluster::Address & address) + : host_name(address.host_name), port(address.port) {} + + static HostID fromString(const String & host_port_str); + + String toString() const + { + return Cluster::Address::toString(host_name, port); + } + + String readableString() const + { + return host_name + ":" + DB::toString(port); + } + + bool isLocalAddress(UInt16 clickhouse_port) const; + + static String applyToString(const HostID & host_id) + { + return host_id.toString(); + } +}; + + +struct DDLLogEntry +{ + String query; + std::vector hosts; + String initiator; // optional + + static constexpr int CURRENT_VERSION = 1; + + String toString() const; + + void parse(const String & data); +}; + + +struct DDLTask +{ + /// Stages of task lifetime correspond ordering of these data fields: + + /// Stage 1: parse entry + String entry_name; + String entry_path; + DDLLogEntry entry; + + /// Stage 2: resolve host_id and check that + HostID host_id; + String host_id_str; + + /// Stage 3.1: parse query + ASTPtr query; + ASTQueryWithOnCluster * query_on_cluster = nullptr; + + /// Stage 3.2: check cluster and find the host in cluster + String cluster_name; + ClusterPtr cluster; + Cluster::Address address_in_cluster; + size_t host_shard_num; + size_t host_replica_num; + + /// Stage 3.3: execute query + ExecutionStatus execution_status; + bool was_executed = false; + + /// Stage 4: commit results to ZooKeeper +}; + + +} diff --git a/src/Interpreters/DDLWorker.cpp b/src/Interpreters/DDLWorker.cpp index 4e2dcc98767..2c454db4787 100644 --- a/src/Interpreters/DDLWorker.cpp +++ b/src/Interpreters/DDLWorker.cpp @@ -1,4 +1,5 @@ #include +#include #include #include #include @@ -9,37 +10,21 @@ #include #include #include -#include #include #include -#include -#include #include #include -#include #include -#include -#include -#include -#include #include -#include #include #include #include #include -#include -#include -#include -#include -#include -#include -#include #include #include -#include #include #include +#include #include #include @@ -51,7 +36,6 @@ namespace ErrorCodes { extern const int NOT_IMPLEMENTED; extern const int LOGICAL_ERROR; - extern const int UNKNOWN_FORMAT_VERSION; extern const int INCONSISTENT_CLUSTER_DEFINITION; extern const int TIMEOUT_EXCEEDED; extern const int UNKNOWN_TYPE_OF_QUERY; @@ -60,141 +44,6 @@ namespace ErrorCodes } -namespace -{ - -struct HostID -{ - String host_name; - UInt16 port; - - HostID() = default; - - explicit HostID(const Cluster::Address & address) - : host_name(address.host_name), port(address.port) {} - - static HostID fromString(const String & host_port_str) - { - HostID res; - std::tie(res.host_name, res.port) = Cluster::Address::fromString(host_port_str); - return res; - } - - String toString() const - { - return Cluster::Address::toString(host_name, port); - } - - String readableString() const - { - return host_name + ":" + DB::toString(port); - } - - bool isLocalAddress(UInt16 clickhouse_port) const - { - try - { - return DB::isLocalAddress(DNSResolver::instance().resolveAddress(host_name, port), clickhouse_port); - } - catch (const Poco::Net::NetException &) - { - /// Avoid "Host not found" exceptions - return false; - } - } - - static String applyToString(const HostID & host_id) - { - return host_id.toString(); - } -}; - -} - - -struct DDLLogEntry -{ - String query; - std::vector hosts; - String initiator; // optional - - static constexpr int CURRENT_VERSION = 1; - - String toString() - { - WriteBufferFromOwnString wb; - - Strings host_id_strings(hosts.size()); - std::transform(hosts.begin(), hosts.end(), host_id_strings.begin(), HostID::applyToString); - - auto version = CURRENT_VERSION; - wb << "version: " << version << "\n"; - wb << "query: " << escape << query << "\n"; - wb << "hosts: " << host_id_strings << "\n"; - wb << "initiator: " << initiator << "\n"; - - return wb.str(); - } - - void parse(const String & data) - { - ReadBufferFromString rb(data); - - int version; - rb >> "version: " >> version >> "\n"; - - if (version != CURRENT_VERSION) - throw Exception(ErrorCodes::UNKNOWN_FORMAT_VERSION, "Unknown DDLLogEntry format version: {}", version); - - Strings host_id_strings; - rb >> "query: " >> escape >> query >> "\n"; - rb >> "hosts: " >> host_id_strings >> "\n"; - - if (!rb.eof()) - rb >> "initiator: " >> initiator >> "\n"; - else - initiator.clear(); - - assertEOF(rb); - - hosts.resize(host_id_strings.size()); - std::transform(host_id_strings.begin(), host_id_strings.end(), hosts.begin(), HostID::fromString); - } -}; - - -struct DDLTask -{ - /// Stages of task lifetime correspond ordering of these data fields: - - /// Stage 1: parse entry - String entry_name; - String entry_path; - DDLLogEntry entry; - - /// Stage 2: resolve host_id and check that - HostID host_id; - String host_id_str; - - /// Stage 3.1: parse query - ASTPtr query; - ASTQueryWithOnCluster * query_on_cluster = nullptr; - - /// Stage 3.2: check cluster and find the host in cluster - String cluster_name; - ClusterPtr cluster; - Cluster::Address address_in_cluster; - size_t host_shard_num; - size_t host_replica_num; - - /// Stage 3.3: execute query - ExecutionStatus execution_status; - bool was_executed = false; - - /// Stage 4: commit results to ZooKeeper -}; - - namespace { @@ -293,21 +142,6 @@ std::unique_ptr createSimpleZooKeeperLock( } -static bool isSupportedAlterType(int type) -{ - static const std::unordered_set unsupported_alter_types{ - ASTAlterCommand::ATTACH_PARTITION, - ASTAlterCommand::REPLACE_PARTITION, - ASTAlterCommand::FETCH_PARTITION, - ASTAlterCommand::FREEZE_PARTITION, - ASTAlterCommand::FREEZE_ALL, - ASTAlterCommand::NO_TYPE, - }; - - return unsupported_alter_types.count(type) == 0; -} - - DDLWorker::DDLWorker(int pool_size_, const std::string & zk_root_dir, Context & context_, const Poco::Util::AbstractConfiguration * config, const String & prefix) : context(context_) , log(&Poco::Logger::get("DDLWorker")) @@ -1187,313 +1021,4 @@ void DDLWorker::runCleanupThread() } -class DDLQueryStatusInputStream : public IBlockInputStream -{ -public: - - DDLQueryStatusInputStream(const String & zk_node_path, const DDLLogEntry & entry, const Context & context_) - : node_path(zk_node_path), context(context_), watch(CLOCK_MONOTONIC_COARSE), log(&Poco::Logger::get("DDLQueryStatusInputStream")) - { - sample = Block{ - {std::make_shared(), "host"}, - {std::make_shared(), "port"}, - {std::make_shared(), "status"}, - {std::make_shared(), "error"}, - {std::make_shared(), "num_hosts_remaining"}, - {std::make_shared(), "num_hosts_active"}, - }; - - for (const HostID & host: entry.hosts) - waiting_hosts.emplace(host.toString()); - - addTotalRowsApprox(entry.hosts.size()); - - timeout_seconds = context.getSettingsRef().distributed_ddl_task_timeout; - } - - String getName() const override - { - return "DDLQueryStatusInputStream"; - } - - Block getHeader() const override { return sample; } - - Block readImpl() override - { - Block res; - if (num_hosts_finished >= waiting_hosts.size()) - { - if (first_exception) - throw Exception(*first_exception); - - return res; - } - - auto zookeeper = context.getZooKeeper(); - size_t try_number = 0; - - while (res.rows() == 0) - { - if (isCancelled()) - { - if (first_exception) - throw Exception(*first_exception); - - return res; - } - - if (timeout_seconds >= 0 && watch.elapsedSeconds() > timeout_seconds) - { - size_t num_unfinished_hosts = waiting_hosts.size() - num_hosts_finished; - size_t num_active_hosts = current_active_hosts.size(); - - - throw Exception(ErrorCodes::TIMEOUT_EXCEEDED, - "Watching task {} is executing longer than distributed_ddl_task_timeout (={}) seconds. " - "There are {} unfinished hosts ({} of them are currently active), they are going to execute the query in background", - node_path, timeout_seconds, num_unfinished_hosts, num_active_hosts); - } - - if (num_hosts_finished != 0 || try_number != 0) - { - sleepForMilliseconds(std::min(1000, 50 * (try_number + 1))); - } - - /// TODO: add shared lock - if (!zookeeper->exists(node_path)) - { - throw Exception(ErrorCodes::UNFINISHED, - "Cannot provide query execution status. The query's node {} has been deleted by the cleaner since it was finished (or its lifetime is expired)", - node_path); - } - - Strings new_hosts = getNewAndUpdate(getChildrenAllowNoNode(zookeeper, node_path + "/finished")); - ++try_number; - if (new_hosts.empty()) - continue; - - current_active_hosts = getChildrenAllowNoNode(zookeeper, node_path + "/active"); - - MutableColumns columns = sample.cloneEmptyColumns(); - for (const String & host_id : new_hosts) - { - ExecutionStatus status(-1, "Cannot obtain error message"); - { - String status_data; - if (zookeeper->tryGet(node_path + "/finished/" + host_id, status_data)) - status.tryDeserializeText(status_data); - } - - auto [host, port] = Cluster::Address::fromString(host_id); - - if (status.code != 0 && first_exception == nullptr) - first_exception = std::make_unique(status.code, "There was an error on [{}:{}]: {}", host, port, status.message); - - ++num_hosts_finished; - - columns[0]->insert(host); - columns[1]->insert(port); - columns[2]->insert(status.code); - columns[3]->insert(status.message); - columns[4]->insert(waiting_hosts.size() - num_hosts_finished); - columns[5]->insert(current_active_hosts.size()); - } - res = sample.cloneWithColumns(std::move(columns)); - } - - return res; - } - - Block getSampleBlock() const - { - return sample.cloneEmpty(); - } - - ~DDLQueryStatusInputStream() override = default; - -private: - - static Strings getChildrenAllowNoNode(const std::shared_ptr & zookeeper, const String & node_path) - { - Strings res; - Coordination::Error code = zookeeper->tryGetChildren(node_path, res); - if (code != Coordination::Error::ZOK && code != Coordination::Error::ZNONODE) - throw Coordination::Exception(code, node_path); - return res; - } - - Strings getNewAndUpdate(const Strings & current_list_of_finished_hosts) - { - Strings diff; - for (const String & host : current_list_of_finished_hosts) - { - if (!waiting_hosts.count(host)) - { - if (!ignoring_hosts.count(host)) - { - ignoring_hosts.emplace(host); - LOG_INFO(log, "Unexpected host {} appeared in task {}", host, node_path); - } - continue; - } - - if (!finished_hosts.count(host)) - { - diff.emplace_back(host); - finished_hosts.emplace(host); - } - } - - return diff; - } - - String node_path; - const Context & context; - Stopwatch watch; - Poco::Logger * log; - - Block sample; - - NameSet waiting_hosts; /// hosts from task host list - NameSet finished_hosts; /// finished hosts from host list - NameSet ignoring_hosts; /// appeared hosts that are not in hosts list - Strings current_active_hosts; /// Hosts that were in active state at the last check - size_t num_hosts_finished = 0; - - /// Save the first detected error and throw it at the end of execution - std::unique_ptr first_exception; - - Int64 timeout_seconds = 120; -}; - - -BlockIO executeDDLQueryOnCluster(const ASTPtr & query_ptr_, const Context & context, AccessRightsElements && query_requires_access, bool query_requires_grant_option) -{ - /// Remove FORMAT and INTO OUTFILE if exists - ASTPtr query_ptr = query_ptr_->clone(); - ASTQueryWithOutput::resetOutputASTIfExist(*query_ptr); - - // XXX: serious design flaw since `ASTQueryWithOnCluster` is not inherited from `IAST`! - auto * query = dynamic_cast(query_ptr.get()); - if (!query) - { - throw Exception("Distributed execution is not supported for such DDL queries", ErrorCodes::NOT_IMPLEMENTED); - } - - if (!context.getSettingsRef().allow_distributed_ddl) - throw Exception("Distributed DDL queries are prohibited for the user", ErrorCodes::QUERY_IS_PROHIBITED); - - if (const auto * query_alter = query_ptr->as()) - { - for (const auto & command : query_alter->command_list->commands) - { - if (!isSupportedAlterType(command->type)) - throw Exception("Unsupported type of ALTER query", ErrorCodes::NOT_IMPLEMENTED); - } - } - - query->cluster = context.getMacros()->expand(query->cluster); - ClusterPtr cluster = context.getCluster(query->cluster); - DDLWorker & ddl_worker = context.getDDLWorker(); - - /// Enumerate hosts which will be used to send query. - Cluster::AddressesWithFailover shards = cluster->getShardsAddresses(); - std::vector hosts; - for (const auto & shard : shards) - { - for (const auto & addr : shard) - hosts.emplace_back(addr); - } - - if (hosts.empty()) - throw Exception("No hosts defined to execute distributed DDL query", ErrorCodes::LOGICAL_ERROR); - - /// The current database in a distributed query need to be replaced with either - /// the local current database or a shard's default database. - bool need_replace_current_database - = (std::find_if( - query_requires_access.begin(), - query_requires_access.end(), - [](const AccessRightsElement & elem) { return elem.isEmptyDatabase(); }) - != query_requires_access.end()); - - bool use_local_default_database = false; - const String & current_database = context.getCurrentDatabase(); - - if (need_replace_current_database) - { - Strings shard_default_databases; - for (const auto & shard : shards) - { - for (const auto & addr : shard) - { - if (!addr.default_database.empty()) - shard_default_databases.push_back(addr.default_database); - else - use_local_default_database = true; - } - } - std::sort(shard_default_databases.begin(), shard_default_databases.end()); - shard_default_databases.erase(std::unique(shard_default_databases.begin(), shard_default_databases.end()), shard_default_databases.end()); - assert(use_local_default_database || !shard_default_databases.empty()); - - if (use_local_default_database && !shard_default_databases.empty()) - throw Exception("Mixed local default DB and shard default DB in DDL query", ErrorCodes::NOT_IMPLEMENTED); - - if (use_local_default_database) - { - query_requires_access.replaceEmptyDatabase(current_database); - } - else - { - for (size_t i = 0; i != query_requires_access.size();) - { - auto & element = query_requires_access[i]; - if (element.isEmptyDatabase()) - { - query_requires_access.insert(query_requires_access.begin() + i + 1, shard_default_databases.size() - 1, element); - for (size_t j = 0; j != shard_default_databases.size(); ++j) - query_requires_access[i + j].replaceEmptyDatabase(shard_default_databases[j]); - i += shard_default_databases.size(); - } - else - ++i; - } - } - } - - AddDefaultDatabaseVisitor visitor(current_database, !use_local_default_database); - visitor.visitDDL(query_ptr); - - /// Check access rights, assume that all servers have the same users config - if (query_requires_grant_option) - context.getAccess()->checkGrantOption(query_requires_access); - else - context.checkAccess(query_requires_access); - - DDLLogEntry entry; - entry.hosts = std::move(hosts); - entry.query = queryToString(query_ptr); - entry.initiator = ddl_worker.getCommonHostID(); - String node_path = ddl_worker.enqueueQuery(entry); - - BlockIO io; - if (context.getSettingsRef().distributed_ddl_task_timeout == 0) - return io; - - auto stream = std::make_shared(node_path, entry, context); - io.in = std::move(stream); - return io; -} - -BlockIO executeDDLQueryOnCluster(const ASTPtr & query_ptr, const Context & context, const AccessRightsElements & query_requires_access, bool query_requires_grant_option) -{ - return executeDDLQueryOnCluster(query_ptr, context, AccessRightsElements{query_requires_access}, query_requires_grant_option); -} - -BlockIO executeDDLQueryOnCluster(const ASTPtr & query_ptr_, const Context & context) -{ - return executeDDLQueryOnCluster(query_ptr_, context, {}); -} - } diff --git a/src/Interpreters/DDLWorker.h b/src/Interpreters/DDLWorker.h index 39cdcab709e..caa2242caf8 100644 --- a/src/Interpreters/DDLWorker.h +++ b/src/Interpreters/DDLWorker.h @@ -1,11 +1,9 @@ #pragma once -#include -#include #include #include -#include -#include +#include +#include #include #include @@ -18,23 +16,22 @@ namespace zkutil class ZooKeeper; } +namespace Poco +{ + class Logger; + namespace Util { class AbstractConfiguration; } +} + namespace DB { class Context; class ASTAlterQuery; -class AccessRightsElements; struct DDLLogEntry; struct DDLTask; using DDLTaskPtr = std::unique_ptr; -/// Pushes distributed DDL query to the queue -BlockIO executeDDLQueryOnCluster(const ASTPtr & query_ptr, const Context & context); -BlockIO executeDDLQueryOnCluster(const ASTPtr & query_ptr, const Context & context, const AccessRightsElements & query_requires_access, bool query_requires_grant_option = false); -BlockIO executeDDLQueryOnCluster(const ASTPtr & query_ptr, const Context & context, AccessRightsElements && query_requires_access, bool query_requires_grant_option = false); - - class DDLWorker { public: @@ -137,9 +134,6 @@ private: size_t max_tasks_in_queue = 1000; ThreadGroupStatusPtr thread_group; - - friend class DDLQueryStatusInputStream; - friend struct DDLTask; }; diff --git a/src/Interpreters/InterpreterAlterQuery.cpp b/src/Interpreters/InterpreterAlterQuery.cpp index e229cb120e5..013e30a3ed5 100644 --- a/src/Interpreters/InterpreterAlterQuery.cpp +++ b/src/Interpreters/InterpreterAlterQuery.cpp @@ -1,5 +1,5 @@ #include -#include +#include #include #include #include diff --git a/src/Interpreters/InterpreterCreateQuery.cpp b/src/Interpreters/InterpreterCreateQuery.cpp index 0f7d441c0d6..04c5efce3e2 100644 --- a/src/Interpreters/InterpreterCreateQuery.cpp +++ b/src/Interpreters/InterpreterCreateQuery.cpp @@ -28,7 +28,8 @@ #include #include -#include +#include +#include #include #include #include diff --git a/src/Interpreters/InterpreterCreateQuotaQuery.cpp b/src/Interpreters/InterpreterCreateQuotaQuery.cpp index f45c2c9709d..ff30a2fff47 100644 --- a/src/Interpreters/InterpreterCreateQuotaQuery.cpp +++ b/src/Interpreters/InterpreterCreateQuotaQuery.cpp @@ -2,7 +2,7 @@ #include #include #include -#include +#include #include #include #include diff --git a/src/Interpreters/InterpreterCreateRoleQuery.cpp b/src/Interpreters/InterpreterCreateRoleQuery.cpp index 2fa04eebae1..72ad3234b95 100644 --- a/src/Interpreters/InterpreterCreateRoleQuery.cpp +++ b/src/Interpreters/InterpreterCreateRoleQuery.cpp @@ -1,7 +1,7 @@ #include #include #include -#include +#include #include #include diff --git a/src/Interpreters/InterpreterCreateRowPolicyQuery.cpp b/src/Interpreters/InterpreterCreateRowPolicyQuery.cpp index 9dacc9d1bf4..8f1c5b061e0 100644 --- a/src/Interpreters/InterpreterCreateRowPolicyQuery.cpp +++ b/src/Interpreters/InterpreterCreateRowPolicyQuery.cpp @@ -4,7 +4,7 @@ #include #include #include -#include +#include #include #include #include diff --git a/src/Interpreters/InterpreterCreateSettingsProfileQuery.cpp b/src/Interpreters/InterpreterCreateSettingsProfileQuery.cpp index 2d5f4d499b7..b65225db16c 100644 --- a/src/Interpreters/InterpreterCreateSettingsProfileQuery.cpp +++ b/src/Interpreters/InterpreterCreateSettingsProfileQuery.cpp @@ -2,7 +2,7 @@ #include #include #include -#include +#include #include #include #include diff --git a/src/Interpreters/InterpreterCreateUserQuery.cpp b/src/Interpreters/InterpreterCreateUserQuery.cpp index 111f698beb9..c9b087de5b4 100644 --- a/src/Interpreters/InterpreterCreateUserQuery.cpp +++ b/src/Interpreters/InterpreterCreateUserQuery.cpp @@ -1,7 +1,7 @@ #include #include #include -#include +#include #include #include #include diff --git a/src/Interpreters/InterpreterDropAccessEntityQuery.cpp b/src/Interpreters/InterpreterDropAccessEntityQuery.cpp index d79d239ee12..e86f8361100 100644 --- a/src/Interpreters/InterpreterDropAccessEntityQuery.cpp +++ b/src/Interpreters/InterpreterDropAccessEntityQuery.cpp @@ -2,7 +2,7 @@ #include #include #include -#include +#include #include #include #include diff --git a/src/Interpreters/InterpreterDropQuery.cpp b/src/Interpreters/InterpreterDropQuery.cpp index 48eb20485be..0f03525f237 100644 --- a/src/Interpreters/InterpreterDropQuery.cpp +++ b/src/Interpreters/InterpreterDropQuery.cpp @@ -2,7 +2,7 @@ #include #include -#include +#include #include #include #include diff --git a/src/Interpreters/InterpreterGrantQuery.cpp b/src/Interpreters/InterpreterGrantQuery.cpp index 6f45687a4e1..dafe4d2e18c 100644 --- a/src/Interpreters/InterpreterGrantQuery.cpp +++ b/src/Interpreters/InterpreterGrantQuery.cpp @@ -2,7 +2,7 @@ #include #include #include -#include +#include #include #include #include diff --git a/src/Interpreters/InterpreterKillQueryQuery.cpp b/src/Interpreters/InterpreterKillQueryQuery.cpp index 0f7da8f1f58..c50659c6c45 100644 --- a/src/Interpreters/InterpreterKillQueryQuery.cpp +++ b/src/Interpreters/InterpreterKillQueryQuery.cpp @@ -2,7 +2,7 @@ #include #include #include -#include +#include #include #include #include diff --git a/src/Interpreters/InterpreterOptimizeQuery.cpp b/src/Interpreters/InterpreterOptimizeQuery.cpp index 680dd9b803b..431d5074cde 100644 --- a/src/Interpreters/InterpreterOptimizeQuery.cpp +++ b/src/Interpreters/InterpreterOptimizeQuery.cpp @@ -1,7 +1,7 @@ #include #include #include -#include +#include #include #include #include diff --git a/src/Interpreters/InterpreterRenameQuery.cpp b/src/Interpreters/InterpreterRenameQuery.cpp index 65ed33bd9db..3a375e2ba60 100644 --- a/src/Interpreters/InterpreterRenameQuery.cpp +++ b/src/Interpreters/InterpreterRenameQuery.cpp @@ -3,7 +3,7 @@ #include #include #include -#include +#include #include #include #include diff --git a/src/Interpreters/InterpreterSystemQuery.cpp b/src/Interpreters/InterpreterSystemQuery.cpp index f0a8ce9064d..1b8c3ae79f2 100644 --- a/src/Interpreters/InterpreterSystemQuery.cpp +++ b/src/Interpreters/InterpreterSystemQuery.cpp @@ -14,7 +14,7 @@ #include #include #include -#include +#include #include #include #include diff --git a/src/Interpreters/executeDDLQueryOnCluster.cpp b/src/Interpreters/executeDDLQueryOnCluster.cpp new file mode 100644 index 00000000000..6da1704ce55 --- /dev/null +++ b/src/Interpreters/executeDDLQueryOnCluster.cpp @@ -0,0 +1,317 @@ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + + +namespace DB +{ + +namespace ErrorCodes +{ + extern const int NOT_IMPLEMENTED; + extern const int TIMEOUT_EXCEEDED; + extern const int UNFINISHED; + extern const int QUERY_IS_PROHIBITED; +} + +static bool isSupportedAlterType(int type) +{ + static const std::unordered_set unsupported_alter_types{ + ASTAlterCommand::ATTACH_PARTITION, + ASTAlterCommand::REPLACE_PARTITION, + ASTAlterCommand::FETCH_PARTITION, + ASTAlterCommand::FREEZE_PARTITION, + ASTAlterCommand::FREEZE_ALL, + ASTAlterCommand::NO_TYPE, + }; + + return unsupported_alter_types.count(type) == 0; +} + + +BlockIO executeDDLQueryOnCluster(const ASTPtr & query_ptr_, const Context & context) +{ + return executeDDLQueryOnCluster(query_ptr_, context, {}); +} + +BlockIO executeDDLQueryOnCluster(const ASTPtr & query_ptr, const Context & context, const AccessRightsElements & query_requires_access, bool query_requires_grant_option) +{ + return executeDDLQueryOnCluster(query_ptr, context, AccessRightsElements{query_requires_access}, query_requires_grant_option); +} + +BlockIO executeDDLQueryOnCluster(const ASTPtr & query_ptr_, const Context & context, AccessRightsElements && query_requires_access, bool query_requires_grant_option) +{ + /// Remove FORMAT and INTO OUTFILE if exists + ASTPtr query_ptr = query_ptr_->clone(); + ASTQueryWithOutput::resetOutputASTIfExist(*query_ptr); + + // XXX: serious design flaw since `ASTQueryWithOnCluster` is not inherited from `IAST`! + auto * query = dynamic_cast(query_ptr.get()); + if (!query) + { + throw Exception("Distributed execution is not supported for such DDL queries", ErrorCodes::NOT_IMPLEMENTED); + } + + if (!context.getSettingsRef().allow_distributed_ddl) + throw Exception("Distributed DDL queries are prohibited for the user", ErrorCodes::QUERY_IS_PROHIBITED); + + if (const auto * query_alter = query_ptr->as()) + { + for (const auto & command : query_alter->command_list->commands) + { + if (!isSupportedAlterType(command->type)) + throw Exception("Unsupported type of ALTER query", ErrorCodes::NOT_IMPLEMENTED); + } + } + + query->cluster = context.getMacros()->expand(query->cluster); + ClusterPtr cluster = context.getCluster(query->cluster); + DDLWorker & ddl_worker = context.getDDLWorker(); + + /// Enumerate hosts which will be used to send query. + Cluster::AddressesWithFailover shards = cluster->getShardsAddresses(); + std::vector hosts; + for (const auto & shard : shards) + { + for (const auto & addr : shard) + hosts.emplace_back(addr); + } + + if (hosts.empty()) + throw Exception("No hosts defined to execute distributed DDL query", ErrorCodes::LOGICAL_ERROR); + + /// The current database in a distributed query need to be replaced with either + /// the local current database or a shard's default database. + bool need_replace_current_database + = (std::find_if( + query_requires_access.begin(), + query_requires_access.end(), + [](const AccessRightsElement & elem) { return elem.isEmptyDatabase(); }) + != query_requires_access.end()); + + bool use_local_default_database = false; + const String & current_database = context.getCurrentDatabase(); + + if (need_replace_current_database) + { + Strings shard_default_databases; + for (const auto & shard : shards) + { + for (const auto & addr : shard) + { + if (!addr.default_database.empty()) + shard_default_databases.push_back(addr.default_database); + else + use_local_default_database = true; + } + } + std::sort(shard_default_databases.begin(), shard_default_databases.end()); + shard_default_databases.erase(std::unique(shard_default_databases.begin(), shard_default_databases.end()), shard_default_databases.end()); + assert(use_local_default_database || !shard_default_databases.empty()); + + if (use_local_default_database && !shard_default_databases.empty()) + throw Exception("Mixed local default DB and shard default DB in DDL query", ErrorCodes::NOT_IMPLEMENTED); + + if (use_local_default_database) + { + query_requires_access.replaceEmptyDatabase(current_database); + } + else + { + for (size_t i = 0; i != query_requires_access.size();) + { + auto & element = query_requires_access[i]; + if (element.isEmptyDatabase()) + { + query_requires_access.insert(query_requires_access.begin() + i + 1, shard_default_databases.size() - 1, element); + for (size_t j = 0; j != shard_default_databases.size(); ++j) + query_requires_access[i + j].replaceEmptyDatabase(shard_default_databases[j]); + i += shard_default_databases.size(); + } + else + ++i; + } + } + } + + AddDefaultDatabaseVisitor visitor(current_database, !use_local_default_database); + visitor.visitDDL(query_ptr); + + /// Check access rights, assume that all servers have the same users config + if (query_requires_grant_option) + context.getAccess()->checkGrantOption(query_requires_access); + else + context.checkAccess(query_requires_access); + + DDLLogEntry entry; + entry.hosts = std::move(hosts); + entry.query = queryToString(query_ptr); + entry.initiator = ddl_worker.getCommonHostID(); + String node_path = ddl_worker.enqueueQuery(entry); + + BlockIO io; + if (context.getSettingsRef().distributed_ddl_task_timeout == 0) + return io; + + auto stream = std::make_shared(node_path, entry, context); + io.in = std::move(stream); + return io; +} + + +DDLQueryStatusInputStream::DDLQueryStatusInputStream(const String & zk_node_path, const DDLLogEntry & entry, const Context & context_) + : node_path(zk_node_path) + , context(context_) + , watch(CLOCK_MONOTONIC_COARSE) + , log(&Poco::Logger::get("DDLQueryStatusInputStream")) +{ + sample = Block{ + {std::make_shared(), "host"}, + {std::make_shared(), "port"}, + {std::make_shared(), "status"}, + {std::make_shared(), "error"}, + {std::make_shared(), "num_hosts_remaining"}, + {std::make_shared(), "num_hosts_active"}, + }; + + for (const HostID & host: entry.hosts) + waiting_hosts.emplace(host.toString()); + + addTotalRowsApprox(entry.hosts.size()); + + timeout_seconds = context.getSettingsRef().distributed_ddl_task_timeout; +} + +Block DDLQueryStatusInputStream::readImpl() +{ + Block res; + if (num_hosts_finished >= waiting_hosts.size()) + { + if (first_exception) + throw Exception(*first_exception); + + return res; + } + + auto zookeeper = context.getZooKeeper(); + size_t try_number = 0; + + while (res.rows() == 0) + { + if (isCancelled()) + { + if (first_exception) + throw Exception(*first_exception); + + return res; + } + + if (timeout_seconds >= 0 && watch.elapsedSeconds() > timeout_seconds) + { + size_t num_unfinished_hosts = waiting_hosts.size() - num_hosts_finished; + size_t num_active_hosts = current_active_hosts.size(); + + + throw Exception(ErrorCodes::TIMEOUT_EXCEEDED, + "Watching task {} is executing longer than distributed_ddl_task_timeout (={}) seconds. " + "There are {} unfinished hosts ({} of them are currently active), they are going to execute the query in background", + node_path, timeout_seconds, num_unfinished_hosts, num_active_hosts); + } + + if (num_hosts_finished != 0 || try_number != 0) + { + sleepForMilliseconds(std::min(1000, 50 * (try_number + 1))); + } + + /// TODO: add shared lock + if (!zookeeper->exists(node_path)) + { + throw Exception(ErrorCodes::UNFINISHED, + "Cannot provide query execution status. The query's node {} has been deleted by the cleaner since it was finished (or its lifetime is expired)", + node_path); + } + + Strings new_hosts = getNewAndUpdate(getChildrenAllowNoNode(zookeeper, node_path + "/finished")); + ++try_number; + if (new_hosts.empty()) + continue; + + current_active_hosts = getChildrenAllowNoNode(zookeeper, node_path + "/active"); + + MutableColumns columns = sample.cloneEmptyColumns(); + for (const String & host_id : new_hosts) + { + ExecutionStatus status(-1, "Cannot obtain error message"); + { + String status_data; + if (zookeeper->tryGet(node_path + "/finished/" + host_id, status_data)) + status.tryDeserializeText(status_data); + } + + auto [host, port] = Cluster::Address::fromString(host_id); + + if (status.code != 0 && first_exception == nullptr) + first_exception = std::make_unique(status.code, "There was an error on [{}:{}]: {}", host, port, status.message); + + ++num_hosts_finished; + + columns[0]->insert(host); + columns[1]->insert(port); + columns[2]->insert(status.code); + columns[3]->insert(status.message); + columns[4]->insert(waiting_hosts.size() - num_hosts_finished); + columns[5]->insert(current_active_hosts.size()); + } + res = sample.cloneWithColumns(std::move(columns)); + } + + return res; +} + +Strings DDLQueryStatusInputStream::getChildrenAllowNoNode(const std::shared_ptr & zookeeper, const String & node_path) +{ + Strings res; + Coordination::Error code = zookeeper->tryGetChildren(node_path, res); + if (code != Coordination::Error::ZOK && code != Coordination::Error::ZNONODE) + throw Coordination::Exception(code, node_path); + return res; +} + +Strings DDLQueryStatusInputStream::getNewAndUpdate(const Strings & current_list_of_finished_hosts) +{ + Strings diff; + for (const String & host : current_list_of_finished_hosts) + { + if (!waiting_hosts.count(host)) + { + if (!ignoring_hosts.count(host)) + { + ignoring_hosts.emplace(host); + LOG_INFO(log, "Unexpected host {} appeared in task {}", host, node_path); + } + continue; + } + + if (!finished_hosts.count(host)) + { + diff.emplace_back(host); + finished_hosts.emplace(host); + } + } + + return diff; +} + + +} diff --git a/src/Interpreters/executeDDLQueryOnCluster.h b/src/Interpreters/executeDDLQueryOnCluster.h new file mode 100644 index 00000000000..83880cc94c1 --- /dev/null +++ b/src/Interpreters/executeDDLQueryOnCluster.h @@ -0,0 +1,63 @@ +#pragma once +#include +#include + +namespace zkutil +{ + class ZooKeeper; +} + +namespace DB +{ + +class Context; +class AccessRightsElements; +struct DDLLogEntry; + + +/// Pushes distributed DDL query to the queue. +/// Returns DDLQueryStatusInputStream, which reads results of query execution on each host in the cluster. +BlockIO executeDDLQueryOnCluster(const ASTPtr & query_ptr, const Context & context); +BlockIO executeDDLQueryOnCluster(const ASTPtr & query_ptr, const Context & context, const AccessRightsElements & query_requires_access, bool query_requires_grant_option = false); +BlockIO executeDDLQueryOnCluster(const ASTPtr & query_ptr, const Context & context, AccessRightsElements && query_requires_access, bool query_requires_grant_option = false); + + +class DDLQueryStatusInputStream : public IBlockInputStream +{ +public: + DDLQueryStatusInputStream(const String & zk_node_path, const DDLLogEntry & entry, const Context & context_); + + String getName() const override { return "DDLQueryStatusInputStream"; } + + Block getHeader() const override { return sample; } + + Block getSampleBlock() const { return sample.cloneEmpty(); } + + Block readImpl() override; + +private: + + static Strings getChildrenAllowNoNode(const std::shared_ptr & zookeeper, const String & node_path); + + Strings getNewAndUpdate(const Strings & current_list_of_finished_hosts); + + String node_path; + const Context & context; + Stopwatch watch; + Poco::Logger * log; + + Block sample; + + NameSet waiting_hosts; /// hosts from task host list + NameSet finished_hosts; /// finished hosts from host list + NameSet ignoring_hosts; /// appeared hosts that are not in hosts list + Strings current_active_hosts; /// Hosts that were in active state at the last check + size_t num_hosts_finished = 0; + + /// Save the first detected error and throw it at the end of execution + std::unique_ptr first_exception; + + Int64 timeout_seconds = 120; +}; + +} diff --git a/src/Interpreters/ya.make b/src/Interpreters/ya.make index 4c0b64934c7..11a09c40d6a 100644 --- a/src/Interpreters/ya.make +++ b/src/Interpreters/ya.make @@ -45,11 +45,13 @@ SRCS( CrossToInnerJoinVisitor.cpp DatabaseAndTableWithAlias.cpp DatabaseCatalog.cpp + DDLTask.cpp DDLWorker.cpp DictionaryReader.cpp DNSCacheUpdater.cpp EmbeddedDictionaries.cpp evaluateConstantExpression.cpp + executeDDLQueryOnCluster.cpp executeQuery.cpp ExecuteScalarSubqueriesVisitor.cpp ExpressionActions.cpp From 2a6c0b91802de8279a0928e853a3840d94a1413a Mon Sep 17 00:00:00 2001 From: Alexander Tokmakov Date: Thu, 5 Nov 2020 12:52:23 +0300 Subject: [PATCH 0059/2357] try reuse DDLWorker in DatabaseReplicated --- src/Databases/DatabaseReplicated.cpp | 206 +++++++++++------- src/Databases/DatabaseReplicated.h | 16 +- src/Databases/IDatabase.h | 6 - src/Interpreters/DDLWorker.cpp | 36 ++- src/Interpreters/DDLWorker.h | 10 +- src/Interpreters/InterpreterAlterQuery.cpp | 8 +- src/Interpreters/InterpreterCreateQuery.cpp | 29 ++- src/Interpreters/InterpreterDropQuery.cpp | 16 +- src/Interpreters/InterpreterRenameQuery.cpp | 11 +- src/Interpreters/executeDDLQueryOnCluster.cpp | 18 +- src/Interpreters/executeDDLQueryOnCluster.h | 5 +- .../test_replicated_database/test.py | 12 +- 12 files changed, 224 insertions(+), 149 deletions(-) diff --git a/src/Databases/DatabaseReplicated.cpp b/src/Databases/DatabaseReplicated.cpp index 145b3abba00..1213b5bc075 100644 --- a/src/Databases/DatabaseReplicated.cpp +++ b/src/Databases/DatabaseReplicated.cpp @@ -13,7 +13,10 @@ #include #include #include - +#include +#include +#include +#include namespace DB { @@ -45,6 +48,7 @@ zkutil::ZooKeeperPtr DatabaseReplicated::getZooKeeper() const return res; } +DatabaseReplicated::~DatabaseReplicated() = default; DatabaseReplicated::DatabaseReplicated( const String & name_, @@ -125,12 +129,15 @@ DatabaseReplicated::DatabaseReplicated( feedback_timeout = context_.getConfigRef().getInt("database_replicated_feedback_timeout", 0); LOG_DEBUG(log, "Snapshot period is set to {} log entries per one snapshot", snapshot_period); - //TODO do we need separate pool? - background_log_executor = context_.getReplicatedSchedulePool().createTask( - database_name + "(DatabaseReplicated::background_executor)", [this] { runBackgroundLogExecutor(); } - ); + //FIXME use database UUID + ddl_worker = std::make_unique(1, zookeeper_path + "/log", context_, nullptr, String{}, true, database_name, replica_name, shard_name); - background_log_executor->scheduleAfter(500); + //TODO do we need separate pool? + //background_log_executor = context_.getReplicatedSchedulePool().createTask( + // database_name + "(DatabaseReplicated::background_executor)", [this] { runBackgroundLogExecutor(); } + //); + + //background_log_executor->scheduleAfter(500); } void DatabaseReplicated::createDatabaseZooKeeperNodes() @@ -226,7 +233,7 @@ void DatabaseReplicated::runBackgroundLogExecutor() } } - background_log_executor->scheduleAfter(500); + //background_log_executor->scheduleAfter(500); } void DatabaseReplicated::writeLastExecutedToDiskAndZK() @@ -244,95 +251,128 @@ void DatabaseReplicated::writeLastExecutedToDiskAndZK() out.close(); } -void DatabaseReplicated::executeLogName(const String & log_entry_name) +void DatabaseReplicated::executeLogName(const String & /*log_entry_name*/) { - String path = zookeeper_path + "/log/" + log_entry_name; - current_zookeeper = getZooKeeper(); - String query_to_execute = current_zookeeper->get(path, {}, nullptr); - - try - { - current_context = std::make_unique(global_context); - current_context->getClientInfo().query_kind = ClientInfo::QueryKind::REPLICATED_LOG_QUERY; - current_context->setCurrentDatabase(database_name); - current_context->setCurrentQueryId(""); // generate random query_id - executeQuery(query_to_execute, *current_context); - } - catch (const Exception & e) - { - tryLogCurrentException(log, "Query from zookeeper " + query_to_execute + " wasn't finished successfully"); - current_zookeeper->create( - zookeeper_path + "/replicas/" + replica_name + "/errors/" + log_entry_name, e.what(), zkutil::CreateMode::Persistent); - } - - LOG_DEBUG(log, "Executed query: {}", query_to_execute); +// String path = zookeeper_path + "/log/" + log_entry_name; +// current_zookeeper = getZooKeeper(); +// String query_to_execute = current_zookeeper->get(path, {}, nullptr); +// +// try +// { +// current_context = std::make_unique(global_context); +// current_context->getClientInfo().query_kind = ClientInfo::QueryKind::REPLICATED_LOG_QUERY; +// current_context->setCurrentDatabase(database_name); +// current_context->setCurrentQueryId(""); // generate random query_id +// executeQuery(query_to_execute, *current_context); +// } +// catch (const Exception & e) +// { +// tryLogCurrentException(log, "Query from zookeeper " + query_to_execute + " wasn't finished successfully"); +// current_zookeeper->create( +// zookeeper_path + "/replicas/" + replica_name + "/errors/" + log_entry_name, e.what(), zkutil::CreateMode::Persistent); +// } +// +// LOG_DEBUG(log, "Executed query: {}", query_to_execute); } -void DatabaseReplicated::propose(const ASTPtr & query) +BlockIO DatabaseReplicated::propose(const ASTPtr & query) { - current_zookeeper = getZooKeeper(); + //current_zookeeper = getZooKeeper(); - LOG_DEBUG(log, "Proposing query: {}", queryToString(query)); + if (const auto * query_alter = query->as()) { - std::lock_guard lock(log_name_mutex); - log_name_to_exec_with_result - = current_zookeeper->create(zookeeper_path + "/log/log-", queryToString(query), zkutil::CreateMode::PersistentSequential); - } - - background_log_executor->schedule(); -} - -BlockIO DatabaseReplicated::getFeedback() -{ - BlockIO res; - if (feedback_timeout == 0) - return res; - - Stopwatch watch; - - NamesAndTypes block_structure = - { - {"replica_name", std::make_shared()}, - {"execution_feedback", std::make_shared()}, - }; - auto replica_name_column = block_structure[0].type->createColumn(); - auto feedback_column = block_structure[1].type->createColumn(); - - current_zookeeper = getZooKeeper(); - Strings replica_states = current_zookeeper->getChildren(zookeeper_path + "/replicas"); - auto replica_iter = replica_states.begin(); - - while (!replica_states.empty() && watch.elapsedSeconds() < feedback_timeout) - { - String last_executed = current_zookeeper->get(zookeeper_path + "/replicas/" + *replica_iter); - if (last_executed > log_name_to_exec_with_result) + for (const auto & command : query_alter->command_list->commands) { - replica_name_column->insert(*replica_iter); - String err_path = zookeeper_path + "/replicas/" + *replica_iter + "/errors/" + log_name_to_exec_with_result; - if (!current_zookeeper->exists(err_path)) - { - feedback_column->insert("OK"); - } - else - { - String feedback = current_zookeeper->get(err_path, {}, nullptr); - feedback_column->insert(feedback); - } - replica_states.erase(replica_iter); - replica_iter = replica_states.begin(); + //FIXME allow all types of queries (maybe we should execute ATTACH an similar queries on leader) + if (!isSupportedAlterType(command->type)) + throw Exception("Unsupported type of ALTER query", ErrorCodes::NOT_IMPLEMENTED); } } - Block block = Block({ - {std::move(replica_name_column), block_structure[0].type, block_structure[0].name}, - {std::move(feedback_column), block_structure[1].type, block_structure[1].name} - }); + LOG_DEBUG(log, "Proposing query: {}", queryToString(query)); - res.in = std::make_shared(block); - return res; + DDLLogEntry entry; + entry.hosts = {}; + entry.query = queryToString(query); + entry.initiator = ddl_worker->getCommonHostID(); + String node_path = ddl_worker->enqueueQuery(entry); + + BlockIO io; + //FIXME use query context + if (global_context.getSettingsRef().distributed_ddl_task_timeout == 0) + return io; + + //FIXME need list of all replicas + Strings hosts_to_wait; + //TODO maybe it's better to use (shard_name + sep + replica_name) as host ID to allow use {replica} macro (may may have the same values across shards) + hosts_to_wait.emplace_back(replica_name); + auto stream = std::make_shared(node_path, entry, global_context); + io.in = std::move(stream); + return io; + + //executeDDLQueryOnCluster(query, global_context); + + + //{ + // std::lock_guard lock(log_name_mutex); + // log_name_to_exec_with_result + // = current_zookeeper->create(zookeeper_path + "/log/log-", queryToString(query), zkutil::CreateMode::PersistentSequential); + //} + + //background_log_executor->schedule(); } +//BlockIO DatabaseReplicated::getFeedback() +//{ +// BlockIO res; +// if (feedback_timeout == 0) +// return res; +// +// Stopwatch watch; +// +// NamesAndTypes block_structure = +// { +// {"replica_name", std::make_shared()}, +// {"execution_feedback", std::make_shared()}, +// }; +// auto replica_name_column = block_structure[0].type->createColumn(); +// auto feedback_column = block_structure[1].type->createColumn(); +// +// current_zookeeper = getZooKeeper(); +// Strings replica_states = current_zookeeper->getChildren(zookeeper_path + "/replicas"); +// auto replica_iter = replica_states.begin(); +// +// while (!replica_states.empty() && watch.elapsedSeconds() < feedback_timeout) +// { +// String last_executed = current_zookeeper->get(zookeeper_path + "/replicas/" + *replica_iter); +// if (last_executed > log_name_to_exec_with_result) +// { +// replica_name_column->insert(*replica_iter); +// String err_path = zookeeper_path + "/replicas/" + *replica_iter + "/errors/" + log_name_to_exec_with_result; +// if (!current_zookeeper->exists(err_path)) +// { +// feedback_column->insert("OK"); +// } +// else +// { +// String feedback = current_zookeeper->get(err_path, {}, nullptr); +// feedback_column->insert(feedback); +// } +// replica_states.erase(replica_iter); +// replica_iter = replica_states.begin(); +// } +// } +// +// Block block = Block({ +// {std::move(replica_name_column), block_structure[0].type, block_structure[0].name}, +// {std::move(feedback_column), block_structure[1].type, block_structure[1].name} +// }); +// +// res.in = std::make_shared(block); +// return res; +//} + void DatabaseReplicated::createSnapshot() { current_zookeeper = getZooKeeper(); @@ -389,7 +429,7 @@ void DatabaseReplicated::loadMetadataFromSnapshot() String query_to_execute = current_zookeeper->get(path, {}, nullptr); - current_context = std::make_unique(global_context); + auto current_context = std::make_unique(global_context); current_context->getClientInfo().query_kind = ClientInfo::QueryKind::REPLICATED_LOG_QUERY; current_context->setCurrentDatabase(database_name); current_context->setCurrentQueryId(""); // generate random query_id diff --git a/src/Databases/DatabaseReplicated.h b/src/Databases/DatabaseReplicated.h index 375118e7356..537eaad893f 100644 --- a/src/Databases/DatabaseReplicated.h +++ b/src/Databases/DatabaseReplicated.h @@ -6,10 +6,14 @@ #include #include #include +#include namespace DB { + +class DDLWorker; + /** DatabaseReplicated engine * supports replication of metadata * via DDL log being written to ZooKeeper @@ -39,13 +43,15 @@ public: const String & zookeeper_path_, const String & shard_name_, const String & replica_name_, Context & context); + ~DatabaseReplicated() override; + void drop(const Context & /*context*/) override; String getEngineName() const override { return "Replicated"; } - void propose(const ASTPtr & query) override; + BlockIO propose(const ASTPtr & query); - BlockIO getFeedback(); + //BlockIO getFeedback(); private: void createDatabaseZooKeeperNodes(); @@ -63,7 +69,7 @@ private: String shard_name; String replica_name; - std::unique_ptr current_context; // to run executeQuery + //std::unique_ptr current_context; // to run executeQuery std::mutex log_name_mutex; String log_name_to_exec_with_result; @@ -73,7 +79,7 @@ private: String last_executed_log_entry = ""; - BackgroundSchedulePool::TaskHolder background_log_executor; + //BackgroundSchedulePool::TaskHolder background_log_executor; zkutil::ZooKeeperPtr current_zookeeper; /// Use only the methods below. mutable std::mutex current_zookeeper_mutex; /// To recreate the session in the background thread. @@ -82,6 +88,8 @@ private: zkutil::ZooKeeperPtr getZooKeeper() const; void setZooKeeper(zkutil::ZooKeeperPtr zookeeper); + std::unique_ptr ddl_worker; + }; } diff --git a/src/Databases/IDatabase.h b/src/Databases/IDatabase.h index 393e8f2d10c..9b744259406 100644 --- a/src/Databases/IDatabase.h +++ b/src/Databases/IDatabase.h @@ -184,12 +184,6 @@ public: /// Is the database empty. virtual bool empty() const = 0; - /// Submit query to log. Currently used by DatabaseReplicated engine only. - virtual void propose(const ASTPtr & /*query*/) - { - throw Exception(getEngineName() + ": propose() is not supported", ErrorCodes::NOT_IMPLEMENTED); - } - /// Add the table to the database. Record its presence in the metadata. virtual void createTable( const Context & /*context*/, diff --git a/src/Interpreters/DDLWorker.cpp b/src/Interpreters/DDLWorker.cpp index 2c454db4787..b607bd084ea 100644 --- a/src/Interpreters/DDLWorker.cpp +++ b/src/Interpreters/DDLWorker.cpp @@ -142,12 +142,17 @@ std::unique_ptr createSimpleZooKeeperLock( } -DDLWorker::DDLWorker(int pool_size_, const std::string & zk_root_dir, Context & context_, const Poco::Util::AbstractConfiguration * config, const String & prefix) +DDLWorker::DDLWorker(int pool_size_, const std::string & zk_root_dir, Context & context_, const Poco::Util::AbstractConfiguration * config, const String & prefix, + bool is_replicated_db_, const std::optional & db_name_, const std::optional & db_replica_name_, const std::optional & db_shard_name_) : context(context_) , log(&Poco::Logger::get("DDLWorker")) , pool_size(pool_size_) , worker_pool(pool_size_) { + is_replicated_db = is_replicated_db_; + db_name = db_name_; + db_replica_name = db_replica_name_; + db_shard_name = db_shard_name_; last_tasks.reserve(pool_size); queue_dir = zk_root_dir; @@ -267,6 +272,15 @@ DDLTaskPtr DDLWorker::initAndCheckTask(const String & entry_name, String & out_r return {}; } + if (is_replicated_db) + { + // + task->host_id.host_name = host_fqdn; + task->host_id.port = context.getTCPPort(); + task->host_id_str = *db_replica_name; + return task; + } + bool host_in_hostlist = false; for (const HostID & host : task->entry.hosts) { @@ -390,6 +404,9 @@ void DDLWorker::parseQueryAndResolveHost(DDLTask & task) if (!task.query || !(task.query_on_cluster = dynamic_cast(task.query.get()))) throw Exception("Received unknown DDL query", ErrorCodes::UNKNOWN_TYPE_OF_QUERY); + if (is_replicated_db) + return; + task.cluster_name = task.query_on_cluster->cluster; task.cluster = context.tryGetCluster(task.cluster_name); if (!task.cluster) @@ -507,7 +524,14 @@ bool DDLWorker::tryExecuteQuery(const String & query, const DDLTask & task, Exec try { auto current_context = std::make_unique(context); - current_context->getClientInfo().query_kind = ClientInfo::QueryKind::SECONDARY_QUERY; + if (is_replicated_db) + { + current_context->getClientInfo().query_kind + = ClientInfo::QueryKind::REPLICATED_LOG_QUERY; //FIXME why do we need separate query kind? + current_context->setCurrentDatabase(*db_name); + } + else + current_context->getClientInfo().query_kind = ClientInfo::QueryKind::SECONDARY_QUERY; current_context->setCurrentQueryId(""); // generate random query_id executeQuery(istr, ostr, false, *current_context, {}); } @@ -696,7 +720,11 @@ bool DDLWorker::tryExecuteQueryOnLeaderReplica( return res; }; - String shard_node_name = get_shard_name(task.cluster->getShardsAddresses().at(task.host_shard_num)); + String shard_node_name; + if (is_replicated_db) + shard_node_name = *db_shard_name; + else + shard_node_name = get_shard_name(task.cluster->getShardsAddresses().at(task.host_shard_num)); String shard_path = node_path + "/shards/" + shard_node_name; String is_executed_path = shard_path + "/executed"; String tries_to_execute_path = shard_path + "/tries_to_execute"; @@ -892,7 +920,7 @@ void DDLWorker::createStatusDirs(const std::string & node_path, const ZooKeeperP String DDLWorker::enqueueQuery(DDLLogEntry & entry) { - if (entry.hosts.empty()) + if (entry.hosts.empty() && !is_replicated_db) throw Exception("Empty host list in a distributed DDL task", ErrorCodes::LOGICAL_ERROR); auto zookeeper = getAndSetZooKeeper(); diff --git a/src/Interpreters/DDLWorker.h b/src/Interpreters/DDLWorker.h index caa2242caf8..1c28100f933 100644 --- a/src/Interpreters/DDLWorker.h +++ b/src/Interpreters/DDLWorker.h @@ -4,6 +4,7 @@ #include #include #include +#include #include #include @@ -35,7 +36,8 @@ using DDLTaskPtr = std::unique_ptr; class DDLWorker { public: - DDLWorker(int pool_size_, const std::string & zk_root_dir, Context & context_, const Poco::Util::AbstractConfiguration * config, const String & prefix); + DDLWorker(int pool_size_, const std::string & zk_root_dir, Context & context_, const Poco::Util::AbstractConfiguration * config, const String & prefix, + bool is_replicated_db_ = false, const std::optional & db_name_ = std::nullopt, const std::optional & db_replica_name_ = std::nullopt, const std::optional & db_shard_name_ = std::nullopt); ~DDLWorker(); /// Pushes query into DDL queue, returns path to created node @@ -101,8 +103,12 @@ private: void attachToThreadGroup(); private: + bool is_replicated_db; + std::optional db_name; + std::optional db_replica_name; + std::optional db_shard_name; std::atomic is_circular_replicated = false; - Context & context; + Context context; Poco::Logger * log; std::string host_fqdn; /// current host domain name diff --git a/src/Interpreters/InterpreterAlterQuery.cpp b/src/Interpreters/InterpreterAlterQuery.cpp index 013e30a3ed5..38d00c089ab 100644 --- a/src/Interpreters/InterpreterAlterQuery.cpp +++ b/src/Interpreters/InterpreterAlterQuery.cpp @@ -51,12 +51,8 @@ BlockIO InterpreterAlterQuery::execute() auto metadata_snapshot = table->getInMemoryMetadataPtr(); DatabasePtr database = DatabaseCatalog::instance().getDatabase(table_id.database_name); - if (database->getEngineName() == "Replicated" && context.getClientInfo().query_kind != ClientInfo::QueryKind::REPLICATED_LOG_QUERY && !table->supportsReplication()) - { - database->propose(query_ptr); - auto * database_replicated = typeid_cast(database.get()); - return database_replicated->getFeedback(); - } + if (typeid_cast(database.get()) && context.getClientInfo().query_kind != ClientInfo::QueryKind::REPLICATED_LOG_QUERY && !table->supportsReplication()) + return typeid_cast(database.get())->propose(query_ptr); /// Add default database to table identifiers that we can encounter in e.g. default expressions, /// mutation expression, etc. diff --git a/src/Interpreters/InterpreterCreateQuery.cpp b/src/Interpreters/InterpreterCreateQuery.cpp index 04c5efce3e2..b36fe32b26d 100644 --- a/src/Interpreters/InterpreterCreateQuery.cpp +++ b/src/Interpreters/InterpreterCreateQuery.cpp @@ -77,6 +77,7 @@ namespace ErrorCodes extern const int ILLEGAL_SYNTAX_FOR_DATA_TYPE; extern const int ILLEGAL_COLUMN; extern const int LOGICAL_ERROR; + extern const int UNKNOWN_DATABASE; } namespace fs = std::filesystem; @@ -720,15 +721,22 @@ BlockIO InterpreterCreateQuery::createTable(ASTCreateQuery & create) create.database = current_database; } + //TODO make code better if possible + bool need_add_to_database = !create.temporary; + if(need_add_to_database && database->getEngineName() == "Replicated") + { + auto guard = DatabaseCatalog::instance().getDDLGuard(create.database, create.table); + database = DatabaseCatalog::instance().getDatabase(create.database); + if (typeid_cast(database.get()) && context.getClientInfo().query_kind != ClientInfo::QueryKind::REPLICATED_LOG_QUERY) + { + assertOrSetUUID(create, database); + return typeid_cast(database.get())->propose(query_ptr); + } + } + /// Actually creates table bool created = doCreateTable(create, properties); - if (database->getEngineName() == "Replicated" && context.getClientInfo().query_kind != ClientInfo::QueryKind::REPLICATED_LOG_QUERY) - { - auto * database_replicated = typeid_cast(database.get()); - return database_replicated->getFeedback(); - } - if (!created) /// Table already exists return {}; @@ -753,6 +761,9 @@ bool InterpreterCreateQuery::doCreateTable(ASTCreateQuery & create, guard = DatabaseCatalog::instance().getDDLGuard(create.database, table_name); database = DatabaseCatalog::instance().getDatabase(create.database); + //TODO do we need it? + if (database->getEngineName() == "Replicated" && context.getClientInfo().query_kind != ClientInfo::QueryKind::REPLICATED_LOG_QUERY) + throw Exception(ErrorCodes::UNKNOWN_DATABASE, "Database was renamed"); assertOrSetUUID(create, database); /// Table can be created before or it can be created concurrently in another thread, while we were waiting in DDLGuard. @@ -790,12 +801,6 @@ bool InterpreterCreateQuery::doCreateTable(ASTCreateQuery & create, return true; } - if (database->getEngineName() == "Replicated" && context.getClientInfo().query_kind != ClientInfo::QueryKind::REPLICATED_LOG_QUERY) - { - database->propose(query_ptr); - return true; - } - StoragePtr res; /// NOTE: CREATE query may be rewritten by Storage creator or table function if (create.as_table_function) diff --git a/src/Interpreters/InterpreterDropQuery.cpp b/src/Interpreters/InterpreterDropQuery.cpp index 0f03525f237..c93f8098713 100644 --- a/src/Interpreters/InterpreterDropQuery.cpp +++ b/src/Interpreters/InterpreterDropQuery.cpp @@ -101,8 +101,8 @@ BlockIO InterpreterDropQuery::executeToTable(const ASTDropQuery & query) if (database->getEngineName() != "Atomic" && database->getEngineName() != "Replicated") table_lock = table->lockExclusively(context.getCurrentQueryId(), context.getSettingsRef().lock_acquire_timeout); /// Drop table from memory, don't touch data and metadata - if (database->getEngineName() == "Replicated" && context.getClientInfo().query_kind != ClientInfo::QueryKind::REPLICATED_LOG_QUERY) - database->propose(query_ptr); + if (typeid_cast(database.get()) && context.getClientInfo().query_kind != ClientInfo::QueryKind::REPLICATED_LOG_QUERY) + return typeid_cast(database.get())->propose(query_ptr); else database->detachTable(table_id.table_name); } @@ -115,7 +115,7 @@ BlockIO InterpreterDropQuery::executeToTable(const ASTDropQuery & query) auto metadata_snapshot = table->getInMemoryMetadataPtr(); /// Drop table data, don't touch metadata if (database->getEngineName() == "Replicated" && context.getClientInfo().query_kind != ClientInfo::QueryKind::REPLICATED_LOG_QUERY) - database->propose(query_ptr); + return typeid_cast(database.get())->propose(query_ptr); else table->truncate(query_ptr, metadata_snapshot, context, table_lock); } @@ -131,8 +131,8 @@ BlockIO InterpreterDropQuery::executeToTable(const ASTDropQuery & query) table_lock = table->lockExclusively(context.getCurrentQueryId(), context.getSettingsRef().lock_acquire_timeout); /// Prevents recursive drop from drop database query. The original query must specify a table. - if (!query_ptr->as().table.empty() && database->getEngineName() == "Replicated" && context.getClientInfo().query_kind != ClientInfo::QueryKind::REPLICATED_LOG_QUERY) - database->propose(query_ptr); + if (typeid_cast(database.get()) && !query_ptr->as().table.empty() && context.getClientInfo().query_kind != ClientInfo::QueryKind::REPLICATED_LOG_QUERY) + return typeid_cast(database.get())->propose(query_ptr); else database->dropTable(context, table_id.table_name, query.no_delay); } @@ -151,12 +151,6 @@ BlockIO InterpreterDropQuery::executeToTable(const ASTDropQuery & query) } } - if (database && database->getEngineName() == "Replicated" && context.getClientInfo().query_kind != ClientInfo::QueryKind::REPLICATED_LOG_QUERY) - { - auto * database_replicated = typeid_cast(database.get()); - return database_replicated->getFeedback(); - } - return {}; } diff --git a/src/Interpreters/InterpreterRenameQuery.cpp b/src/Interpreters/InterpreterRenameQuery.cpp index 3a375e2ba60..4eee34a683e 100644 --- a/src/Interpreters/InterpreterRenameQuery.cpp +++ b/src/Interpreters/InterpreterRenameQuery.cpp @@ -75,9 +75,9 @@ BlockIO InterpreterRenameQuery::executeToTables(const ASTRenameQuery & rename, c database_catalog.assertTableDoesntExist(StorageID(elem.to_database_name, elem.to_table_name), context); DatabasePtr database = database_catalog.getDatabase(elem.from_database_name); - if (database->getEngineName() == "Replicated" && context.getClientInfo().query_kind != ClientInfo::QueryKind::REPLICATED_LOG_QUERY) + if (typeid_cast(database.get()) && context.getClientInfo().query_kind != ClientInfo::QueryKind::REPLICATED_LOG_QUERY) { - database->propose(query_ptr); + return typeid_cast(database.get())->propose(query_ptr); } else { @@ -89,13 +89,6 @@ BlockIO InterpreterRenameQuery::executeToTables(const ASTRenameQuery & rename, c rename.exchange, rename.dictionary); } - - // TODO it can't work - if (database->getEngineName() == "Replicated" && context.getClientInfo().query_kind != ClientInfo::QueryKind::REPLICATED_LOG_QUERY) - { - auto * database_replicated = typeid_cast(database.get()); - return database_replicated->getFeedback(); - } } return {}; diff --git a/src/Interpreters/executeDDLQueryOnCluster.cpp b/src/Interpreters/executeDDLQueryOnCluster.cpp index 6da1704ce55..03065245766 100644 --- a/src/Interpreters/executeDDLQueryOnCluster.cpp +++ b/src/Interpreters/executeDDLQueryOnCluster.cpp @@ -25,7 +25,7 @@ namespace ErrorCodes extern const int QUERY_IS_PROHIBITED; } -static bool isSupportedAlterType(int type) +bool isSupportedAlterType(int type) { static const std::unordered_set unsupported_alter_types{ ASTAlterCommand::ATTACH_PARTITION, @@ -170,7 +170,8 @@ BlockIO executeDDLQueryOnCluster(const ASTPtr & query_ptr_, const Context & cont } -DDLQueryStatusInputStream::DDLQueryStatusInputStream(const String & zk_node_path, const DDLLogEntry & entry, const Context & context_) +DDLQueryStatusInputStream::DDLQueryStatusInputStream(const String & zk_node_path, const DDLLogEntry & entry, const Context & context_, + const std::optional & hosts_to_wait) : node_path(zk_node_path) , context(context_) , watch(CLOCK_MONOTONIC_COARSE) @@ -185,10 +186,17 @@ DDLQueryStatusInputStream::DDLQueryStatusInputStream(const String & zk_node_path {std::make_shared(), "num_hosts_active"}, }; - for (const HostID & host: entry.hosts) - waiting_hosts.emplace(host.toString()); + if (hosts_to_wait) + { + waiting_hosts = NameSet(hosts_to_wait->begin(), hosts_to_wait->end()); + } + else + { + for (const HostID & host : entry.hosts) + waiting_hosts.emplace(host.toString()); + } - addTotalRowsApprox(entry.hosts.size()); + addTotalRowsApprox(waiting_hosts.size()); timeout_seconds = context.getSettingsRef().distributed_ddl_task_timeout; } diff --git a/src/Interpreters/executeDDLQueryOnCluster.h b/src/Interpreters/executeDDLQueryOnCluster.h index 83880cc94c1..0f7a411ed92 100644 --- a/src/Interpreters/executeDDLQueryOnCluster.h +++ b/src/Interpreters/executeDDLQueryOnCluster.h @@ -15,6 +15,9 @@ class AccessRightsElements; struct DDLLogEntry; +/// Returns true if provided ALTER type can be executed ON CLUSTER +bool isSupportedAlterType(int type); + /// Pushes distributed DDL query to the queue. /// Returns DDLQueryStatusInputStream, which reads results of query execution on each host in the cluster. BlockIO executeDDLQueryOnCluster(const ASTPtr & query_ptr, const Context & context); @@ -25,7 +28,7 @@ BlockIO executeDDLQueryOnCluster(const ASTPtr & query_ptr, const Context & conte class DDLQueryStatusInputStream : public IBlockInputStream { public: - DDLQueryStatusInputStream(const String & zk_node_path, const DDLLogEntry & entry, const Context & context_); + DDLQueryStatusInputStream(const String & zk_node_path, const DDLLogEntry & entry, const Context & context_, const std::optional & hosts_to_wait = {}); String getName() const override { return "DDLQueryStatusInputStream"; } diff --git a/tests/integration/test_replicated_database/test.py b/tests/integration/test_replicated_database/test.py index 372ac7a7c3e..06d8aa9467a 100644 --- a/tests/integration/test_replicated_database/test.py +++ b/tests/integration/test_replicated_database/test.py @@ -24,8 +24,8 @@ def assert_create_query(nodes, table_name, expected): def started_cluster(): try: cluster.start() - main_node.query("CREATE DATABASE testdb ENGINE = Replicated('/clickhouse/databases/test1', 'replica1');") - dummy_node.query("CREATE DATABASE testdb ENGINE = Replicated('/clickhouse/databases/test1', 'replica2');") + main_node.query("CREATE DATABASE testdb ENGINE = Replicated('/clickhouse/databases/test1', 'shard1', 'replica1');") + dummy_node.query("CREATE DATABASE testdb ENGINE = Replicated('/clickhouse/databases/test1', 'shard1', 'replica2');") yield cluster finally: @@ -67,7 +67,7 @@ def test_simple_alter_table(started_cluster): assert_create_query([main_node, dummy_node], "alter_test", expected) def test_create_replica_after_delay(started_cluster): - competing_node.query("CREATE DATABASE testdb ENGINE = Replicated('/clickhouse/databases/test1', 'replica3');") + competing_node.query("CREATE DATABASE testdb ENGINE = Replicated('/clickhouse/databases/test1', 'shard1', 'replica3');") main_node.query("ALTER TABLE testdb.alter_test ADD COLUMN Added3 UInt32;") main_node.query("ALTER TABLE testdb.alter_test DROP COLUMN AddedNested1;") @@ -128,15 +128,15 @@ def test_replica_restart(started_cluster): def test_snapshot_and_snapshot_recover(started_cluster): #FIXME bad test - snapshotting_node.query("CREATE DATABASE testdb ENGINE = Replicated('/clickhouse/databases/test1', 'replica4');") + snapshotting_node.query("CREATE DATABASE testdb ENGINE = Replicated('/clickhouse/databases/test1', 'shard1', 'replica4');") time.sleep(5) - snapshot_recovering_node.query("CREATE DATABASE testdb ENGINE = Replicated('/clickhouse/databases/test1', 'replica5');") + snapshot_recovering_node.query("CREATE DATABASE testdb ENGINE = Replicated('/clickhouse/databases/test1', 'shard1', 'replica5');") time.sleep(5) assert snapshotting_node.query("desc table testdb.alter_test") == snapshot_recovering_node.query("desc table testdb.alter_test") def test_drop_and_create_replica(started_cluster): main_node.query("DROP DATABASE testdb") - main_node.query("CREATE DATABASE testdb ENGINE = Replicated('/clickhouse/databases/test1', 'replica1');") + main_node.query("CREATE DATABASE testdb ENGINE = Replicated('/clickhouse/databases/test1', 'shard1', 'replica1');") expected = "CREATE TABLE testdb.concurrent_test\\n(\\n `CounterID` UInt32,\\n `StartDate` Date,\\n `UserID` UInt32,\\n" \ " `VisitID` UInt32,\\n `NestedColumn.A` Array(UInt8),\\n `NestedColumn.S` Array(String),\\n `ToDrop` UInt32\\n)\\n" \ From b0262b3d06130854ae96a10b1d2854ad9c7b92bb Mon Sep 17 00:00:00 2001 From: Alexander Tokmakov Date: Fri, 13 Nov 2020 21:35:45 +0300 Subject: [PATCH 0060/2357] better replica creation --- src/Databases/DatabaseReplicated.cpp | 280 +++++++++++---------------- src/Databases/DatabaseReplicated.h | 20 +- src/Interpreters/DDLWorker.cpp | 41 ++-- src/Interpreters/DDLWorker.h | 29 ++- 4 files changed, 159 insertions(+), 211 deletions(-) diff --git a/src/Databases/DatabaseReplicated.cpp b/src/Databases/DatabaseReplicated.cpp index 1213b5bc075..c4bffd8fd5d 100644 --- a/src/Databases/DatabaseReplicated.cpp +++ b/src/Databases/DatabaseReplicated.cpp @@ -16,6 +16,8 @@ #include #include #include +#include +#include #include namespace DB @@ -25,29 +27,22 @@ namespace ErrorCodes extern const int NO_ZOOKEEPER; extern const int LOGICAL_ERROR; extern const int BAD_ARGUMENTS; + extern const int REPLICA_IS_ALREADY_EXIST; } -//FIXME never used -void DatabaseReplicated::setZooKeeper(zkutil::ZooKeeperPtr zookeeper) -{ - std::lock_guard lock(current_zookeeper_mutex); - current_zookeeper = zookeeper; -} - -zkutil::ZooKeeperPtr DatabaseReplicated::tryGetZooKeeper() const -{ - std::lock_guard lock(current_zookeeper_mutex); - return current_zookeeper; -} +constexpr const char * first_entry_name = "query-0000000000"; zkutil::ZooKeeperPtr DatabaseReplicated::getZooKeeper() const { - auto res = tryGetZooKeeper(); - if (!res) - throw Exception("Cannot get ZooKeeper", ErrorCodes::NO_ZOOKEEPER); - return res; + return global_context.getZooKeeper(); } +static inline String getHostID(const Context & global_context) +{ + return Cluster::Address::toString(getFQDNOrHostName(), global_context.getTCPPort()); +} + + DatabaseReplicated::~DatabaseReplicated() = default; DatabaseReplicated::DatabaseReplicated( @@ -64,99 +59,119 @@ DatabaseReplicated::DatabaseReplicated( , replica_name(replica_name_) { if (zookeeper_path.empty() || shard_name.empty() || replica_name.empty()) - throw Exception("ZooKeeper path and shard and replica names must be non-empty", ErrorCodes::BAD_ARGUMENTS); + throw Exception("ZooKeeper path, shard and replica names must be non-empty", ErrorCodes::BAD_ARGUMENTS); + if (shard_name.find('/') != std::string::npos || replica_name.find('/') != std::string::npos) + throw Exception("Shard and replica names should not contain '/'", ErrorCodes::BAD_ARGUMENTS); if (zookeeper_path.back() == '/') zookeeper_path.resize(zookeeper_path.size() - 1); + /// If zookeeper chroot prefix is used, path should start with '/', because chroot concatenates without it. if (zookeeper_path.front() != '/') zookeeper_path = "/" + zookeeper_path; - if (context_.hasZooKeeper()) - { - current_zookeeper = context_.getZooKeeper(); - } - if (!current_zookeeper) + if (!context_.hasZooKeeper()) { throw Exception("Can't create replicated database without ZooKeeper", ErrorCodes::NO_ZOOKEEPER); } + //FIXME it will fail on startup if zk is not available + + auto current_zookeeper = global_context.getZooKeeper(); - /// New database if (!current_zookeeper->exists(zookeeper_path)) { - createDatabaseZooKeeperNodes(); + /// Create new database, multiple nodes can execute it concurrently + createDatabaseNodesInZooKeeper(current_zookeeper); } - /// Attach existing replica - //TODO better protection from wrong replica names - if (current_zookeeper->exists(zookeeper_path + "/replicas/" + replica_name)) + replica_path = zookeeper_path + "/replicas/" + shard_name + "|" + replica_name; + + String replica_host_id; + if (current_zookeeper->tryGet(replica_path, replica_host_id)) { - String remote_last_entry = current_zookeeper->get(zookeeper_path + "/replicas/" + replica_name, {}, nullptr); + String host_id = getHostID(global_context); + if (replica_host_id != host_id) + throw Exception(ErrorCodes::REPLICA_IS_ALREADY_EXIST, + "Replica {} of shard {} of replicated database at {} already exists. Replica host ID: '{}', current host ID: '{}'", + replica_name, shard_name, zookeeper_path, replica_host_id, host_id); - String local_last_entry; - try - { - ReadBufferFromFile in(getMetadataPath() + ".last_entry", 16); - readStringUntilEOF(local_last_entry, in); - } - catch (const Exception &) - { - /// Metadata is corrupted. - /// Replica erases the previous zk last executed log entry - /// and behaves like a new clean replica. - writeLastExecutedToDiskAndZK(); - } - - if (!local_last_entry.empty() && local_last_entry == remote_last_entry) - { - last_executed_log_entry = local_last_entry; - } - else - { - //FIXME - throw Exception( - "Replica name might be in use by a different node. Please check replica_name parameter. Remove .last_entry file from " - "metadata to create a new replica.", - ErrorCodes::LOGICAL_ERROR); - } + log_entry_to_execute = current_zookeeper->get(replica_path + "/log_ptr"); } else { - createReplicaZooKeeperNodes(); + /// Throws if replica with the same name was created concurrently + createReplicaNodesInZooKeeper(current_zookeeper); } + assert(log_entry_to_execute.starts_with("query-")); + + snapshot_period = context_.getConfigRef().getInt("database_replicated_snapshot_period", 10); - feedback_timeout = context_.getConfigRef().getInt("database_replicated_feedback_timeout", 0); LOG_DEBUG(log, "Snapshot period is set to {} log entries per one snapshot", snapshot_period); - - //FIXME use database UUID - ddl_worker = std::make_unique(1, zookeeper_path + "/log", context_, nullptr, String{}, true, database_name, replica_name, shard_name); - - //TODO do we need separate pool? - //background_log_executor = context_.getReplicatedSchedulePool().createTask( - // database_name + "(DatabaseReplicated::background_executor)", [this] { runBackgroundLogExecutor(); } - //); - - //background_log_executor->scheduleAfter(500); } -void DatabaseReplicated::createDatabaseZooKeeperNodes() +bool DatabaseReplicated::createDatabaseNodesInZooKeeper(const zkutil::ZooKeeperPtr & current_zookeeper) { - current_zookeeper = getZooKeeper(); - current_zookeeper->createAncestors(zookeeper_path); - current_zookeeper->createIfNotExists(zookeeper_path, String()); - current_zookeeper->createIfNotExists(zookeeper_path + "/log", String()); - current_zookeeper->createIfNotExists(zookeeper_path + "/snapshots", String()); - current_zookeeper->createIfNotExists(zookeeper_path + "/replicas", String()); + Coordination::Requests ops; + ops.emplace_back(zkutil::makeCreateRequest(zookeeper_path, "", zkutil::CreateMode::Persistent)); + ops.emplace_back(zkutil::makeCreateRequest(zookeeper_path + "/log", "", zkutil::CreateMode::Persistent)); + ops.emplace_back(zkutil::makeCreateRequest(zookeeper_path + "/snapshots", "", zkutil::CreateMode::Persistent)); + /// Create empty snapshot (with no tables) + ops.emplace_back(zkutil::makeCreateRequest(zookeeper_path + "/snapshots/" + first_entry_name, "", zkutil::CreateMode::Persistent)); + ops.emplace_back(zkutil::makeCreateRequest(zookeeper_path + "/replicas", "", zkutil::CreateMode::Persistent)); + + Coordination::Responses responses; + auto res = current_zookeeper->tryMulti(ops, responses); + if (res == Coordination::Error::ZOK) + return true; + if (res == Coordination::Error::ZNODEEXISTS) + return false; + + zkutil::KeeperMultiException::check(res, ops, responses); + assert(false); } -void DatabaseReplicated::createReplicaZooKeeperNodes() +void DatabaseReplicated::createReplicaNodesInZooKeeper(const zkutil::ZooKeeperPtr & current_zookeeper) { - current_zookeeper->create(zookeeper_path + "/replicas/" + replica_name, "", zkutil::CreateMode::Persistent); + current_zookeeper->createAncestors(replica_path); + + Strings snapshots = current_zookeeper->getChildren(zookeeper_path + "/snapshots"); + std::sort(snapshots.begin(), snapshots.end()); + if (snapshots.empty()) + throw Exception(ErrorCodes::LOGICAL_ERROR, "No snapshots found"); + + /// When creating new replica, use latest snapshot version as initial value of log_pointer + log_entry_to_execute = snapshots.back(); + + /// Write host name to replica_path, it will protect from multiple replicas with the same name + auto host_id = getHostID(global_context); + + Coordination::Requests ops; + ops.emplace_back(zkutil::makeCreateRequest(replica_path, host_id, zkutil::CreateMode::Persistent)); + ops.emplace_back(zkutil::makeCreateRequest(replica_path + "/log_ptr", log_entry_to_execute , zkutil::CreateMode::Persistent)); + current_zookeeper->multi(ops); } +void DatabaseReplicated::loadStoredObjects(Context & context, bool has_force_restore_data_flag, bool force_attach) +{ + DatabaseAtomic::loadStoredObjects(context, has_force_restore_data_flag, force_attach); + + DatabaseReplicatedExtensions ext; + ext.database_uuid = getUUID(); + ext.database_name = getDatabaseName(); + ext.shard_name = shard_name; + ext.replica_name = replica_name; + ext.first_not_executed = log_entry_to_execute; + + /// Pool size must be 1 (to avoid reordering of log entries) + constexpr size_t pool_size = 1; + ddl_worker = std::make_unique(pool_size, zookeeper_path + "/log", global_context, nullptr, "", + std::make_optional(std::move(ext))); +} + + void DatabaseReplicated::removeOutdatedSnapshotsAndLog() { /// This method removes all snapshots and logged queries @@ -170,7 +185,7 @@ void DatabaseReplicated::removeOutdatedSnapshotsAndLog() /// because the replica will use the latest snapshot available /// and this snapshot will set the last executed log query /// to a greater one than the least advanced current replica. - current_zookeeper = getZooKeeper(); + auto current_zookeeper = getZooKeeper(); Strings replica_states = current_zookeeper->getChildren(zookeeper_path + "/replicas"); //TODO do not use log pointers to determine which entries to remove if there are staled pointers. // We can just remove all entries older than previous snapshot version. @@ -209,7 +224,7 @@ void DatabaseReplicated::runBackgroundLogExecutor() loadMetadataFromSnapshot(); } - current_zookeeper = getZooKeeper(); + auto current_zookeeper = getZooKeeper(); Strings log_entry_names = current_zookeeper->getChildren(zookeeper_path + "/log"); std::sort(log_entry_names.begin(), log_entry_names.end()); @@ -219,7 +234,7 @@ void DatabaseReplicated::runBackgroundLogExecutor() for (const String & log_entry_name : log_entry_names) { - executeLogName(log_entry_name); + //executeLogName(log_entry_name); last_executed_log_entry = log_entry_name; writeLastExecutedToDiskAndZK(); @@ -238,7 +253,7 @@ void DatabaseReplicated::runBackgroundLogExecutor() void DatabaseReplicated::writeLastExecutedToDiskAndZK() { - current_zookeeper = getZooKeeper(); + auto current_zookeeper = getZooKeeper(); current_zookeeper->createOrUpdate( zookeeper_path + "/replicas/" + replica_name, last_executed_log_entry, zkutil::CreateMode::Persistent); @@ -251,35 +266,9 @@ void DatabaseReplicated::writeLastExecutedToDiskAndZK() out.close(); } -void DatabaseReplicated::executeLogName(const String & /*log_entry_name*/) -{ -// String path = zookeeper_path + "/log/" + log_entry_name; -// current_zookeeper = getZooKeeper(); -// String query_to_execute = current_zookeeper->get(path, {}, nullptr); -// -// try -// { -// current_context = std::make_unique(global_context); -// current_context->getClientInfo().query_kind = ClientInfo::QueryKind::REPLICATED_LOG_QUERY; -// current_context->setCurrentDatabase(database_name); -// current_context->setCurrentQueryId(""); // generate random query_id -// executeQuery(query_to_execute, *current_context); -// } -// catch (const Exception & e) -// { -// tryLogCurrentException(log, "Query from zookeeper " + query_to_execute + " wasn't finished successfully"); -// current_zookeeper->create( -// zookeeper_path + "/replicas/" + replica_name + "/errors/" + log_entry_name, e.what(), zkutil::CreateMode::Persistent); -// } -// -// LOG_DEBUG(log, "Executed query: {}", query_to_execute); -} BlockIO DatabaseReplicated::propose(const ASTPtr & query) { - //current_zookeeper = getZooKeeper(); - - if (const auto * query_alter = query->as()) { for (const auto & command : query_alter->command_list->commands) @@ -303,79 +292,18 @@ BlockIO DatabaseReplicated::propose(const ASTPtr & query) if (global_context.getSettingsRef().distributed_ddl_task_timeout == 0) return io; - //FIXME need list of all replicas + //FIXME need list of all replicas, we can obtain it from zk Strings hosts_to_wait; - //TODO maybe it's better to use (shard_name + sep + replica_name) as host ID to allow use {replica} macro (may may have the same values across shards) - hosts_to_wait.emplace_back(replica_name); + hosts_to_wait.emplace_back(shard_name + '/' +replica_name); auto stream = std::make_shared(node_path, entry, global_context); io.in = std::move(stream); return io; - - //executeDDLQueryOnCluster(query, global_context); - - - //{ - // std::lock_guard lock(log_name_mutex); - // log_name_to_exec_with_result - // = current_zookeeper->create(zookeeper_path + "/log/log-", queryToString(query), zkutil::CreateMode::PersistentSequential); - //} - - //background_log_executor->schedule(); } -//BlockIO DatabaseReplicated::getFeedback() -//{ -// BlockIO res; -// if (feedback_timeout == 0) -// return res; -// -// Stopwatch watch; -// -// NamesAndTypes block_structure = -// { -// {"replica_name", std::make_shared()}, -// {"execution_feedback", std::make_shared()}, -// }; -// auto replica_name_column = block_structure[0].type->createColumn(); -// auto feedback_column = block_structure[1].type->createColumn(); -// -// current_zookeeper = getZooKeeper(); -// Strings replica_states = current_zookeeper->getChildren(zookeeper_path + "/replicas"); -// auto replica_iter = replica_states.begin(); -// -// while (!replica_states.empty() && watch.elapsedSeconds() < feedback_timeout) -// { -// String last_executed = current_zookeeper->get(zookeeper_path + "/replicas/" + *replica_iter); -// if (last_executed > log_name_to_exec_with_result) -// { -// replica_name_column->insert(*replica_iter); -// String err_path = zookeeper_path + "/replicas/" + *replica_iter + "/errors/" + log_name_to_exec_with_result; -// if (!current_zookeeper->exists(err_path)) -// { -// feedback_column->insert("OK"); -// } -// else -// { -// String feedback = current_zookeeper->get(err_path, {}, nullptr); -// feedback_column->insert(feedback); -// } -// replica_states.erase(replica_iter); -// replica_iter = replica_states.begin(); -// } -// } -// -// Block block = Block({ -// {std::move(replica_name_column), block_structure[0].type, block_structure[0].name}, -// {std::move(feedback_column), block_structure[1].type, block_structure[1].name} -// }); -// -// res.in = std::make_shared(block); -// return res; -//} void DatabaseReplicated::createSnapshot() { - current_zookeeper = getZooKeeper(); + auto current_zookeeper = getZooKeeper(); String snapshot_path = zookeeper_path + "/snapshots/" + last_executed_log_entry; if (Coordination::Error::ZNODEEXISTS == current_zookeeper->tryCreate(snapshot_path, String(), zkutil::CreateMode::Persistent)) @@ -399,7 +327,7 @@ void DatabaseReplicated::loadMetadataFromSnapshot() { /// Executes the latest snapshot. /// Used by new replicas only. - current_zookeeper = getZooKeeper(); + auto current_zookeeper = getZooKeeper(); Strings snapshots; if (current_zookeeper->tryGetChildren(zookeeper_path + "/snapshots", snapshots) != Coordination::Error::ZOK) @@ -443,9 +371,19 @@ void DatabaseReplicated::loadMetadataFromSnapshot() void DatabaseReplicated::drop(const Context & context_) { - current_zookeeper = getZooKeeper(); + auto current_zookeeper = getZooKeeper(); current_zookeeper->tryRemove(zookeeper_path + "/replicas/" + replica_name); DatabaseAtomic::drop(context_); } +void DatabaseReplicated::shutdown() +{ + if (ddl_worker) + { + ddl_worker->shutdown(); + ddl_worker = nullptr; + } + DatabaseAtomic::shutdown(); +} + } diff --git a/src/Databases/DatabaseReplicated.h b/src/Databases/DatabaseReplicated.h index 537eaad893f..219779d602d 100644 --- a/src/Databases/DatabaseReplicated.h +++ b/src/Databases/DatabaseReplicated.h @@ -51,14 +51,15 @@ public: BlockIO propose(const ASTPtr & query); - //BlockIO getFeedback(); + void shutdown() override; + + void loadStoredObjects(Context & context, bool has_force_restore_data_flag, bool force_attach = false) override; private: - void createDatabaseZooKeeperNodes(); - void createReplicaZooKeeperNodes(); + bool createDatabaseNodesInZooKeeper(const zkutil::ZooKeeperPtr & current_zookeeper); + void createReplicaNodesInZooKeeper(const zkutil::ZooKeeperPtr & current_zookeeper); void runBackgroundLogExecutor(); - void executeLogName(const String &); void writeLastExecutedToDiskAndZK(); void loadMetadataFromSnapshot(); @@ -68,25 +69,18 @@ private: String zookeeper_path; String shard_name; String replica_name; + String replica_path; - //std::unique_ptr current_context; // to run executeQuery + String log_entry_to_execute; std::mutex log_name_mutex; String log_name_to_exec_with_result; int snapshot_period; - int feedback_timeout; String last_executed_log_entry = ""; - //BackgroundSchedulePool::TaskHolder background_log_executor; - - zkutil::ZooKeeperPtr current_zookeeper; /// Use only the methods below. - mutable std::mutex current_zookeeper_mutex; /// To recreate the session in the background thread. - - zkutil::ZooKeeperPtr tryGetZooKeeper() const; zkutil::ZooKeeperPtr getZooKeeper() const; - void setZooKeeper(zkutil::ZooKeeperPtr zookeeper); std::unique_ptr ddl_worker; diff --git a/src/Interpreters/DDLWorker.cpp b/src/Interpreters/DDLWorker.cpp index 83e7029ec31..7d947a264a6 100644 --- a/src/Interpreters/DDLWorker.cpp +++ b/src/Interpreters/DDLWorker.cpp @@ -142,17 +142,15 @@ std::unique_ptr createSimpleZooKeeperLock( } -DDLWorker::DDLWorker(int pool_size_, const std::string & zk_root_dir, Context & context_, const Poco::Util::AbstractConfiguration * config, const String & prefix, - bool is_replicated_db_, const std::optional & db_name_, const std::optional & db_replica_name_, const std::optional & db_shard_name_) +DDLWorker::DDLWorker(int pool_size_, const std::string & zk_root_dir, const Context & context_, const Poco::Util::AbstractConfiguration * config, const String & prefix, + std::optional database_replicated_ext_) : context(context_) - , log(&Poco::Logger::get("DDLWorker")) + , log(&Poco::Logger::get(database_replicated_ext_ ? fmt::format("DDLWorker ({})", database_replicated_ext_->database_name) : "DDLWorker")) + , database_replicated_ext(std::move(database_replicated_ext_)) , pool_size(pool_size_) , worker_pool(pool_size_) { - is_replicated_db = is_replicated_db_; - db_name = db_name_; - db_replica_name = db_replica_name_; - db_shard_name = db_shard_name_; + assert(!database_replicated_ext || pool_size == 1); last_tasks.reserve(pool_size); queue_dir = zk_root_dir; @@ -181,25 +179,29 @@ DDLWorker::DDLWorker(int pool_size_, const std::string & zk_root_dir, Context & cleanup_thread = ThreadFromGlobalPool(&DDLWorker::runCleanupThread, this); } - -DDLWorker::~DDLWorker() +void DDLWorker::shutdown() { stop_flag = true; queue_updated_event->set(); cleanup_event->set(); +} + +DDLWorker::~DDLWorker() +{ + shutdown(); worker_pool.wait(); main_thread.join(); cleanup_thread.join(); } -DDLWorker::ZooKeeperPtr DDLWorker::tryGetZooKeeper() const +ZooKeeperPtr DDLWorker::tryGetZooKeeper() const { std::lock_guard lock(zookeeper_mutex); return current_zookeeper; } -DDLWorker::ZooKeeperPtr DDLWorker::getAndSetZooKeeper() +ZooKeeperPtr DDLWorker::getAndSetZooKeeper() { std::lock_guard lock(zookeeper_mutex); @@ -272,12 +274,11 @@ DDLTaskPtr DDLWorker::initAndCheckTask(const String & entry_name, String & out_r return {}; } - if (is_replicated_db) + if (database_replicated_ext) { - // task->host_id.host_name = host_fqdn; task->host_id.port = context.getTCPPort(); - task->host_id_str = *db_replica_name; + task->host_id_str = database_replicated_ext->shard_name + '|' + database_replicated_ext->replica_name; return task; } @@ -404,7 +405,7 @@ void DDLWorker::parseQueryAndResolveHost(DDLTask & task) if (!task.query || !(task.query_on_cluster = dynamic_cast(task.query.get()))) throw Exception("Received unknown DDL query", ErrorCodes::UNKNOWN_TYPE_OF_QUERY); - if (is_replicated_db) + if (database_replicated_ext) return; task.cluster_name = task.query_on_cluster->cluster; @@ -524,11 +525,11 @@ bool DDLWorker::tryExecuteQuery(const String & query, const DDLTask & task, Exec try { auto current_context = std::make_unique(context); - if (is_replicated_db) + if (database_replicated_ext) { current_context->getClientInfo().query_kind = ClientInfo::QueryKind::REPLICATED_LOG_QUERY; //FIXME why do we need separate query kind? - current_context->setCurrentDatabase(*db_name); + current_context->setCurrentDatabase(database_replicated_ext->database_name); } else current_context->getClientInfo().query_kind = ClientInfo::QueryKind::SECONDARY_QUERY; @@ -721,8 +722,8 @@ bool DDLWorker::tryExecuteQueryOnLeaderReplica( }; String shard_node_name; - if (is_replicated_db) - shard_node_name = *db_shard_name; + if (database_replicated_ext) + shard_node_name = database_replicated_ext->shard_name; else shard_node_name = get_shard_name(task.cluster->getShardsAddresses().at(task.host_shard_num)); String shard_path = node_path + "/shards/" + shard_node_name; @@ -920,7 +921,7 @@ void DDLWorker::createStatusDirs(const std::string & node_path, const ZooKeeperP String DDLWorker::enqueueQuery(DDLLogEntry & entry) { - if (entry.hosts.empty() && !is_replicated_db) + if (entry.hosts.empty() && !database_replicated_ext) throw Exception("Empty host list in a distributed DDL task", ErrorCodes::LOGICAL_ERROR); auto zookeeper = getAndSetZooKeeper(); diff --git a/src/Interpreters/DDLWorker.h b/src/Interpreters/DDLWorker.h index 1c28100f933..f38d41df503 100644 --- a/src/Interpreters/DDLWorker.h +++ b/src/Interpreters/DDLWorker.h @@ -31,13 +31,30 @@ class ASTAlterQuery; struct DDLLogEntry; struct DDLTask; using DDLTaskPtr = std::unique_ptr; +using ZooKeeperPtr = std::shared_ptr; + + +struct DatabaseReplicatedExtensions +{ + UUID database_uuid; + String database_name; + String shard_name; + String replica_name; + String first_not_executed; + using NewEntryCallback = std::function; + using EntryExecutedCallback = std::function; + using EntryErrorCallback = std::function; + NewEntryCallback before_execution_callback; + EntryExecutedCallback executed_callback; + EntryErrorCallback error_callback; +}; class DDLWorker { public: - DDLWorker(int pool_size_, const std::string & zk_root_dir, Context & context_, const Poco::Util::AbstractConfiguration * config, const String & prefix, - bool is_replicated_db_ = false, const std::optional & db_name_ = std::nullopt, const std::optional & db_replica_name_ = std::nullopt, const std::optional & db_shard_name_ = std::nullopt); + DDLWorker(int pool_size_, const std::string & zk_root_dir, const Context & context_, const Poco::Util::AbstractConfiguration * config, const String & prefix, + std::optional database_replicated_ext_ = std::nullopt); ~DDLWorker(); /// Pushes query into DDL queue, returns path to created node @@ -50,8 +67,9 @@ public: return host_fqdn_id; } + void shutdown(); + private: - using ZooKeeperPtr = std::shared_ptr; /// Returns cached ZooKeeper session (possibly expired). ZooKeeperPtr tryGetZooKeeper() const; @@ -103,13 +121,10 @@ private: void attachToThreadGroup(); private: - bool is_replicated_db; - std::optional db_name; - std::optional db_replica_name; - std::optional db_shard_name; std::atomic is_circular_replicated = false; Context context; Poco::Logger * log; + std::optional database_replicated_ext; std::string host_fqdn; /// current host domain name std::string host_fqdn_id; /// host_name:port From 2283906a1118d0836fc6cb813557e8a3d8f21383 Mon Sep 17 00:00:00 2001 From: Alexander Tokmakov Date: Thu, 19 Nov 2020 13:34:45 +0300 Subject: [PATCH 0061/2357] try support replica recovery --- src/Common/ErrorCodes.cpp | 1 + src/Databases/DatabaseReplicated.cpp | 259 +++++++++++++++++---------- src/Databases/DatabaseReplicated.h | 22 ++- src/Interpreters/DDLWorker.cpp | 65 ++++++- src/Interpreters/DDLWorker.h | 18 +- 5 files changed, 253 insertions(+), 112 deletions(-) diff --git a/src/Common/ErrorCodes.cpp b/src/Common/ErrorCodes.cpp index 405b8c60af8..1981dea5cb9 100644 --- a/src/Common/ErrorCodes.cpp +++ b/src/Common/ErrorCodes.cpp @@ -522,6 +522,7 @@ M(553, ROCKSDB_ERROR) \ M(553, LZMA_STREAM_ENCODER_FAILED) \ M(554, LZMA_STREAM_DECODER_FAILED) \ + M(554, DATABASE_REPLICATION_FAILED) \ M(999, KEEPER_EXCEPTION) \ M(1000, POCO_EXCEPTION) \ M(1001, STD_EXCEPTION) \ diff --git a/src/Databases/DatabaseReplicated.cpp b/src/Databases/DatabaseReplicated.cpp index c4bffd8fd5d..7b6d98f992a 100644 --- a/src/Databases/DatabaseReplicated.cpp +++ b/src/Databases/DatabaseReplicated.cpp @@ -28,9 +28,10 @@ namespace ErrorCodes extern const int LOGICAL_ERROR; extern const int BAD_ARGUMENTS; extern const int REPLICA_IS_ALREADY_EXIST; + extern const int DATABASE_REPLICATION_FAILED; } -constexpr const char * first_entry_name = "query-0000000000"; +static constexpr size_t METADATA_FILE_BUFFER_SIZE = 32768; zkutil::ZooKeeperPtr DatabaseReplicated::getZooKeeper() const { @@ -42,6 +43,15 @@ static inline String getHostID(const Context & global_context) return Cluster::Address::toString(getFQDNOrHostName(), global_context.getTCPPort()); } +Strings DatabaseReplicated::getSnapshots(const ZooKeeperPtr & zookeeper) const +{ + Strings snapshots = zookeeper->getChildren(zookeeper_path + "/snapshots"); + std::sort(snapshots.begin(), snapshots.end()); + if (snapshots.empty()) + throw Exception(ErrorCodes::LOGICAL_ERROR, "No snapshots found"); + return snapshots; +} + DatabaseReplicated::~DatabaseReplicated() = default; @@ -84,7 +94,7 @@ DatabaseReplicated::DatabaseReplicated( createDatabaseNodesInZooKeeper(current_zookeeper); } - replica_path = zookeeper_path + "/replicas/" + shard_name + "|" + replica_name; + replica_path = zookeeper_path + "/replicas/" + shard_name + "/" + replica_name; String replica_host_id; if (current_zookeeper->tryGet(replica_path, replica_host_id)) @@ -95,7 +105,7 @@ DatabaseReplicated::DatabaseReplicated( "Replica {} of shard {} of replicated database at {} already exists. Replica host ID: '{}', current host ID: '{}'", replica_name, shard_name, zookeeper_path, replica_host_id, host_id); - log_entry_to_execute = current_zookeeper->get(replica_path + "/log_ptr"); + log_entry_to_execute = parse(current_zookeeper->get(replica_path + "/log_ptr")); } else { @@ -103,10 +113,7 @@ DatabaseReplicated::DatabaseReplicated( createReplicaNodesInZooKeeper(current_zookeeper); } - assert(log_entry_to_execute.starts_with("query-")); - - - snapshot_period = context_.getConfigRef().getInt("database_replicated_snapshot_period", 10); + snapshot_period = 1; //context_.getConfigRef().getInt("database_replicated_snapshot_period", 10); LOG_DEBUG(log, "Snapshot period is set to {} log entries per one snapshot", snapshot_period); } @@ -117,10 +124,12 @@ bool DatabaseReplicated::createDatabaseNodesInZooKeeper(const zkutil::ZooKeeperP Coordination::Requests ops; ops.emplace_back(zkutil::makeCreateRequest(zookeeper_path, "", zkutil::CreateMode::Persistent)); ops.emplace_back(zkutil::makeCreateRequest(zookeeper_path + "/log", "", zkutil::CreateMode::Persistent)); + ops.emplace_back(zkutil::makeCreateRequest(zookeeper_path + "/replicas", "", zkutil::CreateMode::Persistent)); ops.emplace_back(zkutil::makeCreateRequest(zookeeper_path + "/snapshots", "", zkutil::CreateMode::Persistent)); /// Create empty snapshot (with no tables) - ops.emplace_back(zkutil::makeCreateRequest(zookeeper_path + "/snapshots/" + first_entry_name, "", zkutil::CreateMode::Persistent)); - ops.emplace_back(zkutil::makeCreateRequest(zookeeper_path + "/replicas", "", zkutil::CreateMode::Persistent)); + ops.emplace_back(zkutil::makeCreateRequest(zookeeper_path + "/snapshots/0", "", zkutil::CreateMode::Persistent)); + ops.emplace_back(zkutil::makeCreateRequest(zookeeper_path + "/metadata", "", zkutil::CreateMode::Persistent)); + ops.emplace_back(zkutil::makeCreateRequest(zookeeper_path + "/metadata/0", "", zkutil::CreateMode::Persistent)); Coordination::Responses responses; auto res = current_zookeeper->tryMulti(ops, responses); @@ -137,20 +146,24 @@ void DatabaseReplicated::createReplicaNodesInZooKeeper(const zkutil::ZooKeeperPt { current_zookeeper->createAncestors(replica_path); - Strings snapshots = current_zookeeper->getChildren(zookeeper_path + "/snapshots"); - std::sort(snapshots.begin(), snapshots.end()); - if (snapshots.empty()) - throw Exception(ErrorCodes::LOGICAL_ERROR, "No snapshots found"); - /// When creating new replica, use latest snapshot version as initial value of log_pointer - log_entry_to_execute = snapshots.back(); + log_entry_to_execute = parse(getSnapshots(current_zookeeper).back()); /// Write host name to replica_path, it will protect from multiple replicas with the same name auto host_id = getHostID(global_context); + /// On replica creation add empty entry to log. Can be used to trigger some actions on other replicas (e.g. update cluster info). + DDLLogEntry entry; + entry.hosts = {}; + entry.query = {}; + entry.initiator = {}; + + recoverLostReplica(current_zookeeper, log_entry_to_execute, true); + Coordination::Requests ops; ops.emplace_back(zkutil::makeCreateRequest(replica_path, host_id, zkutil::CreateMode::Persistent)); - ops.emplace_back(zkutil::makeCreateRequest(replica_path + "/log_ptr", log_entry_to_execute , zkutil::CreateMode::Persistent)); + ops.emplace_back(zkutil::makeCreateRequest(replica_path + "/log_ptr", toString(log_entry_to_execute), zkutil::CreateMode::Persistent)); + ops.emplace_back(zkutil::makeCreateRequest(zookeeper_path + "/log/query-", entry.toString(), zkutil::CreateMode::PersistentSequential)); current_zookeeper->multi(ops); } @@ -160,10 +173,13 @@ void DatabaseReplicated::loadStoredObjects(Context & context, bool has_force_res DatabaseReplicatedExtensions ext; ext.database_uuid = getUUID(); + ext.zookeeper_path = zookeeper_path; ext.database_name = getDatabaseName(); ext.shard_name = shard_name; ext.replica_name = replica_name; ext.first_not_executed = log_entry_to_execute; + ext.lost_callback = [this] (const String & entry_name, const ZooKeeperPtr & zookeeper) { onUnexpectedLogEntry(entry_name, zookeeper); }; + ext.executed_callback = [this] (const String & entry_name, const ZooKeeperPtr & zookeeper) { onExecutedLogEntry(entry_name, zookeeper); }; /// Pool size must be 1 (to avoid reordering of log entries) constexpr size_t pool_size = 1; @@ -171,6 +187,41 @@ void DatabaseReplicated::loadStoredObjects(Context & context, bool has_force_res std::make_optional(std::move(ext))); } +void DatabaseReplicated::onUnexpectedLogEntry(const String & entry_name, const ZooKeeperPtr & zookeeper) +{ + /// We cannot execute next entry of replication log. Possible reasons: + /// 1. Replica is staled, some entries were removed by log cleanup process. + /// In this case we should recover replica from the last snapshot. + /// 2. Replication log is broken due to manual operations with ZooKeeper or logical error. + /// In this case we just stop replication without any attempts to recover it automatically, + /// because such attempts may lead to unexpected data removal. + + constexpr const char * name = "query-"; + if (!startsWith(entry_name, name)) + throw Exception(ErrorCodes::DATABASE_REPLICATION_FAILED, "Unexpected entry in replication log: {}", entry_name); + + UInt32 entry_number; + if (!tryParse(entry_number, entry_name.substr(strlen(name)))) + throw Exception(ErrorCodes::DATABASE_REPLICATION_FAILED, "Cannot parse number of replication log entry {}", entry_name); + + if (entry_number < log_entry_to_execute) + throw Exception(ErrorCodes::LOGICAL_ERROR, "Entry {} already executed, current pointer is {}", entry_number, log_entry_to_execute); + + /// Entry name is valid. Let's get min snapshot version to check if replica is staled. + Strings snapshots = getSnapshots(zookeeper); + UInt32 min_snapshot = parse(snapshots.front()); + + if (log_entry_to_execute < min_snapshot) + { + recoverLostReplica(zookeeper, parse(snapshots.back())); + return; + } + + throw Exception(ErrorCodes::LOGICAL_ERROR, "Cannot recover replica, probably it's a bug. " + "Got log entry '{}' when expected entry number {}, " + "available snapshots: ", + entry_name, log_entry_to_execute, boost::algorithm::join(snapshots, ", ")); +} void DatabaseReplicated::removeOutdatedSnapshotsAndLog() { @@ -217,40 +268,51 @@ void DatabaseReplicated::removeOutdatedSnapshotsAndLog() } } -void DatabaseReplicated::runBackgroundLogExecutor() +void DatabaseReplicated::onExecutedLogEntry(const String & entry_name, const ZooKeeperPtr & zookeeper) { - if (last_executed_log_entry.empty()) + assert(entry_name == DatabaseReplicatedExtensions::getLogEntryName(log_entry_to_execute)); + ++log_entry_to_execute; + + if (snapshot_period > 0 && log_entry_to_execute % snapshot_period == 0) { - loadMetadataFromSnapshot(); + createSnapshot(zookeeper); } - - auto current_zookeeper = getZooKeeper(); - Strings log_entry_names = current_zookeeper->getChildren(zookeeper_path + "/log"); - - std::sort(log_entry_names.begin(), log_entry_names.end()); - auto newest_entry_it = std::upper_bound(log_entry_names.begin(), log_entry_names.end(), last_executed_log_entry); - - log_entry_names.erase(log_entry_names.begin(), newest_entry_it); - - for (const String & log_entry_name : log_entry_names) - { - //executeLogName(log_entry_name); - last_executed_log_entry = log_entry_name; - writeLastExecutedToDiskAndZK(); - - int log_n = parse(log_entry_name.substr(4)); - int last_log_n = parse(log_entry_names.back().substr(4)); - - /// The third condition gurantees at most one snapshot creation per batch - if (log_n > 0 && snapshot_period > 0 && (last_log_n - log_n) / snapshot_period == 0 && log_n % snapshot_period == 0) - { - createSnapshot(); - } - } - - //background_log_executor->scheduleAfter(500); } +//void DatabaseReplicated::runBackgroundLogExecutor() +//{ +// if (last_executed_log_entry.empty()) +// { +// loadMetadataFromSnapshot(); +// } +// +// auto current_zookeeper = getZooKeeper(); +// Strings log_entry_names = current_zookeeper->getChildren(zookeeper_path + "/log"); +// +// std::sort(log_entry_names.begin(), log_entry_names.end()); +// auto newest_entry_it = std::upper_bound(log_entry_names.begin(), log_entry_names.end(), last_executed_log_entry); +// +// log_entry_names.erase(log_entry_names.begin(), newest_entry_it); +// +// for (const String & log_entry_name : log_entry_names) +// { +// //executeLogName(log_entry_name); +// last_executed_log_entry = log_entry_name; +// writeLastExecutedToDiskAndZK(); +// +// int log_n = parse(log_entry_name.substr(4)); +// int last_log_n = parse(log_entry_names.back().substr(4)); +// +// /// The third condition gurantees at most one snapshot creation per batch +// if (log_n > 0 && snapshot_period > 0 && (last_log_n - log_n) / snapshot_period == 0 && log_n % snapshot_period == 0) +// { +// createSnapshot(); +// } +// } +// +// //background_log_executor->scheduleAfter(500); +//} + void DatabaseReplicated::writeLastExecutedToDiskAndZK() { auto current_zookeeper = getZooKeeper(); @@ -294,79 +356,88 @@ BlockIO DatabaseReplicated::propose(const ASTPtr & query) //FIXME need list of all replicas, we can obtain it from zk Strings hosts_to_wait; - hosts_to_wait.emplace_back(shard_name + '/' +replica_name); + hosts_to_wait.emplace_back(shard_name + '|' +replica_name); auto stream = std::make_shared(node_path, entry, global_context); io.in = std::move(stream); return io; } -void DatabaseReplicated::createSnapshot() +void DatabaseReplicated::createSnapshot(const ZooKeeperPtr & zookeeper) { - auto current_zookeeper = getZooKeeper(); - String snapshot_path = zookeeper_path + "/snapshots/" + last_executed_log_entry; + String snapshot_path = zookeeper_path + "/snapshot/" + toString(log_entry_to_execute); - if (Coordination::Error::ZNODEEXISTS == current_zookeeper->tryCreate(snapshot_path, String(), zkutil::CreateMode::Persistent)) - { + if (zookeeper->exists(snapshot_path)) return; - } - for (auto iterator = getTablesIterator(global_context, {}); iterator->isValid(); iterator->next()) + std::vector> create_queries; { - String table_name = iterator->name(); - auto query = getCreateQueryFromMetadata(getObjectMetadataPath(table_name), true); - String statement = queryToString(query); - current_zookeeper->create(snapshot_path + "/" + table_name, statement, zkutil::CreateMode::Persistent); + std::lock_guard lock{mutex}; + create_queries.reserve(tables.size()); + for (const auto & table : tables) + { + const String & name = table.first; + ReadBufferFromFile in(getObjectMetadataPath(name), METADATA_FILE_BUFFER_SIZE); + String attach_query; + readStringUntilEOF(attach_query, in); + create_queries.emplace_back(escapeForFileName(name), std::move(attach_query)); + } } - current_zookeeper->create(snapshot_path + "/.completed", String(), zkutil::CreateMode::Persistent); - removeOutdatedSnapshotsAndLog(); + if (zookeeper->exists(snapshot_path)) + return; + + String queries_path = zookeeper_path + "/metadata/" + toString(log_entry_to_execute); + zookeeper->tryCreate(queries_path, "", zkutil::CreateMode::Persistent); + queries_path += '/'; + + //FIXME use tryMulti with MULTI_BATCH_SIZE + + for (const auto & table : create_queries) + zookeeper->tryCreate(queries_path + table.first, table.second, zkutil::CreateMode::Persistent); + + if (create_queries.size() != zookeeper->getChildren(zookeeper_path + "/metadata/" + toString(log_entry_to_execute)).size()) + throw Exception(ErrorCodes::LOGICAL_ERROR, "Created invalid snapshot"); + + zookeeper->tryCreate(snapshot_path, String(), zkutil::CreateMode::Persistent); } -void DatabaseReplicated::loadMetadataFromSnapshot() +void DatabaseReplicated::recoverLostReplica(const ZooKeeperPtr & current_zookeeper, UInt32 from_snapshot, bool create) { - /// Executes the latest snapshot. - /// Used by new replicas only. - auto current_zookeeper = getZooKeeper(); + LOG_WARNING(log, "Will recover replica from snapshot", from_snapshot); - Strings snapshots; - if (current_zookeeper->tryGetChildren(zookeeper_path + "/snapshots", snapshots) != Coordination::Error::ZOK) - return; + //FIXME drop old tables - auto latest_snapshot = std::max_element(snapshots.begin(), snapshots.end()); - while (snapshots.size() > 0 && !current_zookeeper->exists(zookeeper_path + "/snapshots/" + *latest_snapshot + "/.completed")) + String snapshot_metadata_path = zookeeper_path + "/metadata/" + toString(from_snapshot); + Strings tables_in_snapshot = current_zookeeper->getChildren(snapshot_metadata_path); + current_zookeeper->get(zookeeper_path + "/snapshots/" + toString(from_snapshot)); /// Assert node exists + snapshot_metadata_path += '/'; + + for (const auto & table_name : tables_in_snapshot) { - snapshots.erase(latest_snapshot); - latest_snapshot = std::max_element(snapshots.begin(), snapshots.end()); + String query_to_execute = current_zookeeper->get(snapshot_metadata_path + table_name); + + + if (!startsWith(query_to_execute, "ATTACH ")) + throw Exception(ErrorCodes::LOGICAL_ERROR, "Unexpected query: {}", query_to_execute); + query_to_execute = "CREATE " + query_to_execute.substr(strlen("ATTACH ")); + + Context current_context = global_context; + current_context.getClientInfo().query_kind = ClientInfo::QueryKind::REPLICATED_LOG_QUERY; + current_context.setCurrentDatabase(database_name); + current_context.setCurrentQueryId(""); // generate random query_id + + executeQuery(query_to_execute, current_context); } - if (snapshots.size() < 1) - { - return; - } - - Strings metadatas; - if (current_zookeeper->tryGetChildren(zookeeper_path + "/snapshots/" + *latest_snapshot, metadatas) != Coordination::Error::ZOK) + if (create) return; - LOG_DEBUG(log, "Executing {} snapshot", *latest_snapshot); + current_zookeeper->set(replica_path + "/log-ptr", toString(from_snapshot)); + last_executed_log_entry = from_snapshot; + ddl_worker->setLogPointer(from_snapshot); //FIXME - for (auto t = metadatas.begin(); t != metadatas.end(); ++t) - { - String path = zookeeper_path + "/snapshots/" + *latest_snapshot + "/" + *t; - - String query_to_execute = current_zookeeper->get(path, {}, nullptr); - - auto current_context = std::make_unique(global_context); - current_context->getClientInfo().query_kind = ClientInfo::QueryKind::REPLICATED_LOG_QUERY; - current_context->setCurrentDatabase(database_name); - current_context->setCurrentQueryId(""); // generate random query_id - - executeQuery(query_to_execute, *current_context); - } - - last_executed_log_entry = *latest_snapshot; - writeLastExecutedToDiskAndZK(); + //writeLastExecutedToDiskAndZK(); } void DatabaseReplicated::drop(const Context & context_) diff --git a/src/Databases/DatabaseReplicated.h b/src/Databases/DatabaseReplicated.h index 219779d602d..3f5bd4608f1 100644 --- a/src/Databases/DatabaseReplicated.h +++ b/src/Databases/DatabaseReplicated.h @@ -13,6 +13,7 @@ namespace DB { class DDLWorker; +using ZooKeeperPtr = std::shared_ptr; /** DatabaseReplicated engine * supports replication of metadata @@ -56,22 +57,29 @@ public: void loadStoredObjects(Context & context, bool has_force_restore_data_flag, bool force_attach = false) override; private: - bool createDatabaseNodesInZooKeeper(const zkutil::ZooKeeperPtr & current_zookeeper); - void createReplicaNodesInZooKeeper(const zkutil::ZooKeeperPtr & current_zookeeper); + bool createDatabaseNodesInZooKeeper(const ZooKeeperPtr & current_zookeeper); + void createReplicaNodesInZooKeeper(const ZooKeeperPtr & current_zookeeper); - void runBackgroundLogExecutor(); + //void runBackgroundLogExecutor(); void writeLastExecutedToDiskAndZK(); - void loadMetadataFromSnapshot(); - void createSnapshot(); + //void loadMetadataFromSnapshot(); + void createSnapshot(const ZooKeeperPtr & zookeeper); void removeOutdatedSnapshotsAndLog(); + Strings getSnapshots(const ZooKeeperPtr & zookeeper) const; + + void onUnexpectedLogEntry(const String & entry_name, const ZooKeeperPtr & zookeeper); + void recoverLostReplica(const ZooKeeperPtr & current_zookeeper, UInt32 from_snapshot, bool create = false); + + void onExecutedLogEntry(const String & entry_name, const ZooKeeperPtr & zookeeper); + String zookeeper_path; String shard_name; String replica_name; String replica_path; - String log_entry_to_execute; + UInt32 log_entry_to_execute; std::mutex log_name_mutex; String log_name_to_exec_with_result; @@ -84,6 +92,8 @@ private: std::unique_ptr ddl_worker; + + }; } diff --git a/src/Interpreters/DDLWorker.cpp b/src/Interpreters/DDLWorker.cpp index 7d947a264a6..51f0e1b45a9 100644 --- a/src/Interpreters/DDLWorker.cpp +++ b/src/Interpreters/DDLWorker.cpp @@ -142,6 +142,22 @@ std::unique_ptr createSimpleZooKeeperLock( } +String DatabaseReplicatedExtensions::getLogEntryName(UInt32 log_entry_number) +{ + constexpr size_t seq_node_digits = 10; + String number = toString(log_entry_number); + String name = "query-" + String(seq_node_digits - number.size(), '0') + number; + return name; +} + +UInt32 DatabaseReplicatedExtensions::getLogEntryNumber(const String & log_entry_name) +{ + constexpr const char * name = "query-"; + assert(startsWith(log_entry_name, name)); + return parse(log_entry_name.substr(strlen(name))); +} + + DDLWorker::DDLWorker(int pool_size_, const std::string & zk_root_dir, const Context & context_, const Poco::Util::AbstractConfiguration * config, const String & prefix, std::optional database_replicated_ext_) : context(context_) @@ -236,8 +252,21 @@ DDLTaskPtr DDLWorker::initAndCheckTask(const String & entry_name, String & out_r String node_data; String entry_path = queue_dir + "/" + entry_name; + if (database_replicated_ext) + { + auto expected_log_entry = DatabaseReplicatedExtensions::getLogEntryName(database_replicated_ext->first_not_executed); + if (entry_name != expected_log_entry) + { + database_replicated_ext->lost_callback(entry_name, zookeeper); + out_reason = "DatabaseReplicated: expected " + expected_log_entry + " got " + entry_name; + return {}; + } + } + if (!zookeeper->tryGet(entry_path, node_data)) { + if (database_replicated_ext) + database_replicated_ext->lost_callback(entry_name, zookeeper); /// It is Ok that node could be deleted just now. It means that there are no current host in node's host list. out_reason = "The task was deleted"; return {}; @@ -339,7 +368,7 @@ void DDLWorker::scheduleTasks() ? queue_nodes.begin() : std::upper_bound(queue_nodes.begin(), queue_nodes.end(), last_tasks.back()); - for (auto it = begin_node; it != queue_nodes.end(); ++it) + for (auto it = begin_node; it != queue_nodes.end() && !stop_flag; ++it) { String entry_name = *it; @@ -362,11 +391,17 @@ void DDLWorker::scheduleTasks() if (!already_processed) { - worker_pool.scheduleOrThrowOnError([this, task_ptr = task.release()]() + if (database_replicated_ext) { - setThreadName("DDLWorkerExec"); - enqueueTask(DDLTaskPtr(task_ptr)); - }); + enqueueTask(DDLTaskPtr(task.release())); + } + else + { + worker_pool.scheduleOrThrowOnError([this, task_ptr = task.release()]() { + setThreadName("DDLWorkerExec"); + enqueueTask(DDLTaskPtr(task_ptr)); + }); + } } else { @@ -374,9 +409,6 @@ void DDLWorker::scheduleTasks() } saveTask(entry_name); - - if (stop_flag) - break; } } @@ -599,6 +631,7 @@ void DDLWorker::enqueueTask(DDLTaskPtr task_ptr) } } } + void DDLWorker::processTask(DDLTask & task) { auto zookeeper = tryGetZooKeeper(); @@ -626,7 +659,9 @@ void DDLWorker::processTask(DDLTask & task) else throw Coordination::Exception(code, active_node_path); - if (!task.was_executed) + //FIXME + bool is_dummy_query = database_replicated_ext && task.entry.query.empty(); + if (!task.was_executed && !is_dummy_query) { try { @@ -675,7 +710,19 @@ void DDLWorker::processTask(DDLTask & task) Coordination::Requests ops; ops.emplace_back(zkutil::makeRemoveRequest(active_node_path, -1)); ops.emplace_back(zkutil::makeCreateRequest(finished_node_path, task.execution_status.serializeText(), zkutil::CreateMode::Persistent)); + if (database_replicated_ext) + { + assert(DatabaseReplicatedExtensions::getLogEntryName(database_replicated_ext->first_not_executed) == task.entry_name); + ops.emplace_back(zkutil::makeSetRequest(database_replicated_ext->getReplicaPath() + "/log_ptr", toString(database_replicated_ext->first_not_executed), -1)); + } + zookeeper->multi(ops); + + if (database_replicated_ext) + { + database_replicated_ext->executed_callback(task.entry_name, zookeeper); + ++(database_replicated_ext->first_not_executed); + } } diff --git a/src/Interpreters/DDLWorker.h b/src/Interpreters/DDLWorker.h index f38d41df503..08bf641264e 100644 --- a/src/Interpreters/DDLWorker.h +++ b/src/Interpreters/DDLWorker.h @@ -37,16 +37,25 @@ using ZooKeeperPtr = std::shared_ptr; struct DatabaseReplicatedExtensions { UUID database_uuid; + String zookeeper_path; String database_name; String shard_name; String replica_name; - String first_not_executed; - using NewEntryCallback = std::function; + UInt32 first_not_executed; + using EntryLostCallback = std::function; using EntryExecutedCallback = std::function; using EntryErrorCallback = std::function; - NewEntryCallback before_execution_callback; + EntryLostCallback lost_callback; EntryExecutedCallback executed_callback; EntryErrorCallback error_callback; + + String getReplicaPath() const + { + return zookeeper_path + "/replicas/" + shard_name + "/" + replica_name; + } + + static String getLogEntryName(UInt32 log_entry_number); + static UInt32 getLogEntryNumber(const String & log_entry_name); }; @@ -69,6 +78,9 @@ public: void shutdown(); + //FIXME get rid of this method + void setLogPointer(UInt32 log_pointer) { database_replicated_ext->first_not_executed = log_pointer; } + private: /// Returns cached ZooKeeper session (possibly expired). From 7ab4445e993333f15cea8d69e0de9a909c7d6495 Mon Sep 17 00:00:00 2001 From: Alexander Tokmakov Date: Fri, 20 Nov 2020 19:06:27 +0300 Subject: [PATCH 0062/2357] try another approach --- src/Databases/DatabaseAtomic.cpp | 18 ++- src/Databases/DatabaseAtomic.h | 4 +- src/Databases/DatabaseOnDisk.cpp | 5 +- src/Databases/DatabaseOnDisk.h | 2 +- src/Databases/DatabaseReplicated.cpp | 124 +++----------------- src/Databases/DatabaseReplicated.h | 2 - src/Interpreters/Context.cpp | 13 ++ src/Interpreters/Context.h | 11 ++ src/Interpreters/DDLTask.h | 22 ++++ src/Interpreters/DDLWorker.cpp | 96 ++++++++++++--- src/Interpreters/DDLWorker.h | 5 + src/Interpreters/SystemLog.h | 9 +- src/Storages/StorageReplicatedMergeTree.cpp | 7 ++ 13 files changed, 186 insertions(+), 132 deletions(-) diff --git a/src/Databases/DatabaseAtomic.cpp b/src/Databases/DatabaseAtomic.cpp index 15a55da89b2..78400368924 100644 --- a/src/Databases/DatabaseAtomic.cpp +++ b/src/Databases/DatabaseAtomic.cpp @@ -11,6 +11,9 @@ #include #include +//FIXME it shouldn't be here +#include +#include namespace DB { @@ -263,7 +266,8 @@ void DatabaseAtomic::renameTable(const Context & context, const String & table_n } void DatabaseAtomic::commitCreateTable(const ASTCreateQuery & query, const StoragePtr & table, - const String & table_metadata_tmp_path, const String & table_metadata_path) + const String & table_metadata_tmp_path, const String & table_metadata_path, + const Context & query_context) { DetachedTables not_in_use; auto table_data_path = getTableDataPath(query); @@ -280,6 +284,18 @@ void DatabaseAtomic::commitCreateTable(const ASTCreateQuery & query, const Stora /// We will get en exception if some table with the same UUID exists (even if it's detached table or table from another database) DatabaseCatalog::instance().addUUIDMapping(query.uuid); locked_uuid = true; + + if (auto txn = query_context.getMetadataTransaction()) + { + String metadata_zk_path = txn->zookeeper_path + "/metadata/" + escapeForFileName(query.table); + String statement = getObjectDefinitionFromCreateQuery(query.clone()); + /// zk::multi(...) will throw if `metadata_zk_path` exists + txn->ops.emplace_back(zkutil::makeCreateRequest(metadata_zk_path, statement, zkutil::CreateMode::Persistent)); + txn->current_zookeeper->multi(txn->ops); /// Commit point (a sort of) for Replicated database + /// NOTE: replica will be lost if server crashes before the following renameNoReplace(...) + /// TODO better detection and recovery + } + /// It throws if `table_metadata_path` already exists (it's possible if table was detached) renameNoReplace(table_metadata_tmp_path, table_metadata_path); /// Commit point (a sort of) attachTableUnlocked(query.table, table, lock); /// Should never throw diff --git a/src/Databases/DatabaseAtomic.h b/src/Databases/DatabaseAtomic.h index 97e6e1173d1..61ce2721701 100644 --- a/src/Databases/DatabaseAtomic.h +++ b/src/Databases/DatabaseAtomic.h @@ -60,10 +60,10 @@ public: void waitDetachedTableNotInUse(const UUID & uuid); -private: +protected: void commitAlterTable(const StorageID & table_id, const String & table_metadata_tmp_path, const String & table_metadata_path) override; void commitCreateTable(const ASTCreateQuery & query, const StoragePtr & table, - const String & table_metadata_tmp_path, const String & table_metadata_path) override; + const String & table_metadata_tmp_path, const String & table_metadata_path, const Context & query_context) override; void assertDetachedTableNotInUse(const UUID & uuid); typedef std::unordered_map DetachedTables; diff --git a/src/Databases/DatabaseOnDisk.cpp b/src/Databases/DatabaseOnDisk.cpp index 8fa136f4969..8f24f53fc3f 100644 --- a/src/Databases/DatabaseOnDisk.cpp +++ b/src/Databases/DatabaseOnDisk.cpp @@ -193,11 +193,12 @@ void DatabaseOnDisk::createTable( out.close(); } - commitCreateTable(create, table, table_metadata_tmp_path, table_metadata_path); + commitCreateTable(create, table, table_metadata_tmp_path, table_metadata_path, context); } void DatabaseOnDisk::commitCreateTable(const ASTCreateQuery & query, const StoragePtr & table, - const String & table_metadata_tmp_path, const String & table_metadata_path) + const String & table_metadata_tmp_path, const String & table_metadata_path, + const Context & /*query_context*/) { try { diff --git a/src/Databases/DatabaseOnDisk.h b/src/Databases/DatabaseOnDisk.h index 23c1584ff9c..a5510ef4810 100644 --- a/src/Databases/DatabaseOnDisk.h +++ b/src/Databases/DatabaseOnDisk.h @@ -83,7 +83,7 @@ protected: ASTPtr getCreateQueryFromMetadata(const String & metadata_path, bool throw_on_error) const; virtual void commitCreateTable(const ASTCreateQuery & query, const StoragePtr & table, - const String & table_metadata_tmp_path, const String & table_metadata_path); + const String & table_metadata_tmp_path, const String & table_metadata_path, const Context & query_context); const String metadata_path; const String data_path; diff --git a/src/Databases/DatabaseReplicated.cpp b/src/Databases/DatabaseReplicated.cpp index 7b6d98f992a..608d03c339b 100644 --- a/src/Databases/DatabaseReplicated.cpp +++ b/src/Databases/DatabaseReplicated.cpp @@ -29,10 +29,9 @@ namespace ErrorCodes extern const int BAD_ARGUMENTS; extern const int REPLICA_IS_ALREADY_EXIST; extern const int DATABASE_REPLICATION_FAILED; + extern const int UNKNOWN_DATABASE; } -static constexpr size_t METADATA_FILE_BUFFER_SIZE = 32768; - zkutil::ZooKeeperPtr DatabaseReplicated::getZooKeeper() const { return global_context.getZooKeeper(); @@ -43,15 +42,6 @@ static inline String getHostID(const Context & global_context) return Cluster::Address::toString(getFQDNOrHostName(), global_context.getTCPPort()); } -Strings DatabaseReplicated::getSnapshots(const ZooKeeperPtr & zookeeper) const -{ - Strings snapshots = zookeeper->getChildren(zookeeper_path + "/snapshots"); - std::sort(snapshots.begin(), snapshots.end()); - if (snapshots.empty()) - throw Exception(ErrorCodes::LOGICAL_ERROR, "No snapshots found"); - return snapshots; -} - DatabaseReplicated::~DatabaseReplicated() = default; @@ -125,11 +115,9 @@ bool DatabaseReplicated::createDatabaseNodesInZooKeeper(const zkutil::ZooKeeperP ops.emplace_back(zkutil::makeCreateRequest(zookeeper_path, "", zkutil::CreateMode::Persistent)); ops.emplace_back(zkutil::makeCreateRequest(zookeeper_path + "/log", "", zkutil::CreateMode::Persistent)); ops.emplace_back(zkutil::makeCreateRequest(zookeeper_path + "/replicas", "", zkutil::CreateMode::Persistent)); - ops.emplace_back(zkutil::makeCreateRequest(zookeeper_path + "/snapshots", "", zkutil::CreateMode::Persistent)); - /// Create empty snapshot (with no tables) - ops.emplace_back(zkutil::makeCreateRequest(zookeeper_path + "/snapshots/0", "", zkutil::CreateMode::Persistent)); + ops.emplace_back(zkutil::makeCreateRequest(zookeeper_path + "/counter", "", zkutil::CreateMode::Persistent)); ops.emplace_back(zkutil::makeCreateRequest(zookeeper_path + "/metadata", "", zkutil::CreateMode::Persistent)); - ops.emplace_back(zkutil::makeCreateRequest(zookeeper_path + "/metadata/0", "", zkutil::CreateMode::Persistent)); + ops.emplace_back(zkutil::makeCreateRequest(zookeeper_path + "/min_log_ptr", "0", zkutil::CreateMode::Persistent)); Coordination::Responses responses; auto res = current_zookeeper->tryMulti(ops, responses); @@ -147,7 +135,7 @@ void DatabaseReplicated::createReplicaNodesInZooKeeper(const zkutil::ZooKeeperPt current_zookeeper->createAncestors(replica_path); /// When creating new replica, use latest snapshot version as initial value of log_pointer - log_entry_to_execute = parse(getSnapshots(current_zookeeper).back()); + log_entry_to_execute = 0; //FIXME /// Write host name to replica_path, it will protect from multiple replicas with the same name auto host_id = getHostID(global_context); @@ -160,10 +148,16 @@ void DatabaseReplicated::createReplicaNodesInZooKeeper(const zkutil::ZooKeeperPt recoverLostReplica(current_zookeeper, log_entry_to_execute, true); + String query_path_prefix = zookeeper_path + "/log/query-"; + String counter_prefix = zookeeper_path + "/counter/cnt-"; + String counter_path = current_zookeeper->create(counter_prefix, "", zkutil::CreateMode::EphemeralSequential); + String query_path = query_path_prefix + counter_path.substr(counter_prefix.size()); + Coordination::Requests ops; ops.emplace_back(zkutil::makeCreateRequest(replica_path, host_id, zkutil::CreateMode::Persistent)); ops.emplace_back(zkutil::makeCreateRequest(replica_path + "/log_ptr", toString(log_entry_to_execute), zkutil::CreateMode::Persistent)); - ops.emplace_back(zkutil::makeCreateRequest(zookeeper_path + "/log/query-", entry.toString(), zkutil::CreateMode::PersistentSequential)); + ops.emplace_back(zkutil::makeCreateRequest(query_path, entry.toString(), zkutil::CreateMode::PersistentSequential)); + ops.emplace_back(zkutil::makeRemoveRequest(counter_path, -1)); current_zookeeper->multi(ops); } @@ -207,20 +201,17 @@ void DatabaseReplicated::onUnexpectedLogEntry(const String & entry_name, const Z if (entry_number < log_entry_to_execute) throw Exception(ErrorCodes::LOGICAL_ERROR, "Entry {} already executed, current pointer is {}", entry_number, log_entry_to_execute); - /// Entry name is valid. Let's get min snapshot version to check if replica is staled. - Strings snapshots = getSnapshots(zookeeper); - UInt32 min_snapshot = parse(snapshots.front()); + /// Entry name is valid. Let's get min log pointer to check if replica is staled. + UInt32 min_snapshot = parse(zookeeper->get(zookeeper_path + "/min_log_ptr")); if (log_entry_to_execute < min_snapshot) { - recoverLostReplica(zookeeper, parse(snapshots.back())); + recoverLostReplica(zookeeper, 0); //FIXME log_pointer return; } throw Exception(ErrorCodes::LOGICAL_ERROR, "Cannot recover replica, probably it's a bug. " - "Got log entry '{}' when expected entry number {}, " - "available snapshots: ", - entry_name, log_entry_to_execute, boost::algorithm::join(snapshots, ", ")); + "Got log entry '{}' when expected entry number {}"); } void DatabaseReplicated::removeOutdatedSnapshotsAndLog() @@ -268,51 +259,11 @@ void DatabaseReplicated::removeOutdatedSnapshotsAndLog() } } -void DatabaseReplicated::onExecutedLogEntry(const String & entry_name, const ZooKeeperPtr & zookeeper) +void DatabaseReplicated::onExecutedLogEntry(const String & /*entry_name*/, const ZooKeeperPtr & /*zookeeper*/) { - assert(entry_name == DatabaseReplicatedExtensions::getLogEntryName(log_entry_to_execute)); - ++log_entry_to_execute; - if (snapshot_period > 0 && log_entry_to_execute % snapshot_period == 0) - { - createSnapshot(zookeeper); - } } -//void DatabaseReplicated::runBackgroundLogExecutor() -//{ -// if (last_executed_log_entry.empty()) -// { -// loadMetadataFromSnapshot(); -// } -// -// auto current_zookeeper = getZooKeeper(); -// Strings log_entry_names = current_zookeeper->getChildren(zookeeper_path + "/log"); -// -// std::sort(log_entry_names.begin(), log_entry_names.end()); -// auto newest_entry_it = std::upper_bound(log_entry_names.begin(), log_entry_names.end(), last_executed_log_entry); -// -// log_entry_names.erase(log_entry_names.begin(), newest_entry_it); -// -// for (const String & log_entry_name : log_entry_names) -// { -// //executeLogName(log_entry_name); -// last_executed_log_entry = log_entry_name; -// writeLastExecutedToDiskAndZK(); -// -// int log_n = parse(log_entry_name.substr(4)); -// int last_log_n = parse(log_entry_names.back().substr(4)); -// -// /// The third condition gurantees at most one snapshot creation per batch -// if (log_n > 0 && snapshot_period > 0 && (last_log_n - log_n) / snapshot_period == 0 && log_n % snapshot_period == 0) -// { -// createSnapshot(); -// } -// } -// -// //background_log_executor->scheduleAfter(500); -//} - void DatabaseReplicated::writeLastExecutedToDiskAndZK() { auto current_zookeeper = getZooKeeper(); @@ -363,58 +314,19 @@ BlockIO DatabaseReplicated::propose(const ASTPtr & query) } -void DatabaseReplicated::createSnapshot(const ZooKeeperPtr & zookeeper) -{ - String snapshot_path = zookeeper_path + "/snapshot/" + toString(log_entry_to_execute); - - if (zookeeper->exists(snapshot_path)) - return; - - std::vector> create_queries; - { - std::lock_guard lock{mutex}; - create_queries.reserve(tables.size()); - for (const auto & table : tables) - { - const String & name = table.first; - ReadBufferFromFile in(getObjectMetadataPath(name), METADATA_FILE_BUFFER_SIZE); - String attach_query; - readStringUntilEOF(attach_query, in); - create_queries.emplace_back(escapeForFileName(name), std::move(attach_query)); - } - } - - if (zookeeper->exists(snapshot_path)) - return; - - String queries_path = zookeeper_path + "/metadata/" + toString(log_entry_to_execute); - zookeeper->tryCreate(queries_path, "", zkutil::CreateMode::Persistent); - queries_path += '/'; - - //FIXME use tryMulti with MULTI_BATCH_SIZE - - for (const auto & table : create_queries) - zookeeper->tryCreate(queries_path + table.first, table.second, zkutil::CreateMode::Persistent); - - if (create_queries.size() != zookeeper->getChildren(zookeeper_path + "/metadata/" + toString(log_entry_to_execute)).size()) - throw Exception(ErrorCodes::LOGICAL_ERROR, "Created invalid snapshot"); - - zookeeper->tryCreate(snapshot_path, String(), zkutil::CreateMode::Persistent); -} - void DatabaseReplicated::recoverLostReplica(const ZooKeeperPtr & current_zookeeper, UInt32 from_snapshot, bool create) { LOG_WARNING(log, "Will recover replica from snapshot", from_snapshot); //FIXME drop old tables - String snapshot_metadata_path = zookeeper_path + "/metadata/" + toString(from_snapshot); + String snapshot_metadata_path = zookeeper_path + "/metadata"; Strings tables_in_snapshot = current_zookeeper->getChildren(snapshot_metadata_path); - current_zookeeper->get(zookeeper_path + "/snapshots/" + toString(from_snapshot)); /// Assert node exists snapshot_metadata_path += '/'; for (const auto & table_name : tables_in_snapshot) { + //FIXME It's not atomic. We need multiget here (available since ZooKeeper 3.6.0). String query_to_execute = current_zookeeper->get(snapshot_metadata_path + table_name); diff --git a/src/Databases/DatabaseReplicated.h b/src/Databases/DatabaseReplicated.h index 3f5bd4608f1..663df59ac63 100644 --- a/src/Databases/DatabaseReplicated.h +++ b/src/Databases/DatabaseReplicated.h @@ -64,10 +64,8 @@ private: void writeLastExecutedToDiskAndZK(); //void loadMetadataFromSnapshot(); - void createSnapshot(const ZooKeeperPtr & zookeeper); void removeOutdatedSnapshotsAndLog(); - Strings getSnapshots(const ZooKeeperPtr & zookeeper) const; void onUnexpectedLogEntry(const String & entry_name, const ZooKeeperPtr & zookeeper); void recoverLostReplica(const ZooKeeperPtr & current_zookeeper, UInt32 from_snapshot, bool create = false); diff --git a/src/Interpreters/Context.cpp b/src/Interpreters/Context.cpp index 1b9391b8725..a7309e9ae47 100644 --- a/src/Interpreters/Context.cpp +++ b/src/Interpreters/Context.cpp @@ -2415,4 +2415,17 @@ StorageID Context::resolveStorageIDImpl(StorageID storage_id, StorageNamespace w return StorageID::createEmpty(); } +void Context::initMetadataTransaction(MetadataTransactionPtr txn) +{ + assert(!metadata_transaction); + assert(query_context == this); + metadata_transaction = std::move(txn); +} + +MetadataTransactionPtr Context::getMetadataTransaction() const +{ + assert(query_context == this); + return metadata_transaction; +} + } diff --git a/src/Interpreters/Context.h b/src/Interpreters/Context.h index c55d8e6d604..ed11fab7599 100644 --- a/src/Interpreters/Context.h +++ b/src/Interpreters/Context.h @@ -114,6 +114,8 @@ using VolumePtr = std::shared_ptr; struct NamedSession; struct BackgroundTaskSchedulingSettings; +struct MetadataTransaction; +using MetadataTransactionPtr = std::shared_ptr; #if USE_EMBEDDED_COMPILER class CompiledExpressionCache; @@ -212,6 +214,12 @@ private: /// to be customized in HTTP and TCP servers by overloading the customizeContext(DB::Context&) /// methods. + MetadataTransactionPtr metadata_transaction; /// Distributed DDL context. I'm not sure if it's a suitable place for this, + /// but it's the easiest way to pass this through the whole stack from executeQuery(...) + /// to DatabaseOnDisk::commitCreateTable(...) or IStorage::alter(...) without changing + /// thousands of signatures. + /// And I hope it will be replaced with more common Transaction sometime. + /// Use copy constructor or createGlobal() instead Context(); @@ -634,6 +642,9 @@ public: IHostContextPtr & getHostContext(); const IHostContextPtr & getHostContext() const; + void initMetadataTransaction(MetadataTransactionPtr txn); + MetadataTransactionPtr getMetadataTransaction() const; + struct MySQLWireContext { uint8_t sequence_id = 0; diff --git a/src/Interpreters/DDLTask.h b/src/Interpreters/DDLTask.h index 51f09efd0bd..ba58fe3f42e 100644 --- a/src/Interpreters/DDLTask.h +++ b/src/Interpreters/DDLTask.h @@ -1,12 +1,14 @@ #pragma once #include #include +#include namespace DB { class ASTQueryWithOnCluster; +using ZooKeeperPtr = std::shared_ptr; struct HostID { @@ -62,6 +64,8 @@ struct DDLTask String entry_path; DDLLogEntry entry; + bool we_are_initiator = false; + /// Stage 2: resolve host_id and check that HostID host_id; String host_id_str; @@ -82,7 +86,25 @@ struct DDLTask bool was_executed = false; /// Stage 4: commit results to ZooKeeper + + String active_path; + String finished_path; + String shard_path; }; +struct MetadataTransaction +{ + ZooKeeperPtr current_zookeeper; + String zookeeper_path; + Coordination::Requests ops; + + + + void addOps(Coordination::Requests & other_ops) + { + std::move(ops.begin(), ops.end(), std::back_inserter(other_ops)); + } +}; + } diff --git a/src/Interpreters/DDLWorker.cpp b/src/Interpreters/DDLWorker.cpp index 51f0e1b45a9..5e4d79c32ab 100644 --- a/src/Interpreters/DDLWorker.cpp +++ b/src/Interpreters/DDLWorker.cpp @@ -252,13 +252,35 @@ DDLTaskPtr DDLWorker::initAndCheckTask(const String & entry_name, String & out_r String node_data; String entry_path = queue_dir + "/" + entry_name; + auto task = std::make_unique(); + task->entry_name = entry_name; + task->entry_path = entry_path; + if (database_replicated_ext) { - auto expected_log_entry = DatabaseReplicatedExtensions::getLogEntryName(database_replicated_ext->first_not_executed); - if (entry_name != expected_log_entry) + //auto expected_log_entry = DatabaseReplicatedExtensions::getLogEntryName(database_replicated_ext->first_not_executed); + //if (entry_name != expected_log_entry) + //{ + // database_replicated_ext->lost_callback(entry_name, zookeeper); + // out_reason = "DatabaseReplicated: expected " + expected_log_entry + " got " + entry_name; + // return {}; + //} + + String initiator_name; + zkutil::EventPtr wait_committed_or_failed; + + if (zookeeper->tryGet(entry_path + "/try", initiator_name, nullptr, wait_committed_or_failed)) { - database_replicated_ext->lost_callback(entry_name, zookeeper); - out_reason = "DatabaseReplicated: expected " + expected_log_entry + " got " + entry_name; + task->we_are_initiator = initiator_name == database_replicated_ext->getFullReplicaName(); + /// Query is not committed yet. We cannot just skip it and execute next one, because reordering may break replication. + //FIXME add some timeouts + if (!task->we_are_initiator) + wait_committed_or_failed->wait(); + } + + if (!task->we_are_initiator && !zookeeper->exists(entry_path + "/committed")) + { + out_reason = "Entry " + entry_name + " hasn't been committed"; return {}; } } @@ -272,10 +294,6 @@ DDLTaskPtr DDLWorker::initAndCheckTask(const String & entry_name, String & out_r return {}; } - auto task = std::make_unique(); - task->entry_name = entry_name; - task->entry_path = entry_path; - try { task->entry.parse(node_data); @@ -557,15 +575,34 @@ bool DDLWorker::tryExecuteQuery(const String & query, const DDLTask & task, Exec try { auto current_context = std::make_unique(context); + current_context->makeQueryContext(); + current_context->setCurrentQueryId(""); // generate random query_id + if (database_replicated_ext) { current_context->getClientInfo().query_kind = ClientInfo::QueryKind::REPLICATED_LOG_QUERY; //FIXME why do we need separate query kind? current_context->setCurrentDatabase(database_replicated_ext->database_name); + + if (task.we_are_initiator) + { + auto txn = std::make_shared(); + current_context->initMetadataTransaction(txn); + txn->current_zookeeper = current_zookeeper; + txn->zookeeper_path = database_replicated_ext->zookeeper_path; + txn->ops.emplace_back(zkutil::makeRemoveRequest(task.entry_path + "/try", -1)); + txn->ops.emplace_back(zkutil::makeCreateRequest(task.entry_path + "/committed", + database_replicated_ext->getFullReplicaName(), zkutil::CreateMode::Persistent)); + txn->ops.emplace_back(zkutil::makeRemoveRequest(task.active_path, -1)); + if (!task.shard_path.empty()) + txn->ops.emplace_back(zkutil::makeCreateRequest(task.shard_path, task.host_id_str, zkutil::CreateMode::Persistent)); + txn->ops.emplace_back(zkutil::makeCreateRequest(task.finished_path, task.execution_status.serializeText(), zkutil::CreateMode::Persistent)); + //txn->ops.emplace_back(zkutil::makeSetRequest(database_replicated_ext->getReplicaPath() + "/log_ptr", toString(database_replicated_ext->first_not_executed), -1)); + } } else current_context->getClientInfo().query_kind = ClientInfo::QueryKind::SECONDARY_QUERY; - current_context->setCurrentQueryId(""); // generate random query_id + executeQuery(istr, ostr, false, *current_context, {}); } catch (...) @@ -639,8 +676,9 @@ void DDLWorker::processTask(DDLTask & task) LOG_DEBUG(log, "Processing task {} ({})", task.entry_name, task.entry.query); String dummy; - String active_node_path = task.entry_path + "/active/" + task.host_id_str; - String finished_node_path = task.entry_path + "/finished/" + task.host_id_str; + //FIXME duplicate + String active_node_path = task.active_path = task.entry_path + "/active/" + task.host_id_str; + String finished_node_path = task.finished_path = task.entry_path + "/finished/" + task.host_id_str; auto code = zookeeper->tryCreate(active_node_path, "", zkutil::CreateMode::Ephemeral, dummy); @@ -712,11 +750,15 @@ void DDLWorker::processTask(DDLTask & task) ops.emplace_back(zkutil::makeCreateRequest(finished_node_path, task.execution_status.serializeText(), zkutil::CreateMode::Persistent)); if (database_replicated_ext) { - assert(DatabaseReplicatedExtensions::getLogEntryName(database_replicated_ext->first_not_executed) == task.entry_name); - ops.emplace_back(zkutil::makeSetRequest(database_replicated_ext->getReplicaPath() + "/log_ptr", toString(database_replicated_ext->first_not_executed), -1)); + //assert(DatabaseReplicatedExtensions::getLogEntryName(database_replicated_ext->first_not_executed) == task.entry_name); + //ops.emplace_back(zkutil::makeSetRequest(database_replicated_ext->getReplicaPath() + "/log_ptr", toString(database_replicated_ext->first_not_executed), -1)); } - zookeeper->multi(ops); + //FIXME replace with multi(...) or use MetadataTransaction + Coordination::Responses responses; + auto res = zookeeper->tryMulti(ops, responses); + if (res != Coordination::Error::ZNODEEXISTS && res != Coordination::Error::ZNONODE) + zkutil::KeeperMultiException::check(res, ops, responses); if (database_replicated_ext) { @@ -774,6 +816,7 @@ bool DDLWorker::tryExecuteQueryOnLeaderReplica( else shard_node_name = get_shard_name(task.cluster->getShardsAddresses().at(task.host_shard_num)); String shard_path = node_path + "/shards/" + shard_node_name; + task.shard_path = shard_path; //FIXME duplicate String is_executed_path = shard_path + "/executed"; String tries_to_execute_path = shard_path + "/tries_to_execute"; zookeeper->createAncestors(shard_path + "/"); @@ -826,7 +869,8 @@ bool DDLWorker::tryExecuteQueryOnLeaderReplica( /// and on the next iteration new leader will take lock if (tryExecuteQuery(rewritten_query, task, task.execution_status)) { - zookeeper->create(is_executed_path, task.host_id_str, zkutil::CreateMode::Persistent); + //FIXME replace with create(...) or remove and use MetadataTransaction + zookeeper->createIfNotExists(is_executed_path, task.host_id_str); executed_by_leader = true; break; } @@ -976,7 +1020,27 @@ String DDLWorker::enqueueQuery(DDLLogEntry & entry) String query_path_prefix = queue_dir + "/query-"; zookeeper->createAncestors(query_path_prefix); - String node_path = zookeeper->create(query_path_prefix, entry.toString(), zkutil::CreateMode::PersistentSequential); + String node_path; + if (database_replicated_ext) + { + /// We cannot create sequential node and it's ephemeral child in a single transaction, so allocate sequential number another way + String counter_prefix = database_replicated_ext->zookeeper_path + "/counter/cnt-"; + String counter_path = zookeeper->create(counter_prefix, "", zkutil::CreateMode::EphemeralSequential); + node_path = query_path_prefix + counter_path.substr(counter_prefix.size()); + + Coordination::Requests ops; + /// Query is not committed yet, but we have to write it into log to avoid reordering + ops.emplace_back(zkutil::makeCreateRequest(node_path, entry.toString(), zkutil::CreateMode::Persistent)); + /// '/try' will be replaced with '/committed' or will be removed due to expired session or other error + ops.emplace_back(zkutil::makeCreateRequest(node_path + "/try", database_replicated_ext->getFullReplicaName(), zkutil::CreateMode::Ephemeral)); + /// We don't need it anymore + ops.emplace_back(zkutil::makeRemoveRequest(counter_path, -1)); + zookeeper->multi(ops); + } + else + { + node_path = zookeeper->create(query_path_prefix, entry.toString(), zkutil::CreateMode::PersistentSequential); + } /// Optional step try diff --git a/src/Interpreters/DDLWorker.h b/src/Interpreters/DDLWorker.h index 08bf641264e..86677bfbb19 100644 --- a/src/Interpreters/DDLWorker.h +++ b/src/Interpreters/DDLWorker.h @@ -54,6 +54,11 @@ struct DatabaseReplicatedExtensions return zookeeper_path + "/replicas/" + shard_name + "/" + replica_name; } + String getFullReplicaName() const + { + return shard_name + '|' + replica_name; + } + static String getLogEntryName(UInt32 log_entry_number); static UInt32 getLogEntryNumber(const String & log_entry_name); }; diff --git a/src/Interpreters/SystemLog.h b/src/Interpreters/SystemLog.h index 6c56565a152..20980a186cb 100644 --- a/src/Interpreters/SystemLog.h +++ b/src/Interpreters/SystemLog.h @@ -505,7 +505,9 @@ void SystemLog::prepareTable() LOG_DEBUG(log, "Existing table {} for system log has obsolete or different structure. Renaming it to {}", description, backQuoteIfNeed(to.table)); - InterpreterRenameQuery(rename, context).execute(); + Context query_context = context; + query_context.makeQueryContext(); + InterpreterRenameQuery(rename, query_context).execute(); /// The required table will be created. table = nullptr; @@ -521,7 +523,10 @@ void SystemLog::prepareTable() auto create = getCreateTableQuery(); - InterpreterCreateQuery interpreter(create, context); + + Context query_context = context; + query_context.makeQueryContext(); + InterpreterCreateQuery interpreter(create, query_context); interpreter.setInternal(true); interpreter.execute(); diff --git a/src/Storages/StorageReplicatedMergeTree.cpp b/src/Storages/StorageReplicatedMergeTree.cpp index b93500000b5..5c176de1395 100644 --- a/src/Storages/StorageReplicatedMergeTree.cpp +++ b/src/Storages/StorageReplicatedMergeTree.cpp @@ -46,6 +46,7 @@ #include #include #include +#include #include #include @@ -4104,6 +4105,12 @@ void StorageReplicatedMergeTree::alter( zkutil::makeCreateRequest(mutations_path + "/", mutation_entry.toString(), zkutil::CreateMode::PersistentSequential)); } + if (auto txn = query_context.getMetadataTransaction()) + { + txn->addOps(ops); + //TODO maybe also change here table metadata in replicated database? + } + Coordination::Responses results; Coordination::Error rc = zookeeper->tryMulti(ops, results); From dad21ee684c5869d1c83b572cdec5c6f3bcb9130 Mon Sep 17 00:00:00 2001 From: Alexander Tokmakov Date: Tue, 24 Nov 2020 13:24:39 +0300 Subject: [PATCH 0063/2357] maintain metadata in zk --- src/Common/ZooKeeper/ZooKeeper.cpp | 8 +++ src/Databases/DatabaseAtomic.cpp | 56 ++++++++++++++++- src/Databases/DatabaseAtomic.h | 2 +- src/Databases/DatabaseOrdinary.cpp | 4 +- src/Databases/DatabaseOrdinary.h | 2 +- src/Databases/DatabaseReplicated.cpp | 4 +- src/Interpreters/DDLWorker.cpp | 24 +++----- src/Interpreters/InterpreterAlterQuery.cpp | 4 +- src/Storages/StorageReplicatedMergeTree.cpp | 30 ++++++++-- .../test_replicated_database/test.py | 60 +++++++++++-------- 10 files changed, 140 insertions(+), 54 deletions(-) diff --git a/src/Common/ZooKeeper/ZooKeeper.cpp b/src/Common/ZooKeeper/ZooKeeper.cpp index bee875d1c74..09703e523bb 100644 --- a/src/Common/ZooKeeper/ZooKeeper.cpp +++ b/src/Common/ZooKeeper/ZooKeeper.cpp @@ -537,6 +537,14 @@ Coordination::Error ZooKeeper::trySet(const std::string & path, const std::strin Coordination::Error ZooKeeper::multiImpl(const Coordination::Requests & requests, Coordination::Responses & responses) { + String desc; + for (const auto & r : requests) + { + auto & r_ref = *r; + desc += String(typeid(r_ref).name()) + "\t" + r->getPath() + "\n"; + } + LOG_TRACE(&Poco::Logger::get("ZKTX"), "zk multi {}", desc); + if (requests.empty()) return Coordination::Error::ZOK; diff --git a/src/Databases/DatabaseAtomic.cpp b/src/Databases/DatabaseAtomic.cpp index 78400368924..ca39cefc5c8 100644 --- a/src/Databases/DatabaseAtomic.cpp +++ b/src/Databases/DatabaseAtomic.cpp @@ -4,6 +4,7 @@ #include #include #include +#include #include #include #include @@ -108,7 +109,7 @@ StoragePtr DatabaseAtomic::detachTable(const String & name) return table; } -void DatabaseAtomic::dropTable(const Context &, const String & table_name, bool no_delay) +void DatabaseAtomic::dropTable(const Context & context, const String & table_name, bool no_delay) { String table_metadata_path = getObjectMetadataPath(table_name); String table_metadata_path_drop; @@ -117,6 +118,16 @@ void DatabaseAtomic::dropTable(const Context &, const String & table_name, bool std::unique_lock lock(mutex); table = getTableUnlocked(table_name, lock); table_metadata_path_drop = DatabaseCatalog::instance().getPathForDroppedMetadata(table->getStorageID()); + + if (auto txn = context.getMetadataTransaction()) + { + String metadata_zk_path = txn->zookeeper_path + "/metadata/" + escapeForFileName(table_name); + txn->ops.emplace_back(zkutil::makeRemoveRequest(metadata_zk_path, -1)); + txn->current_zookeeper->multi(txn->ops); /// Commit point (a sort of) for Replicated database + /// NOTE: replica will be lost if server crashes before the following rename + /// TODO better detection and recovery + } + Poco::File(table_metadata_path).renameTo(table_metadata_path_drop); /// Mark table as dropped DatabaseWithDictionaries::detachTableUnlocked(table_name, lock); /// Should never throw table_name_to_path.erase(table_name); @@ -146,6 +157,8 @@ void DatabaseAtomic::renameTable(const Context & context, const String & table_n if (exchange && dictionary) throw Exception(ErrorCodes::NOT_IMPLEMENTED, "Cannot exchange dictionaries"); + if (exchange && !supportsRenameat2()) + throw Exception(ErrorCodes::NOT_IMPLEMENTED, "RENAME EXCHANGE is not supported"); auto & other_db = dynamic_cast(to_database); bool inside_database = this == &other_db; @@ -231,6 +244,33 @@ void DatabaseAtomic::renameTable(const Context & context, const String & table_n } /// Table renaming actually begins here + if (auto txn = context.getMetadataTransaction()) + { + String statement; + String statement_to; + { + ReadBufferFromFile in(old_metadata_path, 4096); + readStringUntilEOF(statement, in); + if (exchange) + { + ReadBufferFromFile in_to(new_metadata_path, 4096); + readStringUntilEOF(statement_to, in_to); + } + } + String metadata_zk_path = txn->zookeeper_path + "/metadata/" + escapeForFileName(table_name); + String metadata_zk_path_to = txn->zookeeper_path + "/metadata/" + escapeForFileName(to_table_name); + txn->ops.emplace_back(zkutil::makeRemoveRequest(metadata_zk_path, -1)); + if (exchange) + { + txn->ops.emplace_back(zkutil::makeRemoveRequest(metadata_zk_path_to, -1)); + txn->ops.emplace_back(zkutil::makeCreateRequest(metadata_zk_path, statement_to, zkutil::CreateMode::Persistent)); + } + txn->ops.emplace_back(zkutil::makeCreateRequest(metadata_zk_path_to, statement, zkutil::CreateMode::Persistent)); + txn->current_zookeeper->multi(txn->ops); /// Commit point (a sort of) for Replicated database + /// NOTE: replica will be lost if server crashes before the following rename + /// TODO better detection and recovery + } + if (exchange) renameExchange(old_metadata_path, new_metadata_path); else @@ -312,7 +352,7 @@ void DatabaseAtomic::commitCreateTable(const ASTCreateQuery & query, const Stora tryCreateSymlink(query.table, table_data_path); } -void DatabaseAtomic::commitAlterTable(const StorageID & table_id, const String & table_metadata_tmp_path, const String & table_metadata_path) +void DatabaseAtomic::commitAlterTable(const StorageID & table_id, const String & table_metadata_tmp_path, const String & table_metadata_path, const String & statement, const Context & query_context) { bool check_file_exists = true; SCOPE_EXIT({ std::error_code code; if (check_file_exists) std::filesystem::remove(table_metadata_tmp_path, code); }); @@ -323,6 +363,18 @@ void DatabaseAtomic::commitAlterTable(const StorageID & table_id, const String & if (table_id.uuid != actual_table_id.uuid) throw Exception("Cannot alter table because it was renamed", ErrorCodes::CANNOT_ASSIGN_ALTER); + if (&query_context != &query_context.getGlobalContext()) // FIXME + { + if (auto txn = query_context.getMetadataTransaction()) + { + String metadata_zk_path = txn->zookeeper_path + "/metadata/" + escapeForFileName(table_id.table_name); + txn->ops.emplace_back(zkutil::makeSetRequest(metadata_zk_path, statement, -1)); + txn->current_zookeeper->multi(txn->ops); /// Commit point (a sort of) for Replicated database + /// NOTE: replica will be lost if server crashes before the following rename + /// TODO better detection and recovery + } + } + check_file_exists = renameExchangeIfSupported(table_metadata_tmp_path, table_metadata_path); if (!check_file_exists) std::filesystem::rename(table_metadata_tmp_path, table_metadata_path); diff --git a/src/Databases/DatabaseAtomic.h b/src/Databases/DatabaseAtomic.h index 61ce2721701..9cc6a429656 100644 --- a/src/Databases/DatabaseAtomic.h +++ b/src/Databases/DatabaseAtomic.h @@ -61,7 +61,7 @@ public: void waitDetachedTableNotInUse(const UUID & uuid); protected: - void commitAlterTable(const StorageID & table_id, const String & table_metadata_tmp_path, const String & table_metadata_path) override; + void commitAlterTable(const StorageID & table_id, const String & table_metadata_tmp_path, const String & table_metadata_path, const String & statement, const Context & query_context) override; void commitCreateTable(const ASTCreateQuery & query, const StoragePtr & table, const String & table_metadata_tmp_path, const String & table_metadata_path, const Context & query_context) override; diff --git a/src/Databases/DatabaseOrdinary.cpp b/src/Databases/DatabaseOrdinary.cpp index b363058c0c6..3df0d8fe907 100644 --- a/src/Databases/DatabaseOrdinary.cpp +++ b/src/Databases/DatabaseOrdinary.cpp @@ -312,10 +312,10 @@ void DatabaseOrdinary::alterTable(const Context & context, const StorageID & tab out.close(); } - commitAlterTable(table_id, table_metadata_tmp_path, table_metadata_path); + commitAlterTable(table_id, table_metadata_tmp_path, table_metadata_path, statement, context); } -void DatabaseOrdinary::commitAlterTable(const StorageID &, const String & table_metadata_tmp_path, const String & table_metadata_path) +void DatabaseOrdinary::commitAlterTable(const StorageID &, const String & table_metadata_tmp_path, const String & table_metadata_path, const String & /*statement*/, const Context & /*query_context*/) { try { diff --git a/src/Databases/DatabaseOrdinary.h b/src/Databases/DatabaseOrdinary.h index b5ea286ef15..6a21e19d5e2 100644 --- a/src/Databases/DatabaseOrdinary.h +++ b/src/Databases/DatabaseOrdinary.h @@ -30,7 +30,7 @@ public: const StorageInMemoryMetadata & metadata) override; protected: - virtual void commitAlterTable(const StorageID & table_id, const String & table_metadata_tmp_path, const String & table_metadata_path); + virtual void commitAlterTable(const StorageID & table_id, const String & table_metadata_tmp_path, const String & table_metadata_path, const String & statement, const Context & query_context); void startupTables(ThreadPool & thread_pool); }; diff --git a/src/Databases/DatabaseReplicated.cpp b/src/Databases/DatabaseReplicated.cpp index 608d03c339b..25fb95ba0de 100644 --- a/src/Databases/DatabaseReplicated.cpp +++ b/src/Databases/DatabaseReplicated.cpp @@ -146,8 +146,6 @@ void DatabaseReplicated::createReplicaNodesInZooKeeper(const zkutil::ZooKeeperPt entry.query = {}; entry.initiator = {}; - recoverLostReplica(current_zookeeper, log_entry_to_execute, true); - String query_path_prefix = zookeeper_path + "/log/query-"; String counter_prefix = zookeeper_path + "/counter/cnt-"; String counter_path = current_zookeeper->create(counter_prefix, "", zkutil::CreateMode::EphemeralSequential); @@ -165,6 +163,8 @@ void DatabaseReplicated::loadStoredObjects(Context & context, bool has_force_res { DatabaseAtomic::loadStoredObjects(context, has_force_restore_data_flag, force_attach); + recoverLostReplica(global_context.getZooKeeper(), 0, true); //FIXME + DatabaseReplicatedExtensions ext; ext.database_uuid = getUUID(); ext.zookeeper_path = zookeeper_path; diff --git a/src/Interpreters/DDLWorker.cpp b/src/Interpreters/DDLWorker.cpp index 5e4d79c32ab..099b968d895 100644 --- a/src/Interpreters/DDLWorker.cpp +++ b/src/Interpreters/DDLWorker.cpp @@ -258,16 +258,8 @@ DDLTaskPtr DDLWorker::initAndCheckTask(const String & entry_name, String & out_r if (database_replicated_ext) { - //auto expected_log_entry = DatabaseReplicatedExtensions::getLogEntryName(database_replicated_ext->first_not_executed); - //if (entry_name != expected_log_entry) - //{ - // database_replicated_ext->lost_callback(entry_name, zookeeper); - // out_reason = "DatabaseReplicated: expected " + expected_log_entry + " got " + entry_name; - // return {}; - //} - String initiator_name; - zkutil::EventPtr wait_committed_or_failed; + zkutil::EventPtr wait_committed_or_failed = std::make_shared(); if (zookeeper->tryGet(entry_path + "/try", initiator_name, nullptr, wait_committed_or_failed)) { @@ -275,7 +267,10 @@ DDLTaskPtr DDLWorker::initAndCheckTask(const String & entry_name, String & out_r /// Query is not committed yet. We cannot just skip it and execute next one, because reordering may break replication. //FIXME add some timeouts if (!task->we_are_initiator) + { + LOG_TRACE(log, "Waiting for initiator {} to commit or rollback entry {}", initiator_name, entry_path); wait_committed_or_failed->wait(); + } } if (!task->we_are_initiator && !zookeeper->exists(entry_path + "/committed")) @@ -378,7 +373,10 @@ void DDLWorker::scheduleTasks() Strings queue_nodes = zookeeper->getChildren(queue_dir, nullptr, queue_updated_event); filterAndSortQueueNodes(queue_nodes); if (queue_nodes.empty()) + { + LOG_TRACE(log, "No tasks to schedule"); return; + } bool server_startup = last_tasks.empty(); @@ -389,6 +387,7 @@ void DDLWorker::scheduleTasks() for (auto it = begin_node; it != queue_nodes.end() && !stop_flag; ++it) { String entry_name = *it; + LOG_TRACE(log, "Checking task {}", entry_name); String reason; auto task = initAndCheckTask(entry_name, reason, zookeeper); @@ -748,11 +747,6 @@ void DDLWorker::processTask(DDLTask & task) Coordination::Requests ops; ops.emplace_back(zkutil::makeRemoveRequest(active_node_path, -1)); ops.emplace_back(zkutil::makeCreateRequest(finished_node_path, task.execution_status.serializeText(), zkutil::CreateMode::Persistent)); - if (database_replicated_ext) - { - //assert(DatabaseReplicatedExtensions::getLogEntryName(database_replicated_ext->first_not_executed) == task.entry_name); - //ops.emplace_back(zkutil::makeSetRequest(database_replicated_ext->getReplicaPath() + "/log_ptr", toString(database_replicated_ext->first_not_executed), -1)); - } //FIXME replace with multi(...) or use MetadataTransaction Coordination::Responses responses; @@ -816,8 +810,8 @@ bool DDLWorker::tryExecuteQueryOnLeaderReplica( else shard_node_name = get_shard_name(task.cluster->getShardsAddresses().at(task.host_shard_num)); String shard_path = node_path + "/shards/" + shard_node_name; - task.shard_path = shard_path; //FIXME duplicate String is_executed_path = shard_path + "/executed"; + task.shard_path = is_executed_path; //FIXME duplicate String tries_to_execute_path = shard_path + "/tries_to_execute"; zookeeper->createAncestors(shard_path + "/"); diff --git a/src/Interpreters/InterpreterAlterQuery.cpp b/src/Interpreters/InterpreterAlterQuery.cpp index c094bb8377c..5f6058b48c0 100644 --- a/src/Interpreters/InterpreterAlterQuery.cpp +++ b/src/Interpreters/InterpreterAlterQuery.cpp @@ -51,9 +51,11 @@ BlockIO InterpreterAlterQuery::execute() auto metadata_snapshot = table->getInMemoryMetadataPtr(); DatabasePtr database = DatabaseCatalog::instance().getDatabase(table_id.database_name); - if (typeid_cast(database.get()) && context.getClientInfo().query_kind != ClientInfo::QueryKind::REPLICATED_LOG_QUERY && !table->supportsReplication()) + if (typeid_cast(database.get()) && context.getClientInfo().query_kind != ClientInfo::QueryKind::REPLICATED_LOG_QUERY) return typeid_cast(database.get())->propose(query_ptr); + //FIXME commit MetadataTransaction for all ALTER kinds. Now its' implemented only for metadata alter. + /// Add default database to table identifiers that we can encounter in e.g. default expressions, /// mutation expression, etc. AddDefaultDatabaseVisitor visitor(table_id.getDatabaseName()); diff --git a/src/Storages/StorageReplicatedMergeTree.cpp b/src/Storages/StorageReplicatedMergeTree.cpp index 5c176de1395..9db2821502d 100644 --- a/src/Storages/StorageReplicatedMergeTree.cpp +++ b/src/Storages/StorageReplicatedMergeTree.cpp @@ -30,6 +30,7 @@ #include #include +#include #include #include @@ -4047,6 +4048,8 @@ void StorageReplicatedMergeTree::alter( future_metadata_in_zk.constraints = new_constraints_str; Coordination::Requests ops; + size_t alter_path_idx = std::numeric_limits::max(); + size_t mutation_path_idx = std::numeric_limits::max(); String new_metadata_str = future_metadata_in_zk.toString(); ops.emplace_back(zkutil::makeSetRequest(zookeeper_path + "/metadata", new_metadata_str, metadata_version)); @@ -4078,6 +4081,7 @@ void StorageReplicatedMergeTree::alter( *current_metadata, query_context.getSettingsRef().materialize_ttl_after_modify, query_context); alter_entry->have_mutation = !maybe_mutation_commands.empty(); + alter_path_idx = ops.size(); ops.emplace_back(zkutil::makeCreateRequest( zookeeper_path + "/log/log-", alter_entry->toString(), zkutil::CreateMode::PersistentSequential)); @@ -4101,6 +4105,7 @@ void StorageReplicatedMergeTree::alter( mutation_entry.create_time = time(nullptr); ops.emplace_back(zkutil::makeSetRequest(mutations_path, String(), mutations_stat.version)); + mutation_path_idx = ops.size(); ops.emplace_back( zkutil::makeCreateRequest(mutations_path + "/", mutation_entry.toString(), zkutil::CreateMode::PersistentSequential)); } @@ -4108,7 +4113,24 @@ void StorageReplicatedMergeTree::alter( if (auto txn = query_context.getMetadataTransaction()) { txn->addOps(ops); - //TODO maybe also change here table metadata in replicated database? + /// NOTE: IDatabase::alterTable(...) is called when executing ALTER_METADATA queue entry without query context, + /// so we have to update metadata of DatabaseReplicated here. + /// It also may cause "Table columns structure in ZooKeeper is different" error on server startup + /// even for Ordinary and Atomic databases. + String metadata_zk_path = txn->zookeeper_path + "/metadata/" + escapeForFileName(table_id.table_name); + auto ast = DatabaseCatalog::instance().getDatabase(table_id.database_name)->getCreateTableQuery(table_id.table_name, query_context); + auto & ast_create_query = ast->as(); + + //FIXME copy-paste + ASTPtr new_columns = InterpreterCreateQuery::formatColumns(future_metadata.columns); + ASTPtr new_indices = InterpreterCreateQuery::formatIndices(future_metadata.secondary_indices); + ASTPtr new_constraints = InterpreterCreateQuery::formatConstraints(future_metadata.constraints); + + ast_create_query.columns_list->replace(ast_create_query.columns_list->columns, new_columns); + ast_create_query.columns_list->setOrReplace(ast_create_query.columns_list->indices, new_indices); + ast_create_query.columns_list->setOrReplace(ast_create_query.columns_list->constraints, new_constraints); + + ops.emplace_back(zkutil::makeSetRequest(metadata_zk_path, getObjectDefinitionFromCreateQuery(ast), -1)); } Coordination::Responses results; @@ -4124,17 +4146,17 @@ void StorageReplicatedMergeTree::alter( if (alter_entry->have_mutation) { /// ALTER_METADATA record in replication /log - String alter_path = dynamic_cast(*results[2]).path_created; + String alter_path = dynamic_cast(*results[alter_path_idx]).path_created; alter_entry->znode_name = alter_path.substr(alter_path.find_last_of('/') + 1); /// ReplicatedMergeTreeMutationEntry record in /mutations - String mutation_path = dynamic_cast(*results.back()).path_created; + String mutation_path = dynamic_cast(*results[mutation_path_idx]).path_created; mutation_znode = mutation_path.substr(mutation_path.find_last_of('/') + 1); } else { /// ALTER_METADATA record in replication /log - String alter_path = dynamic_cast(*results.back()).path_created; + String alter_path = dynamic_cast(*results[alter_path_idx]).path_created; alter_entry->znode_name = alter_path.substr(alter_path.find_last_of('/') + 1); } break; diff --git a/tests/integration/test_replicated_database/test.py b/tests/integration/test_replicated_database/test.py index 06d8aa9467a..11bfbad393b 100644 --- a/tests/integration/test_replicated_database/test.py +++ b/tests/integration/test_replicated_database/test.py @@ -16,7 +16,7 @@ snapshot_recovering_node = cluster.add_instance('snapshot_recovering_node', main uuid_regex = re.compile("[0-9a-f]{8}\-[0-9a-f]{4}\-[0-9a-f]{4}\-[0-9a-f]{4}\-[0-9a-f]{12}") def assert_create_query(nodes, table_name, expected): replace_uuid = lambda x: re.sub(uuid_regex, "uuid", x) - query = "show create table testdb.{}".format(table_name) + query = "show create table {}".format(table_name) for node in nodes: assert_eq_with_retry(node, query, expected, get_result=replace_uuid) @@ -41,45 +41,53 @@ def test_create_replicated_table(started_cluster): expected = "CREATE TABLE testdb.replicated_table\\n(\\n `d` Date,\\n `k` UInt64,\\n `i32` Int32\\n)\\n" \ "ENGINE = ReplicatedMergeTree(\\'/clickhouse/tables/uuid/{shard}\\', \\'{replica}\\')\\n" \ "PARTITION BY toYYYYMM(d)\\nORDER BY k\\nSETTINGS index_granularity = 8192" - assert_create_query([main_node, dummy_node], "replicated_table", expected) + assert_create_query([main_node, dummy_node], "testdb.replicated_table", expected) # assert without replacing uuid assert main_node.query("show create testdb.replicated_table") == dummy_node.query("show create testdb.replicated_table") -def test_simple_alter_table(started_cluster): - #TODO add test with ReplicatedMergeTree - main_node.query("CREATE TABLE testdb.alter_test " +@pytest.mark.parametrize("engine", ['MergeTree', 'ReplicatedMergeTree']) +def test_simple_alter_table(started_cluster, engine): + name = "testdb.alter_test_{}".format(engine) + main_node.query("CREATE TABLE {} " "(CounterID UInt32, StartDate Date, UserID UInt32, VisitID UInt32, NestedColumn Nested(A UInt8, S String), ToDrop UInt32) " - "ENGINE = MergeTree(StartDate, intHash32(UserID), (CounterID, StartDate, intHash32(UserID), VisitID), 8192);") - main_node.query("ALTER TABLE testdb.alter_test ADD COLUMN Added0 UInt32;") - main_node.query("ALTER TABLE testdb.alter_test ADD COLUMN Added2 UInt32;") - main_node.query("ALTER TABLE testdb.alter_test ADD COLUMN Added1 UInt32 AFTER Added0;") - main_node.query("ALTER TABLE testdb.alter_test ADD COLUMN AddedNested1 Nested(A UInt32, B UInt64) AFTER Added2;") - main_node.query("ALTER TABLE testdb.alter_test ADD COLUMN AddedNested1.C Array(String) AFTER AddedNested1.B;") - main_node.query("ALTER TABLE testdb.alter_test ADD COLUMN AddedNested2 Nested(A UInt32, B UInt64) AFTER AddedNested1;") + "ENGINE = {} PARTITION BY StartDate ORDER BY (CounterID, StartDate, intHash32(UserID), VisitID);".format(name, engine)) + main_node.query("ALTER TABLE {} ADD COLUMN Added0 UInt32;".format(name)) + main_node.query("ALTER TABLE {} ADD COLUMN Added2 UInt32;".format(name)) + main_node.query("ALTER TABLE {} ADD COLUMN Added1 UInt32 AFTER Added0;".format(name)) + main_node.query("ALTER TABLE {} ADD COLUMN AddedNested1 Nested(A UInt32, B UInt64) AFTER Added2;".format(name)) + main_node.query("ALTER TABLE {} ADD COLUMN AddedNested1.C Array(String) AFTER AddedNested1.B;".format(name)) + main_node.query("ALTER TABLE {} ADD COLUMN AddedNested2 Nested(A UInt32, B UInt64) AFTER AddedNested1;".format(name)) - expected = "CREATE TABLE testdb.alter_test\\n(\\n `CounterID` UInt32,\\n `StartDate` Date,\\n `UserID` UInt32,\\n" \ + full_engine = engine if not "Replicated" in engine else engine + "(\\'/clickhouse/tables/uuid/{shard}\\', \\'{replica}\\')" + expected = "CREATE TABLE {}\\n(\\n `CounterID` UInt32,\\n `StartDate` Date,\\n `UserID` UInt32,\\n" \ " `VisitID` UInt32,\\n `NestedColumn.A` Array(UInt8),\\n `NestedColumn.S` Array(String),\\n" \ " `ToDrop` UInt32,\\n `Added0` UInt32,\\n `Added1` UInt32,\\n `Added2` UInt32,\\n" \ " `AddedNested1.A` Array(UInt32),\\n `AddedNested1.B` Array(UInt64),\\n `AddedNested1.C` Array(String),\\n" \ " `AddedNested2.A` Array(UInt32),\\n `AddedNested2.B` Array(UInt64)\\n)\\n" \ - "ENGINE = MergeTree(StartDate, intHash32(UserID), (CounterID, StartDate, intHash32(UserID), VisitID), 8192)" + "ENGINE = {}\\nPARTITION BY StartDate\\nORDER BY (CounterID, StartDate, intHash32(UserID), VisitID)\\n" \ + "SETTINGS index_granularity = 8192".format(name, full_engine) - assert_create_query([main_node, dummy_node], "alter_test", expected) + assert_create_query([main_node, dummy_node], name, expected) -def test_create_replica_after_delay(started_cluster): + +@pytest.mark.parametrize("engine", ['MergeTree', 'ReplicatedMergeTree']) +def test_create_replica_after_delay(started_cluster, engine): competing_node.query("CREATE DATABASE testdb ENGINE = Replicated('/clickhouse/databases/test1', 'shard1', 'replica3');") - main_node.query("ALTER TABLE testdb.alter_test ADD COLUMN Added3 UInt32;") - main_node.query("ALTER TABLE testdb.alter_test DROP COLUMN AddedNested1;") - main_node.query("ALTER TABLE testdb.alter_test RENAME COLUMN Added1 TO AddedNested1;") + name = "testdb.alter_test_{}".format(engine) + main_node.query("ALTER TABLE {} ADD COLUMN Added3 UInt32;".format(name)) + main_node.query("ALTER TABLE {} DROP COLUMN AddedNested1;".format(name)) + main_node.query("ALTER TABLE {} RENAME COLUMN Added1 TO AddedNested1;".format(name)) - expected = "CREATE TABLE testdb.alter_test\\n(\\n `CounterID` UInt32,\\n `StartDate` Date,\\n `UserID` UInt32,\\n" \ + full_engine = engine if not "Replicated" in engine else engine + "(\\'/clickhouse/tables/uuid/{shard}\\', \\'{replica}\\')" + expected = "CREATE TABLE {}\\n(\\n `CounterID` UInt32,\\n `StartDate` Date,\\n `UserID` UInt32,\\n" \ " `VisitID` UInt32,\\n `NestedColumn.A` Array(UInt8),\\n `NestedColumn.S` Array(String),\\n" \ " `ToDrop` UInt32,\\n `Added0` UInt32,\\n `AddedNested1` UInt32,\\n `Added2` UInt32,\\n" \ " `AddedNested2.A` Array(UInt32),\\n `AddedNested2.B` Array(UInt64),\\n `Added3` UInt32\\n)\\n" \ - "ENGINE = MergeTree(StartDate, intHash32(UserID), (CounterID, StartDate, intHash32(UserID), VisitID), 8192)" + "ENGINE = {}\\nPARTITION BY StartDate\\nORDER BY (CounterID, StartDate, intHash32(UserID), VisitID)\\n" \ + "SETTINGS index_granularity = 8192".format(name, full_engine) - assert_create_query([main_node, dummy_node, competing_node], "alter_test", expected) + assert_create_query([main_node, dummy_node, competing_node], name, expected) def test_alters_from_different_replicas(started_cluster): main_node.query("CREATE TABLE testdb.concurrent_test " @@ -103,7 +111,7 @@ def test_alters_from_different_replicas(started_cluster): " `AddedNested2.B` Array(UInt64)\\n)\\n" \ "ENGINE = MergeTree(StartDate, intHash32(UserID), (CounterID, StartDate, intHash32(UserID), VisitID), 8192)" - assert_create_query([main_node, competing_node], "concurrent_test", expected) + assert_create_query([main_node, competing_node], "testdb.concurrent_test", expected) def test_drop_and_create_table(started_cluster): main_node.query("DROP TABLE testdb.concurrent_test") @@ -115,7 +123,7 @@ def test_drop_and_create_table(started_cluster): " `VisitID` UInt32,\\n `NestedColumn.A` Array(UInt8),\\n `NestedColumn.S` Array(String),\\n `ToDrop` UInt32\\n)\\n" \ "ENGINE = MergeTree(StartDate, intHash32(UserID), (CounterID, StartDate, intHash32(UserID), VisitID), 8192)" - assert_create_query([main_node, competing_node], "concurrent_test", expected) + assert_create_query([main_node, competing_node], "testdb.concurrent_test", expected) def test_replica_restart(started_cluster): main_node.restart_clickhouse() @@ -124,7 +132,7 @@ def test_replica_restart(started_cluster): " `VisitID` UInt32,\\n `NestedColumn.A` Array(UInt8),\\n `NestedColumn.S` Array(String),\\n `ToDrop` UInt32\\n)\\n" \ "ENGINE = MergeTree(StartDate, intHash32(UserID), (CounterID, StartDate, intHash32(UserID), VisitID), 8192)" - assert_create_query([main_node, competing_node], "concurrent_test", expected) + assert_create_query([main_node, competing_node], "testdb.concurrent_test", expected) def test_snapshot_and_snapshot_recover(started_cluster): #FIXME bad test @@ -142,7 +150,7 @@ def test_drop_and_create_replica(started_cluster): " `VisitID` UInt32,\\n `NestedColumn.A` Array(UInt8),\\n `NestedColumn.S` Array(String),\\n `ToDrop` UInt32\\n)\\n" \ "ENGINE = MergeTree(StartDate, intHash32(UserID), (CounterID, StartDate, intHash32(UserID), VisitID), 8192)" - assert_create_query([main_node, competing_node], "concurrent_test", expected) + assert_create_query([main_node, competing_node], "testdb.concurrent_test", expected) #TODO tests with Distributed From f1a52a609bd6ced447fbb2cb4102675c798e32c0 Mon Sep 17 00:00:00 2001 From: Alexander Tokmakov Date: Fri, 27 Nov 2020 17:04:03 +0300 Subject: [PATCH 0064/2357] separate DatabaseReplicatedDDLWorker --- src/Databases/DatabaseAtomic.cpp | 4 +- src/Databases/DatabaseAtomic.h | 4 +- src/Databases/DatabaseLazy.cpp | 2 +- src/Databases/DatabaseLazy.h | 2 +- src/Databases/DatabaseOnDisk.cpp | 2 +- src/Databases/DatabaseOnDisk.h | 2 +- src/Databases/DatabaseOrdinary.cpp | 4 +- src/Databases/DatabaseOrdinary.h | 4 +- src/Databases/DatabaseReplicated.cpp | 91 +++-- src/Databases/DatabaseReplicated.h | 13 +- src/Databases/DatabaseReplicatedWorker.cpp | 114 ++++++ src/Databases/DatabaseReplicatedWorker.h | 26 ++ src/Databases/DatabaseWithDictionaries.cpp | 2 +- src/Databases/DatabaseWithDictionaries.h | 2 +- src/Interpreters/Context.cpp | 3 +- src/Interpreters/DDLTask.cpp | 280 +++++++++++++ src/Interpreters/DDLTask.h | 85 +++- src/Interpreters/DDLWorker.cpp | 371 ++---------------- src/Interpreters/DDLWorker.h | 64 +-- .../configs/config.xml | 3 + .../configs/disable_snapshots.xml | 3 - .../configs/snapshot_each_query.xml | 3 - .../test_replicated_database/test.py | 21 +- 23 files changed, 639 insertions(+), 466 deletions(-) create mode 100644 src/Databases/DatabaseReplicatedWorker.cpp create mode 100644 src/Databases/DatabaseReplicatedWorker.h create mode 100644 tests/integration/test_replicated_database/configs/config.xml delete mode 100644 tests/integration/test_replicated_database/configs/disable_snapshots.xml delete mode 100644 tests/integration/test_replicated_database/configs/snapshot_each_query.xml diff --git a/src/Databases/DatabaseAtomic.cpp b/src/Databases/DatabaseAtomic.cpp index ca39cefc5c8..a444d9cc200 100644 --- a/src/Databases/DatabaseAtomic.cpp +++ b/src/Databases/DatabaseAtomic.cpp @@ -38,12 +38,12 @@ public: UUID uuid() const override { return table()->getStorageID().uuid; } }; -DatabaseAtomic::DatabaseAtomic(String name_, String metadata_path_, UUID uuid, Context & context_) +DatabaseAtomic::DatabaseAtomic(String name_, String metadata_path_, UUID uuid, const Context & context_) : DatabaseAtomic(name_, metadata_path_, uuid, "DatabaseAtomic (" + name_ + ")", context_) { } -DatabaseAtomic::DatabaseAtomic(String name_, String metadata_path_, UUID uuid, const String & logger, Context & context_) +DatabaseAtomic::DatabaseAtomic(String name_, String metadata_path_, UUID uuid, const String & logger, const Context & context_) : DatabaseOrdinary(name_, std::move(metadata_path_), "store/", logger, context_) , path_to_table_symlinks(global_context.getPath() + "data/" + escapeForFileName(name_) + "/") , path_to_metadata_symlink(global_context.getPath() + "metadata/" + escapeForFileName(name_)) diff --git a/src/Databases/DatabaseAtomic.h b/src/Databases/DatabaseAtomic.h index 9cc6a429656..e9cb418c787 100644 --- a/src/Databases/DatabaseAtomic.h +++ b/src/Databases/DatabaseAtomic.h @@ -20,8 +20,8 @@ namespace DB class DatabaseAtomic : public DatabaseOrdinary { public: - DatabaseAtomic(String name_, String metadata_path_, UUID uuid, Context & context_); - DatabaseAtomic(String name_, String metadata_path_, UUID uuid, const String & logger, Context & context_); + DatabaseAtomic(String name_, String metadata_path_, UUID uuid, const Context & context_); + DatabaseAtomic(String name_, String metadata_path_, UUID uuid, const String & logger, const Context & context_); String getEngineName() const override { return "Atomic"; } UUID getUUID() const override { return db_uuid; } diff --git a/src/Databases/DatabaseLazy.cpp b/src/Databases/DatabaseLazy.cpp index a4ace4bde9b..0119f17f843 100644 --- a/src/Databases/DatabaseLazy.cpp +++ b/src/Databases/DatabaseLazy.cpp @@ -27,7 +27,7 @@ namespace ErrorCodes } -DatabaseLazy::DatabaseLazy(const String & name_, const String & metadata_path_, time_t expiration_time_, Context & context_) +DatabaseLazy::DatabaseLazy(const String & name_, const String & metadata_path_, time_t expiration_time_, const Context & context_) : DatabaseOnDisk(name_, metadata_path_, "data/" + escapeForFileName(name_) + "/", "DatabaseLazy (" + name_ + ")", context_) , expiration_time(expiration_time_) { diff --git a/src/Databases/DatabaseLazy.h b/src/Databases/DatabaseLazy.h index 0893b085fae..2d091297c91 100644 --- a/src/Databases/DatabaseLazy.h +++ b/src/Databases/DatabaseLazy.h @@ -18,7 +18,7 @@ class Context; class DatabaseLazy final : public DatabaseOnDisk { public: - DatabaseLazy(const String & name_, const String & metadata_path_, time_t expiration_time_, Context & context_); + DatabaseLazy(const String & name_, const String & metadata_path_, time_t expiration_time_, const Context & context_); String getEngineName() const override { return "Lazy"; } diff --git a/src/Databases/DatabaseOnDisk.cpp b/src/Databases/DatabaseOnDisk.cpp index 8f24f53fc3f..18941ba7c04 100644 --- a/src/Databases/DatabaseOnDisk.cpp +++ b/src/Databases/DatabaseOnDisk.cpp @@ -131,7 +131,7 @@ DatabaseOnDisk::DatabaseOnDisk( const String & metadata_path_, const String & data_path_, const String & logger, - Context & context) + const Context & context) : DatabaseWithOwnTablesBase(name, logger, context) , metadata_path(metadata_path_) , data_path(data_path_) diff --git a/src/Databases/DatabaseOnDisk.h b/src/Databases/DatabaseOnDisk.h index a5510ef4810..f5b9ea0c0d5 100644 --- a/src/Databases/DatabaseOnDisk.h +++ b/src/Databases/DatabaseOnDisk.h @@ -31,7 +31,7 @@ String getObjectDefinitionFromCreateQuery(const ASTPtr & query); class DatabaseOnDisk : public DatabaseWithOwnTablesBase { public: - DatabaseOnDisk(const String & name, const String & metadata_path_, const String & data_path_, const String & logger, Context & context); + DatabaseOnDisk(const String & name, const String & metadata_path_, const String & data_path_, const String & logger, const Context & context); void createTable( const Context & context, diff --git a/src/Databases/DatabaseOrdinary.cpp b/src/Databases/DatabaseOrdinary.cpp index aaceb640213..470c9e7db29 100644 --- a/src/Databases/DatabaseOrdinary.cpp +++ b/src/Databases/DatabaseOrdinary.cpp @@ -99,13 +99,13 @@ namespace } -DatabaseOrdinary::DatabaseOrdinary(const String & name_, const String & metadata_path_, Context & context_) +DatabaseOrdinary::DatabaseOrdinary(const String & name_, const String & metadata_path_, const Context & context_) : DatabaseOrdinary(name_, metadata_path_, "data/" + escapeForFileName(name_) + "/", "DatabaseOrdinary (" + name_ + ")", context_) { } DatabaseOrdinary::DatabaseOrdinary( - const String & name_, const String & metadata_path_, const String & data_path_, const String & logger, Context & context_) + const String & name_, const String & metadata_path_, const String & data_path_, const String & logger, const Context & context_) : DatabaseWithDictionaries(name_, metadata_path_, data_path_, logger, context_) { } diff --git a/src/Databases/DatabaseOrdinary.h b/src/Databases/DatabaseOrdinary.h index 6a21e19d5e2..c1ad32345f6 100644 --- a/src/Databases/DatabaseOrdinary.h +++ b/src/Databases/DatabaseOrdinary.h @@ -14,8 +14,8 @@ namespace DB class DatabaseOrdinary : public DatabaseWithDictionaries { public: - DatabaseOrdinary(const String & name_, const String & metadata_path_, Context & context); - DatabaseOrdinary(const String & name_, const String & metadata_path_, const String & data_path_, const String & logger, Context & context_); + DatabaseOrdinary(const String & name_, const String & metadata_path_, const Context & context); + DatabaseOrdinary(const String & name_, const String & metadata_path_, const String & data_path_, const String & logger, const Context & context_); String getEngineName() const override { return "Ordinary"; } diff --git a/src/Databases/DatabaseReplicated.cpp b/src/Databases/DatabaseReplicated.cpp index 25fb95ba0de..eef1b98afe2 100644 --- a/src/Databases/DatabaseReplicated.cpp +++ b/src/Databases/DatabaseReplicated.cpp @@ -13,12 +13,16 @@ #include #include #include -#include +#include #include #include #include #include #include +#include +#include +#include +#include namespace DB { @@ -52,7 +56,7 @@ DatabaseReplicated::DatabaseReplicated( const String & zookeeper_path_, const String & shard_name_, const String & replica_name_, - Context & context_) + const Context & context_) : DatabaseAtomic(name_, metadata_path_, uuid, "DatabaseReplicated (" + name_ + ")", context_) , zookeeper_path(zookeeper_path_) , shard_name(shard_name_) @@ -116,8 +120,11 @@ bool DatabaseReplicated::createDatabaseNodesInZooKeeper(const zkutil::ZooKeeperP ops.emplace_back(zkutil::makeCreateRequest(zookeeper_path + "/log", "", zkutil::CreateMode::Persistent)); ops.emplace_back(zkutil::makeCreateRequest(zookeeper_path + "/replicas", "", zkutil::CreateMode::Persistent)); ops.emplace_back(zkutil::makeCreateRequest(zookeeper_path + "/counter", "", zkutil::CreateMode::Persistent)); + ops.emplace_back(zkutil::makeCreateRequest(zookeeper_path + "/counter/cnt-", "", zkutil::CreateMode::Persistent)); + ops.emplace_back(zkutil::makeRemoveRequest(zookeeper_path + "/counter/cnt-", -1)); ops.emplace_back(zkutil::makeCreateRequest(zookeeper_path + "/metadata", "", zkutil::CreateMode::Persistent)); - ops.emplace_back(zkutil::makeCreateRequest(zookeeper_path + "/min_log_ptr", "0", zkutil::CreateMode::Persistent)); + ops.emplace_back(zkutil::makeCreateRequest(zookeeper_path + "/min_log_ptr", "1", zkutil::CreateMode::Persistent)); + ops.emplace_back(zkutil::makeCreateRequest(zookeeper_path + "/max_log_ptr", "1", zkutil::CreateMode::Persistent)); Coordination::Responses responses; auto res = current_zookeeper->tryMulti(ops, responses); @@ -128,6 +135,7 @@ bool DatabaseReplicated::createDatabaseNodesInZooKeeper(const zkutil::ZooKeeperP zkutil::KeeperMultiException::check(res, ops, responses); assert(false); + __builtin_unreachable(); } void DatabaseReplicated::createReplicaNodesInZooKeeper(const zkutil::ZooKeeperPtr & current_zookeeper) @@ -135,7 +143,7 @@ void DatabaseReplicated::createReplicaNodesInZooKeeper(const zkutil::ZooKeeperPt current_zookeeper->createAncestors(replica_path); /// When creating new replica, use latest snapshot version as initial value of log_pointer - log_entry_to_execute = 0; //FIXME + //log_entry_to_execute = 0; //FIXME /// Write host name to replica_path, it will protect from multiple replicas with the same name auto host_id = getHostID(global_context); @@ -153,8 +161,8 @@ void DatabaseReplicated::createReplicaNodesInZooKeeper(const zkutil::ZooKeeperPt Coordination::Requests ops; ops.emplace_back(zkutil::makeCreateRequest(replica_path, host_id, zkutil::CreateMode::Persistent)); - ops.emplace_back(zkutil::makeCreateRequest(replica_path + "/log_ptr", toString(log_entry_to_execute), zkutil::CreateMode::Persistent)); - ops.emplace_back(zkutil::makeCreateRequest(query_path, entry.toString(), zkutil::CreateMode::PersistentSequential)); + ops.emplace_back(zkutil::makeCreateRequest(replica_path + "/log_ptr", "0", zkutil::CreateMode::Persistent)); + ops.emplace_back(zkutil::makeCreateRequest(query_path, entry.toString(), zkutil::CreateMode::Persistent)); ops.emplace_back(zkutil::makeRemoveRequest(counter_path, -1)); current_zookeeper->multi(ops); } @@ -163,22 +171,9 @@ void DatabaseReplicated::loadStoredObjects(Context & context, bool has_force_res { DatabaseAtomic::loadStoredObjects(context, has_force_restore_data_flag, force_attach); - recoverLostReplica(global_context.getZooKeeper(), 0, true); //FIXME + //recoverLostReplica(global_context.getZooKeeper(), 0, true); //FIXME - DatabaseReplicatedExtensions ext; - ext.database_uuid = getUUID(); - ext.zookeeper_path = zookeeper_path; - ext.database_name = getDatabaseName(); - ext.shard_name = shard_name; - ext.replica_name = replica_name; - ext.first_not_executed = log_entry_to_execute; - ext.lost_callback = [this] (const String & entry_name, const ZooKeeperPtr & zookeeper) { onUnexpectedLogEntry(entry_name, zookeeper); }; - ext.executed_callback = [this] (const String & entry_name, const ZooKeeperPtr & zookeeper) { onExecutedLogEntry(entry_name, zookeeper); }; - - /// Pool size must be 1 (to avoid reordering of log entries) - constexpr size_t pool_size = 1; - ddl_worker = std::make_unique(pool_size, zookeeper_path + "/log", global_context, nullptr, "", - std::make_optional(std::move(ext))); + ddl_worker = std::make_unique(this, global_context); } void DatabaseReplicated::onUnexpectedLogEntry(const String & entry_name, const ZooKeeperPtr & zookeeper) @@ -314,48 +309,68 @@ BlockIO DatabaseReplicated::propose(const ASTPtr & query) } -void DatabaseReplicated::recoverLostReplica(const ZooKeeperPtr & current_zookeeper, UInt32 from_snapshot, bool create) +void DatabaseReplicated::recoverLostReplica(const ZooKeeperPtr & current_zookeeper, UInt32 from_snapshot, bool /*create*/) { - LOG_WARNING(log, "Will recover replica from snapshot", from_snapshot); + LOG_WARNING(log, "Will recover replica"); //FIXME drop old tables String snapshot_metadata_path = zookeeper_path + "/metadata"; Strings tables_in_snapshot = current_zookeeper->getChildren(snapshot_metadata_path); snapshot_metadata_path += '/'; + from_snapshot = parse(current_zookeeper->get(zookeeper_path + "/max_log_ptr")); for (const auto & table_name : tables_in_snapshot) { //FIXME It's not atomic. We need multiget here (available since ZooKeeper 3.6.0). - String query_to_execute = current_zookeeper->get(snapshot_metadata_path + table_name); + String query_text = current_zookeeper->get(snapshot_metadata_path + table_name); + auto query_ast = parseQueryFromMetadataInZooKeeper(table_name, query_text); + Context query_context = global_context; + query_context.makeQueryContext(); + query_context.getClientInfo().query_kind = ClientInfo::QueryKind::REPLICATED_LOG_QUERY; + query_context.setCurrentDatabase(database_name); + query_context.setCurrentQueryId(""); // generate random query_id - if (!startsWith(query_to_execute, "ATTACH ")) - throw Exception(ErrorCodes::LOGICAL_ERROR, "Unexpected query: {}", query_to_execute); - query_to_execute = "CREATE " + query_to_execute.substr(strlen("ATTACH ")); + //FIXME + DatabaseCatalog::instance().waitTableFinallyDropped(query_ast->as()->uuid); - Context current_context = global_context; - current_context.getClientInfo().query_kind = ClientInfo::QueryKind::REPLICATED_LOG_QUERY; - current_context.setCurrentDatabase(database_name); - current_context.setCurrentQueryId(""); // generate random query_id - - executeQuery(query_to_execute, current_context); + LOG_INFO(log, "Executing {}", serializeAST(*query_ast)); + InterpreterCreateQuery(query_ast, query_context).execute(); } - if (create) - return; + //if (create) + // return; - current_zookeeper->set(replica_path + "/log-ptr", toString(from_snapshot)); + current_zookeeper->set(replica_path + "/log_ptr", toString(from_snapshot)); last_executed_log_entry = from_snapshot; - ddl_worker->setLogPointer(from_snapshot); //FIXME + //ddl_worker->setLogPointer(from_snapshot); //FIXME //writeLastExecutedToDiskAndZK(); } +ASTPtr DatabaseReplicated::parseQueryFromMetadataInZooKeeper(const String & node_name, const String & query) +{ + ParserCreateQuery parser; + String description = "in ZooKeeper " + zookeeper_path + "/metadata/" + node_name; + auto ast = parseQuery(parser, query, description, 0, global_context.getSettingsRef().max_parser_depth); + + auto & create = ast->as(); + if (create.uuid == UUIDHelpers::Nil || create.table != TABLE_WITH_UUID_NAME_PLACEHOLDER || ! create.database.empty()) + throw Exception(ErrorCodes::LOGICAL_ERROR, "Got unexpected query from {}: {}", node_name, query); + + create.database = getDatabaseName(); + create.table = unescapeForFileName(node_name); + create.attach = false; + + return ast; +} + void DatabaseReplicated::drop(const Context & context_) { auto current_zookeeper = getZooKeeper(); - current_zookeeper->tryRemove(zookeeper_path + "/replicas/" + replica_name); + current_zookeeper->set(replica_path, "DROPPED"); + current_zookeeper->tryRemoveRecursive(replica_path); DatabaseAtomic::drop(context_); } diff --git a/src/Databases/DatabaseReplicated.h b/src/Databases/DatabaseReplicated.h index 663df59ac63..d6cd93773cf 100644 --- a/src/Databases/DatabaseReplicated.h +++ b/src/Databases/DatabaseReplicated.h @@ -12,7 +12,7 @@ namespace DB { -class DDLWorker; +class DatabaseReplicatedDDLWorker; using ZooKeeperPtr = std::shared_ptr; /** DatabaseReplicated engine @@ -42,7 +42,7 @@ class DatabaseReplicated : public DatabaseAtomic public: DatabaseReplicated(const String & name_, const String & metadata_path_, UUID uuid, const String & zookeeper_path_, const String & shard_name_, const String & replica_name_, - Context & context); + const Context & context); ~DatabaseReplicated() override; @@ -56,6 +56,11 @@ public: void loadStoredObjects(Context & context, bool has_force_restore_data_flag, bool force_attach = false) override; + String getFullReplicaName() const { return shard_name + '|' + replica_name; } + + //FIXME + friend struct DatabaseReplicatedTask; + friend class DatabaseReplicatedDDLWorker; private: bool createDatabaseNodesInZooKeeper(const ZooKeeperPtr & current_zookeeper); void createReplicaNodesInZooKeeper(const ZooKeeperPtr & current_zookeeper); @@ -72,6 +77,8 @@ private: void onExecutedLogEntry(const String & entry_name, const ZooKeeperPtr & zookeeper); + ASTPtr parseQueryFromMetadataInZooKeeper(const String & node_name, const String & query); + String zookeeper_path; String shard_name; String replica_name; @@ -88,7 +95,7 @@ private: zkutil::ZooKeeperPtr getZooKeeper() const; - std::unique_ptr ddl_worker; + std::unique_ptr ddl_worker; diff --git a/src/Databases/DatabaseReplicatedWorker.cpp b/src/Databases/DatabaseReplicatedWorker.cpp new file mode 100644 index 00000000000..869b888d3ad --- /dev/null +++ b/src/Databases/DatabaseReplicatedWorker.cpp @@ -0,0 +1,114 @@ +#include +#include +#include + +namespace DB +{ + +namespace ErrorCodes +{ + extern const int LOGICAL_ERROR; +} + +DatabaseReplicatedDDLWorker::DatabaseReplicatedDDLWorker(DatabaseReplicated * db, const Context & context_) + : DDLWorker(/* pool_size */ 1, db->zookeeper_path + "/log", context_, nullptr, {}, fmt::format("DDLWorker({})", db->getDatabaseName())) + , database(db) +{ + /// Pool size must be 1 (to avoid reordering of log entries) +} + +void DatabaseReplicatedDDLWorker::initialize() +{ + /// Check if we need to recover replica. + /// Invariant: replica is lost if it's log_ptr value is less then min_log_ptr value. + + UInt32 our_log_ptr = parse(current_zookeeper->get(database->replica_path + "/log_ptr")); + UInt32 min_log_ptr = parse(current_zookeeper->get(database->zookeeper_path + "/min_log_ptr")); + if (our_log_ptr < min_log_ptr) + database->recoverLostReplica(current_zookeeper, 0); +} + +String DatabaseReplicatedDDLWorker::enqueueQuery(DDLLogEntry & entry) +{ + auto zookeeper = getAndSetZooKeeper(); + const String query_path_prefix = queue_dir + "/query-"; + + /// We cannot create sequential node and it's ephemeral child in a single transaction, so allocate sequential number another way + String counter_prefix = database->zookeeper_path + "/counter/cnt-"; + String counter_path = zookeeper->create(counter_prefix, "", zkutil::CreateMode::EphemeralSequential); + String node_path = query_path_prefix + counter_path.substr(counter_prefix.size()); + + Coordination::Requests ops; + /// Query is not committed yet, but we have to write it into log to avoid reordering + ops.emplace_back(zkutil::makeCreateRequest(node_path, entry.toString(), zkutil::CreateMode::Persistent)); + /// '/try' will be replaced with '/committed' or will be removed due to expired session or other error + ops.emplace_back(zkutil::makeCreateRequest(node_path + "/try", database->getFullReplicaName(), zkutil::CreateMode::Ephemeral)); + /// We don't need it anymore + ops.emplace_back(zkutil::makeRemoveRequest(counter_path, -1)); + /// Create status dirs + ops.emplace_back(zkutil::makeCreateRequest(node_path + "/active", "", zkutil::CreateMode::Persistent)); + ops.emplace_back(zkutil::makeCreateRequest(node_path + "/finished", "", zkutil::CreateMode::Persistent)); + zookeeper->multi(ops); + + return node_path; +} + +DDLTaskPtr DatabaseReplicatedDDLWorker::initAndCheckTask(const String & entry_name, String & out_reason, const ZooKeeperPtr & zookeeper) +{ + UInt32 our_log_ptr = parse(current_zookeeper->get(database->replica_path + "/log_ptr")); + UInt32 entry_num = DatabaseReplicatedTask::getLogEntryNumber(entry_name); + + if (entry_num <= our_log_ptr) + { + out_reason = fmt::format("Task {} already executed according to log pointer {}", entry_name, our_log_ptr); + return {}; + } + + String entry_path = queue_dir + "/" + entry_name; + auto task = std::make_unique(entry_name, entry_path, database); + + String initiator_name; + zkutil::EventPtr wait_committed_or_failed = std::make_shared(); + + if (zookeeper->tryGet(entry_path + "/try", initiator_name, nullptr, wait_committed_or_failed)) + { + task->we_are_initiator = initiator_name == task->host_id_str; + /// Query is not committed yet. We cannot just skip it and execute next one, because reordering may break replication. + //FIXME add some timeouts + if (!task->we_are_initiator) + { + LOG_TRACE(log, "Waiting for initiator {} to commit or rollback entry {}", initiator_name, entry_path); + wait_committed_or_failed->wait(); + } + } + + if (!task->we_are_initiator && !zookeeper->exists(entry_path + "/committed")) + { + out_reason = "Entry " + entry_name + " hasn't been committed"; + return {}; + } + + String node_data; + if (!zookeeper->tryGet(entry_path, node_data)) + { + LOG_ERROR(log, "Cannot get log entry {}", entry_path); + database->onUnexpectedLogEntry(entry_name, zookeeper); + throw Exception(ErrorCodes::LOGICAL_ERROR, "should be unreachable"); + } + + auto error = task->tryParseEntry(node_data); + if (error) + { + LOG_ERROR(log, "Cannot parse query from '{}': {}", node_data, *error); + database->onUnexpectedLogEntry(entry_name, zookeeper); + throw Exception(ErrorCodes::LOGICAL_ERROR, "should be unreachable"); + } + + task->parseQueryFromEntry(context); + + return task; +} + + + +} diff --git a/src/Databases/DatabaseReplicatedWorker.h b/src/Databases/DatabaseReplicatedWorker.h new file mode 100644 index 00000000000..d190bd1795d --- /dev/null +++ b/src/Databases/DatabaseReplicatedWorker.h @@ -0,0 +1,26 @@ +#pragma once +#include + + +namespace DB +{ + +class DatabaseReplicated; + +class DatabaseReplicatedDDLWorker : public DDLWorker +{ +public: + DatabaseReplicatedDDLWorker(DatabaseReplicated * db, const Context & context_); + + String enqueueQuery(DDLLogEntry & entry) override; + +private: + void initialize() override; + + DDLTaskPtr initAndCheckTask(const String & entry_name, String & out_reason, const ZooKeeperPtr & zookeeper) override; + + DatabaseReplicated * database; + +}; + +} diff --git a/src/Databases/DatabaseWithDictionaries.cpp b/src/Databases/DatabaseWithDictionaries.cpp index da7f7f9b83e..ee16f4ae15e 100644 --- a/src/Databases/DatabaseWithDictionaries.cpp +++ b/src/Databases/DatabaseWithDictionaries.cpp @@ -349,7 +349,7 @@ void DatabaseWithDictionaries::shutdown() DatabaseWithDictionaries::DatabaseWithDictionaries( - const String & name, const String & metadata_path_, const String & data_path_, const String & logger, Context & context) + const String & name, const String & metadata_path_, const String & data_path_, const String & logger, const Context & context) : DatabaseOnDisk(name, metadata_path_, data_path_, logger, context) , external_loader(context.getExternalDictionariesLoader()) { diff --git a/src/Databases/DatabaseWithDictionaries.h b/src/Databases/DatabaseWithDictionaries.h index 36cee18e4db..d69289d7456 100644 --- a/src/Databases/DatabaseWithDictionaries.h +++ b/src/Databases/DatabaseWithDictionaries.h @@ -38,7 +38,7 @@ public: ~DatabaseWithDictionaries() override; protected: - DatabaseWithDictionaries(const String & name, const String & metadata_path_, const String & data_path_, const String & logger, Context & context); + DatabaseWithDictionaries(const String & name, const String & metadata_path_, const String & data_path_, const String & logger, const Context & context); ASTPtr getCreateDictionaryQueryImpl(const String & dictionary_name, bool throw_on_error) const override; diff --git a/src/Interpreters/Context.cpp b/src/Interpreters/Context.cpp index 04bd6b37280..b9283935ec9 100644 --- a/src/Interpreters/Context.cpp +++ b/src/Interpreters/Context.cpp @@ -2437,7 +2437,8 @@ void Context::initMetadataTransaction(MetadataTransactionPtr txn) MetadataTransactionPtr Context::getMetadataTransaction() const { - assert(query_context == this); + //FIXME + //assert(query_context == this); return metadata_transaction; } diff --git a/src/Interpreters/DDLTask.cpp b/src/Interpreters/DDLTask.cpp index dfb8f5ff746..0bc98dfd0dd 100644 --- a/src/Interpreters/DDLTask.cpp +++ b/src/Interpreters/DDLTask.cpp @@ -6,6 +6,12 @@ #include #include #include +#include +#include +#include +#include +#include +#include namespace DB { @@ -13,6 +19,8 @@ namespace DB namespace ErrorCodes { extern const int UNKNOWN_FORMAT_VERSION; + extern const int UNKNOWN_TYPE_OF_QUERY; + extern const int INCONSISTENT_CLUSTER_DEFINITION; } HostID HostID::fromString(const String & host_port_str) @@ -78,4 +86,276 @@ void DDLLogEntry::parse(const String & data) } +std::optional DDLTaskBase::tryParseEntry(const String & data) +{ + std::optional error; + try + { + entry.parse(data); + } + catch (...) + { + error = ExecutionStatus::fromCurrentException().serializeText(); + } + return error; +} + +void DDLTaskBase::parseQueryFromEntry(const Context & context) +{ + const char * begin = entry.query.data(); + const char * end = begin + entry.query.size(); + + ParserQuery parser_query(end); + String description; + query = parseQuery(parser_query, begin, end, description, 0, context.getSettingsRef().max_parser_depth); +} + +std::unique_ptr DDLTaskBase::makeQueryContext(Context & from_context) const +{ + auto query_context = std::make_unique(from_context); + query_context->makeQueryContext(); + query_context->setCurrentQueryId(""); // generate random query_id + query_context->getClientInfo().query_kind = ClientInfo::QueryKind::SECONDARY_QUERY; + return query_context; +} + + +bool DDLTask::findCurrentHostID(const Context & global_context, Poco::Logger * log) +{ + bool host_in_hostlist = false; + + for (const HostID & host : entry.hosts) + { + auto maybe_secure_port = global_context.getTCPPortSecure(); + + /// The port is considered local if it matches TCP or TCP secure port that the server is listening. + bool is_local_port = (maybe_secure_port && host.isLocalAddress(*maybe_secure_port)) + || host.isLocalAddress(global_context.getTCPPort()); + + if (!is_local_port) + continue; + + if (host_in_hostlist) + { + /// This check could be slow a little bit + LOG_WARNING(log, "There are two the same ClickHouse instances in task {}: {} and {}. Will use the first one only.", + entry_name, host_id.readableString(), host.readableString()); + } + else + { + host_in_hostlist = true; + host_id = host; + host_id_str = host.toString(); + } + } + + return host_in_hostlist; +} + +void DDLTask::setClusterInfo(const Context & context, Poco::Logger * log) +{ + auto query_on_cluster = dynamic_cast(query.get()); + if (!query_on_cluster) + throw Exception("Received unknown DDL query", ErrorCodes::UNKNOWN_TYPE_OF_QUERY); + + cluster_name = query_on_cluster->cluster; + cluster = context.tryGetCluster(cluster_name); + + if (!cluster) + throw Exception(ErrorCodes::INCONSISTENT_CLUSTER_DEFINITION, + "DDL task {} contains current host {} in cluster {}, but there are no such cluster here.", + entry_name, host_id.readableString(), cluster_name); + + /// Try to find host from task host list in cluster + /// At the first, try find exact match (host name and ports should be literally equal) + /// If the attempt fails, try find it resolving host name of each instance + + if (!tryFindHostInCluster()) + { + LOG_WARNING(log, "Not found the exact match of host {} from task {} in cluster {} definition. Will try to find it using host name resolving.", + host_id.readableString(), entry_name, cluster_name); + + if (!tryFindHostInClusterViaResolving(context)) + throw Exception(ErrorCodes::INCONSISTENT_CLUSTER_DEFINITION, "Not found host {} in definition of cluster {}", + host_id.readableString(), cluster_name); + + LOG_INFO(log, "Resolved host {} from task {} as host {} in definition of cluster {}", + host_id.readableString(), entry_name, address_in_cluster.readableString(), cluster_name); + } + + query = query_on_cluster->getRewrittenASTWithoutOnCluster(address_in_cluster.default_database); + query_on_cluster = nullptr; +} + +bool DDLTask::tryFindHostInCluster() +{ + const auto & shards = cluster->getShardsAddresses(); + bool found_exact_match = false; + String default_database; + + for (size_t shard_num = 0; shard_num < shards.size(); ++shard_num) + { + for (size_t replica_num = 0; replica_num < shards[shard_num].size(); ++replica_num) + { + const Cluster::Address & address = shards[shard_num][replica_num]; + + if (address.host_name == host_id.host_name && address.port == host_id.port) + { + if (found_exact_match) + { + if (default_database == address.default_database) + { + throw Exception(ErrorCodes::INCONSISTENT_CLUSTER_DEFINITION, + "There are two exactly the same ClickHouse instances {} in cluster {}", + address.readableString(), cluster_name); + } + else + { + /* Circular replication is used. + * It is when every physical node contains + * replicas of different shards of the same table. + * To distinguish one replica from another on the same node, + * every shard is placed into separate database. + * */ + is_circular_replicated = true; + auto * query_with_table = dynamic_cast(query.get()); + if (!query_with_table || query_with_table->database.empty()) + { + throw Exception(ErrorCodes::INCONSISTENT_CLUSTER_DEFINITION, + "For a distributed DDL on circular replicated cluster its table name must be qualified by database name."); + } + if (default_database == query_with_table->database) + return true; + } + } + found_exact_match = true; + host_shard_num = shard_num; + host_replica_num = replica_num; + address_in_cluster = address; + default_database = address.default_database; + } + } + } + + return found_exact_match; +} + +bool DDLTask::tryFindHostInClusterViaResolving(const Context & context) +{ + const auto & shards = cluster->getShardsAddresses(); + bool found_via_resolving = false; + + for (size_t shard_num = 0; shard_num < shards.size(); ++shard_num) + { + for (size_t replica_num = 0; replica_num < shards[shard_num].size(); ++replica_num) + { + const Cluster::Address & address = shards[shard_num][replica_num]; + + if (auto resolved = address.getResolvedAddress(); + resolved && (isLocalAddress(*resolved, context.getTCPPort()) + || (context.getTCPPortSecure() && isLocalAddress(*resolved, *context.getTCPPortSecure())))) + { + if (found_via_resolving) + { + throw Exception(ErrorCodes::INCONSISTENT_CLUSTER_DEFINITION, + "There are two the same ClickHouse instances in cluster {} : {} and {}", + cluster_name, address_in_cluster.readableString(), address.readableString()); + } + else + { + found_via_resolving = true; + host_shard_num = shard_num; + host_replica_num = replica_num; + address_in_cluster = address; + } + } + } + } + + return found_via_resolving; +} + +String DDLTask::getShardID() const +{ + /// Generate unique name for shard node, it will be used to execute the query by only single host + /// Shard node name has format 'replica_name1,replica_name2,...,replica_nameN' + /// Where replica_name is 'replica_config_host_name:replica_port' + + auto shard_addresses = cluster->getShardsAddresses().at(host_shard_num); + + Strings replica_names; + for (const Cluster::Address & address : shard_addresses) + replica_names.emplace_back(address.readableString()); + std::sort(replica_names.begin(), replica_names.end()); + + String res; + for (auto it = replica_names.begin(); it != replica_names.end(); ++it) + res += *it + (std::next(it) != replica_names.end() ? "," : ""); + + return res; +} + +DatabaseReplicatedTask::DatabaseReplicatedTask(const String & name, const String & path, DatabaseReplicated * database_) + : DDLTaskBase(name, path) + , database(database_) +{ + host_id_str = database->getFullReplicaName(); +} + +String DatabaseReplicatedTask::getShardID() const +{ + return database->shard_name; +} + +std::unique_ptr DatabaseReplicatedTask::makeQueryContext(Context & from_context) const +{ + auto query_context = DDLTaskBase::makeQueryContext(from_context); + query_context->getClientInfo().query_kind = ClientInfo::QueryKind::REPLICATED_LOG_QUERY; //FIXME why do we need separate query kind? + query_context->setCurrentDatabase(database->getDatabaseName()); + + if (we_are_initiator) + { + auto txn = std::make_shared(); + query_context->initMetadataTransaction(txn); + txn->current_zookeeper = from_context.getZooKeeper(); + txn->zookeeper_path = database->zookeeper_path; + txn->ops.emplace_back(zkutil::makeRemoveRequest(entry_path + "/try", -1)); + txn->ops.emplace_back(zkutil::makeCreateRequest(entry_path + "/committed", host_id_str, zkutil::CreateMode::Persistent)); + txn->ops.emplace_back(zkutil::makeRemoveRequest(getActiveNodePath(), -1)); + if (execute_on_leader) + txn->ops.emplace_back(zkutil::makeCreateRequest(getShardNodePath() + "/executed", host_id_str, zkutil::CreateMode::Persistent)); + txn->ops.emplace_back(zkutil::makeCreateRequest(getFinishedNodePath(), execution_status.serializeText(), zkutil::CreateMode::Persistent)); + txn->ops.emplace_back(zkutil::makeSetRequest(database->replica_path + "/log_ptr", toString(getLogEntryNumber(entry_name)), -1)); + txn->ops.emplace_back(zkutil::makeSetRequest(database->zookeeper_path + "/max_log_ptr", toString(getLogEntryNumber(entry_name)), -1)); + } + + return query_context; +} + +String DatabaseReplicatedTask::getLogEntryName(UInt32 log_entry_number) +{ + constexpr size_t seq_node_digits = 10; + String number = toString(log_entry_number); + String name = "query-" + String(seq_node_digits - number.size(), '0') + number; + return name; +} + +UInt32 DatabaseReplicatedTask::getLogEntryNumber(const String & log_entry_name) +{ + constexpr const char * name = "query-"; + assert(startsWith(log_entry_name, name)); + return parse(log_entry_name.substr(strlen(name))); +} + +void DatabaseReplicatedTask::parseQueryFromEntry(const Context & context) +{ + if (entry.query.empty()) + { + was_executed = true; + return; + } + + DDLTaskBase::parseQueryFromEntry(context); +} + } diff --git a/src/Interpreters/DDLTask.h b/src/Interpreters/DDLTask.h index ba58fe3f42e..19d92a1bc78 100644 --- a/src/Interpreters/DDLTask.h +++ b/src/Interpreters/DDLTask.h @@ -3,12 +3,17 @@ #include #include +namespace Poco +{ +class Logger; +} namespace DB { class ASTQueryWithOnCluster; using ZooKeeperPtr = std::shared_ptr; +class DatabaseReplicated; struct HostID { @@ -54,42 +59,88 @@ struct DDLLogEntry void parse(const String & data); }; +struct DDLTaskBase +{ + const String entry_name; + const String entry_path; -struct DDLTask + DDLTaskBase(const String & name, const String & path) : entry_name(name), entry_path(path) {} + virtual ~DDLTaskBase() = default; + + std::optional tryParseEntry(const String & data); + virtual void parseQueryFromEntry(const Context & context); + + DDLLogEntry entry; + + String host_id_str; + ASTPtr query; + + bool is_circular_replicated = false; + bool execute_on_leader = false; + + ExecutionStatus execution_status; + bool was_executed = false; + + virtual String getShardID() const = 0; + + virtual std::unique_ptr makeQueryContext(Context & from_context) const; + + inline String getActiveNodePath() const { return entry_path + "/active/" + host_id_str; } + inline String getFinishedNodePath() const { return entry_path + "/finished/" + host_id_str; } + inline String getShardNodePath() const { return entry_path + "/shards/" + getShardID(); } + +}; + +struct DDLTask : public DDLTaskBase { /// Stages of task lifetime correspond ordering of these data fields: - /// Stage 1: parse entry - String entry_name; - String entry_path; - DDLLogEntry entry; + DDLTask(const String & name, const String & path) : DDLTaskBase(name, path) {} + + bool findCurrentHostID(const Context & global_context, Poco::Logger * log); + + void setClusterInfo(const Context & context, Poco::Logger * log); - bool we_are_initiator = false; /// Stage 2: resolve host_id and check that - HostID host_id; - String host_id_str; + /// Stage 3.1: parse query - ASTPtr query; - ASTQueryWithOnCluster * query_on_cluster = nullptr; /// Stage 3.2: check cluster and find the host in cluster + + /// Stage 3.3: execute query + + /// Stage 4: commit results to ZooKeeper + + String getShardID() const override; + +private: + bool tryFindHostInCluster(); + bool tryFindHostInClusterViaResolving(const Context & context); + + HostID host_id; String cluster_name; ClusterPtr cluster; Cluster::Address address_in_cluster; size_t host_shard_num; size_t host_replica_num; +}; - /// Stage 3.3: execute query - ExecutionStatus execution_status; - bool was_executed = false; +struct DatabaseReplicatedTask : public DDLTaskBase +{ + DatabaseReplicatedTask(const String & name, const String & path, DatabaseReplicated * database_); - /// Stage 4: commit results to ZooKeeper + void parseQueryFromEntry(const Context & context) override; - String active_path; - String finished_path; - String shard_path; + String getShardID() const override; + std::unique_ptr makeQueryContext(Context & from_context) const override; + + static String getLogEntryName(UInt32 log_entry_number); + static UInt32 getLogEntryNumber(const String & log_entry_name); + + DatabaseReplicated * database; + bool we_are_initiator = false; }; diff --git a/src/Interpreters/DDLWorker.cpp b/src/Interpreters/DDLWorker.cpp index fc9039be576..0399687a4d8 100644 --- a/src/Interpreters/DDLWorker.cpp +++ b/src/Interpreters/DDLWorker.cpp @@ -142,33 +142,13 @@ std::unique_ptr createSimpleZooKeeperLock( } -String DatabaseReplicatedExtensions::getLogEntryName(UInt32 log_entry_number) -{ - constexpr size_t seq_node_digits = 10; - String number = toString(log_entry_number); - String name = "query-" + String(seq_node_digits - number.size(), '0') + number; - return name; -} - -UInt32 DatabaseReplicatedExtensions::getLogEntryNumber(const String & log_entry_name) -{ - constexpr const char * name = "query-"; - assert(startsWith(log_entry_name, name)); - return parse(log_entry_name.substr(strlen(name))); -} - - DDLWorker::DDLWorker(int pool_size_, const std::string & zk_root_dir, const Context & context_, const Poco::Util::AbstractConfiguration * config, const String & prefix, - std::optional database_replicated_ext_) + const String & logger_name) : context(context_) - , log(&Poco::Logger::get(database_replicated_ext_ ? fmt::format("DDLWorker ({})", database_replicated_ext_->database_name) : "DDLWorker")) - , database_replicated_ext(std::move(database_replicated_ext_)) - , pool_size(pool_size_) + , log(&Poco::Logger::get(logger_name)) + , pool_size(pool_size_) //FIXME make it optional , worker_pool(pool_size_) { - assert(!database_replicated_ext || pool_size == 1); - last_tasks.reserve(pool_size); - queue_dir = zk_root_dir; if (queue_dir.back() == '/') queue_dir.resize(queue_dir.size() - 1); @@ -252,60 +232,26 @@ DDLTaskPtr DDLWorker::initAndCheckTask(const String & entry_name, String & out_r String node_data; String entry_path = queue_dir + "/" + entry_name; - auto task = std::make_unique(); - task->entry_name = entry_name; - task->entry_path = entry_path; - - if (database_replicated_ext) - { - String initiator_name; - zkutil::EventPtr wait_committed_or_failed = std::make_shared(); - - if (zookeeper->tryGet(entry_path + "/try", initiator_name, nullptr, wait_committed_or_failed)) - { - task->we_are_initiator = initiator_name == database_replicated_ext->getFullReplicaName(); - /// Query is not committed yet. We cannot just skip it and execute next one, because reordering may break replication. - //FIXME add some timeouts - if (!task->we_are_initiator) - { - LOG_TRACE(log, "Waiting for initiator {} to commit or rollback entry {}", initiator_name, entry_path); - wait_committed_or_failed->wait(); - } - } - - if (!task->we_are_initiator && !zookeeper->exists(entry_path + "/committed")) - { - out_reason = "Entry " + entry_name + " hasn't been committed"; - return {}; - } - } + auto task = std::make_unique(entry_name, entry_path); if (!zookeeper->tryGet(entry_path, node_data)) { - if (database_replicated_ext) - database_replicated_ext->lost_callback(entry_name, zookeeper); /// It is Ok that node could be deleted just now. It means that there are no current host in node's host list. out_reason = "The task was deleted"; return {}; } - try - { - task->entry.parse(node_data); - } - catch (...) + auto error = task->tryParseEntry(node_data); + if (error) { /// What should we do if we even cannot parse host name and therefore cannot properly submit execution status? /// We can try to create fail node using FQDN if it equal to host name in cluster config attempt will be successful. /// Otherwise, that node will be ignored by DDLQueryStatusInputStream. - - tryLogCurrentException(log, "Cannot parse DDL task " + entry_name + ", will try to send error status"); - - String status = ExecutionStatus::fromCurrentException().serializeText(); + LOG_ERROR(log, "Cannot parse DDL task {}, will try to send error status: {}", entry_name, *error); try { createStatusDirs(entry_path, zookeeper); - zookeeper->tryCreate(entry_path + "/finished/" + host_fqdn_id, status, zkutil::CreateMode::Persistent); + zookeeper->tryCreate(entry_path + "/finished/" + host_fqdn_id, *error, zkutil::CreateMode::Persistent); } catch (...) { @@ -316,45 +262,15 @@ DDLTaskPtr DDLWorker::initAndCheckTask(const String & entry_name, String & out_r return {}; } - if (database_replicated_ext) - { - task->host_id.host_name = host_fqdn; - task->host_id.port = context.getTCPPort(); - task->host_id_str = database_replicated_ext->shard_name + '|' + database_replicated_ext->replica_name; - return task; - } - - bool host_in_hostlist = false; - for (const HostID & host : task->entry.hosts) - { - auto maybe_secure_port = context.getTCPPortSecure(); - - /// The port is considered local if it matches TCP or TCP secure port that the server is listening. - bool is_local_port = (maybe_secure_port && host.isLocalAddress(*maybe_secure_port)) - || host.isLocalAddress(context.getTCPPort()); - - if (!is_local_port) - continue; - - if (host_in_hostlist) - { - /// This check could be slow a little bit - LOG_WARNING(log, "There are two the same ClickHouse instances in task {}: {} and {}. Will use the first one only.", entry_name, task->host_id.readableString(), host.readableString()); - } - else - { - host_in_hostlist = true; - task->host_id = host; - task->host_id_str = host.toString(); - } - } - - if (!host_in_hostlist) + if (!task->findCurrentHostID(context, log)) { out_reason = "There is no a local address in host list"; return {}; } + task->parseQueryFromEntry(context); + task->setClusterInfo(context, log); + return task; } @@ -378,11 +294,11 @@ void DDLWorker::scheduleTasks() return; } - bool server_startup = last_tasks.empty(); + bool server_startup = !last_entry_name.has_value(); auto begin_node = server_startup ? queue_nodes.begin() - : std::upper_bound(queue_nodes.begin(), queue_nodes.end(), last_tasks.back()); + : std::upper_bound(queue_nodes.begin(), queue_nodes.end(), *last_entry_name); for (auto it = begin_node; it != queue_nodes.end() && !stop_flag; ++it) { @@ -394,7 +310,7 @@ void DDLWorker::scheduleTasks() if (!task) { LOG_DEBUG(log, "Will not execute task {}: {}", entry_name, reason); - saveTask(entry_name); + last_entry_name = entry_name; continue; } @@ -408,7 +324,7 @@ void DDLWorker::scheduleTasks() if (!already_processed) { - if (database_replicated_ext) + if (pool_size == 1) { enqueueTask(DDLTaskPtr(task.release())); } @@ -425,143 +341,18 @@ void DDLWorker::scheduleTasks() LOG_DEBUG(log, "Task {} ({}) has been already processed", entry_name, task->entry.query); } - saveTask(entry_name); + last_entry_name = entry_name; } } -void DDLWorker::saveTask(const String & entry_name) -{ - if (last_tasks.size() == pool_size) - { - last_tasks.erase(last_tasks.begin()); - } - last_tasks.emplace_back(entry_name); -} - /// Parses query and resolves cluster and host in cluster -void DDLWorker::parseQueryAndResolveHost(DDLTask & task) +void DDLWorker::parseQueryAndResolveHost(DDLTaskBase & /*task*/) { - { - const char * begin = task.entry.query.data(); - const char * end = begin + task.entry.query.size(); - ParserQuery parser_query(end); - String description; - task.query = parseQuery(parser_query, begin, end, description, 0, context.getSettingsRef().max_parser_depth); - } - - // XXX: serious design flaw since `ASTQueryWithOnCluster` is not inherited from `IAST`! - if (!task.query || !(task.query_on_cluster = dynamic_cast(task.query.get()))) - throw Exception("Received unknown DDL query", ErrorCodes::UNKNOWN_TYPE_OF_QUERY); - - if (database_replicated_ext) - return; - - task.cluster_name = task.query_on_cluster->cluster; - task.cluster = context.tryGetCluster(task.cluster_name); - if (!task.cluster) - throw Exception(ErrorCodes::INCONSISTENT_CLUSTER_DEFINITION, - "DDL task {} contains current host {} in cluster {}, but there are no such cluster here.", - task.entry_name, task.host_id.readableString(), task.cluster_name); - - /// Try to find host from task host list in cluster - /// At the first, try find exact match (host name and ports should be literally equal) - /// If the attempt fails, try find it resolving host name of each instance - const auto & shards = task.cluster->getShardsAddresses(); - - bool found_exact_match = false; - String default_database; - for (size_t shard_num = 0; shard_num < shards.size(); ++shard_num) - { - for (size_t replica_num = 0; replica_num < shards[shard_num].size(); ++replica_num) - { - const Cluster::Address & address = shards[shard_num][replica_num]; - - if (address.host_name == task.host_id.host_name && address.port == task.host_id.port) - { - if (found_exact_match) - { - if (default_database == address.default_database) - { - throw Exception(ErrorCodes::INCONSISTENT_CLUSTER_DEFINITION, - "There are two exactly the same ClickHouse instances {} in cluster {}", - address.readableString(), task.cluster_name); - } - else - { - /* Circular replication is used. - * It is when every physical node contains - * replicas of different shards of the same table. - * To distinguish one replica from another on the same node, - * every shard is placed into separate database. - * */ - is_circular_replicated = true; - auto * query_with_table = dynamic_cast(task.query.get()); - if (!query_with_table || query_with_table->database.empty()) - { - throw Exception(ErrorCodes::INCONSISTENT_CLUSTER_DEFINITION, - "For a distributed DDL on circular replicated cluster its table name must be qualified by database name."); - } - if (default_database == query_with_table->database) - return; - } - } - found_exact_match = true; - task.host_shard_num = shard_num; - task.host_replica_num = replica_num; - task.address_in_cluster = address; - default_database = address.default_database; - } - } - } - - if (found_exact_match) - return; - - LOG_WARNING(log, "Not found the exact match of host {} from task {} in cluster {} definition. Will try to find it using host name resolving.", task.host_id.readableString(), task.entry_name, task.cluster_name); - - bool found_via_resolving = false; - for (size_t shard_num = 0; shard_num < shards.size(); ++shard_num) - { - for (size_t replica_num = 0; replica_num < shards[shard_num].size(); ++replica_num) - { - const Cluster::Address & address = shards[shard_num][replica_num]; - - if (auto resolved = address.getResolvedAddress(); - resolved && (isLocalAddress(*resolved, context.getTCPPort()) - || (context.getTCPPortSecure() && isLocalAddress(*resolved, *context.getTCPPortSecure())))) - { - if (found_via_resolving) - { - throw Exception(ErrorCodes::INCONSISTENT_CLUSTER_DEFINITION, - "There are two the same ClickHouse instances in cluster {} : {} and {}", - task.cluster_name, task.address_in_cluster.readableString(), address.readableString()); - } - else - { - found_via_resolving = true; - task.host_shard_num = shard_num; - task.host_replica_num = replica_num; - task.address_in_cluster = address; - } - } - } - } - - if (!found_via_resolving) - { - throw Exception(ErrorCodes::INCONSISTENT_CLUSTER_DEFINITION, - "Not found host {} in definition of cluster {}", - task.host_id.readableString(), task.cluster_name); - } - else - { - LOG_INFO(log, "Resolved host {} from task {} as host {} in definition of cluster {}", task.host_id.readableString(), task.entry_name, task.address_in_cluster.readableString(), task.cluster_name); - } } -bool DDLWorker::tryExecuteQuery(const String & query, const DDLTask & task, ExecutionStatus & status) +bool DDLWorker::tryExecuteQuery(const String & query, const DDLTaskBase & task, ExecutionStatus & status) { /// Add special comment at the start of query to easily identify DDL-produced queries in query_log String query_prefix = "/* ddl_entry=" + task.entry_name + " */ "; @@ -573,36 +364,8 @@ bool DDLWorker::tryExecuteQuery(const String & query, const DDLTask & task, Exec try { - auto current_context = std::make_unique(context); - current_context->makeQueryContext(); - current_context->setCurrentQueryId(""); // generate random query_id - - if (database_replicated_ext) - { - current_context->getClientInfo().query_kind - = ClientInfo::QueryKind::REPLICATED_LOG_QUERY; //FIXME why do we need separate query kind? - current_context->setCurrentDatabase(database_replicated_ext->database_name); - - if (task.we_are_initiator) - { - auto txn = std::make_shared(); - current_context->initMetadataTransaction(txn); - txn->current_zookeeper = current_zookeeper; - txn->zookeeper_path = database_replicated_ext->zookeeper_path; - txn->ops.emplace_back(zkutil::makeRemoveRequest(task.entry_path + "/try", -1)); - txn->ops.emplace_back(zkutil::makeCreateRequest(task.entry_path + "/committed", - database_replicated_ext->getFullReplicaName(), zkutil::CreateMode::Persistent)); - txn->ops.emplace_back(zkutil::makeRemoveRequest(task.active_path, -1)); - if (!task.shard_path.empty()) - txn->ops.emplace_back(zkutil::makeCreateRequest(task.shard_path, task.host_id_str, zkutil::CreateMode::Persistent)); - txn->ops.emplace_back(zkutil::makeCreateRequest(task.finished_path, task.execution_status.serializeText(), zkutil::CreateMode::Persistent)); - //txn->ops.emplace_back(zkutil::makeSetRequest(database_replicated_ext->getReplicaPath() + "/log_ptr", toString(database_replicated_ext->first_not_executed), -1)); - } - } - else - current_context->getClientInfo().query_kind = ClientInfo::QueryKind::SECONDARY_QUERY; - - executeQuery(istr, ostr, false, *current_context, {}); + auto query_context = task.makeQueryContext(context); + executeQuery(istr, ostr, false, *query_context, {}); } catch (...) { @@ -644,6 +407,7 @@ void DDLWorker::enqueueTask(DDLTaskPtr task_ptr) processTask(task); return; } + /// TODO recover zk in runMainThread(...) and retry task (why do we need another place where session is recovered?) catch (const Coordination::Exception & e) { if (Coordination::isHardwareError(e.code)) @@ -668,17 +432,16 @@ void DDLWorker::enqueueTask(DDLTaskPtr task_ptr) } } -void DDLWorker::processTask(DDLTask & task) +void DDLWorker::processTask(DDLTaskBase & task) { auto zookeeper = tryGetZooKeeper(); LOG_DEBUG(log, "Processing task {} ({})", task.entry_name, task.entry.query); - String dummy; - //FIXME duplicate - String active_node_path = task.active_path = task.entry_path + "/active/" + task.host_id_str; - String finished_node_path = task.finished_path = task.entry_path + "/finished/" + task.host_id_str; + String active_node_path = task.getActiveNodePath(); + String finished_node_path = task.getFinishedNodePath(); + String dummy; auto code = zookeeper->tryCreate(active_node_path, "", zkutil::CreateMode::Ephemeral, dummy); if (code == Coordination::Error::ZOK || code == Coordination::Error::ZNODEEXISTS) @@ -696,22 +459,16 @@ void DDLWorker::processTask(DDLTask & task) else throw Coordination::Exception(code, active_node_path); - //FIXME - bool is_dummy_query = database_replicated_ext && task.entry.query.empty(); - if (!task.was_executed && !is_dummy_query) + if (!task.was_executed) { try { - is_circular_replicated = false; - parseQueryAndResolveHost(task); - - ASTPtr rewritten_ast = task.query_on_cluster->getRewrittenASTWithoutOnCluster(task.address_in_cluster.default_database); - String rewritten_query = queryToString(rewritten_ast); + String rewritten_query = queryToString(task.query); LOG_DEBUG(log, "Executing query: {}", rewritten_query); - if (auto * query_with_table = dynamic_cast(rewritten_ast.get()); query_with_table) + StoragePtr storage; + if (auto * query_with_table = dynamic_cast(task.query.get()); query_with_table) { - StoragePtr storage; if (!query_with_table->table.empty()) { /// It's not CREATE DATABASE @@ -719,11 +476,11 @@ void DDLWorker::processTask(DDLTask & task) storage = DatabaseCatalog::instance().tryGetTable(table_id, context); } - if (storage && taskShouldBeExecutedOnLeader(rewritten_ast, storage) && !is_circular_replicated) - tryExecuteQueryOnLeaderReplica(task, storage, rewritten_query, task.entry_path, zookeeper); - else - tryExecuteQuery(rewritten_query, task, task.execution_status); + task.execute_on_leader = storage && taskShouldBeExecutedOnLeader(task.query, storage) && !task.is_circular_replicated; } + + if (task.execute_on_leader) + tryExecuteQueryOnLeaderReplica(task, storage, rewritten_query, task.entry_path, zookeeper); else tryExecuteQuery(rewritten_query, task, task.execution_status); } @@ -753,12 +510,6 @@ void DDLWorker::processTask(DDLTask & task) auto res = zookeeper->tryMulti(ops, responses); if (res != Coordination::Error::ZNODEEXISTS && res != Coordination::Error::ZNONODE) zkutil::KeeperMultiException::check(res, ops, responses); - - if (database_replicated_ext) - { - database_replicated_ext->executed_callback(task.entry_name, zookeeper); - ++(database_replicated_ext->first_not_executed); - } } @@ -775,10 +526,10 @@ bool DDLWorker::taskShouldBeExecutedOnLeader(const ASTPtr ast_ddl, const Storage } bool DDLWorker::tryExecuteQueryOnLeaderReplica( - DDLTask & task, + DDLTaskBase & task, StoragePtr storage, const String & rewritten_query, - const String & node_path, + const String & /*node_path*/, const ZooKeeperPtr & zookeeper) { StorageReplicatedMergeTree * replicated_storage = dynamic_cast(storage.get()); @@ -787,31 +538,8 @@ bool DDLWorker::tryExecuteQueryOnLeaderReplica( if (!replicated_storage) throw Exception(ErrorCodes::NOT_IMPLEMENTED, "Storage type '{}' is not supported by distributed DDL", storage->getName()); - /// Generate unique name for shard node, it will be used to execute the query by only single host - /// Shard node name has format 'replica_name1,replica_name2,...,replica_nameN' - /// Where replica_name is 'replica_config_host_name:replica_port' - auto get_shard_name = [] (const Cluster::Addresses & shard_addresses) - { - Strings replica_names; - for (const Cluster::Address & address : shard_addresses) - replica_names.emplace_back(address.readableString()); - std::sort(replica_names.begin(), replica_names.end()); - - String res; - for (auto it = replica_names.begin(); it != replica_names.end(); ++it) - res += *it + (std::next(it) != replica_names.end() ? "," : ""); - - return res; - }; - - String shard_node_name; - if (database_replicated_ext) - shard_node_name = database_replicated_ext->shard_name; - else - shard_node_name = get_shard_name(task.cluster->getShardsAddresses().at(task.host_shard_num)); - String shard_path = node_path + "/shards/" + shard_node_name; + String shard_path = task.getShardNodePath(); String is_executed_path = shard_path + "/executed"; - task.shard_path = is_executed_path; //FIXME duplicate String tries_to_execute_path = shard_path + "/tries_to_execute"; zookeeper->createAncestors(shard_path + "/"); @@ -1035,7 +763,7 @@ void DDLWorker::createStatusDirs(const std::string & node_path, const ZooKeeperP String DDLWorker::enqueueQuery(DDLLogEntry & entry) { - if (entry.hosts.empty() && !database_replicated_ext) + if (entry.hosts.empty()) throw Exception("Empty host list in a distributed DDL task", ErrorCodes::LOGICAL_ERROR); auto zookeeper = getAndSetZooKeeper(); @@ -1043,27 +771,7 @@ String DDLWorker::enqueueQuery(DDLLogEntry & entry) String query_path_prefix = queue_dir + "/query-"; zookeeper->createAncestors(query_path_prefix); - String node_path; - if (database_replicated_ext) - { - /// We cannot create sequential node and it's ephemeral child in a single transaction, so allocate sequential number another way - String counter_prefix = database_replicated_ext->zookeeper_path + "/counter/cnt-"; - String counter_path = zookeeper->create(counter_prefix, "", zkutil::CreateMode::EphemeralSequential); - node_path = query_path_prefix + counter_path.substr(counter_prefix.size()); - - Coordination::Requests ops; - /// Query is not committed yet, but we have to write it into log to avoid reordering - ops.emplace_back(zkutil::makeCreateRequest(node_path, entry.toString(), zkutil::CreateMode::Persistent)); - /// '/try' will be replaced with '/committed' or will be removed due to expired session or other error - ops.emplace_back(zkutil::makeCreateRequest(node_path + "/try", database_replicated_ext->getFullReplicaName(), zkutil::CreateMode::Ephemeral)); - /// We don't need it anymore - ops.emplace_back(zkutil::makeRemoveRequest(counter_path, -1)); - zookeeper->multi(ops); - } - else - { - node_path = zookeeper->create(query_path_prefix, entry.toString(), zkutil::CreateMode::PersistentSequential); - } + String node_path = zookeeper->create(query_path_prefix, entry.toString(), zkutil::CreateMode::PersistentSequential); /// Optional step try @@ -1091,6 +799,7 @@ void DDLWorker::runMainThread() { auto zookeeper = getAndSetZooKeeper(); zookeeper->createAncestors(queue_dir + "/"); + initialize(); initialized = true; } catch (const Coordination::Exception & e) diff --git a/src/Interpreters/DDLWorker.h b/src/Interpreters/DDLWorker.h index 86677bfbb19..39087d05fbb 100644 --- a/src/Interpreters/DDLWorker.h +++ b/src/Interpreters/DDLWorker.h @@ -29,50 +29,20 @@ namespace DB class Context; class ASTAlterQuery; struct DDLLogEntry; -struct DDLTask; -using DDLTaskPtr = std::unique_ptr; +struct DDLTaskBase; +using DDLTaskPtr = std::unique_ptr; using ZooKeeperPtr = std::shared_ptr; -struct DatabaseReplicatedExtensions -{ - UUID database_uuid; - String zookeeper_path; - String database_name; - String shard_name; - String replica_name; - UInt32 first_not_executed; - using EntryLostCallback = std::function; - using EntryExecutedCallback = std::function; - using EntryErrorCallback = std::function; - EntryLostCallback lost_callback; - EntryExecutedCallback executed_callback; - EntryErrorCallback error_callback; - - String getReplicaPath() const - { - return zookeeper_path + "/replicas/" + shard_name + "/" + replica_name; - } - - String getFullReplicaName() const - { - return shard_name + '|' + replica_name; - } - - static String getLogEntryName(UInt32 log_entry_number); - static UInt32 getLogEntryNumber(const String & log_entry_name); -}; - - class DDLWorker { public: DDLWorker(int pool_size_, const std::string & zk_root_dir, const Context & context_, const Poco::Util::AbstractConfiguration * config, const String & prefix, - std::optional database_replicated_ext_ = std::nullopt); - ~DDLWorker(); + const String & logger_name = "DDLWorker"); + virtual ~DDLWorker(); /// Pushes query into DDL queue, returns path to created node - String enqueueQuery(DDLLogEntry & entry); + virtual String enqueueQuery(DDLLogEntry & entry); /// Host ID (name:port) for logging purposes /// Note that in each task hosts are identified individually by name:port from initiator server cluster config @@ -83,10 +53,7 @@ public: void shutdown(); - //FIXME get rid of this method - void setLogPointer(UInt32 log_pointer) { database_replicated_ext->first_not_executed = log_pointer; } - -private: +protected: /// Returns cached ZooKeeper session (possibly expired). ZooKeeperPtr tryGetZooKeeper() const; @@ -97,14 +64,13 @@ private: void checkCurrentTasks(); void scheduleTasks(); - void saveTask(const String & entry_name); /// Reads entry and check that the host belongs to host list of the task /// Returns non-empty DDLTaskPtr if entry parsed and the check is passed - DDLTaskPtr initAndCheckTask(const String & entry_name, String & out_reason, const ZooKeeperPtr & zookeeper); + virtual DDLTaskPtr initAndCheckTask(const String & entry_name, String & out_reason, const ZooKeeperPtr & zookeeper); void enqueueTask(DDLTaskPtr task); - void processTask(DDLTask & task); + void processTask(DDLTaskBase & task); /// Check that query should be executed on leader replica only static bool taskShouldBeExecutedOnLeader(const ASTPtr ast_ddl, StoragePtr storage); @@ -115,15 +81,15 @@ private: /// query via RemoteBlockOutputStream to leader, so to avoid such "2-phase" query execution we /// execute query directly on leader. bool tryExecuteQueryOnLeaderReplica( - DDLTask & task, + DDLTaskBase & task, StoragePtr storage, const String & rewritten_query, const String & node_path, const ZooKeeperPtr & zookeeper); - void parseQueryAndResolveHost(DDLTask & task); + void parseQueryAndResolveHost(DDLTaskBase & task); - bool tryExecuteQuery(const String & query, const DDLTask & task, ExecutionStatus & status); + bool tryExecuteQuery(const String & query, const DDLTaskBase & task, ExecutionStatus & status); /// Checks and cleanups queue's nodes void cleanupQueue(Int64 current_time_seconds, const ZooKeeperPtr & zookeeper); @@ -131,17 +97,16 @@ private: /// Init task node static void createStatusDirs(const std::string & node_path, const ZooKeeperPtr & zookeeper); + virtual void initialize() {} void runMainThread(); void runCleanupThread(); void attachToThreadGroup(); -private: - std::atomic is_circular_replicated = false; +protected: Context context; Poco::Logger * log; - std::optional database_replicated_ext; std::string host_fqdn; /// current host domain name std::string host_fqdn_id; /// host_name:port @@ -151,7 +116,8 @@ private: ZooKeeperPtr current_zookeeper; /// Save state of executed task to avoid duplicate execution on ZK error - std::vector last_tasks; + //std::vector last_tasks; + std::optional last_entry_name; std::shared_ptr queue_updated_event = std::make_shared(); std::shared_ptr cleanup_event = std::make_shared(); diff --git a/tests/integration/test_replicated_database/configs/config.xml b/tests/integration/test_replicated_database/configs/config.xml new file mode 100644 index 00000000000..d751454437c --- /dev/null +++ b/tests/integration/test_replicated_database/configs/config.xml @@ -0,0 +1,3 @@ + + 10 + diff --git a/tests/integration/test_replicated_database/configs/disable_snapshots.xml b/tests/integration/test_replicated_database/configs/disable_snapshots.xml deleted file mode 100644 index 9a656bdcea1..00000000000 --- a/tests/integration/test_replicated_database/configs/disable_snapshots.xml +++ /dev/null @@ -1,3 +0,0 @@ - - 0 - diff --git a/tests/integration/test_replicated_database/configs/snapshot_each_query.xml b/tests/integration/test_replicated_database/configs/snapshot_each_query.xml deleted file mode 100644 index 6eae1d9d992..00000000000 --- a/tests/integration/test_replicated_database/configs/snapshot_each_query.xml +++ /dev/null @@ -1,3 +0,0 @@ - - 1 - diff --git a/tests/integration/test_replicated_database/test.py b/tests/integration/test_replicated_database/test.py index 11bfbad393b..8c5a25b3fe7 100644 --- a/tests/integration/test_replicated_database/test.py +++ b/tests/integration/test_replicated_database/test.py @@ -7,11 +7,11 @@ from helpers.test_tools import assert_eq_with_retry cluster = ClickHouseCluster(__file__) -main_node = cluster.add_instance('main_node', main_configs=['configs/disable_snapshots.xml'], with_zookeeper=True, stay_alive=True, macros={"shard": 1, "replica": 1}) -dummy_node = cluster.add_instance('dummy_node', main_configs=['configs/disable_snapshots.xml'], with_zookeeper=True, macros={"shard": 1, "replica": 2}) -competing_node = cluster.add_instance('competing_node', main_configs=['configs/disable_snapshots.xml'], with_zookeeper=True, macros={"shard": 1, "replica": 3}) -snapshotting_node = cluster.add_instance('snapshotting_node', main_configs=['configs/snapshot_each_query.xml'], with_zookeeper=True, macros={"shard": 2, "replica": 1}) -snapshot_recovering_node = cluster.add_instance('snapshot_recovering_node', main_configs=['configs/disable_snapshots.xml'], with_zookeeper=True, macros={"shard": 2, "replica": 2}) +main_node = cluster.add_instance('main_node', main_configs=['configs/config.xml'], with_zookeeper=True, stay_alive=True, macros={"shard": 1, "replica": 1}) +dummy_node = cluster.add_instance('dummy_node', main_configs=['configs/config.xml'], with_zookeeper=True, macros={"shard": 1, "replica": 2}) +competing_node = cluster.add_instance('competing_node', main_configs=['configs/config.xml'], with_zookeeper=True, macros={"shard": 1, "replica": 3}) +snapshotting_node = cluster.add_instance('snapshotting_node', main_configs=['configs/config.xml'], with_zookeeper=True, macros={"shard": 2, "replica": 1}) +snapshot_recovering_node = cluster.add_instance('snapshot_recovering_node', main_configs=['configs/config.xml'], with_zookeeper=True, macros={"shard": 2, "replica": 2}) uuid_regex = re.compile("[0-9a-f]{8}\-[0-9a-f]{4}\-[0-9a-f]{4}\-[0-9a-f]{4}\-[0-9a-f]{12}") def assert_create_query(nodes, table_name, expected): @@ -70,9 +70,10 @@ def test_simple_alter_table(started_cluster, engine): assert_create_query([main_node, dummy_node], name, expected) +@pytest.mark.dependency(depends=['test_simple_alter_table']) @pytest.mark.parametrize("engine", ['MergeTree', 'ReplicatedMergeTree']) def test_create_replica_after_delay(started_cluster, engine): - competing_node.query("CREATE DATABASE testdb ENGINE = Replicated('/clickhouse/databases/test1', 'shard1', 'replica3');") + competing_node.query("CREATE DATABASE IF NOT EXISTS testdb ENGINE = Replicated('/clickhouse/databases/test1', 'shard1', 'replica3');") name = "testdb.alter_test_{}".format(engine) main_node.query("ALTER TABLE {} ADD COLUMN Added3 UInt32;".format(name)) @@ -113,6 +114,7 @@ def test_alters_from_different_replicas(started_cluster): assert_create_query([main_node, competing_node], "testdb.concurrent_test", expected) +@pytest.mark.dependency(depends=['test_alters_from_different_replicas']) def test_drop_and_create_table(started_cluster): main_node.query("DROP TABLE testdb.concurrent_test") main_node.query("CREATE TABLE testdb.concurrent_test " @@ -125,6 +127,7 @@ def test_drop_and_create_table(started_cluster): assert_create_query([main_node, competing_node], "testdb.concurrent_test", expected) +@pytest.mark.dependency(depends=['test_drop_and_create_table']) def test_replica_restart(started_cluster): main_node.restart_clickhouse() @@ -134,14 +137,18 @@ def test_replica_restart(started_cluster): assert_create_query([main_node, competing_node], "testdb.concurrent_test", expected) + +@pytest.mark.dependency(depends=['test_create_replica_after_delay']) def test_snapshot_and_snapshot_recover(started_cluster): #FIXME bad test snapshotting_node.query("CREATE DATABASE testdb ENGINE = Replicated('/clickhouse/databases/test1', 'shard1', 'replica4');") time.sleep(5) snapshot_recovering_node.query("CREATE DATABASE testdb ENGINE = Replicated('/clickhouse/databases/test1', 'shard1', 'replica5');") time.sleep(5) - assert snapshotting_node.query("desc table testdb.alter_test") == snapshot_recovering_node.query("desc table testdb.alter_test") + assert snapshotting_node.query("desc table testdb.alter_test_MergeTree") == snapshot_recovering_node.query("desc table testdb.alter_test_MergeTree") + assert snapshotting_node.query("desc table testdb.alter_test_ReplicatedMergeTree") == snapshot_recovering_node.query("desc table testdb.alter_test_ReplicatedMergeTree") +@pytest.mark.dependency(depends=['test_replica_restart']) def test_drop_and_create_replica(started_cluster): main_node.query("DROP DATABASE testdb") main_node.query("CREATE DATABASE testdb ENGINE = Replicated('/clickhouse/databases/test1', 'shard1', 'replica1');") From ab197a49c82db8c9e4aae3984a8da91a0e120728 Mon Sep 17 00:00:00 2001 From: Alexander Tokmakov Date: Sun, 29 Nov 2020 14:45:32 +0300 Subject: [PATCH 0065/2357] better code, fixes --- src/Databases/DatabaseAtomic.cpp | 72 +++----- src/Databases/DatabaseReplicated.cpp | 160 +++++++++--------- src/Databases/DatabaseReplicated.h | 31 ++-- src/Databases/DatabaseReplicatedWorker.cpp | 20 +-- src/Databases/ya.make | 1 + src/Interpreters/DDLTask.cpp | 43 ++--- src/Interpreters/DDLTask.h | 32 +--- src/Interpreters/DDLWorker.cpp | 59 ++++--- src/Interpreters/DDLWorker.h | 5 +- src/Interpreters/InterpreterCreateQuery.cpp | 2 +- src/Interpreters/executeDDLQueryOnCluster.cpp | 12 +- src/Interpreters/executeDDLQueryOnCluster.h | 1 + .../test_replicated_database/test.py | 9 +- 13 files changed, 194 insertions(+), 253 deletions(-) diff --git a/src/Databases/DatabaseAtomic.cpp b/src/Databases/DatabaseAtomic.cpp index a444d9cc200..b60adf44e51 100644 --- a/src/Databases/DatabaseAtomic.cpp +++ b/src/Databases/DatabaseAtomic.cpp @@ -120,13 +120,10 @@ void DatabaseAtomic::dropTable(const Context & context, const String & table_nam table_metadata_path_drop = DatabaseCatalog::instance().getPathForDroppedMetadata(table->getStorageID()); if (auto txn = context.getMetadataTransaction()) - { - String metadata_zk_path = txn->zookeeper_path + "/metadata/" + escapeForFileName(table_name); - txn->ops.emplace_back(zkutil::makeRemoveRequest(metadata_zk_path, -1)); - txn->current_zookeeper->multi(txn->ops); /// Commit point (a sort of) for Replicated database - /// NOTE: replica will be lost if server crashes before the following rename - /// TODO better detection and recovery - } + txn->commit(); /// Commit point (a sort of) for Replicated database + + /// NOTE: replica will be lost if server crashes before the following rename + /// TODO better detection and recovery Poco::File(table_metadata_path).renameTo(table_metadata_path_drop); /// Mark table as dropped DatabaseWithDictionaries::detachTableUnlocked(table_name, lock); /// Should never throw @@ -245,31 +242,10 @@ void DatabaseAtomic::renameTable(const Context & context, const String & table_n /// Table renaming actually begins here if (auto txn = context.getMetadataTransaction()) - { - String statement; - String statement_to; - { - ReadBufferFromFile in(old_metadata_path, 4096); - readStringUntilEOF(statement, in); - if (exchange) - { - ReadBufferFromFile in_to(new_metadata_path, 4096); - readStringUntilEOF(statement_to, in_to); - } - } - String metadata_zk_path = txn->zookeeper_path + "/metadata/" + escapeForFileName(table_name); - String metadata_zk_path_to = txn->zookeeper_path + "/metadata/" + escapeForFileName(to_table_name); - txn->ops.emplace_back(zkutil::makeRemoveRequest(metadata_zk_path, -1)); - if (exchange) - { - txn->ops.emplace_back(zkutil::makeRemoveRequest(metadata_zk_path_to, -1)); - txn->ops.emplace_back(zkutil::makeCreateRequest(metadata_zk_path, statement_to, zkutil::CreateMode::Persistent)); - } - txn->ops.emplace_back(zkutil::makeCreateRequest(metadata_zk_path_to, statement, zkutil::CreateMode::Persistent)); - txn->current_zookeeper->multi(txn->ops); /// Commit point (a sort of) for Replicated database - /// NOTE: replica will be lost if server crashes before the following rename - /// TODO better detection and recovery - } + txn->commit(); /// Commit point (a sort of) for Replicated database + + /// NOTE: replica will be lost if server crashes before the following rename + /// TODO better detection and recovery if (exchange) renameExchange(old_metadata_path, new_metadata_path); @@ -326,15 +302,10 @@ void DatabaseAtomic::commitCreateTable(const ASTCreateQuery & query, const Stora locked_uuid = true; if (auto txn = query_context.getMetadataTransaction()) - { - String metadata_zk_path = txn->zookeeper_path + "/metadata/" + escapeForFileName(query.table); - String statement = getObjectDefinitionFromCreateQuery(query.clone()); - /// zk::multi(...) will throw if `metadata_zk_path` exists - txn->ops.emplace_back(zkutil::makeCreateRequest(metadata_zk_path, statement, zkutil::CreateMode::Persistent)); - txn->current_zookeeper->multi(txn->ops); /// Commit point (a sort of) for Replicated database - /// NOTE: replica will be lost if server crashes before the following renameNoReplace(...) - /// TODO better detection and recovery - } + txn->commit(); /// Commit point (a sort of) for Replicated database + + /// NOTE: replica will be lost if server crashes before the following renameNoReplace(...) + /// TODO better detection and recovery /// It throws if `table_metadata_path` already exists (it's possible if table was detached) renameNoReplace(table_metadata_tmp_path, table_metadata_path); /// Commit point (a sort of) @@ -352,7 +323,8 @@ void DatabaseAtomic::commitCreateTable(const ASTCreateQuery & query, const Stora tryCreateSymlink(query.table, table_data_path); } -void DatabaseAtomic::commitAlterTable(const StorageID & table_id, const String & table_metadata_tmp_path, const String & table_metadata_path, const String & statement, const Context & query_context) +void DatabaseAtomic::commitAlterTable(const StorageID & table_id, const String & table_metadata_tmp_path, const String & table_metadata_path, + const String & /*statement*/, const Context & query_context) { bool check_file_exists = true; SCOPE_EXIT({ std::error_code code; if (check_file_exists) std::filesystem::remove(table_metadata_tmp_path, code); }); @@ -363,17 +335,11 @@ void DatabaseAtomic::commitAlterTable(const StorageID & table_id, const String & if (table_id.uuid != actual_table_id.uuid) throw Exception("Cannot alter table because it was renamed", ErrorCodes::CANNOT_ASSIGN_ALTER); - if (&query_context != &query_context.getGlobalContext()) // FIXME - { - if (auto txn = query_context.getMetadataTransaction()) - { - String metadata_zk_path = txn->zookeeper_path + "/metadata/" + escapeForFileName(table_id.table_name); - txn->ops.emplace_back(zkutil::makeSetRequest(metadata_zk_path, statement, -1)); - txn->current_zookeeper->multi(txn->ops); /// Commit point (a sort of) for Replicated database - /// NOTE: replica will be lost if server crashes before the following rename - /// TODO better detection and recovery - } - } + if (auto txn = query_context.getMetadataTransaction()) + txn->commit(); /// Commit point (a sort of) for Replicated database + + /// NOTE: replica will be lost if server crashes before the following rename + /// TODO better detection and recovery check_file_exists = renameExchangeIfSupported(table_metadata_tmp_path, table_metadata_path); if (!check_file_exists) diff --git a/src/Databases/DatabaseReplicated.cpp b/src/Databases/DatabaseReplicated.cpp index eef1b98afe2..418eaf567a4 100644 --- a/src/Databases/DatabaseReplicated.cpp +++ b/src/Databases/DatabaseReplicated.cpp @@ -34,6 +34,7 @@ namespace ErrorCodes extern const int REPLICA_IS_ALREADY_EXIST; extern const int DATABASE_REPLICATION_FAILED; extern const int UNKNOWN_DATABASE; + extern const int NOT_IMPLEMENTED; } zkutil::ZooKeeperPtr DatabaseReplicated::getZooKeeper() const @@ -106,9 +107,6 @@ DatabaseReplicated::DatabaseReplicated( /// Throws if replica with the same name was created concurrently createReplicaNodesInZooKeeper(current_zookeeper); } - - snapshot_period = 1; //context_.getConfigRef().getInt("database_replicated_snapshot_period", 10); - LOG_DEBUG(log, "Snapshot period is set to {} log entries per one snapshot", snapshot_period); } bool DatabaseReplicated::createDatabaseNodesInZooKeeper(const zkutil::ZooKeeperPtr & current_zookeeper) @@ -171,8 +169,6 @@ void DatabaseReplicated::loadStoredObjects(Context & context, bool has_force_res { DatabaseAtomic::loadStoredObjects(context, has_force_restore_data_flag, force_attach); - //recoverLostReplica(global_context.getZooKeeper(), 0, true); //FIXME - ddl_worker = std::make_unique(this, global_context); } @@ -209,71 +205,6 @@ void DatabaseReplicated::onUnexpectedLogEntry(const String & entry_name, const Z "Got log entry '{}' when expected entry number {}"); } -void DatabaseReplicated::removeOutdatedSnapshotsAndLog() -{ - /// This method removes all snapshots and logged queries - /// that no longer will be in use by current replicas or - /// new coming ones. - /// Each registered replica has its state in ZooKeeper. - /// Therefore, snapshots and logged queries that are less - /// than a least advanced replica are removed. - /// It does not interfere with a new coming replica - /// metadata loading from snapshot - /// because the replica will use the latest snapshot available - /// and this snapshot will set the last executed log query - /// to a greater one than the least advanced current replica. - auto current_zookeeper = getZooKeeper(); - Strings replica_states = current_zookeeper->getChildren(zookeeper_path + "/replicas"); - //TODO do not use log pointers to determine which entries to remove if there are staled pointers. - // We can just remove all entries older than previous snapshot version. - // Possible invariant: store all entries since last snapshot, replica becomes lost when it cannot get log entry. - auto least_advanced = std::min_element(replica_states.begin(), replica_states.end()); - Strings snapshots = current_zookeeper->getChildren(zookeeper_path + "/snapshots"); - - if (snapshots.size() < 2) - { - return; - } - - std::sort(snapshots.begin(), snapshots.end()); - auto still_useful = std::lower_bound(snapshots.begin(), snapshots.end(), *least_advanced); - snapshots.erase(still_useful, snapshots.end()); - for (const String & snapshot : snapshots) - { - current_zookeeper->tryRemoveRecursive(zookeeper_path + "/snapshots/" + snapshot); - } - - Strings log_entry_names = current_zookeeper->getChildren(zookeeper_path + "/log"); - std::sort(log_entry_names.begin(), log_entry_names.end()); - auto still_useful_log = std::upper_bound(log_entry_names.begin(), log_entry_names.end(), *still_useful); - log_entry_names.erase(still_useful_log, log_entry_names.end()); - for (const String & log_entry_name : log_entry_names) - { - String log_entry_path = zookeeper_path + "/log/" + log_entry_name; - current_zookeeper->tryRemove(log_entry_path); - } -} - -void DatabaseReplicated::onExecutedLogEntry(const String & /*entry_name*/, const ZooKeeperPtr & /*zookeeper*/) -{ - -} - -void DatabaseReplicated::writeLastExecutedToDiskAndZK() -{ - auto current_zookeeper = getZooKeeper(); - current_zookeeper->createOrUpdate( - zookeeper_path + "/replicas/" + replica_name, last_executed_log_entry, zkutil::CreateMode::Persistent); - - String metadata_file = getMetadataPath() + ".last_entry"; - WriteBufferFromFile out(metadata_file, last_executed_log_entry.size(), O_WRONLY | O_CREAT); - writeString(last_executed_log_entry, out); - out.next(); - if (global_context.getSettingsRef().fsync_metadata) - out.sync(); - out.close(); -} - BlockIO DatabaseReplicated::propose(const ASTPtr & query) { @@ -302,14 +233,14 @@ BlockIO DatabaseReplicated::propose(const ASTPtr & query) //FIXME need list of all replicas, we can obtain it from zk Strings hosts_to_wait; - hosts_to_wait.emplace_back(shard_name + '|' +replica_name); - auto stream = std::make_shared(node_path, entry, global_context); + hosts_to_wait.emplace_back(getFullReplicaName()); + auto stream = std::make_shared(node_path, entry, global_context, hosts_to_wait); io.in = std::move(stream); return io; } -void DatabaseReplicated::recoverLostReplica(const ZooKeeperPtr & current_zookeeper, UInt32 from_snapshot, bool /*create*/) +void DatabaseReplicated::recoverLostReplica(const ZooKeeperPtr & current_zookeeper, UInt32 from_snapshot) { LOG_WARNING(log, "Will recover replica"); @@ -339,14 +270,7 @@ void DatabaseReplicated::recoverLostReplica(const ZooKeeperPtr & current_zookeep InterpreterCreateQuery(query_ast, query_context).execute(); } - //if (create) - // return; - current_zookeeper->set(replica_path + "/log_ptr", toString(from_snapshot)); - last_executed_log_entry = from_snapshot; - //ddl_worker->setLogPointer(from_snapshot); //FIXME - - //writeLastExecutedToDiskAndZK(); } ASTPtr DatabaseReplicated::parseQueryFromMetadataInZooKeeper(const String & node_name, const String & query) @@ -384,4 +308,80 @@ void DatabaseReplicated::shutdown() DatabaseAtomic::shutdown(); } + +void DatabaseReplicated::dropTable(const Context & context, const String & table_name, bool no_delay) +{ + auto txn = context.getMetadataTransaction(); + //assert(!ddl_worker->isCurrentlyActive() || txn /*|| called from DROP DATABASE */); + if (txn && txn->is_initial_query) + { + String metadata_zk_path = zookeeper_path + "/metadata/" + escapeForFileName(table_name); + txn->ops.emplace_back(zkutil::makeRemoveRequest(metadata_zk_path, -1)); + } + DatabaseAtomic::dropTable(context, table_name, no_delay); +} + +void DatabaseReplicated::renameTable(const Context & context, const String & table_name, IDatabase & to_database, + const String & to_table_name, bool exchange, bool dictionary) +{ + auto txn = context.getMetadataTransaction(); + assert(txn); + + if (txn->is_initial_query) + { + String statement; + String statement_to; + { + //FIXME It's not atomic (however we have only one thread) + ReadBufferFromFile in(getObjectMetadataPath(table_name), 4096); + readStringUntilEOF(statement, in); + if (exchange) + { + ReadBufferFromFile in_to(to_database.getObjectMetadataPath(to_table_name), 4096); + readStringUntilEOF(statement_to, in_to); + } + } + String metadata_zk_path = txn->zookeeper_path + "/metadata/" + escapeForFileName(table_name); + String metadata_zk_path_to = txn->zookeeper_path + "/metadata/" + escapeForFileName(to_table_name); + txn->ops.emplace_back(zkutil::makeRemoveRequest(metadata_zk_path, -1)); + if (exchange) + { + txn->ops.emplace_back(zkutil::makeRemoveRequest(metadata_zk_path_to, -1)); + txn->ops.emplace_back(zkutil::makeCreateRequest(metadata_zk_path, statement_to, zkutil::CreateMode::Persistent)); + } + txn->ops.emplace_back(zkutil::makeCreateRequest(metadata_zk_path_to, statement, zkutil::CreateMode::Persistent)); + } + + DatabaseAtomic::renameTable(context, table_name, to_database, to_table_name, exchange, dictionary); +} + +void DatabaseReplicated::commitCreateTable(const ASTCreateQuery & query, const StoragePtr & table, + const String & table_metadata_tmp_path, const String & table_metadata_path, + const Context & query_context) +{ + auto txn = query_context.getMetadataTransaction(); + assert(!ddl_worker->isCurrentlyActive() || txn); + if (txn && txn->is_initial_query) + { + String metadata_zk_path = txn->zookeeper_path + "/metadata/" + escapeForFileName(query.table); + String statement = getObjectDefinitionFromCreateQuery(query.clone()); + /// zk::multi(...) will throw if `metadata_zk_path` exists + txn->ops.emplace_back(zkutil::makeCreateRequest(metadata_zk_path, statement, zkutil::CreateMode::Persistent)); + } + DatabaseAtomic::commitCreateTable(query, table, table_metadata_tmp_path, table_metadata_path, query_context); +} + +void DatabaseReplicated::commitAlterTable(const StorageID & table_id, + const String & table_metadata_tmp_path, const String & table_metadata_path, + const String & statement, const Context & query_context) +{ + auto txn = query_context.getMetadataTransaction(); + if (txn && txn->is_initial_query) + { + String metadata_zk_path = txn->zookeeper_path + "/metadata/" + escapeForFileName(table_id.table_name); + txn->ops.emplace_back(zkutil::makeSetRequest(metadata_zk_path, statement, -1)); + } + DatabaseAtomic::commitAlterTable(table_id, table_metadata_tmp_path, table_metadata_path, statement, query_context); +} + } diff --git a/src/Databases/DatabaseReplicated.h b/src/Databases/DatabaseReplicated.h index d6cd93773cf..8085c234af4 100644 --- a/src/Databases/DatabaseReplicated.h +++ b/src/Databases/DatabaseReplicated.h @@ -46,6 +46,16 @@ public: ~DatabaseReplicated() override; + void dropTable(const Context &, const String & table_name, bool no_delay) override; + void renameTable(const Context & context, const String & table_name, IDatabase & to_database, + const String & to_table_name, bool exchange, bool dictionary) override; + void commitCreateTable(const ASTCreateQuery & query, const StoragePtr & table, + const String & table_metadata_tmp_path, const String & table_metadata_path, + const Context & query_context) override; + void commitAlterTable(const StorageID & table_id, + const String & table_metadata_tmp_path, const String & table_metadata_path, + const String & statement, const Context & query_context) override; + void drop(const Context & /*context*/) override; String getEngineName() const override { return "Replicated"; } @@ -65,17 +75,8 @@ private: bool createDatabaseNodesInZooKeeper(const ZooKeeperPtr & current_zookeeper); void createReplicaNodesInZooKeeper(const ZooKeeperPtr & current_zookeeper); - //void runBackgroundLogExecutor(); - void writeLastExecutedToDiskAndZK(); - - //void loadMetadataFromSnapshot(); - void removeOutdatedSnapshotsAndLog(); - - void onUnexpectedLogEntry(const String & entry_name, const ZooKeeperPtr & zookeeper); - void recoverLostReplica(const ZooKeeperPtr & current_zookeeper, UInt32 from_snapshot, bool create = false); - - void onExecutedLogEntry(const String & entry_name, const ZooKeeperPtr & zookeeper); + void recoverLostReplica(const ZooKeeperPtr & current_zookeeper, UInt32 from_snapshot); ASTPtr parseQueryFromMetadataInZooKeeper(const String & node_name, const String & query); @@ -86,19 +87,9 @@ private: UInt32 log_entry_to_execute; - std::mutex log_name_mutex; - String log_name_to_exec_with_result; - - int snapshot_period; - - String last_executed_log_entry = ""; - zkutil::ZooKeeperPtr getZooKeeper() const; std::unique_ptr ddl_worker; - - - }; } diff --git a/src/Databases/DatabaseReplicatedWorker.cpp b/src/Databases/DatabaseReplicatedWorker.cpp index 869b888d3ad..29599d4d66d 100644 --- a/src/Databases/DatabaseReplicatedWorker.cpp +++ b/src/Databases/DatabaseReplicatedWorker.cpp @@ -96,19 +96,19 @@ DDLTaskPtr DatabaseReplicatedDDLWorker::initAndCheckTask(const String & entry_na throw Exception(ErrorCodes::LOGICAL_ERROR, "should be unreachable"); } - auto error = task->tryParseEntry(node_data); - if (error) - { - LOG_ERROR(log, "Cannot parse query from '{}': {}", node_data, *error); - database->onUnexpectedLogEntry(entry_name, zookeeper); - throw Exception(ErrorCodes::LOGICAL_ERROR, "should be unreachable"); - } + task->entry.parse(node_data); - task->parseQueryFromEntry(context); + if (task->entry.query.empty()) + { + //TODO better way to determine special entries + task->was_executed = true; + } + else + { + task->parseQueryFromEntry(context); + } return task; } - - } diff --git a/src/Databases/ya.make b/src/Databases/ya.make index 09d3dc38cb2..38f79532080 100644 --- a/src/Databases/ya.make +++ b/src/Databases/ya.make @@ -17,6 +17,7 @@ SRCS( DatabaseOnDisk.cpp DatabaseOrdinary.cpp DatabaseReplicated.cpp + DatabaseReplicatedWorker.cpp DatabaseWithDictionaries.cpp DatabasesCommon.cpp MySQL/ConnectionMySQLSettings.cpp diff --git a/src/Interpreters/DDLTask.cpp b/src/Interpreters/DDLTask.cpp index 0bc98dfd0dd..9ef7352ceb4 100644 --- a/src/Interpreters/DDLTask.cpp +++ b/src/Interpreters/DDLTask.cpp @@ -86,20 +86,6 @@ void DDLLogEntry::parse(const String & data) } -std::optional DDLTaskBase::tryParseEntry(const String & data) -{ - std::optional error; - try - { - entry.parse(data); - } - catch (...) - { - error = ExecutionStatus::fromCurrentException().serializeText(); - } - return error; -} - void DDLTaskBase::parseQueryFromEntry(const Context & context) { const char * begin = entry.query.data(); @@ -313,22 +299,25 @@ std::unique_ptr DatabaseReplicatedTask::makeQueryContext(Context & from query_context->getClientInfo().query_kind = ClientInfo::QueryKind::REPLICATED_LOG_QUERY; //FIXME why do we need separate query kind? query_context->setCurrentDatabase(database->getDatabaseName()); + auto txn = std::make_shared(); + query_context->initMetadataTransaction(txn); + txn->current_zookeeper = from_context.getZooKeeper(); + txn->zookeeper_path = database->zookeeper_path; + txn->is_initial_query = we_are_initiator; + if (we_are_initiator) { - auto txn = std::make_shared(); - query_context->initMetadataTransaction(txn); - txn->current_zookeeper = from_context.getZooKeeper(); - txn->zookeeper_path = database->zookeeper_path; txn->ops.emplace_back(zkutil::makeRemoveRequest(entry_path + "/try", -1)); txn->ops.emplace_back(zkutil::makeCreateRequest(entry_path + "/committed", host_id_str, zkutil::CreateMode::Persistent)); txn->ops.emplace_back(zkutil::makeRemoveRequest(getActiveNodePath(), -1)); - if (execute_on_leader) - txn->ops.emplace_back(zkutil::makeCreateRequest(getShardNodePath() + "/executed", host_id_str, zkutil::CreateMode::Persistent)); - txn->ops.emplace_back(zkutil::makeCreateRequest(getFinishedNodePath(), execution_status.serializeText(), zkutil::CreateMode::Persistent)); - txn->ops.emplace_back(zkutil::makeSetRequest(database->replica_path + "/log_ptr", toString(getLogEntryNumber(entry_name)), -1)); txn->ops.emplace_back(zkutil::makeSetRequest(database->zookeeper_path + "/max_log_ptr", toString(getLogEntryNumber(entry_name)), -1)); } + if (execute_on_leader) + txn->ops.emplace_back(zkutil::makeCreateRequest(getShardNodePath() + "/executed", host_id_str, zkutil::CreateMode::Persistent)); + txn->ops.emplace_back(zkutil::makeCreateRequest(getFinishedNodePath(), execution_status.serializeText(), zkutil::CreateMode::Persistent)); + txn->ops.emplace_back(zkutil::makeSetRequest(database->replica_path + "/log_ptr", toString(getLogEntryNumber(entry_name)), -1)); + return query_context; } @@ -347,15 +336,9 @@ UInt32 DatabaseReplicatedTask::getLogEntryNumber(const String & log_entry_name) return parse(log_entry_name.substr(strlen(name))); } -void DatabaseReplicatedTask::parseQueryFromEntry(const Context & context) +void MetadataTransaction::commit() { - if (entry.query.empty()) - { - was_executed = true; - return; - } - - DDLTaskBase::parseQueryFromEntry(context); + current_zookeeper->multi(ops); } } diff --git a/src/Interpreters/DDLTask.h b/src/Interpreters/DDLTask.h index 19d92a1bc78..2db1a696384 100644 --- a/src/Interpreters/DDLTask.h +++ b/src/Interpreters/DDLTask.h @@ -64,12 +64,6 @@ struct DDLTaskBase const String entry_name; const String entry_path; - DDLTaskBase(const String & name, const String & path) : entry_name(name), entry_path(path) {} - virtual ~DDLTaskBase() = default; - - std::optional tryParseEntry(const String & data); - virtual void parseQueryFromEntry(const Context & context); - DDLLogEntry entry; String host_id_str; @@ -81,6 +75,11 @@ struct DDLTaskBase ExecutionStatus execution_status; bool was_executed = false; + DDLTaskBase(const String & name, const String & path) : entry_name(name), entry_path(path) {} + virtual ~DDLTaskBase() = default; + + void parseQueryFromEntry(const Context & context); + virtual String getShardID() const = 0; virtual std::unique_ptr makeQueryContext(Context & from_context) const; @@ -93,26 +92,12 @@ struct DDLTaskBase struct DDLTask : public DDLTaskBase { - /// Stages of task lifetime correspond ordering of these data fields: - DDLTask(const String & name, const String & path) : DDLTaskBase(name, path) {} bool findCurrentHostID(const Context & global_context, Poco::Logger * log); void setClusterInfo(const Context & context, Poco::Logger * log); - - /// Stage 2: resolve host_id and check that - - - /// Stage 3.1: parse query - - /// Stage 3.2: check cluster and find the host in cluster - - /// Stage 3.3: execute query - - /// Stage 4: commit results to ZooKeeper - String getShardID() const override; private: @@ -131,8 +116,6 @@ struct DatabaseReplicatedTask : public DDLTaskBase { DatabaseReplicatedTask(const String & name, const String & path, DatabaseReplicated * database_); - void parseQueryFromEntry(const Context & context) override; - String getShardID() const override; std::unique_ptr makeQueryContext(Context & from_context) const override; @@ -148,14 +131,15 @@ struct MetadataTransaction { ZooKeeperPtr current_zookeeper; String zookeeper_path; + bool is_initial_query; Coordination::Requests ops; - - void addOps(Coordination::Requests & other_ops) { std::move(ops.begin(), ops.end(), std::back_inserter(other_ops)); } + + void commit(); }; } diff --git a/src/Interpreters/DDLWorker.cpp b/src/Interpreters/DDLWorker.cpp index 0399687a4d8..12f4c42b467 100644 --- a/src/Interpreters/DDLWorker.cpp +++ b/src/Interpreters/DDLWorker.cpp @@ -36,11 +36,8 @@ namespace ErrorCodes { extern const int NOT_IMPLEMENTED; extern const int LOGICAL_ERROR; - extern const int INCONSISTENT_CLUSTER_DEFINITION; extern const int TIMEOUT_EXCEEDED; - extern const int UNKNOWN_TYPE_OF_QUERY; extern const int UNFINISHED; - extern const int QUERY_IS_PROHIBITED; } @@ -226,7 +223,6 @@ void DDLWorker::recoverZooKeeper() } } - DDLTaskPtr DDLWorker::initAndCheckTask(const String & entry_name, String & out_reason, const ZooKeeperPtr & zookeeper) { String node_data; @@ -241,36 +237,50 @@ DDLTaskPtr DDLWorker::initAndCheckTask(const String & entry_name, String & out_r return {}; } - auto error = task->tryParseEntry(node_data); - if (error) + auto write_error_status = [&](const String & host_id, const String & error_message, const String & reason) + { + LOG_ERROR(log, "Cannot parse DDL task {}: {}. Will try to send error status: {}", entry_name, reason, error_message); + createStatusDirs(entry_path, zookeeper); + zookeeper->tryCreate(entry_path + "/finished/" + host_id, error_message, zkutil::CreateMode::Persistent); + }; + + try + { + /// Stage 1: parse entry + task->entry.parse(node_data); + } + catch (...) { /// What should we do if we even cannot parse host name and therefore cannot properly submit execution status? /// We can try to create fail node using FQDN if it equal to host name in cluster config attempt will be successful. /// Otherwise, that node will be ignored by DDLQueryStatusInputStream. - LOG_ERROR(log, "Cannot parse DDL task {}, will try to send error status: {}", entry_name, *error); - try - { - createStatusDirs(entry_path, zookeeper); - zookeeper->tryCreate(entry_path + "/finished/" + host_fqdn_id, *error, zkutil::CreateMode::Persistent); - } - catch (...) - { - tryLogCurrentException(log, "Can't report the task has invalid format"); - } - out_reason = "Incorrect task format"; + write_error_status(host_fqdn_id, ExecutionStatus::fromCurrentException().serializeText(), out_reason); return {}; } + /// Stage 2: resolve host_id and check if we should execute query or not if (!task->findCurrentHostID(context, log)) { out_reason = "There is no a local address in host list"; return {}; } - task->parseQueryFromEntry(context); - task->setClusterInfo(context, log); + try + { + /// Stage 3.1: parse query + task->parseQueryFromEntry(context); + /// Stage 3.2: check cluster and find the host in cluster + task->setClusterInfo(context, log); + } + catch (...) + { + out_reason = "Cannot parse query or obtain cluster info"; + write_error_status(task->host_id_str, ExecutionStatus::fromCurrentException().serializeText(), out_reason); + return {}; + } + /// Now task is ready for execution return task; } @@ -330,7 +340,8 @@ void DDLWorker::scheduleTasks() } else { - worker_pool.scheduleOrThrowOnError([this, task_ptr = task.release()]() { + worker_pool.scheduleOrThrowOnError([this, task_ptr = task.release()]() + { setThreadName("DDLWorkerExec"); enqueueTask(DDLTaskPtr(task_ptr)); }); @@ -345,13 +356,6 @@ void DDLWorker::scheduleTasks() } } -/// Parses query and resolves cluster and host in cluster -void DDLWorker::parseQueryAndResolveHost(DDLTaskBase & /*task*/) -{ - -} - - bool DDLWorker::tryExecuteQuery(const String & query, const DDLTaskBase & task, ExecutionStatus & status) { /// Add special comment at the start of query to easily identify DDL-produced queries in query_log @@ -792,7 +796,6 @@ void DDLWorker::runMainThread() setThreadName("DDLWorker"); LOG_DEBUG(log, "Started DDLWorker thread"); - bool initialized = false; do { try diff --git a/src/Interpreters/DDLWorker.h b/src/Interpreters/DDLWorker.h index 39087d05fbb..02076ae1df1 100644 --- a/src/Interpreters/DDLWorker.h +++ b/src/Interpreters/DDLWorker.h @@ -53,6 +53,8 @@ public: void shutdown(); + bool isCurrentlyActive() const { return initialized && !stop_flag; } + protected: /// Returns cached ZooKeeper session (possibly expired). @@ -87,8 +89,6 @@ protected: const String & node_path, const ZooKeeperPtr & zookeeper); - void parseQueryAndResolveHost(DDLTaskBase & task); - bool tryExecuteQuery(const String & query, const DDLTaskBase & task, ExecutionStatus & status); /// Checks and cleanups queue's nodes @@ -121,6 +121,7 @@ protected: std::shared_ptr queue_updated_event = std::make_shared(); std::shared_ptr cleanup_event = std::make_shared(); + std::atomic initialized = false; std::atomic stop_flag = false; ThreadFromGlobalPool main_thread; diff --git a/src/Interpreters/InterpreterCreateQuery.cpp b/src/Interpreters/InterpreterCreateQuery.cpp index 8d695b29793..f79eb800b66 100644 --- a/src/Interpreters/InterpreterCreateQuery.cpp +++ b/src/Interpreters/InterpreterCreateQuery.cpp @@ -731,7 +731,7 @@ BlockIO InterpreterCreateQuery::createTable(ASTCreateQuery & create) //TODO make code better if possible bool need_add_to_database = !create.temporary; - if(need_add_to_database && database->getEngineName() == "Replicated") + if (need_add_to_database && database->getEngineName() == "Replicated") { auto guard = DatabaseCatalog::instance().getDDLGuard(create.database, create.table); database = DatabaseCatalog::instance().getDatabase(create.database); diff --git a/src/Interpreters/executeDDLQueryOnCluster.cpp b/src/Interpreters/executeDDLQueryOnCluster.cpp index 03065245766..24405a5be27 100644 --- a/src/Interpreters/executeDDLQueryOnCluster.cpp +++ b/src/Interpreters/executeDDLQueryOnCluster.cpp @@ -23,6 +23,7 @@ namespace ErrorCodes extern const int TIMEOUT_EXCEEDED; extern const int UNFINISHED; extern const int QUERY_IS_PROHIBITED; + extern const int LOGICAL_ERROR; } bool isSupportedAlterType(int type) @@ -189,6 +190,7 @@ DDLQueryStatusInputStream::DDLQueryStatusInputStream(const String & zk_node_path if (hosts_to_wait) { waiting_hosts = NameSet(hosts_to_wait->begin(), hosts_to_wait->end()); + by_hostname = false; } else { @@ -267,7 +269,15 @@ Block DDLQueryStatusInputStream::readImpl() status.tryDeserializeText(status_data); } - auto [host, port] = Cluster::Address::fromString(host_id); + //FIXME + String host = host_id; + UInt16 port = 0; + if (by_hostname) + { + auto host_and_port = Cluster::Address::fromString(host_id); + host = host_and_port.first; + port = host_and_port.second; + } if (status.code != 0 && first_exception == nullptr) first_exception = std::make_unique(status.code, "There was an error on [{}:{}]: {}", host, port, status.message); diff --git a/src/Interpreters/executeDDLQueryOnCluster.h b/src/Interpreters/executeDDLQueryOnCluster.h index 0f7a411ed92..f65abf33c4f 100644 --- a/src/Interpreters/executeDDLQueryOnCluster.h +++ b/src/Interpreters/executeDDLQueryOnCluster.h @@ -61,6 +61,7 @@ private: std::unique_ptr first_exception; Int64 timeout_seconds = 120; + bool by_hostname = true; }; } diff --git a/tests/integration/test_replicated_database/test.py b/tests/integration/test_replicated_database/test.py index 8c5a25b3fe7..f99f4517e5a 100644 --- a/tests/integration/test_replicated_database/test.py +++ b/tests/integration/test_replicated_database/test.py @@ -90,6 +90,7 @@ def test_create_replica_after_delay(started_cluster, engine): assert_create_query([main_node, dummy_node, competing_node], name, expected) +@pytest.mark.dependency(depends=['test_create_replica_after_delay']) def test_alters_from_different_replicas(started_cluster): main_node.query("CREATE TABLE testdb.concurrent_test " "(CounterID UInt32, StartDate Date, UserID UInt32, VisitID UInt32, NestedColumn Nested(A UInt8, S String), ToDrop UInt32) " @@ -138,13 +139,13 @@ def test_replica_restart(started_cluster): assert_create_query([main_node, competing_node], "testdb.concurrent_test", expected) -@pytest.mark.dependency(depends=['test_create_replica_after_delay']) +@pytest.mark.dependency(depends=['test_replica_restart']) def test_snapshot_and_snapshot_recover(started_cluster): - #FIXME bad test snapshotting_node.query("CREATE DATABASE testdb ENGINE = Replicated('/clickhouse/databases/test1', 'shard1', 'replica4');") - time.sleep(5) snapshot_recovering_node.query("CREATE DATABASE testdb ENGINE = Replicated('/clickhouse/databases/test1', 'shard1', 'replica5');") - time.sleep(5) + + assert_eq_with_retry(snapshotting_node, "select count() from system.tables where name like 'alter_test_%'", "2\n") + assert_eq_with_retry(snapshot_recovering_node, "select count() from system.tables where name like 'alter_test_%'", "2\n") assert snapshotting_node.query("desc table testdb.alter_test_MergeTree") == snapshot_recovering_node.query("desc table testdb.alter_test_MergeTree") assert snapshotting_node.query("desc table testdb.alter_test_ReplicatedMergeTree") == snapshot_recovering_node.query("desc table testdb.alter_test_ReplicatedMergeTree") From c955542dce00478321a424e05f0ef777dfcc00e2 Mon Sep 17 00:00:00 2001 From: Alexander Tokmakov Date: Mon, 30 Nov 2020 23:22:25 +0300 Subject: [PATCH 0066/2357] run functional tests with Replicated engine --- src/Interpreters/InterpreterCreateQuery.cpp | 10 +++++++++- src/Interpreters/executeDDLQueryOnCluster.cpp | 7 ++++++- 2 files changed, 15 insertions(+), 2 deletions(-) diff --git a/src/Interpreters/InterpreterCreateQuery.cpp b/src/Interpreters/InterpreterCreateQuery.cpp index f79eb800b66..0b7fb3e5431 100644 --- a/src/Interpreters/InterpreterCreateQuery.cpp +++ b/src/Interpreters/InterpreterCreateQuery.cpp @@ -132,7 +132,15 @@ BlockIO InterpreterCreateQuery::createDatabase(ASTCreateQuery & create) bool old_style_database = context.getSettingsRef().default_database_engine.value == DefaultDatabaseEngine::Ordinary; auto engine = std::make_shared(); auto storage = std::make_shared(); - engine->name = old_style_database ? "Ordinary" : "Atomic"; + //FIXME revert it before merge + engine->name = "Atomic"; + if (old_style_database) + { + engine = makeASTFunction("Replicated", + std::make_shared(fmt::format("/clickhouse/db/{}/", create.database)), + std::make_shared("s1"), + std::make_shared("r1")); + } storage->set(storage->engine, engine); create.set(create.storage, storage); } diff --git a/src/Interpreters/executeDDLQueryOnCluster.cpp b/src/Interpreters/executeDDLQueryOnCluster.cpp index 24405a5be27..0b44206a2b2 100644 --- a/src/Interpreters/executeDDLQueryOnCluster.cpp +++ b/src/Interpreters/executeDDLQueryOnCluster.cpp @@ -294,7 +294,12 @@ Block DDLQueryStatusInputStream::readImpl() res = sample.cloneWithColumns(std::move(columns)); } - return res; + //FIXME revert it before merge + bool is_functional_tests = !by_hostname && context.getSettingsRef().default_database_engine.value == DefaultDatabaseEngine::Ordinary; + if (is_functional_tests) + return {}; + else + return res; } Strings DDLQueryStatusInputStream::getChildrenAllowNoNode(const std::shared_ptr & zookeeper, const String & node_path) From 1a4bd67736df1fdaec41df52bb4ca9d6ea5c4f81 Mon Sep 17 00:00:00 2001 From: Alexander Tokmakov Date: Tue, 1 Dec 2020 20:20:42 +0300 Subject: [PATCH 0067/2357] fixes --- src/Common/ZooKeeper/TestKeeper.cpp | 8 ++++---- src/Databases/DatabaseReplicated.cpp | 1 + src/Interpreters/Context.cpp | 1 + src/Interpreters/DDLWorker.cpp | 16 +++++++++++++--- src/Interpreters/DDLWorker.h | 1 + src/Interpreters/InterpreterCreateQuery.cpp | 5 ++++- src/Interpreters/executeDDLQueryOnCluster.cpp | 4 ++++ 7 files changed, 28 insertions(+), 8 deletions(-) diff --git a/src/Common/ZooKeeper/TestKeeper.cpp b/src/Common/ZooKeeper/TestKeeper.cpp index 5f34a60c34e..2d89228c7ae 100644 --- a/src/Common/ZooKeeper/TestKeeper.cpp +++ b/src/Common/ZooKeeper/TestKeeper.cpp @@ -213,10 +213,11 @@ std::pair TestKeeperCreateRequest::process(TestKeeper::Contai created_node.is_sequental = is_sequential; std::string path_created = path; + ++it->second.seq_num; + if (is_sequential) { auto seq_num = it->second.seq_num; - ++it->second.seq_num; std::stringstream seq_num_str; // STYLE_CHECK_ALLOW_STD_STRING_STREAM seq_num_str.exceptions(std::ios::failbit); @@ -228,15 +229,14 @@ std::pair TestKeeperCreateRequest::process(TestKeeper::Contai response.path_created = path_created; container.emplace(path_created, std::move(created_node)); - undo = [&container, path_created, is_sequential = is_sequential, parent_path = it->first] + undo = [&container, path_created, parent_path = it->first] { container.erase(path_created); auto & undo_parent = container.at(parent_path); --undo_parent.stat.cversion; --undo_parent.stat.numChildren; - if (is_sequential) - --undo_parent.seq_num; + --undo_parent.seq_num; }; ++it->second.stat.cversion; diff --git a/src/Databases/DatabaseReplicated.cpp b/src/Databases/DatabaseReplicated.cpp index 418eaf567a4..a7e6c11ca4c 100644 --- a/src/Databases/DatabaseReplicated.cpp +++ b/src/Databases/DatabaseReplicated.cpp @@ -170,6 +170,7 @@ void DatabaseReplicated::loadStoredObjects(Context & context, bool has_force_res DatabaseAtomic::loadStoredObjects(context, has_force_restore_data_flag, force_attach); ddl_worker = std::make_unique(this, global_context); + ddl_worker->startup(); } void DatabaseReplicated::onUnexpectedLogEntry(const String & entry_name, const ZooKeeperPtr & zookeeper) diff --git a/src/Interpreters/Context.cpp b/src/Interpreters/Context.cpp index 27deb07d296..ef19c134854 100644 --- a/src/Interpreters/Context.cpp +++ b/src/Interpreters/Context.cpp @@ -1487,6 +1487,7 @@ void Context::setDDLWorker(std::unique_ptr ddl_worker) auto lock = getLock(); if (shared->ddl_worker) throw Exception("DDL background thread has already been initialized", ErrorCodes::LOGICAL_ERROR); + ddl_worker->startup(); shared->ddl_worker = std::move(ddl_worker); } diff --git a/src/Interpreters/DDLWorker.cpp b/src/Interpreters/DDLWorker.cpp index 12f4c42b467..188d38b8647 100644 --- a/src/Interpreters/DDLWorker.cpp +++ b/src/Interpreters/DDLWorker.cpp @@ -167,7 +167,10 @@ DDLWorker::DDLWorker(int pool_size_, const std::string & zk_root_dir, const Cont host_fqdn = getFQDNOrHostName(); host_fqdn_id = Cluster::Address::toString(host_fqdn, context.getTCPPort()); +} +void DDLWorker::startup() +{ main_thread = ThreadFromGlobalPool(&DDLWorker::runMainThread, this); cleanup_thread = ThreadFromGlobalPool(&DDLWorker::runCleanupThread, this); } @@ -183,8 +186,10 @@ DDLWorker::~DDLWorker() { shutdown(); worker_pool.wait(); - main_thread.join(); - cleanup_thread.join(); + if (main_thread.joinable()) + main_thread.join(); + if (cleanup_thread.joinable()) + cleanup_thread.join(); } @@ -421,7 +426,12 @@ void DDLWorker::enqueueTask(DDLTaskPtr task_ptr) else if (e.code == Coordination::Error::ZNONODE) { LOG_ERROR(log, "ZooKeeper error: {}", getCurrentExceptionMessage(true)); - // TODO: retry? + if (!current_zookeeper->exists(task_ptr->entry_path)) + { + //FIXME race condition with cleanup thread + LOG_ERROR(log, "Task {} is lost. It probably was removed by other server.", task_ptr->entry_path); + return; + } } else { diff --git a/src/Interpreters/DDLWorker.h b/src/Interpreters/DDLWorker.h index 02076ae1df1..f41ca0fce8f 100644 --- a/src/Interpreters/DDLWorker.h +++ b/src/Interpreters/DDLWorker.h @@ -51,6 +51,7 @@ public: return host_fqdn_id; } + void startup(); void shutdown(); bool isCurrentlyActive() const { return initialized && !stop_flag; } diff --git a/src/Interpreters/InterpreterCreateQuery.cpp b/src/Interpreters/InterpreterCreateQuery.cpp index 0b7fb3e5431..f201e38be2e 100644 --- a/src/Interpreters/InterpreterCreateQuery.cpp +++ b/src/Interpreters/InterpreterCreateQuery.cpp @@ -136,7 +136,10 @@ BlockIO InterpreterCreateQuery::createDatabase(ASTCreateQuery & create) engine->name = "Atomic"; if (old_style_database) { - engine = makeASTFunction("Replicated", + if (database_name == "test") + engine->name = "Ordinary"; // for stateful tests + else + engine = makeASTFunction("Replicated", std::make_shared(fmt::format("/clickhouse/db/{}/", create.database)), std::make_shared("s1"), std::make_shared("r1")); diff --git a/src/Interpreters/executeDDLQueryOnCluster.cpp b/src/Interpreters/executeDDLQueryOnCluster.cpp index 0b44206a2b2..2ca07349cbc 100644 --- a/src/Interpreters/executeDDLQueryOnCluster.cpp +++ b/src/Interpreters/executeDDLQueryOnCluster.cpp @@ -201,6 +201,10 @@ DDLQueryStatusInputStream::DDLQueryStatusInputStream(const String & zk_node_path addTotalRowsApprox(waiting_hosts.size()); timeout_seconds = context.getSettingsRef().distributed_ddl_task_timeout; + + //FIXME revert it before merge + if (context.getSettingsRef().default_database_engine.value == DefaultDatabaseEngine::Ordinary) + timeout_seconds = 10; } Block DDLQueryStatusInputStream::readImpl() From 39532f7d9e47204a499ffa9200b91eaae9763aae Mon Sep 17 00:00:00 2001 From: Alexander Tokmakov Date: Thu, 3 Dec 2020 21:14:27 +0300 Subject: [PATCH 0068/2357] slightly better DDLWorker initialization and restarting --- src/Common/ZooKeeper/TestKeeper.cpp | 4 +- src/Databases/DatabaseAtomic.cpp | 3 - src/Databases/DatabaseReplicatedWorker.cpp | 32 +++- src/Databases/DatabaseReplicatedWorker.h | 3 +- src/Interpreters/DDLTask.h | 2 + src/Interpreters/DDLWorker.cpp | 187 ++++++++------------- src/Interpreters/DDLWorker.h | 15 +- 7 files changed, 114 insertions(+), 132 deletions(-) diff --git a/src/Common/ZooKeeper/TestKeeper.cpp b/src/Common/ZooKeeper/TestKeeper.cpp index 2d89228c7ae..86387417a3c 100644 --- a/src/Common/ZooKeeper/TestKeeper.cpp +++ b/src/Common/ZooKeeper/TestKeeper.cpp @@ -213,8 +213,6 @@ std::pair TestKeeperCreateRequest::process(TestKeeper::Contai created_node.is_sequental = is_sequential; std::string path_created = path; - ++it->second.seq_num; - if (is_sequential) { auto seq_num = it->second.seq_num; @@ -226,6 +224,8 @@ std::pair TestKeeperCreateRequest::process(TestKeeper::Contai path_created += seq_num_str.str(); } + ++it->second.seq_num; + response.path_created = path_created; container.emplace(path_created, std::move(created_node)); diff --git a/src/Databases/DatabaseAtomic.cpp b/src/Databases/DatabaseAtomic.cpp index b60adf44e51..438fa2d97bd 100644 --- a/src/Databases/DatabaseAtomic.cpp +++ b/src/Databases/DatabaseAtomic.cpp @@ -11,10 +11,7 @@ #include #include #include - -//FIXME it shouldn't be here #include -#include namespace DB { diff --git a/src/Databases/DatabaseReplicatedWorker.cpp b/src/Databases/DatabaseReplicatedWorker.cpp index 29599d4d66d..0c2368cdcf6 100644 --- a/src/Databases/DatabaseReplicatedWorker.cpp +++ b/src/Databases/DatabaseReplicatedWorker.cpp @@ -17,7 +17,26 @@ DatabaseReplicatedDDLWorker::DatabaseReplicatedDDLWorker(DatabaseReplicated * db /// Pool size must be 1 (to avoid reordering of log entries) } -void DatabaseReplicatedDDLWorker::initialize() +void DatabaseReplicatedDDLWorker::initializeMainThread() +{ + do + { + try + { + auto zookeeper = getAndSetZooKeeper(); + initializeReplication(); + initialized = true; + } + catch (...) + { + tryLogCurrentException(log, fmt::format("Error on initialization of {}", database->getDatabaseName())); + sleepForSeconds(5); + } + } + while (!initialized && !stop_flag); +} + +void DatabaseReplicatedDDLWorker::initializeReplication() { /// Check if we need to recover replica. /// Invariant: replica is lost if it's log_ptr value is less then min_log_ptr value. @@ -101,11 +120,16 @@ DDLTaskPtr DatabaseReplicatedDDLWorker::initAndCheckTask(const String & entry_na if (task->entry.query.empty()) { //TODO better way to determine special entries - task->was_executed = true; + out_reason = "It's dummy task"; + return {}; } - else + + task->parseQueryFromEntry(context); + + if (zookeeper->exists(task->getFinishedNodePath())) { - task->parseQueryFromEntry(context); + out_reason = "Task has been already processed"; + return {}; } return task; diff --git a/src/Databases/DatabaseReplicatedWorker.h b/src/Databases/DatabaseReplicatedWorker.h index d190bd1795d..7994104331e 100644 --- a/src/Databases/DatabaseReplicatedWorker.h +++ b/src/Databases/DatabaseReplicatedWorker.h @@ -15,7 +15,8 @@ public: String enqueueQuery(DDLLogEntry & entry) override; private: - void initialize() override; + void initializeMainThread() override; + void initializeReplication(); DDLTaskPtr initAndCheckTask(const String & entry_name, String & out_reason, const ZooKeeperPtr & zookeeper) override; diff --git a/src/Interpreters/DDLTask.h b/src/Interpreters/DDLTask.h index 2db1a696384..94127b39b84 100644 --- a/src/Interpreters/DDLTask.h +++ b/src/Interpreters/DDLTask.h @@ -76,6 +76,8 @@ struct DDLTaskBase bool was_executed = false; DDLTaskBase(const String & name, const String & path) : entry_name(name), entry_path(path) {} + DDLTaskBase(const DDLTaskBase &) = delete; + DDLTaskBase(DDLTaskBase &&) = default; virtual ~DDLTaskBase() = default; void parseQueryFromEntry(const Context & context); diff --git a/src/Interpreters/DDLWorker.cpp b/src/Interpreters/DDLWorker.cpp index 188d38b8647..e4ea5f8db17 100644 --- a/src/Interpreters/DDLWorker.cpp +++ b/src/Interpreters/DDLWorker.cpp @@ -143,9 +143,14 @@ DDLWorker::DDLWorker(int pool_size_, const std::string & zk_root_dir, const Cont const String & logger_name) : context(context_) , log(&Poco::Logger::get(logger_name)) - , pool_size(pool_size_) //FIXME make it optional - , worker_pool(pool_size_) + , pool_size(pool_size_) { + if (1 < pool_size) + { + LOG_WARNING(log, "DDLWorker is configured to use multiple threads. " + "It's not recommended because queries can be reordered. Also it may cause some unknown issues to appear."); + worker_pool.emplace(pool_size); + } queue_dir = zk_root_dir; if (queue_dir.back() == '/') queue_dir.resize(queue_dir.size() - 1); @@ -185,7 +190,8 @@ void DDLWorker::shutdown() DDLWorker::~DDLWorker() { shutdown(); - worker_pool.wait(); + if (worker_pool) + worker_pool->wait(); if (main_thread.joinable()) main_thread.join(); if (cleanup_thread.joinable()) @@ -209,24 +215,6 @@ ZooKeeperPtr DDLWorker::getAndSetZooKeeper() return current_zookeeper; } -void DDLWorker::recoverZooKeeper() -{ - LOG_DEBUG(log, "Recovering ZooKeeper session after: {}", getCurrentExceptionMessage(false)); - - while (!stop_flag) - { - try - { - getAndSetZooKeeper(); - break; - } - catch (...) - { - tryLogCurrentException(__PRETTY_FUNCTION__); - sleepForSeconds(5); - } - } -} DDLTaskPtr DDLWorker::initAndCheckTask(const String & entry_name, String & out_reason, const ZooKeeperPtr & zookeeper) { @@ -285,6 +273,12 @@ DDLTaskPtr DDLWorker::initAndCheckTask(const String & entry_name, String & out_r return {}; } + if (zookeeper->exists(task->getFinishedNodePath())) + { + out_reason = "Task has been already processed"; + return {}; + } + /// Now task is ready for execution return task; } @@ -309,11 +303,11 @@ void DDLWorker::scheduleTasks() return; } - bool server_startup = !last_entry_name.has_value(); + bool server_startup = current_tasks.empty(); auto begin_node = server_startup ? queue_nodes.begin() - : std::upper_bound(queue_nodes.begin(), queue_nodes.end(), *last_entry_name); + : std::upper_bound(queue_nodes.begin(), queue_nodes.end(), current_tasks.back()->entry_name); for (auto it = begin_node; it != queue_nodes.end() && !stop_flag; ++it) { @@ -325,42 +319,39 @@ void DDLWorker::scheduleTasks() if (!task) { LOG_DEBUG(log, "Will not execute task {}: {}", entry_name, reason); - last_entry_name = entry_name; + task->was_executed = true; + saveTask(std::move(task)); //FIXME questionable continue; } - bool already_processed = zookeeper->exists(task->entry_path + "/finished/" + task->host_id_str); - if (!server_startup && !task->was_executed && already_processed) - { - throw Exception(ErrorCodes::LOGICAL_ERROR, - "Server expects that DDL task {} should be processed, but it was already processed according to ZK", - entry_name); - } + auto & saved_task = saveTask(std::move(task)); - if (!already_processed) + if (worker_pool) { - if (pool_size == 1) + worker_pool->scheduleOrThrowOnError([this, &saved_task]() { - enqueueTask(DDLTaskPtr(task.release())); - } - else - { - worker_pool.scheduleOrThrowOnError([this, task_ptr = task.release()]() - { - setThreadName("DDLWorkerExec"); - enqueueTask(DDLTaskPtr(task_ptr)); - }); - } + setThreadName("DDLWorkerExec"); + processTask(saved_task); + }); } else { - LOG_DEBUG(log, "Task {} ({}) has been already processed", entry_name, task->entry.query); + processTask(saved_task); } - - last_entry_name = entry_name; } } +DDLTaskBase & DDLWorker::saveTask(DDLTaskPtr && task) +{ + if (current_tasks.size() == pool_size) + { + assert(current_tasks.front()->was_executed); + current_tasks.pop_front(); + } + current_tasks.emplace_back(std::move(task)); + return *current_tasks.back(); +} + bool DDLWorker::tryExecuteQuery(const String & query, const DDLTaskBase & task, ExecutionStatus & status) { /// Add special comment at the start of query to easily identify DDL-produced queries in query_log @@ -404,48 +395,6 @@ void DDLWorker::attachToThreadGroup() } } - -void DDLWorker::enqueueTask(DDLTaskPtr task_ptr) -{ - auto & task = *task_ptr; - - while (!stop_flag) - { - try - { - processTask(task); - return; - } - /// TODO recover zk in runMainThread(...) and retry task (why do we need another place where session is recovered?) - catch (const Coordination::Exception & e) - { - if (Coordination::isHardwareError(e.code)) - { - recoverZooKeeper(); - } - else if (e.code == Coordination::Error::ZNONODE) - { - LOG_ERROR(log, "ZooKeeper error: {}", getCurrentExceptionMessage(true)); - if (!current_zookeeper->exists(task_ptr->entry_path)) - { - //FIXME race condition with cleanup thread - LOG_ERROR(log, "Task {} is lost. It probably was removed by other server.", task_ptr->entry_path); - return; - } - } - else - { - LOG_ERROR(log, "Unexpected ZooKeeper error: {}.", getCurrentExceptionMessage(true)); - return; - } - } - catch (...) - { - LOG_WARNING(log, "An error occurred while processing task {} ({}) : {}", task.entry_name, task.entry.query, getCurrentExceptionMessage(true)); - } - } -} - void DDLWorker::processTask(DDLTaskBase & task) { auto zookeeper = tryGetZooKeeper(); @@ -458,22 +407,16 @@ void DDLWorker::processTask(DDLTaskBase & task) String dummy; auto code = zookeeper->tryCreate(active_node_path, "", zkutil::CreateMode::Ephemeral, dummy); - if (code == Coordination::Error::ZOK || code == Coordination::Error::ZNODEEXISTS) - { - // Ok - } - else if (code == Coordination::Error::ZNONODE) + if (code == Coordination::Error::ZNONODE) { /// There is no parent - //TODO why not to create parent before active_node? createStatusDirs(task.entry_path, zookeeper); - if (Coordination::Error::ZOK != zookeeper->tryCreate(active_node_path, "", zkutil::CreateMode::Ephemeral, dummy)) - throw Coordination::Exception(code, active_node_path); + zookeeper->create(active_node_path, "", zkutil::CreateMode::Ephemeral); } else throw Coordination::Exception(code, active_node_path); - if (!task.was_executed) + if (!task.was_executed) // FIXME always true { try { @@ -513,6 +456,9 @@ void DDLWorker::processTask(DDLTaskBase & task) } /// FIXME: if server fails right here, the task will be executed twice. We need WAL here. + /// Another possible issue: if ZooKeeper session is lost here, we will recover connection and execute the task second time. + + /// Delete active flag and create finish flag Coordination::Requests ops; @@ -787,7 +733,9 @@ String DDLWorker::enqueueQuery(DDLLogEntry & entry) String node_path = zookeeper->create(query_path_prefix, entry.toString(), zkutil::CreateMode::PersistentSequential); - /// Optional step + /// We cannot create status dirs in a single transaction with previous request, + /// because we don't know node_path until previous request is executed. + /// Se we try to create status dirs here or later when we will execute entry. try { createStatusDirs(node_path, zookeeper); @@ -801,70 +749,80 @@ String DDLWorker::enqueueQuery(DDLLogEntry & entry) } -void DDLWorker::runMainThread() +void DDLWorker::initializeMainThread() { - setThreadName("DDLWorker"); - LOG_DEBUG(log, "Started DDLWorker thread"); - do { try { auto zookeeper = getAndSetZooKeeper(); zookeeper->createAncestors(queue_dir + "/"); - initialize(); initialized = true; } catch (const Coordination::Exception & e) { if (!Coordination::isHardwareError(e.code)) - throw; /// A logical error. + throw Exception(ErrorCodes::LOGICAL_ERROR, "Unexpected ZooKeeper error: {}", e.message()); tryLogCurrentException(__PRETTY_FUNCTION__); /// Avoid busy loop when ZooKeeper is not available. - sleepForSeconds(1); + sleepForSeconds(5); } catch (...) { - tryLogCurrentException(log, "Terminating. Cannot initialize DDL queue."); - return; + tryLogCurrentException(log, "Cannot initialize main thread of DDLWorker, will try again"); + sleepForSeconds(5); } } while (!initialized && !stop_flag); +} + +void DDLWorker::runMainThread() +{ + setThreadName("DDLWorker"); + attachToThreadGroup(); + LOG_DEBUG(log, "Starting DDLWorker thread"); while (!stop_flag) { try { - attachToThreadGroup(); + /// Reinitialize DDLWorker state (including ZooKeeper connection) if required + if (!initialized) + { + initializeMainThread(); + LOG_DEBUG(log, "Initialized DDLWorker thread"); + } cleanup_event->set(); scheduleTasks(); - LOG_DEBUG(log, "Waiting a watch"); + LOG_DEBUG(log, "Waiting for queue updates"); queue_updated_event->wait(); } catch (const Coordination::Exception & e) { if (Coordination::isHardwareError(e.code)) { - recoverZooKeeper(); + initialized = false; } else if (e.code == Coordination::Error::ZNONODE) { + // TODO add comment: when it happens and why it's expected? + // maybe because cleanup thread may remove nodes inside queue entry which are currently processed LOG_ERROR(log, "ZooKeeper error: {}", getCurrentExceptionMessage(true)); } else { - LOG_ERROR(log, "Unexpected ZooKeeper error: {}. Terminating.", getCurrentExceptionMessage(true)); - return; + LOG_ERROR(log, "Unexpected ZooKeeper error: {}.", getCurrentExceptionMessage(true)); + assert(false); } } catch (...) { - tryLogCurrentException(log, "Unexpected error, will terminate:"); - return; + tryLogCurrentException(log, "Unexpected error, will try to restart main thread:"); + initialized = false; } } } @@ -891,6 +849,7 @@ void DDLWorker::runCleanupThread() continue; } + /// ZooKeeper connection is recovered by main thread. We will wait for it on cleanup_event. auto zookeeper = tryGetZooKeeper(); if (zookeeper->expired()) continue; diff --git a/src/Interpreters/DDLWorker.h b/src/Interpreters/DDLWorker.h index f41ca0fce8f..78921fa60e3 100644 --- a/src/Interpreters/DDLWorker.h +++ b/src/Interpreters/DDLWorker.h @@ -62,17 +62,16 @@ protected: ZooKeeperPtr tryGetZooKeeper() const; /// If necessary, creates a new session and caches it. ZooKeeperPtr getAndSetZooKeeper(); - /// ZooKeeper recover loop (while not stopped). - void recoverZooKeeper(); - void checkCurrentTasks(); + /// Iterates through queue tasks in ZooKeeper, runs execution of new tasks void scheduleTasks(); + DDLTaskBase & saveTask(DDLTaskPtr && task); + /// Reads entry and check that the host belongs to host list of the task /// Returns non-empty DDLTaskPtr if entry parsed and the check is passed virtual DDLTaskPtr initAndCheckTask(const String & entry_name, String & out_reason, const ZooKeeperPtr & zookeeper); - void enqueueTask(DDLTaskPtr task); void processTask(DDLTaskBase & task); /// Check that query should be executed on leader replica only @@ -98,7 +97,7 @@ protected: /// Init task node static void createStatusDirs(const std::string & node_path, const ZooKeeperPtr & zookeeper); - virtual void initialize() {} + virtual void initializeMainThread(); void runMainThread(); void runCleanupThread(); @@ -117,8 +116,8 @@ protected: ZooKeeperPtr current_zookeeper; /// Save state of executed task to avoid duplicate execution on ZK error - //std::vector last_tasks; - std::optional last_entry_name; + //std::optional last_entry_name; + std::list current_tasks; std::shared_ptr queue_updated_event = std::make_shared(); std::shared_ptr cleanup_event = std::make_shared(); @@ -130,7 +129,7 @@ protected: /// Size of the pool for query execution. size_t pool_size = 1; - ThreadPool worker_pool; + std::optional worker_pool; /// Cleaning starts after new node event is received if the last cleaning wasn't made sooner than N seconds ago Int64 cleanup_delay_period = 60; // minute (in seconds) From 9f3c77f62e281fbb6c14e23ec81bde5e7000f416 Mon Sep 17 00:00:00 2001 From: Alexander Tokmakov Date: Fri, 4 Dec 2020 23:12:32 +0300 Subject: [PATCH 0069/2357] add zk ops into task --- src/Common/ZooKeeper/ZooKeeper.h | 8 ++ src/Interpreters/DDLTask.cpp | 18 ++-- src/Interpreters/DDLTask.h | 18 +++- src/Interpreters/DDLWorker.cpp | 172 ++++++++++++++++++++++--------- src/Interpreters/DDLWorker.h | 2 +- 5 files changed, 160 insertions(+), 58 deletions(-) diff --git a/src/Common/ZooKeeper/ZooKeeper.h b/src/Common/ZooKeeper/ZooKeeper.h index 1ad744102c6..e79553ed4d9 100644 --- a/src/Common/ZooKeeper/ZooKeeper.h +++ b/src/Common/ZooKeeper/ZooKeeper.h @@ -314,8 +314,15 @@ public: return std::make_shared(path, zookeeper, false, false, ""); } + void reset() + { + need_remove = false; + } + ~EphemeralNodeHolder() { + if (!need_remove) + return; try { zookeeper.tryRemove(path); @@ -331,6 +338,7 @@ private: std::string path; ZooKeeper & zookeeper; CurrentMetrics::Increment metric_increment{CurrentMetrics::EphemeralNode}; + bool need_remove = true; }; using EphemeralNodeHolderPtr = EphemeralNodeHolder::Ptr; diff --git a/src/Interpreters/DDLTask.cpp b/src/Interpreters/DDLTask.cpp index 9ef7352ceb4..3d9297880c1 100644 --- a/src/Interpreters/DDLTask.cpp +++ b/src/Interpreters/DDLTask.cpp @@ -96,7 +96,7 @@ void DDLTaskBase::parseQueryFromEntry(const Context & context) query = parseQuery(parser_query, begin, end, description, 0, context.getSettingsRef().max_parser_depth); } -std::unique_ptr DDLTaskBase::makeQueryContext(Context & from_context) const +std::unique_ptr DDLTaskBase::makeQueryContext(Context & from_context) { auto query_context = std::make_unique(from_context); query_context->makeQueryContext(); @@ -293,7 +293,7 @@ String DatabaseReplicatedTask::getShardID() const return database->shard_name; } -std::unique_ptr DatabaseReplicatedTask::makeQueryContext(Context & from_context) const +std::unique_ptr DatabaseReplicatedTask::makeQueryContext(Context & from_context) { auto query_context = DDLTaskBase::makeQueryContext(from_context); query_context->getClientInfo().query_kind = ClientInfo::QueryKind::REPLICATED_LOG_QUERY; //FIXME why do we need separate query kind? @@ -309,15 +309,18 @@ std::unique_ptr DatabaseReplicatedTask::makeQueryContext(Context & from { txn->ops.emplace_back(zkutil::makeRemoveRequest(entry_path + "/try", -1)); txn->ops.emplace_back(zkutil::makeCreateRequest(entry_path + "/committed", host_id_str, zkutil::CreateMode::Persistent)); - txn->ops.emplace_back(zkutil::makeRemoveRequest(getActiveNodePath(), -1)); + //txn->ops.emplace_back(zkutil::makeRemoveRequest(getActiveNodePath(), -1)); txn->ops.emplace_back(zkutil::makeSetRequest(database->zookeeper_path + "/max_log_ptr", toString(getLogEntryNumber(entry_name)), -1)); } - if (execute_on_leader) - txn->ops.emplace_back(zkutil::makeCreateRequest(getShardNodePath() + "/executed", host_id_str, zkutil::CreateMode::Persistent)); - txn->ops.emplace_back(zkutil::makeCreateRequest(getFinishedNodePath(), execution_status.serializeText(), zkutil::CreateMode::Persistent)); + //if (execute_on_leader) + // txn->ops.emplace_back(zkutil::makeCreateRequest(getShardNodePath() + "/executed", host_id_str, zkutil::CreateMode::Persistent)); + //txn->ops.emplace_back(zkutil::makeCreateRequest(getFinishedNodePath(), execution_status.serializeText(), zkutil::CreateMode::Persistent)); txn->ops.emplace_back(zkutil::makeSetRequest(database->replica_path + "/log_ptr", toString(getLogEntryNumber(entry_name)), -1)); + std::move(ops.begin(), ops.end(), std::back_inserter(txn->ops)); + ops.clear(); + return query_context; } @@ -338,7 +341,10 @@ UInt32 DatabaseReplicatedTask::getLogEntryNumber(const String & log_entry_name) void MetadataTransaction::commit() { + assert(state == CREATED); + state = FAILED; current_zookeeper->multi(ops); + state = COMMITED; } } diff --git a/src/Interpreters/DDLTask.h b/src/Interpreters/DDLTask.h index 94127b39b84..aa234d1bfdd 100644 --- a/src/Interpreters/DDLTask.h +++ b/src/Interpreters/DDLTask.h @@ -15,6 +15,9 @@ class ASTQueryWithOnCluster; using ZooKeeperPtr = std::shared_ptr; class DatabaseReplicated; +struct MetadataTransaction; +using MetadataTransactionPtr = std::shared_ptr; + struct HostID { String host_name; @@ -72,6 +75,8 @@ struct DDLTaskBase bool is_circular_replicated = false; bool execute_on_leader = false; + //MetadataTransactionPtr txn; + Coordination::Requests ops; ExecutionStatus execution_status; bool was_executed = false; @@ -84,7 +89,7 @@ struct DDLTaskBase virtual String getShardID() const = 0; - virtual std::unique_ptr makeQueryContext(Context & from_context) const; + virtual std::unique_ptr makeQueryContext(Context & from_context); inline String getActiveNodePath() const { return entry_path + "/active/" + host_id_str; } inline String getFinishedNodePath() const { return entry_path + "/finished/" + host_id_str; } @@ -119,7 +124,7 @@ struct DatabaseReplicatedTask : public DDLTaskBase DatabaseReplicatedTask(const String & name, const String & path, DatabaseReplicated * database_); String getShardID() const override; - std::unique_ptr makeQueryContext(Context & from_context) const override; + std::unique_ptr makeQueryContext(Context & from_context) override; static String getLogEntryName(UInt32 log_entry_number); static UInt32 getLogEntryNumber(const String & log_entry_name); @@ -131,6 +136,14 @@ struct DatabaseReplicatedTask : public DDLTaskBase struct MetadataTransaction { + enum State + { + CREATED, + COMMITED, + FAILED + }; + + State state = CREATED; ZooKeeperPtr current_zookeeper; String zookeeper_path; bool is_initial_query; @@ -142,6 +155,7 @@ struct MetadataTransaction } void commit(); + }; } diff --git a/src/Interpreters/DDLWorker.cpp b/src/Interpreters/DDLWorker.cpp index e4ea5f8db17..a3262c238fc 100644 --- a/src/Interpreters/DDLWorker.cpp +++ b/src/Interpreters/DDLWorker.cpp @@ -38,6 +38,11 @@ namespace ErrorCodes extern const int LOGICAL_ERROR; extern const int TIMEOUT_EXCEEDED; extern const int UNFINISHED; + extern const int NOT_A_LEADER; + extern const int KEEPER_EXCEPTION; + extern const int CANNOT_ASSIGN_ALTER; + extern const int CANNOT_ALLOCATE_MEMORY; + extern const int MEMORY_LIMIT_EXCEEDED; } @@ -295,6 +300,19 @@ void DDLWorker::scheduleTasks() LOG_DEBUG(log, "Scheduling tasks"); auto zookeeper = tryGetZooKeeper(); + for (auto & task : current_tasks) + { + /// Main thread of DDLWorker was restarted, probably due to lost connection with ZooKeeper. + /// We have some unfinished tasks. To avoid duplication of some queries, try to write execution status. + bool status_written = task->ops.empty(); + bool task_still_exists = zookeeper->exists(task->entry_path); + if (task->was_executed && !status_written && task_still_exists) + { + assert(!zookeeper->exists(task->getFinishedNodePath())); + processTask(*task); + } + } + Strings queue_nodes = zookeeper->getChildren(queue_dir, nullptr, queue_updated_event); filterAndSortQueueNodes(queue_nodes); if (queue_nodes.empty()) @@ -304,10 +322,16 @@ void DDLWorker::scheduleTasks() } bool server_startup = current_tasks.empty(); + auto begin_node = queue_nodes.begin(); - auto begin_node = server_startup - ? queue_nodes.begin() - : std::upper_bound(queue_nodes.begin(), queue_nodes.end(), current_tasks.back()->entry_name); + if (!server_startup) + { + /// We will recheck status of last executed tasks. It's useful if main thread was just restarted. + auto & min_task = *std::min_element(current_tasks.begin(), current_tasks.end()); + begin_node = std::upper_bound(queue_nodes.begin(), queue_nodes.end(), min_task->entry_name); + current_tasks.clear(); + //FIXME better way of maintaning current tasks list and min_task name; + } for (auto it = begin_node; it != queue_nodes.end() && !stop_flag; ++it) { @@ -319,8 +343,8 @@ void DDLWorker::scheduleTasks() if (!task) { LOG_DEBUG(log, "Will not execute task {}: {}", entry_name, reason); - task->was_executed = true; - saveTask(std::move(task)); //FIXME questionable + //task->was_executed = true; + //saveTask(std::move(task)); continue; } @@ -343,16 +367,17 @@ void DDLWorker::scheduleTasks() DDLTaskBase & DDLWorker::saveTask(DDLTaskPtr && task) { - if (current_tasks.size() == pool_size) - { - assert(current_tasks.front()->was_executed); - current_tasks.pop_front(); - } + //assert(current_tasks.size() <= pool_size + 1); + //if (current_tasks.size() == pool_size) + //{ + // assert(current_tasks.front()->ops.empty()); //FIXME + // current_tasks.pop_front(); + //} current_tasks.emplace_back(std::move(task)); return *current_tasks.back(); } -bool DDLWorker::tryExecuteQuery(const String & query, const DDLTaskBase & task, ExecutionStatus & status) +bool DDLWorker::tryExecuteQuery(const String & query, DDLTaskBase & task) { /// Add special comment at the start of query to easily identify DDL-produced queries in query_log String query_prefix = "/* ddl_entry=" + task.entry_name + " */ "; @@ -367,15 +392,34 @@ bool DDLWorker::tryExecuteQuery(const String & query, const DDLTaskBase & task, auto query_context = task.makeQueryContext(context); executeQuery(istr, ostr, false, *query_context, {}); } - catch (...) + catch (const DB::Exception & e) { - status = ExecutionStatus::fromCurrentException(); + task.execution_status = ExecutionStatus::fromCurrentException(); tryLogCurrentException(log, "Query " + query + " wasn't finished successfully"); + /// We use return value of tryExecuteQuery(...) in tryExecuteQueryOnLeaderReplica(...) to determine + /// if replica has stopped being leader and we should retry query. + /// However, for the majority of exceptions there is no sense to retry, because most likely we will just + /// get the same exception again. So we return false only for several special exception codes, + /// and consider query as executed with status "failed" and return true in other cases. + bool no_sense_to_retry = e.code() != ErrorCodes::KEEPER_EXCEPTION && + e.code() != ErrorCodes::NOT_A_LEADER && + e.code() != ErrorCodes::CANNOT_ASSIGN_ALTER && + e.code() != ErrorCodes::CANNOT_ALLOCATE_MEMORY && + e.code() != ErrorCodes::MEMORY_LIMIT_EXCEEDED; + return no_sense_to_retry; + } + catch (...) + { + task.execution_status = ExecutionStatus::fromCurrentException(); + tryLogCurrentException(log, "Query " + query + " wasn't finished successfully"); + + /// We don't know what exactly happened, but maybe it's Poco::NetException or std::bad_alloc, + /// so we consider unknown exception as retryable error. return false; } - status = ExecutionStatus(0); + task.execution_status = ExecutionStatus(0); LOG_DEBUG(log, "Executed query: {}", query); return true; @@ -405,19 +449,18 @@ void DDLWorker::processTask(DDLTaskBase & task) String finished_node_path = task.getFinishedNodePath(); String dummy; - auto code = zookeeper->tryCreate(active_node_path, "", zkutil::CreateMode::Ephemeral, dummy); + zookeeper->createAncestors(active_node_path); + auto active_node = zkutil::EphemeralNodeHolder::create(active_node_path, *zookeeper, ""); - if (code == Coordination::Error::ZNONODE) + if (!task.was_executed) { - /// There is no parent - createStatusDirs(task.entry_path, zookeeper); - zookeeper->create(active_node_path, "", zkutil::CreateMode::Ephemeral); - } - else - throw Coordination::Exception(code, active_node_path); + /// If table and database engine supports it, they will execute task.ops by their own in a single transaction + /// with other zk operations (such as appending something to ReplicatedMergeTree log, or + /// updating metadata in Replicated database), so we make create request for finished_node_path with status "0", + /// which means that query executed successfully. + task.ops.emplace_back(zkutil::makeRemoveRequest(active_node_path, -1)); + task.ops.emplace_back(zkutil::makeCreateRequest(finished_node_path, "0", zkutil::CreateMode::Persistent)); - if (!task.was_executed) // FIXME always true - { try { String rewritten_query = queryToString(task.query); @@ -439,7 +482,7 @@ void DDLWorker::processTask(DDLTaskBase & task) if (task.execute_on_leader) tryExecuteQueryOnLeaderReplica(task, storage, rewritten_query, task.entry_path, zookeeper); else - tryExecuteQuery(rewritten_query, task, task.execution_status); + tryExecuteQuery(rewritten_query, task); } catch (const Coordination::Exception &) { @@ -451,25 +494,35 @@ void DDLWorker::processTask(DDLTaskBase & task) task.execution_status = ExecutionStatus::fromCurrentException("An error occurred before execution"); } + if (task.execution_status.code != 0) + { + bool status_written_by_table_or_db = task.ops.empty(); + if (status_written_by_table_or_db) + { + throw Exception(ErrorCodes::UNFINISHED, "Unexpected error: {}", task.execution_status.serializeText()); + } + else + { + /// task.ops where not executed by table or database engine, se DDLWorker is responsible for + /// writing query execution status into ZooKeeper. + task.ops.emplace_back(zkutil::makeSetRequest(finished_node_path, task.execution_status.serializeText(), -1)); + } + } + /// We need to distinguish ZK errors occurred before and after query executing task.was_executed = true; } /// FIXME: if server fails right here, the task will be executed twice. We need WAL here. - /// Another possible issue: if ZooKeeper session is lost here, we will recover connection and execute the task second time. + /// If ZooKeeper connection is lost here, we will try again to write query status. - - - /// Delete active flag and create finish flag - Coordination::Requests ops; - ops.emplace_back(zkutil::makeRemoveRequest(active_node_path, -1)); - ops.emplace_back(zkutil::makeCreateRequest(finished_node_path, task.execution_status.serializeText(), zkutil::CreateMode::Persistent)); - - //FIXME replace with multi(...) or use MetadataTransaction - Coordination::Responses responses; - auto res = zookeeper->tryMulti(ops, responses); - if (res != Coordination::Error::ZNODEEXISTS && res != Coordination::Error::ZNONODE) - zkutil::KeeperMultiException::check(res, ops, responses); + bool status_written = task.ops.empty(); + if (!status_written) + { + zookeeper->multi(task.ops); + active_node->reset(); + task.ops.clear(); + } } @@ -496,13 +549,17 @@ bool DDLWorker::tryExecuteQueryOnLeaderReplica( /// If we will develop new replicated storage if (!replicated_storage) - throw Exception(ErrorCodes::NOT_IMPLEMENTED, "Storage type '{}' is not supported by distributed DDL", storage->getName()); + throw Exception(ErrorCodes::LOGICAL_ERROR, "Storage type '{}' is not supported by distributed DDL", storage->getName()); String shard_path = task.getShardNodePath(); String is_executed_path = shard_path + "/executed"; String tries_to_execute_path = shard_path + "/tries_to_execute"; zookeeper->createAncestors(shard_path + "/"); + /// Leader replica creates is_executed_path node on successful query execution. + /// We will remove create_shard_flag from zk operations list, if current replica is just waiting for leader to execute the query. + auto create_shard_flag = zkutil::makeCreateRequest(is_executed_path, task.host_id_str, zkutil::CreateMode::Persistent); + /// Node exists, or we will create or we will get an exception zookeeper->tryCreate(tries_to_execute_path, "0", zkutil::CreateMode::Persistent); @@ -526,7 +583,9 @@ bool DDLWorker::tryExecuteQueryOnLeaderReplica( Stopwatch stopwatch; - bool executed_by_leader = false; + bool executed_by_us = false; + bool executed_by_other_leader = false; + /// Defensive programming. One hour is more than enough to execute almost all DDL queries. /// If it will be very long query like ALTER DELETE for a huge table it's still will be executed, /// but DDL worker can continue processing other queries. @@ -544,7 +603,7 @@ bool DDLWorker::tryExecuteQueryOnLeaderReplica( if (zookeeper->tryGet(is_executed_path, executed_by)) { LOG_DEBUG(log, "Task {} has already been executed by replica ({}) of the same shard.", task.entry_name, executed_by); - executed_by_leader = true; + executed_by_other_leader = true; break; } @@ -555,13 +614,14 @@ bool DDLWorker::tryExecuteQueryOnLeaderReplica( zookeeper->set(tries_to_execute_path, toString(counter + 1)); + task.ops.push_back(create_shard_flag); + SCOPE_EXIT({ if (!executed_by_us && !task.ops.empty()) task.ops.pop_back(); }); + /// If the leader will unexpectedly changed this method will return false /// and on the next iteration new leader will take lock - if (tryExecuteQuery(rewritten_query, task, task.execution_status)) + if (tryExecuteQuery(rewritten_query, task)) { - //FIXME replace with create(...) or remove and use MetadataTransaction - zookeeper->createIfNotExists(is_executed_path, task.host_id_str); - executed_by_leader = true; + executed_by_us = true; break; } @@ -572,7 +632,7 @@ bool DDLWorker::tryExecuteQueryOnLeaderReplica( if (event->tryWait(std::uniform_int_distribution(0, 1000)(rng))) { LOG_DEBUG(log, "Task {} has already been executed by replica ({}) of the same shard.", task.entry_name, zookeeper->get(is_executed_path)); - executed_by_leader = true; + executed_by_other_leader = true; break; } else @@ -593,8 +653,10 @@ bool DDLWorker::tryExecuteQueryOnLeaderReplica( } } + assert(!(executed_by_us && executed_by_other_leader)); + /// Not executed by leader so was not executed at all - if (!executed_by_leader) + if (!executed_by_us && !executed_by_other_leader) { /// If we failed with timeout if (stopwatch.elapsedSeconds() >= MAX_EXECUTION_TIMEOUT_SEC) @@ -610,7 +672,11 @@ bool DDLWorker::tryExecuteQueryOnLeaderReplica( return false; } - LOG_DEBUG(log, "Task {} has already been executed by replica ({}) of the same shard.", task.entry_name, zookeeper->get(is_executed_path)); + if (executed_by_us) + LOG_DEBUG(log, "Task {} executed by current replica", task.entry_name); + else // if (executed_by_other_leader) + LOG_DEBUG(log, "Task {} has already been executed by replica ({}) of the same shard.", task.entry_name, zookeeper->get(is_executed_path)); + return true; } @@ -816,9 +882,17 @@ void DDLWorker::runMainThread() else { LOG_ERROR(log, "Unexpected ZooKeeper error: {}.", getCurrentExceptionMessage(true)); - assert(false); + //assert(false); } } + catch (const Exception & e) + { + if (e.code() == ErrorCodes::LOGICAL_ERROR) + throw; /// Something terrible happened. Will terminate DDLWorker. + + tryLogCurrentException(log, "Unexpected error, will try to restart main thread:"); + initialized = false; + } catch (...) { tryLogCurrentException(log, "Unexpected error, will try to restart main thread:"); diff --git a/src/Interpreters/DDLWorker.h b/src/Interpreters/DDLWorker.h index 78921fa60e3..4145e0754e8 100644 --- a/src/Interpreters/DDLWorker.h +++ b/src/Interpreters/DDLWorker.h @@ -89,7 +89,7 @@ protected: const String & node_path, const ZooKeeperPtr & zookeeper); - bool tryExecuteQuery(const String & query, const DDLTaskBase & task, ExecutionStatus & status); + bool tryExecuteQuery(const String & query, DDLTaskBase & task); /// Checks and cleanups queue's nodes void cleanupQueue(Int64 current_time_seconds, const ZooKeeperPtr & zookeeper); From 1a855845a890e19c5fe6dbb1c414fb1da761e6c5 Mon Sep 17 00:00:00 2001 From: MyroTk Date: Wed, 13 Jan 2021 15:22:42 +0100 Subject: [PATCH 0070/2357] Bringing up to date --- contrib/AMQP-CPP | 2 +- contrib/arrow | 2 +- contrib/aws | 2 +- contrib/boost | 2 +- contrib/cassandra | 2 +- contrib/cctz | 2 +- contrib/croaring | 2 +- contrib/grpc | 2 +- contrib/jemalloc | 2 +- contrib/krb5 | 2 +- contrib/libc-headers | 2 +- contrib/libcxx | 2 +- contrib/libcxxabi | 2 +- contrib/libgsasl | 2 +- contrib/libhdfs3 | 2 +- contrib/librdkafka | 2 +- contrib/libunwind | 2 +- contrib/mariadb-connector-c | 2 +- contrib/openldap | 2 +- contrib/poco | 2 +- contrib/protobuf | 2 +- contrib/replxx | 2 +- 22 files changed, 22 insertions(+), 22 deletions(-) diff --git a/contrib/AMQP-CPP b/contrib/AMQP-CPP index 03781aaff0f..d63e1f01658 160000 --- a/contrib/AMQP-CPP +++ b/contrib/AMQP-CPP @@ -1 +1 @@ -Subproject commit 03781aaff0f10ef41f902b8cf865fe0067180c10 +Subproject commit d63e1f016582e9faaaf279aa24513087a07bc6e7 diff --git a/contrib/arrow b/contrib/arrow index 744bdfe188f..3cbcb7b62c2 160000 --- a/contrib/arrow +++ b/contrib/arrow @@ -1 +1 @@ -Subproject commit 744bdfe188f018e5e05f5deebd4e9ee0a7706cf4 +Subproject commit 3cbcb7b62c2f2d02851bff837758637eb592a64b diff --git a/contrib/aws b/contrib/aws index a220591e335..17e10c0fc77 160000 --- a/contrib/aws +++ b/contrib/aws @@ -1 +1 @@ -Subproject commit a220591e335923ce1c19bbf9eb925787f7ab6c13 +Subproject commit 17e10c0fc77f22afe890fa6d1b283760e5edaa56 diff --git a/contrib/boost b/contrib/boost index 8e259cd2a6b..a04e72c0464 160000 --- a/contrib/boost +++ b/contrib/boost @@ -1 +1 @@ -Subproject commit 8e259cd2a6b60d75dd17e73432f11bb7b9351bb1 +Subproject commit a04e72c0464f0c31d3384f18f0c0db36a05538e0 diff --git a/contrib/cassandra b/contrib/cassandra index d10187efb25..a49b4e0e269 160000 --- a/contrib/cassandra +++ b/contrib/cassandra @@ -1 +1 @@ -Subproject commit d10187efb25b26da391def077edf3c6f2f3a23dd +Subproject commit a49b4e0e2696a4b8ef286a5b9538d1cbe8490509 diff --git a/contrib/cctz b/contrib/cctz index c0f1bcb97fd..7a2db4ece6e 160000 --- a/contrib/cctz +++ b/contrib/cctz @@ -1 +1 @@ -Subproject commit c0f1bcb97fd2782f7c3f972fadd5aad5affac4b8 +Subproject commit 7a2db4ece6e0f1b246173cbdb62711ae258ee841 diff --git a/contrib/croaring b/contrib/croaring index d8402939b5c..5f20740ec0d 160000 --- a/contrib/croaring +++ b/contrib/croaring @@ -1 +1 @@ -Subproject commit d8402939b5c9fc134fd4fcf058fe0f7006d2b129 +Subproject commit 5f20740ec0de5e153e8f4cb2ab91814e8b291a14 diff --git a/contrib/grpc b/contrib/grpc index 7436366ceb3..a6570b863cf 160000 --- a/contrib/grpc +++ b/contrib/grpc @@ -1 +1 @@ -Subproject commit 7436366ceb341ba5c00ea29f1645e02a2b70bf93 +Subproject commit a6570b863cf76c9699580ba51c7827d5bffaac43 diff --git a/contrib/jemalloc b/contrib/jemalloc index e6891d97461..93e27e435ca 160000 --- a/contrib/jemalloc +++ b/contrib/jemalloc @@ -1 +1 @@ -Subproject commit e6891d9746143bf2cf617493d880ba5a0b9a3efd +Subproject commit 93e27e435cac846028da20cd9b0841fbc9110bd2 diff --git a/contrib/krb5 b/contrib/krb5 index 90ff6f4f8c6..99f7ad2831a 160000 --- a/contrib/krb5 +++ b/contrib/krb5 @@ -1 +1 @@ -Subproject commit 90ff6f4f8c695d6bf1aaba78a9b8942be92141c2 +Subproject commit 99f7ad2831a01f264c07eed42a0a3a9336b86184 diff --git a/contrib/libc-headers b/contrib/libc-headers index a720b7105a6..92c74f938cf 160000 --- a/contrib/libc-headers +++ b/contrib/libc-headers @@ -1 +1 @@ -Subproject commit a720b7105a610acbd7427eea475a5b6810c151eb +Subproject commit 92c74f938cf2c4dd529cae4f3d2923d153b029a7 diff --git a/contrib/libcxx b/contrib/libcxx index 8b80a151d12..9f71e122533 160000 --- a/contrib/libcxx +++ b/contrib/libcxx @@ -1 +1 @@ -Subproject commit 8b80a151d12b98ffe2d0c22f7cec12c3b9ff88d7 +Subproject commit 9f71e122533c43298c2892108904bb942b0d840f diff --git a/contrib/libcxxabi b/contrib/libcxxabi index df8f1e727db..1ebc83af4c0 160000 --- a/contrib/libcxxabi +++ b/contrib/libcxxabi @@ -1 +1 @@ -Subproject commit df8f1e727dbc9e2bedf2282096fa189dc3fe0076 +Subproject commit 1ebc83af4c06dbcd56b4d166c1314a7d4c1173f9 diff --git a/contrib/libgsasl b/contrib/libgsasl index 383ee28e82f..140fb582505 160000 --- a/contrib/libgsasl +++ b/contrib/libgsasl @@ -1 +1 @@ -Subproject commit 383ee28e82f69fa16ed43b48bd9c8ee5b313ab84 +Subproject commit 140fb58250588c8323285b75fcf127c4adc33dfa diff --git a/contrib/libhdfs3 b/contrib/libhdfs3 index 095b9d48b40..30552ac527f 160000 --- a/contrib/libhdfs3 +++ b/contrib/libhdfs3 @@ -1 +1 @@ -Subproject commit 095b9d48b400abb72d967cb0539af13b1e3d90cf +Subproject commit 30552ac527f2c14070d834e171493b2e7f662375 diff --git a/contrib/librdkafka b/contrib/librdkafka index f2f6616419d..2090cbf56b7 160000 --- a/contrib/librdkafka +++ b/contrib/librdkafka @@ -1 +1 @@ -Subproject commit f2f6616419d567c9198aef0d1133a2e9b4f02276 +Subproject commit 2090cbf56b715247ec2be7f768707a7ab1bf7ede diff --git a/contrib/libunwind b/contrib/libunwind index 8fe25d7dc70..27026ef4a9c 160000 --- a/contrib/libunwind +++ b/contrib/libunwind @@ -1 +1 @@ -Subproject commit 8fe25d7dc70f2a4ea38c3e5a33fa9d4199b67a5a +Subproject commit 27026ef4a9c6c8cc956d1d131c4d794e24096981 diff --git a/contrib/mariadb-connector-c b/contrib/mariadb-connector-c index 21f451d4d31..1485b0de3ea 160000 --- a/contrib/mariadb-connector-c +++ b/contrib/mariadb-connector-c @@ -1 +1 @@ -Subproject commit 21f451d4d3157ffed31ec60a8b76c407190e66bd +Subproject commit 1485b0de3eaa1508dfe49a5ba1e4aa2a71fd8335 diff --git a/contrib/openldap b/contrib/openldap index 0208811b604..34b9ba94b30 160000 --- a/contrib/openldap +++ b/contrib/openldap @@ -1 +1 @@ -Subproject commit 0208811b6043ca06fda8631a5e473df1ec515ccb +Subproject commit 34b9ba94b30319ed6389a4e001d057f7983fe363 diff --git a/contrib/poco b/contrib/poco index 2c32e17c7df..757d947235b 160000 --- a/contrib/poco +++ b/contrib/poco @@ -1 +1 @@ -Subproject commit 2c32e17c7dfee1f8bf24227b697cdef5fddf0823 +Subproject commit 757d947235b307675cff964f29b19d388140a9eb diff --git a/contrib/protobuf b/contrib/protobuf index 73b12814204..445d1ae73a4 160000 --- a/contrib/protobuf +++ b/contrib/protobuf @@ -1 +1 @@ -Subproject commit 73b12814204ad9068ba352914d0dc244648b48ee +Subproject commit 445d1ae73a450b1e94622e7040989aa2048402e3 diff --git a/contrib/replxx b/contrib/replxx index cdb6e3f2ce4..8cf626c04e9 160000 --- a/contrib/replxx +++ b/contrib/replxx @@ -1 +1 @@ -Subproject commit cdb6e3f2ce4464225daf9c8beeae7db98d590bdc +Subproject commit 8cf626c04e9a74313fb0b474cdbe2297c0f3cdc8 From c0c78316a33c6639766dbf9530724b67a6bfe4c9 Mon Sep 17 00:00:00 2001 From: MyroTk Date: Wed, 13 Jan 2021 15:31:03 +0100 Subject: [PATCH 0071/2357] Revert "Bringing up to date" This reverts commit 1a855845a890e19c5fe6dbb1c414fb1da761e6c5. --- contrib/AMQP-CPP | 2 +- contrib/arrow | 2 +- contrib/aws | 2 +- contrib/boost | 2 +- contrib/cassandra | 2 +- contrib/cctz | 2 +- contrib/croaring | 2 +- contrib/grpc | 2 +- contrib/jemalloc | 2 +- contrib/krb5 | 2 +- contrib/libc-headers | 2 +- contrib/libcxx | 2 +- contrib/libcxxabi | 2 +- contrib/libgsasl | 2 +- contrib/libhdfs3 | 2 +- contrib/librdkafka | 2 +- contrib/libunwind | 2 +- contrib/mariadb-connector-c | 2 +- contrib/openldap | 2 +- contrib/poco | 2 +- contrib/protobuf | 2 +- contrib/replxx | 2 +- 22 files changed, 22 insertions(+), 22 deletions(-) diff --git a/contrib/AMQP-CPP b/contrib/AMQP-CPP index d63e1f01658..03781aaff0f 160000 --- a/contrib/AMQP-CPP +++ b/contrib/AMQP-CPP @@ -1 +1 @@ -Subproject commit d63e1f016582e9faaaf279aa24513087a07bc6e7 +Subproject commit 03781aaff0f10ef41f902b8cf865fe0067180c10 diff --git a/contrib/arrow b/contrib/arrow index 3cbcb7b62c2..744bdfe188f 160000 --- a/contrib/arrow +++ b/contrib/arrow @@ -1 +1 @@ -Subproject commit 3cbcb7b62c2f2d02851bff837758637eb592a64b +Subproject commit 744bdfe188f018e5e05f5deebd4e9ee0a7706cf4 diff --git a/contrib/aws b/contrib/aws index 17e10c0fc77..a220591e335 160000 --- a/contrib/aws +++ b/contrib/aws @@ -1 +1 @@ -Subproject commit 17e10c0fc77f22afe890fa6d1b283760e5edaa56 +Subproject commit a220591e335923ce1c19bbf9eb925787f7ab6c13 diff --git a/contrib/boost b/contrib/boost index a04e72c0464..8e259cd2a6b 160000 --- a/contrib/boost +++ b/contrib/boost @@ -1 +1 @@ -Subproject commit a04e72c0464f0c31d3384f18f0c0db36a05538e0 +Subproject commit 8e259cd2a6b60d75dd17e73432f11bb7b9351bb1 diff --git a/contrib/cassandra b/contrib/cassandra index a49b4e0e269..d10187efb25 160000 --- a/contrib/cassandra +++ b/contrib/cassandra @@ -1 +1 @@ -Subproject commit a49b4e0e2696a4b8ef286a5b9538d1cbe8490509 +Subproject commit d10187efb25b26da391def077edf3c6f2f3a23dd diff --git a/contrib/cctz b/contrib/cctz index 7a2db4ece6e..c0f1bcb97fd 160000 --- a/contrib/cctz +++ b/contrib/cctz @@ -1 +1 @@ -Subproject commit 7a2db4ece6e0f1b246173cbdb62711ae258ee841 +Subproject commit c0f1bcb97fd2782f7c3f972fadd5aad5affac4b8 diff --git a/contrib/croaring b/contrib/croaring index 5f20740ec0d..d8402939b5c 160000 --- a/contrib/croaring +++ b/contrib/croaring @@ -1 +1 @@ -Subproject commit 5f20740ec0de5e153e8f4cb2ab91814e8b291a14 +Subproject commit d8402939b5c9fc134fd4fcf058fe0f7006d2b129 diff --git a/contrib/grpc b/contrib/grpc index a6570b863cf..7436366ceb3 160000 --- a/contrib/grpc +++ b/contrib/grpc @@ -1 +1 @@ -Subproject commit a6570b863cf76c9699580ba51c7827d5bffaac43 +Subproject commit 7436366ceb341ba5c00ea29f1645e02a2b70bf93 diff --git a/contrib/jemalloc b/contrib/jemalloc index 93e27e435ca..e6891d97461 160000 --- a/contrib/jemalloc +++ b/contrib/jemalloc @@ -1 +1 @@ -Subproject commit 93e27e435cac846028da20cd9b0841fbc9110bd2 +Subproject commit e6891d9746143bf2cf617493d880ba5a0b9a3efd diff --git a/contrib/krb5 b/contrib/krb5 index 99f7ad2831a..90ff6f4f8c6 160000 --- a/contrib/krb5 +++ b/contrib/krb5 @@ -1 +1 @@ -Subproject commit 99f7ad2831a01f264c07eed42a0a3a9336b86184 +Subproject commit 90ff6f4f8c695d6bf1aaba78a9b8942be92141c2 diff --git a/contrib/libc-headers b/contrib/libc-headers index 92c74f938cf..a720b7105a6 160000 --- a/contrib/libc-headers +++ b/contrib/libc-headers @@ -1 +1 @@ -Subproject commit 92c74f938cf2c4dd529cae4f3d2923d153b029a7 +Subproject commit a720b7105a610acbd7427eea475a5b6810c151eb diff --git a/contrib/libcxx b/contrib/libcxx index 9f71e122533..8b80a151d12 160000 --- a/contrib/libcxx +++ b/contrib/libcxx @@ -1 +1 @@ -Subproject commit 9f71e122533c43298c2892108904bb942b0d840f +Subproject commit 8b80a151d12b98ffe2d0c22f7cec12c3b9ff88d7 diff --git a/contrib/libcxxabi b/contrib/libcxxabi index 1ebc83af4c0..df8f1e727db 160000 --- a/contrib/libcxxabi +++ b/contrib/libcxxabi @@ -1 +1 @@ -Subproject commit 1ebc83af4c06dbcd56b4d166c1314a7d4c1173f9 +Subproject commit df8f1e727dbc9e2bedf2282096fa189dc3fe0076 diff --git a/contrib/libgsasl b/contrib/libgsasl index 140fb582505..383ee28e82f 160000 --- a/contrib/libgsasl +++ b/contrib/libgsasl @@ -1 +1 @@ -Subproject commit 140fb58250588c8323285b75fcf127c4adc33dfa +Subproject commit 383ee28e82f69fa16ed43b48bd9c8ee5b313ab84 diff --git a/contrib/libhdfs3 b/contrib/libhdfs3 index 30552ac527f..095b9d48b40 160000 --- a/contrib/libhdfs3 +++ b/contrib/libhdfs3 @@ -1 +1 @@ -Subproject commit 30552ac527f2c14070d834e171493b2e7f662375 +Subproject commit 095b9d48b400abb72d967cb0539af13b1e3d90cf diff --git a/contrib/librdkafka b/contrib/librdkafka index 2090cbf56b7..f2f6616419d 160000 --- a/contrib/librdkafka +++ b/contrib/librdkafka @@ -1 +1 @@ -Subproject commit 2090cbf56b715247ec2be7f768707a7ab1bf7ede +Subproject commit f2f6616419d567c9198aef0d1133a2e9b4f02276 diff --git a/contrib/libunwind b/contrib/libunwind index 27026ef4a9c..8fe25d7dc70 160000 --- a/contrib/libunwind +++ b/contrib/libunwind @@ -1 +1 @@ -Subproject commit 27026ef4a9c6c8cc956d1d131c4d794e24096981 +Subproject commit 8fe25d7dc70f2a4ea38c3e5a33fa9d4199b67a5a diff --git a/contrib/mariadb-connector-c b/contrib/mariadb-connector-c index 1485b0de3ea..21f451d4d31 160000 --- a/contrib/mariadb-connector-c +++ b/contrib/mariadb-connector-c @@ -1 +1 @@ -Subproject commit 1485b0de3eaa1508dfe49a5ba1e4aa2a71fd8335 +Subproject commit 21f451d4d3157ffed31ec60a8b76c407190e66bd diff --git a/contrib/openldap b/contrib/openldap index 34b9ba94b30..0208811b604 160000 --- a/contrib/openldap +++ b/contrib/openldap @@ -1 +1 @@ -Subproject commit 34b9ba94b30319ed6389a4e001d057f7983fe363 +Subproject commit 0208811b6043ca06fda8631a5e473df1ec515ccb diff --git a/contrib/poco b/contrib/poco index 757d947235b..2c32e17c7df 160000 --- a/contrib/poco +++ b/contrib/poco @@ -1 +1 @@ -Subproject commit 757d947235b307675cff964f29b19d388140a9eb +Subproject commit 2c32e17c7dfee1f8bf24227b697cdef5fddf0823 diff --git a/contrib/protobuf b/contrib/protobuf index 445d1ae73a4..73b12814204 160000 --- a/contrib/protobuf +++ b/contrib/protobuf @@ -1 +1 @@ -Subproject commit 445d1ae73a450b1e94622e7040989aa2048402e3 +Subproject commit 73b12814204ad9068ba352914d0dc244648b48ee diff --git a/contrib/replxx b/contrib/replxx index 8cf626c04e9..cdb6e3f2ce4 160000 --- a/contrib/replxx +++ b/contrib/replxx @@ -1 +1 @@ -Subproject commit 8cf626c04e9a74313fb0b474cdbe2297c0f3cdc8 +Subproject commit cdb6e3f2ce4464225daf9c8beeae7db98d590bdc From eba98b04b0322f02139f7553c2fab61b84a514e8 Mon Sep 17 00:00:00 2001 From: Anton Ivashkin Date: Thu, 14 Jan 2021 19:26:56 +0300 Subject: [PATCH 0072/2357] Zero copy replication over S3: Hybrid storage support --- S3ZeroCopyReplication.md | 17 +- src/Storages/MergeTree/DataPartsExchange.cpp | 26 ++- src/Storages/MergeTree/DataPartsExchange.h | 3 +- src/Storages/MergeTree/IMergeTreeDataPart.cpp | 218 ++++++++++++++++-- src/Storages/MergeTree/IMergeTreeDataPart.h | 11 +- src/Storages/MergeTree/MergeTreeData.cpp | 5 +- src/Storages/MergeTree/MergeTreeData.h | 3 +- .../MergeTree/MergeTreeDataMergerMutator.cpp | 1 + .../MergeTree/MergedBlockOutputStream.cpp | 1 + .../MergeTree/ReplicatedMergeTreeLogEntry.h | 40 ++-- src/Storages/StorageReplicatedMergeTree.cpp | 176 ++++++++++---- src/Storages/StorageReplicatedMergeTree.h | 24 +- 12 files changed, 431 insertions(+), 94 deletions(-) diff --git a/S3ZeroCopyReplication.md b/S3ZeroCopyReplication.md index 22c01caa90c..bfb39addcd2 100644 --- a/S3ZeroCopyReplication.md +++ b/S3ZeroCopyReplication.md @@ -18,9 +18,14 @@ Применик перед запросом смотрит, будет ли хранить данные в S3. Проверка сейчас кривая - если в сторадже есть S3, то считаем, что будет S3. Если да S3, то отсылает в запросе send_s3_metadata=1. -Источник при получении такого запроса смотрит, лежит ли парт на S3. Если да, то в Зукипере ставит метку по пути `<путь к данным таблицы>/zero_copy_s3/shared//`, +Источник при получении такого запроса смотрит, лежит ли парт на S3. Если да, то в Зукипере ставит метку по пути `<путь к данным таблицы>/zero_copy_s3/shared/<имя парта>//<Путь парта>/`, ставит в ответ куку send_s3_metadata=1 и вместо файлов с данными отсылает только файлы метаданных. +Путь получился сложным, потому что требуется +* по имени парта получить, на каких репликах он уже есть на S3 (нужно для гибридного хранилища) +* по уникальному пути понимать, используелся ли эта копия парта другими репликами +* для павильного времени жизни лока различать лок основного варианта (all_0_0_0) от временного (tmp_fetch_all_0_0_0) + Приемник при получении ответа с send_s3_metadata=1 проверяет доступность по переданному ключу (первый объект checksums.txt) создает только файлики с идентичными меаданными, которые в итоге будут ссылаться на те же ключи в S3, ставит в зукипере аналогичную метку, только со своим ID реплики, и работает с этим. @@ -30,14 +35,14 @@ При мерже если реузльтат будет на S3, нода ставит эфемерную метку в Zookeeper по пути `<путь к данным таблицы>/zero_copy_s3/merged/<имя нового парта>` (!! НЕ !!). Если такая метка уже есть, то считает, что другая нода уже помержила или мержит сейчас, и надо сделать fetch вместо мержа самой. +В гибридном хранилище если парт переносится на S3, нода через ZK проверяет, нет был ли парт перенесен другой нодой, если был, то делает fetch (модифицированный по сравнению с обычным fetch'ем). + В конфиг добавлен флаг, по которому включается функционал нового протокола репликации - merge_tree->allow_s3_zero_copy_replication. Сейчас стоит в true - это времеменно, чтобы все тесты сейчас проходили с включенным флагом, перед финальным мержем надо не забыть заменить на false. ## Костыли и недоработки, коих много * В качестве ID парта берется имя первого S3-ключа от файла checksums.txt. -* Не нашел удобного способа прокидывать в коде зукипер, прокинул хадркодом. - * При удалении класс диска ничего не знает про парты, прокинул флаг, что надо оставлять данные в S3 параметром, это очень криво получилось. * Возможна гонка, если источник отошлет метаданные про парт и тут же решит его удалить до того, как приемник поставит в зукипер пометку. @@ -52,9 +57,5 @@ * Тесты пока только самые базовые. -* ... много их. Честно. - -## TODO, чего еще вообще не делалось - -* Для гибридного хранилища сделать проверку и fetch при переезде парта с локального диска в S3. +* Для гибридного хранилища если две ноды решают одновременно перенести парт на S3, обе проверяют, что его там еще нет и обе переносят. diff --git a/src/Storages/MergeTree/DataPartsExchange.cpp b/src/Storages/MergeTree/DataPartsExchange.cpp index f8f5bfb5a3b..884dd22c295 100644 --- a/src/Storages/MergeTree/DataPartsExchange.cpp +++ b/src/Storages/MergeTree/DataPartsExchange.cpp @@ -267,7 +267,7 @@ void Service::sendPartS3Metadata(const MergeTreeData::DataPartPtr & part, WriteB if (disk->getType() != "s3") throw Exception("S3 disk is not S3 anymore", ErrorCodes::LOGICAL_ERROR); - part->lockSharedData(zookeeper_path, replica_name, zookeeper); + part->lockSharedData(); String part_id = part->getUniqueId(); writeStringBinary(part_id, out); @@ -327,7 +327,8 @@ MergeTreeData::MutableDataPartPtr Fetcher::fetchPart( const String & interserver_scheme, bool to_detached, const String & tmp_prefix_, - bool try_use_s3_copy) + bool try_use_s3_copy, + const DiskPtr disk_s3) { if (blocker.isCancelled()) throw Exception("Fetching of part was cancelled", ErrorCodes::ABORTED); @@ -348,6 +349,9 @@ MergeTreeData::MutableDataPartPtr Fetcher::fetchPart( {"compress", "false"} }); + if (try_use_s3_copy && disk_s3 && disk_s3->getType() != "s3") + throw Exception("Try to fetch shared s3 part on non-s3 disk", ErrorCodes::LOGICAL_ERROR); + Disks disks_s3; if (!data_settings->allow_s3_zero_copy_replication) @@ -355,9 +359,15 @@ MergeTreeData::MutableDataPartPtr Fetcher::fetchPart( if (try_use_s3_copy) { - disks_s3 = data.getDisksByType("s3"); - if (disks_s3.empty()) - try_use_s3_copy = false; + if (disk_s3) + disks_s3.push_back(disk_s3); + else + { + disks_s3 = data.getDisksByType("s3"); + + if (disks_s3.empty()) + try_use_s3_copy = false; + } } if (try_use_s3_copy) @@ -405,6 +415,10 @@ MergeTreeData::MutableDataPartPtr Fetcher::fetchPart( if (part_type == "InMemory") throw Exception("Got 'send_s3_metadata' cookie for in-memory partition", ErrorCodes::LOGICAL_ERROR); + UUID part_uuid = UUIDHelpers::Nil; + if (server_protocol_version >= REPLICATION_PROTOCOL_VERSION_WITH_PARTS_UUID) + readUUIDText(part_uuid, in); + try { return downloadPartToS3(part_name, replica_path, to_detached, tmp_prefix_, std::move(disks_s3), in); @@ -680,7 +694,7 @@ MergeTreeData::MutableDataPartPtr Fetcher::downloadPartToS3( new_data_part->modification_time = time(nullptr); new_data_part->loadColumnsChecksumsIndexes(true, false); - new_data_part->lockSharedData(zookeeper_path, replica_name, zookeeper); + new_data_part->lockSharedData(); return new_data_part; } diff --git a/src/Storages/MergeTree/DataPartsExchange.h b/src/Storages/MergeTree/DataPartsExchange.h index c5bc891b550..f0297aa1d28 100644 --- a/src/Storages/MergeTree/DataPartsExchange.h +++ b/src/Storages/MergeTree/DataPartsExchange.h @@ -81,7 +81,8 @@ public: const String & interserver_scheme, bool to_detached = false, const String & tmp_prefix_ = "", - bool try_use_s3_copy = true); + bool try_use_s3_copy = true, + const DiskPtr disk_s3 = nullptr); /// You need to stop the data transfer. ActionBlocker blocker; diff --git a/src/Storages/MergeTree/IMergeTreeDataPart.cpp b/src/Storages/MergeTree/IMergeTreeDataPart.cpp index 5f017972a47..f4635208cda 100644 --- a/src/Storages/MergeTree/IMergeTreeDataPart.cpp +++ b/src/Storages/MergeTree/IMergeTreeDataPart.cpp @@ -9,6 +9,7 @@ #include #include #include +#include #include #include #include @@ -20,6 +21,7 @@ namespace DB { + namespace ErrorCodes { extern const int DIRECTORY_ALREADY_EXISTS; @@ -773,7 +775,8 @@ void IMergeTreeDataPart::loadColumns(bool require) { /// We can get list of columns only from columns.txt in compact parts. if (require || part_type == Type::COMPACT) - throw Exception("No columns.txt in part " + name, ErrorCodes::NO_FILE_IN_DATA_PART); + throw Exception("No columns.txt in part " + name + ", expected path " + path + " on drive " + volume->getDisk()->getName(), + ErrorCodes::NO_FILE_IN_DATA_PART); /// If there is no file with a list of columns, write it down. for (const NameAndTypePair & column : metadata_snapshot->getColumns().getAllPhysical()) @@ -855,7 +858,10 @@ void IMergeTreeDataPart::renameTo(const String & new_relative_path, bool remove_ volume->getDisk()->setLastModified(from, Poco::Timestamp::fromEpochTime(time(nullptr))); volume->getDisk()->moveFile(from, to); + String old_relative_path = relative_path; relative_path = new_relative_path; + lockSharedData(); + unlockSharedData(old_relative_path); } @@ -1010,7 +1016,15 @@ void IMergeTreeDataPart::makeCloneOnDisk(const DiskPtr & disk, const String & di } disk->createDirectories(path_to_clone); - volume->getDisk()->copy(getFullRelativePath(), disk, path_to_clone); + bool is_fetched = false; + + if (disk->getType() == "s3") + { + is_fetched = tryToFetchIfShared(disk, path_to_clone + "/" + name); + } + + if (!is_fetched) + volume->getDisk()->copy(getFullRelativePath(), disk, path_to_clone); volume->getDisk()->removeIfExists(path_to_clone + '/' + DELETE_ON_DESTROY_MARKER_FILE_NAME); } @@ -1148,38 +1162,212 @@ String IMergeTreeDataPart::getUniqueId() const return id; } -void IMergeTreeDataPart::lockSharedData(const String & zookeeper_path, const String & replica_name, zkutil::ZooKeeperPtr zookeeper) const +void IMergeTreeDataPart::lockSharedData() const { + if (!volume) + return; + DiskPtr disk = volume->getDisk(); + if (!disk) + return; + if (disk->getType() != "s3") + return; + + const StorageReplicatedMergeTree *replicated_storage = dynamic_cast(&storage); + if (!replicated_storage) + return; + + StorageReplicatedMergeTree::ZooKeeperAccessData zk = replicated_storage->getZooKeeperAccessData(); + if (!zk.zookeeper) + return; + String id = getUniqueId(); + boost::replace_all(id, "/", "_"); + String norm_path = relative_path; + boost::replace_all(norm_path, "/", "_"); - String zookeeper_node = zookeeper_path + "/zero_copy_s3/shared/" + id + "/" + replica_name; + String zookeeper_node = zk.zookeeper_path + "/zero_copy_s3/shared/" + name + "/" + id + "/" + norm_path + "/" + zk.replica_name; - LOG_TRACE(storage.log, "Set zookeeper lock {}", id); + LOG_TRACE(storage.log, "Set zookeeper lock {}", zookeeper_node); - zookeeper->createAncestors(zookeeper_node); - zookeeper->createIfNotExists(zookeeper_node, "lock"); + zk.zookeeper->createAncestors(zookeeper_node); + zk.zookeeper->createIfNotExists(zookeeper_node, "lock"); } -bool IMergeTreeDataPart::unlockSharedData(const String & zookeeper_path, const String & replica_name, zkutil::ZooKeeperPtr zookeeper) const +bool IMergeTreeDataPart::unlockSharedData() const { + return unlockSharedData(relative_path); +} + +bool IMergeTreeDataPart::unlockSharedData(const String & path) const +{ + if (!volume) + return true; + DiskPtr disk = volume->getDisk(); + if (!disk) + return true; + if (disk->getType() != "s3") + return true; + + const StorageReplicatedMergeTree *replicated_storage = dynamic_cast(&storage); + if (!replicated_storage) + return true; + + StorageReplicatedMergeTree::ZooKeeperAccessData zk = replicated_storage->getZooKeeperAccessData(); + if (!zk.zookeeper) + return true; + String id = getUniqueId(); + boost::replace_all(id, "/", "_"); + String norm_path = path; + boost::replace_all(norm_path, "/", "_"); - String zookeeper_part_node = zookeeper_path + "/zero_copy_s3/shared/" + id; - String zookeeper_node = zookeeper_part_node + "/" + replica_name; + String zookeeper_part_node = zk.zookeeper_path + "/zero_copy_s3/shared/" + name; + String zookeeper_part_uniq_node = zookeeper_part_node + "/" + id; + String zookeeper_part_path_node = zookeeper_part_uniq_node + "/" + norm_path; + String zookeeper_node = zookeeper_part_path_node + "/" + zk.replica_name; - LOG_TRACE(storage.log, "Remove zookeeper lock for {}", id); + LOG_TRACE(storage.log, "Remove zookeeper lock {}", zookeeper_node); - zookeeper->remove(zookeeper_node); + zk.zookeeper->tryRemove(zookeeper_node); Strings children; - zookeeper->tryGetChildren(zookeeper_part_node, children); + zk.zookeeper->tryGetChildren(zookeeper_part_path_node, children); + if (!children.empty()) + { + LOG_TRACE(storage.log, "Found zookeper locks for {}", zookeeper_part_path_node); + return false; + } + + zk.zookeeper->tryRemove(zookeeper_part_path_node); + + children.clear(); + zk.zookeeper->tryGetChildren(zookeeper_part_uniq_node, children); if (!children.empty()) { - LOG_TRACE(storage.log, "Found zookeper locks for {}", id); + LOG_TRACE(storage.log, "Found zookeper locks for {}", zookeeper_part_uniq_node); + return false; } - return children.empty(); + zk.zookeeper->tryRemove(zookeeper_part_uniq_node); + + /// Even when we have lock with same part name, but with different uniq, we can remove files on S3 + children.clear(); + zk.zookeeper->tryGetChildren(zookeeper_part_node, children); + if (children.empty()) + /// Cleanup after last uniq removing + zk.zookeeper->tryRemove(zookeeper_part_node); + + return true; +} + +String IMergeTreeDataPart::getSharedDataReplica( + const String & zookeeper_path, + zkutil::ZooKeeperPtr zookeeper, + const String & replica_name) const +{ + String norm_path = relative_path; + boost::replace_all(norm_path, "/", "_"); + String zookeeper_part_node = zookeeper_path + "/zero_copy_s3/shared/" + name; + + Strings ids; + zookeeper->tryGetChildren(zookeeper_part_node, ids); + + Strings replicas; + for (const auto & id : ids) + { + String zookeeper_part_uniq_node = zookeeper_part_node + "/" + id; + Strings paths; + zookeeper->tryGetChildren(zookeeper_part_uniq_node, paths); + for (const auto &path : paths) + { + String zookeeper_node = zookeeper_part_uniq_node + "/" + path; + Strings id_replicas; + zookeeper->tryGetChildren(zookeeper_node, id_replicas); + LOG_TRACE(storage.log, "Found zookeper replicas for {}: {}", zookeeper_node, id_replicas.size()); + replicas.insert(replicas.end(), id_replicas.begin(), id_replicas.end()); + } + } + + LOG_TRACE(storage.log, "Found zookeper replicas for part {}: {}", name, replicas.size()); + + String best_replica; + Strings active_replicas; + + /// TODO: Move best replica choose in common method (here is the same code as in StorageReplicatedMergeTree::fetchPartition) + + /// Leave only active replicas. + active_replicas.reserve(replicas.size()); + + for (const String & replica : replicas) + if ((replica != replica_name) && (zookeeper->exists(zookeeper_path + "/replicas/" + replica + "/is_active"))) + active_replicas.push_back(replica); + + LOG_TRACE(storage.log, "Found zookeper active replicas for part {}: {}", name, active_replicas.size()); + + if (active_replicas.empty()) + return best_replica; + + /** You must select the best (most relevant) replica. + * This is a replica with the maximum `log_pointer`, then with the minimum `queue` size. + * NOTE This is not exactly the best criteria. It does not make sense to download old partitions, + * and it would be nice to be able to choose the replica closest by network. + * NOTE Of course, there are data races here. You can solve it by retrying. + */ + Int64 max_log_pointer = -1; + UInt64 min_queue_size = std::numeric_limits::max(); + + for (const String & replica : active_replicas) + { + String current_replica_path = zookeeper_path + "/replicas/" + replica; + + String log_pointer_str = zookeeper->get(current_replica_path + "/log_pointer"); + Int64 log_pointer = log_pointer_str.empty() ? 0 : parse(log_pointer_str); + + Coordination::Stat stat; + zookeeper->get(current_replica_path + "/queue", &stat); + size_t queue_size = stat.numChildren; + + if (log_pointer > max_log_pointer + || (log_pointer == max_log_pointer && queue_size < min_queue_size)) + { + max_log_pointer = log_pointer; + min_queue_size = queue_size; + best_replica = replica; + } + } + + return best_replica; +} + +bool IMergeTreeDataPart::tryToFetchIfShared(const DiskPtr & disk, const String & path) const +{ + const StorageReplicatedMergeTree *replicated_storage = dynamic_cast(&storage); + if (!replicated_storage) + return false; + + StorageReplicatedMergeTree::ZooKeeperAccessData zk = replicated_storage->getZooKeeperAccessData(); + if (!zk.zookeeper) + return false; + + String replica = getSharedDataReplica(zk.zookeeper_path, zk.zookeeper, zk.replica_name); + + /// We can't fetch part when none replicas have this part on S3 + if (replica.empty()) + return false; + + ReplicatedMergeTreeLogEntry log_entry; + log_entry.type = ReplicatedMergeTreeLogEntry::FETCH_SHARED_PART; + log_entry.source_replica = replica; + log_entry.new_part_name = name;//part_name; + log_entry.create_time = 0;//part_create_time; + log_entry.disk = disk; + log_entry.path = path; + + /// TODO: !!! Fix const usage !!! + StorageReplicatedMergeTree *replicated_storage_nc = const_cast(replicated_storage); + + return replicated_storage_nc->executeFetchShared(log_entry); } bool isCompactPart(const MergeTreeDataPartPtr & data_part) diff --git a/src/Storages/MergeTree/IMergeTreeDataPart.h b/src/Storages/MergeTree/IMergeTreeDataPart.h index 746d95fe78e..cfe3d7da263 100644 --- a/src/Storages/MergeTree/IMergeTreeDataPart.h +++ b/src/Storages/MergeTree/IMergeTreeDataPart.h @@ -369,12 +369,13 @@ public: String getUniqueId() const; /// Lock part in zookeeper for use common S3 data in several nodes - void lockSharedData(const String & zookeeper_path, const String & replica_name, zkutil::ZooKeeperPtr zookeeper) const; + void lockSharedData() const; /// Unlock common S3 data part in zookeeper /// Return true if data unlocked /// Return false if data is still used by another node - bool unlockSharedData(const String & zookeeper_path, const String & replica_name, zkutil::ZooKeeperPtr zookeeper) const; + bool unlockSharedData() const; + bool unlockSharedData(const String & path) const; protected: @@ -439,6 +440,12 @@ private: /// Found column without specific compression and return codec /// for this column with default parameters. CompressionCodecPtr detectDefaultCompressionCodec() const; + + /// Fetch part only if some replica has it on shared storage like S3 + bool tryToFetchIfShared(const DiskPtr & disk, const String & path) const; + + /// Get best replica having this partition on S3 + String getSharedDataReplica(const String & zookeeper_path, zkutil::ZooKeeperPtr zookeeper, const String & replica_name) const; }; using MergeTreeDataPartState = IMergeTreeDataPart::State; diff --git a/src/Storages/MergeTree/MergeTreeData.cpp b/src/Storages/MergeTree/MergeTreeData.cpp index c3a599665bb..37f7187585c 100644 --- a/src/Storages/MergeTree/MergeTreeData.cpp +++ b/src/Storages/MergeTree/MergeTreeData.cpp @@ -1833,7 +1833,8 @@ MergeTreeData::DataPartsVector MergeTreeData::getActivePartsToReplace( const MergeTreePartInfo & new_part_info, const String & new_part_name, DataPartPtr & out_covering_part, - DataPartsLock & /* data_parts_lock */) const + DataPartsLock & /* data_parts_lock */, + bool allow_duplicate) const { /// Parts contained in the part are consecutive in data_parts, intersecting the insertion place for the part itself. auto it_middle = data_parts_by_state_and_info.lower_bound(DataPartStateAndInfo{DataPartState::Committed, new_part_info}); @@ -1867,7 +1868,7 @@ MergeTreeData::DataPartsVector MergeTreeData::getActivePartsToReplace( DataPartIteratorByStateAndInfo end = it_middle; while (end != committed_parts_range.end()) { - if ((*end)->info == new_part_info) + if ((*end)->info == new_part_info && !allow_duplicate) throw Exception("Unexpected duplicate part " + (*end)->getNameWithState() + ". It is a bug.", ErrorCodes::LOGICAL_ERROR); if (!new_part_info.contains((*end)->info)) diff --git a/src/Storages/MergeTree/MergeTreeData.h b/src/Storages/MergeTree/MergeTreeData.h index e65d486d46f..53902688f1f 100644 --- a/src/Storages/MergeTree/MergeTreeData.h +++ b/src/Storages/MergeTree/MergeTreeData.h @@ -870,7 +870,8 @@ protected: const MergeTreePartInfo & new_part_info, const String & new_part_name, DataPartPtr & out_covering_part, - DataPartsLock & data_parts_lock) const; + DataPartsLock & data_parts_lock, + bool allow_duplicate = false) const; /// Checks whether the column is in the primary key, possibly wrapped in a chain of functions with single argument. bool isPrimaryOrMinMaxKeyColumnPossiblyWrappedInFunctions(const ASTPtr & node, const StorageMetadataPtr & metadata_snapshot) const; diff --git a/src/Storages/MergeTree/MergeTreeDataMergerMutator.cpp b/src/Storages/MergeTree/MergeTreeDataMergerMutator.cpp index 1065b992396..807d1e9eed2 100644 --- a/src/Storages/MergeTree/MergeTreeDataMergerMutator.cpp +++ b/src/Storages/MergeTree/MergeTreeDataMergerMutator.cpp @@ -1876,6 +1876,7 @@ void MergeTreeDataMergerMutator::finalizeMutatedPart( MergeTreeData::DataPart::calculateTotalSizeOnDisk(new_data_part->volume->getDisk(), new_data_part->getFullRelativePath())); new_data_part->default_codec = codec; new_data_part->calculateColumnsSizesOnDisk(); + new_data_part->lockSharedData(); } diff --git a/src/Storages/MergeTree/MergedBlockOutputStream.cpp b/src/Storages/MergeTree/MergedBlockOutputStream.cpp index 00a4c37c60d..255526eca11 100644 --- a/src/Storages/MergeTree/MergedBlockOutputStream.cpp +++ b/src/Storages/MergeTree/MergedBlockOutputStream.cpp @@ -126,6 +126,7 @@ void MergedBlockOutputStream::writeSuffixAndFinalizePart( new_part->calculateColumnsSizesOnDisk(); if (default_codec != nullptr) new_part->default_codec = default_codec; + new_part->lockSharedData(); } void MergedBlockOutputStream::finalizePartOnDisk( diff --git a/src/Storages/MergeTree/ReplicatedMergeTreeLogEntry.h b/src/Storages/MergeTree/ReplicatedMergeTreeLogEntry.h index 4b384171dde..e9e3d15c5ff 100644 --- a/src/Storages/MergeTree/ReplicatedMergeTreeLogEntry.h +++ b/src/Storages/MergeTree/ReplicatedMergeTreeLogEntry.h @@ -6,6 +6,7 @@ #include #include #include +#include #include #include @@ -29,29 +30,31 @@ struct ReplicatedMergeTreeLogEntryData { enum Type { - EMPTY, /// Not used. - GET_PART, /// Get the part from another replica. - MERGE_PARTS, /// Merge the parts. - DROP_RANGE, /// Delete the parts in the specified partition in the specified number range. - CLEAR_COLUMN, /// NOTE: Deprecated. Drop specific column from specified partition. - CLEAR_INDEX, /// NOTE: Deprecated. Drop specific index from specified partition. - REPLACE_RANGE, /// Drop certain range of partitions and replace them by new ones - MUTATE_PART, /// Apply one or several mutations to the part. - ALTER_METADATA, /// Apply alter modification according to global /metadata and /columns paths + EMPTY, /// Not used. + GET_PART, /// Get the part from another replica. + MERGE_PARTS, /// Merge the parts. + DROP_RANGE, /// Delete the parts in the specified partition in the specified number range. + CLEAR_COLUMN, /// NOTE: Deprecated. Drop specific column from specified partition. + CLEAR_INDEX, /// NOTE: Deprecated. Drop specific index from specified partition. + REPLACE_RANGE, /// Drop certain range of partitions and replace them by new ones + MUTATE_PART, /// Apply one or several mutations to the part. + ALTER_METADATA, /// Apply alter modification according to global /metadata and /columns paths + FETCH_SHARED_PART, /// Get the part from other replica only if it on shared S3 storade }; static String typeToString(Type type) { switch (type) { - case ReplicatedMergeTreeLogEntryData::GET_PART: return "GET_PART"; - case ReplicatedMergeTreeLogEntryData::MERGE_PARTS: return "MERGE_PARTS"; - case ReplicatedMergeTreeLogEntryData::DROP_RANGE: return "DROP_RANGE"; - case ReplicatedMergeTreeLogEntryData::CLEAR_COLUMN: return "CLEAR_COLUMN"; - case ReplicatedMergeTreeLogEntryData::CLEAR_INDEX: return "CLEAR_INDEX"; - case ReplicatedMergeTreeLogEntryData::REPLACE_RANGE: return "REPLACE_RANGE"; - case ReplicatedMergeTreeLogEntryData::MUTATE_PART: return "MUTATE_PART"; - case ReplicatedMergeTreeLogEntryData::ALTER_METADATA: return "ALTER_METADATA"; + case ReplicatedMergeTreeLogEntryData::GET_PART: return "GET_PART"; + case ReplicatedMergeTreeLogEntryData::MERGE_PARTS: return "MERGE_PARTS"; + case ReplicatedMergeTreeLogEntryData::DROP_RANGE: return "DROP_RANGE"; + case ReplicatedMergeTreeLogEntryData::CLEAR_COLUMN: return "CLEAR_COLUMN"; + case ReplicatedMergeTreeLogEntryData::CLEAR_INDEX: return "CLEAR_INDEX"; + case ReplicatedMergeTreeLogEntryData::REPLACE_RANGE: return "REPLACE_RANGE"; + case ReplicatedMergeTreeLogEntryData::MUTATE_PART: return "MUTATE_PART"; + case ReplicatedMergeTreeLogEntryData::ALTER_METADATA: return "ALTER_METADATA"; + case ReplicatedMergeTreeLogEntryData::FETCH_SHARED_PART: return "FETCH_SHARED_PART"; default: throw Exception("Unknown log entry type: " + DB::toString(type), ErrorCodes::LOGICAL_ERROR); } @@ -191,6 +194,9 @@ struct ReplicatedMergeTreeLogEntry : public ReplicatedMergeTreeLogEntryData, std std::condition_variable execution_complete; /// Awake when currently_executing becomes false. static Ptr parse(const String & s, const Coordination::Stat & stat); + + DiskPtr disk; + String path; }; using ReplicatedMergeTreeLogEntryPtr = std::shared_ptr; diff --git a/src/Storages/StorageReplicatedMergeTree.cpp b/src/Storages/StorageReplicatedMergeTree.cpp index 977a485f758..59312737a39 100644 --- a/src/Storages/StorageReplicatedMergeTree.cpp +++ b/src/Storages/StorageReplicatedMergeTree.cpp @@ -1891,6 +1891,60 @@ bool StorageReplicatedMergeTree::executeFetch(LogEntry & entry) } +bool StorageReplicatedMergeTree::executeFetchShared(ReplicatedMergeTreeLogEntry & entry) +{ + if (entry.type != LogEntry::FETCH_SHARED_PART) + { + throw Exception("Wrong entry.type in executeFetchShared", ErrorCodes::LOGICAL_ERROR); + } + + if (entry.source_replica.empty()) + { + LOG_INFO(log, "No active replica has part {} on S3.", entry.new_part_name); + return false; + } + + const auto storage_settings_ptr = getSettings(); + auto metadata_snapshot = getInMemoryMetadataPtr(); + + static std::atomic_uint total_fetches {0}; + if (storage_settings_ptr->replicated_max_parallel_fetches && total_fetches >= storage_settings_ptr->replicated_max_parallel_fetches) + { + throw Exception("Too many total fetches from replicas, maximum: " + storage_settings_ptr->replicated_max_parallel_fetches.toString(), + ErrorCodes::TOO_MANY_FETCHES); + } + + ++total_fetches; + SCOPE_EXIT({--total_fetches;}); + + if (storage_settings_ptr->replicated_max_parallel_fetches_for_table + && current_table_fetches >= storage_settings_ptr->replicated_max_parallel_fetches_for_table) + { + throw Exception("Too many fetches from replicas for table, maximum: " + storage_settings_ptr->replicated_max_parallel_fetches_for_table.toString(), + ErrorCodes::TOO_MANY_FETCHES); + } + + ++current_table_fetches; + SCOPE_EXIT({--current_table_fetches;}); + + try + { + if (!fetchPart(entry.new_part_name, metadata_snapshot, zookeeper_path + "/replicas/" + entry.source_replica, false, entry.quorum, + nullptr, true, entry.disk, entry.path)) + return false; + } + catch (Exception & e) + { + if (e.code() == ErrorCodes::RECEIVED_ERROR_TOO_MANY_REQUESTS) + e.addMessage("Too busy replica. Will try later."); + tryLogCurrentException(log, __PRETTY_FUNCTION__); + throw; + } + + return true; +} + + void StorageReplicatedMergeTree::executeDropRange(const LogEntry & entry) { auto drop_range_info = MergeTreePartInfo::fromPartName(entry.new_part_name, format_version); @@ -3133,6 +3187,29 @@ String StorageReplicatedMergeTree::findReplicaHavingPart(const String & part_nam return {}; } +String StorageReplicatedMergeTree::findReplicaHavingSharedPart(const String & part_name, bool active) +{ + auto zookeeper = getZooKeeper(); + Strings replicas = zookeeper->getChildren(zookeeper_path + "/replicas"); + + /// Select replicas in uniformly random order. + std::shuffle(replicas.begin(), replicas.end(), thread_local_rng); + + for (const String & replica : replicas) + { + /// We don't interested in ourself. + if (replica == replica_name) + continue; + + if (checkReplicaHavePart(replica, part_name) && + (!active || zookeeper->exists(zookeeper_path + "/replicas/" + replica + "/is_active"))) + return replica; + + /// Obviously, replica could become inactive or even vanish after return from this method. + } + + return {}; +} String StorageReplicatedMergeTree::findReplicaHavingCoveringPart(LogEntry & entry, bool active) { @@ -3330,7 +3407,6 @@ void StorageReplicatedMergeTree::updateQuorum(const String & part_name, bool is_ } } - void StorageReplicatedMergeTree::cleanLastPartNode(const String & partition_id) { auto zookeeper = getZooKeeper(); @@ -3382,7 +3458,6 @@ void StorageReplicatedMergeTree::cleanLastPartNode(const String & partition_id) } } - bool StorageReplicatedMergeTree::partIsInsertingWithParallelQuorum(const MergeTreePartInfo & part_info) const { auto zookeeper = getZooKeeper(); @@ -3411,7 +3486,8 @@ bool StorageReplicatedMergeTree::partIsLastQuorumPart(const MergeTreePartInfo & } bool StorageReplicatedMergeTree::fetchPart(const String & part_name, const StorageMetadataPtr & metadata_snapshot, - const String & source_replica_path, bool to_detached, size_t quorum, zkutil::ZooKeeper::Ptr zookeeper_) + const String & source_replica_path, bool to_detached, size_t quorum, zkutil::ZooKeeper::Ptr zookeeper_, bool replace_exists, + DiskPtr replaced_disk, String replaced_part_path) { auto zookeeper = zookeeper_ ? zookeeper_ : getZooKeeper(); const auto part_info = MergeTreePartInfo::fromPartName(part_name, format_version); @@ -3461,6 +3537,8 @@ bool StorageReplicatedMergeTree::fetchPart(const String & part_name, const Stora }; DataPartPtr part_to_clone; + + if (!replace_exists) { /// If the desired part is a result of a part mutation, try to find the source part and compare /// its checksums to the checksums of the desired part. If they match, we can just clone the local part. @@ -3520,7 +3598,8 @@ bool StorageReplicatedMergeTree::fetchPart(const String & part_name, const Stora return fetcher.fetchPart( metadata_snapshot, part_name, source_replica_path, address.host, address.replication_port, - timeouts, user_password.first, user_password.second, interserver_scheme, to_detached); + timeouts, user_password.first, user_password.second, interserver_scheme, to_detached, "", true, + replace_exists ? replaced_disk : nullptr); }; } @@ -3530,46 +3609,56 @@ bool StorageReplicatedMergeTree::fetchPart(const String & part_name, const Stora if (!to_detached) { - Transaction transaction(*this); - renameTempPartAndReplace(part, nullptr, &transaction); - - /** NOTE - * Here, an error occurs if ALTER occurred with a change in the column type or column deletion, - * and the part on remote server has not yet been modified. - * After a while, one of the following attempts to make `fetchPart` succeed. - */ - replaced_parts = checkPartChecksumsAndCommit(transaction, part); - - /** If a quorum is tracked for this part, you must update it. - * If you do not have time, in case of losing the session, when you restart the server - see the `ReplicatedMergeTreeRestartingThread::updateQuorumIfWeHavePart` method. - */ - if (quorum) + if (replace_exists) { - /// Check if this quorum insert is parallel or not - if (zookeeper->exists(zookeeper_path + "/quorum/parallel/" + part_name)) - updateQuorum(part_name, true); - else if (zookeeper->exists(zookeeper_path + "/quorum/status")) - updateQuorum(part_name, false); + if (part->volume->getDisk()->getName() != replaced_disk->getName()) + throw Exception("Part " + part->name + " fetched on wrong disk " + part->volume->getDisk()->getName(), ErrorCodes::LOGICAL_ERROR); + replaced_disk->removeIfExists(replaced_part_path); + replaced_disk->moveDirectory(part->getFullRelativePath(), replaced_part_path); } - - /// merged parts that are still inserted with quorum. if it only contains one block, it hasn't been merged before - if (part_info.level != 0 || part_info.mutation != 0) + else { - Strings quorum_parts = zookeeper->getChildren(zookeeper_path + "/quorum/parallel"); - for (const String & quorum_part : quorum_parts) + Transaction transaction(*this); + renameTempPartAndReplace(part, nullptr, &transaction); + + /** NOTE + * Here, an error occurs if ALTER occurred with a change in the column type or column deletion, + * and the part on remote server has not yet been modified. + * After a while, one of the following attempts to make `fetchPart` succeed. + */ + replaced_parts = checkPartChecksumsAndCommit(transaction, part); + + /** If a quorum is tracked for this part, you must update it. + * If you do not have time, in case of losing the session, when you restart the server - see the `ReplicatedMergeTreeRestartingThread::updateQuorumIfWeHavePart` method. + */ + if (quorum) { - auto quorum_part_info = MergeTreePartInfo::fromPartName(quorum_part, format_version); - if (part_info.contains(quorum_part_info)) - updateQuorum(quorum_part, true); + /// Check if this quorum insert is parallel or not + if (zookeeper->exists(zookeeper_path + "/quorum/parallel/" + part_name)) + updateQuorum(part_name, true); + else if (zookeeper->exists(zookeeper_path + "/quorum/status")) + updateQuorum(part_name, false); } - } - merge_selecting_task->schedule(); + /// merged parts that are still inserted with quorum. if it only contains one block, it hasn't been merged before + if (part_info.level != 0 || part_info.mutation != 0) + { + Strings quorum_parts = zookeeper->getChildren(zookeeper_path + "/quorum/parallel"); + for (const String & quorum_part : quorum_parts) + { + auto quorum_part_info = MergeTreePartInfo::fromPartName(quorum_part, format_version); + if (part_info.contains(quorum_part_info)) + updateQuorum(quorum_part, true); + } + } - for (const auto & replaced_part : replaced_parts) - { - LOG_DEBUG(log, "Part {} is rendered obsolete by fetching part {}", replaced_part->name, part_name); - ProfileEvents::increment(ProfileEvents::ObsoleteReplicatedParts); + merge_selecting_task->schedule(); + + for (const auto & replaced_part : replaced_parts) + { + LOG_DEBUG(log, "Part {} is rendered obsolete by fetching part {}", replaced_part->name, part_name); + ProfileEvents::increment(ProfileEvents::ObsoleteReplicatedParts); + } } write_part_log({}); @@ -5315,13 +5404,13 @@ void StorageReplicatedMergeTree::clearOldPartsAndRemoveFromZK() } parts.clear(); - auto remove_parts_from_filesystem = [log=log,&zookeeper=zookeeper,&zookeeper_path=zookeeper_path,&replica_name=replica_name] (const DataPartsVector & parts_to_remove) + auto remove_parts_from_filesystem = [log=log] (const DataPartsVector & parts_to_remove) { for (const auto & part : parts_to_remove) { try { - bool keep_s3 = !part->unlockSharedData(zookeeper_path, replica_name, zookeeper); + bool keep_s3 = !part->unlockSharedData(); part->remove(keep_s3); } catch (...) @@ -6271,4 +6360,13 @@ void StorageReplicatedMergeTree::startBackgroundMovesIfNeeded() background_moves_executor.start(); } +StorageReplicatedMergeTree::ZooKeeperAccessData StorageReplicatedMergeTree::getZooKeeperAccessData() const +{ + ZooKeeperAccessData res; + res.zookeeper = tryGetZooKeeper(); + res.zookeeper_path = zookeeper_path; + res.replica_name = replica_name; + return res; +} + } diff --git a/src/Storages/StorageReplicatedMergeTree.h b/src/Storages/StorageReplicatedMergeTree.h index d396f32dcca..11dc475257e 100644 --- a/src/Storages/StorageReplicatedMergeTree.h +++ b/src/Storages/StorageReplicatedMergeTree.h @@ -211,6 +211,18 @@ public: /// is not overloaded bool canExecuteFetch(const ReplicatedMergeTreeLogEntry & entry, String & disable_reason) const; + struct ZooKeeperAccessData + { + zkutil::ZooKeeperPtr zookeeper; + String zookeeper_path; + String replica_name; + }; + + ZooKeeperAccessData getZooKeeperAccessData() const; + + /// Fetch part only when it stored on shared storage like S3 + bool executeFetchShared(ReplicatedMergeTreeLogEntry & entry); + private: /// Get a sequential consistent view of current parts. ReplicatedMergeTreeQuorumAddedParts::PartitionIdToMaxBlock getMaxAddedBlocks() const; @@ -369,8 +381,7 @@ private: String getChecksumsForZooKeeper(const MergeTreeDataPartChecksums & checksums) const; /// Accepts a PreComitted part, atomically checks its checksums with ones on other replicas and commit the part - DataPartsVector checkPartChecksumsAndCommit(Transaction & transaction, - const DataPartPtr & part); + DataPartsVector checkPartChecksumsAndCommit(Transaction & transaction, const DataPartPtr & part); bool partIsAssignedToBackgroundOperation(const DataPartPtr & part) const override; @@ -487,6 +498,10 @@ private: */ String findReplicaHavingPart(const String & part_name, bool active); + /** Returns a replica with part on shared storage like S3. + */ + String findReplicaHavingSharedPart(const String & part_name, bool active); + bool checkReplicaHavePart(const String & replica, const String & part_name); /** Find replica having specified part or any part that covers it. @@ -508,7 +523,10 @@ private: const String & replica_path, bool to_detached, size_t quorum, - zkutil::ZooKeeper::Ptr zookeeper_ = nullptr); + zkutil::ZooKeeper::Ptr zookeeper_ = nullptr, + bool replace_exists = false, + DiskPtr replaced_disk = nullptr, + String replaced_part_path = ""); /// Required only to avoid races between executeLogEntry and fetchPartition std::unordered_set currently_fetching_parts; From 97b5179e55496b42f93db35cc6b8957be5a6b0bf Mon Sep 17 00:00:00 2001 From: Pavel Kruglov Date: Tue, 19 Jan 2021 22:21:06 +0300 Subject: [PATCH 0073/2357] Implement HedgedRequests --- src/Client/Connection.cpp | 115 ++-- src/Client/Connection.h | 17 +- src/Client/ConnectionPoolWithFailover.cpp | 235 +++++++- src/Client/ConnectionPoolWithFailover.h | 61 +++ src/Client/GetHedgedConnections.cpp | 512 ++++++++++++++++++ src/Client/GetHedgedConnections.h | 154 ++++++ src/Client/HedgedConnections.cpp | 389 +++++++++++++ src/Client/HedgedConnections.h | 93 ++++ src/Client/IConnections.h | 57 ++ src/Client/MultiplexedConnections.cpp | 2 +- src/Client/MultiplexedConnections.h | 39 +- src/Client/ya.make | 2 + src/Common/Epoll.cpp | 82 +++ src/Common/Epoll.h | 44 ++ src/Common/ErrorCodes.cpp | 1 + src/Common/PoolWithFailoverBase.h | 94 ++-- src/Common/TimerDescriptor.cpp | 2 +- src/Common/TimerDescriptor.h | 16 +- src/Common/ya.make | 1 + src/Core/Defines.h | 4 + src/Core/Settings.h | 9 + src/DataStreams/RemoteQueryExecutor.cpp | 77 +-- src/DataStreams/RemoteQueryExecutor.h | 9 +- .../RemoteQueryExecutorReadContext.cpp | 71 +-- .../RemoteQueryExecutorReadContext.h | 19 +- src/IO/ConnectionTimeouts.h | 45 +- src/IO/ConnectionTimeoutsContext.h | 11 +- src/IO/ReadBufferFromPocoSocket.cpp | 2 +- src/IO/ReadBufferFromPocoSocket.h | 6 +- src/Processors/Executors/PollingQueue.cpp | 35 +- src/Processors/Executors/PollingQueue.h | 3 +- src/Server/TCPHandler.cpp | 25 + .../configs/remote_servers.xml | 18 + .../test_hedged_requests/configs/users.xml | 10 + .../test_hedged_requests/configs/users1.xml | 7 + .../integration/test_hedged_requests/test.py | 76 +++ 36 files changed, 2054 insertions(+), 289 deletions(-) create mode 100644 src/Client/GetHedgedConnections.cpp create mode 100644 src/Client/GetHedgedConnections.h create mode 100644 src/Client/HedgedConnections.cpp create mode 100644 src/Client/HedgedConnections.h create mode 100644 src/Client/IConnections.h create mode 100644 src/Common/Epoll.cpp create mode 100644 src/Common/Epoll.h create mode 100644 tests/integration/test_hedged_requests/configs/remote_servers.xml create mode 100644 tests/integration/test_hedged_requests/configs/users.xml create mode 100644 tests/integration/test_hedged_requests/configs/users1.xml create mode 100644 tests/integration/test_hedged_requests/test.py diff --git a/src/Client/Connection.cpp b/src/Client/Connection.cpp index ef114490c51..15f530f4085 100644 --- a/src/Client/Connection.cpp +++ b/src/Client/Connection.cpp @@ -64,53 +64,7 @@ void Connection::connect(const ConnectionTimeouts & timeouts) if (connected) disconnect(); - LOG_TRACE(log_wrapper.get(), "Connecting. Database: {}. User: {}{}{}", - default_database.empty() ? "(not specified)" : default_database, - user, - static_cast(secure) ? ". Secure" : "", - static_cast(compression) ? "" : ". Uncompressed"); - - if (static_cast(secure)) - { -#if USE_SSL - socket = std::make_unique(); - - /// we resolve the ip when we open SecureStreamSocket, so to make Server Name Indication (SNI) - /// work we need to pass host name separately. It will be send into TLS Hello packet to let - /// the server know which host we want to talk with (single IP can process requests for multiple hosts using SNI). - static_cast(socket.get())->setPeerHostName(host); -#else - throw Exception{"tcp_secure protocol is disabled because poco library was built without NetSSL support.", ErrorCodes::SUPPORT_IS_DISABLED}; -#endif - } - else - { - socket = std::make_unique(); - } - - current_resolved_address = DNSResolver::instance().resolveAddress(host, port); - - const auto & connection_timeout = static_cast(secure) ? timeouts.secure_connection_timeout : timeouts.connection_timeout; - socket->connect(*current_resolved_address, connection_timeout); - socket->setReceiveTimeout(timeouts.receive_timeout); - socket->setSendTimeout(timeouts.send_timeout); - socket->setNoDelay(true); - if (timeouts.tcp_keep_alive_timeout.totalSeconds()) - { - socket->setKeepAlive(true); - socket->setOption(IPPROTO_TCP, -#if defined(TCP_KEEPALIVE) - TCP_KEEPALIVE -#else - TCP_KEEPIDLE // __APPLE__ -#endif - , timeouts.tcp_keep_alive_timeout); - } - - in = std::make_shared(*socket); - out = std::make_shared(*socket); - - connected = true; + prepare(timeouts); sendHello(); receiveHello(); @@ -146,6 +100,57 @@ void Connection::disconnect() connected = false; } +void Connection::prepare(const ConnectionTimeouts & timeouts) +{ + LOG_TRACE(log_wrapper.get(), "Connecting. Database: {}. User: {}{}{}", + default_database.empty() ? "(not specified)" : default_database, + user, + static_cast(secure) ? ". Secure" : "", + static_cast(compression) ? "" : ". Uncompressed"); + + if (static_cast(secure)) + { +#if USE_SSL + socket = std::make_unique(); + + /// we resolve the ip when we open SecureStreamSocket, so to make Server Name Indication (SNI) + /// work we need to pass host name separately. It will be send into TLS Hello packet to let + /// the server know which host we want to talk with (single IP can process requests for multiple hosts using SNI). + static_cast(socket.get())->setPeerHostName(host); +#else + throw Exception{"tcp_secure protocol is disabled because poco library was built without NetSSL support.", ErrorCodes::SUPPORT_IS_DISABLED}; +#endif + } + else + { + socket = std::make_unique(); + } + + current_resolved_address = DNSResolver::instance().resolveAddress(host, port); + + const auto & connection_timeout = static_cast(secure) ? timeouts.secure_connection_timeout : timeouts.connection_timeout; + socket->connect(*current_resolved_address, connection_timeout); + socket->setReceiveTimeout(timeouts.receive_timeout); + socket->setSendTimeout(timeouts.send_timeout); + socket->setNoDelay(true); + if (timeouts.tcp_keep_alive_timeout.totalSeconds()) + { + socket->setKeepAlive(true); + socket->setOption(IPPROTO_TCP, +#if defined(TCP_KEEPALIVE) + TCP_KEEPALIVE +#else + TCP_KEEPIDLE // __APPLE__ +#endif + , timeouts.tcp_keep_alive_timeout); + } + + in = std::make_shared(*socket); + out = std::make_shared(*socket); + + connected = true; +} + void Connection::sendHello() { @@ -334,8 +339,6 @@ void Connection::sendClusterNameAndSalt() bool Connection::ping() { - // LOG_TRACE(log_wrapper.get(), "Ping"); - TimeoutSetter timeout_setter(*socket, sync_request_timeout, true); try { @@ -379,10 +382,21 @@ TablesStatusResponse Connection::getTablesStatus(const ConnectionTimeouts & time TimeoutSetter timeout_setter(*socket, sync_request_timeout, true); + sendTablesStatusRequest(request); + TablesStatusResponse response = receiveTablesStatusResponse(); + + return response; +} + +void Connection::sendTablesStatusRequest(const TablesStatusRequest & request) +{ writeVarUInt(Protocol::Client::TablesStatusRequest, *out); request.write(*out, server_revision); out->next(); +} +TablesStatusResponse Connection::receiveTablesStatusResponse() +{ UInt64 response_type = 0; readVarUInt(response_type, *in); @@ -396,7 +410,6 @@ TablesStatusResponse Connection::getTablesStatus(const ConnectionTimeouts & time return response; } - void Connection::sendQuery( const ConnectionTimeouts & timeouts, const String & query, @@ -742,7 +755,7 @@ std::optional Connection::checkPacket(size_t timeout_microseconds) } -Packet Connection::receivePacket(std::function async_callback) +Packet Connection::receivePacket(AsyncCallback async_callback) { in->setAsyncCallback(std::move(async_callback)); SCOPE_EXIT(in->setAsyncCallback({})); diff --git a/src/Client/Connection.h b/src/Client/Connection.h index 83e8f3ba206..7c96634c21a 100644 --- a/src/Client/Connection.h +++ b/src/Client/Connection.h @@ -173,7 +173,7 @@ public: /// Receive packet from server. /// Each time read blocks and async_callback is set, it will be called. You can poll socket inside it. - Packet receivePacket(std::function async_callback = {}); + Packet receivePacket(AsyncCallback async_callback = {}); /// If not connected yet, or if connection is broken - then connect. If cannot connect - throw an exception. void forceConnected(const ConnectionTimeouts & timeouts); @@ -192,6 +192,19 @@ public: size_t outBytesCount() const { return out ? out->count() : 0; } size_t inBytesCount() const { return in ? in->count() : 0; } + /// Make preparation before sending Hello in connect + void prepare(const ConnectionTimeouts & timeouts); + + void sendHello(); + + void receiveHello(); + + void sendTablesStatusRequest(const TablesStatusRequest & request); + + TablesStatusResponse receiveTablesStatusResponse(); + + Poco::Net::Socket * getSocket() { return socket.get(); } + private: String host; UInt16 port; @@ -280,8 +293,6 @@ private: LoggerWrapper log_wrapper; void connect(const ConnectionTimeouts & timeouts); - void sendHello(); - void receiveHello(); #if USE_SSL void sendClusterNameAndSalt(); diff --git a/src/Client/ConnectionPoolWithFailover.cpp b/src/Client/ConnectionPoolWithFailover.cpp index 1ca61dc8059..df4541ecf7e 100644 --- a/src/Client/ConnectionPoolWithFailover.cpp +++ b/src/Client/ConnectionPoolWithFailover.cpp @@ -132,6 +132,8 @@ std::vector ConnectionPoolWithFailover::getMany(const Co const Settings * settings, PoolMode pool_mode) { + LOG_DEBUG(log, "ConnectionPoolWithFailover getMany"); + TryGetEntryFunc try_get_entry = [&](NestedPool & pool, std::string & fail_message) { return tryGetEntry(pool, timeouts, fail_message, settings); @@ -164,6 +166,9 @@ std::vector ConnectionPoolWithFailover::g const Settings * settings, PoolMode pool_mode, const QualifiedTableName & table_to_check) { + + LOG_DEBUG(log, "ConnectionPoolWithFailover getManyChecked"); + TryGetEntryFunc try_get_entry = [&](NestedPool & pool, std::string & fail_message) { return tryGetEntry(pool, timeouts, fail_message, settings, &table_to_check); @@ -172,11 +177,49 @@ std::vector ConnectionPoolWithFailover::g return getManyImpl(settings, pool_mode, try_get_entry); } +ConnectionPoolWithFailover::Base::GetPriorityFunc ConnectionPoolWithFailover::makeGetPriorityFunc(const Settings * settings) +{ + size_t offset = 0; + if (settings) + offset = settings->load_balancing_first_offset % nested_pools.size(); + + GetPriorityFunc get_priority; + switch (settings ? LoadBalancing(settings->load_balancing) : default_load_balancing) + { + case LoadBalancing::NEAREST_HOSTNAME: + get_priority = [&](size_t i) { return hostname_differences[i]; }; + break; + case LoadBalancing::IN_ORDER: + get_priority = [](size_t i) { return i; }; + break; + case LoadBalancing::RANDOM: + break; + case LoadBalancing::FIRST_OR_RANDOM: + get_priority = [offset](size_t i) -> size_t { return i != offset; }; + break; + case LoadBalancing::ROUND_ROBIN: + if (last_used >= nested_pools.size()) + last_used = 0; + ++last_used; + /* Consider nested_pools.size() equals to 5 + * last_used = 1 -> get_priority: 0 1 2 3 4 + * last_used = 2 -> get_priority: 5 0 1 2 3 + * last_used = 3 -> get_priority: 5 4 0 1 2 + * ... + * */ + get_priority = [&](size_t i) { ++i; return i < last_used ? nested_pools.size() - i : i - last_used; }; + break; + } + + return get_priority; +} + std::vector ConnectionPoolWithFailover::getManyImpl( const Settings * settings, PoolMode pool_mode, const TryGetEntryFunc & try_get_entry) { + LOG_DEBUG(log, "ConnectionPoolWithFailover getManyImpl"); size_t min_entries = (settings && settings->skip_unavailable_shards) ? 0 : 1; size_t max_tries = (settings ? size_t{settings->connections_with_failover_max_tries} : @@ -194,36 +237,7 @@ std::vector ConnectionPoolWithFailover::g else throw DB::Exception("Unknown pool allocation mode", DB::ErrorCodes::LOGICAL_ERROR); - size_t offset = 0; - if (settings) - offset = settings->load_balancing_first_offset % nested_pools.size(); - GetPriorityFunc get_priority; - switch (settings ? LoadBalancing(settings->load_balancing) : default_load_balancing) - { - case LoadBalancing::NEAREST_HOSTNAME: - get_priority = [&](size_t i) { return hostname_differences[i]; }; - break; - case LoadBalancing::IN_ORDER: - get_priority = [](size_t i) { return i; }; - break; - case LoadBalancing::RANDOM: - break; - case LoadBalancing::FIRST_OR_RANDOM: - get_priority = [offset](size_t i) -> size_t { return i != offset; }; - break; - case LoadBalancing::ROUND_ROBIN: - if (last_used >= nested_pools.size()) - last_used = 0; - ++last_used; - /* Consider nested_pools.size() equals to 5 - * last_used = 1 -> get_priority: 0 1 2 3 4 - * last_used = 2 -> get_priority: 5 0 1 2 3 - * last_used = 3 -> get_priority: 5 4 0 1 2 - * ... - * */ - get_priority = [&](size_t i) { ++i; return i < last_used ? nested_pools.size() - i : i - last_used; }; - break; - } + GetPriorityFunc get_priority = makeGetPriorityFunc(settings); UInt64 max_ignored_errors = settings ? settings->distributed_replica_max_ignored_errors.value : 0; bool fallback_to_stale_replicas = settings ? settings->fallback_to_stale_replicas_for_distributed_queries.value : true; @@ -244,8 +258,11 @@ ConnectionPoolWithFailover::tryGetEntry( TryResult result; try { + LOG_DEBUG(log, "ConnectionPoolWithFailover tryGetEntry"); result.entry = pool.get(timeouts, settings, /* force_connected = */ false); + LOG_DEBUG(log, "ConnectionPoolWithFailover isConnected {}", result.entry->isConnected()); + UInt64 server_revision = 0; if (table_to_check) server_revision = result.entry->getServerRevision(timeouts); @@ -314,4 +331,162 @@ ConnectionPoolWithFailover::tryGetEntry( return result; } +std::vector ConnectionPoolWithFailover::getShuffledPools(const Settings * settings) +{ + GetPriorityFunc get_priority = makeGetPriorityFunc(settings); + UInt64 max_ignored_errors = settings ? settings->distributed_replica_max_ignored_errors.value : 0; + return Base::getShuffledPools(max_ignored_errors, get_priority); +} + +TryGetConnection::TryGetConnection( + IConnectionPool * pool_, + const ConnectionTimeouts * timeouts_, + const Settings * settings_, + std::shared_ptr table_to_check_, + Poco::Logger * log_) : + pool(pool_), timeouts(timeouts_), settings(settings_), + table_to_check(table_to_check_), log(log_), stage(Stage::CONNECT), socket_fd(-1) +{ +} + +void TryGetConnection::reset() +{ + resetResult(); + stage = Stage::CONNECT; + epoll = nullptr; + socket_fd = -1; + fail_message.clear(); +} + +void TryGetConnection::resetResult() +{ + if (!result.entry.isNull()) + { + result.entry->disconnect(); + result.reset(); + } +} + +void TryGetConnection::processFail(bool add_description) +{ + if (epoll) + epoll->remove(socket_fd); + + fail_message = getCurrentExceptionMessage(/* with_stacktrace = */ false); + if (add_description) + fail_message += " (" + result.entry->getDescription() + ")"; + resetResult(); + socket_fd = -1; + stage = Stage::FAILED; +} + +void TryGetConnection::run() +{ + try + { + if (stage == Stage::CONNECT) + { + result.entry = pool->get(*timeouts, settings, /* force_connected = */ false); + + if (!result.entry->isConnected()) + { + result.entry->prepare(*timeouts); + socket_fd = result.entry->getSocket()->impl()->sockfd(); + result.entry->sendHello(); + stage = Stage::RECEIVE_HELLO; + /// We are waiting for hello from replica. + return; + } + + socket_fd = result.entry->getSocket()->impl()->sockfd(); + stage = Stage::START_CHECK_TABLE; + } + + if (stage == Stage::RECEIVE_HELLO) + { + result.entry->receiveHello(); + stage = Stage::START_CHECK_TABLE; + } + + if (stage == Stage::START_CHECK_TABLE) + { + UInt64 server_revision = 0; + if (table_to_check) + server_revision = result.entry->getServerRevision(*timeouts); + + if (!table_to_check || server_revision < DBMS_MIN_REVISION_WITH_TABLES_STATUS) + { + result.entry->forceConnected(*timeouts); + result.is_usable = true; + result.is_up_to_date = true; + stage = FINISHED; + return; + } + + TablesStatusRequest status_request; + status_request.tables.emplace(*table_to_check); + + result.entry->sendTablesStatusRequest(status_request); + stage = Stage::RECEIVE_TABLES_STATUS; + /// We are waiting for tables status response. + return; + } + + if (stage == Stage::RECEIVE_TABLES_STATUS) + { + TablesStatusResponse status_response = result.entry->receiveTablesStatusResponse(); + auto table_status_it = status_response.table_states_by_id.find(*table_to_check); + if (table_status_it == status_response.table_states_by_id.end()) + { + const char * message_pattern = "There is no table {}.{} on server: {}"; + fail_message = fmt::format(message_pattern, backQuote(table_to_check->database), backQuote(table_to_check->table), result.entry->getDescription()); + LOG_WARNING(log, fail_message); + ProfileEvents::increment(ProfileEvents::DistributedConnectionMissingTable); + stage = Stage::FINISHED; + return; + } + + result.is_usable = true; + + UInt64 max_allowed_delay = settings ? UInt64(settings->max_replica_delay_for_distributed_queries) : 0; + if (!max_allowed_delay) + { + result.is_up_to_date = true; + stage = Stage::FINISHED; + return; + } + + UInt32 delay = table_status_it->second.absolute_delay; + + if (delay < max_allowed_delay) + result.is_up_to_date = true; + else + { + result.is_up_to_date = false; + result.staleness = delay; + + LOG_TRACE(log, "Server {} has unacceptable replica delay for table {}.{}: {}", result.entry->getDescription(), table_to_check->database, table_to_check->table, delay); + ProfileEvents::increment(ProfileEvents::DistributedConnectionStaleReplica); + } + } + + stage = Stage::FINISHED; + } + catch (Poco::Net::NetException & e) + { + processFail(true); + } + catch (Poco::TimeoutException & e) + { + processFail(true); + } + catch (const Exception & e) + { + if (e.code() != ErrorCodes::ATTEMPT_TO_READ_AFTER_EOF) + throw; + + processFail(false); + } +} + } diff --git a/src/Client/ConnectionPoolWithFailover.h b/src/Client/ConnectionPoolWithFailover.h index 7d5f713f6a9..c57a7bb984a 100644 --- a/src/Client/ConnectionPoolWithFailover.h +++ b/src/Client/ConnectionPoolWithFailover.h @@ -2,6 +2,7 @@ #include #include +#include #include #include @@ -31,6 +32,55 @@ enum class PoolMode GET_ALL }; +/// Class for establishing connection with replica without blocking. +class TryGetConnection +{ +public: + enum Stage + { + CONNECT = 0, + RECEIVE_HELLO = 1, + START_CHECK_TABLE = 2, + RECEIVE_TABLES_STATUS = 3, + FINISHED = 4, + FAILED = 5, + }; + + using TryResult = PoolWithFailoverBase::TryResult; + + TryGetConnection(IConnectionPool * pool_, + const ConnectionTimeouts * timeouts_, + const Settings * settings_, + std::shared_ptr table_to_check = nullptr, + Poco::Logger * log_ = nullptr); + + /// Continue connecting to replica from previous stage. Initial stage is CONNECT. + void run(); + + void resetResult(); + + /// Reset class to initial stage. + void reset(); + + /// If connection is failed and epoll is set, before disconnecting + /// socket will be removed from epoll. + void setEpoll(Epoll * epoll_) { epoll = epoll_; } + + /// Process fail connection. + void processFail(bool add_description = false); + + IConnectionPool * pool; + const ConnectionTimeouts * timeouts; + std::string fail_message; + const Settings * settings; + std::shared_ptr table_to_check; + Poco::Logger * log; + TryResult result; + Stage stage; + int socket_fd; + Epoll * epoll = nullptr; +}; + class ConnectionPoolWithFailover : public IConnectionPool, private PoolWithFailoverBase { public: @@ -80,6 +130,15 @@ public: using Status = std::vector; Status getStatus() const; + std::vector getShuffledPools(const Settings * settings); + + size_t getMaxErrorCup() const { return Base::max_error_cap; } + + void updateSharedError(std::vector & shuffled_pools) + { + Base::updateSharedErrorCounts(shuffled_pools); + } + private: /// Get the values of relevant settings and call Base::getMany() std::vector getManyImpl( @@ -97,6 +156,8 @@ private: const Settings * settings, const QualifiedTableName * table_to_check = nullptr); + GetPriorityFunc makeGetPriorityFunc(const Settings * settings); + private: std::vector hostname_differences; /// Distances from name of this host to the names of hosts of pools. size_t last_used = 0; /// Last used for round_robin policy. diff --git a/src/Client/GetHedgedConnections.cpp b/src/Client/GetHedgedConnections.cpp new file mode 100644 index 00000000000..3b30650e6e5 --- /dev/null +++ b/src/Client/GetHedgedConnections.cpp @@ -0,0 +1,512 @@ +#include +#include + + +namespace DB +{ +namespace ErrorCodes +{ + extern const int BAD_ARGUMENTS; + extern const int ALL_CONNECTION_TRIES_FAILED; +} + +GetHedgedConnections::GetHedgedConnections( + const ConnectionPoolWithFailoverPtr & pool_, + const Settings * settings_, + const ConnectionTimeouts & timeouts_, + std::shared_ptr table_to_check_) + : pool(pool_), settings(settings_), timeouts(timeouts_), table_to_check(table_to_check_) +{ + log = &Poco::Logger::get("GetHedgedConnections"); + shuffled_pools = pool->getShuffledPools(settings); + for (size_t i = 0; i != shuffled_pools.size(); ++i) + try_get_connections.emplace_back(shuffled_pools[i].pool, &timeouts, settings, table_to_check, log); + + max_tries + = (settings ? size_t{settings->connections_with_failover_max_tries} : size_t{DBMS_CONNECTION_POOL_WITH_FAILOVER_DEFAULT_MAX_TRIES}); + + fallback_to_stale_replicas = settings ? settings->fallback_to_stale_replicas_for_distributed_queries : false; +} + +GetHedgedConnections::~GetHedgedConnections() +{ + pool->updateSharedError(shuffled_pools); +} + +GetHedgedConnections::Replicas GetHedgedConnections::getConnections() +{ + entries_count = 0; + usable_count = 0; + failed_pools_count = 0; + + ReplicaStatePtr replica = &first_replica; + int index = 0; + + while (index != -1 || epoll.size() != 0) + { + if (index != -1) + { + Action action = startTryGetConnection(index, replica); + if (action == Action::TRY_NEXT_REPLICA) + { + index = getNextIndex(index); + continue; + } + + if (action == Action::FINISH) + { + swapReplicasIfNeeded(); + return {&first_replica, &second_replica}; + } + } + + /// Process epoll events + replica = processEpollEvents(); + if (replica->isReady()) + { + swapReplicasIfNeeded(); + return {&first_replica, &second_replica}; + } + + index = getNextIndex(index); + } + + /// We reach this point only if there was no up to date replica + + if (usable_count == 0) + { + if (settings && settings->skip_unavailable_shards) + { + first_replica.state = State::CANNOT_CHOOSE; + second_replica.state = State::CANNOT_CHOOSE; + return {&first_replica, &second_replica}; + } + + throw NetException("All connection tries failed. Log: \n\n" + fail_messages + "\n", ErrorCodes::ALL_CONNECTION_TRIES_FAILED); + } + if (!fallback_to_stale_replicas) + throw DB::Exception("Could not find connection to up-to-date replica.", DB::ErrorCodes::ALL_REPLICAS_ARE_STALE); + + setBestUsableReplica(first_replica); + return {&first_replica, &second_replica}; +} + +void GetHedgedConnections::chooseSecondReplica() +{ + LOG_DEBUG(log, "choose second replica"); + + if (second_replica.isCannotChoose() || second_replica.isReady()) + return; + + int index; + if (second_replica.isNotReady()) + index = second_replica.index; + else + index = first_replica.index; + + while (true) + { + if (second_replica.isEmpty()) + { + + index = getNextIndex(index); + if (index == -1) + break; + + Action action = startTryGetConnection(index, &second_replica); + + if (action == Action::TRY_NEXT_REPLICA) + continue; + + /// Second replica is ready or we are waiting for response from it + return; + } + + if (!second_replica.isNotReady()) + throw Exception("Second replica state must be 'NOT_READY' before process epoll events", ErrorCodes::LOGICAL_ERROR); + + ReplicaStatePtr replica = processEpollEvents( true); + + if (replica != &second_replica) + throw Exception("Epoll could return only second replica here", ErrorCodes::LOGICAL_ERROR); + + /// If replica is not empty than it is ready or we are waiting for a response from it + if (!second_replica.isEmpty()) + return; + } + + /// There is no up to date replica + + LOG_DEBUG(log, "there is no up to date replica for second replica"); + + if (!fallback_to_stale_replicas || usable_count <= 1) + second_replica.state = State::CANNOT_CHOOSE; + else + setBestUsableReplica(second_replica, first_replica.index); +} + +void GetHedgedConnections::stopChoosingSecondReplica() +{ + LOG_DEBUG(log, "stop choosing second replica"); + + if (!second_replica.isNotReady()) + throw Exception("Can't stop choosing second replica, because it's not in process of choosing", ErrorCodes::LOGICAL_ERROR); + + removeTimeoutsFromReplica(&second_replica, epoll); + epoll.remove(second_replica.fd); + + try_get_connections[second_replica.index].reset(); + second_replica.reset(); +} + +int GetHedgedConnections::getNextIndex(int cur_index) +{ + /// Check if there is no more available replicas + if (cur_index == -1 || entries_count + failed_pools_count >= shuffled_pools.size()) + return -1; + + /// We can work with two replicas simultaneously and they must have different indexes + int skip_index = -1; + if (!first_replica.isEmpty()) + skip_index = first_replica.index; + else if (!second_replica.isEmpty()) + skip_index = second_replica.index; + + bool finish = false; + int next_index = cur_index; + while (!finish) + { + next_index = (next_index + 1) % shuffled_pools.size(); + + /// Check if we can try this replica + if (next_index != skip_index && (max_tries == 0 || shuffled_pools[next_index].error_count < max_tries) + && try_get_connections[next_index].stage != TryGetConnection::Stage::FINISHED) + finish = true; + + /// If we made a complete round, there is no replica to connect + else if (next_index == cur_index) + return -1; + } + + LOG_DEBUG(log, "get next index: {}", next_index); + + return next_index; +} + +GetHedgedConnections::Action GetHedgedConnections::startTryGetConnection(int index, ReplicaStatePtr replica) +{ + LOG_DEBUG(log, "start try get connection with {} replica", index); + TryGetConnection & try_get_connection = try_get_connections[index]; + + replica->state = State::NOT_READY; + replica->index = index; + + try_get_connection.reset(); + try_get_connection.run(); + + if (try_get_connection.stage != TryGetConnection::Stage::FAILED) + { + replica->fd = try_get_connection.socket_fd; + replica->connection = &*try_get_connection.result.entry; + } + + Action action = processTryGetConnectionStage(replica); + + if (action == Action::PROCESS_EPOLL_EVENTS) + { + epoll.add(try_get_connection.socket_fd); + try_get_connection.setEpoll(&epoll); + addTimeouts(replica); + } + + return action; +} + +GetHedgedConnections::Action +GetHedgedConnections::processTryGetConnectionStage(ReplicaStatePtr replica, bool remove_from_epoll) +{ + LOG_DEBUG(log, "process get connection stage for {} replica", replica->index); + TryGetConnection & try_get_connection = try_get_connections[replica->index]; + + if (try_get_connection.stage == TryGetConnection::Stage::FINISHED) + { + LOG_DEBUG(log, "stage: FINISHED"); + ++entries_count; + + if (remove_from_epoll) + epoll.remove(try_get_connection.socket_fd); + + if (try_get_connection.result.is_usable) + { + LOG_DEBUG(log, "replica is usable"); + ++usable_count; + if (try_get_connection.result.is_up_to_date) + { + LOG_DEBUG(log, "replica is up to date, finish get hedged connections"); + replica->state = State::READY; + return Action::FINISH; + } + + /// This replica is not up to date, we will try to find up to date + replica->reset(); + return Action::TRY_NEXT_REPLICA; + } + } + else if (try_get_connection.stage == TryGetConnection::Stage::FAILED) + { + LOG_DEBUG(log, "stage: FAILED"); + processFailedConnection(replica); + return Action::TRY_NEXT_REPLICA; + } + + LOG_DEBUG(log, "middle stage, process epoll events"); + + /// Get connection process is not finished + return Action::PROCESS_EPOLL_EVENTS; +} + +void GetHedgedConnections::processFailedConnection(ReplicaStatePtr replica) +{ + LOG_DEBUG(log, "failed connection with {} replica", replica->index); + + ShuffledPool & shuffled_pool = shuffled_pools[replica->index]; + LOG_WARNING( + log, "Connection failed at try №{}, reason: {}", (shuffled_pool.error_count + 1), try_get_connections[replica->index].fail_message); + ProfileEvents::increment(ProfileEvents::DistributedConnectionFailTry); + + shuffled_pool.error_count = std::min(pool->getMaxErrorCup(), shuffled_pool.error_count + 1); + + if (shuffled_pool.error_count >= max_tries) + { + ++failed_pools_count; + ProfileEvents::increment(ProfileEvents::DistributedConnectionFailAtAll); + } + + std::string & fail_message = try_get_connections[replica->index].fail_message; + if (!fail_message.empty()) + fail_messages += fail_message + "\n"; + + replica->reset(); +} + +void GetHedgedConnections::addTimeouts(ReplicaState * replica) +{ + LOG_DEBUG(log, "add timeouts for {} replica", replica->index); + + addTimeoutToReplica(TimerTypes::RECEIVE_TIMEOUT, replica, epoll, timeouts); + + /// If we haven't connected to second replica yet, set special timeout for it + if (second_replica.isEmpty()) + { + auto stage = try_get_connections[replica->index].stage; + if (stage == TryGetConnection::Stage::RECEIVE_HELLO) + addTimeoutToReplica(TimerTypes::RECEIVE_HELLO_TIMEOUT, replica, epoll, timeouts); + else if (stage == TryGetConnection::Stage::RECEIVE_TABLES_STATUS) + addTimeoutToReplica(TimerTypes::RECEIVE_TABLES_STATUS_TIMEOUT, replica, epoll, timeouts); + } +} + +void GetHedgedConnections::swapReplicasIfNeeded() +{ + if ((!first_replica.isReady() && second_replica.isReady())) + { + LOG_DEBUG(log, "swap replicas"); + swapReplicas(); + } +} + +GetHedgedConnections::ReplicaStatePtr GetHedgedConnections::processEpollEvents(bool non_blocking) +{ + LOG_DEBUG(log, "process epoll events"); + int event_fd; + ReplicaStatePtr replica; + bool finish = false; + while (!finish) + { + event_fd = getReadyFileDescriptor(epoll); + + if ((replica = isEventReplica(event_fd))) + finish = processReplicaEvent(replica, non_blocking); + + else if (auto * timeout_descriptor = isEventTimeout(event_fd, replica)) + { + processTimeoutEvent(replica, timeout_descriptor); + finish = true; + } + else + throw Exception("Unknown event from epoll", ErrorCodes::LOGICAL_ERROR); + } + + LOG_DEBUG(log, "cancel process epoll events"); + + return replica; +} + +GetHedgedConnections::ReplicaStatePtr GetHedgedConnections::isEventReplica(int event_fd) +{ + if (event_fd == first_replica.fd) + return &first_replica; + + if (event_fd == second_replica.fd) + return &second_replica; + + return nullptr; +} + +TimerDescriptorPtr GetHedgedConnections::isEventTimeout(int event_fd, ReplicaStatePtr & replica_out) +{ + if (first_replica.active_timeouts.find(event_fd) != first_replica.active_timeouts.end()) + { + replica_out = &first_replica; + return first_replica.active_timeouts[event_fd].get(); + } + + if (second_replica.active_timeouts.find(event_fd) != second_replica.active_timeouts.end()) + { + replica_out = &second_replica; + return second_replica.active_timeouts[event_fd].get(); + } + + return nullptr; +} + +int GetHedgedConnections::getReadyFileDescriptor(Epoll & epoll_, AsyncCallback async_callback) +{ + if (first_replica.connection && first_replica.connection->hasReadPendingData()) + return first_replica.fd; + + if (second_replica.connection && second_replica.connection->hasReadPendingData()) + return second_replica.fd; + + return epoll_.getReady(std::move(async_callback)).data.fd; +} + +bool GetHedgedConnections::processReplicaEvent(ReplicaStatePtr replica, bool non_blocking) +{ + LOG_DEBUG(log, "epoll event is {} replica", replica->index); + removeTimeoutsFromReplica(replica, epoll); + try_get_connections[replica->index].run(); + Action action = processTryGetConnectionStage(replica, true); + if (action == Action::PROCESS_EPOLL_EVENTS) + { + addTimeouts(replica); + return non_blocking; + } + + return true; +} + +void GetHedgedConnections::processTimeoutEvent(ReplicaStatePtr & replica, TimerDescriptorPtr timeout_descriptor) +{ + LOG_DEBUG(log, "epoll event is timeout for {} replica", replica->index); + + epoll.remove(timeout_descriptor->getDescriptor()); + replica->active_timeouts.erase(timeout_descriptor->getDescriptor()); + + if (timeout_descriptor->getType() == TimerTypes::RECEIVE_TIMEOUT) + { + LOG_DEBUG(log, "process receive timeout for {} replica", replica->index); + removeTimeoutsFromReplica(replica, epoll); + epoll.remove(replica->fd); + + TryGetConnection & try_get_connection = try_get_connections[replica->index]; + try_get_connection.fail_message = "Receive timeout expired (" + try_get_connection.result.entry->getDescription() + ")"; + try_get_connection.resetResult(); + try_get_connection.stage = TryGetConnection::Stage::FAILED; + processFailedConnection(replica); + } + + else if (timeout_descriptor->getType() == TimerTypes::RECEIVE_HELLO_TIMEOUT + || timeout_descriptor->getType() == TimerTypes::RECEIVE_TABLES_STATUS_TIMEOUT) + { + if (replica->index == second_replica.index || !second_replica.isEmpty()) + throw Exception( + "Received timeout to connect with second replica, but current replica is second or second replica is not empty", + ErrorCodes::LOGICAL_ERROR); + replica = &second_replica; + } +} + +void GetHedgedConnections::setBestUsableReplica(ReplicaState & replica, int skip_index) +{ + LOG_DEBUG(log, "set best usable replica"); + + std::vector indexes(try_get_connections.size()); + for (size_t i = 0; i != indexes.size(); ++i) + indexes[i] = i; + + /// Remove unusable and failed replicas, skip the replica with skip_index index + indexes.erase( + std::remove_if( + indexes.begin(), + indexes.end(), + [&](int i) { + return try_get_connections[i].result.entry.isNull() || !try_get_connections[i].result.is_usable || i == skip_index; + }), + indexes.end()); + + if (indexes.empty()) + throw Exception("There is no usable replica to choose", ErrorCodes::LOGICAL_ERROR); + + /// Sort replicas by staleness + std::stable_sort(indexes.begin(), indexes.end(), [&](size_t lhs, size_t rhs) { + return try_get_connections[lhs].result.staleness < try_get_connections[rhs].result.staleness; + }); + + replica.index = indexes[0]; + replica.connection = &*try_get_connections[indexes[0]].result.entry; + replica.state = State::READY; + replica.fd = replica.connection->getSocket()->impl()->sockfd(); +} + +void addTimeoutToReplica(int type, GetHedgedConnections::ReplicaStatePtr replica, Epoll & epoll, const ConnectionTimeouts & timeouts) +{ + Poco::Timespan timeout; + switch (type) + { + case TimerTypes::RECEIVE_HELLO_TIMEOUT: + timeout = timeouts.receive_hello_timeout; + break; + case TimerTypes::RECEIVE_TABLES_STATUS_TIMEOUT: + timeout = timeouts.receive_tables_status_timeout; + break; + case TimerTypes::RECEIVE_DATA_TIMEOUT: + timeout = timeouts.receive_data_timeout; + break; + case TimerTypes::RECEIVE_TIMEOUT: + timeout = timeouts.receive_timeout; + break; + default: + throw Exception("Unknown timeout type", ErrorCodes::BAD_ARGUMENTS); + } + + std::unique_ptr timeout_descriptor = std::make_unique(); + timeout_descriptor->setType(type); + timeout_descriptor->setRelative(timeout); + epoll.add(timeout_descriptor->getDescriptor()); + replica->active_timeouts[timeout_descriptor->getDescriptor()] = std::move(timeout_descriptor); +} + +void removeTimeoutsFromReplica(GetHedgedConnections::ReplicaStatePtr replica, Epoll & epoll) +{ + for (auto & [fd, _] : replica->active_timeouts) + epoll.remove(fd); + replica->active_timeouts.clear(); +} + +void removeTimeoutFromReplica(int type, GetHedgedConnections::ReplicaStatePtr replica, Epoll & epoll) +{ + auto it = std::find_if( + replica->active_timeouts.begin(), + replica->active_timeouts.end(), + [type](auto & value){ return value.second->getType() == type; } + ); + + if (it != replica->active_timeouts.end()) + { + epoll.remove(it->first); + replica->active_timeouts.erase(it); + } +} + +} diff --git a/src/Client/GetHedgedConnections.h b/src/Client/GetHedgedConnections.h new file mode 100644 index 00000000000..c42dc24ddc7 --- /dev/null +++ b/src/Client/GetHedgedConnections.h @@ -0,0 +1,154 @@ +#pragma once + +#include +#include +#include +#include +#include + +namespace DB +{ + +/// Class for establishing hedged connections with replicas. +/// It works with multiple replicas simultaneously without blocking +/// (in current implementation only with 2 replicas) by using epoll. +class GetHedgedConnections +{ +public: + using ShuffledPool = ConnectionPoolWithFailover::Base::ShuffledPool; + + enum State + { + EMPTY = 0, + READY = 1, + NOT_READY = 2, + CANNOT_CHOOSE = 3, + }; + + struct ReplicaState + { + Connection * connection = nullptr; + State state = State::EMPTY; + int index = -1; + int fd = -1; + std::unordered_map> active_timeouts; + + void reset() + { + connection = nullptr; + state = State::EMPTY; + index = -1; + fd = -1; + active_timeouts.clear(); + } + + bool isReady() const { return state == State::READY; }; + bool isNotReady() const { return state == State::NOT_READY; }; + bool isEmpty() const { return state == State::EMPTY; }; + bool isCannotChoose() const { return state == State::CANNOT_CHOOSE; }; + }; + + using ReplicaStatePtr = ReplicaState *; + + struct Replicas + { + ReplicaStatePtr first_replica; + ReplicaStatePtr second_replica; + }; + + GetHedgedConnections(const ConnectionPoolWithFailoverPtr & pool_, + const Settings * settings_, + const ConnectionTimeouts & timeouts_, + std::shared_ptr table_to_check_ = nullptr); + + /// Establish connection with replicas. Return replicas as soon as connection with one of them is finished. + /// The first replica is always has state FINISHED and ready for sending query, the second replica + /// may have any state. To continue working with second replica call chooseSecondReplica(). + Replicas getConnections(); + + /// Continue choosing second replica, this function is not blocking. Second replica will be ready + /// for sending query when it has state FINISHED. + void chooseSecondReplica(); + + void stopChoosingSecondReplica(); + + void swapReplicas() { std::swap(first_replica, second_replica); } + + /// Move ready replica to the first place. + void swapReplicasIfNeeded(); + + /// Check if the file descriptor is belong to one of replicas. If yes, return this replica, if no, return nullptr. + ReplicaStatePtr isEventReplica(int event_fd); + + /// Check if the file descriptor is belong to timeout to any replica. + /// If yes, return corresponding TimerDescriptor and set timeout owner to replica, + /// if no, return nullptr. + TimerDescriptorPtr isEventTimeout(int event_fd, ReplicaStatePtr & replica); + + /// Get file rescriptor that ready for reading. + int getReadyFileDescriptor(Epoll & epoll_, AsyncCallback async_callback = {}); + + int getFileDescriptor() const { return epoll.getFileDescriptor(); } + + const ConnectionTimeouts & getConnectionTimeouts() const { return timeouts; } + + ~GetHedgedConnections(); + +private: + + enum Action + { + FINISH = 0, + PROCESS_EPOLL_EVENTS = 1, + TRY_NEXT_REPLICA = 2, + }; + + Action startTryGetConnection(int index, ReplicaStatePtr replica); + + Action processTryGetConnectionStage(ReplicaStatePtr replica, bool remove_from_epoll = false); + + int getNextIndex(int cur_index = -1); + + void addTimeouts(ReplicaStatePtr replica); + + void processFailedConnection(ReplicaStatePtr replica); + + void processReceiveTimeout(ReplicaStatePtr replica); + + bool processReplicaEvent(ReplicaStatePtr replica, bool non_blocking); + + void processTimeoutEvent(ReplicaStatePtr & replica, TimerDescriptorPtr timeout_descriptor); + + ReplicaStatePtr processEpollEvents(bool non_blocking = false); + + void setBestUsableReplica(ReplicaState & replica, int skip_index = -1); + + const ConnectionPoolWithFailoverPtr pool; + const Settings * settings; + const ConnectionTimeouts timeouts; + std::shared_ptr table_to_check; + std::vector try_get_connections; + std::vector shuffled_pools; + ReplicaState first_replica; + ReplicaState second_replica; + bool fallback_to_stale_replicas; + Epoll epoll; + Poco::Logger * log; + std::string fail_messages; + size_t entries_count; + size_t usable_count; + size_t failed_pools_count; + size_t max_tries; + +}; + +/// Add timeout with particular type to replica and add it to epoll. +void addTimeoutToReplica(int type, GetHedgedConnections::ReplicaStatePtr replica, Epoll & epoll, const ConnectionTimeouts & timeouts); + +/// Remove timeout with particular type from replica and epoll. +void removeTimeoutFromReplica(int type, GetHedgedConnections::ReplicaStatePtr replica, Epoll & epoll); + +/// Remove all timeouts from replica and epoll. +void removeTimeoutsFromReplica(GetHedgedConnections::ReplicaStatePtr replica, Epoll & epoll); + +} diff --git a/src/Client/HedgedConnections.cpp b/src/Client/HedgedConnections.cpp new file mode 100644 index 00000000000..57315bcd6fe --- /dev/null +++ b/src/Client/HedgedConnections.cpp @@ -0,0 +1,389 @@ +#include +#include + +namespace DB +{ +namespace ErrorCodes +{ + extern const int MISMATCH_REPLICAS_DATA_SOURCES; + extern const int LOGICAL_ERROR; + extern const int SOCKET_TIMEOUT; +} + +HedgedConnections::HedgedConnections( + const ConnectionPoolWithFailoverPtr & pool_, + const Settings & settings_, + const ConnectionTimeouts & timeouts_, + const ThrottlerPtr & throttler_, + std::shared_ptr table_to_check_) + : get_hedged_connections(pool_, &settings_, timeouts_, table_to_check_), settings(settings_), throttler(throttler_), log(&Poco::Logger::get("HedgedConnections")) +{ + replicas = get_hedged_connections.getConnections(); + + /// First replica may have state CANNOT_CHOOSE if setting skip_unavailable_shards is enabled + if (replicas.first_replica->isReady()) + replicas.first_replica->connection->setThrottler(throttler); + + if (!replicas.second_replica->isCannotChoose()) + { + if (replicas.second_replica->isNotReady()) + epoll.add(get_hedged_connections.getFileDescriptor()); + + auto set_throttler = [this, throttler_](ReplicaStatePtr replica) + { + replica->connection->setThrottler(throttler_); + }; + second_replica_pipeline.add(std::function(set_throttler)); + } +} + +void HedgedConnections::Pipeline::add(std::function send_function) +{ + pipeline.push_back(send_function); +} + +void HedgedConnections::Pipeline::run(ReplicaStatePtr replica) +{ + for (auto & send_func : pipeline) + send_func(replica); + + pipeline.clear(); +} + +size_t HedgedConnections::size() const +{ + if (replicas.first_replica->isReady() || replicas.second_replica->isReady()) + return 1; + + return 0; +} + +bool HedgedConnections::hasActiveConnections() const +{ + return replicas.first_replica->isReady() || replicas.second_replica->isReady(); +} + +void HedgedConnections::sendScalarsData(Scalars & data) +{ + std::lock_guard lock(cancel_mutex); + + if (!sent_query) + throw Exception("Cannot send scalars data: query not yet sent.", ErrorCodes::LOGICAL_ERROR); + + auto send_scalars_data = [&data](ReplicaStatePtr replica) { replica->connection->sendScalarsData(data); }; + + if (replicas.first_replica->isReady()) + send_scalars_data(replicas.first_replica); + + if (replicas.second_replica->isReady()) + send_scalars_data(replicas.second_replica); + else if (!replicas.second_replica->isCannotChoose()) + second_replica_pipeline.add(std::function(send_scalars_data)); +} + +void HedgedConnections::sendExternalTablesData(std::vector & data) +{ + std::lock_guard lock(cancel_mutex); + + if (!sent_query) + throw Exception("Cannot send external tables data: query not yet sent.", ErrorCodes::LOGICAL_ERROR); + + if (data.size() != size()) + throw Exception("Mismatch between replicas and data sources", ErrorCodes::MISMATCH_REPLICAS_DATA_SOURCES); + + auto send_external_tables_data = [&data](ReplicaStatePtr replica) { replica->connection->sendExternalTablesData(data[0]); }; + + if (replicas.first_replica->isReady()) + send_external_tables_data(replicas.first_replica); + + if (replicas.second_replica->isReady()) + send_external_tables_data(replicas.second_replica); + else if (!replicas.second_replica->isCannotChoose()) + second_replica_pipeline.add(send_external_tables_data); +} + +void HedgedConnections::sendQuery( + const ConnectionTimeouts & timeouts, + const String & query, + const String & query_id, + UInt64 stage, + const ClientInfo & client_info, + bool with_pending_data) +{ + std::lock_guard lock(cancel_mutex); + + if (sent_query) + throw Exception("Query already sent.", ErrorCodes::LOGICAL_ERROR); + + auto send_query = [this, timeouts, query, query_id, stage, client_info, with_pending_data](ReplicaStatePtr replica) + { + Settings modified_settings = settings; + if (replica->connection->getServerRevision(timeouts) < DBMS_MIN_REVISION_WITH_CURRENT_AGGREGATION_VARIANT_SELECTION_METHOD) + { + modified_settings.group_by_two_level_threshold = 0; + modified_settings.group_by_two_level_threshold_bytes = 0; + } + + replica->connection->sendQuery(timeouts, query, query_id, stage, &modified_settings, &client_info, with_pending_data); + this->epoll.add(replica->fd); + addTimeoutToReplica(TimerTypes::RECEIVE_TIMEOUT, replica, this->epoll, timeouts); + }; + + if (replicas.first_replica->isReady()) + { + send_query(replicas.first_replica); + if (replicas.second_replica->isEmpty()) + addTimeoutToReplica(TimerTypes::RECEIVE_DATA_TIMEOUT, replicas.first_replica, epoll, timeouts); + } + + if (replicas.second_replica->isReady()) + send_query(replicas.second_replica); + else if (!replicas.second_replica->isCannotChoose()) + second_replica_pipeline.add(send_query); + + sent_query = true; +} + +void HedgedConnections::disconnect() +{ + std::lock_guard lock(cancel_mutex); + + if (replicas.first_replica->isReady()) + { + replicas.first_replica->connection->disconnect(); + replicas.first_replica->reset(); + } + + if (replicas.second_replica->isReady()) + { + replicas.second_replica->connection->disconnect(); + replicas.second_replica->reset(); + } + else if (replicas.second_replica->isNotReady()) + get_hedged_connections.stopChoosingSecondReplica(); +} + +std::string HedgedConnections::dumpAddresses() const +{ + std::lock_guard lock(cancel_mutex); + + std::string addresses = ""; + + if (replicas.first_replica->isReady()) + addresses += replicas.first_replica->connection->getDescription(); + + if (replicas.second_replica->isReady()) + addresses += "; " + replicas.second_replica->connection->getDescription(); + + return addresses; +} + +void HedgedConnections::sendCancel() +{ + std::lock_guard lock(cancel_mutex); + + if (!sent_query || cancelled) + throw Exception("Cannot cancel. Either no query sent or already cancelled.", ErrorCodes::LOGICAL_ERROR); + + if (replicas.first_replica->isReady()) + replicas.first_replica->connection->sendCancel(); + + if (replicas.second_replica->isReady()) + replicas.second_replica->connection->sendCancel(); + + cancelled = true; +} + + +Packet HedgedConnections::drain() +{ + std::lock_guard lock(cancel_mutex); + + if (!cancelled) + throw Exception("Cannot drain connections: cancel first.", ErrorCodes::LOGICAL_ERROR); + + Packet res; + res.type = Protocol::Server::EndOfStream; + + while (epoll.size() != 0) + { + Packet packet = receivePacketImpl(); + switch (packet.type) + { + case Protocol::Server::Data: + case Protocol::Server::Progress: + case Protocol::Server::ProfileInfo: + case Protocol::Server::Totals: + case Protocol::Server::Extremes: + case Protocol::Server::EndOfStream: + break; + + case Protocol::Server::Exception: + default: + /// If we receive an exception or an unknown packet, we save it. + res = std::move(packet); + break; + } + } + + return res; +} + +Packet HedgedConnections::receivePacket() +{ + std::lock_guard lock(cancel_mutex); + return receivePacketUnlocked(); +} + +Packet HedgedConnections::receivePacketUnlocked(AsyncCallback async_callback) +{ + if (!sent_query) + throw Exception("Cannot receive packets: no query sent.", ErrorCodes::LOGICAL_ERROR); + if (!hasActiveConnections()) + throw Exception("No more packets are available.", ErrorCodes::LOGICAL_ERROR); + + if (epoll.size() == 0) + throw Exception("No pending events in epoll.", ErrorCodes::LOGICAL_ERROR); + + return receivePacketImpl(std::move(async_callback)); +} + +Packet HedgedConnections::receivePacketImpl(AsyncCallback async_callback) +{ + int event_fd; + ReplicaStatePtr replica; + Packet packet; + bool finish = false; + while (!finish) + { + event_fd = get_hedged_connections.getReadyFileDescriptor(epoll, async_callback); + + if (auto timeout_descriptor = get_hedged_connections.isEventTimeout(event_fd, replica)) + processTimeoutEvent(replica, timeout_descriptor); + else if ((replica = get_hedged_connections.isEventReplica(event_fd))) + { + packet = receivePacketFromReplica(replica, async_callback); + finish = true; + } + else if (event_fd == get_hedged_connections.getFileDescriptor()) + processGetHedgedConnectionsEvent(); + else + throw Exception("Unknown event from epoll", ErrorCodes::LOGICAL_ERROR); + } + + return packet; +}; + +Packet HedgedConnections::receivePacketFromReplica(ReplicaStatePtr replica, AsyncCallback async_callback) +{ + Packet packet = replica->connection->receivePacket(std::move(async_callback)); + switch (packet.type) + { + case Protocol::Server::Data: + removeTimeoutsFromReplica(replica, epoll); + processReceiveData(replica); + addTimeoutToReplica(TimerTypes::RECEIVE_TIMEOUT, replica, epoll, get_hedged_connections.getConnectionTimeouts()); + break; + case Protocol::Server::Progress: + case Protocol::Server::ProfileInfo: + case Protocol::Server::Totals: + case Protocol::Server::Extremes: + case Protocol::Server::Log: + removeTimeoutFromReplica(TimerTypes::RECEIVE_TIMEOUT, replica, epoll); + addTimeoutToReplica(TimerTypes::RECEIVE_TIMEOUT, replica, epoll, get_hedged_connections.getConnectionTimeouts()); + break; + + case Protocol::Server::EndOfStream: + finishProcessReplica(replica, false); + break; + + case Protocol::Server::Exception: + default: + finishProcessReplica(replica, true); + break; + } + + return packet; +} + +void HedgedConnections::processReceiveData(ReplicaStatePtr replica) +{ + /// When we receive first packet of data from any replica, we continue working with this replica + /// and stop working with another replica (if there is another replica). If current replica is + /// second, move it to the first place. + if (replica == replicas.second_replica) + get_hedged_connections.swapReplicas(); + + if (replicas.second_replica->isCannotChoose() || replicas.second_replica->isEmpty()) + return; + + if (replicas.second_replica->isNotReady()) + { + get_hedged_connections.stopChoosingSecondReplica(); + epoll.remove(get_hedged_connections.getFileDescriptor()); + } + else if (replicas.second_replica->isReady()) + { + replicas.second_replica->connection->sendCancel(); + finishProcessReplica(replicas.second_replica, true); + } +} + +void HedgedConnections::processTimeoutEvent(ReplicaStatePtr & replica, TimerDescriptorPtr timeout_descriptor) +{ + epoll.remove(timeout_descriptor->getDescriptor()); + replica->active_timeouts.erase(timeout_descriptor->getDescriptor()); + + if (timeout_descriptor->getType() == TimerTypes::RECEIVE_TIMEOUT) + { + finishProcessReplica(replica, true); + + if (!replicas.first_replica->isReady() && !replicas.second_replica->isNotReady()) + throw NetException("Receive timeout expired", ErrorCodes::SOCKET_TIMEOUT); + } + else if (timeout_descriptor->getType() == TimerTypes::RECEIVE_DATA_TIMEOUT) + { + if (!replicas.second_replica->isEmpty()) + throw Exception("Cannot start choosing second replica, it's not empty", ErrorCodes::LOGICAL_ERROR); + + get_hedged_connections.chooseSecondReplica(); + + if (replicas.second_replica->isReady()) + processChosenSecondReplica(); + else if (replicas.second_replica->isNotReady()) + epoll.add(get_hedged_connections.getFileDescriptor()); + } +} + +void HedgedConnections::processGetHedgedConnectionsEvent() +{ + get_hedged_connections.chooseSecondReplica(); + if (replicas.second_replica->isReady()) + processChosenSecondReplica(); + + if (!replicas.second_replica->isNotReady()) + epoll.remove(get_hedged_connections.getFileDescriptor()); +} + +void HedgedConnections::processChosenSecondReplica() +{ + second_replica_pipeline.run(replicas.second_replica); + + /// In case when the first replica get receive timeout before the second is chosen, + /// we need to move the second replica to the first place + get_hedged_connections.swapReplicasIfNeeded(); +} + +void HedgedConnections::finishProcessReplica(ReplicaStatePtr replica, bool disconnect) +{ + removeTimeoutsFromReplica(replica, epoll); + epoll.remove(replica->fd); + if (disconnect) + replica->connection->disconnect(); + replica->reset(); + + /// Move active connection to the first replica if it exists + get_hedged_connections.swapReplicasIfNeeded(); +} + +} diff --git a/src/Client/HedgedConnections.h b/src/Client/HedgedConnections.h new file mode 100644 index 00000000000..b84ad89658f --- /dev/null +++ b/src/Client/HedgedConnections.h @@ -0,0 +1,93 @@ +#pragma once + +#include +#include +#include + +namespace DB +{ + +class HedgedConnections : public IConnections +{ +public: + using ReplicaStatePtr = GetHedgedConnections::ReplicaStatePtr; + using Replicas = GetHedgedConnections::Replicas; + + HedgedConnections(const ConnectionPoolWithFailoverPtr & pool_, + const Settings & settings_, + const ConnectionTimeouts & timeouts_, + const ThrottlerPtr & throttler, + std::shared_ptr table_to_check_ = nullptr); + + void sendScalarsData(Scalars & data) override; + + void sendExternalTablesData(std::vector & data) override; + + void sendQuery( + const ConnectionTimeouts & timeouts, + const String & query, + const String & query_id, + UInt64 stage, + const ClientInfo & client_info, + bool with_pending_data) override; + + Packet receivePacket() override; + + Packet receivePacketUnlocked(AsyncCallback async_callback = {}) override; + + void disconnect() override; + + void sendCancel() override; + + Packet drain() override; + + std::string dumpAddresses() const override; + + size_t size() const override; + + bool hasActiveConnections() const override; + +private: + class Pipeline + { + public: + void add(std::function send_function); + + void run(ReplicaStatePtr replica); + + bool empty() const { return pipeline.empty(); } + + private: + std::vector> pipeline; + }; + + void processChosenSecondReplica(); + + Packet receivePacketFromReplica(ReplicaStatePtr replica, AsyncCallback async_callback = {}); + + Packet receivePacketImpl(AsyncCallback async_callback = {}); + + void processReceiveData(ReplicaStatePtr replica); + + void processTimeoutEvent(ReplicaStatePtr & replica, TimerDescriptorPtr timeout_descriptor); + + void processGetHedgedConnectionsEvent(); + + void removeReceiveTimeout(ReplicaStatePtr replica); + + void finishProcessReplica(ReplicaStatePtr replica, bool disconnect); + + GetHedgedConnections get_hedged_connections; + Replicas replicas; + Epoll epoll; + const Settings & settings; + ThrottlerPtr throttler; + Poco::Logger * log; + Pipeline second_replica_pipeline; + bool sent_query = false; + bool cancelled = false; + + mutable std::mutex cancel_mutex; +}; + +} diff --git a/src/Client/IConnections.h b/src/Client/IConnections.h new file mode 100644 index 00000000000..85d1e29c243 --- /dev/null +++ b/src/Client/IConnections.h @@ -0,0 +1,57 @@ +#pragma once + +#include + +namespace DB +{ + +/// Base class for working with multiple replicas (connections) +/// from one shard within a single thread +class IConnections : boost::noncopyable +{ +public: + /// Send all scalars to replicas. + virtual void sendScalarsData(Scalars & data) = 0; + /// Send all content of external tables to replicas. + virtual void sendExternalTablesData(std::vector & data) = 0; + + /// Send request to replicas. + virtual void sendQuery( + const ConnectionTimeouts & timeouts, + const String & query, + const String & query_id, + UInt64 stage, + const ClientInfo & client_info, + bool with_pending_data) = 0; + + /// Get packet from any replica. + virtual Packet receivePacket() = 0; + + /// Version of `receivePacket` function without locking. + virtual Packet receivePacketUnlocked(AsyncCallback async_callback) = 0; + + /// Break all active connections. + virtual void disconnect() = 0; + + /// Send a request to replicas to cancel the request + virtual void sendCancel() = 0; + + /** On each replica, read and skip all packets to EndOfStream or Exception. + * Returns EndOfStream if no exception has been received. Otherwise + * returns the last received packet of type Exception. + */ + virtual Packet drain() = 0; + + /// Get the replica addresses as a string. + virtual std::string dumpAddresses() const = 0; + + /// Returns the number of replicas. + virtual size_t size() const = 0; + + /// Check if there are any valid replicas. + virtual bool hasActiveConnections() const = 0; + + virtual ~IConnections() = default; +}; + +} diff --git a/src/Client/MultiplexedConnections.cpp b/src/Client/MultiplexedConnections.cpp index ed7aad0a515..fbf8c9aa172 100644 --- a/src/Client/MultiplexedConnections.cpp +++ b/src/Client/MultiplexedConnections.cpp @@ -237,7 +237,7 @@ std::string MultiplexedConnections::dumpAddressesUnlocked() const return buf.str(); } -Packet MultiplexedConnections::receivePacketUnlocked(std::function async_callback) +Packet MultiplexedConnections::receivePacketUnlocked(AsyncCallback async_callback) { if (!sent_query) throw Exception("Cannot receive packets: no query sent.", ErrorCodes::LOGICAL_ERROR); diff --git a/src/Client/MultiplexedConnections.h b/src/Client/MultiplexedConnections.h index 2ab2b60570e..720add1ba81 100644 --- a/src/Client/MultiplexedConnections.h +++ b/src/Client/MultiplexedConnections.h @@ -5,6 +5,7 @@ #include #include #include +#include namespace DB { @@ -16,7 +17,7 @@ namespace DB * * The interface is almost the same as Connection. */ -class MultiplexedConnections final : private boost::noncopyable +class MultiplexedConnections final : public IConnections { public: /// Accepts ready connection. @@ -27,49 +28,35 @@ public: std::vector && connections, const Settings & settings_, const ThrottlerPtr & throttler_); - /// Send all scalars to replicas. - void sendScalarsData(Scalars & data); - /// Send all content of external tables to replicas. - void sendExternalTablesData(std::vector & data); + void sendScalarsData(Scalars & data) override; + void sendExternalTablesData(std::vector & data) override; - /// Send request to replicas. void sendQuery( const ConnectionTimeouts & timeouts, const String & query, const String & query_id, UInt64 stage, const ClientInfo & client_info, - bool with_pending_data); + bool with_pending_data) override; - /// Get packet from any replica. - Packet receivePacket(); + Packet receivePacket() override; - /// Break all active connections. - void disconnect(); + void disconnect() override; - /// Send a request to the replica to cancel the request - void sendCancel(); + void sendCancel() override; - /** On each replica, read and skip all packets to EndOfStream or Exception. - * Returns EndOfStream if no exception has been received. Otherwise - * returns the last received packet of type Exception. - */ - Packet drain(); + Packet drain() override; - /// Get the replica addresses as a string. - std::string dumpAddresses() const; + std::string dumpAddresses() const override; - /// Returns the number of replicas. /// Without locking, because sendCancel() does not change this number. - size_t size() const { return replica_states.size(); } + size_t size() const override { return replica_states.size(); } - /// Check if there are any valid replicas. /// Without locking, because sendCancel() does not change the state of the replicas. - bool hasActiveConnections() const { return active_connection_count > 0; } + bool hasActiveConnections() const override { return active_connection_count > 0; } private: - /// Internal version of `receivePacket` function without locking. - Packet receivePacketUnlocked(std::function async_callback = {}); + Packet receivePacketUnlocked(AsyncCallback async_callback = {}) override; /// Internal version of `dumpAddresses` function without locking. std::string dumpAddressesUnlocked() const; diff --git a/src/Client/ya.make b/src/Client/ya.make index 87a0cea102a..603e8290350 100644 --- a/src/Client/ya.make +++ b/src/Client/ya.make @@ -12,6 +12,8 @@ PEERDIR( SRCS( Connection.cpp ConnectionPoolWithFailover.cpp + GetHedgedConnections.cpp + HedgedConnections.cpp MultiplexedConnections.cpp TimeoutSetter.cpp diff --git a/src/Common/Epoll.cpp b/src/Common/Epoll.cpp new file mode 100644 index 00000000000..8ce100c7834 --- /dev/null +++ b/src/Common/Epoll.cpp @@ -0,0 +1,82 @@ +#include "Epoll.h" +#include +#include +#include + +namespace DB +{ + + +/// TODO: add appropriate error codes +namespace ErrorCodes +{ + extern const int EPOLL_ERROR; + extern const int LOGICAL_ERROR; +} + +Epoll::Epoll() : events_count(0) +{ + epoll_fd = epoll_create1(0); + if (epoll_fd == -1) + throwFromErrno("Cannot open epoll descriptor", DB::ErrorCodes::EPOLL_ERROR); +} + +void Epoll::add(int fd, void * ptr) +{ + epoll_event event; + event.events = EPOLLIN | EPOLLPRI; + if (ptr) + event.data.ptr = ptr; + else + event.data.fd = fd; + + if (epoll_ctl(epoll_fd, EPOLL_CTL_ADD, fd, &event) == -1) + throwFromErrno("Cannot add new descriptor to epoll", DB::ErrorCodes::EPOLL_ERROR); + + ++events_count; +} + +void Epoll::remove(int fd) +{ + if (epoll_ctl(epoll_fd, EPOLL_CTL_DEL, fd, nullptr) == -1) + throwFromErrno("Cannot remove descriptor from epoll", DB::ErrorCodes::EPOLL_ERROR); + + --events_count; +} + +epoll_event Epoll::getReady(AsyncCallback async_callback) const +{ + std::vector events = getManyReady(1, true, std::move(async_callback)); + if (events.empty()) + throw Exception("Vector of ready events is empty", ErrorCodes::LOGICAL_ERROR); + + return events[0]; +} + +std::vector Epoll::getManyReady(int max_events, bool blocking, AsyncCallback async_callback) const +{ + std::vector events(max_events); + + int ready_size = 0; + int timeout = blocking && !async_callback ? -1 : 0; + while (ready_size <= 0 && (ready_size != 0 || blocking)) + { + ready_size = epoll_wait(epoll_fd, events.data(), max_events, timeout); + + if (ready_size == -1 && errno != EINTR) + throwFromErrno("Error in epoll_wait", DB::ErrorCodes::EPOLL_ERROR); + + if (ready_size == 0 && blocking && async_callback) + async_callback(epoll_fd, 0, "epoll"); + } + + events.resize(ready_size); + return events; +} + +Epoll::~Epoll() +{ + close(epoll_fd); +} + +} diff --git a/src/Common/Epoll.h b/src/Common/Epoll.h new file mode 100644 index 00000000000..0e04d666af0 --- /dev/null +++ b/src/Common/Epoll.h @@ -0,0 +1,44 @@ +#pragma once + +#include +#include +#include +#include + +namespace DB +{ + +using AsyncCallback = std::function; + +class Epoll : boost::noncopyable +{ +public: + Epoll(); + + /// Add new file descriptor to epoll. + void add(int fd, void * ptr = nullptr); + + /// Remove file descriptor to epoll. + void remove(int fd); + + /// Get events from epoll. If blocking is false and there are no ready events, + /// return empty vector, otherwise wait for ready events. If blocking is true, + /// async_callback is given and there is no ready events, async_callback is called + /// with epoll file descriptor. + std::vector getManyReady(int max_events, bool blocking, AsyncCallback async_callback = {}) const; + + /// Get only one ready event, this function is always blocking. + epoll_event getReady(AsyncCallback async_callback = {}) const; + + int getFileDescriptor() const { return epoll_fd; } + + int size() const { return events_count; } + + ~Epoll(); + +private: + int epoll_fd; + int events_count; +}; + +} diff --git a/src/Common/ErrorCodes.cpp b/src/Common/ErrorCodes.cpp index a2cd65137c0..fe0c0533adc 100644 --- a/src/Common/ErrorCodes.cpp +++ b/src/Common/ErrorCodes.cpp @@ -533,6 +533,7 @@ M(564, INTERSERVER_SCHEME_DOESNT_MATCH) \ M(565, TOO_MANY_PARTITIONS) \ M(566, CANNOT_RMDIR) \ + M(567, EPOLL_ERROR) \ \ M(999, KEEPER_EXCEPTION) \ M(1000, POCO_EXCEPTION) \ diff --git a/src/Common/PoolWithFailoverBase.h b/src/Common/PoolWithFailoverBase.h index 7779d18d969..6bb6f4a94dd 100644 --- a/src/Common/PoolWithFailoverBase.h +++ b/src/Common/PoolWithFailoverBase.h @@ -93,6 +93,18 @@ public: double staleness = 0.0; /// Helps choosing the "least stale" option when all replicas are stale. }; + struct PoolState; + + using PoolStates = std::vector; + + struct ShuffledPool + { + NestedPool * pool{}; + const PoolState * state{}; + size_t index = 0; + size_t error_count = 0; + }; + /// This functor must be provided by a client. It must perform a single try that takes a connection /// from the provided pool and checks that it is good. using TryGetEntryFunc = std::function; @@ -113,9 +125,6 @@ public: const GetPriorityFunc & get_priority = GetPriorityFunc()); protected: - struct PoolState; - - using PoolStates = std::vector; /// Returns a single connection. Entry get(size_t max_ignored_errors, bool fallback_to_stale_replicas, @@ -124,6 +133,10 @@ protected: /// This function returns a copy of pool states to avoid race conditions when modifying shared pool states. PoolStates updatePoolStates(size_t max_ignored_errors); + std::vector getShuffledPools(size_t max_ignored_errors, const GetPriorityFunc & get_priority); + + inline void updateSharedErrorCounts(std::vector & shuffled_pools); + auto getPoolExtendedStates() const { std::lock_guard lock(pool_states_mutex); @@ -143,6 +156,46 @@ protected: Poco::Logger * log; }; + +template +std::vector::ShuffledPool> +PoolWithFailoverBase::getShuffledPools( + size_t max_ignored_errors, const PoolWithFailoverBase::GetPriorityFunc & get_priority) +{ + /// Update random numbers and error counts. + PoolStates pool_states = updatePoolStates(max_ignored_errors); + if (get_priority) + { + for (size_t i = 0; i < pool_states.size(); ++i) + pool_states[i].priority = get_priority(i); + } + + /// Sort the pools into order in which they will be tried (based on respective PoolStates). + std::vector shuffled_pools; + shuffled_pools.reserve(nested_pools.size()); + for (size_t i = 0; i < nested_pools.size(); ++i) + shuffled_pools.push_back(ShuffledPool{nested_pools[i].get(), &pool_states[i], i, 0}); + std::sort( + shuffled_pools.begin(), shuffled_pools.end(), + [](const ShuffledPool & lhs, const ShuffledPool & rhs) + { + return PoolState::compare(*lhs.state, *rhs.state); + }); + + return shuffled_pools; +} + +template +inline void PoolWithFailoverBase::updateSharedErrorCounts(std::vector & shuffled_pools) +{ + std::lock_guard lock(pool_states_mutex); + for (const ShuffledPool & pool: shuffled_pools) + { + auto & pool_state = shared_pool_states[pool.index]; + pool_state.error_count = std::min(max_error_cap, pool_state.error_count + pool.error_count); + } +} + template typename TNestedPool::Entry PoolWithFailoverBase::get(size_t max_ignored_errors, bool fallback_to_stale_replicas, @@ -168,33 +221,7 @@ PoolWithFailoverBase::getMany( const TryGetEntryFunc & try_get_entry, const GetPriorityFunc & get_priority) { - /// Update random numbers and error counts. - PoolStates pool_states = updatePoolStates(max_ignored_errors); - if (get_priority) - { - for (size_t i = 0; i < pool_states.size(); ++i) - pool_states[i].priority = get_priority(i); - } - - struct ShuffledPool - { - NestedPool * pool{}; - const PoolState * state{}; - size_t index = 0; - size_t error_count = 0; - }; - - /// Sort the pools into order in which they will be tried (based on respective PoolStates). - std::vector shuffled_pools; - shuffled_pools.reserve(nested_pools.size()); - for (size_t i = 0; i < nested_pools.size(); ++i) - shuffled_pools.push_back(ShuffledPool{nested_pools[i].get(), &pool_states[i], i, 0}); - std::sort( - shuffled_pools.begin(), shuffled_pools.end(), - [](const ShuffledPool & lhs, const ShuffledPool & rhs) - { - return PoolState::compare(*lhs.state, *rhs.state); - }); + std::vector shuffled_pools = getShuffledPools(max_ignored_errors, get_priority); /// We will try to get a connection from each pool until a connection is produced or max_tries is reached. std::vector try_results(shuffled_pools.size()); @@ -206,12 +233,7 @@ PoolWithFailoverBase::getMany( /// At exit update shared error counts with error counts occurred during this call. SCOPE_EXIT( { - std::lock_guard lock(pool_states_mutex); - for (const ShuffledPool & pool: shuffled_pools) - { - auto & pool_state = shared_pool_states[pool.index]; - pool_state.error_count = std::min(max_error_cap, pool_state.error_count + pool.error_count); - } + updateSharedErrorCounts(shuffled_pools); }); std::string fail_messages; diff --git a/src/Common/TimerDescriptor.cpp b/src/Common/TimerDescriptor.cpp index f4c3ec35588..e2b8a0ec928 100644 --- a/src/Common/TimerDescriptor.cpp +++ b/src/Common/TimerDescriptor.cpp @@ -74,7 +74,7 @@ void TimerDescriptor::setRelative(const Poco::Timespan & timespan) const spec.it_interval.tv_nsec = 0; spec.it_interval.tv_sec = 0; spec.it_value.tv_sec = timespan.totalSeconds(); - spec.it_value.tv_nsec = timespan.useconds(); + spec.it_value.tv_nsec = timespan.useconds() * 1000; if (-1 == timerfd_settime(timer_fd, 0 /*relative timer */, &spec, nullptr)) throwFromErrno("Cannot set time for timer_fd", ErrorCodes::CANNOT_SET_TIMER_PERIOD); diff --git a/src/Common/TimerDescriptor.h b/src/Common/TimerDescriptor.h index ddb8f2a1367..fa49189abfc 100644 --- a/src/Common/TimerDescriptor.h +++ b/src/Common/TimerDescriptor.h @@ -5,14 +5,24 @@ namespace DB { +enum TimerTypes +{ + DEFAULT, + RECEIVE_HELLO_TIMEOUT, + RECEIVE_TABLES_STATUS_TIMEOUT, + RECEIVE_DATA_TIMEOUT, + RECEIVE_TIMEOUT, +}; + /// Wrapper over timerfd. class TimerDescriptor { private: int timer_fd; + int type = TimerTypes::DEFAULT; public: - explicit TimerDescriptor(int clockid, int flags); + explicit TimerDescriptor(int clockid = CLOCK_MONOTONIC, int flags = 0); ~TimerDescriptor(); TimerDescriptor(const TimerDescriptor &) = delete; @@ -21,11 +31,15 @@ public: TimerDescriptor & operator=(TimerDescriptor &&) = default; int getDescriptor() const { return timer_fd; } + int getType() const { return type; } void reset() const; void drain() const; void setRelative(const Poco::Timespan & timespan) const; + void setType(int type_) { type = type_; } }; +using TimerDescriptorPtr = TimerDescriptor *; + } #endif diff --git a/src/Common/ya.make b/src/Common/ya.make index 5b5da618bbe..9097736c32c 100644 --- a/src/Common/ya.make +++ b/src/Common/ya.make @@ -40,6 +40,7 @@ SRCS( DirectorySyncGuard.cpp Dwarf.cpp Elf.cpp + Epoll.cpp ErrorCodes.cpp Exception.cpp ExternalLoaderStatus.cpp diff --git a/src/Core/Defines.h b/src/Core/Defines.h index f1cd1a8a708..d8e8b526600 100644 --- a/src/Core/Defines.h +++ b/src/Core/Defines.h @@ -11,6 +11,10 @@ #define DBMS_DEFAULT_CONNECT_TIMEOUT_WITH_FAILOVER_SECURE_MS 100 #define DBMS_DEFAULT_SEND_TIMEOUT_SEC 300 #define DBMS_DEFAULT_RECEIVE_TIMEOUT_SEC 300 +/// Timeouts for hedged requests +#define DBMS_DEFAULT_RECEIVE_HELLO_TIMEOUT_SEC 1 +#define DBMS_DEFAULT_RECEIVE_TABLES_STATUS_TIMEOUT_SEC 1 +#define DBMS_DEFAULT_RECEIVE_DATA_TIMEOUT_SEC 1 /// Timeout for synchronous request-result protocol call (like Ping or TablesStatus). #define DBMS_DEFAULT_SYNC_REQUEST_TIMEOUT_SEC 5 #define DBMS_DEFAULT_POLL_INTERVAL 10 diff --git a/src/Core/Settings.h b/src/Core/Settings.h index 1e4b07997ab..46da24aca80 100644 --- a/src/Core/Settings.h +++ b/src/Core/Settings.h @@ -55,6 +55,10 @@ class IColumn; M(Seconds, receive_timeout, DBMS_DEFAULT_RECEIVE_TIMEOUT_SEC, "", 0) \ M(Seconds, send_timeout, DBMS_DEFAULT_SEND_TIMEOUT_SEC, "", 0) \ M(Seconds, tcp_keep_alive_timeout, 0, "The time in seconds the connection needs to remain idle before TCP starts sending keepalive probes", 0) \ + M(Seconds, receive_hello_timeout, DBMS_DEFAULT_RECEIVE_HELLO_TIMEOUT_SEC, "Connection timeout for receiving hello from replica", 0) \ + M(Seconds, receive_tables_status_timeout, DBMS_DEFAULT_RECEIVE_TABLES_STATUS_TIMEOUT_SEC, "Connection timeout for receiving tables status from replica", 0) \ + M(Seconds, receive_data_timeout, DBMS_DEFAULT_RECEIVE_DATA_TIMEOUT_SEC, "Connection timeout for receiving first packet of data from replica", 0) \ + M(Bool, use_hedged_requests, true, "Use hedged requests for distributed queries", 0) \ M(Milliseconds, queue_max_wait_ms, 0, "The wait time in the request queue, if the number of concurrent requests exceeds the maximum.", 0) \ M(Milliseconds, connection_pool_max_wait_ms, 0, "The wait time when the connection pool is full.", 0) \ M(Milliseconds, replace_running_query_max_wait_ms, 5000, "The wait time for running query with the same query_id to finish when setting 'replace_running_query' is active.", 0) \ @@ -215,6 +219,11 @@ class IColumn; M(Milliseconds, stream_flush_interval_ms, 7500, "Timeout for flushing data from streaming storages.", 0) \ M(Milliseconds, stream_poll_timeout_ms, 500, "Timeout for polling data from/to streaming storages.", 0) \ \ + /** Settings for testing hedged requests */ \ + M(Int64, sleep_before_send_hello, 0, "Time to sleep before sending hello in TCPHandler", 0) \ + M(Int64, sleep_before_send_tables_status, 0, "Time to sleep before sending tables status response in TCPHandler", 0) \ + M(Int64, sleep_before_send_data, 0, "Time to sleep before sending data in TCPHandler", 0) \ + \ M(Bool, insert_allow_materialized_columns, 0, "If setting is enabled, Allow materialized columns in INSERT.", 0) \ M(Seconds, http_connection_timeout, DEFAULT_HTTP_READ_BUFFER_CONNECTION_TIMEOUT, "HTTP connection timeout.", 0) \ M(Seconds, http_send_timeout, DEFAULT_HTTP_READ_BUFFER_TIMEOUT, "HTTP send timeout", 0) \ diff --git a/src/DataStreams/RemoteQueryExecutor.cpp b/src/DataStreams/RemoteQueryExecutor.cpp index 14e51ffefdf..49654b51199 100644 --- a/src/DataStreams/RemoteQueryExecutor.cpp +++ b/src/DataStreams/RemoteQueryExecutor.cpp @@ -13,6 +13,8 @@ #include #include #include +#include +#include namespace DB { @@ -29,23 +31,23 @@ RemoteQueryExecutor::RemoteQueryExecutor( : header(header_), query(query_), context(context_) , scalars(scalars_), external_tables(external_tables_), stage(stage_) { - create_multiplexed_connections = [this, &connection, throttler]() + create_connections = [this, &connection, throttler]() { return std::make_unique(connection, context.getSettingsRef(), throttler); }; } RemoteQueryExecutor::RemoteQueryExecutor( - std::vector && connections, + std::vector && connections_, const String & query_, const Block & header_, const Context & context_, const ThrottlerPtr & throttler, const Scalars & scalars_, const Tables & external_tables_, QueryProcessingStage::Enum stage_) : header(header_), query(query_), context(context_) , scalars(scalars_), external_tables(external_tables_), stage(stage_) { - create_multiplexed_connections = [this, connections, throttler]() mutable + create_connections = [this, connections_, throttler]() mutable { return std::make_unique( - std::move(connections), context.getSettingsRef(), throttler); + std::move(connections_), context.getSettingsRef(), throttler); }; } @@ -56,23 +58,34 @@ RemoteQueryExecutor::RemoteQueryExecutor( : header(header_), query(query_), context(context_) , scalars(scalars_), external_tables(external_tables_), stage(stage_) { - create_multiplexed_connections = [this, pool, throttler]() + create_connections = [this, pool, throttler]()->std::unique_ptr { const Settings & current_settings = context.getSettingsRef(); auto timeouts = ConnectionTimeouts::getTCPTimeoutsWithFailover(current_settings); - std::vector connections; - if (main_table) + + if (current_settings.use_hedged_requests && current_settings.max_parallel_replicas <= 1) { - auto try_results = pool->getManyChecked(timeouts, ¤t_settings, pool_mode, main_table.getQualifiedName()); - connections.reserve(try_results.size()); - for (auto & try_result : try_results) - connections.emplace_back(std::move(try_result.entry)); + std::shared_ptr table_to_check = nullptr; + if (main_table) + table_to_check = std::make_shared(main_table.getQualifiedName()); + + return std::make_unique(pool, current_settings, timeouts, throttler, table_to_check); } else - connections = pool->getMany(timeouts, ¤t_settings, pool_mode); + { + std::vector connection_entries; + if (main_table) + { + auto try_results = pool->getManyChecked(timeouts, ¤t_settings, pool_mode, main_table.getQualifiedName()); + connection_entries.reserve(try_results.size()); + for (auto & try_result : try_results) + connection_entries.emplace_back(std::move(try_result.entry)); + } + else + connection_entries = pool->getMany(timeouts, ¤t_settings, pool_mode); - return std::make_unique( - std::move(connections), current_settings, throttler); + return std::make_unique(std::move(connection_entries), current_settings, throttler); + } }; } @@ -83,7 +96,7 @@ RemoteQueryExecutor::~RemoteQueryExecutor() * these connections did not remain hanging in the out-of-sync state. */ if (established || isQueryPending()) - multiplexed_connections->disconnect(); + connections->disconnect(); } /** If we receive a block with slightly different column types, or with excessive columns, @@ -140,10 +153,10 @@ void RemoteQueryExecutor::sendQuery() if (sent_query) return; - multiplexed_connections = create_multiplexed_connections(); + connections = create_connections(); const auto & settings = context.getSettingsRef(); - if (settings.skip_unavailable_shards && 0 == multiplexed_connections->size()) + if (settings.skip_unavailable_shards && 0 == connections->size()) return; /// Query cannot be canceled in the middle of the send query, @@ -167,7 +180,7 @@ void RemoteQueryExecutor::sendQuery() modified_client_info.client_trace_context = CurrentThread::get().thread_trace_context; } - multiplexed_connections->sendQuery(timeouts, query, query_id, stage, modified_client_info, true); + connections->sendQuery(timeouts, query, query_id, stage, modified_client_info, true); established = false; sent_query = true; @@ -183,7 +196,7 @@ Block RemoteQueryExecutor::read() { sendQuery(); - if (context.getSettingsRef().skip_unavailable_shards && (0 == multiplexed_connections->size())) + if (context.getSettingsRef().skip_unavailable_shards && (0 == connections->size())) return {}; } @@ -192,7 +205,7 @@ Block RemoteQueryExecutor::read() if (was_cancelled) return Block(); - Packet packet = multiplexed_connections->receivePacket(); + Packet packet = connections->receivePacket(); if (auto block = processPacket(std::move(packet))) return *block; @@ -207,7 +220,7 @@ std::variant RemoteQueryExecutor::read(std::unique_ptr { sendQuery(); - if (context.getSettingsRef().skip_unavailable_shards && (0 == multiplexed_connections->size())) + if (context.getSettingsRef().skip_unavailable_shards && (0 == connections->size())) return Block(); } @@ -217,7 +230,7 @@ std::variant RemoteQueryExecutor::read(std::unique_ptr if (was_cancelled) return Block(); - read_context = std::make_unique(*multiplexed_connections); + read_context = std::make_unique(*connections); } do @@ -228,7 +241,7 @@ std::variant RemoteQueryExecutor::read(std::unique_ptr if (read_context->is_read_in_progress.load(std::memory_order_relaxed)) { read_context->setTimer(); - return read_context->epoll_fd; + return read_context->epoll.getFileDescriptor(); } else { @@ -258,7 +271,7 @@ std::optional RemoteQueryExecutor::processPacket(Packet packet) break; case Protocol::Server::EndOfStream: - if (!multiplexed_connections->hasActiveConnections()) + if (!connections->hasActiveConnections()) { finished = true; return Block(); @@ -300,7 +313,7 @@ std::optional RemoteQueryExecutor::processPacket(Packet packet) got_unknown_packet_from_replica = true; throw Exception(ErrorCodes::UNKNOWN_PACKET_FROM_SERVER, "Unknown packet {} from one of the following replicas: {}", toString(packet.type), - multiplexed_connections->dumpAddresses()); + connections->dumpAddresses()); } return {}; @@ -326,7 +339,7 @@ void RemoteQueryExecutor::finish(std::unique_ptr * read_context) tryCancel("Cancelling query because enough data has been read", read_context); /// Get the remaining packets so that there is no out of sync in the connections to the replicas. - Packet packet = multiplexed_connections->drain(); + Packet packet = connections->drain(); switch (packet.type) { case Protocol::Server::EndOfStream: @@ -348,7 +361,7 @@ void RemoteQueryExecutor::finish(std::unique_ptr * read_context) got_unknown_packet_from_replica = true; throw Exception(ErrorCodes::UNKNOWN_PACKET_FROM_SERVER, "Unknown packet {} from one of the following replicas: {}", toString(packet.type), - multiplexed_connections->dumpAddresses()); + connections->dumpAddresses()); } } @@ -371,14 +384,14 @@ void RemoteQueryExecutor::cancel(std::unique_ptr * read_context) void RemoteQueryExecutor::sendScalars() { - multiplexed_connections->sendScalarsData(scalars); + connections->sendScalarsData(scalars); } void RemoteQueryExecutor::sendExternalTables() { SelectQueryInfo query_info; - size_t count = multiplexed_connections->size(); + size_t count = connections->size(); { std::lock_guard lock(external_tables_mutex); @@ -415,7 +428,7 @@ void RemoteQueryExecutor::sendExternalTables() } } - multiplexed_connections->sendExternalTablesData(external_tables_data); + connections->sendExternalTablesData(external_tables_data); } void RemoteQueryExecutor::tryCancel(const char * reason, std::unique_ptr * read_context) @@ -432,11 +445,11 @@ void RemoteQueryExecutor::tryCancel(const char * reason, std::unique_ptrcancel(); - multiplexed_connections->sendCancel(); + connections->sendCancel(); } if (log) - LOG_TRACE(log, "({}) {}", multiplexed_connections->dumpAddresses(), reason); + LOG_TRACE(log, "({}) {}", connections->dumpAddresses(), reason); } bool RemoteQueryExecutor::isQueryPending() const diff --git a/src/DataStreams/RemoteQueryExecutor.h b/src/DataStreams/RemoteQueryExecutor.h index 46d9d067563..2a1f2687e59 100644 --- a/src/DataStreams/RemoteQueryExecutor.h +++ b/src/DataStreams/RemoteQueryExecutor.h @@ -1,7 +1,8 @@ #pragma once #include -#include +#include +#include #include #include #include @@ -40,7 +41,7 @@ public: /// Accepts several connections already taken from pool. RemoteQueryExecutor( - std::vector && connections, + std::vector && connections_, const String & query_, const Block & header_, const Context & context_, const ThrottlerPtr & throttler = nullptr, const Scalars & scalars_ = Scalars(), const Tables & external_tables_ = Tables(), QueryProcessingStage::Enum stage_ = QueryProcessingStage::Complete); @@ -100,8 +101,8 @@ private: Block totals; Block extremes; - std::function()> create_multiplexed_connections; - std::unique_ptr multiplexed_connections; + std::function()> create_connections; + std::unique_ptr connections; const String query; String query_id = ""; diff --git a/src/DataStreams/RemoteQueryExecutorReadContext.cpp b/src/DataStreams/RemoteQueryExecutorReadContext.cpp index bc47b049407..c854794cd27 100644 --- a/src/DataStreams/RemoteQueryExecutorReadContext.cpp +++ b/src/DataStreams/RemoteQueryExecutorReadContext.cpp @@ -3,7 +3,7 @@ #include #include #include -#include +#include #include namespace DB @@ -11,7 +11,7 @@ namespace DB struct RemoteQueryExecutorRoutine { - MultiplexedConnections & connections; + IConnections & connections; RemoteQueryExecutorReadContext & read_context; struct ReadCallback @@ -19,15 +19,15 @@ struct RemoteQueryExecutorRoutine RemoteQueryExecutorReadContext & read_context; Fiber & fiber; - void operator()(Poco::Net::Socket & socket) + void operator()(int fd, const Poco::Timespan & timeout = 0, const std::string fd_description = "") { try { - read_context.setSocket(socket); + read_context.setConnectionFD(fd, timeout, fd_description); } catch (DB::Exception & e) { - e.addMessage(" while reading from socket ({})", socket.peerAddress().toString()); + e.addMessage(" while reading from {}", fd_description); throw; } @@ -70,60 +70,38 @@ namespace ErrorCodes extern const int SOCKET_TIMEOUT; } -RemoteQueryExecutorReadContext::RemoteQueryExecutorReadContext(MultiplexedConnections & connections_) +RemoteQueryExecutorReadContext::RemoteQueryExecutorReadContext(IConnections & connections_) : connections(connections_) { - epoll_fd = epoll_create(2); - if (-1 == epoll_fd) - throwFromErrno("Cannot create epoll descriptor", ErrorCodes::CANNOT_OPEN_FILE); if (-1 == pipe2(pipe_fd, O_NONBLOCK)) throwFromErrno("Cannot create pipe", ErrorCodes::CANNOT_OPEN_FILE); { - epoll_event socket_event; - socket_event.events = EPOLLIN | EPOLLPRI; - socket_event.data.fd = pipe_fd[0]; - - if (-1 == epoll_ctl(epoll_fd, EPOLL_CTL_ADD, pipe_fd[0], &socket_event)) - throwFromErrno("Cannot add pipe descriptor to epoll", ErrorCodes::CANNOT_OPEN_FILE); + epoll.add(pipe_fd[0]); } { - epoll_event timer_event; - timer_event.events = EPOLLIN | EPOLLPRI; - timer_event.data.fd = timer.getDescriptor(); - - if (-1 == epoll_ctl(epoll_fd, EPOLL_CTL_ADD, timer_event.data.fd, &timer_event)) - throwFromErrno("Cannot add timer descriptor to epoll", ErrorCodes::CANNOT_OPEN_FILE); + epoll.add(timer.getDescriptor()); } auto routine = RemoteQueryExecutorRoutine{connections, *this}; fiber = boost::context::fiber(std::allocator_arg_t(), stack, std::move(routine)); } -void RemoteQueryExecutorReadContext::setSocket(Poco::Net::Socket & socket) +void RemoteQueryExecutorReadContext::setConnectionFD(int fd, const Poco::Timespan & timeout, const std::string & fd_description) { - int fd = socket.impl()->sockfd(); - if (fd == socket_fd) + if (fd == connection_fd) return; - epoll_event socket_event; - socket_event.events = EPOLLIN | EPOLLPRI; - socket_event.data.fd = fd; + if (connection_fd != -1) + epoll.remove(connection_fd); - if (socket_fd != -1) - { - if (-1 == epoll_ctl(epoll_fd, EPOLL_CTL_DEL, socket_fd, &socket_event)) - throwFromErrno("Cannot remove socket descriptor to epoll", ErrorCodes::CANNOT_OPEN_FILE); - } + connection_fd = fd; + epoll.add(connection_fd); - socket_fd = fd; - - if (-1 == epoll_ctl(epoll_fd, EPOLL_CTL_ADD, socket_fd, &socket_event)) - throwFromErrno("Cannot add socket descriptor to epoll", ErrorCodes::CANNOT_OPEN_FILE); - - receive_timeout = socket.impl()->getReceiveTimeout(); + receive_timeout = timeout; + connection_fd_description = fd_description; } bool RemoteQueryExecutorReadContext::checkTimeout() const @@ -142,21 +120,16 @@ bool RemoteQueryExecutorReadContext::checkTimeout() const bool RemoteQueryExecutorReadContext::checkTimeoutImpl() const { - epoll_event events[3]; - events[0].data.fd = events[1].data.fd = events[2].data.fd = -1; - - /// Wait for epoll_fd will not block if it was polled externally. - int num_events = epoll_wait(epoll_fd, events, 3, 0); - if (num_events == -1) - throwFromErrno("Failed to epoll_wait", ErrorCodes::CANNOT_READ_FROM_SOCKET); + /// Wait for epoll will not block if it was polled externally. + std::vector events = epoll.getManyReady(epoll.size(), /* blocking = */ false); bool is_socket_ready = false; bool is_pipe_alarmed = false; bool has_timer_alarm = false; - for (int i = 0; i < num_events; ++i) + for (size_t i = 0; i < events.size(); ++i) { - if (events[i].data.fd == socket_fd) + if (events[i].data.fd == connection_fd) is_socket_ready = true; if (events[i].data.fd == timer.getDescriptor()) has_timer_alarm = true; @@ -225,9 +198,7 @@ void RemoteQueryExecutorReadContext::cancel() RemoteQueryExecutorReadContext::~RemoteQueryExecutorReadContext() { - /// socket_fd is closed by Poco::Net::Socket - if (epoll_fd != -1) - close(epoll_fd); + /// connection_fd is closed by Poco::Net::Socket if (pipe_fd[0] != -1) close(pipe_fd[0]); if (pipe_fd[1] != -1) diff --git a/src/DataStreams/RemoteQueryExecutorReadContext.h b/src/DataStreams/RemoteQueryExecutorReadContext.h index 89dc2813a9a..cb6421f78d0 100644 --- a/src/DataStreams/RemoteQueryExecutorReadContext.h +++ b/src/DataStreams/RemoteQueryExecutorReadContext.h @@ -7,7 +7,9 @@ #include #include #include +#include #include +#include #include namespace Poco::Net @@ -33,26 +35,29 @@ public: std::mutex fiber_lock; Poco::Timespan receive_timeout; - MultiplexedConnections & connections; + IConnections & connections; Poco::Net::Socket * last_used_socket = nullptr; /// Here we have three descriptors we are going to wait: - /// * socket_fd is a descriptor of connection. It may be changed in case of reading from several replicas. + /// * connection_fd is a descriptor of connection. It may be changed in case of reading from several replicas. /// * timer is a timerfd descriptor to manually check socket timeout /// * pipe_fd is a pipe we use to cancel query and socket polling by executor. - /// We put those descriptors into our own epoll_fd which is used by external executor. + /// We put those descriptors into our own epoll which is used by external executor. TimerDescriptor timer{CLOCK_MONOTONIC, 0}; - int socket_fd = -1; - int epoll_fd = -1; + int connection_fd = -1; int pipe_fd[2] = { -1, -1 }; - explicit RemoteQueryExecutorReadContext(MultiplexedConnections & connections_); + Epoll epoll; + + std::string connection_fd_description; + + explicit RemoteQueryExecutorReadContext(IConnections & connections_); ~RemoteQueryExecutorReadContext(); bool checkTimeout() const; bool checkTimeoutImpl() const; - void setSocket(Poco::Net::Socket & socket); + void setConnectionFD(int fd, const Poco::Timespan & timeout = 0, const std::string & fd_description = ""); void setTimer() const; bool resumeRoutine(); diff --git a/src/IO/ConnectionTimeouts.h b/src/IO/ConnectionTimeouts.h index e5efabee6e2..01f31d6efa8 100644 --- a/src/IO/ConnectionTimeouts.h +++ b/src/IO/ConnectionTimeouts.h @@ -17,6 +17,11 @@ struct ConnectionTimeouts Poco::Timespan http_keep_alive_timeout; Poco::Timespan secure_connection_timeout; + /// Timeouts for HedgedConnections + Poco::Timespan receive_hello_timeout; + Poco::Timespan receive_tables_status_timeout; + Poco::Timespan receive_data_timeout; + ConnectionTimeouts() = default; ConnectionTimeouts(const Poco::Timespan & connection_timeout_, @@ -27,7 +32,10 @@ struct ConnectionTimeouts receive_timeout(receive_timeout_), tcp_keep_alive_timeout(0), http_keep_alive_timeout(0), - secure_connection_timeout(connection_timeout) + secure_connection_timeout(connection_timeout), + receive_hello_timeout(0), + receive_tables_status_timeout(0), + receive_data_timeout(0) { } @@ -40,7 +48,10 @@ struct ConnectionTimeouts receive_timeout(receive_timeout_), tcp_keep_alive_timeout(tcp_keep_alive_timeout_), http_keep_alive_timeout(0), - secure_connection_timeout(connection_timeout) + secure_connection_timeout(connection_timeout), + receive_hello_timeout(0), + receive_tables_status_timeout(0), + receive_data_timeout(0) { } ConnectionTimeouts(const Poco::Timespan & connection_timeout_, @@ -53,7 +64,10 @@ struct ConnectionTimeouts receive_timeout(receive_timeout_), tcp_keep_alive_timeout(tcp_keep_alive_timeout_), http_keep_alive_timeout(http_keep_alive_timeout_), - secure_connection_timeout(connection_timeout) + secure_connection_timeout(connection_timeout), + receive_hello_timeout(0), + receive_tables_status_timeout(0), + receive_data_timeout(0) { } @@ -62,13 +76,19 @@ struct ConnectionTimeouts const Poco::Timespan & receive_timeout_, const Poco::Timespan & tcp_keep_alive_timeout_, const Poco::Timespan & http_keep_alive_timeout_, - const Poco::Timespan & secure_connection_timeout_) - : connection_timeout(connection_timeout_), - send_timeout(send_timeout_), - receive_timeout(receive_timeout_), - tcp_keep_alive_timeout(tcp_keep_alive_timeout_), - http_keep_alive_timeout(http_keep_alive_timeout_), - secure_connection_timeout(secure_connection_timeout_) + const Poco::Timespan & secure_connection_timeout_, + const Poco::Timespan & receive_hello_timeout_, + const Poco::Timespan & receive_tables_status_timeout_, + const Poco::Timespan & receive_data_timeout_) + : connection_timeout(connection_timeout_), + send_timeout(send_timeout_), + receive_timeout(receive_timeout_), + tcp_keep_alive_timeout(tcp_keep_alive_timeout_), + http_keep_alive_timeout(http_keep_alive_timeout_), + secure_connection_timeout(secure_connection_timeout_), + receive_hello_timeout(receive_hello_timeout_), + receive_tables_status_timeout(receive_tables_status_timeout_), + receive_data_timeout(receive_data_timeout_) { } @@ -87,7 +107,10 @@ struct ConnectionTimeouts saturate(receive_timeout, limit), saturate(tcp_keep_alive_timeout, limit), saturate(http_keep_alive_timeout, limit), - saturate(secure_connection_timeout, limit)); + saturate(secure_connection_timeout, limit), + saturate(receive_hello_timeout, limit), + saturate(receive_tables_status_timeout, limit), + saturate(receive_data_timeout, limit)); } /// Timeouts for the case when we have just single attempt to connect. diff --git a/src/IO/ConnectionTimeoutsContext.h b/src/IO/ConnectionTimeoutsContext.h index ce19738f507..c6daae39e7a 100644 --- a/src/IO/ConnectionTimeoutsContext.h +++ b/src/IO/ConnectionTimeoutsContext.h @@ -16,7 +16,16 @@ inline ConnectionTimeouts ConnectionTimeouts::getTCPTimeoutsWithoutFailover(cons /// Timeouts for the case when we will try many addresses in a loop. inline ConnectionTimeouts ConnectionTimeouts::getTCPTimeoutsWithFailover(const Settings & settings) { - return ConnectionTimeouts(settings.connect_timeout_with_failover_ms, settings.send_timeout, settings.receive_timeout, settings.tcp_keep_alive_timeout, 0, settings.connect_timeout_with_failover_secure_ms); + return ConnectionTimeouts( + settings.connect_timeout_with_failover_ms, + settings.send_timeout, + settings.receive_timeout, + settings.tcp_keep_alive_timeout, + 0, + settings.connect_timeout_with_failover_secure_ms, + settings.receive_hello_timeout, + settings.receive_tables_status_timeout, + settings.receive_data_timeout); } inline ConnectionTimeouts ConnectionTimeouts::getHTTPTimeouts(const Context & context) diff --git a/src/IO/ReadBufferFromPocoSocket.cpp b/src/IO/ReadBufferFromPocoSocket.cpp index 2c13446e693..e08b9e7c8fb 100644 --- a/src/IO/ReadBufferFromPocoSocket.cpp +++ b/src/IO/ReadBufferFromPocoSocket.cpp @@ -42,7 +42,7 @@ bool ReadBufferFromPocoSocket::nextImpl() /// Note that receive timeout is not checked here. External code should check it while polling. while (bytes_read < 0 && async_callback && errno == EAGAIN) { - async_callback(socket); + async_callback(socket.impl()->sockfd(), socket.getReceiveTimeout(), "socket (" + socket.peerAddress().toString() + ")"); bytes_read = socket.impl()->receiveBytes(internal_buffer.begin(), internal_buffer.size(), flags); } } diff --git a/src/IO/ReadBufferFromPocoSocket.h b/src/IO/ReadBufferFromPocoSocket.h index 8064cd39246..7fd1b646846 100644 --- a/src/IO/ReadBufferFromPocoSocket.h +++ b/src/IO/ReadBufferFromPocoSocket.h @@ -8,6 +8,8 @@ namespace DB { +using AsyncCallback = std::function; + /** Works with the ready Poco::Net::Socket. Blocking operations. */ class ReadBufferFromPocoSocket : public BufferWithOwnMemory @@ -28,10 +30,10 @@ public: bool poll(size_t timeout_microseconds); - void setAsyncCallback(std::function async_callback_) { async_callback = std::move(async_callback_); } + void setAsyncCallback(AsyncCallback async_callback_) { async_callback = std::move(async_callback_); } private: - std::function async_callback; + AsyncCallback async_callback; }; } diff --git a/src/Processors/Executors/PollingQueue.cpp b/src/Processors/Executors/PollingQueue.cpp index 93edfe53987..44941ae788a 100644 --- a/src/Processors/Executors/PollingQueue.cpp +++ b/src/Processors/Executors/PollingQueue.cpp @@ -23,24 +23,14 @@ namespace ErrorCodes PollingQueue::PollingQueue() { - epoll_fd = epoll_create(1); - if (-1 == epoll_fd) - throwFromErrno("Cannot create epoll descriptor", ErrorCodes::CANNOT_OPEN_FILE); - if (-1 == pipe2(pipe_fd, O_NONBLOCK)) throwFromErrno("Cannot create pipe", ErrorCodes::CANNOT_OPEN_FILE); - epoll_event socket_event; - socket_event.events = EPOLLIN | EPOLLPRI; - socket_event.data.ptr = pipe_fd; - - if (-1 == epoll_ctl(epoll_fd, EPOLL_CTL_ADD, pipe_fd[0], &socket_event)) - throwFromErrno("Cannot add pipe descriptor to epoll", ErrorCodes::CANNOT_OPEN_FILE); + epoll.add(pipe_fd[0], pipe_fd); } PollingQueue::~PollingQueue() { - close(epoll_fd); close(pipe_fd[0]); close(pipe_fd[1]); } @@ -52,13 +42,7 @@ void PollingQueue::addTask(size_t thread_number, void * data, int fd) throw Exception(ErrorCodes::LOGICAL_ERROR, "Task {} was already added to task queue", key); tasks[key] = TaskData{thread_number, data, fd}; - - epoll_event socket_event; - socket_event.events = EPOLLIN | EPOLLPRI; - socket_event.data.ptr = data; - - if (-1 == epoll_ctl(epoll_fd, EPOLL_CTL_ADD, fd, &socket_event)) - throwFromErrno("Cannot add socket descriptor to epoll", ErrorCodes::CANNOT_OPEN_FILE); + epoll.add(fd, data); } static std::string dumpTasks(const std::unordered_map & tasks) @@ -84,16 +68,7 @@ PollingQueue::TaskData PollingQueue::wait(std::unique_lock & lock) lock.unlock(); - epoll_event event; - event.data.ptr = nullptr; - int num_events = 0; - - while (num_events == 0) - { - num_events = epoll_wait(epoll_fd, &event, 1, 0); - if (num_events == -1) - throwFromErrno("Failed to epoll_wait", ErrorCodes::CANNOT_READ_FROM_SOCKET); - } + epoll_event event = epoll.getReady(); lock.lock(); @@ -111,9 +86,7 @@ PollingQueue::TaskData PollingQueue::wait(std::unique_lock & lock) auto res = it->second; tasks.erase(it); - - if (-1 == epoll_ctl(epoll_fd, EPOLL_CTL_DEL, res.fd, &event)) - throwFromErrno("Cannot remove socket descriptor to epoll", ErrorCodes::CANNOT_OPEN_FILE); + epoll.remove(res.fd); return res; } diff --git a/src/Processors/Executors/PollingQueue.h b/src/Processors/Executors/PollingQueue.h index 9d37bf0a2cc..0d306ddf2f7 100644 --- a/src/Processors/Executors/PollingQueue.h +++ b/src/Processors/Executors/PollingQueue.h @@ -4,6 +4,7 @@ #include #include #include +#include namespace DB { @@ -25,7 +26,7 @@ public: }; private: - int epoll_fd; + Epoll epoll; int pipe_fd[2]; std::atomic_bool is_finished = false; std::unordered_map tasks; diff --git a/src/Server/TCPHandler.cpp b/src/Server/TCPHandler.cpp index 12d1a0249b7..b6298f46dc7 100644 --- a/src/Server/TCPHandler.cpp +++ b/src/Server/TCPHandler.cpp @@ -681,6 +681,15 @@ void TCPHandler::processTablesStatusRequest() response.table_states_by_id.emplace(table_name, std::move(status)); } + /// For testing hedged requests + const Settings & settings = query_context->getSettingsRef(); + if (settings.sleep_before_send_tables_status) + { + std::chrono::seconds sec(settings.sleep_before_send_tables_status); + std::this_thread::sleep_for(sec); + } + + writeVarUInt(Protocol::Server::TablesStatusResponse, *out); response.write(*out, client_tcp_protocol_version); } @@ -881,6 +890,14 @@ void TCPHandler::receiveUnexpectedHello() void TCPHandler::sendHello() { + /// For testing hedged requests + const Settings & settings = query_context->getSettingsRef(); + if (settings.sleep_before_send_hello) + { + std::chrono::seconds sec(settings.sleep_before_send_hello); + std::this_thread::sleep_for(sec); + } + writeVarUInt(Protocol::Server::Hello, *out); writeStringBinary(DBMS_NAME, *out); writeVarUInt(DBMS_VERSION_MAJOR, *out); @@ -1313,6 +1330,14 @@ bool TCPHandler::isQueryCancelled() void TCPHandler::sendData(const Block & block) { + /// For testing hedged requests + const Settings & settings = query_context->getSettingsRef(); + if (settings.sleep_before_send_data) + { + std::chrono::seconds sec(settings.sleep_before_send_data); + std::this_thread::sleep_for(sec); + } + initBlockOutput(block); writeVarUInt(Protocol::Server::Data, *out); diff --git a/tests/integration/test_hedged_requests/configs/remote_servers.xml b/tests/integration/test_hedged_requests/configs/remote_servers.xml new file mode 100644 index 00000000000..60d2f5891d7 --- /dev/null +++ b/tests/integration/test_hedged_requests/configs/remote_servers.xml @@ -0,0 +1,18 @@ + + + + + true + + node_1 + 9000 + + + node_2 + 9000 + + + + + + diff --git a/tests/integration/test_hedged_requests/configs/users.xml b/tests/integration/test_hedged_requests/configs/users.xml new file mode 100644 index 00000000000..0cf32bf9e1a --- /dev/null +++ b/tests/integration/test_hedged_requests/configs/users.xml @@ -0,0 +1,10 @@ + + + + + in_order + 1 + 1 + + + diff --git a/tests/integration/test_hedged_requests/configs/users1.xml b/tests/integration/test_hedged_requests/configs/users1.xml new file mode 100644 index 00000000000..2a54396feca --- /dev/null +++ b/tests/integration/test_hedged_requests/configs/users1.xml @@ -0,0 +1,7 @@ + + + + + + + diff --git a/tests/integration/test_hedged_requests/test.py b/tests/integration/test_hedged_requests/test.py new file mode 100644 index 00000000000..719477d9c7f --- /dev/null +++ b/tests/integration/test_hedged_requests/test.py @@ -0,0 +1,76 @@ +import os +import sys +import time + +import pytest + +sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) + +from helpers.cluster import ClickHouseCluster +from helpers.network import PartitionManager + +cluster = ClickHouseCluster(__file__) + +# Cluster with 1 shard of 2 replicas. node is the instance with Distributed table. +node = cluster.add_instance( + 'node', with_zookeeper=True, main_configs=['configs/remote_servers.xml'], user_configs=['configs/users.xml']) +node_1 = cluster.add_instance('node_1', with_zookeeper=True, stay_alive=True, user_configs=['configs/users1.xml']) +node_2 = cluster.add_instance('node_2', with_zookeeper=True) + +sleep_timeout = 5 +receive_timeout = 1 + +config = ''' + + + <{setting}>{sleep} + + +''' + + +@pytest.fixture(scope="module") +def started_cluster(): + try: + cluster.start() + + node_1.query('''CREATE TABLE replicated (id UInt32, date Date) ENGINE = + ReplicatedMergeTree('/clickhouse/tables/replicated', 'node_1') ORDER BY id PARTITION BY toYYYYMM(date)''') + + node_2.query('''CREATE TABLE replicated (id UInt32, date Date) ENGINE = + ReplicatedMergeTree('/clickhouse/tables/replicated', 'node_2') ORDER BY id PARTITION BY toYYYYMM(date)''') + + node.query('''CREATE TABLE distributed (id UInt32, date Date) ENGINE = + Distributed('test_cluster', 'default', 'replicated')''') + + yield cluster + + finally: + cluster.shutdown() + +def process_test(sleep_setting_name, receive_timeout_name): + node_1.replace_config('/etc/clickhouse-server/users.d/users1.xml', config.format(setting=sleep_setting_name, sleep=sleep_timeout)) + + # Restart node to make new config relevant + node_1.restart_clickhouse(sleep_timeout + 1) + + # Without hedged requests select query will last more than sleep_timeout seconds, + # with hedged requests it will last just over receive_timeout seconds + + node.query("SET {setting}={value}".format(setting=receive_timeout_name, value=receive_timeout)) + + start = time.time() + node.query("SELECT * FROM distributed"); + query_time = time.time() - start + + # Check that query time is not long + assert query_time < sleep_timeout + + +def test_change_replica_on_receive_hello(started_cluster): + node.query("INSERT INTO distributed VALUES (1, '2020-01-01')") + + process_test("sleep_before_send_hello", "receive_hello_timeout") + process_test("sleep_before_send_tables_status", "receive_tables_status_timeout") + process_test("sleep_before_send_data", "receive_data_timeout") + From 507695cbcbeaf08af3c3d240b62a2a73000001c6 Mon Sep 17 00:00:00 2001 From: Pavel Kruglov Date: Wed, 20 Jan 2021 02:15:13 +0300 Subject: [PATCH 0074/2357] Fix build --- src/Client/HedgedConnections.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/Client/HedgedConnections.cpp b/src/Client/HedgedConnections.cpp index 57315bcd6fe..4963c74c327 100644 --- a/src/Client/HedgedConnections.cpp +++ b/src/Client/HedgedConnections.cpp @@ -29,7 +29,7 @@ HedgedConnections::HedgedConnections( if (replicas.second_replica->isNotReady()) epoll.add(get_hedged_connections.getFileDescriptor()); - auto set_throttler = [this, throttler_](ReplicaStatePtr replica) + auto set_throttler = [throttler_](ReplicaStatePtr replica) { replica->connection->setThrottler(throttler_); }; From 2aa29e1bc7f9fe1a9ccfa96ed96db87ac8a6cd95 Mon Sep 17 00:00:00 2001 From: Pavel Kruglov Date: Wed, 20 Jan 2021 13:29:31 +0300 Subject: [PATCH 0075/2357] Fix build 2 --- src/Client/ConnectionPoolWithFailover.cpp | 4 ++-- src/Client/GetHedgedConnections.cpp | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/src/Client/ConnectionPoolWithFailover.cpp b/src/Client/ConnectionPoolWithFailover.cpp index df4541ecf7e..00ec1e30f10 100644 --- a/src/Client/ConnectionPoolWithFailover.cpp +++ b/src/Client/ConnectionPoolWithFailover.cpp @@ -472,11 +472,11 @@ void TryGetConnection::run() stage = Stage::FINISHED; } - catch (Poco::Net::NetException & e) + catch (Poco::Net::NetException &) { processFail(true); } - catch (Poco::TimeoutException & e) + catch (Poco::TimeoutException &) { processFail(true); } diff --git a/src/Client/GetHedgedConnections.cpp b/src/Client/GetHedgedConnections.cpp index 3b30650e6e5..839d6bf37c2 100644 --- a/src/Client/GetHedgedConnections.cpp +++ b/src/Client/GetHedgedConnections.cpp @@ -319,7 +319,7 @@ GetHedgedConnections::ReplicaStatePtr GetHedgedConnections::processEpollEvents(b { LOG_DEBUG(log, "process epoll events"); int event_fd; - ReplicaStatePtr replica; + ReplicaStatePtr replica = nullptr; bool finish = false; while (!finish) { From 573edbcd11411017157fbfa6a6e89757212d248a Mon Sep 17 00:00:00 2001 From: Anton Popov Date: Fri, 22 Jan 2021 05:34:08 +0300 Subject: [PATCH 0076/2357] improve performance of aggregation in order of sorting key --- src/Core/SortCursor.h | 16 +++ .../FinishAggregatingInOrderAlgorithm.cpp | 134 ++++++++++++++++++ .../FinishAggregatingInOrderAlgorithm.h | 53 +++++++ src/Processors/Merges/Algorithms/MergedData.h | 3 +- .../FinishAggregatingInOrderTransform.h | 34 +++++ src/Processors/QueryPlan/AggregatingStep.cpp | 4 +- .../AggregatingInOrderTransform.cpp | 18 +-- .../Transforms/AggregatingInOrderTransform.h | 1 - .../Transforms/FinishSortingTransform.cpp | 14 -- .../01291_aggregation_in_order.sql | 16 +-- 10 files changed, 253 insertions(+), 40 deletions(-) create mode 100644 src/Processors/Merges/Algorithms/FinishAggregatingInOrderAlgorithm.cpp create mode 100644 src/Processors/Merges/Algorithms/FinishAggregatingInOrderAlgorithm.h create mode 100644 src/Processors/Merges/FinishAggregatingInOrderTransform.h diff --git a/src/Core/SortCursor.h b/src/Core/SortCursor.h index f383c3ded8e..79730e9697e 100644 --- a/src/Core/SortCursor.h +++ b/src/Core/SortCursor.h @@ -366,4 +366,20 @@ private: } }; +template +bool less(const TLeftColumns & lhs, const TRightColumns & rhs, size_t i, size_t j, const SortDescription & descr) +{ + for (const auto & elem : descr) + { + size_t ind = elem.column_number; + int res = elem.direction * lhs[ind]->compareAt(i, j, *rhs[ind], elem.nulls_direction); + if (res < 0) + return true; + else if (res > 0) + return false; + } + + return false; +} + } diff --git a/src/Processors/Merges/Algorithms/FinishAggregatingInOrderAlgorithm.cpp b/src/Processors/Merges/Algorithms/FinishAggregatingInOrderAlgorithm.cpp new file mode 100644 index 00000000000..e20f8416851 --- /dev/null +++ b/src/Processors/Merges/Algorithms/FinishAggregatingInOrderAlgorithm.cpp @@ -0,0 +1,134 @@ +#include +#include +#include + +#include + +namespace DB +{ + +FinishAggregatingInOrderAlgorithm::State::State( + const Chunk & chunk, const SortDescription & desc) + : num_rows(chunk.getNumRows()) + , all_columns(chunk.getColumns()) +{ + sorting_columns.reserve(desc.size()); + for (const auto & column_desc : desc) + sorting_columns.emplace_back(all_columns[column_desc.column_number].get()); +} + +FinishAggregatingInOrderAlgorithm::FinishAggregatingInOrderAlgorithm( + const Block & header_, + size_t num_inputs_, + AggregatingTransformParamsPtr params_, + SortDescription description_, + size_t max_block_size_) + : merged_data(header_.cloneEmptyColumns(), false, max_block_size_) + , header(header_) + , num_inputs(num_inputs_) + , params(params_) + , description(description_) +{ + /// Replace column names in description to positions. + for (auto & column_description : description) + { + if (!column_description.column_name.empty()) + { + column_description.column_number = header_.getPositionByName(column_description.column_name); + column_description.column_name.clear(); + } + } +} + +void FinishAggregatingInOrderAlgorithm::initialize(Inputs inputs) +{ + current_inputs = std::move(inputs); + states.reserve(num_inputs); + for (size_t i = 0; i < num_inputs; ++i) + states.emplace_back(current_inputs[i].chunk, description); +} + +void FinishAggregatingInOrderAlgorithm::consume(Input & input, size_t source_num) +{ + states[source_num] = State{input.chunk, description}; +} + +IMergingAlgorithm::Status FinishAggregatingInOrderAlgorithm::merge() +{ + std::optional best_input; + for (size_t i = 0; i < num_inputs; ++i) + { + if (!states[i].isValid()) + continue; + + if (!best_input + || less(states[i].sorting_columns, states[*best_input].sorting_columns, + states[i].num_rows - 1, states[*best_input].num_rows - 1, description)) + { + best_input = i; + } + } + + if (!best_input) + return Status{merged_data.pull(), true}; + + auto & best_state = states[*best_input]; + best_state.to_row = states[*best_input].num_rows; + + for (size_t i = 0; i < num_inputs; ++i) + { + if (!states[i].isValid() || i == *best_input) + continue; + + auto indices = ext::range(states[i].current_row, states[i].num_rows); + auto it = std::upper_bound(indices.begin(), indices.end(), best_state.num_rows - 1, + [&](size_t lhs_pos, size_t rhs_pos) + { + return less(best_state.sorting_columns, states[i].sorting_columns, lhs_pos, rhs_pos, description); + }); + + states[i].to_row = (it == indices.end() ? states[i].num_rows : *it); + } + + auto aggregated = aggregate(); + for (size_t i = 0; i < aggregated.rows(); ++i) + merged_data.insertRow(aggregated.getColumns(), i, aggregated.rows()); + + Status status(*best_input); + if (merged_data.hasEnoughRows()) + status.chunk = merged_data.pull(); + + return status; +} + +Block FinishAggregatingInOrderAlgorithm::aggregate() +{ + BlocksList blocks; + + for (size_t i = 0; i < num_inputs; ++i) + { + const auto & state = states[i]; + if (!state.isValid()) + continue; + + if (state.current_row == 0 && state.to_row == state.num_rows) + { + blocks.emplace_back(header.cloneWithColumns(states[i].all_columns)); + } + else + { + Columns new_columns; + new_columns.reserve(state.all_columns.size()); + for (const auto & column : state.all_columns) + new_columns.emplace_back(column->cut(state.current_row, state.to_row - state.current_row)); + + blocks.emplace_back(header.cloneWithColumns(new_columns)); + } + + states[i].current_row = states[i].to_row; + } + + return params->aggregator.mergeBlocks(blocks, false); +} + +} diff --git a/src/Processors/Merges/Algorithms/FinishAggregatingInOrderAlgorithm.h b/src/Processors/Merges/Algorithms/FinishAggregatingInOrderAlgorithm.h new file mode 100644 index 00000000000..57a5671bf82 --- /dev/null +++ b/src/Processors/Merges/Algorithms/FinishAggregatingInOrderAlgorithm.h @@ -0,0 +1,53 @@ +#pragma once + +#include +#include +#include +#include + +namespace DB +{ + +struct AggregatingTransformParams; +using AggregatingTransformParamsPtr = std::shared_ptr; + +class FinishAggregatingInOrderAlgorithm final : public IMergingAlgorithm +{ +public: + FinishAggregatingInOrderAlgorithm( + const Block & header_, + size_t num_inputs_, + AggregatingTransformParamsPtr params_, + SortDescription description_, + size_t max_block_size_); + + void initialize(Inputs inputs) override; + void consume(Input & input, size_t source_num) override; + Status merge() override; + + struct State + { + size_t num_rows; + Columns all_columns; + ColumnRawPtrs sorting_columns; + + size_t current_row = 0; + size_t to_row = 0; + + State(const Chunk & chunk, const SortDescription & description); + bool isValid() const { return current_row < num_rows; } + }; + +private: + Block aggregate(); + + MergedData merged_data; + Block header; + size_t num_inputs; + AggregatingTransformParamsPtr params; + SortDescription description; + Inputs current_inputs; + std::vector states; +}; + +} diff --git a/src/Processors/Merges/Algorithms/MergedData.h b/src/Processors/Merges/Algorithms/MergedData.h index 9bf33d72f31..fa703e185a3 100644 --- a/src/Processors/Merges/Algorithms/MergedData.h +++ b/src/Processors/Merges/Algorithms/MergedData.h @@ -21,7 +21,8 @@ public: /// Pull will be called at next prepare call. void flush() { need_flush = true; } - void insertRow(const ColumnRawPtrs & raw_columns, size_t row, size_t block_size) + template + void insertRow(const TColumns & raw_columns, size_t row, size_t block_size) { size_t num_columns = raw_columns.size(); for (size_t i = 0; i < num_columns; ++i) diff --git a/src/Processors/Merges/FinishAggregatingInOrderTransform.h b/src/Processors/Merges/FinishAggregatingInOrderTransform.h new file mode 100644 index 00000000000..27e37355910 --- /dev/null +++ b/src/Processors/Merges/FinishAggregatingInOrderTransform.h @@ -0,0 +1,34 @@ +#pragma once + +#include +#include + +namespace DB +{ + +class ColumnAggregateFunction; + +/// Implementation of IMergingTransform via FinishAggregatingInOrderAlgorithm. +class FinishAggregatingInOrderTransform final : public IMergingTransform +{ +public: + FinishAggregatingInOrderTransform( + const Block & header, + size_t num_inputs, + AggregatingTransformParamsPtr params, + SortDescription description, + size_t max_block_size) + : IMergingTransform( + num_inputs, header, header, true, + header, + num_inputs, + params, + std::move(description), + max_block_size) + { + } + + String getName() const override { return "AggregatingSortedTransform"; } +}; + +} diff --git a/src/Processors/QueryPlan/AggregatingStep.cpp b/src/Processors/QueryPlan/AggregatingStep.cpp index e8d4a262366..0474a15961e 100644 --- a/src/Processors/QueryPlan/AggregatingStep.cpp +++ b/src/Processors/QueryPlan/AggregatingStep.cpp @@ -3,6 +3,7 @@ #include #include #include +#include namespace DB { @@ -95,9 +96,10 @@ void AggregatingStep::transformPipeline(QueryPipeline & pipeline) } } - auto transform = std::make_shared( + auto transform = std::make_shared( pipeline.getHeader(), pipeline.getNumStreams(), + transform_params, group_by_sort_description, max_block_size); diff --git a/src/Processors/Transforms/AggregatingInOrderTransform.cpp b/src/Processors/Transforms/AggregatingInOrderTransform.cpp index 0db95bc3b20..a3932a7ab1b 100644 --- a/src/Processors/Transforms/AggregatingInOrderTransform.cpp +++ b/src/Processors/Transforms/AggregatingInOrderTransform.cpp @@ -1,5 +1,8 @@ #include #include +#include + +#include namespace DB { @@ -46,21 +49,6 @@ AggregatingInOrderTransform::AggregatingInOrderTransform( AggregatingInOrderTransform::~AggregatingInOrderTransform() = default; -static bool less(const MutableColumns & lhs, const Columns & rhs, size_t i, size_t j, const SortDescription & descr) -{ - for (const auto & elem : descr) - { - size_t ind = elem.column_number; - int res = elem.direction * lhs[ind]->compareAt(i, j, *rhs[ind], elem.nulls_direction); - if (res < 0) - return true; - else if (res > 0) - return false; - } - return false; -} - - void AggregatingInOrderTransform::consume(Chunk chunk) { size_t rows = chunk.getNumRows(); diff --git a/src/Processors/Transforms/AggregatingInOrderTransform.h b/src/Processors/Transforms/AggregatingInOrderTransform.h index 10793e885ce..7b659fc53e2 100644 --- a/src/Processors/Transforms/AggregatingInOrderTransform.h +++ b/src/Processors/Transforms/AggregatingInOrderTransform.h @@ -88,5 +88,4 @@ private: AggregatingTransformParamsPtr params; }; - } diff --git a/src/Processors/Transforms/FinishSortingTransform.cpp b/src/Processors/Transforms/FinishSortingTransform.cpp index 29d0170d907..763ed9ecc49 100644 --- a/src/Processors/Transforms/FinishSortingTransform.cpp +++ b/src/Processors/Transforms/FinishSortingTransform.cpp @@ -37,20 +37,6 @@ FinishSortingTransform::FinishSortingTransform( description_sorted.assign(description.begin(), description.begin() + prefix_size); } -static bool less(const Columns & lhs, const Columns & rhs, size_t i, size_t j, const SortDescription & descr) -{ - for (const auto & elem : descr) - { - size_t ind = elem.column_number; - int res = elem.direction * lhs[ind]->compareAt(i, j, *rhs[ind], elem.nulls_direction); - if (res < 0) - return true; - else if (res > 0) - return false; - } - return false; -} - void FinishSortingTransform::consume(Chunk chunk) { generated_prefix = false; diff --git a/tests/queries/0_stateless/01291_aggregation_in_order.sql b/tests/queries/0_stateless/01291_aggregation_in_order.sql index 753075f2757..c4357811520 100644 --- a/tests/queries/0_stateless/01291_aggregation_in_order.sql +++ b/tests/queries/0_stateless/01291_aggregation_in_order.sql @@ -9,12 +9,12 @@ INSERT INTO pk_order(a, b, c, d) VALUES (2, 2, 107, 2), (2, 3, 108, 2), (2, 4, 1 -- Order after group by in order is determined -SELECT a, b FROM pk_order GROUP BY a, b; -SELECT a FROM pk_order GROUP BY a; +SELECT a, b FROM pk_order GROUP BY a, b ORDER BY a, b; +SELECT a FROM pk_order GROUP BY a ORDER BY a; -SELECT a, b, sum(c), avg(d) FROM pk_order GROUP BY a, b; -SELECT a, sum(c), avg(d) FROM pk_order GROUP BY a; -SELECT a, sum(c), avg(d) FROM pk_order GROUP BY -a; +SELECT a, b, sum(c), avg(d) FROM pk_order GROUP BY a, b ORDER BY a, b; +SELECT a, sum(c), avg(d) FROM pk_order GROUP BY a ORDER BY a; +SELECT a, sum(c), avg(d) FROM pk_order GROUP BY -a ORDER BY a; DROP TABLE IF EXISTS pk_order; @@ -26,8 +26,8 @@ INSERT INTO pk_order set max_block_size = 1; -SELECT d, max(b) FROM pk_order GROUP BY d, a LIMIT 5; -SELECT d, avg(a) FROM pk_order GROUP BY toString(d) LIMIT 5; -SELECT toStartOfHour(d) as d1, min(a), max(b) FROM pk_order GROUP BY d1 LIMIT 5; +SELECT d, max(b) FROM pk_order GROUP BY d, a ORDER BY d, a LIMIT 5; +SELECT d, avg(a) FROM pk_order GROUP BY toString(d) ORDER BY toString(d) LIMIT 5; +SELECT toStartOfHour(d) as d1, min(a), max(b) FROM pk_order GROUP BY d1 ORDER BY d1 LIMIT 5; DROP TABLE pk_order; From 4f6c880232cac8e9d9c6e0c79111de5a9fed8c91 Mon Sep 17 00:00:00 2001 From: Denis Glazachev Date: Mon, 25 Jan 2021 18:31:59 +0400 Subject: [PATCH 0077/2357] Pass and handle a chain of multiple prewhere infos --- src/Interpreters/InterpreterSelectQuery.cpp | 72 ++++++---- .../getHeaderForProcessingStage.cpp | 11 +- .../MergeTreeBaseSelectProcessor.cpp | 41 +++--- .../MergeTree/MergeTreeBaseSelectProcessor.h | 8 +- .../MergeTree/MergeTreeBlockReadUtils.cpp | 19 +-- .../MergeTree/MergeTreeBlockReadUtils.h | 7 +- .../MergeTree/MergeTreeDataSelectExecutor.cpp | 30 ++-- .../MergeTree/MergeTreeRangeReader.cpp | 136 ++++++++++++------ src/Storages/MergeTree/MergeTreeRangeReader.h | 13 +- src/Storages/MergeTree/MergeTreeReadPool.cpp | 8 +- src/Storages/MergeTree/MergeTreeReadPool.h | 9 +- .../MergeTreeReverseSelectProcessor.cpp | 11 +- .../MergeTreeReverseSelectProcessor.h | 2 +- .../MergeTree/MergeTreeSelectProcessor.cpp | 11 +- .../MergeTree/MergeTreeSelectProcessor.h | 2 +- ...rgeTreeThreadSelectBlockInputProcessor.cpp | 8 +- ...MergeTreeThreadSelectBlockInputProcessor.h | 2 +- src/Storages/SelectQueryInfo.h | 6 +- src/Storages/StorageBuffer.cpp | 25 ++-- 19 files changed, 247 insertions(+), 174 deletions(-) diff --git a/src/Interpreters/InterpreterSelectQuery.cpp b/src/Interpreters/InterpreterSelectQuery.cpp index 4f6b76e9b53..9dd63362dbd 100644 --- a/src/Interpreters/InterpreterSelectQuery.cpp +++ b/src/Interpreters/InterpreterSelectQuery.cpp @@ -1186,36 +1186,40 @@ void InterpreterSelectQuery::addEmptySourceToQueryPlan(QueryPlan & query_plan, c { Pipe pipe(std::make_shared(source_header)); - if (query_info.prewhere_info) + if (query_info.prewhere_info_list) { - if (query_info.prewhere_info->alias_actions) + for (const auto & prewhere_info : *query_info.prewhere_info_list) { + if (prewhere_info.alias_actions) + { + pipe.addSimpleTransform([&](const Block & header) + { + return std::make_shared( + header, prewhere_info.alias_actions); + }); + } + pipe.addSimpleTransform([&](const Block & header) { - return std::make_shared(header, query_info.prewhere_info->alias_actions); + return std::make_shared( + header, + prewhere_info.prewhere_actions, + prewhere_info.prewhere_column_name, + prewhere_info.remove_prewhere_column); }); - } - pipe.addSimpleTransform([&](const Block & header) - { - return std::make_shared( - header, - query_info.prewhere_info->prewhere_actions, - query_info.prewhere_info->prewhere_column_name, - query_info.prewhere_info->remove_prewhere_column); - }); - - // To remove additional columns - // In some cases, we did not read any marks so that the pipeline.streams is empty - // Thus, some columns in prewhere are not removed as expected - // This leads to mismatched header in distributed table - if (query_info.prewhere_info->remove_columns_actions) - { - pipe.addSimpleTransform([&](const Block & header) + // To remove additional columns + // In some cases, we did not read any marks so that the pipeline.streams is empty + // Thus, some columns in prewhere are not removed as expected + // This leads to mismatched header in distributed table + if (prewhere_info.remove_columns_actions) { - return std::make_shared( - header, query_info.prewhere_info->remove_columns_actions); - }); + pipe.addSimpleTransform([&](const Block & header) + { + return std::make_shared( + header, prewhere_info.remove_columns_actions); + }); + } } } @@ -1552,17 +1556,23 @@ void InterpreterSelectQuery::executeFetchColumns( if (prewhere_info) { - query_info.prewhere_info = std::make_shared( - std::make_shared(prewhere_info->prewhere_actions), - prewhere_info->prewhere_column_name); + if (!query_info.prewhere_info_list) + query_info.prewhere_info_list = std::make_shared(); + + query_info.prewhere_info_list->emplace_back( + std::make_shared(prewhere_info->prewhere_actions), + prewhere_info->prewhere_column_name); + + auto & new_prewhere_info = query_info.prewhere_info_list->back(); if (prewhere_info->alias_actions) - query_info.prewhere_info->alias_actions = std::make_shared(prewhere_info->alias_actions); - if (prewhere_info->remove_columns_actions) - query_info.prewhere_info->remove_columns_actions = std::make_shared(prewhere_info->remove_columns_actions); + new_prewhere_info.alias_actions = std::make_shared(prewhere_info->alias_actions); - query_info.prewhere_info->remove_prewhere_column = prewhere_info->remove_prewhere_column; - query_info.prewhere_info->need_filter = prewhere_info->need_filter; + if (prewhere_info->remove_columns_actions) + new_prewhere_info.remove_columns_actions = std::make_shared(prewhere_info->remove_columns_actions); + + new_prewhere_info.remove_prewhere_column = prewhere_info->remove_prewhere_column; + new_prewhere_info.need_filter = prewhere_info->need_filter; } /// Create optimizer with prepared actions. diff --git a/src/Interpreters/getHeaderForProcessingStage.cpp b/src/Interpreters/getHeaderForProcessingStage.cpp index e341a5637f4..761f04e81ee 100644 --- a/src/Interpreters/getHeaderForProcessingStage.cpp +++ b/src/Interpreters/getHeaderForProcessingStage.cpp @@ -42,11 +42,14 @@ Block getHeaderForProcessingStage( case QueryProcessingStage::FetchColumns: { Block header = metadata_snapshot->getSampleBlockForColumns(column_names, storage.getVirtuals(), storage.getStorageID()); - if (query_info.prewhere_info) + if (query_info.prewhere_info_list) { - query_info.prewhere_info->prewhere_actions->execute(header); - if (query_info.prewhere_info->remove_prewhere_column) - header.erase(query_info.prewhere_info->prewhere_column_name); + for (const auto & prewhere_info : *query_info.prewhere_info_list) + { + prewhere_info.prewhere_actions->execute(header); + if (prewhere_info.remove_prewhere_column) + header.erase(prewhere_info.prewhere_column_name); + } } return header; } diff --git a/src/Storages/MergeTree/MergeTreeBaseSelectProcessor.cpp b/src/Storages/MergeTree/MergeTreeBaseSelectProcessor.cpp index c852151f27d..3405a211c98 100644 --- a/src/Storages/MergeTree/MergeTreeBaseSelectProcessor.cpp +++ b/src/Storages/MergeTree/MergeTreeBaseSelectProcessor.cpp @@ -22,17 +22,17 @@ MergeTreeBaseSelectProcessor::MergeTreeBaseSelectProcessor( Block header, const MergeTreeData & storage_, const StorageMetadataPtr & metadata_snapshot_, - const PrewhereInfoPtr & prewhere_info_, + const PrewhereInfoListPtr & prewhere_info_list_, UInt64 max_block_size_rows_, UInt64 preferred_block_size_bytes_, UInt64 preferred_max_column_in_block_size_bytes_, const MergeTreeReaderSettings & reader_settings_, bool use_uncompressed_cache_, const Names & virt_column_names_) - : SourceWithProgress(getHeader(std::move(header), prewhere_info_, virt_column_names_)) + : SourceWithProgress(getHeader(std::move(header), prewhere_info_list_, virt_column_names_)) , storage(storage_) , metadata_snapshot(metadata_snapshot_) - , prewhere_info(prewhere_info_) + , prewhere_info_list(prewhere_info_list_) , max_block_size_rows(max_block_size_rows_) , preferred_block_size_bytes(preferred_block_size_bytes_) , preferred_max_column_in_block_size_bytes(preferred_max_column_in_block_size_bytes_) @@ -70,18 +70,18 @@ Chunk MergeTreeBaseSelectProcessor::generate() void MergeTreeBaseSelectProcessor::initializeRangeReaders(MergeTreeReadTask & current_task) { - if (prewhere_info) + if (prewhere_info_list) { if (reader->getColumns().empty()) { - current_task.range_reader = MergeTreeRangeReader(pre_reader.get(), nullptr, prewhere_info, true); + current_task.range_reader = MergeTreeRangeReader(pre_reader.get(), nullptr, prewhere_info_list, true); } else { MergeTreeRangeReader * pre_reader_ptr = nullptr; if (pre_reader != nullptr) { - current_task.pre_range_reader = MergeTreeRangeReader(pre_reader.get(), nullptr, prewhere_info, false); + current_task.pre_range_reader = MergeTreeRangeReader(pre_reader.get(), nullptr, prewhere_info_list, false); pre_reader_ptr = ¤t_task.pre_range_reader; } @@ -309,34 +309,37 @@ void MergeTreeBaseSelectProcessor::injectVirtualColumns(Chunk & chunk, MergeTree chunk.setColumns(columns, num_rows); } -void MergeTreeBaseSelectProcessor::executePrewhereActions(Block & block, const PrewhereInfoPtr & prewhere_info) +void MergeTreeBaseSelectProcessor::executePrewhereActions(Block & block, const PrewhereInfoListPtr & prewhere_info_list) { - if (prewhere_info) - { - if (prewhere_info->alias_actions) - prewhere_info->alias_actions->execute(block); + if (!prewhere_info_list) + return; - prewhere_info->prewhere_actions->execute(block); - auto & prewhere_column = block.getByName(prewhere_info->prewhere_column_name); + for (const auto & prewhere_info : *prewhere_info_list) + { + if (prewhere_info.alias_actions) + prewhere_info.alias_actions->execute(block); + + prewhere_info.prewhere_actions->execute(block); + auto & prewhere_column = block.getByName(prewhere_info.prewhere_column_name); if (!prewhere_column.type->canBeUsedInBooleanContext()) throw Exception("Invalid type for filter in PREWHERE: " + prewhere_column.type->getName(), - ErrorCodes::LOGICAL_ERROR); + ErrorCodes::LOGICAL_ERROR); - if (prewhere_info->remove_prewhere_column) - block.erase(prewhere_info->prewhere_column_name); + if (prewhere_info.remove_prewhere_column) + block.erase(prewhere_info.prewhere_column_name); else { - auto & ctn = block.getByName(prewhere_info->prewhere_column_name); + auto & ctn = block.getByName(prewhere_info.prewhere_column_name); ctn.column = ctn.type->createColumnConst(block.rows(), 1u)->convertToFullColumnIfConst(); } } } Block MergeTreeBaseSelectProcessor::getHeader( - Block block, const PrewhereInfoPtr & prewhere_info, const Names & virtual_columns) + Block block, const PrewhereInfoListPtr & prewhere_info_list, const Names & virtual_columns) { - executePrewhereActions(block, prewhere_info); + executePrewhereActions(block, prewhere_info_list); injectVirtualColumns(block, nullptr, virtual_columns); return block; } diff --git a/src/Storages/MergeTree/MergeTreeBaseSelectProcessor.h b/src/Storages/MergeTree/MergeTreeBaseSelectProcessor.h index 00ef131ae45..a3d7520b89a 100644 --- a/src/Storages/MergeTree/MergeTreeBaseSelectProcessor.h +++ b/src/Storages/MergeTree/MergeTreeBaseSelectProcessor.h @@ -23,7 +23,7 @@ public: Block header, const MergeTreeData & storage_, const StorageMetadataPtr & metadata_snapshot_, - const PrewhereInfoPtr & prewhere_info_, + const PrewhereInfoListPtr & prewhere_info_list_, UInt64 max_block_size_rows_, UInt64 preferred_block_size_bytes_, UInt64 preferred_max_column_in_block_size_bytes_, @@ -33,7 +33,7 @@ public: ~MergeTreeBaseSelectProcessor() override; - static void executePrewhereActions(Block & block, const PrewhereInfoPtr & prewhere_info); + static void executePrewhereActions(Block & block, const PrewhereInfoListPtr & prewhere_info_list); protected: Chunk generate() final; @@ -49,7 +49,7 @@ protected: static void injectVirtualColumns(Block & block, MergeTreeReadTask * task, const Names & virtual_columns); static void injectVirtualColumns(Chunk & chunk, MergeTreeReadTask * task, const Names & virtual_columns); - static Block getHeader(Block block, const PrewhereInfoPtr & prewhere_info, const Names & virtual_columns); + static Block getHeader(Block block, const PrewhereInfoListPtr & prewhere_info_list, const Names & virtual_columns); void initializeRangeReaders(MergeTreeReadTask & task); @@ -57,7 +57,7 @@ protected: const MergeTreeData & storage; StorageMetadataPtr metadata_snapshot; - PrewhereInfoPtr prewhere_info; + PrewhereInfoListPtr prewhere_info_list; UInt64 max_block_size_rows; UInt64 preferred_block_size_bytes; diff --git a/src/Storages/MergeTree/MergeTreeBlockReadUtils.cpp b/src/Storages/MergeTree/MergeTreeBlockReadUtils.cpp index f8b5e0a9c0a..f3191a76120 100644 --- a/src/Storages/MergeTree/MergeTreeBlockReadUtils.cpp +++ b/src/Storages/MergeTree/MergeTreeBlockReadUtils.cpp @@ -118,11 +118,10 @@ NameSet injectRequiredColumns(const MergeTreeData & storage, const StorageMetada MergeTreeReadTask::MergeTreeReadTask( const MergeTreeData::DataPartPtr & data_part_, const MarkRanges & mark_ranges_, const size_t part_index_in_query_, const Names & ordered_names_, const NameSet & column_name_set_, const NamesAndTypesList & columns_, - const NamesAndTypesList & pre_columns_, const bool remove_prewhere_column_, const bool should_reorder_, - MergeTreeBlockSizePredictorPtr && size_predictor_) + const NamesAndTypesList & pre_columns_, const bool should_reorder_, MergeTreeBlockSizePredictorPtr && size_predictor_) : data_part{data_part_}, mark_ranges{mark_ranges_}, part_index_in_query{part_index_in_query_}, ordered_names{ordered_names_}, column_name_set{column_name_set_}, columns{columns_}, pre_columns{pre_columns_}, - remove_prewhere_column{remove_prewhere_column_}, should_reorder{should_reorder_}, size_predictor{std::move(size_predictor_)} + should_reorder{should_reorder_}, size_predictor{std::move(size_predictor_)} { } @@ -258,7 +257,7 @@ MergeTreeReadTaskColumns getReadTaskColumns( const StorageMetadataPtr & metadata_snapshot, const MergeTreeData::DataPartPtr & data_part, const Names & required_columns, - const PrewhereInfoPtr & prewhere_info, + const PrewhereInfoListPtr & prewhere_info_list, bool check_columns) { Names column_names = required_columns; @@ -267,12 +266,14 @@ MergeTreeReadTaskColumns getReadTaskColumns( /// inject columns required for defaults evaluation bool should_reorder = !injectRequiredColumns(storage, metadata_snapshot, data_part, column_names).empty(); - if (prewhere_info) + if (prewhere_info_list) { - if (prewhere_info->alias_actions) - pre_column_names = prewhere_info->alias_actions->getRequiredColumns(); - else - pre_column_names = prewhere_info->prewhere_actions->getRequiredColumns(); + for (const auto & prewhere_info : *prewhere_info_list) + { + const auto required_column_names = (prewhere_info.alias_actions ? + prewhere_info.alias_actions->getRequiredColumns() : prewhere_info.prewhere_actions->getRequiredColumns()); + pre_column_names.insert(pre_column_names.end(), required_column_names.begin(), required_column_names.end()); + } if (pre_column_names.empty()) pre_column_names.push_back(column_names[0]); diff --git a/src/Storages/MergeTree/MergeTreeBlockReadUtils.h b/src/Storages/MergeTree/MergeTreeBlockReadUtils.h index 31d609e4242..f2537c554c3 100644 --- a/src/Storages/MergeTree/MergeTreeBlockReadUtils.h +++ b/src/Storages/MergeTree/MergeTreeBlockReadUtils.h @@ -42,8 +42,6 @@ struct MergeTreeReadTask const NamesAndTypesList & columns; /// column names to read during PREWHERE const NamesAndTypesList & pre_columns; - /// should PREWHERE column be returned to requesting side? - const bool remove_prewhere_column; /// resulting block may require reordering in accordance with `ordered_names` const bool should_reorder; /// Used to satistfy preferred_block_size_bytes limitation @@ -57,8 +55,7 @@ struct MergeTreeReadTask MergeTreeReadTask( const MergeTreeData::DataPartPtr & data_part_, const MarkRanges & mark_ranges_, const size_t part_index_in_query_, const Names & ordered_names_, const NameSet & column_name_set_, const NamesAndTypesList & columns_, - const NamesAndTypesList & pre_columns_, const bool remove_prewhere_column_, const bool should_reorder_, - MergeTreeBlockSizePredictorPtr && size_predictor_); + const NamesAndTypesList & pre_columns_, const bool should_reorder_, MergeTreeBlockSizePredictorPtr && size_predictor_); virtual ~MergeTreeReadTask(); }; @@ -78,7 +75,7 @@ MergeTreeReadTaskColumns getReadTaskColumns( const StorageMetadataPtr & metadata_snapshot, const MergeTreeData::DataPartPtr & data_part, const Names & required_columns, - const PrewhereInfoPtr & prewhere_info, + const PrewhereInfoListPtr & prewhere_info_list, bool check_columns); struct MergeTreeBlockSizePredictor diff --git a/src/Storages/MergeTree/MergeTreeDataSelectExecutor.cpp b/src/Storages/MergeTree/MergeTreeDataSelectExecutor.cpp index 4e1f307137a..248efc140fd 100644 --- a/src/Storages/MergeTree/MergeTreeDataSelectExecutor.cpp +++ b/src/Storages/MergeTree/MergeTreeDataSelectExecutor.cpp @@ -833,14 +833,20 @@ QueryPlanPtr MergeTreeDataSelectExecutor::readFromParts( plan->addStep(std::move(adding_column)); } - if (query_info.prewhere_info && query_info.prewhere_info->remove_columns_actions) + if (query_info.prewhere_info_list) { - auto expression_step = std::make_unique( - plan->getCurrentDataStream(), - query_info.prewhere_info->remove_columns_actions->getActionsDAG().clone()); + for (const auto & prewhere_info : *query_info.prewhere_info_list) + { + if (prewhere_info.remove_columns_actions) + { + auto expression_step = std::make_unique( + plan->getCurrentDataStream(), + prewhere_info.remove_columns_actions->getActionsDAG().clone()); - expression_step->setStepDescription("Remove unused columns after PREWHERE"); - plan->addStep(std::move(expression_step)); + expression_step->setStepDescription("Remove unused columns after PREWHERE"); + plan->addStep(std::move(expression_step)); + } + } } return plan; @@ -948,7 +954,7 @@ QueryPlanPtr MergeTreeDataSelectExecutor::spreadMarkRangesAmongStreams( std::move(parts), data, metadata_snapshot, - query_info.prewhere_info, + query_info.prewhere_info_list, true, column_names, MergeTreeReadPool::BackoffSettings(settings), @@ -964,7 +970,7 @@ QueryPlanPtr MergeTreeDataSelectExecutor::spreadMarkRangesAmongStreams( i, pool, min_marks_for_concurrent_read, max_block_size, settings.preferred_block_size_bytes, settings.preferred_max_column_in_block_size_bytes, data, metadata_snapshot, use_uncompressed_cache, - query_info.prewhere_info, reader_settings, virt_columns); + query_info.prewhere_info_list, reader_settings, virt_columns); if (i == 0) { @@ -987,7 +993,7 @@ QueryPlanPtr MergeTreeDataSelectExecutor::spreadMarkRangesAmongStreams( auto source = std::make_shared( data, metadata_snapshot, part.data_part, max_block_size, settings.preferred_block_size_bytes, settings.preferred_max_column_in_block_size_bytes, column_names, part.ranges, use_uncompressed_cache, - query_info.prewhere_info, true, reader_settings, virt_columns, part.part_index_in_query); + query_info.prewhere_info_list, true, reader_settings, virt_columns, part.part_index_in_query); res.emplace_back(std::move(source)); } @@ -1187,7 +1193,7 @@ QueryPlanPtr MergeTreeDataSelectExecutor::spreadMarkRangesAmongStreamsWithOrder( column_names, ranges_to_get_from_part, use_uncompressed_cache, - query_info.prewhere_info, + query_info.prewhere_info_list, true, reader_settings, virt_columns, @@ -1205,7 +1211,7 @@ QueryPlanPtr MergeTreeDataSelectExecutor::spreadMarkRangesAmongStreamsWithOrder( column_names, ranges_to_get_from_part, use_uncompressed_cache, - query_info.prewhere_info, + query_info.prewhere_info_list, true, reader_settings, virt_columns, @@ -1359,7 +1365,7 @@ QueryPlanPtr MergeTreeDataSelectExecutor::spreadMarkRangesAmongStreamsFinal( column_names, part_it->ranges, use_uncompressed_cache, - query_info.prewhere_info, + query_info.prewhere_info_list, true, reader_settings, virt_columns, diff --git a/src/Storages/MergeTree/MergeTreeRangeReader.cpp b/src/Storages/MergeTree/MergeTreeRangeReader.cpp index c13146bd35c..2ca2b30a5eb 100644 --- a/src/Storages/MergeTree/MergeTreeRangeReader.cpp +++ b/src/Storages/MergeTree/MergeTreeRangeReader.cpp @@ -443,32 +443,79 @@ size_t MergeTreeRangeReader::ReadResult::numZerosInTail(const UInt8 * begin, con return count; } -void MergeTreeRangeReader::ReadResult::setFilter(const ColumnPtr & new_filter) +void MergeTreeRangeReader::ReadResult::addFilter(const ColumnPtr & new_filter) { - if (!new_filter && filter) - throw Exception("Can't replace existing filter with empty.", ErrorCodes::LOGICAL_ERROR); - if (filter) { - size_t new_size = new_filter->size(); + if (!new_filter) + throw Exception("Can't add an empty filter to the existing one.", ErrorCodes::LOGICAL_ERROR); + const auto new_size = new_filter->size(); if (new_size != total_rows_per_granule) - throw Exception("Can't set filter because it's size is " + toString(new_size) + " but " + throw Exception("Can't add the new filter because it's size is " + toString(new_size) + " but " + toString(total_rows_per_granule) + " rows was read.", ErrorCodes::LOGICAL_ERROR); } ConstantFilterDescription const_description(*new_filter); if (const_description.always_true) - setFilterConstTrue(); + { + if (!filter) + setFilterConstTrue(); + } else if (const_description.always_false) + { clear(); + } else { - FilterDescription filter_description(*new_filter); - filter_holder = filter_description.data_holder ? filter_description.data_holder : new_filter; - filter = typeid_cast(filter_holder.get()); - if (!filter) - throw Exception("setFilter function expected ColumnUInt8.", ErrorCodes::LOGICAL_ERROR); + FilterDescription description(*new_filter); + auto new_holder = (description.data_holder ? description.data_holder : new_filter); + auto * new_holder_cast = typeid_cast(new_holder.get()); + + if (!new_holder_cast) + throw Exception("addFilter function expected ColumnUInt8.", ErrorCodes::LOGICAL_ERROR); + + if (filter) + { + MutableColumnPtr new_mutable_holder = IColumn::mutate(std::move(new_holder)); + auto * new_mutable_holder_cast = typeid_cast(new_mutable_holder.get()); + + if (!new_mutable_holder_cast) + throw Exception("addFilter function expected ColumnUInt8.", ErrorCodes::LOGICAL_ERROR); + + const auto & data = filter->getData(); + auto it = data.begin(); + + auto & new_data = new_mutable_holder_cast->getData(); + auto n_it = new_data.begin(); + + while (it != data.end() && n_it != new_data.end()) + { + *n_it = (*n_it && *it); + ++it; + ++n_it; + } + + ConstantFilterDescription new_const_description(*new_mutable_holder); + if (new_const_description.always_true) + { + setFilterConstTrue(); + } + else if (new_const_description.always_false) + { + clear(); + } + else + { + filter_holder = std::move(new_mutable_holder); + filter = new_mutable_holder_cast; + } + } + else + { + filter_holder = std::move(new_holder); + filter = new_holder_cast; + } } } @@ -489,11 +536,14 @@ size_t MergeTreeRangeReader::ReadResult::countBytesInResultFilter(const IColumn: MergeTreeRangeReader::MergeTreeRangeReader( IMergeTreeReader * merge_tree_reader_, MergeTreeRangeReader * prev_reader_, - const PrewhereInfoPtr & prewhere_, + const PrewhereInfoListPtr & prewhere_info_list_, bool last_reader_in_chain_) : merge_tree_reader(merge_tree_reader_) - , index_granularity(&(merge_tree_reader->data_part->index_granularity)), prev_reader(prev_reader_) - , prewhere(prewhere_), last_reader_in_chain(last_reader_in_chain_), is_initialized(true) + , index_granularity(&(merge_tree_reader->data_part->index_granularity)) + , prev_reader(prev_reader_) + , prewhere_info_list(prewhere_info_list_) + , last_reader_in_chain(last_reader_in_chain_) + , is_initialized(true) { if (prev_reader) sample_block = prev_reader->getSampleBlock(); @@ -501,16 +551,19 @@ MergeTreeRangeReader::MergeTreeRangeReader( for (const auto & name_and_type : merge_tree_reader->getColumns()) sample_block.insert({name_and_type.type->createColumn(), name_and_type.type, name_and_type.name}); - if (prewhere) + if (prewhere_info_list) { - if (prewhere->alias_actions) - prewhere->alias_actions->execute(sample_block, true); + for (const auto & prewhere_info : *prewhere_info_list) + { + if (prewhere_info.alias_actions) + prewhere_info.alias_actions->execute(sample_block, true); - if (prewhere->prewhere_actions) - prewhere->prewhere_actions->execute(sample_block, true); + if (prewhere_info.prewhere_actions) + prewhere_info.prewhere_actions->execute(sample_block, true); - if (prewhere->remove_prewhere_column) - sample_block.erase(prewhere->prewhere_column_name); + if (prewhere_info.remove_prewhere_column) + sample_block.erase(prewhere_info.prewhere_column_name); + } } } @@ -701,7 +754,13 @@ MergeTreeRangeReader::ReadResult MergeTreeRangeReader::read(size_t max_rows, Mar if (read_result.num_rows == 0) return read_result; - executePrewhereActionsAndFilterColumns(read_result); + if (prewhere_info_list) + { + for (const auto & prewhere_info : *prewhere_info_list) + { + executePrewhereActionsAndFilterColumns(read_result, prewhere_info); + } + } return read_result; } @@ -798,11 +857,8 @@ Columns MergeTreeRangeReader::continueReadingChain(ReadResult & result, size_t & return columns; } -void MergeTreeRangeReader::executePrewhereActionsAndFilterColumns(ReadResult & result) +void MergeTreeRangeReader::executePrewhereActionsAndFilterColumns(ReadResult & result, const PrewhereInfo & prewhere_info) { - if (!prewhere) - return; - const auto & header = merge_tree_reader->getColumns(); size_t num_columns = header.size(); @@ -831,14 +887,14 @@ void MergeTreeRangeReader::executePrewhereActionsAndFilterColumns(ReadResult & r for (auto name_and_type = header.begin(); pos < num_columns; ++pos, ++name_and_type) block.insert({result.columns[pos], name_and_type->type, name_and_type->name}); - if (prewhere->alias_actions) - prewhere->alias_actions->execute(block); + if (prewhere_info.alias_actions) + prewhere_info.alias_actions->execute(block); /// Columns might be projected out. We need to store them here so that default columns can be evaluated later. result.block_before_prewhere = block; - prewhere->prewhere_actions->execute(block); + prewhere_info.prewhere_actions->execute(block); - prewhere_column_pos = block.getPositionByName(prewhere->prewhere_column_name); + prewhere_column_pos = block.getPositionByName(prewhere_info.prewhere_column_name); result.columns.clear(); result.columns.reserve(block.columns()); @@ -848,15 +904,7 @@ void MergeTreeRangeReader::executePrewhereActionsAndFilterColumns(ReadResult & r filter.swap(result.columns[prewhere_column_pos]); } - if (result.getFilter()) - { - /// TODO: implement for prewhere chain. - /// In order to do it we need combine filter and result.filter, where filter filters only '1' in result.filter. - throw Exception("MergeTreeRangeReader chain with several prewhere actions in not implemented.", - ErrorCodes::LOGICAL_ERROR); - } - - result.setFilter(filter); + result.addFilter(filter); /// If there is a WHERE, we filter in there, and only optimize IO and shrink columns here if (!last_reader_in_chain) @@ -866,7 +914,7 @@ void MergeTreeRangeReader::executePrewhereActionsAndFilterColumns(ReadResult & r if (result.totalRowsPerGranule() == 0) result.setFilterConstFalse(); /// If we need to filter in PREWHERE - else if (prewhere->need_filter || result.need_filter) + else if (prewhere_info.need_filter || result.need_filter) { /// If there is a filter and without optimized if (result.getFilter() && last_reader_in_chain) @@ -907,11 +955,11 @@ void MergeTreeRangeReader::executePrewhereActionsAndFilterColumns(ReadResult & r /// Check if the PREWHERE column is needed if (!result.columns.empty()) { - if (prewhere->remove_prewhere_column) + if (prewhere_info.remove_prewhere_column) result.columns.erase(result.columns.begin() + prewhere_column_pos); else result.columns[prewhere_column_pos] = - getSampleBlock().getByName(prewhere->prewhere_column_name).type-> + getSampleBlock().getByName(prewhere_info.prewhere_column_name).type-> createColumnConst(result.num_rows, 1u)->convertToFullColumnIfConst(); } } @@ -919,7 +967,7 @@ void MergeTreeRangeReader::executePrewhereActionsAndFilterColumns(ReadResult & r else { result.columns[prewhere_column_pos] = result.getFilterHolder()->convertToFullColumnIfConst(); - if (getSampleBlock().getByName(prewhere->prewhere_column_name).type->isNullable()) + if (getSampleBlock().getByName(prewhere_info.prewhere_column_name).type->isNullable()) result.columns[prewhere_column_pos] = makeNullable(std::move(result.columns[prewhere_column_pos])); result.clearFilter(); // Acting as a flag to not filter in PREWHERE } diff --git a/src/Storages/MergeTree/MergeTreeRangeReader.h b/src/Storages/MergeTree/MergeTreeRangeReader.h index 381b87ecffd..8f8482d1abf 100644 --- a/src/Storages/MergeTree/MergeTreeRangeReader.h +++ b/src/Storages/MergeTree/MergeTreeRangeReader.h @@ -13,7 +13,8 @@ using ColumnUInt8 = ColumnVector; class IMergeTreeReader; class MergeTreeIndexGranularity; struct PrewhereInfo; -using PrewhereInfoPtr = std::shared_ptr; +using PrewhereInfoList = std::vector; +using PrewhereInfoListPtr = std::shared_ptr; /// MergeTreeReader iterator which allows sequential reading for arbitrary number of rows between pairs of marks in the same part. /// Stores reading state, which can be inside granule. Can skip rows in current granule and start reading from next mark. @@ -24,7 +25,7 @@ public: MergeTreeRangeReader( IMergeTreeReader * merge_tree_reader_, MergeTreeRangeReader * prev_reader_, - const PrewhereInfoPtr & prewhere_, + const PrewhereInfoListPtr & prewhere_info_list, bool last_reader_in_chain_); MergeTreeRangeReader() = default; @@ -153,8 +154,8 @@ public: void addRows(size_t rows) { num_read_rows += rows; } void addRange(const MarkRange & range) { started_ranges.push_back({rows_per_granule.size(), range}); } - /// Set filter or replace old one. Filter must have more zeroes than previous. - void setFilter(const ColumnPtr & new_filter); + /// Apply a filter on top of the existing one (AND'ed) or set it if there isn't any. + void addFilter(const ColumnPtr & new_filter); /// For each granule calculate the number of filtered rows at the end. Remove them and update filter. void optimize(bool can_read_incomplete_granules); /// Remove all rows from granules. @@ -212,12 +213,12 @@ private: ReadResult startReadingChain(size_t max_rows, MarkRanges & ranges); Columns continueReadingChain(ReadResult & result, size_t & num_rows); - void executePrewhereActionsAndFilterColumns(ReadResult & result); + void executePrewhereActionsAndFilterColumns(ReadResult & result, const PrewhereInfo & prewhere_info); IMergeTreeReader * merge_tree_reader = nullptr; const MergeTreeIndexGranularity * index_granularity = nullptr; MergeTreeRangeReader * prev_reader = nullptr; /// If not nullptr, read from prev_reader firstly. - PrewhereInfoPtr prewhere; + PrewhereInfoListPtr prewhere_info_list; Stream stream; diff --git a/src/Storages/MergeTree/MergeTreeReadPool.cpp b/src/Storages/MergeTree/MergeTreeReadPool.cpp index d9a250e3f7a..a3a580fa7f2 100644 --- a/src/Storages/MergeTree/MergeTreeReadPool.cpp +++ b/src/Storages/MergeTree/MergeTreeReadPool.cpp @@ -24,7 +24,7 @@ MergeTreeReadPool::MergeTreeReadPool( RangesInDataParts && parts_, const MergeTreeData & data_, const StorageMetadataPtr & metadata_snapshot_, - const PrewhereInfoPtr & prewhere_info_, + const PrewhereInfoListPtr & prewhere_info_list_, const bool check_columns_, const Names & column_names_, const BackoffSettings & backoff_settings_, @@ -37,7 +37,7 @@ MergeTreeReadPool::MergeTreeReadPool( , column_names{column_names_} , do_not_steal_tasks{do_not_steal_tasks_} , predict_block_size_bytes{preferred_block_size_bytes_ > 0} - , prewhere_info{prewhere_info_} + , prewhere_info_list{prewhere_info_list_} , parts_ranges{std::move(parts_)} { /// parts don't contain duplicate MergeTreeDataPart's. @@ -139,7 +139,7 @@ MergeTreeReadTaskPtr MergeTreeReadPool::getTask(const size_t min_marks_to_read, return std::make_unique( part.data_part, ranges_to_get_from_part, part.part_index_in_query, ordered_names, per_part_column_name_set[part_idx], per_part_columns[part_idx], per_part_pre_columns[part_idx], - prewhere_info && prewhere_info->remove_prewhere_column, per_part_should_reorder[part_idx], std::move(curr_task_size_predictor)); + per_part_should_reorder[part_idx], std::move(curr_task_size_predictor)); } MarkRanges MergeTreeReadPool::getRestMarks(const IMergeTreeDataPart & part, const MarkRange & from) const @@ -229,7 +229,7 @@ std::vector MergeTreeReadPool::fillPerPartInfo( per_part_sum_marks.push_back(sum_marks); auto [required_columns, required_pre_columns, should_reorder] = - getReadTaskColumns(data, metadata_snapshot, part.data_part, column_names, prewhere_info, check_columns); + getReadTaskColumns(data, metadata_snapshot, part.data_part, column_names, prewhere_info_list, check_columns); /// will be used to distinguish between PREWHERE and WHERE columns when applying filter const auto & required_column_names = required_columns.getNames(); diff --git a/src/Storages/MergeTree/MergeTreeReadPool.h b/src/Storages/MergeTree/MergeTreeReadPool.h index aa6811661e6..ec9523ccbe3 100644 --- a/src/Storages/MergeTree/MergeTreeReadPool.h +++ b/src/Storages/MergeTree/MergeTreeReadPool.h @@ -71,10 +71,9 @@ private: public: MergeTreeReadPool( const size_t threads_, const size_t sum_marks_, const size_t min_marks_for_concurrent_read_, - RangesInDataParts && parts_, const MergeTreeData & data_, const StorageMetadataPtr & metadata_snapshot_, const PrewhereInfoPtr & prewhere_info_, - const bool check_columns_, const Names & column_names_, - const BackoffSettings & backoff_settings_, size_t preferred_block_size_bytes_, - const bool do_not_steal_tasks_ = false); + RangesInDataParts && parts_, const MergeTreeData & data_, const StorageMetadataPtr & metadata_snapshot_, + const PrewhereInfoListPtr & prewhere_info_list, const bool check_columns_, const Names & column_names_, + const BackoffSettings & backoff_settings_, size_t preferred_block_size_bytes_, const bool do_not_steal_tasks_ = false); MergeTreeReadTaskPtr getTask(const size_t min_marks_to_read, const size_t thread, const Names & ordered_names); @@ -107,7 +106,7 @@ private: std::vector per_part_pre_columns; std::vector per_part_should_reorder; std::vector per_part_size_predictor; - PrewhereInfoPtr prewhere_info; + PrewhereInfoListPtr prewhere_info_list; struct Part { diff --git a/src/Storages/MergeTree/MergeTreeReverseSelectProcessor.cpp b/src/Storages/MergeTree/MergeTreeReverseSelectProcessor.cpp index ee0a77ba3cf..35df1106339 100644 --- a/src/Storages/MergeTree/MergeTreeReverseSelectProcessor.cpp +++ b/src/Storages/MergeTree/MergeTreeReverseSelectProcessor.cpp @@ -22,7 +22,7 @@ MergeTreeReverseSelectProcessor::MergeTreeReverseSelectProcessor( Names required_columns_, MarkRanges mark_ranges_, bool use_uncompressed_cache_, - const PrewhereInfoPtr & prewhere_info_, + const PrewhereInfoListPtr & prewhere_info_list_, bool check_columns, const MergeTreeReaderSettings & reader_settings_, const Names & virt_column_names_, @@ -31,7 +31,7 @@ MergeTreeReverseSelectProcessor::MergeTreeReverseSelectProcessor( : MergeTreeBaseSelectProcessor{ metadata_snapshot_->getSampleBlockForColumns(required_columns_, storage_.getVirtuals(), storage_.getStorageID()), - storage_, metadata_snapshot_, prewhere_info_, max_block_size_rows_, + storage_, metadata_snapshot_, prewhere_info_list_, max_block_size_rows_, preferred_block_size_bytes_, preferred_max_column_in_block_size_bytes_, reader_settings_, use_uncompressed_cache_, virt_column_names_}, required_columns{std::move(required_columns_)}, @@ -56,7 +56,7 @@ MergeTreeReverseSelectProcessor::MergeTreeReverseSelectProcessor( ordered_names = header_without_virtual_columns.getNames(); - task_columns = getReadTaskColumns(storage, metadata_snapshot, data_part, required_columns, prewhere_info, check_columns); + task_columns = getReadTaskColumns(storage, metadata_snapshot, data_part, required_columns, prewhere_info_list, check_columns); /// will be used to distinguish between PREWHERE and WHERE columns when applying filter const auto & column_names = task_columns.columns.getNames(); @@ -71,7 +71,7 @@ MergeTreeReverseSelectProcessor::MergeTreeReverseSelectProcessor( all_mark_ranges, owned_uncompressed_cache.get(), owned_mark_cache.get(), reader_settings); - if (prewhere_info) + if (prewhere_info_list) pre_reader = data_part->getReader(task_columns.pre_columns, metadata_snapshot, all_mark_ranges, owned_uncompressed_cache.get(), owned_mark_cache.get(), reader_settings); } @@ -100,8 +100,7 @@ try task = std::make_unique( data_part, mark_ranges_for_task, part_index_in_query, ordered_names, column_name_set, - task_columns.columns, task_columns.pre_columns, prewhere_info && prewhere_info->remove_prewhere_column, - task_columns.should_reorder, std::move(size_predictor)); + task_columns.columns, task_columns.pre_columns, task_columns.should_reorder, std::move(size_predictor)); return true; } diff --git a/src/Storages/MergeTree/MergeTreeReverseSelectProcessor.h b/src/Storages/MergeTree/MergeTreeReverseSelectProcessor.h index c9fd06c5534..b6da7166457 100644 --- a/src/Storages/MergeTree/MergeTreeReverseSelectProcessor.h +++ b/src/Storages/MergeTree/MergeTreeReverseSelectProcessor.h @@ -26,7 +26,7 @@ public: Names required_columns_, MarkRanges mark_ranges, bool use_uncompressed_cache, - const PrewhereInfoPtr & prewhere_info, + const PrewhereInfoListPtr & prewhere_info_list, bool check_columns, const MergeTreeReaderSettings & reader_settings, const Names & virt_column_names = {}, diff --git a/src/Storages/MergeTree/MergeTreeSelectProcessor.cpp b/src/Storages/MergeTree/MergeTreeSelectProcessor.cpp index 65f9b1eba3b..cdb97f47a47 100644 --- a/src/Storages/MergeTree/MergeTreeSelectProcessor.cpp +++ b/src/Storages/MergeTree/MergeTreeSelectProcessor.cpp @@ -22,7 +22,7 @@ MergeTreeSelectProcessor::MergeTreeSelectProcessor( Names required_columns_, MarkRanges mark_ranges_, bool use_uncompressed_cache_, - const PrewhereInfoPtr & prewhere_info_, + const PrewhereInfoListPtr & prewhere_info_list_, bool check_columns_, const MergeTreeReaderSettings & reader_settings_, const Names & virt_column_names_, @@ -31,7 +31,7 @@ MergeTreeSelectProcessor::MergeTreeSelectProcessor( : MergeTreeBaseSelectProcessor{ metadata_snapshot_->getSampleBlockForColumns(required_columns_, storage_.getVirtuals(), storage_.getStorageID()), - storage_, metadata_snapshot_, prewhere_info_, max_block_size_rows_, + storage_, metadata_snapshot_, prewhere_info_list_, max_block_size_rows_, preferred_block_size_bytes_, preferred_max_column_in_block_size_bytes_, reader_settings_, use_uncompressed_cache_, virt_column_names_}, required_columns{std::move(required_columns_)}, @@ -69,7 +69,7 @@ try task_columns = getReadTaskColumns( storage, metadata_snapshot, data_part, - required_columns, prewhere_info, check_columns); + required_columns, prewhere_info_list, check_columns); auto size_predictor = (preferred_block_size_bytes == 0) ? nullptr @@ -81,8 +81,7 @@ try task = std::make_unique( data_part, all_mark_ranges, part_index_in_query, ordered_names, column_name_set, task_columns.columns, - task_columns.pre_columns, prewhere_info && prewhere_info->remove_prewhere_column, - task_columns.should_reorder, std::move(size_predictor)); + task_columns.pre_columns, task_columns.should_reorder, std::move(size_predictor)); if (!reader) { @@ -94,7 +93,7 @@ try reader = data_part->getReader(task_columns.columns, metadata_snapshot, all_mark_ranges, owned_uncompressed_cache.get(), owned_mark_cache.get(), reader_settings); - if (prewhere_info) + if (prewhere_info_list) pre_reader = data_part->getReader(task_columns.pre_columns, metadata_snapshot, all_mark_ranges, owned_uncompressed_cache.get(), owned_mark_cache.get(), reader_settings); } diff --git a/src/Storages/MergeTree/MergeTreeSelectProcessor.h b/src/Storages/MergeTree/MergeTreeSelectProcessor.h index 925c437f1ce..521bbbfdba4 100644 --- a/src/Storages/MergeTree/MergeTreeSelectProcessor.h +++ b/src/Storages/MergeTree/MergeTreeSelectProcessor.h @@ -26,7 +26,7 @@ public: Names required_columns_, MarkRanges mark_ranges, bool use_uncompressed_cache, - const PrewhereInfoPtr & prewhere_info, + const PrewhereInfoListPtr & prewhere_info_list, bool check_columns, const MergeTreeReaderSettings & reader_settings, const Names & virt_column_names = {}, diff --git a/src/Storages/MergeTree/MergeTreeThreadSelectBlockInputProcessor.cpp b/src/Storages/MergeTree/MergeTreeThreadSelectBlockInputProcessor.cpp index f57247e39ab..eb1a80acb49 100644 --- a/src/Storages/MergeTree/MergeTreeThreadSelectBlockInputProcessor.cpp +++ b/src/Storages/MergeTree/MergeTreeThreadSelectBlockInputProcessor.cpp @@ -18,12 +18,12 @@ MergeTreeThreadSelectBlockInputProcessor::MergeTreeThreadSelectBlockInputProcess const MergeTreeData & storage_, const StorageMetadataPtr & metadata_snapshot_, const bool use_uncompressed_cache_, - const PrewhereInfoPtr & prewhere_info_, + const PrewhereInfoListPtr & prewhere_info_list_, const MergeTreeReaderSettings & reader_settings_, const Names & virt_column_names_) : MergeTreeBaseSelectProcessor{ - pool_->getHeader(), storage_, metadata_snapshot_, prewhere_info_, + pool_->getHeader(), storage_, metadata_snapshot_, prewhere_info_list_, max_block_size_rows_, preferred_block_size_bytes_, preferred_max_column_in_block_size_bytes_, reader_settings_, use_uncompressed_cache_, virt_column_names_}, @@ -78,7 +78,7 @@ bool MergeTreeThreadSelectBlockInputProcessor::getNewTask() owned_uncompressed_cache.get(), owned_mark_cache.get(), reader_settings, IMergeTreeReader::ValueSizeMap{}, profile_callback); - if (prewhere_info) + if (prewhere_info_list) pre_reader = task->data_part->getReader(task->pre_columns, metadata_snapshot, rest_mark_ranges, owned_uncompressed_cache.get(), owned_mark_cache.get(), reader_settings, IMergeTreeReader::ValueSizeMap{}, profile_callback); @@ -94,7 +94,7 @@ bool MergeTreeThreadSelectBlockInputProcessor::getNewTask() owned_uncompressed_cache.get(), owned_mark_cache.get(), reader_settings, reader->getAvgValueSizeHints(), profile_callback); - if (prewhere_info) + if (prewhere_info_list) pre_reader = task->data_part->getReader(task->pre_columns, metadata_snapshot, rest_mark_ranges, owned_uncompressed_cache.get(), owned_mark_cache.get(), reader_settings, reader->getAvgValueSizeHints(), profile_callback); diff --git a/src/Storages/MergeTree/MergeTreeThreadSelectBlockInputProcessor.h b/src/Storages/MergeTree/MergeTreeThreadSelectBlockInputProcessor.h index 2b2ed36fc18..dd3ba8c973c 100644 --- a/src/Storages/MergeTree/MergeTreeThreadSelectBlockInputProcessor.h +++ b/src/Storages/MergeTree/MergeTreeThreadSelectBlockInputProcessor.h @@ -24,7 +24,7 @@ public: const MergeTreeData & storage_, const StorageMetadataPtr & metadata_snapshot_, const bool use_uncompressed_cache_, - const PrewhereInfoPtr & prewhere_info_, + const PrewhereInfoListPtr & prewhere_info_list_, const MergeTreeReaderSettings & reader_settings_, const Names & virt_column_names_); diff --git a/src/Storages/SelectQueryInfo.h b/src/Storages/SelectQueryInfo.h index 5a3ada6288b..68f2f8f1361 100644 --- a/src/Storages/SelectQueryInfo.h +++ b/src/Storages/SelectQueryInfo.h @@ -32,6 +32,8 @@ struct PrewhereInfo : prewhere_actions(std::move(prewhere_actions_)), prewhere_column_name(std::move(prewhere_column_name_)) {} }; +using PrewhereInfoList = std::vector; + /// Same as PrewhereInfo, but with ActionsDAG struct PrewhereDAGInfo { @@ -75,7 +77,7 @@ struct InputOrderInfo bool operator !=(const InputOrderInfo & other) const { return !(*this == other); } }; -using PrewhereInfoPtr = std::shared_ptr; +using PrewhereInfoListPtr = std::shared_ptr; using PrewhereDAGInfoPtr = std::shared_ptr; using FilterInfoPtr = std::shared_ptr; using InputOrderInfoPtr = std::shared_ptr; @@ -104,7 +106,7 @@ struct SelectQueryInfo TreeRewriterResultPtr syntax_analyzer_result; - PrewhereInfoPtr prewhere_info; + PrewhereInfoListPtr prewhere_info_list; ReadInOrderOptimizerPtr order_optimizer; /// Can be modified while reading from storage diff --git a/src/Storages/StorageBuffer.cpp b/src/Storages/StorageBuffer.cpp index ce74567c62b..53fee054f4b 100644 --- a/src/Storages/StorageBuffer.cpp +++ b/src/Storages/StorageBuffer.cpp @@ -314,21 +314,26 @@ void StorageBuffer::read( } else { - if (query_info.prewhere_info) + if (query_info.prewhere_info_list) { - pipe_from_buffers.addSimpleTransform([&](const Block & header) - { - return std::make_shared( - header, query_info.prewhere_info->prewhere_actions, - query_info.prewhere_info->prewhere_column_name, query_info.prewhere_info->remove_prewhere_column); - }); - - if (query_info.prewhere_info->alias_actions) + for (const auto & prewhere_info : *query_info.prewhere_info_list) { pipe_from_buffers.addSimpleTransform([&](const Block & header) { - return std::make_shared(header, query_info.prewhere_info->alias_actions); + return std::make_shared( + header, prewhere_info.prewhere_actions, + prewhere_info.prewhere_column_name, + prewhere_info.remove_prewhere_column); }); + + if (prewhere_info.alias_actions) + { + pipe_from_buffers.addSimpleTransform([&](const Block & header) + { + return std::make_shared( + header, prewhere_info.alias_actions); + }); + } } } From ff5ce1a5ae9d5912954ed6027fa4436e96387273 Mon Sep 17 00:00:00 2001 From: Denis Glazachev Date: Mon, 25 Jan 2021 22:01:59 +0400 Subject: [PATCH 0078/2357] Fix compilation --- src/Storages/MergeTree/MergeTreeRangeReader.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/Storages/MergeTree/MergeTreeRangeReader.cpp b/src/Storages/MergeTree/MergeTreeRangeReader.cpp index 2ca2b30a5eb..361ab2d227a 100644 --- a/src/Storages/MergeTree/MergeTreeRangeReader.cpp +++ b/src/Storages/MergeTree/MergeTreeRangeReader.cpp @@ -470,7 +470,7 @@ void MergeTreeRangeReader::ReadResult::addFilter(const ColumnPtr & new_filter) { FilterDescription description(*new_filter); auto new_holder = (description.data_holder ? description.data_holder : new_filter); - auto * new_holder_cast = typeid_cast(new_holder.get()); + const auto * new_holder_cast = typeid_cast(new_holder.get()); if (!new_holder_cast) throw Exception("addFilter function expected ColumnUInt8.", ErrorCodes::LOGICAL_ERROR); From 4870e0af691f62e25efa706c891a118e309796a7 Mon Sep 17 00:00:00 2001 From: Denis Glazachev Date: Mon, 25 Jan 2021 22:09:17 +0400 Subject: [PATCH 0079/2357] Add filter as a (first) prewhere --- src/Interpreters/ExpressionAnalyzer.cpp | 25 ++++++--- src/Interpreters/InterpreterSelectQuery.cpp | 59 ++++++++++++++++----- src/Interpreters/InterpreterSelectQuery.h | 7 +-- 3 files changed, 64 insertions(+), 27 deletions(-) diff --git a/src/Interpreters/ExpressionAnalyzer.cpp b/src/Interpreters/ExpressionAnalyzer.cpp index 13f23643c3a..2055faca820 100644 --- a/src/Interpreters/ExpressionAnalyzer.cpp +++ b/src/Interpreters/ExpressionAnalyzer.cpp @@ -1371,6 +1371,8 @@ ExpressionAnalysisResult::ExpressionAnalysisResult( if (storage && filter_info_) { + // TODO: handle filter exactly like prewhere, store the info in PrewhereDAGInfo, collect unnecessary columns, etc.? + filter_info = filter_info_; query_analyzer.appendPreliminaryFilter(chain, filter_info->actions_dag, filter_info->column_name); } @@ -1539,9 +1541,19 @@ ExpressionAnalysisResult::ExpressionAnalysisResult( void ExpressionAnalysisResult::finalize(const ExpressionActionsChain & chain, size_t where_step_num) { + size_t next_step_i = 0; + + if (hasFilter()) + { + const ExpressionActionsChain::Step & step = *chain.steps.at(next_step_i++); + filter_info->do_remove_column = step.can_remove_required_output.at(0); + + // TODO: handle filter exactly like prewhere, collect columns to remove after filter? + } + if (hasPrewhere()) { - const ExpressionActionsChain::Step & step = *chain.steps.at(0); + const ExpressionActionsChain::Step & step = *chain.steps.at(next_step_i++); prewhere_info->remove_prewhere_column = step.can_remove_required_output.at(0); NameSet columns_to_remove; @@ -1553,13 +1565,12 @@ void ExpressionAnalysisResult::finalize(const ExpressionActionsChain & chain, si columns_to_remove_after_prewhere = std::move(columns_to_remove); } - else if (hasFilter()) - { - /// Can't have prewhere and filter set simultaneously - filter_info->do_remove_column = chain.steps.at(0)->can_remove_required_output.at(0); - } + if (hasWhere()) - remove_where_filter = chain.steps.at(where_step_num)->can_remove_required_output.at(0); + { + const ExpressionActionsChain::Step & step = *chain.steps.at(where_step_num); + remove_where_filter = step.can_remove_required_output.at(0); + } } void ExpressionAnalysisResult::removeExtraColumns() const diff --git a/src/Interpreters/InterpreterSelectQuery.cpp b/src/Interpreters/InterpreterSelectQuery.cpp index 9dd63362dbd..57c18f1bb86 100644 --- a/src/Interpreters/InterpreterSelectQuery.cpp +++ b/src/Interpreters/InterpreterSelectQuery.cpp @@ -383,7 +383,7 @@ InterpreterSelectQuery::InterpreterSelectQuery( view = nullptr; } - if (try_move_to_prewhere && storage && !row_policy_filter && query.where() && !query.prewhere() && !query.final()) + if (try_move_to_prewhere && storage && query.where() && !query.prewhere() && !query.final()) { /// PREWHERE optimization: transfer some condition from WHERE to PREWHERE if enabled and viable if (const auto * merge_tree = dynamic_cast(storage.get())) @@ -450,9 +450,6 @@ InterpreterSelectQuery::InterpreterSelectQuery( } } - if (!options.only_analyze && storage && filter_info && query.prewhere()) - throw Exception("PREWHERE is not supported if the table is filtered by row-level security expression", ErrorCodes::ILLEGAL_PREWHERE); - /// Calculate structure of the result. result_header = getSampleBlockImpl(); }; @@ -806,12 +803,30 @@ void InterpreterSelectQuery::executeImpl(QueryPlan & query_plan, const BlockInpu bool intermediate_stage = false; bool to_aggregation_stage = false; bool from_aggregation_stage = false; + const bool filter_in_prewhere = ( + (settings.optimize_move_to_prewhere || expressions.prewhere_info) && + !input && !input_pipe && storage && storage->supportsPrewhere() + ); if (options.only_analyze) { auto read_nothing = std::make_unique(source_header); query_plan.addStep(std::move(read_nothing)); + if (expressions.filter_info && filter_in_prewhere) + { + auto row_level_security_step = std::make_unique( + query_plan.getCurrentDataStream(), + expressions.filter_info->actions_dag, + expressions.filter_info->column_name, + expressions.filter_info->do_remove_column); + + row_level_security_step->setStepDescription("Row-level security filter (PREWHERE)"); + query_plan.addStep(std::move(row_level_security_step)); + + // TODO: handle filter like prewhere, remove unnecessary columns after it, etc.? + } + if (expressions.prewhere_info) { auto prewhere_step = std::make_unique( @@ -862,11 +877,8 @@ void InterpreterSelectQuery::executeImpl(QueryPlan & query_plan, const BlockInpu if (options.to_stage == QueryProcessingStage::WithMergeableStateAfterAggregation) to_aggregation_stage = true; - if (storage && expressions.filter_info && expressions.prewhere_info) - throw Exception("PREWHERE is not supported if the table is filtered by row-level security expression", ErrorCodes::ILLEGAL_PREWHERE); - - /** Read the data from Storage. from_stage - to what stage the request was completed in Storage. */ - executeFetchColumns(from_stage, query_plan, expressions.prewhere_info, expressions.columns_to_remove_after_prewhere); + /// Read the data from Storage. from_stage - to what stage the request was completed in Storage. + executeFetchColumns(from_stage, query_plan, filter_in_prewhere); LOG_TRACE(log, "{} -> {}", QueryProcessingStage::toString(from_stage), QueryProcessingStage::toString(options.to_stage)); } @@ -931,7 +943,7 @@ void InterpreterSelectQuery::executeImpl(QueryPlan & query_plan, const BlockInpu if (expressions.first_stage) { - if (expressions.hasFilter()) + if (expressions.filter_info && !filter_in_prewhere) { auto row_level_security_step = std::make_unique( query_plan.getCurrentDataStream(), @@ -941,6 +953,8 @@ void InterpreterSelectQuery::executeImpl(QueryPlan & query_plan, const BlockInpu row_level_security_step->setStepDescription("Row-level security filter"); query_plan.addStep(std::move(row_level_security_step)); + + // TODO: handle filter like prewhere, remove unnecessary columns after it, etc.? } if (expressions.before_array_join) @@ -1228,12 +1242,13 @@ void InterpreterSelectQuery::addEmptySourceToQueryPlan(QueryPlan & query_plan, c query_plan.addStep(std::move(read_from_pipe)); } -void InterpreterSelectQuery::executeFetchColumns( - QueryProcessingStage::Enum processing_stage, QueryPlan & query_plan, - const PrewhereDAGInfoPtr & prewhere_info, const NameSet & columns_to_remove_after_prewhere) +void InterpreterSelectQuery::executeFetchColumns(QueryProcessingStage::Enum processing_stage, QueryPlan & query_plan, bool filter_in_prewhere) { auto & query = getSelectQuery(); const Settings & settings = context->getSettingsRef(); + auto & expressions = analysis_result; + auto & prewhere_info = expressions.prewhere_info; + auto & columns_to_remove_after_prewhere = expressions.columns_to_remove_after_prewhere; /// Optimization for trivial query like SELECT count() FROM table. bool optimize_trivial_count = @@ -1241,7 +1256,7 @@ void InterpreterSelectQuery::executeFetchColumns( && (settings.max_parallel_replicas <= 1) && storage && storage->getName() != "MaterializeMySQL" - && !filter_info + && !expressions.filter_info && processing_stage == QueryProcessingStage::FetchColumns && query_analyzer->hasAggregation() && (query_analyzer->aggregates().size() == 1) @@ -1554,6 +1569,22 @@ void InterpreterSelectQuery::executeFetchColumns( query_info.syntax_analyzer_result = syntax_analyzer_result; query_info.sets = query_analyzer->getPreparedSets(); + if (expressions.filter_info && filter_in_prewhere) + { + if (!query_info.prewhere_info_list) + query_info.prewhere_info_list = std::make_shared(); + + query_info.prewhere_info_list->emplace( + query_info.prewhere_info_list->begin(), + std::make_shared(expressions.filter_info->actions_dag), + expressions.filter_info->column_name); + + auto & new_filter_info = query_info.prewhere_info_list->front(); + + new_filter_info.remove_prewhere_column = expressions.filter_info->do_remove_column; + new_filter_info.need_filter = true; + } + if (prewhere_info) { if (!query_info.prewhere_info_list) diff --git a/src/Interpreters/InterpreterSelectQuery.h b/src/Interpreters/InterpreterSelectQuery.h index 1fff316e1d4..6fcbf102b05 100644 --- a/src/Interpreters/InterpreterSelectQuery.h +++ b/src/Interpreters/InterpreterSelectQuery.h @@ -108,12 +108,7 @@ private: /// Different stages of query execution. - void executeFetchColumns( - QueryProcessingStage::Enum processing_stage, - QueryPlan & query_plan, - const PrewhereDAGInfoPtr & prewhere_info, - const NameSet & columns_to_remove_after_prewhere); - + void executeFetchColumns(QueryProcessingStage::Enum processing_stage, QueryPlan & query_plan, bool filter_in_prewhere); void executeWhere(QueryPlan & query_plan, const ActionsDAGPtr & expression, bool remove_filter); void executeAggregation(QueryPlan & query_plan, const ActionsDAGPtr & expression, bool overflow_row, bool final, InputOrderInfoPtr group_by_info); void executeMergeAggregated(QueryPlan & query_plan, bool overflow_row, bool final); From 3146a1a9542b16d3e56730ca6aa289d23fd70689 Mon Sep 17 00:00:00 2001 From: Alexander Tokmakov Date: Mon, 25 Jan 2021 21:59:23 +0300 Subject: [PATCH 0080/2357] fix --- docker/test/stress/stress | 7 +++++-- src/Interpreters/DDLTask.cpp | 2 +- src/Interpreters/DDLWorker.cpp | 17 +++++++++++++---- src/Interpreters/InterpreterCreateQuery.cpp | 2 +- .../test_materialize_mysql_database/test.py | 2 +- 5 files changed, 21 insertions(+), 9 deletions(-) diff --git a/docker/test/stress/stress b/docker/test/stress/stress index 458f78fcdb4..c530f605da7 100755 --- a/docker/test/stress/stress +++ b/docker/test/stress/stress @@ -22,12 +22,15 @@ def get_options(i): if 0 < i: options += " --order=random" - if i % 2 == 1: + if i % 3 == 1: options += " --db-engine=Ordinary" + if i % 3 == 2: + options += ''' --db-engine="Replicated('/test/db/test_{}', 's1', 'r1')"'''.format(i) + # If database name is not specified, new database is created for each functional test. # Run some threads with one database for all tests. - if i % 3 == 1: + if i % 2 == 1: options += " --database=test_{}".format(i) if i == 13: diff --git a/src/Interpreters/DDLTask.cpp b/src/Interpreters/DDLTask.cpp index 3d9297880c1..fd2de014581 100644 --- a/src/Interpreters/DDLTask.cpp +++ b/src/Interpreters/DDLTask.cpp @@ -140,7 +140,7 @@ bool DDLTask::findCurrentHostID(const Context & global_context, Poco::Logger * l void DDLTask::setClusterInfo(const Context & context, Poco::Logger * log) { - auto query_on_cluster = dynamic_cast(query.get()); + auto * query_on_cluster = dynamic_cast(query.get()); if (!query_on_cluster) throw Exception("Received unknown DDL query", ErrorCodes::UNKNOWN_TYPE_OF_QUERY); diff --git a/src/Interpreters/DDLWorker.cpp b/src/Interpreters/DDLWorker.cpp index 91a5309bb5d..fc72e4d8366 100644 --- a/src/Interpreters/DDLWorker.cpp +++ b/src/Interpreters/DDLWorker.cpp @@ -201,11 +201,7 @@ void DDLWorker::shutdown() stop_flag = true; queue_updated_event->set(); cleanup_event->set(); -} -DDLWorker::~DDLWorker() -{ - shutdown(); worker_pool.reset(); if (main_thread.joinable()) main_thread.join(); @@ -213,6 +209,11 @@ DDLWorker::~DDLWorker() cleanup_thread.join(); } +DDLWorker::~DDLWorker() +{ + shutdown(); +} + ZooKeeperPtr DDLWorker::tryGetZooKeeper() const { @@ -490,9 +491,14 @@ void DDLWorker::processTask(DDLTaskBase & task) } if (task.execute_on_leader) + { tryExecuteQueryOnLeaderReplica(task, storage, rewritten_query, task.entry_path, zookeeper); + } else + { + storage.reset(); tryExecuteQuery(rewritten_query, task); + } } catch (const Coordination::Exception &) { @@ -892,6 +898,7 @@ void DDLWorker::initializeMainThread() { tryLogCurrentException(log, "Cannot initialize DDL queue."); reset_state(false); + sleepForSeconds(5); } } while (!initialized && !stop_flag); @@ -949,11 +956,13 @@ void DDLWorker::runMainThread() LOG_ERROR(log, "Unexpected ZooKeeper error: {}", getCurrentExceptionMessage(true)); reset_state(); } + sleepForSeconds(5); } catch (...) { tryLogCurrentException(log, "Unexpected error, will try to restart main thread:"); reset_state(); + sleepForSeconds(5); } } } diff --git a/src/Interpreters/InterpreterCreateQuery.cpp b/src/Interpreters/InterpreterCreateQuery.cpp index 40789fc1a8a..b66af77930c 100644 --- a/src/Interpreters/InterpreterCreateQuery.cpp +++ b/src/Interpreters/InterpreterCreateQuery.cpp @@ -718,7 +718,7 @@ void InterpreterCreateQuery::assertOrSetUUID(ASTCreateQuery & create, const Data const auto * kind = create.is_dictionary ? "Dictionary" : "Table"; const auto * kind_upper = create.is_dictionary ? "DICTIONARY" : "TABLE"; - if (database->getEngineName() == "Replicated" && context.getClientInfo().query_kind == ClientInfo::QueryKind::REPLICATED_LOG_QUERY) + if (database->getEngineName() == "Replicated" && context.getClientInfo().query_kind == ClientInfo::QueryKind::REPLICATED_LOG_QUERY && !internal) { if (create.uuid == UUIDHelpers::Nil) throw Exception("Table UUID is not specified in DDL log", ErrorCodes::LOGICAL_ERROR); diff --git a/tests/integration/test_materialize_mysql_database/test.py b/tests/integration/test_materialize_mysql_database/test.py index dbd6e894987..3cdc527d33d 100644 --- a/tests/integration/test_materialize_mysql_database/test.py +++ b/tests/integration/test_materialize_mysql_database/test.py @@ -14,7 +14,7 @@ DOCKER_COMPOSE_PATH = get_docker_compose_path() cluster = ClickHouseCluster(__file__) -node_db_ordinary = cluster.add_instance('node1', user_configs=["configs/users.xml"], with_mysql=False, stay_alive=True) +node_db_ordinary = cluster.add_instance('node1', user_configs=["configs/users.xml"], with_mysql=False, stay_alive=True, with_zookeeper=True) #FIXME node_db_atomic = cluster.add_instance('node2', user_configs=["configs/users_db_atomic.xml"], with_mysql=False, stay_alive=True) From 11b53d3b9d9a98748e763a1698aa88639c02ebd0 Mon Sep 17 00:00:00 2001 From: Denis Glazachev Date: Tue, 26 Jan 2021 00:17:48 +0400 Subject: [PATCH 0081/2357] Fix compilation/linter --- src/Storages/MergeTree/MergeTreeRangeReader.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/Storages/MergeTree/MergeTreeRangeReader.cpp b/src/Storages/MergeTree/MergeTreeRangeReader.cpp index 361ab2d227a..fcac5bc2c59 100644 --- a/src/Storages/MergeTree/MergeTreeRangeReader.cpp +++ b/src/Storages/MergeTree/MergeTreeRangeReader.cpp @@ -484,10 +484,10 @@ void MergeTreeRangeReader::ReadResult::addFilter(const ColumnPtr & new_filter) throw Exception("addFilter function expected ColumnUInt8.", ErrorCodes::LOGICAL_ERROR); const auto & data = filter->getData(); - auto it = data.begin(); + auto * it = data.begin(); auto & new_data = new_mutable_holder_cast->getData(); - auto n_it = new_data.begin(); + auto * n_it = new_data.begin(); while (it != data.end() && n_it != new_data.end()) { From 0f7f8ace7388fd6aa700d21fbc946d48cc8eae43 Mon Sep 17 00:00:00 2001 From: romanzhukov Date: Tue, 26 Jan 2021 01:39:23 +0300 Subject: [PATCH 0082/2357] DOCSUP-5266: Add changes from PR --- .../operations/utilities/clickhouse-local.md | 4 ++ .../functions/date-time-functions.md | 12 ++++-- .../operations/utilities/clickhouse-local.md | 7 +++- .../data-types/simpleaggregatefunction.md | 3 ++ .../functions/date-time-functions.md | 38 +++++++++++++++++++ 5 files changed, 59 insertions(+), 5 deletions(-) diff --git a/docs/en/operations/utilities/clickhouse-local.md b/docs/en/operations/utilities/clickhouse-local.md index 04f9f3660b5..cfabf42bff1 100644 --- a/docs/en/operations/utilities/clickhouse-local.md +++ b/docs/en/operations/utilities/clickhouse-local.md @@ -91,6 +91,8 @@ $ clickhouse-local --query " Now let’s output memory user for each Unix user: +Query: + ``` bash $ ps aux | tail -n +2 | awk '{ printf("%s\t%s\n", $1, $4) }' \ | clickhouse-local --structure "user String, mem Float64" \ @@ -98,6 +100,8 @@ $ ps aux | tail -n +2 | awk '{ printf("%s\t%s\n", $1, $4) }' \ FROM table GROUP BY user ORDER BY memTotal DESC FORMAT Pretty" ``` +Result: + ``` text Read 186 rows, 4.15 KiB in 0.035 sec., 5302 rows/sec., 118.34 KiB/sec. ┏━━━━━━━━━━┳━━━━━━━━━━┓ diff --git a/docs/en/sql-reference/functions/date-time-functions.md b/docs/en/sql-reference/functions/date-time-functions.md index 9de780fb596..b73d13c59a4 100644 --- a/docs/en/sql-reference/functions/date-time-functions.md +++ b/docs/en/sql-reference/functions/date-time-functions.md @@ -661,8 +661,6 @@ Result: └────────────────────────────────────────────┘ ``` -[Original article](https://clickhouse.tech/docs/en/query_language/functions/date_time_functions/) - ## FROM\_UNIXTIME {#fromunixfime} When there is only single argument of integer type, it act in the same way as `toDateTime` and return [DateTime](../../sql-reference/data-types/datetime.md). @@ -670,10 +668,14 @@ type. For example: +Query: + ```sql -SELECT FROM_UNIXTIME(423543535) +SELECT FROM_UNIXTIME(423543535); ``` +Result: + ```text ┌─FROM_UNIXTIME(423543535)─┐ │ 1983-06-04 10:58:55 │ @@ -685,7 +687,7 @@ When there are two arguments, first is integer or DateTime, second is constant f For example: ```sql -SELECT FROM_UNIXTIME(1234334543, '%Y-%m-%d %R:%S') AS DateTime +SELECT FROM_UNIXTIME(1234334543, '%Y-%m-%d %R:%S') AS DateTime; ``` ```text @@ -837,3 +839,5 @@ Result: │ 2020-01-01 │ └────────────────────────────────────┘ ``` + +[Original article](https://clickhouse.tech/docs/en/query_language/functions/date_time_functions/) \ No newline at end of file diff --git a/docs/ru/operations/utilities/clickhouse-local.md b/docs/ru/operations/utilities/clickhouse-local.md index 2b5c9b119e2..e3c421ac75e 100644 --- a/docs/ru/operations/utilities/clickhouse-local.md +++ b/docs/ru/operations/utilities/clickhouse-local.md @@ -21,7 +21,8 @@ toc_title: clickhouse-local Основной формат вызова: ``` bash -$ clickhouse-local --structure "table_structure" --input-format "format_of_incoming_data" -q "query" +$ clickhouse-local --structure "table_structure" --input-format "format_of_incoming_data" \ + --query "query" ``` Ключи команды: @@ -78,6 +79,8 @@ $ clickhouse-local --query " А теперь давайте выведем на экран объём оперативной памяти, занимаемой пользователями (Unix): +Запрос: + ``` bash $ ps aux | tail -n +2 | awk '{ printf("%s\t%s\n", $1, $4) }' \ | clickhouse-local --structure "user String, mem Float64" \ @@ -85,6 +88,8 @@ $ ps aux | tail -n +2 | awk '{ printf("%s\t%s\n", $1, $4) }' \ FROM table GROUP BY user ORDER BY memTotal DESC FORMAT Pretty" ``` +Ответ: + ``` text Read 186 rows, 4.15 KiB in 0.035 sec., 5302 rows/sec., 118.34 KiB/sec. ┏━━━━━━━━━━┳━━━━━━━━━━┓ diff --git a/docs/ru/sql-reference/data-types/simpleaggregatefunction.md b/docs/ru/sql-reference/data-types/simpleaggregatefunction.md index 52f0412a177..3ff4e5fd662 100644 --- a/docs/ru/sql-reference/data-types/simpleaggregatefunction.md +++ b/docs/ru/sql-reference/data-types/simpleaggregatefunction.md @@ -15,6 +15,9 @@ The following aggregate functions are supported: - [`groupBitXor`](../../sql-reference/aggregate-functions/reference/groupbitxor.md#groupbitxor) - [`groupArrayArray`](../../sql-reference/aggregate-functions/reference/grouparray.md#agg_function-grouparray) - [`groupUniqArrayArray`](../../sql-reference/aggregate-functions/reference/groupuniqarray.md#groupuniqarray) +- [`sumMap`](../../sql-reference/aggregate-functions/reference/summap.md#agg_functions-summap) +- [`minMap`](../../sql-reference/aggregate-functions/reference/minmap.md#agg_functions-minmap) +- [`maxMap`](../../sql-reference/aggregate-functions/reference/maxmap.md#agg_functions-maxmap) Values of the `SimpleAggregateFunction(func, Type)` look and stored the same way as `Type`, so you do not need to apply functions with `-Merge`/`-State` suffixes. `SimpleAggregateFunction` has better performance than `AggregateFunction` with same aggregation function. diff --git a/docs/ru/sql-reference/functions/date-time-functions.md b/docs/ru/sql-reference/functions/date-time-functions.md index 31482cde77f..e923de8ebd2 100644 --- a/docs/ru/sql-reference/functions/date-time-functions.md +++ b/docs/ru/sql-reference/functions/date-time-functions.md @@ -665,4 +665,42 @@ SELECT formatDateTime(toDate('2010-01-04'), '%g') └────────────────────────────────────────────┘ ``` +## FROM\_UNIXTIME {#fromunixfime} + +Когда есть только один аргумент целочисленного типа, он действует так же, как `toDateTime` и возвращает тип [DateTime](../../sql-reference/data-types/datetime.md). + +**Пример** + +Запрос: + +```sql +SELECT FROM_UNIXTIME(423543535); +``` + +Ответ: + +```text +┌─FROM_UNIXTIME(423543535)─┐ +│ 1983-06-04 10:58:55 │ +└──────────────────────────┘ +``` + +В случае, когда есть два аргумента, первый типа `Integer` или `DateTime`, а второй — является строкой постоянного формата, функция работает таким же образом, как `formatdatetime` и возвращает значение типа `String`. + +**Пример** + +Запрос: + +```sql +SELECT FROM_UNIXTIME(1234334543, '%Y-%m-%d %R:%S') AS DateTime; +``` + +Ответ: + +```text +┌─DateTime────────────┐ +│ 2009-02-11 14:42:23 │ +└─────────────────────┘ +``` + [Оригинальная статья](https://clickhouse.tech/docs/ru/query_language/functions/date_time_functions/) From 097c9362bdad12d3ffbc7a817fc3bfda81a82156 Mon Sep 17 00:00:00 2001 From: Ivan <5627721+abyss7@users.noreply.github.com> Date: Tue, 26 Jan 2021 14:00:52 +0300 Subject: [PATCH 0083/2357] Update date-time-functions.md --- docs/en/sql-reference/functions/date-time-functions.md | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/docs/en/sql-reference/functions/date-time-functions.md b/docs/en/sql-reference/functions/date-time-functions.md index b73d13c59a4..856ce830abe 100644 --- a/docs/en/sql-reference/functions/date-time-functions.md +++ b/docs/en/sql-reference/functions/date-time-functions.md @@ -602,7 +602,7 @@ This is necessary for searching for pageviews in the corresponding session. ## formatDateTime {#formatdatetime} -Function formats a Time according given Format string. N.B.: Format is a constant expression, e.g. you can not have multiple formats for single result column. +Function formats a Time according to the given Format string. N.B.: Format is a constant expression, e.g. you cannot have multiple formats for a single result column. **Syntax** @@ -663,7 +663,7 @@ Result: ## FROM\_UNIXTIME {#fromunixfime} -When there is only single argument of integer type, it act in the same way as `toDateTime` and return [DateTime](../../sql-reference/data-types/datetime.md). +When there is only a single argument of integer type, it acts in the same way as `toDateTime` and return [DateTime](../../sql-reference/data-types/datetime.md). type. For example: @@ -682,7 +682,7 @@ Result: └──────────────────────────┘ ``` -When there are two arguments, first is integer or DateTime, second is constant format string, it act in the same way as `formatDateTime` and return `String` type. +When there are two arguments: first is an integer or DateTime, second is a constant format string - it acts in the same way as `formatDateTime` and return `String` type. For example: @@ -840,4 +840,4 @@ Result: └────────────────────────────────────┘ ``` -[Original article](https://clickhouse.tech/docs/en/query_language/functions/date_time_functions/) \ No newline at end of file +[Original article](https://clickhouse.tech/docs/en/query_language/functions/date_time_functions/) From 1834c5ccae9da4b456544dbfa22d01f16ad0393f Mon Sep 17 00:00:00 2001 From: Ivan <5627721+abyss7@users.noreply.github.com> Date: Tue, 26 Jan 2021 14:04:39 +0300 Subject: [PATCH 0084/2357] Update date-time-functions.md --- docs/ru/sql-reference/functions/date-time-functions.md | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/docs/ru/sql-reference/functions/date-time-functions.md b/docs/ru/sql-reference/functions/date-time-functions.md index e923de8ebd2..4db244d2388 100644 --- a/docs/ru/sql-reference/functions/date-time-functions.md +++ b/docs/ru/sql-reference/functions/date-time-functions.md @@ -665,9 +665,9 @@ SELECT formatDateTime(toDate('2010-01-04'), '%g') └────────────────────────────────────────────┘ ``` -## FROM\_UNIXTIME {#fromunixfime} +## FROM\_UNIXTIME {#fromunixtime} -Когда есть только один аргумент целочисленного типа, он действует так же, как `toDateTime` и возвращает тип [DateTime](../../sql-reference/data-types/datetime.md). +Когда указан только один аргумент целочисленного типа, то функция действует так же, как `toDateTime`, и возвращает тип [DateTime](../../sql-reference/data-types/datetime.md). **Пример** @@ -685,7 +685,7 @@ SELECT FROM_UNIXTIME(423543535); └──────────────────────────┘ ``` -В случае, когда есть два аргумента, первый типа `Integer` или `DateTime`, а второй — является строкой постоянного формата, функция работает таким же образом, как `formatdatetime` и возвращает значение типа `String`. +В случае, когда есть два аргумента: первый типа `Integer` или `DateTime`, а второй является строкой постоянного формата — функция работает таким же образом, как `formatDateTime`, и возвращает значение типа `String`. **Пример** From 04531f14d9fb55c3eca1ac23070262d200828d60 Mon Sep 17 00:00:00 2001 From: Ivan <5627721+abyss7@users.noreply.github.com> Date: Tue, 26 Jan 2021 14:06:08 +0300 Subject: [PATCH 0085/2357] Fix hyphen --- docs/en/sql-reference/functions/date-time-functions.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/en/sql-reference/functions/date-time-functions.md b/docs/en/sql-reference/functions/date-time-functions.md index 856ce830abe..f11bec55697 100644 --- a/docs/en/sql-reference/functions/date-time-functions.md +++ b/docs/en/sql-reference/functions/date-time-functions.md @@ -682,7 +682,7 @@ Result: └──────────────────────────┘ ``` -When there are two arguments: first is an integer or DateTime, second is a constant format string - it acts in the same way as `formatDateTime` and return `String` type. +When there are two arguments: first is an integer or DateTime, second is a constant format string — it acts in the same way as `formatDateTime` and return `String` type. For example: From 0d1c9479f8f904ff5c48b2320959f4dd244c4c0a Mon Sep 17 00:00:00 2001 From: Denis Glazachev Date: Tue, 26 Jan 2021 18:39:12 +0400 Subject: [PATCH 0086/2357] Fix compilation/linter --- src/Storages/MergeTree/MergeTreeRangeReader.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/Storages/MergeTree/MergeTreeRangeReader.cpp b/src/Storages/MergeTree/MergeTreeRangeReader.cpp index fcac5bc2c59..0b3765adc6a 100644 --- a/src/Storages/MergeTree/MergeTreeRangeReader.cpp +++ b/src/Storages/MergeTree/MergeTreeRangeReader.cpp @@ -484,7 +484,7 @@ void MergeTreeRangeReader::ReadResult::addFilter(const ColumnPtr & new_filter) throw Exception("addFilter function expected ColumnUInt8.", ErrorCodes::LOGICAL_ERROR); const auto & data = filter->getData(); - auto * it = data.begin(); + const auto * it = data.begin(); auto & new_data = new_mutable_holder_cast->getData(); auto * n_it = new_data.begin(); From f20d5e3b419b1efc77e3a3a1b7aa46f86ac4c201 Mon Sep 17 00:00:00 2001 From: Alexander Tokmakov Date: Tue, 26 Jan 2021 20:51:25 +0300 Subject: [PATCH 0087/2357] fix --- src/Databases/DatabaseAtomic.cpp | 13 +++-- src/Databases/DatabaseReplicated.h | 2 +- src/Interpreters/Context.cpp | 3 +- src/Interpreters/Context.h | 1 + src/Interpreters/DDLTask.h | 3 +- src/Interpreters/DDLWorker.cpp | 53 ++++++++----------- src/Interpreters/InterpreterRenameQuery.cpp | 7 +++ src/Interpreters/executeDDLQueryOnCluster.cpp | 7 +-- src/Parsers/ASTAlterQuery.cpp | 14 ++++- src/Parsers/ASTAlterQuery.h | 4 ++ src/Storages/StorageMaterializedView.cpp | 6 ++- tests/clickhouse-test | 16 ++++-- 12 files changed, 78 insertions(+), 51 deletions(-) diff --git a/src/Databases/DatabaseAtomic.cpp b/src/Databases/DatabaseAtomic.cpp index 1da23b9beef..8b75f439152 100644 --- a/src/Databases/DatabaseAtomic.cpp +++ b/src/Databases/DatabaseAtomic.cpp @@ -115,8 +115,8 @@ void DatabaseAtomic::dropTable(const Context & context, const String & table_nam std::unique_lock lock(mutex); table = getTableUnlocked(table_name, lock); table_metadata_path_drop = DatabaseCatalog::instance().getPathForDroppedMetadata(table->getStorageID()); - - if (auto txn = context.getMetadataTransaction()) + auto txn = context.getMetadataTransaction(); + if (txn && !context.isInternalSubquery()) txn->commit(); /// Commit point (a sort of) for Replicated database /// NOTE: replica will be lost if server crashes before the following rename @@ -241,7 +241,8 @@ void DatabaseAtomic::renameTable(const Context & context, const String & table_n } /// Table renaming actually begins here - if (auto txn = context.getMetadataTransaction()) + auto txn = context.getMetadataTransaction(); + if (txn && !context.isInternalSubquery()) txn->commit(); /// Commit point (a sort of) for Replicated database /// NOTE: replica will be lost if server crashes before the following rename @@ -301,7 +302,8 @@ void DatabaseAtomic::commitCreateTable(const ASTCreateQuery & query, const Stora DatabaseCatalog::instance().addUUIDMapping(query.uuid); locked_uuid = true; - if (auto txn = query_context.getMetadataTransaction()) + auto txn = query_context.getMetadataTransaction(); + if (txn && !query_context.isInternalSubquery()) txn->commit(); /// Commit point (a sort of) for Replicated database /// NOTE: replica will be lost if server crashes before the following renameNoReplace(...) @@ -335,7 +337,8 @@ void DatabaseAtomic::commitAlterTable(const StorageID & table_id, const String & if (table_id.uuid != actual_table_id.uuid) throw Exception("Cannot alter table because it was renamed", ErrorCodes::CANNOT_ASSIGN_ALTER); - if (auto txn = query_context.getMetadataTransaction()) + auto txn = query_context.getMetadataTransaction(); + if (txn && !query_context.isInternalSubquery()) txn->commit(); /// Commit point (a sort of) for Replicated database /// NOTE: replica will be lost if server crashes before the following rename diff --git a/src/Databases/DatabaseReplicated.h b/src/Databases/DatabaseReplicated.h index 8085c234af4..586f381c962 100644 --- a/src/Databases/DatabaseReplicated.h +++ b/src/Databases/DatabaseReplicated.h @@ -64,7 +64,7 @@ public: void shutdown() override; - void loadStoredObjects(Context & context, bool has_force_restore_data_flag, bool force_attach = false) override; + void loadStoredObjects(Context & context, bool has_force_restore_data_flag, bool force_attach) override; String getFullReplicaName() const { return shard_name + '|' + replica_name; } diff --git a/src/Interpreters/Context.cpp b/src/Interpreters/Context.cpp index 3d102553f5a..6895439b855 100644 --- a/src/Interpreters/Context.cpp +++ b/src/Interpreters/Context.cpp @@ -2522,8 +2522,7 @@ void Context::initMetadataTransaction(MetadataTransactionPtr txn) MetadataTransactionPtr Context::getMetadataTransaction() const { - //FIXME - //assert(query_context == this); + assert(!metadata_transaction || hasQueryContext()); return metadata_transaction; } diff --git a/src/Interpreters/Context.h b/src/Interpreters/Context.h index dcb581b98c6..37ed01d4dbc 100644 --- a/src/Interpreters/Context.h +++ b/src/Interpreters/Context.h @@ -536,6 +536,7 @@ public: const Context & getQueryContext() const; Context & getQueryContext(); bool hasQueryContext() const { return query_context != nullptr; } + bool isInternalSubquery() const { return hasQueryContext() && query_context != this; } const Context & getSessionContext() const; Context & getSessionContext(); diff --git a/src/Interpreters/DDLTask.h b/src/Interpreters/DDLTask.h index 7501c01aa8f..a12676ab8a3 100644 --- a/src/Interpreters/DDLTask.h +++ b/src/Interpreters/DDLTask.h @@ -85,9 +85,10 @@ struct DDLTaskBase ExecutionStatus execution_status; bool was_executed = false; + std::atomic_bool completely_processed = false; + DDLTaskBase(const String & name, const String & path) : entry_name(name), entry_path(path) {} DDLTaskBase(const DDLTaskBase &) = delete; - DDLTaskBase(DDLTaskBase &&) = default; virtual ~DDLTaskBase() = default; void parseQueryFromEntry(const Context & context); diff --git a/src/Interpreters/DDLWorker.cpp b/src/Interpreters/DDLWorker.cpp index fc72e4d8366..cb38c733582 100644 --- a/src/Interpreters/DDLWorker.cpp +++ b/src/Interpreters/DDLWorker.cpp @@ -341,9 +341,10 @@ void DDLWorker::scheduleTasks() auto & min_task = *std::min_element(current_tasks.begin(), current_tasks.end()); begin_node = std::upper_bound(queue_nodes.begin(), queue_nodes.end(), min_task->entry_name); current_tasks.clear(); - //FIXME better way of maintaning current tasks list and min_task name; } + assert(current_tasks.empty()); + for (auto it = begin_node; it != queue_nodes.end() && !stop_flag; ++it) { String entry_name = *it; @@ -378,12 +379,8 @@ void DDLWorker::scheduleTasks() DDLTaskBase & DDLWorker::saveTask(DDLTaskPtr && task) { - //assert(current_tasks.size() <= pool_size + 1); - //if (current_tasks.size() == pool_size) - //{ - // assert(current_tasks.front()->ops.empty()); //FIXME - // current_tasks.pop_front(); - //} + std::remove_if(current_tasks.begin(), current_tasks.end(), [](const DDLTaskPtr & t) { return t->completely_processed.load(); }); + assert(current_tasks.size() <= pool_size); current_tasks.emplace_back(std::move(task)); return *current_tasks.back(); } @@ -555,6 +552,8 @@ void DDLWorker::processTask(DDLTaskBase & task) active_node->reset(); task.ops.clear(); } + + task.completely_processed = true; } @@ -572,6 +571,9 @@ bool DDLWorker::taskShouldBeExecutedOnLeader(const ASTPtr ast_ddl, const Storage // Setting alters should be executed on all replicas if (alter->isSettingsAlter()) return false; + + if (alter->isFreezeAlter()) + return false; } return storage->supportsReplication(); @@ -856,28 +858,20 @@ String DDLWorker::enqueueQuery(DDLLogEntry & entry) void DDLWorker::initializeMainThread() { - auto reset_state = [&](bool reset_pool = true) - { - initialized = false; - /// It will wait for all threads in pool to finish and will not rethrow exceptions (if any). - /// We create new thread pool to forget previous exceptions. - if (reset_pool) - worker_pool = std::make_unique(pool_size); - /// Clear other in-memory state, like server just started. - current_tasks.clear(); - max_id = 0; - }; - + assert(!initialized); + assert(max_id == 0); + assert(current_tasks.empty()); setThreadName("DDLWorker"); LOG_DEBUG(log, "Started DDLWorker thread"); - do + while (!stop_flag) { try { auto zookeeper = getAndSetZooKeeper(); zookeeper->createAncestors(fs::path(queue_dir) / ""); initialized = true; + return; } catch (const Coordination::Exception & e) { @@ -885,33 +879,29 @@ void DDLWorker::initializeMainThread() { /// A logical error. LOG_ERROR(log, "ZooKeeper error: {}. Failed to start DDLWorker.",getCurrentExceptionMessage(true)); - reset_state(false); assert(false); /// Catch such failures in tests with debug build } tryLogCurrentException(__PRETTY_FUNCTION__); - - /// Avoid busy loop when ZooKeeper is not available. - sleepForSeconds(5); } catch (...) { tryLogCurrentException(log, "Cannot initialize DDL queue."); - reset_state(false); - sleepForSeconds(5); } + + /// Avoid busy loop when ZooKeeper is not available. + sleepForSeconds(5); } - while (!initialized && !stop_flag); } void DDLWorker::runMainThread() { - auto reset_state = [&](bool reset_pool = true) + auto reset_state = [&]() { initialized = false; /// It will wait for all threads in pool to finish and will not rethrow exceptions (if any). /// We create new thread pool to forget previous exceptions. - if (reset_pool) + if (1 < pool_size) worker_pool = std::make_unique(pool_size); /// Clear other in-memory state, like server just started. current_tasks.clear(); @@ -944,6 +934,7 @@ void DDLWorker::runMainThread() if (Coordination::isHardwareError(e.code)) { initialized = false; + LOG_INFO(log, "Lost ZooKeeper connection, will try to connect again: {}", getCurrentExceptionMessage(true)); } else if (e.code == Coordination::Error::ZNONODE) { @@ -953,10 +944,10 @@ void DDLWorker::runMainThread() } else { - LOG_ERROR(log, "Unexpected ZooKeeper error: {}", getCurrentExceptionMessage(true)); + LOG_ERROR(log, "Unexpected ZooKeeper error, will try to restart main thread: {}", getCurrentExceptionMessage(true)); reset_state(); } - sleepForSeconds(5); + sleepForSeconds(1); } catch (...) { diff --git a/src/Interpreters/InterpreterRenameQuery.cpp b/src/Interpreters/InterpreterRenameQuery.cpp index 72398103d62..a6075643a96 100644 --- a/src/Interpreters/InterpreterRenameQuery.cpp +++ b/src/Interpreters/InterpreterRenameQuery.cpp @@ -13,6 +13,10 @@ namespace DB { +namespace ErrorCodes +{ + extern const int NOT_IMPLEMENTED; +} InterpreterRenameQuery::InterpreterRenameQuery(const ASTPtr & query_ptr_, Context & context_) : query_ptr(query_ptr_), context(context_) @@ -78,6 +82,9 @@ BlockIO InterpreterRenameQuery::executeToTables(const ASTRenameQuery & rename, c DatabasePtr database = database_catalog.getDatabase(elem.from_database_name); if (typeid_cast(database.get()) && context.getClientInfo().query_kind != ClientInfo::QueryKind::REPLICATED_LOG_QUERY) { + if (1 < descriptions.size()) + throw Exception(ErrorCodes::NOT_IMPLEMENTED, "Database {} is Replicated, " + "it does not support renaming of multiple tables in single query.", elem.from_database_name); return typeid_cast(database.get())->propose(query_ptr); } else diff --git a/src/Interpreters/executeDDLQueryOnCluster.cpp b/src/Interpreters/executeDDLQueryOnCluster.cpp index cf801caed04..fb155e82926 100644 --- a/src/Interpreters/executeDDLQueryOnCluster.cpp +++ b/src/Interpreters/executeDDLQueryOnCluster.cpp @@ -31,12 +31,13 @@ namespace ErrorCodes bool isSupportedAlterType(int type) { + assert(type != ASTAlterCommand::NO_TYPE); static const std::unordered_set unsupported_alter_types{ + /// It's dangerous, because it may duplicate data if executed on multiple replicas ASTAlterCommand::ATTACH_PARTITION, - ASTAlterCommand::REPLACE_PARTITION, + /// Usually followed by ATTACH PARTITION ASTAlterCommand::FETCH_PARTITION, - ASTAlterCommand::FREEZE_PARTITION, - ASTAlterCommand::FREEZE_ALL, + /// Logical error ASTAlterCommand::NO_TYPE, }; diff --git a/src/Parsers/ASTAlterQuery.cpp b/src/Parsers/ASTAlterQuery.cpp index 8a44dcc7c3b..f24b26d5b54 100644 --- a/src/Parsers/ASTAlterQuery.cpp +++ b/src/Parsers/ASTAlterQuery.cpp @@ -344,7 +344,7 @@ void ASTAlterCommand::formatImpl( throw Exception("Unexpected type of ALTER", ErrorCodes::UNEXPECTED_AST_STRUCTURE); } -bool ASTAlterQuery::isSettingsAlter() const +bool ASTAlterQuery::isOneCommandTypeOnly(const ASTAlterCommand::Type & type) const { if (command_list) { @@ -353,7 +353,7 @@ bool ASTAlterQuery::isSettingsAlter() const for (const auto & child : command_list->children) { const auto & command = child->as(); - if (command.type != ASTAlterCommand::MODIFY_SETTING) + if (command.type != type) return false; } return true; @@ -361,6 +361,16 @@ bool ASTAlterQuery::isSettingsAlter() const return false; } +bool ASTAlterQuery::isSettingsAlter() const +{ + return isOneCommandTypeOnly(ASTAlterCommand::MODIFY_SETTING); +} + +bool ASTAlterQuery::isFreezeAlter() const +{ + return isOneCommandTypeOnly(ASTAlterCommand::FREEZE_PARTITION) || isOneCommandTypeOnly(ASTAlterCommand::FREEZE_ALL); +} + /** Get the text that identifies this element. */ String ASTAlterQuery::getID(char delim) const { diff --git a/src/Parsers/ASTAlterQuery.h b/src/Parsers/ASTAlterQuery.h index f53a987905e..4cc01aa889e 100644 --- a/src/Parsers/ASTAlterQuery.h +++ b/src/Parsers/ASTAlterQuery.h @@ -189,6 +189,8 @@ public: bool isSettingsAlter() const; + bool isFreezeAlter() const; + String getID(char) const override; ASTPtr clone() const override; @@ -200,6 +202,8 @@ public: protected: void formatQueryImpl(const FormatSettings & settings, FormatState & state, FormatStateStacked frame) const override; + + bool isOneCommandTypeOnly(const ASTAlterCommand::Type & type) const; }; } diff --git a/src/Storages/StorageMaterializedView.cpp b/src/Storages/StorageMaterializedView.cpp index af00b37b1d5..29aea3e6150 100644 --- a/src/Storages/StorageMaterializedView.cpp +++ b/src/Storages/StorageMaterializedView.cpp @@ -89,6 +89,7 @@ StorageMaterializedView::StorageMaterializedView( else { /// We will create a query to create an internal table. + auto create_context = Context(local_context); auto manual_create_query = std::make_shared(); manual_create_query->database = getStorageID().database_name; manual_create_query->table = generateInnerTableName(getStorageID()); @@ -99,7 +100,7 @@ StorageMaterializedView::StorageMaterializedView( manual_create_query->set(manual_create_query->columns_list, new_columns_list); manual_create_query->set(manual_create_query->storage, query.storage->ptr()); - InterpreterCreateQuery create_interpreter(manual_create_query, local_context); + InterpreterCreateQuery create_interpreter(manual_create_query, create_context); create_interpreter.setInternal(true); create_interpreter.execute(); @@ -205,7 +206,8 @@ static void executeDropQuery(ASTDropQuery::Kind kind, Context & global_context, drop_query->no_delay = no_delay; drop_query->if_exists = true; ASTPtr ast_drop_query = drop_query; - InterpreterDropQuery drop_interpreter(ast_drop_query, global_context); + auto drop_context = Context(global_context); + InterpreterDropQuery drop_interpreter(ast_drop_query, drop_context); drop_interpreter.execute(); } } diff --git a/tests/clickhouse-test b/tests/clickhouse-test index d5c6019d28f..13e7b4be001 100755 --- a/tests/clickhouse-test +++ b/tests/clickhouse-test @@ -162,7 +162,12 @@ def run_single_test(args, ext, server_logs_level, client_options, case_file, std while (datetime.now() - start_time).total_seconds() < args.timeout and proc.poll() is None: sleep(0.01) - if not args.database: + need_drop_database = not args.database + if need_drop_database and args.no_drop_if_fail: + maybe_passed = (proc.returncode == 0) and (proc.stderr is None) and (proc.stdout is None or 'Exception' not in proc.stdout) + need_drop_database = not maybe_passed + + if need_drop_database: clickhouse_proc_create = Popen(shlex.split(args.client), stdin=PIPE, stdout=PIPE, stderr=PIPE, universal_newlines=True) seconds_left = max(args.timeout - (datetime.now() - start_time).total_seconds(), 10) try: @@ -181,9 +186,10 @@ def run_single_test(args, ext, server_logs_level, client_options, case_file, std total_time = (datetime.now() - start_time).total_seconds() - # Normalize randomized database names in stdout, stderr files. - os.system("LC_ALL=C sed -i -e 's/{test_db}/default/g' {file}".format(test_db=database, file=stdout_file)) - os.system("LC_ALL=C sed -i -e 's/{test_db}/default/g' {file}".format(test_db=database, file=stderr_file)) + if not args.show_db_name: + # Normalize randomized database names in stdout, stderr files. + os.system("LC_ALL=C sed -i -e 's/{test_db}/default/g' {file}".format(test_db=database, file=stdout_file)) + os.system("LC_ALL=C sed -i -e 's/{test_db}/default/g' {file}".format(test_db=database, file=stderr_file)) stdout = open(stdout_file, 'rb').read() if os.path.exists(stdout_file) else b'' stdout = str(stdout, errors='replace', encoding='utf-8') @@ -884,6 +890,8 @@ if __name__ == '__main__': parser.add_argument('--hung-check', action='store_true', default=False) parser.add_argument('--force-color', action='store_true', default=False) parser.add_argument('--database', help='Database for tests (random name test_XXXXXX by default)') + parser.add_argument('--no-drop-if-fail', action='store_true', help='Do not drop database for test if test has failed') + parser.add_argument('--show-db-name', action='store_true', help='Do not replace random database name with "default"') parser.add_argument('--parallel', default='1/1', help='One parallel test run number/total') parser.add_argument('-j', '--jobs', default=1, nargs='?', type=int, help='Run all tests in parallel') parser.add_argument('-U', '--unified', default=3, type=int, help='output NUM lines of unified context') From 666aab676e2c298263940760e94523286c81d9e6 Mon Sep 17 00:00:00 2001 From: Anton Popov Date: Tue, 26 Jan 2021 21:38:53 +0300 Subject: [PATCH 0088/2357] add comments to algorithm --- .../FinishAggregatingInOrderAlgorithm.cpp | 29 +++++++++---------- .../FinishAggregatingInOrderAlgorithm.h | 24 +++++++++++---- src/Processors/Merges/Algorithms/MergedData.h | 3 +- .../FinishAggregatingInOrderTransform.h | 8 ++--- src/Processors/QueryPlan/AggregatingStep.cpp | 3 +- .../AggregatingInOrderTransform.cpp | 2 -- .../Transforms/AggregatingInOrderTransform.h | 1 + src/Processors/ya.make | 1 + 8 files changed, 38 insertions(+), 33 deletions(-) diff --git a/src/Processors/Merges/Algorithms/FinishAggregatingInOrderAlgorithm.cpp b/src/Processors/Merges/Algorithms/FinishAggregatingInOrderAlgorithm.cpp index e20f8416851..7a127c7193a 100644 --- a/src/Processors/Merges/Algorithms/FinishAggregatingInOrderAlgorithm.cpp +++ b/src/Processors/Merges/Algorithms/FinishAggregatingInOrderAlgorithm.cpp @@ -21,13 +21,11 @@ FinishAggregatingInOrderAlgorithm::FinishAggregatingInOrderAlgorithm( const Block & header_, size_t num_inputs_, AggregatingTransformParamsPtr params_, - SortDescription description_, - size_t max_block_size_) - : merged_data(header_.cloneEmptyColumns(), false, max_block_size_) - , header(header_) + SortDescription description_) + : header(header_) , num_inputs(num_inputs_) , params(params_) - , description(description_) + , description(std::move(description_)) { /// Replace column names in description to positions. for (auto & column_description : description) @@ -55,6 +53,7 @@ void FinishAggregatingInOrderAlgorithm::consume(Input & input, size_t source_num IMergingAlgorithm::Status FinishAggregatingInOrderAlgorithm::merge() { + /// Find the input with smallest last row. std::optional best_input; for (size_t i = 0; i < num_inputs; ++i) { @@ -70,11 +69,13 @@ IMergingAlgorithm::Status FinishAggregatingInOrderAlgorithm::merge() } if (!best_input) - return Status{merged_data.pull(), true}; + return Status{aggregate(), true}; + /// Chunk at best_input will be aggregated entirely. auto & best_state = states[*best_input]; best_state.to_row = states[*best_input].num_rows; + /// Find the positions upto which need to aggregate in other chunks. for (size_t i = 0; i < num_inputs; ++i) { if (!states[i].isValid() || i == *best_input) @@ -90,28 +91,23 @@ IMergingAlgorithm::Status FinishAggregatingInOrderAlgorithm::merge() states[i].to_row = (it == indices.end() ? states[i].num_rows : *it); } - auto aggregated = aggregate(); - for (size_t i = 0; i < aggregated.rows(); ++i) - merged_data.insertRow(aggregated.getColumns(), i, aggregated.rows()); - Status status(*best_input); - if (merged_data.hasEnoughRows()) - status.chunk = merged_data.pull(); + status.chunk = aggregate(); return status; } -Block FinishAggregatingInOrderAlgorithm::aggregate() +Chunk FinishAggregatingInOrderAlgorithm::aggregate() { BlocksList blocks; for (size_t i = 0; i < num_inputs; ++i) { const auto & state = states[i]; - if (!state.isValid()) + if (!state.isValid() || state.current_row == state.to_row) continue; - if (state.current_row == 0 && state.to_row == state.num_rows) + if (state.to_row - state.current_row == state.num_rows) { blocks.emplace_back(header.cloneWithColumns(states[i].all_columns)); } @@ -128,7 +124,8 @@ Block FinishAggregatingInOrderAlgorithm::aggregate() states[i].current_row = states[i].to_row; } - return params->aggregator.mergeBlocks(blocks, false); + auto aggregated = params->aggregator.mergeBlocks(blocks, false); + return {aggregated.getColumns(), aggregated.rows()}; } } diff --git a/src/Processors/Merges/Algorithms/FinishAggregatingInOrderAlgorithm.h b/src/Processors/Merges/Algorithms/FinishAggregatingInOrderAlgorithm.h index 57a5671bf82..f724e33b640 100644 --- a/src/Processors/Merges/Algorithms/FinishAggregatingInOrderAlgorithm.h +++ b/src/Processors/Merges/Algorithms/FinishAggregatingInOrderAlgorithm.h @@ -11,6 +11,17 @@ namespace DB struct AggregatingTransformParams; using AggregatingTransformParamsPtr = std::shared_ptr; +/** + * The second step of aggregation in order of sorting key. + * The transform recieves k inputs with partialy aggregated data, + * sorted by group by key (prefix of sorting key). + * Then it merges aggregated data from inputs by the following algorithm: + * - At each step find the smallest value X of the sorting key among last rows of current blocks of inputs. + * Since the data is sorted in order of sorting key and has no duplicates (because of aggregation), + * X will never appear later in any of input streams. + * - Aggregate all rows in current blocks of inputs upto the upper_bound of X using + * regular hash table algorithm (Aggregator::mergeBlock). + */ class FinishAggregatingInOrderAlgorithm final : public IMergingAlgorithm { public: @@ -18,30 +29,31 @@ public: const Block & header_, size_t num_inputs_, AggregatingTransformParamsPtr params_, - SortDescription description_, - size_t max_block_size_); + SortDescription description_); void initialize(Inputs inputs) override; void consume(Input & input, size_t source_num) override; Status merge() override; +private: + Chunk aggregate(); + struct State { size_t num_rows; Columns all_columns; ColumnRawPtrs sorting_columns; + /// Number of row starting from which need to aggregate. size_t current_row = 0; + + /// Number of row upto which need to aggregate (not included). size_t to_row = 0; State(const Chunk & chunk, const SortDescription & description); bool isValid() const { return current_row < num_rows; } }; -private: - Block aggregate(); - - MergedData merged_data; Block header; size_t num_inputs; AggregatingTransformParamsPtr params; diff --git a/src/Processors/Merges/Algorithms/MergedData.h b/src/Processors/Merges/Algorithms/MergedData.h index fa703e185a3..9bf33d72f31 100644 --- a/src/Processors/Merges/Algorithms/MergedData.h +++ b/src/Processors/Merges/Algorithms/MergedData.h @@ -21,8 +21,7 @@ public: /// Pull will be called at next prepare call. void flush() { need_flush = true; } - template - void insertRow(const TColumns & raw_columns, size_t row, size_t block_size) + void insertRow(const ColumnRawPtrs & raw_columns, size_t row, size_t block_size) { size_t num_columns = raw_columns.size(); for (size_t i = 0; i < num_columns; ++i) diff --git a/src/Processors/Merges/FinishAggregatingInOrderTransform.h b/src/Processors/Merges/FinishAggregatingInOrderTransform.h index 27e37355910..e067b9472d9 100644 --- a/src/Processors/Merges/FinishAggregatingInOrderTransform.h +++ b/src/Processors/Merges/FinishAggregatingInOrderTransform.h @@ -16,19 +16,17 @@ public: const Block & header, size_t num_inputs, AggregatingTransformParamsPtr params, - SortDescription description, - size_t max_block_size) + SortDescription description) : IMergingTransform( num_inputs, header, header, true, header, num_inputs, params, - std::move(description), - max_block_size) + std::move(description)) { } - String getName() const override { return "AggregatingSortedTransform"; } + String getName() const override { return "FinishAggregatingInOrderTransform"; } }; } diff --git a/src/Processors/QueryPlan/AggregatingStep.cpp b/src/Processors/QueryPlan/AggregatingStep.cpp index 0474a15961e..813d86b50c0 100644 --- a/src/Processors/QueryPlan/AggregatingStep.cpp +++ b/src/Processors/QueryPlan/AggregatingStep.cpp @@ -100,8 +100,7 @@ void AggregatingStep::transformPipeline(QueryPipeline & pipeline) pipeline.getHeader(), pipeline.getNumStreams(), transform_params, - group_by_sort_description, - max_block_size); + group_by_sort_description); pipeline.addTransform(std::move(transform)); aggregating_sorted = collector.detachProcessors(1); diff --git a/src/Processors/Transforms/AggregatingInOrderTransform.cpp b/src/Processors/Transforms/AggregatingInOrderTransform.cpp index a3932a7ab1b..d6526f0cdf1 100644 --- a/src/Processors/Transforms/AggregatingInOrderTransform.cpp +++ b/src/Processors/Transforms/AggregatingInOrderTransform.cpp @@ -2,8 +2,6 @@ #include #include -#include - namespace DB { diff --git a/src/Processors/Transforms/AggregatingInOrderTransform.h b/src/Processors/Transforms/AggregatingInOrderTransform.h index 7b659fc53e2..10793e885ce 100644 --- a/src/Processors/Transforms/AggregatingInOrderTransform.h +++ b/src/Processors/Transforms/AggregatingInOrderTransform.h @@ -88,4 +88,5 @@ private: AggregatingTransformParamsPtr params; }; + } diff --git a/src/Processors/ya.make b/src/Processors/ya.make index 2eb27be8899..caa8ffa6146 100644 --- a/src/Processors/ya.make +++ b/src/Processors/ya.make @@ -79,6 +79,7 @@ SRCS( LimitTransform.cpp Merges/Algorithms/AggregatingSortedAlgorithm.cpp Merges/Algorithms/CollapsingSortedAlgorithm.cpp + Merges/Algorithms/FinishAggregatingInOrderAlgorithm.cpp Merges/Algorithms/GraphiteRollupSortedAlgorithm.cpp Merges/Algorithms/IMergingAlgorithmWithDelayedChunk.cpp Merges/Algorithms/IMergingAlgorithmWithSharedChunks.cpp From 38e8bab6b186807361819d7ba8fb0373b46390a3 Mon Sep 17 00:00:00 2001 From: Anton Popov Date: Wed, 27 Jan 2021 03:44:36 +0300 Subject: [PATCH 0089/2357] fix tests --- .../Merges/Algorithms/FinishAggregatingInOrderAlgorithm.cpp | 2 +- .../Merges/Algorithms/FinishAggregatingInOrderAlgorithm.h | 6 +++--- .../01551_mergetree_read_in_order_spread.reference | 2 +- 3 files changed, 5 insertions(+), 5 deletions(-) diff --git a/src/Processors/Merges/Algorithms/FinishAggregatingInOrderAlgorithm.cpp b/src/Processors/Merges/Algorithms/FinishAggregatingInOrderAlgorithm.cpp index 7a127c7193a..0e4de315aa1 100644 --- a/src/Processors/Merges/Algorithms/FinishAggregatingInOrderAlgorithm.cpp +++ b/src/Processors/Merges/Algorithms/FinishAggregatingInOrderAlgorithm.cpp @@ -75,7 +75,7 @@ IMergingAlgorithm::Status FinishAggregatingInOrderAlgorithm::merge() auto & best_state = states[*best_input]; best_state.to_row = states[*best_input].num_rows; - /// Find the positions upto which need to aggregate in other chunks. + /// Find the positions up to which need to aggregate in other chunks. for (size_t i = 0; i < num_inputs; ++i) { if (!states[i].isValid() || i == *best_input) diff --git a/src/Processors/Merges/Algorithms/FinishAggregatingInOrderAlgorithm.h b/src/Processors/Merges/Algorithms/FinishAggregatingInOrderAlgorithm.h index f724e33b640..c54e847d0a3 100644 --- a/src/Processors/Merges/Algorithms/FinishAggregatingInOrderAlgorithm.h +++ b/src/Processors/Merges/Algorithms/FinishAggregatingInOrderAlgorithm.h @@ -13,13 +13,13 @@ using AggregatingTransformParamsPtr = std::shared_ptr Date: Wed, 27 Jan 2021 12:33:11 +0300 Subject: [PATCH 0090/2357] Work with any number of replicas simultaneously, support max_parallel_replicas --- src/Client/Connection.cpp | 28 ++ src/Client/ConnectionPoolWithFailover.cpp | 6 +- src/Client/ConnectionPoolWithFailover.h | 6 +- src/Client/GetHedgedConnections.cpp | 439 +++++++++++----------- src/Client/GetHedgedConnections.h | 86 +++-- src/Client/HedgedConnections.cpp | 314 +++++++++------- src/Client/HedgedConnections.h | 33 +- src/DataStreams/RemoteQueryExecutor.cpp | 4 +- 8 files changed, 511 insertions(+), 405 deletions(-) diff --git a/src/Client/Connection.cpp b/src/Client/Connection.cpp index 15f530f4085..75586ea8cae 100644 --- a/src/Client/Connection.cpp +++ b/src/Client/Connection.cpp @@ -91,6 +91,7 @@ void Connection::connect(const ConnectionTimeouts & timeouts) void Connection::disconnect() { +// LOG_DEBUG(log_wrapper.get(), "disconnect"); in = nullptr; last_input_packet_type.reset(); out = nullptr; // can write to socket @@ -102,6 +103,8 @@ void Connection::disconnect() void Connection::prepare(const ConnectionTimeouts & timeouts) { +// LOG_DEBUG(log_wrapper.get(), "Connect"); + LOG_TRACE(log_wrapper.get(), "Connecting. Database: {}. User: {}{}{}", default_database.empty() ? "(not specified)" : default_database, user, @@ -154,6 +157,8 @@ void Connection::prepare(const ConnectionTimeouts & timeouts) void Connection::sendHello() { +// LOG_DEBUG(log_wrapper.get(), "sendHello"); + /** Disallow control characters in user controlled parameters * to mitigate the possibility of SSRF. * The user may do server side requests with 'remote' table function. @@ -210,6 +215,8 @@ void Connection::sendHello() void Connection::receiveHello() { +// LOG_DEBUG(log_wrapper.get(), "receiveHello"); + /// Receive hello packet. UInt64 packet_type = 0; @@ -313,6 +320,8 @@ const String & Connection::getServerDisplayName(const ConnectionTimeouts & timeo void Connection::forceConnected(const ConnectionTimeouts & timeouts) { +// LOG_DEBUG(log_wrapper.get(), "forceConnected"); + if (!connected) { connect(timeouts); @@ -339,6 +348,8 @@ void Connection::sendClusterNameAndSalt() bool Connection::ping() { +// LOG_DEBUG(log_wrapper.get(), "ping"); + TimeoutSetter timeout_setter(*socket, sync_request_timeout, true); try { @@ -390,6 +401,8 @@ TablesStatusResponse Connection::getTablesStatus(const ConnectionTimeouts & time void Connection::sendTablesStatusRequest(const TablesStatusRequest & request) { +// LOG_DEBUG(log_wrapper.get(), "sendTablesStatusRequest"); + writeVarUInt(Protocol::Client::TablesStatusRequest, *out); request.write(*out, server_revision); out->next(); @@ -397,6 +410,8 @@ void Connection::sendTablesStatusRequest(const TablesStatusRequest & request) TablesStatusResponse Connection::receiveTablesStatusResponse() { +// LOG_DEBUG(log_wrapper.get(), "receiveTablesStatusResponse"); + UInt64 response_type = 0; readVarUInt(response_type, *in); @@ -422,6 +437,8 @@ void Connection::sendQuery( if (!connected) connect(timeouts); +// LOG_DEBUG(log_wrapper.get(), "sendQuery"); + TimeoutSetter timeout_setter(*socket, timeouts.send_timeout, timeouts.receive_timeout, true); if (settings) @@ -520,6 +537,8 @@ void Connection::sendCancel() if (!out) return; +// LOG_DEBUG(log_wrapper.get(), "sendCancel"); + writeVarUInt(Protocol::Client::Cancel, *out); out->next(); } @@ -527,6 +546,8 @@ void Connection::sendCancel() void Connection::sendData(const Block & block, const String & name, bool scalar) { +// LOG_DEBUG(log_wrapper.get(), "sendData"); + if (!block_out) { if (compression == Protocol::Compression::Enable) @@ -557,6 +578,7 @@ void Connection::sendData(const Block & block, const String & name, bool scalar) void Connection::sendPreparedData(ReadBuffer & input, size_t size, const String & name) { /// NOTE 'Throttler' is not used in this method (could use, but it's not important right now). +// LOG_DEBUG(log_wrapper.get(), "sendPreparedData"); writeVarUInt(Protocol::Client::Data, *out); writeStringBinary(name, *out); @@ -574,6 +596,8 @@ void Connection::sendScalarsData(Scalars & data) if (data.empty()) return; +// LOG_DEBUG(log_wrapper.get(), "sendScalarsData"); + Stopwatch watch; size_t out_bytes = out ? out->count() : 0; size_t maybe_compressed_out_bytes = maybe_compressed_out ? maybe_compressed_out->count() : 0; @@ -659,6 +683,8 @@ void Connection::sendExternalTablesData(ExternalTablesData & data) return; } +// LOG_DEBUG(log_wrapper.get(), "sendExternalTablesData"); + Stopwatch watch; size_t out_bytes = out ? out->count() : 0; size_t maybe_compressed_out_bytes = maybe_compressed_out ? maybe_compressed_out->count() : 0; @@ -757,6 +783,8 @@ std::optional Connection::checkPacket(size_t timeout_microseconds) Packet Connection::receivePacket(AsyncCallback async_callback) { +// LOG_DEBUG(log_wrapper.get(), "receivePacket"); + in->setAsyncCallback(std::move(async_callback)); SCOPE_EXIT(in->setAsyncCallback({})); diff --git a/src/Client/ConnectionPoolWithFailover.cpp b/src/Client/ConnectionPoolWithFailover.cpp index 00ec1e30f10..af4f8bb2d25 100644 --- a/src/Client/ConnectionPoolWithFailover.cpp +++ b/src/Client/ConnectionPoolWithFailover.cpp @@ -353,7 +353,7 @@ void TryGetConnection::reset() { resetResult(); stage = Stage::CONNECT; - epoll = nullptr; + action_before_disconnect = nullptr; socket_fd = -1; fail_message.clear(); } @@ -369,8 +369,8 @@ void TryGetConnection::resetResult() void TryGetConnection::processFail(bool add_description) { - if (epoll) - epoll->remove(socket_fd); + if (action_before_disconnect) + action_before_disconnect(socket_fd); fail_message = getCurrentExceptionMessage(/* with_stacktrace = */ false); if (add_description) diff --git a/src/Client/ConnectionPoolWithFailover.h b/src/Client/ConnectionPoolWithFailover.h index c57a7bb984a..86f63191608 100644 --- a/src/Client/ConnectionPoolWithFailover.h +++ b/src/Client/ConnectionPoolWithFailover.h @@ -62,9 +62,7 @@ public: /// Reset class to initial stage. void reset(); - /// If connection is failed and epoll is set, before disconnecting - /// socket will be removed from epoll. - void setEpoll(Epoll * epoll_) { epoll = epoll_; } + void setActionBeforeDisconnect(std::function action) { action_before_disconnect = action; } /// Process fail connection. void processFail(bool add_description = false); @@ -78,7 +76,7 @@ public: TryResult result; Stage stage; int socket_fd; - Epoll * epoll = nullptr; + std::function action_before_disconnect; }; class ConnectionPoolWithFailover : public IConnectionPool, private PoolWithFailoverBase diff --git a/src/Client/GetHedgedConnections.cpp b/src/Client/GetHedgedConnections.cpp index 839d6bf37c2..4c729dc0722 100644 --- a/src/Client/GetHedgedConnections.cpp +++ b/src/Client/GetHedgedConnections.cpp @@ -7,6 +7,7 @@ namespace DB namespace ErrorCodes { extern const int BAD_ARGUMENTS; + extern const int LOGICAL_ERROR; extern const int ALL_CONNECTION_TRIES_FAILED; } @@ -26,6 +27,9 @@ GetHedgedConnections::GetHedgedConnections( = (settings ? size_t{settings->connections_with_failover_max_tries} : size_t{DBMS_CONNECTION_POOL_WITH_FAILOVER_DEFAULT_MAX_TRIES}); fallback_to_stale_replicas = settings ? settings->fallback_to_stale_replicas_for_distributed_queries : false; + entries_count = 0; + usable_count = 0; + failed_pools_count = 0; } GetHedgedConnections::~GetHedgedConnections() @@ -33,173 +37,175 @@ GetHedgedConnections::~GetHedgedConnections() pool->updateSharedError(shuffled_pools); } -GetHedgedConnections::Replicas GetHedgedConnections::getConnections() +std::vector GetHedgedConnections::getManyConnections(PoolMode pool_mode) { - entries_count = 0; - usable_count = 0; - failed_pools_count = 0; + size_t min_entries = (settings && settings->skip_unavailable_shards) ? 0 : 1; - ReplicaStatePtr replica = &first_replica; - int index = 0; + size_t max_entries; + if (pool_mode == PoolMode::GET_ALL) + { + min_entries = shuffled_pools.size(); + max_entries = shuffled_pools.size(); + } + else if (pool_mode == PoolMode::GET_ONE) + max_entries = 1; + else if (pool_mode == PoolMode::GET_MANY) + max_entries = settings ? size_t(settings->max_parallel_replicas) : 1; + else + throw DB::Exception("Unknown pool allocation mode", DB::ErrorCodes::LOGICAL_ERROR); + + std::vector replicas; + replicas.reserve(max_entries); + for (size_t i = 0; i != max_entries; ++i) + { + auto replica = getNextConnection(false); + if (replica->isCannotChoose()) + { + if (replicas.size() >= min_entries) + break; + + /// Determine the reason of not enough replicas. + if (!fallback_to_stale_replicas && usable_count >= min_entries) + throw DB::Exception( + "Could not find enough connections to up-to-date replicas. Got: " + std::to_string(replicas.size()) + + ", needed: " + std::to_string(min_entries), + DB::ErrorCodes::ALL_REPLICAS_ARE_STALE); + + throw DB::NetException( + "Could not connect to " + std::to_string(min_entries) + " replicas. Log: \n\n" + fail_messages + "\n", + DB::ErrorCodes::ALL_CONNECTION_TRIES_FAILED); + } + replicas.push_back(replica); + } + + return replicas; +} + +GetHedgedConnections::ReplicaStatePtr GetHedgedConnections::getNextConnection(bool non_blocking) +{ +// LOG_DEBUG(log, "getNextConnection"); + ReplicaStatePtr replica = createNewReplica(); + + int index; + + /// Check if it's the first time. + if (epoll.size() == 0 && ready_indexes.size() == 0) + { + index = 0; + last_used_index = 0; + } + else + index = getNextIndex(); + + bool is_first = true; while (index != -1 || epoll.size() != 0) { + if (index == -1 && !is_first && non_blocking) + { + replica->state = State::NOT_READY; + return replica; + } + + if (is_first) + is_first = false; + if (index != -1) { Action action = startTryGetConnection(index, replica); - if (action == Action::TRY_NEXT_REPLICA) - { - index = getNextIndex(index); - continue; - } if (action == Action::FINISH) - { - swapReplicasIfNeeded(); - return {&first_replica, &second_replica}; - } - } - - /// Process epoll events - replica = processEpollEvents(); - if (replica->isReady()) - { - swapReplicasIfNeeded(); - return {&first_replica, &second_replica}; - } - - index = getNextIndex(index); - } - - /// We reach this point only if there was no up to date replica - - if (usable_count == 0) - { - if (settings && settings->skip_unavailable_shards) - { - first_replica.state = State::CANNOT_CHOOSE; - second_replica.state = State::CANNOT_CHOOSE; - return {&first_replica, &second_replica}; - } - - throw NetException("All connection tries failed. Log: \n\n" + fail_messages + "\n", ErrorCodes::ALL_CONNECTION_TRIES_FAILED); - } - if (!fallback_to_stale_replicas) - throw DB::Exception("Could not find connection to up-to-date replica.", DB::ErrorCodes::ALL_REPLICAS_ARE_STALE); - - setBestUsableReplica(first_replica); - return {&first_replica, &second_replica}; -} - -void GetHedgedConnections::chooseSecondReplica() -{ - LOG_DEBUG(log, "choose second replica"); - - if (second_replica.isCannotChoose() || second_replica.isReady()) - return; - - int index; - if (second_replica.isNotReady()) - index = second_replica.index; - else - index = first_replica.index; - - while (true) - { - if (second_replica.isEmpty()) - { - - index = getNextIndex(index); - if (index == -1) - break; - - Action action = startTryGetConnection(index, &second_replica); + return replica; if (action == Action::TRY_NEXT_REPLICA) + { + index = getNextIndex(); continue; + } - /// Second replica is ready or we are waiting for response from it - return; + if (action == Action::PROCESS_EPOLL_EVENTS && non_blocking) + return replica; } - if (!second_replica.isNotReady()) - throw Exception("Second replica state must be 'NOT_READY' before process epoll events", ErrorCodes::LOGICAL_ERROR); + replica = processEpollEvents(non_blocking); + if (replica->isReady() || (replica->isNotReady() && non_blocking)) + return replica; - ReplicaStatePtr replica = processEpollEvents( true); + if (replica->isNotReady()) + throw Exception("Not ready replica after processing epoll events.", ErrorCodes::LOGICAL_ERROR); - if (replica != &second_replica) - throw Exception("Epoll could return only second replica here", ErrorCodes::LOGICAL_ERROR); - - /// If replica is not empty than it is ready or we are waiting for a response from it - if (!second_replica.isEmpty()) - return; + index = getNextIndex(); } - /// There is no up to date replica + /// We reach this point only if there was no free up to date replica. - LOG_DEBUG(log, "there is no up to date replica for second replica"); + /// Check if there is no even a free usable replica + if (!canGetNewConnection()) + { + replica->state = State::CANNOT_CHOOSE; + return replica; + } - if (!fallback_to_stale_replicas || usable_count <= 1) - second_replica.state = State::CANNOT_CHOOSE; - else - setBestUsableReplica(second_replica, first_replica.index); + if (!fallback_to_stale_replicas) + { + replica->state = State::CANNOT_CHOOSE; + return replica; + } + + setBestUsableReplica(replica); + return replica; } -void GetHedgedConnections::stopChoosingSecondReplica() +void GetHedgedConnections::stopChoosingReplicas() { - LOG_DEBUG(log, "stop choosing second replica"); +// LOG_DEBUG(log, "stopChoosingReplicas"); + for (auto & [fd, replica] : fd_to_replica) + { + removeTimeoutsFromReplica(replica, epoll, timeout_fd_to_replica); + epoll.remove(fd); + try_get_connections[replica->index].reset(); + replica->reset(); + } - if (!second_replica.isNotReady()) - throw Exception("Can't stop choosing second replica, because it's not in process of choosing", ErrorCodes::LOGICAL_ERROR); - - removeTimeoutsFromReplica(&second_replica, epoll); - epoll.remove(second_replica.fd); - - try_get_connections[second_replica.index].reset(); - second_replica.reset(); + fd_to_replica.clear(); } -int GetHedgedConnections::getNextIndex(int cur_index) +int GetHedgedConnections::getNextIndex() { /// Check if there is no more available replicas - if (cur_index == -1 || entries_count + failed_pools_count >= shuffled_pools.size()) + if (entries_count + failed_pools_count >= shuffled_pools.size()) return -1; - /// We can work with two replicas simultaneously and they must have different indexes - int skip_index = -1; - if (!first_replica.isEmpty()) - skip_index = first_replica.index; - else if (!second_replica.isEmpty()) - skip_index = second_replica.index; - bool finish = false; - int next_index = cur_index; + int next_index = last_used_index; while (!finish) { next_index = (next_index + 1) % shuffled_pools.size(); /// Check if we can try this replica - if (next_index != skip_index && (max_tries == 0 || shuffled_pools[next_index].error_count < max_tries) + if (indexes_in_process.find(next_index) == indexes_in_process.end() && (max_tries == 0 || shuffled_pools[next_index].error_count < max_tries) && try_get_connections[next_index].stage != TryGetConnection::Stage::FINISHED) finish = true; /// If we made a complete round, there is no replica to connect - else if (next_index == cur_index) + else if (next_index == last_used_index) return -1; } - LOG_DEBUG(log, "get next index: {}", next_index); +// LOG_DEBUG(log, "get next index: {}", next_index); + last_used_index = next_index; return next_index; } -GetHedgedConnections::Action GetHedgedConnections::startTryGetConnection(int index, ReplicaStatePtr replica) +GetHedgedConnections::Action GetHedgedConnections::startTryGetConnection(int index, ReplicaStatePtr & replica) { - LOG_DEBUG(log, "start try get connection with {} replica", index); +// LOG_DEBUG(log, "start try get connection with {} replica", index); TryGetConnection & try_get_connection = try_get_connections[index]; replica->state = State::NOT_READY; replica->index = index; + indexes_in_process.insert(index); try_get_connection.reset(); try_get_connection.run(); @@ -215,7 +221,13 @@ GetHedgedConnections::Action GetHedgedConnections::startTryGetConnection(int ind if (action == Action::PROCESS_EPOLL_EVENTS) { epoll.add(try_get_connection.socket_fd); - try_get_connection.setEpoll(&epoll); + fd_to_replica[try_get_connection.socket_fd] = replica; + try_get_connection.setActionBeforeDisconnect( + [&](int fd) + { + epoll.remove(fd); + fd_to_replica.erase(fd); + }); addTimeouts(replica); } @@ -223,51 +235,58 @@ GetHedgedConnections::Action GetHedgedConnections::startTryGetConnection(int ind } GetHedgedConnections::Action -GetHedgedConnections::processTryGetConnectionStage(ReplicaStatePtr replica, bool remove_from_epoll) +GetHedgedConnections::processTryGetConnectionStage(ReplicaStatePtr & replica, bool remove_from_epoll) { - LOG_DEBUG(log, "process get connection stage for {} replica", replica->index); +// LOG_DEBUG(log, "process get connection stage for {} replica", replica->index); TryGetConnection & try_get_connection = try_get_connections[replica->index]; if (try_get_connection.stage == TryGetConnection::Stage::FINISHED) { - LOG_DEBUG(log, "stage: FINISHED"); + indexes_in_process.erase(replica->index); + +// LOG_DEBUG(log, "stage: FINISHED"); ++entries_count; if (remove_from_epoll) + { epoll.remove(try_get_connection.socket_fd); + fd_to_replica.erase(try_get_connection.socket_fd); + } if (try_get_connection.result.is_usable) { - LOG_DEBUG(log, "replica is usable"); +// LOG_DEBUG(log, "replica is usable"); ++usable_count; if (try_get_connection.result.is_up_to_date) { - LOG_DEBUG(log, "replica is up to date, finish get hedged connections"); +// LOG_DEBUG(log, "replica is up to date, finish get hedged connections"); replica->state = State::READY; + ready_indexes.insert(replica->index); return Action::FINISH; } - - /// This replica is not up to date, we will try to find up to date - replica->reset(); - return Action::TRY_NEXT_REPLICA; } + + /// This replica is not up to date, we will try to find up to date + fd_to_replica.erase(replica->fd); + replica->reset(); + return Action::TRY_NEXT_REPLICA; } else if (try_get_connection.stage == TryGetConnection::Stage::FAILED) { - LOG_DEBUG(log, "stage: FAILED"); +// LOG_DEBUG(log, "stage: FAILED"); processFailedConnection(replica); return Action::TRY_NEXT_REPLICA; } - LOG_DEBUG(log, "middle stage, process epoll events"); +// LOG_DEBUG(log, "middle stage, process epoll events"); /// Get connection process is not finished return Action::PROCESS_EPOLL_EVENTS; } -void GetHedgedConnections::processFailedConnection(ReplicaStatePtr replica) +void GetHedgedConnections::processFailedConnection(ReplicaStatePtr & replica) { - LOG_DEBUG(log, "failed connection with {} replica", replica->index); +// LOG_DEBUG(log, "failed connection with {} replica", replica->index); ShuffledPool & shuffled_pool = shuffled_pools[replica->index]; LOG_WARNING( @@ -286,105 +305,65 @@ void GetHedgedConnections::processFailedConnection(ReplicaStatePtr replica) if (!fail_message.empty()) fail_messages += fail_message + "\n"; + indexes_in_process.erase(replica->index); replica->reset(); } -void GetHedgedConnections::addTimeouts(ReplicaState * replica) +void GetHedgedConnections::addTimeouts(ReplicaStatePtr & replica) { - LOG_DEBUG(log, "add timeouts for {} replica", replica->index); +// LOG_DEBUG(log, "add timeouts for {} replica", replica->index); - addTimeoutToReplica(TimerTypes::RECEIVE_TIMEOUT, replica, epoll, timeouts); + addTimeoutToReplica(TimerTypes::RECEIVE_TIMEOUT, replica, epoll, timeout_fd_to_replica, timeouts); - /// If we haven't connected to second replica yet, set special timeout for it - if (second_replica.isEmpty()) - { - auto stage = try_get_connections[replica->index].stage; - if (stage == TryGetConnection::Stage::RECEIVE_HELLO) - addTimeoutToReplica(TimerTypes::RECEIVE_HELLO_TIMEOUT, replica, epoll, timeouts); - else if (stage == TryGetConnection::Stage::RECEIVE_TABLES_STATUS) - addTimeoutToReplica(TimerTypes::RECEIVE_TABLES_STATUS_TIMEOUT, replica, epoll, timeouts); - } -} - -void GetHedgedConnections::swapReplicasIfNeeded() -{ - if ((!first_replica.isReady() && second_replica.isReady())) - { - LOG_DEBUG(log, "swap replicas"); - swapReplicas(); - } + auto stage = try_get_connections[replica->index].stage; + if (stage == TryGetConnection::Stage::RECEIVE_HELLO) + addTimeoutToReplica(TimerTypes::RECEIVE_HELLO_TIMEOUT, replica, epoll, timeout_fd_to_replica, timeouts); + else if (stage == TryGetConnection::Stage::RECEIVE_TABLES_STATUS) + addTimeoutToReplica(TimerTypes::RECEIVE_TABLES_STATUS_TIMEOUT, replica, epoll, timeout_fd_to_replica, timeouts); } GetHedgedConnections::ReplicaStatePtr GetHedgedConnections::processEpollEvents(bool non_blocking) { - LOG_DEBUG(log, "process epoll events"); +// LOG_DEBUG(log, "process epoll events"); int event_fd; ReplicaStatePtr replica = nullptr; bool finish = false; while (!finish) { - event_fd = getReadyFileDescriptor(epoll); + event_fd = getReadyFileDescriptor(); - if ((replica = isEventReplica(event_fd))) - finish = processReplicaEvent(replica, non_blocking); - - else if (auto * timeout_descriptor = isEventTimeout(event_fd, replica)) + if (fd_to_replica.find(event_fd) != fd_to_replica.end()) { - processTimeoutEvent(replica, timeout_descriptor); - finish = true; + replica = fd_to_replica[event_fd]; + finish = processReplicaEvent(replica, non_blocking); + } + else if (timeout_fd_to_replica.find(event_fd) != timeout_fd_to_replica.end()) + { + replica = timeout_fd_to_replica[event_fd]; + finish = processTimeoutEvent(replica, replica->active_timeouts[event_fd].get(), non_blocking); } else throw Exception("Unknown event from epoll", ErrorCodes::LOGICAL_ERROR); } - LOG_DEBUG(log, "cancel process epoll events"); +// LOG_DEBUG(log, "cancel process epoll events"); return replica; } -GetHedgedConnections::ReplicaStatePtr GetHedgedConnections::isEventReplica(int event_fd) +int GetHedgedConnections::getReadyFileDescriptor(AsyncCallback async_callback) { - if (event_fd == first_replica.fd) - return &first_replica; + for (auto & [fd, replica] : fd_to_replica) + if (replica->connection->hasReadPendingData()) + return replica->fd; - if (event_fd == second_replica.fd) - return &second_replica; - - return nullptr; + return epoll.getReady(std::move(async_callback)).data.fd; } -TimerDescriptorPtr GetHedgedConnections::isEventTimeout(int event_fd, ReplicaStatePtr & replica_out) +bool GetHedgedConnections::processReplicaEvent(ReplicaStatePtr & replica, bool non_blocking) { - if (first_replica.active_timeouts.find(event_fd) != first_replica.active_timeouts.end()) - { - replica_out = &first_replica; - return first_replica.active_timeouts[event_fd].get(); - } - - if (second_replica.active_timeouts.find(event_fd) != second_replica.active_timeouts.end()) - { - replica_out = &second_replica; - return second_replica.active_timeouts[event_fd].get(); - } - - return nullptr; -} - -int GetHedgedConnections::getReadyFileDescriptor(Epoll & epoll_, AsyncCallback async_callback) -{ - if (first_replica.connection && first_replica.connection->hasReadPendingData()) - return first_replica.fd; - - if (second_replica.connection && second_replica.connection->hasReadPendingData()) - return second_replica.fd; - - return epoll_.getReady(std::move(async_callback)).data.fd; -} - -bool GetHedgedConnections::processReplicaEvent(ReplicaStatePtr replica, bool non_blocking) -{ - LOG_DEBUG(log, "epoll event is {} replica", replica->index); - removeTimeoutsFromReplica(replica, epoll); +// LOG_DEBUG(log, "epoll event is {} replica", replica->index); + removeTimeoutsFromReplica(replica, epoll, timeout_fd_to_replica); try_get_connections[replica->index].run(); Action action = processTryGetConnectionStage(replica, true); if (action == Action::PROCESS_EPOLL_EVENTS) @@ -396,70 +375,84 @@ bool GetHedgedConnections::processReplicaEvent(ReplicaStatePtr replica, bool non return true; } -void GetHedgedConnections::processTimeoutEvent(ReplicaStatePtr & replica, TimerDescriptorPtr timeout_descriptor) +bool GetHedgedConnections::processTimeoutEvent(ReplicaStatePtr & replica, TimerDescriptorPtr timeout_descriptor, bool non_blocking) { - LOG_DEBUG(log, "epoll event is timeout for {} replica", replica->index); +// LOG_DEBUG(log, "epoll event is timeout for {} replica", replica->index); epoll.remove(timeout_descriptor->getDescriptor()); replica->active_timeouts.erase(timeout_descriptor->getDescriptor()); + timeout_fd_to_replica[timeout_descriptor->getDescriptor()]; if (timeout_descriptor->getType() == TimerTypes::RECEIVE_TIMEOUT) { - LOG_DEBUG(log, "process receive timeout for {} replica", replica->index); - removeTimeoutsFromReplica(replica, epoll); +// LOG_DEBUG(log, "process receive timeout for {} replica", replica->index); + removeTimeoutsFromReplica(replica, epoll, timeout_fd_to_replica); epoll.remove(replica->fd); + fd_to_replica.erase(replica->fd); TryGetConnection & try_get_connection = try_get_connections[replica->index]; try_get_connection.fail_message = "Receive timeout expired (" + try_get_connection.result.entry->getDescription() + ")"; try_get_connection.resetResult(); try_get_connection.stage = TryGetConnection::Stage::FAILED; processFailedConnection(replica); + + return true; } - else if (timeout_descriptor->getType() == TimerTypes::RECEIVE_HELLO_TIMEOUT + else if ((timeout_descriptor->getType() == TimerTypes::RECEIVE_HELLO_TIMEOUT || timeout_descriptor->getType() == TimerTypes::RECEIVE_TABLES_STATUS_TIMEOUT) + && entries_count + ready_indexes.size() + failed_pools_count < shuffled_pools.size()) { - if (replica->index == second_replica.index || !second_replica.isEmpty()) - throw Exception( - "Received timeout to connect with second replica, but current replica is second or second replica is not empty", - ErrorCodes::LOGICAL_ERROR); - replica = &second_replica; + replica = createNewReplica(); + return true; } + + return non_blocking; } -void GetHedgedConnections::setBestUsableReplica(ReplicaState & replica, int skip_index) +void GetHedgedConnections::setBestUsableReplica(ReplicaStatePtr & replica) { - LOG_DEBUG(log, "set best usable replica"); +// LOG_DEBUG(log, "set best usable replica"); std::vector indexes(try_get_connections.size()); for (size_t i = 0; i != indexes.size(); ++i) indexes[i] = i; - /// Remove unusable and failed replicas, skip the replica with skip_index index + /// Remove unusable and failed replicas, skip ready replicas indexes.erase( std::remove_if( indexes.begin(), indexes.end(), [&](int i) { - return try_get_connections[i].result.entry.isNull() || !try_get_connections[i].result.is_usable || i == skip_index; + return try_get_connections[i].result.entry.isNull() || !try_get_connections[i].result.is_usable || + indexes_in_process.find(i) != indexes_in_process.end() || ready_indexes.find(i) != ready_indexes.end(); }), indexes.end()); if (indexes.empty()) - throw Exception("There is no usable replica to choose", ErrorCodes::LOGICAL_ERROR); + { + replica->state = State::CANNOT_CHOOSE; + return; + } /// Sort replicas by staleness std::stable_sort(indexes.begin(), indexes.end(), [&](size_t lhs, size_t rhs) { return try_get_connections[lhs].result.staleness < try_get_connections[rhs].result.staleness; }); - replica.index = indexes[0]; - replica.connection = &*try_get_connections[indexes[0]].result.entry; - replica.state = State::READY; - replica.fd = replica.connection->getSocket()->impl()->sockfd(); + replica->index = indexes[0]; + replica->connection = &*try_get_connections[indexes[0]].result.entry; + replica->state = State::READY; + replica->fd = replica->connection->getSocket()->impl()->sockfd(); + ready_indexes.insert(replica->index); } -void addTimeoutToReplica(int type, GetHedgedConnections::ReplicaStatePtr replica, Epoll & epoll, const ConnectionTimeouts & timeouts) +void addTimeoutToReplica( + int type, + GetHedgedConnections::ReplicaStatePtr & replica, + Epoll & epoll, + std::unordered_map & timeout_fd_to_replica, + const ConnectionTimeouts & timeouts) { Poco::Timespan timeout; switch (type) @@ -484,17 +477,28 @@ void addTimeoutToReplica(int type, GetHedgedConnections::ReplicaStatePtr replica timeout_descriptor->setType(type); timeout_descriptor->setRelative(timeout); epoll.add(timeout_descriptor->getDescriptor()); + timeout_fd_to_replica[timeout_descriptor->getDescriptor()] = replica; replica->active_timeouts[timeout_descriptor->getDescriptor()] = std::move(timeout_descriptor); } -void removeTimeoutsFromReplica(GetHedgedConnections::ReplicaStatePtr replica, Epoll & epoll) +void removeTimeoutsFromReplica( + GetHedgedConnections::ReplicaStatePtr & replica, + Epoll & epoll, + std::unordered_map & timeout_fd_to_replica) { for (auto & [fd, _] : replica->active_timeouts) + { epoll.remove(fd); + timeout_fd_to_replica.erase(fd); + } replica->active_timeouts.clear(); } -void removeTimeoutFromReplica(int type, GetHedgedConnections::ReplicaStatePtr replica, Epoll & epoll) +void removeTimeoutFromReplica( + int type, + GetHedgedConnections::ReplicaStatePtr & replica, + Epoll & epoll, + std::unordered_map & timeout_fd_to_replica) { auto it = std::find_if( replica->active_timeouts.begin(), @@ -505,6 +509,7 @@ void removeTimeoutFromReplica(int type, GetHedgedConnections::ReplicaStatePtr re if (it != replica->active_timeouts.end()) { epoll.remove(it->first); + timeout_fd_to_replica.erase(it->first); replica->active_timeouts.erase(it); } } diff --git a/src/Client/GetHedgedConnections.h b/src/Client/GetHedgedConnections.h index c42dc24ddc7..df060e9ecd5 100644 --- a/src/Client/GetHedgedConnections.h +++ b/src/Client/GetHedgedConnections.h @@ -31,6 +31,7 @@ public: State state = State::EMPTY; int index = -1; int fd = -1; + size_t parallel_replica_offset = 0; std::unordered_map> active_timeouts; void reset() @@ -39,6 +40,7 @@ public: state = State::EMPTY; index = -1; fd = -1; + parallel_replica_offset = 0; active_timeouts.clear(); } @@ -48,7 +50,8 @@ public: bool isCannotChoose() const { return state == State::CANNOT_CHOOSE; }; }; - using ReplicaStatePtr = ReplicaState *; + using ReplicaStatePtr = std::shared_ptr; + struct Replicas { @@ -61,32 +64,15 @@ public: const ConnectionTimeouts & timeouts_, std::shared_ptr table_to_check_ = nullptr); - /// Establish connection with replicas. Return replicas as soon as connection with one of them is finished. - /// The first replica is always has state FINISHED and ready for sending query, the second replica - /// may have any state. To continue working with second replica call chooseSecondReplica(). - Replicas getConnections(); + std::vector getManyConnections(PoolMode pool_mode); - /// Continue choosing second replica, this function is not blocking. Second replica will be ready - /// for sending query when it has state FINISHED. - void chooseSecondReplica(); + ReplicaStatePtr getNextConnection(bool non_blocking); - void stopChoosingSecondReplica(); + bool canGetNewConnection() const { return ready_indexes.size() + failed_pools_count < shuffled_pools.size(); } - void swapReplicas() { std::swap(first_replica, second_replica); } + void stopChoosingReplicas(); - /// Move ready replica to the first place. - void swapReplicasIfNeeded(); - - /// Check if the file descriptor is belong to one of replicas. If yes, return this replica, if no, return nullptr. - ReplicaStatePtr isEventReplica(int event_fd); - - /// Check if the file descriptor is belong to timeout to any replica. - /// If yes, return corresponding TimerDescriptor and set timeout owner to replica, - /// if no, return nullptr. - TimerDescriptorPtr isEventTimeout(int event_fd, ReplicaStatePtr & replica); - - /// Get file rescriptor that ready for reading. - int getReadyFileDescriptor(Epoll & epoll_, AsyncCallback async_callback = {}); + bool hasEventsInProcess() const { return epoll.size() > 0; } int getFileDescriptor() const { return epoll.getFileDescriptor(); } @@ -103,25 +89,29 @@ private: TRY_NEXT_REPLICA = 2, }; - Action startTryGetConnection(int index, ReplicaStatePtr replica); + Action startTryGetConnection(int index, ReplicaStatePtr & replica); - Action processTryGetConnectionStage(ReplicaStatePtr replica, bool remove_from_epoll = false); + Action processTryGetConnectionStage(ReplicaStatePtr & replica, bool remove_from_epoll = false); - int getNextIndex(int cur_index = -1); + int getNextIndex(); - void addTimeouts(ReplicaStatePtr replica); + int getReadyFileDescriptor(AsyncCallback async_callback = {}); - void processFailedConnection(ReplicaStatePtr replica); + void addTimeouts(ReplicaStatePtr & replica); - void processReceiveTimeout(ReplicaStatePtr replica); + void processFailedConnection(ReplicaStatePtr & replica); - bool processReplicaEvent(ReplicaStatePtr replica, bool non_blocking); + void processReceiveTimeout(ReplicaStatePtr & replica); - void processTimeoutEvent(ReplicaStatePtr & replica, TimerDescriptorPtr timeout_descriptor); + bool processReplicaEvent(ReplicaStatePtr & replica, bool non_blocking); + + bool processTimeoutEvent(ReplicaStatePtr & replica, TimerDescriptorPtr timeout_descriptor, bool non_blocking); ReplicaStatePtr processEpollEvents(bool non_blocking = false); - void setBestUsableReplica(ReplicaState & replica, int skip_index = -1); + void setBestUsableReplica(ReplicaStatePtr & replica); + + ReplicaStatePtr createNewReplica() { return std::make_shared(); } const ConnectionPoolWithFailoverPtr pool; const Settings * settings; @@ -129,8 +119,14 @@ private: std::shared_ptr table_to_check; std::vector try_get_connections; std::vector shuffled_pools; - ReplicaState first_replica; - ReplicaState second_replica; + + std::unordered_map fd_to_replica; + std::unordered_map timeout_fd_to_replica; + +// std::vector> replicas; +// std::unordered_map> replicas_store; +// ReplicaState first_replica; +// ReplicaState second_replica; bool fallback_to_stale_replicas; Epoll epoll; Poco::Logger * log; @@ -139,16 +135,30 @@ private: size_t usable_count; size_t failed_pools_count; size_t max_tries; + int last_used_index; + std::unordered_set indexes_in_process; + std::unordered_set ready_indexes; }; /// Add timeout with particular type to replica and add it to epoll. -void addTimeoutToReplica(int type, GetHedgedConnections::ReplicaStatePtr replica, Epoll & epoll, const ConnectionTimeouts & timeouts); - +void addTimeoutToReplica( + int type, + GetHedgedConnections::ReplicaStatePtr & replica, + Epoll & epoll, + std::unordered_map & timeout_fd_to_replica, + const ConnectionTimeouts & timeouts); /// Remove timeout with particular type from replica and epoll. -void removeTimeoutFromReplica(int type, GetHedgedConnections::ReplicaStatePtr replica, Epoll & epoll); +void removeTimeoutFromReplica( + int type, + GetHedgedConnections::ReplicaStatePtr & replica, + Epoll & epoll, + std::unordered_map & timeout_fd_to_replica); /// Remove all timeouts from replica and epoll. -void removeTimeoutsFromReplica(GetHedgedConnections::ReplicaStatePtr replica, Epoll & epoll); +void removeTimeoutsFromReplica( + GetHedgedConnections::ReplicaStatePtr & replica, + Epoll & epoll, + std::unordered_map & timeout_fd_to_replica); } diff --git a/src/Client/HedgedConnections.cpp b/src/Client/HedgedConnections.cpp index 4963c74c327..a4231b2c172 100644 --- a/src/Client/HedgedConnections.cpp +++ b/src/Client/HedgedConnections.cpp @@ -15,91 +15,83 @@ HedgedConnections::HedgedConnections( const Settings & settings_, const ConnectionTimeouts & timeouts_, const ThrottlerPtr & throttler_, + PoolMode pool_mode, std::shared_ptr table_to_check_) : get_hedged_connections(pool_, &settings_, timeouts_, table_to_check_), settings(settings_), throttler(throttler_), log(&Poco::Logger::get("HedgedConnections")) { - replicas = get_hedged_connections.getConnections(); + std::vector replicas_states = get_hedged_connections.getManyConnections(pool_mode); - /// First replica may have state CANNOT_CHOOSE if setting skip_unavailable_shards is enabled - if (replicas.first_replica->isReady()) - replicas.first_replica->connection->setThrottler(throttler); - - if (!replicas.second_replica->isCannotChoose()) + for (size_t i = 0; i != replicas_states.size(); ++i) { - if (replicas.second_replica->isNotReady()) - epoll.add(get_hedged_connections.getFileDescriptor()); - - auto set_throttler = [throttler_](ReplicaStatePtr replica) - { - replica->connection->setThrottler(throttler_); - }; - second_replica_pipeline.add(std::function(set_throttler)); + replicas_states[i]->parallel_replica_offset = i; + replicas_states[i]->connection->setThrottler(throttler_); + epoll.add(replicas_states[i]->fd); + fd_to_replica[replicas_states[i]->fd] = replicas_states[i]; + replicas.push_back({std::move(replicas_states[i])}); + active_connections_count_by_offset[i] = 1; } + + pipeline_for_new_replicas.add([throttler_](ReplicaStatePtr & replica_){ replica_->connection->setThrottler(throttler_); }); } -void HedgedConnections::Pipeline::add(std::function send_function) +void HedgedConnections::Pipeline::add(std::function send_function) { pipeline.push_back(send_function); } -void HedgedConnections::Pipeline::run(ReplicaStatePtr replica) +void HedgedConnections::Pipeline::run(ReplicaStatePtr & replica) { for (auto & send_func : pipeline) send_func(replica); - - pipeline.clear(); } size_t HedgedConnections::size() const { - if (replicas.first_replica->isReady() || replicas.second_replica->isReady()) - return 1; + if (replicas.empty()) + return 0; - return 0; -} - -bool HedgedConnections::hasActiveConnections() const -{ - return replicas.first_replica->isReady() || replicas.second_replica->isReady(); + return 1; } void HedgedConnections::sendScalarsData(Scalars & data) { std::lock_guard lock(cancel_mutex); +// LOG_DEBUG(log, "sendScalarsData"); + if (!sent_query) throw Exception("Cannot send scalars data: query not yet sent.", ErrorCodes::LOGICAL_ERROR); - auto send_scalars_data = [&data](ReplicaStatePtr replica) { replica->connection->sendScalarsData(data); }; + auto send_scalars_data = [&data](ReplicaStatePtr & replica) { replica->connection->sendScalarsData(data); }; - if (replicas.first_replica->isReady()) - send_scalars_data(replicas.first_replica); + for (auto & replicas_with_same_offset : replicas) + for (auto & replica : replicas_with_same_offset) + if (replica->isReady()) + send_scalars_data(replica); - if (replicas.second_replica->isReady()) - send_scalars_data(replicas.second_replica); - else if (!replicas.second_replica->isCannotChoose()) - second_replica_pipeline.add(std::function(send_scalars_data)); + pipeline_for_new_replicas.add(send_scalars_data); } void HedgedConnections::sendExternalTablesData(std::vector & data) { std::lock_guard lock(cancel_mutex); +// LOG_DEBUG(log, "sendExternalTablesData"); + if (!sent_query) throw Exception("Cannot send external tables data: query not yet sent.", ErrorCodes::LOGICAL_ERROR); if (data.size() != size()) throw Exception("Mismatch between replicas and data sources", ErrorCodes::MISMATCH_REPLICAS_DATA_SOURCES); - auto send_external_tables_data = [&data](ReplicaStatePtr replica) { replica->connection->sendExternalTablesData(data[0]); }; + auto send_external_tables_data = [&data](ReplicaStatePtr & replica) { replica->connection->sendExternalTablesData(data[0]); }; - if (replicas.first_replica->isReady()) - send_external_tables_data(replicas.first_replica); + for (auto & replicas_with_same_offset : replicas) + for (auto & replica : replicas_with_same_offset) + if (replica->isReady()) + send_external_tables_data(replica); - if (replicas.second_replica->isReady()) - send_external_tables_data(replicas.second_replica); - else if (!replicas.second_replica->isCannotChoose()) - second_replica_pipeline.add(send_external_tables_data); + pipeline_for_new_replicas.add(send_external_tables_data); } void HedgedConnections::sendQuery( @@ -112,35 +104,52 @@ void HedgedConnections::sendQuery( { std::lock_guard lock(cancel_mutex); +// LOG_DEBUG(log, "sendQuery"); + if (sent_query) throw Exception("Query already sent.", ErrorCodes::LOGICAL_ERROR); - auto send_query = [this, timeouts, query, query_id, stage, client_info, with_pending_data](ReplicaStatePtr replica) + for (auto & replicas_with_same_offset : replicas) { - Settings modified_settings = settings; - if (replica->connection->getServerRevision(timeouts) < DBMS_MIN_REVISION_WITH_CURRENT_AGGREGATION_VARIANT_SELECTION_METHOD) + for (auto & replica : replicas_with_same_offset) { + if (replica->connection->getServerRevision(timeouts) < DBMS_MIN_REVISION_WITH_CURRENT_AGGREGATION_VARIANT_SELECTION_METHOD) + { + has_two_level_aggregation_incompatibility = true; + break; + } + } + if (has_two_level_aggregation_incompatibility) + break; + } + + auto send_query = [this, timeouts, query, query_id, stage, client_info, with_pending_data](ReplicaStatePtr & replica) + { + Settings modified_settings = this->settings; + + if (this->has_two_level_aggregation_incompatibility) + { + /// Disable two-level aggregation due to version incompatibility. modified_settings.group_by_two_level_threshold = 0; modified_settings.group_by_two_level_threshold_bytes = 0; } + if (this->replicas.size() > 1) + { + modified_settings.parallel_replicas_count = this->replicas.size(); + modified_settings.parallel_replica_offset = replica->parallel_replica_offset; + } + replica->connection->sendQuery(timeouts, query, query_id, stage, &modified_settings, &client_info, with_pending_data); - this->epoll.add(replica->fd); - addTimeoutToReplica(TimerTypes::RECEIVE_TIMEOUT, replica, this->epoll, timeouts); + addTimeoutToReplica(TimerTypes::RECEIVE_TIMEOUT, replica, this->epoll, this->timeout_fd_to_replica, timeouts); + addTimeoutToReplica(TimerTypes::RECEIVE_DATA_TIMEOUT, replica, this->epoll, this->timeout_fd_to_replica, timeouts); }; - if (replicas.first_replica->isReady()) - { - send_query(replicas.first_replica); - if (replicas.second_replica->isEmpty()) - addTimeoutToReplica(TimerTypes::RECEIVE_DATA_TIMEOUT, replicas.first_replica, epoll, timeouts); - } - - if (replicas.second_replica->isReady()) - send_query(replicas.second_replica); - else if (!replicas.second_replica->isCannotChoose()) - second_replica_pipeline.add(send_query); + for (auto & replicas_with_same_offset : replicas) + for (auto & replica : replicas_with_same_offset) + send_query(replica); + pipeline_for_new_replicas.add(send_query); sent_query = true; } @@ -148,32 +157,41 @@ void HedgedConnections::disconnect() { std::lock_guard lock(cancel_mutex); - if (replicas.first_replica->isReady()) - { - replicas.first_replica->connection->disconnect(); - replicas.first_replica->reset(); - } +// LOG_DEBUG(log, "disconnect"); - if (replicas.second_replica->isReady()) + for (auto & replicas_with_same_offset : replicas) + for (auto & replica : replicas_with_same_offset) + if (replica->isReady()) + finishProcessReplica(replica, true); + + if (get_hedged_connections.hasEventsInProcess()) { - replicas.second_replica->connection->disconnect(); - replicas.second_replica->reset(); + get_hedged_connections.stopChoosingReplicas(); + if (next_replica_in_process) + epoll.remove(get_hedged_connections.getFileDescriptor()); } - else if (replicas.second_replica->isNotReady()) - get_hedged_connections.stopChoosingSecondReplica(); } std::string HedgedConnections::dumpAddresses() const { std::lock_guard lock(cancel_mutex); +// LOG_DEBUG(log, "dumpAddresses"); + std::string addresses = ""; + bool is_first = true; - if (replicas.first_replica->isReady()) - addresses += replicas.first_replica->connection->getDescription(); - - if (replicas.second_replica->isReady()) - addresses += "; " + replicas.second_replica->connection->getDescription(); + for (auto & replicas_with_same_offset : replicas) + { + for (auto & replica : replicas_with_same_offset) + { + if (replica->isReady()) + { + addresses += (is_first ? "" : "; ") + replica->connection->getDescription(); + is_first = false; + } + } + } return addresses; } @@ -182,14 +200,15 @@ void HedgedConnections::sendCancel() { std::lock_guard lock(cancel_mutex); +// LOG_DEBUG(log, "sendCancel"); + if (!sent_query || cancelled) throw Exception("Cannot cancel. Either no query sent or already cancelled.", ErrorCodes::LOGICAL_ERROR); - if (replicas.first_replica->isReady()) - replicas.first_replica->connection->sendCancel(); - - if (replicas.second_replica->isReady()) - replicas.second_replica->connection->sendCancel(); + for (auto & replicas_with_same_offset : replicas) + for (auto & replica : replicas_with_same_offset) + if (replica->isReady()) + replica->connection->sendCancel(); cancelled = true; } @@ -202,6 +221,8 @@ Packet HedgedConnections::drain() if (!cancelled) throw Exception("Cannot drain connections: cancel first.", ErrorCodes::LOGICAL_ERROR); +// LOG_DEBUG(log, "drain"); + Packet res; res.type = Protocol::Server::EndOfStream; @@ -250,23 +271,31 @@ Packet HedgedConnections::receivePacketUnlocked(AsyncCallback async_callback) Packet HedgedConnections::receivePacketImpl(AsyncCallback async_callback) { +// LOG_DEBUG(log, "sreceivePacketImpl"); + int event_fd; - ReplicaStatePtr replica; + ReplicaStatePtr replica = nullptr; Packet packet; bool finish = false; while (!finish) { - event_fd = get_hedged_connections.getReadyFileDescriptor(epoll, async_callback); + event_fd = getReadyFileDescriptor(async_callback); - if (auto timeout_descriptor = get_hedged_connections.isEventTimeout(event_fd, replica)) - processTimeoutEvent(replica, timeout_descriptor); - else if ((replica = get_hedged_connections.isEventReplica(event_fd))) + if (fd_to_replica.find(event_fd) != fd_to_replica.end()) { +// LOG_DEBUG(log, "event is replica"); + replica = fd_to_replica[event_fd]; packet = receivePacketFromReplica(replica, async_callback); finish = true; } + else if (timeout_fd_to_replica.find(event_fd) != timeout_fd_to_replica.end()) + { +// LOG_DEBUG(log, "event is timeout"); + replica = timeout_fd_to_replica[event_fd]; + processTimeoutEvent(replica, replica->active_timeouts[event_fd].get()); + } else if (event_fd == get_hedged_connections.getFileDescriptor()) - processGetHedgedConnectionsEvent(); + tryGetNewReplica(); else throw Exception("Unknown event from epoll", ErrorCodes::LOGICAL_ERROR); } @@ -274,23 +303,33 @@ Packet HedgedConnections::receivePacketImpl(AsyncCallback async_callback) return packet; }; -Packet HedgedConnections::receivePacketFromReplica(ReplicaStatePtr replica, AsyncCallback async_callback) +int HedgedConnections::getReadyFileDescriptor(AsyncCallback async_callback) { + for (auto & [fd, replica] : fd_to_replica) + if (replica->connection->hasReadPendingData()) + return replica->fd; + + return epoll.getReady(std::move(async_callback)).data.fd; +} + +Packet HedgedConnections::receivePacketFromReplica(ReplicaStatePtr & replica, AsyncCallback async_callback) +{ +// LOG_DEBUG(log, "sreceivePacketFromReplica"); Packet packet = replica->connection->receivePacket(std::move(async_callback)); switch (packet.type) { case Protocol::Server::Data: - removeTimeoutsFromReplica(replica, epoll); + removeTimeoutsFromReplica(replica, epoll, timeout_fd_to_replica); processReceiveData(replica); - addTimeoutToReplica(TimerTypes::RECEIVE_TIMEOUT, replica, epoll, get_hedged_connections.getConnectionTimeouts()); + addTimeoutToReplica(TimerTypes::RECEIVE_TIMEOUT, replica, epoll, timeout_fd_to_replica, get_hedged_connections.getConnectionTimeouts()); break; case Protocol::Server::Progress: case Protocol::Server::ProfileInfo: case Protocol::Server::Totals: case Protocol::Server::Extremes: case Protocol::Server::Log: - removeTimeoutFromReplica(TimerTypes::RECEIVE_TIMEOUT, replica, epoll); - addTimeoutToReplica(TimerTypes::RECEIVE_TIMEOUT, replica, epoll, get_hedged_connections.getConnectionTimeouts()); + removeTimeoutFromReplica(TimerTypes::RECEIVE_TIMEOUT, replica, epoll, timeout_fd_to_replica); + addTimeoutToReplica(TimerTypes::RECEIVE_TIMEOUT, replica, epoll, timeout_fd_to_replica, get_hedged_connections.getConnectionTimeouts()); break; case Protocol::Server::EndOfStream: @@ -306,26 +345,29 @@ Packet HedgedConnections::receivePacketFromReplica(ReplicaStatePtr replica, Asyn return packet; } -void HedgedConnections::processReceiveData(ReplicaStatePtr replica) +void HedgedConnections::processReceiveData(ReplicaStatePtr & replica) { /// When we receive first packet of data from any replica, we continue working with this replica - /// and stop working with another replica (if there is another replica). If current replica is - /// second, move it to the first place. - if (replica == replicas.second_replica) - get_hedged_connections.swapReplicas(); + /// and stop working with other replicas (if there are other replicas). - if (replicas.second_replica->isCannotChoose() || replicas.second_replica->isEmpty()) - return; +// LOG_DEBUG(log, "processReceiveData"); - if (replicas.second_replica->isNotReady()) + offsets_with_received_data.insert(replica->parallel_replica_offset); + + for (auto & other_replica : replicas[replica->parallel_replica_offset]) { - get_hedged_connections.stopChoosingSecondReplica(); - epoll.remove(get_hedged_connections.getFileDescriptor()); + if (other_replica->isReady() && other_replica != replica) + { + other_replica->connection->sendCancel(); + finishProcessReplica(other_replica, true); + } } - else if (replicas.second_replica->isReady()) + + if (get_hedged_connections.hasEventsInProcess() && offsets_with_received_data.size() == replicas.size()) { - replicas.second_replica->connection->sendCancel(); - finishProcessReplica(replicas.second_replica, true); + get_hedged_connections.stopChoosingReplicas(); + if (next_replica_in_process) + epoll.remove(get_hedged_connections.getFileDescriptor()); } } @@ -333,57 +375,73 @@ void HedgedConnections::processTimeoutEvent(ReplicaStatePtr & replica, TimerDesc { epoll.remove(timeout_descriptor->getDescriptor()); replica->active_timeouts.erase(timeout_descriptor->getDescriptor()); + timeout_fd_to_replica.erase(timeout_descriptor->getDescriptor()); if (timeout_descriptor->getType() == TimerTypes::RECEIVE_TIMEOUT) { + size_t offset = replica->parallel_replica_offset; finishProcessReplica(replica, true); - if (!replicas.first_replica->isReady() && !replicas.second_replica->isNotReady()) + /// Check if there is no active connection with same offset. + if (active_connections_count_by_offset[offset] == 0) throw NetException("Receive timeout expired", ErrorCodes::SOCKET_TIMEOUT); } else if (timeout_descriptor->getType() == TimerTypes::RECEIVE_DATA_TIMEOUT) { - if (!replicas.second_replica->isEmpty()) - throw Exception("Cannot start choosing second replica, it's not empty", ErrorCodes::LOGICAL_ERROR); - - get_hedged_connections.chooseSecondReplica(); - - if (replicas.second_replica->isReady()) - processChosenSecondReplica(); - else if (replicas.second_replica->isNotReady()) - epoll.add(get_hedged_connections.getFileDescriptor()); + offsets_queue.push(replica->parallel_replica_offset); + tryGetNewReplica(); } } -void HedgedConnections::processGetHedgedConnectionsEvent() +void HedgedConnections::tryGetNewReplica() { - get_hedged_connections.chooseSecondReplica(); - if (replicas.second_replica->isReady()) - processChosenSecondReplica(); +// LOG_DEBUG(log, "tryGetNewReplica"); - if (!replicas.second_replica->isNotReady()) + ReplicaStatePtr new_replica = get_hedged_connections.getNextConnection(/*non_blocking*/ true); + + /// Skip replicas with old server version if we didn't disable two-level aggregation in sendQuery. + while (new_replica->isReady() && !has_two_level_aggregation_incompatibility + && new_replica->connection->getServerRevision(get_hedged_connections.getConnectionTimeouts()) < DBMS_MIN_REVISION_WITH_CURRENT_AGGREGATION_VARIANT_SELECTION_METHOD) + new_replica = get_hedged_connections.getNextConnection(/*non_blocking*/ true); + + if (new_replica->isReady()) + { +// LOG_DEBUG(log, "processNewReadyReplica"); + new_replica->parallel_replica_offset = offsets_queue.front(); + offsets_queue.pop(); + replicas[new_replica->parallel_replica_offset].push_back(new_replica); + epoll.add(new_replica->fd); + fd_to_replica[new_replica->fd] = new_replica; + ++active_connections_count_by_offset[new_replica->parallel_replica_offset]; + pipeline_for_new_replicas.run(new_replica); + } + else if (new_replica->isNotReady() && !next_replica_in_process) + { + epoll.add(get_hedged_connections.getFileDescriptor()); + next_replica_in_process = true; + } + + if (next_replica_in_process && (new_replica->isCannotChoose() || offsets_queue.empty())) + { epoll.remove(get_hedged_connections.getFileDescriptor()); + next_replica_in_process = false; + } } -void HedgedConnections::processChosenSecondReplica() +void HedgedConnections::finishProcessReplica(ReplicaStatePtr & replica, bool disconnect) { - second_replica_pipeline.run(replicas.second_replica); +// LOG_DEBUG(log, "finishProcessReplica"); - /// In case when the first replica get receive timeout before the second is chosen, - /// we need to move the second replica to the first place - get_hedged_connections.swapReplicasIfNeeded(); -} - -void HedgedConnections::finishProcessReplica(ReplicaStatePtr replica, bool disconnect) -{ - removeTimeoutsFromReplica(replica, epoll); + removeTimeoutsFromReplica(replica, epoll, timeout_fd_to_replica); epoll.remove(replica->fd); + fd_to_replica.erase(replica->fd); + --active_connections_count_by_offset[replica->parallel_replica_offset]; + if (active_connections_count_by_offset[replica->parallel_replica_offset] == 0) + active_connections_count_by_offset.erase(replica->parallel_replica_offset); + if (disconnect) replica->connection->disconnect(); replica->reset(); - - /// Move active connection to the first replica if it exists - get_hedged_connections.swapReplicasIfNeeded(); } } diff --git a/src/Client/HedgedConnections.h b/src/Client/HedgedConnections.h index b84ad89658f..b6e64ac45ad 100644 --- a/src/Client/HedgedConnections.h +++ b/src/Client/HedgedConnections.h @@ -3,6 +3,7 @@ #include #include #include +#include namespace DB { @@ -17,6 +18,7 @@ public: const Settings & settings_, const ConnectionTimeouts & timeouts_, const ThrottlerPtr & throttler, + PoolMode pool_mode, std::shared_ptr table_to_check_ = nullptr); void sendScalarsData(Scalars & data) override; @@ -45,47 +47,52 @@ public: size_t size() const override; - bool hasActiveConnections() const override; + bool hasActiveConnections() const override { return !active_connections_count_by_offset.empty(); } private: class Pipeline { public: - void add(std::function send_function); + void add(std::function send_function); - void run(ReplicaStatePtr replica); + void run(ReplicaStatePtr & replica); bool empty() const { return pipeline.empty(); } private: - std::vector> pipeline; + std::vector> pipeline; }; - void processChosenSecondReplica(); - - Packet receivePacketFromReplica(ReplicaStatePtr replica, AsyncCallback async_callback = {}); + Packet receivePacketFromReplica(ReplicaStatePtr & replica, AsyncCallback async_callback = {}); Packet receivePacketImpl(AsyncCallback async_callback = {}); - void processReceiveData(ReplicaStatePtr replica); + void processReceiveData(ReplicaStatePtr & replica); void processTimeoutEvent(ReplicaStatePtr & replica, TimerDescriptorPtr timeout_descriptor); - void processGetHedgedConnectionsEvent(); + void tryGetNewReplica(); - void removeReceiveTimeout(ReplicaStatePtr replica); + void finishProcessReplica(ReplicaStatePtr & replica, bool disconnect); - void finishProcessReplica(ReplicaStatePtr replica, bool disconnect); + int getReadyFileDescriptor(AsyncCallback async_callback = {}); GetHedgedConnections get_hedged_connections; - Replicas replicas; + std::vector> replicas; + std::unordered_map fd_to_replica; + std::unordered_map timeout_fd_to_replica; + std::queue offsets_queue; Epoll epoll; const Settings & settings; ThrottlerPtr throttler; Poco::Logger * log; - Pipeline second_replica_pipeline; + Pipeline pipeline_for_new_replicas; bool sent_query = false; bool cancelled = false; + std::unordered_map active_connections_count_by_offset; + bool next_replica_in_process = false; + bool has_two_level_aggregation_incompatibility = false; + std::unordered_set offsets_with_received_data; mutable std::mutex cancel_mutex; }; diff --git a/src/DataStreams/RemoteQueryExecutor.cpp b/src/DataStreams/RemoteQueryExecutor.cpp index 49654b51199..01c31eac640 100644 --- a/src/DataStreams/RemoteQueryExecutor.cpp +++ b/src/DataStreams/RemoteQueryExecutor.cpp @@ -63,13 +63,13 @@ RemoteQueryExecutor::RemoteQueryExecutor( const Settings & current_settings = context.getSettingsRef(); auto timeouts = ConnectionTimeouts::getTCPTimeoutsWithFailover(current_settings); - if (current_settings.use_hedged_requests && current_settings.max_parallel_replicas <= 1) + if (current_settings.use_hedged_requests) { std::shared_ptr table_to_check = nullptr; if (main_table) table_to_check = std::make_shared(main_table.getQualifiedName()); - return std::make_unique(pool, current_settings, timeouts, throttler, table_to_check); + return std::make_unique(pool, current_settings, timeouts, throttler, pool_mode, table_to_check); } else { From 1f22ba4bbb384c72f6fc57538c7ebb13dacd73ca Mon Sep 17 00:00:00 2001 From: romanzhukov Date: Wed, 27 Jan 2021 12:35:08 +0300 Subject: [PATCH 0091/2357] DOCSUP-5266: fix PR and ticket comments --- .../data-types/simpleaggregatefunction.md | 6 +++++- .../data-types/simpleaggregatefunction.md | 21 ++++++++++++------- 2 files changed, 18 insertions(+), 9 deletions(-) diff --git a/docs/en/sql-reference/data-types/simpleaggregatefunction.md b/docs/en/sql-reference/data-types/simpleaggregatefunction.md index 2d2746f85d3..015972d7dbe 100644 --- a/docs/en/sql-reference/data-types/simpleaggregatefunction.md +++ b/docs/en/sql-reference/data-types/simpleaggregatefunction.md @@ -21,7 +21,11 @@ The following aggregate functions are supported: - [`argMin`](../../sql-reference/aggregate-functions/reference/argmin.md) - [`argMax`](../../sql-reference/aggregate-functions/reference/argmax.md) -Values of the `SimpleAggregateFunction(func, Type)` look and stored the same way as `Type`, so you do not need to apply functions with `-Merge`/`-State` suffixes. `SimpleAggregateFunction` has better performance than `AggregateFunction` with same aggregation function. + +!!! note "Note" + Values of the `SimpleAggregateFunction(func, Type)` look and stored the same way as `Type`, so you do not need to apply functions with `-Merge`/`-State` suffixes. + + `SimpleAggregateFunction` has better performance than `AggregateFunction` with same aggregation function. **Parameters** diff --git a/docs/ru/sql-reference/data-types/simpleaggregatefunction.md b/docs/ru/sql-reference/data-types/simpleaggregatefunction.md index 3ff4e5fd662..84e20877866 100644 --- a/docs/ru/sql-reference/data-types/simpleaggregatefunction.md +++ b/docs/ru/sql-reference/data-types/simpleaggregatefunction.md @@ -1,8 +1,9 @@ # SimpleAggregateFunction {#data-type-simpleaggregatefunction} -`SimpleAggregateFunction(name, types_of_arguments…)` data type stores current value of the aggregate function, and does not store its full state as [`AggregateFunction`](../../sql-reference/data-types/aggregatefunction.md) does. This optimization can be applied to functions for which the following property holds: the result of applying a function `f` to a row set `S1 UNION ALL S2` can be obtained by applying `f` to parts of the row set separately, and then again applying `f` to the results: `f(S1 UNION ALL S2) = f(f(S1) UNION ALL f(S2))`. This property guarantees that partial aggregation results are enough to compute the combined one, so we don’t have to store and process any extra data. +Тип данных `SimpleAggregateFunction(name, types_of_arguments…)` хранит только текущее значение агрегатной функции и не сохраняет ее полное состояние, как это делает [`AggregateFunction`] (../../sql-reference/data-types/aggregatefunction.md). Такая оптимизация может быть применена к функциям, для которых выполняется следующее свойство: результат выполнения функции `f` к набору строк `S1 UNION ALL S2` может быть получен путем выполнения `f` к отдельным частям набора строк, +а затем повторного выполнения `f` к результатам: `f(S1 UNION ALL S2) = f(f(S1) UNION ALL f(S2))`. Это свойство гарантирует, что результатов частичной агрегации достаточно для вычисления комбинированной, поэтому нам не нужно хранить и обрабатывать какие-либо дополнительные данные. -The following aggregate functions are supported: +Поддерживаются следующие агрегатные функции: - [`any`](../../sql-reference/aggregate-functions/reference/any.md#agg_function-any) - [`anyLast`](../../sql-reference/aggregate-functions/reference/anylast.md#anylastx) @@ -19,14 +20,18 @@ The following aggregate functions are supported: - [`minMap`](../../sql-reference/aggregate-functions/reference/minmap.md#agg_functions-minmap) - [`maxMap`](../../sql-reference/aggregate-functions/reference/maxmap.md#agg_functions-maxmap) -Values of the `SimpleAggregateFunction(func, Type)` look and stored the same way as `Type`, so you do not need to apply functions with `-Merge`/`-State` suffixes. `SimpleAggregateFunction` has better performance than `AggregateFunction` with same aggregation function. +!!! note "Примечание" + Значения `SimpleAggregateFunction(func, Type)` отображаются и хранятся так же, как и `Type`, поэтому вам не требуется применять функции с суффиксами `-Merge`/`-State`. + + `SimpleAggregateFunction` имеет лучшую производительность, чем `AggregateFunction` с той же агрегатной функцией. -**Parameters** -- Name of the aggregate function. -- Types of the aggregate function arguments. +**Параметры** -**Example** +- имя агрегатной функции. +- типы аргументов агрегатной функции. + +**Пример** ``` sql CREATE TABLE t @@ -36,4 +41,4 @@ CREATE TABLE t ) ENGINE = ... ``` -[Original article](https://clickhouse.tech/docs/en/data_types/simpleaggregatefunction/) +[Оригинальная статья](https://clickhouse.tech/docs/en/data_types/simpleaggregatefunction/) From 68119d78680b0e6dc181caf81eb8e7724ce8c535 Mon Sep 17 00:00:00 2001 From: romanzhukov Date: Wed, 27 Jan 2021 12:50:49 +0300 Subject: [PATCH 0092/2357] DOCSUP-5266: fix PR and ticket comments --- docs/ru/sql-reference/data-types/simpleaggregatefunction.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/ru/sql-reference/data-types/simpleaggregatefunction.md b/docs/ru/sql-reference/data-types/simpleaggregatefunction.md index 84e20877866..c1b3ac240f0 100644 --- a/docs/ru/sql-reference/data-types/simpleaggregatefunction.md +++ b/docs/ru/sql-reference/data-types/simpleaggregatefunction.md @@ -1,6 +1,6 @@ # SimpleAggregateFunction {#data-type-simpleaggregatefunction} -Тип данных `SimpleAggregateFunction(name, types_of_arguments…)` хранит только текущее значение агрегатной функции и не сохраняет ее полное состояние, как это делает [`AggregateFunction`] (../../sql-reference/data-types/aggregatefunction.md). Такая оптимизация может быть применена к функциям, для которых выполняется следующее свойство: результат выполнения функции `f` к набору строк `S1 UNION ALL S2` может быть получен путем выполнения `f` к отдельным частям набора строк, +Тип данных `SimpleAggregateFunction(name, types_of_arguments…)` хранит только текущее значение агрегатной функции и не сохраняет ее полное состояние, как это делает [`AggregateFunction`](../../sql-reference/data-types/aggregatefunction.md). Такая оптимизация может быть применена к функциям, для которых выполняется следующее свойство: результат выполнения функции `f` к набору строк `S1 UNION ALL S2` может быть получен путем выполнения `f` к отдельным частям набора строк, а затем повторного выполнения `f` к результатам: `f(S1 UNION ALL S2) = f(f(S1) UNION ALL f(S2))`. Это свойство гарантирует, что результатов частичной агрегации достаточно для вычисления комбинированной, поэтому нам не нужно хранить и обрабатывать какие-либо дополнительные данные. Поддерживаются следующие агрегатные функции: From 6a922959826441028885069d707d10a6946ac482 Mon Sep 17 00:00:00 2001 From: Denis Glazachev Date: Wed, 27 Jan 2021 19:16:31 +0400 Subject: [PATCH 0093/2357] Remove TODOs --- src/Interpreters/ExpressionAnalyzer.cpp | 4 ---- src/Interpreters/InterpreterSelectQuery.cpp | 4 ---- 2 files changed, 8 deletions(-) diff --git a/src/Interpreters/ExpressionAnalyzer.cpp b/src/Interpreters/ExpressionAnalyzer.cpp index 2055faca820..8599f5c15f8 100644 --- a/src/Interpreters/ExpressionAnalyzer.cpp +++ b/src/Interpreters/ExpressionAnalyzer.cpp @@ -1371,8 +1371,6 @@ ExpressionAnalysisResult::ExpressionAnalysisResult( if (storage && filter_info_) { - // TODO: handle filter exactly like prewhere, store the info in PrewhereDAGInfo, collect unnecessary columns, etc.? - filter_info = filter_info_; query_analyzer.appendPreliminaryFilter(chain, filter_info->actions_dag, filter_info->column_name); } @@ -1547,8 +1545,6 @@ void ExpressionAnalysisResult::finalize(const ExpressionActionsChain & chain, si { const ExpressionActionsChain::Step & step = *chain.steps.at(next_step_i++); filter_info->do_remove_column = step.can_remove_required_output.at(0); - - // TODO: handle filter exactly like prewhere, collect columns to remove after filter? } if (hasPrewhere()) diff --git a/src/Interpreters/InterpreterSelectQuery.cpp b/src/Interpreters/InterpreterSelectQuery.cpp index 57c18f1bb86..946f22198fb 100644 --- a/src/Interpreters/InterpreterSelectQuery.cpp +++ b/src/Interpreters/InterpreterSelectQuery.cpp @@ -823,8 +823,6 @@ void InterpreterSelectQuery::executeImpl(QueryPlan & query_plan, const BlockInpu row_level_security_step->setStepDescription("Row-level security filter (PREWHERE)"); query_plan.addStep(std::move(row_level_security_step)); - - // TODO: handle filter like prewhere, remove unnecessary columns after it, etc.? } if (expressions.prewhere_info) @@ -953,8 +951,6 @@ void InterpreterSelectQuery::executeImpl(QueryPlan & query_plan, const BlockInpu row_level_security_step->setStepDescription("Row-level security filter"); query_plan.addStep(std::move(row_level_security_step)); - - // TODO: handle filter like prewhere, remove unnecessary columns after it, etc.? } if (expressions.before_array_join) From 9c7881f4c9dba5ce9fe241603368228fc87e9420 Mon Sep 17 00:00:00 2001 From: kssenii Date: Thu, 28 Jan 2021 09:22:01 +0000 Subject: [PATCH 0094/2357] Fix --- .../AggregateFunctionFactory.cpp | 7 +++++- src/DataTypes/DataTypeFactory.cpp | 23 +++++++++++-------- src/Functions/FunctionFactory.cpp | 3 ++- ...56_test_query_log_factories_info.reference | 10 ++++---- .../01656_test_query_log_factories_info.sql | 4 +++- 5 files changed, 29 insertions(+), 18 deletions(-) diff --git a/src/AggregateFunctions/AggregateFunctionFactory.cpp b/src/AggregateFunctions/AggregateFunctionFactory.cpp index 5fc690d59f2..53fc895849b 100644 --- a/src/AggregateFunctions/AggregateFunctionFactory.cpp +++ b/src/AggregateFunctions/AggregateFunctionFactory.cpp @@ -98,6 +98,7 @@ AggregateFunctionPtr AggregateFunctionFactory::getImpl( bool has_null_arguments) const { String name = getAliasToOrName(name_param); + bool is_case_insensitive = false; Value found; /// Find by exact match. @@ -107,7 +108,10 @@ AggregateFunctionPtr AggregateFunctionFactory::getImpl( } if (auto jt = case_insensitive_aggregate_functions.find(Poco::toLower(name)); jt != case_insensitive_aggregate_functions.end()) + { found = jt->second; + is_case_insensitive = true; + } const Context * query_context = nullptr; if (CurrentThread::isInitialized()) @@ -118,7 +122,8 @@ AggregateFunctionPtr AggregateFunctionFactory::getImpl( out_properties = found.properties; if (query_context && query_context->getSettingsRef().log_queries) - query_context->addQueryFactoriesInfo(Context::QueryLogFactories::AggregateFunction, name); + query_context->addQueryFactoriesInfo( + Context::QueryLogFactories::AggregateFunction, is_case_insensitive ? Poco::toLower(name) : name); /// The case when aggregate function should return NULL on NULL arguments. This case is handled in "get" method. if (!out_properties.returns_default_when_only_null && has_null_arguments) diff --git a/src/DataTypes/DataTypeFactory.cpp b/src/DataTypes/DataTypeFactory.cpp index 2f100202ee9..1bc2a307915 100644 --- a/src/DataTypes/DataTypeFactory.cpp +++ b/src/DataTypes/DataTypeFactory.cpp @@ -78,16 +78,7 @@ DataTypePtr DataTypeFactory::get(const String & family_name_param, const ASTPtr return get("LowCardinality", low_cardinality_params); } - DataTypePtr res = findCreatorByName(family_name)(parameters); - - if (CurrentThread::isInitialized()) - { - const auto * query_context = CurrentThread::get().getQueryContext(); - if (query_context && query_context->getSettingsRef().log_queries) - query_context->addQueryFactoriesInfo(Context::QueryLogFactories::DataType, family_name); - } - - return res; + return findCreatorByName(family_name)(parameters); } DataTypePtr DataTypeFactory::getCustom(DataTypeCustomDescPtr customization) const @@ -159,10 +150,18 @@ void DataTypeFactory::registerSimpleDataTypeCustom(const String &name, SimpleCre const DataTypeFactory::Value & DataTypeFactory::findCreatorByName(const String & family_name) const { + const Context * query_context = nullptr; + if (CurrentThread::isInitialized()) + query_context = CurrentThread::get().getQueryContext(); + { DataTypesDictionary::const_iterator it = data_types.find(family_name); if (data_types.end() != it) + { + if (query_context && query_context->getSettingsRef().log_queries) + query_context->addQueryFactoriesInfo(Context::QueryLogFactories::DataType, family_name); return it->second; + } } String family_name_lowercase = Poco::toLower(family_name); @@ -170,7 +169,11 @@ const DataTypeFactory::Value & DataTypeFactory::findCreatorByName(const String & { DataTypesDictionary::const_iterator it = case_insensitive_data_types.find(family_name_lowercase); if (case_insensitive_data_types.end() != it) + { + if (query_context && query_context->getSettingsRef().log_queries) + query_context->addQueryFactoriesInfo(Context::QueryLogFactories::DataType, family_name_lowercase); return it->second; + } } auto hints = this->getHints(family_name); diff --git a/src/Functions/FunctionFactory.cpp b/src/Functions/FunctionFactory.cpp index 768f1cfe487..e98cb543df6 100644 --- a/src/Functions/FunctionFactory.cpp +++ b/src/Functions/FunctionFactory.cpp @@ -92,7 +92,8 @@ FunctionOverloadResolverImplPtr FunctionFactory::tryGetImpl( res = it->second(context); else { - it = case_insensitive_functions.find(Poco::toLower(name)); + name = Poco::toLower(name); + it = case_insensitive_functions.find(name); if (case_insensitive_functions.end() != it) res = it->second(context); } diff --git a/tests/queries/0_stateless/01656_test_query_log_factories_info.reference b/tests/queries/0_stateless/01656_test_query_log_factories_info.reference index 3c93cd9ec26..77486e99ea5 100644 --- a/tests/queries/0_stateless/01656_test_query_log_factories_info.reference +++ b/tests/queries/0_stateless/01656_test_query_log_factories_info.reference @@ -1,8 +1,8 @@ -2 worl [123,1,1] 49 \N 50 4950 Nullable(UInt64) 50 -2 worl [123,1,1] 49 \N 50 4950 Nullable(UInt64) 50 +2 worl 1 0.7615946626193841 0 4950 99 [123,1,1] 49 \N 50 4950 Nullable(UInt64) 50 +2 worl 1 0.7615946626193841 0 4950 99 [123,1,1] 49 \N 50 4950 Nullable(UInt64) 50 arraySort(used_aggregate_functions) -['avg','count','groupBitAnd','sum','uniq'] +['avg','count','groupBitAnd','max','sum','uniq'] arraySort(used_aggregate_function_combinators) ['Array','If','OrDefault','OrNull'] @@ -11,7 +11,7 @@ arraySort(used_table_functions) ['numbers'] arraySort(used_functions) -['addDays','array','arrayFlatten','cast','modulo','plus','substring','toDate','toDayOfYear','toTypeName','toWeek'] +['addDays','array','arrayFlatten','cast','crc32','modulo','plus','pow','substring','tanh','toDate','toDayOfYear','toTypeName','toWeek'] arraySort(used_data_type_families) ['Array','Int32','Nullable','String'] @@ -20,5 +20,5 @@ used_database_engines ['Atomic'] arraySort(used_data_type_families) used_storages -['DateTime','Int64'] ['Memory'] +['Int64','datetime'] ['Memory'] diff --git a/tests/queries/0_stateless/01656_test_query_log_factories_info.sql b/tests/queries/0_stateless/01656_test_query_log_factories_info.sql index aa9bdd42a71..0856681e9c5 100644 --- a/tests/queries/0_stateless/01656_test_query_log_factories_info.sql +++ b/tests/queries/0_stateless/01656_test_query_log_factories_info.sql @@ -1,5 +1,7 @@ SELECT uniqArray([1, 1, 2]), SUBSTRING('Hello, world', 7, 5), + POW(1, 2), TANh(1), CrC32(''), + SUM(number), MAX(number), flatten([[[BIT_AND(123)]], [[mod(3, 2)], [CAST('1' AS INTEGER)]]]), week(toDate('2000-12-05')), CAST(arrayJoin([NULL, NULL]) AS Nullable(TEXT)), @@ -47,7 +49,7 @@ WHERE current_database = currentDatabase() AND type == 'QueryFinish' AND (query ORDER BY query_start_time DESC LIMIT 1 FORMAT TabSeparatedWithNames; SELECT ''; -CREATE OR REPLACE TABLE test_query_log_factories_info1.memory_table (id BIGINT, date DateTime) ENGINE=Memory(); +CREATE OR REPLACE TABLE test_query_log_factories_info1.memory_table (id BIGINT, date DATETIME) ENGINE=Memory(); SYSTEM FLUSH LOGS; SELECT arraySort(used_data_type_families), used_storages From 65c061de4978f83c048cfd4c0292a81510ae7bfb Mon Sep 17 00:00:00 2001 From: kssenii Date: Thu, 28 Jan 2021 13:28:11 +0000 Subject: [PATCH 0095/2357] FFix --- .../01656_test_query_log_factories_info.reference | 6 +++--- .../0_stateless/01656_test_query_log_factories_info.sql | 2 +- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/tests/queries/0_stateless/01656_test_query_log_factories_info.reference b/tests/queries/0_stateless/01656_test_query_log_factories_info.reference index 77486e99ea5..e12ee221a7b 100644 --- a/tests/queries/0_stateless/01656_test_query_log_factories_info.reference +++ b/tests/queries/0_stateless/01656_test_query_log_factories_info.reference @@ -1,5 +1,5 @@ -2 worl 1 0.7615946626193841 0 4950 99 [123,1,1] 49 \N 50 4950 Nullable(UInt64) 50 -2 worl 1 0.7615946626193841 0 4950 99 [123,1,1] 49 \N 50 4950 Nullable(UInt64) 50 +2 worl 1 1 0 4950 99 [123,1,1] 49 \N 50 4950 Nullable(UInt64) 50 +2 worl 1 1 0 4950 99 [123,1,1] 49 \N 50 4950 Nullable(UInt64) 50 arraySort(used_aggregate_functions) ['avg','count','groupBitAnd','max','sum','uniq'] @@ -11,7 +11,7 @@ arraySort(used_table_functions) ['numbers'] arraySort(used_functions) -['addDays','array','arrayFlatten','cast','crc32','modulo','plus','pow','substring','tanh','toDate','toDayOfYear','toTypeName','toWeek'] +['addDays','array','arrayFlatten','cast','crc32','modulo','plus','pow','round','substring','tanh','toDate','toDayOfYear','toTypeName','toWeek'] arraySort(used_data_type_families) ['Array','Int32','Nullable','String'] diff --git a/tests/queries/0_stateless/01656_test_query_log_factories_info.sql b/tests/queries/0_stateless/01656_test_query_log_factories_info.sql index 0856681e9c5..b584f2c38c8 100644 --- a/tests/queries/0_stateless/01656_test_query_log_factories_info.sql +++ b/tests/queries/0_stateless/01656_test_query_log_factories_info.sql @@ -1,6 +1,6 @@ SELECT uniqArray([1, 1, 2]), SUBSTRING('Hello, world', 7, 5), - POW(1, 2), TANh(1), CrC32(''), + POW(1, 2), ROUND(TANh(1)), CrC32(''), SUM(number), MAX(number), flatten([[[BIT_AND(123)]], [[mod(3, 2)], [CAST('1' AS INTEGER)]]]), week(toDate('2000-12-05')), From 52e5c0aad748b6ee55a97380abddf0ceb12aa864 Mon Sep 17 00:00:00 2001 From: Alexander Tokmakov Date: Thu, 28 Jan 2021 16:48:17 +0300 Subject: [PATCH 0096/2357] fix thread status --- src/Common/CurrentThread.h | 7 +++--- src/Common/ThreadStatus.cpp | 3 +++ src/Common/ThreadStatus.h | 2 +- src/Interpreters/DDLWorker.cpp | 24 +++++---------------- src/Interpreters/DDLWorker.h | 3 --- src/Interpreters/InterpreterCreateQuery.cpp | 3 ++- src/Interpreters/ThreadStatusExt.cpp | 2 ++ src/Interpreters/executeQuery.cpp | 9 ++------ src/Server/MySQLHandler.cpp | 6 +++++- src/Server/PostgreSQLHandler.cpp | 7 +++++- src/Storages/StorageReplicatedMergeTree.cpp | 2 +- 11 files changed, 31 insertions(+), 37 deletions(-) diff --git a/src/Common/CurrentThread.h b/src/Common/CurrentThread.h index 876cbd8a66b..7ab57ea7fab 100644 --- a/src/Common/CurrentThread.h +++ b/src/Common/CurrentThread.h @@ -63,9 +63,6 @@ public: /// Call from master thread as soon as possible (e.g. when thread accepted connection) static void initializeQuery(); - /// Sets query_context for current thread group - static void attachQueryContext(Context & query_context); - /// You must call one of these methods when create a query child thread: /// Add current thread to a group associated with the thread group static void attachTo(const ThreadGroupStatusPtr & thread_group); @@ -99,6 +96,10 @@ public: private: static void defaultThreadDeleter(); + + /// Sets query_context for current thread group + /// Can by used only through QueryScope + static void attachQueryContext(Context & query_context); }; } diff --git a/src/Common/ThreadStatus.cpp b/src/Common/ThreadStatus.cpp index 5105fff03b2..f2256fbf192 100644 --- a/src/Common/ThreadStatus.cpp +++ b/src/Common/ThreadStatus.cpp @@ -99,6 +99,9 @@ ThreadStatus::~ThreadStatus() /// We've already allocated a little bit more than the limit and cannot track it in the thread memory tracker or its parent. } + /// It may cause segfault if query_context was destroyed, but was not detached + assert((!query_context && query_id.empty()) || (query_id == query_context->getCurrentQueryId())); + if (deleter) deleter(); current_thread = nullptr; diff --git a/src/Common/ThreadStatus.h b/src/Common/ThreadStatus.h index 1be1f2cd4df..dc5f09c5f3d 100644 --- a/src/Common/ThreadStatus.h +++ b/src/Common/ThreadStatus.h @@ -201,7 +201,7 @@ public: void setFatalErrorCallback(std::function callback); void onFatalError(); - /// Sets query context for current thread and its thread group + /// Sets query context for current master thread and its thread group /// NOTE: query_context have to be alive until detachQuery() is called void attachQueryContext(Context & query_context); diff --git a/src/Interpreters/DDLWorker.cpp b/src/Interpreters/DDLWorker.cpp index cb38c733582..83412ab8fb7 100644 --- a/src/Interpreters/DDLWorker.cpp +++ b/src/Interpreters/DDLWorker.cpp @@ -202,11 +202,12 @@ void DDLWorker::shutdown() queue_updated_event->set(); cleanup_event->set(); - worker_pool.reset(); if (main_thread.joinable()) main_thread.join(); if (cleanup_thread.joinable()) cleanup_thread.join(); + + worker_pool.reset(); } DDLWorker::~DDLWorker() @@ -355,8 +356,6 @@ void DDLWorker::scheduleTasks() if (!task) { LOG_DEBUG(log, "Will not execute task {}: {}", entry_name, reason); - //task->was_executed = true; - //saveTask(std::move(task)); continue; } @@ -379,7 +378,7 @@ void DDLWorker::scheduleTasks() DDLTaskBase & DDLWorker::saveTask(DDLTaskPtr && task) { - std::remove_if(current_tasks.begin(), current_tasks.end(), [](const DDLTaskPtr & t) { return t->completely_processed.load(); }); + current_tasks.remove_if([](const DDLTaskPtr & t) { return t->completely_processed.load(); }); assert(current_tasks.size() <= pool_size); current_tasks.emplace_back(std::move(task)); return *current_tasks.back(); @@ -394,10 +393,12 @@ bool DDLWorker::tryExecuteQuery(const String & query, DDLTaskBase & task) ReadBufferFromString istr(query_to_execute); String dummy_string; WriteBufferFromString ostr(dummy_string); + std::optional query_scope; try { auto query_context = task.makeQueryContext(context); + query_scope.emplace(*query_context); executeQuery(istr, ostr, false, *query_context, {}); } catch (const DB::Exception & e) @@ -433,20 +434,6 @@ bool DDLWorker::tryExecuteQuery(const String & query, DDLTaskBase & task) return true; } -void DDLWorker::attachToThreadGroup() -{ - if (thread_group) - { - /// Put all threads to one thread pool - CurrentThread::attachToIfDetached(thread_group); - } - else - { - CurrentThread::initializeQuery(); - thread_group = CurrentThread::getGroup(); - } -} - void DDLWorker::processTask(DDLTaskBase & task) { auto zookeeper = tryGetZooKeeper(); @@ -909,7 +896,6 @@ void DDLWorker::runMainThread() }; setThreadName("DDLWorker"); - attachToThreadGroup(); LOG_DEBUG(log, "Starting DDLWorker thread"); while (!stop_flag) diff --git a/src/Interpreters/DDLWorker.h b/src/Interpreters/DDLWorker.h index c0194c4f252..1b7ebfb5796 100644 --- a/src/Interpreters/DDLWorker.h +++ b/src/Interpreters/DDLWorker.h @@ -102,8 +102,6 @@ protected: void runMainThread(); void runCleanupThread(); - void attachToThreadGroup(); - protected: Context context; Poco::Logger * log; @@ -138,7 +136,6 @@ protected: /// How many tasks could be in the queue size_t max_tasks_in_queue = 1000; - ThreadGroupStatusPtr thread_group; std::atomic max_id = 0; }; diff --git a/src/Interpreters/InterpreterCreateQuery.cpp b/src/Interpreters/InterpreterCreateQuery.cpp index b66af77930c..5292ef57d7a 100644 --- a/src/Interpreters/InterpreterCreateQuery.cpp +++ b/src/Interpreters/InterpreterCreateQuery.cpp @@ -929,7 +929,8 @@ bool InterpreterCreateQuery::doCreateTable(ASTCreateQuery & create, drop_ast->table = create.table; drop_ast->no_ddl_lock = true; - InterpreterDropQuery interpreter(drop_ast, context); + Context drop_context = context; + InterpreterDropQuery interpreter(drop_ast, drop_context); interpreter.execute(); } else diff --git a/src/Interpreters/ThreadStatusExt.cpp b/src/Interpreters/ThreadStatusExt.cpp index 61322cabfb3..8a979721290 100644 --- a/src/Interpreters/ThreadStatusExt.cpp +++ b/src/Interpreters/ThreadStatusExt.cpp @@ -500,6 +500,8 @@ CurrentThread::QueryScope::QueryScope(Context & query_context) { CurrentThread::initializeQuery(); CurrentThread::attachQueryContext(query_context); + if (!query_context.hasQueryContext()) + query_context.makeQueryContext(); } void CurrentThread::QueryScope::logPeakMemoryUsage() diff --git a/src/Interpreters/executeQuery.cpp b/src/Interpreters/executeQuery.cpp index 7003e6f5ee9..770e6e65d24 100644 --- a/src/Interpreters/executeQuery.cpp +++ b/src/Interpreters/executeQuery.cpp @@ -326,13 +326,8 @@ static std::tuple executeQueryImpl( { const auto current_time = std::chrono::system_clock::now(); - /// If we already executing query and it requires to execute internal query, than - /// don't replace thread context with given (it can be temporary). Otherwise, attach context to thread. - if (!internal) - { - context.makeQueryContext(); - CurrentThread::attachQueryContext(context); - } + assert(internal || CurrentThread::get().getQueryContext()); + assert(internal || CurrentThread::get().getQueryContext()->getCurrentQueryId() == CurrentThread::getQueryId()); const Settings & settings = context.getSettingsRef(); diff --git a/src/Server/MySQLHandler.cpp b/src/Server/MySQLHandler.cpp index 63a48fde1a7..f660d97cdc6 100644 --- a/src/Server/MySQLHandler.cpp +++ b/src/Server/MySQLHandler.cpp @@ -24,6 +24,7 @@ #include #include #include +#include #if !defined(ARCADIA_BUILD) # include @@ -86,6 +87,8 @@ MySQLHandler::MySQLHandler(IServer & server_, const Poco::Net::StreamSocket & so void MySQLHandler::run() { + setThreadName("MySQLHandler"); + ThreadStatus thread_status; connection_context.makeSessionContext(); connection_context.getClientInfo().interface = ClientInfo::Interface::MYSQL; connection_context.setDefaultFormat("MySQLWire"); @@ -339,8 +342,9 @@ void MySQLHandler::comQuery(ReadBuffer & payload) affected_rows += progress.written_rows; }); + CurrentThread::QueryScope query_scope{query_context}; - executeQuery(should_replace ? replacement : payload, *out, true, query_context, + executeQuery(should_replace ? replacement : payload, *out, false, query_context, [&with_output](const String &, const String &, const String &, const String &) { with_output = true; diff --git a/src/Server/PostgreSQLHandler.cpp b/src/Server/PostgreSQLHandler.cpp index 2bce5abcd11..b3a3bbf2aaa 100644 --- a/src/Server/PostgreSQLHandler.cpp +++ b/src/Server/PostgreSQLHandler.cpp @@ -5,6 +5,7 @@ #include #include "PostgreSQLHandler.h" #include +#include #include #if !defined(ARCADIA_BUILD) @@ -49,6 +50,8 @@ void PostgreSQLHandler::changeIO(Poco::Net::StreamSocket & socket) void PostgreSQLHandler::run() { + setThreadName("PostgresHandler"); + ThreadStatus thread_status; connection_context.makeSessionContext(); connection_context.getClientInfo().interface = ClientInfo::Interface::POSTGRESQL; connection_context.setDefaultFormat("PostgreSQLWire"); @@ -273,8 +276,10 @@ void PostgreSQLHandler::processQuery() for (const auto & spl_query : queries) { + /// FIXME why do we execute all queries in a single connection context? + CurrentThread::QueryScope query_scope{connection_context}; ReadBufferFromString read_buf(spl_query); - executeQuery(read_buf, *out, true, connection_context, {}); + executeQuery(read_buf, *out, false, connection_context, {}); PostgreSQLProtocol::Messaging::CommandComplete::Command command = PostgreSQLProtocol::Messaging::CommandComplete::classifyQuery(spl_query); diff --git a/src/Storages/StorageReplicatedMergeTree.cpp b/src/Storages/StorageReplicatedMergeTree.cpp index 30b08cdea1e..951ce63944b 100644 --- a/src/Storages/StorageReplicatedMergeTree.cpp +++ b/src/Storages/StorageReplicatedMergeTree.cpp @@ -3682,7 +3682,7 @@ void StorageReplicatedMergeTree::shutdown() /// We clear all old parts after stopping all background operations. It's /// important, because background operations can produce temporary parts - /// which will remove themselves in their descrutors. If so, we may have + /// which will remove themselves in their destrutors. If so, we may have /// race condition between our remove call and background process. clearOldPartsFromFilesystem(true); } From a57456a3fd21829d22635df01404f7383ece545d Mon Sep 17 00:00:00 2001 From: Alexander Tokmakov Date: Thu, 28 Jan 2021 22:02:39 +0300 Subject: [PATCH 0097/2357] fix --- src/Interpreters/DDLTask.h | 1 + src/Interpreters/DDLWorker.cpp | 6 ++++++ src/Interpreters/InterpreterCreateQuery.cpp | 6 +++++- 3 files changed, 12 insertions(+), 1 deletion(-) diff --git a/src/Interpreters/DDLTask.h b/src/Interpreters/DDLTask.h index a12676ab8a3..5b50413b975 100644 --- a/src/Interpreters/DDLTask.h +++ b/src/Interpreters/DDLTask.h @@ -158,6 +158,7 @@ struct MetadataTransaction void addOps(Coordination::Requests & other_ops) { std::move(ops.begin(), ops.end(), std::back_inserter(other_ops)); + ops.clear(); } void commit(); diff --git a/src/Interpreters/DDLWorker.cpp b/src/Interpreters/DDLWorker.cpp index 83412ab8fb7..7b9d3ef8f5b 100644 --- a/src/Interpreters/DDLWorker.cpp +++ b/src/Interpreters/DDLWorker.cpp @@ -400,6 +400,12 @@ bool DDLWorker::tryExecuteQuery(const String & query, DDLTaskBase & task) auto query_context = task.makeQueryContext(context); query_scope.emplace(*query_context); executeQuery(istr, ostr, false, *query_context, {}); + + if (auto txn = query_context->getMetadataTransaction()) + { + if (txn->state == MetadataTransaction::CREATED) + txn->commit(); + } } catch (const DB::Exception & e) { diff --git a/src/Interpreters/InterpreterCreateQuery.cpp b/src/Interpreters/InterpreterCreateQuery.cpp index 5292ef57d7a..926737ef888 100644 --- a/src/Interpreters/InterpreterCreateQuery.cpp +++ b/src/Interpreters/InterpreterCreateQuery.cpp @@ -800,11 +800,11 @@ BlockIO InterpreterCreateQuery::createTable(ASTCreateQuery & create) String current_database = context.getCurrentDatabase(); auto database_name = create.database.empty() ? current_database : create.database; - auto database = DatabaseCatalog::instance().getDatabase(database_name); // If this is a stub ATTACH query, read the query definition from the database if (create.attach && !create.storage && !create.columns_list) { + auto database = DatabaseCatalog::instance().getDatabase(database_name); bool if_not_exists = create.if_not_exists; // Table SQL definition is available even if the table is detached (even permanently) @@ -869,7 +869,11 @@ BlockIO InterpreterCreateQuery::createTable(ASTCreateQuery & create) } //TODO make code better if possible + DatabasePtr database; bool need_add_to_database = !create.temporary; + if (need_add_to_database) + database = DatabaseCatalog::instance().getDatabase(database_name); + if (need_add_to_database && database->getEngineName() == "Replicated") { auto guard = DatabaseCatalog::instance().getDDLGuard(create.database, create.table); From 2d0f742fdab2504402432580fda1b1f182aee4c7 Mon Sep 17 00:00:00 2001 From: Evgeniia Sudarikova Date: Thu, 28 Jan 2021 23:16:29 +0300 Subject: [PATCH 0098/2357] edited EN docs --- .../example-datasets/brown-benchmark.md | 6 +- .../functions/array-functions.md | 105 +++++++++++++++++- .../en/sql-reference/table-functions/mysql.md | 2 +- 3 files changed, 104 insertions(+), 9 deletions(-) diff --git a/docs/en/getting-started/example-datasets/brown-benchmark.md b/docs/en/getting-started/example-datasets/brown-benchmark.md index b5ca23eddb9..effae6d5adb 100644 --- a/docs/en/getting-started/example-datasets/brown-benchmark.md +++ b/docs/en/getting-started/example-datasets/brown-benchmark.md @@ -5,7 +5,7 @@ toc_title: Brown University Benchmark # Brown University Benchmark -MgBench - A new analytical benchmark for machine-generated log data, [Andrew Crotty](http://cs.brown.edu/people/acrotty/). +`MgBench` is a new analytical benchmark for machine-generated log data, [Andrew Crotty](http://cs.brown.edu/people/acrotty/). Download the data: ``` @@ -153,7 +153,7 @@ ORDER BY dt, hr; --- Q1.4: Over a 1-month period, how often was each server blocked on disk I/O? +-- Q1.4: Over 1 month, how often was each server blocked on disk I/O? SELECT machine_name, COUNT(*) AS spikes @@ -301,7 +301,7 @@ WHERE event_type = 'temperature' AND log_time >= '2019-11-29 17:00:00.000'; --- Q3.4: Over the past 6 months, how frequently was each door opened? +-- Q3.4: Over the past 6 months, how frequently were each door opened? SELECT device_name, device_floor, diff --git a/docs/en/sql-reference/functions/array-functions.md b/docs/en/sql-reference/functions/array-functions.md index dc7727bdfd8..48c5176f0e1 100644 --- a/docs/en/sql-reference/functions/array-functions.md +++ b/docs/en/sql-reference/functions/array-functions.md @@ -1294,15 +1294,47 @@ Returns the min of the `func` values. If the function is omitted, it just return Note that the `arrayMin` is a [higher-order function](../../sql-reference/functions/index.md#higher-order-functions). You can pass a lambda function to it as the first argument. -Examples: -```sql +**Syntax** + +``` sql +arrayMin(arr) +``` + +**Returned value** + +- A number. + +Type: [Int](../../sql-reference/data-types/int-uint.md) or [Float](../../sql-reference/data-types/float.md). + +**Parameters** + +- `arr` — [Array](../../sql-reference/data-types/array.md). + +**Examples** + +Query: + +``` sql SELECT arrayMin([1, 2, 4]) AS res +``` + +Result: + +``` text ┌─res─┐ │ 1 │ └─────┘ +``` +Query: +``` sql SELECT arrayMin(x -> (-x), [1, 2, 4]) AS res +``` + +Result: + +``` text ┌─res─┐ │ -4 │ └─────┘ @@ -1314,15 +1346,47 @@ Returns the max of the `func` values. If the function is omitted, it just return Note that the `arrayMax` is a [higher-order function](../../sql-reference/functions/index.md#higher-order-functions). You can pass a lambda function to it as the first argument. -Examples: +**Syntax** + +``` sql +arrayMax(arr) +``` + +**Returned value** + +- A number. + +Type: [Int](../../sql-reference/data-types/int-uint.md) or [Float](../../sql-reference/data-types/float.md). + +**Parameters** + +- `arr` — [Array](../../sql-reference/data-types/array.md). + +**Examples** + +Query: + ```sql SELECT arrayMax([1, 2, 4]) AS res +``` + +Result: + +``` text ┌─res─┐ │ 4 │ └─────┘ +``` +Query: +``` sql SELECT arrayMax(x -> (-x), [1, 2, 4]) AS res +``` + +Result: + +``` text ┌─res─┐ │ -1 │ └─────┘ @@ -1334,21 +1398,52 @@ Returns the sum of the `func` values. If the function is omitted, it just return Note that the `arraySum` is a [higher-order function](../../sql-reference/functions/index.md#higher-order-functions). You can pass a lambda function to it as the first argument. -Examples: +**Syntax** + +``` sql +arraySum(arr) +``` + +**Returned value** + +- A number. + +Type: [Int](../../sql-reference/data-types/int-uint.md) or [Float](../../sql-reference/data-types/float.md). + +**Parameters** + +- `arr` — [Array](../../sql-reference/data-types/array.md). + +**Examples** + +Query: + ```sql SELECT arraySum([2,3]) AS res +``` + +Result: + +``` text ┌─res─┐ │ 5 │ └─────┘ +``` +Query: +``` sql SELECT arraySum(x -> x*x, [2, 3]) AS res +``` + +Result: + +``` text ┌─res─┐ │ 13 │ └─────┘ ``` - ## arrayAvg(\[func,\] arr1, …) {#array-avg} Returns the average of the `func` values. If the function is omitted, it just returns the average of the array elements. diff --git a/docs/en/sql-reference/table-functions/mysql.md b/docs/en/sql-reference/table-functions/mysql.md index eec4a1d0c46..3126f635817 100644 --- a/docs/en/sql-reference/table-functions/mysql.md +++ b/docs/en/sql-reference/table-functions/mysql.md @@ -44,7 +44,7 @@ The rest of the conditions and the `LIMIT` sampling constraint are executed in C A table object with the same columns as the original MySQL table. !!! info "Note" - In the `INSERT` query to distinguish table function `mysql(...)` from table name with column names list you must use keywords `FUNCTION` or `TABLE FUNCTION`. See examples below. + In the `INSERT` query to distinguish table function `mysql(...)` from table name with column names list, you must use keywords `FUNCTION` or `TABLE FUNCTION`. See examples below. **Examples** From 45cb78a67b1ba39fe874817e523a7964751fb7cc Mon Sep 17 00:00:00 2001 From: feng lv Date: Fri, 29 Jan 2021 08:14:34 +0000 Subject: [PATCH 0099/2357] continue of #19487 fix --- src/Interpreters/TreeRewriter.cpp | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/src/Interpreters/TreeRewriter.cpp b/src/Interpreters/TreeRewriter.cpp index ce4103e97ec..a1d1605afd5 100644 --- a/src/Interpreters/TreeRewriter.cpp +++ b/src/Interpreters/TreeRewriter.cpp @@ -693,18 +693,17 @@ void TreeRewriterResult::collectUsedColumns(const ASTPtr & query, bool is_select if (storage) { - String hint_name{}; + std::vector hint_name{}; for (const auto & name : columns_context.requiredColumns()) { auto hints = storage->getHints(name); - if (!hints.empty()) - hint_name = hint_name + " '" + toString(hints) + "'"; + hint_name.insert(hint_name.end(), hints.begin(), hints.end()); } if (!hint_name.empty()) { ss << ", maybe you meant: "; - ss << hint_name; + ss << toString(hint_name); } } else From 4929fe2063f368e48bb53fde011487303426d460 Mon Sep 17 00:00:00 2001 From: Nikolai Kochetov Date: Fri, 29 Jan 2021 18:13:09 +0300 Subject: [PATCH 0100/2357] Update MergeTreeRangeReader --- .../MergeTree/MergeTreeRangeReader.cpp | 57 +++++++++++-------- src/Storages/MergeTree/MergeTreeRangeReader.h | 2 +- 2 files changed, 35 insertions(+), 24 deletions(-) diff --git a/src/Storages/MergeTree/MergeTreeRangeReader.cpp b/src/Storages/MergeTree/MergeTreeRangeReader.cpp index 0b3765adc6a..afbc28e6883 100644 --- a/src/Storages/MergeTree/MergeTreeRangeReader.cpp +++ b/src/Storages/MergeTree/MergeTreeRangeReader.cpp @@ -284,6 +284,13 @@ void MergeTreeRangeReader::ReadResult::shrink(Columns & old_columns) { if (!column) continue; + + if (const auto * column_const = typeid_cast(column.get())) + { + column = column_const->cloneResized(total_rows_per_granule); + continue; + } + auto new_column = column->cloneEmpty(); new_column->reserve(total_rows_per_granule); for (size_t j = 0, pos = 0; j < rows_per_granule_original.size(); pos += rows_per_granule_original[j++]) @@ -754,13 +761,7 @@ MergeTreeRangeReader::ReadResult MergeTreeRangeReader::read(size_t max_rows, Mar if (read_result.num_rows == 0) return read_result; - if (prewhere_info_list) - { - for (const auto & prewhere_info : *prewhere_info_list) - { - executePrewhereActionsAndFilterColumns(read_result, prewhere_info); - } - } + executePrewhereActionsAndFilterColumns(read_result); return read_result; } @@ -857,8 +858,11 @@ Columns MergeTreeRangeReader::continueReadingChain(ReadResult & result, size_t & return columns; } -void MergeTreeRangeReader::executePrewhereActionsAndFilterColumns(ReadResult & result, const PrewhereInfo & prewhere_info) +void MergeTreeRangeReader::executePrewhereActionsAndFilterColumns(ReadResult & result) { + if (prewhere_info_list->empty()) + return; + const auto & header = merge_tree_reader->getColumns(); size_t num_columns = header.size(); @@ -867,7 +871,6 @@ void MergeTreeRangeReader::executePrewhereActionsAndFilterColumns(ReadResult & r "Expected " + toString(num_columns) + ", " "got " + toString(result.columns.size()), ErrorCodes::LOGICAL_ERROR); - ColumnPtr filter; size_t prewhere_column_pos; { @@ -887,25 +890,33 @@ void MergeTreeRangeReader::executePrewhereActionsAndFilterColumns(ReadResult & r for (auto name_and_type = header.begin(); pos < num_columns; ++pos, ++name_and_type) block.insert({result.columns[pos], name_and_type->type, name_and_type->name}); - if (prewhere_info.alias_actions) - prewhere_info.alias_actions->execute(block); + for (size_t i = 0; i < prewhere_info_list->size(); ++i) + { + const auto & prewhere_info = (*prewhere_info_list)[i]; - /// Columns might be projected out. We need to store them here so that default columns can be evaluated later. - result.block_before_prewhere = block; - prewhere_info.prewhere_actions->execute(block); + if (prewhere_info.alias_actions) + prewhere_info.alias_actions->execute(block); - prewhere_column_pos = block.getPositionByName(prewhere_info.prewhere_column_name); + /// Columns might be projected out. We need to store them here so that default columns can be evaluated later. + result.block_before_prewhere = block; + prewhere_info.prewhere_actions->execute(block); + prewhere_column_pos = block.getPositionByName(prewhere_info.prewhere_column_name); + result.addFilter(block.getByPosition(prewhere_column_pos).column); + + if (i + 1 != prewhere_info_list->size() && prewhere_info.remove_prewhere_column) + block.erase(prewhere_column_pos); + else + block.getByPosition(prewhere_column_pos).column = block.getByPosition(prewhere_column_pos).type->createColumnConst(result.num_rows, 1); + } + + block.getByPosition(prewhere_column_pos).column = nullptr; result.columns.clear(); result.columns.reserve(block.columns()); for (auto & col : block) result.columns.emplace_back(std::move(col.column)); - - filter.swap(result.columns[prewhere_column_pos]); } - result.addFilter(filter); - /// If there is a WHERE, we filter in there, and only optimize IO and shrink columns here if (!last_reader_in_chain) result.optimize(merge_tree_reader->canReadIncompleteGranules()); @@ -914,7 +925,7 @@ void MergeTreeRangeReader::executePrewhereActionsAndFilterColumns(ReadResult & r if (result.totalRowsPerGranule() == 0) result.setFilterConstFalse(); /// If we need to filter in PREWHERE - else if (prewhere_info.need_filter || result.need_filter) + else if (prewhere_info_list->back().need_filter || result.need_filter) { /// If there is a filter and without optimized if (result.getFilter() && last_reader_in_chain) @@ -955,11 +966,11 @@ void MergeTreeRangeReader::executePrewhereActionsAndFilterColumns(ReadResult & r /// Check if the PREWHERE column is needed if (!result.columns.empty()) { - if (prewhere_info.remove_prewhere_column) + if (prewhere_info_list->back().remove_prewhere_column) result.columns.erase(result.columns.begin() + prewhere_column_pos); else result.columns[prewhere_column_pos] = - getSampleBlock().getByName(prewhere_info.prewhere_column_name).type-> + getSampleBlock().getByName(prewhere_info_list->back().prewhere_column_name).type-> createColumnConst(result.num_rows, 1u)->convertToFullColumnIfConst(); } } @@ -967,7 +978,7 @@ void MergeTreeRangeReader::executePrewhereActionsAndFilterColumns(ReadResult & r else { result.columns[prewhere_column_pos] = result.getFilterHolder()->convertToFullColumnIfConst(); - if (getSampleBlock().getByName(prewhere_info.prewhere_column_name).type->isNullable()) + if (getSampleBlock().getByName(prewhere_info_list->back().prewhere_column_name).type->isNullable()) result.columns[prewhere_column_pos] = makeNullable(std::move(result.columns[prewhere_column_pos])); result.clearFilter(); // Acting as a flag to not filter in PREWHERE } diff --git a/src/Storages/MergeTree/MergeTreeRangeReader.h b/src/Storages/MergeTree/MergeTreeRangeReader.h index 8f8482d1abf..6ee7c9f3e29 100644 --- a/src/Storages/MergeTree/MergeTreeRangeReader.h +++ b/src/Storages/MergeTree/MergeTreeRangeReader.h @@ -213,7 +213,7 @@ private: ReadResult startReadingChain(size_t max_rows, MarkRanges & ranges); Columns continueReadingChain(ReadResult & result, size_t & num_rows); - void executePrewhereActionsAndFilterColumns(ReadResult & result, const PrewhereInfo & prewhere_info); + void executePrewhereActionsAndFilterColumns(ReadResult & result); IMergeTreeReader * merge_tree_reader = nullptr; const MergeTreeIndexGranularity * index_granularity = nullptr; From 01a0cb649a3afc28726b36dbe1b10e4243ad34b2 Mon Sep 17 00:00:00 2001 From: Pavel Kruglov Date: Fri, 29 Jan 2021 18:46:28 +0300 Subject: [PATCH 0101/2357] Fix build, style, tests --- src/Client/GetHedgedConnections.cpp | 9 ++++++--- src/Client/GetHedgedConnections.h | 11 +++++++---- src/Client/HedgedConnections.cpp | 17 ++++++++++------- src/Client/HedgedConnections.h | 4 +++- src/Client/MultiplexedConnections.cpp | 4 ++-- src/Client/MultiplexedConnections.h | 2 +- src/Common/Epoll.cpp | 3 +++ src/Common/Epoll.h | 4 ++++ src/DataStreams/RemoteQueryExecutor.cpp | 8 +++++++- 9 files changed, 43 insertions(+), 19 deletions(-) diff --git a/src/Client/GetHedgedConnections.cpp b/src/Client/GetHedgedConnections.cpp index 4c729dc0722..7c1e7e1ced8 100644 --- a/src/Client/GetHedgedConnections.cpp +++ b/src/Client/GetHedgedConnections.cpp @@ -1,3 +1,5 @@ +#if defined(OS_LINUX) + #include #include @@ -72,7 +74,7 @@ std::vector GetHedgedConnections::getMany DB::ErrorCodes::ALL_REPLICAS_ARE_STALE); throw DB::NetException( - "Could not connect to " + std::to_string(min_entries) + " replicas. Log: \n\n" + fail_messages + "\n", + "All connection tries failed. Log: \n\n" + fail_messages + "\n", DB::ErrorCodes::ALL_CONNECTION_TRIES_FAILED); } replicas.push_back(replica); @@ -89,7 +91,7 @@ GetHedgedConnections::ReplicaStatePtr GetHedgedConnections::getNextConnection(bo int index; /// Check if it's the first time. - if (epoll.size() == 0 && ready_indexes.size() == 0) + if (epoll.empty() && ready_indexes.empty()) { index = 0; last_used_index = 0; @@ -99,7 +101,7 @@ GetHedgedConnections::ReplicaStatePtr GetHedgedConnections::getNextConnection(bo bool is_first = true; - while (index != -1 || epoll.size() != 0) + while (index != -1 || !epoll.empty()) { if (index == -1 && !is_first && non_blocking) { @@ -515,3 +517,4 @@ void removeTimeoutFromReplica( } } +#endif diff --git a/src/Client/GetHedgedConnections.h b/src/Client/GetHedgedConnections.h index df060e9ecd5..88daad779fe 100644 --- a/src/Client/GetHedgedConnections.h +++ b/src/Client/GetHedgedConnections.h @@ -1,5 +1,7 @@ #pragma once +#if defined(OS_LINUX) + #include #include #include @@ -44,10 +46,10 @@ public: active_timeouts.clear(); } - bool isReady() const { return state == State::READY; }; - bool isNotReady() const { return state == State::NOT_READY; }; - bool isEmpty() const { return state == State::EMPTY; }; - bool isCannotChoose() const { return state == State::CANNOT_CHOOSE; }; + bool isReady() const { return state == State::READY; } + bool isNotReady() const { return state == State::NOT_READY; } + bool isEmpty() const { return state == State::EMPTY; } + bool isCannotChoose() const { return state == State::CANNOT_CHOOSE; } }; using ReplicaStatePtr = std::shared_ptr; @@ -162,3 +164,4 @@ void removeTimeoutsFromReplica( std::unordered_map & timeout_fd_to_replica); } +#endif diff --git a/src/Client/HedgedConnections.cpp b/src/Client/HedgedConnections.cpp index a4231b2c172..6dc746ec7f4 100644 --- a/src/Client/HedgedConnections.cpp +++ b/src/Client/HedgedConnections.cpp @@ -1,3 +1,5 @@ +#if defined(OS_LINUX) + #include #include @@ -142,7 +144,7 @@ void HedgedConnections::sendQuery( replica->connection->sendQuery(timeouts, query, query_id, stage, &modified_settings, &client_info, with_pending_data); addTimeoutToReplica(TimerTypes::RECEIVE_TIMEOUT, replica, this->epoll, this->timeout_fd_to_replica, timeouts); - addTimeoutToReplica(TimerTypes::RECEIVE_DATA_TIMEOUT, replica, this->epoll, this->timeout_fd_to_replica, timeouts); + addTimeoutToReplica(TimerTypes::RECEIVE_DATA_TIMEOUT, replica, this->epoll, this->timeout_fd_to_replica, timeouts); }; for (auto & replicas_with_same_offset : replicas) @@ -178,12 +180,12 @@ std::string HedgedConnections::dumpAddresses() const // LOG_DEBUG(log, "dumpAddresses"); - std::string addresses = ""; + std::string addresses; bool is_first = true; - for (auto & replicas_with_same_offset : replicas) + for (const auto & replicas_with_same_offset : replicas) { - for (auto & replica : replicas_with_same_offset) + for (const auto & replica : replicas_with_same_offset) { if (replica->isReady()) { @@ -226,7 +228,7 @@ Packet HedgedConnections::drain() Packet res; res.type = Protocol::Server::EndOfStream; - while (epoll.size() != 0) + while (!epoll.empty()) { Packet packet = receivePacketImpl(); switch (packet.type) @@ -253,7 +255,7 @@ Packet HedgedConnections::drain() Packet HedgedConnections::receivePacket() { std::lock_guard lock(cancel_mutex); - return receivePacketUnlocked(); + return receivePacketUnlocked({}); } Packet HedgedConnections::receivePacketUnlocked(AsyncCallback async_callback) @@ -263,7 +265,7 @@ Packet HedgedConnections::receivePacketUnlocked(AsyncCallback async_callback) if (!hasActiveConnections()) throw Exception("No more packets are available.", ErrorCodes::LOGICAL_ERROR); - if (epoll.size() == 0) + if (epoll.empty()) throw Exception("No pending events in epoll.", ErrorCodes::LOGICAL_ERROR); return receivePacketImpl(std::move(async_callback)); @@ -445,3 +447,4 @@ void HedgedConnections::finishProcessReplica(ReplicaStatePtr & replica, bool dis } } +#endif diff --git a/src/Client/HedgedConnections.h b/src/Client/HedgedConnections.h index b6e64ac45ad..1400ff89de4 100644 --- a/src/Client/HedgedConnections.h +++ b/src/Client/HedgedConnections.h @@ -1,4 +1,5 @@ #pragma once +#if defined(OS_LINUX) #include #include @@ -35,7 +36,7 @@ public: Packet receivePacket() override; - Packet receivePacketUnlocked(AsyncCallback async_callback = {}) override; + Packet receivePacketUnlocked(AsyncCallback async_callback) override; void disconnect() override; @@ -98,3 +99,4 @@ private: }; } +#endif diff --git a/src/Client/MultiplexedConnections.cpp b/src/Client/MultiplexedConnections.cpp index fbf8c9aa172..3e7850e5f85 100644 --- a/src/Client/MultiplexedConnections.cpp +++ b/src/Client/MultiplexedConnections.cpp @@ -143,7 +143,7 @@ void MultiplexedConnections::sendQuery( Packet MultiplexedConnections::receivePacket() { std::lock_guard lock(cancel_mutex); - Packet packet = receivePacketUnlocked(); + Packet packet = receivePacketUnlocked({}); return packet; } @@ -191,7 +191,7 @@ Packet MultiplexedConnections::drain() while (hasActiveConnections()) { - Packet packet = receivePacketUnlocked(); + Packet packet = receivePacketUnlocked({}); switch (packet.type) { diff --git a/src/Client/MultiplexedConnections.h b/src/Client/MultiplexedConnections.h index 720add1ba81..a7c20200fcf 100644 --- a/src/Client/MultiplexedConnections.h +++ b/src/Client/MultiplexedConnections.h @@ -56,7 +56,7 @@ public: bool hasActiveConnections() const override { return active_connection_count > 0; } private: - Packet receivePacketUnlocked(AsyncCallback async_callback = {}) override; + Packet receivePacketUnlocked(AsyncCallback async_callback) override; /// Internal version of `dumpAddresses` function without locking. std::string dumpAddressesUnlocked() const; diff --git a/src/Common/Epoll.cpp b/src/Common/Epoll.cpp index 8ce100c7834..cb34f81cf36 100644 --- a/src/Common/Epoll.cpp +++ b/src/Common/Epoll.cpp @@ -1,3 +1,5 @@ +#if defined(OS_LINUX) + #include "Epoll.h" #include #include @@ -80,3 +82,4 @@ Epoll::~Epoll() } } +#endif diff --git a/src/Common/Epoll.h b/src/Common/Epoll.h index 0e04d666af0..1dc65d15d08 100644 --- a/src/Common/Epoll.h +++ b/src/Common/Epoll.h @@ -1,4 +1,5 @@ #pragma once +#if defined(OS_LINUX) #include #include @@ -34,6 +35,8 @@ public: int size() const { return events_count; } + bool empty() const { return events_count == 0; } + ~Epoll(); private: @@ -42,3 +45,4 @@ private: }; } +#endif diff --git a/src/DataStreams/RemoteQueryExecutor.cpp b/src/DataStreams/RemoteQueryExecutor.cpp index 01c31eac640..52a7a3e0a78 100644 --- a/src/DataStreams/RemoteQueryExecutor.cpp +++ b/src/DataStreams/RemoteQueryExecutor.cpp @@ -63,7 +63,13 @@ RemoteQueryExecutor::RemoteQueryExecutor( const Settings & current_settings = context.getSettingsRef(); auto timeouts = ConnectionTimeouts::getTCPTimeoutsWithFailover(current_settings); - if (current_settings.use_hedged_requests) + bool use_hedged_requests = current_settings.use_hedged_requests; + +#if !defined(OS_LINUX) + use_hedged_requests = false; +#endif + + if (use_hedged_requests) { std::shared_ptr table_to_check = nullptr; if (main_table) From afdc9635cb1183c6c37e3ccd804b5a8ca6498311 Mon Sep 17 00:00:00 2001 From: Nikolai Kochetov Date: Fri, 29 Jan 2021 19:12:53 +0300 Subject: [PATCH 0102/2357] Update MergeTreeRangeReader --- src/Interpreters/InterpreterSelectQuery.cpp | 6 ++++++ src/Storages/MergeTree/MergeTreeRangeReader.cpp | 2 +- 2 files changed, 7 insertions(+), 1 deletion(-) diff --git a/src/Interpreters/InterpreterSelectQuery.cpp b/src/Interpreters/InterpreterSelectQuery.cpp index db0edefb5e8..6a0e2515801 100644 --- a/src/Interpreters/InterpreterSelectQuery.cpp +++ b/src/Interpreters/InterpreterSelectQuery.cpp @@ -1579,6 +1579,12 @@ void InterpreterSelectQuery::executeFetchColumns(QueryProcessingStage::Enum proc std::make_shared(expressions.filter_info->actions_dag), expressions.filter_info->column_name); + if (alias_actions) + { + query_info.prewhere_info_list->back().alias_actions = std::make_shared(alias_actions); + alias_actions = nullptr; + } + auto & new_filter_info = query_info.prewhere_info_list->front(); new_filter_info.remove_prewhere_column = expressions.filter_info->do_remove_column; diff --git a/src/Storages/MergeTree/MergeTreeRangeReader.cpp b/src/Storages/MergeTree/MergeTreeRangeReader.cpp index afbc28e6883..14ab70a992c 100644 --- a/src/Storages/MergeTree/MergeTreeRangeReader.cpp +++ b/src/Storages/MergeTree/MergeTreeRangeReader.cpp @@ -860,7 +860,7 @@ Columns MergeTreeRangeReader::continueReadingChain(ReadResult & result, size_t & void MergeTreeRangeReader::executePrewhereActionsAndFilterColumns(ReadResult & result) { - if (prewhere_info_list->empty()) + if (!prewhere_info_list || prewhere_info_list->empty()) return; const auto & header = merge_tree_reader->getColumns(); From 47a7273fe363f4577e9dbb8e7ca4a95da600ec02 Mon Sep 17 00:00:00 2001 From: Nikolai Kochetov Date: Fri, 29 Jan 2021 19:21:52 +0300 Subject: [PATCH 0103/2357] Fix build --- src/Storages/MergeTree/MergeTreeRangeReader.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/Storages/MergeTree/MergeTreeRangeReader.cpp b/src/Storages/MergeTree/MergeTreeRangeReader.cpp index 14ab70a992c..7c9b1b36b33 100644 --- a/src/Storages/MergeTree/MergeTreeRangeReader.cpp +++ b/src/Storages/MergeTree/MergeTreeRangeReader.cpp @@ -871,7 +871,7 @@ void MergeTreeRangeReader::executePrewhereActionsAndFilterColumns(ReadResult & r "Expected " + toString(num_columns) + ", " "got " + toString(result.columns.size()), ErrorCodes::LOGICAL_ERROR); - size_t prewhere_column_pos; + size_t prewhere_column_pos = 0; { /// Restore block from columns list. From 60295978779db573fe80f54c38c47664459532bc Mon Sep 17 00:00:00 2001 From: Pavel Kruglov Date: Fri, 29 Jan 2021 19:56:47 +0300 Subject: [PATCH 0104/2357] Fix style error --- src/Client/GetHedgedConnections.cpp | 1 + 1 file changed, 1 insertion(+) diff --git a/src/Client/GetHedgedConnections.cpp b/src/Client/GetHedgedConnections.cpp index 7c1e7e1ced8..269edd252a2 100644 --- a/src/Client/GetHedgedConnections.cpp +++ b/src/Client/GetHedgedConnections.cpp @@ -11,6 +11,7 @@ namespace ErrorCodes extern const int BAD_ARGUMENTS; extern const int LOGICAL_ERROR; extern const int ALL_CONNECTION_TRIES_FAILED; + extern const int ALL_REPLICAS_ARE_STALE; } GetHedgedConnections::GetHedgedConnections( From 649386ff0f37ff3c4fd0440efefa90adfd43d9bb Mon Sep 17 00:00:00 2001 From: Dmitriy Date: Fri, 29 Jan 2021 23:15:42 +0300 Subject: [PATCH 0105/2357] Edit and translate to Russian MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Поправил английскую версию и перевел на русский язык. --- docs/en/operations/opentelemetry.md | 41 +++++----------- .../functions/other-functions.md | 6 +-- docs/en/sql-reference/statements/optimize.md | 2 +- docs/ru/operations/opentelemetry.md | 49 +++++++++++++++++++ .../functions/date-time-functions.md | 2 - .../functions/other-functions.md | 17 +++++-- docs/ru/sql-reference/statements/optimize.md | 5 +- 7 files changed, 80 insertions(+), 42 deletions(-) create mode 100644 docs/ru/operations/opentelemetry.md diff --git a/docs/en/operations/opentelemetry.md b/docs/en/operations/opentelemetry.md index 2afeabc7956..db0a0e9779a 100644 --- a/docs/en/operations/opentelemetry.md +++ b/docs/en/operations/opentelemetry.md @@ -5,52 +5,32 @@ toc_title: OpenTelemetry Support # [experimental] OpenTelemetry Support -[OpenTelemetry](https://opentelemetry.io/) is an open standard for collecting -traces and metrics from distributed application. ClickHouse has some support -for OpenTelemetry. +[OpenTelemetry](https://opentelemetry.io/) is an open standard for collecting traces and metrics from the distributed application. ClickHouse has some support for `OpenTelemetry`. !!! warning "Warning" -This is an experimental feature that will change in backwards-incompatible ways in the future releases. - +This is an experimental feature that will change in backward-incompatible ways in future releases. ## Supplying Trace Context to ClickHouse -ClickHouse accepts trace context HTTP headers, as described by -the [W3C recommendation](https://www.w3.org/TR/trace-context/). -It also accepts trace context over native protocol that is used for -communication between ClickHouse servers or between the client and server. -For manual testing, trace context headers conforming to the Trace Context -recommendation can be supplied to `clickhouse-client` using -`--opentelemetry-traceparent` and `--opentelemetry-tracestate` flags. - -If no parent trace context is supplied, ClickHouse can start a new trace, with -probability controlled by the `opentelemetry_start_trace_probability` setting. +ClickHouse accepts trace context HTTP headers, as described by the [W3C recommendation](https://www.w3.org/TR/trace-context/). It also accepts trace context over a native protocol that is used for communication between ClickHouse servers or between the client and server. For manual testing, trace context headers conforming to the Trace Context recommendation can be supplied to `clickhouse-client` using `--opentelemetry-traceparent` and `--opentelemetry-tracestate` flags. +If no parent trace context is supplied, ClickHouse can start a new trace, with probability controlled by the `opentelemetry_start_trace_probability` setting. ## Propagating the Trace Context The trace context is propagated to downstream services in the following cases: -* Queries to remote ClickHouse servers, such as when using `Distributed` table - engine. +* Queries to remote ClickHouse servers, such as when using `Distributed` table engine. * `URL` table function. Trace context information is sent in HTTP headers. - ## Tracing the ClickHouse Itself -ClickHouse creates _trace spans_ for each query and some of the query execution -stages, such as query planning or distributed queries. +ClickHouse creates _trace spans_ for each query and some of the query execution stages, such as query planning or distributed queries. -To be useful, the tracing information has to be exported to a monitoring system -that supports OpenTelemetry, such as Jaeger or Prometheus. ClickHouse avoids -a dependency on a particular monitoring system, instead only providing the -tracing data through a system table. OpenTelemetry trace span information -[required by the standard](https://github.com/open-telemetry/opentelemetry-specification/blob/master/specification/overview.md#span) -is stored in the `system.opentelemetry_span_log` table. +To be useful, the tracing information has to be exported to a monitoring system that supports `OpenTelemetry`, such as `Jaeger` or `Prometheus`. ClickHouse avoids a dependency on a particular monitoring system, instead only providing the tracing data through a system table. `OpenTelemetry` trace span information [required by the standard](https://github.com/open-telemetry/opentelemetry-specification/blob/master/specification/overview.md#span) is stored in the `system.opentelemetry_span_log` table. -The table must be enabled in the server configuration, see the `opentelemetry_span_log` -element in the default config file `config.xml`. It is enabled by default. +The table must be enabled in the server configuration, see the `opentelemetry_span_log` element in the default config file `config.xml`. It is enabled by default. The table has the following columns: @@ -64,8 +44,7 @@ The table has the following columns: - `attribute.name` - `attribute.values` -The tags or attributes are saved as two parallel arrays, containing the keys -and values. Use `ARRAY JOIN` to work with them. +The tags or attributes are saved as two parallel arrays, containing the keys and values. Use `ARRAY JOIN` to work with them. ## Integration with monitoring systems @@ -94,3 +73,5 @@ FROM system.opentelemetry_span_log ``` In case of any errors, the part of the log data for which the error has occurred will be silently lost. Check the server log for error messages if the data does not arrive. + +[Original article](https://clickhouse.tech/docs/en/operations/opentelemetry/) diff --git a/docs/en/sql-reference/functions/other-functions.md b/docs/en/sql-reference/functions/other-functions.md index 08d34770f57..86ce53b56f6 100644 --- a/docs/en/sql-reference/functions/other-functions.md +++ b/docs/en/sql-reference/functions/other-functions.md @@ -639,7 +639,7 @@ The result of the function depends on the affected data blocks and the order of It can reach the neighbor rows only inside the currently processed data block. The rows order used during the calculation of `neighbor` can differ from the order of rows returned to the user. -To prevent that you can make a subquery with ORDER BY and call the function from outside the subquery. +To prevent that you can make a subquery with `ORDER BY` and call the function from outside the subquery. **Parameters** @@ -745,12 +745,12 @@ Calculates the difference between successive row values ​​in the data block. Returns 0 for the first row and the difference from the previous row for each subsequent row. !!! warning "Warning" - It can reach the previos row only inside the currently processed data block. + It can reach the previous row only inside the currently processed data block. The result of the function depends on the affected data blocks and the order of data in the block. The rows order used during the calculation of `runningDifference` can differ from the order of rows returned to the user. -To prevent that you can make a subquery with ORDER BY and call the function from outside the subquery. +To prevent that you can make a subquery with `ORDER BY` and call the function from outside the subquery. Example: diff --git a/docs/en/sql-reference/statements/optimize.md b/docs/en/sql-reference/statements/optimize.md index a67f282e793..9b16a12d2e2 100644 --- a/docs/en/sql-reference/statements/optimize.md +++ b/docs/en/sql-reference/statements/optimize.md @@ -17,7 +17,7 @@ When `OPTIMIZE` is used with the [ReplicatedMergeTree](../../engines/table-engin - If `OPTIMIZE` doesn’t perform a merge for any reason, it doesn’t notify the client. To enable notifications, use the [optimize_throw_if_noop](../../operations/settings/settings.md#setting-optimize_throw_if_noop) setting. - If you specify a `PARTITION`, only the specified partition is optimized. [How to set partition expression](../../sql-reference/statements/alter/index.md#alter-how-to-specify-part-expr). -- If you specify `FINAL`, optimization is performed even when all the data is already in one part. +- If you specify `FINAL`, optimization is performed even when all the data is already in one part. Also merge is forced even if concurrent merges are performed. - If you specify `DEDUPLICATE`, then completely identical rows will be deduplicated (all columns are compared), it makes sense only for the MergeTree engine. !!! warning "Warning" diff --git a/docs/ru/operations/opentelemetry.md b/docs/ru/operations/opentelemetry.md new file mode 100644 index 00000000000..2131128e17d --- /dev/null +++ b/docs/ru/operations/opentelemetry.md @@ -0,0 +1,49 @@ +--- +toc_priority: 62 +toc_title: Поддержка OpenTelemetry +--- + +# [экспериментально] Поддержка OpenTelemetry + +[OpenTelemetry](https://opentelemetry.io/) — это открытый стандарт для сбора трассировок и метрик из распределенного приложения. В ClickHouse есть поддержка `OpenTelemetry`. + +!!! warning "Предупреждение" +Этот стандарт экспериментальный и будет изменяться в будущих релизах. + +## Обеспечение поддержки контекста трассировки в ClickHouse + +ClickHouse принимает контекст трассировки HTTP заголовков, как описано в [рекомендации W3C](https://www.w3.org/TR/trace-context/). Также он принимает контекст трассировки через нативный протокол, который используется для связи между серверами ClickHouse или между клиентом и сервером. В ручном тестировании заголовки контекста трассировки, соответствующие рекомендации контекста трассировки, могут быть переданы в `clickhouse-client` через флаги: `--opentelemetry-traceparent` и `--opentelemetry-tracestate`. + +Если доставлен не родительский контекст трассировки, ClickHouse может запустить новую трассировку, которая будет контролироваться настройкой `opentelemetry_start_trace_probability`. + +## Распространение контекста трассировки + +Контекст трассировки распространяется на нижестоящие сервисы в следующих случаях: + +* При использовании запросов к удаленным серверам ClickHouse, например при использовании движка таблиц `Distributed`. + +* При использовании табличной функции `URL`. Информация контекста трассировки передается в HTTP заголовки. + +## Трассировка ClickHouse + +ClickHouse создает _trace spans_ для каждого запроса и некоторых этапов выполнения запроса, таких как планирование запросов или распределенные запросы. + +Чтобы быть полезной, информация трассировки должна быть экспортирована в систему мониторинга, поддерживающую `OpenTelemetry`, такую как `Jaeger` или `Prometheus`. ClickHouse не зависит от конкретной системы мониторинга, вместо этого предоставляя данные трассировки только через системную таблицу. Информация о диапазоне трассировки в `OpenTelemetry`, [требуемая стандартом](https://github.com/open-telemetry/opentelemetry-specification/blob/master/specification/overview.md#span), хранится в системной таблице `system.opentelemetry_span_log`. + +Таблица должна быть включена в конфигурации сервера, смотрите элемент `opentelemetry_span_log` в файле конфигурации `config.xml`. По умолчанию таблица уже включена. + +В таблице имеются следующие столбцы: + +- `trace_id` +- `span_id` +- `parent_span_id` +- `operation_name` +- `start_time` +- `finish_time` +- `finish_date` +- `attribute.name` +- `attribute.values` + +Теги или атрибуты сохраняются в виде двух параллельных массивов, содержащих ключи и значения. Для работы с ними используйте `ARRAY JOIN`. + +[Оригинальная статья](https://clickhouse.tech/docs/ru/operations/opentelemetry/) diff --git a/docs/ru/sql-reference/functions/date-time-functions.md b/docs/ru/sql-reference/functions/date-time-functions.md index 31482cde77f..384b6eb58f7 100644 --- a/docs/ru/sql-reference/functions/date-time-functions.md +++ b/docs/ru/sql-reference/functions/date-time-functions.md @@ -23,8 +23,6 @@ SELECT └─────────────────────┴────────────┴────────────┴─────────────────────┘ ``` -Поддерживаются только часовые пояса, отличающиеся от UTC на целое число часов. - ## toTimeZone {#totimezone} Переводит дату или дату-с-временем в указанный часовой пояс. Часовой пояс (таймзона) это атрибут типов Date/DateTime, внутреннее значение (количество секунд) поля таблицы или колонки результата не изменяется, изменяется тип поля и автоматически его текстовое отображение. diff --git a/docs/ru/sql-reference/functions/other-functions.md b/docs/ru/sql-reference/functions/other-functions.md index 68afb3e24ce..1de34df9126 100644 --- a/docs/ru/sql-reference/functions/other-functions.md +++ b/docs/ru/sql-reference/functions/other-functions.md @@ -562,7 +562,7 @@ SELECT ## neighbor {#neighbor} -Функция позволяет получить доступ к значению в колонке `column`, находящемуся на смещении `offset` относительно текущей строки. Является частичной реализацией [оконных функций](https://en.wikipedia.org/wiki/SQL_window_function) `LEAD()` и `LAG()`. +Функция позволяет получить доступ к значению в столбце `column`, находящемуся на смещении `offset` относительно текущей строки. Является частичной реализацией [оконных функций](https://en.wikipedia.org/wiki/SQL_window_function) `LEAD()` и `LAG()`. **Синтаксис** @@ -570,7 +570,13 @@ SELECT neighbor(column, offset[, default_value]) ``` -Результат функции зависит от затронутых блоков данных и порядка данных в блоке. Если сделать подзапрос с ORDER BY и вызывать функцию извне подзапроса, можно будет получить ожидаемый результат. +Результат функции зависит от затронутых блоков данных и порядка данных в блоке. + +!!! warning "Предупреждение" + Функция может получить доступ к значению в столбце соседней строки только внутри обрабатываемого в данный момент блока данных. + +Порядок строк, используемый при вычислении функции `neighbor`, может отличаться от порядка строк, возвращаемых пользователю. +Чтобы этого не случилось, вы можете сделать подзапрос с `ORDER BY` и вызвать функцию изне подзапроса. **Параметры** @@ -675,8 +681,13 @@ FROM numbers(16) Считает разницу между последовательными значениями строк в блоке данных. Возвращает 0 для первой строки и разницу с предыдущей строкой для каждой последующей строки. +!!! warning "Предупреждение" + Функция может взять значение предыдущей строки только внутри текущего обработанного блока данных. + Результат функции зависит от затронутых блоков данных и порядка данных в блоке. -Если сделать подзапрос с ORDER BY и вызывать функцию извне подзапроса, можно будет получить ожидаемый результат. + +Порядок строк, используемый при вычислении функции `runningDifference`, может отличаться от порядка строк, возвращаемых пользователю. +Чтобы этого не случилось, вы можете сделать подзапрос с `ORDER BY` и вызвать функцию извне подзапроса. Пример: diff --git a/docs/ru/sql-reference/statements/optimize.md b/docs/ru/sql-reference/statements/optimize.md index 9b94c31a8f7..8b1d72fed80 100644 --- a/docs/ru/sql-reference/statements/optimize.md +++ b/docs/ru/sql-reference/statements/optimize.md @@ -15,11 +15,10 @@ OPTIMIZE TABLE [db.]name [ON CLUSTER cluster] [PARTITION partition | PARTITION I - Если `OPTIMIZE` не выполняет мёрж по любой причине, ClickHouse не оповещает об этом клиента. Чтобы включить оповещения, используйте настройку [optimize_throw_if_noop](../../operations/settings/settings.md#setting-optimize_throw_if_noop). - Если указать `PARTITION`, то оптимизация выполняется только для указанной партиции. [Как задавать имя партиции в запросах](alter/index.md#alter-how-to-specify-part-expr). -- Если указать `FINAL`, то оптимизация выполняется даже в том случае, если все данные уже лежат в одном куске. +- Если указать `FINAL`, то оптимизация выполняется даже в том случае, если все данные уже лежат в одном куске. Кроме того, слияние является принудительным, даже если выполняются параллельные слияния. - Если указать `DEDUPLICATE`, то произойдет схлопывание полностью одинаковых строк (сравниваются значения во всех колонках), имеет смысл только для движка MergeTree. !!! warning "Внимание" Запрос `OPTIMIZE` не может устранить причину появления ошибки «Too many parts». - -[Оригинальная статья](https://clickhouse.tech/docs/ru/sql-reference/statements/optimize/) \ No newline at end of file +[Оригинальная статья](https://clickhouse.tech/docs/ru/sql-reference/statements/optimize/) From d27f5114c5697c212f211cb389bffa44d0c36b54 Mon Sep 17 00:00:00 2001 From: Pavel Kruglov Date: Fri, 29 Jan 2021 23:21:11 +0300 Subject: [PATCH 0106/2357] Add LOG_DEBUG for tests debug --- src/Client/Connection.cpp | 30 ++++++++++++------------ src/Client/GetHedgedConnections.cpp | 36 ++++++++++++++--------------- src/Client/HedgedConnections.cpp | 30 ++++++++++++------------ 3 files changed, 48 insertions(+), 48 deletions(-) diff --git a/src/Client/Connection.cpp b/src/Client/Connection.cpp index 0bf0456c79c..42d9c86739e 100644 --- a/src/Client/Connection.cpp +++ b/src/Client/Connection.cpp @@ -92,7 +92,7 @@ void Connection::connect(const ConnectionTimeouts & timeouts) void Connection::disconnect() { -// LOG_DEBUG(log_wrapper.get(), "disconnect"); + LOG_DEBUG(log_wrapper.get(), "disconnect"); maybe_compressed_out = nullptr; in = nullptr; @@ -106,7 +106,7 @@ void Connection::disconnect() void Connection::prepare(const ConnectionTimeouts & timeouts) { -// LOG_DEBUG(log_wrapper.get(), "Connect"); + LOG_DEBUG(log_wrapper.get(), "Connect"); LOG_TRACE(log_wrapper.get(), "Connecting. Database: {}. User: {}{}{}", default_database.empty() ? "(not specified)" : default_database, @@ -160,7 +160,7 @@ void Connection::prepare(const ConnectionTimeouts & timeouts) void Connection::sendHello() { -// LOG_DEBUG(log_wrapper.get(), "sendHello"); + LOG_DEBUG(log_wrapper.get(), "sendHello"); /** Disallow control characters in user controlled parameters * to mitigate the possibility of SSRF. @@ -218,7 +218,7 @@ void Connection::sendHello() void Connection::receiveHello() { -// LOG_DEBUG(log_wrapper.get(), "receiveHello"); + LOG_DEBUG(log_wrapper.get(), "receiveHello"); /// Receive hello packet. UInt64 packet_type = 0; @@ -323,7 +323,7 @@ const String & Connection::getServerDisplayName(const ConnectionTimeouts & timeo void Connection::forceConnected(const ConnectionTimeouts & timeouts) { -// LOG_DEBUG(log_wrapper.get(), "forceConnected"); + LOG_DEBUG(log_wrapper.get(), "forceConnected"); if (!connected) { @@ -351,7 +351,7 @@ void Connection::sendClusterNameAndSalt() bool Connection::ping() { -// LOG_DEBUG(log_wrapper.get(), "ping"); + LOG_DEBUG(log_wrapper.get(), "ping"); TimeoutSetter timeout_setter(*socket, sync_request_timeout, true); try @@ -404,7 +404,7 @@ TablesStatusResponse Connection::getTablesStatus(const ConnectionTimeouts & time void Connection::sendTablesStatusRequest(const TablesStatusRequest & request) { -// LOG_DEBUG(log_wrapper.get(), "sendTablesStatusRequest"); + LOG_DEBUG(log_wrapper.get(), "sendTablesStatusRequest"); writeVarUInt(Protocol::Client::TablesStatusRequest, *out); request.write(*out, server_revision); @@ -413,7 +413,7 @@ void Connection::sendTablesStatusRequest(const TablesStatusRequest & request) TablesStatusResponse Connection::receiveTablesStatusResponse() { -// LOG_DEBUG(log_wrapper.get(), "receiveTablesStatusResponse"); + LOG_DEBUG(log_wrapper.get(), "receiveTablesStatusResponse"); UInt64 response_type = 0; readVarUInt(response_type, *in); @@ -440,7 +440,7 @@ void Connection::sendQuery( if (!connected) connect(timeouts); -// LOG_DEBUG(log_wrapper.get(), "sendQuery"); + LOG_DEBUG(log_wrapper.get(), "sendQuery"); TimeoutSetter timeout_setter(*socket, timeouts.send_timeout, timeouts.receive_timeout, true); @@ -540,7 +540,7 @@ void Connection::sendCancel() if (!out) return; -// LOG_DEBUG(log_wrapper.get(), "sendCancel"); + LOG_DEBUG(log_wrapper.get(), "sendCancel"); writeVarUInt(Protocol::Client::Cancel, *out); out->next(); @@ -549,7 +549,7 @@ void Connection::sendCancel() void Connection::sendData(const Block & block, const String & name, bool scalar) { -// LOG_DEBUG(log_wrapper.get(), "sendData"); + LOG_DEBUG(log_wrapper.get(), "sendData"); if (!block_out) { @@ -581,7 +581,7 @@ void Connection::sendData(const Block & block, const String & name, bool scalar) void Connection::sendPreparedData(ReadBuffer & input, size_t size, const String & name) { /// NOTE 'Throttler' is not used in this method (could use, but it's not important right now). -// LOG_DEBUG(log_wrapper.get(), "sendPreparedData"); + LOG_DEBUG(log_wrapper.get(), "sendPreparedData"); if (input.eof()) throw Exception("Buffer is empty (some kind of corruption)", ErrorCodes::EMPTY_DATA_PASSED); @@ -602,7 +602,7 @@ void Connection::sendScalarsData(Scalars & data) if (data.empty()) return; -// LOG_DEBUG(log_wrapper.get(), "sendScalarsData"); + LOG_DEBUG(log_wrapper.get(), "sendScalarsData"); Stopwatch watch; size_t out_bytes = out ? out->count() : 0; @@ -689,7 +689,7 @@ void Connection::sendExternalTablesData(ExternalTablesData & data) return; } -// LOG_DEBUG(log_wrapper.get(), "sendExternalTablesData"); + LOG_DEBUG(log_wrapper.get(), "sendExternalTablesData"); Stopwatch watch; size_t out_bytes = out ? out->count() : 0; @@ -789,7 +789,7 @@ std::optional Connection::checkPacket(size_t timeout_microseconds) Packet Connection::receivePacket(AsyncCallback async_callback) { -// LOG_DEBUG(log_wrapper.get(), "receivePacket"); + LOG_DEBUG(log_wrapper.get(), "receivePacket"); in->setAsyncCallback(std::move(async_callback)); SCOPE_EXIT(in->setAsyncCallback({})); diff --git a/src/Client/GetHedgedConnections.cpp b/src/Client/GetHedgedConnections.cpp index 269edd252a2..546068ca4ee 100644 --- a/src/Client/GetHedgedConnections.cpp +++ b/src/Client/GetHedgedConnections.cpp @@ -86,7 +86,7 @@ std::vector GetHedgedConnections::getMany GetHedgedConnections::ReplicaStatePtr GetHedgedConnections::getNextConnection(bool non_blocking) { -// LOG_DEBUG(log, "getNextConnection"); + LOG_DEBUG(log, "getNextConnection"); ReplicaStatePtr replica = createNewReplica(); int index; @@ -161,7 +161,7 @@ GetHedgedConnections::ReplicaStatePtr GetHedgedConnections::getNextConnection(bo void GetHedgedConnections::stopChoosingReplicas() { -// LOG_DEBUG(log, "stopChoosingReplicas"); + LOG_DEBUG(log, "stopChoosingReplicas"); for (auto & [fd, replica] : fd_to_replica) { removeTimeoutsFromReplica(replica, epoll, timeout_fd_to_replica); @@ -195,7 +195,7 @@ int GetHedgedConnections::getNextIndex() return -1; } -// LOG_DEBUG(log, "get next index: {}", next_index); + LOG_DEBUG(log, "get next index: {}", next_index); last_used_index = next_index; return next_index; @@ -203,7 +203,7 @@ int GetHedgedConnections::getNextIndex() GetHedgedConnections::Action GetHedgedConnections::startTryGetConnection(int index, ReplicaStatePtr & replica) { -// LOG_DEBUG(log, "start try get connection with {} replica", index); + LOG_DEBUG(log, "start try get connection with {} replica", index); TryGetConnection & try_get_connection = try_get_connections[index]; replica->state = State::NOT_READY; @@ -240,14 +240,14 @@ GetHedgedConnections::Action GetHedgedConnections::startTryGetConnection(int ind GetHedgedConnections::Action GetHedgedConnections::processTryGetConnectionStage(ReplicaStatePtr & replica, bool remove_from_epoll) { -// LOG_DEBUG(log, "process get connection stage for {} replica", replica->index); + LOG_DEBUG(log, "process get connection stage for {} replica", replica->index); TryGetConnection & try_get_connection = try_get_connections[replica->index]; if (try_get_connection.stage == TryGetConnection::Stage::FINISHED) { indexes_in_process.erase(replica->index); -// LOG_DEBUG(log, "stage: FINISHED"); + LOG_DEBUG(log, "stage: FINISHED"); ++entries_count; if (remove_from_epoll) @@ -258,11 +258,11 @@ GetHedgedConnections::processTryGetConnectionStage(ReplicaStatePtr & replica, bo if (try_get_connection.result.is_usable) { -// LOG_DEBUG(log, "replica is usable"); + LOG_DEBUG(log, "replica is usable"); ++usable_count; if (try_get_connection.result.is_up_to_date) { -// LOG_DEBUG(log, "replica is up to date, finish get hedged connections"); + LOG_DEBUG(log, "replica is up to date, finish get hedged connections"); replica->state = State::READY; ready_indexes.insert(replica->index); return Action::FINISH; @@ -276,12 +276,12 @@ GetHedgedConnections::processTryGetConnectionStage(ReplicaStatePtr & replica, bo } else if (try_get_connection.stage == TryGetConnection::Stage::FAILED) { -// LOG_DEBUG(log, "stage: FAILED"); + LOG_DEBUG(log, "stage: FAILED"); processFailedConnection(replica); return Action::TRY_NEXT_REPLICA; } -// LOG_DEBUG(log, "middle stage, process epoll events"); + LOG_DEBUG(log, "middle stage, process epoll events"); /// Get connection process is not finished return Action::PROCESS_EPOLL_EVENTS; @@ -289,7 +289,7 @@ GetHedgedConnections::processTryGetConnectionStage(ReplicaStatePtr & replica, bo void GetHedgedConnections::processFailedConnection(ReplicaStatePtr & replica) { -// LOG_DEBUG(log, "failed connection with {} replica", replica->index); + LOG_DEBUG(log, "failed connection with {} replica", replica->index); ShuffledPool & shuffled_pool = shuffled_pools[replica->index]; LOG_WARNING( @@ -314,7 +314,7 @@ void GetHedgedConnections::processFailedConnection(ReplicaStatePtr & replica) void GetHedgedConnections::addTimeouts(ReplicaStatePtr & replica) { -// LOG_DEBUG(log, "add timeouts for {} replica", replica->index); + LOG_DEBUG(log, "add timeouts for {} replica", replica->index); addTimeoutToReplica(TimerTypes::RECEIVE_TIMEOUT, replica, epoll, timeout_fd_to_replica, timeouts); @@ -327,7 +327,7 @@ void GetHedgedConnections::addTimeouts(ReplicaStatePtr & replica) GetHedgedConnections::ReplicaStatePtr GetHedgedConnections::processEpollEvents(bool non_blocking) { -// LOG_DEBUG(log, "process epoll events"); + LOG_DEBUG(log, "process epoll events"); int event_fd; ReplicaStatePtr replica = nullptr; bool finish = false; @@ -349,7 +349,7 @@ GetHedgedConnections::ReplicaStatePtr GetHedgedConnections::processEpollEvents(b throw Exception("Unknown event from epoll", ErrorCodes::LOGICAL_ERROR); } -// LOG_DEBUG(log, "cancel process epoll events"); + LOG_DEBUG(log, "cancel process epoll events"); return replica; } @@ -365,7 +365,7 @@ int GetHedgedConnections::getReadyFileDescriptor(AsyncCallback async_callback) bool GetHedgedConnections::processReplicaEvent(ReplicaStatePtr & replica, bool non_blocking) { -// LOG_DEBUG(log, "epoll event is {} replica", replica->index); + LOG_DEBUG(log, "epoll event is {} replica", replica->index); removeTimeoutsFromReplica(replica, epoll, timeout_fd_to_replica); try_get_connections[replica->index].run(); Action action = processTryGetConnectionStage(replica, true); @@ -380,7 +380,7 @@ bool GetHedgedConnections::processReplicaEvent(ReplicaStatePtr & replica, bool n bool GetHedgedConnections::processTimeoutEvent(ReplicaStatePtr & replica, TimerDescriptorPtr timeout_descriptor, bool non_blocking) { -// LOG_DEBUG(log, "epoll event is timeout for {} replica", replica->index); + LOG_DEBUG(log, "epoll event is timeout for {} replica", replica->index); epoll.remove(timeout_descriptor->getDescriptor()); replica->active_timeouts.erase(timeout_descriptor->getDescriptor()); @@ -388,7 +388,7 @@ bool GetHedgedConnections::processTimeoutEvent(ReplicaStatePtr & replica, TimerD if (timeout_descriptor->getType() == TimerTypes::RECEIVE_TIMEOUT) { -// LOG_DEBUG(log, "process receive timeout for {} replica", replica->index); + LOG_DEBUG(log, "process receive timeout for {} replica", replica->index); removeTimeoutsFromReplica(replica, epoll, timeout_fd_to_replica); epoll.remove(replica->fd); fd_to_replica.erase(replica->fd); @@ -415,7 +415,7 @@ bool GetHedgedConnections::processTimeoutEvent(ReplicaStatePtr & replica, TimerD void GetHedgedConnections::setBestUsableReplica(ReplicaStatePtr & replica) { -// LOG_DEBUG(log, "set best usable replica"); + LOG_DEBUG(log, "set best usable replica"); std::vector indexes(try_get_connections.size()); for (size_t i = 0; i != indexes.size(); ++i) diff --git a/src/Client/HedgedConnections.cpp b/src/Client/HedgedConnections.cpp index 6dc746ec7f4..4282e6b8e21 100644 --- a/src/Client/HedgedConnections.cpp +++ b/src/Client/HedgedConnections.cpp @@ -59,7 +59,7 @@ void HedgedConnections::sendScalarsData(Scalars & data) { std::lock_guard lock(cancel_mutex); -// LOG_DEBUG(log, "sendScalarsData"); + LOG_DEBUG(log, "sendScalarsData"); if (!sent_query) throw Exception("Cannot send scalars data: query not yet sent.", ErrorCodes::LOGICAL_ERROR); @@ -78,7 +78,7 @@ void HedgedConnections::sendExternalTablesData(std::vector & { std::lock_guard lock(cancel_mutex); -// LOG_DEBUG(log, "sendExternalTablesData"); + LOG_DEBUG(log, "sendExternalTablesData"); if (!sent_query) throw Exception("Cannot send external tables data: query not yet sent.", ErrorCodes::LOGICAL_ERROR); @@ -106,7 +106,7 @@ void HedgedConnections::sendQuery( { std::lock_guard lock(cancel_mutex); -// LOG_DEBUG(log, "sendQuery"); + LOG_DEBUG(log, "sendQuery"); if (sent_query) throw Exception("Query already sent.", ErrorCodes::LOGICAL_ERROR); @@ -159,7 +159,7 @@ void HedgedConnections::disconnect() { std::lock_guard lock(cancel_mutex); -// LOG_DEBUG(log, "disconnect"); + LOG_DEBUG(log, "disconnect"); for (auto & replicas_with_same_offset : replicas) for (auto & replica : replicas_with_same_offset) @@ -178,7 +178,7 @@ std::string HedgedConnections::dumpAddresses() const { std::lock_guard lock(cancel_mutex); -// LOG_DEBUG(log, "dumpAddresses"); + LOG_DEBUG(log, "dumpAddresses"); std::string addresses; bool is_first = true; @@ -202,7 +202,7 @@ void HedgedConnections::sendCancel() { std::lock_guard lock(cancel_mutex); -// LOG_DEBUG(log, "sendCancel"); + LOG_DEBUG(log, "sendCancel"); if (!sent_query || cancelled) throw Exception("Cannot cancel. Either no query sent or already cancelled.", ErrorCodes::LOGICAL_ERROR); @@ -223,7 +223,7 @@ Packet HedgedConnections::drain() if (!cancelled) throw Exception("Cannot drain connections: cancel first.", ErrorCodes::LOGICAL_ERROR); -// LOG_DEBUG(log, "drain"); + LOG_DEBUG(log, "drain"); Packet res; res.type = Protocol::Server::EndOfStream; @@ -273,7 +273,7 @@ Packet HedgedConnections::receivePacketUnlocked(AsyncCallback async_callback) Packet HedgedConnections::receivePacketImpl(AsyncCallback async_callback) { -// LOG_DEBUG(log, "sreceivePacketImpl"); + LOG_DEBUG(log, "sreceivePacketImpl"); int event_fd; ReplicaStatePtr replica = nullptr; @@ -285,14 +285,14 @@ Packet HedgedConnections::receivePacketImpl(AsyncCallback async_callback) if (fd_to_replica.find(event_fd) != fd_to_replica.end()) { -// LOG_DEBUG(log, "event is replica"); + LOG_DEBUG(log, "event is replica"); replica = fd_to_replica[event_fd]; packet = receivePacketFromReplica(replica, async_callback); finish = true; } else if (timeout_fd_to_replica.find(event_fd) != timeout_fd_to_replica.end()) { -// LOG_DEBUG(log, "event is timeout"); + LOG_DEBUG(log, "event is timeout"); replica = timeout_fd_to_replica[event_fd]; processTimeoutEvent(replica, replica->active_timeouts[event_fd].get()); } @@ -316,7 +316,7 @@ int HedgedConnections::getReadyFileDescriptor(AsyncCallback async_callback) Packet HedgedConnections::receivePacketFromReplica(ReplicaStatePtr & replica, AsyncCallback async_callback) { -// LOG_DEBUG(log, "sreceivePacketFromReplica"); + LOG_DEBUG(log, "sreceivePacketFromReplica"); Packet packet = replica->connection->receivePacket(std::move(async_callback)); switch (packet.type) { @@ -352,7 +352,7 @@ void HedgedConnections::processReceiveData(ReplicaStatePtr & replica) /// When we receive first packet of data from any replica, we continue working with this replica /// and stop working with other replicas (if there are other replicas). -// LOG_DEBUG(log, "processReceiveData"); + LOG_DEBUG(log, "processReceiveData"); offsets_with_received_data.insert(replica->parallel_replica_offset); @@ -397,7 +397,7 @@ void HedgedConnections::processTimeoutEvent(ReplicaStatePtr & replica, TimerDesc void HedgedConnections::tryGetNewReplica() { -// LOG_DEBUG(log, "tryGetNewReplica"); + LOG_DEBUG(log, "tryGetNewReplica"); ReplicaStatePtr new_replica = get_hedged_connections.getNextConnection(/*non_blocking*/ true); @@ -408,7 +408,7 @@ void HedgedConnections::tryGetNewReplica() if (new_replica->isReady()) { -// LOG_DEBUG(log, "processNewReadyReplica"); + LOG_DEBUG(log, "processNewReadyReplica"); new_replica->parallel_replica_offset = offsets_queue.front(); offsets_queue.pop(); replicas[new_replica->parallel_replica_offset].push_back(new_replica); @@ -432,7 +432,7 @@ void HedgedConnections::tryGetNewReplica() void HedgedConnections::finishProcessReplica(ReplicaStatePtr & replica, bool disconnect) { -// LOG_DEBUG(log, "finishProcessReplica"); + LOG_DEBUG(log, "finishProcessReplica"); removeTimeoutsFromReplica(replica, epoll, timeout_fd_to_replica); epoll.remove(replica->fd); From ddd828e7847da270d457b0c7e747b96c7a8ad81d Mon Sep 17 00:00:00 2001 From: Roman Bug Date: Sat, 30 Jan 2021 18:46:26 +0300 Subject: [PATCH 0107/2357] Update docs/en/sql-reference/functions/date-time-functions.md Co-authored-by: Anna <42538400+adevyatova@users.noreply.github.com> --- docs/en/sql-reference/functions/date-time-functions.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/en/sql-reference/functions/date-time-functions.md b/docs/en/sql-reference/functions/date-time-functions.md index f11bec55697..624e04ca21c 100644 --- a/docs/en/sql-reference/functions/date-time-functions.md +++ b/docs/en/sql-reference/functions/date-time-functions.md @@ -602,7 +602,7 @@ This is necessary for searching for pageviews in the corresponding session. ## formatDateTime {#formatdatetime} -Function formats a Time according to the given Format string. N.B.: Format is a constant expression, e.g. you cannot have multiple formats for a single result column. +Formats a Time according to the given Format string. Format is a constant expression, so you cannot have multiple formats for a single result column. **Syntax** From 7a9863194a9310270c8b6f8ebd1d75195f7bae59 Mon Sep 17 00:00:00 2001 From: Roman Bug Date: Sat, 30 Jan 2021 18:46:36 +0300 Subject: [PATCH 0108/2357] Update docs/ru/operations/utilities/clickhouse-local.md Co-authored-by: Anna <42538400+adevyatova@users.noreply.github.com> --- docs/ru/operations/utilities/clickhouse-local.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/ru/operations/utilities/clickhouse-local.md b/docs/ru/operations/utilities/clickhouse-local.md index e3c421ac75e..f439049401c 100644 --- a/docs/ru/operations/utilities/clickhouse-local.md +++ b/docs/ru/operations/utilities/clickhouse-local.md @@ -77,7 +77,7 @@ $ clickhouse-local --query " 1 2 ``` -А теперь давайте выведем на экран объём оперативной памяти, занимаемой пользователями (Unix): +Объём оперативной памяти, занимаемой пользователями (Unix): Запрос: From 9e0d5c4c9819914d682806f1a7e550bff4125d61 Mon Sep 17 00:00:00 2001 From: Roman Bug Date: Sat, 30 Jan 2021 18:46:53 +0300 Subject: [PATCH 0109/2357] Update docs/ru/sql-reference/data-types/simpleaggregatefunction.md Co-authored-by: Anna <42538400+adevyatova@users.noreply.github.com> --- docs/ru/sql-reference/data-types/simpleaggregatefunction.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/ru/sql-reference/data-types/simpleaggregatefunction.md b/docs/ru/sql-reference/data-types/simpleaggregatefunction.md index c1b3ac240f0..2ca949843b7 100644 --- a/docs/ru/sql-reference/data-types/simpleaggregatefunction.md +++ b/docs/ru/sql-reference/data-types/simpleaggregatefunction.md @@ -1,6 +1,6 @@ # SimpleAggregateFunction {#data-type-simpleaggregatefunction} -Тип данных `SimpleAggregateFunction(name, types_of_arguments…)` хранит только текущее значение агрегатной функции и не сохраняет ее полное состояние, как это делает [`AggregateFunction`](../../sql-reference/data-types/aggregatefunction.md). Такая оптимизация может быть применена к функциям, для которых выполняется следующее свойство: результат выполнения функции `f` к набору строк `S1 UNION ALL S2` может быть получен путем выполнения `f` к отдельным частям набора строк, +Хранит только текущее значение агрегатной функции и не сохраняет ее полное состояние, как это делает [`AggregateFunction`](../../sql-reference/data-types/aggregatefunction.md). Такая оптимизация может быть применена к функциям, которые обладают следующим свойством: результат выполнения функции `f` к набору строк `S1 UNION ALL S2` может быть получен путем выполнения `f` к отдельным частям набора строк, а затем повторного выполнения `f` к результатам: `f(S1 UNION ALL S2) = f(f(S1) UNION ALL f(S2))`. Это свойство гарантирует, что результатов частичной агрегации достаточно для вычисления комбинированной, поэтому нам не нужно хранить и обрабатывать какие-либо дополнительные данные. Поддерживаются следующие агрегатные функции: From 320e78dea614311bd8fcd7451906be1c90f71538 Mon Sep 17 00:00:00 2001 From: Roman Bug Date: Sat, 30 Jan 2021 18:47:15 +0300 Subject: [PATCH 0110/2357] Update docs/ru/sql-reference/data-types/simpleaggregatefunction.md Co-authored-by: Anna <42538400+adevyatova@users.noreply.github.com> --- docs/ru/sql-reference/data-types/simpleaggregatefunction.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/ru/sql-reference/data-types/simpleaggregatefunction.md b/docs/ru/sql-reference/data-types/simpleaggregatefunction.md index 2ca949843b7..cb6c4b8208d 100644 --- a/docs/ru/sql-reference/data-types/simpleaggregatefunction.md +++ b/docs/ru/sql-reference/data-types/simpleaggregatefunction.md @@ -21,7 +21,7 @@ - [`maxMap`](../../sql-reference/aggregate-functions/reference/maxmap.md#agg_functions-maxmap) !!! note "Примечание" - Значения `SimpleAggregateFunction(func, Type)` отображаются и хранятся так же, как и `Type`, поэтому вам не требуется применять функции с суффиксами `-Merge`/`-State`. + Значения `SimpleAggregateFunction(func, Type)` отображаются и хранятся так же, как и `Type`, поэтому комбинаторы [-Merge](../../sql-reference/aggregate-functions/combinators.md#aggregate_functions_combinators-merge) и [-State]((../../sql-reference/aggregate-functions/combinators.md#agg-functions-combinator-state) не требуются. `SimpleAggregateFunction` имеет лучшую производительность, чем `AggregateFunction` с той же агрегатной функцией. From 7126ca376995fa58eb3f07a2c55ba4a5cd88a11f Mon Sep 17 00:00:00 2001 From: Roman Bug Date: Sat, 30 Jan 2021 18:47:25 +0300 Subject: [PATCH 0111/2357] Update docs/ru/sql-reference/data-types/simpleaggregatefunction.md Co-authored-by: Anna <42538400+adevyatova@users.noreply.github.com> --- docs/ru/sql-reference/data-types/simpleaggregatefunction.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/ru/sql-reference/data-types/simpleaggregatefunction.md b/docs/ru/sql-reference/data-types/simpleaggregatefunction.md index cb6c4b8208d..b906a56516f 100644 --- a/docs/ru/sql-reference/data-types/simpleaggregatefunction.md +++ b/docs/ru/sql-reference/data-types/simpleaggregatefunction.md @@ -29,7 +29,7 @@ **Параметры** - имя агрегатной функции. -- типы аргументов агрегатной функции. +- `type` — типы аргументов агрегатной функции. **Пример** From ac0ec2753c9a3021b3efaee1b7dbc4898242942f Mon Sep 17 00:00:00 2001 From: Roman Bug Date: Sat, 30 Jan 2021 18:47:33 +0300 Subject: [PATCH 0112/2357] Update docs/ru/sql-reference/data-types/simpleaggregatefunction.md Co-authored-by: Anna <42538400+adevyatova@users.noreply.github.com> --- docs/ru/sql-reference/data-types/simpleaggregatefunction.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/ru/sql-reference/data-types/simpleaggregatefunction.md b/docs/ru/sql-reference/data-types/simpleaggregatefunction.md index b906a56516f..bf866f7bc58 100644 --- a/docs/ru/sql-reference/data-types/simpleaggregatefunction.md +++ b/docs/ru/sql-reference/data-types/simpleaggregatefunction.md @@ -1,4 +1,4 @@ -# SimpleAggregateFunction {#data-type-simpleaggregatefunction} +# SimpleAggregateFunction(func, type) {#data-type-simpleaggregatefunction} Хранит только текущее значение агрегатной функции и не сохраняет ее полное состояние, как это делает [`AggregateFunction`](../../sql-reference/data-types/aggregatefunction.md). Такая оптимизация может быть применена к функциям, которые обладают следующим свойством: результат выполнения функции `f` к набору строк `S1 UNION ALL S2` может быть получен путем выполнения `f` к отдельным частям набора строк, а затем повторного выполнения `f` к результатам: `f(S1 UNION ALL S2) = f(f(S1) UNION ALL f(S2))`. Это свойство гарантирует, что результатов частичной агрегации достаточно для вычисления комбинированной, поэтому нам не нужно хранить и обрабатывать какие-либо дополнительные данные. From 004b9dd09823c729a800310c8449f56ad28bb51a Mon Sep 17 00:00:00 2001 From: Roman Bug Date: Sat, 30 Jan 2021 18:47:41 +0300 Subject: [PATCH 0113/2357] Update docs/ru/sql-reference/data-types/simpleaggregatefunction.md Co-authored-by: Anna <42538400+adevyatova@users.noreply.github.com> --- docs/ru/sql-reference/data-types/simpleaggregatefunction.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/ru/sql-reference/data-types/simpleaggregatefunction.md b/docs/ru/sql-reference/data-types/simpleaggregatefunction.md index bf866f7bc58..39f3ef99b1c 100644 --- a/docs/ru/sql-reference/data-types/simpleaggregatefunction.md +++ b/docs/ru/sql-reference/data-types/simpleaggregatefunction.md @@ -28,7 +28,7 @@ **Параметры** -- имя агрегатной функции. +- `func` — имя агрегатной функции. - `type` — типы аргументов агрегатной функции. **Пример** From ebcee0525d24464222534c002632589b9d1ad318 Mon Sep 17 00:00:00 2001 From: Roman Bug Date: Sat, 30 Jan 2021 18:47:50 +0300 Subject: [PATCH 0114/2357] Update docs/ru/sql-reference/data-types/simpleaggregatefunction.md Co-authored-by: Anna <42538400+adevyatova@users.noreply.github.com> --- docs/ru/sql-reference/data-types/simpleaggregatefunction.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/ru/sql-reference/data-types/simpleaggregatefunction.md b/docs/ru/sql-reference/data-types/simpleaggregatefunction.md index 39f3ef99b1c..10daad93cc6 100644 --- a/docs/ru/sql-reference/data-types/simpleaggregatefunction.md +++ b/docs/ru/sql-reference/data-types/simpleaggregatefunction.md @@ -1,7 +1,7 @@ # SimpleAggregateFunction(func, type) {#data-type-simpleaggregatefunction} Хранит только текущее значение агрегатной функции и не сохраняет ее полное состояние, как это делает [`AggregateFunction`](../../sql-reference/data-types/aggregatefunction.md). Такая оптимизация может быть применена к функциям, которые обладают следующим свойством: результат выполнения функции `f` к набору строк `S1 UNION ALL S2` может быть получен путем выполнения `f` к отдельным частям набора строк, -а затем повторного выполнения `f` к результатам: `f(S1 UNION ALL S2) = f(f(S1) UNION ALL f(S2))`. Это свойство гарантирует, что результатов частичной агрегации достаточно для вычисления комбинированной, поэтому нам не нужно хранить и обрабатывать какие-либо дополнительные данные. +а затем повторного выполнения `f` к результатам: `f(S1 UNION ALL S2) = f(f(S1) UNION ALL f(S2))`. Это свойство гарантирует, что результатов частичной агрегации достаточно для вычисления комбинированной, поэтому хранить и обрабатывать какие-либо дополнительные данные не требуется. Поддерживаются следующие агрегатные функции: From 6aa86846acc60584c28ffed1fc6260b087693509 Mon Sep 17 00:00:00 2001 From: Denis Glazachev Date: Sun, 31 Jan 2021 05:41:28 +0400 Subject: [PATCH 0115/2357] Removing obsoleted test --- tests/integration/test_row_policy/test.py | 11 ----------- 1 file changed, 11 deletions(-) diff --git a/tests/integration/test_row_policy/test.py b/tests/integration/test_row_policy/test.py index c3c86f5a9c5..8919aeab0c5 100644 --- a/tests/integration/test_row_policy/test.py +++ b/tests/integration/test_row_policy/test.py @@ -109,17 +109,6 @@ def test_cannot_trick_row_policy_with_keyword_with(): assert node.query("WITH 0 AS a SELECT b FROM mydb.filtered_table1") == TSV([[0], [1]]) -def test_prewhere_not_supported(): - expected_error = "PREWHERE is not supported if the table is filtered by row-level security" - assert expected_error in node.query_and_get_error("SELECT * FROM mydb.filtered_table1 PREWHERE 1") - assert expected_error in node.query_and_get_error("SELECT * FROM mydb.filtered_table2 PREWHERE 1") - assert expected_error in node.query_and_get_error("SELECT * FROM mydb.filtered_table3 PREWHERE 1") - - # However PREWHERE should still work for user without filtering. - assert node.query("SELECT * FROM mydb.filtered_table1 PREWHERE 1", user="another") == TSV( - [[0, 0], [0, 1], [1, 0], [1, 1]]) - - def test_policy_from_users_xml_affects_only_user_assigned(): assert node.query("SELECT * FROM mydb.filtered_table1") == TSV([[1, 0], [1, 1]]) assert node.query("SELECT * FROM mydb.filtered_table1", user="another") == TSV([[0, 0], [0, 1], [1, 0], [1, 1]]) From 2e8b45afc17d4bce41f6d5517a49842a15d69720 Mon Sep 17 00:00:00 2001 From: Anton Popov Date: Mon, 1 Feb 2021 16:35:08 +0300 Subject: [PATCH 0116/2357] fix ubsan report --- .../Merges/Algorithms/FinishAggregatingInOrderAlgorithm.cpp | 3 +++ 1 file changed, 3 insertions(+) diff --git a/src/Processors/Merges/Algorithms/FinishAggregatingInOrderAlgorithm.cpp b/src/Processors/Merges/Algorithms/FinishAggregatingInOrderAlgorithm.cpp index 0e4de315aa1..3ef0caefd8f 100644 --- a/src/Processors/Merges/Algorithms/FinishAggregatingInOrderAlgorithm.cpp +++ b/src/Processors/Merges/Algorithms/FinishAggregatingInOrderAlgorithm.cpp @@ -12,6 +12,9 @@ FinishAggregatingInOrderAlgorithm::State::State( : num_rows(chunk.getNumRows()) , all_columns(chunk.getColumns()) { + if (!chunk) + return; + sorting_columns.reserve(desc.size()); for (const auto & column_desc : desc) sorting_columns.emplace_back(all_columns[column_desc.column_number].get()); From 7d9eb966f0833a9663fa64d4f7545c787ae49a93 Mon Sep 17 00:00:00 2001 From: Pavel Kruglov Date: Mon, 1 Feb 2021 20:09:55 +0300 Subject: [PATCH 0117/2357] Fix --- src/Client/GetHedgedConnections.cpp | 4 ++-- src/Client/GetHedgedConnections.h | 2 +- src/Client/HedgedConnections.cpp | 5 ++++- src/Common/TimerDescriptor.h | 2 +- tests/integration/test_hedged_requests/test.py | 8 +++++--- 5 files changed, 13 insertions(+), 8 deletions(-) diff --git a/src/Client/GetHedgedConnections.cpp b/src/Client/GetHedgedConnections.cpp index 546068ca4ee..e8e087c4b0e 100644 --- a/src/Client/GetHedgedConnections.cpp +++ b/src/Client/GetHedgedConnections.cpp @@ -343,7 +343,7 @@ GetHedgedConnections::ReplicaStatePtr GetHedgedConnections::processEpollEvents(b else if (timeout_fd_to_replica.find(event_fd) != timeout_fd_to_replica.end()) { replica = timeout_fd_to_replica[event_fd]; - finish = processTimeoutEvent(replica, replica->active_timeouts[event_fd].get(), non_blocking); + finish = processTimeoutEvent(replica, replica->active_timeouts[event_fd], non_blocking); } else throw Exception("Unknown event from epoll", ErrorCodes::LOGICAL_ERROR); @@ -476,7 +476,7 @@ void addTimeoutToReplica( throw Exception("Unknown timeout type", ErrorCodes::BAD_ARGUMENTS); } - std::unique_ptr timeout_descriptor = std::make_unique(); + TimerDescriptorPtr timeout_descriptor = std::make_shared(); timeout_descriptor->setType(type); timeout_descriptor->setRelative(timeout); epoll.add(timeout_descriptor->getDescriptor()); diff --git a/src/Client/GetHedgedConnections.h b/src/Client/GetHedgedConnections.h index 88daad779fe..3ae9aaf9c72 100644 --- a/src/Client/GetHedgedConnections.h +++ b/src/Client/GetHedgedConnections.h @@ -34,7 +34,7 @@ public: int index = -1; int fd = -1; size_t parallel_replica_offset = 0; - std::unordered_map> active_timeouts; + std::unordered_map> active_timeouts; void reset() { diff --git a/src/Client/HedgedConnections.cpp b/src/Client/HedgedConnections.cpp index 4282e6b8e21..16ba19ebe78 100644 --- a/src/Client/HedgedConnections.cpp +++ b/src/Client/HedgedConnections.cpp @@ -294,7 +294,7 @@ Packet HedgedConnections::receivePacketImpl(AsyncCallback async_callback) { LOG_DEBUG(log, "event is timeout"); replica = timeout_fd_to_replica[event_fd]; - processTimeoutEvent(replica, replica->active_timeouts[event_fd].get()); + processTimeoutEvent(replica, replica->active_timeouts[event_fd]); } else if (event_fd == get_hedged_connections.getFileDescriptor()) tryGetNewReplica(); @@ -375,12 +375,14 @@ void HedgedConnections::processReceiveData(ReplicaStatePtr & replica) void HedgedConnections::processTimeoutEvent(ReplicaStatePtr & replica, TimerDescriptorPtr timeout_descriptor) { + LOG_DEBUG(log, "processTimeoutEvent"); epoll.remove(timeout_descriptor->getDescriptor()); replica->active_timeouts.erase(timeout_descriptor->getDescriptor()); timeout_fd_to_replica.erase(timeout_descriptor->getDescriptor()); if (timeout_descriptor->getType() == TimerTypes::RECEIVE_TIMEOUT) { + LOG_DEBUG(log, "process RECEIVE_TIMEOUT"); size_t offset = replica->parallel_replica_offset; finishProcessReplica(replica, true); @@ -390,6 +392,7 @@ void HedgedConnections::processTimeoutEvent(ReplicaStatePtr & replica, TimerDesc } else if (timeout_descriptor->getType() == TimerTypes::RECEIVE_DATA_TIMEOUT) { + LOG_DEBUG(log, "process RECEIVE_DATA_TIMEOUT"); offsets_queue.push(replica->parallel_replica_offset); tryGetNewReplica(); } diff --git a/src/Common/TimerDescriptor.h b/src/Common/TimerDescriptor.h index fa49189abfc..6138ed8f395 100644 --- a/src/Common/TimerDescriptor.h +++ b/src/Common/TimerDescriptor.h @@ -39,7 +39,7 @@ public: void setType(int type_) { type = type_; } }; -using TimerDescriptorPtr = TimerDescriptor *; +using TimerDescriptorPtr = std::shared_ptr; } #endif diff --git a/tests/integration/test_hedged_requests/test.py b/tests/integration/test_hedged_requests/test.py index 719477d9c7f..00d28ac62eb 100644 --- a/tests/integration/test_hedged_requests/test.py +++ b/tests/integration/test_hedged_requests/test.py @@ -17,7 +17,7 @@ node = cluster.add_instance( node_1 = cluster.add_instance('node_1', with_zookeeper=True, stay_alive=True, user_configs=['configs/users1.xml']) node_2 = cluster.add_instance('node_2', with_zookeeper=True) -sleep_timeout = 5 +sleep_timeout = 30 receive_timeout = 1 config = ''' @@ -62,12 +62,14 @@ def process_test(sleep_setting_name, receive_timeout_name): start = time.time() node.query("SELECT * FROM distributed"); query_time = time.time() - start + + print(query_time) # Check that query time is not long - assert query_time < sleep_timeout + # assert query_time < sleep_timeout -def test_change_replica_on_receive_hello(started_cluster): +def test(started_cluster): node.query("INSERT INTO distributed VALUES (1, '2020-01-01')") process_test("sleep_before_send_hello", "receive_hello_timeout") From f5ad1281f75cb7b5ba65a8ad1158ad57c135011c Mon Sep 17 00:00:00 2001 From: Pavel Kruglov Date: Mon, 1 Feb 2021 20:14:53 +0300 Subject: [PATCH 0118/2357] Fix style --- src/Client/Connection.cpp | 4 ++-- src/Client/GetHedgedConnections.cpp | 13 +++++++++---- 2 files changed, 11 insertions(+), 6 deletions(-) diff --git a/src/Client/Connection.cpp b/src/Client/Connection.cpp index 42d9c86739e..1593933f8f7 100644 --- a/src/Client/Connection.cpp +++ b/src/Client/Connection.cpp @@ -92,8 +92,8 @@ void Connection::connect(const ConnectionTimeouts & timeouts) void Connection::disconnect() { - LOG_DEBUG(log_wrapper.get(), "disconnect"); - + LOG_DEBUG(log_wrapper.get(), "disconnect"); + maybe_compressed_out = nullptr; in = nullptr; last_input_packet_type.reset(); diff --git a/src/Client/GetHedgedConnections.cpp b/src/Client/GetHedgedConnections.cpp index e8e087c4b0e..6b046bfcec0 100644 --- a/src/Client/GetHedgedConnections.cpp +++ b/src/Client/GetHedgedConnections.cpp @@ -426,7 +426,8 @@ void GetHedgedConnections::setBestUsableReplica(ReplicaStatePtr & replica) std::remove_if( indexes.begin(), indexes.end(), - [&](int i) { + [&](int i) + { return try_get_connections[i].result.entry.isNull() || !try_get_connections[i].result.is_usable || indexes_in_process.find(i) != indexes_in_process.end() || ready_indexes.find(i) != ready_indexes.end(); }), @@ -439,9 +440,13 @@ void GetHedgedConnections::setBestUsableReplica(ReplicaStatePtr & replica) } /// Sort replicas by staleness - std::stable_sort(indexes.begin(), indexes.end(), [&](size_t lhs, size_t rhs) { - return try_get_connections[lhs].result.staleness < try_get_connections[rhs].result.staleness; - }); + std::stable_sort( + indexes.begin(), + indexes.end(), + [&](size_t lhs, size_t rhs) + { + return try_get_connections[lhs].result.staleness < try_get_connections[rhs].result.staleness; + }); replica->index = indexes[0]; replica->connection = &*try_get_connections[indexes[0]].result.entry; From 5b16a54233dc51904016131db40f5f316cfc6266 Mon Sep 17 00:00:00 2001 From: Pavel Kruglov Date: Mon, 1 Feb 2021 20:23:46 +0300 Subject: [PATCH 0119/2357] Fix synchronization --- src/DataStreams/RemoteQueryExecutor.cpp | 34 ++++++++++--------------- 1 file changed, 14 insertions(+), 20 deletions(-) diff --git a/src/DataStreams/RemoteQueryExecutor.cpp b/src/DataStreams/RemoteQueryExecutor.cpp index 52a7a3e0a78..ffd532c8baf 100644 --- a/src/DataStreams/RemoteQueryExecutor.cpp +++ b/src/DataStreams/RemoteQueryExecutor.cpp @@ -63,13 +63,8 @@ RemoteQueryExecutor::RemoteQueryExecutor( const Settings & current_settings = context.getSettingsRef(); auto timeouts = ConnectionTimeouts::getTCPTimeoutsWithFailover(current_settings); - bool use_hedged_requests = current_settings.use_hedged_requests; - -#if !defined(OS_LINUX) - use_hedged_requests = false; -#endif - - if (use_hedged_requests) +#if defined(OS_LINUX) + if (current_settings.use_hedged_requests) { std::shared_ptr table_to_check = nullptr; if (main_table) @@ -77,21 +72,20 @@ RemoteQueryExecutor::RemoteQueryExecutor( return std::make_unique(pool, current_settings, timeouts, throttler, pool_mode, table_to_check); } - else - { - std::vector connection_entries; - if (main_table) - { - auto try_results = pool->getManyChecked(timeouts, ¤t_settings, pool_mode, main_table.getQualifiedName()); - connection_entries.reserve(try_results.size()); - for (auto & try_result : try_results) - connection_entries.emplace_back(std::move(try_result.entry)); - } - else - connection_entries = pool->getMany(timeouts, ¤t_settings, pool_mode); +#endif - return std::make_unique(std::move(connection_entries), current_settings, throttler); + std::vector connection_entries; + if (main_table) + { + auto try_results = pool->getManyChecked(timeouts, ¤t_settings, pool_mode, main_table.getQualifiedName()); + connection_entries.reserve(try_results.size()); + for (auto & try_result : try_results) + connection_entries.emplace_back(std::move(try_result.entry)); } + else + connection_entries = pool->getMany(timeouts, ¤t_settings, pool_mode); + + return std::make_unique(std::move(connection_entries), current_settings, throttler); }; } From a937bf26a137544e8c6bfcbce4077c999af0a0ef Mon Sep 17 00:00:00 2001 From: romanzhukov Date: Mon, 1 Feb 2021 21:11:47 +0300 Subject: [PATCH 0120/2357] DOCSUP-5266: Fix ticket comments. --- .../data-types/simpleaggregatefunction.md | 2 +- .../functions/date-time-functions.md | 7 ++-- .../operations/utilities/clickhouse-local.md | 2 +- .../data-types/simpleaggregatefunction.md | 2 +- .../functions/date-time-functions.md | 35 ++++++++++++++----- 5 files changed, 33 insertions(+), 15 deletions(-) diff --git a/docs/en/sql-reference/data-types/simpleaggregatefunction.md b/docs/en/sql-reference/data-types/simpleaggregatefunction.md index 015972d7dbe..155a7e1f858 100644 --- a/docs/en/sql-reference/data-types/simpleaggregatefunction.md +++ b/docs/en/sql-reference/data-types/simpleaggregatefunction.md @@ -32,7 +32,7 @@ The following aggregate functions are supported: - Name of the aggregate function. - Types of the aggregate function arguments. -**Example** +**Syntax** ``` sql CREATE TABLE t diff --git a/docs/en/sql-reference/functions/date-time-functions.md b/docs/en/sql-reference/functions/date-time-functions.md index 624e04ca21c..c995ce32cd4 100644 --- a/docs/en/sql-reference/functions/date-time-functions.md +++ b/docs/en/sql-reference/functions/date-time-functions.md @@ -663,10 +663,9 @@ Result: ## FROM\_UNIXTIME {#fromunixfime} -When there is only a single argument of integer type, it acts in the same way as `toDateTime` and return [DateTime](../../sql-reference/data-types/datetime.md). -type. +Function converts Unix timestamp to date. When there is only a single argument of integer type, it acts in the same way as [toDateTime](../../sql-reference/functions/type-conversion-functions.md#todatetime) and return [DateTime](../../sql-reference/data-types/datetime.md) type. -For example: +**Example:** Query: @@ -682,7 +681,7 @@ Result: └──────────────────────────┘ ``` -When there are two arguments: first is an integer or DateTime, second is a constant format string — it acts in the same way as `formatDateTime` and return `String` type. +When there are two arguments: first is an [Integer](../../sql-reference/data-types/int-uint.md/#int-ranges) or [DateTime](../../sql-reference/data-types/datetime.md), second is a constant format string — it acts in the same way as [formatDateTime](#formatdatetime) and return [String](../../sql-reference/data-types/string.md#string) type. For example: diff --git a/docs/ru/operations/utilities/clickhouse-local.md b/docs/ru/operations/utilities/clickhouse-local.md index f439049401c..8ecbbfcce8c 100644 --- a/docs/ru/operations/utilities/clickhouse-local.md +++ b/docs/ru/operations/utilities/clickhouse-local.md @@ -88,7 +88,7 @@ $ ps aux | tail -n +2 | awk '{ printf("%s\t%s\n", $1, $4) }' \ FROM table GROUP BY user ORDER BY memTotal DESC FORMAT Pretty" ``` -Ответ: +Результат: ``` text Read 186 rows, 4.15 KiB in 0.035 sec., 5302 rows/sec., 118.34 KiB/sec. diff --git a/docs/ru/sql-reference/data-types/simpleaggregatefunction.md b/docs/ru/sql-reference/data-types/simpleaggregatefunction.md index 10daad93cc6..9605706442e 100644 --- a/docs/ru/sql-reference/data-types/simpleaggregatefunction.md +++ b/docs/ru/sql-reference/data-types/simpleaggregatefunction.md @@ -31,7 +31,7 @@ - `func` — имя агрегатной функции. - `type` — типы аргументов агрегатной функции. -**Пример** +**Синтаксис** ``` sql CREATE TABLE t diff --git a/docs/ru/sql-reference/functions/date-time-functions.md b/docs/ru/sql-reference/functions/date-time-functions.md index 4db244d2388..bc35589363f 100644 --- a/docs/ru/sql-reference/functions/date-time-functions.md +++ b/docs/ru/sql-reference/functions/date-time-functions.md @@ -305,7 +305,9 @@ WITH toDateTime64('2020-01-01 10:20:30.999', 3) AS dt64 SELECT toStartOfSecond(d Переводит дату-с-временем или дату в число типа UInt16, содержащее номер ISO года. ISO год отличается от обычного года, потому что в соответствии с [ISO 8601:1988](https://en.wikipedia.org/wiki/ISO_8601) ISO год начинается необязательно первого января. -Пример: +**Пример:** + +Запрос: ```sql SELECT @@ -313,6 +315,9 @@ SELECT toYear(date), toISOYear(date) ``` + +Результат: + ```text ┌───────date─┬─toYear(toDate('2017-01-01'))─┬─toISOYear(toDate('2017-01-01'))─┐ │ 2017-01-01 │ 2017 │ 2016 │ @@ -326,12 +331,18 @@ SELECT 1 Января 2017 г. - воскресение, т.е. первая ISO неделя 2017 года началась в понедельник 2 января, поэтому 1 января 2017 это последняя неделя 2016 года. +**Пример** + +Запрос: + ```sql SELECT toISOWeek(toDate('2017-01-01')) AS ISOWeek20170101, toISOWeek(toDate('2017-01-02')) AS ISOWeek20170102 ``` +Результат: + ```text ┌─ISOWeek20170101─┬─ISOWeek20170102─┐ │ 52 │ 1 │ @@ -368,10 +379,14 @@ SELECT **Пример** +Запрос: + ```sql SELECT toDate('2016-12-27') AS date, toWeek(date) AS week0, toWeek(date,1) AS week1, toWeek(date,9) AS week9; ``` +Результат: + ```text ┌───────date─┬─week0─┬─week1─┬─week9─┐ │ 2016-12-27 │ 52 │ 52 │ 1 │ @@ -387,10 +402,14 @@ SELECT toDate('2016-12-27') AS date, toWeek(date) AS week0, toWeek(date,1) AS we **Пример** +Запрос: + ```sql SELECT toDate('2016-12-27') AS date, toYearWeek(date) AS yearWeek0, toYearWeek(date,1) AS yearWeek1, toYearWeek(date,9) AS yearWeek9; ``` +Результат: + ```text ┌───────date─┬─yearWeek0─┬─yearWeek1─┬─yearWeek9─┐ │ 2016-12-27 │ 201652 │ 201652 │ 201701 │ @@ -573,7 +592,7 @@ dateDiff('unit', startdate, enddate, [timezone]) SELECT dateDiff('hour', toDateTime('2018-01-01 22:00:00'), toDateTime('2018-01-02 23:00:00')); ``` -Ответ: +Результат: ``` text ┌─dateDiff('hour', toDateTime('2018-01-01 22:00:00'), toDateTime('2018-01-02 23:00:00'))─┐ @@ -654,10 +673,10 @@ formatDateTime(Time, Format\[, Timezone\]) Запрос: ``` sql -SELECT formatDateTime(toDate('2010-01-04'), '%g') +SELECT formatDateTime(toDate('2010-01-04'), '%g'); ``` -Ответ: +Результат: ``` ┌─formatDateTime(toDate('2010-01-04'), '%g')─┐ @@ -667,7 +686,7 @@ SELECT formatDateTime(toDate('2010-01-04'), '%g') ## FROM\_UNIXTIME {#fromunixtime} -Когда указан только один аргумент целочисленного типа, то функция действует так же, как `toDateTime`, и возвращает тип [DateTime](../../sql-reference/data-types/datetime.md). +Функция преобразует метку времени Unix в дату. Если указан только один аргумент типа [Integer](../../sql-reference/data-types/int-uint.md/#int-ranges), то функция действует так же, как [toDateTime](../../sql-reference/functions/type-conversion-functions.md#todatetime), и возвращает тип [DateTime](../../sql-reference/data-types/datetime.md). **Пример** @@ -677,7 +696,7 @@ SELECT formatDateTime(toDate('2010-01-04'), '%g') SELECT FROM_UNIXTIME(423543535); ``` -Ответ: +Результат: ```text ┌─FROM_UNIXTIME(423543535)─┐ @@ -685,7 +704,7 @@ SELECT FROM_UNIXTIME(423543535); └──────────────────────────┘ ``` -В случае, когда есть два аргумента: первый типа `Integer` или `DateTime`, а второй является строкой постоянного формата — функция работает таким же образом, как `formatDateTime`, и возвращает значение типа `String`. +В случае, когда есть два аргумента: первый типа [Integer](../../sql-reference/data-types/int-uint.md/#int-ranges) или [DateTime](../../sql-reference/data-types/datetime.md), а второй является строкой постоянного формата — функция работает таким же образом, как [formatDateTime](#formatdatetime), и возвращает значение типа [String](../../sql-reference/data-types/string.md#string). **Пример** @@ -695,7 +714,7 @@ SELECT FROM_UNIXTIME(423543535); SELECT FROM_UNIXTIME(1234334543, '%Y-%m-%d %R:%S') AS DateTime; ``` -Ответ: +Результат: ```text ┌─DateTime────────────┐ From f58ae0ffa15f53c0249ba9c349977475d79f8433 Mon Sep 17 00:00:00 2001 From: romanzhukov Date: Mon, 1 Feb 2021 21:27:13 +0300 Subject: [PATCH 0121/2357] DOCSUP-5266: Fix ticket comments. --- docs/en/sql-reference/functions/date-time-functions.md | 2 +- docs/ru/sql-reference/functions/date-time-functions.md | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/docs/en/sql-reference/functions/date-time-functions.md b/docs/en/sql-reference/functions/date-time-functions.md index c995ce32cd4..0ac1d325fbc 100644 --- a/docs/en/sql-reference/functions/date-time-functions.md +++ b/docs/en/sql-reference/functions/date-time-functions.md @@ -681,7 +681,7 @@ Result: └──────────────────────────┘ ``` -When there are two arguments: first is an [Integer](../../sql-reference/data-types/int-uint.md/#int-ranges) or [DateTime](../../sql-reference/data-types/datetime.md), second is a constant format string — it acts in the same way as [formatDateTime](#formatdatetime) and return [String](../../sql-reference/data-types/string.md#string) type. +When there are two arguments: first is an [Integer](../../sql-reference/data-types/int-uint.md#int-ranges) or [DateTime](../../sql-reference/data-types/datetime.md), second is a constant format string — it acts in the same way as [formatDateTime](#formatdatetime) and return [String](../../sql-reference/data-types/string.md#string) type. For example: diff --git a/docs/ru/sql-reference/functions/date-time-functions.md b/docs/ru/sql-reference/functions/date-time-functions.md index bc35589363f..a822c4f9778 100644 --- a/docs/ru/sql-reference/functions/date-time-functions.md +++ b/docs/ru/sql-reference/functions/date-time-functions.md @@ -686,7 +686,7 @@ SELECT formatDateTime(toDate('2010-01-04'), '%g'); ## FROM\_UNIXTIME {#fromunixtime} -Функция преобразует метку времени Unix в дату. Если указан только один аргумент типа [Integer](../../sql-reference/data-types/int-uint.md/#int-ranges), то функция действует так же, как [toDateTime](../../sql-reference/functions/type-conversion-functions.md#todatetime), и возвращает тип [DateTime](../../sql-reference/data-types/datetime.md). +Функция преобразует метку времени Unix в дату. Если указан только один аргумент типа [Integer](../../sql-reference/data-types/int-uint.md#int-ranges), то функция действует так же, как [toDateTime](../../sql-reference/functions/type-conversion-functions.md#todatetime), и возвращает тип [DateTime](../../sql-reference/data-types/datetime.md). **Пример** @@ -704,7 +704,7 @@ SELECT FROM_UNIXTIME(423543535); └──────────────────────────┘ ``` -В случае, когда есть два аргумента: первый типа [Integer](../../sql-reference/data-types/int-uint.md/#int-ranges) или [DateTime](../../sql-reference/data-types/datetime.md), а второй является строкой постоянного формата — функция работает таким же образом, как [formatDateTime](#formatdatetime), и возвращает значение типа [String](../../sql-reference/data-types/string.md#string). +В случае, когда есть два аргумента: первый типа [Integer](../../sql-reference/data-types/int-uint.md#int-ranges) или [DateTime](../../sql-reference/data-types/datetime.md), а второй является строкой постоянного формата — функция работает таким же образом, как [formatDateTime](#formatdatetime), и возвращает значение типа [String](../../sql-reference/data-types/string.md#string). **Пример** From 23914860b07ea5d4ebfe7b639fff5999c78afd3c Mon Sep 17 00:00:00 2001 From: romanzhukov Date: Mon, 1 Feb 2021 21:43:38 +0300 Subject: [PATCH 0122/2357] DOCSUP-5266: Fix ticket comments. --- docs/en/sql-reference/functions/date-time-functions.md | 4 ++-- docs/ru/sql-reference/functions/date-time-functions.md | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/docs/en/sql-reference/functions/date-time-functions.md b/docs/en/sql-reference/functions/date-time-functions.md index 0ac1d325fbc..ce2092a7818 100644 --- a/docs/en/sql-reference/functions/date-time-functions.md +++ b/docs/en/sql-reference/functions/date-time-functions.md @@ -663,7 +663,7 @@ Result: ## FROM\_UNIXTIME {#fromunixfime} -Function converts Unix timestamp to date. When there is only a single argument of integer type, it acts in the same way as [toDateTime](../../sql-reference/functions/type-conversion-functions.md#todatetime) and return [DateTime](../../sql-reference/data-types/datetime.md) type. +Function converts Unix timestamp to date. When there is only a single argument of [Integer](../../sql-reference/data-types/int-uint.md) type, it acts in the same way as [toDateTime](../../sql-reference/functions/type-conversion-functions.md#todatetime) and return [DateTime](../../sql-reference/data-types/datetime.md) type. **Example:** @@ -681,7 +681,7 @@ Result: └──────────────────────────┘ ``` -When there are two arguments: first is an [Integer](../../sql-reference/data-types/int-uint.md#int-ranges) or [DateTime](../../sql-reference/data-types/datetime.md), second is a constant format string — it acts in the same way as [formatDateTime](#formatdatetime) and return [String](../../sql-reference/data-types/string.md#string) type. +When there are two arguments: first is an [Integer](../../sql-reference/data-types/int-uint.md) or [DateTime](../../sql-reference/data-types/datetime.md), second is a constant format string — it acts in the same way as [formatDateTime](#formatdatetime) and return [String](../../sql-reference/data-types/string.md#string) type. For example: diff --git a/docs/ru/sql-reference/functions/date-time-functions.md b/docs/ru/sql-reference/functions/date-time-functions.md index a822c4f9778..b23862ccce2 100644 --- a/docs/ru/sql-reference/functions/date-time-functions.md +++ b/docs/ru/sql-reference/functions/date-time-functions.md @@ -686,7 +686,7 @@ SELECT formatDateTime(toDate('2010-01-04'), '%g'); ## FROM\_UNIXTIME {#fromunixtime} -Функция преобразует метку времени Unix в дату. Если указан только один аргумент типа [Integer](../../sql-reference/data-types/int-uint.md#int-ranges), то функция действует так же, как [toDateTime](../../sql-reference/functions/type-conversion-functions.md#todatetime), и возвращает тип [DateTime](../../sql-reference/data-types/datetime.md). +Функция преобразует метку времени Unix в дату. Если указан только один аргумент типа [Integer](../../sql-reference/data-types/int-uint.md), то функция действует так же, как [toDateTime](../../sql-reference/functions/type-conversion-functions.md#todatetime), и возвращает тип [DateTime](../../sql-reference/data-types/datetime.md). **Пример** @@ -704,7 +704,7 @@ SELECT FROM_UNIXTIME(423543535); └──────────────────────────┘ ``` -В случае, когда есть два аргумента: первый типа [Integer](../../sql-reference/data-types/int-uint.md#int-ranges) или [DateTime](../../sql-reference/data-types/datetime.md), а второй является строкой постоянного формата — функция работает таким же образом, как [formatDateTime](#formatdatetime), и возвращает значение типа [String](../../sql-reference/data-types/string.md#string). +В случае, когда есть два аргумента: первый типа [Integer](../../sql-reference/data-types/int-uint.md) или [DateTime](../../sql-reference/data-types/datetime.md), а второй является строкой постоянного формата — функция работает таким же образом, как [formatDateTime](#formatdatetime), и возвращает значение типа [String](../../sql-reference/data-types/string.md#string). **Пример** From 9da445e740b45481da042d6e0264cdbe70245443 Mon Sep 17 00:00:00 2001 From: Alexander Tokmakov Date: Mon, 1 Feb 2021 22:29:47 +0300 Subject: [PATCH 0123/2357] execute initial query in the same thread --- src/Databases/DatabaseReplicated.cpp | 12 ++-- src/Databases/DatabaseReplicatedWorker.cpp | 68 ++++++++++++++++++--- src/Databases/DatabaseReplicatedWorker.h | 7 ++- src/Interpreters/DDLTask.cpp | 4 +- src/Interpreters/DDLTask.h | 2 +- src/Interpreters/DDLWorker.cpp | 22 ++++++- src/Interpreters/InterpreterAlterQuery.cpp | 3 + src/Interpreters/InterpreterCreateQuery.cpp | 7 ++- src/Interpreters/InterpreterDropQuery.cpp | 33 ++++++---- src/Interpreters/InterpreterRenameQuery.cpp | 5 +- src/Interpreters/InterpreterRenameQuery.h | 3 + tests/clickhouse-test | 4 +- 12 files changed, 128 insertions(+), 42 deletions(-) diff --git a/src/Databases/DatabaseReplicated.cpp b/src/Databases/DatabaseReplicated.cpp index 6f244ed7ec9..44746cd5716 100644 --- a/src/Databases/DatabaseReplicated.cpp +++ b/src/Databases/DatabaseReplicated.cpp @@ -42,9 +42,9 @@ zkutil::ZooKeeperPtr DatabaseReplicated::getZooKeeper() const return global_context.getZooKeeper(); } -static inline String getHostID(const Context & global_context) +static inline String getHostID(const Context & global_context, const UUID & db_uuid) { - return Cluster::Address::toString(getFQDNOrHostName(), global_context.getTCPPort()); + return Cluster::Address::toString(getFQDNOrHostName(), global_context.getTCPPort()) + ':' + toString(db_uuid); } @@ -94,7 +94,7 @@ DatabaseReplicated::DatabaseReplicated( String replica_host_id; if (current_zookeeper->tryGet(replica_path, replica_host_id)) { - String host_id = getHostID(global_context); + String host_id = getHostID(global_context, db_uuid); if (replica_host_id != host_id) throw Exception(ErrorCodes::REPLICA_IS_ALREADY_EXIST, "Replica {} of shard {} of replicated database at {} already exists. Replica host ID: '{}', current host ID: '{}'", @@ -144,7 +144,7 @@ void DatabaseReplicated::createReplicaNodesInZooKeeper(const zkutil::ZooKeeperPt //log_entry_to_execute = 0; //FIXME /// Write host name to replica_path, it will protect from multiple replicas with the same name - auto host_id = getHostID(global_context); + auto host_id = getHostID(global_context, db_uuid); /// On replica creation add empty entry to log. Can be used to trigger some actions on other replicas (e.g. update cluster info). DDLLogEntry entry; @@ -221,11 +221,11 @@ BlockIO DatabaseReplicated::propose(const ASTPtr & query) LOG_DEBUG(log, "Proposing query: {}", queryToString(query)); + /// TODO maybe write current settings to log entry? DDLLogEntry entry; - entry.hosts = {}; entry.query = queryToString(query); entry.initiator = ddl_worker->getCommonHostID(); - String node_path = ddl_worker->enqueueQuery(entry); + String node_path = ddl_worker->tryEnqueueAndExecuteEntry(entry); BlockIO io; //FIXME use query context diff --git a/src/Databases/DatabaseReplicatedWorker.cpp b/src/Databases/DatabaseReplicatedWorker.cpp index 0c2368cdcf6..a1cdff204c7 100644 --- a/src/Databases/DatabaseReplicatedWorker.cpp +++ b/src/Databases/DatabaseReplicatedWorker.cpp @@ -8,13 +8,16 @@ namespace DB namespace ErrorCodes { extern const int LOGICAL_ERROR; + extern const int DATABASE_REPLICATION_FAILED; } DatabaseReplicatedDDLWorker::DatabaseReplicatedDDLWorker(DatabaseReplicated * db, const Context & context_) : DDLWorker(/* pool_size */ 1, db->zookeeper_path + "/log", context_, nullptr, {}, fmt::format("DDLWorker({})", db->getDatabaseName())) , database(db) { - /// Pool size must be 1 (to avoid reordering of log entries) + /// Pool size must be 1 to avoid reordering of log entries. + /// TODO Make a dependency graph of DDL queries. It will allow to execute independent entries in parallel. + /// We also need similar graph to load tables on server startup in order of topsort. } void DatabaseReplicatedDDLWorker::initializeMainThread() @@ -72,8 +75,51 @@ String DatabaseReplicatedDDLWorker::enqueueQuery(DDLLogEntry & entry) return node_path; } +String DatabaseReplicatedDDLWorker::tryEnqueueAndExecuteEntry(DDLLogEntry & entry) +{ + auto zookeeper = getAndSetZooKeeper(); + // TODO do not enqueue query if we have big replication lag + + String entry_path = enqueueQuery(entry); + auto try_node = zkutil::EphemeralNodeHolder::existing(entry_path + "/try", *zookeeper); + String entry_name = entry_path.substr(entry_path.rfind('/') + 1); + auto task = std::make_unique(entry_name, entry_path, database); + task->entry = entry; + task->parseQueryFromEntry(context); + assert(!task->entry.query.empty()); + assert(!zookeeper->exists(task->getFinishedNodePath())); + task->is_initial_query = true; + + LOG_DEBUG(log, "Waiting for worker thread to process all entries before {}", entry_name); + { + std::unique_lock lock{mutex}; + wait_current_task_change.wait(lock, [&]() { assert(current_task <= entry_name); return zookeeper->expired() || current_task == entry_name; }); + } + + if (zookeeper->expired()) + throw Exception(ErrorCodes::DATABASE_REPLICATION_FAILED, "ZooKeeper session expired, try again"); + + processTask(*task); + + if (!task->was_executed) + { + throw Exception(ErrorCodes::LOGICAL_ERROR, "Entry {} was executed, but was not committed: code {}: {}", + task->execution_status.code, task->execution_status.message); + } + + try_node->reset(); + + return entry_path; +} + DDLTaskPtr DatabaseReplicatedDDLWorker::initAndCheckTask(const String & entry_name, String & out_reason, const ZooKeeperPtr & zookeeper) { + { + std::lock_guard lock{mutex}; + current_task = entry_name; + wait_current_task_change.notify_all(); + } + UInt32 our_log_ptr = parse(current_zookeeper->get(database->replica_path + "/log_ptr")); UInt32 entry_num = DatabaseReplicatedTask::getLogEntryNumber(entry_name); @@ -91,27 +137,31 @@ DDLTaskPtr DatabaseReplicatedDDLWorker::initAndCheckTask(const String & entry_na if (zookeeper->tryGet(entry_path + "/try", initiator_name, nullptr, wait_committed_or_failed)) { - task->we_are_initiator = initiator_name == task->host_id_str; + task->is_initial_query = initiator_name == task->host_id_str; /// Query is not committed yet. We cannot just skip it and execute next one, because reordering may break replication. //FIXME add some timeouts - if (!task->we_are_initiator) - { - LOG_TRACE(log, "Waiting for initiator {} to commit or rollback entry {}", initiator_name, entry_path); - wait_committed_or_failed->wait(); - } + LOG_TRACE(log, "Waiting for initiator {} to commit or rollback entry {}", initiator_name, entry_path); + wait_committed_or_failed->wait(); } - if (!task->we_are_initiator && !zookeeper->exists(entry_path + "/committed")) + if (!zookeeper->exists(entry_path + "/committed")) { out_reason = "Entry " + entry_name + " hasn't been committed"; return {}; } + if (task->is_initial_query) + { + assert(!zookeeper->exists(entry_path + "/try")); + assert(zookeeper->exists(entry_path + "/committed") == (zookeeper->get(task->getFinishedNodePath()) == "0")); + out_reason = "Entry " + entry_name + " has been executed as initial query"; + return {}; + } + String node_data; if (!zookeeper->tryGet(entry_path, node_data)) { LOG_ERROR(log, "Cannot get log entry {}", entry_path); - database->onUnexpectedLogEntry(entry_name, zookeeper); throw Exception(ErrorCodes::LOGICAL_ERROR, "should be unreachable"); } diff --git a/src/Databases/DatabaseReplicatedWorker.h b/src/Databases/DatabaseReplicatedWorker.h index 7994104331e..7e6d64dab0b 100644 --- a/src/Databases/DatabaseReplicatedWorker.h +++ b/src/Databases/DatabaseReplicatedWorker.h @@ -1,7 +1,6 @@ #pragma once #include - namespace DB { @@ -14,6 +13,8 @@ public: String enqueueQuery(DDLLogEntry & entry) override; + String tryEnqueueAndExecuteEntry(DDLLogEntry & entry); + private: void initializeMainThread() override; void initializeReplication(); @@ -21,7 +22,9 @@ private: DDLTaskPtr initAndCheckTask(const String & entry_name, String & out_reason, const ZooKeeperPtr & zookeeper) override; DatabaseReplicated * database; - + mutable std::mutex mutex; + std::condition_variable wait_current_task_change; + String current_task; }; } diff --git a/src/Interpreters/DDLTask.cpp b/src/Interpreters/DDLTask.cpp index fd2de014581..55e613648ae 100644 --- a/src/Interpreters/DDLTask.cpp +++ b/src/Interpreters/DDLTask.cpp @@ -303,9 +303,9 @@ std::unique_ptr DatabaseReplicatedTask::makeQueryContext(Context & from query_context->initMetadataTransaction(txn); txn->current_zookeeper = from_context.getZooKeeper(); txn->zookeeper_path = database->zookeeper_path; - txn->is_initial_query = we_are_initiator; + txn->is_initial_query = is_initial_query; - if (we_are_initiator) + if (is_initial_query) { txn->ops.emplace_back(zkutil::makeRemoveRequest(entry_path + "/try", -1)); txn->ops.emplace_back(zkutil::makeCreateRequest(entry_path + "/committed", host_id_str, zkutil::CreateMode::Persistent)); diff --git a/src/Interpreters/DDLTask.h b/src/Interpreters/DDLTask.h index 5b50413b975..49f6d74a931 100644 --- a/src/Interpreters/DDLTask.h +++ b/src/Interpreters/DDLTask.h @@ -77,6 +77,7 @@ struct DDLTaskBase String host_id_str; ASTPtr query; + bool is_initial_query = false; bool is_circular_replicated = false; bool execute_on_leader = false; @@ -136,7 +137,6 @@ struct DatabaseReplicatedTask : public DDLTaskBase static UInt32 getLogEntryNumber(const String & log_entry_name); DatabaseReplicated * database; - bool we_are_initiator = false; }; diff --git a/src/Interpreters/DDLWorker.cpp b/src/Interpreters/DDLWorker.cpp index 7b9d3ef8f5b..fabb9f9563e 100644 --- a/src/Interpreters/DDLWorker.cpp +++ b/src/Interpreters/DDLWorker.cpp @@ -51,6 +51,7 @@ namespace ErrorCodes extern const int CANNOT_ASSIGN_ALTER; extern const int CANNOT_ALLOCATE_MEMORY; extern const int MEMORY_LIMIT_EXCEEDED; + extern const int INCORRECT_QUERY; } @@ -398,8 +399,9 @@ bool DDLWorker::tryExecuteQuery(const String & query, DDLTaskBase & task) try { auto query_context = task.makeQueryContext(context); - query_scope.emplace(*query_context); - executeQuery(istr, ostr, false, *query_context, {}); + if (!task.is_initial_query) + query_scope.emplace(*query_context); + executeQuery(istr, ostr, !task.is_initial_query, *query_context, {}); if (auto txn = query_context->getMetadataTransaction()) { @@ -409,6 +411,9 @@ bool DDLWorker::tryExecuteQuery(const String & query, DDLTaskBase & task) } catch (const DB::Exception & e) { + if (task.is_initial_query) + throw; + task.execution_status = ExecutionStatus::fromCurrentException(); tryLogCurrentException(log, "Query " + query + " wasn't finished successfully"); @@ -426,6 +431,9 @@ bool DDLWorker::tryExecuteQuery(const String & query, DDLTaskBase & task) } catch (...) { + if (task.is_initial_query) + throw; + task.execution_status = ExecutionStatus::fromCurrentException(); tryLogCurrentException(log, "Query " + query + " wasn't finished successfully"); @@ -474,7 +482,10 @@ void DDLWorker::processTask(DDLTaskBase & task) { /// It's not CREATE DATABASE auto table_id = context.tryResolveStorageID(*query_with_table, Context::ResolveOrdinary); - storage = DatabaseCatalog::instance().tryGetTable(table_id, context); + DatabasePtr database; + std::tie(database, storage) = DatabaseCatalog::instance().tryGetDatabaseAndTable(table_id, context); + if (database && database->getEngineName() == "Replicated") + throw Exception(ErrorCodes::INCORRECT_QUERY, "ON CLUSTER queries are not allowed for Replicated databases"); } task.execute_on_leader = storage && taskShouldBeExecutedOnLeader(task.query, storage) && !task.is_circular_replicated; @@ -496,6 +507,8 @@ void DDLWorker::processTask(DDLTaskBase & task) } catch (...) { + if (task.is_initial_query) + throw; tryLogCurrentException(log, "An error occurred before execution of DDL task: "); task.execution_status = ExecutionStatus::fromCurrentException("An error occurred before execution"); } @@ -628,6 +641,9 @@ bool DDLWorker::tryExecuteQueryOnLeaderReplica( StorageReplicatedMergeTree::Status status; replicated_storage->getStatus(status); + if (task.is_initial_query && !status.is_leader) + throw Exception(ErrorCodes::NOT_A_LEADER, "Cannot execute initial query on non-leader replica"); + /// Any replica which is leader tries to take lock if (status.is_leader && lock->tryLock()) { diff --git a/src/Interpreters/InterpreterAlterQuery.cpp b/src/Interpreters/InterpreterAlterQuery.cpp index db380bca2b1..0edd1a401b3 100644 --- a/src/Interpreters/InterpreterAlterQuery.cpp +++ b/src/Interpreters/InterpreterAlterQuery.cpp @@ -53,7 +53,10 @@ BlockIO InterpreterAlterQuery::execute() DatabasePtr database = DatabaseCatalog::instance().getDatabase(table_id.database_name); if (typeid_cast(database.get()) && context.getClientInfo().query_kind != ClientInfo::QueryKind::REPLICATED_LOG_QUERY) + { + alter_lock.reset(); return typeid_cast(database.get())->propose(query_ptr); + } //FIXME commit MetadataTransaction for all ALTER kinds. Now its' implemented only for metadata alter. diff --git a/src/Interpreters/InterpreterCreateQuery.cpp b/src/Interpreters/InterpreterCreateQuery.cpp index 926737ef888..d91f3140a96 100644 --- a/src/Interpreters/InterpreterCreateQuery.cpp +++ b/src/Interpreters/InterpreterCreateQuery.cpp @@ -572,6 +572,10 @@ InterpreterCreateQuery::TableProperties InterpreterCreateQuery::setProperties(AS validateTableStructure(create, properties); /// Set the table engine if it was not specified explicitly. setEngine(create); + + create.as_database.clear(); + create.as_table.clear(); + return properties; } @@ -835,7 +839,7 @@ BlockIO InterpreterCreateQuery::createTable(ASTCreateQuery & create) /// Data path must be relative to root_path create.attach_from_path = fs::relative(data_path, root_path) / ""; } - else if (create.attach && !create.attach_short_syntax) + else if (create.attach && !create.attach_short_syntax && context.getClientInfo().query_kind != ClientInfo::QueryKind::REPLICATED_LOG_QUERY) { auto * log = &Poco::Logger::get("InterpreterCreateQuery"); LOG_WARNING(log, "ATTACH TABLE query with full table definition is not recommended: " @@ -881,6 +885,7 @@ BlockIO InterpreterCreateQuery::createTable(ASTCreateQuery & create) if (typeid_cast(database.get()) && context.getClientInfo().query_kind != ClientInfo::QueryKind::REPLICATED_LOG_QUERY) { assertOrSetUUID(create, database); + guard.reset(); return typeid_cast(database.get())->propose(query_ptr); } } diff --git a/src/Interpreters/InterpreterDropQuery.cpp b/src/Interpreters/InterpreterDropQuery.cpp index ff7b6ef8387..eed7337b9ab 100644 --- a/src/Interpreters/InterpreterDropQuery.cpp +++ b/src/Interpreters/InterpreterDropQuery.cpp @@ -33,6 +33,7 @@ namespace ErrorCodes extern const int UNKNOWN_TABLE; extern const int UNKNOWN_DICTIONARY; extern const int NOT_IMPLEMENTED; + extern const int INCORRECT_QUERY; } @@ -119,12 +120,28 @@ BlockIO InterpreterDropQuery::executeToTableImpl(const ASTDropQuery & query, Dat if (database && table) { - if (query_ptr->as().is_view && !table->isView()) + if (query.as().is_view && !table->isView()) throw Exception("Table " + table_id.getNameForLogs() + " is not a View", ErrorCodes::LOGICAL_ERROR); /// Now get UUID, so we can wait for table data to be finally dropped table_id.uuid = database->tryGetTableUUID(table_id.table_name); + /// Prevents recursive drop from drop database query. The original query must specify a table. + bool is_drop_or_detach_database = query.table.empty(); + bool is_replicated_ddl_query = typeid_cast(database.get()) && + context.getClientInfo().query_kind != ClientInfo::QueryKind::REPLICATED_LOG_QUERY && + !is_drop_or_detach_database; + if (is_replicated_ddl_query) + { + if (query.kind == ASTDropQuery::Kind::Detach && !query.permanently) + throw Exception(ErrorCodes::INCORRECT_QUERY, "DETACH TABLE is not allowed for Replicated databases. " + "Use DETACH TABLE PERMANENTLY or SYSTEM RESTART REPLICA"); + + ddl_guard.reset(); + table.reset(); + return typeid_cast(database.get())->propose(query.clone()); + } + if (query.kind == ASTDropQuery::Kind::Detach) { context.checkAccess(table->isView() ? AccessType::DROP_VIEW : AccessType::DROP_TABLE, table_id); @@ -135,9 +152,6 @@ BlockIO InterpreterDropQuery::executeToTableImpl(const ASTDropQuery & query, Dat if (database->getUUID() == UUIDHelpers::Nil) table_lock = table->lockExclusively(context.getCurrentQueryId(), context.getSettingsRef().lock_acquire_timeout); - if (typeid_cast(database.get()) && context.getClientInfo().query_kind != ClientInfo::QueryKind::REPLICATED_LOG_QUERY) - return typeid_cast(database.get())->propose(query_ptr); - if (query.permanently) { /// Drop table from memory, don't touch data, metadata file renamed and will be skipped during server restart @@ -157,10 +171,7 @@ BlockIO InterpreterDropQuery::executeToTableImpl(const ASTDropQuery & query, Dat auto table_lock = table->lockExclusively(context.getCurrentQueryId(), context.getSettingsRef().lock_acquire_timeout); auto metadata_snapshot = table->getInMemoryMetadataPtr(); /// Drop table data, don't touch metadata - if (database->getEngineName() == "Replicated" && context.getClientInfo().query_kind != ClientInfo::QueryKind::REPLICATED_LOG_QUERY) - return typeid_cast(database.get())->propose(query_ptr); - else - table->truncate(query_ptr, metadata_snapshot, context, table_lock); + table->truncate(query_ptr, metadata_snapshot, context, table_lock); } else if (query.kind == ASTDropQuery::Kind::Drop) { @@ -173,11 +184,7 @@ BlockIO InterpreterDropQuery::executeToTableImpl(const ASTDropQuery & query, Dat if (database->getUUID() == UUIDHelpers::Nil) table_lock = table->lockExclusively(context.getCurrentQueryId(), context.getSettingsRef().lock_acquire_timeout); - /// Prevents recursive drop from drop database query. The original query must specify a table. - if (typeid_cast(database.get()) && !query_ptr->as().table.empty() && context.getClientInfo().query_kind != ClientInfo::QueryKind::REPLICATED_LOG_QUERY) - return typeid_cast(database.get())->propose(query_ptr); - else - database->dropTable(context, table_id.table_name, query.no_delay); + database->dropTable(context, table_id.table_name, query.no_delay); } db = database; diff --git a/src/Interpreters/InterpreterRenameQuery.cpp b/src/Interpreters/InterpreterRenameQuery.cpp index a6075643a96..52faa89eff1 100644 --- a/src/Interpreters/InterpreterRenameQuery.cpp +++ b/src/Interpreters/InterpreterRenameQuery.cpp @@ -43,9 +43,6 @@ BlockIO InterpreterRenameQuery::execute() RenameDescriptions descriptions; descriptions.reserve(rename.elements.size()); - /// Don't allow to drop tables (that we are renaming); don't allow to create tables in places where tables will be renamed. - TableGuards table_guards; - for (const auto & elem : rename.elements) { descriptions.emplace_back(elem, current_database); @@ -85,6 +82,8 @@ BlockIO InterpreterRenameQuery::executeToTables(const ASTRenameQuery & rename, c if (1 < descriptions.size()) throw Exception(ErrorCodes::NOT_IMPLEMENTED, "Database {} is Replicated, " "it does not support renaming of multiple tables in single query.", elem.from_database_name); + + table_guards.clear(); return typeid_cast(database.get())->propose(query_ptr); } else diff --git a/src/Interpreters/InterpreterRenameQuery.h b/src/Interpreters/InterpreterRenameQuery.h index 055c15181c1..2bc84514b4c 100644 --- a/src/Interpreters/InterpreterRenameQuery.h +++ b/src/Interpreters/InterpreterRenameQuery.h @@ -64,6 +64,9 @@ private: ASTPtr query_ptr; Context & context; + + /// Don't allow to drop tables (that we are renaming); don't allow to create tables in places where tables will be renamed. + TableGuards table_guards; }; } diff --git a/tests/clickhouse-test b/tests/clickhouse-test index 13e7b4be001..3bfbd5d3e7f 100755 --- a/tests/clickhouse-test +++ b/tests/clickhouse-test @@ -186,9 +186,9 @@ def run_single_test(args, ext, server_logs_level, client_options, case_file, std total_time = (datetime.now() - start_time).total_seconds() + # Normalize randomized database names in stdout, stderr files. + os.system("LC_ALL=C sed -i -e 's/{test_db}/default/g' {file}".format(test_db=database, file=stdout_file)) if not args.show_db_name: - # Normalize randomized database names in stdout, stderr files. - os.system("LC_ALL=C sed -i -e 's/{test_db}/default/g' {file}".format(test_db=database, file=stdout_file)) os.system("LC_ALL=C sed -i -e 's/{test_db}/default/g' {file}".format(test_db=database, file=stderr_file)) stdout = open(stdout_file, 'rb').read() if os.path.exists(stdout_file) else b'' From 79f651f2b40379c0d515648b69875054831fe5dc Mon Sep 17 00:00:00 2001 From: romanzhukov Date: Mon, 1 Feb 2021 23:32:45 +0300 Subject: [PATCH 0124/2357] DOCSUP-5822: Add function documentation. --- .../functions/type-conversion-functions.md | 34 +++++++++++++++---- 1 file changed, 27 insertions(+), 7 deletions(-) diff --git a/docs/en/sql-reference/functions/type-conversion-functions.md b/docs/en/sql-reference/functions/type-conversion-functions.md index 6237cd6a976..fdfc3c479ce 100644 --- a/docs/en/sql-reference/functions/type-conversion-functions.md +++ b/docs/en/sql-reference/functions/type-conversion-functions.md @@ -459,28 +459,48 @@ Code: 70. DB::Exception: Received from localhost:9000. DB::Exception: Value in c ## accurateCastOrNull(x, T) {#type_conversion_function-accurate-cast_or_null} -Converts ‘x’ to the ‘t’ data type. Always returns nullable type and returns NULL +Converts input value to the specified data type. Always returns nullable type and returns NULL if the casted value is not representable in the target type. -Example: +**Syntax** + +```sql +accurateCastOrNull(x, T) + +``` + +**Parameters** + +- `x` — Input value. +- `T` — Defines the data type of returned values. + +**Example** + +Query: ``` sql SELECT - accurateCastOrNull(-1, 'UInt8') as uint8, - accurateCastOrNull(128, 'Int8') as int8, - accurateCastOrNull('Test', 'FixedString(2)') as fixed_string + cast(-1, 'UInt8') as uint8, + cast(128, 'Int8') as int8, + cast('Test', 'FixedString(2)') as fixed_string; ``` +Result: + ``` text ┌─uint8─┬─int8─┬─fixed_string─┐ │ ᴺᵁᴸᴸ │ ᴺᵁᴸᴸ │ ᴺᵁᴸᴸ │ -└───────┴──────┴──────────────┘┘ +└───────┴──────┴──────────────┘ ``` +Query: + ``` sql -SELECT toTypeName(accurateCastOrNull(5, 'UInt8')) +SELECT toTypeName(accurateCastOrNull(5, 'UInt8')); ``` +Result: + ``` text ┌─toTypeName(accurateCastOrNull(5, 'UInt8'))─┐ │ Nullable(UInt8) │ From 3e3ee19818ba6e0a6ab7d697f146a7ec539b9039 Mon Sep 17 00:00:00 2001 From: Kruglov Pavel <48961922+Avogar@users.noreply.github.com> Date: Tue, 2 Feb 2021 00:10:55 +0300 Subject: [PATCH 0125/2357] Restart tests --- src/Client/HedgedConnections.cpp | 1 + 1 file changed, 1 insertion(+) diff --git a/src/Client/HedgedConnections.cpp b/src/Client/HedgedConnections.cpp index 16ba19ebe78..52eb79e0372 100644 --- a/src/Client/HedgedConnections.cpp +++ b/src/Client/HedgedConnections.cpp @@ -317,6 +317,7 @@ int HedgedConnections::getReadyFileDescriptor(AsyncCallback async_callback) Packet HedgedConnections::receivePacketFromReplica(ReplicaStatePtr & replica, AsyncCallback async_callback) { LOG_DEBUG(log, "sreceivePacketFromReplica"); + Packet packet = replica->connection->receivePacket(std::move(async_callback)); switch (packet.type) { From f6de1291645909affe5b9b3dbb5e929e95f7c7ea Mon Sep 17 00:00:00 2001 From: romanzhukov Date: Tue, 2 Feb 2021 09:57:41 +0300 Subject: [PATCH 0126/2357] DOCSUP-5822: Add function documentation. --- .../functions/type-conversion-functions.md | 34 +++++++------ .../functions/type-conversion-functions.md | 48 +++++++++++++++++++ 2 files changed, 64 insertions(+), 18 deletions(-) diff --git a/docs/en/sql-reference/functions/type-conversion-functions.md b/docs/en/sql-reference/functions/type-conversion-functions.md index fdfc3c479ce..86217871ca1 100644 --- a/docs/en/sql-reference/functions/type-conversion-functions.md +++ b/docs/en/sql-reference/functions/type-conversion-functions.md @@ -459,25 +459,37 @@ Code: 70. DB::Exception: Received from localhost:9000. DB::Exception: Value in c ## accurateCastOrNull(x, T) {#type_conversion_function-accurate-cast_or_null} -Converts input value to the specified data type. Always returns nullable type and returns NULL -if the casted value is not representable in the target type. +Converts input value `x` to the specified data type `T`. Always returns [Nullable](../../sql-reference/data-types/nullable.md) type and returns [NULL](../../sql-reference/syntax.md#null-literal) if the casted value is not representable in the target type. **Syntax** ```sql accurateCastOrNull(x, T) - ``` **Parameters** - `x` — Input value. -- `T` — Defines the data type of returned values. +- `T` — The name of the returned data type. **Example** Query: +Query: + +``` sql +SELECT toTypeName(accurateCastOrNull(5, 'UInt8')); +``` + +Result: + +``` text +┌─toTypeName(accurateCastOrNull(5, 'UInt8'))─┐ +│ Nullable(UInt8) │ +└────────────────────────────────────────────┘ +``` + ``` sql SELECT cast(-1, 'UInt8') as uint8, @@ -493,20 +505,6 @@ Result: └───────┴──────┴──────────────┘ ``` -Query: - -``` sql -SELECT toTypeName(accurateCastOrNull(5, 'UInt8')); -``` - -Result: - -``` text -┌─toTypeName(accurateCastOrNull(5, 'UInt8'))─┐ -│ Nullable(UInt8) │ -└────────────────────────────────────────────┘ -``` - ## toInterval(Year\|Quarter\|Month\|Week\|Day\|Hour\|Minute\|Second) {#function-tointerval} Converts a Number type argument to an [Interval](../../sql-reference/data-types/special-data-types/interval.md) data type. diff --git a/docs/ru/sql-reference/functions/type-conversion-functions.md b/docs/ru/sql-reference/functions/type-conversion-functions.md index 4a314bd22d8..40fdbc6f5a0 100644 --- a/docs/ru/sql-reference/functions/type-conversion-functions.md +++ b/docs/ru/sql-reference/functions/type-conversion-functions.md @@ -427,6 +427,54 @@ SELECT toTypeName(CAST(x, 'Nullable(UInt16)')) FROM t_null - Настройка [cast_keep_nullable](../../operations/settings/settings.md#cast_keep_nullable) +## accurateCastOrNull(x, T) {#type_conversion_function-accurate-cast_or_null} + +Преобразует входное значение `x` в указанный тип данных `T`. Всегда возвращает тип [Nullable](../../sql-reference/data-types/nullable.md), и возвращает [NULL](../../sql-reference/syntax.md#null-literal), если приведенное значение не может быть представлено в целевом типе. + +**Синтаксис** + +```sql +accurateCastOrNull(x, T) +``` + +**Parameters** + +- `x` — входное значение. +- `T` — имя возвращаемого типа данных. + +**Пример** + +Запрос: + +``` sql +SELECT toTypeName(accurateCastOrNull(5, 'UInt8')); +``` + +Результат: + +``` text +┌─toTypeName(accurateCastOrNull(5, 'UInt8'))─┐ +│ Nullable(UInt8) │ +└────────────────────────────────────────────┘ +``` + +Запрос: + +``` sql +SELECT + cast(-1, 'UInt8') as uint8, + cast(128, 'Int8') as int8, + cast('Test', 'FixedString(2)') as fixed_string; +``` + +Результат: + +``` text +┌─uint8─┬─int8─┬─fixed_string─┐ +│ ᴺᵁᴸᴸ │ ᴺᵁᴸᴸ │ ᴺᵁᴸᴸ │ +└───────┴──────┴──────────────┘ +``` + ## toInterval(Year\|Quarter\|Month\|Week\|Day\|Hour\|Minute\|Second) {#function-tointerval} Приводит аргумент из числового типа данных к типу данных [IntervalType](../../sql-reference/data-types/special-data-types/interval.md). From f3860134ab7b40aafaa585fbc90c6806cac1da4d Mon Sep 17 00:00:00 2001 From: romanzhukov Date: Tue, 2 Feb 2021 10:00:54 +0300 Subject: [PATCH 0127/2357] DOCSUP-5822: Add function documentation. --- docs/en/sql-reference/functions/type-conversion-functions.md | 2 -- 1 file changed, 2 deletions(-) diff --git a/docs/en/sql-reference/functions/type-conversion-functions.md b/docs/en/sql-reference/functions/type-conversion-functions.md index 86217871ca1..047b3b1cbea 100644 --- a/docs/en/sql-reference/functions/type-conversion-functions.md +++ b/docs/en/sql-reference/functions/type-conversion-functions.md @@ -476,8 +476,6 @@ accurateCastOrNull(x, T) Query: -Query: - ``` sql SELECT toTypeName(accurateCastOrNull(5, 'UInt8')); ``` From 0073c87d5d2e80a054468255b021acdbe5ceb660 Mon Sep 17 00:00:00 2001 From: Alexander Tokmakov Date: Tue, 2 Feb 2021 13:32:42 +0300 Subject: [PATCH 0128/2357] fix --- src/Databases/DatabaseAtomic.cpp | 2 +- src/Interpreters/DDLWorker.cpp | 2 +- src/Interpreters/InterpreterDropQuery.cpp | 2 +- src/Storages/StorageMaterializedView.cpp | 12 ++++++------ src/Storages/StorageMaterializedView.h | 2 +- 5 files changed, 10 insertions(+), 10 deletions(-) diff --git a/src/Databases/DatabaseAtomic.cpp b/src/Databases/DatabaseAtomic.cpp index 8b75f439152..e6bc3bfcd44 100644 --- a/src/Databases/DatabaseAtomic.cpp +++ b/src/Databases/DatabaseAtomic.cpp @@ -131,7 +131,7 @@ void DatabaseAtomic::dropTable(const Context & context, const String & table_nam /// Remove the inner table (if any) to avoid deadlock /// (due to attempt to execute DROP from the worker thread) if (auto * mv = dynamic_cast(table.get())) - mv->dropInnerTable(no_delay); + mv->dropInnerTable(no_delay, context); /// Notify DatabaseCatalog that table was dropped. It will remove table data in background. /// Cleanup is performed outside of database to allow easily DROP DATABASE without waiting for cleanup to complete. DatabaseCatalog::instance().enqueueDroppedTableCleanup(table->getStorageID(), table, table_metadata_path_drop, no_delay); diff --git a/src/Interpreters/DDLWorker.cpp b/src/Interpreters/DDLWorker.cpp index fabb9f9563e..dd822e0f237 100644 --- a/src/Interpreters/DDLWorker.cpp +++ b/src/Interpreters/DDLWorker.cpp @@ -484,7 +484,7 @@ void DDLWorker::processTask(DDLTaskBase & task) auto table_id = context.tryResolveStorageID(*query_with_table, Context::ResolveOrdinary); DatabasePtr database; std::tie(database, storage) = DatabaseCatalog::instance().tryGetDatabaseAndTable(table_id, context); - if (database && database->getEngineName() == "Replicated") + if (database && database->getEngineName() == "Replicated" && !typeid_cast(&task)) throw Exception(ErrorCodes::INCORRECT_QUERY, "ON CLUSTER queries are not allowed for Replicated databases"); } diff --git a/src/Interpreters/InterpreterDropQuery.cpp b/src/Interpreters/InterpreterDropQuery.cpp index eed7337b9ab..68680f27ea4 100644 --- a/src/Interpreters/InterpreterDropQuery.cpp +++ b/src/Interpreters/InterpreterDropQuery.cpp @@ -127,7 +127,7 @@ BlockIO InterpreterDropQuery::executeToTableImpl(const ASTDropQuery & query, Dat table_id.uuid = database->tryGetTableUUID(table_id.table_name); /// Prevents recursive drop from drop database query. The original query must specify a table. - bool is_drop_or_detach_database = query.table.empty(); + bool is_drop_or_detach_database = query_ptr->as()->table.empty(); bool is_replicated_ddl_query = typeid_cast(database.get()) && context.getClientInfo().query_kind != ClientInfo::QueryKind::REPLICATED_LOG_QUERY && !is_drop_or_detach_database; diff --git a/src/Storages/StorageMaterializedView.cpp b/src/Storages/StorageMaterializedView.cpp index 29aea3e6150..fb75a933910 100644 --- a/src/Storages/StorageMaterializedView.cpp +++ b/src/Storages/StorageMaterializedView.cpp @@ -194,7 +194,7 @@ BlockOutputStreamPtr StorageMaterializedView::write(const ASTPtr & query, const } -static void executeDropQuery(ASTDropQuery::Kind kind, Context & global_context, const StorageID & target_table_id, bool no_delay) +static void executeDropQuery(ASTDropQuery::Kind kind, const Context & global_context, const StorageID & target_table_id, bool no_delay) { if (DatabaseCatalog::instance().tryGetTable(target_table_id, global_context)) { @@ -220,19 +220,19 @@ void StorageMaterializedView::drop() if (!select_query.select_table_id.empty()) DatabaseCatalog::instance().removeDependency(select_query.select_table_id, table_id); - dropInnerTable(true); + dropInnerTable(true, global_context); } -void StorageMaterializedView::dropInnerTable(bool no_delay) +void StorageMaterializedView::dropInnerTable(bool no_delay, const Context & context) { if (has_inner_table && tryGetTargetTable()) - executeDropQuery(ASTDropQuery::Kind::Drop, global_context, target_table_id, no_delay); + executeDropQuery(ASTDropQuery::Kind::Drop, context, target_table_id, no_delay); } -void StorageMaterializedView::truncate(const ASTPtr &, const StorageMetadataPtr &, const Context &, TableExclusiveLockHolder &) +void StorageMaterializedView::truncate(const ASTPtr &, const StorageMetadataPtr &, const Context & context, TableExclusiveLockHolder &) { if (has_inner_table) - executeDropQuery(ASTDropQuery::Kind::Truncate, global_context, target_table_id, true); + executeDropQuery(ASTDropQuery::Kind::Truncate, context, target_table_id, true); } void StorageMaterializedView::checkStatementCanBeForwarded() const diff --git a/src/Storages/StorageMaterializedView.h b/src/Storages/StorageMaterializedView.h index fab9e28afe3..94e4295cd34 100644 --- a/src/Storages/StorageMaterializedView.h +++ b/src/Storages/StorageMaterializedView.h @@ -37,7 +37,7 @@ public: BlockOutputStreamPtr write(const ASTPtr & query, const StorageMetadataPtr & /*metadata_snapshot*/, const Context & context) override; void drop() override; - void dropInnerTable(bool no_delay); + void dropInnerTable(bool no_delay, const Context & context); void truncate(const ASTPtr &, const StorageMetadataPtr &, const Context &, TableExclusiveLockHolder &) override; From 60a92e9a99551aa959ce4924e69f89fe4254b3c3 Mon Sep 17 00:00:00 2001 From: Pavel Kruglov Date: Tue, 2 Feb 2021 15:14:31 +0300 Subject: [PATCH 0129/2357] Fix build, add comments, update tests --- src/Client/Connection.cpp | 29 -------- src/Client/ConnectionPoolWithFailover.h | 3 +- src/Client/GetHedgedConnections.cpp | 67 +++++-------------- src/Client/GetHedgedConnections.h | 40 ++++++----- src/Client/HedgedConnections.cpp | 56 +++------------- src/Client/HedgedConnections.h | 39 ++++++++--- src/Core/Defines.h | 8 +-- src/Core/Settings.h | 4 +- .../RemoteQueryExecutorReadContext.cpp | 8 +-- .../integration/test_hedged_requests/test.py | 16 ++--- .../configs/remote_servers.xml | 22 ++++++ .../configs/users.xml | 11 +++ .../configs/users1.xml | 8 +++ .../test_hedged_requests_parallel/test.py | 55 +++++++++++++++ 14 files changed, 190 insertions(+), 176 deletions(-) create mode 100644 tests/integration/test_hedged_requests_parallel/configs/remote_servers.xml create mode 100644 tests/integration/test_hedged_requests_parallel/configs/users.xml create mode 100644 tests/integration/test_hedged_requests_parallel/configs/users1.xml create mode 100644 tests/integration/test_hedged_requests_parallel/test.py diff --git a/src/Client/Connection.cpp b/src/Client/Connection.cpp index 1593933f8f7..00ae406651d 100644 --- a/src/Client/Connection.cpp +++ b/src/Client/Connection.cpp @@ -92,8 +92,6 @@ void Connection::connect(const ConnectionTimeouts & timeouts) void Connection::disconnect() { - LOG_DEBUG(log_wrapper.get(), "disconnect"); - maybe_compressed_out = nullptr; in = nullptr; last_input_packet_type.reset(); @@ -106,8 +104,6 @@ void Connection::disconnect() void Connection::prepare(const ConnectionTimeouts & timeouts) { - LOG_DEBUG(log_wrapper.get(), "Connect"); - LOG_TRACE(log_wrapper.get(), "Connecting. Database: {}. User: {}{}{}", default_database.empty() ? "(not specified)" : default_database, user, @@ -160,8 +156,6 @@ void Connection::prepare(const ConnectionTimeouts & timeouts) void Connection::sendHello() { - LOG_DEBUG(log_wrapper.get(), "sendHello"); - /** Disallow control characters in user controlled parameters * to mitigate the possibility of SSRF. * The user may do server side requests with 'remote' table function. @@ -218,8 +212,6 @@ void Connection::sendHello() void Connection::receiveHello() { - LOG_DEBUG(log_wrapper.get(), "receiveHello"); - /// Receive hello packet. UInt64 packet_type = 0; @@ -323,8 +315,6 @@ const String & Connection::getServerDisplayName(const ConnectionTimeouts & timeo void Connection::forceConnected(const ConnectionTimeouts & timeouts) { - LOG_DEBUG(log_wrapper.get(), "forceConnected"); - if (!connected) { connect(timeouts); @@ -351,8 +341,6 @@ void Connection::sendClusterNameAndSalt() bool Connection::ping() { - LOG_DEBUG(log_wrapper.get(), "ping"); - TimeoutSetter timeout_setter(*socket, sync_request_timeout, true); try { @@ -404,8 +392,6 @@ TablesStatusResponse Connection::getTablesStatus(const ConnectionTimeouts & time void Connection::sendTablesStatusRequest(const TablesStatusRequest & request) { - LOG_DEBUG(log_wrapper.get(), "sendTablesStatusRequest"); - writeVarUInt(Protocol::Client::TablesStatusRequest, *out); request.write(*out, server_revision); out->next(); @@ -413,8 +399,6 @@ void Connection::sendTablesStatusRequest(const TablesStatusRequest & request) TablesStatusResponse Connection::receiveTablesStatusResponse() { - LOG_DEBUG(log_wrapper.get(), "receiveTablesStatusResponse"); - UInt64 response_type = 0; readVarUInt(response_type, *in); @@ -440,8 +424,6 @@ void Connection::sendQuery( if (!connected) connect(timeouts); - LOG_DEBUG(log_wrapper.get(), "sendQuery"); - TimeoutSetter timeout_setter(*socket, timeouts.send_timeout, timeouts.receive_timeout, true); if (settings) @@ -540,8 +522,6 @@ void Connection::sendCancel() if (!out) return; - LOG_DEBUG(log_wrapper.get(), "sendCancel"); - writeVarUInt(Protocol::Client::Cancel, *out); out->next(); } @@ -549,8 +529,6 @@ void Connection::sendCancel() void Connection::sendData(const Block & block, const String & name, bool scalar) { - LOG_DEBUG(log_wrapper.get(), "sendData"); - if (!block_out) { if (compression == Protocol::Compression::Enable) @@ -581,7 +559,6 @@ void Connection::sendData(const Block & block, const String & name, bool scalar) void Connection::sendPreparedData(ReadBuffer & input, size_t size, const String & name) { /// NOTE 'Throttler' is not used in this method (could use, but it's not important right now). - LOG_DEBUG(log_wrapper.get(), "sendPreparedData"); if (input.eof()) throw Exception("Buffer is empty (some kind of corruption)", ErrorCodes::EMPTY_DATA_PASSED); @@ -602,8 +579,6 @@ void Connection::sendScalarsData(Scalars & data) if (data.empty()) return; - LOG_DEBUG(log_wrapper.get(), "sendScalarsData"); - Stopwatch watch; size_t out_bytes = out ? out->count() : 0; size_t maybe_compressed_out_bytes = maybe_compressed_out ? maybe_compressed_out->count() : 0; @@ -689,8 +664,6 @@ void Connection::sendExternalTablesData(ExternalTablesData & data) return; } - LOG_DEBUG(log_wrapper.get(), "sendExternalTablesData"); - Stopwatch watch; size_t out_bytes = out ? out->count() : 0; size_t maybe_compressed_out_bytes = maybe_compressed_out ? maybe_compressed_out->count() : 0; @@ -789,8 +762,6 @@ std::optional Connection::checkPacket(size_t timeout_microseconds) Packet Connection::receivePacket(AsyncCallback async_callback) { - LOG_DEBUG(log_wrapper.get(), "receivePacket"); - in->setAsyncCallback(std::move(async_callback)); SCOPE_EXIT(in->setAsyncCallback({})); diff --git a/src/Client/ConnectionPoolWithFailover.h b/src/Client/ConnectionPoolWithFailover.h index 86f63191608..f235c5b2e5f 100644 --- a/src/Client/ConnectionPoolWithFailover.h +++ b/src/Client/ConnectionPoolWithFailover.h @@ -2,7 +2,6 @@ #include #include -#include #include #include @@ -62,6 +61,8 @@ public: /// Reset class to initial stage. void reset(); + /// If action_before_disconnect is set, action_before_disconnect(socket_fd) will be called before + /// disconnect. It may be useful for removing file descriptor from epoll. void setActionBeforeDisconnect(std::function action) { action_before_disconnect = action; } /// Process fail connection. diff --git a/src/Client/GetHedgedConnections.cpp b/src/Client/GetHedgedConnections.cpp index 6b046bfcec0..a9283a75105 100644 --- a/src/Client/GetHedgedConnections.cpp +++ b/src/Client/GetHedgedConnections.cpp @@ -19,9 +19,8 @@ GetHedgedConnections::GetHedgedConnections( const Settings * settings_, const ConnectionTimeouts & timeouts_, std::shared_ptr table_to_check_) - : pool(pool_), settings(settings_), timeouts(timeouts_), table_to_check(table_to_check_) + : pool(pool_), settings(settings_), timeouts(timeouts_), table_to_check(table_to_check_), log(&Poco::Logger::get("GetHedgedConnections")) { - log = &Poco::Logger::get("GetHedgedConnections"); shuffled_pools = pool->getShuffledPools(settings); for (size_t i = 0; i != shuffled_pools.size(); ++i) try_get_connections.emplace_back(shuffled_pools[i].pool, &timeouts, settings, table_to_check, log); @@ -78,6 +77,7 @@ std::vector GetHedgedConnections::getMany "All connection tries failed. Log: \n\n" + fail_messages + "\n", DB::ErrorCodes::ALL_CONNECTION_TRIES_FAILED); } + replicas.push_back(replica); } @@ -86,9 +86,7 @@ std::vector GetHedgedConnections::getMany GetHedgedConnections::ReplicaStatePtr GetHedgedConnections::getNextConnection(bool non_blocking) { - LOG_DEBUG(log, "getNextConnection"); ReplicaStatePtr replica = createNewReplica(); - int index; /// Check if it's the first time. @@ -104,6 +102,8 @@ GetHedgedConnections::ReplicaStatePtr GetHedgedConnections::getNextConnection(bo while (index != -1 || !epoll.empty()) { + /// Prevent blocking after receiving timeout when there is no new replica to connect + /// (processEpollEvents can return EMPTY replica after timeout processing to start new connection). if (index == -1 && !is_first && non_blocking) { replica->state = State::NOT_READY; @@ -134,22 +134,14 @@ GetHedgedConnections::ReplicaStatePtr GetHedgedConnections::getNextConnection(bo if (replica->isReady() || (replica->isNotReady() && non_blocking)) return replica; - if (replica->isNotReady()) - throw Exception("Not ready replica after processing epoll events.", ErrorCodes::LOGICAL_ERROR); - index = getNextIndex(); } /// We reach this point only if there was no free up to date replica. + /// We will try to use usable replica. - /// Check if there is no even a free usable replica - if (!canGetNewConnection()) - { - replica->state = State::CANNOT_CHOOSE; - return replica; - } - - if (!fallback_to_stale_replicas) + /// Check if we are not allowed to use usable replicas or there is no even a free usable replica. + if (!fallback_to_stale_replicas || !canGetNewConnection()) { replica->state = State::CANNOT_CHOOSE; return replica; @@ -161,7 +153,6 @@ GetHedgedConnections::ReplicaStatePtr GetHedgedConnections::getNextConnection(bo void GetHedgedConnections::stopChoosingReplicas() { - LOG_DEBUG(log, "stopChoosingReplicas"); for (auto & [fd, replica] : fd_to_replica) { removeTimeoutsFromReplica(replica, epoll, timeout_fd_to_replica); @@ -175,8 +166,8 @@ void GetHedgedConnections::stopChoosingReplicas() int GetHedgedConnections::getNextIndex() { - /// Check if there is no more available replicas - if (entries_count + failed_pools_count >= shuffled_pools.size()) + /// Check if there is no free replica. + if (entries_count + indexes_in_process.size() + failed_pools_count >= shuffled_pools.size()) return -1; bool finish = false; @@ -185,25 +176,22 @@ int GetHedgedConnections::getNextIndex() { next_index = (next_index + 1) % shuffled_pools.size(); - /// Check if we can try this replica + /// Check if we can try this replica. if (indexes_in_process.find(next_index) == indexes_in_process.end() && (max_tries == 0 || shuffled_pools[next_index].error_count < max_tries) && try_get_connections[next_index].stage != TryGetConnection::Stage::FINISHED) finish = true; - /// If we made a complete round, there is no replica to connect + /// If we made a complete round, there is no replica to connect. else if (next_index == last_used_index) return -1; } - LOG_DEBUG(log, "get next index: {}", next_index); - last_used_index = next_index; return next_index; } GetHedgedConnections::Action GetHedgedConnections::startTryGetConnection(int index, ReplicaStatePtr & replica) { - LOG_DEBUG(log, "start try get connection with {} replica", index); TryGetConnection & try_get_connection = try_get_connections[index]; replica->state = State::NOT_READY; @@ -240,14 +228,11 @@ GetHedgedConnections::Action GetHedgedConnections::startTryGetConnection(int ind GetHedgedConnections::Action GetHedgedConnections::processTryGetConnectionStage(ReplicaStatePtr & replica, bool remove_from_epoll) { - LOG_DEBUG(log, "process get connection stage for {} replica", replica->index); TryGetConnection & try_get_connection = try_get_connections[replica->index]; if (try_get_connection.stage == TryGetConnection::Stage::FINISHED) { indexes_in_process.erase(replica->index); - - LOG_DEBUG(log, "stage: FINISHED"); ++entries_count; if (remove_from_epoll) @@ -258,39 +243,31 @@ GetHedgedConnections::processTryGetConnectionStage(ReplicaStatePtr & replica, bo if (try_get_connection.result.is_usable) { - LOG_DEBUG(log, "replica is usable"); ++usable_count; if (try_get_connection.result.is_up_to_date) { - LOG_DEBUG(log, "replica is up to date, finish get hedged connections"); replica->state = State::READY; ready_indexes.insert(replica->index); return Action::FINISH; } } - /// This replica is not up to date, we will try to find up to date - fd_to_replica.erase(replica->fd); + /// This replica is not up to date, we will try to find up to date. replica->reset(); return Action::TRY_NEXT_REPLICA; } else if (try_get_connection.stage == TryGetConnection::Stage::FAILED) { - LOG_DEBUG(log, "stage: FAILED"); processFailedConnection(replica); return Action::TRY_NEXT_REPLICA; } - LOG_DEBUG(log, "middle stage, process epoll events"); - - /// Get connection process is not finished + /// Get connection process is not finished. return Action::PROCESS_EPOLL_EVENTS; } void GetHedgedConnections::processFailedConnection(ReplicaStatePtr & replica) { - LOG_DEBUG(log, "failed connection with {} replica", replica->index); - ShuffledPool & shuffled_pool = shuffled_pools[replica->index]; LOG_WARNING( log, "Connection failed at try №{}, reason: {}", (shuffled_pool.error_count + 1), try_get_connections[replica->index].fail_message); @@ -314,8 +291,6 @@ void GetHedgedConnections::processFailedConnection(ReplicaStatePtr & replica) void GetHedgedConnections::addTimeouts(ReplicaStatePtr & replica) { - LOG_DEBUG(log, "add timeouts for {} replica", replica->index); - addTimeoutToReplica(TimerTypes::RECEIVE_TIMEOUT, replica, epoll, timeout_fd_to_replica, timeouts); auto stage = try_get_connections[replica->index].stage; @@ -327,7 +302,6 @@ void GetHedgedConnections::addTimeouts(ReplicaStatePtr & replica) GetHedgedConnections::ReplicaStatePtr GetHedgedConnections::processEpollEvents(bool non_blocking) { - LOG_DEBUG(log, "process epoll events"); int event_fd; ReplicaStatePtr replica = nullptr; bool finish = false; @@ -349,8 +323,6 @@ GetHedgedConnections::ReplicaStatePtr GetHedgedConnections::processEpollEvents(b throw Exception("Unknown event from epoll", ErrorCodes::LOGICAL_ERROR); } - LOG_DEBUG(log, "cancel process epoll events"); - return replica; } @@ -365,7 +337,6 @@ int GetHedgedConnections::getReadyFileDescriptor(AsyncCallback async_callback) bool GetHedgedConnections::processReplicaEvent(ReplicaStatePtr & replica, bool non_blocking) { - LOG_DEBUG(log, "epoll event is {} replica", replica->index); removeTimeoutsFromReplica(replica, epoll, timeout_fd_to_replica); try_get_connections[replica->index].run(); Action action = processTryGetConnectionStage(replica, true); @@ -380,15 +351,12 @@ bool GetHedgedConnections::processReplicaEvent(ReplicaStatePtr & replica, bool n bool GetHedgedConnections::processTimeoutEvent(ReplicaStatePtr & replica, TimerDescriptorPtr timeout_descriptor, bool non_blocking) { - LOG_DEBUG(log, "epoll event is timeout for {} replica", replica->index); - epoll.remove(timeout_descriptor->getDescriptor()); replica->active_timeouts.erase(timeout_descriptor->getDescriptor()); timeout_fd_to_replica[timeout_descriptor->getDescriptor()]; if (timeout_descriptor->getType() == TimerTypes::RECEIVE_TIMEOUT) { - LOG_DEBUG(log, "process receive timeout for {} replica", replica->index); removeTimeoutsFromReplica(replica, epoll, timeout_fd_to_replica); epoll.remove(replica->fd); fd_to_replica.erase(replica->fd); @@ -401,10 +369,9 @@ bool GetHedgedConnections::processTimeoutEvent(ReplicaStatePtr & replica, TimerD return true; } - else if ((timeout_descriptor->getType() == TimerTypes::RECEIVE_HELLO_TIMEOUT || timeout_descriptor->getType() == TimerTypes::RECEIVE_TABLES_STATUS_TIMEOUT) - && entries_count + ready_indexes.size() + failed_pools_count < shuffled_pools.size()) + && entries_count + indexes_in_process.size() + failed_pools_count < shuffled_pools.size()) { replica = createNewReplica(); return true; @@ -415,13 +382,11 @@ bool GetHedgedConnections::processTimeoutEvent(ReplicaStatePtr & replica, TimerD void GetHedgedConnections::setBestUsableReplica(ReplicaStatePtr & replica) { - LOG_DEBUG(log, "set best usable replica"); - std::vector indexes(try_get_connections.size()); for (size_t i = 0; i != indexes.size(); ++i) indexes[i] = i; - /// Remove unusable and failed replicas, skip ready replicas + /// Remove unusable, failed replicas and replicas that are ready or in process. indexes.erase( std::remove_if( indexes.begin(), @@ -439,7 +404,7 @@ void GetHedgedConnections::setBestUsableReplica(ReplicaStatePtr & replica) return; } - /// Sort replicas by staleness + /// Sort replicas by staleness. std::stable_sort( indexes.begin(), indexes.end(), diff --git a/src/Client/GetHedgedConnections.h b/src/Client/GetHedgedConnections.h index 3ae9aaf9c72..8638367e184 100644 --- a/src/Client/GetHedgedConnections.h +++ b/src/Client/GetHedgedConnections.h @@ -7,13 +7,15 @@ #include #include #include +#include namespace DB { +using TimerDescriptorPtr = std::shared_ptr; + /// Class for establishing hedged connections with replicas. -/// It works with multiple replicas simultaneously without blocking -/// (in current implementation only with 2 replicas) by using epoll. +/// It works with multiple replicas simultaneously without blocking by using epoll. class GetHedgedConnections { public: @@ -54,24 +56,24 @@ public: using ReplicaStatePtr = std::shared_ptr; - - struct Replicas - { - ReplicaStatePtr first_replica; - ReplicaStatePtr second_replica; - }; - GetHedgedConnections(const ConnectionPoolWithFailoverPtr & pool_, const Settings * settings_, const ConnectionTimeouts & timeouts_, std::shared_ptr table_to_check_ = nullptr); + /// Create and return connections according to pool_mode. std::vector getManyConnections(PoolMode pool_mode); + /// Try to establish connection to the new replica. If non_blocking is false, this function will block + /// until establishing connection to the new replica (returned replica state might be READY or CANNOT_CHOOSE). + /// If non_blocking is true, this function will try to establish connection to the new replica without blocking + /// (returned replica state might be READY, NOT_READY and CANNOT_CHOOSE). ReplicaStatePtr getNextConnection(bool non_blocking); + /// Check if we can try to produce new READY replica. bool canGetNewConnection() const { return ready_indexes.size() + failed_pools_count < shuffled_pools.size(); } + /// Stop working with all replicas that are not READY. void stopChoosingReplicas(); bool hasEventsInProcess() const { return epoll.size() > 0; } @@ -95,6 +97,8 @@ private: Action processTryGetConnectionStage(ReplicaStatePtr & replica, bool remove_from_epoll = false); + /// Find an index of the next free replica to start connection. + /// Return -1 if there is no free replica. int getNextIndex(); int getReadyFileDescriptor(AsyncCallback async_callback = {}); @@ -119,16 +123,21 @@ private: const Settings * settings; const ConnectionTimeouts timeouts; std::shared_ptr table_to_check; + std::vector try_get_connections; std::vector shuffled_pools; + /// Map socket file descriptor to replica. std::unordered_map fd_to_replica; + /// Map timeout file descriptor to replica. std::unordered_map timeout_fd_to_replica; -// std::vector> replicas; -// std::unordered_map> replicas_store; -// ReplicaState first_replica; -// ReplicaState second_replica; + /// Indexes of replicas, that are in process of connection. + std::unordered_set indexes_in_process; + /// Indexes of ready replicas. + std::unordered_set ready_indexes; + + int last_used_index; bool fallback_to_stale_replicas; Epoll epoll; Poco::Logger * log; @@ -137,10 +146,6 @@ private: size_t usable_count; size_t failed_pools_count; size_t max_tries; - int last_used_index; - std::unordered_set indexes_in_process; - std::unordered_set ready_indexes; - }; /// Add timeout with particular type to replica and add it to epoll. @@ -150,6 +155,7 @@ void addTimeoutToReplica( Epoll & epoll, std::unordered_map & timeout_fd_to_replica, const ConnectionTimeouts & timeouts); + /// Remove timeout with particular type from replica and epoll. void removeTimeoutFromReplica( int type, diff --git a/src/Client/HedgedConnections.cpp b/src/Client/HedgedConnections.cpp index 52eb79e0372..f4810a7d79c 100644 --- a/src/Client/HedgedConnections.cpp +++ b/src/Client/HedgedConnections.cpp @@ -47,20 +47,10 @@ void HedgedConnections::Pipeline::run(ReplicaStatePtr & replica) send_func(replica); } -size_t HedgedConnections::size() const -{ - if (replicas.empty()) - return 0; - - return 1; -} - void HedgedConnections::sendScalarsData(Scalars & data) { std::lock_guard lock(cancel_mutex); - LOG_DEBUG(log, "sendScalarsData"); - if (!sent_query) throw Exception("Cannot send scalars data: query not yet sent.", ErrorCodes::LOGICAL_ERROR); @@ -78,8 +68,6 @@ void HedgedConnections::sendExternalTablesData(std::vector & { std::lock_guard lock(cancel_mutex); - LOG_DEBUG(log, "sendExternalTablesData"); - if (!sent_query) throw Exception("Cannot send external tables data: query not yet sent.", ErrorCodes::LOGICAL_ERROR); @@ -106,8 +94,6 @@ void HedgedConnections::sendQuery( { std::lock_guard lock(cancel_mutex); - LOG_DEBUG(log, "sendQuery"); - if (sent_query) throw Exception("Query already sent.", ErrorCodes::LOGICAL_ERROR); @@ -117,11 +103,11 @@ void HedgedConnections::sendQuery( { if (replica->connection->getServerRevision(timeouts) < DBMS_MIN_REVISION_WITH_CURRENT_AGGREGATION_VARIANT_SELECTION_METHOD) { - has_two_level_aggregation_incompatibility = true; + disable_two_level_aggregation = true; break; } } - if (has_two_level_aggregation_incompatibility) + if (disable_two_level_aggregation) break; } @@ -129,7 +115,7 @@ void HedgedConnections::sendQuery( { Settings modified_settings = this->settings; - if (this->has_two_level_aggregation_incompatibility) + if (this->disable_two_level_aggregation) { /// Disable two-level aggregation due to version incompatibility. modified_settings.group_by_two_level_threshold = 0; @@ -159,8 +145,6 @@ void HedgedConnections::disconnect() { std::lock_guard lock(cancel_mutex); - LOG_DEBUG(log, "disconnect"); - for (auto & replicas_with_same_offset : replicas) for (auto & replica : replicas_with_same_offset) if (replica->isReady()) @@ -178,8 +162,6 @@ std::string HedgedConnections::dumpAddresses() const { std::lock_guard lock(cancel_mutex); - LOG_DEBUG(log, "dumpAddresses"); - std::string addresses; bool is_first = true; @@ -202,8 +184,6 @@ void HedgedConnections::sendCancel() { std::lock_guard lock(cancel_mutex); - LOG_DEBUG(log, "sendCancel"); - if (!sent_query || cancelled) throw Exception("Cannot cancel. Either no query sent or already cancelled.", ErrorCodes::LOGICAL_ERROR); @@ -223,8 +203,6 @@ Packet HedgedConnections::drain() if (!cancelled) throw Exception("Cannot drain connections: cancel first.", ErrorCodes::LOGICAL_ERROR); - LOG_DEBUG(log, "drain"); - Packet res; res.type = Protocol::Server::EndOfStream; @@ -273,8 +251,6 @@ Packet HedgedConnections::receivePacketUnlocked(AsyncCallback async_callback) Packet HedgedConnections::receivePacketImpl(AsyncCallback async_callback) { - LOG_DEBUG(log, "sreceivePacketImpl"); - int event_fd; ReplicaStatePtr replica = nullptr; Packet packet; @@ -285,14 +261,12 @@ Packet HedgedConnections::receivePacketImpl(AsyncCallback async_callback) if (fd_to_replica.find(event_fd) != fd_to_replica.end()) { - LOG_DEBUG(log, "event is replica"); replica = fd_to_replica[event_fd]; packet = receivePacketFromReplica(replica, async_callback); finish = true; } else if (timeout_fd_to_replica.find(event_fd) != timeout_fd_to_replica.end()) { - LOG_DEBUG(log, "event is timeout"); replica = timeout_fd_to_replica[event_fd]; processTimeoutEvent(replica, replica->active_timeouts[event_fd]); } @@ -316,8 +290,6 @@ int HedgedConnections::getReadyFileDescriptor(AsyncCallback async_callback) Packet HedgedConnections::receivePacketFromReplica(ReplicaStatePtr & replica, AsyncCallback async_callback) { - LOG_DEBUG(log, "sreceivePacketFromReplica"); - Packet packet = replica->connection->receivePacket(std::move(async_callback)); switch (packet.type) { @@ -350,11 +322,8 @@ Packet HedgedConnections::receivePacketFromReplica(ReplicaStatePtr & replica, As void HedgedConnections::processReceiveData(ReplicaStatePtr & replica) { - /// When we receive first packet of data from any replica, we continue working with this replica - /// and stop working with other replicas (if there are other replicas). - - LOG_DEBUG(log, "processReceiveData"); - + /// When we receive first packet of data from replica, we stop working with replicas, that are + /// responsible for the same offset. offsets_with_received_data.insert(replica->parallel_replica_offset); for (auto & other_replica : replicas[replica->parallel_replica_offset]) @@ -366,6 +335,7 @@ void HedgedConnections::processReceiveData(ReplicaStatePtr & replica) } } + /// If we received data from replicas with all offsets, we need to stop choosing new replicas. if (get_hedged_connections.hasEventsInProcess() && offsets_with_received_data.size() == replicas.size()) { get_hedged_connections.stopChoosingReplicas(); @@ -376,24 +346,21 @@ void HedgedConnections::processReceiveData(ReplicaStatePtr & replica) void HedgedConnections::processTimeoutEvent(ReplicaStatePtr & replica, TimerDescriptorPtr timeout_descriptor) { - LOG_DEBUG(log, "processTimeoutEvent"); epoll.remove(timeout_descriptor->getDescriptor()); replica->active_timeouts.erase(timeout_descriptor->getDescriptor()); timeout_fd_to_replica.erase(timeout_descriptor->getDescriptor()); if (timeout_descriptor->getType() == TimerTypes::RECEIVE_TIMEOUT) { - LOG_DEBUG(log, "process RECEIVE_TIMEOUT"); size_t offset = replica->parallel_replica_offset; finishProcessReplica(replica, true); - /// Check if there is no active connection with same offset. + /// Check if there is no active connections with the same offset. if (active_connections_count_by_offset[offset] == 0) throw NetException("Receive timeout expired", ErrorCodes::SOCKET_TIMEOUT); } else if (timeout_descriptor->getType() == TimerTypes::RECEIVE_DATA_TIMEOUT) { - LOG_DEBUG(log, "process RECEIVE_DATA_TIMEOUT"); offsets_queue.push(replica->parallel_replica_offset); tryGetNewReplica(); } @@ -401,18 +368,15 @@ void HedgedConnections::processTimeoutEvent(ReplicaStatePtr & replica, TimerDesc void HedgedConnections::tryGetNewReplica() { - LOG_DEBUG(log, "tryGetNewReplica"); - ReplicaStatePtr new_replica = get_hedged_connections.getNextConnection(/*non_blocking*/ true); - /// Skip replicas with old server version if we didn't disable two-level aggregation in sendQuery. - while (new_replica->isReady() && !has_two_level_aggregation_incompatibility + /// Skip replicas that doesn't support two-level aggregation if we didn't disable it in sendQuery. + while (new_replica->isReady() && !disable_two_level_aggregation && new_replica->connection->getServerRevision(get_hedged_connections.getConnectionTimeouts()) < DBMS_MIN_REVISION_WITH_CURRENT_AGGREGATION_VARIANT_SELECTION_METHOD) new_replica = get_hedged_connections.getNextConnection(/*non_blocking*/ true); if (new_replica->isReady()) { - LOG_DEBUG(log, "processNewReadyReplica"); new_replica->parallel_replica_offset = offsets_queue.front(); offsets_queue.pop(); replicas[new_replica->parallel_replica_offset].push_back(new_replica); @@ -436,8 +400,6 @@ void HedgedConnections::tryGetNewReplica() void HedgedConnections::finishProcessReplica(ReplicaStatePtr & replica, bool disconnect) { - LOG_DEBUG(log, "finishProcessReplica"); - removeTimeoutsFromReplica(replica, epoll, timeout_fd_to_replica); epoll.remove(replica->fd); fd_to_replica.erase(replica->fd); diff --git a/src/Client/HedgedConnections.h b/src/Client/HedgedConnections.h index 1400ff89de4..8081fa6739d 100644 --- a/src/Client/HedgedConnections.h +++ b/src/Client/HedgedConnections.h @@ -13,7 +13,6 @@ class HedgedConnections : public IConnections { public: using ReplicaStatePtr = GetHedgedConnections::ReplicaStatePtr; - using Replicas = GetHedgedConnections::Replicas; HedgedConnections(const ConnectionPoolWithFailoverPtr & pool_, const Settings & settings_, @@ -46,20 +45,18 @@ public: std::string dumpAddresses() const override; - size_t size() const override; + size_t size() const override { return replicas.size(); } bool hasActiveConnections() const override { return !active_connections_count_by_offset.empty(); } private: + /// We will save actions with replicas in pipeline to perform them on the new replicas. class Pipeline { public: void add(std::function send_function); void run(ReplicaStatePtr & replica); - - bool empty() const { return pipeline.empty(); } - private: std::vector> pipeline; }; @@ -79,21 +76,43 @@ private: int getReadyFileDescriptor(AsyncCallback async_callback = {}); GetHedgedConnections get_hedged_connections; + + /// All replicas in replicas[offset] are responsible for process query + /// with setting parallel_replica_offset = offset. In common situations + /// replicas[offset].size() = 1 (like in MultiplexedConnections). std::vector> replicas; + + /// Map socket file descriptor to replica. std::unordered_map fd_to_replica; + /// Map timeout file descriptor to replica. std::unordered_map timeout_fd_to_replica; + + /// A queue of offsets for new replicas. When we get RECEIVE_DATA_TIMEOUT from + /// the replica, we push it's offset to this queue and start trying to get + /// new replica. std::queue offsets_queue; + + /// Map offset to amount of active connections, responsible to this offset. + std::unordered_map active_connections_count_by_offset; + + std::unordered_set offsets_with_received_data; + + Pipeline pipeline_for_new_replicas; + + /// New replica may not support two-level aggregation due to version incompatibility. + /// If we didn't disabled it, we need to skip this replica. + bool disable_two_level_aggregation = false; + + /// next_replica_in_process is true when get_hedged_connections.getFileDescriptor() + /// is in epoll now and false otherwise. + bool next_replica_in_process = false; + Epoll epoll; const Settings & settings; ThrottlerPtr throttler; Poco::Logger * log; - Pipeline pipeline_for_new_replicas; bool sent_query = false; bool cancelled = false; - std::unordered_map active_connections_count_by_offset; - bool next_replica_in_process = false; - bool has_two_level_aggregation_incompatibility = false; - std::unordered_set offsets_with_received_data; mutable std::mutex cancel_mutex; }; diff --git a/src/Core/Defines.h b/src/Core/Defines.h index f7b67343f17..89f9925b1f3 100644 --- a/src/Core/Defines.h +++ b/src/Core/Defines.h @@ -11,10 +11,10 @@ #define DBMS_DEFAULT_CONNECT_TIMEOUT_WITH_FAILOVER_SECURE_MS 100 #define DBMS_DEFAULT_SEND_TIMEOUT_SEC 300 #define DBMS_DEFAULT_RECEIVE_TIMEOUT_SEC 300 -/// Timeouts for hedged requests -#define DBMS_DEFAULT_RECEIVE_HELLO_TIMEOUT_SEC 1 -#define DBMS_DEFAULT_RECEIVE_TABLES_STATUS_TIMEOUT_SEC 1 -#define DBMS_DEFAULT_RECEIVE_DATA_TIMEOUT_SEC 1 +/// Timeouts for hedged requests. +#define DBMS_DEFAULT_RECEIVE_HELLO_TIMEOUT_MS 100 +#define DBMS_DEFAULT_RECEIVE_TABLES_STATUS_TIMEOUT_MS 100 +#define DBMS_DEFAULT_RECEIVE_DATA_TIMEOUT_SEC 2 /// Timeout for synchronous request-result protocol call (like Ping or TablesStatus). #define DBMS_DEFAULT_SYNC_REQUEST_TIMEOUT_SEC 5 #define DBMS_DEFAULT_POLL_INTERVAL 10 diff --git a/src/Core/Settings.h b/src/Core/Settings.h index 5177f10386e..56c5f7e54ee 100644 --- a/src/Core/Settings.h +++ b/src/Core/Settings.h @@ -55,8 +55,8 @@ class IColumn; M(Seconds, receive_timeout, DBMS_DEFAULT_RECEIVE_TIMEOUT_SEC, "", 0) \ M(Seconds, send_timeout, DBMS_DEFAULT_SEND_TIMEOUT_SEC, "", 0) \ M(Seconds, tcp_keep_alive_timeout, 0, "The time in seconds the connection needs to remain idle before TCP starts sending keepalive probes", 0) \ - M(Seconds, receive_hello_timeout, DBMS_DEFAULT_RECEIVE_HELLO_TIMEOUT_SEC, "Connection timeout for receiving hello from replica", 0) \ - M(Seconds, receive_tables_status_timeout, DBMS_DEFAULT_RECEIVE_TABLES_STATUS_TIMEOUT_SEC, "Connection timeout for receiving tables status from replica", 0) \ + M(Milliseconds, receive_hello_timeout, DBMS_DEFAULT_RECEIVE_HELLO_TIMEOUT_MS, "Connection timeout for receiving hello from replica", 0) \ + M(Milliseconds, receive_tables_status_timeout, DBMS_DEFAULT_RECEIVE_TABLES_STATUS_TIMEOUT_MS, "Connection timeout for receiving tables status from replica", 0) \ M(Seconds, receive_data_timeout, DBMS_DEFAULT_RECEIVE_DATA_TIMEOUT_SEC, "Connection timeout for receiving first packet of data from replica", 0) \ M(Bool, use_hedged_requests, true, "Use hedged requests for distributed queries", 0) \ M(Milliseconds, queue_max_wait_ms, 0, "The wait time in the request queue, if the number of concurrent requests exceeds the maximum.", 0) \ diff --git a/src/DataStreams/RemoteQueryExecutorReadContext.cpp b/src/DataStreams/RemoteQueryExecutorReadContext.cpp index c854794cd27..c77b2d48f05 100644 --- a/src/DataStreams/RemoteQueryExecutorReadContext.cpp +++ b/src/DataStreams/RemoteQueryExecutorReadContext.cpp @@ -127,13 +127,13 @@ bool RemoteQueryExecutorReadContext::checkTimeoutImpl() const bool is_pipe_alarmed = false; bool has_timer_alarm = false; - for (size_t i = 0; i < events.size(); ++i) + for (const auto & event : events) { - if (events[i].data.fd == connection_fd) + if (event.data.fd == connection_fd) is_socket_ready = true; - if (events[i].data.fd == timer.getDescriptor()) + if (event.data.fd == timer.getDescriptor()) has_timer_alarm = true; - if (events[i].data.fd == pipe_fd[0]) + if (event.data.fd == pipe_fd[0]) is_pipe_alarmed = true; } diff --git a/tests/integration/test_hedged_requests/test.py b/tests/integration/test_hedged_requests/test.py index 00d28ac62eb..5e63d92b6c5 100644 --- a/tests/integration/test_hedged_requests/test.py +++ b/tests/integration/test_hedged_requests/test.py @@ -17,9 +17,6 @@ node = cluster.add_instance( node_1 = cluster.add_instance('node_1', with_zookeeper=True, stay_alive=True, user_configs=['configs/users1.xml']) node_2 = cluster.add_instance('node_2', with_zookeeper=True) -sleep_timeout = 30 -receive_timeout = 1 - config = ''' @@ -48,14 +45,14 @@ def started_cluster(): finally: cluster.shutdown() -def process_test(sleep_setting_name, receive_timeout_name): +def process_test(sleep_setting_name, receive_timeout_name, receive_timeout, sleep_timeout): node_1.replace_config('/etc/clickhouse-server/users.d/users1.xml', config.format(setting=sleep_setting_name, sleep=sleep_timeout)) # Restart node to make new config relevant node_1.restart_clickhouse(sleep_timeout + 1) # Without hedged requests select query will last more than sleep_timeout seconds, - # with hedged requests it will last just over receive_timeout seconds + # with hedged requests it will last just over receive_timeout node.query("SET {setting}={value}".format(setting=receive_timeout_name, value=receive_timeout)) @@ -65,14 +62,11 @@ def process_test(sleep_setting_name, receive_timeout_name): print(query_time) - # Check that query time is not long - # assert query_time < sleep_timeout - def test(started_cluster): node.query("INSERT INTO distributed VALUES (1, '2020-01-01')") - process_test("sleep_before_send_hello", "receive_hello_timeout") - process_test("sleep_before_send_tables_status", "receive_tables_status_timeout") - process_test("sleep_before_send_data", "receive_data_timeout") + process_test("sleep_before_send_hello", "receive_hello_timeout", 1000, 30) + process_test("sleep_before_send_tables_status", "receive_tables_status_timeout", 1000, 30) + process_test("sleep_before_send_data", "receive_data_timeout", 1, 30) diff --git a/tests/integration/test_hedged_requests_parallel/configs/remote_servers.xml b/tests/integration/test_hedged_requests_parallel/configs/remote_servers.xml new file mode 100644 index 00000000000..9d753ca2b6a --- /dev/null +++ b/tests/integration/test_hedged_requests_parallel/configs/remote_servers.xml @@ -0,0 +1,22 @@ + + + + + true + + node_1 + 9000 + + + node_2 + 9000 + + + node_3 + 9000 + + + + + + diff --git a/tests/integration/test_hedged_requests_parallel/configs/users.xml b/tests/integration/test_hedged_requests_parallel/configs/users.xml new file mode 100644 index 00000000000..0007089f326 --- /dev/null +++ b/tests/integration/test_hedged_requests_parallel/configs/users.xml @@ -0,0 +1,11 @@ + + + + + in_order + 1 + 0 + 2 + + + diff --git a/tests/integration/test_hedged_requests_parallel/configs/users1.xml b/tests/integration/test_hedged_requests_parallel/configs/users1.xml new file mode 100644 index 00000000000..5fe444b94ff --- /dev/null +++ b/tests/integration/test_hedged_requests_parallel/configs/users1.xml @@ -0,0 +1,8 @@ + + + + + 30 + + + diff --git a/tests/integration/test_hedged_requests_parallel/test.py b/tests/integration/test_hedged_requests_parallel/test.py new file mode 100644 index 00000000000..65e44095ded --- /dev/null +++ b/tests/integration/test_hedged_requests_parallel/test.py @@ -0,0 +1,55 @@ +import os +import sys +import time + +import pytest + +sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) + +from helpers.cluster import ClickHouseCluster +from helpers.network import PartitionManager + +cluster = ClickHouseCluster(__file__) + +# Cluster with 1 shard of 2 replicas. node is the instance with Distributed table. +node = cluster.add_instance( + 'node', with_zookeeper=True, main_configs=['configs/remote_servers.xml'], user_configs=['configs/users.xml']) +node_1 = cluster.add_instance('node_1', with_zookeeper=True, user_configs=['configs/users1.xml']) +node_2 = cluster.add_instance('node_2', with_zookeeper=True) +node_3 = cluster.add_instance('node_3', with_zookeeper=True) + + +@pytest.fixture(scope="module") +def started_cluster(): + try: + cluster.start() + + node_1.query('''CREATE TABLE replicated (id UInt32, date Date) ENGINE = + ReplicatedMergeTree('/clickhouse/tables/replicated', 'node_1') ORDER BY id PARTITION BY toYYYYMM(date)''') + + node_2.query('''CREATE TABLE replicated (id UInt32, date Date) ENGINE = + ReplicatedMergeTree('/clickhouse/tables/replicated', 'node_2') ORDER BY id PARTITION BY toYYYYMM(date)''') + + node_3.query('''CREATE TABLE replicated (id UInt32, date Date) ENGINE = + ReplicatedMergeTree('/clickhouse/tables/replicated', 'node_3') ORDER BY id PARTITION BY toYYYYMM(date)''') + + node.query('''CREATE TABLE distributed (id UInt32, date Date) ENGINE = + Distributed('test_cluster', 'default', 'replicated')''') + + yield cluster + + finally: + cluster.shutdown() + +def test(started_cluster): + node.query("INSERT INTO distributed VALUES (1, '2020-01-01')") + + # Without hedged requests select query will last more 30 seconds, + # with hedged requests it will last just over 2 seconds + + start = time.time() + node.query("SELECT * FROM distributed"); + query_time = time.time() - start + + print(query_time) + From 02cc43502edd12e90577b093d59c9fdbfb9b4c75 Mon Sep 17 00:00:00 2001 From: Pavel Kruglov Date: Tue, 2 Feb 2021 15:17:06 +0300 Subject: [PATCH 0130/2357] Remove LOG_DEBUG --- src/Client/ConnectionPoolWithFailover.cpp | 9 --------- 1 file changed, 9 deletions(-) diff --git a/src/Client/ConnectionPoolWithFailover.cpp b/src/Client/ConnectionPoolWithFailover.cpp index af4f8bb2d25..8a67d59925a 100644 --- a/src/Client/ConnectionPoolWithFailover.cpp +++ b/src/Client/ConnectionPoolWithFailover.cpp @@ -132,8 +132,6 @@ std::vector ConnectionPoolWithFailover::getMany(const Co const Settings * settings, PoolMode pool_mode) { - LOG_DEBUG(log, "ConnectionPoolWithFailover getMany"); - TryGetEntryFunc try_get_entry = [&](NestedPool & pool, std::string & fail_message) { return tryGetEntry(pool, timeouts, fail_message, settings); @@ -166,9 +164,6 @@ std::vector ConnectionPoolWithFailover::g const Settings * settings, PoolMode pool_mode, const QualifiedTableName & table_to_check) { - - LOG_DEBUG(log, "ConnectionPoolWithFailover getManyChecked"); - TryGetEntryFunc try_get_entry = [&](NestedPool & pool, std::string & fail_message) { return tryGetEntry(pool, timeouts, fail_message, settings, &table_to_check); @@ -219,7 +214,6 @@ std::vector ConnectionPoolWithFailover::g PoolMode pool_mode, const TryGetEntryFunc & try_get_entry) { - LOG_DEBUG(log, "ConnectionPoolWithFailover getManyImpl"); size_t min_entries = (settings && settings->skip_unavailable_shards) ? 0 : 1; size_t max_tries = (settings ? size_t{settings->connections_with_failover_max_tries} : @@ -258,11 +252,8 @@ ConnectionPoolWithFailover::tryGetEntry( TryResult result; try { - LOG_DEBUG(log, "ConnectionPoolWithFailover tryGetEntry"); result.entry = pool.get(timeouts, settings, /* force_connected = */ false); - LOG_DEBUG(log, "ConnectionPoolWithFailover isConnected {}", result.entry->isConnected()); - UInt64 server_revision = 0; if (table_to_check) server_revision = result.entry->getServerRevision(timeouts); From cc14cb11f9edc20ad634c62de5d9ec959ea9fd80 Mon Sep 17 00:00:00 2001 From: Pavel Kruglov Date: Tue, 2 Feb 2021 15:20:13 +0300 Subject: [PATCH 0131/2357] Update test --- tests/integration/test_hedged_requests_parallel/test.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/integration/test_hedged_requests_parallel/test.py b/tests/integration/test_hedged_requests_parallel/test.py index 65e44095ded..b662fc9d80c 100644 --- a/tests/integration/test_hedged_requests_parallel/test.py +++ b/tests/integration/test_hedged_requests_parallel/test.py @@ -11,7 +11,7 @@ from helpers.network import PartitionManager cluster = ClickHouseCluster(__file__) -# Cluster with 1 shard of 2 replicas. node is the instance with Distributed table. +# Cluster with 1 shard of 3 replicas. node is the instance with Distributed table. node = cluster.add_instance( 'node', with_zookeeper=True, main_configs=['configs/remote_servers.xml'], user_configs=['configs/users.xml']) node_1 = cluster.add_instance('node_1', with_zookeeper=True, user_configs=['configs/users1.xml']) From ed3de186a4c34fd9c39656b6723f89b3cafc4d40 Mon Sep 17 00:00:00 2001 From: benbiti Date: Tue, 2 Feb 2021 20:26:36 +0800 Subject: [PATCH 0132/2357] [Docs]fix mistype in avg --- docs/en/sql-reference/aggregate-functions/reference/avg.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/en/sql-reference/aggregate-functions/reference/avg.md b/docs/en/sql-reference/aggregate-functions/reference/avg.md index e2e6aace734..0b80a1be704 100644 --- a/docs/en/sql-reference/aggregate-functions/reference/avg.md +++ b/docs/en/sql-reference/aggregate-functions/reference/avg.md @@ -9,7 +9,7 @@ Calculates the arithmetic mean. **Syntax** ``` sql -avgWeighted(x) +avg(x) ``` **Parameter** From 0b4a9ed87a56f8bd83eb4079ab7296784c693942 Mon Sep 17 00:00:00 2001 From: Kruglov Pavel <48961922+Avogar@users.noreply.github.com> Date: Tue, 2 Feb 2021 17:38:42 +0300 Subject: [PATCH 0133/2357] Fix gcc-9 build --- src/Common/TimerDescriptor.h | 2 -- 1 file changed, 2 deletions(-) diff --git a/src/Common/TimerDescriptor.h b/src/Common/TimerDescriptor.h index 6138ed8f395..debf7cdc899 100644 --- a/src/Common/TimerDescriptor.h +++ b/src/Common/TimerDescriptor.h @@ -39,7 +39,5 @@ public: void setType(int type_) { type = type_; } }; -using TimerDescriptorPtr = std::shared_ptr; - } #endif From dd9af192c56668bb8a323671fce5c11d27ecf254 Mon Sep 17 00:00:00 2001 From: Pavel Kruglov Date: Tue, 2 Feb 2021 18:18:05 +0300 Subject: [PATCH 0134/2357] Update test --- .../test_hedged_requests/configs/users.xml | 2 -- .../integration/test_hedged_requests/test.py | 20 +++++++++---------- .../configs/users.xml | 2 -- 3 files changed, 9 insertions(+), 15 deletions(-) diff --git a/tests/integration/test_hedged_requests/configs/users.xml b/tests/integration/test_hedged_requests/configs/users.xml index 0cf32bf9e1a..c95d73a92ed 100644 --- a/tests/integration/test_hedged_requests/configs/users.xml +++ b/tests/integration/test_hedged_requests/configs/users.xml @@ -3,8 +3,6 @@ in_order - 1 - 1 diff --git a/tests/integration/test_hedged_requests/test.py b/tests/integration/test_hedged_requests/test.py index 5e63d92b6c5..992590b516f 100644 --- a/tests/integration/test_hedged_requests/test.py +++ b/tests/integration/test_hedged_requests/test.py @@ -20,7 +20,7 @@ node_2 = cluster.add_instance('node_2', with_zookeeper=True) config = ''' - <{setting}>{sleep} + <{setting}>30 ''' @@ -45,16 +45,14 @@ def started_cluster(): finally: cluster.shutdown() -def process_test(sleep_setting_name, receive_timeout_name, receive_timeout, sleep_timeout): - node_1.replace_config('/etc/clickhouse-server/users.d/users1.xml', config.format(setting=sleep_setting_name, sleep=sleep_timeout)) +def process_test(sleep_setting_name, receive_timeout_name): + node_1.replace_config('/etc/clickhouse-server/users.d/users1.xml', config.format(setting=sleep_setting_name)) # Restart node to make new config relevant - node_1.restart_clickhouse(sleep_timeout + 1) + node_1.restart_clickhouse(30) - # Without hedged requests select query will last more than sleep_timeout seconds, - # with hedged requests it will last just over receive_timeout - - node.query("SET {setting}={value}".format(setting=receive_timeout_name, value=receive_timeout)) + # Without hedged requests select query will last more than 30 seconds, + # with hedged requests it will last just around 1-2 second start = time.time() node.query("SELECT * FROM distributed"); @@ -66,7 +64,7 @@ def process_test(sleep_setting_name, receive_timeout_name, receive_timeout, slee def test(started_cluster): node.query("INSERT INTO distributed VALUES (1, '2020-01-01')") - process_test("sleep_before_send_hello", "receive_hello_timeout", 1000, 30) - process_test("sleep_before_send_tables_status", "receive_tables_status_timeout", 1000, 30) - process_test("sleep_before_send_data", "receive_data_timeout", 1, 30) + process_test("sleep_before_send_hello", "receive_hello_timeout") + process_test("sleep_before_send_tables_status", "receive_tables_status_timeout") + process_test("sleep_before_send_data", "receive_data_timeout") diff --git a/tests/integration/test_hedged_requests_parallel/configs/users.xml b/tests/integration/test_hedged_requests_parallel/configs/users.xml index 0007089f326..c3ba59294a5 100644 --- a/tests/integration/test_hedged_requests_parallel/configs/users.xml +++ b/tests/integration/test_hedged_requests_parallel/configs/users.xml @@ -3,8 +3,6 @@ in_order - 1 - 0 2 From 2c928f11e1c18ee1cb78e33ff08025297256632e Mon Sep 17 00:00:00 2001 From: Pavel Kruglov Date: Tue, 2 Feb 2021 19:39:30 +0300 Subject: [PATCH 0135/2357] Remove code duplication --- src/Client/ConnectionPoolWithFailover.cpp | 91 ++++------------------- src/Client/ConnectionPoolWithFailover.h | 8 +- src/Client/GetHedgedConnections.cpp | 2 +- 3 files changed, 20 insertions(+), 81 deletions(-) diff --git a/src/Client/ConnectionPoolWithFailover.cpp b/src/Client/ConnectionPoolWithFailover.cpp index 8a67d59925a..a7120f16b4d 100644 --- a/src/Client/ConnectionPoolWithFailover.cpp +++ b/src/Client/ConnectionPoolWithFailover.cpp @@ -249,77 +249,10 @@ ConnectionPoolWithFailover::tryGetEntry( const Settings * settings, const QualifiedTableName * table_to_check) { - TryResult result; - try - { - result.entry = pool.get(timeouts, settings, /* force_connected = */ false); - - UInt64 server_revision = 0; - if (table_to_check) - server_revision = result.entry->getServerRevision(timeouts); - - if (!table_to_check || server_revision < DBMS_MIN_REVISION_WITH_TABLES_STATUS) - { - result.entry->forceConnected(timeouts); - result.is_usable = true; - result.is_up_to_date = true; - return result; - } - - /// Only status of the remote table corresponding to the Distributed table is taken into account. - /// TODO: request status for joined tables also. - TablesStatusRequest status_request; - status_request.tables.emplace(*table_to_check); - - TablesStatusResponse status_response = result.entry->getTablesStatus(timeouts, status_request); - auto table_status_it = status_response.table_states_by_id.find(*table_to_check); - if (table_status_it == status_response.table_states_by_id.end()) - { - const char * message_pattern = "There is no table {}.{} on server: {}"; - fail_message = fmt::format(message_pattern, backQuote(table_to_check->database), backQuote(table_to_check->table), result.entry->getDescription()); - LOG_WARNING(log, fail_message); - ProfileEvents::increment(ProfileEvents::DistributedConnectionMissingTable); - - return result; - } - - result.is_usable = true; - - UInt64 max_allowed_delay = settings ? UInt64(settings->max_replica_delay_for_distributed_queries) : 0; - if (!max_allowed_delay) - { - result.is_up_to_date = true; - return result; - } - - UInt32 delay = table_status_it->second.absolute_delay; - - if (delay < max_allowed_delay) - result.is_up_to_date = true; - else - { - result.is_up_to_date = false; - result.staleness = delay; - - LOG_TRACE(log, "Server {} has unacceptable replica delay for table {}.{}: {}", result.entry->getDescription(), table_to_check->database, table_to_check->table, delay); - ProfileEvents::increment(ProfileEvents::DistributedConnectionStaleReplica); - } - } - catch (const Exception & e) - { - if (e.code() != ErrorCodes::NETWORK_ERROR && e.code() != ErrorCodes::SOCKET_TIMEOUT - && e.code() != ErrorCodes::ATTEMPT_TO_READ_AFTER_EOF) - throw; - - fail_message = getCurrentExceptionMessage(/* with_stacktrace = */ false); - - if (!result.entry.isNull()) - { - result.entry->disconnect(); - result.reset(); - } - } - return result; + TryGetConnection try_get_connection(&pool, &timeouts, settings, table_to_check, log, false); + try_get_connection.run(); + fail_message = try_get_connection.fail_message; + return try_get_connection.result; } std::vector ConnectionPoolWithFailover::getShuffledPools(const Settings * settings) @@ -333,10 +266,11 @@ TryGetConnection::TryGetConnection( IConnectionPool * pool_, const ConnectionTimeouts * timeouts_, const Settings * settings_, - std::shared_ptr table_to_check_, - Poco::Logger * log_) : + const QualifiedTableName * table_to_check_, + Poco::Logger * log_, + bool non_blocking_) : pool(pool_), timeouts(timeouts_), settings(settings_), - table_to_check(table_to_check_), log(log_), stage(Stage::CONNECT), socket_fd(-1) + table_to_check(table_to_check_), log(log_), stage(Stage::CONNECT), socket_fd(-1), non_blocking(non_blocking_) { } @@ -386,7 +320,8 @@ void TryGetConnection::run() result.entry->sendHello(); stage = Stage::RECEIVE_HELLO; /// We are waiting for hello from replica. - return; + if (non_blocking) + return; } socket_fd = result.entry->getSocket()->impl()->sockfd(); @@ -411,7 +346,8 @@ void TryGetConnection::run() result.is_usable = true; result.is_up_to_date = true; stage = FINISHED; - return; + if (non_blocking) + return; } TablesStatusRequest status_request; @@ -420,7 +356,8 @@ void TryGetConnection::run() result.entry->sendTablesStatusRequest(status_request); stage = Stage::RECEIVE_TABLES_STATUS; /// We are waiting for tables status response. - return; + if (non_blocking) + return; } if (stage == Stage::RECEIVE_TABLES_STATUS) diff --git a/src/Client/ConnectionPoolWithFailover.h b/src/Client/ConnectionPoolWithFailover.h index f235c5b2e5f..c4248effa81 100644 --- a/src/Client/ConnectionPoolWithFailover.h +++ b/src/Client/ConnectionPoolWithFailover.h @@ -50,8 +50,9 @@ public: TryGetConnection(IConnectionPool * pool_, const ConnectionTimeouts * timeouts_, const Settings * settings_, - std::shared_ptr table_to_check = nullptr, - Poco::Logger * log_ = nullptr); + const QualifiedTableName * table_to_check = nullptr, + Poco::Logger * log_ = nullptr, + bool non_blocking_ = true); /// Continue connecting to replica from previous stage. Initial stage is CONNECT. void run(); @@ -72,11 +73,12 @@ public: const ConnectionTimeouts * timeouts; std::string fail_message; const Settings * settings; - std::shared_ptr table_to_check; + const QualifiedTableName * table_to_check; Poco::Logger * log; TryResult result; Stage stage; int socket_fd; + bool non_blocking; std::function action_before_disconnect; }; diff --git a/src/Client/GetHedgedConnections.cpp b/src/Client/GetHedgedConnections.cpp index a9283a75105..093b4bc930c 100644 --- a/src/Client/GetHedgedConnections.cpp +++ b/src/Client/GetHedgedConnections.cpp @@ -23,7 +23,7 @@ GetHedgedConnections::GetHedgedConnections( { shuffled_pools = pool->getShuffledPools(settings); for (size_t i = 0; i != shuffled_pools.size(); ++i) - try_get_connections.emplace_back(shuffled_pools[i].pool, &timeouts, settings, table_to_check, log); + try_get_connections.emplace_back(shuffled_pools[i].pool, &timeouts, settings, table_to_check.get(), log); max_tries = (settings ? size_t{settings->connections_with_failover_max_tries} : size_t{DBMS_CONNECTION_POOL_WITH_FAILOVER_DEFAULT_MAX_TRIES}); From 6456ccf0da4ae12568c559b40015459da07fb6d6 Mon Sep 17 00:00:00 2001 From: Alexander Tokmakov Date: Tue, 2 Feb 2021 22:39:04 +0300 Subject: [PATCH 0136/2357] better test --- src/Databases/DatabaseReplicatedWorker.h | 2 +- src/Interpreters/DatabaseCatalog.cpp | 18 +++-- src/Interpreters/DatabaseCatalog.h | 7 +- src/Interpreters/InterpreterAlterQuery.cpp | 10 +-- src/Interpreters/InterpreterCreateQuery.cpp | 2 +- src/Interpreters/InterpreterDropQuery.cpp | 2 +- src/Interpreters/InterpreterRenameQuery.cpp | 12 +++- src/Interpreters/InterpreterRenameQuery.h | 5 +- .../MergeTree/registerStorageMergeTree.cpp | 8 ++- .../configs/config.xml | 31 ++++++++ .../test_replicated_database/test.py | 71 +++++++++++-------- 11 files changed, 112 insertions(+), 56 deletions(-) diff --git a/src/Databases/DatabaseReplicatedWorker.h b/src/Databases/DatabaseReplicatedWorker.h index 7e6d64dab0b..6e29e48469b 100644 --- a/src/Databases/DatabaseReplicatedWorker.h +++ b/src/Databases/DatabaseReplicatedWorker.h @@ -21,7 +21,7 @@ private: DDLTaskPtr initAndCheckTask(const String & entry_name, String & out_reason, const ZooKeeperPtr & zookeeper) override; - DatabaseReplicated * database; + DatabaseReplicated * const database; mutable std::mutex mutex; std::condition_variable wait_current_task_change; String current_task; diff --git a/src/Interpreters/DatabaseCatalog.cpp b/src/Interpreters/DatabaseCatalog.cpp index 18cf69675ba..4ab3fb28785 100644 --- a/src/Interpreters/DatabaseCatalog.cpp +++ b/src/Interpreters/DatabaseCatalog.cpp @@ -956,21 +956,25 @@ DDLGuard::DDLGuard(Map & map_, std::shared_mutex & db_mutex_, std::unique_locksecond.counter; guards_lock.unlock(); table_lock = std::unique_lock(*it->second.mutex); - bool is_database = elem.empty(); - if (!is_database) + is_database_guard = elem.empty(); + if (!is_database_guard) { bool locked_database_for_read = db_mutex.try_lock_shared(); if (!locked_database_for_read) { - removeTableLock(); + releaseTableLock(); throw Exception(ErrorCodes::UNKNOWN_DATABASE, "Database {} is currently dropped or renamed", database_name); } } } -void DDLGuard::removeTableLock() +void DDLGuard::releaseTableLock() noexcept { + if (table_lock_removed) + return; + + table_lock_removed = true; guards_lock.lock(); --it->second.counter; if (!it->second.counter) @@ -978,14 +982,14 @@ void DDLGuard::removeTableLock() table_lock.unlock(); map.erase(it); } + guards_lock.unlock(); } DDLGuard::~DDLGuard() { - bool is_database = it->first.empty(); - if (!is_database) + if (!is_database_guard) db_mutex.unlock_shared(); - removeTableLock(); + releaseTableLock(); } } diff --git a/src/Interpreters/DatabaseCatalog.h b/src/Interpreters/DatabaseCatalog.h index 5146c786f64..c9f031ef678 100644 --- a/src/Interpreters/DatabaseCatalog.h +++ b/src/Interpreters/DatabaseCatalog.h @@ -54,14 +54,17 @@ public: DDLGuard(Map & map_, std::shared_mutex & db_mutex_, std::unique_lock guards_lock_, const String & elem, const String & database_name); ~DDLGuard(); + /// Unlocks table name, keeps holding read lock for database name + void releaseTableLock() noexcept; + private: Map & map; std::shared_mutex & db_mutex; Map::iterator it; std::unique_lock guards_lock; std::unique_lock table_lock; - - void removeTableLock(); + bool table_lock_removed = false; + bool is_database_guard = false; }; diff --git a/src/Interpreters/InterpreterAlterQuery.cpp b/src/Interpreters/InterpreterAlterQuery.cpp index 0edd1a401b3..612f9833af5 100644 --- a/src/Interpreters/InterpreterAlterQuery.cpp +++ b/src/Interpreters/InterpreterAlterQuery.cpp @@ -47,17 +47,19 @@ BlockIO InterpreterAlterQuery::execute() context.checkAccess(getRequiredAccess()); auto table_id = context.resolveStorageID(alter, Context::ResolveOrdinary); - StoragePtr table = DatabaseCatalog::instance().getTable(table_id, context); - auto alter_lock = table->lockForAlter(context.getCurrentQueryId(), context.getSettingsRef().lock_acquire_timeout); - auto metadata_snapshot = table->getInMemoryMetadataPtr(); DatabasePtr database = DatabaseCatalog::instance().getDatabase(table_id.database_name); if (typeid_cast(database.get()) && context.getClientInfo().query_kind != ClientInfo::QueryKind::REPLICATED_LOG_QUERY) { - alter_lock.reset(); + auto guard = DatabaseCatalog::instance().getDDLGuard(table_id.database_name, table_id.table_name); + guard->releaseTableLock(); return typeid_cast(database.get())->propose(query_ptr); } + StoragePtr table = DatabaseCatalog::instance().getTable(table_id, context); + auto alter_lock = table->lockForAlter(context.getCurrentQueryId(), context.getSettingsRef().lock_acquire_timeout); + auto metadata_snapshot = table->getInMemoryMetadataPtr(); + //FIXME commit MetadataTransaction for all ALTER kinds. Now its' implemented only for metadata alter. /// Add default database to table identifiers that we can encounter in e.g. default expressions, diff --git a/src/Interpreters/InterpreterCreateQuery.cpp b/src/Interpreters/InterpreterCreateQuery.cpp index d91f3140a96..8d344545c8a 100644 --- a/src/Interpreters/InterpreterCreateQuery.cpp +++ b/src/Interpreters/InterpreterCreateQuery.cpp @@ -885,7 +885,7 @@ BlockIO InterpreterCreateQuery::createTable(ASTCreateQuery & create) if (typeid_cast(database.get()) && context.getClientInfo().query_kind != ClientInfo::QueryKind::REPLICATED_LOG_QUERY) { assertOrSetUUID(create, database); - guard.reset(); + guard->releaseTableLock(); return typeid_cast(database.get())->propose(query_ptr); } } diff --git a/src/Interpreters/InterpreterDropQuery.cpp b/src/Interpreters/InterpreterDropQuery.cpp index 68680f27ea4..db2f463893e 100644 --- a/src/Interpreters/InterpreterDropQuery.cpp +++ b/src/Interpreters/InterpreterDropQuery.cpp @@ -137,7 +137,7 @@ BlockIO InterpreterDropQuery::executeToTableImpl(const ASTDropQuery & query, Dat throw Exception(ErrorCodes::INCORRECT_QUERY, "DETACH TABLE is not allowed for Replicated databases. " "Use DETACH TABLE PERMANENTLY or SYSTEM RESTART REPLICA"); - ddl_guard.reset(); + ddl_guard->releaseTableLock(); table.reset(); return typeid_cast(database.get())->propose(query.clone()); } diff --git a/src/Interpreters/InterpreterRenameQuery.cpp b/src/Interpreters/InterpreterRenameQuery.cpp index 52faa89eff1..d2f79ba071c 100644 --- a/src/Interpreters/InterpreterRenameQuery.cpp +++ b/src/Interpreters/InterpreterRenameQuery.cpp @@ -43,6 +43,9 @@ BlockIO InterpreterRenameQuery::execute() RenameDescriptions descriptions; descriptions.reserve(rename.elements.size()); + /// Don't allow to drop tables (that we are renaming); don't allow to create tables in places where tables will be renamed. + TableGuards table_guards; + for (const auto & elem : rename.elements) { descriptions.emplace_back(elem, current_database); @@ -64,10 +67,10 @@ BlockIO InterpreterRenameQuery::execute() if (rename.database) return executeToDatabase(rename, descriptions); else - return executeToTables(rename, descriptions); + return executeToTables(rename, descriptions, table_guards); } -BlockIO InterpreterRenameQuery::executeToTables(const ASTRenameQuery & rename, const RenameDescriptions & descriptions) +BlockIO InterpreterRenameQuery::executeToTables(const ASTRenameQuery & rename, const RenameDescriptions & descriptions, TableGuards & ddl_guards) { auto & database_catalog = DatabaseCatalog::instance(); @@ -83,7 +86,10 @@ BlockIO InterpreterRenameQuery::executeToTables(const ASTRenameQuery & rename, c throw Exception(ErrorCodes::NOT_IMPLEMENTED, "Database {} is Replicated, " "it does not support renaming of multiple tables in single query.", elem.from_database_name); - table_guards.clear(); + UniqueTableName from(elem.from_database_name, elem.from_table_name); + UniqueTableName to(elem.to_database_name, elem.to_table_name); + ddl_guards[from]->releaseTableLock(); + ddl_guards[to]->releaseTableLock(); return typeid_cast(database.get())->propose(query_ptr); } else diff --git a/src/Interpreters/InterpreterRenameQuery.h b/src/Interpreters/InterpreterRenameQuery.h index 2bc84514b4c..0da25f63e8d 100644 --- a/src/Interpreters/InterpreterRenameQuery.h +++ b/src/Interpreters/InterpreterRenameQuery.h @@ -57,16 +57,13 @@ public: void extendQueryLogElemImpl(QueryLogElement & elem, const ASTPtr & ast, const Context &) const override; private: - BlockIO executeToTables(const ASTRenameQuery & rename, const RenameDescriptions & descriptions); + BlockIO executeToTables(const ASTRenameQuery & rename, const RenameDescriptions & descriptions, TableGuards & ddl_guards); static BlockIO executeToDatabase(const ASTRenameQuery & rename, const RenameDescriptions & descriptions); AccessRightsElements getRequiredAccess() const; ASTPtr query_ptr; Context & context; - - /// Don't allow to drop tables (that we are renaming); don't allow to create tables in places where tables will be renamed. - TableGuards table_guards; }; } diff --git a/src/Storages/MergeTree/registerStorageMergeTree.cpp b/src/Storages/MergeTree/registerStorageMergeTree.cpp index 9a881a60a69..1d68f788a42 100644 --- a/src/Storages/MergeTree/registerStorageMergeTree.cpp +++ b/src/Storages/MergeTree/registerStorageMergeTree.cpp @@ -450,17 +450,21 @@ static StoragePtr create(const StorageFactory::Arguments & args) arg_cnt += 2; } else - throw Exception("Expected two string literal arguments: zookeper_path and replica_name", ErrorCodes::BAD_ARGUMENTS); + throw Exception("Expected two string literal arguments: zookeeper_path and replica_name", ErrorCodes::BAD_ARGUMENTS); /// Allow implicit {uuid} macros only for zookeeper_path in ON CLUSTER queries bool is_on_cluster = args.local_context.getClientInfo().query_kind == ClientInfo::QueryKind::SECONDARY_QUERY; - bool is_replicated_database = args.local_context.getClientInfo().query_kind == ClientInfo::QueryKind::REPLICATED_LOG_QUERY; + bool is_replicated_database = args.local_context.getClientInfo().query_kind == ClientInfo::QueryKind::REPLICATED_LOG_QUERY && + DatabaseCatalog::instance().getDatabase(args.table_id.database_name)->getEngineName() == "Replicated"; bool allow_uuid_macro = is_on_cluster || is_replicated_database || args.query.attach; /// Unfold {database} and {table} macro on table creation, so table can be renamed. /// We also unfold {uuid} macro, so path will not be broken after moving table from Atomic to Ordinary database. if (!args.attach) { + if (is_replicated_database && !is_extended_storage_def) + throw Exception(ErrorCodes::BAD_ARGUMENTS, "Old syntax is not allowed for ReplicatedMergeTree tables in Replicated databases"); + Macros::MacroExpansionInfo info; /// NOTE: it's not recursive info.expand_special_macros_only = true; diff --git a/tests/integration/test_replicated_database/configs/config.xml b/tests/integration/test_replicated_database/configs/config.xml index d751454437c..ebceee3aa5c 100644 --- a/tests/integration/test_replicated_database/configs/config.xml +++ b/tests/integration/test_replicated_database/configs/config.xml @@ -1,3 +1,34 @@ 10 + + + + + true + + main_node + 9000 + + + dummy_node + 9000 + + + competing_node + 9000 + + + + true + + snapshotting_node + 9000 + + + snapshot_recovering_node + 9000 + + + + diff --git a/tests/integration/test_replicated_database/test.py b/tests/integration/test_replicated_database/test.py index f99f4517e5a..2471228b55e 100644 --- a/tests/integration/test_replicated_database/test.py +++ b/tests/integration/test_replicated_database/test.py @@ -13,6 +13,8 @@ competing_node = cluster.add_instance('competing_node', main_configs=['configs/c snapshotting_node = cluster.add_instance('snapshotting_node', main_configs=['configs/config.xml'], with_zookeeper=True, macros={"shard": 2, "replica": 1}) snapshot_recovering_node = cluster.add_instance('snapshot_recovering_node', main_configs=['configs/config.xml'], with_zookeeper=True, macros={"shard": 2, "replica": 2}) +all_nodes = [main_node, dummy_node, competing_node, snapshotting_node, snapshot_recovering_node] + uuid_regex = re.compile("[0-9a-f]{8}\-[0-9a-f]{4}\-[0-9a-f]{4}\-[0-9a-f]{4}\-[0-9a-f]{12}") def assert_create_query(nodes, table_name, expected): replace_uuid = lambda x: re.sub(uuid_regex, "uuid", x) @@ -31,11 +33,10 @@ def started_cluster(): finally: cluster.shutdown() -#TODO better tests - def test_create_replicated_table(started_cluster): - #FIXME should fail (replicated with old syntax) - #main_node.query("CREATE TABLE testdb.replicated_table (d Date, k UInt64, i32 Int32) ENGINE=ReplicatedMergeTree(d, k, 8192);") + assert "Old syntax is not allowed" in \ + main_node.query_and_get_error("CREATE TABLE testdb.replicated_table (d Date, k UInt64, i32 Int32) ENGINE=ReplicatedMergeTree('/test/tmp', 'r', d, k, 8192);") + main_node.query("CREATE TABLE testdb.replicated_table (d Date, k UInt64, i32 Int32) ENGINE=ReplicatedMergeTree ORDER BY k PARTITION BY toYYYYMM(d);") expected = "CREATE TABLE testdb.replicated_table\\n(\\n `d` Date,\\n `k` UInt64,\\n `i32` Int32\\n)\\n" \ @@ -47,6 +48,7 @@ def test_create_replicated_table(started_cluster): @pytest.mark.parametrize("engine", ['MergeTree', 'ReplicatedMergeTree']) def test_simple_alter_table(started_cluster, engine): + # test_simple_alter_table name = "testdb.alter_test_{}".format(engine) main_node.query("CREATE TABLE {} " "(CounterID UInt32, StartDate Date, UserID UInt32, VisitID UInt32, NestedColumn Nested(A UInt8, S String), ToDrop UInt32) " @@ -69,10 +71,7 @@ def test_simple_alter_table(started_cluster, engine): assert_create_query([main_node, dummy_node], name, expected) - -@pytest.mark.dependency(depends=['test_simple_alter_table']) -@pytest.mark.parametrize("engine", ['MergeTree', 'ReplicatedMergeTree']) -def test_create_replica_after_delay(started_cluster, engine): + # test_create_replica_after_delay competing_node.query("CREATE DATABASE IF NOT EXISTS testdb ENGINE = Replicated('/clickhouse/databases/test1', 'shard1', 'replica3');") name = "testdb.alter_test_{}".format(engine) @@ -90,13 +89,17 @@ def test_create_replica_after_delay(started_cluster, engine): assert_create_query([main_node, dummy_node, competing_node], name, expected) -@pytest.mark.dependency(depends=['test_create_replica_after_delay']) + def test_alters_from_different_replicas(started_cluster): + # test_alters_from_different_replicas + competing_node.query("CREATE DATABASE IF NOT EXISTS testdb ENGINE = Replicated('/clickhouse/databases/test1', 'shard1', 'replica3');") + main_node.query("CREATE TABLE testdb.concurrent_test " "(CounterID UInt32, StartDate Date, UserID UInt32, VisitID UInt32, NestedColumn Nested(A UInt8, S String), ToDrop UInt32) " "ENGINE = MergeTree(StartDate, intHash32(UserID), (CounterID, StartDate, intHash32(UserID), VisitID), 8192);") - time.sleep(1) #FIXME + main_node.query("CREATE TABLE testdb.dist AS testdb.concurrent_test ENGINE = Distributed(cluster, testdb, concurrent_test, CounterID)") + dummy_node.kill_clickhouse(stop_start_wait_sec=0) competing_node.query("ALTER TABLE testdb.concurrent_test ADD COLUMN Added0 UInt32;") @@ -115,50 +118,56 @@ def test_alters_from_different_replicas(started_cluster): assert_create_query([main_node, competing_node], "testdb.concurrent_test", expected) -@pytest.mark.dependency(depends=['test_alters_from_different_replicas']) -def test_drop_and_create_table(started_cluster): + # test_create_replica_after_delay main_node.query("DROP TABLE testdb.concurrent_test") main_node.query("CREATE TABLE testdb.concurrent_test " "(CounterID UInt32, StartDate Date, UserID UInt32, VisitID UInt32, NestedColumn Nested(A UInt8, S String), ToDrop UInt32) " - "ENGINE = MergeTree(StartDate, intHash32(UserID), (CounterID, StartDate, intHash32(UserID), VisitID), 8192);") + "ENGINE = ReplicatedMergeTree ORDER BY CounterID;") expected = "CREATE TABLE testdb.concurrent_test\\n(\\n `CounterID` UInt32,\\n `StartDate` Date,\\n `UserID` UInt32,\\n" \ " `VisitID` UInt32,\\n `NestedColumn.A` Array(UInt8),\\n `NestedColumn.S` Array(String),\\n `ToDrop` UInt32\\n)\\n" \ - "ENGINE = MergeTree(StartDate, intHash32(UserID), (CounterID, StartDate, intHash32(UserID), VisitID), 8192)" + "ENGINE = ReplicatedMergeTree(\\'/clickhouse/tables/uuid/{shard}\\', \\'{replica}\\')\\nORDER BY CounterID\\nSETTINGS index_granularity = 8192" assert_create_query([main_node, competing_node], "testdb.concurrent_test", expected) -@pytest.mark.dependency(depends=['test_drop_and_create_table']) -def test_replica_restart(started_cluster): + main_node.query("INSERT INTO testdb.dist (CounterID, StartDate, UserID) SELECT number, addDays(toDate('2020-02-02'), number), intHash32(number) FROM numbers(10)") + + # test_replica_restart main_node.restart_clickhouse() expected = "CREATE TABLE testdb.concurrent_test\\n(\\n `CounterID` UInt32,\\n `StartDate` Date,\\n `UserID` UInt32,\\n" \ " `VisitID` UInt32,\\n `NestedColumn.A` Array(UInt8),\\n `NestedColumn.S` Array(String),\\n `ToDrop` UInt32\\n)\\n" \ - "ENGINE = MergeTree(StartDate, intHash32(UserID), (CounterID, StartDate, intHash32(UserID), VisitID), 8192)" - - assert_create_query([main_node, competing_node], "testdb.concurrent_test", expected) + "ENGINE = ReplicatedMergeTree(\\'/clickhouse/tables/uuid/{shard}\\', \\'{replica}\\')\\nORDER BY CounterID\\nSETTINGS index_granularity = 8192" -@pytest.mark.dependency(depends=['test_replica_restart']) -def test_snapshot_and_snapshot_recover(started_cluster): - snapshotting_node.query("CREATE DATABASE testdb ENGINE = Replicated('/clickhouse/databases/test1', 'shard1', 'replica4');") - snapshot_recovering_node.query("CREATE DATABASE testdb ENGINE = Replicated('/clickhouse/databases/test1', 'shard1', 'replica5');") + # test_snapshot_and_snapshot_recover + snapshotting_node.query("CREATE DATABASE testdb ENGINE = Replicated('/clickhouse/databases/test1', 'shard2', 'replica1');") + snapshot_recovering_node.query("CREATE DATABASE testdb ENGINE = Replicated('/clickhouse/databases/test1', 'shard2', 'replica2');") + assert_create_query(all_nodes, "testdb.concurrent_test", expected) - assert_eq_with_retry(snapshotting_node, "select count() from system.tables where name like 'alter_test_%'", "2\n") - assert_eq_with_retry(snapshot_recovering_node, "select count() from system.tables where name like 'alter_test_%'", "2\n") - assert snapshotting_node.query("desc table testdb.alter_test_MergeTree") == snapshot_recovering_node.query("desc table testdb.alter_test_MergeTree") - assert snapshotting_node.query("desc table testdb.alter_test_ReplicatedMergeTree") == snapshot_recovering_node.query("desc table testdb.alter_test_ReplicatedMergeTree") + main_node.query("SYSTEM FLUSH DISTRIBUTED testdb.dist") + main_node.query("ALTER TABLE testdb.concurrent_test UPDATE StartDate = addYears(StartDate, 1) WHERE 1") + main_node.query("ALTER TABLE testdb.concurrent_test DELETE WHERE UserID % 2") -@pytest.mark.dependency(depends=['test_replica_restart']) -def test_drop_and_create_replica(started_cluster): + # test_drop_and_create_replica main_node.query("DROP DATABASE testdb") main_node.query("CREATE DATABASE testdb ENGINE = Replicated('/clickhouse/databases/test1', 'shard1', 'replica1');") expected = "CREATE TABLE testdb.concurrent_test\\n(\\n `CounterID` UInt32,\\n `StartDate` Date,\\n `UserID` UInt32,\\n" \ " `VisitID` UInt32,\\n `NestedColumn.A` Array(UInt8),\\n `NestedColumn.S` Array(String),\\n `ToDrop` UInt32\\n)\\n" \ - "ENGINE = MergeTree(StartDate, intHash32(UserID), (CounterID, StartDate, intHash32(UserID), VisitID), 8192)" + "ENGINE = ReplicatedMergeTree(\\'/clickhouse/tables/uuid/{shard}\\', \\'{replica}\\')\\nORDER BY CounterID\\nSETTINGS index_granularity = 8192" assert_create_query([main_node, competing_node], "testdb.concurrent_test", expected) + assert_create_query(all_nodes, "testdb.concurrent_test", expected) -#TODO tests with Distributed + for node in all_nodes: + node.query("SYSTEM SYNC REPLICA testdb.concurrent_test") + + expected = "0\t2021-02-02\t4249604106\n" \ + "1\t2021-02-03\t1343103100\n" \ + "4\t2021-02-06\t3902320246\n" \ + "7\t2021-02-09\t3844986530\n" \ + "9\t2021-02-11\t1241149650\n" + + assert_eq_with_retry(dummy_node, "SELECT CounterID, StartDate, UserID FROM testdb.dist ORDER BY CounterID", expected) From b8ae9caa619a42e65fe39ea24d21ffbbee1859b1 Mon Sep 17 00:00:00 2001 From: Pavel Kruglov Date: Wed, 3 Feb 2021 14:27:26 +0300 Subject: [PATCH 0137/2357] Fix style --- src/Client/ConnectionPoolWithFailover.cpp | 2 -- 1 file changed, 2 deletions(-) diff --git a/src/Client/ConnectionPoolWithFailover.cpp b/src/Client/ConnectionPoolWithFailover.cpp index a7120f16b4d..a0dfe1a1a8c 100644 --- a/src/Client/ConnectionPoolWithFailover.cpp +++ b/src/Client/ConnectionPoolWithFailover.cpp @@ -24,8 +24,6 @@ namespace DB namespace ErrorCodes { extern const int ATTEMPT_TO_READ_AFTER_EOF; - extern const int NETWORK_ERROR; - extern const int SOCKET_TIMEOUT; extern const int LOGICAL_ERROR; } From 3fc8b294e8275294fd3aaafaa1d1f22aae4d8a03 Mon Sep 17 00:00:00 2001 From: Pavel Kruglov Date: Wed, 3 Feb 2021 15:56:42 +0300 Subject: [PATCH 0138/2357] Reset changes in tryGetEntry --- src/Client/ConnectionPoolWithFailover.cpp | 91 +++++++++++++++++++---- src/Client/ConnectionPoolWithFailover.h | 4 +- 2 files changed, 79 insertions(+), 16 deletions(-) diff --git a/src/Client/ConnectionPoolWithFailover.cpp b/src/Client/ConnectionPoolWithFailover.cpp index a0dfe1a1a8c..3e41c26fb65 100644 --- a/src/Client/ConnectionPoolWithFailover.cpp +++ b/src/Client/ConnectionPoolWithFailover.cpp @@ -24,6 +24,8 @@ namespace DB namespace ErrorCodes { extern const int ATTEMPT_TO_READ_AFTER_EOF; + extern const int NETWORK_ERROR; + extern const int SOCKET_TIMEOUT; extern const int LOGICAL_ERROR; } @@ -247,10 +249,77 @@ ConnectionPoolWithFailover::tryGetEntry( const Settings * settings, const QualifiedTableName * table_to_check) { - TryGetConnection try_get_connection(&pool, &timeouts, settings, table_to_check, log, false); - try_get_connection.run(); - fail_message = try_get_connection.fail_message; - return try_get_connection.result; + TryResult result; + try + { + result.entry = pool.get(timeouts, settings, /* force_connected = */ false); + + UInt64 server_revision = 0; + if (table_to_check) + server_revision = result.entry->getServerRevision(timeouts); + + if (!table_to_check || server_revision < DBMS_MIN_REVISION_WITH_TABLES_STATUS) + { + result.entry->forceConnected(timeouts); + result.is_usable = true; + result.is_up_to_date = true; + return result; + } + + /// Only status of the remote table corresponding to the Distributed table is taken into account. + /// TODO: request status for joined tables also. + TablesStatusRequest status_request; + status_request.tables.emplace(*table_to_check); + + TablesStatusResponse status_response = result.entry->getTablesStatus(timeouts, status_request); + auto table_status_it = status_response.table_states_by_id.find(*table_to_check); + if (table_status_it == status_response.table_states_by_id.end()) + { + const char * message_pattern = "There is no table {}.{} on server: {}"; + fail_message = fmt::format(message_pattern, backQuote(table_to_check->database), backQuote(table_to_check->table), result.entry->getDescription()); + LOG_WARNING(log, fail_message); + ProfileEvents::increment(ProfileEvents::DistributedConnectionMissingTable); + + return result; + } + + result.is_usable = true; + + UInt64 max_allowed_delay = settings ? UInt64(settings->max_replica_delay_for_distributed_queries) : 0; + if (!max_allowed_delay) + { + result.is_up_to_date = true; + return result; + } + + UInt32 delay = table_status_it->second.absolute_delay; + + if (delay < max_allowed_delay) + result.is_up_to_date = true; + else + { + result.is_up_to_date = false; + result.staleness = delay; + + LOG_TRACE(log, "Server {} has unacceptable replica delay for table {}.{}: {}", result.entry->getDescription(), table_to_check->database, table_to_check->table, delay); + ProfileEvents::increment(ProfileEvents::DistributedConnectionStaleReplica); + } + } + catch (const Exception & e) + { + if (e.code() != ErrorCodes::NETWORK_ERROR && e.code() != ErrorCodes::SOCKET_TIMEOUT + && e.code() != ErrorCodes::ATTEMPT_TO_READ_AFTER_EOF) + throw; + + fail_message = getCurrentExceptionMessage(/* with_stacktrace = */ false); + + if (!result.entry.isNull()) + { + result.entry->disconnect(); + result.reset(); + } + } + return result; } std::vector ConnectionPoolWithFailover::getShuffledPools(const Settings * settings) @@ -265,10 +334,9 @@ TryGetConnection::TryGetConnection( const ConnectionTimeouts * timeouts_, const Settings * settings_, const QualifiedTableName * table_to_check_, - Poco::Logger * log_, - bool non_blocking_) : + Poco::Logger * log_) : pool(pool_), timeouts(timeouts_), settings(settings_), - table_to_check(table_to_check_), log(log_), stage(Stage::CONNECT), socket_fd(-1), non_blocking(non_blocking_) + table_to_check(table_to_check_), log(log_), stage(Stage::CONNECT), socket_fd(-1) { } @@ -318,8 +386,7 @@ void TryGetConnection::run() result.entry->sendHello(); stage = Stage::RECEIVE_HELLO; /// We are waiting for hello from replica. - if (non_blocking) - return; + return; } socket_fd = result.entry->getSocket()->impl()->sockfd(); @@ -344,8 +411,7 @@ void TryGetConnection::run() result.is_usable = true; result.is_up_to_date = true; stage = FINISHED; - if (non_blocking) - return; + return; } TablesStatusRequest status_request; @@ -354,8 +420,7 @@ void TryGetConnection::run() result.entry->sendTablesStatusRequest(status_request); stage = Stage::RECEIVE_TABLES_STATUS; /// We are waiting for tables status response. - if (non_blocking) - return; + return; } if (stage == Stage::RECEIVE_TABLES_STATUS) diff --git a/src/Client/ConnectionPoolWithFailover.h b/src/Client/ConnectionPoolWithFailover.h index c4248effa81..a6c0b9e8070 100644 --- a/src/Client/ConnectionPoolWithFailover.h +++ b/src/Client/ConnectionPoolWithFailover.h @@ -51,8 +51,7 @@ public: const ConnectionTimeouts * timeouts_, const Settings * settings_, const QualifiedTableName * table_to_check = nullptr, - Poco::Logger * log_ = nullptr, - bool non_blocking_ = true); + Poco::Logger * log_ = nullptr); /// Continue connecting to replica from previous stage. Initial stage is CONNECT. void run(); @@ -78,7 +77,6 @@ public: TryResult result; Stage stage; int socket_fd; - bool non_blocking; std::function action_before_disconnect; }; From 066fb4c82bd33744dc8a99d34d88674d83764ba1 Mon Sep 17 00:00:00 2001 From: Alexander Tokmakov Date: Wed, 3 Feb 2021 23:02:37 +0300 Subject: [PATCH 0139/2357] fix --- src/Databases/DatabaseReplicatedWorker.cpp | 2 +- src/Interpreters/DDLWorker.cpp | 9 +- src/Interpreters/DDLWorker.h | 2 +- src/Interpreters/DatabaseCatalog.cpp | 8 +- src/Interpreters/executeDDLQueryOnCluster.cpp | 2 +- .../0_stateless/01238_http_memory_tracking.sh | 3 + .../01281_group_by_limit_memory_tracking.sh | 3 + .../01541_max_memory_usage_for_user.sh | 3 + tests/queries/skip_list.json | 128 +++++++++++++++++- 9 files changed, 147 insertions(+), 13 deletions(-) diff --git a/src/Databases/DatabaseReplicatedWorker.cpp b/src/Databases/DatabaseReplicatedWorker.cpp index a1cdff204c7..5af216c3d0d 100644 --- a/src/Databases/DatabaseReplicatedWorker.cpp +++ b/src/Databases/DatabaseReplicatedWorker.cpp @@ -93,7 +93,7 @@ String DatabaseReplicatedDDLWorker::tryEnqueueAndExecuteEntry(DDLLogEntry & entr LOG_DEBUG(log, "Waiting for worker thread to process all entries before {}", entry_name); { std::unique_lock lock{mutex}; - wait_current_task_change.wait(lock, [&]() { assert(current_task <= entry_name); return zookeeper->expired() || current_task == entry_name; }); + wait_current_task_change.wait(lock, [&]() { assert(zookeeper->expired() || current_task <= entry_name); return zookeeper->expired() || current_task == entry_name; }); } if (zookeeper->expired()) diff --git a/src/Interpreters/DDLWorker.cpp b/src/Interpreters/DDLWorker.cpp index 4470a3649c5..545e00296e8 100644 --- a/src/Interpreters/DDLWorker.cpp +++ b/src/Interpreters/DDLWorker.cpp @@ -357,7 +357,7 @@ void DDLWorker::scheduleTasks() if (!task) { LOG_DEBUG(log, "Will not execute task {}: {}", entry_name, reason); - updateMaxDDLEntryID(*task); + updateMaxDDLEntryID(entry_name); continue; } @@ -449,9 +449,9 @@ bool DDLWorker::tryExecuteQuery(const String & query, DDLTaskBase & task) return true; } -void DDLWorker::updateMaxDDLEntryID(const DDLTaskBase & task) +void DDLWorker::updateMaxDDLEntryID(const String & entry_name) { - DB::ReadBufferFromString in(task.entry_name); + DB::ReadBufferFromString in(entry_name); DB::assertString("query-", in); UInt64 id; readText(id, in); @@ -511,6 +511,7 @@ void DDLWorker::processTask(DDLTaskBase & task) if (task.execute_on_leader) { + tryExecuteQueryOnLeaderReplica(task, storage, rewritten_query, task.entry_path, zookeeper); } else { @@ -549,7 +550,7 @@ void DDLWorker::processTask(DDLTaskBase & task) task.was_executed = true; } - updateMaxDDLEntryID(task); + updateMaxDDLEntryID(task.entry_name); /// FIXME: if server fails right here, the task will be executed twice. We need WAL here. /// If ZooKeeper connection is lost here, we will try again to write query status. diff --git a/src/Interpreters/DDLWorker.h b/src/Interpreters/DDLWorker.h index 6124e5ee8ec..d9fd4e58cb6 100644 --- a/src/Interpreters/DDLWorker.h +++ b/src/Interpreters/DDLWorker.h @@ -73,7 +73,7 @@ protected: virtual DDLTaskPtr initAndCheckTask(const String & entry_name, String & out_reason, const ZooKeeperPtr & zookeeper); void processTask(DDLTaskBase & task); - void updateMaxDDLEntryID(const DDLTaskBase & task); + void updateMaxDDLEntryID(const String & entry_name); /// Check that query should be executed on leader replica only static bool taskShouldBeExecutedOnLeader(const ASTPtr ast_ddl, StoragePtr storage); diff --git a/src/Interpreters/DatabaseCatalog.cpp b/src/Interpreters/DatabaseCatalog.cpp index 4ab3fb28785..6313da7132d 100644 --- a/src/Interpreters/DatabaseCatalog.cpp +++ b/src/Interpreters/DatabaseCatalog.cpp @@ -976,12 +976,10 @@ void DDLGuard::releaseTableLock() noexcept table_lock_removed = true; guards_lock.lock(); - --it->second.counter; - if (!it->second.counter) - { - table_lock.unlock(); + UInt32 counter = --it->second.counter; + table_lock.unlock(); + if (counter == 0) map.erase(it); - } guards_lock.unlock(); } diff --git a/src/Interpreters/executeDDLQueryOnCluster.cpp b/src/Interpreters/executeDDLQueryOnCluster.cpp index fb155e82926..a0148316610 100644 --- a/src/Interpreters/executeDDLQueryOnCluster.cpp +++ b/src/Interpreters/executeDDLQueryOnCluster.cpp @@ -33,7 +33,7 @@ bool isSupportedAlterType(int type) { assert(type != ASTAlterCommand::NO_TYPE); static const std::unordered_set unsupported_alter_types{ - /// It's dangerous, because it may duplicate data if executed on multiple replicas + /// It's dangerous, because it may duplicate data if executed on multiple replicas. We can allow it after #18978 ASTAlterCommand::ATTACH_PARTITION, /// Usually followed by ATTACH PARTITION ASTAlterCommand::FETCH_PARTITION, diff --git a/tests/queries/0_stateless/01238_http_memory_tracking.sh b/tests/queries/0_stateless/01238_http_memory_tracking.sh index 90a7611c7c7..8c900e4c208 100755 --- a/tests/queries/0_stateless/01238_http_memory_tracking.sh +++ b/tests/queries/0_stateless/01238_http_memory_tracking.sh @@ -18,3 +18,6 @@ yes 'SELECT 1' 2>/dev/null | { } | grep -x -c 1 wait + +# Reset max_memory_usage_for_user, so it will not affect other tests +${CLICKHOUSE_CLIENT} --max_memory_usage_for_user=0 -q "SELECT 1 FORMAT Null" diff --git a/tests/queries/0_stateless/01281_group_by_limit_memory_tracking.sh b/tests/queries/0_stateless/01281_group_by_limit_memory_tracking.sh index 285e2ab8dad..222f7edd787 100755 --- a/tests/queries/0_stateless/01281_group_by_limit_memory_tracking.sh +++ b/tests/queries/0_stateless/01281_group_by_limit_memory_tracking.sh @@ -42,3 +42,6 @@ execute_group_by # if memory accounting will be incorrect, the second query will be failed with MEMORY_LIMIT_EXCEEDED execute_group_by wait + +# Reset max_memory_usage_for_user, so it will not affect other tests +${CLICKHOUSE_CLIENT} --max_memory_usage_for_user=0 -q "SELECT 1 FORMAT Null" diff --git a/tests/queries/0_stateless/01541_max_memory_usage_for_user.sh b/tests/queries/0_stateless/01541_max_memory_usage_for_user.sh index c81bd1a6ce4..32877bfd0fe 100755 --- a/tests/queries/0_stateless/01541_max_memory_usage_for_user.sh +++ b/tests/queries/0_stateless/01541_max_memory_usage_for_user.sh @@ -66,4 +66,7 @@ echo 'OK' ${CLICKHOUSE_CLIENT} --query "DROP USER test_01541"; +# Reset max_memory_usage_for_user, so it will not affect other tests +${CLICKHOUSE_CLIENT} --max_memory_usage_for_user=0 -q "SELECT 1 FORMAT Null" + exit 0 diff --git a/tests/queries/skip_list.json b/tests/queries/skip_list.json index 3311eb3882d..273e00c8a23 100644 --- a/tests/queries/skip_list.json +++ b/tests/queries/skip_list.json @@ -100,7 +100,133 @@ "00604_show_create_database", "00609_mv_index_in_in", "00510_materizlized_view_and_deduplication_zookeeper", - "00738_lock_for_inner_table" + "memory_tracking", /// FIXME remove it before merge + "memory_tracking", + "memory_usage", + "00738_lock_for_inner_table", + "01666_blns", + "01652_ignore_and_low_cardinality", + "01651_map_functions", + "01650_fetch_patition_with_macro_in_zk_path", + "01648_mutations_and_escaping", + "01640_marks_corruption_regression", + "01622_byte_size", + "01611_string_to_low_cardinality_key_alter", + "01602_show_create_view", + "01600_log_queries_with_extensive_info", + "01560_ttl_remove_empty_parts", + "01554_bloom_filter_index_big_integer_uuid", + "01550_type_map_formats_input", + "01550_type_map_formats", + "01550_create_map_type", + "01532_primary_key_without_order_by_zookeeper", + "01511_alter_version_versioned_collapsing_merge_tree_zookeeper", + "01509_parallel_quorum_insert_no_replicas", + "01504_compression_multiple_streams", + "01494_storage_join_persistency", + "01493_storage_set_persistency", + "01493_alter_remove_properties_zookeeper", + "01475_read_subcolumns_storages", + "01475_read_subcolumns", + "01463_test_alter_live_view_refresh", + "01451_replicated_detach_drop_part", + "01451_detach_drop_part", + "01440_big_int_exotic_casts", + "01430_modify_sample_by_zookeeper", + "01417_freeze_partition_verbose_zookeeper", + "01417_freeze_partition_verbose", + "01396_inactive_replica_cleanup_nodes_zookeeper", + "01375_compact_parts_codecs", + "01357_version_collapsing_attach_detach_zookeeper", + "01355_alter_column_with_order", + "01291_geo_types", + "01270_optimize_skip_unused_shards_low_cardinality", + "01237_live_view_over_distributed_with_subquery_select_table_alias", + "01236_distributed_over_live_view_over_distributed", + "01235_live_view_over_distributed", + "01182_materialized_view_different_structure", + "01150_ddl_guard_rwr", + "01148_zookeeper_path_macros_unfolding", + "01135_default_and_alter_zookeeper", + "01130_in_memory_parts_partitons", + "01127_month_partitioning_consistency_select", + "01114_database_atomic", + "01083_expressions_in_engine_arguments", + "01073_attach_if_not_exists", + "01072_optimize_skip_unused_shards_const_expr_eval", + "01071_prohibition_secondary_index_with_old_format_merge_tree", + "01071_live_view_detach_dependency", + "01062_alter_on_mutataion_zookeeper", + "01060_shutdown_table_after_detach", + "01056_create_table_as", + "01035_avg", + "01021_only_tuple_columns", + "01019_alter_materialized_view_query", + "01019_alter_materialized_view_consistent", + "01019_alter_materialized_view_atomic", + "01015_attach_part", + "00989_parallel_parts_loading", + "00980_zookeeper_merge_tree_alter_settings", + "00980_merge_alter_settings", + "00980_create_temporary_live_view", + "00978_live_view_watch", + "00977_live_view_watch_events", + "00976_live_view_select_version", + "00975_live_view_create", + "00974_live_view_select_with_aggregation", + "00973_live_view_with_subquery_select_with_aggregation_in_subquery", + "00973_live_view_with_subquery_select_with_aggregation", + "00973_live_view_with_subquery_select_table_alias", + "00973_live_view_with_subquery_select_nested_with_aggregation_table_alias", + "00973_live_view_with_subquery_select_nested_with_aggregation", + "00973_live_view_with_subquery_select_nested", + "00973_live_view_with_subquery_select_join_no_alias", + "00973_live_view_with_subquery_select_join", + "00973_live_view_with_subquery_select", + "00973_live_view_select_prewhere", + "00973_live_view_select", + "00972_live_view_select_1", + "00969_live_view_watch_format_jsoneachrowwithprogress", + "00968_live_view_select_format_jsoneachrowwithprogress", + "00961_temporary_live_view_watch", + "00955_test_final_mark", + "00933_reserved_word", + "00926_zookeeper_adaptive_index_granularity_replicated_merge_tree", + "00926_adaptive_index_granularity_replacing_merge_tree", + "00926_adaptive_index_granularity_merge_tree", + "00925_zookeeper_empty_replicated_merge_tree_optimize_final", + "00800_low_cardinality_distinct_numeric", + "00754_alter_modify_order_by_replicated_zookeeper", + "00751_low_cardinality_nullable_group_by", + "00751_default_databasename_for_view", + "00719_parallel_ddl_table", + "00718_low_cardinaliry_alter", + "00717_low_cardinaliry_distributed_group_by", + "00688_low_cardinality_syntax", + "00688_low_cardinality_nullable_cast", + "00688_low_cardinality_in", + "00652_replicated_mutations_zookeeper", + "00634_rename_view", + "00626_replace_partition_from_table", + "00625_arrays_in_nested", + "00623_replicated_truncate_table_zookeeper", + "00619_union_highlite", + "00599_create_view_with_subquery", + "00571_non_exist_database_when_create_materializ_view", + "00553_buff_exists_materlized_column", + "00516_deduplication_after_drop_partition_zookeeper", + "00508_materialized_view_to", + "00446_clear_column_in_partition_concurrent_zookeeper", + "00423_storage_log_single_thread", + "00311_array_primary_key", + "00236_replicated_drop_on_non_leader_zookeeper", + "00226_zookeeper_deduplication_and_unexpected_parts", + "00215_primary_key_order_zookeeper", + "00180_attach_materialized_view", + "00121_drop_column_zookeeper", + "00116_storage_set", + "00083_create_merge_tree_zookeeper", + "00062_replicated_merge_tree_alter_zookeeper" ], "polymorphic-parts": [ "01508_partition_pruning", /// bug, shoud be fixed From a597ed0ff02074198d01068f6cf8e9789005e759 Mon Sep 17 00:00:00 2001 From: Vladimir Chebotarev Date: Thu, 4 Feb 2021 09:21:05 +0300 Subject: [PATCH 0140/2357] Fixed open behavior of remote host filter in case when there is remote_url_allow_hosts section in configuration but no entries there. --- src/Common/RemoteHostFilter.cpp | 3 ++- src/Common/RemoteHostFilter.h | 1 + .../test_allowed_url_from_config/test.py | 20 +++++++++++++++---- 3 files changed, 19 insertions(+), 5 deletions(-) diff --git a/src/Common/RemoteHostFilter.cpp b/src/Common/RemoteHostFilter.cpp index fb6fc4e9bc3..6ea366314e1 100644 --- a/src/Common/RemoteHostFilter.cpp +++ b/src/Common/RemoteHostFilter.cpp @@ -42,6 +42,7 @@ void RemoteHostFilter::setValuesFromConfig(const Poco::Util::AbstractConfigurati else if (startsWith(key, "host")) primary_hosts.insert(config.getString("remote_url_allow_hosts." + key)); } + is_allow_by_default = false; } } @@ -58,6 +59,6 @@ bool RemoteHostFilter::checkForDirectEntry(const std::string & str) const } return true; } - return true; + return is_allow_by_default; } } diff --git a/src/Common/RemoteHostFilter.h b/src/Common/RemoteHostFilter.h index 48d9b2bda7c..a445471a411 100644 --- a/src/Common/RemoteHostFilter.h +++ b/src/Common/RemoteHostFilter.h @@ -24,6 +24,7 @@ public: void checkHostAndPort(const std::string & host, const std::string & port) const; /// Does the same as checkURL, but for host and port. private: + bool is_allow_by_default = true; std::unordered_set primary_hosts; /// Allowed primary () URL from config.xml std::vector regexp_hosts; /// Allowed regexp () URL from config.xml diff --git a/tests/integration/test_allowed_url_from_config/test.py b/tests/integration/test_allowed_url_from_config/test.py index 6442937c8f4..59c7c6e37e7 100644 --- a/tests/integration/test_allowed_url_from_config/test.py +++ b/tests/integration/test_allowed_url_from_config/test.py @@ -6,6 +6,7 @@ node1 = cluster.add_instance('node1', main_configs=['configs/config_with_hosts.x node2 = cluster.add_instance('node2', main_configs=['configs/config_with_only_primary_hosts.xml']) node3 = cluster.add_instance('node3', main_configs=['configs/config_with_only_regexp_hosts.xml']) node4 = cluster.add_instance('node4', main_configs=['configs/config_without_allowed_hosts.xml']) +node5 = cluster.add_instance('node5', main_configs=[]) # No `remote_url_allow_hosts` at all. node6 = cluster.add_instance('node6', main_configs=['configs/config_for_remote.xml']) node7 = cluster.add_instance('node7', main_configs=['configs/config_for_redirect.xml'], with_hdfs=True) @@ -51,10 +52,21 @@ def test_config_with_only_regexp_hosts(start_cluster): def test_config_without_allowed_hosts(start_cluster): - assert node4.query("CREATE TABLE table_test_4_1 (word String) Engine=URL('https://host:80', CSV)") == "" - assert node4.query("CREATE TABLE table_test_4_2 (word String) Engine=URL('https://host', HDFS)") == "" - assert node4.query("CREATE TABLE table_test_4_3 (word String) Engine=URL('https://yandex.ru', CSV)") == "" - assert node4.query("CREATE TABLE table_test_4_4 (word String) Engine=URL('ftp://something.com', S3)") == "" + assert "not allowed" in node4.query_and_get_error( + "CREATE TABLE table_test_4_1 (word String) Engine=URL('https://host:80', CSV)") + assert "not allowed" in node4.query_and_get_error( + "CREATE TABLE table_test_4_2 (word String) Engine=URL('https://host', HDFS)") + assert "not allowed" in node4.query_and_get_error( + "CREATE TABLE table_test_4_3 (word String) Engine=URL('https://yandex.ru', CSV)") + assert "not allowed" in node4.query_and_get_error( + "CREATE TABLE table_test_4_4 (word String) Engine=URL('ftp://something.com', S3)") + + +def test_config_without_allowed_hosts_section(start_cluster): + assert node5.query("CREATE TABLE table_test_4_1 (word String) Engine=URL('https://host:80', CSV)") == "" + assert node5.query("CREATE TABLE table_test_4_2 (word String) Engine=URL('https://host', HDFS)") == "" + assert node5.query("CREATE TABLE table_test_4_3 (word String) Engine=URL('https://yandex.ru', CSV)") == "" + assert node5.query("CREATE TABLE table_test_4_4 (word String) Engine=URL('ftp://something.com', S3)") == "" def test_table_function_remote(start_cluster): From f106d58c05d89fcbd73114d62c8993722ea072a2 Mon Sep 17 00:00:00 2001 From: Vladimir Chebotarev Date: Thu, 4 Feb 2021 10:41:47 +0300 Subject: [PATCH 0141/2357] Minor fix. --- programs/server/config.xml | 4 +-- .../test_allowed_url_from_config/test.py | 34 +++++++++---------- 2 files changed, 19 insertions(+), 19 deletions(-) diff --git a/programs/server/config.xml b/programs/server/config.xml index 849d3dc32ba..650a9a28ff2 100644 --- a/programs/server/config.xml +++ b/programs/server/config.xml @@ -568,7 +568,7 @@ - + - + diff --git a/docs/ru/sql-reference/data-types/simpleaggregatefunction.md b/docs/ru/sql-reference/data-types/simpleaggregatefunction.md index 9605706442e..7441ceae655 100644 --- a/docs/ru/sql-reference/data-types/simpleaggregatefunction.md +++ b/docs/ru/sql-reference/data-types/simpleaggregatefunction.md @@ -25,20 +25,29 @@ `SimpleAggregateFunction` имеет лучшую производительность, чем `AggregateFunction` с той же агрегатной функцией. - **Параметры** - `func` — имя агрегатной функции. - `type` — типы аргументов агрегатной функции. -**Синтаксис** +**Пример** + +Запрос: ``` sql -CREATE TABLE t +CREATE TABLE simple (id UInt64,val SimpleAggregateFunction(sum,Double)) ENGINE=AggregatingMergeTree ORDER BY id; +``` + +Ответ: + +``` text +CREATE TABLE simple ( - column1 SimpleAggregateFunction(sum, UInt64), - column2 SimpleAggregateFunction(any, String) -) ENGINE = ... + `id` UInt64, + `val` SimpleAggregateFunction(sum, Double) +) +ENGINE = AggregatingMergeTree +ORDER BY id ``` [Оригинальная статья](https://clickhouse.tech/docs/en/data_types/simpleaggregatefunction/) From 9a9138d0380ddf67cceda85eb26f8c4d2c978b63 Mon Sep 17 00:00:00 2001 From: romanzhukov Date: Fri, 5 Feb 2021 01:37:59 +0300 Subject: [PATCH 0147/2357] DOCSUP-5266: Fix ticket comments. --- .../functions/type-conversion-functions.md | 119 ++++++++++---- .../functions/type-conversion-functions.md | 149 +++++++++++++----- 2 files changed, 194 insertions(+), 74 deletions(-) diff --git a/docs/en/sql-reference/functions/type-conversion-functions.md b/docs/en/sql-reference/functions/type-conversion-functions.md index 047b3b1cbea..b2ede6ba6ec 100644 --- a/docs/en/sql-reference/functions/type-conversion-functions.md +++ b/docs/en/sql-reference/functions/type-conversion-functions.md @@ -36,10 +36,14 @@ The behavior of functions for the [NaN and Inf](../../sql-reference/data-types/f **Example** +Query: + ``` sql -SELECT toInt64(nan), toInt32(32), toInt16('16'), toInt8(8.8) +SELECT toInt64(nan), toInt32(32), toInt16('16'), toInt8(8.8); ``` +Result: + ``` text ┌─────────toInt64(nan)─┬─toInt32(32)─┬─toInt16('16')─┬─toInt8(8.8)─┐ │ -9223372036854775808 │ 32 │ 16 │ 8 │ @@ -52,10 +56,14 @@ It takes an argument of type String and tries to parse it into Int (8 \| 16 \| 3 **Example** +Query: + ``` sql -select toInt64OrZero('123123'), toInt8OrZero('123qwe123') +select toInt64OrZero('123123'), toInt8OrZero('123qwe123'); ``` +Result: + ``` text ┌─toInt64OrZero('123123')─┬─toInt8OrZero('123qwe123')─┐ │ 123123 │ 0 │ @@ -68,10 +76,14 @@ It takes an argument of type String and tries to parse it into Int (8 \| 16 \| 3 **Example** +Query: + ``` sql -select toInt64OrNull('123123'), toInt8OrNull('123qwe123') +select toInt64OrNull('123123'), toInt8OrNull('123qwe123'); ``` +String: + ``` text ┌─toInt64OrNull('123123')─┬─toInt8OrNull('123qwe123')─┐ │ 123123 │ ᴺᵁᴸᴸ │ @@ -102,10 +114,14 @@ The behavior of functions for negative agruments and for the [NaN and Inf](../.. **Example** +Query: + ``` sql -SELECT toUInt64(nan), toUInt32(-32), toUInt16('16'), toUInt8(8.8) +SELECT toUInt64(nan), toUInt32(-32), toUInt16('16'), toUInt8(8.8); ``` +Result: + ``` text ┌───────toUInt64(nan)─┬─toUInt32(-32)─┬─toUInt16('16')─┬─toUInt8(8.8)─┐ │ 9223372036854775808 │ 4294967264 │ 16 │ 8 │ @@ -168,20 +184,28 @@ A value in the `Nullable(Decimal(P,S))` data type. The value contains: **Examples** +Query: + ``` sql -SELECT toDecimal32OrNull(toString(-1.111), 5) AS val, toTypeName(val) +SELECT toDecimal32OrNull(toString(-1.111), 5) AS val, toTypeName(val); ``` +Result: + ``` text ┌──────val─┬─toTypeName(toDecimal32OrNull(toString(-1.111), 5))─┐ │ -1.11100 │ Nullable(Decimal(9, 5)) │ └──────────┴────────────────────────────────────────────────────┘ ``` +Query: + ``` sql -SELECT toDecimal32OrNull(toString(-1.111), 2) AS val, toTypeName(val) +SELECT toDecimal32OrNull(toString(-1.111), 2) AS val, toTypeName(val); ``` +Result: + ``` text ┌──val─┬─toTypeName(toDecimal32OrNull(toString(-1.111), 2))─┐ │ ᴺᵁᴸᴸ │ Nullable(Decimal(9, 2)) │ @@ -348,7 +372,7 @@ String to UUID. Query: ``` sql -SELECT reinterpretAsUUID(reverse(unhex('000102030405060708090a0b0c0d0e0f'))) +SELECT reinterpretAsUUID(reverse(unhex('000102030405060708090a0b0c0d0e0f'))); ``` Result: @@ -381,9 +405,11 @@ Result: ## CAST(x, T) {#type_conversion_function-cast} -Converts ‘x’ to the ‘t’ data type. The syntax CAST(x AS t) is also supported. +Converts unput value `x` to the `T` data type. The syntax `CAST(x AS t)` is also supported. -Example: +**Example** + +Query: ``` sql SELECT @@ -394,6 +420,8 @@ SELECT CAST(timestamp, 'FixedString(22)') AS fixed_string ``` +Result: + ``` text ┌─timestamp───────────┬────────────datetime─┬───────date─┬─string──────────────┬─fixed_string──────────────┐ │ 2016-06-15 23:00:00 │ 2016-06-15 23:00:00 │ 2016-06-15 │ 2016-06-15 23:00:00 │ 2016-06-15 23:00:00\0\0\0 │ @@ -402,12 +430,18 @@ SELECT Conversion to FixedString(N) only works for arguments of type String or FixedString(N). -Type conversion to [Nullable](../../sql-reference/data-types/nullable.md) and back is supported. Example: +Type conversion to [Nullable](../../sql-reference/data-types/nullable.md) and back is supported. + +**Examples** + +Query: ``` sql -SELECT toTypeName(x) FROM t_null +SELECT toTypeName(x) FROM t_null; ``` +Result: + ``` text ┌─toTypeName(x)─┐ │ Int8 │ @@ -415,10 +449,14 @@ SELECT toTypeName(x) FROM t_null └───────────────┘ ``` +Query: + ``` sql -SELECT toTypeName(CAST(x, 'Nullable(UInt16)')) FROM t_null +SELECT toTypeName(CAST(x, 'Nullable(UInt16)')) FROM t_null; ``` +Result: + ``` text ┌─toTypeName(CAST(x, 'Nullable(UInt16)'))─┐ │ Nullable(UInt16) │ @@ -432,15 +470,18 @@ SELECT toTypeName(CAST(x, 'Nullable(UInt16)')) FROM t_null ## accurateCast(x, T) {#type_conversion_function-accurate-cast} -Converts ‘x’ to the ‘t’ data type. The differente from cast(x, T) is that accurateCast -does not allow overflow of numeric types during cast if type value x does not fit -bounds of type T. +Converts `x` to the `T` data type. The differente from [cast(x, T)](#type_conversion_function-cast) is that `accurateCast` +does not allow overflow of numeric types during cast if type value `x` does not fit bounds of type `T`. + +**Examples** + +Query: -Example ``` sql -SELECT cast(-1, 'UInt8') as uint8; +SELECT cast(-1, 'UInt8') as uint8; ``` +Result: ``` text ┌─uint8─┐ @@ -448,10 +489,14 @@ SELECT cast(-1, 'UInt8') as uint8; └───────┘ ``` +Query: + ```sql SELECT accurateCast(-1, 'UInt8') as uint8; ``` +Result: + ``` text Code: 70. DB::Exception: Received from localhost:9000. DB::Exception: Value in column Int8 cannot be safely converted into type UInt8: While processing accurateCast(-1, 'UInt8') AS uint8. @@ -472,7 +517,7 @@ accurateCastOrNull(x, T) - `x` — Input value. - `T` — The name of the returned data type. -**Example** +**Examples** Query: @@ -488,6 +533,8 @@ Result: └────────────────────────────────────────────┘ ``` +Query: + ``` sql SELECT cast(-1, 'UInt8') as uint8, @@ -530,6 +577,8 @@ toIntervalYear(number) **Example** +Query: + ``` sql WITH toDate('2019-01-01') AS date, @@ -537,9 +586,11 @@ WITH toIntervalWeek(1) AS interval_to_week SELECT date + interval_week, - date + interval_to_week + date + interval_to_week; ``` +Result: + ``` text ┌─plus(date, interval_week)─┬─plus(date, interval_to_week)─┐ │ 2019-01-08 │ 2019-01-08 │ @@ -598,7 +649,7 @@ Query: ``` sql SELECT parseDateTimeBestEffort('Sat, 18 Aug 2018 07:22:16 GMT', 'Europe/Moscow') -AS parseDateTimeBestEffort +AS parseDateTimeBestEffort; ``` Result: @@ -613,7 +664,7 @@ Query: ``` sql SELECT parseDateTimeBestEffort('1284101485') -AS parseDateTimeBestEffort +AS parseDateTimeBestEffort; ``` Result: @@ -628,7 +679,7 @@ Query: ``` sql SELECT parseDateTimeBestEffort('2018-12-12 10:12:12') -AS parseDateTimeBestEffort +AS parseDateTimeBestEffort; ``` Result: @@ -642,7 +693,7 @@ Result: Query: ``` sql -SELECT parseDateTimeBestEffort('10 20:19') +SELECT parseDateTimeBestEffort('10 20:19'); ``` Result: @@ -667,7 +718,7 @@ This function is similar to [‘parseDateTimeBestEffort’](#parsedatetimebestef **Syntax** ``` sql -parseDateTimeBestEffortUS(time_string [, time_zone]); +parseDateTimeBestEffortUS(time_string [, time_zone]) ``` **Parameters** @@ -769,7 +820,7 @@ Type: `LowCardinality(expr_result_type)` Query: ``` sql -SELECT toLowCardinality('1') +SELECT toLowCardinality('1'); ``` Result: @@ -808,7 +859,7 @@ Query: ``` sql WITH toDateTime64('2019-09-16 19:20:12.345678910', 6) AS dt64 -SELECT toUnixTimestamp64Milli(dt64) +SELECT toUnixTimestamp64Milli(dt64); ``` Result: @@ -819,9 +870,11 @@ Result: └──────────────────────────────┘ ``` +Query: + ``` sql WITH toDateTime64('2019-09-16 19:20:12.345678910', 6) AS dt64 -SELECT toUnixTimestamp64Nano(dt64) +SELECT toUnixTimestamp64Nano(dt64); ``` Result: @@ -855,13 +908,17 @@ fromUnixTimestamp64Milli(value [, ti]) - `value` converted to the `DateTime64` data type. -**Examples** +**Example** + +Query: ``` sql WITH CAST(1234567891011, 'Int64') AS i64 -SELECT fromUnixTimestamp64Milli(i64, 'UTC') +SELECT fromUnixTimestamp64Milli(i64, 'UTC'); ``` +Result: + ``` text ┌─fromUnixTimestamp64Milli(i64, 'UTC')─┐ │ 2009-02-13 23:31:31.011 │ @@ -893,7 +950,7 @@ Query: ``` sql SELECT formatRow('CSV', number, 'good') -FROM numbers(3) +FROM numbers(3); ``` Result: @@ -934,7 +991,7 @@ Query: ``` sql SELECT formatRowNoNewline('CSV', number, 'good') -FROM numbers(3) +FROM numbers(3); ``` Result: diff --git a/docs/ru/sql-reference/functions/type-conversion-functions.md b/docs/ru/sql-reference/functions/type-conversion-functions.md index 40fdbc6f5a0..ee3e8583504 100644 --- a/docs/ru/sql-reference/functions/type-conversion-functions.md +++ b/docs/ru/sql-reference/functions/type-conversion-functions.md @@ -36,10 +36,14 @@ toc_title: "\u0424\u0443\u043d\u043a\u0446\u0438\u0438\u0020\u043f\u0440\u0435\u **Пример** +Запрос: + ``` sql -SELECT toInt64(nan), toInt32(32), toInt16('16'), toInt8(8.8) +SELECT toInt64(nan), toInt32(32), toInt16('16'), toInt8(8.8); ``` +Результат: + ``` text ┌─────────toInt64(nan)─┬─toInt32(32)─┬─toInt16('16')─┬─toInt8(8.8)─┐ │ -9223372036854775808 │ 32 │ 16 │ 8 │ @@ -52,10 +56,14 @@ SELECT toInt64(nan), toInt32(32), toInt16('16'), toInt8(8.8) **Пример** +Запрос: + ``` sql -select toInt64OrZero('123123'), toInt8OrZero('123qwe123') +select toInt64OrZero('123123'), toInt8OrZero('123qwe123'); ``` +Результат: + ``` text ┌─toInt64OrZero('123123')─┬─toInt8OrZero('123qwe123')─┐ │ 123123 │ 0 │ @@ -68,10 +76,14 @@ select toInt64OrZero('123123'), toInt8OrZero('123qwe123') **Пример** +Запрос: + ``` sql -select toInt64OrNull('123123'), toInt8OrNull('123qwe123') +select toInt64OrNull('123123'), toInt8OrNull('123qwe123'); ``` +Результат: + ``` text ┌─toInt64OrNull('123123')─┬─toInt8OrNull('123qwe123')─┐ │ 123123 │ ᴺᵁᴸᴸ │ @@ -102,10 +114,14 @@ select toInt64OrNull('123123'), toInt8OrNull('123qwe123') **Пример** +Запрос: + ``` sql -SELECT toUInt64(nan), toUInt32(-32), toUInt16('16'), toUInt8(8.8) +SELECT toUInt64(nan), toUInt32(-32), toUInt16('16'), toUInt8(8.8); ``` +Результат: + ``` text ┌───────toUInt64(nan)─┬─toUInt32(-32)─┬─toUInt16('16')─┬─toUInt8(8.8)─┐ │ 9223372036854775808 │ 4294967264 │ 16 │ 8 │ @@ -168,20 +184,28 @@ SELECT toUInt64(nan), toUInt32(-32), toUInt16('16'), toUInt8(8.8) **Примеры** +Запрос: + ``` sql -SELECT toDecimal32OrNull(toString(-1.111), 5) AS val, toTypeName(val) +SELECT toDecimal32OrNull(toString(-1.111), 5) AS val, toTypeName(val); ``` +Результат: + ``` text ┌──────val─┬─toTypeName(toDecimal32OrNull(toString(-1.111), 5))─┐ │ -1.11100 │ Nullable(Decimal(9, 5)) │ └──────────┴────────────────────────────────────────────────────┘ ``` +Запрос: + ``` sql -SELECT toDecimal32OrNull(toString(-1.111), 2) AS val, toTypeName(val) +SELECT toDecimal32OrNull(toString(-1.111), 2) AS val, toTypeName(val); ``` +Результат: + ``` text ┌──val─┬─toTypeName(toDecimal32OrNull(toString(-1.111), 2))─┐ │ ᴺᵁᴸᴸ │ Nullable(Decimal(9, 2)) │ @@ -211,22 +235,30 @@ SELECT toDecimal32OrNull(toString(-1.111), 2) AS val, toTypeName(val) - Число с `S` десятичными знаками, если ClickHouse распознал число во входной строке. - 0 c `S` десятичными знаками, если ClickHouse не смог распознать число во входной строке или входное число содержит больше чем `S` десятичных знаков. -**Пример** +**Примеры** + +Запрос: ``` sql -SELECT toDecimal32OrZero(toString(-1.111), 5) AS val, toTypeName(val) +SELECT toDecimal32OrZero(toString(-1.111), 5) AS val, toTypeName(val); ``` +Результат: + ``` text ┌──────val─┬─toTypeName(toDecimal32OrZero(toString(-1.111), 5))─┐ │ -1.11100 │ Decimal(9, 5) │ └──────────┴────────────────────────────────────────────────────┘ ``` +Запрос: + ``` sql -SELECT toDecimal32OrZero(toString(-1.111), 2) AS val, toTypeName(val) +SELECT toDecimal32OrZero(toString(-1.111), 2) AS val, toTypeName(val); ``` +Результат: + ``` text ┌──val─┬─toTypeName(toDecimal32OrZero(toString(-1.111), 2))─┐ │ 0.00 │ Decimal(9, 2) │ @@ -258,12 +290,18 @@ YYYY-MM-DD hh:mm:ss Дополнительно, функция toString от аргумента типа DateTime может принимать второй аргумент String - имя тайм-зоны. Пример: `Asia/Yekaterinburg` В этом случае, форматирование времени производится согласно указанной тайм-зоне. +**Пример** + +Запрос: + ``` sql SELECT now() AS now_local, - toString(now(), 'Asia/Yekaterinburg') AS now_yekat + toString(now(), 'Asia/Yekaterinburg') AS now_yekat; ``` +Результат: + ``` text ┌───────────now_local─┬─now_yekat───────────┐ │ 2016-06-15 00:11:21 │ 2016-06-15 02:11:21 │ @@ -281,22 +319,30 @@ SELECT Принимает аргумент типа String или FixedString. Возвращает String, вырезая содержимое строки до первого найденного нулевого байта. -Пример: +**Примеры** + +Запрос: ``` sql -SELECT toFixedString('foo', 8) AS s, toStringCutToZero(s) AS s_cut +SELECT toFixedString('foo', 8) AS s, toStringCutToZero(s) AS s_cut; ``` +Результат: + ``` text ┌─s─────────────┬─s_cut─┐ │ foo\0\0\0\0\0 │ foo │ └───────────────┴───────┘ ``` +Запрос: + ``` sql -SELECT toFixedString('foo\0bar', 8) AS s, toStringCutToZero(s) AS s_cut +SELECT toFixedString('foo\0bar', 8) AS s, toStringCutToZero(s) AS s_cut; ``` +Результат: + ``` text ┌─s──────────┬─s_cut─┐ │ foo\0bar\0 │ foo │ @@ -344,7 +390,7 @@ reinterpretAsUUID(fixed_string) Запрос: ``` sql -SELECT reinterpretAsUUID(reverse(unhex('000102030405060708090a0b0c0d0e0f'))) +SELECT reinterpretAsUUID(reverse(unhex('000102030405060708090a0b0c0d0e0f'))); ``` Результат: @@ -377,10 +423,11 @@ SELECT uuid = uuid2; ## CAST(x, T) {#type_conversion_function-cast} -Преобразует x в тип данных t. -Поддерживается также синтаксис CAST(x AS t). +Преобразует входное значение `x` в тип данных `T`. Поддерживается также синтаксис `CAST(x AS t)`. -Пример: +**Пример** + +Запрос: ``` sql SELECT @@ -388,9 +435,11 @@ SELECT CAST(timestamp AS DateTime) AS datetime, CAST(timestamp AS Date) AS date, CAST(timestamp, 'String') AS string, - CAST(timestamp, 'FixedString(22)') AS fixed_string + CAST(timestamp, 'FixedString(22)') AS fixed_string; ``` +Результат: + ``` text ┌─timestamp───────────┬────────────datetime─┬───────date─┬─string──────────────┬─fixed_string──────────────┐ │ 2016-06-15 23:00:00 │ 2016-06-15 23:00:00 │ 2016-06-15 │ 2016-06-15 23:00:00 │ 2016-06-15 23:00:00\0\0\0 │ @@ -399,12 +448,18 @@ SELECT Преобразование в FixedString(N) работает только для аргументов типа String или FixedString(N). -Поддержано преобразование к типу [Nullable](../../sql-reference/functions/type-conversion-functions.md) и обратно. Пример: +Поддерживается преобразование к типу [Nullable](../../sql-reference/functions/type-conversion-functions.md) и обратно. + +**Примеры** + +Запрос: ``` sql -SELECT toTypeName(x) FROM t_null +SELECT toTypeName(x) FROM t_null; ``` +Результат: + ``` text ┌─toTypeName(x)─┐ │ Int8 │ @@ -412,10 +467,14 @@ SELECT toTypeName(x) FROM t_null └───────────────┘ ``` +Запрос: + ``` sql -SELECT toTypeName(CAST(x, 'Nullable(UInt16)')) FROM t_null +SELECT toTypeName(CAST(x, 'Nullable(UInt16)')) FROM t_null; ``` +Результат: + ``` text ┌─toTypeName(CAST(x, 'Nullable(UInt16)'))─┐ │ Nullable(UInt16) │ @@ -442,7 +501,7 @@ accurateCastOrNull(x, T) - `x` — входное значение. - `T` — имя возвращаемого типа данных. -**Пример** +**Примеры** Запрос: @@ -502,6 +561,8 @@ toIntervalYear(number) **Пример** +Запрос: + ``` sql WITH toDate('2019-01-01') AS date, @@ -509,9 +570,11 @@ WITH toIntervalWeek(1) AS interval_to_week SELECT date + interval_week, - date + interval_to_week + date + interval_to_week; ``` +Результат: + ``` text ┌─plus(date, interval_week)─┬─plus(date, interval_to_week)─┐ │ 2019-01-08 │ 2019-01-08 │ @@ -527,7 +590,7 @@ SELECT **Синтаксис** ``` sql -parseDateTimeBestEffort(time_string[, time_zone]); +parseDateTimeBestEffort(time_string[, time_zone]) ``` **Параметры** @@ -570,7 +633,7 @@ AS parseDateTimeBestEffort; ``` sql SELECT parseDateTimeBestEffort('Sat, 18 Aug 2018 07:22:16 GMT', 'Europe/Moscow') -AS parseDateTimeBestEffort +AS parseDateTimeBestEffort; ``` Результат: @@ -585,7 +648,7 @@ AS parseDateTimeBestEffort ``` sql SELECT parseDateTimeBestEffort('1284101485') -AS parseDateTimeBestEffort +AS parseDateTimeBestEffort; ``` Результат: @@ -600,7 +663,7 @@ AS parseDateTimeBestEffort ``` sql SELECT parseDateTimeBestEffort('2018-12-12 10:12:12') -AS parseDateTimeBestEffort +AS parseDateTimeBestEffort; ``` Результат: @@ -614,7 +677,7 @@ AS parseDateTimeBestEffort Запрос: ``` sql -SELECT parseDateTimeBestEffort('10 20:19') +SELECT parseDateTimeBestEffort('10 20:19'); ``` Результат: @@ -639,7 +702,7 @@ SELECT parseDateTimeBestEffort('10 20:19') **Синтаксис** ``` sql -parseDateTimeBestEffortUS(time_string [, time_zone]); +parseDateTimeBestEffortUS(time_string [, time_zone]) ``` **Параметры** @@ -668,7 +731,7 @@ SELECT parseDateTimeBestEffortUS('09/12/2020 12:12:57') AS parseDateTimeBestEffortUS; ``` -Ответ: +Результат: ``` text ┌─parseDateTimeBestEffortUS─┐ @@ -683,7 +746,7 @@ SELECT parseDateTimeBestEffortUS('09-12-2020 12:12:57') AS parseDateTimeBestEffortUS; ``` -Ответ: +Результат: ``` text ┌─parseDateTimeBestEffortUS─┐ @@ -698,7 +761,7 @@ SELECT parseDateTimeBestEffortUS('09.12.2020 12:12:57') AS parseDateTimeBestEffortUS; ``` -Ответ: +Результат: ``` text ┌─parseDateTimeBestEffortUS─┐ @@ -733,10 +796,10 @@ toUnixTimestamp64Milli(value) ``` sql WITH toDateTime64('2019-09-16 19:20:12.345678910', 6) AS dt64 -SELECT toUnixTimestamp64Milli(dt64) +SELECT toUnixTimestamp64Milli(dt64); ``` -Ответ: +Результат: ``` text ┌─toUnixTimestamp64Milli(dt64)─┐ @@ -748,10 +811,10 @@ SELECT toUnixTimestamp64Milli(dt64) ``` sql WITH toDateTime64('2019-09-16 19:20:12.345678910', 6) AS dt64 -SELECT toUnixTimestamp64Nano(dt64) +SELECT toUnixTimestamp64Nano(dt64); ``` -Ответ: +Результат: ``` text ┌─toUnixTimestamp64Nano(dt64)─┐ @@ -786,10 +849,10 @@ fromUnixTimestamp64Milli(value [, ti]) ``` sql WITH CAST(1234567891011, 'Int64') AS i64 -SELECT fromUnixTimestamp64Milli(i64, 'UTC') +SELECT fromUnixTimestamp64Milli(i64, 'UTC'); ``` -Ответ: +Результат: ``` text ┌─fromUnixTimestamp64Milli(i64, 'UTC')─┐ @@ -820,7 +883,7 @@ toLowCardinality(expr) Тип: `LowCardinality(expr_result_type)` -**Example** +**Пример** Запрос: @@ -861,10 +924,10 @@ formatRow(format, x, y, ...) ``` sql SELECT formatRow('CSV', number, 'good') -FROM numbers(3) +FROM numbers(3); ``` -Ответ: +Результат: ``` text ┌─formatRow('CSV', number, 'good')─┐ @@ -902,10 +965,10 @@ formatRowNoNewline(format, x, y, ...) ``` sql SELECT formatRowNoNewline('CSV', number, 'good') -FROM numbers(3) +FROM numbers(3); ``` -Ответ: +Результат: ``` text ┌─formatRowNoNewline('CSV', number, 'good')─┐ From a3721ef0ac77046bc0db336b0bb71aa274b2fe97 Mon Sep 17 00:00:00 2001 From: romanzhukov Date: Fri, 5 Feb 2021 01:44:18 +0300 Subject: [PATCH 0148/2357] Revert "DOCSUP-5266: Fix ticket comments." This reverts commit 9a9138d0380ddf67cceda85eb26f8c4d2c978b63. --- .../functions/type-conversion-functions.md | 119 ++++---------- .../functions/type-conversion-functions.md | 149 +++++------------- 2 files changed, 74 insertions(+), 194 deletions(-) diff --git a/docs/en/sql-reference/functions/type-conversion-functions.md b/docs/en/sql-reference/functions/type-conversion-functions.md index b2ede6ba6ec..047b3b1cbea 100644 --- a/docs/en/sql-reference/functions/type-conversion-functions.md +++ b/docs/en/sql-reference/functions/type-conversion-functions.md @@ -36,14 +36,10 @@ The behavior of functions for the [NaN and Inf](../../sql-reference/data-types/f **Example** -Query: - ``` sql -SELECT toInt64(nan), toInt32(32), toInt16('16'), toInt8(8.8); +SELECT toInt64(nan), toInt32(32), toInt16('16'), toInt8(8.8) ``` -Result: - ``` text ┌─────────toInt64(nan)─┬─toInt32(32)─┬─toInt16('16')─┬─toInt8(8.8)─┐ │ -9223372036854775808 │ 32 │ 16 │ 8 │ @@ -56,14 +52,10 @@ It takes an argument of type String and tries to parse it into Int (8 \| 16 \| 3 **Example** -Query: - ``` sql -select toInt64OrZero('123123'), toInt8OrZero('123qwe123'); +select toInt64OrZero('123123'), toInt8OrZero('123qwe123') ``` -Result: - ``` text ┌─toInt64OrZero('123123')─┬─toInt8OrZero('123qwe123')─┐ │ 123123 │ 0 │ @@ -76,14 +68,10 @@ It takes an argument of type String and tries to parse it into Int (8 \| 16 \| 3 **Example** -Query: - ``` sql -select toInt64OrNull('123123'), toInt8OrNull('123qwe123'); +select toInt64OrNull('123123'), toInt8OrNull('123qwe123') ``` -String: - ``` text ┌─toInt64OrNull('123123')─┬─toInt8OrNull('123qwe123')─┐ │ 123123 │ ᴺᵁᴸᴸ │ @@ -114,14 +102,10 @@ The behavior of functions for negative agruments and for the [NaN and Inf](../.. **Example** -Query: - ``` sql -SELECT toUInt64(nan), toUInt32(-32), toUInt16('16'), toUInt8(8.8); +SELECT toUInt64(nan), toUInt32(-32), toUInt16('16'), toUInt8(8.8) ``` -Result: - ``` text ┌───────toUInt64(nan)─┬─toUInt32(-32)─┬─toUInt16('16')─┬─toUInt8(8.8)─┐ │ 9223372036854775808 │ 4294967264 │ 16 │ 8 │ @@ -184,28 +168,20 @@ A value in the `Nullable(Decimal(P,S))` data type. The value contains: **Examples** -Query: - ``` sql -SELECT toDecimal32OrNull(toString(-1.111), 5) AS val, toTypeName(val); +SELECT toDecimal32OrNull(toString(-1.111), 5) AS val, toTypeName(val) ``` -Result: - ``` text ┌──────val─┬─toTypeName(toDecimal32OrNull(toString(-1.111), 5))─┐ │ -1.11100 │ Nullable(Decimal(9, 5)) │ └──────────┴────────────────────────────────────────────────────┘ ``` -Query: - ``` sql -SELECT toDecimal32OrNull(toString(-1.111), 2) AS val, toTypeName(val); +SELECT toDecimal32OrNull(toString(-1.111), 2) AS val, toTypeName(val) ``` -Result: - ``` text ┌──val─┬─toTypeName(toDecimal32OrNull(toString(-1.111), 2))─┐ │ ᴺᵁᴸᴸ │ Nullable(Decimal(9, 2)) │ @@ -372,7 +348,7 @@ String to UUID. Query: ``` sql -SELECT reinterpretAsUUID(reverse(unhex('000102030405060708090a0b0c0d0e0f'))); +SELECT reinterpretAsUUID(reverse(unhex('000102030405060708090a0b0c0d0e0f'))) ``` Result: @@ -405,11 +381,9 @@ Result: ## CAST(x, T) {#type_conversion_function-cast} -Converts unput value `x` to the `T` data type. The syntax `CAST(x AS t)` is also supported. +Converts ‘x’ to the ‘t’ data type. The syntax CAST(x AS t) is also supported. -**Example** - -Query: +Example: ``` sql SELECT @@ -420,8 +394,6 @@ SELECT CAST(timestamp, 'FixedString(22)') AS fixed_string ``` -Result: - ``` text ┌─timestamp───────────┬────────────datetime─┬───────date─┬─string──────────────┬─fixed_string──────────────┐ │ 2016-06-15 23:00:00 │ 2016-06-15 23:00:00 │ 2016-06-15 │ 2016-06-15 23:00:00 │ 2016-06-15 23:00:00\0\0\0 │ @@ -430,18 +402,12 @@ Result: Conversion to FixedString(N) only works for arguments of type String or FixedString(N). -Type conversion to [Nullable](../../sql-reference/data-types/nullable.md) and back is supported. - -**Examples** - -Query: +Type conversion to [Nullable](../../sql-reference/data-types/nullable.md) and back is supported. Example: ``` sql -SELECT toTypeName(x) FROM t_null; +SELECT toTypeName(x) FROM t_null ``` -Result: - ``` text ┌─toTypeName(x)─┐ │ Int8 │ @@ -449,14 +415,10 @@ Result: └───────────────┘ ``` -Query: - ``` sql -SELECT toTypeName(CAST(x, 'Nullable(UInt16)')) FROM t_null; +SELECT toTypeName(CAST(x, 'Nullable(UInt16)')) FROM t_null ``` -Result: - ``` text ┌─toTypeName(CAST(x, 'Nullable(UInt16)'))─┐ │ Nullable(UInt16) │ @@ -470,18 +432,15 @@ Result: ## accurateCast(x, T) {#type_conversion_function-accurate-cast} -Converts `x` to the `T` data type. The differente from [cast(x, T)](#type_conversion_function-cast) is that `accurateCast` -does not allow overflow of numeric types during cast if type value `x` does not fit bounds of type `T`. - -**Examples** - -Query: +Converts ‘x’ to the ‘t’ data type. The differente from cast(x, T) is that accurateCast +does not allow overflow of numeric types during cast if type value x does not fit +bounds of type T. +Example ``` sql -SELECT cast(-1, 'UInt8') as uint8; +SELECT cast(-1, 'UInt8') as uint8; ``` -Result: ``` text ┌─uint8─┐ @@ -489,14 +448,10 @@ Result: └───────┘ ``` -Query: - ```sql SELECT accurateCast(-1, 'UInt8') as uint8; ``` -Result: - ``` text Code: 70. DB::Exception: Received from localhost:9000. DB::Exception: Value in column Int8 cannot be safely converted into type UInt8: While processing accurateCast(-1, 'UInt8') AS uint8. @@ -517,7 +472,7 @@ accurateCastOrNull(x, T) - `x` — Input value. - `T` — The name of the returned data type. -**Examples** +**Example** Query: @@ -533,8 +488,6 @@ Result: └────────────────────────────────────────────┘ ``` -Query: - ``` sql SELECT cast(-1, 'UInt8') as uint8, @@ -577,8 +530,6 @@ toIntervalYear(number) **Example** -Query: - ``` sql WITH toDate('2019-01-01') AS date, @@ -586,11 +537,9 @@ WITH toIntervalWeek(1) AS interval_to_week SELECT date + interval_week, - date + interval_to_week; + date + interval_to_week ``` -Result: - ``` text ┌─plus(date, interval_week)─┬─plus(date, interval_to_week)─┐ │ 2019-01-08 │ 2019-01-08 │ @@ -649,7 +598,7 @@ Query: ``` sql SELECT parseDateTimeBestEffort('Sat, 18 Aug 2018 07:22:16 GMT', 'Europe/Moscow') -AS parseDateTimeBestEffort; +AS parseDateTimeBestEffort ``` Result: @@ -664,7 +613,7 @@ Query: ``` sql SELECT parseDateTimeBestEffort('1284101485') -AS parseDateTimeBestEffort; +AS parseDateTimeBestEffort ``` Result: @@ -679,7 +628,7 @@ Query: ``` sql SELECT parseDateTimeBestEffort('2018-12-12 10:12:12') -AS parseDateTimeBestEffort; +AS parseDateTimeBestEffort ``` Result: @@ -693,7 +642,7 @@ Result: Query: ``` sql -SELECT parseDateTimeBestEffort('10 20:19'); +SELECT parseDateTimeBestEffort('10 20:19') ``` Result: @@ -718,7 +667,7 @@ This function is similar to [‘parseDateTimeBestEffort’](#parsedatetimebestef **Syntax** ``` sql -parseDateTimeBestEffortUS(time_string [, time_zone]) +parseDateTimeBestEffortUS(time_string [, time_zone]); ``` **Parameters** @@ -820,7 +769,7 @@ Type: `LowCardinality(expr_result_type)` Query: ``` sql -SELECT toLowCardinality('1'); +SELECT toLowCardinality('1') ``` Result: @@ -859,7 +808,7 @@ Query: ``` sql WITH toDateTime64('2019-09-16 19:20:12.345678910', 6) AS dt64 -SELECT toUnixTimestamp64Milli(dt64); +SELECT toUnixTimestamp64Milli(dt64) ``` Result: @@ -870,11 +819,9 @@ Result: └──────────────────────────────┘ ``` -Query: - ``` sql WITH toDateTime64('2019-09-16 19:20:12.345678910', 6) AS dt64 -SELECT toUnixTimestamp64Nano(dt64); +SELECT toUnixTimestamp64Nano(dt64) ``` Result: @@ -908,17 +855,13 @@ fromUnixTimestamp64Milli(value [, ti]) - `value` converted to the `DateTime64` data type. -**Example** - -Query: +**Examples** ``` sql WITH CAST(1234567891011, 'Int64') AS i64 -SELECT fromUnixTimestamp64Milli(i64, 'UTC'); +SELECT fromUnixTimestamp64Milli(i64, 'UTC') ``` -Result: - ``` text ┌─fromUnixTimestamp64Milli(i64, 'UTC')─┐ │ 2009-02-13 23:31:31.011 │ @@ -950,7 +893,7 @@ Query: ``` sql SELECT formatRow('CSV', number, 'good') -FROM numbers(3); +FROM numbers(3) ``` Result: @@ -991,7 +934,7 @@ Query: ``` sql SELECT formatRowNoNewline('CSV', number, 'good') -FROM numbers(3); +FROM numbers(3) ``` Result: diff --git a/docs/ru/sql-reference/functions/type-conversion-functions.md b/docs/ru/sql-reference/functions/type-conversion-functions.md index ee3e8583504..40fdbc6f5a0 100644 --- a/docs/ru/sql-reference/functions/type-conversion-functions.md +++ b/docs/ru/sql-reference/functions/type-conversion-functions.md @@ -36,14 +36,10 @@ toc_title: "\u0424\u0443\u043d\u043a\u0446\u0438\u0438\u0020\u043f\u0440\u0435\u **Пример** -Запрос: - ``` sql -SELECT toInt64(nan), toInt32(32), toInt16('16'), toInt8(8.8); +SELECT toInt64(nan), toInt32(32), toInt16('16'), toInt8(8.8) ``` -Результат: - ``` text ┌─────────toInt64(nan)─┬─toInt32(32)─┬─toInt16('16')─┬─toInt8(8.8)─┐ │ -9223372036854775808 │ 32 │ 16 │ 8 │ @@ -56,14 +52,10 @@ SELECT toInt64(nan), toInt32(32), toInt16('16'), toInt8(8.8); **Пример** -Запрос: - ``` sql -select toInt64OrZero('123123'), toInt8OrZero('123qwe123'); +select toInt64OrZero('123123'), toInt8OrZero('123qwe123') ``` -Результат: - ``` text ┌─toInt64OrZero('123123')─┬─toInt8OrZero('123qwe123')─┐ │ 123123 │ 0 │ @@ -76,14 +68,10 @@ select toInt64OrZero('123123'), toInt8OrZero('123qwe123'); **Пример** -Запрос: - ``` sql -select toInt64OrNull('123123'), toInt8OrNull('123qwe123'); +select toInt64OrNull('123123'), toInt8OrNull('123qwe123') ``` -Результат: - ``` text ┌─toInt64OrNull('123123')─┬─toInt8OrNull('123qwe123')─┐ │ 123123 │ ᴺᵁᴸᴸ │ @@ -114,14 +102,10 @@ select toInt64OrNull('123123'), toInt8OrNull('123qwe123'); **Пример** -Запрос: - ``` sql -SELECT toUInt64(nan), toUInt32(-32), toUInt16('16'), toUInt8(8.8); +SELECT toUInt64(nan), toUInt32(-32), toUInt16('16'), toUInt8(8.8) ``` -Результат: - ``` text ┌───────toUInt64(nan)─┬─toUInt32(-32)─┬─toUInt16('16')─┬─toUInt8(8.8)─┐ │ 9223372036854775808 │ 4294967264 │ 16 │ 8 │ @@ -184,28 +168,20 @@ SELECT toUInt64(nan), toUInt32(-32), toUInt16('16'), toUInt8(8.8); **Примеры** -Запрос: - ``` sql -SELECT toDecimal32OrNull(toString(-1.111), 5) AS val, toTypeName(val); +SELECT toDecimal32OrNull(toString(-1.111), 5) AS val, toTypeName(val) ``` -Результат: - ``` text ┌──────val─┬─toTypeName(toDecimal32OrNull(toString(-1.111), 5))─┐ │ -1.11100 │ Nullable(Decimal(9, 5)) │ └──────────┴────────────────────────────────────────────────────┘ ``` -Запрос: - ``` sql -SELECT toDecimal32OrNull(toString(-1.111), 2) AS val, toTypeName(val); +SELECT toDecimal32OrNull(toString(-1.111), 2) AS val, toTypeName(val) ``` -Результат: - ``` text ┌──val─┬─toTypeName(toDecimal32OrNull(toString(-1.111), 2))─┐ │ ᴺᵁᴸᴸ │ Nullable(Decimal(9, 2)) │ @@ -235,30 +211,22 @@ SELECT toDecimal32OrNull(toString(-1.111), 2) AS val, toTypeName(val); - Число с `S` десятичными знаками, если ClickHouse распознал число во входной строке. - 0 c `S` десятичными знаками, если ClickHouse не смог распознать число во входной строке или входное число содержит больше чем `S` десятичных знаков. -**Примеры** - -Запрос: +**Пример** ``` sql -SELECT toDecimal32OrZero(toString(-1.111), 5) AS val, toTypeName(val); +SELECT toDecimal32OrZero(toString(-1.111), 5) AS val, toTypeName(val) ``` -Результат: - ``` text ┌──────val─┬─toTypeName(toDecimal32OrZero(toString(-1.111), 5))─┐ │ -1.11100 │ Decimal(9, 5) │ └──────────┴────────────────────────────────────────────────────┘ ``` -Запрос: - ``` sql -SELECT toDecimal32OrZero(toString(-1.111), 2) AS val, toTypeName(val); +SELECT toDecimal32OrZero(toString(-1.111), 2) AS val, toTypeName(val) ``` -Результат: - ``` text ┌──val─┬─toTypeName(toDecimal32OrZero(toString(-1.111), 2))─┐ │ 0.00 │ Decimal(9, 2) │ @@ -290,18 +258,12 @@ YYYY-MM-DD hh:mm:ss Дополнительно, функция toString от аргумента типа DateTime может принимать второй аргумент String - имя тайм-зоны. Пример: `Asia/Yekaterinburg` В этом случае, форматирование времени производится согласно указанной тайм-зоне. -**Пример** - -Запрос: - ``` sql SELECT now() AS now_local, - toString(now(), 'Asia/Yekaterinburg') AS now_yekat; + toString(now(), 'Asia/Yekaterinburg') AS now_yekat ``` -Результат: - ``` text ┌───────────now_local─┬─now_yekat───────────┐ │ 2016-06-15 00:11:21 │ 2016-06-15 02:11:21 │ @@ -319,30 +281,22 @@ SELECT Принимает аргумент типа String или FixedString. Возвращает String, вырезая содержимое строки до первого найденного нулевого байта. -**Примеры** - -Запрос: +Пример: ``` sql -SELECT toFixedString('foo', 8) AS s, toStringCutToZero(s) AS s_cut; +SELECT toFixedString('foo', 8) AS s, toStringCutToZero(s) AS s_cut ``` -Результат: - ``` text ┌─s─────────────┬─s_cut─┐ │ foo\0\0\0\0\0 │ foo │ └───────────────┴───────┘ ``` -Запрос: - ``` sql -SELECT toFixedString('foo\0bar', 8) AS s, toStringCutToZero(s) AS s_cut; +SELECT toFixedString('foo\0bar', 8) AS s, toStringCutToZero(s) AS s_cut ``` -Результат: - ``` text ┌─s──────────┬─s_cut─┐ │ foo\0bar\0 │ foo │ @@ -390,7 +344,7 @@ reinterpretAsUUID(fixed_string) Запрос: ``` sql -SELECT reinterpretAsUUID(reverse(unhex('000102030405060708090a0b0c0d0e0f'))); +SELECT reinterpretAsUUID(reverse(unhex('000102030405060708090a0b0c0d0e0f'))) ``` Результат: @@ -423,11 +377,10 @@ SELECT uuid = uuid2; ## CAST(x, T) {#type_conversion_function-cast} -Преобразует входное значение `x` в тип данных `T`. Поддерживается также синтаксис `CAST(x AS t)`. +Преобразует x в тип данных t. +Поддерживается также синтаксис CAST(x AS t). -**Пример** - -Запрос: +Пример: ``` sql SELECT @@ -435,11 +388,9 @@ SELECT CAST(timestamp AS DateTime) AS datetime, CAST(timestamp AS Date) AS date, CAST(timestamp, 'String') AS string, - CAST(timestamp, 'FixedString(22)') AS fixed_string; + CAST(timestamp, 'FixedString(22)') AS fixed_string ``` -Результат: - ``` text ┌─timestamp───────────┬────────────datetime─┬───────date─┬─string──────────────┬─fixed_string──────────────┐ │ 2016-06-15 23:00:00 │ 2016-06-15 23:00:00 │ 2016-06-15 │ 2016-06-15 23:00:00 │ 2016-06-15 23:00:00\0\0\0 │ @@ -448,18 +399,12 @@ SELECT Преобразование в FixedString(N) работает только для аргументов типа String или FixedString(N). -Поддерживается преобразование к типу [Nullable](../../sql-reference/functions/type-conversion-functions.md) и обратно. - -**Примеры** - -Запрос: +Поддержано преобразование к типу [Nullable](../../sql-reference/functions/type-conversion-functions.md) и обратно. Пример: ``` sql -SELECT toTypeName(x) FROM t_null; +SELECT toTypeName(x) FROM t_null ``` -Результат: - ``` text ┌─toTypeName(x)─┐ │ Int8 │ @@ -467,14 +412,10 @@ SELECT toTypeName(x) FROM t_null; └───────────────┘ ``` -Запрос: - ``` sql -SELECT toTypeName(CAST(x, 'Nullable(UInt16)')) FROM t_null; +SELECT toTypeName(CAST(x, 'Nullable(UInt16)')) FROM t_null ``` -Результат: - ``` text ┌─toTypeName(CAST(x, 'Nullable(UInt16)'))─┐ │ Nullable(UInt16) │ @@ -501,7 +442,7 @@ accurateCastOrNull(x, T) - `x` — входное значение. - `T` — имя возвращаемого типа данных. -**Примеры** +**Пример** Запрос: @@ -561,8 +502,6 @@ toIntervalYear(number) **Пример** -Запрос: - ``` sql WITH toDate('2019-01-01') AS date, @@ -570,11 +509,9 @@ WITH toIntervalWeek(1) AS interval_to_week SELECT date + interval_week, - date + interval_to_week; + date + interval_to_week ``` -Результат: - ``` text ┌─plus(date, interval_week)─┬─plus(date, interval_to_week)─┐ │ 2019-01-08 │ 2019-01-08 │ @@ -590,7 +527,7 @@ SELECT **Синтаксис** ``` sql -parseDateTimeBestEffort(time_string[, time_zone]) +parseDateTimeBestEffort(time_string[, time_zone]); ``` **Параметры** @@ -633,7 +570,7 @@ AS parseDateTimeBestEffort; ``` sql SELECT parseDateTimeBestEffort('Sat, 18 Aug 2018 07:22:16 GMT', 'Europe/Moscow') -AS parseDateTimeBestEffort; +AS parseDateTimeBestEffort ``` Результат: @@ -648,7 +585,7 @@ AS parseDateTimeBestEffort; ``` sql SELECT parseDateTimeBestEffort('1284101485') -AS parseDateTimeBestEffort; +AS parseDateTimeBestEffort ``` Результат: @@ -663,7 +600,7 @@ AS parseDateTimeBestEffort; ``` sql SELECT parseDateTimeBestEffort('2018-12-12 10:12:12') -AS parseDateTimeBestEffort; +AS parseDateTimeBestEffort ``` Результат: @@ -677,7 +614,7 @@ AS parseDateTimeBestEffort; Запрос: ``` sql -SELECT parseDateTimeBestEffort('10 20:19'); +SELECT parseDateTimeBestEffort('10 20:19') ``` Результат: @@ -702,7 +639,7 @@ SELECT parseDateTimeBestEffort('10 20:19'); **Синтаксис** ``` sql -parseDateTimeBestEffortUS(time_string [, time_zone]) +parseDateTimeBestEffortUS(time_string [, time_zone]); ``` **Параметры** @@ -731,7 +668,7 @@ SELECT parseDateTimeBestEffortUS('09/12/2020 12:12:57') AS parseDateTimeBestEffortUS; ``` -Результат: +Ответ: ``` text ┌─parseDateTimeBestEffortUS─┐ @@ -746,7 +683,7 @@ SELECT parseDateTimeBestEffortUS('09-12-2020 12:12:57') AS parseDateTimeBestEffortUS; ``` -Результат: +Ответ: ``` text ┌─parseDateTimeBestEffortUS─┐ @@ -761,7 +698,7 @@ SELECT parseDateTimeBestEffortUS('09.12.2020 12:12:57') AS parseDateTimeBestEffortUS; ``` -Результат: +Ответ: ``` text ┌─parseDateTimeBestEffortUS─┐ @@ -796,10 +733,10 @@ toUnixTimestamp64Milli(value) ``` sql WITH toDateTime64('2019-09-16 19:20:12.345678910', 6) AS dt64 -SELECT toUnixTimestamp64Milli(dt64); +SELECT toUnixTimestamp64Milli(dt64) ``` -Результат: +Ответ: ``` text ┌─toUnixTimestamp64Milli(dt64)─┐ @@ -811,10 +748,10 @@ SELECT toUnixTimestamp64Milli(dt64); ``` sql WITH toDateTime64('2019-09-16 19:20:12.345678910', 6) AS dt64 -SELECT toUnixTimestamp64Nano(dt64); +SELECT toUnixTimestamp64Nano(dt64) ``` -Результат: +Ответ: ``` text ┌─toUnixTimestamp64Nano(dt64)─┐ @@ -849,10 +786,10 @@ fromUnixTimestamp64Milli(value [, ti]) ``` sql WITH CAST(1234567891011, 'Int64') AS i64 -SELECT fromUnixTimestamp64Milli(i64, 'UTC'); +SELECT fromUnixTimestamp64Milli(i64, 'UTC') ``` -Результат: +Ответ: ``` text ┌─fromUnixTimestamp64Milli(i64, 'UTC')─┐ @@ -883,7 +820,7 @@ toLowCardinality(expr) Тип: `LowCardinality(expr_result_type)` -**Пример** +**Example** Запрос: @@ -924,10 +861,10 @@ formatRow(format, x, y, ...) ``` sql SELECT formatRow('CSV', number, 'good') -FROM numbers(3); +FROM numbers(3) ``` -Результат: +Ответ: ``` text ┌─formatRow('CSV', number, 'good')─┐ @@ -965,10 +902,10 @@ formatRowNoNewline(format, x, y, ...) ``` sql SELECT formatRowNoNewline('CSV', number, 'good') -FROM numbers(3); +FROM numbers(3) ``` -Результат: +Ответ: ``` text ┌─formatRowNoNewline('CSV', number, 'good')─┐ From cc17edbc99ed060f870331f5eb9da93baf5e1e03 Mon Sep 17 00:00:00 2001 From: romanzhukov Date: Fri, 5 Feb 2021 13:29:31 +0300 Subject: [PATCH 0149/2357] DOCSUP-5822: Add function documentation and fix all file examples. --- .../functions/type-conversion-functions.md | 154 +++++++++++++----- .../functions/type-conversion-functions.md | 146 ++++++++++++----- 2 files changed, 220 insertions(+), 80 deletions(-) diff --git a/docs/en/sql-reference/functions/type-conversion-functions.md b/docs/en/sql-reference/functions/type-conversion-functions.md index 047b3b1cbea..1742f6b8888 100644 --- a/docs/en/sql-reference/functions/type-conversion-functions.md +++ b/docs/en/sql-reference/functions/type-conversion-functions.md @@ -36,10 +36,14 @@ The behavior of functions for the [NaN and Inf](../../sql-reference/data-types/f **Example** +Query: + ``` sql -SELECT toInt64(nan), toInt32(32), toInt16('16'), toInt8(8.8) +SELECT toInt64(nan), toInt32(32), toInt16('16'), toInt8(8.8); ``` +Result: + ``` text ┌─────────toInt64(nan)─┬─toInt32(32)─┬─toInt16('16')─┬─toInt8(8.8)─┐ │ -9223372036854775808 │ 32 │ 16 │ 8 │ @@ -52,10 +56,14 @@ It takes an argument of type String and tries to parse it into Int (8 \| 16 \| 3 **Example** +Query: + ``` sql -select toInt64OrZero('123123'), toInt8OrZero('123qwe123') +SELECT toInt64OrZero('123123'), toInt8OrZero('123qwe123'); ``` +Result: + ``` text ┌─toInt64OrZero('123123')─┬─toInt8OrZero('123qwe123')─┐ │ 123123 │ 0 │ @@ -68,10 +76,14 @@ It takes an argument of type String and tries to parse it into Int (8 \| 16 \| 3 **Example** +Query: + ``` sql -select toInt64OrNull('123123'), toInt8OrNull('123qwe123') +SELECT toInt64OrNull('123123'), toInt8OrNull('123qwe123'); ``` +Result: + ``` text ┌─toInt64OrNull('123123')─┬─toInt8OrNull('123qwe123')─┐ │ 123123 │ ᴺᵁᴸᴸ │ @@ -102,10 +114,14 @@ The behavior of functions for negative agruments and for the [NaN and Inf](../.. **Example** +Query: + ``` sql -SELECT toUInt64(nan), toUInt32(-32), toUInt16('16'), toUInt8(8.8) +SELECT toUInt64(nan), toUInt32(-32), toUInt16('16'), toUInt8(8.8); ``` +Result: + ``` text ┌───────toUInt64(nan)─┬─toUInt32(-32)─┬─toUInt16('16')─┬─toUInt8(8.8)─┐ │ 9223372036854775808 │ 4294967264 │ 16 │ 8 │ @@ -168,20 +184,28 @@ A value in the `Nullable(Decimal(P,S))` data type. The value contains: **Examples** +Query: + ``` sql -SELECT toDecimal32OrNull(toString(-1.111), 5) AS val, toTypeName(val) +SELECT toDecimal32OrNull(toString(-1.111), 5) AS val, toTypeName(val); ``` +Result: + ``` text ┌──────val─┬─toTypeName(toDecimal32OrNull(toString(-1.111), 5))─┐ │ -1.11100 │ Nullable(Decimal(9, 5)) │ └──────────┴────────────────────────────────────────────────────┘ ``` +Query: + ``` sql -SELECT toDecimal32OrNull(toString(-1.111), 2) AS val, toTypeName(val) +SELECT toDecimal32OrNull(toString(-1.111), 2) AS val, toTypeName(val); ``` +Result: + ``` text ┌──val─┬─toTypeName(toDecimal32OrNull(toString(-1.111), 2))─┐ │ ᴺᵁᴸᴸ │ Nullable(Decimal(9, 2)) │ @@ -213,20 +237,28 @@ A value in the `Nullable(Decimal(P,S))` data type. The value contains: **Example** +Query: + ``` sql -SELECT toDecimal32OrZero(toString(-1.111), 5) AS val, toTypeName(val) +SELECT toDecimal32OrZero(toString(-1.111), 5) AS val, toTypeName(val); ``` +Result: + ``` text ┌──────val─┬─toTypeName(toDecimal32OrZero(toString(-1.111), 5))─┐ │ -1.11100 │ Decimal(9, 5) │ └──────────┴────────────────────────────────────────────────────┘ ``` +Query: + ``` sql -SELECT toDecimal32OrZero(toString(-1.111), 2) AS val, toTypeName(val) +SELECT toDecimal32OrZero(toString(-1.111), 2) AS val, toTypeName(val); ``` +Result: + ``` text ┌──val─┬─toTypeName(toDecimal32OrZero(toString(-1.111), 2))─┐ │ 0.00 │ Decimal(9, 2) │ @@ -258,12 +290,18 @@ Conversion between numeric types uses the same rules as assignments between diff Additionally, the toString function of the DateTime argument can take a second String argument containing the name of the time zone. Example: `Asia/Yekaterinburg` In this case, the time is formatted according to the specified time zone. +**Example** + +Query: + ``` sql SELECT now() AS now_local, - toString(now(), 'Asia/Yekaterinburg') AS now_yekat + toString(now(), 'Asia/Yekaterinburg') AS now_yekat; ``` +Result: + ``` text ┌───────────now_local─┬─now_yekat───────────┐ │ 2016-06-15 00:11:21 │ 2016-06-15 02:11:21 │ @@ -281,22 +319,30 @@ If the string has fewer bytes than N, it is padded with null bytes to the right. Accepts a String or FixedString argument. Returns the String with the content truncated at the first zero byte found. -Example: +**Example** + +Query: ``` sql -SELECT toFixedString('foo', 8) AS s, toStringCutToZero(s) AS s_cut +SELECT toFixedString('foo', 8) AS s, toStringCutToZero(s) AS s_cut; ``` +Result: + ``` text ┌─s─────────────┬─s_cut─┐ │ foo\0\0\0\0\0 │ foo │ └───────────────┴───────┘ ``` +Query: + ``` sql -SELECT toFixedString('foo\0bar', 8) AS s, toStringCutToZero(s) AS s_cut +SELECT toFixedString('foo\0bar', 8) AS s, toStringCutToZero(s) AS s_cut; ``` +Result: + ``` text ┌─s──────────┬─s_cut─┐ │ foo\0bar\0 │ foo │ @@ -348,7 +394,7 @@ String to UUID. Query: ``` sql -SELECT reinterpretAsUUID(reverse(unhex('000102030405060708090a0b0c0d0e0f'))) +SELECT reinterpretAsUUID(reverse(unhex('000102030405060708090a0b0c0d0e0f'))); ``` Result: @@ -381,9 +427,11 @@ Result: ## CAST(x, T) {#type_conversion_function-cast} -Converts ‘x’ to the ‘t’ data type. The syntax CAST(x AS t) is also supported. +Converts input value `x` to the `T` data type. The syntax `CAST(x AS t)` is also supported. -Example: +**Example** + +Query: ``` sql SELECT @@ -391,9 +439,11 @@ SELECT CAST(timestamp AS DateTime) AS datetime, CAST(timestamp AS Date) AS date, CAST(timestamp, 'String') AS string, - CAST(timestamp, 'FixedString(22)') AS fixed_string + CAST(timestamp, 'FixedString(22)') AS fixed_string; ``` +Result: + ``` text ┌─timestamp───────────┬────────────datetime─┬───────date─┬─string──────────────┬─fixed_string──────────────┐ │ 2016-06-15 23:00:00 │ 2016-06-15 23:00:00 │ 2016-06-15 │ 2016-06-15 23:00:00 │ 2016-06-15 23:00:00\0\0\0 │ @@ -402,12 +452,18 @@ SELECT Conversion to FixedString(N) only works for arguments of type String or FixedString(N). -Type conversion to [Nullable](../../sql-reference/data-types/nullable.md) and back is supported. Example: +Type conversion to [Nullable](../../sql-reference/data-types/nullable.md) and back is supported. + +**Example** + +Query: ``` sql -SELECT toTypeName(x) FROM t_null +SELECT toTypeName(x) FROM t_null; ``` +Result: + ``` text ┌─toTypeName(x)─┐ │ Int8 │ @@ -415,10 +471,14 @@ SELECT toTypeName(x) FROM t_null └───────────────┘ ``` +Query: + ``` sql -SELECT toTypeName(CAST(x, 'Nullable(UInt16)')) FROM t_null +SELECT toTypeName(CAST(x, 'Nullable(UInt16)')) FROM t_null; ``` +Result: + ``` text ┌─toTypeName(CAST(x, 'Nullable(UInt16)'))─┐ │ Nullable(UInt16) │ @@ -432,15 +492,18 @@ SELECT toTypeName(CAST(x, 'Nullable(UInt16)')) FROM t_null ## accurateCast(x, T) {#type_conversion_function-accurate-cast} -Converts ‘x’ to the ‘t’ data type. The differente from cast(x, T) is that accurateCast -does not allow overflow of numeric types during cast if type value x does not fit -bounds of type T. +Converts `x` to the `T` data type. The differente from [cast(x, T)](#type_conversion_function-cast) is that `accurateCast` +does not allow overflow of numeric types during cast if type value `x` does not fit bounds of type `T`. + +**Example** + +Query: -Example ``` sql SELECT cast(-1, 'UInt8') as uint8; ``` +Result: ``` text ┌─uint8─┐ @@ -448,13 +511,16 @@ SELECT cast(-1, 'UInt8') as uint8; └───────┘ ``` +Query: + ```sql SELECT accurateCast(-1, 'UInt8') as uint8; ``` +Result: + ``` text Code: 70. DB::Exception: Received from localhost:9000. DB::Exception: Value in column Int8 cannot be safely converted into type UInt8: While processing accurateCast(-1, 'UInt8') AS uint8. - ``` ## accurateCastOrNull(x, T) {#type_conversion_function-accurate-cast_or_null} @@ -488,6 +554,8 @@ Result: └────────────────────────────────────────────┘ ``` +Query: + ``` sql SELECT cast(-1, 'UInt8') as uint8, @@ -530,6 +598,8 @@ toIntervalYear(number) **Example** +Query: + ``` sql WITH toDate('2019-01-01') AS date, @@ -537,9 +607,11 @@ WITH toIntervalWeek(1) AS interval_to_week SELECT date + interval_week, - date + interval_to_week + date + interval_to_week; ``` +Result: + ``` text ┌─plus(date, interval_week)─┬─plus(date, interval_to_week)─┐ │ 2019-01-08 │ 2019-01-08 │ @@ -555,7 +627,7 @@ The function parses [ISO 8601](https://en.wikipedia.org/wiki/ISO_8601), [RFC 112 **Syntax** ``` sql -parseDateTimeBestEffort(time_string [, time_zone]); +parseDateTimeBestEffort(time_string [, time_zone]) ``` **Parameters** @@ -598,7 +670,7 @@ Query: ``` sql SELECT parseDateTimeBestEffort('Sat, 18 Aug 2018 07:22:16 GMT', 'Europe/Moscow') -AS parseDateTimeBestEffort +AS parseDateTimeBestEffort; ``` Result: @@ -613,7 +685,7 @@ Query: ``` sql SELECT parseDateTimeBestEffort('1284101485') -AS parseDateTimeBestEffort +AS parseDateTimeBestEffort; ``` Result: @@ -628,7 +700,7 @@ Query: ``` sql SELECT parseDateTimeBestEffort('2018-12-12 10:12:12') -AS parseDateTimeBestEffort +AS parseDateTimeBestEffort; ``` Result: @@ -642,7 +714,7 @@ Result: Query: ``` sql -SELECT parseDateTimeBestEffort('10 20:19') +SELECT parseDateTimeBestEffort('10 20:19'); ``` Result: @@ -662,12 +734,12 @@ Result: ## parseDateTimeBestEffortUS {#parsedatetimebesteffortUS} -This function is similar to [‘parseDateTimeBestEffort’](#parsedatetimebesteffort), the only difference is that this function prefers US date format (`MM/DD/YYYY` etc.) in case of ambiguity. +This function is similar to [parseDateTimeBestEffort](#parsedatetimebesteffort), the only difference is that this function prefers US date format (`MM/DD/YYYY` etc.) in case of ambiguity. **Syntax** ``` sql -parseDateTimeBestEffortUS(time_string [, time_zone]); +parseDateTimeBestEffortUS(time_string [, time_zone]) ``` **Parameters** @@ -769,7 +841,7 @@ Type: `LowCardinality(expr_result_type)` Query: ``` sql -SELECT toLowCardinality('1') +SELECT toLowCardinality('1'); ``` Result: @@ -808,7 +880,7 @@ Query: ``` sql WITH toDateTime64('2019-09-16 19:20:12.345678910', 6) AS dt64 -SELECT toUnixTimestamp64Milli(dt64) +SELECT toUnixTimestamp64Milli(dt64); ``` Result: @@ -821,7 +893,7 @@ Result: ``` sql WITH toDateTime64('2019-09-16 19:20:12.345678910', 6) AS dt64 -SELECT toUnixTimestamp64Nano(dt64) +SELECT toUnixTimestamp64Nano(dt64); ``` Result: @@ -855,13 +927,17 @@ fromUnixTimestamp64Milli(value [, ti]) - `value` converted to the `DateTime64` data type. -**Examples** +**Example** + +Query: ``` sql WITH CAST(1234567891011, 'Int64') AS i64 -SELECT fromUnixTimestamp64Milli(i64, 'UTC') +SELECT fromUnixTimestamp64Milli(i64, 'UTC'); ``` +Result: + ``` text ┌─fromUnixTimestamp64Milli(i64, 'UTC')─┐ │ 2009-02-13 23:31:31.011 │ @@ -893,7 +969,7 @@ Query: ``` sql SELECT formatRow('CSV', number, 'good') -FROM numbers(3) +FROM numbers(3); ``` Result: @@ -934,7 +1010,7 @@ Query: ``` sql SELECT formatRowNoNewline('CSV', number, 'good') -FROM numbers(3) +FROM numbers(3); ``` Result: diff --git a/docs/ru/sql-reference/functions/type-conversion-functions.md b/docs/ru/sql-reference/functions/type-conversion-functions.md index 40fdbc6f5a0..aa55e015c61 100644 --- a/docs/ru/sql-reference/functions/type-conversion-functions.md +++ b/docs/ru/sql-reference/functions/type-conversion-functions.md @@ -36,10 +36,14 @@ toc_title: "\u0424\u0443\u043d\u043a\u0446\u0438\u0438\u0020\u043f\u0440\u0435\u **Пример** +Запрос: + ``` sql -SELECT toInt64(nan), toInt32(32), toInt16('16'), toInt8(8.8) +SELECT toInt64(nan), toInt32(32), toInt16('16'), toInt8(8.8); ``` +Результат: + ``` text ┌─────────toInt64(nan)─┬─toInt32(32)─┬─toInt16('16')─┬─toInt8(8.8)─┐ │ -9223372036854775808 │ 32 │ 16 │ 8 │ @@ -52,10 +56,14 @@ SELECT toInt64(nan), toInt32(32), toInt16('16'), toInt8(8.8) **Пример** +Запрос: + ``` sql -select toInt64OrZero('123123'), toInt8OrZero('123qwe123') +SELECT toInt64OrZero('123123'), toInt8OrZero('123qwe123'); ``` +Результат: + ``` text ┌─toInt64OrZero('123123')─┬─toInt8OrZero('123qwe123')─┐ │ 123123 │ 0 │ @@ -68,10 +76,14 @@ select toInt64OrZero('123123'), toInt8OrZero('123qwe123') **Пример** +Запрос: + ``` sql -select toInt64OrNull('123123'), toInt8OrNull('123qwe123') +SELECT toInt64OrNull('123123'), toInt8OrNull('123qwe123'); ``` +Результат: + ``` text ┌─toInt64OrNull('123123')─┬─toInt8OrNull('123qwe123')─┐ │ 123123 │ ᴺᵁᴸᴸ │ @@ -102,10 +114,14 @@ select toInt64OrNull('123123'), toInt8OrNull('123qwe123') **Пример** +Запрос: + ``` sql -SELECT toUInt64(nan), toUInt32(-32), toUInt16('16'), toUInt8(8.8) +SELECT toUInt64(nan), toUInt32(-32), toUInt16('16'), toUInt8(8.8); ``` +Результат: + ``` text ┌───────toUInt64(nan)─┬─toUInt32(-32)─┬─toUInt16('16')─┬─toUInt8(8.8)─┐ │ 9223372036854775808 │ 4294967264 │ 16 │ 8 │ @@ -168,20 +184,28 @@ SELECT toUInt64(nan), toUInt32(-32), toUInt16('16'), toUInt8(8.8) **Примеры** +Запрос: + ``` sql -SELECT toDecimal32OrNull(toString(-1.111), 5) AS val, toTypeName(val) +SELECT toDecimal32OrNull(toString(-1.111), 5) AS val, toTypeName(val); ``` +Результат: + ``` text ┌──────val─┬─toTypeName(toDecimal32OrNull(toString(-1.111), 5))─┐ │ -1.11100 │ Nullable(Decimal(9, 5)) │ └──────────┴────────────────────────────────────────────────────┘ ``` +Запрос: + ``` sql -SELECT toDecimal32OrNull(toString(-1.111), 2) AS val, toTypeName(val) +SELECT toDecimal32OrNull(toString(-1.111), 2) AS val, toTypeName(val); ``` +Результат: + ``` text ┌──val─┬─toTypeName(toDecimal32OrNull(toString(-1.111), 2))─┐ │ ᴺᵁᴸᴸ │ Nullable(Decimal(9, 2)) │ @@ -213,20 +237,28 @@ SELECT toDecimal32OrNull(toString(-1.111), 2) AS val, toTypeName(val) **Пример** +Запрос: + ``` sql -SELECT toDecimal32OrZero(toString(-1.111), 5) AS val, toTypeName(val) +SELECT toDecimal32OrZero(toString(-1.111), 5) AS val, toTypeName(val); ``` +Результат: + ``` text ┌──────val─┬─toTypeName(toDecimal32OrZero(toString(-1.111), 5))─┐ │ -1.11100 │ Decimal(9, 5) │ └──────────┴────────────────────────────────────────────────────┘ ``` +Запрос: + ``` sql -SELECT toDecimal32OrZero(toString(-1.111), 2) AS val, toTypeName(val) +SELECT toDecimal32OrZero(toString(-1.111), 2) AS val, toTypeName(val); ``` +Результат: + ``` text ┌──val─┬─toTypeName(toDecimal32OrZero(toString(-1.111), 2))─┐ │ 0.00 │ Decimal(9, 2) │ @@ -258,12 +290,18 @@ YYYY-MM-DD hh:mm:ss Дополнительно, функция toString от аргумента типа DateTime может принимать второй аргумент String - имя тайм-зоны. Пример: `Asia/Yekaterinburg` В этом случае, форматирование времени производится согласно указанной тайм-зоне. +**Пример** + +Запрос: + ``` sql SELECT now() AS now_local, - toString(now(), 'Asia/Yekaterinburg') AS now_yekat + toString(now(), 'Asia/Yekaterinburg') AS now_yekat; ``` +Результат: + ``` text ┌───────────now_local─┬─now_yekat───────────┐ │ 2016-06-15 00:11:21 │ 2016-06-15 02:11:21 │ @@ -281,22 +319,30 @@ SELECT Принимает аргумент типа String или FixedString. Возвращает String, вырезая содержимое строки до первого найденного нулевого байта. -Пример: +**Примеры** + +Запрос: ``` sql -SELECT toFixedString('foo', 8) AS s, toStringCutToZero(s) AS s_cut +SELECT toFixedString('foo', 8) AS s, toStringCutToZero(s) AS s_cut; ``` +Результат: + ``` text ┌─s─────────────┬─s_cut─┐ │ foo\0\0\0\0\0 │ foo │ └───────────────┴───────┘ ``` +Запрос: + ``` sql -SELECT toFixedString('foo\0bar', 8) AS s, toStringCutToZero(s) AS s_cut +SELECT toFixedString('foo\0bar', 8) AS s, toStringCutToZero(s) AS s_cut; ``` +Результат: + ``` text ┌─s──────────┬─s_cut─┐ │ foo\0bar\0 │ foo │ @@ -344,7 +390,7 @@ reinterpretAsUUID(fixed_string) Запрос: ``` sql -SELECT reinterpretAsUUID(reverse(unhex('000102030405060708090a0b0c0d0e0f'))) +SELECT reinterpretAsUUID(reverse(unhex('000102030405060708090a0b0c0d0e0f'))); ``` Результат: @@ -380,7 +426,9 @@ SELECT uuid = uuid2; Преобразует x в тип данных t. Поддерживается также синтаксис CAST(x AS t). -Пример: +**Пример** + +Запрос: ``` sql SELECT @@ -388,9 +436,11 @@ SELECT CAST(timestamp AS DateTime) AS datetime, CAST(timestamp AS Date) AS date, CAST(timestamp, 'String') AS string, - CAST(timestamp, 'FixedString(22)') AS fixed_string + CAST(timestamp, 'FixedString(22)') AS fixed_string; ``` +Результат: + ``` text ┌─timestamp───────────┬────────────datetime─┬───────date─┬─string──────────────┬─fixed_string──────────────┐ │ 2016-06-15 23:00:00 │ 2016-06-15 23:00:00 │ 2016-06-15 │ 2016-06-15 23:00:00 │ 2016-06-15 23:00:00\0\0\0 │ @@ -399,12 +449,18 @@ SELECT Преобразование в FixedString(N) работает только для аргументов типа String или FixedString(N). -Поддержано преобразование к типу [Nullable](../../sql-reference/functions/type-conversion-functions.md) и обратно. Пример: +Поддержано преобразование к типу [Nullable](../../sql-reference/functions/type-conversion-functions.md) и обратно. + +**Примеры** + +Запрос: ``` sql -SELECT toTypeName(x) FROM t_null +SELECT toTypeName(x) FROM t_null; ``` +Результат: + ``` text ┌─toTypeName(x)─┐ │ Int8 │ @@ -412,10 +468,14 @@ SELECT toTypeName(x) FROM t_null └───────────────┘ ``` +Запрос: + ``` sql -SELECT toTypeName(CAST(x, 'Nullable(UInt16)')) FROM t_null +SELECT toTypeName(CAST(x, 'Nullable(UInt16)')) FROM t_null; ``` +Результат: + ``` text ┌─toTypeName(CAST(x, 'Nullable(UInt16)'))─┐ │ Nullable(UInt16) │ @@ -442,7 +502,7 @@ accurateCastOrNull(x, T) - `x` — входное значение. - `T` — имя возвращаемого типа данных. -**Пример** +**Примеры** Запрос: @@ -502,6 +562,8 @@ toIntervalYear(number) **Пример** +Запрос: + ``` sql WITH toDate('2019-01-01') AS date, @@ -509,9 +571,11 @@ WITH toIntervalWeek(1) AS interval_to_week SELECT date + interval_week, - date + interval_to_week + date + interval_to_week; ``` +Результат: + ``` text ┌─plus(date, interval_week)─┬─plus(date, interval_to_week)─┐ │ 2019-01-08 │ 2019-01-08 │ @@ -527,7 +591,7 @@ SELECT **Синтаксис** ``` sql -parseDateTimeBestEffort(time_string[, time_zone]); +parseDateTimeBestEffort(time_string[, time_zone]) ``` **Параметры** @@ -570,7 +634,7 @@ AS parseDateTimeBestEffort; ``` sql SELECT parseDateTimeBestEffort('Sat, 18 Aug 2018 07:22:16 GMT', 'Europe/Moscow') -AS parseDateTimeBestEffort +AS parseDateTimeBestEffort; ``` Результат: @@ -585,7 +649,7 @@ AS parseDateTimeBestEffort ``` sql SELECT parseDateTimeBestEffort('1284101485') -AS parseDateTimeBestEffort +AS parseDateTimeBestEffort; ``` Результат: @@ -600,7 +664,7 @@ AS parseDateTimeBestEffort ``` sql SELECT parseDateTimeBestEffort('2018-12-12 10:12:12') -AS parseDateTimeBestEffort +AS parseDateTimeBestEffort; ``` Результат: @@ -614,7 +678,7 @@ AS parseDateTimeBestEffort Запрос: ``` sql -SELECT parseDateTimeBestEffort('10 20:19') +SELECT parseDateTimeBestEffort('10 20:19'); ``` Результат: @@ -639,7 +703,7 @@ SELECT parseDateTimeBestEffort('10 20:19') **Синтаксис** ``` sql -parseDateTimeBestEffortUS(time_string [, time_zone]); +parseDateTimeBestEffortUS(time_string [, time_zone]) ``` **Параметры** @@ -668,7 +732,7 @@ SELECT parseDateTimeBestEffortUS('09/12/2020 12:12:57') AS parseDateTimeBestEffortUS; ``` -Ответ: +Результат: ``` text ┌─parseDateTimeBestEffortUS─┐ @@ -683,7 +747,7 @@ SELECT parseDateTimeBestEffortUS('09-12-2020 12:12:57') AS parseDateTimeBestEffortUS; ``` -Ответ: +Результат: ``` text ┌─parseDateTimeBestEffortUS─┐ @@ -698,7 +762,7 @@ SELECT parseDateTimeBestEffortUS('09.12.2020 12:12:57') AS parseDateTimeBestEffortUS; ``` -Ответ: +Результат: ``` text ┌─parseDateTimeBestEffortUS─┐ @@ -733,10 +797,10 @@ toUnixTimestamp64Milli(value) ``` sql WITH toDateTime64('2019-09-16 19:20:12.345678910', 6) AS dt64 -SELECT toUnixTimestamp64Milli(dt64) +SELECT toUnixTimestamp64Milli(dt64); ``` -Ответ: +Результат: ``` text ┌─toUnixTimestamp64Milli(dt64)─┐ @@ -748,10 +812,10 @@ SELECT toUnixTimestamp64Milli(dt64) ``` sql WITH toDateTime64('2019-09-16 19:20:12.345678910', 6) AS dt64 -SELECT toUnixTimestamp64Nano(dt64) +SELECT toUnixTimestamp64Nano(dt64); ``` -Ответ: +Результат: ``` text ┌─toUnixTimestamp64Nano(dt64)─┐ @@ -786,10 +850,10 @@ fromUnixTimestamp64Milli(value [, ti]) ``` sql WITH CAST(1234567891011, 'Int64') AS i64 -SELECT fromUnixTimestamp64Milli(i64, 'UTC') +SELECT fromUnixTimestamp64Milli(i64, 'UTC'); ``` -Ответ: +Результат: ``` text ┌─fromUnixTimestamp64Milli(i64, 'UTC')─┐ @@ -820,12 +884,12 @@ toLowCardinality(expr) Тип: `LowCardinality(expr_result_type)` -**Example** +**Пример** Запрос: ```sql -SELECT toLowCardinality('1') +SELECT toLowCardinality('1'); ``` Результат: @@ -861,10 +925,10 @@ formatRow(format, x, y, ...) ``` sql SELECT formatRow('CSV', number, 'good') -FROM numbers(3) +FROM numbers(3); ``` -Ответ: +Результат: ``` text ┌─formatRow('CSV', number, 'good')─┐ @@ -902,10 +966,10 @@ formatRowNoNewline(format, x, y, ...) ``` sql SELECT formatRowNoNewline('CSV', number, 'good') -FROM numbers(3) +FROM numbers(3); ``` -Ответ: +Результат: ``` text ┌─formatRowNoNewline('CSV', number, 'good')─┐ From 5472eb5bd99aa712821a30b4e6aa1a73dfb6d40b Mon Sep 17 00:00:00 2001 From: Nicolae Vartolomei Date: Fri, 5 Feb 2021 10:39:58 +0000 Subject: [PATCH 0150/2357] Allow to drop readonly tables This check doesn't seem to be necessary. There seem to be a deadlock due to a logical race of drop with restarting thread. Seen in https://clickhouse-test-reports.s3.yandex.net/20088/4ebb44bb9936ed1daa330cb38f343664ca83751c/integration_tests_flaky_check_(asan).html#fail1 --- src/Storages/StorageReplicatedMergeTree.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/Storages/StorageReplicatedMergeTree.cpp b/src/Storages/StorageReplicatedMergeTree.cpp index 69cbe0d7062..cb5f4dd5185 100644 --- a/src/Storages/StorageReplicatedMergeTree.cpp +++ b/src/Storages/StorageReplicatedMergeTree.cpp @@ -751,7 +751,7 @@ void StorageReplicatedMergeTree::drop() auto zookeeper = global_context.getZooKeeper(); /// If probably there is metadata in ZooKeeper, we don't allow to drop the table. - if (is_readonly || !zookeeper) + if (!zookeeper) throw Exception("Can't drop readonly replicated table (need to drop data in ZooKeeper as well)", ErrorCodes::TABLE_IS_READ_ONLY); shutdown(); From df6c882aab57882f78c15baae200d593b3dad7e6 Mon Sep 17 00:00:00 2001 From: Anton Ivashkin Date: Wed, 20 Jan 2021 12:48:22 +0300 Subject: [PATCH 0151/2357] Fix build after merge --- S3ZeroCopyReplication.md | 2 +- src/Disks/DiskCacheWrapper.cpp | 6 ++-- src/Disks/DiskCacheWrapper.h | 2 +- src/Disks/DiskDecorator.cpp | 4 +-- src/Disks/DiskDecorator.h | 2 +- src/Disks/IDisk.h | 4 +-- src/Disks/S3/DiskS3.cpp | 2 -- src/Disks/S3/DiskS3.h | 2 +- src/Storages/MergeTree/DataPartsExchange.cpp | 9 +++--- src/Storages/MergeTree/IMergeTreeDataPart.cpp | 25 ++++++++++++--- src/Storages/MergeTree/MergeTreeSettings.h | 2 +- src/Storages/StorageReplicatedMergeTree.cpp | 31 ++++++++++++++----- 12 files changed, 61 insertions(+), 30 deletions(-) diff --git a/S3ZeroCopyReplication.md b/S3ZeroCopyReplication.md index bfb39addcd2..5230640ebcc 100644 --- a/S3ZeroCopyReplication.md +++ b/S3ZeroCopyReplication.md @@ -37,7 +37,7 @@ В гибридном хранилище если парт переносится на S3, нода через ZK проверяет, нет был ли парт перенесен другой нодой, если был, то делает fetch (модифицированный по сравнению с обычным fetch'ем). -В конфиг добавлен флаг, по которому включается функционал нового протокола репликации - merge_tree->allow_s3_zero_copy_replication. Сейчас стоит в true - это времеменно, чтобы все тесты сейчас проходили с включенным флагом, перед финальным мержем надо не забыть заменить на false. +В конфиг добавлен флаг, по которому включается функционал нового протокола репликации - merge_tree->allow_s3_zero_copy_replication. Сейчас стоит в false. ## Костыли и недоработки, коих много diff --git a/src/Disks/DiskCacheWrapper.cpp b/src/Disks/DiskCacheWrapper.cpp index c26fa7623a4..df30af769e1 100644 --- a/src/Disks/DiskCacheWrapper.cpp +++ b/src/Disks/DiskCacheWrapper.cpp @@ -278,11 +278,11 @@ void DiskCacheWrapper::removeRecursive(const String & path) DiskDecorator::removeRecursive(path); } -void DiskCacheWrapper::removeShared(const String & path, bool keep_s3) +void DiskCacheWrapper::removeSharedFile(const String & path, bool keep_s3) { if (cache_disk->exists(path)) - cache_disk->removeShared(path, keep_s3); - DiskDecorator::removeShared(path, keep_s3); + cache_disk->removeSharedFile(path, keep_s3); + DiskDecorator::removeSharedFile(path, keep_s3); } void DiskCacheWrapper::removeSharedRecursive(const String & path, bool keep_s3) diff --git a/src/Disks/DiskCacheWrapper.h b/src/Disks/DiskCacheWrapper.h index fc7ccaaa345..8995bf1936d 100644 --- a/src/Disks/DiskCacheWrapper.h +++ b/src/Disks/DiskCacheWrapper.h @@ -41,7 +41,7 @@ public: void removeFileIfExists(const String & path) override; void removeDirectory(const String & path) override; void removeRecursive(const String & path) override; - void removeShared(const String & path, bool keep_s3) override; + void removeSharedFile(const String & path, bool keep_s3) override; void removeSharedRecursive(const String & path, bool keep_s3) override; void createHardLink(const String & src_path, const String & dst_path) override; ReservationPtr reserve(UInt64 bytes) override; diff --git a/src/Disks/DiskDecorator.cpp b/src/Disks/DiskDecorator.cpp index 9c8c7859b8b..96d2e8278e3 100644 --- a/src/Disks/DiskDecorator.cpp +++ b/src/Disks/DiskDecorator.cpp @@ -150,9 +150,9 @@ void DiskDecorator::removeRecursive(const String & path) delegate->removeRecursive(path); } -void DiskDecorator::removeShared(const String & path, bool keep_s3) +void DiskDecorator::removeSharedFile(const String & path, bool keep_s3) { - delegate->removeShared(path, keep_s3); + delegate->removeSharedFile(path, keep_s3); } void DiskDecorator::removeSharedRecursive(const String & path, bool keep_s3) diff --git a/src/Disks/DiskDecorator.h b/src/Disks/DiskDecorator.h index edba993639a..d069f8a84b6 100644 --- a/src/Disks/DiskDecorator.h +++ b/src/Disks/DiskDecorator.h @@ -43,7 +43,7 @@ public: void removeFileIfExists(const String & path) override; void removeDirectory(const String & path) override; void removeRecursive(const String & path) override; - void removeShared(const String & path, bool keep_s3) override; + void removeSharedFile(const String & path, bool keep_s3) override; void removeSharedRecursive(const String & path, bool keep_s3) override; void setLastModified(const String & path, const Poco::Timestamp & timestamp) override; Poco::Timestamp getLastModified(const String & path) override; diff --git a/src/Disks/IDisk.h b/src/Disks/IDisk.h index 612c5ef88ee..a5a886c9c9f 100644 --- a/src/Disks/IDisk.h +++ b/src/Disks/IDisk.h @@ -201,10 +201,10 @@ public: /// Invoked when Global Context is shutdown. virtual void shutdown() { } - /// Return some uniq string for file, overrided for S3 + /// Return some uniq string for file, overrode for S3 virtual String getUniqueId(const String & path) const { return path; } - /// Check file, overrided for S3 only + /// Check file, overrode for S3 only virtual bool checkUniqueId(const String & id) const { return exists(id); } /// Returns executor to perform asynchronous operations. diff --git a/src/Disks/S3/DiskS3.cpp b/src/Disks/S3/DiskS3.cpp index bbedb2af8f6..aadfcfa82d6 100644 --- a/src/Disks/S3/DiskS3.cpp +++ b/src/Disks/S3/DiskS3.cpp @@ -935,8 +935,6 @@ bool DiskS3::checkUniqueId(const String & id) const throwIfError(resp); Aws::Vector object_list = resp.GetResult().GetContents(); - if (object_list.size() < 1) - return false; for (const auto & object : object_list) if (object.GetKey() == id) return true; diff --git a/src/Disks/S3/DiskS3.h b/src/Disks/S3/DiskS3.h index acfb75f681d..165f09ff1e4 100644 --- a/src/Disks/S3/DiskS3.h +++ b/src/Disks/S3/DiskS3.h @@ -118,7 +118,7 @@ public: String getUniqueId(const String & path) const override; - bool checkUniqueId(const String & path) const override; + bool checkUniqueId(const String & id) const override; private: bool tryReserve(UInt64 bytes); diff --git a/src/Storages/MergeTree/DataPartsExchange.cpp b/src/Storages/MergeTree/DataPartsExchange.cpp index f2ae78c85ce..7041cfd5ad2 100644 --- a/src/Storages/MergeTree/DataPartsExchange.cpp +++ b/src/Storages/MergeTree/DataPartsExchange.cpp @@ -9,7 +9,6 @@ #include #include #include -#include #include #include #include @@ -619,11 +618,11 @@ MergeTreeData::MutableDataPartPtr Fetcher::downloadPartToS3( DiskPtr disk = disks_s3[0]; - for (const auto & disk_ : disks_s3) + for (const auto & disk_s3 : disks_s3) { - if (disk_->checkUniqueId(part_id)) + if (disk_s3->checkUniqueId(part_id)) { - disk = disk_; + disk = disk_s3; break; } } @@ -662,7 +661,7 @@ MergeTreeData::MutableDataPartPtr Fetcher::downloadPartToS3( String metadata_file = fullPath(disk, data_path); { - auto file_out = createWriteBufferFromFileBase(metadata_file, 0, 0, DBMS_DEFAULT_BUFFER_SIZE, -1); + auto file_out = std::make_unique(metadata_file, DBMS_DEFAULT_BUFFER_SIZE, -1, 0666, nullptr, 0); HashingWriteBuffer hashing_out(*file_out); diff --git a/src/Storages/MergeTree/IMergeTreeDataPart.cpp b/src/Storages/MergeTree/IMergeTreeDataPart.cpp index 89ec68e5068..69710311af3 100644 --- a/src/Storages/MergeTree/IMergeTreeDataPart.cpp +++ b/src/Storages/MergeTree/IMergeTreeDataPart.cpp @@ -1132,7 +1132,9 @@ void IMergeTreeDataPart::makeCloneOnDisk(const DiskPtr & disk, const String & di if (disk->getType() == "s3") { - is_fetched = tryToFetchIfShared(disk, path_to_clone + "/" + name); + auto data_settings = storage.getSettings(); + if (data_settings->allow_s3_zero_copy_replication) + is_fetched = tryToFetchIfShared(disk, path_to_clone + "/" + name); } if (!is_fetched) @@ -1301,8 +1303,23 @@ void IMergeTreeDataPart::lockSharedData() const LOG_TRACE(storage.log, "Set zookeeper lock {}", zookeeper_node); - zk.zookeeper->createAncestors(zookeeper_node); - zk.zookeeper->createIfNotExists(zookeeper_node, "lock"); + /// In rare case other replica can remove path between createAncestors and createIfNotExists + /// So we make up to 5 attempts + for (int attempts = 5; attempts > 0; --attempts) + { + try + { + zk.zookeeper->createAncestors(zookeeper_node); + zk.zookeeper->createIfNotExists(zookeeper_node, "lock"); + break; + } + catch (const zkutil::KeeperException & e) + { + if (e.code == Coordination::Error::ZNONODE) + continue; + throw; + } + } } bool IMergeTreeDataPart::unlockSharedData() const @@ -1476,7 +1493,7 @@ bool IMergeTreeDataPart::tryToFetchIfShared(const DiskPtr & disk, const String & log_entry.disk = disk; log_entry.path = path; - /// TODO: !!! Fix const usage !!! + /// TODO: Fix const usage StorageReplicatedMergeTree *replicated_storage_nc = const_cast(replicated_storage); return replicated_storage_nc->executeFetchShared(log_entry); diff --git a/src/Storages/MergeTree/MergeTreeSettings.h b/src/Storages/MergeTree/MergeTreeSettings.h index 68c69c3687e..ab6e2cc995e 100644 --- a/src/Storages/MergeTree/MergeTreeSettings.h +++ b/src/Storages/MergeTree/MergeTreeSettings.h @@ -108,7 +108,7 @@ struct Settings; M(UInt64, concurrent_part_removal_threshold, 100, "Activate concurrent part removal (see 'max_part_removal_threads') only if the number of inactive data parts is at least this.", 0) \ M(String, storage_policy, "default", "Name of storage disk policy", 0) \ M(Bool, allow_nullable_key, false, "Allow Nullable types as primary keys.", 0) \ - M(Bool, allow_s3_zero_copy_replication, true, "Allow Zero-copy replication over S3", 0) \ + M(Bool, allow_s3_zero_copy_replication, false, "Allow Zero-copy replication over S3", 0) \ M(Bool, remove_empty_parts, true, "Remove empty parts after they were pruned by TTL, mutation, or collapsing merge algorithm", 0) \ M(Bool, assign_part_uuids, false, "Generate UUIDs for parts. Before enabling check that all replicas support new format.", 0) \ M(Int64, max_partitions_to_read, -1, "Limit the max number of partitions that can be accessed in one query. <= 0 means unlimited. This setting is the default that can be overridden by the query-level setting with the same name.", 0) \ diff --git a/src/Storages/StorageReplicatedMergeTree.cpp b/src/Storages/StorageReplicatedMergeTree.cpp index c3b8731cbe8..2002c124a66 100644 --- a/src/Storages/StorageReplicatedMergeTree.cpp +++ b/src/Storages/StorageReplicatedMergeTree.cpp @@ -1498,11 +1498,28 @@ bool StorageReplicatedMergeTree::tryExecuteMerge(const LogEntry & entry) { auto zookeeper = getZooKeeper(); String zookeeper_node = zookeeper_path + "/zero_copy_s3/merged/" + entry.new_part_name; - zookeeper->createAncestors(zookeeper_node); - auto code = zookeeper->tryCreate(zookeeper_node, "lock", zkutil::CreateMode::Ephemeral); - /// Someone else created or started create this merge - if (code == Coordination::Error::ZNODEEXISTS) - return false; + + /// In rare case other replica can remove path between createAncestors and tryCreate + /// So we make up to 5 attempts to make a lock + for (int attempts = 5; attempts > 0; --attempts) + { + try + { + zookeeper->createAncestors(zookeeper_node); + auto code = zookeeper->tryCreate(zookeeper_node, "lock", zkutil::CreateMode::Ephemeral); + /// Someone else created or started create this merge + if (code == Coordination::Error::ZNODEEXISTS) + return false; + if (code != Coordination::Error::ZNONODE) + break; + } + catch (const zkutil::KeeperException & e) + { + if (e.code == Coordination::Error::ZNONODE) + continue; + throw; + } + } } } @@ -1930,7 +1947,7 @@ bool StorageReplicatedMergeTree::executeFetchShared(ReplicatedMergeTreeLogEntry try { - if (!fetchPart(entry.new_part_name, metadata_snapshot, zookeeper_path + "/replicas/" + entry.source_replica, false, entry.quorum, + if (!fetchPart(entry.new_part_name, metadata_snapshot, zookeeper_path + "/replicas/" + entry.source_replica, false, entry.quorum, nullptr, true, entry.disk, entry.path)) return false; } @@ -3624,7 +3641,7 @@ bool StorageReplicatedMergeTree::fetchPart(const String & part_name, const Stora { if (part->volume->getDisk()->getName() != replaced_disk->getName()) throw Exception("Part " + part->name + " fetched on wrong disk " + part->volume->getDisk()->getName(), ErrorCodes::LOGICAL_ERROR); - replaced_disk->removeIfExists(replaced_part_path); + replaced_disk->removeFileIfExists(replaced_part_path); replaced_disk->moveDirectory(part->getFullRelativePath(), replaced_part_path); } else From e051423584855ef75bbe7d41d1b6db8a649f7bee Mon Sep 17 00:00:00 2001 From: Evgeniia Sudarikova Date: Fri, 5 Feb 2021 22:14:52 +0300 Subject: [PATCH 0152/2357] add RU docs --- .../example-datasets/brown-benchmark.md | 416 ++++++++++++++++++ .../functions/array-functions.md | 150 +++++++ .../ru/sql-reference/table-functions/mysql.md | 63 ++- 3 files changed, 605 insertions(+), 24 deletions(-) create mode 100644 docs/ru/getting-started/example-datasets/brown-benchmark.md diff --git a/docs/ru/getting-started/example-datasets/brown-benchmark.md b/docs/ru/getting-started/example-datasets/brown-benchmark.md new file mode 100644 index 00000000000..b3f2285093a --- /dev/null +++ b/docs/ru/getting-started/example-datasets/brown-benchmark.md @@ -0,0 +1,416 @@ +--- +toc_priority: 20 +toc_title: Brown University Benchmark +--- + +# Brown University Benchmark + +`MgBench` — это новый аналитический бенчмарк для сгенерированного журнала событий, разработанный [Andrew Crotty](http://cs.brown.edu/people/acrotty/). + +Скачать данные: +``` +wget https://datasets.clickhouse.tech/mgbench{1..3}.csv.xz +``` + +Распаковать данные: +``` +xz -v -d mgbench{1..3}.csv.xz +``` + +Создание таблиц: +``` +CREATE DATABASE mgbench; + + +CREATE TABLE mgbench.logs1 ( + log_time DateTime, + machine_name LowCardinality(String), + machine_group LowCardinality(String), + cpu_idle Nullable(Float32), + cpu_nice Nullable(Float32), + cpu_system Nullable(Float32), + cpu_user Nullable(Float32), + cpu_wio Nullable(Float32), + disk_free Nullable(Float32), + disk_total Nullable(Float32), + part_max_used Nullable(Float32), + load_fifteen Nullable(Float32), + load_five Nullable(Float32), + load_one Nullable(Float32), + mem_buffers Nullable(Float32), + mem_cached Nullable(Float32), + mem_free Nullable(Float32), + mem_shared Nullable(Float32), + swap_free Nullable(Float32), + bytes_in Nullable(Float32), + bytes_out Nullable(Float32) +) +ENGINE = MergeTree() +ORDER BY (machine_group, machine_name, log_time); + + +CREATE TABLE mgbench.logs2 ( + log_time DateTime, + client_ip IPv4, + request String, + status_code UInt16, + object_size UInt64 +) +ENGINE = MergeTree() +ORDER BY log_time; + + +CREATE TABLE mgbench.logs3 ( + log_time DateTime64, + device_id FixedString(15), + device_name LowCardinality(String), + device_type LowCardinality(String), + device_floor UInt8, + event_type LowCardinality(String), + event_unit FixedString(1), + event_value Nullable(Float32) +) +ENGINE = MergeTree() +ORDER BY (event_type, log_time); +``` + +Insert data: + +``` +clickhouse-client --query "INSERT INTO mgbench.logs1 FORMAT CSVWithNames" < mgbench1.csv +clickhouse-client --query "INSERT INTO mgbench.logs2 FORMAT CSVWithNames" < mgbench2.csv +clickhouse-client --query "INSERT INTO mgbench.logs3 FORMAT CSVWithNames" < mgbench3.csv +``` + +Run benchmark queries: +``` +-- Q1.1: What is the CPU/network utilization for each web server since midnight? + +SELECT machine_name, + MIN(cpu) AS cpu_min, + MAX(cpu) AS cpu_max, + AVG(cpu) AS cpu_avg, + MIN(net_in) AS net_in_min, + MAX(net_in) AS net_in_max, + AVG(net_in) AS net_in_avg, + MIN(net_out) AS net_out_min, + MAX(net_out) AS net_out_max, + AVG(net_out) AS net_out_avg +FROM ( + SELECT machine_name, + COALESCE(cpu_user, 0.0) AS cpu, + COALESCE(bytes_in, 0.0) AS net_in, + COALESCE(bytes_out, 0.0) AS net_out + FROM logs1 + WHERE machine_name IN ('anansi','aragog','urd') + AND log_time >= TIMESTAMP '2017-01-11 00:00:00' +) AS r +GROUP BY machine_name; + + +-- Q1.2: Which computer lab machines have been offline in the past day? + +SELECT machine_name, + log_time +FROM logs1 +WHERE (machine_name LIKE 'cslab%' OR + machine_name LIKE 'mslab%') + AND load_one IS NULL + AND log_time >= TIMESTAMP '2017-01-10 00:00:00' +ORDER BY machine_name, + log_time; + + +-- Q1.3: What are the hourly average metrics during the past 10 days for a specific workstation? + +SELECT dt, + hr, + AVG(load_fifteen) AS load_fifteen_avg, + AVG(load_five) AS load_five_avg, + AVG(load_one) AS load_one_avg, + AVG(mem_free) AS mem_free_avg, + AVG(swap_free) AS swap_free_avg +FROM ( + SELECT CAST(log_time AS DATE) AS dt, + EXTRACT(HOUR FROM log_time) AS hr, + load_fifteen, + load_five, + load_one, + mem_free, + swap_free + FROM logs1 + WHERE machine_name = 'babbage' + AND load_fifteen IS NOT NULL + AND load_five IS NOT NULL + AND load_one IS NOT NULL + AND mem_free IS NOT NULL + AND swap_free IS NOT NULL + AND log_time >= TIMESTAMP '2017-01-01 00:00:00' +) AS r +GROUP BY dt, + hr +ORDER BY dt, + hr; + + +-- Q1.4: Over 1 month, how often was each server blocked on disk I/O? + +SELECT machine_name, + COUNT(*) AS spikes +FROM logs1 +WHERE machine_group = 'Servers' + AND cpu_wio > 0.99 + AND log_time >= TIMESTAMP '2016-12-01 00:00:00' + AND log_time < TIMESTAMP '2017-01-01 00:00:00' +GROUP BY machine_name +ORDER BY spikes DESC +LIMIT 10; + + +-- Q1.5: Which externally reachable VMs have run low on memory? + +SELECT machine_name, + dt, + MIN(mem_free) AS mem_free_min +FROM ( + SELECT machine_name, + CAST(log_time AS DATE) AS dt, + mem_free + FROM logs1 + WHERE machine_group = 'DMZ' + AND mem_free IS NOT NULL +) AS r +GROUP BY machine_name, + dt +HAVING MIN(mem_free) < 10000 +ORDER BY machine_name, + dt; + + +-- Q1.6: What is the total hourly network traffic across all file servers? + +SELECT dt, + hr, + SUM(net_in) AS net_in_sum, + SUM(net_out) AS net_out_sum, + SUM(net_in) + SUM(net_out) AS both_sum +FROM ( + SELECT CAST(log_time AS DATE) AS dt, + EXTRACT(HOUR FROM log_time) AS hr, + COALESCE(bytes_in, 0.0) / 1000000000.0 AS net_in, + COALESCE(bytes_out, 0.0) / 1000000000.0 AS net_out + FROM logs1 + WHERE machine_name IN ('allsorts','andes','bigred','blackjack','bonbon', + 'cadbury','chiclets','cotton','crows','dove','fireball','hearts','huey', + 'lindt','milkduds','milkyway','mnm','necco','nerds','orbit','peeps', + 'poprocks','razzles','runts','smarties','smuggler','spree','stride', + 'tootsie','trident','wrigley','york') +) AS r +GROUP BY dt, + hr +ORDER BY both_sum DESC +LIMIT 10; + + +-- Q2.1: Which requests have caused server errors within the past 2 weeks? + +SELECT * +FROM logs2 +WHERE status_code >= 500 + AND log_time >= TIMESTAMP '2012-12-18 00:00:00' +ORDER BY log_time; + + +-- Q2.2: During a specific 2-week period, was the user password file leaked? + +SELECT * +FROM logs2 +WHERE status_code >= 200 + AND status_code < 300 + AND request LIKE '%/etc/passwd%' + AND log_time >= TIMESTAMP '2012-05-06 00:00:00' + AND log_time < TIMESTAMP '2012-05-20 00:00:00'; + + +-- Q2.3: What was the average path depth for top-level requests in the past month? + +SELECT top_level, + AVG(LENGTH(request) - LENGTH(REPLACE(request, '/', ''))) AS depth_avg +FROM ( + SELECT SUBSTRING(request FROM 1 FOR len) AS top_level, + request + FROM ( + SELECT POSITION(SUBSTRING(request FROM 2), '/') AS len, + request + FROM logs2 + WHERE status_code >= 200 + AND status_code < 300 + AND log_time >= TIMESTAMP '2012-12-01 00:00:00' + ) AS r + WHERE len > 0 +) AS s +WHERE top_level IN ('/about','/courses','/degrees','/events', + '/grad','/industry','/news','/people', + '/publications','/research','/teaching','/ugrad') +GROUP BY top_level +ORDER BY top_level; + + +-- Q2.4: During the last 3 months, which clients have made an excessive number of requests? + +SELECT client_ip, + COUNT(*) AS num_requests +FROM logs2 +WHERE log_time >= TIMESTAMP '2012-10-01 00:00:00' +GROUP BY client_ip +HAVING COUNT(*) >= 100000 +ORDER BY num_requests DESC; + + +-- Q2.5: What are the daily unique visitors? + +SELECT dt, + COUNT(DISTINCT client_ip) +FROM ( + SELECT CAST(log_time AS DATE) AS dt, + client_ip + FROM logs2 +) AS r +GROUP BY dt +ORDER BY dt; + + +-- Q2.6: What are the average and maximum data transfer rates (Gbps)? + +SELECT AVG(transfer) / 125000000.0 AS transfer_avg, + MAX(transfer) / 125000000.0 AS transfer_max +FROM ( + SELECT log_time, + SUM(object_size) AS transfer + FROM logs2 + GROUP BY log_time +) AS r; + + +-- Q3.1: Did the indoor temperature reach freezing over the weekend? + +SELECT * +FROM logs3 +WHERE event_type = 'temperature' + AND event_value <= 32.0 + AND log_time >= '2019-11-29 17:00:00.000'; + + +-- Q3.4: Over the past 6 months, how frequently were each door opened? + +SELECT device_name, + device_floor, + COUNT(*) AS ct +FROM logs3 +WHERE event_type = 'door_open' + AND log_time >= '2019-06-01 00:00:00.000' +GROUP BY device_name, + device_floor +ORDER BY ct DESC; + + +-- Q3.5: Where in the building do large temperature variations occur in winter and summer? + +WITH temperature AS ( + SELECT dt, + device_name, + device_type, + device_floor + FROM ( + SELECT dt, + hr, + device_name, + device_type, + device_floor, + AVG(event_value) AS temperature_hourly_avg + FROM ( + SELECT CAST(log_time AS DATE) AS dt, + EXTRACT(HOUR FROM log_time) AS hr, + device_name, + device_type, + device_floor, + event_value + FROM logs3 + WHERE event_type = 'temperature' + ) AS r + GROUP BY dt, + hr, + device_name, + device_type, + device_floor + ) AS s + GROUP BY dt, + device_name, + device_type, + device_floor + HAVING MAX(temperature_hourly_avg) - MIN(temperature_hourly_avg) >= 25.0 +) +SELECT DISTINCT device_name, + device_type, + device_floor, + 'WINTER' +FROM temperature +WHERE dt >= DATE '2018-12-01' + AND dt < DATE '2019-03-01' +UNION +SELECT DISTINCT device_name, + device_type, + device_floor, + 'SUMMER' +FROM temperature +WHERE dt >= DATE '2019-06-01' + AND dt < DATE '2019-09-01'; + + +-- Q3.6: For each device category, what are the monthly power consumption metrics? + +SELECT yr, + mo, + SUM(coffee_hourly_avg) AS coffee_monthly_sum, + AVG(coffee_hourly_avg) AS coffee_monthly_avg, + SUM(printer_hourly_avg) AS printer_monthly_sum, + AVG(printer_hourly_avg) AS printer_monthly_avg, + SUM(projector_hourly_avg) AS projector_monthly_sum, + AVG(projector_hourly_avg) AS projector_monthly_avg, + SUM(vending_hourly_avg) AS vending_monthly_sum, + AVG(vending_hourly_avg) AS vending_monthly_avg +FROM ( + SELECT dt, + yr, + mo, + hr, + AVG(coffee) AS coffee_hourly_avg, + AVG(printer) AS printer_hourly_avg, + AVG(projector) AS projector_hourly_avg, + AVG(vending) AS vending_hourly_avg + FROM ( + SELECT CAST(log_time AS DATE) AS dt, + EXTRACT(YEAR FROM log_time) AS yr, + EXTRACT(MONTH FROM log_time) AS mo, + EXTRACT(HOUR FROM log_time) AS hr, + CASE WHEN device_name LIKE 'coffee%' THEN event_value END AS coffee, + CASE WHEN device_name LIKE 'printer%' THEN event_value END AS printer, + CASE WHEN device_name LIKE 'projector%' THEN event_value END AS projector, + CASE WHEN device_name LIKE 'vending%' THEN event_value END AS vending + FROM logs3 + WHERE device_type = 'meter' + ) AS r + GROUP BY dt, + yr, + mo, + hr +) AS s +GROUP BY yr, + mo +ORDER BY yr, + mo; +``` + +Данные также доступны для работы с интерактивными запросами через [Playground](https://gh-api.clickhouse.tech/play?user=play), [пример](https://gh-api.clickhouse.tech/play?user=play#U0VMRUNUIG1hY2hpbmVfbmFtZSwKICAgICAgIE1JTihjcHUpIEFTIGNwdV9taW4sCiAgICAgICBNQVgoY3B1KSBBUyBjcHVfbWF4LAogICAgICAgQVZHKGNwdSkgQVMgY3B1X2F2ZywKICAgICAgIE1JTihuZXRfaW4pIEFTIG5ldF9pbl9taW4sCiAgICAgICBNQVgobmV0X2luKSBBUyBuZXRfaW5fbWF4LAogICAgICAgQVZHKG5ldF9pbikgQVMgbmV0X2luX2F2ZywKICAgICAgIE1JTihuZXRfb3V0KSBBUyBuZXRfb3V0X21pbiwKICAgICAgIE1BWChuZXRfb3V0KSBBUyBuZXRfb3V0X21heCwKICAgICAgIEFWRyhuZXRfb3V0KSBBUyBuZXRfb3V0X2F2ZwpGUk9NICgKICBTRUxFQ1QgbWFjaGluZV9uYW1lLAogICAgICAgICBDT0FMRVNDRShjcHVfdXNlciwgMC4wKSBBUyBjcHUsCiAgICAgICAgIENPQUxFU0NFKGJ5dGVzX2luLCAwLjApIEFTIG5ldF9pbiwKICAgICAgICAgQ09BTEVTQ0UoYnl0ZXNfb3V0LCAwLjApIEFTIG5ldF9vdXQKICBGUk9NIG1nYmVuY2gubG9nczEKICBXSEVSRSBtYWNoaW5lX25hbWUgSU4gKCdhbmFuc2knLCdhcmFnb2cnLCd1cmQnKQogICAgQU5EIGxvZ190aW1lID49IFRJTUVTVEFNUCAnMjAxNy0wMS0xMSAwMDowMDowMCcKKSBBUyByCkdST1VQIEJZIG1hY2hpbmVfbmFtZQ==). + +[Оригинальная статья](https://clickhouse.tech/docs/ru/getting_started/example_datasets/brown-benchmark/) diff --git a/docs/ru/sql-reference/functions/array-functions.md b/docs/ru/sql-reference/functions/array-functions.md index 015d14b9de5..7afd9da471e 100644 --- a/docs/ru/sql-reference/functions/array-functions.md +++ b/docs/ru/sql-reference/functions/array-functions.md @@ -1135,12 +1135,162 @@ SELECT Функция `arrayFirstIndex` является [функцией высшего порядка](../../sql-reference/functions/index.md#higher-order-functions) — в качестве первого аргумента ей нужно передать лямбда-функцию, и этот аргумент не может быть опущен. +## arrayMin(\[func,\] arr1, …) {#array-min} + +Возвращает минимальное значение функции `func`. Если функция не указана, возвращает минимальный из элементов массива. + +Функция `arrayMin` является [функцией высшего порядка](../../sql-reference/functions/index.md#higher-order-functions) — в качестве первого аргумента ей нужно передать лямбда-функцию. + +**Синтаксис** + +``` sql +arrayMin(arr) +``` + +**Возвращаемое значение** + +- Число. + +Тип: [Int](../../sql-reference/data-types/int-uint.md) или [Float](../../sql-reference/data-types/float.md). + +**Параметры** + +- `arr` — [Массив](../../sql-reference/data-types/array.md). + +**Примеры** + +Запрос: + +``` sql +SELECT arrayMin([1, 2, 4]) AS res +``` + +Результат: + +``` text +┌─res─┐ +│ 1 │ +└─────┘ +``` + +Запрос: + +``` sql +SELECT arrayMin(x -> (-x), [1, 2, 4]) AS res +``` + +Результат: + +``` text +┌─res─┐ +│ -4 │ +└─────┘ +``` + +## arrayMax(\[func,\] arr1, …) {#array-max} + +Возвращает максимальное значение функции `func`. Если функция не указана, возвращает максимальный из элементов массива. + +Функция `arrayMax` является [функцией высшего порядка](../../sql-reference/functions/index.md#higher-order-functions) — в качестве первого аргумента ей нужно передать лямбда-функцию. + +**Синтаксис** + +``` sql +arrayMax(arr) +``` + +**Возвращаемое значение** + +- Число. + +Тип: [Int](../../sql-reference/data-types/int-uint.md) или [Float](../../sql-reference/data-types/float.md). + +**Параметры** + +- `arr` — [Массив](../../sql-reference/data-types/array.md). + +**Примеры** + +Запрос: + +```sql +SELECT arrayMax([1, 2, 4]) AS res +``` + +Результат: + +``` text +┌─res─┐ +│ 4 │ +└─────┘ +``` + +Запрос: + +``` sql +SELECT arrayMax(x -> (-x), [1, 2, 4]) AS res +``` + +Результат: + +``` text +┌─res─┐ +│ -1 │ +└─────┘ +``` + ## arraySum(\[func,\] arr1, …) {#array-sum} Возвращает сумму значений функции `func`. Если функция не указана - просто возвращает сумму элементов массива. Функция `arraySum` является [функцией высшего порядка](../../sql-reference/functions/index.md#higher-order-functions) - в качестве первого аргумента ей можно передать лямбда-функцию. +**Синтаксис** + +``` sql +arraySum(arr) +``` + +**Возвращаемое значение** + +- Число. + +Тип: [Int](../../sql-reference/data-types/int-uint.md) или [Float](../../sql-reference/data-types/float.md). + +**Параметры** + +- `arr` — [Массив](../../sql-reference/data-types/array.md). + +**Примеры** + +Запрос: + +```sql +SELECT arraySum([2,3]) AS res +``` + +Результат: + +``` text +┌─res─┐ +│ 5 │ +└─────┘ +``` + +Запрос: + +``` sql +SELECT arraySum(x -> x*x, [2, 3]) AS res +``` + +Результат: + +``` text +┌─res─┐ +│ 13 │ +└─────┘ +``` + ## arrayCumSum(\[func,\] arr1, …) {#arraycumsumfunc-arr1} Возвращает массив из частичных сумм элементов исходного массива (сумма с накоплением). Если указана функция `func`, то значения элементов массива преобразуются этой функцией перед суммированием. diff --git a/docs/ru/sql-reference/table-functions/mysql.md b/docs/ru/sql-reference/table-functions/mysql.md index 21841eee67a..18b34d0bf6c 100644 --- a/docs/ru/sql-reference/table-functions/mysql.md +++ b/docs/ru/sql-reference/table-functions/mysql.md @@ -7,6 +7,8 @@ toc_title: mysql Позволяет выполнять запросы `SELECT` над данными, хранящимися на удалённом MySQL сервере. +**Синтаксис** + ``` sql mysql('host:port', 'database', 'table', 'user', 'password'[, replace_query, 'on_duplicate_clause']); ``` @@ -23,13 +25,13 @@ mysql('host:port', 'database', 'table', 'user', 'password'[, replace_query, 'on_ - `password` — пароль пользователя. -- `replace_query` — флаг, отвечающий за преобразование запросов `INSERT INTO` в `REPLACE INTO`. Если `replace_query=1`, то запрос заменяется. +- `replace_query` — флаг, отвечающий за преобразование запросов `INSERT INTO` в `REPLACE INTO`. Возможные значения: + - `0` - выполняется запрос `INSERT INTO`. + - `1` - выполняется запрос `REPLACE INTO`. -- `on_duplicate_clause` — выражение `ON DUPLICATE KEY on_duplicate_clause`, добавляемое в запрос `INSERT`. +- `on_duplicate_clause` — выражение `ON DUPLICATE KEY on_duplicate_clause`, добавляемое в запрос `INSERT`. Может быть передано только с помощью `replace_query = 0` (если вы одновременно передадите `replace_query = 1` и `on_duplicate_clause`, будет сгенерировано исключение). - Пример: `INSERT INTO t (c1,c2) VALUES ('a', 2) ON DUPLICATE KEY UPDATE c2 = c2 + 1`, где `on_duplicate_clause` это `UPDATE c2 = c2 + 1`. Чтобы узнать какие `on_duplicate_clause` можно использовать с секцией `ON DUPLICATE KEY` обратитесь к документации MySQL. - - Чтобы указать `'on_duplicate_clause'` необходимо передать `0` в параметр `replace_query`. Если одновременно передать `replace_query = 1` и `'on_duplicate_clause'`, то ClickHouse сгенерирует исключение. + Пример: `INSERT INTO t (c1,c2) VALUES ('a', 2) ON DUPLICATE KEY UPDATE c2 = c2 + 1`, где `on_duplicate_clause` это `UPDATE c2 = c2 + 1;` Простые условия `WHERE` такие как `=, !=, >, >=, <, =` выполняются на стороне сервера MySQL. @@ -39,46 +41,59 @@ mysql('host:port', 'database', 'table', 'user', 'password'[, replace_query, 'on_ Объект таблицы с теми же столбцами, что и в исходной таблице MySQL. -## Пример использования {#primer-ispolzovaniia} +!!! note "Примечание" + Чтобы отличить табличную функцию `mysql (...)` в запросе `INSERT` от имени таблицы со списком имен столбцов, используйте ключевые слова `FUNCTION` или `TABLE FUNCTION`. См. примеры ниже. + +**Примеры** Таблица в MySQL: ``` text mysql> CREATE TABLE `test`.`test` ( -> `int_id` INT NOT NULL AUTO_INCREMENT, - -> `int_nullable` INT NULL DEFAULT NULL, -> `float` FLOAT NOT NULL, - -> `float_nullable` FLOAT NULL DEFAULT NULL, -> PRIMARY KEY (`int_id`)); -Query OK, 0 rows affected (0,09 sec) -mysql> insert into test (`int_id`, `float`) VALUES (1,2); -Query OK, 1 row affected (0,00 sec) +mysql> INSERT INTO test (`int_id`, `float`) VALUES (1,2); -mysql> select * from test; -+--------+--------------+-------+----------------+ -| int_id | int_nullable | float | float_nullable | -+--------+--------------+-------+----------------+ -| 1 | NULL | 2 | NULL | -+--------+--------------+-------+----------------+ -1 row in set (0,00 sec) +mysql> SELECT * FROM test; ++--------+-------+ +| int_id | float | ++--------+-------+ +| 1 | 2 | ++--------+-------+ ``` Получение данных в ClickHouse: ``` sql -SELECT * FROM mysql('localhost:3306', 'test', 'test', 'bayonet', '123') +SELECT * FROM mysql('localhost:3306', 'test', 'test', 'bayonet', '123'); ``` ``` text -┌─int_id─┬─int_nullable─┬─float─┬─float_nullable─┐ -│ 1 │ ᴺᵁᴸᴸ │ 2 │ ᴺᵁᴸᴸ │ -└────────┴──────────────┴───────┴────────────────┘ +┌─int_id─┬─float─┐ +│ 1 │ 2 │ +└────────┴───────┘ ``` -## Смотрите также {#smotrite-takzhe} +Замена и вставка: + +```sql +INSERT INTO FUNCTION mysql('localhost:3306', 'test', 'test', 'bayonet', '123', 1) (int_id, float) VALUES (1, 3); +INSERT INTO TABLE FUNCTION mysql('localhost:3306', 'test', 'test', 'bayonet', '123', 0, 'UPDATE int_id = int_id + 1') (int_id, float) VALUES (1, 4); +SELECT * FROM mysql('localhost:3306', 'test', 'test', 'bayonet', '123'); +``` + +``` text +┌─int_id─┬─float─┐ +│ 1 │ 3 │ +│ 2 │ 4 │ +└────────┴───────┘ +``` + +**Смотрите также** - [Движок таблиц ‘MySQL’](../../sql-reference/table-functions/mysql.md) - [Использование MySQL как источника данных для внешнего словаря](../../sql-reference/table-functions/mysql.md#dicts-external_dicts_dict_sources-mysql) -[Оригинальная статья](https://clickhouse.tech/docs/ru/query_language/table_functions/mysql/) +[Оригинальная статья](https://clickhouse.tech/docs/ru/sql-reference/table_functions/mysql/) From c285dafb5d2f6655fdf62febd0c9177f0bee5c1e Mon Sep 17 00:00:00 2001 From: Evgeniia Sudarikova Date: Fri, 5 Feb 2021 22:20:07 +0300 Subject: [PATCH 0153/2357] edited brown benchmark --- docs/en/getting-started/example-datasets/brown-benchmark.md | 2 ++ 1 file changed, 2 insertions(+) diff --git a/docs/en/getting-started/example-datasets/brown-benchmark.md b/docs/en/getting-started/example-datasets/brown-benchmark.md index effae6d5adb..c9b74a84a54 100644 --- a/docs/en/getting-started/example-datasets/brown-benchmark.md +++ b/docs/en/getting-started/example-datasets/brown-benchmark.md @@ -412,3 +412,5 @@ ORDER BY yr, ``` The data is also available for interactive queries in the [Playground](https://gh-api.clickhouse.tech/play?user=play), [example](https://gh-api.clickhouse.tech/play?user=play#U0VMRUNUIG1hY2hpbmVfbmFtZSwKICAgICAgIE1JTihjcHUpIEFTIGNwdV9taW4sCiAgICAgICBNQVgoY3B1KSBBUyBjcHVfbWF4LAogICAgICAgQVZHKGNwdSkgQVMgY3B1X2F2ZywKICAgICAgIE1JTihuZXRfaW4pIEFTIG5ldF9pbl9taW4sCiAgICAgICBNQVgobmV0X2luKSBBUyBuZXRfaW5fbWF4LAogICAgICAgQVZHKG5ldF9pbikgQVMgbmV0X2luX2F2ZywKICAgICAgIE1JTihuZXRfb3V0KSBBUyBuZXRfb3V0X21pbiwKICAgICAgIE1BWChuZXRfb3V0KSBBUyBuZXRfb3V0X21heCwKICAgICAgIEFWRyhuZXRfb3V0KSBBUyBuZXRfb3V0X2F2ZwpGUk9NICgKICBTRUxFQ1QgbWFjaGluZV9uYW1lLAogICAgICAgICBDT0FMRVNDRShjcHVfdXNlciwgMC4wKSBBUyBjcHUsCiAgICAgICAgIENPQUxFU0NFKGJ5dGVzX2luLCAwLjApIEFTIG5ldF9pbiwKICAgICAgICAgQ09BTEVTQ0UoYnl0ZXNfb3V0LCAwLjApIEFTIG5ldF9vdXQKICBGUk9NIG1nYmVuY2gubG9nczEKICBXSEVSRSBtYWNoaW5lX25hbWUgSU4gKCdhbmFuc2knLCdhcmFnb2cnLCd1cmQnKQogICAgQU5EIGxvZ190aW1lID49IFRJTUVTVEFNUCAnMjAxNy0wMS0xMSAwMDowMDowMCcKKSBBUyByCkdST1VQIEJZIG1hY2hpbmVfbmFtZQ==). + +[Original article](https://clickhouse.tech/docs/en/getting_started/example_datasets/brown-benchmark/) From 44714c3fa895d0b827f771e0e3b9fcd876651d81 Mon Sep 17 00:00:00 2001 From: Evgeniia Sudarikova Date: Fri, 5 Feb 2021 22:34:26 +0300 Subject: [PATCH 0154/2357] edited RU brown benchmark --- docs/ru/getting-started/example-datasets/brown-benchmark.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/ru/getting-started/example-datasets/brown-benchmark.md b/docs/ru/getting-started/example-datasets/brown-benchmark.md index b3f2285093a..e4fe00ace93 100644 --- a/docs/ru/getting-started/example-datasets/brown-benchmark.md +++ b/docs/ru/getting-started/example-datasets/brown-benchmark.md @@ -5,7 +5,7 @@ toc_title: Brown University Benchmark # Brown University Benchmark -`MgBench` — это новый аналитический бенчмарк для сгенерированного журнала событий, разработанный [Andrew Crotty](http://cs.brown.edu/people/acrotty/). +`MgBench` — это новый аналитический бенчмарк для данных журнала событий, сгенерированных машиной. Бенчмарк разработан [Andrew Crotty](http://cs.brown.edu/people/acrotty/). Скачать данные: ``` From 0704d3cf27239ec0aa07ee88f256ccc40b891b7e Mon Sep 17 00:00:00 2001 From: Pavel Kruglov Date: Sat, 6 Feb 2021 03:54:27 +0300 Subject: [PATCH 0155/2357] Refactor --- src/Client/ConnectionPoolWithFailover.cpp | 10 +- src/Client/ConnectionPoolWithFailover.h | 6 +- src/Client/GetHedgedConnections.cpp | 491 ------------------ src/Client/GetHedgedConnections.h | 173 ------ src/Client/HedgedConnections.cpp | 300 +++++++---- src/Client/HedgedConnections.h | 85 ++- src/Client/HedgedConnectionsFactory.cpp | 475 +++++++++++++++++ src/Client/HedgedConnectionsFactory.h | 167 ++++++ src/Client/ya.make | 2 +- src/Common/Epoll.cpp | 24 +- src/Common/Epoll.h | 12 +- src/Common/TimerDescriptor.h | 12 - .../RemoteQueryExecutorReadContext.cpp | 15 +- src/IO/ConnectionTimeouts.h | 18 +- src/IO/ReadBufferFromPocoSocket.cpp | 8 +- src/IO/ReadBufferFromPocoSocket.h | 1 + 16 files changed, 938 insertions(+), 861 deletions(-) delete mode 100644 src/Client/GetHedgedConnections.cpp delete mode 100644 src/Client/GetHedgedConnections.h create mode 100644 src/Client/HedgedConnectionsFactory.cpp create mode 100644 src/Client/HedgedConnectionsFactory.h diff --git a/src/Client/ConnectionPoolWithFailover.cpp b/src/Client/ConnectionPoolWithFailover.cpp index 3e41c26fb65..15344b3b18b 100644 --- a/src/Client/ConnectionPoolWithFailover.cpp +++ b/src/Client/ConnectionPoolWithFailover.cpp @@ -329,7 +329,7 @@ std::vector ConnectionPoolWithFa return Base::getShuffledPools(max_ignored_errors, get_priority); } -TryGetConnection::TryGetConnection( +ConnectionEstablisher::ConnectionEstablisher( IConnectionPool * pool_, const ConnectionTimeouts * timeouts_, const Settings * settings_, @@ -340,7 +340,7 @@ TryGetConnection::TryGetConnection( { } -void TryGetConnection::reset() +void ConnectionEstablisher::reset() { resetResult(); stage = Stage::CONNECT; @@ -349,7 +349,7 @@ void TryGetConnection::reset() fail_message.clear(); } -void TryGetConnection::resetResult() +void ConnectionEstablisher::resetResult() { if (!result.entry.isNull()) { @@ -358,7 +358,7 @@ void TryGetConnection::resetResult() } } -void TryGetConnection::processFail(bool add_description) +void ConnectionEstablisher::processFail(bool add_description) { if (action_before_disconnect) action_before_disconnect(socket_fd); @@ -371,7 +371,7 @@ void TryGetConnection::processFail(bool add_description) stage = Stage::FAILED; } -void TryGetConnection::run() +void ConnectionEstablisher::run() { try { diff --git a/src/Client/ConnectionPoolWithFailover.h b/src/Client/ConnectionPoolWithFailover.h index a6c0b9e8070..44b06e871ec 100644 --- a/src/Client/ConnectionPoolWithFailover.h +++ b/src/Client/ConnectionPoolWithFailover.h @@ -31,8 +31,8 @@ enum class PoolMode GET_ALL }; -/// Class for establishing connection with replica without blocking. -class TryGetConnection +/// Class for establishing connection with replica without blocking using different stages. +class ConnectionEstablisher { public: enum Stage @@ -47,7 +47,7 @@ public: using TryResult = PoolWithFailoverBase::TryResult; - TryGetConnection(IConnectionPool * pool_, + ConnectionEstablisher(IConnectionPool * pool_, const ConnectionTimeouts * timeouts_, const Settings * settings_, const QualifiedTableName * table_to_check = nullptr, diff --git a/src/Client/GetHedgedConnections.cpp b/src/Client/GetHedgedConnections.cpp deleted file mode 100644 index 093b4bc930c..00000000000 --- a/src/Client/GetHedgedConnections.cpp +++ /dev/null @@ -1,491 +0,0 @@ -#if defined(OS_LINUX) - -#include -#include - - -namespace DB -{ -namespace ErrorCodes -{ - extern const int BAD_ARGUMENTS; - extern const int LOGICAL_ERROR; - extern const int ALL_CONNECTION_TRIES_FAILED; - extern const int ALL_REPLICAS_ARE_STALE; -} - -GetHedgedConnections::GetHedgedConnections( - const ConnectionPoolWithFailoverPtr & pool_, - const Settings * settings_, - const ConnectionTimeouts & timeouts_, - std::shared_ptr table_to_check_) - : pool(pool_), settings(settings_), timeouts(timeouts_), table_to_check(table_to_check_), log(&Poco::Logger::get("GetHedgedConnections")) -{ - shuffled_pools = pool->getShuffledPools(settings); - for (size_t i = 0; i != shuffled_pools.size(); ++i) - try_get_connections.emplace_back(shuffled_pools[i].pool, &timeouts, settings, table_to_check.get(), log); - - max_tries - = (settings ? size_t{settings->connections_with_failover_max_tries} : size_t{DBMS_CONNECTION_POOL_WITH_FAILOVER_DEFAULT_MAX_TRIES}); - - fallback_to_stale_replicas = settings ? settings->fallback_to_stale_replicas_for_distributed_queries : false; - entries_count = 0; - usable_count = 0; - failed_pools_count = 0; -} - -GetHedgedConnections::~GetHedgedConnections() -{ - pool->updateSharedError(shuffled_pools); -} - -std::vector GetHedgedConnections::getManyConnections(PoolMode pool_mode) -{ - size_t min_entries = (settings && settings->skip_unavailable_shards) ? 0 : 1; - - size_t max_entries; - if (pool_mode == PoolMode::GET_ALL) - { - min_entries = shuffled_pools.size(); - max_entries = shuffled_pools.size(); - } - else if (pool_mode == PoolMode::GET_ONE) - max_entries = 1; - else if (pool_mode == PoolMode::GET_MANY) - max_entries = settings ? size_t(settings->max_parallel_replicas) : 1; - else - throw DB::Exception("Unknown pool allocation mode", DB::ErrorCodes::LOGICAL_ERROR); - - std::vector replicas; - replicas.reserve(max_entries); - for (size_t i = 0; i != max_entries; ++i) - { - auto replica = getNextConnection(false); - if (replica->isCannotChoose()) - { - if (replicas.size() >= min_entries) - break; - - /// Determine the reason of not enough replicas. - if (!fallback_to_stale_replicas && usable_count >= min_entries) - throw DB::Exception( - "Could not find enough connections to up-to-date replicas. Got: " + std::to_string(replicas.size()) - + ", needed: " + std::to_string(min_entries), - DB::ErrorCodes::ALL_REPLICAS_ARE_STALE); - - throw DB::NetException( - "All connection tries failed. Log: \n\n" + fail_messages + "\n", - DB::ErrorCodes::ALL_CONNECTION_TRIES_FAILED); - } - - replicas.push_back(replica); - } - - return replicas; -} - -GetHedgedConnections::ReplicaStatePtr GetHedgedConnections::getNextConnection(bool non_blocking) -{ - ReplicaStatePtr replica = createNewReplica(); - int index; - - /// Check if it's the first time. - if (epoll.empty() && ready_indexes.empty()) - { - index = 0; - last_used_index = 0; - } - else - index = getNextIndex(); - - bool is_first = true; - - while (index != -1 || !epoll.empty()) - { - /// Prevent blocking after receiving timeout when there is no new replica to connect - /// (processEpollEvents can return EMPTY replica after timeout processing to start new connection). - if (index == -1 && !is_first && non_blocking) - { - replica->state = State::NOT_READY; - return replica; - } - - if (is_first) - is_first = false; - - if (index != -1) - { - Action action = startTryGetConnection(index, replica); - - if (action == Action::FINISH) - return replica; - - if (action == Action::TRY_NEXT_REPLICA) - { - index = getNextIndex(); - continue; - } - - if (action == Action::PROCESS_EPOLL_EVENTS && non_blocking) - return replica; - } - - replica = processEpollEvents(non_blocking); - if (replica->isReady() || (replica->isNotReady() && non_blocking)) - return replica; - - index = getNextIndex(); - } - - /// We reach this point only if there was no free up to date replica. - /// We will try to use usable replica. - - /// Check if we are not allowed to use usable replicas or there is no even a free usable replica. - if (!fallback_to_stale_replicas || !canGetNewConnection()) - { - replica->state = State::CANNOT_CHOOSE; - return replica; - } - - setBestUsableReplica(replica); - return replica; -} - -void GetHedgedConnections::stopChoosingReplicas() -{ - for (auto & [fd, replica] : fd_to_replica) - { - removeTimeoutsFromReplica(replica, epoll, timeout_fd_to_replica); - epoll.remove(fd); - try_get_connections[replica->index].reset(); - replica->reset(); - } - - fd_to_replica.clear(); -} - -int GetHedgedConnections::getNextIndex() -{ - /// Check if there is no free replica. - if (entries_count + indexes_in_process.size() + failed_pools_count >= shuffled_pools.size()) - return -1; - - bool finish = false; - int next_index = last_used_index; - while (!finish) - { - next_index = (next_index + 1) % shuffled_pools.size(); - - /// Check if we can try this replica. - if (indexes_in_process.find(next_index) == indexes_in_process.end() && (max_tries == 0 || shuffled_pools[next_index].error_count < max_tries) - && try_get_connections[next_index].stage != TryGetConnection::Stage::FINISHED) - finish = true; - - /// If we made a complete round, there is no replica to connect. - else if (next_index == last_used_index) - return -1; - } - - last_used_index = next_index; - return next_index; -} - -GetHedgedConnections::Action GetHedgedConnections::startTryGetConnection(int index, ReplicaStatePtr & replica) -{ - TryGetConnection & try_get_connection = try_get_connections[index]; - - replica->state = State::NOT_READY; - replica->index = index; - indexes_in_process.insert(index); - - try_get_connection.reset(); - try_get_connection.run(); - - if (try_get_connection.stage != TryGetConnection::Stage::FAILED) - { - replica->fd = try_get_connection.socket_fd; - replica->connection = &*try_get_connection.result.entry; - } - - Action action = processTryGetConnectionStage(replica); - - if (action == Action::PROCESS_EPOLL_EVENTS) - { - epoll.add(try_get_connection.socket_fd); - fd_to_replica[try_get_connection.socket_fd] = replica; - try_get_connection.setActionBeforeDisconnect( - [&](int fd) - { - epoll.remove(fd); - fd_to_replica.erase(fd); - }); - addTimeouts(replica); - } - - return action; -} - -GetHedgedConnections::Action -GetHedgedConnections::processTryGetConnectionStage(ReplicaStatePtr & replica, bool remove_from_epoll) -{ - TryGetConnection & try_get_connection = try_get_connections[replica->index]; - - if (try_get_connection.stage == TryGetConnection::Stage::FINISHED) - { - indexes_in_process.erase(replica->index); - ++entries_count; - - if (remove_from_epoll) - { - epoll.remove(try_get_connection.socket_fd); - fd_to_replica.erase(try_get_connection.socket_fd); - } - - if (try_get_connection.result.is_usable) - { - ++usable_count; - if (try_get_connection.result.is_up_to_date) - { - replica->state = State::READY; - ready_indexes.insert(replica->index); - return Action::FINISH; - } - } - - /// This replica is not up to date, we will try to find up to date. - replica->reset(); - return Action::TRY_NEXT_REPLICA; - } - else if (try_get_connection.stage == TryGetConnection::Stage::FAILED) - { - processFailedConnection(replica); - return Action::TRY_NEXT_REPLICA; - } - - /// Get connection process is not finished. - return Action::PROCESS_EPOLL_EVENTS; -} - -void GetHedgedConnections::processFailedConnection(ReplicaStatePtr & replica) -{ - ShuffledPool & shuffled_pool = shuffled_pools[replica->index]; - LOG_WARNING( - log, "Connection failed at try №{}, reason: {}", (shuffled_pool.error_count + 1), try_get_connections[replica->index].fail_message); - ProfileEvents::increment(ProfileEvents::DistributedConnectionFailTry); - - shuffled_pool.error_count = std::min(pool->getMaxErrorCup(), shuffled_pool.error_count + 1); - - if (shuffled_pool.error_count >= max_tries) - { - ++failed_pools_count; - ProfileEvents::increment(ProfileEvents::DistributedConnectionFailAtAll); - } - - std::string & fail_message = try_get_connections[replica->index].fail_message; - if (!fail_message.empty()) - fail_messages += fail_message + "\n"; - - indexes_in_process.erase(replica->index); - replica->reset(); -} - -void GetHedgedConnections::addTimeouts(ReplicaStatePtr & replica) -{ - addTimeoutToReplica(TimerTypes::RECEIVE_TIMEOUT, replica, epoll, timeout_fd_to_replica, timeouts); - - auto stage = try_get_connections[replica->index].stage; - if (stage == TryGetConnection::Stage::RECEIVE_HELLO) - addTimeoutToReplica(TimerTypes::RECEIVE_HELLO_TIMEOUT, replica, epoll, timeout_fd_to_replica, timeouts); - else if (stage == TryGetConnection::Stage::RECEIVE_TABLES_STATUS) - addTimeoutToReplica(TimerTypes::RECEIVE_TABLES_STATUS_TIMEOUT, replica, epoll, timeout_fd_to_replica, timeouts); -} - -GetHedgedConnections::ReplicaStatePtr GetHedgedConnections::processEpollEvents(bool non_blocking) -{ - int event_fd; - ReplicaStatePtr replica = nullptr; - bool finish = false; - while (!finish) - { - event_fd = getReadyFileDescriptor(); - - if (fd_to_replica.find(event_fd) != fd_to_replica.end()) - { - replica = fd_to_replica[event_fd]; - finish = processReplicaEvent(replica, non_blocking); - } - else if (timeout_fd_to_replica.find(event_fd) != timeout_fd_to_replica.end()) - { - replica = timeout_fd_to_replica[event_fd]; - finish = processTimeoutEvent(replica, replica->active_timeouts[event_fd], non_blocking); - } - else - throw Exception("Unknown event from epoll", ErrorCodes::LOGICAL_ERROR); - } - - return replica; -} - -int GetHedgedConnections::getReadyFileDescriptor(AsyncCallback async_callback) -{ - for (auto & [fd, replica] : fd_to_replica) - if (replica->connection->hasReadPendingData()) - return replica->fd; - - return epoll.getReady(std::move(async_callback)).data.fd; -} - -bool GetHedgedConnections::processReplicaEvent(ReplicaStatePtr & replica, bool non_blocking) -{ - removeTimeoutsFromReplica(replica, epoll, timeout_fd_to_replica); - try_get_connections[replica->index].run(); - Action action = processTryGetConnectionStage(replica, true); - if (action == Action::PROCESS_EPOLL_EVENTS) - { - addTimeouts(replica); - return non_blocking; - } - - return true; -} - -bool GetHedgedConnections::processTimeoutEvent(ReplicaStatePtr & replica, TimerDescriptorPtr timeout_descriptor, bool non_blocking) -{ - epoll.remove(timeout_descriptor->getDescriptor()); - replica->active_timeouts.erase(timeout_descriptor->getDescriptor()); - timeout_fd_to_replica[timeout_descriptor->getDescriptor()]; - - if (timeout_descriptor->getType() == TimerTypes::RECEIVE_TIMEOUT) - { - removeTimeoutsFromReplica(replica, epoll, timeout_fd_to_replica); - epoll.remove(replica->fd); - fd_to_replica.erase(replica->fd); - - TryGetConnection & try_get_connection = try_get_connections[replica->index]; - try_get_connection.fail_message = "Receive timeout expired (" + try_get_connection.result.entry->getDescription() + ")"; - try_get_connection.resetResult(); - try_get_connection.stage = TryGetConnection::Stage::FAILED; - processFailedConnection(replica); - - return true; - } - else if ((timeout_descriptor->getType() == TimerTypes::RECEIVE_HELLO_TIMEOUT - || timeout_descriptor->getType() == TimerTypes::RECEIVE_TABLES_STATUS_TIMEOUT) - && entries_count + indexes_in_process.size() + failed_pools_count < shuffled_pools.size()) - { - replica = createNewReplica(); - return true; - } - - return non_blocking; -} - -void GetHedgedConnections::setBestUsableReplica(ReplicaStatePtr & replica) -{ - std::vector indexes(try_get_connections.size()); - for (size_t i = 0; i != indexes.size(); ++i) - indexes[i] = i; - - /// Remove unusable, failed replicas and replicas that are ready or in process. - indexes.erase( - std::remove_if( - indexes.begin(), - indexes.end(), - [&](int i) - { - return try_get_connections[i].result.entry.isNull() || !try_get_connections[i].result.is_usable || - indexes_in_process.find(i) != indexes_in_process.end() || ready_indexes.find(i) != ready_indexes.end(); - }), - indexes.end()); - - if (indexes.empty()) - { - replica->state = State::CANNOT_CHOOSE; - return; - } - - /// Sort replicas by staleness. - std::stable_sort( - indexes.begin(), - indexes.end(), - [&](size_t lhs, size_t rhs) - { - return try_get_connections[lhs].result.staleness < try_get_connections[rhs].result.staleness; - }); - - replica->index = indexes[0]; - replica->connection = &*try_get_connections[indexes[0]].result.entry; - replica->state = State::READY; - replica->fd = replica->connection->getSocket()->impl()->sockfd(); - ready_indexes.insert(replica->index); -} - -void addTimeoutToReplica( - int type, - GetHedgedConnections::ReplicaStatePtr & replica, - Epoll & epoll, - std::unordered_map & timeout_fd_to_replica, - const ConnectionTimeouts & timeouts) -{ - Poco::Timespan timeout; - switch (type) - { - case TimerTypes::RECEIVE_HELLO_TIMEOUT: - timeout = timeouts.receive_hello_timeout; - break; - case TimerTypes::RECEIVE_TABLES_STATUS_TIMEOUT: - timeout = timeouts.receive_tables_status_timeout; - break; - case TimerTypes::RECEIVE_DATA_TIMEOUT: - timeout = timeouts.receive_data_timeout; - break; - case TimerTypes::RECEIVE_TIMEOUT: - timeout = timeouts.receive_timeout; - break; - default: - throw Exception("Unknown timeout type", ErrorCodes::BAD_ARGUMENTS); - } - - TimerDescriptorPtr timeout_descriptor = std::make_shared(); - timeout_descriptor->setType(type); - timeout_descriptor->setRelative(timeout); - epoll.add(timeout_descriptor->getDescriptor()); - timeout_fd_to_replica[timeout_descriptor->getDescriptor()] = replica; - replica->active_timeouts[timeout_descriptor->getDescriptor()] = std::move(timeout_descriptor); -} - -void removeTimeoutsFromReplica( - GetHedgedConnections::ReplicaStatePtr & replica, - Epoll & epoll, - std::unordered_map & timeout_fd_to_replica) -{ - for (auto & [fd, _] : replica->active_timeouts) - { - epoll.remove(fd); - timeout_fd_to_replica.erase(fd); - } - replica->active_timeouts.clear(); -} - -void removeTimeoutFromReplica( - int type, - GetHedgedConnections::ReplicaStatePtr & replica, - Epoll & epoll, - std::unordered_map & timeout_fd_to_replica) -{ - auto it = std::find_if( - replica->active_timeouts.begin(), - replica->active_timeouts.end(), - [type](auto & value){ return value.second->getType() == type; } - ); - - if (it != replica->active_timeouts.end()) - { - epoll.remove(it->first); - timeout_fd_to_replica.erase(it->first); - replica->active_timeouts.erase(it); - } -} - -} -#endif diff --git a/src/Client/GetHedgedConnections.h b/src/Client/GetHedgedConnections.h deleted file mode 100644 index 8638367e184..00000000000 --- a/src/Client/GetHedgedConnections.h +++ /dev/null @@ -1,173 +0,0 @@ -#pragma once - -#if defined(OS_LINUX) - -#include -#include -#include -#include -#include -#include - -namespace DB -{ - -using TimerDescriptorPtr = std::shared_ptr; - -/// Class for establishing hedged connections with replicas. -/// It works with multiple replicas simultaneously without blocking by using epoll. -class GetHedgedConnections -{ -public: - using ShuffledPool = ConnectionPoolWithFailover::Base::ShuffledPool; - - enum State - { - EMPTY = 0, - READY = 1, - NOT_READY = 2, - CANNOT_CHOOSE = 3, - }; - - struct ReplicaState - { - Connection * connection = nullptr; - State state = State::EMPTY; - int index = -1; - int fd = -1; - size_t parallel_replica_offset = 0; - std::unordered_map> active_timeouts; - - void reset() - { - connection = nullptr; - state = State::EMPTY; - index = -1; - fd = -1; - parallel_replica_offset = 0; - active_timeouts.clear(); - } - - bool isReady() const { return state == State::READY; } - bool isNotReady() const { return state == State::NOT_READY; } - bool isEmpty() const { return state == State::EMPTY; } - bool isCannotChoose() const { return state == State::CANNOT_CHOOSE; } - }; - - using ReplicaStatePtr = std::shared_ptr; - - GetHedgedConnections(const ConnectionPoolWithFailoverPtr & pool_, - const Settings * settings_, - const ConnectionTimeouts & timeouts_, - std::shared_ptr table_to_check_ = nullptr); - - /// Create and return connections according to pool_mode. - std::vector getManyConnections(PoolMode pool_mode); - - /// Try to establish connection to the new replica. If non_blocking is false, this function will block - /// until establishing connection to the new replica (returned replica state might be READY or CANNOT_CHOOSE). - /// If non_blocking is true, this function will try to establish connection to the new replica without blocking - /// (returned replica state might be READY, NOT_READY and CANNOT_CHOOSE). - ReplicaStatePtr getNextConnection(bool non_blocking); - - /// Check if we can try to produce new READY replica. - bool canGetNewConnection() const { return ready_indexes.size() + failed_pools_count < shuffled_pools.size(); } - - /// Stop working with all replicas that are not READY. - void stopChoosingReplicas(); - - bool hasEventsInProcess() const { return epoll.size() > 0; } - - int getFileDescriptor() const { return epoll.getFileDescriptor(); } - - const ConnectionTimeouts & getConnectionTimeouts() const { return timeouts; } - - ~GetHedgedConnections(); - -private: - - enum Action - { - FINISH = 0, - PROCESS_EPOLL_EVENTS = 1, - TRY_NEXT_REPLICA = 2, - }; - - Action startTryGetConnection(int index, ReplicaStatePtr & replica); - - Action processTryGetConnectionStage(ReplicaStatePtr & replica, bool remove_from_epoll = false); - - /// Find an index of the next free replica to start connection. - /// Return -1 if there is no free replica. - int getNextIndex(); - - int getReadyFileDescriptor(AsyncCallback async_callback = {}); - - void addTimeouts(ReplicaStatePtr & replica); - - void processFailedConnection(ReplicaStatePtr & replica); - - void processReceiveTimeout(ReplicaStatePtr & replica); - - bool processReplicaEvent(ReplicaStatePtr & replica, bool non_blocking); - - bool processTimeoutEvent(ReplicaStatePtr & replica, TimerDescriptorPtr timeout_descriptor, bool non_blocking); - - ReplicaStatePtr processEpollEvents(bool non_blocking = false); - - void setBestUsableReplica(ReplicaStatePtr & replica); - - ReplicaStatePtr createNewReplica() { return std::make_shared(); } - - const ConnectionPoolWithFailoverPtr pool; - const Settings * settings; - const ConnectionTimeouts timeouts; - std::shared_ptr table_to_check; - - std::vector try_get_connections; - std::vector shuffled_pools; - - /// Map socket file descriptor to replica. - std::unordered_map fd_to_replica; - /// Map timeout file descriptor to replica. - std::unordered_map timeout_fd_to_replica; - - /// Indexes of replicas, that are in process of connection. - std::unordered_set indexes_in_process; - /// Indexes of ready replicas. - std::unordered_set ready_indexes; - - int last_used_index; - bool fallback_to_stale_replicas; - Epoll epoll; - Poco::Logger * log; - std::string fail_messages; - size_t entries_count; - size_t usable_count; - size_t failed_pools_count; - size_t max_tries; -}; - -/// Add timeout with particular type to replica and add it to epoll. -void addTimeoutToReplica( - int type, - GetHedgedConnections::ReplicaStatePtr & replica, - Epoll & epoll, - std::unordered_map & timeout_fd_to_replica, - const ConnectionTimeouts & timeouts); - -/// Remove timeout with particular type from replica and epoll. -void removeTimeoutFromReplica( - int type, - GetHedgedConnections::ReplicaStatePtr & replica, - Epoll & epoll, - std::unordered_map & timeout_fd_to_replica); - -/// Remove all timeouts from replica and epoll. -void removeTimeoutsFromReplica( - GetHedgedConnections::ReplicaStatePtr & replica, - Epoll & epoll, - std::unordered_map & timeout_fd_to_replica); - -} -#endif diff --git a/src/Client/HedgedConnections.cpp b/src/Client/HedgedConnections.cpp index f4810a7d79c..a6ffc3cbd1d 100644 --- a/src/Client/HedgedConnections.cpp +++ b/src/Client/HedgedConnections.cpp @@ -10,6 +10,7 @@ namespace ErrorCodes extern const int MISMATCH_REPLICAS_DATA_SOURCES; extern const int LOGICAL_ERROR; extern const int SOCKET_TIMEOUT; + extern const int ALL_CONNECTION_TRIES_FAILED; } HedgedConnections::HedgedConnections( @@ -19,29 +20,35 @@ HedgedConnections::HedgedConnections( const ThrottlerPtr & throttler_, PoolMode pool_mode, std::shared_ptr table_to_check_) - : get_hedged_connections(pool_, &settings_, timeouts_, table_to_check_), settings(settings_), throttler(throttler_), log(&Poco::Logger::get("HedgedConnections")) + : hedged_connections_factory(pool_, &settings_, timeouts_, table_to_check_) + , settings(settings_) + , throttler(throttler_) + , log(&Poco::Logger::get("HedgedConnections")) { - std::vector replicas_states = get_hedged_connections.getManyConnections(pool_mode); + std::vector connections = hedged_connections_factory.getManyConnections(pool_mode); - for (size_t i = 0; i != replicas_states.size(); ++i) + ReplicaState replica; + for (size_t i = 0; i != connections.size(); ++i) { - replicas_states[i]->parallel_replica_offset = i; - replicas_states[i]->connection->setThrottler(throttler_); - epoll.add(replicas_states[i]->fd); - fd_to_replica[replicas_states[i]->fd] = replicas_states[i]; - replicas.push_back({std::move(replicas_states[i])}); - active_connections_count_by_offset[i] = 1; + replica.connection = connections[i]; + replica.connection->setThrottler(throttler_); + int socket_fd = replica.connection->getSocket()->impl()->sockfd(); + epoll.add(socket_fd); + fd_to_replica_location[socket_fd] = ReplicaLocation{i, 0}; + offset_states.push_back(OffsetState{{replica}, 1, false}); } - pipeline_for_new_replicas.add([throttler_](ReplicaStatePtr & replica_){ replica_->connection->setThrottler(throttler_); }); + active_connection_count = connections.size(); + offsets_with_received_first_data_packet = 0; + pipeline_for_new_replicas.add([throttler_](ReplicaState & replica_) { replica_.connection->setThrottler(throttler_); }); } -void HedgedConnections::Pipeline::add(std::function send_function) +void HedgedConnections::Pipeline::add(std::function send_function) { pipeline.push_back(send_function); } -void HedgedConnections::Pipeline::run(ReplicaStatePtr & replica) +void HedgedConnections::Pipeline::run(ReplicaState & replica) { for (auto & send_func : pipeline) send_func(replica); @@ -54,11 +61,11 @@ void HedgedConnections::sendScalarsData(Scalars & data) if (!sent_query) throw Exception("Cannot send scalars data: query not yet sent.", ErrorCodes::LOGICAL_ERROR); - auto send_scalars_data = [&data](ReplicaStatePtr & replica) { replica->connection->sendScalarsData(data); }; + auto send_scalars_data = [&data](ReplicaState & replica) { replica.connection->sendScalarsData(data); }; - for (auto & replicas_with_same_offset : replicas) - for (auto & replica : replicas_with_same_offset) - if (replica->isReady()) + for (auto & offset_state : offset_states) + for (auto & replica : offset_state.replicas) + if (replica.connection) send_scalars_data(replica); pipeline_for_new_replicas.add(send_scalars_data); @@ -74,11 +81,11 @@ void HedgedConnections::sendExternalTablesData(std::vector & if (data.size() != size()) throw Exception("Mismatch between replicas and data sources", ErrorCodes::MISMATCH_REPLICAS_DATA_SOURCES); - auto send_external_tables_data = [&data](ReplicaStatePtr & replica) { replica->connection->sendExternalTablesData(data[0]); }; + auto send_external_tables_data = [&data](ReplicaState & replica) { replica.connection->sendExternalTablesData(data[0]); }; - for (auto & replicas_with_same_offset : replicas) - for (auto & replica : replicas_with_same_offset) - if (replica->isReady()) + for (auto & offset_state : offset_states) + for (auto & replica : offset_state.replicas) + if (replica.connection) send_external_tables_data(replica); pipeline_for_new_replicas.add(send_external_tables_data); @@ -97,11 +104,11 @@ void HedgedConnections::sendQuery( if (sent_query) throw Exception("Query already sent.", ErrorCodes::LOGICAL_ERROR); - for (auto & replicas_with_same_offset : replicas) + for (auto & offset_state : offset_states) { - for (auto & replica : replicas_with_same_offset) + for (auto & replica : offset_state.replicas) { - if (replica->connection->getServerRevision(timeouts) < DBMS_MIN_REVISION_WITH_CURRENT_AGGREGATION_VARIANT_SELECTION_METHOD) + if (replica.connection->getServerRevision(timeouts) < DBMS_MIN_REVISION_WITH_CURRENT_AGGREGATION_VARIANT_SELECTION_METHOD) { disable_two_level_aggregation = true; break; @@ -111,30 +118,29 @@ void HedgedConnections::sendQuery( break; } - auto send_query = [this, timeouts, query, query_id, stage, client_info, with_pending_data](ReplicaStatePtr & replica) - { - Settings modified_settings = this->settings; + auto send_query = [this, timeouts, query, query_id, stage, client_info, with_pending_data](ReplicaState & replica) { + Settings modified_settings = settings; - if (this->disable_two_level_aggregation) + if (disable_two_level_aggregation) { /// Disable two-level aggregation due to version incompatibility. modified_settings.group_by_two_level_threshold = 0; modified_settings.group_by_two_level_threshold_bytes = 0; } - if (this->replicas.size() > 1) + if (offset_states.size() > 1) { - modified_settings.parallel_replicas_count = this->replicas.size(); - modified_settings.parallel_replica_offset = replica->parallel_replica_offset; + modified_settings.parallel_replicas_count = offset_states.size(); + modified_settings.parallel_replica_offset = fd_to_replica_location[replica.connection->getSocket()->impl()->sockfd()].offset; } - replica->connection->sendQuery(timeouts, query, query_id, stage, &modified_settings, &client_info, with_pending_data); - addTimeoutToReplica(TimerTypes::RECEIVE_TIMEOUT, replica, this->epoll, this->timeout_fd_to_replica, timeouts); - addTimeoutToReplica(TimerTypes::RECEIVE_DATA_TIMEOUT, replica, this->epoll, this->timeout_fd_to_replica, timeouts); + replica.connection->sendQuery(timeouts, query, query_id, stage, &modified_settings, &client_info, with_pending_data); + addTimeoutToReplica(ConnectionTimeoutType::RECEIVE_TIMEOUT, replica); + addTimeoutToReplica(ConnectionTimeoutType::RECEIVE_DATA_TIMEOUT, replica); }; - for (auto & replicas_with_same_offset : replicas) - for (auto & replica : replicas_with_same_offset) + for (auto & offset_status : offset_states) + for (auto & replica : offset_status.replicas) send_query(replica); pipeline_for_new_replicas.add(send_query); @@ -145,16 +151,20 @@ void HedgedConnections::disconnect() { std::lock_guard lock(cancel_mutex); - for (auto & replicas_with_same_offset : replicas) - for (auto & replica : replicas_with_same_offset) - if (replica->isReady()) + for (auto & offset_status : offset_states) + for (auto & replica : offset_status.replicas) + if (replica.connection) finishProcessReplica(replica, true); - if (get_hedged_connections.hasEventsInProcess()) + if (hedged_connections_factory.hasEventsInProcess()) { - get_hedged_connections.stopChoosingReplicas(); if (next_replica_in_process) - epoll.remove(get_hedged_connections.getFileDescriptor()); + { + epoll.remove(hedged_connections_factory.getFileDescriptor()); + next_replica_in_process = false; + } + + hedged_connections_factory.stopChoosingReplicas(); } } @@ -165,13 +175,13 @@ std::string HedgedConnections::dumpAddresses() const std::string addresses; bool is_first = true; - for (const auto & replicas_with_same_offset : replicas) + for (const auto & offset_state : offset_states) { - for (const auto & replica : replicas_with_same_offset) + for (const auto & replica : offset_state.replicas) { - if (replica->isReady()) + if (replica.connection) { - addresses += (is_first ? "" : "; ") + replica->connection->getDescription(); + addresses += (is_first ? "" : "; ") + replica.connection->getDescription(); is_first = false; } } @@ -187,15 +197,14 @@ void HedgedConnections::sendCancel() if (!sent_query || cancelled) throw Exception("Cannot cancel. Either no query sent or already cancelled.", ErrorCodes::LOGICAL_ERROR); - for (auto & replicas_with_same_offset : replicas) - for (auto & replica : replicas_with_same_offset) - if (replica->isReady()) - replica->connection->sendCancel(); + for (auto & offset_status : offset_states) + for (auto & replica : offset_status.replicas) + if (replica.connection) + replica.connection->sendCancel(); cancelled = true; } - Packet HedgedConnections::drain() { std::lock_guard lock(cancel_mutex); @@ -252,26 +261,24 @@ Packet HedgedConnections::receivePacketUnlocked(AsyncCallback async_callback) Packet HedgedConnections::receivePacketImpl(AsyncCallback async_callback) { int event_fd; - ReplicaStatePtr replica = nullptr; Packet packet; bool finish = false; while (!finish) { event_fd = getReadyFileDescriptor(async_callback); - if (fd_to_replica.find(event_fd) != fd_to_replica.end()) + if (fd_to_replica_location.contains(event_fd)) { - replica = fd_to_replica[event_fd]; - packet = receivePacketFromReplica(replica, async_callback); + packet = receivePacketFromReplica(fd_to_replica_location[event_fd], async_callback); finish = true; } - else if (timeout_fd_to_replica.find(event_fd) != timeout_fd_to_replica.end()) + else if (timeout_fd_to_replica_location.contains(event_fd)) { - replica = timeout_fd_to_replica[event_fd]; - processTimeoutEvent(replica, replica->active_timeouts[event_fd]); + ReplicaLocation location = timeout_fd_to_replica_location[event_fd]; + processTimeoutEvent(location, offset_states[location.offset].replicas[location.index].active_timeouts[event_fd]); } - else if (event_fd == get_hedged_connections.getFileDescriptor()) - tryGetNewReplica(); + else if (event_fd == hedged_connections_factory.getFileDescriptor()) + tryGetNewReplica(false); else throw Exception("Unknown event from epoll", ErrorCodes::LOGICAL_ERROR); } @@ -281,30 +288,34 @@ Packet HedgedConnections::receivePacketImpl(AsyncCallback async_callback) int HedgedConnections::getReadyFileDescriptor(AsyncCallback async_callback) { - for (auto & [fd, replica] : fd_to_replica) - if (replica->connection->hasReadPendingData()) - return replica->fd; + for (auto & [fd, location] : fd_to_replica_location) + { + ReplicaState & replica = offset_states[location.offset].replicas[location.index]; + if (replica.connection->hasReadPendingData()) + return replica.connection->getSocket()->impl()->sockfd(); + } - return epoll.getReady(std::move(async_callback)).data.fd; + return epoll.getReady(true, std::move(async_callback)).data.fd; } -Packet HedgedConnections::receivePacketFromReplica(ReplicaStatePtr & replica, AsyncCallback async_callback) +Packet HedgedConnections::receivePacketFromReplica(ReplicaLocation & replica_location, AsyncCallback async_callback) { - Packet packet = replica->connection->receivePacket(std::move(async_callback)); + ReplicaState & replica = offset_states[replica_location.offset].replicas[replica_location.index]; + removeTimeoutFromReplica(ConnectionTimeoutType::RECEIVE_TIMEOUT, replica); + Packet packet = replica.connection->receivePacket(std::move(async_callback)); switch (packet.type) { case Protocol::Server::Data: - removeTimeoutsFromReplica(replica, epoll, timeout_fd_to_replica); - processReceiveData(replica); - addTimeoutToReplica(TimerTypes::RECEIVE_TIMEOUT, replica, epoll, timeout_fd_to_replica, get_hedged_connections.getConnectionTimeouts()); + if (!offset_states[replica_location.offset].first_packet_of_data_received) + processReceivedFirstDataPacket(replica_location); + addTimeoutToReplica(ConnectionTimeoutType::RECEIVE_TIMEOUT, replica); break; case Protocol::Server::Progress: case Protocol::Server::ProfileInfo: case Protocol::Server::Totals: case Protocol::Server::Extremes: case Protocol::Server::Log: - removeTimeoutFromReplica(TimerTypes::RECEIVE_TIMEOUT, replica, epoll, timeout_fd_to_replica); - addTimeoutToReplica(TimerTypes::RECEIVE_TIMEOUT, replica, epoll, timeout_fd_to_replica, get_hedged_connections.getConnectionTimeouts()); + addTimeoutToReplica(ConnectionTimeoutType::RECEIVE_TIMEOUT, replica); break; case Protocol::Server::EndOfStream: @@ -320,96 +331,155 @@ Packet HedgedConnections::receivePacketFromReplica(ReplicaStatePtr & replica, As return packet; } -void HedgedConnections::processReceiveData(ReplicaStatePtr & replica) +void HedgedConnections::processReceivedFirstDataPacket(ReplicaLocation & replica_location) { /// When we receive first packet of data from replica, we stop working with replicas, that are /// responsible for the same offset. - offsets_with_received_data.insert(replica->parallel_replica_offset); + OffsetState & offset_state = offset_states[replica_location.offset]; + removeTimeoutFromReplica(ConnectionTimeoutType::RECEIVE_DATA_TIMEOUT, offset_state.replicas[replica_location.index]); + ++offsets_with_received_first_data_packet; + offset_state.first_packet_of_data_received = true; - for (auto & other_replica : replicas[replica->parallel_replica_offset]) + for (size_t i = 0; i != offset_state.replicas.size(); ++i) { - if (other_replica->isReady() && other_replica != replica) + if (i != replica_location.index && offset_state.replicas[i].connection) { - other_replica->connection->sendCancel(); - finishProcessReplica(other_replica, true); + offset_state.replicas[i].connection->sendCancel(); + finishProcessReplica(offset_state.replicas[i], true); } } /// If we received data from replicas with all offsets, we need to stop choosing new replicas. - if (get_hedged_connections.hasEventsInProcess() && offsets_with_received_data.size() == replicas.size()) + if (hedged_connections_factory.hasEventsInProcess() && offsets_with_received_first_data_packet == offset_states.size()) { - get_hedged_connections.stopChoosingReplicas(); if (next_replica_in_process) - epoll.remove(get_hedged_connections.getFileDescriptor()); + { + epoll.remove(hedged_connections_factory.getFileDescriptor()); + next_replica_in_process = false; + } + hedged_connections_factory.stopChoosingReplicas(); } } -void HedgedConnections::processTimeoutEvent(ReplicaStatePtr & replica, TimerDescriptorPtr timeout_descriptor) +void HedgedConnections::processTimeoutEvent(ReplicaLocation & replica_location, ConnectionTimeoutDescriptorPtr timeout_descriptor) { - epoll.remove(timeout_descriptor->getDescriptor()); - replica->active_timeouts.erase(timeout_descriptor->getDescriptor()); - timeout_fd_to_replica.erase(timeout_descriptor->getDescriptor()); + ReplicaState & replica = offset_states[replica_location.offset].replicas[replica_location.index]; + epoll.remove(timeout_descriptor->timer.getDescriptor()); + replica.active_timeouts.erase(timeout_descriptor->timer.getDescriptor()); + timeout_fd_to_replica_location.erase(timeout_descriptor->timer.getDescriptor()); - if (timeout_descriptor->getType() == TimerTypes::RECEIVE_TIMEOUT) + if (timeout_descriptor->type == ConnectionTimeoutType::RECEIVE_TIMEOUT) { - size_t offset = replica->parallel_replica_offset; finishProcessReplica(replica, true); - /// Check if there is no active connections with the same offset. - if (active_connections_count_by_offset[offset] == 0) + /// Check if there is no active connections with the same offset and there is no new replica in process. + if (offset_states[replica_location.offset].active_connection_count == 0 && !next_replica_in_process) throw NetException("Receive timeout expired", ErrorCodes::SOCKET_TIMEOUT); } - else if (timeout_descriptor->getType() == TimerTypes::RECEIVE_DATA_TIMEOUT) + else if (timeout_descriptor->type == ConnectionTimeoutType::RECEIVE_DATA_TIMEOUT) { - offsets_queue.push(replica->parallel_replica_offset); - tryGetNewReplica(); + offsets_queue.push(replica_location.offset); + tryGetNewReplica(true); } } -void HedgedConnections::tryGetNewReplica() +void HedgedConnections::tryGetNewReplica(bool start_new_connection) { - ReplicaStatePtr new_replica = get_hedged_connections.getNextConnection(/*non_blocking*/ true); + Connection * connection = nullptr; + HedgedConnectionsFactory::State state = hedged_connections_factory.getNextConnection(start_new_connection, connection); /// Skip replicas that doesn't support two-level aggregation if we didn't disable it in sendQuery. - while (new_replica->isReady() && !disable_two_level_aggregation - && new_replica->connection->getServerRevision(get_hedged_connections.getConnectionTimeouts()) < DBMS_MIN_REVISION_WITH_CURRENT_AGGREGATION_VARIANT_SELECTION_METHOD) - new_replica = get_hedged_connections.getNextConnection(/*non_blocking*/ true); + while (state == HedgedConnectionsFactory::State::READY && !disable_two_level_aggregation + && connection->getServerRevision(hedged_connections_factory.getConnectionTimeouts()) + < DBMS_MIN_REVISION_WITH_CURRENT_AGGREGATION_VARIANT_SELECTION_METHOD) + state = hedged_connections_factory.getNextConnection(true, connection); - if (new_replica->isReady()) + if (state == HedgedConnectionsFactory::State::READY) { - new_replica->parallel_replica_offset = offsets_queue.front(); + size_t offset = offsets_queue.front(); offsets_queue.pop(); - replicas[new_replica->parallel_replica_offset].push_back(new_replica); - epoll.add(new_replica->fd); - fd_to_replica[new_replica->fd] = new_replica; - ++active_connections_count_by_offset[new_replica->parallel_replica_offset]; - pipeline_for_new_replicas.run(new_replica); + size_t index = offset_states[offset].replicas.size(); + + ReplicaState replica; + replica.connection = connection; + int socket_fd = replica.connection->getSocket()->impl()->sockfd(); + epoll.add(socket_fd); + fd_to_replica_location[socket_fd] = ReplicaLocation{offset, index}; + offset_states[offset].replicas.push_back(replica); + ++offset_states[offset].active_connection_count; + ++active_connection_count; + pipeline_for_new_replicas.run(replica); } - else if (new_replica->isNotReady() && !next_replica_in_process) + else if (state == HedgedConnectionsFactory::State::NOT_READY && !next_replica_in_process) { - epoll.add(get_hedged_connections.getFileDescriptor()); + epoll.add(hedged_connections_factory.getFileDescriptor()); next_replica_in_process = true; } - if (next_replica_in_process && (new_replica->isCannotChoose() || offsets_queue.empty())) + /// Check if we cannot get new replica and there is no active replica with needed offsets. + else if (state == HedgedConnectionsFactory::State::CANNOT_CHOOSE) { - epoll.remove(get_hedged_connections.getFileDescriptor()); + while (!offsets_queue.empty()) + { + if (offset_states[offsets_queue.front()].active_connection_count == 0) + throw Exception("Cannot find enough connections to replicas", ErrorCodes::ALL_CONNECTION_TRIES_FAILED); + offsets_queue.pop(); + } + } + + /// Check if we don't need to listen hedged_connections_factory file descriptor in epoll anymore. + if (next_replica_in_process && (state == HedgedConnectionsFactory::State::CANNOT_CHOOSE || offsets_queue.empty())) + { + epoll.remove(hedged_connections_factory.getFileDescriptor()); next_replica_in_process = false; } } -void HedgedConnections::finishProcessReplica(ReplicaStatePtr & replica, bool disconnect) +void HedgedConnections::finishProcessReplica(ReplicaState & replica, bool disconnect) { - removeTimeoutsFromReplica(replica, epoll, timeout_fd_to_replica); - epoll.remove(replica->fd); - fd_to_replica.erase(replica->fd); - --active_connections_count_by_offset[replica->parallel_replica_offset]; - if (active_connections_count_by_offset[replica->parallel_replica_offset] == 0) - active_connections_count_by_offset.erase(replica->parallel_replica_offset); + removeTimeoutsFromReplica(replica); + int socket_fd = replica.connection->getSocket()->impl()->sockfd(); + epoll.remove(socket_fd); + --offset_states[fd_to_replica_location[socket_fd].offset].active_connection_count; + fd_to_replica_location.erase(socket_fd); + --active_connection_count; if (disconnect) - replica->connection->disconnect(); - replica->reset(); + replica.connection->disconnect(); + replica.connection = nullptr; +} + +void HedgedConnections::addTimeoutToReplica(ConnectionTimeoutType type, ReplicaState & replica) +{ + ConnectionTimeoutDescriptorPtr timeout_descriptor + = createConnectionTimeoutDescriptor(type, hedged_connections_factory.getConnectionTimeouts()); + epoll.add(timeout_descriptor->timer.getDescriptor()); + timeout_fd_to_replica_location[timeout_descriptor->timer.getDescriptor()] + = fd_to_replica_location[replica.connection->getSocket()->impl()->sockfd()]; + replica.active_timeouts[timeout_descriptor->timer.getDescriptor()] = std::move(timeout_descriptor); +} + +void HedgedConnections::removeTimeoutsFromReplica(ReplicaState & replica) +{ + for (auto & [fd, _] : replica.active_timeouts) + { + epoll.remove(fd); + timeout_fd_to_replica_location.erase(fd); + } + replica.active_timeouts.clear(); +} + +void HedgedConnections::removeTimeoutFromReplica(ConnectionTimeoutType type, ReplicaState & replica) +{ + auto it = std::find_if( + replica.active_timeouts.begin(), replica.active_timeouts.end(), [type](auto & value) { return value.second->type == type; }); + + if (it != replica.active_timeouts.end()) + { + epoll.remove(it->first); + timeout_fd_to_replica_location.erase(it->first); + replica.active_timeouts.erase(it); + } } } diff --git a/src/Client/HedgedConnections.h b/src/Client/HedgedConnections.h index 8081fa6739d..6931db9ede6 100644 --- a/src/Client/HedgedConnections.h +++ b/src/Client/HedgedConnections.h @@ -1,18 +1,41 @@ #pragma once #if defined(OS_LINUX) -#include -#include #include #include +#include +#include namespace DB { +/** To receive data from multiple replicas (connections) from one shard asynchronously, + * The principe of Hedged Connections is used to reduce tail latency: + * (if we don't receive data from replica for a long time, we try to get new replica + * and send query to it, without cancelling working with previous replica). This class + * supports all functionality that MultipleConnections has. + */ class HedgedConnections : public IConnections { public: - using ReplicaStatePtr = GetHedgedConnections::ReplicaStatePtr; + struct ReplicaState + { + Connection * connection = nullptr; + std::unordered_map active_timeouts; + }; + + struct ReplicaLocation + { + size_t offset; + size_t index; + }; + + struct OffsetState + { + std::vector replicas; + size_t active_connection_count; + bool first_packet_of_data_received; + }; HedgedConnections(const ConnectionPoolWithFailoverPtr & pool_, const Settings & settings_, @@ -45,57 +68,67 @@ public: std::string dumpAddresses() const override; - size_t size() const override { return replicas.size(); } + size_t size() const override { return offset_states.size(); } - bool hasActiveConnections() const override { return !active_connections_count_by_offset.empty(); } + bool hasActiveConnections() const override { return active_connection_count > 0; } private: /// We will save actions with replicas in pipeline to perform them on the new replicas. class Pipeline { public: - void add(std::function send_function); + void add(std::function send_function); - void run(ReplicaStatePtr & replica); + void run(ReplicaState & replica); private: - std::vector> pipeline; + std::vector> pipeline; }; - Packet receivePacketFromReplica(ReplicaStatePtr & replica, AsyncCallback async_callback = {}); + Packet receivePacketFromReplica(ReplicaLocation & replica_location, AsyncCallback async_callback = {}); Packet receivePacketImpl(AsyncCallback async_callback = {}); - void processReceiveData(ReplicaStatePtr & replica); + void processReceivedFirstDataPacket(ReplicaLocation & replica_location); - void processTimeoutEvent(ReplicaStatePtr & replica, TimerDescriptorPtr timeout_descriptor); + void processTimeoutEvent(ReplicaLocation & replica_location, ConnectionTimeoutDescriptorPtr timeout_descriptor); - void tryGetNewReplica(); + void tryGetNewReplica(bool start_new_connection); - void finishProcessReplica(ReplicaStatePtr & replica, bool disconnect); + void finishProcessReplica(ReplicaState & replica, bool disconnect); int getReadyFileDescriptor(AsyncCallback async_callback = {}); - GetHedgedConnections get_hedged_connections; + void addTimeoutToReplica(ConnectionTimeoutType type, ReplicaState & replica); - /// All replicas in replicas[offset] are responsible for process query + void removeTimeoutsFromReplica(ReplicaState & replica); + + void removeTimeoutFromReplica(ConnectionTimeoutType type, ReplicaState & replica); + + + HedgedConnectionsFactory hedged_connections_factory; + + /// All replicas in offset_states[offset] is responsible for process query /// with setting parallel_replica_offset = offset. In common situations - /// replicas[offset].size() = 1 (like in MultiplexedConnections). - std::vector> replicas; + /// replica_states[offset].replicas.size() = 1 (like in MultiplexedConnections). + std::vector offset_states; - /// Map socket file descriptor to replica. - std::unordered_map fd_to_replica; - /// Map timeout file descriptor to replica. - std::unordered_map timeout_fd_to_replica; + /// Map socket file descriptor to replica location (it's offset and index in OffsetState.replicas). + std::unordered_map fd_to_replica_location; + /// Map timeout file descriptor to replica location (it's offset and index in OffsetState.replicas). + std::unordered_map timeout_fd_to_replica_location; /// A queue of offsets for new replicas. When we get RECEIVE_DATA_TIMEOUT from /// the replica, we push it's offset to this queue and start trying to get /// new replica. std::queue offsets_queue; - /// Map offset to amount of active connections, responsible to this offset. - std::unordered_map active_connections_count_by_offset; + /// The current number of valid connections to the replicas of this shard. + size_t active_connection_count; - std::unordered_set offsets_with_received_data; + /// We count offsets which received first packet of data, + /// it's needed to cancel choosing new replicas when all offsets + /// received their first packet of data. + size_t offsets_with_received_first_data_packet; Pipeline pipeline_for_new_replicas; @@ -103,8 +136,8 @@ private: /// If we didn't disabled it, we need to skip this replica. bool disable_two_level_aggregation = false; - /// next_replica_in_process is true when get_hedged_connections.getFileDescriptor() - /// is in epoll now and false otherwise. + /// This flag means we need to get connection with new replica, but no replica is ready. + /// When it's true, hedged_connections_factory.getFileDescriptor() is in epoll. bool next_replica_in_process = false; Epoll epoll; diff --git a/src/Client/HedgedConnectionsFactory.cpp b/src/Client/HedgedConnectionsFactory.cpp new file mode 100644 index 00000000000..22666642b4e --- /dev/null +++ b/src/Client/HedgedConnectionsFactory.cpp @@ -0,0 +1,475 @@ +#if defined(OS_LINUX) + +#include +#include + + +namespace DB +{ +namespace ErrorCodes +{ + extern const int BAD_ARGUMENTS; + extern const int LOGICAL_ERROR; + extern const int ALL_CONNECTION_TRIES_FAILED; + extern const int ALL_REPLICAS_ARE_STALE; +} + +HedgedConnectionsFactory::HedgedConnectionsFactory( + const ConnectionPoolWithFailoverPtr & pool_, + const Settings * settings_, + const ConnectionTimeouts & timeouts_, + std::shared_ptr table_to_check_) + : pool(pool_), settings(settings_), timeouts(timeouts_), table_to_check(table_to_check_), log(&Poco::Logger::get("HedgedConnectionsFactory")) +{ + shuffled_pools = pool->getShuffledPools(settings); + for (size_t i = 0; i != shuffled_pools.size(); ++i) + connection_establishers.emplace_back(shuffled_pools[i].pool, &timeouts, settings, table_to_check.get(), log); + + max_tries + = (settings ? size_t{settings->connections_with_failover_max_tries} : size_t{DBMS_CONNECTION_POOL_WITH_FAILOVER_DEFAULT_MAX_TRIES}); + + fallback_to_stale_replicas = settings && settings->fallback_to_stale_replicas_for_distributed_queries; + entries_count = 0; + usable_count = 0; + failed_pools_count = 0; +} + +HedgedConnectionsFactory::~HedgedConnectionsFactory() +{ + pool->updateSharedError(shuffled_pools); +} + +std::vector HedgedConnectionsFactory::getManyConnections(PoolMode pool_mode) +{ + size_t min_entries = (settings && settings->skip_unavailable_shards) ? 0 : 1; + + size_t max_entries; + if (pool_mode == PoolMode::GET_ALL) + { + min_entries = shuffled_pools.size(); + max_entries = shuffled_pools.size(); + } + else if (pool_mode == PoolMode::GET_ONE) + max_entries = 1; + else if (pool_mode == PoolMode::GET_MANY) + max_entries = settings ? size_t(settings->max_parallel_replicas) : 1; + else + throw DB::Exception("Unknown pool allocation mode", DB::ErrorCodes::LOGICAL_ERROR); + + std::vector connections; + connections.reserve(max_entries); + + /// Try to start establishing connections with max_entries replicas. + int index; + for (size_t i = 0; i != max_entries; ++i) + { + index = getNextIndex(); + if (index == -1) + break; + + ReplicaStatePtr replica = startEstablishingConnection(index); + if (replica->state == State::READY) + connections.push_back(replica->connection); + } + + /// Process connections until we get enough READY connections + /// (work asynchronously with all connections we started). + Connection * connection = nullptr; + while (connections.size() < max_entries) + { + auto state = processConnections(true, connection); + if (state == State::READY) + connections.push_back(connection); + else if (state == State::CANNOT_CHOOSE) + { + if (connections.size() >= min_entries) + break; + + /// Determine the reason of not enough replicas. + if (!fallback_to_stale_replicas && usable_count >= min_entries) + throw DB::Exception( + "Could not find enough connections to up-to-date replicas. Got: " + std::to_string(connections.size()) + + ", needed: " + std::to_string(min_entries), + DB::ErrorCodes::ALL_REPLICAS_ARE_STALE); + + throw DB::NetException( + "All connection tries failed. Log: \n\n" + fail_messages + "\n", + DB::ErrorCodes::ALL_CONNECTION_TRIES_FAILED); + } + } + + return connections; +} + +HedgedConnectionsFactory::State HedgedConnectionsFactory::getNextConnection(bool start_new_connection, Connection *& connection_out) +{ + if (start_new_connection) + { + /// Try to start establishing connection to the new replica. + int index = getNextIndex(); + if (index != -1) + { + ReplicaStatePtr replica = startEstablishingConnection(index); + if (replica->state == State::READY) + { + connection_out = replica->connection; + return State::READY; + } + } + } + + return processConnections(false, connection_out); +} + +HedgedConnectionsFactory::State HedgedConnectionsFactory::processConnections(bool blocking, Connection *& connection_out) +{ + ReplicaStatePtr replica = nullptr; + int index = -1; + + while (index != -1 || !epoll.empty()) + { + if (index != -1) + { + replica = startEstablishingConnection(index); + if (replica->state == State::READY) + { + connection_out = replica->connection; + return State::READY; + } + } + + if (!processEpollEvents(replica, blocking)) + return State::NOT_READY; + + if (replica->state == State::READY) + { + connection_out = replica->connection; + return State::READY; + } + + index = getNextIndex(); + } + + /// We reach this point only if there was no free up to date replica. + /// We will try to use usable replica. + + /// Check if we are not allowed to use usable replicas or there is no even a free usable replica. + if (!fallback_to_stale_replicas || !canGetNewConnection()) + return State::CANNOT_CHOOSE; + + setBestUsableReplica(replica); + connection_out = replica->connection; + return replica->state; +} + +void HedgedConnectionsFactory::stopChoosingReplicas() +{ + for (auto & [fd, replica] : fd_to_replica) + { + removeTimeoutsFromReplica(replica); + epoll.remove(fd); + connection_establishers[replica->index].reset(); + replica->reset(); + } + + fd_to_replica.clear(); +} + +int HedgedConnectionsFactory::getNextIndex() +{ + /// Check if there is no free replica. + if (entries_count + indexes_in_process.size() + failed_pools_count >= shuffled_pools.size()) + return -1; + + /// Check if it's the first time. + if (last_used_index == -1) + { + last_used_index = 0; + return 0; + } + + bool finish = false; + int next_index = last_used_index; + while (!finish) + { + next_index = (next_index + 1) % shuffled_pools.size(); + + /// Check if we can try this replica. + if (indexes_in_process.find(next_index) == indexes_in_process.end() && (max_tries == 0 || shuffled_pools[next_index].error_count < max_tries) + && connection_establishers[next_index].stage != ConnectionEstablisher::Stage::FINISHED) + finish = true; + + /// If we made a complete round, there is no replica to connect. + else if (next_index == last_used_index) + return -1; + } + + last_used_index = next_index; + return next_index; +} + +HedgedConnectionsFactory::ReplicaStatePtr HedgedConnectionsFactory::startEstablishingConnection(int index) +{ + ReplicaStatePtr replica = createNewReplica(); + + do + { + ConnectionEstablisher & connection_establisher = connection_establishers[index]; + + replica->state = State::NOT_READY; + replica->index = index; + indexes_in_process.insert(index); + + connection_establisher.reset(); + connection_establisher.run(); + + if (connection_establisher.stage != ConnectionEstablisher::Stage::FAILED) + replica->connection = &*connection_establisher.result.entry; + + processConnectionEstablisherStage(replica); + + if (replica->state == State::NOT_READY) + { + epoll.add(connection_establisher.socket_fd); + fd_to_replica[connection_establisher.socket_fd] = replica; + connection_establisher.setActionBeforeDisconnect([&](int fd) { + epoll.remove(fd); + fd_to_replica.erase(fd); + }); + addTimeouts(replica); + } + } + while (replica->state == State::EMPTY && (index = getNextIndex()) != -1); + + return replica; +} + +void HedgedConnectionsFactory::processConnectionEstablisherStage(ReplicaStatePtr & replica, bool remove_from_epoll) +{ + ConnectionEstablisher & connection_establisher = connection_establishers[replica->index]; + + if (connection_establisher.stage == ConnectionEstablisher::Stage::FINISHED) + { + indexes_in_process.erase(replica->index); + ++entries_count; + + if (remove_from_epoll) + { + epoll.remove(connection_establisher.socket_fd); + fd_to_replica.erase(connection_establisher.socket_fd); + } + + if (connection_establisher.result.is_usable) + { + ++usable_count; + if (connection_establisher.result.is_up_to_date) + { + replica->state = State::READY; + ready_indexes.insert(replica->index); + return; + } + } + + /// This replica is not up to date, we will try to find up to date. + replica->reset(); + } + else if (connection_establisher.stage == ConnectionEstablisher::Stage::FAILED) + processFailedConnection(replica); +} + +void HedgedConnectionsFactory::processFailedConnection(ReplicaStatePtr & replica) +{ + ShuffledPool & shuffled_pool = shuffled_pools[replica->index]; + LOG_WARNING( + log, "Connection failed at try №{}, reason: {}", (shuffled_pool.error_count + 1), connection_establishers[replica->index].fail_message); + ProfileEvents::increment(ProfileEvents::DistributedConnectionFailTry); + + shuffled_pool.error_count = std::min(pool->getMaxErrorCup(), shuffled_pool.error_count + 1); + + if (shuffled_pool.error_count >= max_tries) + { + ++failed_pools_count; + ProfileEvents::increment(ProfileEvents::DistributedConnectionFailAtAll); + } + + std::string & fail_message = connection_establishers[replica->index].fail_message; + if (!fail_message.empty()) + fail_messages += fail_message + "\n"; + + indexes_in_process.erase(replica->index); + replica->reset(); +} + +void HedgedConnectionsFactory::addTimeouts(ReplicaStatePtr & replica) +{ + addTimeoutToReplica(ConnectionTimeoutType::RECEIVE_TIMEOUT, replica); + + auto stage = connection_establishers[replica->index].stage; + if (stage == ConnectionEstablisher::Stage::RECEIVE_HELLO) + addTimeoutToReplica(ConnectionTimeoutType::RECEIVE_HELLO_TIMEOUT, replica); + else if (stage == ConnectionEstablisher::Stage::RECEIVE_TABLES_STATUS) + addTimeoutToReplica(ConnectionTimeoutType::RECEIVE_TABLES_STATUS_TIMEOUT, replica); +} + +void HedgedConnectionsFactory::addTimeoutToReplica(ConnectionTimeoutType type, ReplicaStatePtr & replica) +{ + ConnectionTimeoutDescriptorPtr timeout_descriptor = createConnectionTimeoutDescriptor(type, timeouts); + epoll.add(timeout_descriptor->timer.getDescriptor()); + timeout_fd_to_replica[timeout_descriptor->timer.getDescriptor()] = replica; + replica->active_timeouts[timeout_descriptor->timer.getDescriptor()] = std::move(timeout_descriptor); +} + +void HedgedConnectionsFactory::removeTimeoutsFromReplica(ReplicaStatePtr & replica) +{ + for (auto & [fd, _] : replica->active_timeouts) + { + epoll.remove(fd); + timeout_fd_to_replica.erase(fd); + } + replica->active_timeouts.clear(); +} + +bool HedgedConnectionsFactory::processEpollEvents(ReplicaStatePtr & replica, bool blocking) +{ + int event_fd; + bool finish = false; + while (!finish) + { + event_fd = getReadyFileDescriptor(blocking); + + /// Check if there is no events. + if (event_fd == -1) + return false; + + if (fd_to_replica.find(event_fd) != fd_to_replica.end()) + { + replica = fd_to_replica[event_fd]; + processReplicaEvent(replica); + /// Check if replica is ready or we need to try next replica. + if (replica->state == State::READY || replica->state == State::EMPTY) + finish = true; + } + else if (timeout_fd_to_replica.find(event_fd) != timeout_fd_to_replica.end()) + { + replica = timeout_fd_to_replica[event_fd]; + processTimeoutEvent(replica, replica->active_timeouts[event_fd]); + /// Check if we need to try next replica. + if (replica->state == State::EMPTY) + finish = true; + } + else + throw Exception("Unknown event from epoll", ErrorCodes::LOGICAL_ERROR); + } + + return true; +} + +int HedgedConnectionsFactory::getReadyFileDescriptor(bool blocking) +{ + for (auto & [fd, replica] : fd_to_replica) + if (replica->connection->hasReadPendingData()) + return replica->connection->getSocket()->impl()->sockfd(); + + return epoll.getReady(/* blocking */blocking).data.fd; +} + +void HedgedConnectionsFactory::processReplicaEvent(ReplicaStatePtr & replica) +{ + removeTimeoutsFromReplica(replica); + connection_establishers[replica->index].run(); + processConnectionEstablisherStage(replica, true); + if (replica->state == State::NOT_READY) + addTimeouts(replica); +} + +void HedgedConnectionsFactory::processTimeoutEvent(ReplicaStatePtr & replica, ConnectionTimeoutDescriptorPtr timeout_descriptor) +{ + epoll.remove(timeout_descriptor->timer.getDescriptor()); + replica->active_timeouts.erase(timeout_descriptor->timer.getDescriptor()); + timeout_fd_to_replica[timeout_descriptor->timer.getDescriptor()]; + + if (timeout_descriptor->type == ConnectionTimeoutType::RECEIVE_TIMEOUT) + { + removeTimeoutsFromReplica(replica); + int fd = replica->connection->getSocket()->impl()->sockfd(); + epoll.remove(fd); + fd_to_replica.erase(fd); + + ConnectionEstablisher & connection_establisher = connection_establishers[replica->index]; + connection_establisher.fail_message = "Receive timeout expired (" + connection_establisher.result.entry->getDescription() + ")"; + connection_establisher.resetResult(); + connection_establisher.stage = ConnectionEstablisher::Stage::FAILED; + processFailedConnection(replica); + } + else if ((timeout_descriptor->type == ConnectionTimeoutType::RECEIVE_HELLO_TIMEOUT + || timeout_descriptor->type == ConnectionTimeoutType::RECEIVE_TABLES_STATUS_TIMEOUT) + && entries_count + indexes_in_process.size() + failed_pools_count < shuffled_pools.size()) + replica = createNewReplica(); +} + +void HedgedConnectionsFactory::setBestUsableReplica(ReplicaStatePtr & replica) +{ + std::vector indexes(connection_establishers.size()); + for (size_t i = 0; i != indexes.size(); ++i) + indexes[i] = i; + + /// Remove unusable, failed replicas and replicas that are ready or in process. + indexes.erase( + std::remove_if( + indexes.begin(), + indexes.end(), + [&](int i) + { + return connection_establishers[i].result.entry.isNull() || !connection_establishers[i].result.is_usable || + indexes_in_process.find(i) != indexes_in_process.end() || ready_indexes.find(i) != ready_indexes.end(); + }), + indexes.end()); + + if (indexes.empty()) + { + replica->state = State::CANNOT_CHOOSE; + return; + } + + /// Sort replicas by staleness. + std::stable_sort( + indexes.begin(), + indexes.end(), + [&](size_t lhs, size_t rhs) + { + return connection_establishers[lhs].result.staleness < connection_establishers[rhs].result.staleness; + }); + + replica->index = indexes[0]; + replica->connection = &*connection_establishers[indexes[0]].result.entry; + replica->state = State::READY; + ready_indexes.insert(replica->index); +} + +ConnectionTimeoutDescriptorPtr createConnectionTimeoutDescriptor(ConnectionTimeoutType type, const ConnectionTimeouts & timeouts) +{ + Poco::Timespan timeout; + switch (type) + { + case ConnectionTimeoutType::RECEIVE_HELLO_TIMEOUT: + timeout = timeouts.receive_hello_timeout; + break; + case ConnectionTimeoutType::RECEIVE_TABLES_STATUS_TIMEOUT: + timeout = timeouts.receive_tables_status_timeout; + break; + case ConnectionTimeoutType::RECEIVE_DATA_TIMEOUT: + timeout = timeouts.receive_data_timeout; + break; + case ConnectionTimeoutType::RECEIVE_TIMEOUT: + timeout = timeouts.receive_timeout; + break; + } + + ConnectionTimeoutDescriptorPtr timeout_descriptor = std::make_shared(); + timeout_descriptor->type = type; + timeout_descriptor->timer.setRelative(timeout); + return timeout_descriptor; +} + +} +#endif diff --git a/src/Client/HedgedConnectionsFactory.h b/src/Client/HedgedConnectionsFactory.h new file mode 100644 index 00000000000..d1dc262d39c --- /dev/null +++ b/src/Client/HedgedConnectionsFactory.h @@ -0,0 +1,167 @@ +#pragma once + +#if defined(OS_LINUX) + +#include +#include +#include +#include +#include +#include + +namespace DB +{ + +enum class ConnectionTimeoutType +{ + RECEIVE_HELLO_TIMEOUT, + RECEIVE_TABLES_STATUS_TIMEOUT, + RECEIVE_DATA_TIMEOUT, + RECEIVE_TIMEOUT, +}; + +struct ConnectionTimeoutDescriptor +{ + ConnectionTimeoutType type; + TimerDescriptor timer; +}; + +using ConnectionTimeoutDescriptorPtr = std::shared_ptr; +using TimerDescriptorPtr = std::shared_ptr; + +/** Class for establishing hedged connections with replicas. + * The process of establishing connection is divided on stages, on each stage if + * replica doesn't respond for a long time, we start establishing connection with + * the next replica, without cancelling working with previous one. + * It works with multiple replicas simultaneously without blocking by using epoll. + */ +class HedgedConnectionsFactory +{ +public: + using ShuffledPool = ConnectionPoolWithFailover::Base::ShuffledPool; + + enum class State + { + EMPTY = 0, + READY = 1, + NOT_READY = 2, + CANNOT_CHOOSE = 3, + }; + + struct ReplicaState + { + Connection * connection = nullptr; + size_t index = -1; + State state = State::EMPTY; + std::unordered_map active_timeouts; + + void reset() + { + connection = nullptr; + index = -1; + state = State::EMPTY; + active_timeouts.clear(); + } + }; + + using ReplicaStatePtr = std::shared_ptr; + + HedgedConnectionsFactory(const ConnectionPoolWithFailoverPtr & pool_, + const Settings * settings_, + const ConnectionTimeouts & timeouts_, + std::shared_ptr table_to_check_ = nullptr); + + /// Create and return active connections according to pool_mode. + std::vector getManyConnections(PoolMode pool_mode); + + /// Try to get connection to the new replica without blocking. If start_new_connection is true, we start establishing connection + /// with the new replica and then call processConnections, otherwise just call processConnections. + State getNextConnection(bool start_new_connection, Connection *& connection_out); + + /// Process all current events in epoll (connections, timeouts), if there is no events in epoll and blocking is false, + /// return NOT_READY. Returned state might be READY, NOT_READY and CANNOT_CHOOSE. + /// If state is READY, replica connection will be written in connection_out. + State processConnections(bool blocking, Connection *& connection_out); + + /// Check if we can try to produce new READY replica. + bool canGetNewConnection() const { return ready_indexes.size() + failed_pools_count < shuffled_pools.size(); } + + /// Stop working with all replicas that are not READY. + void stopChoosingReplicas(); + + bool hasEventsInProcess() const { return epoll.size() > 0; } + + int getFileDescriptor() const { return epoll.getFileDescriptor(); } + + const ConnectionTimeouts & getConnectionTimeouts() const { return timeouts; } + + ~HedgedConnectionsFactory(); + +private: + ReplicaStatePtr startEstablishingConnection(int index); + + void processConnectionEstablisherStage(ReplicaStatePtr & replica, bool remove_from_epoll = false); + + /// Find an index of the next free replica to start connection. + /// Return -1 if there is no free replica. + int getNextIndex(); + + int getReadyFileDescriptor(bool blocking); + + void addTimeouts(ReplicaStatePtr & replica); + + void addTimeoutToReplica(ConnectionTimeoutType type, ReplicaStatePtr & replica); + + void removeTimeoutsFromReplica(ReplicaStatePtr & replica); + + void processFailedConnection(ReplicaStatePtr & replica); + + void processReceiveTimeout(ReplicaStatePtr & replica); + + void processReplicaEvent(ReplicaStatePtr & replica); + + void processTimeoutEvent(ReplicaStatePtr & replica, ConnectionTimeoutDescriptorPtr timeout_descriptor); + + /// Return false if there is no ready events, return true if replica is ready + /// or we need to try next replica. + bool processEpollEvents(ReplicaStatePtr & replica, bool blocking); + + void setBestUsableReplica(ReplicaStatePtr & replica); + + ReplicaStatePtr createNewReplica() { return std::make_shared(); } + + const ConnectionPoolWithFailoverPtr pool; + const Settings * settings; + const ConnectionTimeouts timeouts; + std::shared_ptr table_to_check; + + std::vector connection_establishers; + std::vector shuffled_pools; + std::vector replica_states; + + /// Map socket file descriptor to replica. + std::unordered_map fd_to_replica; + /// Map timeout file descriptor to replica. + std::unordered_map timeout_fd_to_replica; + + /// Indexes of replicas, that are in process of connection. + std::unordered_set indexes_in_process; + /// Indexes of ready replicas. + std::unordered_set ready_indexes; + + int last_used_index = -1; + bool fallback_to_stale_replicas; + Epoll epoll; + Poco::Logger * log; + std::string fail_messages; + size_t entries_count; + size_t usable_count; + size_t failed_pools_count; + size_t max_tries; +}; + +/// Create ConnectionTimeoutDescriptor with particular type. +ConnectionTimeoutDescriptorPtr createConnectionTimeoutDescriptor(ConnectionTimeoutType type, const ConnectionTimeouts & timeouts); + +} +#endif diff --git a/src/Client/ya.make b/src/Client/ya.make index 603e8290350..7a664f328f7 100644 --- a/src/Client/ya.make +++ b/src/Client/ya.make @@ -12,8 +12,8 @@ PEERDIR( SRCS( Connection.cpp ConnectionPoolWithFailover.cpp - GetHedgedConnections.cpp HedgedConnections.cpp + HedgedConnectionsFactory.cpp MultiplexedConnections.cpp TimeoutSetter.cpp diff --git a/src/Common/Epoll.cpp b/src/Common/Epoll.cpp index cb34f81cf36..bfd323b4f55 100644 --- a/src/Common/Epoll.cpp +++ b/src/Common/Epoll.cpp @@ -46,24 +46,24 @@ void Epoll::remove(int fd) --events_count; } -epoll_event Epoll::getReady(AsyncCallback async_callback) const +epoll_event Epoll::getReady(bool blocking, AsyncCallback async_callback) const { - std::vector events = getManyReady(1, true, std::move(async_callback)); - if (events.empty()) - throw Exception("Vector of ready events is empty", ErrorCodes::LOGICAL_ERROR); + epoll_event event; + event.data.fd = -1; + size_t ready_events_count = getManyReady(1, &event, blocking, std::move(async_callback)); + if (ready_events_count > 1) + throw Exception("Returned amount of events cannot be more than 1.", ErrorCodes::LOGICAL_ERROR); - return events[0]; + return event; } -std::vector Epoll::getManyReady(int max_events, bool blocking, AsyncCallback async_callback) const +size_t Epoll::getManyReady(int max_events, epoll_event * events_out, bool blocking, AsyncCallback async_callback) const { - std::vector events(max_events); - int ready_size = 0; int timeout = blocking && !async_callback ? -1 : 0; - while (ready_size <= 0 && (ready_size != 0 || blocking)) + do { - ready_size = epoll_wait(epoll_fd, events.data(), max_events, timeout); + ready_size = epoll_wait(epoll_fd, events_out, max_events, timeout); if (ready_size == -1 && errno != EINTR) throwFromErrno("Error in epoll_wait", DB::ErrorCodes::EPOLL_ERROR); @@ -71,9 +71,9 @@ std::vector Epoll::getManyReady(int max_events, bool blocking, Asyn if (ready_size == 0 && blocking && async_callback) async_callback(epoll_fd, 0, "epoll"); } + while (ready_size <= 0 && (ready_size != 0 || blocking)); - events.resize(ready_size); - return events; + return ready_size; } Epoll::~Epoll() diff --git a/src/Common/Epoll.h b/src/Common/Epoll.h index 1dc65d15d08..92638715aeb 100644 --- a/src/Common/Epoll.h +++ b/src/Common/Epoll.h @@ -16,20 +16,22 @@ class Epoll : boost::noncopyable public: Epoll(); - /// Add new file descriptor to epoll. + /// Add new file descriptor to epoll. If ptr set to nullptr, epoll_event.data.fd = fd, + /// otherwise epoll_event.data.ptr = ptr. void add(int fd, void * ptr = nullptr); /// Remove file descriptor to epoll. void remove(int fd); - /// Get events from epoll. If blocking is false and there are no ready events, + /// Get events from epoll. Events are written in events_out, this function returns an amount of ready events. + /// If blocking is false and there are no ready events, /// return empty vector, otherwise wait for ready events. If blocking is true, /// async_callback is given and there is no ready events, async_callback is called /// with epoll file descriptor. - std::vector getManyReady(int max_events, bool blocking, AsyncCallback async_callback = {}) const; + size_t getManyReady(int max_events, epoll_event * events_out, bool blocking, AsyncCallback async_callback = {}) const; - /// Get only one ready event, this function is always blocking. - epoll_event getReady(AsyncCallback async_callback = {}) const; + /// Get only one ready event, if blocking is false and there is no ready events, epoll_event.data.fd will be set to -1. + epoll_event getReady(bool blocking = true, AsyncCallback async_callback = {}) const; int getFileDescriptor() const { return epoll_fd; } diff --git a/src/Common/TimerDescriptor.h b/src/Common/TimerDescriptor.h index debf7cdc899..6f7003f6980 100644 --- a/src/Common/TimerDescriptor.h +++ b/src/Common/TimerDescriptor.h @@ -5,21 +5,11 @@ namespace DB { -enum TimerTypes -{ - DEFAULT, - RECEIVE_HELLO_TIMEOUT, - RECEIVE_TABLES_STATUS_TIMEOUT, - RECEIVE_DATA_TIMEOUT, - RECEIVE_TIMEOUT, -}; - /// Wrapper over timerfd. class TimerDescriptor { private: int timer_fd; - int type = TimerTypes::DEFAULT; public: explicit TimerDescriptor(int clockid = CLOCK_MONOTONIC, int flags = 0); @@ -31,12 +21,10 @@ public: TimerDescriptor & operator=(TimerDescriptor &&) = default; int getDescriptor() const { return timer_fd; } - int getType() const { return type; } void reset() const; void drain() const; void setRelative(const Poco::Timespan & timespan) const; - void setType(int type_) { type = type_; } }; } diff --git a/src/DataStreams/RemoteQueryExecutorReadContext.cpp b/src/DataStreams/RemoteQueryExecutorReadContext.cpp index c77b2d48f05..e02ac1fc1b3 100644 --- a/src/DataStreams/RemoteQueryExecutorReadContext.cpp +++ b/src/DataStreams/RemoteQueryExecutorReadContext.cpp @@ -121,19 +121,22 @@ bool RemoteQueryExecutorReadContext::checkTimeout() const bool RemoteQueryExecutorReadContext::checkTimeoutImpl() const { /// Wait for epoll will not block if it was polled externally. - std::vector events = epoll.getManyReady(epoll.size(), /* blocking = */ false); + epoll_event events[3]; + events[0].data.fd = events[1].data.fd = events[2].data.fd = -1; + + epoll.getManyReady(3, events,/* blocking = */ false); bool is_socket_ready = false; bool is_pipe_alarmed = false; bool has_timer_alarm = false; - for (const auto & event : events) + for (int i = 0; i < 3; ++i) { - if (event.data.fd == connection_fd) + if (events[i].data.fd == connection_fd) is_socket_ready = true; - if (event.data.fd == timer.getDescriptor()) + if (events[i].data.fd == timer.getDescriptor()) has_timer_alarm = true; - if (event.data.fd == pipe_fd[0]) + if (events[i].data.fd == pipe_fd[0]) is_pipe_alarmed = true; } @@ -198,7 +201,7 @@ void RemoteQueryExecutorReadContext::cancel() RemoteQueryExecutorReadContext::~RemoteQueryExecutorReadContext() { - /// connection_fd is closed by Poco::Net::Socket + /// connection_fd is closed by Poco::Net::Socket or Epoll if (pipe_fd[0] != -1) close(pipe_fd[0]); if (pipe_fd[1] != -1) diff --git a/src/IO/ConnectionTimeouts.h b/src/IO/ConnectionTimeouts.h index 01f31d6efa8..a92f75bf980 100644 --- a/src/IO/ConnectionTimeouts.h +++ b/src/IO/ConnectionTimeouts.h @@ -33,9 +33,9 @@ struct ConnectionTimeouts tcp_keep_alive_timeout(0), http_keep_alive_timeout(0), secure_connection_timeout(connection_timeout), - receive_hello_timeout(0), - receive_tables_status_timeout(0), - receive_data_timeout(0) + receive_hello_timeout(receive_timeout_), + receive_tables_status_timeout(receive_timeout_), + receive_data_timeout(receive_timeout_) { } @@ -49,9 +49,9 @@ struct ConnectionTimeouts tcp_keep_alive_timeout(tcp_keep_alive_timeout_), http_keep_alive_timeout(0), secure_connection_timeout(connection_timeout), - receive_hello_timeout(0), - receive_tables_status_timeout(0), - receive_data_timeout(0) + receive_hello_timeout(receive_timeout_), + receive_tables_status_timeout(receive_timeout_), + receive_data_timeout(receive_timeout_) { } ConnectionTimeouts(const Poco::Timespan & connection_timeout_, @@ -65,9 +65,9 @@ struct ConnectionTimeouts tcp_keep_alive_timeout(tcp_keep_alive_timeout_), http_keep_alive_timeout(http_keep_alive_timeout_), secure_connection_timeout(connection_timeout), - receive_hello_timeout(0), - receive_tables_status_timeout(0), - receive_data_timeout(0) + receive_hello_timeout(receive_timeout_), + receive_tables_status_timeout(receive_timeout_), + receive_data_timeout(receive_timeout_) { } diff --git a/src/IO/ReadBufferFromPocoSocket.cpp b/src/IO/ReadBufferFromPocoSocket.cpp index e08b9e7c8fb..1f9c732e644 100644 --- a/src/IO/ReadBufferFromPocoSocket.cpp +++ b/src/IO/ReadBufferFromPocoSocket.cpp @@ -14,7 +14,6 @@ namespace ProfileEvents namespace DB { - namespace ErrorCodes { extern const int NETWORK_ERROR; @@ -42,7 +41,7 @@ bool ReadBufferFromPocoSocket::nextImpl() /// Note that receive timeout is not checked here. External code should check it while polling. while (bytes_read < 0 && async_callback && errno == EAGAIN) { - async_callback(socket.impl()->sockfd(), socket.getReceiveTimeout(), "socket (" + socket.peerAddress().toString() + ")"); + async_callback(socket.impl()->sockfd(), socket.getReceiveTimeout(), socket_description); bytes_read = socket.impl()->receiveBytes(internal_buffer.begin(), internal_buffer.size(), flags); } } @@ -74,7 +73,10 @@ bool ReadBufferFromPocoSocket::nextImpl() } ReadBufferFromPocoSocket::ReadBufferFromPocoSocket(Poco::Net::Socket & socket_, size_t buf_size) - : BufferWithOwnMemory(buf_size), socket(socket_), peer_address(socket.peerAddress()) + : BufferWithOwnMemory(buf_size) + , socket(socket_) + , peer_address(socket.peerAddress()) + , socket_description("socket (" + peer_address.toString() + ")") { } diff --git a/src/IO/ReadBufferFromPocoSocket.h b/src/IO/ReadBufferFromPocoSocket.h index 7fd1b646846..73e83dfb5f9 100644 --- a/src/IO/ReadBufferFromPocoSocket.h +++ b/src/IO/ReadBufferFromPocoSocket.h @@ -34,6 +34,7 @@ public: private: AsyncCallback async_callback; + std::string socket_description; }; } From 610798aa487ee1b2ef6007b9185a1c1b27a11660 Mon Sep 17 00:00:00 2001 From: keenwolf Date: Sat, 6 Feb 2021 15:32:49 +0800 Subject: [PATCH 0156/2357] fix the toMinute bug which will cause toDateTime or toString printing wrong time --- base/common/DateLUTImpl.h | 10 +++- src/Functions/ya.make | 1 + .../0_stateless/01698_fix_toMinute.reference | 24 ++++++++ .../0_stateless/01698_fix_toMinute.sql | 16 +++++ .../01699_timezoneOffset.reference | 58 +++++-------------- .../0_stateless/01699_timezoneOffset.sql | 3 +- 6 files changed, 65 insertions(+), 47 deletions(-) create mode 100644 tests/queries/0_stateless/01698_fix_toMinute.reference create mode 100644 tests/queries/0_stateless/01698_fix_toMinute.sql diff --git a/base/common/DateLUTImpl.h b/base/common/DateLUTImpl.h index 23c78f6e7fc..8991f69d3f3 100644 --- a/base/common/DateLUTImpl.h +++ b/base/common/DateLUTImpl.h @@ -317,8 +317,14 @@ public: if (offset_is_whole_number_of_hours_everytime) return (t / 60) % 60; - UInt32 date = find(t).date; - return (UInt32(t) - date) / 60 % 60; + /// To consider the DST changing situation within this day. + /// also make the special timezones with no whole hour offset such as 'Australia/Lord_Howe' been taken into account + DayNum index = findIndex(t); + time_t res = t - lut[index].date; + if (lut[index].amount_of_offset_change != 0 && t >= lut[index].date + lut[index].time_at_offset_change) + res += lut[index].amount_of_offset_change; + + return res / 60 % 60; } inline time_t toStartOfMinute(time_t t) const { return t / 60 * 60; } diff --git a/src/Functions/ya.make b/src/Functions/ya.make index b97a4a187e9..647bbbb47cb 100644 --- a/src/Functions/ya.make +++ b/src/Functions/ya.make @@ -452,6 +452,7 @@ SRCS( timeSlot.cpp timeSlots.cpp timezone.cpp + timezoneOffset.cpp toColumnTypeName.cpp toCustomWeek.cpp toDayOfMonth.cpp diff --git a/tests/queries/0_stateless/01698_fix_toMinute.reference b/tests/queries/0_stateless/01698_fix_toMinute.reference new file mode 100644 index 00000000000..5df800c9fef --- /dev/null +++ b/tests/queries/0_stateless/01698_fix_toMinute.reference @@ -0,0 +1,24 @@ +Check the bug causing situation: the special Australia/Lord_Howe time zone. tooDateTime and toString functions are all tested at once +1554559200 2019-04-07 01:00:00 2019-04-07 01:00:00 +1554559800 2019-04-07 01:10:00 2019-04-07 01:10:00 +1554560400 2019-04-07 01:20:00 2019-04-07 01:20:00 +1554561000 2019-04-07 01:30:00 2019-04-07 01:30:00 +1554561600 2019-04-07 01:40:00 2019-04-07 01:40:00 +1554562200 2019-04-07 01:50:00 2019-04-07 01:50:00 +1554562800 2019-04-07 01:30:00 2019-04-07 01:30:00 +1554563400 2019-04-07 01:40:00 2019-04-07 01:40:00 +1554564000 2019-04-07 01:50:00 2019-04-07 01:50:00 +1554564600 2019-04-07 02:00:00 2019-04-07 02:00:00 +1554565200 2019-04-07 02:10:00 2019-04-07 02:10:00 +1554565800 2019-04-07 02:20:00 2019-04-07 02:20:00 +1554566400 2019-04-07 02:30:00 2019-04-07 02:30:00 +1554567000 2019-04-07 02:40:00 2019-04-07 02:40:00 +1554567600 2019-04-07 02:50:00 2019-04-07 02:50:00 +1554568200 2019-04-07 03:00:00 2019-04-07 03:00:00 +1554568800 2019-04-07 03:10:00 2019-04-07 03:10:00 +1554569400 2019-04-07 03:20:00 2019-04-07 03:20:00 +1554570000 2019-04-07 03:30:00 2019-04-07 03:30:00 +1554570600 2019-04-07 03:40:00 2019-04-07 03:40:00 +4 days test in batch comparing with manually computation result for Europe/Moscow whose timezone epoc is of whole hour: +4 days test in batch comparing with manually computation result for Asia/Tehran whose timezone epoc is of half hour: +4 days test in batch comparing with manually computation result for Australia/Lord_Howe whose timezone epoc is of half hour and also its DST offset is half hour: diff --git a/tests/queries/0_stateless/01698_fix_toMinute.sql b/tests/queries/0_stateless/01698_fix_toMinute.sql new file mode 100644 index 00000000000..293741b6957 --- /dev/null +++ b/tests/queries/0_stateless/01698_fix_toMinute.sql @@ -0,0 +1,16 @@ +/* toDateTime or toString or other functions which should call the toMinute() function will all meet this bug. tests below will verify the toDateTime and toString. */ +SELECT 'Check the bug causing situation: the special Australia/Lord_Howe time zone. tooDateTime and toString functions are all tested at once'; +SELECT toUnixTimestamp(x) as tt, (toDateTime('2019-04-07 01:00:00', 'Australia/Lord_Howe') + INTERVAL number * 600 SECOND) AS x, toString(x) as xx FROM numbers(20); + +/* The Batch Part. Test period is whole 4 days*/ +SELECT '4 days test in batch comparing with manually computation result for Europe/Moscow whose timezone epoc is of whole hour:'; +SELECT toUnixTimestamp(x) as tt, (toDateTime('1981-04-01 00:00:00', 'Europe/Moscow') + INTERVAL number * 600 SECOND) AS x, timezoneOffset(x) as res,(toDateTime(toString(x), 'UTC') - x ) AS calc FROM numbers(576) where res != calc; +SELECT toUnixTimestamp(x) as tt, (toDateTime('1981-09-30 00:00:00', 'Europe/Moscow') + INTERVAL number * 600 SECOND) AS x, timezoneOffset(x) as res,(toDateTime(toString(x), 'UTC') - x ) AS calc FROM numbers(576) where res != calc; + +SELECT '4 days test in batch comparing with manually computation result for Asia/Tehran whose timezone epoc is of half hour:'; +SELECT toUnixTimestamp(x) as tt, (toDateTime('2020-03-21 00:00:00', 'Asia/Tehran') + INTERVAL number * 600 SECOND) AS x, timezoneOffset(x) as res,(toDateTime(toString(x), 'UTC') - x ) AS calc FROM numbers(576) where res != calc; +SELECT toUnixTimestamp(x) as tt, (toDateTime('2020-09-20 00:00:00', 'Asia/Tehran') + INTERVAL number * 600 SECOND) AS x, timezoneOffset(x) as res,(toDateTime(toString(x), 'UTC') - x ) AS calc FROM numbers(576) where res != calc; + +SELECT '4 days test in batch comparing with manually computation result for Australia/Lord_Howe whose timezone epoc is of half hour and also its DST offset is half hour:'; +SELECT toUnixTimestamp(x) as tt, (toDateTime('2020-10-04 01:40:00', 'Australia/Lord_Howe') + INTERVAL number * 600 SECOND) AS x, timezoneOffset(x) as res,(toDateTime(toString(x), 'UTC') - x ) AS calc FROM numbers(576) where res != calc; +SELECT toUnixTimestamp(x) as tt, (toDateTime('2019-04-07 01:00:00', 'Australia/Lord_Howe') + INTERVAL number * 600 SECOND) AS x, timezoneOffset(x) as res,(toDateTime(toString(x), 'UTC') - x ) AS calc FROM numbers(576) where res != calc; diff --git a/tests/queries/0_stateless/01699_timezoneOffset.reference b/tests/queries/0_stateless/01699_timezoneOffset.reference index e70c5fa62ee..45f30314f5a 100644 --- a/tests/queries/0_stateless/01699_timezoneOffset.reference +++ b/tests/queries/0_stateless/01699_timezoneOffset.reference @@ -50,57 +50,29 @@ DST boundary test for Australia/Lord_Howe. This is a special timezone with DST o DST boundary test for Australia/Lord_Howe: 0 2020-10-04 01:40:00 37800 1601737800 1 2020-10-04 01:50:00 37800 1601738400 -2 2020-10-04 02:00:00 39600 1601739000 -3 2020-10-04 02:10:00 39600 1601739600 +2 2020-10-04 02:30:00 39600 1601739000 +3 2020-10-04 02:40:00 39600 1601739600 0 2019-04-07 01:00:00 39600 1554559200 1 2019-04-07 01:10:00 39600 1554559800 2 2019-04-07 01:20:00 39600 1554560400 3 2019-04-07 01:30:00 39600 1554561000 4 2019-04-07 01:40:00 39600 1554561600 5 2019-04-07 01:50:00 39600 1554562200 -6 2019-04-07 01:00:00 37800 1554562800 -7 2019-04-07 01:10:00 37800 1554563400 -8 2019-04-07 01:20:00 37800 1554564000 -9 2019-04-07 02:30:00 37800 1554564600 -10 2019-04-07 02:40:00 37800 1554565200 -11 2019-04-07 02:50:00 37800 1554565800 -12 2019-04-07 02:00:00 37800 1554566400 -13 2019-04-07 02:10:00 37800 1554567000 -14 2019-04-07 02:20:00 37800 1554567600 -15 2019-04-07 03:30:00 37800 1554568200 -16 2019-04-07 03:40:00 37800 1554568800 -17 2019-04-07 03:50:00 37800 1554569400 +6 2019-04-07 01:30:00 37800 1554562800 +7 2019-04-07 01:40:00 37800 1554563400 +8 2019-04-07 01:50:00 37800 1554564000 +9 2019-04-07 02:00:00 37800 1554564600 +10 2019-04-07 02:10:00 37800 1554565200 +11 2019-04-07 02:20:00 37800 1554565800 +12 2019-04-07 02:30:00 37800 1554566400 +13 2019-04-07 02:40:00 37800 1554567000 +14 2019-04-07 02:50:00 37800 1554567600 +15 2019-04-07 03:00:00 37800 1554568200 +16 2019-04-07 03:10:00 37800 1554568800 +17 2019-04-07 03:20:00 37800 1554569400 4 days test in batch comparing with manually computation result for Europe/Moscow: 4 days test in batch comparing with manually computation result for Asia/Tehran: -The result maybe wrong for toDateTime processing Australia/Lord_Howe -1601739000 2020-10-04 02:00:00 39600 37800 -1601739600 2020-10-04 02:10:00 39600 37800 -1601740200 2020-10-04 02:20:00 39600 37800 -1601740800 2020-10-04 03:30:00 39600 41400 -1601741400 2020-10-04 03:40:00 39600 41400 -1601742000 2020-10-04 03:50:00 39600 41400 -1601742600 2020-10-04 03:00:00 39600 37800 -1601743200 2020-10-04 03:10:00 39600 37800 -1601743800 2020-10-04 03:20:00 39600 37800 -1601744400 2020-10-04 04:30:00 39600 41400 -1601745000 2020-10-04 04:40:00 39600 41400 -1601745600 2020-10-04 04:50:00 39600 41400 -1601746200 2020-10-04 04:00:00 39600 37800 -1601746800 2020-10-04 04:10:00 39600 37800 -1601747400 2020-10-04 04:20:00 39600 37800 -1601748000 2020-10-04 05:30:00 39600 41400 -1554562800 2019-04-07 01:00:00 37800 36000 -1554563400 2019-04-07 01:10:00 37800 36000 -1554564000 2019-04-07 01:20:00 37800 36000 -1554564600 2019-04-07 02:30:00 37800 39600 -1554565200 2019-04-07 02:40:00 37800 39600 -1554565800 2019-04-07 02:50:00 37800 39600 -1554566400 2019-04-07 02:00:00 37800 36000 -1554567000 2019-04-07 02:10:00 37800 36000 -1554567600 2019-04-07 02:20:00 37800 36000 -1554568200 2019-04-07 03:30:00 37800 39600 -1554568800 2019-04-07 03:40:00 37800 39600 -1554569400 2019-04-07 03:50:00 37800 39600 +4 days test in batch comparing with manually computation result for Australia/Lord_Howe Moscow DST Years: 11 1981-06-01 00:00:00 14400 12 1982-06-01 00:00:00 14400 diff --git a/tests/queries/0_stateless/01699_timezoneOffset.sql b/tests/queries/0_stateless/01699_timezoneOffset.sql index 1b3f05ecdd7..8cabb23c4de 100644 --- a/tests/queries/0_stateless/01699_timezoneOffset.sql +++ b/tests/queries/0_stateless/01699_timezoneOffset.sql @@ -26,8 +26,7 @@ SELECT '4 days test in batch comparing with manually computation result for Asia SELECT toUnixTimestamp(x) as tt, (toDateTime('2020-03-21 00:00:00', 'Asia/Tehran') + INTERVAL number * 600 SECOND) AS x, timezoneOffset(x) as res,(toDateTime(toString(x), 'UTC') - x ) AS calc FROM numbers(576) where res != calc; SELECT toUnixTimestamp(x) as tt, (toDateTime('2020-09-20 00:00:00', 'Asia/Tehran') + INTERVAL number * 600 SECOND) AS x, timezoneOffset(x) as res,(toDateTime(toString(x), 'UTC') - x ) AS calc FROM numbers(576) where res != calc; -/* During this test we got unexpected result comes from the toDateTime() function when process the special time zone of 'Australia/Lord_Howe', which may be some kind of bugs. */ -SELECT 'The result maybe wrong for toDateTime processing Australia/Lord_Howe'; +SELECT '4 days test in batch comparing with manually computation result for Australia/Lord_Howe'; SELECT toUnixTimestamp(x) as tt, (toDateTime('2020-10-04 01:40:00', 'Australia/Lord_Howe') + INTERVAL number * 600 SECOND) AS x, timezoneOffset(x) as res,(toDateTime(toString(x), 'UTC') - x ) AS calc FROM numbers(18) where res != calc; SELECT toUnixTimestamp(x) as tt, (toDateTime('2019-04-07 01:00:00', 'Australia/Lord_Howe') + INTERVAL number * 600 SECOND) AS x, timezoneOffset(x) as res,(toDateTime(toString(x), 'UTC') - x ) AS calc FROM numbers(18) where res != calc; From 740c1c72e6eed901e56d7256f1067304e265dcf9 Mon Sep 17 00:00:00 2001 From: keenwolf Date: Sat, 6 Feb 2021 16:55:46 +0800 Subject: [PATCH 0157/2357] little fix --- tests/queries/0_stateless/01698_fix_toMinute.sql | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/queries/0_stateless/01698_fix_toMinute.sql b/tests/queries/0_stateless/01698_fix_toMinute.sql index 293741b6957..f582806719d 100644 --- a/tests/queries/0_stateless/01698_fix_toMinute.sql +++ b/tests/queries/0_stateless/01698_fix_toMinute.sql @@ -1,5 +1,5 @@ /* toDateTime or toString or other functions which should call the toMinute() function will all meet this bug. tests below will verify the toDateTime and toString. */ -SELECT 'Check the bug causing situation: the special Australia/Lord_Howe time zone. tooDateTime and toString functions are all tested at once'; +SELECT 'Check the bug causing situation: the special Australia/Lord_Howe time zone. toDateTime and toString functions are all tested at once'; SELECT toUnixTimestamp(x) as tt, (toDateTime('2019-04-07 01:00:00', 'Australia/Lord_Howe') + INTERVAL number * 600 SECOND) AS x, toString(x) as xx FROM numbers(20); /* The Batch Part. Test period is whole 4 days*/ From 34af94accfc03fb6335aae9b8ca27f6e6992d49d Mon Sep 17 00:00:00 2001 From: keenwolf Date: Sat, 6 Feb 2021 16:59:01 +0800 Subject: [PATCH 0158/2357] little fix --- tests/queries/0_stateless/01698_fix_toMinute.reference | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/queries/0_stateless/01698_fix_toMinute.reference b/tests/queries/0_stateless/01698_fix_toMinute.reference index 5df800c9fef..7675aad3a57 100644 --- a/tests/queries/0_stateless/01698_fix_toMinute.reference +++ b/tests/queries/0_stateless/01698_fix_toMinute.reference @@ -1,4 +1,4 @@ -Check the bug causing situation: the special Australia/Lord_Howe time zone. tooDateTime and toString functions are all tested at once +Check the bug causing situation: the special Australia/Lord_Howe time zone. toDateTime and toString functions are all tested at once 1554559200 2019-04-07 01:00:00 2019-04-07 01:00:00 1554559800 2019-04-07 01:10:00 2019-04-07 01:10:00 1554560400 2019-04-07 01:20:00 2019-04-07 01:20:00 From 794f185442ea1da799e8c2348bbaac0b6e310aa7 Mon Sep 17 00:00:00 2001 From: Pavel Kruglov Date: Sat, 6 Feb 2021 17:23:48 +0300 Subject: [PATCH 0159/2357] Fix --- src/Client/HedgedConnections.cpp | 7 +++++-- src/Client/HedgedConnectionsFactory.cpp | 12 +++++++----- src/Client/HedgedConnectionsFactory.h | 2 +- src/Common/Epoll.cpp | 11 ----------- src/Common/Epoll.h | 3 --- src/Processors/Executors/PollingQueue.cpp | 4 +++- 6 files changed, 16 insertions(+), 23 deletions(-) diff --git a/src/Client/HedgedConnections.cpp b/src/Client/HedgedConnections.cpp index a6ffc3cbd1d..8a7c728146f 100644 --- a/src/Client/HedgedConnections.cpp +++ b/src/Client/HedgedConnections.cpp @@ -275,7 +275,7 @@ Packet HedgedConnections::receivePacketImpl(AsyncCallback async_callback) else if (timeout_fd_to_replica_location.contains(event_fd)) { ReplicaLocation location = timeout_fd_to_replica_location[event_fd]; - processTimeoutEvent(location, offset_states[location.offset].replicas[location.index].active_timeouts[event_fd]); + processTimeoutEvent(location, offset_states[location.offset].replicas[location.index].active_timeouts[event_fd]); } else if (event_fd == hedged_connections_factory.getFileDescriptor()) tryGetNewReplica(false); @@ -295,7 +295,10 @@ int HedgedConnections::getReadyFileDescriptor(AsyncCallback async_callback) return replica.connection->getSocket()->impl()->sockfd(); } - return epoll.getReady(true, std::move(async_callback)).data.fd; + epoll_event event; + event.data.fd = -1; + epoll.getManyReady(1, &event, true, std::move(async_callback)); + return event.data.fd; } Packet HedgedConnections::receivePacketFromReplica(ReplicaLocation & replica_location, AsyncCallback async_callback) diff --git a/src/Client/HedgedConnectionsFactory.cpp b/src/Client/HedgedConnectionsFactory.cpp index 22666642b4e..01063faa2fe 100644 --- a/src/Client/HedgedConnectionsFactory.cpp +++ b/src/Client/HedgedConnectionsFactory.cpp @@ -8,7 +8,6 @@ namespace DB { namespace ErrorCodes { - extern const int BAD_ARGUMENTS; extern const int LOGICAL_ERROR; extern const int ALL_CONNECTION_TRIES_FAILED; extern const int ALL_REPLICAS_ARE_STALE; @@ -370,7 +369,10 @@ int HedgedConnectionsFactory::getReadyFileDescriptor(bool blocking) if (replica->connection->hasReadPendingData()) return replica->connection->getSocket()->impl()->sockfd(); - return epoll.getReady(/* blocking */blocking).data.fd; + epoll_event event; + event.data.fd = -1; + epoll.getManyReady(1, &event, blocking); + return event.data.fd; } void HedgedConnectionsFactory::processReplicaEvent(ReplicaStatePtr & replica) @@ -388,7 +390,7 @@ void HedgedConnectionsFactory::processTimeoutEvent(ReplicaStatePtr & replica, Co replica->active_timeouts.erase(timeout_descriptor->timer.getDescriptor()); timeout_fd_to_replica[timeout_descriptor->timer.getDescriptor()]; - if (timeout_descriptor->type == ConnectionTimeoutType::RECEIVE_TIMEOUT) + if (timeout_descriptor->type == ConnectionTimeoutType::RECEIVE_TIMEOUT) { removeTimeoutsFromReplica(replica); int fd = replica->connection->getSocket()->impl()->sockfd(); @@ -401,8 +403,8 @@ void HedgedConnectionsFactory::processTimeoutEvent(ReplicaStatePtr & replica, Co connection_establisher.stage = ConnectionEstablisher::Stage::FAILED; processFailedConnection(replica); } - else if ((timeout_descriptor->type == ConnectionTimeoutType::RECEIVE_HELLO_TIMEOUT - || timeout_descriptor->type == ConnectionTimeoutType::RECEIVE_TABLES_STATUS_TIMEOUT) + else if ((timeout_descriptor->type == ConnectionTimeoutType::RECEIVE_HELLO_TIMEOUT + || timeout_descriptor->type == ConnectionTimeoutType::RECEIVE_TABLES_STATUS_TIMEOUT) && entries_count + indexes_in_process.size() + failed_pools_count < shuffled_pools.size()) replica = createNewReplica(); } diff --git a/src/Client/HedgedConnectionsFactory.h b/src/Client/HedgedConnectionsFactory.h index d1dc262d39c..0a3ac1e7b47 100644 --- a/src/Client/HedgedConnectionsFactory.h +++ b/src/Client/HedgedConnectionsFactory.h @@ -100,7 +100,7 @@ public: private: ReplicaStatePtr startEstablishingConnection(int index); - void processConnectionEstablisherStage(ReplicaStatePtr & replica, bool remove_from_epoll = false); + void processConnectionEstablisherStage(ReplicaStatePtr & replica, bool remove_from_epoll = false); /// Find an index of the next free replica to start connection. /// Return -1 if there is no free replica. diff --git a/src/Common/Epoll.cpp b/src/Common/Epoll.cpp index bfd323b4f55..5a0140a06ec 100644 --- a/src/Common/Epoll.cpp +++ b/src/Common/Epoll.cpp @@ -46,17 +46,6 @@ void Epoll::remove(int fd) --events_count; } -epoll_event Epoll::getReady(bool blocking, AsyncCallback async_callback) const -{ - epoll_event event; - event.data.fd = -1; - size_t ready_events_count = getManyReady(1, &event, blocking, std::move(async_callback)); - if (ready_events_count > 1) - throw Exception("Returned amount of events cannot be more than 1.", ErrorCodes::LOGICAL_ERROR); - - return event; -} - size_t Epoll::getManyReady(int max_events, epoll_event * events_out, bool blocking, AsyncCallback async_callback) const { int ready_size = 0; diff --git a/src/Common/Epoll.h b/src/Common/Epoll.h index 92638715aeb..3a91199799b 100644 --- a/src/Common/Epoll.h +++ b/src/Common/Epoll.h @@ -30,9 +30,6 @@ public: /// with epoll file descriptor. size_t getManyReady(int max_events, epoll_event * events_out, bool blocking, AsyncCallback async_callback = {}) const; - /// Get only one ready event, if blocking is false and there is no ready events, epoll_event.data.fd will be set to -1. - epoll_event getReady(bool blocking = true, AsyncCallback async_callback = {}) const; - int getFileDescriptor() const { return epoll_fd; } int size() const { return events_count; } diff --git a/src/Processors/Executors/PollingQueue.cpp b/src/Processors/Executors/PollingQueue.cpp index 44941ae788a..a601d426a5d 100644 --- a/src/Processors/Executors/PollingQueue.cpp +++ b/src/Processors/Executors/PollingQueue.cpp @@ -68,7 +68,9 @@ PollingQueue::TaskData PollingQueue::wait(std::unique_lock & lock) lock.unlock(); - epoll_event event = epoll.getReady(); + epoll_event event; + event.data.ptr = nullptr; + epoll.getManyReady(1, &event, true); lock.lock(); From 8ff3dde2903c65d23ea4e26568651b707d83ba20 Mon Sep 17 00:00:00 2001 From: Pavel Kruglov Date: Sat, 6 Feb 2021 18:23:41 +0300 Subject: [PATCH 0160/2357] Add sendIgnoredPartUUIDs to HedgedRequests --- src/Client/HedgedConnections.cpp | 19 +++++++++++++++++++ src/Client/HedgedConnections.h | 2 ++ src/Client/IConnections.h | 3 +++ src/Client/MultiplexedConnections.h | 2 +- src/DataStreams/RemoteQueryExecutor.cpp | 2 +- 5 files changed, 26 insertions(+), 2 deletions(-) diff --git a/src/Client/HedgedConnections.cpp b/src/Client/HedgedConnections.cpp index 8a7c728146f..957e4d09fe5 100644 --- a/src/Client/HedgedConnections.cpp +++ b/src/Client/HedgedConnections.cpp @@ -91,6 +91,23 @@ void HedgedConnections::sendExternalTablesData(std::vector & pipeline_for_new_replicas.add(send_external_tables_data); } +void HedgedConnections::sendIgnoredPartUUIDs(const std::vector & uuids) +{ + std::lock_guard lock(cancel_mutex); + + if (sent_query) + throw Exception("Cannot send uuids after query is sent.", ErrorCodes::LOGICAL_ERROR); + + auto send_ignored_part_uuids = [&uuids](ReplicaState & replica) { replica.connection->sendIgnoredPartUUIDs(uuids); }; + + for (auto & offset_state : offset_states) + for (auto & replica : offset_state.replicas) + if (replica.connection) + send_ignored_part_uuids(replica); + + pipeline_for_new_replicas.add(send_ignored_part_uuids); +} + void HedgedConnections::sendQuery( const ConnectionTimeouts & timeouts, const String & query, @@ -220,6 +237,7 @@ Packet HedgedConnections::drain() Packet packet = receivePacketImpl(); switch (packet.type) { + case Protocol::Server::PartUUIDs: case Protocol::Server::Data: case Protocol::Server::Progress: case Protocol::Server::ProfileInfo: @@ -313,6 +331,7 @@ Packet HedgedConnections::receivePacketFromReplica(ReplicaLocation & replica_loc processReceivedFirstDataPacket(replica_location); addTimeoutToReplica(ConnectionTimeoutType::RECEIVE_TIMEOUT, replica); break; + case Protocol::Server::PartUUIDs: case Protocol::Server::Progress: case Protocol::Server::ProfileInfo: case Protocol::Server::Totals: diff --git a/src/Client/HedgedConnections.h b/src/Client/HedgedConnections.h index 6931db9ede6..eb73f2ded52 100644 --- a/src/Client/HedgedConnections.h +++ b/src/Client/HedgedConnections.h @@ -64,6 +64,8 @@ public: void sendCancel() override; + void sendIgnoredPartUUIDs(const std::vector & uuids) override; + Packet drain() override; std::string dumpAddresses() const override; diff --git a/src/Client/IConnections.h b/src/Client/IConnections.h index 85d1e29c243..38730922456 100644 --- a/src/Client/IConnections.h +++ b/src/Client/IConnections.h @@ -36,6 +36,9 @@ public: /// Send a request to replicas to cancel the request virtual void sendCancel() = 0; + /// Send parts' uuids to replicas to exclude them from query processing + virtual void sendIgnoredPartUUIDs(const std::vector & uuids) = 0; + /** On each replica, read and skip all packets to EndOfStream or Exception. * Returns EndOfStream if no exception has been received. Otherwise * returns the last received packet of type Exception. diff --git a/src/Client/MultiplexedConnections.h b/src/Client/MultiplexedConnections.h index a2f7f42e6b6..c04b06e525e 100644 --- a/src/Client/MultiplexedConnections.h +++ b/src/Client/MultiplexedConnections.h @@ -46,7 +46,7 @@ public: void sendCancel() override; /// Send parts' uuids to replicas to exclude them from query processing - void sendIgnoredPartUUIDs(const std::vector & uuids); + void sendIgnoredPartUUIDs(const std::vector & uuids) override; Packet drain() override; diff --git a/src/DataStreams/RemoteQueryExecutor.cpp b/src/DataStreams/RemoteQueryExecutor.cpp index 19996b563a2..9cac638ceb5 100644 --- a/src/DataStreams/RemoteQueryExecutor.cpp +++ b/src/DataStreams/RemoteQueryExecutor.cpp @@ -273,7 +273,7 @@ std::variant RemoteQueryExecutor::restartQueryWithoutDuplicatedUUIDs { /// Cancel previous query and disconnect before retry. cancel(read_context); - multiplexed_connections->disconnect(); + connections->disconnect(); /// Only resend once, otherwise throw an exception if (!resent_query) From 9048dc43d469563cbe71350fe2f51dd84f2fac62 Mon Sep 17 00:00:00 2001 From: Pavel Kruglov Date: Sat, 6 Feb 2021 22:13:50 +0300 Subject: [PATCH 0161/2357] Fix style and build --- src/Common/Epoll.cpp | 3 --- src/DataStreams/RemoteQueryExecutorReadContext.cpp | 4 ++-- 2 files changed, 2 insertions(+), 5 deletions(-) diff --git a/src/Common/Epoll.cpp b/src/Common/Epoll.cpp index 5a0140a06ec..da3a4c4c04b 100644 --- a/src/Common/Epoll.cpp +++ b/src/Common/Epoll.cpp @@ -8,12 +8,9 @@ namespace DB { - -/// TODO: add appropriate error codes namespace ErrorCodes { extern const int EPOLL_ERROR; - extern const int LOGICAL_ERROR; } Epoll::Epoll() : events_count(0) diff --git a/src/DataStreams/RemoteQueryExecutorReadContext.cpp b/src/DataStreams/RemoteQueryExecutorReadContext.cpp index e02ac1fc1b3..c2a65f02d08 100644 --- a/src/DataStreams/RemoteQueryExecutorReadContext.cpp +++ b/src/DataStreams/RemoteQueryExecutorReadContext.cpp @@ -124,13 +124,13 @@ bool RemoteQueryExecutorReadContext::checkTimeoutImpl() const epoll_event events[3]; events[0].data.fd = events[1].data.fd = events[2].data.fd = -1; - epoll.getManyReady(3, events,/* blocking = */ false); + int num_events = epoll.getManyReady(3, events,/* blocking = */ false); bool is_socket_ready = false; bool is_pipe_alarmed = false; bool has_timer_alarm = false; - for (int i = 0; i < 3; ++i) + for (int i = 0; i < num_events; ++i) { if (events[i].data.fd == connection_fd) is_socket_ready = true; From 3d8e05dc94d0b241dcd0e69c908f9b056fccd3fe Mon Sep 17 00:00:00 2001 From: Pavel Kruglov Date: Sun, 7 Feb 2021 00:54:29 +0300 Subject: [PATCH 0162/2357] Fix style --- src/Client/HedgedConnections.cpp | 3 ++- src/Client/HedgedConnectionsFactory.cpp | 3 ++- 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/src/Client/HedgedConnections.cpp b/src/Client/HedgedConnections.cpp index 957e4d09fe5..65100a7ea41 100644 --- a/src/Client/HedgedConnections.cpp +++ b/src/Client/HedgedConnections.cpp @@ -135,7 +135,8 @@ void HedgedConnections::sendQuery( break; } - auto send_query = [this, timeouts, query, query_id, stage, client_info, with_pending_data](ReplicaState & replica) { + auto send_query = [this, timeouts, query, query_id, stage, client_info, with_pending_data](ReplicaState & replica) + { Settings modified_settings = settings; if (disable_two_level_aggregation) diff --git a/src/Client/HedgedConnectionsFactory.cpp b/src/Client/HedgedConnectionsFactory.cpp index 01063faa2fe..84848949fb9 100644 --- a/src/Client/HedgedConnectionsFactory.cpp +++ b/src/Client/HedgedConnectionsFactory.cpp @@ -231,7 +231,8 @@ HedgedConnectionsFactory::ReplicaStatePtr HedgedConnectionsFactory::startEstabli { epoll.add(connection_establisher.socket_fd); fd_to_replica[connection_establisher.socket_fd] = replica; - connection_establisher.setActionBeforeDisconnect([&](int fd) { + connection_establisher.setActionBeforeDisconnect([&](int fd) + { epoll.remove(fd); fd_to_replica.erase(fd); }); From 636ef5394bd5f2977783b46b3d33ca4620220b4f Mon Sep 17 00:00:00 2001 From: Kruglov Pavel <48961922+Avogar@users.noreply.github.com> Date: Sun, 7 Feb 2021 01:45:11 +0300 Subject: [PATCH 0163/2357] Remove redundant field --- src/Client/HedgedConnectionsFactory.h | 1 - 1 file changed, 1 deletion(-) diff --git a/src/Client/HedgedConnectionsFactory.h b/src/Client/HedgedConnectionsFactory.h index 0a3ac1e7b47..799e16bb068 100644 --- a/src/Client/HedgedConnectionsFactory.h +++ b/src/Client/HedgedConnectionsFactory.h @@ -137,7 +137,6 @@ private: std::vector connection_establishers; std::vector shuffled_pools; - std::vector replica_states; /// Map socket file descriptor to replica. std::unordered_map fd_to_replica; From 203cafa9b6e421971f3d2d67051929d40d3a21f9 Mon Sep 17 00:00:00 2001 From: Nicolae Vartolomei Date: Sun, 7 Feb 2021 14:32:19 +0000 Subject: [PATCH 0164/2357] Mark functions as pure virtual to fix missing vtable pointer error ``` Undefined symbols for architecture x86_64: "vtable for DB::IModel", referenced from: DB::IModel::IModel() in Obfuscator.cpp.o NOTE: a missing vtable usually means the first non-inline virtual member function has no definition. ld: symbol(s) not found for architecture x86_64 clang-11: error: linker command failed with exit code 1 (use -v to see invocation) ninja: build stopped: subcommand failed. ``` --- programs/obfuscator/Obfuscator.cpp | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/programs/obfuscator/Obfuscator.cpp b/programs/obfuscator/Obfuscator.cpp index 950db4e4f05..5eb5467c58b 100644 --- a/programs/obfuscator/Obfuscator.cpp +++ b/programs/obfuscator/Obfuscator.cpp @@ -100,16 +100,16 @@ class IModel { public: /// Call train iteratively for each block to train a model. - virtual void train(const IColumn & column); + virtual void train(const IColumn & column) = 0; /// Call finalize one time after training before generating. - virtual void finalize(); + virtual void finalize() = 0; /// Call generate: pass source data column to obtain a column with anonymized data as a result. - virtual ColumnPtr generate(const IColumn & column); + virtual ColumnPtr generate(const IColumn & column) = 0; /// Deterministically change seed to some other value. This can be used to generate more values than were in source. - virtual void updateSeed(); + virtual void updateSeed() = 0; virtual ~IModel() = default; }; From 479b45d772f18b690f1da7365d9a582ad836b577 Mon Sep 17 00:00:00 2001 From: Nicolae Vartolomei Date: Sun, 7 Feb 2021 14:37:50 +0000 Subject: [PATCH 0165/2357] Fix linker flags for shared linking on macOS This combination now works: `-DUSE_STATIC_LIBRARIES=0 -DSPLIT_SHARED_LIBRARIES=1`. Without `SPLIT_SHARED_LIBRARIES` it is still failing. --- base/common/CMakeLists.txt | 4 ++++ base/daemon/CMakeLists.txt | 5 +++++ src/CMakeLists.txt | 12 ++++++++++-- 3 files changed, 19 insertions(+), 2 deletions(-) diff --git a/base/common/CMakeLists.txt b/base/common/CMakeLists.txt index cea52b443dd..217976e2fb0 100644 --- a/base/common/CMakeLists.txt +++ b/base/common/CMakeLists.txt @@ -47,6 +47,10 @@ endif() target_include_directories(common PUBLIC .. ${CMAKE_CURRENT_BINARY_DIR}/..) +if (OS_DARWIN AND NOT MAKE_STATIC_LIBRARIES) + target_link_libraries(common PUBLIC -Wl,-U,_inside_main) +endif() + # Allow explicit fallback to readline if (NOT ENABLE_REPLXX AND ENABLE_READLINE) message (STATUS "Attempt to fallback to readline explicitly") diff --git a/base/daemon/CMakeLists.txt b/base/daemon/CMakeLists.txt index 26d59a57e7f..6ef87db6a61 100644 --- a/base/daemon/CMakeLists.txt +++ b/base/daemon/CMakeLists.txt @@ -5,6 +5,11 @@ add_library (daemon ) target_include_directories (daemon PUBLIC ..) + +if (OS_DARWIN AND NOT MAKE_STATIC_LIBRARIES) + target_link_libraries (daemon PUBLIC -Wl,-undefined,dynamic_lookup) +endif() + target_link_libraries (daemon PUBLIC loggers PRIVATE clickhouse_common_io clickhouse_common_config common ${EXECINFO_LIBRARIES}) if (USE_SENTRY) diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index dba9385fe27..8acd8b32c39 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -157,7 +157,11 @@ macro(add_object_library name common_path) list (APPEND all_modules ${name}) add_headers_and_sources(${name} ${common_path}) add_library(${name} SHARED ${${name}_sources} ${${name}_headers}) - target_link_libraries (${name} PRIVATE -Wl,--unresolved-symbols=ignore-all) + if (OS_DARWIN) + target_link_libraries (${name} PRIVATE -Wl,-undefined,dynamic_lookup) + else() + target_link_libraries (${name} PRIVATE -Wl,--unresolved-symbols=ignore-all) + endif() endif () endmacro() @@ -209,7 +213,11 @@ else() target_link_libraries (clickhouse_interpreters PRIVATE clickhouse_parsers_new jemalloc libdivide) list (APPEND all_modules dbms) # force all split libs to be linked - set (CMAKE_SHARED_LINKER_FLAGS "${CMAKE_SHARED_LINKER_FLAGS} -Wl,--no-as-needed") + if (OS_DARWIN) + set (CMAKE_SHARED_LINKER_FLAGS "${CMAKE_SHARED_LINKER_FLAGS} -Wl,-undefined,error") + else() + set (CMAKE_SHARED_LINKER_FLAGS "${CMAKE_SHARED_LINKER_FLAGS} -Wl,--no-as-needed") + endif() endif () macro (dbms_target_include_directories) From 098a6d6a051c70937ffe1dba404e680bec5065e5 Mon Sep 17 00:00:00 2001 From: Vladimir Chebotarev Date: Mon, 8 Feb 2021 09:22:00 +0300 Subject: [PATCH 0166/2357] Added test for S3 table function. --- .../test_allowed_url_from_config/test.py | 15 +++++++++------ 1 file changed, 9 insertions(+), 6 deletions(-) diff --git a/tests/integration/test_allowed_url_from_config/test.py b/tests/integration/test_allowed_url_from_config/test.py index d920faae96a..8001af35913 100644 --- a/tests/integration/test_allowed_url_from_config/test.py +++ b/tests/integration/test_allowed_url_from_config/test.py @@ -53,20 +53,23 @@ def test_config_with_only_regexp_hosts(start_cluster): def test_config_without_allowed_hosts_section(start_cluster): assert node4.query("CREATE TABLE table_test_4_1 (word String) Engine=URL('https://host:80', CSV)") == "" - assert node4.query("CREATE TABLE table_test_4_2 (word String) Engine=URL('https://host', HDFS)") == "" - assert node4.query("CREATE TABLE table_test_4_3 (word String) Engine=URL('https://yandex.ru', CSV)") == "" - assert node4.query("CREATE TABLE table_test_4_4 (word String) Engine=URL('ftp://something.com', S3)") == "" + assert node4.query("CREATE TABLE table_test_4_2 (word String) Engine=S3('https://host:80/bucket/key', CSV)") == "" + assert node4.query("CREATE TABLE table_test_4_3 (word String) Engine=URL('https://host', HDFS)") == "" + assert node4.query("CREATE TABLE table_test_4_4 (word String) Engine=URL('https://yandex.ru', CSV)") == "" + assert node4.query("CREATE TABLE table_test_4_5 (word String) Engine=URL('ftp://something.com', S3)") == "" def test_config_without_allowed_hosts(start_cluster): assert "not allowed" in node5.query_and_get_error( "CREATE TABLE table_test_5_1 (word String) Engine=URL('https://host:80', CSV)") assert "not allowed" in node5.query_and_get_error( - "CREATE TABLE table_test_5_2 (word String) Engine=URL('https://host', HDFS)") + "CREATE TABLE table_test_5_2 (word String) Engine=S3('https://host:80/bucket/key', CSV)") assert "not allowed" in node5.query_and_get_error( - "CREATE TABLE table_test_5_3 (word String) Engine=URL('https://yandex.ru', CSV)") + "CREATE TABLE table_test_5_3 (word String) Engine=URL('https://host', HDFS)") assert "not allowed" in node5.query_and_get_error( - "CREATE TABLE table_test_5_4 (word String) Engine=URL('ftp://something.com', S3)") + "CREATE TABLE table_test_5_4 (word String) Engine=URL('https://yandex.ru', CSV)") + assert "not allowed" in node5.query_and_get_error( + "CREATE TABLE table_test_5_5 (word String) Engine=URL('ftp://something.com', S3)") def test_table_function_remote(start_cluster): From 7ce0ef2561deda64192a2a0531dcc054b6ea1c60 Mon Sep 17 00:00:00 2001 From: Alexander Tokmakov Date: Mon, 8 Feb 2021 12:14:17 +0300 Subject: [PATCH 0167/2357] show clusters for replicated db --- src/Databases/DatabaseReplicated.cpp | 108 +++++++++++++++++- src/Databases/DatabaseReplicated.h | 8 +- src/Databases/DatabaseReplicatedWorker.cpp | 2 +- src/Interpreters/DDLWorker.cpp | 2 +- src/Interpreters/InterpreterCreateQuery.cpp | 29 +++-- src/Storages/System/StorageSystemClusters.cpp | 66 ++++++----- src/Storages/System/StorageSystemClusters.h | 3 + tests/queries/skip_list.json | 12 ++ 8 files changed, 186 insertions(+), 44 deletions(-) diff --git a/src/Databases/DatabaseReplicated.cpp b/src/Databases/DatabaseReplicated.cpp index 5a11787331c..43568379632 100644 --- a/src/Databases/DatabaseReplicated.cpp +++ b/src/Databases/DatabaseReplicated.cpp @@ -36,8 +36,11 @@ namespace ErrorCodes extern const int UNKNOWN_DATABASE; extern const int NOT_IMPLEMENTED; extern const int INCORRECT_QUERY; + extern const int ALL_CONNECTION_TRIES_FAILED; } +static constexpr const char * DROPPED_MARK = "DROPPED"; + zkutil::ZooKeeperPtr DatabaseReplicated::getZooKeeper() const { return global_context.getZooKeeper(); @@ -68,6 +71,8 @@ DatabaseReplicated::DatabaseReplicated( throw Exception("ZooKeeper path, shard and replica names must be non-empty", ErrorCodes::BAD_ARGUMENTS); if (shard_name.find('/') != std::string::npos || replica_name.find('/') != std::string::npos) throw Exception("Shard and replica names should not contain '/'", ErrorCodes::BAD_ARGUMENTS); + if (shard_name.find('|') != std::string::npos || replica_name.find('|') != std::string::npos) + throw Exception("Shard and replica names should not contain '|'", ErrorCodes::BAD_ARGUMENTS); if (zookeeper_path.back() == '/') zookeeper_path.resize(zookeeper_path.size() - 1); @@ -90,7 +95,7 @@ DatabaseReplicated::DatabaseReplicated( createDatabaseNodesInZooKeeper(current_zookeeper); } - replica_path = zookeeper_path + "/replicas/" + shard_name + "/" + replica_name; + replica_path = zookeeper_path + "/replicas/" + getFullReplicaName(); String replica_host_id; if (current_zookeeper->tryGet(replica_path, replica_host_id)) @@ -110,6 +115,93 @@ DatabaseReplicated::DatabaseReplicated( } } +String DatabaseReplicated::getFullReplicaName() const +{ + return shard_name + '|' + replica_name; +} + +std::pair DatabaseReplicated::parseFullReplicaName(const String & name) +{ + String shard; + String replica; + auto pos = name.find('|'); + if (pos == std::string::npos || name.find('|', pos + 1) != std::string::npos) + throw Exception(ErrorCodes::LOGICAL_ERROR, "Incorrect replica identifier: {}", name); + shard = name.substr(0, pos); + replica = name.substr(pos + 1); + return {shard, replica}; +} + +ClusterPtr DatabaseReplicated::getCluster() const +{ + Strings hosts; + Strings host_ids; + + auto zookeeper = global_context.getZooKeeper(); + constexpr int max_retries = 10; + int iteration = 0; + bool success = false; + while (++iteration <= max_retries) + { + host_ids.resize(0); + Coordination::Stat stat; + hosts = zookeeper->getChildren(zookeeper_path + "/replicas", &stat); + if (hosts.empty()) + throw Exception(ErrorCodes::LOGICAL_ERROR, "No hosts found"); + Int32 cver = stat.cversion; + + std::vector futures; + futures.reserve(hosts.size()); + host_ids.reserve(hosts.size()); + for (const auto & host : hosts) + futures.emplace_back(zookeeper->asyncTryGet(zookeeper_path + "/replicas/" + host)); + + success = true; + for (auto & future : futures) + { + auto res = future.get(); + if (res.error != Coordination::Error::ZOK) + success = false; + host_ids.emplace_back(res.data); + } + + zookeeper->get(zookeeper_path + "/replicas", &stat); + if (success && cver == stat.version) + break; + } + if (!success) + throw Exception(ErrorCodes::ALL_CONNECTION_TRIES_FAILED, "Cannot get consistent cluster snapshot"); + + assert(!hosts.empty()); + assert(hosts.size() == host_ids.size()); + std::sort(hosts.begin(), hosts.end()); + String current_shard = parseFullReplicaName(hosts.front()).first; + std::vector shards; + shards.emplace_back(); + for (size_t i = 0; i < hosts.size(); ++i) + { + const auto & id = host_ids[i]; + if (id == DROPPED_MARK) + continue; + auto [shard, replica] = parseFullReplicaName(hosts[i]); + auto pos = id.find(':'); + String host = id.substr(0, pos); + if (shard != current_shard) + { + current_shard = shard; + if (!shards.back().empty()) + shards.emplace_back(); + } + shards.back().emplace_back(unescapeForFileName(host)); + } + + /// TODO make it configurable + String username = "default"; + String password; + + return std::make_shared(global_context.getSettingsRef(), shards, username, password, global_context.getTCPPort(), false); +} + bool DatabaseReplicated::createDatabaseNodesInZooKeeper(const zkutil::ZooKeeperPtr & current_zookeeper) { current_zookeeper->createAncestors(zookeeper_path); @@ -139,8 +231,6 @@ bool DatabaseReplicated::createDatabaseNodesInZooKeeper(const zkutil::ZooKeeperP void DatabaseReplicated::createReplicaNodesInZooKeeper(const zkutil::ZooKeeperPtr & current_zookeeper) { - current_zookeeper->createAncestors(replica_path); - /// When creating new replica, use latest snapshot version as initial value of log_pointer //log_entry_to_execute = 0; //FIXME @@ -296,9 +386,15 @@ ASTPtr DatabaseReplicated::parseQueryFromMetadataInZooKeeper(const String & node void DatabaseReplicated::drop(const Context & context_) { auto current_zookeeper = getZooKeeper(); - current_zookeeper->set(replica_path, "DROPPED"); + current_zookeeper->set(replica_path, DROPPED_MARK); DatabaseAtomic::drop(context_); current_zookeeper->tryRemoveRecursive(replica_path); + /// TODO it may leave garbage in ZooKeeper if the last node lost connection here + if (current_zookeeper->tryRemove(zookeeper_path + "/replicas") == Coordination::Error::ZOK) + { + /// It was the last replica, remove all metadata + current_zookeeper->tryRemoveRecursive(zookeeper_path); + } } void DatabaseReplicated::stopReplication() @@ -318,7 +414,7 @@ void DatabaseReplicated::shutdown() void DatabaseReplicated::dropTable(const Context & context, const String & table_name, bool no_delay) { auto txn = context.getMetadataTransaction(); - //assert(!ddl_worker->isCurrentlyActive() || txn /*|| called from DROP DATABASE */); + assert(!ddl_worker->isCurrentlyActive() || txn); if (txn && txn->is_initial_query) { String metadata_zk_path = zookeeper_path + "/metadata/" + escapeForFileName(table_name); @@ -335,6 +431,8 @@ void DatabaseReplicated::renameTable(const Context & context, const String & tab if (txn->is_initial_query) { + if (this != &to_database) + throw Exception(ErrorCodes::NOT_IMPLEMENTED, "Moving tables between databases is not supported for Replicated engine"); if (!isTableExist(table_name, context)) throw Exception(ErrorCodes::UNKNOWN_TABLE, "Table {} does not exist", table_name); if (exchange && !to_database.isTableExist(to_table_name, context)) diff --git a/src/Databases/DatabaseReplicated.h b/src/Databases/DatabaseReplicated.h index a866a61558c..0f500b16470 100644 --- a/src/Databases/DatabaseReplicated.h +++ b/src/Databases/DatabaseReplicated.h @@ -15,6 +15,9 @@ namespace DB class DatabaseReplicatedDDLWorker; using ZooKeeperPtr = std::shared_ptr; +class Cluster; +using ClusterPtr = std::shared_ptr; + /** DatabaseReplicated engine * supports replication of metadata * via DDL log being written to ZooKeeper @@ -67,7 +70,10 @@ public: void loadStoredObjects(Context & context, bool has_force_restore_data_flag, bool force_attach) override; - String getFullReplicaName() const { return shard_name + '|' + replica_name; } + String getFullReplicaName() const; + static std::pair parseFullReplicaName(const String & name); + + ClusterPtr getCluster() const; //FIXME friend struct DatabaseReplicatedTask; diff --git a/src/Databases/DatabaseReplicatedWorker.cpp b/src/Databases/DatabaseReplicatedWorker.cpp index 1c000a8f0a7..748305922b7 100644 --- a/src/Databases/DatabaseReplicatedWorker.cpp +++ b/src/Databases/DatabaseReplicatedWorker.cpp @@ -208,7 +208,7 @@ DDLTaskPtr DatabaseReplicatedDDLWorker::initAndCheckTask(const String & entry_na if (task->is_initial_query) { assert(!zookeeper->exists(entry_path + "/try")); - assert(zookeeper->exists(entry_path + "/committed") == (zookeeper->get(task->getFinishedNodePath()) == "0")); + assert(zookeeper->exists(entry_path + "/committed") == (zookeeper->get(task->getFinishedNodePath()) == ExecutionStatus(0).serializeText())); out_reason = fmt::format("Entry {} has been executed as initial query", entry_name); return {}; } diff --git a/src/Interpreters/DDLWorker.cpp b/src/Interpreters/DDLWorker.cpp index da2e878541d..f0cc3370211 100644 --- a/src/Interpreters/DDLWorker.cpp +++ b/src/Interpreters/DDLWorker.cpp @@ -488,7 +488,7 @@ void DDLWorker::processTask(DDLTaskBase & task) /// updating metadata in Replicated database), so we make create request for finished_node_path with status "0", /// which means that query executed successfully. task.ops.emplace_back(zkutil::makeRemoveRequest(active_node_path, -1)); - task.ops.emplace_back(zkutil::makeCreateRequest(finished_node_path, "0", zkutil::CreateMode::Persistent)); + task.ops.emplace_back(zkutil::makeCreateRequest(finished_node_path, ExecutionStatus(0).serializeText(), zkutil::CreateMode::Persistent)); try { diff --git a/src/Interpreters/InterpreterCreateQuery.cpp b/src/Interpreters/InterpreterCreateQuery.cpp index 6af212172b2..be241339ef7 100644 --- a/src/Interpreters/InterpreterCreateQuery.cpp +++ b/src/Interpreters/InterpreterCreateQuery.cpp @@ -827,17 +827,28 @@ BlockIO InterpreterCreateQuery::createTable(ASTCreateQuery & create) if (create.attach_from_path) { - fs::path data_path = fs::path(*create.attach_from_path).lexically_normal(); fs::path user_files = fs::path(context.getUserFilesPath()).lexically_normal(); - if (data_path.is_relative()) - data_path = (user_files / data_path).lexically_normal(); - if (!startsWith(data_path, user_files)) - throw Exception(ErrorCodes::PATH_ACCESS_DENIED, - "Data directory {} must be inside {} to attach it", String(data_path), String(user_files)); - fs::path root_path = fs::path(context.getPath()).lexically_normal(); - /// Data path must be relative to root_path - create.attach_from_path = fs::relative(data_path, root_path) / ""; + + if (context.getClientInfo().query_kind == ClientInfo::QueryKind::INITIAL_QUERY) + { + fs::path data_path = fs::path(*create.attach_from_path).lexically_normal(); + if (data_path.is_relative()) + data_path = (user_files / data_path).lexically_normal(); + if (!startsWith(data_path, user_files)) + throw Exception(ErrorCodes::PATH_ACCESS_DENIED, + "Data directory {} must be inside {} to attach it", String(data_path), String(user_files)); + + /// Data path must be relative to root_path + create.attach_from_path = fs::relative(data_path, root_path) / ""; + } + else + { + fs::path data_path = (root_path / *create.attach_from_path).lexically_normal(); + if (!startsWith(data_path, user_files)) + throw Exception(ErrorCodes::PATH_ACCESS_DENIED, + "Data directory {} must be inside {} to attach it", String(data_path), String(user_files)); + } } else if (create.attach && !create.attach_short_syntax && context.getClientInfo().query_kind != ClientInfo::QueryKind::REPLICATED_LOG_QUERY) { diff --git a/src/Storages/System/StorageSystemClusters.cpp b/src/Storages/System/StorageSystemClusters.cpp index ae8bcca2804..62ad1c5150f 100644 --- a/src/Storages/System/StorageSystemClusters.cpp +++ b/src/Storages/System/StorageSystemClusters.cpp @@ -3,6 +3,7 @@ #include #include #include +#include namespace DB { @@ -26,40 +27,51 @@ NamesAndTypesList StorageSystemClusters::getNamesAndTypes() }; } + void StorageSystemClusters::fillData(MutableColumns & res_columns, const Context & context, const SelectQueryInfo &) const { for (const auto & name_and_cluster : context.getClusters().getContainer()) + writeCluster(res_columns, name_and_cluster); + + const auto databases = DatabaseCatalog::instance().getDatabases(); + for (const auto & name_and_database : databases) { - const String & cluster_name = name_and_cluster.first; - const ClusterPtr & cluster = name_and_cluster.second; - const auto & shards_info = cluster->getShardsInfo(); - const auto & addresses_with_failover = cluster->getShardsAddresses(); + if (const auto * replicated = typeid_cast(name_and_database.second.get())) + writeCluster(res_columns, {name_and_database.first, replicated->getCluster()}); + } +} - for (size_t shard_index = 0; shard_index < shards_info.size(); ++shard_index) +void StorageSystemClusters::writeCluster(MutableColumns & res_columns, const NameAndCluster & name_and_cluster) const +{ + const String & cluster_name = name_and_cluster.first; + const ClusterPtr & cluster = name_and_cluster.second; + const auto & shards_info = cluster->getShardsInfo(); + const auto & addresses_with_failover = cluster->getShardsAddresses(); + + for (size_t shard_index = 0; shard_index < shards_info.size(); ++shard_index) + { + const auto & shard_info = shards_info[shard_index]; + const auto & shard_addresses = addresses_with_failover[shard_index]; + const auto pool_status = shard_info.pool->getStatus(); + + for (size_t replica_index = 0; replica_index < shard_addresses.size(); ++replica_index) { - const auto & shard_info = shards_info[shard_index]; - const auto & shard_addresses = addresses_with_failover[shard_index]; - const auto pool_status = shard_info.pool->getStatus(); + size_t i = 0; + const auto & address = shard_addresses[replica_index]; - for (size_t replica_index = 0; replica_index < shard_addresses.size(); ++replica_index) - { - size_t i = 0; - const auto & address = shard_addresses[replica_index]; - - res_columns[i++]->insert(cluster_name); - res_columns[i++]->insert(shard_info.shard_num); - res_columns[i++]->insert(shard_info.weight); - res_columns[i++]->insert(replica_index + 1); - res_columns[i++]->insert(address.host_name); - auto resolved = address.getResolvedAddress(); - res_columns[i++]->insert(resolved ? resolved->host().toString() : String()); - res_columns[i++]->insert(address.port); - res_columns[i++]->insert(address.is_local); - res_columns[i++]->insert(address.user); - res_columns[i++]->insert(address.default_database); - res_columns[i++]->insert(pool_status[replica_index].error_count); - res_columns[i++]->insert(pool_status[replica_index].estimated_recovery_time.count()); - } + res_columns[i++]->insert(cluster_name); + res_columns[i++]->insert(shard_info.shard_num); + res_columns[i++]->insert(shard_info.weight); + res_columns[i++]->insert(replica_index + 1); + res_columns[i++]->insert(address.host_name); + auto resolved = address.getResolvedAddress(); + res_columns[i++]->insert(resolved ? resolved->host().toString() : String()); + res_columns[i++]->insert(address.port); + res_columns[i++]->insert(address.is_local); + res_columns[i++]->insert(address.user); + res_columns[i++]->insert(address.default_database); + res_columns[i++]->insert(pool_status[replica_index].error_count); + res_columns[i++]->insert(pool_status[replica_index].estimated_recovery_time.count()); } } } diff --git a/src/Storages/System/StorageSystemClusters.h b/src/Storages/System/StorageSystemClusters.h index 4cda7c372b2..68282f1b1fe 100644 --- a/src/Storages/System/StorageSystemClusters.h +++ b/src/Storages/System/StorageSystemClusters.h @@ -10,6 +10,7 @@ namespace DB { class Context; +class Cluster; /** Implements system table 'clusters' * that allows to obtain information about available clusters @@ -25,8 +26,10 @@ public: protected: using IStorageSystemOneBlock::IStorageSystemOneBlock; + using NameAndCluster = std::pair>; void fillData(MutableColumns & res_columns, const Context & context, const SelectQueryInfo & query_info) const override; + void writeCluster(MutableColumns & res_columns, const NameAndCluster & name_and_cluster) const; }; } diff --git a/tests/queries/skip_list.json b/tests/queries/skip_list.json index adee777f900..4c6927f575a 100644 --- a/tests/queries/skip_list.json +++ b/tests/queries/skip_list.json @@ -103,7 +103,19 @@ "memory_tracking", /// FIXME remove it before merge "memory_tracking", "memory_usage", + "01686_rocksdb", + "01550_mutation_subquery", + "01070_mutations_with_dependencies", + "01070_materialize_ttl", + "01055_compact_parts", + "01017_mutations_with_nondeterministic_functions_zookeeper", + "00926_adaptive_index_granularity_pk", + "00910_zookeeper_test_alter_compression_codecs", + "00908_bloom_filter_index", + "00616_final_single_part", + "00446_clear_column_in_partition_zookeeper", "01533_multiple_nested", + "01213_alter_rename_column_zookeeper", "01575_disable_detach_table_of_dictionary", "01457_create_as_table_function_structure", "01415_inconsistent_merge_tree_settings", From cd94f708a16dc1135637fa7d9bf852317531798d Mon Sep 17 00:00:00 2001 From: Anton Ivashkin Date: Fri, 5 Feb 2021 20:13:44 +0300 Subject: [PATCH 0168/2357] Fix build after merge --- src/Disks/DiskDecorator.h | 7 +++---- src/Disks/IStoragePolicy.h | 2 ++ src/Disks/S3/DiskS3.cpp | 8 ++------ src/Disks/StoragePolicy.h | 2 +- 4 files changed, 8 insertions(+), 11 deletions(-) diff --git a/src/Disks/DiskDecorator.h b/src/Disks/DiskDecorator.h index 18f43e1b9b6..d5ac6f0fda0 100644 --- a/src/Disks/DiskDecorator.h +++ b/src/Disks/DiskDecorator.h @@ -49,10 +49,9 @@ public: void setReadOnly(const String & path) override; void createHardLink(const String & src_path, const String & dst_path) override; void truncateFile(const String & path, size_t size) override; - int open(const String & path, mode_t mode) const override; - void close(int fd) const override; - void sync(int fd) const override; - const String getType() const override { return delegate->getType(); } + int open(const String & path, mode_t mode) const; + void close(int fd) const; + void sync(int fd) const; String getUniqueId(const String & path) const override { return delegate->getUniqueId(path); } bool checkUniqueId(const String & id) const override { return delegate->checkUniqueId(id); } DiskType::Type getType() const override { return delegate->getType(); } diff --git a/src/Disks/IStoragePolicy.h b/src/Disks/IStoragePolicy.h index a41ea87c328..957021441b8 100644 --- a/src/Disks/IStoragePolicy.h +++ b/src/Disks/IStoragePolicy.h @@ -36,6 +36,7 @@ public: /// mutations files virtual DiskPtr getAnyDisk() const = 0; virtual DiskPtr getDiskByName(const String & disk_name) const = 0; + virtual Disks getDisksByType(const String & type) const = 0; /// Get free space from most free disk virtual UInt64 getMaxUnreservedFreeSpace() const = 0; /// Reserves space on any volume with index > min_volume_index or returns nullptr @@ -57,6 +58,7 @@ public: /// Check if we have any volume with stopped merges virtual bool hasAnyVolumeWithDisabledMerges() const = 0; virtual bool containsVolume(const String & volume_name) const = 0; + /// Returns disks by type ordered by volumes priority }; } diff --git a/src/Disks/S3/DiskS3.cpp b/src/Disks/S3/DiskS3.cpp index f223e423256..56789cf6327 100644 --- a/src/Disks/S3/DiskS3.cpp +++ b/src/Disks/S3/DiskS3.cpp @@ -23,12 +23,8 @@ #include #include #include -<<<<<<< HEAD -#include -======= #include #include ->>>>>>> master #include @@ -985,10 +981,10 @@ bool DiskS3::checkUniqueId(const String & id) const { /// Check that we have right s3 and have access rights /// Actually interprets id as s3 object name and checks if it exists - Aws::S3::Model::ListObjectsRequest request; + Aws::S3::Model::ListObjectsV2Request request; request.SetBucket(bucket); request.SetPrefix(id); - auto resp = client->ListObjects(request); + auto resp = client->ListObjectsV2(request); throwIfError(resp); Aws::Vector object_list = resp.GetResult().GetContents(); diff --git a/src/Disks/StoragePolicy.h b/src/Disks/StoragePolicy.h index 5cc92e1ede7..7e72fcda8b1 100644 --- a/src/Disks/StoragePolicy.h +++ b/src/Disks/StoragePolicy.h @@ -48,7 +48,7 @@ public: Disks getDisks() const override; /// Returns disks by type ordered by volumes priority - Disks getDisksByType(const String & type) const; + Disks getDisksByType(const String & type) const override; /// Returns any disk /// Used when it's not important, for example for From 91d0924665401514396ed30ef6c01c8212b0b4bb Mon Sep 17 00:00:00 2001 From: Alexander Tokmakov Date: Mon, 8 Feb 2021 12:46:30 +0300 Subject: [PATCH 0169/2357] write dictionaries metadata to zk --- src/Databases/DatabaseReplicated.cpp | 30 +++++++++++++++++++++ src/Databases/DatabaseReplicated.h | 4 +++ src/Databases/DatabaseWithDictionaries.cpp | 12 ++++++++- src/Interpreters/InterpreterCreateQuery.cpp | 7 +++++ src/Interpreters/InterpreterDropQuery.cpp | 13 +++++++++ 5 files changed, 65 insertions(+), 1 deletion(-) diff --git a/src/Databases/DatabaseReplicated.cpp b/src/Databases/DatabaseReplicated.cpp index 43568379632..a134ba5dec7 100644 --- a/src/Databases/DatabaseReplicated.cpp +++ b/src/Databases/DatabaseReplicated.cpp @@ -303,6 +303,9 @@ BlockIO DatabaseReplicated::propose(const ASTPtr & query, const Context & query_ if (query_context.getClientInfo().query_kind != ClientInfo::QueryKind::INITIAL_QUERY) throw Exception(ErrorCodes::INCORRECT_QUERY, "It's not initial query. ON CLUSTER is not allowed for Replicated database."); + if (auto * ddl_query = query->as()) + ddl_query->database.clear(); + if (const auto * query_alter = query->as()) { for (const auto & command : query_alter->command_list->children) @@ -493,4 +496,31 @@ void DatabaseReplicated::commitAlterTable(const StorageID & table_id, DatabaseAtomic::commitAlterTable(table_id, table_metadata_tmp_path, table_metadata_path, statement, query_context); } +void DatabaseReplicated::createDictionary(const Context & context, + const String & dictionary_name, + const ASTPtr & query) +{ + auto txn = context.getMetadataTransaction(); + assert(!ddl_worker->isCurrentlyActive() || txn); + if (txn && txn->is_initial_query) + { + String metadata_zk_path = txn->zookeeper_path + "/metadata/" + escapeForFileName(dictionary_name); + String statement = getObjectDefinitionFromCreateQuery(query->clone()); + txn->ops.emplace_back(zkutil::makeCreateRequest(metadata_zk_path, statement, zkutil::CreateMode::Persistent)); + } + DatabaseAtomic::createDictionary(context, dictionary_name, query); +} + +void DatabaseReplicated::removeDictionary(const Context & context, const String & dictionary_name) +{ + auto txn = context.getMetadataTransaction(); + assert(!ddl_worker->isCurrentlyActive() || txn); + if (txn && txn->is_initial_query) + { + String metadata_zk_path = zookeeper_path + "/metadata/" + escapeForFileName(dictionary_name); + txn->ops.emplace_back(zkutil::makeRemoveRequest(metadata_zk_path, -1)); + } + DatabaseAtomic::removeDictionary(context, dictionary_name); +} + } diff --git a/src/Databases/DatabaseReplicated.h b/src/Databases/DatabaseReplicated.h index 0f500b16470..c39321f0caa 100644 --- a/src/Databases/DatabaseReplicated.h +++ b/src/Databases/DatabaseReplicated.h @@ -58,6 +58,10 @@ public: void commitAlterTable(const StorageID & table_id, const String & table_metadata_tmp_path, const String & table_metadata_path, const String & statement, const Context & query_context) override; + void createDictionary(const Context & context, + const String & dictionary_name, + const ASTPtr & query) override; + void removeDictionary(const Context & context, const String & dictionary_name) override; void drop(const Context & /*context*/) override; diff --git a/src/Databases/DatabaseWithDictionaries.cpp b/src/Databases/DatabaseWithDictionaries.cpp index ee16f4ae15e..7ce5de56b64 100644 --- a/src/Databases/DatabaseWithDictionaries.cpp +++ b/src/Databases/DatabaseWithDictionaries.cpp @@ -4,6 +4,7 @@ #include #include #include +#include #include #include #include @@ -193,6 +194,10 @@ void DatabaseWithDictionaries::createDictionary(const Context & context, const S detachDictionary(dictionary_name); }); + auto txn = context.getMetadataTransaction(); + if (txn && !context.isInternalSubquery()) + txn->commit(); /// Commit point (a sort of) for Replicated database + /// If it was ATTACH query and file with dictionary metadata already exist /// (so, ATTACH is done after DETACH), then rename atomically replaces old file with new one. Poco::File(dictionary_metadata_tmp_path).renameTo(dictionary_metadata_path); @@ -205,7 +210,7 @@ void DatabaseWithDictionaries::createDictionary(const Context & context, const S succeeded = true; } -void DatabaseWithDictionaries::removeDictionary(const Context &, const String & dictionary_name) +void DatabaseWithDictionaries::removeDictionary(const Context & context, const String & dictionary_name) { DictionaryAttachInfo attach_info; detachDictionaryImpl(dictionary_name, attach_info); @@ -213,6 +218,11 @@ void DatabaseWithDictionaries::removeDictionary(const Context &, const String & try { String dictionary_metadata_path = getObjectMetadataPath(dictionary_name); + + auto txn = context.getMetadataTransaction(); + if (txn && !context.isInternalSubquery()) + txn->commit(); /// Commit point (a sort of) for Replicated database + Poco::File(dictionary_metadata_path).remove(); CurrentStatusInfo::unset(CurrentStatusInfo::DictionaryStatus, StorageID(attach_info.create_query).getInternalDictionaryName()); diff --git a/src/Interpreters/InterpreterCreateQuery.cpp b/src/Interpreters/InterpreterCreateQuery.cpp index be241339ef7..376bf8417ff 100644 --- a/src/Interpreters/InterpreterCreateQuery.cpp +++ b/src/Interpreters/InterpreterCreateQuery.cpp @@ -1107,6 +1107,13 @@ BlockIO InterpreterCreateQuery::createDictionary(ASTCreateQuery & create) auto guard = DatabaseCatalog::instance().getDDLGuard(database_name, dictionary_name); DatabasePtr database = DatabaseCatalog::instance().getDatabase(database_name); + if (typeid_cast(database.get()) && context.getClientInfo().query_kind != ClientInfo::QueryKind::REPLICATED_LOG_QUERY) + { + assertOrSetUUID(create, database); + guard->releaseTableLock(); + return typeid_cast(database.get())->propose(query_ptr, context); + } + if (database->isDictionaryExist(dictionary_name)) { /// TODO Check structure of dictionary diff --git a/src/Interpreters/InterpreterDropQuery.cpp b/src/Interpreters/InterpreterDropQuery.cpp index b22d46358f9..e6943f06e06 100644 --- a/src/Interpreters/InterpreterDropQuery.cpp +++ b/src/Interpreters/InterpreterDropQuery.cpp @@ -212,6 +212,19 @@ BlockIO InterpreterDropQuery::executeToDictionary( DatabasePtr database = tryGetDatabase(database_name, if_exists); + bool is_drop_or_detach_database = query_ptr->as()->table.empty(); + bool is_replicated_ddl_query = typeid_cast(database.get()) && + context.getClientInfo().query_kind != ClientInfo::QueryKind::REPLICATED_LOG_QUERY && + !is_drop_or_detach_database; + if (is_replicated_ddl_query) + { + if (kind == ASTDropQuery::Kind::Detach) + throw Exception(ErrorCodes::INCORRECT_QUERY, "DETACH DICTIONARY is not allowed for Replicated databases."); + + ddl_guard->releaseTableLock(); + return typeid_cast(database.get())->propose(query_ptr, context); + } + if (!database || !database->isDictionaryExist(dictionary_name)) { if (!if_exists) From 12b925dd7a3878b9861c593fda973bd9ee54312a Mon Sep 17 00:00:00 2001 From: Pavel Kruglov Date: Mon, 8 Feb 2021 14:06:45 +0300 Subject: [PATCH 0170/2357] Small update --- src/Client/HedgedConnections.cpp | 4 ++-- src/Client/HedgedConnectionsFactory.cpp | 19 +++++++------------ src/Client/HedgedConnectionsFactory.h | 12 +++++------- .../integration/test_hedged_requests/test.py | 2 +- .../test_hedged_requests_parallel/test.py | 2 +- 5 files changed, 16 insertions(+), 23 deletions(-) diff --git a/src/Client/HedgedConnections.cpp b/src/Client/HedgedConnections.cpp index 65100a7ea41..bba17f9dcad 100644 --- a/src/Client/HedgedConnections.cpp +++ b/src/Client/HedgedConnections.cpp @@ -409,13 +409,13 @@ void HedgedConnections::processTimeoutEvent(ReplicaLocation & replica_location, void HedgedConnections::tryGetNewReplica(bool start_new_connection) { Connection * connection = nullptr; - HedgedConnectionsFactory::State state = hedged_connections_factory.getNextConnection(start_new_connection, connection); + HedgedConnectionsFactory::State state = hedged_connections_factory.getNextConnection(start_new_connection, false, connection); /// Skip replicas that doesn't support two-level aggregation if we didn't disable it in sendQuery. while (state == HedgedConnectionsFactory::State::READY && !disable_two_level_aggregation && connection->getServerRevision(hedged_connections_factory.getConnectionTimeouts()) < DBMS_MIN_REVISION_WITH_CURRENT_AGGREGATION_VARIANT_SELECTION_METHOD) - state = hedged_connections_factory.getNextConnection(true, connection); + state = hedged_connections_factory.getNextConnection(true, false, connection); if (state == HedgedConnectionsFactory::State::READY) { diff --git a/src/Client/HedgedConnectionsFactory.cpp b/src/Client/HedgedConnectionsFactory.cpp index 84848949fb9..84f0384f377 100644 --- a/src/Client/HedgedConnectionsFactory.cpp +++ b/src/Client/HedgedConnectionsFactory.cpp @@ -76,7 +76,7 @@ std::vector HedgedConnectionsFactory::getManyConnections(PoolMode Connection * connection = nullptr; while (connections.size() < max_entries) { - auto state = processConnections(true, connection); + auto state = getNextConnection(false, true, connection); if (state == State::READY) connections.push_back(connection); else if (state == State::CANNOT_CHOOSE) @@ -100,15 +100,18 @@ std::vector HedgedConnectionsFactory::getManyConnections(PoolMode return connections; } -HedgedConnectionsFactory::State HedgedConnectionsFactory::getNextConnection(bool start_new_connection, Connection *& connection_out) +HedgedConnectionsFactory::State HedgedConnectionsFactory::getNextConnection(bool start_new_connection, bool blocking, Connection *& connection_out) { + ReplicaStatePtr replica = nullptr; + int index = -1; + if (start_new_connection) { /// Try to start establishing connection to the new replica. - int index = getNextIndex(); + index = getNextIndex(); if (index != -1) { - ReplicaStatePtr replica = startEstablishingConnection(index); + replica = startEstablishingConnection(index); if (replica->state == State::READY) { connection_out = replica->connection; @@ -117,14 +120,6 @@ HedgedConnectionsFactory::State HedgedConnectionsFactory::getNextConnection(bool } } - return processConnections(false, connection_out); -} - -HedgedConnectionsFactory::State HedgedConnectionsFactory::processConnections(bool blocking, Connection *& connection_out) -{ - ReplicaStatePtr replica = nullptr; - int index = -1; - while (index != -1 || !epoll.empty()) { if (index != -1) diff --git a/src/Client/HedgedConnectionsFactory.h b/src/Client/HedgedConnectionsFactory.h index 799e16bb068..dbafed7f3d1 100644 --- a/src/Client/HedgedConnectionsFactory.h +++ b/src/Client/HedgedConnectionsFactory.h @@ -74,14 +74,12 @@ public: /// Create and return active connections according to pool_mode. std::vector getManyConnections(PoolMode pool_mode); - /// Try to get connection to the new replica without blocking. If start_new_connection is true, we start establishing connection - /// with the new replica and then call processConnections, otherwise just call processConnections. - State getNextConnection(bool start_new_connection, Connection *& connection_out); - - /// Process all current events in epoll (connections, timeouts), if there is no events in epoll and blocking is false, - /// return NOT_READY. Returned state might be READY, NOT_READY and CANNOT_CHOOSE. + /// Try to get connection to the new replica. If start_new_connection is true, we start establishing connection + /// with the new replica. Process all current events in epoll (connections, timeouts), + /// if there is no events in epoll and blocking is false, return NOT_READY. + /// Returned state might be READY, NOT_READY and CANNOT_CHOOSE. /// If state is READY, replica connection will be written in connection_out. - State processConnections(bool blocking, Connection *& connection_out); + State getNextConnection(bool start_new_connection, bool blocking, Connection *& connection_out); /// Check if we can try to produce new READY replica. bool canGetNewConnection() const { return ready_indexes.size() + failed_pools_count < shuffled_pools.size(); } diff --git a/tests/integration/test_hedged_requests/test.py b/tests/integration/test_hedged_requests/test.py index 992590b516f..20602b1af0a 100644 --- a/tests/integration/test_hedged_requests/test.py +++ b/tests/integration/test_hedged_requests/test.py @@ -61,7 +61,7 @@ def process_test(sleep_setting_name, receive_timeout_name): print(query_time) -def test(started_cluster): +def test_hedged_requests(started_cluster): node.query("INSERT INTO distributed VALUES (1, '2020-01-01')") process_test("sleep_before_send_hello", "receive_hello_timeout") diff --git a/tests/integration/test_hedged_requests_parallel/test.py b/tests/integration/test_hedged_requests_parallel/test.py index b662fc9d80c..08c5c0d3cd1 100644 --- a/tests/integration/test_hedged_requests_parallel/test.py +++ b/tests/integration/test_hedged_requests_parallel/test.py @@ -41,7 +41,7 @@ def started_cluster(): finally: cluster.shutdown() -def test(started_cluster): +def test_hedged_requests_with_max_parallel_replicas(started_cluster): node.query("INSERT INTO distributed VALUES (1, '2020-01-01')") # Without hedged requests select query will last more 30 seconds, From 4d44d75bc74666c11e08ccabfb11b34a1d093558 Mon Sep 17 00:00:00 2001 From: Anton Ivashkin Date: Mon, 8 Feb 2021 14:45:10 +0300 Subject: [PATCH 0171/2357] Fix build after merge one more time --- src/Disks/StoragePolicy.cpp | 2 +- src/Storages/MergeTree/DataPartsExchange.cpp | 6 +++--- src/Storages/MergeTree/IMergeTreeDataPart.cpp | 8 ++++---- src/Storages/StorageReplicatedMergeTree.cpp | 6 ++---- 4 files changed, 10 insertions(+), 12 deletions(-) diff --git a/src/Disks/StoragePolicy.cpp b/src/Disks/StoragePolicy.cpp index 55ccc39c58a..be40a5ae72d 100644 --- a/src/Disks/StoragePolicy.cpp +++ b/src/Disks/StoragePolicy.cpp @@ -164,7 +164,7 @@ Disks StoragePolicy::getDisksByType(const String & type) const Disks res; for (const auto & volume : volumes) for (const auto & disk : volume->getDisks()) - if (disk->getType() == type) + if (DB::DiskType::toString(disk->getType()) == type) res.push_back(disk); return res; } diff --git a/src/Storages/MergeTree/DataPartsExchange.cpp b/src/Storages/MergeTree/DataPartsExchange.cpp index d93e16fe154..d031989bfcd 100644 --- a/src/Storages/MergeTree/DataPartsExchange.cpp +++ b/src/Storages/MergeTree/DataPartsExchange.cpp @@ -157,7 +157,7 @@ void Service::processQuery(const Poco::Net::HTMLForm & params, ReadBuffer & /*bo if (send_s3_metadata == 1) { auto disk = part->volume->getDisk(); - if (disk->getType() == "s3") + if (disk->getType() == DB::DiskType::Type::S3) { try_use_s3_copy = true; } @@ -262,7 +262,7 @@ void Service::sendPartS3Metadata(const MergeTreeData::DataPartPtr & part, WriteB checksums.files[file_name] = {}; auto disk = part->volume->getDisk(); - if (disk->getType() != "s3") + if (disk->getType() != DB::DiskType::Type::S3) throw Exception("S3 disk is not S3 anymore", ErrorCodes::LOGICAL_ERROR); part->lockSharedData(); @@ -347,7 +347,7 @@ MergeTreeData::MutableDataPartPtr Fetcher::fetchPart( {"compress", "false"} }); - if (try_use_s3_copy && disk_s3 && disk_s3->getType() != "s3") + if (try_use_s3_copy && disk_s3 && disk_s3->getType() != DB::DiskType::Type::S3) throw Exception("Try to fetch shared s3 part on non-s3 disk", ErrorCodes::LOGICAL_ERROR); Disks disks_s3; diff --git a/src/Storages/MergeTree/IMergeTreeDataPart.cpp b/src/Storages/MergeTree/IMergeTreeDataPart.cpp index b5ab3c84558..5c35a8d0af3 100644 --- a/src/Storages/MergeTree/IMergeTreeDataPart.cpp +++ b/src/Storages/MergeTree/IMergeTreeDataPart.cpp @@ -1168,7 +1168,7 @@ void IMergeTreeDataPart::makeCloneOnDisk(const DiskPtr & disk, const String & di bool is_fetched = false; - if (disk->getType() == "s3") + if (disk->getType() == DB::DiskType::Type::S3) { auto data_settings = storage.getSettings(); if (data_settings->allow_s3_zero_copy_replication) @@ -1317,7 +1317,7 @@ String IMergeTreeDataPart::getUniqueId() const auto disk = volume->getDisk(); - if (disk->getType() == "s3") + if (disk->getType() == DB::DiskType::Type::S3) id = disk->getUniqueId(getFullRelativePath() + "checksums.txt"); if (id.empty()) @@ -1333,7 +1333,7 @@ void IMergeTreeDataPart::lockSharedData() const DiskPtr disk = volume->getDisk(); if (!disk) return; - if (disk->getType() != "s3") + if (disk->getType() != DB::DiskType::Type::S3) return; const StorageReplicatedMergeTree *replicated_storage = dynamic_cast(&storage); @@ -1384,7 +1384,7 @@ bool IMergeTreeDataPart::unlockSharedData(const String & path) const DiskPtr disk = volume->getDisk(); if (!disk) return true; - if (disk->getType() != "s3") + if (disk->getType() != DB::DiskType::Type::S3) return true; const StorageReplicatedMergeTree *replicated_storage = dynamic_cast(&storage); diff --git a/src/Storages/StorageReplicatedMergeTree.cpp b/src/Storages/StorageReplicatedMergeTree.cpp index 6bea0bbd7f3..25d379e2960 100644 --- a/src/Storages/StorageReplicatedMergeTree.cpp +++ b/src/Storages/StorageReplicatedMergeTree.cpp @@ -1491,10 +1491,9 @@ bool StorageReplicatedMergeTree::tryExecuteMerge(const LogEntry & entry) future_merged_part.updatePath(*this, reserved_space); future_merged_part.merge_type = entry.merge_type; -<<<<<<< HEAD { auto disk = reserved_space->getDisk(); - if (disk->getType() == "s3") + if (disk->getType() == DB::DiskType::Type::S3) { auto zookeeper = getZooKeeper(); String zookeeper_node = zookeeper_path + "/zero_copy_s3/merged/" + entry.new_part_name; @@ -1522,11 +1521,10 @@ bool StorageReplicatedMergeTree::tryExecuteMerge(const LogEntry & entry) } } } -======= + /// Account TTL merge if (isTTLMergeType(future_merged_part.merge_type)) global_context.getMergeList().bookMergeWithTTL(); ->>>>>>> master auto table_id = getStorageID(); /// Add merge to list From 8efee9ed9a5db0d4cc773b7bf60760160bb8b79c Mon Sep 17 00:00:00 2001 From: romanzhukov Date: Mon, 8 Feb 2021 15:40:23 +0300 Subject: [PATCH 0172/2357] DOCSUP-5822: IN oper - supports diff types. --- docs/en/sql-reference/operators/in.md | 20 +++++++++++++++++++- docs/ru/sql-reference/operators/in.md | 18 +++++++++++++++++- 2 files changed, 36 insertions(+), 2 deletions(-) diff --git a/docs/en/sql-reference/operators/in.md b/docs/en/sql-reference/operators/in.md index bfa8b3d1003..5f928f12024 100644 --- a/docs/en/sql-reference/operators/in.md +++ b/docs/en/sql-reference/operators/in.md @@ -13,10 +13,28 @@ SELECT (CounterID, UserID) IN ((34, 123), (101500, 456)) FROM ... If the left side is a single column that is in the index, and the right side is a set of constants, the system uses the index for processing the query. -Don’t list too many values explicitly (i.e. millions). If a data set is large, put it in a temporary table (for example, see the section “External data for query processing”), then use a subquery. +Don’t list too many values explicitly (i.e. millions). If a data set is large, put it in a temporary table (for example, see the section [External data for query processing](../../engines/table-engines/special/external-data.md)), then use a subquery. The right side of the operator can be a set of constant expressions, a set of tuples with constant expressions (shown in the examples above), or the name of a database table or SELECT subquery in brackets. +ClickHouse allows different types inside `IN` subquery. For left hand side it applies type conversion to the type of right hand side. + +**Example** + +Query: + +``` sql +SELECT '1' IN (SELECT 1); +``` + +Result: + +``` text +┌─in('1', _subquery49)─┐ +│ 1 │ +└──────────────────────┘ +``` + If the right side of the operator is the name of a table (for example, `UserID IN users`), this is equivalent to the subquery `UserID IN (SELECT * FROM users)`. Use this when working with external data that is sent along with the query. For example, the query can be sent together with a set of user IDs loaded to the ‘users’ temporary table, which should be filtered. If the right side of the operator is a table name that has the Set engine (a prepared data set that is always in RAM), the data set will not be created over again for each query. diff --git a/docs/ru/sql-reference/operators/in.md b/docs/ru/sql-reference/operators/in.md index 4c1290df166..5a4fe95f108 100644 --- a/docs/ru/sql-reference/operators/in.md +++ b/docs/ru/sql-reference/operators/in.md @@ -13,10 +13,26 @@ SELECT (CounterID, UserID) IN ((34, 123), (101500, 456)) FROM ... Если слева стоит один столбец, входящий в индекс, а справа - множество констант, то при выполнении запроса, система воспользуется индексом. -Не перечисляйте слишком большое количество значений (миллионы) явно. Если множество большое - лучше загрузить его во временную таблицу (например, смотрите раздел «Внешние данные для обработки запроса»), и затем воспользоваться подзапросом. +Не перечисляйте слишком большое количество значений (миллионы) явно. Если множество большое - лучше загрузить его во временную таблицу (например, смотрите раздел [Внешние данные для обработки запроса](../../engines/table-engines/special/external-data.md)), и затем воспользоваться подзапросом. В качестве правой части оператора может быть множество константных выражений, множество кортежей с константными выражениями (показано в примерах выше), а также имя таблицы или подзапрос SELECT в скобках. +**Пример** + +Запрос: + +``` sql +SELECT '1' IN (SELECT 1); +``` + +Результат: + +``` text +┌─in('1', _subquery49)─┐ +│ 1 │ +└──────────────────────┘ +``` + Если в качестве правой части оператора указано имя таблицы (например, `UserID IN users`), то это эквивалентно подзапросу `UserID IN (SELECT * FROM users)`. Это используется при работе с внешними данными, отправляемым вместе с запросом. Например, вместе с запросом может быть отправлено множество идентификаторов посетителей, загруженное во временную таблицу users, по которому следует выполнить фильтрацию. Если в качестве правой части оператора, указано имя таблицы, имеющий движок Set (подготовленное множество, постоянно находящееся в оперативке), то множество не будет создаваться заново при каждом запросе. From 5647f0eb8c25fc302179661d77e27e8d5e7bf479 Mon Sep 17 00:00:00 2001 From: romanzhukov Date: Mon, 8 Feb 2021 15:51:33 +0300 Subject: [PATCH 0173/2357] DOCSUP-5822: IN oper - supports diff types. --- docs/en/sql-reference/operators/in.md | 2 +- docs/ru/sql-reference/operators/in.md | 2 ++ 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/docs/en/sql-reference/operators/in.md b/docs/en/sql-reference/operators/in.md index 5f928f12024..1b6531a57f8 100644 --- a/docs/en/sql-reference/operators/in.md +++ b/docs/en/sql-reference/operators/in.md @@ -17,7 +17,7 @@ Don’t list too many values explicitly (i.e. millions). If a data set is large The right side of the operator can be a set of constant expressions, a set of tuples with constant expressions (shown in the examples above), or the name of a database table or SELECT subquery in brackets. -ClickHouse allows different types inside `IN` subquery. For left hand side it applies type conversion to the type of right hand side. +ClickHouse allows different types inside `IN` subquery. For left hand side it applies type conversion to the type of right hand side with [accurateCastOrNull](../functions/type-conversion-functions.md#type_conversion_function-accurate-cast_or_null). **Example** diff --git a/docs/ru/sql-reference/operators/in.md b/docs/ru/sql-reference/operators/in.md index 5a4fe95f108..d86d6f9ec57 100644 --- a/docs/ru/sql-reference/operators/in.md +++ b/docs/ru/sql-reference/operators/in.md @@ -17,6 +17,8 @@ SELECT (CounterID, UserID) IN ((34, 123), (101500, 456)) FROM ... В качестве правой части оператора может быть множество константных выражений, множество кортежей с константными выражениями (показано в примерах выше), а также имя таблицы или подзапрос SELECT в скобках. +ClickHouse допускает различные типы внутри подзапроса `IN`. Для левой стороны он применяет преобразование к типу правой стороны с помощью [accurateCastOrNull](../functions/type-conversion-functions.md#type_conversion_function-accurate-cast_or_null). + **Пример** Запрос: From 78f5f416171a192c4c6dbad4dd79d069be389a43 Mon Sep 17 00:00:00 2001 From: romanzhukov Date: Mon, 8 Feb 2021 15:55:53 +0300 Subject: [PATCH 0174/2357] DOCSUP-5822: Minor text fix. --- docs/en/sql-reference/operators/in.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/en/sql-reference/operators/in.md b/docs/en/sql-reference/operators/in.md index 1b6531a57f8..a0dd0455c4d 100644 --- a/docs/en/sql-reference/operators/in.md +++ b/docs/en/sql-reference/operators/in.md @@ -17,7 +17,7 @@ Don’t list too many values explicitly (i.e. millions). If a data set is large The right side of the operator can be a set of constant expressions, a set of tuples with constant expressions (shown in the examples above), or the name of a database table or SELECT subquery in brackets. -ClickHouse allows different types inside `IN` subquery. For left hand side it applies type conversion to the type of right hand side with [accurateCastOrNull](../functions/type-conversion-functions.md#type_conversion_function-accurate-cast_or_null). +ClickHouse allows different types inside `IN` subquery. For left hand side it applies conversion to the type of right hand side with [accurateCastOrNull](../functions/type-conversion-functions.md#type_conversion_function-accurate-cast_or_null). **Example** From d3ebf59376f42efef5e5341c04f182cdcd11f51c Mon Sep 17 00:00:00 2001 From: Pavel Kruglov Date: Mon, 8 Feb 2021 16:08:15 +0300 Subject: [PATCH 0175/2357] Fix --- src/Client/HedgedConnections.cpp | 152 ++++++++++++------------ src/Client/HedgedConnections.h | 37 +++--- src/Client/HedgedConnectionsFactory.cpp | 2 +- src/Client/HedgedConnectionsFactory.h | 2 +- 4 files changed, 94 insertions(+), 99 deletions(-) diff --git a/src/Client/HedgedConnections.cpp b/src/Client/HedgedConnections.cpp index bba17f9dcad..8ac79f5cb44 100644 --- a/src/Client/HedgedConnections.cpp +++ b/src/Client/HedgedConnections.cpp @@ -27,28 +27,30 @@ HedgedConnections::HedgedConnections( { std::vector connections = hedged_connections_factory.getManyConnections(pool_mode); - ReplicaState replica; + ReplicaStatePtr replica = nullptr; for (size_t i = 0; i != connections.size(); ++i) { - replica.connection = connections[i]; - replica.connection->setThrottler(throttler_); - int socket_fd = replica.connection->getSocket()->impl()->sockfd(); + replica = std::make_shared(); + replica->connection = connections[i]; + replica->offset = i; + replica->connection->setThrottler(throttler_); + int socket_fd = replica->connection->getSocket()->impl()->sockfd(); epoll.add(socket_fd); - fd_to_replica_location[socket_fd] = ReplicaLocation{i, 0}; - offset_states.push_back(OffsetState{{replica}, 1, false}); + fd_to_replica[socket_fd] = replica; + offset_states.push_back(OffsetState{{std::move(replica)}, 1, false}); } active_connection_count = connections.size(); offsets_with_received_first_data_packet = 0; - pipeline_for_new_replicas.add([throttler_](ReplicaState & replica_) { replica_.connection->setThrottler(throttler_); }); + pipeline_for_new_replicas.add([throttler_](ReplicaStatePtr & replica_) { replica_->connection->setThrottler(throttler_); }); } -void HedgedConnections::Pipeline::add(std::function send_function) +void HedgedConnections::Pipeline::add(std::function send_function) { pipeline.push_back(send_function); } -void HedgedConnections::Pipeline::run(ReplicaState & replica) +void HedgedConnections::Pipeline::run(ReplicaStatePtr & replica) { for (auto & send_func : pipeline) send_func(replica); @@ -61,11 +63,11 @@ void HedgedConnections::sendScalarsData(Scalars & data) if (!sent_query) throw Exception("Cannot send scalars data: query not yet sent.", ErrorCodes::LOGICAL_ERROR); - auto send_scalars_data = [&data](ReplicaState & replica) { replica.connection->sendScalarsData(data); }; + auto send_scalars_data = [&data](ReplicaStatePtr & replica) { replica->connection->sendScalarsData(data); }; for (auto & offset_state : offset_states) for (auto & replica : offset_state.replicas) - if (replica.connection) + if (replica->connection) send_scalars_data(replica); pipeline_for_new_replicas.add(send_scalars_data); @@ -81,11 +83,11 @@ void HedgedConnections::sendExternalTablesData(std::vector & if (data.size() != size()) throw Exception("Mismatch between replicas and data sources", ErrorCodes::MISMATCH_REPLICAS_DATA_SOURCES); - auto send_external_tables_data = [&data](ReplicaState & replica) { replica.connection->sendExternalTablesData(data[0]); }; + auto send_external_tables_data = [&data](ReplicaStatePtr & replica) { replica->connection->sendExternalTablesData(data[0]); }; for (auto & offset_state : offset_states) for (auto & replica : offset_state.replicas) - if (replica.connection) + if (replica->connection) send_external_tables_data(replica); pipeline_for_new_replicas.add(send_external_tables_data); @@ -98,11 +100,11 @@ void HedgedConnections::sendIgnoredPartUUIDs(const std::vector & uuids) if (sent_query) throw Exception("Cannot send uuids after query is sent.", ErrorCodes::LOGICAL_ERROR); - auto send_ignored_part_uuids = [&uuids](ReplicaState & replica) { replica.connection->sendIgnoredPartUUIDs(uuids); }; + auto send_ignored_part_uuids = [&uuids](ReplicaStatePtr & replica) { replica->connection->sendIgnoredPartUUIDs(uuids); }; for (auto & offset_state : offset_states) for (auto & replica : offset_state.replicas) - if (replica.connection) + if (replica->connection) send_ignored_part_uuids(replica); pipeline_for_new_replicas.add(send_ignored_part_uuids); @@ -125,7 +127,7 @@ void HedgedConnections::sendQuery( { for (auto & replica : offset_state.replicas) { - if (replica.connection->getServerRevision(timeouts) < DBMS_MIN_REVISION_WITH_CURRENT_AGGREGATION_VARIANT_SELECTION_METHOD) + if (replica->connection->getServerRevision(timeouts) < DBMS_MIN_REVISION_WITH_CURRENT_AGGREGATION_VARIANT_SELECTION_METHOD) { disable_two_level_aggregation = true; break; @@ -135,8 +137,7 @@ void HedgedConnections::sendQuery( break; } - auto send_query = [this, timeouts, query, query_id, stage, client_info, with_pending_data](ReplicaState & replica) - { + auto send_query = [this, timeouts, query, query_id, stage, client_info, with_pending_data](ReplicaStatePtr & replica) { Settings modified_settings = settings; if (disable_two_level_aggregation) @@ -149,10 +150,10 @@ void HedgedConnections::sendQuery( if (offset_states.size() > 1) { modified_settings.parallel_replicas_count = offset_states.size(); - modified_settings.parallel_replica_offset = fd_to_replica_location[replica.connection->getSocket()->impl()->sockfd()].offset; + modified_settings.parallel_replica_offset = replica->offset; } - replica.connection->sendQuery(timeouts, query, query_id, stage, &modified_settings, &client_info, with_pending_data); + replica->connection->sendQuery(timeouts, query, query_id, stage, &modified_settings, &client_info, with_pending_data); addTimeoutToReplica(ConnectionTimeoutType::RECEIVE_TIMEOUT, replica); addTimeoutToReplica(ConnectionTimeoutType::RECEIVE_DATA_TIMEOUT, replica); }; @@ -171,7 +172,7 @@ void HedgedConnections::disconnect() for (auto & offset_status : offset_states) for (auto & replica : offset_status.replicas) - if (replica.connection) + if (replica->connection) finishProcessReplica(replica, true); if (hedged_connections_factory.hasEventsInProcess()) @@ -197,9 +198,9 @@ std::string HedgedConnections::dumpAddresses() const { for (const auto & replica : offset_state.replicas) { - if (replica.connection) + if (replica->connection) { - addresses += (is_first ? "" : "; ") + replica.connection->getDescription(); + addresses += (is_first ? "" : "; ") + replica->connection->getDescription(); is_first = false; } } @@ -217,8 +218,8 @@ void HedgedConnections::sendCancel() for (auto & offset_status : offset_states) for (auto & replica : offset_status.replicas) - if (replica.connection) - replica.connection->sendCancel(); + if (replica->connection) + replica->connection->sendCancel(); cancelled = true; } @@ -286,15 +287,15 @@ Packet HedgedConnections::receivePacketImpl(AsyncCallback async_callback) { event_fd = getReadyFileDescriptor(async_callback); - if (fd_to_replica_location.contains(event_fd)) + if (fd_to_replica.contains(event_fd)) { - packet = receivePacketFromReplica(fd_to_replica_location[event_fd], async_callback); + packet = receivePacketFromReplica(fd_to_replica[event_fd], async_callback); finish = true; } - else if (timeout_fd_to_replica_location.contains(event_fd)) + else if (timeout_fd_to_replica.contains(event_fd)) { - ReplicaLocation location = timeout_fd_to_replica_location[event_fd]; - processTimeoutEvent(location, offset_states[location.offset].replicas[location.index].active_timeouts[event_fd]); + ReplicaStatePtr & replica = timeout_fd_to_replica[event_fd]; + processTimeoutEvent(replica, replica->active_timeouts[event_fd]); } else if (event_fd == hedged_connections_factory.getFileDescriptor()) tryGetNewReplica(false); @@ -307,11 +308,10 @@ Packet HedgedConnections::receivePacketImpl(AsyncCallback async_callback) int HedgedConnections::getReadyFileDescriptor(AsyncCallback async_callback) { - for (auto & [fd, location] : fd_to_replica_location) + for (auto & [fd, replica] : fd_to_replica) { - ReplicaState & replica = offset_states[location.offset].replicas[location.index]; - if (replica.connection->hasReadPendingData()) - return replica.connection->getSocket()->impl()->sockfd(); + if (replica->connection->hasReadPendingData()) + return replica->connection->getSocket()->impl()->sockfd(); } epoll_event event; @@ -320,16 +320,15 @@ int HedgedConnections::getReadyFileDescriptor(AsyncCallback async_callback) return event.data.fd; } -Packet HedgedConnections::receivePacketFromReplica(ReplicaLocation & replica_location, AsyncCallback async_callback) +Packet HedgedConnections::receivePacketFromReplica(ReplicaStatePtr & replica, AsyncCallback async_callback) { - ReplicaState & replica = offset_states[replica_location.offset].replicas[replica_location.index]; removeTimeoutFromReplica(ConnectionTimeoutType::RECEIVE_TIMEOUT, replica); - Packet packet = replica.connection->receivePacket(std::move(async_callback)); + Packet packet = replica->connection->receivePacket(std::move(async_callback)); switch (packet.type) { case Protocol::Server::Data: - if (!offset_states[replica_location.offset].first_packet_of_data_received) - processReceivedFirstDataPacket(replica_location); + if (!offset_states[replica->offset].first_packet_of_data_received) + processReceivedFirstDataPacket(replica); addTimeoutToReplica(ConnectionTimeoutType::RECEIVE_TIMEOUT, replica); break; case Protocol::Server::PartUUIDs: @@ -354,21 +353,21 @@ Packet HedgedConnections::receivePacketFromReplica(ReplicaLocation & replica_loc return packet; } -void HedgedConnections::processReceivedFirstDataPacket(ReplicaLocation & replica_location) +void HedgedConnections::processReceivedFirstDataPacket(ReplicaStatePtr & replica) { /// When we receive first packet of data from replica, we stop working with replicas, that are /// responsible for the same offset. - OffsetState & offset_state = offset_states[replica_location.offset]; - removeTimeoutFromReplica(ConnectionTimeoutType::RECEIVE_DATA_TIMEOUT, offset_state.replicas[replica_location.index]); + OffsetState & offset_state = offset_states[replica->offset]; + removeTimeoutFromReplica(ConnectionTimeoutType::RECEIVE_DATA_TIMEOUT, replica); ++offsets_with_received_first_data_packet; offset_state.first_packet_of_data_received = true; - for (size_t i = 0; i != offset_state.replicas.size(); ++i) + for (auto & other_replica : offset_state.replicas) { - if (i != replica_location.index && offset_state.replicas[i].connection) + if (replica != other_replica && other_replica->connection) { - offset_state.replicas[i].connection->sendCancel(); - finishProcessReplica(offset_state.replicas[i], true); + other_replica->connection->sendCancel(); + finishProcessReplica(other_replica, true); } } @@ -384,24 +383,23 @@ void HedgedConnections::processReceivedFirstDataPacket(ReplicaLocation & replica } } -void HedgedConnections::processTimeoutEvent(ReplicaLocation & replica_location, ConnectionTimeoutDescriptorPtr timeout_descriptor) +void HedgedConnections::processTimeoutEvent(ReplicaStatePtr & replica, ConnectionTimeoutDescriptorPtr timeout_descriptor) { - ReplicaState & replica = offset_states[replica_location.offset].replicas[replica_location.index]; epoll.remove(timeout_descriptor->timer.getDescriptor()); - replica.active_timeouts.erase(timeout_descriptor->timer.getDescriptor()); - timeout_fd_to_replica_location.erase(timeout_descriptor->timer.getDescriptor()); + replica->active_timeouts.erase(timeout_descriptor->timer.getDescriptor()); + timeout_fd_to_replica.erase(timeout_descriptor->timer.getDescriptor()); if (timeout_descriptor->type == ConnectionTimeoutType::RECEIVE_TIMEOUT) { finishProcessReplica(replica, true); /// Check if there is no active connections with the same offset and there is no new replica in process. - if (offset_states[replica_location.offset].active_connection_count == 0 && !next_replica_in_process) + if (offset_states[replica->offset].active_connection_count == 0 && !next_replica_in_process) throw NetException("Receive timeout expired", ErrorCodes::SOCKET_TIMEOUT); } else if (timeout_descriptor->type == ConnectionTimeoutType::RECEIVE_DATA_TIMEOUT) { - offsets_queue.push(replica_location.offset); + offsets_queue.push(replica->offset); tryGetNewReplica(true); } } @@ -421,14 +419,14 @@ void HedgedConnections::tryGetNewReplica(bool start_new_connection) { size_t offset = offsets_queue.front(); offsets_queue.pop(); - size_t index = offset_states[offset].replicas.size(); - ReplicaState replica; - replica.connection = connection; - int socket_fd = replica.connection->getSocket()->impl()->sockfd(); + ReplicaStatePtr replica = std::make_shared(); + replica->connection = connection; + replica->offset = offset; + int socket_fd = replica->connection->getSocket()->impl()->sockfd(); epoll.add(socket_fd); - fd_to_replica_location[socket_fd] = ReplicaLocation{offset, index}; - offset_states[offset].replicas.push_back(replica); + fd_to_replica[socket_fd] = replica; + offset_states[offset].replicas.push_back(std::move(replica)); ++offset_states[offset].active_connection_count; ++active_connection_count; pipeline_for_new_replicas.run(replica); @@ -458,50 +456,50 @@ void HedgedConnections::tryGetNewReplica(bool start_new_connection) } } -void HedgedConnections::finishProcessReplica(ReplicaState & replica, bool disconnect) +void HedgedConnections::finishProcessReplica(ReplicaStatePtr & replica, bool disconnect) { removeTimeoutsFromReplica(replica); - int socket_fd = replica.connection->getSocket()->impl()->sockfd(); + int socket_fd = replica->connection->getSocket()->impl()->sockfd(); epoll.remove(socket_fd); - --offset_states[fd_to_replica_location[socket_fd].offset].active_connection_count; - fd_to_replica_location.erase(socket_fd); + --offset_states[replica->offset].active_connection_count; + fd_to_replica.erase(socket_fd); --active_connection_count; if (disconnect) - replica.connection->disconnect(); - replica.connection = nullptr; + replica->connection->disconnect(); + replica->connection = nullptr; } -void HedgedConnections::addTimeoutToReplica(ConnectionTimeoutType type, ReplicaState & replica) +void HedgedConnections::addTimeoutToReplica(ConnectionTimeoutType type, ReplicaStatePtr & replica) { ConnectionTimeoutDescriptorPtr timeout_descriptor = createConnectionTimeoutDescriptor(type, hedged_connections_factory.getConnectionTimeouts()); epoll.add(timeout_descriptor->timer.getDescriptor()); - timeout_fd_to_replica_location[timeout_descriptor->timer.getDescriptor()] - = fd_to_replica_location[replica.connection->getSocket()->impl()->sockfd()]; - replica.active_timeouts[timeout_descriptor->timer.getDescriptor()] = std::move(timeout_descriptor); + timeout_fd_to_replica[timeout_descriptor->timer.getDescriptor()] + = fd_to_replica[replica->connection->getSocket()->impl()->sockfd()]; + replica->active_timeouts[timeout_descriptor->timer.getDescriptor()] = std::move(timeout_descriptor); } -void HedgedConnections::removeTimeoutsFromReplica(ReplicaState & replica) +void HedgedConnections::removeTimeoutsFromReplica(ReplicaStatePtr & replica) { - for (auto & [fd, _] : replica.active_timeouts) + for (auto & [fd, _] : replica->active_timeouts) { epoll.remove(fd); - timeout_fd_to_replica_location.erase(fd); + timeout_fd_to_replica.erase(fd); } - replica.active_timeouts.clear(); + replica->active_timeouts.clear(); } -void HedgedConnections::removeTimeoutFromReplica(ConnectionTimeoutType type, ReplicaState & replica) +void HedgedConnections::removeTimeoutFromReplica(ConnectionTimeoutType type, ReplicaStatePtr & replica) { auto it = std::find_if( - replica.active_timeouts.begin(), replica.active_timeouts.end(), [type](auto & value) { return value.second->type == type; }); + replica->active_timeouts.begin(), replica->active_timeouts.end(), [type](auto & value) { return value.second->type == type; }); - if (it != replica.active_timeouts.end()) + if (it != replica->active_timeouts.end()) { epoll.remove(it->first); - timeout_fd_to_replica_location.erase(it->first); - replica.active_timeouts.erase(it); + timeout_fd_to_replica.erase(it->first); + replica->active_timeouts.erase(it); } } diff --git a/src/Client/HedgedConnections.h b/src/Client/HedgedConnections.h index eb73f2ded52..56eca3ffbe7 100644 --- a/src/Client/HedgedConnections.h +++ b/src/Client/HedgedConnections.h @@ -22,17 +22,14 @@ public: { Connection * connection = nullptr; std::unordered_map active_timeouts; + size_t offset = 0; }; - struct ReplicaLocation - { - size_t offset; - size_t index; - }; + using ReplicaStatePtr = std::shared_ptr; struct OffsetState { - std::vector replicas; + std::vector replicas; size_t active_connection_count; bool first_packet_of_data_received; }; @@ -79,32 +76,32 @@ private: class Pipeline { public: - void add(std::function send_function); + void add(std::function send_function); - void run(ReplicaState & replica); + void run(ReplicaStatePtr & replica); private: - std::vector> pipeline; + std::vector> pipeline; }; - Packet receivePacketFromReplica(ReplicaLocation & replica_location, AsyncCallback async_callback = {}); + Packet receivePacketFromReplica(ReplicaStatePtr & replica, AsyncCallback async_callback = {}); Packet receivePacketImpl(AsyncCallback async_callback = {}); - void processReceivedFirstDataPacket(ReplicaLocation & replica_location); + void processReceivedFirstDataPacket(ReplicaStatePtr & replica); - void processTimeoutEvent(ReplicaLocation & replica_location, ConnectionTimeoutDescriptorPtr timeout_descriptor); + void processTimeoutEvent(ReplicaStatePtr & replica, ConnectionTimeoutDescriptorPtr timeout_descriptor); void tryGetNewReplica(bool start_new_connection); - void finishProcessReplica(ReplicaState & replica, bool disconnect); + void finishProcessReplica(ReplicaStatePtr & replica, bool disconnect); int getReadyFileDescriptor(AsyncCallback async_callback = {}); - void addTimeoutToReplica(ConnectionTimeoutType type, ReplicaState & replica); + void addTimeoutToReplica(ConnectionTimeoutType type, ReplicaStatePtr & replica); - void removeTimeoutsFromReplica(ReplicaState & replica); + void removeTimeoutsFromReplica(ReplicaStatePtr & replica); - void removeTimeoutFromReplica(ConnectionTimeoutType type, ReplicaState & replica); + void removeTimeoutFromReplica(ConnectionTimeoutType type, ReplicaStatePtr & replica); HedgedConnectionsFactory hedged_connections_factory; @@ -114,10 +111,10 @@ private: /// replica_states[offset].replicas.size() = 1 (like in MultiplexedConnections). std::vector offset_states; - /// Map socket file descriptor to replica location (it's offset and index in OffsetState.replicas). - std::unordered_map fd_to_replica_location; - /// Map timeout file descriptor to replica location (it's offset and index in OffsetState.replicas). - std::unordered_map timeout_fd_to_replica_location; + /// Map socket file descriptor to replica. + std::unordered_map fd_to_replica; + /// Map timeout file descriptor to replica. + std::unordered_map timeout_fd_to_replica; /// A queue of offsets for new replicas. When we get RECEIVE_DATA_TIMEOUT from /// the replica, we push it's offset to this queue and start trying to get diff --git a/src/Client/HedgedConnectionsFactory.cpp b/src/Client/HedgedConnectionsFactory.cpp index 84f0384f377..7b49a351ff6 100644 --- a/src/Client/HedgedConnectionsFactory.cpp +++ b/src/Client/HedgedConnectionsFactory.cpp @@ -104,7 +104,7 @@ HedgedConnectionsFactory::State HedgedConnectionsFactory::getNextConnection(bool { ReplicaStatePtr replica = nullptr; int index = -1; - + if (start_new_connection) { /// Try to start establishing connection to the new replica. diff --git a/src/Client/HedgedConnectionsFactory.h b/src/Client/HedgedConnectionsFactory.h index dbafed7f3d1..345a1f2fe3e 100644 --- a/src/Client/HedgedConnectionsFactory.h +++ b/src/Client/HedgedConnectionsFactory.h @@ -75,7 +75,7 @@ public: std::vector getManyConnections(PoolMode pool_mode); /// Try to get connection to the new replica. If start_new_connection is true, we start establishing connection - /// with the new replica. Process all current events in epoll (connections, timeouts), + /// with the new replica. Process all current events in epoll (connections, timeouts), /// if there is no events in epoll and blocking is false, return NOT_READY. /// Returned state might be READY, NOT_READY and CANNOT_CHOOSE. /// If state is READY, replica connection will be written in connection_out. From 134a686fa5f2a26bab4dae159bfd5d72bb4e1874 Mon Sep 17 00:00:00 2001 From: Pavel Kruglov Date: Mon, 8 Feb 2021 16:16:09 +0300 Subject: [PATCH 0176/2357] Fix 2 --- src/Client/HedgedConnections.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/Client/HedgedConnections.cpp b/src/Client/HedgedConnections.cpp index 8ac79f5cb44..9fab9e6ec84 100644 --- a/src/Client/HedgedConnections.cpp +++ b/src/Client/HedgedConnections.cpp @@ -426,7 +426,7 @@ void HedgedConnections::tryGetNewReplica(bool start_new_connection) int socket_fd = replica->connection->getSocket()->impl()->sockfd(); epoll.add(socket_fd); fd_to_replica[socket_fd] = replica; - offset_states[offset].replicas.push_back(std::move(replica)); + offset_states[offset].replicas.push_back(replica); ++offset_states[offset].active_connection_count; ++active_connection_count; pipeline_for_new_replicas.run(replica); From bfc703692ad5d90bb1f43836752e4f4668ba1c4b Mon Sep 17 00:00:00 2001 From: Vitaliy Zakaznikov Date: Mon, 8 Feb 2021 08:48:43 -0500 Subject: [PATCH 0177/2357] Starting to add LDAP docs. --- .../external-authenticators/index.md | 9 ++ .../external-authenticators/ldap.md | 145 ++++++++++++++++++ .../sql-reference/statements/create/user.md | 3 +- 3 files changed, 156 insertions(+), 1 deletion(-) create mode 100644 docs/en/operations/external-authenticators/index.md create mode 100644 docs/en/operations/external-authenticators/ldap.md diff --git a/docs/en/operations/external-authenticators/index.md b/docs/en/operations/external-authenticators/index.md new file mode 100644 index 00000000000..10c2ea91eb9 --- /dev/null +++ b/docs/en/operations/external-authenticators/index.md @@ -0,0 +1,9 @@ +--- +toc_folder_title: External User Authenticators and Directories +toc_priority: 48 +toc_title: Introduction +--- + +# External User Authenticators and Directories {#external-authenticators} + +ClickHouse supports authenticating and managing users using external services such as [LDAP](#external-authenticators-ldap). diff --git a/docs/en/operations/external-authenticators/ldap.md b/docs/en/operations/external-authenticators/ldap.md new file mode 100644 index 00000000000..fd5f2e578ce --- /dev/null +++ b/docs/en/operations/external-authenticators/ldap.md @@ -0,0 +1,145 @@ +# LDAP {#external-authenticators-ldap} + +LDAP server can be used to authenticate ClickHouse users. There are two different approaches for doing this: + +- use LDAP as an external authenticator for existing users, which are defined in `users.xml` or in local access control paths +- use LDAP as an external user directory and allow locally undefined users to be authenticated if they exist on the LDAP server + +For both of these approaches, an internally named LDAP server must be defined in the ClickHouse config +so that other parts of config are able to refer to it. + +## Server Definition {#ldap-server-definition} + +To define LDAP server you must add `ldap_servers` section to the `config.xml`. For example, + +```xml + + + + + localhost + 636 + uid={user_name},ou=users,dc=example,dc=com + 300 + yes + tls1.2 + demand + /path/to/tls_cert_file + /path/to/tls_key_file + /path/to/tls_ca_cert_file + /path/to/tls_ca_cert_dir + ECDHE-ECDSA-AES256-GCM-SHA384:ECDHE-RSA-AES256-GCM-SHA384:AES256-GCM-SHA384 + + + +``` + +Note, that you can define multiple LDAP servers inside `ldap_servers` section using distinct names. + +Parameters: + +- `host` - LDAP server hostname or IP, this parameter is mandatory and cannot be empty. +- `port` - LDAP server port, default is `636` if `enable_tls` is set to `true`, `389` otherwise. +- `bind_dn` - template used to construct the DN to bind to. + - The resulting DN will be constructed by replacing all `{user_name}` substrings of the template with the actual user name during each authentication attempt. +- `verification_cooldown` - a period of time, in seconds, after a successful bind attempt, during which the user will be assumed to be successfully authenticated for all consecutive requests without contacting the LDAP server. + - Specify `0` (the default) to disable caching and force contacting the LDAP server for each authentication request. +- `enable_tls` - flag to trigger use of secure connection to the LDAP server. + - Specify `no` for plain text `ldap://` protocol (not recommended). + - Specify `yes` for LDAP over SSL/TLS `ldaps://` protocol (recommended, the default). + - Specify `starttls` for legacy StartTLS protocol (plain text `ldap://` protocol, upgraded to TLS). +- `tls_minimum_protocol_version` - the minimum protocol version of SSL/TLS. + - Accepted values are: `ssl2`, `ssl3`, `tls1.0`, `tls1.1`, `tls1.2` (the default). +- `tls_require_cert` - SSL/TLS peer certificate verification behavior. + - Accepted values are: `never`, `allow`, `try`, `demand` (the default). +- `tls_cert_file` - path to certificate file. +- `tls_key_file` - path to certificate key file. +- `tls_ca_cert_file` - path to CA certificate file. +- `tls_ca_cert_dir` - path to the directory containing CA certificates. +- `tls_cipher_suite` - allowed cipher suite (in OpenSSL notation). + +## External Authenticator {#ldap-external-authenticator} + +A remote LDAP server can be used as a method for verifying the passwords for locally defined users (users defined in `users.xml` or in local access control paths). In order to achieve this, specify previously defined LDAP server name instead of `password` or similar sections in the user definition. + +At each login attempt, ClickHouse will try to "bind" to the specified DN (see `bind_dn` sections in LDAP server config in `config.xml`) at the LDAP server using the provided credentials, and, if successful, the user will be considered authenticated. This is often called "simple bind" method. + +Example (goes into `users.xml`): + +```xml + + + + + + + + my_ldap_server + + + + +``` + +Note, that now, once user `my_user` refers to `my_ldap_server`, this LDAP server must be configured in the main `config.xml` file as described previously. + +When SQL-driven Access Control and Account Management is enabled in ClickHouse, users that are identified by LDAP servers can also be created using queries. + +Example (execute in ClickHouse client): + +```sql +CREATE USER my_user IDENTIFIED WITH ldap_server BY 'my_ldap_server' +``` + +## Exernal User Directory {#ldap-external-user-directory} + +A remote LDAP server can be used as a source of user definitions, in addition to the locally defined users. In order to achieve this, specify previously defined LDAP server name in `ldap` section inside `users_directories` section in main `config.xml` file. + +At each login attempt, ClickHouse will try to find the user definition locally and authenticate it as usual, but if the user is not defined, ClickHouse will assume it exists in the external LDAP directory, and will try to "bind" to the specified DN (see `bind_dn` sections in LDAP server config in `config.xml`) at the LDAP server using the provided credentials, and, if successful, the user will be considered existing and authenticated. The user will be assigned roles from the list specified in `roles`. Additionally, LDAP "search" can be performed and results can be transformed and treated as role names and then assigned to the user if `role_mapping` section is also configured. All this implies that the SQL-driven Access Control and Account Management is enabled in ClickHouse and roles are created using `CREATE ROLE ...` queries. + +Example (goes into `config.xml`): + +```xml + + + + + + my_ldap_server + + + + + + ou=groups,dc=example,dc=com + subtree + (&(objectClass=groupOfNames)(member={bind_dn})) + cn + clickhouse_ + + + + +``` + +Note, that now, once `my_ldap_server` is referred from `ldap` inside `user_directories` section, this LDAP server must be configured in the main `config.xml` file as described previously. + +Parameters: + +- `server` - one of LDAP server names defined in `ldap_servers` config section above. This parameter is mandatory and cannot be empty. +- `roles` - section with a list of locally defined roles that will be assigned to each user retrieved from the LDAP server. + - If no roles are specified here or assigned during role mapping (below), user will not be able to perform any actions after authentication. +- `role_mapping` - section with LDAP search parameters and mapping rules. + - When a user authenticates, while still bound to LDAP, an LDAP search is performed using `search_filter` and the name of the logged in user. For each entry found during that search, the value of the specified attribute is extracted. For each attribute value that has the specified prefix, the prefix is removed, and the rest of the value becomes the name of a local role defined in ClickHouse, which is expected to be created beforehand by `CREATE ROLE ...` command. + + - There can be multiple `role_mapping` sections defined inside the same `ldap` section. All of them will be applied. + - `base_dn` - template used to construct the base DN for the LDAP search. + - The resulting DN will be constructed by replacing all `{user_name}` and `{bind_dn}` substrings of the template with the actual user name and bind DN during each LDAP search. + - `scope` - scope of the LDAP search. + - Accepted values are: `base`, `one_level`, `children`, `subtree` (the default). + - `search_filter` - template used to construct the search filter for the LDAP search. + - The resulting filter will be constructed by replacing all `{user_name}`, `{bind_dn}`, and `{base_dn}` substrings of the template with the actual user name, bind DN, and base DN during each LDAP search. + - Note, that the special characters must be escaped properly in XML. + - `attribute` - attribute name whose values will be returned by the LDAP search. + - `prefix` - prefix, that will be expected to be in front of each string in the original list of strings returned by the LDAP search. Prefix will be removed from the original strings and resulting strings will be treated as local role names. Empty, by default. + diff --git a/docs/en/sql-reference/statements/create/user.md b/docs/en/sql-reference/statements/create/user.md index d5343cce7be..c1a52e3b864 100644 --- a/docs/en/sql-reference/statements/create/user.md +++ b/docs/en/sql-reference/statements/create/user.md @@ -12,7 +12,7 @@ Syntax: ``` sql CREATE USER [IF NOT EXISTS | OR REPLACE] name1 [ON CLUSTER cluster_name1] [, name2 [ON CLUSTER cluster_name2] ...] - [IDENTIFIED [WITH {NO_PASSWORD|PLAINTEXT_PASSWORD|SHA256_PASSWORD|SHA256_HASH|DOUBLE_SHA1_PASSWORD|DOUBLE_SHA1_HASH}] BY {'password'|'hash'}] + [IDENTIFIED [WITH {NO_PASSWORD|PLAINTEXT_PASSWORD|SHA256_PASSWORD|SHA256_HASH|DOUBLE_SHA1_PASSWORD|DOUBLE_SHA1_HASH|LDAP_SERVER}] BY {'password'|'hash'}] [HOST {LOCAL | NAME 'name' | REGEXP 'name_regexp' | IP 'address' | LIKE 'pattern'} [,...] | ANY | NONE] [DEFAULT ROLE role [,...]] [SETTINGS variable [= value] [MIN [=] min_value] [MAX [=] max_value] [READONLY|WRITABLE] | PROFILE 'profile_name'] [,...] @@ -30,6 +30,7 @@ There are multiple ways of user identification: - `IDENTIFIED WITH sha256_hash BY 'hash'` - `IDENTIFIED WITH double_sha1_password BY 'qwerty'` - `IDENTIFIED WITH double_sha1_hash BY 'hash'` +- `IDENTIFIED WITH ldap_server BY 'server'` ## User Host {#user-host} From 7b45860b0674ae5a85979f0147de532f4da52f1a Mon Sep 17 00:00:00 2001 From: Pavel Kruglov Date: Mon, 8 Feb 2021 17:02:11 +0300 Subject: [PATCH 0178/2357] Style fix --- src/Client/HedgedConnections.cpp | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/Client/HedgedConnections.cpp b/src/Client/HedgedConnections.cpp index 9fab9e6ec84..b361f04f0b1 100644 --- a/src/Client/HedgedConnections.cpp +++ b/src/Client/HedgedConnections.cpp @@ -137,7 +137,8 @@ void HedgedConnections::sendQuery( break; } - auto send_query = [this, timeouts, query, query_id, stage, client_info, with_pending_data](ReplicaStatePtr & replica) { + auto send_query = [this, timeouts, query, query_id, stage, client_info, with_pending_data](ReplicaStatePtr & replica) + { Settings modified_settings = settings; if (disable_two_level_aggregation) From a594c738c2f4c539065f36d160b451c7048fb670 Mon Sep 17 00:00:00 2001 From: Pavel Kruglov Date: Mon, 8 Feb 2021 17:14:36 +0300 Subject: [PATCH 0179/2357] Remove code duplication --- src/Client/HedgedConnectionsFactory.cpp | 12 ------------ 1 file changed, 12 deletions(-) diff --git a/src/Client/HedgedConnectionsFactory.cpp b/src/Client/HedgedConnectionsFactory.cpp index 7b49a351ff6..2a5abbbaf57 100644 --- a/src/Client/HedgedConnectionsFactory.cpp +++ b/src/Client/HedgedConnectionsFactory.cpp @@ -106,19 +106,7 @@ HedgedConnectionsFactory::State HedgedConnectionsFactory::getNextConnection(bool int index = -1; if (start_new_connection) - { - /// Try to start establishing connection to the new replica. index = getNextIndex(); - if (index != -1) - { - replica = startEstablishingConnection(index); - if (replica->state == State::READY) - { - connection_out = replica->connection; - return State::READY; - } - } - } while (index != -1 || !epoll.empty()) { From e312ef72281dc5b034343d0ff33035fbf1a7a7ef Mon Sep 17 00:00:00 2001 From: Vitaliy Zakaznikov Date: Mon, 8 Feb 2021 12:29:45 -0500 Subject: [PATCH 0180/2357] Updating LDAP docs. --- .../external-authenticators/index.md | 12 ++++- .../external-authenticators/ldap.md | 53 ++++++++++++------- 2 files changed, 44 insertions(+), 21 deletions(-) diff --git a/docs/en/operations/external-authenticators/index.md b/docs/en/operations/external-authenticators/index.md index 10c2ea91eb9..f06c1de8ec7 100644 --- a/docs/en/operations/external-authenticators/index.md +++ b/docs/en/operations/external-authenticators/index.md @@ -6,4 +6,14 @@ toc_title: Introduction # External User Authenticators and Directories {#external-authenticators} -ClickHouse supports authenticating and managing users using external services such as [LDAP](#external-authenticators-ldap). +ClickHouse supports authenticating and managing users using external services. + +The following external authenticators and directories are supported. + +## External Authenticators + +- [LDAP](#ldap-external-authenticator) + +## External User Directories + +- [LDAP](#ldap-external-user-directory) diff --git a/docs/en/operations/external-authenticators/ldap.md b/docs/en/operations/external-authenticators/ldap.md index fd5f2e578ce..7ad1fd68b74 100644 --- a/docs/en/operations/external-authenticators/ldap.md +++ b/docs/en/operations/external-authenticators/ldap.md @@ -41,8 +41,11 @@ Parameters: - `host` - LDAP server hostname or IP, this parameter is mandatory and cannot be empty. - `port` - LDAP server port, default is `636` if `enable_tls` is set to `true`, `389` otherwise. - `bind_dn` - template used to construct the DN to bind to. - - The resulting DN will be constructed by replacing all `{user_name}` substrings of the template with the actual user name during each authentication attempt. -- `verification_cooldown` - a period of time, in seconds, after a successful bind attempt, during which the user will be assumed to be successfully authenticated for all consecutive requests without contacting the LDAP server. + - The resulting DN will be constructed by replacing all `{user_name}` substrings of the + template with the actual user name during each authentication attempt. +- `verification_cooldown` - a period of time, in seconds, after a successful bind attempt, + during which the user will be assumed to be successfully authenticated for all consecutive + requests without contacting the LDAP server. - Specify `0` (the default) to disable caching and force contacting the LDAP server for each authentication request. - `enable_tls` - flag to trigger use of secure connection to the LDAP server. - Specify `no` for plain text `ldap://` protocol (not recommended). @@ -58,13 +61,14 @@ Parameters: - `tls_ca_cert_dir` - path to the directory containing CA certificates. - `tls_cipher_suite` - allowed cipher suite (in OpenSSL notation). -## External Authenticator {#ldap-external-authenticator} +## Using LDAP As External Authenticator {#ldap-external-authenticator} -A remote LDAP server can be used as a method for verifying the passwords for locally defined users (users defined in `users.xml` or in local access control paths). In order to achieve this, specify previously defined LDAP server name instead of `password` or similar sections in the user definition. +A remote LDAP server can be used as a method for verifying passwords for locally defined users (users defined in `users.xml` or in local access control paths). In order to achieve this, specify previously defined LDAP server name instead of `password` or similar sections in the user definition. -At each login attempt, ClickHouse will try to "bind" to the specified DN (see `bind_dn` sections in LDAP server config in `config.xml`) at the LDAP server using the provided credentials, and, if successful, the user will be considered authenticated. This is often called "simple bind" method. +At each login attempt, ClickHouse will try to "bind" to the specified DN defined by the `bind_dn` parameter +in the [LDAP server definition](#ldap-server-definition) using the provided credentials, and if successful, the user will be considered authenticated. This is often called a "simple bind" method. -Example (goes into `users.xml`): +For example, ```xml @@ -81,21 +85,20 @@ Example (goes into `users.xml`): ``` -Note, that now, once user `my_user` refers to `my_ldap_server`, this LDAP server must be configured in the main `config.xml` file as described previously. +Note, that user `my_user` refers to `my_ldap_server`. This LDAP server must be configured in the main `config.xml` file as described previously. -When SQL-driven Access Control and Account Management is enabled in ClickHouse, users that are identified by LDAP servers can also be created using queries. +When SQL-driven [Access Control and Account Management](#access-control) is enabled in ClickHouse, users that are identified by LDAP servers can also be created using the [CRATE USER](#create-user-statement) statement. -Example (execute in ClickHouse client): ```sql CREATE USER my_user IDENTIFIED WITH ldap_server BY 'my_ldap_server' ``` -## Exernal User Directory {#ldap-external-user-directory} +## Using LDAP As Exernal User Directory {#ldap-external-user-directory} -A remote LDAP server can be used as a source of user definitions, in addition to the locally defined users. In order to achieve this, specify previously defined LDAP server name in `ldap` section inside `users_directories` section in main `config.xml` file. +In addition to the locally defined users, a remote LDAP server can be used as a source of user definitions. In order to achieve this, specify previously defined LDAP server name (see [LDAP Server Definition](#ldap-server-definition)) in the `ldap` section inside the `users_directories` section in of the `config.xml` file. -At each login attempt, ClickHouse will try to find the user definition locally and authenticate it as usual, but if the user is not defined, ClickHouse will assume it exists in the external LDAP directory, and will try to "bind" to the specified DN (see `bind_dn` sections in LDAP server config in `config.xml`) at the LDAP server using the provided credentials, and, if successful, the user will be considered existing and authenticated. The user will be assigned roles from the list specified in `roles`. Additionally, LDAP "search" can be performed and results can be transformed and treated as role names and then assigned to the user if `role_mapping` section is also configured. All this implies that the SQL-driven Access Control and Account Management is enabled in ClickHouse and roles are created using `CREATE ROLE ...` queries. +At each login attempt, ClickHouse will try to find the user definition locally and authenticate it as usual, but if the user is not defined, ClickHouse will assume it exists in the external LDAP directory, and will try to "bind" to the specified DN at the LDAP server using the provided credentials, and if successful, the user will be considered existing and authenticated. The user will be assigned roles from the list specified in the `roles` section. Additionally, LDAP "search" can be performed and results can be transformed and treated as role names and then assigned to the user if the `role_mapping` section is also configured. All this implies that the SQL-driven [Access Control and Account Management](#access-control) is enabled in ClickHouse and roles are created using the [CREATE ROLE](#create-role-statement) statement. Example (goes into `config.xml`): @@ -122,24 +125,34 @@ Example (goes into `config.xml`): ``` -Note, that now, once `my_ldap_server` is referred from `ldap` inside `user_directories` section, this LDAP server must be configured in the main `config.xml` file as described previously. +Note that `my_ldap_server` referred in the `ldap` section inside the `user_directories` section must be a previously +defined LDAP server that is configured in the `config.xml` (see [LDAP Server Definition](#ldap-server-definition)). Parameters: -- `server` - one of LDAP server names defined in `ldap_servers` config section above. This parameter is mandatory and cannot be empty. +- `server` - one of LDAP server names defined in `ldap_servers` config section above. + This parameter is mandatory and cannot be empty. - `roles` - section with a list of locally defined roles that will be assigned to each user retrieved from the LDAP server. - - If no roles are specified here or assigned during role mapping (below), user will not be able to perform any actions after authentication. + - If no roles are specified here or assigned during role mapping (below), user will not be able + to perform any actions after authentication. - `role_mapping` - section with LDAP search parameters and mapping rules. - - When a user authenticates, while still bound to LDAP, an LDAP search is performed using `search_filter` and the name of the logged in user. For each entry found during that search, the value of the specified attribute is extracted. For each attribute value that has the specified prefix, the prefix is removed, and the rest of the value becomes the name of a local role defined in ClickHouse, which is expected to be created beforehand by `CREATE ROLE ...` command. - + - When a user authenticates, while still bound to LDAP, an LDAP search is performed using `search_filter` + and the name of the logged in user. For each entry found during that search, the value of the specified + attribute is extracted. For each attribute value that has the specified prefix, the prefix is removed, + and the rest of the value becomes the name of a local role defined in ClickHouse, + which is expected to be created beforehand by the [CREATE ROLE](#create-role-statement) statement. - There can be multiple `role_mapping` sections defined inside the same `ldap` section. All of them will be applied. - `base_dn` - template used to construct the base DN for the LDAP search. - - The resulting DN will be constructed by replacing all `{user_name}` and `{bind_dn}` substrings of the template with the actual user name and bind DN during each LDAP search. + - The resulting DN will be constructed by replacing all `{user_name}` and `{bind_dn}` + substrings of the template with the actual user name and bind DN during each LDAP search. - `scope` - scope of the LDAP search. - Accepted values are: `base`, `one_level`, `children`, `subtree` (the default). - `search_filter` - template used to construct the search filter for the LDAP search. - - The resulting filter will be constructed by replacing all `{user_name}`, `{bind_dn}`, and `{base_dn}` substrings of the template with the actual user name, bind DN, and base DN during each LDAP search. + - The resulting filter will be constructed by replacing all `{user_name}`, `{bind_dn}`, and `{base_dn}` + substrings of the template with the actual user name, bind DN, and base DN during each LDAP search. - Note, that the special characters must be escaped properly in XML. - `attribute` - attribute name whose values will be returned by the LDAP search. - - `prefix` - prefix, that will be expected to be in front of each string in the original list of strings returned by the LDAP search. Prefix will be removed from the original strings and resulting strings will be treated as local role names. Empty, by default. + - `prefix` - prefix, that will be expected to be in front of each string in the original + list of strings returned by the LDAP search. Prefix will be removed from the original + strings and resulting strings will be treated as local role names. Empty, by default. From 9d9055681c8c5536d3dec4974cf42c90490f1efb Mon Sep 17 00:00:00 2001 From: Vitaliy Zakaznikov Date: Mon, 8 Feb 2021 12:35:18 -0500 Subject: [PATCH 0181/2357] Small changes to LDAP docs. --- docs/en/operations/external-authenticators/index.md | 4 ++-- docs/en/operations/external-authenticators/ldap.md | 6 +++--- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/docs/en/operations/external-authenticators/index.md b/docs/en/operations/external-authenticators/index.md index f06c1de8ec7..3387bbbdc05 100644 --- a/docs/en/operations/external-authenticators/index.md +++ b/docs/en/operations/external-authenticators/index.md @@ -10,10 +10,10 @@ ClickHouse supports authenticating and managing users using external services. The following external authenticators and directories are supported. -## External Authenticators +External Authenticators: - [LDAP](#ldap-external-authenticator) -## External User Directories +External User Directories: - [LDAP](#ldap-external-user-directory) diff --git a/docs/en/operations/external-authenticators/ldap.md b/docs/en/operations/external-authenticators/ldap.md index 7ad1fd68b74..03be357a12a 100644 --- a/docs/en/operations/external-authenticators/ldap.md +++ b/docs/en/operations/external-authenticators/ldap.md @@ -8,7 +8,7 @@ LDAP server can be used to authenticate ClickHouse users. There are two differen For both of these approaches, an internally named LDAP server must be defined in the ClickHouse config so that other parts of config are able to refer to it. -## Server Definition {#ldap-server-definition} +## LDAP Server Definition {#ldap-server-definition} To define LDAP server you must add `ldap_servers` section to the `config.xml`. For example, @@ -61,7 +61,7 @@ Parameters: - `tls_ca_cert_dir` - path to the directory containing CA certificates. - `tls_cipher_suite` - allowed cipher suite (in OpenSSL notation). -## Using LDAP As External Authenticator {#ldap-external-authenticator} +## LDAP External Authenticator {#ldap-external-authenticator} A remote LDAP server can be used as a method for verifying passwords for locally defined users (users defined in `users.xml` or in local access control paths). In order to achieve this, specify previously defined LDAP server name instead of `password` or similar sections in the user definition. @@ -94,7 +94,7 @@ When SQL-driven [Access Control and Account Management](#access-control) is enab CREATE USER my_user IDENTIFIED WITH ldap_server BY 'my_ldap_server' ``` -## Using LDAP As Exernal User Directory {#ldap-external-user-directory} +## LDAP Exernal User Directory {#ldap-external-user-directory} In addition to the locally defined users, a remote LDAP server can be used as a source of user definitions. In order to achieve this, specify previously defined LDAP server name (see [LDAP Server Definition](#ldap-server-definition)) in the `ldap` section inside the `users_directories` section in of the `config.xml` file. From 3c94e4d6f4b5e7c8ee048d6325d6275775d35426 Mon Sep 17 00:00:00 2001 From: Vitaliy Zakaznikov Date: Mon, 8 Feb 2021 14:01:33 -0500 Subject: [PATCH 0182/2357] Changing index.md --- docs/en/operations/external-authenticators/index.md | 10 ++-------- 1 file changed, 2 insertions(+), 8 deletions(-) diff --git a/docs/en/operations/external-authenticators/index.md b/docs/en/operations/external-authenticators/index.md index 3387bbbdc05..fb8483fa341 100644 --- a/docs/en/operations/external-authenticators/index.md +++ b/docs/en/operations/external-authenticators/index.md @@ -8,12 +8,6 @@ toc_title: Introduction ClickHouse supports authenticating and managing users using external services. -The following external authenticators and directories are supported. +The following external authenticators and directories are supported: -External Authenticators: - -- [LDAP](#ldap-external-authenticator) - -External User Directories: - -- [LDAP](#ldap-external-user-directory) +- [LDAP](#external-authenticators-ldap) [Authenticator](#ldap-external-authenticator) and [Directory](#ldap-external-user-directory) From 78c1d69b8c55a651f77f630e34e582dabb006f1f Mon Sep 17 00:00:00 2001 From: Alexander Tokmakov Date: Mon, 8 Feb 2021 22:36:17 +0300 Subject: [PATCH 0183/2357] better code --- src/Common/CurrentMetrics.cpp | 1 - src/Databases/DatabaseOnDisk.cpp | 54 +++++++++++++++++++ src/Databases/DatabaseOnDisk.h | 2 + src/Databases/DatabaseOrdinary.cpp | 50 +---------------- src/Databases/DatabaseReplicated.cpp | 13 ++--- src/Databases/DatabaseReplicatedWorker.cpp | 2 +- src/Interpreters/ClientInfo.h | 1 - src/Interpreters/Context.cpp | 1 - src/Interpreters/Context.h | 1 - src/Interpreters/DDLTask.cpp | 4 +- src/Interpreters/DDLTask.h | 5 +- src/Interpreters/DDLWorker.cpp | 2 - src/Interpreters/InterpreterAlterQuery.cpp | 13 +++-- src/Interpreters/InterpreterCreateQuery.cpp | 41 +++++--------- src/Interpreters/InterpreterCreateQuery.h | 3 ++ src/Interpreters/InterpreterDropQuery.cpp | 13 ++++- src/Interpreters/InterpreterRenameQuery.cpp | 2 +- .../MergeTree/registerStorageMergeTree.cpp | 2 +- src/Storages/StorageReplicatedMergeTree.cpp | 32 +++++------ src/Storages/StorageReplicatedMergeTree.h | 4 +- src/Storages/System/StorageSystemClusters.cpp | 2 +- src/Storages/System/StorageSystemClusters.h | 2 +- .../test_replicated_database/test.py | 11 +++- ...8_ddl_dictionaries_concurrent_requrests.sh | 4 +- tests/queries/skip_list.json | 6 +++ 25 files changed, 146 insertions(+), 125 deletions(-) diff --git a/src/Common/CurrentMetrics.cpp b/src/Common/CurrentMetrics.cpp index c524467d8ca..4fb2709c8e4 100644 --- a/src/Common/CurrentMetrics.cpp +++ b/src/Common/CurrentMetrics.cpp @@ -15,7 +15,6 @@ M(BackgroundSchedulePoolTask, "Number of active tasks in BackgroundSchedulePool. This pool is used for periodic ReplicatedMergeTree tasks, like cleaning old data parts, altering data parts, replica re-initialization, etc.") \ M(BackgroundBufferFlushSchedulePoolTask, "Number of active tasks in BackgroundBufferFlushSchedulePool. This pool is used for periodic Buffer flushes") \ M(BackgroundDistributedSchedulePoolTask, "Number of active tasks in BackgroundDistributedSchedulePool. This pool is used for distributed sends that is done in background.") \ - M(BackgroundReplicatedSchedulePoolTask, "Number of active tasks in BackgroundReplicatedSchedulePoolTask. The pool is used by replicated database for executing DDL log coming from other replicas. One task corresponds to one replicated database") \ M(BackgroundMessageBrokerSchedulePoolTask, "Number of active tasks in BackgroundProcessingPool for message streaming") \ M(CacheDictionaryUpdateQueueBatches, "Number of 'batches' (a set of keys) in update queue in CacheDictionaries.") \ M(CacheDictionaryUpdateQueueKeys, "Exact number of keys in update queue in CacheDictionaries.") \ diff --git a/src/Databases/DatabaseOnDisk.cpp b/src/Databases/DatabaseOnDisk.cpp index 275f5bd3976..a03cb33591c 100644 --- a/src/Databases/DatabaseOnDisk.cpp +++ b/src/Databases/DatabaseOnDisk.cpp @@ -129,6 +129,60 @@ String getObjectDefinitionFromCreateQuery(const ASTPtr & query) return statement_buf.str(); } +void applyMetadataChangesToCreateQuery(const ASTPtr & query, const StorageInMemoryMetadata & metadata) +{ + auto & ast_create_query = query->as(); + + bool has_structure = ast_create_query.columns_list && ast_create_query.columns_list->columns; + if (ast_create_query.as_table_function && !has_structure) + throw Exception(ErrorCodes::NOT_IMPLEMENTED, "Cannot alter table {} because it was created AS table function" + " and doesn't have structure in metadata", backQuote(ast_create_query.table)); + + assert(has_structure); + ASTPtr new_columns = InterpreterCreateQuery::formatColumns(metadata.columns); + ASTPtr new_indices = InterpreterCreateQuery::formatIndices(metadata.secondary_indices); + ASTPtr new_constraints = InterpreterCreateQuery::formatConstraints(metadata.constraints); + + ast_create_query.columns_list->replace(ast_create_query.columns_list->columns, new_columns); + ast_create_query.columns_list->setOrReplace(ast_create_query.columns_list->indices, new_indices); + ast_create_query.columns_list->setOrReplace(ast_create_query.columns_list->constraints, new_constraints); + + if (metadata.select.select_query) + { + query->replace(ast_create_query.select, metadata.select.select_query); + } + + /// MaterializedView is one type of CREATE query without storage. + if (ast_create_query.storage) + { + ASTStorage & storage_ast = *ast_create_query.storage; + + bool is_extended_storage_def + = storage_ast.partition_by || storage_ast.primary_key || storage_ast.order_by || storage_ast.sample_by || storage_ast.settings; + + if (is_extended_storage_def) + { + if (metadata.sorting_key.definition_ast) + storage_ast.set(storage_ast.order_by, metadata.sorting_key.definition_ast); + + if (metadata.primary_key.definition_ast) + storage_ast.set(storage_ast.primary_key, metadata.primary_key.definition_ast); + + if (metadata.sampling_key.definition_ast) + storage_ast.set(storage_ast.sample_by, metadata.sampling_key.definition_ast); + + if (metadata.table_ttl.definition_ast) + storage_ast.set(storage_ast.ttl_table, metadata.table_ttl.definition_ast); + else if (storage_ast.ttl_table != nullptr) /// TTL was removed + storage_ast.ttl_table = nullptr; + + if (metadata.settings_changes) + storage_ast.set(storage_ast.settings, metadata.settings_changes); + } + } +} + + DatabaseOnDisk::DatabaseOnDisk( const String & name, const String & metadata_path_, diff --git a/src/Databases/DatabaseOnDisk.h b/src/Databases/DatabaseOnDisk.h index b8cc1f60e66..60a50ac4539 100644 --- a/src/Databases/DatabaseOnDisk.h +++ b/src/Databases/DatabaseOnDisk.h @@ -25,6 +25,8 @@ std::pair createTableFromAST( */ String getObjectDefinitionFromCreateQuery(const ASTPtr & query); +void applyMetadataChangesToCreateQuery(const ASTPtr & query, const StorageInMemoryMetadata & metadata); + /* Class to provide basic operations with tables when metadata is stored on disk in .sql files. */ diff --git a/src/Databases/DatabaseOrdinary.cpp b/src/Databases/DatabaseOrdinary.cpp index 49bec28e4a1..d859578eb46 100644 --- a/src/Databases/DatabaseOrdinary.cpp +++ b/src/Databases/DatabaseOrdinary.cpp @@ -272,55 +272,7 @@ void DatabaseOrdinary::alterTable(const Context & context, const StorageID & tab 0, context.getSettingsRef().max_parser_depth); - auto & ast_create_query = ast->as(); - - bool has_structure = ast_create_query.columns_list && ast_create_query.columns_list->columns; - if (ast_create_query.as_table_function && !has_structure) - throw Exception(ErrorCodes::NOT_IMPLEMENTED, "Cannot alter table {} because it was created AS table function" - " and doesn't have structure in metadata", backQuote(table_name)); - - assert(has_structure); - ASTPtr new_columns = InterpreterCreateQuery::formatColumns(metadata.columns); - ASTPtr new_indices = InterpreterCreateQuery::formatIndices(metadata.secondary_indices); - ASTPtr new_constraints = InterpreterCreateQuery::formatConstraints(metadata.constraints); - - ast_create_query.columns_list->replace(ast_create_query.columns_list->columns, new_columns); - ast_create_query.columns_list->setOrReplace(ast_create_query.columns_list->indices, new_indices); - ast_create_query.columns_list->setOrReplace(ast_create_query.columns_list->constraints, new_constraints); - - if (metadata.select.select_query) - { - ast->replace(ast_create_query.select, metadata.select.select_query); - } - - /// MaterializedView is one type of CREATE query without storage. - if (ast_create_query.storage) - { - ASTStorage & storage_ast = *ast_create_query.storage; - - bool is_extended_storage_def - = storage_ast.partition_by || storage_ast.primary_key || storage_ast.order_by || storage_ast.sample_by || storage_ast.settings; - - if (is_extended_storage_def) - { - if (metadata.sorting_key.definition_ast) - storage_ast.set(storage_ast.order_by, metadata.sorting_key.definition_ast); - - if (metadata.primary_key.definition_ast) - storage_ast.set(storage_ast.primary_key, metadata.primary_key.definition_ast); - - if (metadata.sampling_key.definition_ast) - storage_ast.set(storage_ast.sample_by, metadata.sampling_key.definition_ast); - - if (metadata.table_ttl.definition_ast) - storage_ast.set(storage_ast.ttl_table, metadata.table_ttl.definition_ast); - else if (storage_ast.ttl_table != nullptr) /// TTL was removed - storage_ast.ttl_table = nullptr; - - if (metadata.settings_changes) - storage_ast.set(storage_ast.settings, metadata.settings_changes); - } - } + applyMetadataChangesToCreateQuery(ast, metadata); statement = getObjectDefinitionFromCreateQuery(ast); { diff --git a/src/Databases/DatabaseReplicated.cpp b/src/Databases/DatabaseReplicated.cpp index a134ba5dec7..4a6058afcd0 100644 --- a/src/Databases/DatabaseReplicated.cpp +++ b/src/Databases/DatabaseReplicated.cpp @@ -134,6 +134,7 @@ std::pair DatabaseReplicated::parseFullReplicaName(const String ClusterPtr DatabaseReplicated::getCluster() const { + /// TODO Maintain up-to-date Cluster and allow to use it in Distributed tables Strings hosts; Strings host_ids; @@ -149,6 +150,7 @@ ClusterPtr DatabaseReplicated::getCluster() const if (hosts.empty()) throw Exception(ErrorCodes::LOGICAL_ERROR, "No hosts found"); Int32 cver = stat.cversion; + std::sort(hosts.begin(), hosts.end()); std::vector futures; futures.reserve(hosts.size()); @@ -174,7 +176,6 @@ ClusterPtr DatabaseReplicated::getCluster() const assert(!hosts.empty()); assert(hosts.size() == host_ids.size()); - std::sort(hosts.begin(), hosts.end()); String current_shard = parseFullReplicaName(hosts.front()).first; std::vector shards; shards.emplace_back(); @@ -327,9 +328,7 @@ BlockIO DatabaseReplicated::propose(const ASTPtr & query, const Context & query_ if (query_context.getSettingsRef().distributed_ddl_task_timeout == 0) return io; - //FIXME need list of all replicas, we can obtain it from zk - Strings hosts_to_wait; - hosts_to_wait.emplace_back(getFullReplicaName()); + Strings hosts_to_wait = getZooKeeper()->getChildren(zookeeper_path + "/replicas"); auto stream = std::make_shared(node_path, entry, query_context, hosts_to_wait); io.in = std::move(stream); return io; @@ -338,7 +337,7 @@ BlockIO DatabaseReplicated::propose(const ASTPtr & query, const Context & query_ void DatabaseReplicated::recoverLostReplica(const ZooKeeperPtr & current_zookeeper, UInt32 from_snapshot) { - LOG_WARNING(log, "Will recover replica"); + //LOG_WARNING(log, "Will recover replica"); //FIXME drop old tables @@ -355,7 +354,7 @@ void DatabaseReplicated::recoverLostReplica(const ZooKeeperPtr & current_zookeep Context query_context = global_context; query_context.makeQueryContext(); - query_context.getClientInfo().query_kind = ClientInfo::QueryKind::REPLICATED_LOG_QUERY; + query_context.getClientInfo().query_kind = ClientInfo::QueryKind::SECONDARY_QUERY; query_context.setCurrentDatabase(database_name); query_context.setCurrentQueryId(""); // generate random query_id @@ -436,6 +435,8 @@ void DatabaseReplicated::renameTable(const Context & context, const String & tab { if (this != &to_database) throw Exception(ErrorCodes::NOT_IMPLEMENTED, "Moving tables between databases is not supported for Replicated engine"); + if (table_name == to_table_name) + throw Exception(ErrorCodes::INCORRECT_QUERY, "Cannot rename table to itself"); if (!isTableExist(table_name, context)) throw Exception(ErrorCodes::UNKNOWN_TABLE, "Table {} does not exist", table_name); if (exchange && !to_database.isTableExist(to_table_name, context)) diff --git a/src/Databases/DatabaseReplicatedWorker.cpp b/src/Databases/DatabaseReplicatedWorker.cpp index 748305922b7..dd9dc322f9d 100644 --- a/src/Databases/DatabaseReplicatedWorker.cpp +++ b/src/Databases/DatabaseReplicatedWorker.cpp @@ -48,7 +48,7 @@ void DatabaseReplicatedDDLWorker::initializeReplication() UInt32 our_log_ptr = parse(current_zookeeper->get(database->replica_path + "/log_ptr")); UInt32 max_log_ptr = parse(current_zookeeper->get(database->zookeeper_path + "/max_log_ptr")); UInt32 logs_to_keep = parse(current_zookeeper->get(database->zookeeper_path + "/logs_to_keep")); - if (our_log_ptr + logs_to_keep < max_log_ptr) + if (our_log_ptr == 0 || our_log_ptr + logs_to_keep < max_log_ptr) database->recoverLostReplica(current_zookeeper, 0); } diff --git a/src/Interpreters/ClientInfo.h b/src/Interpreters/ClientInfo.h index cacbed44c42..d2b7beb7d8c 100644 --- a/src/Interpreters/ClientInfo.h +++ b/src/Interpreters/ClientInfo.h @@ -42,7 +42,6 @@ public: NO_QUERY = 0, /// Uninitialized object. INITIAL_QUERY = 1, SECONDARY_QUERY = 2, /// Query that was initiated by another query for distributed or ON CLUSTER query execution. - REPLICATED_LOG_QUERY = 3, /// Query from replicated DDL log. }; diff --git a/src/Interpreters/Context.cpp b/src/Interpreters/Context.cpp index 83804125cd4..10619e3ad9a 100644 --- a/src/Interpreters/Context.cpp +++ b/src/Interpreters/Context.cpp @@ -79,7 +79,6 @@ namespace CurrentMetrics extern const Metric BackgroundSchedulePoolTask; extern const Metric BackgroundBufferFlushSchedulePoolTask; extern const Metric BackgroundDistributedSchedulePoolTask; - extern const Metric BackgroundReplicatedSchedulePoolTask; extern const Metric BackgroundMessageBrokerSchedulePoolTask; } diff --git a/src/Interpreters/Context.h b/src/Interpreters/Context.h index 906efcc6dba..636255d6190 100644 --- a/src/Interpreters/Context.h +++ b/src/Interpreters/Context.h @@ -622,7 +622,6 @@ public: BackgroundSchedulePool & getSchedulePool() const; BackgroundSchedulePool & getMessageBrokerSchedulePool() const; BackgroundSchedulePool & getDistributedSchedulePool() const; - BackgroundSchedulePool & getReplicatedSchedulePool() const; /// Has distributed_ddl configuration or not. bool hasDistributedDDL() const; diff --git a/src/Interpreters/DDLTask.cpp b/src/Interpreters/DDLTask.cpp index 9737167fa4c..9e379443364 100644 --- a/src/Interpreters/DDLTask.cpp +++ b/src/Interpreters/DDLTask.cpp @@ -296,7 +296,7 @@ String DatabaseReplicatedTask::getShardID() const std::unique_ptr DatabaseReplicatedTask::makeQueryContext(Context & from_context) { auto query_context = DDLTaskBase::makeQueryContext(from_context); - query_context->getClientInfo().query_kind = ClientInfo::QueryKind::REPLICATED_LOG_QUERY; //FIXME why do we need separate query kind? + query_context->getClientInfo().query_kind = ClientInfo::QueryKind::SECONDARY_QUERY; query_context->setCurrentDatabase(database->getDatabaseName()); auto txn = std::make_shared(); @@ -340,7 +340,7 @@ void MetadataTransaction::commit() assert(state == CREATED); state = FAILED; current_zookeeper->multi(ops); - state = COMMITED; + state = COMMITTED; } } diff --git a/src/Interpreters/DDLTask.h b/src/Interpreters/DDLTask.h index 552f4919765..43d9fa1c0ae 100644 --- a/src/Interpreters/DDLTask.h +++ b/src/Interpreters/DDLTask.h @@ -144,7 +144,7 @@ struct MetadataTransaction enum State { CREATED, - COMMITED, + COMMITTED, FAILED }; @@ -154,10 +154,11 @@ struct MetadataTransaction bool is_initial_query; Coordination::Requests ops; - void addOps(Coordination::Requests & other_ops) + void moveOpsTo(Coordination::Requests & other_ops) { std::move(ops.begin(), ops.end(), std::back_inserter(other_ops)); ops.clear(); + state = COMMITTED; } void commit(); diff --git a/src/Interpreters/DDLWorker.cpp b/src/Interpreters/DDLWorker.cpp index f0cc3370211..665bacf9d6d 100644 --- a/src/Interpreters/DDLWorker.cpp +++ b/src/Interpreters/DDLWorker.cpp @@ -42,7 +42,6 @@ namespace DB namespace ErrorCodes { - extern const int NOT_IMPLEMENTED; extern const int LOGICAL_ERROR; extern const int TIMEOUT_EXCEEDED; extern const int UNFINISHED; @@ -51,7 +50,6 @@ namespace ErrorCodes extern const int CANNOT_ASSIGN_ALTER; extern const int CANNOT_ALLOCATE_MEMORY; extern const int MEMORY_LIMIT_EXCEEDED; - extern const int INCORRECT_QUERY; } diff --git a/src/Interpreters/InterpreterAlterQuery.cpp b/src/Interpreters/InterpreterAlterQuery.cpp index cee9b9083ea..402f05895bc 100644 --- a/src/Interpreters/InterpreterAlterQuery.cpp +++ b/src/Interpreters/InterpreterAlterQuery.cpp @@ -28,6 +28,7 @@ namespace ErrorCodes { extern const int LOGICAL_ERROR; extern const int INCORRECT_QUERY; + extern const int NOT_IMPLEMENTED; } @@ -49,7 +50,7 @@ BlockIO InterpreterAlterQuery::execute() auto table_id = context.resolveStorageID(alter, Context::ResolveOrdinary); DatabasePtr database = DatabaseCatalog::instance().getDatabase(table_id.database_name); - if (typeid_cast(database.get()) && context.getClientInfo().query_kind != ClientInfo::QueryKind::REPLICATED_LOG_QUERY) + if (typeid_cast(database.get()) && context.getClientInfo().query_kind != ClientInfo::QueryKind::SECONDARY_QUERY) { auto guard = DatabaseCatalog::instance().getDDLGuard(table_id.database_name, table_id.table_name); guard->releaseTableLock(); @@ -60,8 +61,6 @@ BlockIO InterpreterAlterQuery::execute() auto alter_lock = table->lockForAlter(context.getCurrentQueryId(), context.getSettingsRef().lock_acquire_timeout); auto metadata_snapshot = table->getInMemoryMetadataPtr(); - //FIXME commit MetadataTransaction for all ALTER kinds. Now its' implemented only for metadata alter. - /// Add default database to table identifiers that we can encounter in e.g. default expressions, /// mutation expression, etc. AddDefaultDatabaseVisitor visitor(table_id.getDatabaseName()); @@ -95,6 +94,14 @@ BlockIO InterpreterAlterQuery::execute() throw Exception("Wrong parameter type in ALTER query", ErrorCodes::LOGICAL_ERROR); } + if (typeid_cast(database.get())) + { + int command_types_count = !mutation_commands.empty() + !partition_commands.empty() + !live_view_commands.empty() + !alter_commands.empty(); + if (1 < command_types_count) + throw Exception(ErrorCodes::NOT_IMPLEMENTED, "For Replicated databases it's not allowed " + "to execute ALTERs of different types in single query"); + } + if (!mutation_commands.empty()) { MutationsInterpreter(table, metadata_snapshot, mutation_commands, context, false).validate(); diff --git a/src/Interpreters/InterpreterCreateQuery.cpp b/src/Interpreters/InterpreterCreateQuery.cpp index 376bf8417ff..bbe8526ae5b 100644 --- a/src/Interpreters/InterpreterCreateQuery.cpp +++ b/src/Interpreters/InterpreterCreateQuery.cpp @@ -149,7 +149,7 @@ BlockIO InterpreterCreateQuery::createDatabase(ASTCreateQuery & create) engine = makeASTFunction("Replicated", std::make_shared(fmt::format("/clickhouse/db/{}/", create.database)), std::make_shared("s1"), - std::make_shared("r1")); + std::make_shared("r" + toString(getpid()))); } engine->no_empty_args = true; @@ -573,8 +573,9 @@ InterpreterCreateQuery::TableProperties InterpreterCreateQuery::setProperties(AS /// Set the table engine if it was not specified explicitly. setEngine(create); - create.as_database.clear(); - create.as_table.clear(); + assert(as_database_saved.empty() && as_table_saved.empty()); + std::swap(create.as_database, as_database_saved); + std::swap(create.as_table, as_table_saved); return properties; } @@ -722,7 +723,7 @@ void InterpreterCreateQuery::assertOrSetUUID(ASTCreateQuery & create, const Data const auto * kind = create.is_dictionary ? "Dictionary" : "Table"; const auto * kind_upper = create.is_dictionary ? "DICTIONARY" : "TABLE"; - if (database->getEngineName() == "Replicated" && context.getClientInfo().query_kind == ClientInfo::QueryKind::REPLICATED_LOG_QUERY && !internal) + if (database->getEngineName() == "Replicated" && context.getClientInfo().query_kind == ClientInfo::QueryKind::SECONDARY_QUERY && !internal) { if (create.uuid == UUIDHelpers::Nil) throw Exception("Table UUID is not specified in DDL log", ErrorCodes::LOGICAL_ERROR); @@ -753,7 +754,6 @@ void InterpreterCreateQuery::assertOrSetUUID(ASTCreateQuery & create, const Data } else { - assert(context.getClientInfo().query_kind != ClientInfo::QueryKind::REPLICATED_LOG_QUERY); bool is_on_cluster = context.getClientInfo().query_kind == ClientInfo::QueryKind::SECONDARY_QUERY; if (create.uuid != UUIDHelpers::Nil && !is_on_cluster) throw Exception(ErrorCodes::INCORRECT_QUERY, @@ -850,7 +850,7 @@ BlockIO InterpreterCreateQuery::createTable(ASTCreateQuery & create) "Data directory {} must be inside {} to attach it", String(data_path), String(user_files)); } } - else if (create.attach && !create.attach_short_syntax && context.getClientInfo().query_kind != ClientInfo::QueryKind::REPLICATED_LOG_QUERY) + else if (create.attach && !create.attach_short_syntax && context.getClientInfo().query_kind != ClientInfo::QueryKind::SECONDARY_QUERY) { auto * log = &Poco::Logger::get("InterpreterCreateQuery"); LOG_WARNING(log, "ATTACH TABLE query with full table definition is not recommended: " @@ -874,16 +874,6 @@ BlockIO InterpreterCreateQuery::createTable(ASTCreateQuery & create) /// Set and retrieve list of columns, indices and constraints. Set table engine if needed. Rewrite query in canonical way. TableProperties properties = setProperties(create); - /// DDL log for replicated databases can not - /// contain the right database name for every replica - /// therefore for such queries the AST database - /// field is modified right before an actual execution - if (context.getClientInfo().query_kind == ClientInfo::QueryKind::REPLICATED_LOG_QUERY) - { - create.database = current_database; - } - - //TODO make code better if possible DatabasePtr database; bool need_add_to_database = !create.temporary; if (need_add_to_database) @@ -893,7 +883,7 @@ BlockIO InterpreterCreateQuery::createTable(ASTCreateQuery & create) { auto guard = DatabaseCatalog::instance().getDDLGuard(create.database, create.table); database = DatabaseCatalog::instance().getDatabase(create.database); - if (typeid_cast(database.get()) && context.getClientInfo().query_kind != ClientInfo::QueryKind::REPLICATED_LOG_QUERY) + if (typeid_cast(database.get()) && context.getClientInfo().query_kind != ClientInfo::QueryKind::SECONDARY_QUERY) { assertOrSetUUID(create, database); guard->releaseTableLock(); @@ -930,9 +920,6 @@ bool InterpreterCreateQuery::doCreateTable(ASTCreateQuery & create, guard = DatabaseCatalog::instance().getDDLGuard(create.database, create.table); database = DatabaseCatalog::instance().getDatabase(create.database); - //TODO do we need it? - if (database->getEngineName() == "Replicated" && context.getClientInfo().query_kind != ClientInfo::QueryKind::REPLICATED_LOG_QUERY) - throw Exception(ErrorCodes::UNKNOWN_DATABASE, "Database was renamed"); assertOrSetUUID(create, database); /// Table can be created before or it can be created concurrently in another thread, while we were waiting in DDLGuard. @@ -1107,9 +1094,10 @@ BlockIO InterpreterCreateQuery::createDictionary(ASTCreateQuery & create) auto guard = DatabaseCatalog::instance().getDDLGuard(database_name, dictionary_name); DatabasePtr database = DatabaseCatalog::instance().getDatabase(database_name); - if (typeid_cast(database.get()) && context.getClientInfo().query_kind != ClientInfo::QueryKind::REPLICATED_LOG_QUERY) + if (typeid_cast(database.get()) && context.getClientInfo().query_kind != ClientInfo::QueryKind::SECONDARY_QUERY) { - assertOrSetUUID(create, database); + if (!create.attach) + assertOrSetUUID(create, database); guard->releaseTableLock(); return typeid_cast(database.get())->propose(query_ptr, context); } @@ -1266,15 +1254,14 @@ AccessRightsElements InterpreterCreateQuery::getRequiredAccess() const return required_access; } -void InterpreterCreateQuery::extendQueryLogElemImpl(QueryLogElement & elem, const ASTPtr & ast, const Context &) const +void InterpreterCreateQuery::extendQueryLogElemImpl(QueryLogElement & elem, const ASTPtr &, const Context &) const { - const auto & create = ast->as(); elem.query_kind = "Create"; - if (!create.as_table.empty()) + if (!as_table_saved.empty()) { - String database = backQuoteIfNeed(create.as_database.empty() ? context.getCurrentDatabase() : create.as_database); + String database = backQuoteIfNeed(as_database_saved.empty() ? context.getCurrentDatabase() : as_database_saved); elem.query_databases.insert(database); - elem.query_tables.insert(database + "." + backQuoteIfNeed(create.as_table)); + elem.query_tables.insert(database + "." + backQuoteIfNeed(as_table_saved)); } } diff --git a/src/Interpreters/InterpreterCreateQuery.h b/src/Interpreters/InterpreterCreateQuery.h index c109b0b7760..d88357fe412 100644 --- a/src/Interpreters/InterpreterCreateQuery.h +++ b/src/Interpreters/InterpreterCreateQuery.h @@ -95,5 +95,8 @@ private: /// Is this an internal query - not from the user. bool internal = false; bool force_attach = false; + + mutable String as_database_saved; + mutable String as_table_saved; }; } diff --git a/src/Interpreters/InterpreterDropQuery.cpp b/src/Interpreters/InterpreterDropQuery.cpp index e6943f06e06..ae76e8efd46 100644 --- a/src/Interpreters/InterpreterDropQuery.cpp +++ b/src/Interpreters/InterpreterDropQuery.cpp @@ -129,7 +129,7 @@ BlockIO InterpreterDropQuery::executeToTableImpl(const ASTDropQuery & query, Dat /// Prevents recursive drop from drop database query. The original query must specify a table. bool is_drop_or_detach_database = query_ptr->as()->table.empty(); bool is_replicated_ddl_query = typeid_cast(database.get()) && - context.getClientInfo().query_kind != ClientInfo::QueryKind::REPLICATED_LOG_QUERY && + context.getClientInfo().query_kind != ClientInfo::QueryKind::SECONDARY_QUERY && !is_drop_or_detach_database; if (is_replicated_ddl_query) { @@ -137,6 +137,13 @@ BlockIO InterpreterDropQuery::executeToTableImpl(const ASTDropQuery & query, Dat throw Exception(ErrorCodes::INCORRECT_QUERY, "DETACH TABLE is not allowed for Replicated databases. " "Use DETACH TABLE PERMANENTLY or SYSTEM RESTART REPLICA"); + if (query.kind == ASTDropQuery::Kind::Detach) + context.checkAccess(table->isView() ? AccessType::DROP_VIEW : AccessType::DROP_TABLE, table_id); + else if (query.kind == ASTDropQuery::Kind::Truncate) + context.checkAccess(AccessType::TRUNCATE, table_id); + else if (query.kind == ASTDropQuery::Kind::Drop) + context.checkAccess(table->isView() ? AccessType::DROP_VIEW : AccessType::DROP_TABLE, table_id); + ddl_guard->releaseTableLock(); table.reset(); return typeid_cast(database.get())->propose(query.clone(), context); @@ -214,13 +221,15 @@ BlockIO InterpreterDropQuery::executeToDictionary( bool is_drop_or_detach_database = query_ptr->as()->table.empty(); bool is_replicated_ddl_query = typeid_cast(database.get()) && - context.getClientInfo().query_kind != ClientInfo::QueryKind::REPLICATED_LOG_QUERY && + context.getClientInfo().query_kind != ClientInfo::QueryKind::SECONDARY_QUERY && !is_drop_or_detach_database; if (is_replicated_ddl_query) { if (kind == ASTDropQuery::Kind::Detach) throw Exception(ErrorCodes::INCORRECT_QUERY, "DETACH DICTIONARY is not allowed for Replicated databases."); + context.checkAccess(AccessType::DROP_DICTIONARY, database_name, dictionary_name); + ddl_guard->releaseTableLock(); return typeid_cast(database.get())->propose(query_ptr, context); } diff --git a/src/Interpreters/InterpreterRenameQuery.cpp b/src/Interpreters/InterpreterRenameQuery.cpp index 5bfc144e014..b9d7faac73c 100644 --- a/src/Interpreters/InterpreterRenameQuery.cpp +++ b/src/Interpreters/InterpreterRenameQuery.cpp @@ -80,7 +80,7 @@ BlockIO InterpreterRenameQuery::executeToTables(const ASTRenameQuery & rename, c database_catalog.assertTableDoesntExist(StorageID(elem.to_database_name, elem.to_table_name), context); DatabasePtr database = database_catalog.getDatabase(elem.from_database_name); - if (typeid_cast(database.get()) && context.getClientInfo().query_kind != ClientInfo::QueryKind::REPLICATED_LOG_QUERY) + if (typeid_cast(database.get()) && context.getClientInfo().query_kind != ClientInfo::QueryKind::SECONDARY_QUERY) { if (1 < descriptions.size()) throw Exception(ErrorCodes::NOT_IMPLEMENTED, "Database {} is Replicated, " diff --git a/src/Storages/MergeTree/registerStorageMergeTree.cpp b/src/Storages/MergeTree/registerStorageMergeTree.cpp index 1d68f788a42..8377e37b07a 100644 --- a/src/Storages/MergeTree/registerStorageMergeTree.cpp +++ b/src/Storages/MergeTree/registerStorageMergeTree.cpp @@ -454,7 +454,7 @@ static StoragePtr create(const StorageFactory::Arguments & args) /// Allow implicit {uuid} macros only for zookeeper_path in ON CLUSTER queries bool is_on_cluster = args.local_context.getClientInfo().query_kind == ClientInfo::QueryKind::SECONDARY_QUERY; - bool is_replicated_database = args.local_context.getClientInfo().query_kind == ClientInfo::QueryKind::REPLICATED_LOG_QUERY && + bool is_replicated_database = args.local_context.getClientInfo().query_kind == ClientInfo::QueryKind::SECONDARY_QUERY && DatabaseCatalog::instance().getDatabase(args.table_id.database_name)->getEngineName() == "Replicated"; bool allow_uuid_macro = is_on_cluster || is_replicated_database || args.query.attach; diff --git a/src/Storages/StorageReplicatedMergeTree.cpp b/src/Storages/StorageReplicatedMergeTree.cpp index a4b83e365d1..3295be311d1 100644 --- a/src/Storages/StorageReplicatedMergeTree.cpp +++ b/src/Storages/StorageReplicatedMergeTree.cpp @@ -4284,24 +4284,12 @@ void StorageReplicatedMergeTree::alter( if (auto txn = query_context.getMetadataTransaction()) { - txn->addOps(ops); + txn->moveOpsTo(ops); /// NOTE: IDatabase::alterTable(...) is called when executing ALTER_METADATA queue entry without query context, /// so we have to update metadata of DatabaseReplicated here. - /// It also may cause "Table columns structure in ZooKeeper is different" error on server startup - /// even for Ordinary and Atomic databases. String metadata_zk_path = txn->zookeeper_path + "/metadata/" + escapeForFileName(table_id.table_name); auto ast = DatabaseCatalog::instance().getDatabase(table_id.database_name)->getCreateTableQuery(table_id.table_name, query_context); - auto & ast_create_query = ast->as(); - - //FIXME copy-paste - ASTPtr new_columns = InterpreterCreateQuery::formatColumns(future_metadata.columns); - ASTPtr new_indices = InterpreterCreateQuery::formatIndices(future_metadata.secondary_indices); - ASTPtr new_constraints = InterpreterCreateQuery::formatConstraints(future_metadata.constraints); - - ast_create_query.columns_list->replace(ast_create_query.columns_list->columns, new_columns); - ast_create_query.columns_list->setOrReplace(ast_create_query.columns_list->indices, new_indices); - ast_create_query.columns_list->setOrReplace(ast_create_query.columns_list->constraints, new_constraints); - + applyMetadataChangesToCreateQuery(ast, future_metadata); ops.emplace_back(zkutil::makeSetRequest(metadata_zk_path, getObjectDefinitionFromCreateQuery(ast), -1)); } @@ -4450,7 +4438,7 @@ void StorageReplicatedMergeTree::dropPartition(const ASTPtr & partition, bool de else { String partition_id = getPartitionIDFromQuery(partition, query_context); - did_drop = dropAllPartsInPartition(*zookeeper, partition_id, entry, detach); + did_drop = dropAllPartsInPartition(*zookeeper, partition_id, entry, query_context, detach); } if (did_drop) @@ -4474,7 +4462,7 @@ void StorageReplicatedMergeTree::dropPartition(const ASTPtr & partition, bool de void StorageReplicatedMergeTree::truncate( - const ASTPtr &, const StorageMetadataPtr &, const Context &, TableExclusiveLockHolder & table_lock) + const ASTPtr &, const StorageMetadataPtr &, const Context & query_context, TableExclusiveLockHolder & table_lock) { table_lock.release(); /// Truncate is done asynchronously. @@ -4490,7 +4478,7 @@ void StorageReplicatedMergeTree::truncate( { LogEntry entry; - if (dropAllPartsInPartition(*zookeeper, partition_id, entry, false)) + if (dropAllPartsInPartition(*zookeeper, partition_id, entry, query_context, false)) waitForAllReplicasToProcessLogEntry(entry); } } @@ -5274,6 +5262,9 @@ void StorageReplicatedMergeTree::mutate(const MutationCommands & commands, const requests.emplace_back(zkutil::makeCreateRequest( mutations_path + "/", mutation_entry.toString(), zkutil::CreateMode::PersistentSequential)); + if (auto txn = query_context.getMetadataTransaction()) + txn->moveOpsTo(requests); + Coordination::Responses responses; Coordination::Error rc = zookeeper->tryMulti(requests, responses); @@ -5775,6 +5766,9 @@ void StorageReplicatedMergeTree::replacePartitionFrom( } } + if (auto txn = context.getMetadataTransaction()) + txn->moveOpsTo(ops); + ops.emplace_back(zkutil::makeSetRequest(zookeeper_path + "/log", "", -1)); /// Just update version ops.emplace_back(zkutil::makeCreateRequest(zookeeper_path + "/log/log-", entry.toString(), zkutil::CreateMode::PersistentSequential)); @@ -6243,7 +6237,7 @@ bool StorageReplicatedMergeTree::dropPart( } bool StorageReplicatedMergeTree::dropAllPartsInPartition( - zkutil::ZooKeeper & zookeeper, String & partition_id, LogEntry & entry, bool detach) + zkutil::ZooKeeper & zookeeper, String & partition_id, LogEntry & entry, const Context & query_context, bool detach) { MergeTreePartInfo drop_range_info; if (!getFakePartCoveringAllPartsInPartition(partition_id, drop_range_info)) @@ -6275,6 +6269,8 @@ bool StorageReplicatedMergeTree::dropAllPartsInPartition( Coordination::Requests ops; ops.emplace_back(zkutil::makeCreateRequest(zookeeper_path + "/log/log-", entry.toString(), zkutil::CreateMode::PersistentSequential)); ops.emplace_back(zkutil::makeSetRequest(zookeeper_path + "/log", "", -1)); /// Just update version. + if (auto txn = query_context.getMetadataTransaction()) + txn->moveOpsTo(ops); Coordination::Responses responses = zookeeper.multi(ops); String log_znode_path = dynamic_cast(*responses.front()).path_created; diff --git a/src/Storages/StorageReplicatedMergeTree.h b/src/Storages/StorageReplicatedMergeTree.h index 6db05294b63..a1a70ada9b2 100644 --- a/src/Storages/StorageReplicatedMergeTree.h +++ b/src/Storages/StorageReplicatedMergeTree.h @@ -134,7 +134,7 @@ public: */ void drop() override; - void truncate(const ASTPtr &, const StorageMetadataPtr &, const Context &, TableExclusiveLockHolder &) override; + void truncate(const ASTPtr &, const StorageMetadataPtr &, const Context & query_context, TableExclusiveLockHolder &) override; void checkTableCanBeRenamed() const override; @@ -577,7 +577,7 @@ private: bool dropPart(zkutil::ZooKeeperPtr & zookeeper, String part_name, LogEntry & entry, bool detach, bool throw_if_noop); bool dropAllPartsInPartition( - zkutil::ZooKeeper & zookeeper, String & partition_id, LogEntry & entry, bool detach); + zkutil::ZooKeeper & zookeeper, String & partition_id, LogEntry & entry, const Context & query_context, bool detach); // Partition helpers void dropPartition(const ASTPtr & partition, bool detach, bool drop_part, const Context & query_context, bool throw_if_noop) override; diff --git a/src/Storages/System/StorageSystemClusters.cpp b/src/Storages/System/StorageSystemClusters.cpp index 62ad1c5150f..7e16deb6d22 100644 --- a/src/Storages/System/StorageSystemClusters.cpp +++ b/src/Storages/System/StorageSystemClusters.cpp @@ -41,7 +41,7 @@ void StorageSystemClusters::fillData(MutableColumns & res_columns, const Context } } -void StorageSystemClusters::writeCluster(MutableColumns & res_columns, const NameAndCluster & name_and_cluster) const +void StorageSystemClusters::writeCluster(MutableColumns & res_columns, const NameAndCluster & name_and_cluster) { const String & cluster_name = name_and_cluster.first; const ClusterPtr & cluster = name_and_cluster.second; diff --git a/src/Storages/System/StorageSystemClusters.h b/src/Storages/System/StorageSystemClusters.h index 68282f1b1fe..4f2a843999f 100644 --- a/src/Storages/System/StorageSystemClusters.h +++ b/src/Storages/System/StorageSystemClusters.h @@ -29,7 +29,7 @@ protected: using NameAndCluster = std::pair>; void fillData(MutableColumns & res_columns, const Context & context, const SelectQueryInfo & query_info) const override; - void writeCluster(MutableColumns & res_columns, const NameAndCluster & name_and_cluster) const; + static void writeCluster(MutableColumns & res_columns, const NameAndCluster & name_and_cluster); }; } diff --git a/tests/integration/test_replicated_database/test.py b/tests/integration/test_replicated_database/test.py index 2471228b55e..2a5a7f4716e 100644 --- a/tests/integration/test_replicated_database/test.py +++ b/tests/integration/test_replicated_database/test.py @@ -147,7 +147,16 @@ def test_alters_from_different_replicas(started_cluster): main_node.query("SYSTEM FLUSH DISTRIBUTED testdb.dist") main_node.query("ALTER TABLE testdb.concurrent_test UPDATE StartDate = addYears(StartDate, 1) WHERE 1") - main_node.query("ALTER TABLE testdb.concurrent_test DELETE WHERE UserID % 2") + res = main_node.query("ALTER TABLE testdb.concurrent_test DELETE WHERE UserID % 2") + assert "shard1|replica1" in res and "shard1|replica2" in res and "shard1|replica3" in res + assert "shard2|replica1" in res and "shard2|replica2" in res + + expected = "1\t1\tmain_node\n" \ + "1\t2\tdummy_node\n" \ + "1\t3\tcompeting_node\n" \ + "2\t1\tsnapshotting_node\n" \ + "2\t2\tsnapshot_recovering_node\n" + assert main_node.query("SELECT shard_num, replica_num, host_name FROM system.clusters WHERE cluster='testdb'") == expected # test_drop_and_create_replica main_node.query("DROP DATABASE testdb") diff --git a/tests/queries/0_stateless/01018_ddl_dictionaries_concurrent_requrests.sh b/tests/queries/0_stateless/01018_ddl_dictionaries_concurrent_requrests.sh index bc13e44934a..025fe51e2a9 100755 --- a/tests/queries/0_stateless/01018_ddl_dictionaries_concurrent_requrests.sh +++ b/tests/queries/0_stateless/01018_ddl_dictionaries_concurrent_requrests.sh @@ -113,8 +113,8 @@ timeout $TIMEOUT bash -c thread7 2> /dev/null & wait $CLICKHOUSE_CLIENT -q "SELECT 'Still alive'" -$CLICKHOUSE_CLIENT -q "ATTACH DICTIONARY database_for_dict.dict1" -$CLICKHOUSE_CLIENT -q "ATTACH DICTIONARY database_for_dict.dict2" +$CLICKHOUSE_CLIENT -q "ATTACH DICTIONARY IF NOT EXISTS database_for_dict.dict1" +$CLICKHOUSE_CLIENT -q "ATTACH DICTIONARY IF NOT EXISTS database_for_dict.dict2" $CLICKHOUSE_CLIENT -n -q " DROP TABLE table_for_dict1; diff --git a/tests/queries/skip_list.json b/tests/queries/skip_list.json index 4c6927f575a..1c5136b6bde 100644 --- a/tests/queries/skip_list.json +++ b/tests/queries/skip_list.json @@ -103,6 +103,12 @@ "memory_tracking", /// FIXME remove it before merge "memory_tracking", "memory_usage", + "01188_attach_table_from_pat", + "01110_dictionary_layout_without_arguments", + "01018_ddl_dictionaries_create", + "01018_ddl_dictionaries_select", + "01414_freeze_does_not_prevent_alters", + "01018_ddl_dictionaries_bad_queries", "01686_rocksdb", "01550_mutation_subquery", "01070_mutations_with_dependencies", From 813092ff55c9534a0d10be577a3fbb8326810442 Mon Sep 17 00:00:00 2001 From: Nicolae Vartolomei Date: Thu, 4 Feb 2021 12:49:38 +0000 Subject: [PATCH 0184/2357] Test replicated fetches_network partition --- .../__init__.py | 0 .../configs/profiles.xml | 9 ++ .../test.py | 82 +++++++++++++++++++ 3 files changed, 91 insertions(+) create mode 100644 tests/integration/test_replicated_fetches_network_partition/__init__.py create mode 100644 tests/integration/test_replicated_fetches_network_partition/configs/profiles.xml create mode 100644 tests/integration/test_replicated_fetches_network_partition/test.py diff --git a/tests/integration/test_replicated_fetches_network_partition/__init__.py b/tests/integration/test_replicated_fetches_network_partition/__init__.py new file mode 100644 index 00000000000..e69de29bb2d diff --git a/tests/integration/test_replicated_fetches_network_partition/configs/profiles.xml b/tests/integration/test_replicated_fetches_network_partition/configs/profiles.xml new file mode 100644 index 00000000000..7abcf0bfde3 --- /dev/null +++ b/tests/integration/test_replicated_fetches_network_partition/configs/profiles.xml @@ -0,0 +1,9 @@ + + + + + 30 + + + + \ No newline at end of file diff --git a/tests/integration/test_replicated_fetches_network_partition/test.py b/tests/integration/test_replicated_fetches_network_partition/test.py new file mode 100644 index 00000000000..618bf94c2bd --- /dev/null +++ b/tests/integration/test_replicated_fetches_network_partition/test.py @@ -0,0 +1,82 @@ +#!/usr/bin/env python3 + +import pytest +import time +from helpers.cluster import ClickHouseCluster +from helpers.network import PartitionManager +import random +import string + +cluster = ClickHouseCluster(__file__) +node1 = cluster.add_instance('node1', with_zookeeper=True) +node2 = cluster.add_instance('node2', with_zookeeper=True) +node3 = cluster.add_instance('node3', with_zookeeper=True, user_configs=['configs/profiles.xml']) + +DEFAULT_MAX_THREADS_FOR_FETCH = 3 + +@pytest.fixture(scope="module") +def started_cluster(): + try: + cluster.start() + + yield cluster + + finally: + cluster.shutdown() + + +def get_random_string(length): + return ''.join(random.choice(string.ascii_uppercase + string.digits) for _ in range(length)) + + +def test_no_stall(started_cluster): + node1.query("CREATE TABLE t (key UInt64, data String) ENGINE = ReplicatedMergeTree('/clickhouse/test/t', '1') ORDER BY tuple() PARTITION BY key") + node2.query("CREATE TABLE t (key UInt64, data String) ENGINE = ReplicatedMergeTree('/clickhouse/test/t', '2') ORDER BY tuple() PARTITION BY key") + node3.query("CREATE TABLE t (key UInt64, data String) ENGINE = ReplicatedMergeTree('/clickhouse/test/t', '3') ORDER BY tuple() PARTITION BY key") + + node1.query("SYSTEM STOP MERGES") + node2.query("SYSTEM STOP MERGES") + node3.query("SYSTEM STOP MERGES") + + with PartitionManager() as pm: + node3.query("SYSTEM STOP FETCHES t") + + node1.query("INSERT INTO t SELECT 1, '{}' FROM numbers(5000)".format(get_random_string(104857))) + node1.query("INSERT INTO t SELECT 2, '{}' FROM numbers(5000)".format(get_random_string(104857))) + node1.query("INSERT INTO t SELECT 3, '{}' FROM numbers(5000)".format(get_random_string(104857))) + node1.query("INSERT INTO t SELECT 4, '{}' FROM numbers(5000)".format(get_random_string(104857))) + node1.query("INSERT INTO t SELECT 5, '{}' FROM numbers(5000)".format(get_random_string(104857))) + + # Make sure node2 has all the parts. + node2.query("SYSTEM SYNC REPLICA t") + + # Do not allow sending from replica 2 yet, force node3 to initiate replication from node1. + node2.query("SYSTEM STOP REPLICATED SENDS") + + print("replica 2 fully synced") + + # Make node1 very slow, node3 should replicate from node2 instead. + pm.add_network_delay(node1, 1000) + + # node3 starts to replicate from node 1 + node3.query("SYSTEM START FETCHES t") + + # Wait some time to give a chance for node3 to try replicating without success from node1. + time.sleep(10) + + # Wait for replication... + node2.query("SYSTEM START REPLICATED SENDS") + + for _ in range(1000): + print(node3.query("SELECT result_part_name FROM system.replicated_fetches").strip().split()) + print() + result = node3.query("SELECT count() FROM system.parts WHERE table = 't'").strip() + print(result) + print() + print() + + # Replication done. + if result == "5": + break + + time.sleep(3) From b153e8c190bc6246456cdc401beb8254004897cd Mon Sep 17 00:00:00 2001 From: Nicolae Vartolomei Date: Thu, 4 Feb 2021 17:25:10 +0000 Subject: [PATCH 0185/2357] Add support for custom fetchPart timeouts --- src/Storages/MergeTree/MergeTreeSettings.h | 3 +++ src/Storages/StorageReplicatedMergeTree.cpp | 23 +++++++++++++++++-- src/Storages/StorageReplicatedMergeTree.h | 2 ++ .../test.py | 11 ++++----- 4 files changed, 30 insertions(+), 9 deletions(-) diff --git a/src/Storages/MergeTree/MergeTreeSettings.h b/src/Storages/MergeTree/MergeTreeSettings.h index 53388617a07..705222e86f5 100644 --- a/src/Storages/MergeTree/MergeTreeSettings.h +++ b/src/Storages/MergeTree/MergeTreeSettings.h @@ -80,6 +80,9 @@ struct Settings; M(UInt64, replicated_max_parallel_fetches_for_host, DEFAULT_COUNT_OF_HTTP_CONNECTIONS_PER_ENDPOINT, "Limit parallel fetches from endpoint (actually pool size).", 0) \ M(UInt64, replicated_max_parallel_sends, 0, "Limit parallel sends.", 0) \ M(UInt64, replicated_max_parallel_sends_for_table, 0, "Limit parallel sends for one table.", 0) \ + M(Seconds, replicated_fetches_http_connection_timeout, 0, "HTTP connection timeout for part fetch requests. Inherited from default profile `http_connection_timeout` if not set explicitly.", 0) \ + M(Seconds, replicated_fetches_http_send_timeout, 0, "HTTP send timeout for part fetch/send requests. Inherited from default profile `http_send_timeout` if not set explicitly.", 0) \ + M(Seconds, replicated_fetches_http_receive_timeout, 0, "HTTP receive timeout for fetch/send part requests. Inherited from default profile `http_receive_timeout` if not set explicitly.", 0) \ M(Bool, replicated_can_become_leader, true, "If true, Replicated tables replicas on this node will try to acquire leadership.", 0) \ M(Seconds, zookeeper_session_expiration_check_period, 60, "ZooKeeper session expiration check period, in seconds.", 0) \ M(Bool, detach_old_local_parts_when_cloning_replica, 1, "Do not remove old local parts when repairing lost replica.", 0) \ diff --git a/src/Storages/StorageReplicatedMergeTree.cpp b/src/Storages/StorageReplicatedMergeTree.cpp index 69cbe0d7062..6b168f051ac 100644 --- a/src/Storages/StorageReplicatedMergeTree.cpp +++ b/src/Storages/StorageReplicatedMergeTree.cpp @@ -2178,7 +2178,8 @@ bool StorageReplicatedMergeTree::executeReplaceRange(const LogEntry & entry) { String source_replica_path = zookeeper_path + "/replicas/" + part_desc->replica; ReplicatedMergeTreeAddress address(getZooKeeper()->get(source_replica_path + "/host")); - auto timeouts = ConnectionTimeouts::getHTTPTimeouts(global_context); + auto timeouts = getFetchPartHTTPTimeouts(global_context); + auto [user, password] = global_context.getInterserverCredentials(); String interserver_scheme = global_context.getInterserverScheme(); @@ -3111,6 +3112,23 @@ void StorageReplicatedMergeTree::exitLeaderElection() leader_election = nullptr; } +ConnectionTimeouts StorageReplicatedMergeTree::getFetchPartHTTPTimeouts(const Context & context) +{ + auto timeouts = ConnectionTimeouts::getHTTPTimeouts(context); + auto settings = getSettings(); + + if (settings->replicated_fetches_http_connection_timeout.changed) + timeouts.connection_timeout = settings->replicated_fetches_http_connection_timeout; + + if (settings->replicated_fetches_http_send_timeout.changed) + timeouts.send_timeout = settings->replicated_fetches_http_send_timeout; + + if (settings->replicated_fetches_http_receive_timeout.changed) + timeouts.receive_timeout = settings->replicated_fetches_http_receive_timeout; + + return timeouts; +} + bool StorageReplicatedMergeTree::checkReplicaHavePart(const String & replica, const String & part_name) { auto zookeeper = getZooKeeper(); @@ -3520,7 +3538,8 @@ bool StorageReplicatedMergeTree::fetchPart(const String & part_name, const Stora else { ReplicatedMergeTreeAddress address(zookeeper->get(source_replica_path + "/host")); - auto timeouts = ConnectionTimeouts::getHTTPTimeouts(global_context); + auto timeouts = getFetchPartHTTPTimeouts(global_context); + auto user_password = global_context.getInterserverCredentials(); String interserver_scheme = global_context.getInterserverScheme(); diff --git a/src/Storages/StorageReplicatedMergeTree.h b/src/Storages/StorageReplicatedMergeTree.h index 6db05294b63..2acfd7027b9 100644 --- a/src/Storages/StorageReplicatedMergeTree.h +++ b/src/Storages/StorageReplicatedMergeTree.h @@ -488,6 +488,8 @@ private: /// Exchange parts. + ConnectionTimeouts getFetchPartHTTPTimeouts(const Context & context); + /** Returns an empty string if no one has a part. */ String findReplicaHavingPart(const String & part_name, bool active); diff --git a/tests/integration/test_replicated_fetches_network_partition/test.py b/tests/integration/test_replicated_fetches_network_partition/test.py index 618bf94c2bd..e42395c7af4 100644 --- a/tests/integration/test_replicated_fetches_network_partition/test.py +++ b/tests/integration/test_replicated_fetches_network_partition/test.py @@ -68,15 +68,12 @@ def test_no_stall(started_cluster): node2.query("SYSTEM START REPLICATED SENDS") for _ in range(1000): - print(node3.query("SELECT result_part_name FROM system.replicated_fetches").strip().split()) - print() - result = node3.query("SELECT count() FROM system.parts WHERE table = 't'").strip() - print(result) - print() - print() + print('Currently running fetches', node3.query("SELECT result_part_name FROM system.replicated_fetches").strip().split()) + parts_fetched = node3.query("SELECT count() FROM system.parts WHERE table = 't'").strip() + print('parts_fetched', parts_fetched) # Replication done. - if result == "5": + if parts_fetched == "5": break time.sleep(3) From 75fca08b2a15c702884aa760fbca3389c974bea3 Mon Sep 17 00:00:00 2001 From: Nicolae Vartolomei Date: Thu, 4 Feb 2021 17:26:53 +0000 Subject: [PATCH 0186/2357] Update test_replicated_fetches_network_partition with new settings --- .../configs/merge_tree.xml | 6 ++++++ .../configs/profiles.xml | 9 --------- .../test_replicated_fetches_network_partition/test.py | 2 +- 3 files changed, 7 insertions(+), 10 deletions(-) create mode 100644 tests/integration/test_replicated_fetches_network_partition/configs/merge_tree.xml delete mode 100644 tests/integration/test_replicated_fetches_network_partition/configs/profiles.xml diff --git a/tests/integration/test_replicated_fetches_network_partition/configs/merge_tree.xml b/tests/integration/test_replicated_fetches_network_partition/configs/merge_tree.xml new file mode 100644 index 00000000000..eba2c5e8ffc --- /dev/null +++ b/tests/integration/test_replicated_fetches_network_partition/configs/merge_tree.xml @@ -0,0 +1,6 @@ + + + 30 + 1 + + diff --git a/tests/integration/test_replicated_fetches_network_partition/configs/profiles.xml b/tests/integration/test_replicated_fetches_network_partition/configs/profiles.xml deleted file mode 100644 index 7abcf0bfde3..00000000000 --- a/tests/integration/test_replicated_fetches_network_partition/configs/profiles.xml +++ /dev/null @@ -1,9 +0,0 @@ - - - - - 30 - - - - \ No newline at end of file diff --git a/tests/integration/test_replicated_fetches_network_partition/test.py b/tests/integration/test_replicated_fetches_network_partition/test.py index e42395c7af4..e493709dcbe 100644 --- a/tests/integration/test_replicated_fetches_network_partition/test.py +++ b/tests/integration/test_replicated_fetches_network_partition/test.py @@ -10,7 +10,7 @@ import string cluster = ClickHouseCluster(__file__) node1 = cluster.add_instance('node1', with_zookeeper=True) node2 = cluster.add_instance('node2', with_zookeeper=True) -node3 = cluster.add_instance('node3', with_zookeeper=True, user_configs=['configs/profiles.xml']) +node3 = cluster.add_instance('node3', with_zookeeper=True, main_configs=['configs/merge_tree.xml']) DEFAULT_MAX_THREADS_FOR_FETCH = 3 From 165ba59a1535cf97a7bc35a5dfbf37904163dd5b Mon Sep 17 00:00:00 2001 From: Nicolae Vartolomei Date: Thu, 4 Feb 2021 20:05:22 +0000 Subject: [PATCH 0187/2357] Increase network delay and add more info to logs for debugging CI behaviour is different from my test environment --- .../test_replicated_fetches_network_partition/test.py | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/tests/integration/test_replicated_fetches_network_partition/test.py b/tests/integration/test_replicated_fetches_network_partition/test.py index e493709dcbe..fd4727ba9fb 100644 --- a/tests/integration/test_replicated_fetches_network_partition/test.py +++ b/tests/integration/test_replicated_fetches_network_partition/test.py @@ -12,8 +12,6 @@ node1 = cluster.add_instance('node1', with_zookeeper=True) node2 = cluster.add_instance('node2', with_zookeeper=True) node3 = cluster.add_instance('node3', with_zookeeper=True, main_configs=['configs/merge_tree.xml']) -DEFAULT_MAX_THREADS_FOR_FETCH = 3 - @pytest.fixture(scope="module") def started_cluster(): try: @@ -56,7 +54,7 @@ def test_no_stall(started_cluster): print("replica 2 fully synced") # Make node1 very slow, node3 should replicate from node2 instead. - pm.add_network_delay(node1, 1000) + pm.add_network_delay(node1, 2000) # node3 starts to replicate from node 1 node3.query("SYSTEM START FETCHES t") @@ -68,7 +66,7 @@ def test_no_stall(started_cluster): node2.query("SYSTEM START REPLICATED SENDS") for _ in range(1000): - print('Currently running fetches', node3.query("SELECT result_part_name FROM system.replicated_fetches").strip().split()) + print('Currently running fetches', node3.query("SELECT result_part_name, source_replica_hostname, progress FROM system.replicated_fetches").strip().split()) parts_fetched = node3.query("SELECT count() FROM system.parts WHERE table = 't'").strip() print('parts_fetched', parts_fetched) From f39eef92279c53edf00eefad7a328f2d98a8eb73 Mon Sep 17 00:00:00 2001 From: Nicolae Vartolomei Date: Thu, 4 Feb 2021 20:21:20 +0000 Subject: [PATCH 0188/2357] Reformat test and drop tables at the end to reduce the size of test results archive --- .../test.py | 46 +++++++++++-------- 1 file changed, 28 insertions(+), 18 deletions(-) diff --git a/tests/integration/test_replicated_fetches_network_partition/test.py b/tests/integration/test_replicated_fetches_network_partition/test.py index fd4727ba9fb..e0c5c4cce8a 100644 --- a/tests/integration/test_replicated_fetches_network_partition/test.py +++ b/tests/integration/test_replicated_fetches_network_partition/test.py @@ -12,6 +12,7 @@ node1 = cluster.add_instance('node1', with_zookeeper=True) node2 = cluster.add_instance('node2', with_zookeeper=True) node3 = cluster.add_instance('node3', with_zookeeper=True, main_configs=['configs/merge_tree.xml']) + @pytest.fixture(scope="module") def started_cluster(): try: @@ -36,23 +37,24 @@ def test_no_stall(started_cluster): node2.query("SYSTEM STOP MERGES") node3.query("SYSTEM STOP MERGES") + # Pause node3 until the test setup is prepared + node3.query("SYSTEM STOP FETCHES t") + + node1.query("INSERT INTO t SELECT 1, '{}' FROM numbers(5000)".format(get_random_string(104857))) + node1.query("INSERT INTO t SELECT 2, '{}' FROM numbers(5000)".format(get_random_string(104857))) + node1.query("INSERT INTO t SELECT 3, '{}' FROM numbers(5000)".format(get_random_string(104857))) + node1.query("INSERT INTO t SELECT 4, '{}' FROM numbers(5000)".format(get_random_string(104857))) + node1.query("INSERT INTO t SELECT 5, '{}' FROM numbers(5000)".format(get_random_string(104857))) + + # Make sure node2 has all the parts. + node2.query("SYSTEM SYNC REPLICA t") + + # Do not allow sending from replica 2 yet, force node3 to initiate replication from node1. + node2.query("SYSTEM STOP REPLICATED SENDS") + + print("replica 2 fully synced") + with PartitionManager() as pm: - node3.query("SYSTEM STOP FETCHES t") - - node1.query("INSERT INTO t SELECT 1, '{}' FROM numbers(5000)".format(get_random_string(104857))) - node1.query("INSERT INTO t SELECT 2, '{}' FROM numbers(5000)".format(get_random_string(104857))) - node1.query("INSERT INTO t SELECT 3, '{}' FROM numbers(5000)".format(get_random_string(104857))) - node1.query("INSERT INTO t SELECT 4, '{}' FROM numbers(5000)".format(get_random_string(104857))) - node1.query("INSERT INTO t SELECT 5, '{}' FROM numbers(5000)".format(get_random_string(104857))) - - # Make sure node2 has all the parts. - node2.query("SYSTEM SYNC REPLICA t") - - # Do not allow sending from replica 2 yet, force node3 to initiate replication from node1. - node2.query("SYSTEM STOP REPLICATED SENDS") - - print("replica 2 fully synced") - # Make node1 very slow, node3 should replicate from node2 instead. pm.add_network_delay(node1, 2000) @@ -66,12 +68,20 @@ def test_no_stall(started_cluster): node2.query("SYSTEM START REPLICATED SENDS") for _ in range(1000): - print('Currently running fetches', node3.query("SELECT result_part_name, source_replica_hostname, progress FROM system.replicated_fetches").strip().split()) + print('Currently running fetches:') + print(node3.query("SELECT result_part_name, source_replica_hostname, progress FROM system.replicated_fetches").strip()) + print() + parts_fetched = node3.query("SELECT count() FROM system.parts WHERE table = 't'").strip() - print('parts_fetched', parts_fetched) + print('parts_fetched:', parts_fetched) + print() # Replication done. if parts_fetched == "5": break time.sleep(3) + + node1.query("DROP TABLE t SYNC") + node2.query("DROP TABLE t SYNC") + node3.query("DROP TABLE t SYNC") From 135f82ce94d8cbb548de5a6f0a0712bffa0d4a9c Mon Sep 17 00:00:00 2001 From: Nicolae Vartolomei Date: Thu, 4 Feb 2021 20:33:06 +0000 Subject: [PATCH 0189/2357] Correctly document that settings apply only for fetch part requests --- src/Storages/MergeTree/MergeTreeSettings.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/Storages/MergeTree/MergeTreeSettings.h b/src/Storages/MergeTree/MergeTreeSettings.h index 705222e86f5..4d7be425cf7 100644 --- a/src/Storages/MergeTree/MergeTreeSettings.h +++ b/src/Storages/MergeTree/MergeTreeSettings.h @@ -81,8 +81,8 @@ struct Settings; M(UInt64, replicated_max_parallel_sends, 0, "Limit parallel sends.", 0) \ M(UInt64, replicated_max_parallel_sends_for_table, 0, "Limit parallel sends for one table.", 0) \ M(Seconds, replicated_fetches_http_connection_timeout, 0, "HTTP connection timeout for part fetch requests. Inherited from default profile `http_connection_timeout` if not set explicitly.", 0) \ - M(Seconds, replicated_fetches_http_send_timeout, 0, "HTTP send timeout for part fetch/send requests. Inherited from default profile `http_send_timeout` if not set explicitly.", 0) \ - M(Seconds, replicated_fetches_http_receive_timeout, 0, "HTTP receive timeout for fetch/send part requests. Inherited from default profile `http_receive_timeout` if not set explicitly.", 0) \ + M(Seconds, replicated_fetches_http_send_timeout, 0, "HTTP send timeout for part fetch requests. Inherited from default profile `http_send_timeout` if not set explicitly.", 0) \ + M(Seconds, replicated_fetches_http_receive_timeout, 0, "HTTP receive timeout for fetch part requests. Inherited from default profile `http_receive_timeout` if not set explicitly.", 0) \ M(Bool, replicated_can_become_leader, true, "If true, Replicated tables replicas on this node will try to acquire leadership.", 0) \ M(Seconds, zookeeper_session_expiration_check_period, 60, "ZooKeeper session expiration check period, in seconds.", 0) \ M(Bool, detach_old_local_parts_when_cloning_replica, 1, "Do not remove old local parts when repairing lost replica.", 0) \ From 6181daf92d9aac636bc2c2df7b7a9f0944ac35bc Mon Sep 17 00:00:00 2001 From: Nicolae Vartolomei Date: Fri, 5 Feb 2021 00:35:54 +0000 Subject: [PATCH 0190/2357] Workaround for drop not finishing if it is started while table is readonly --- .../test_replicated_fetches_network_partition/test.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/tests/integration/test_replicated_fetches_network_partition/test.py b/tests/integration/test_replicated_fetches_network_partition/test.py index e0c5c4cce8a..b0b5534cf1f 100644 --- a/tests/integration/test_replicated_fetches_network_partition/test.py +++ b/tests/integration/test_replicated_fetches_network_partition/test.py @@ -82,6 +82,7 @@ def test_no_stall(started_cluster): time.sleep(3) - node1.query("DROP TABLE t SYNC") - node2.query("DROP TABLE t SYNC") - node3.query("DROP TABLE t SYNC") + for n in [node1, node2, node3]: + # Workaround for drop not finishing if it is started while table is readonly. + n.query("SYSTEM RESTART REPLICA t") + n.query("DROP TABLE t SYNC") From e252b138420cb9621dbc26aff3ef411d43177161 Mon Sep 17 00:00:00 2001 From: Roman Bug Date: Mon, 8 Feb 2021 23:54:28 +0300 Subject: [PATCH 0191/2357] Update simpleaggregatefunction.md Remove output of creating table example. --- .../data-types/simpleaggregatefunction.md | 14 -------------- 1 file changed, 14 deletions(-) diff --git a/docs/ru/sql-reference/data-types/simpleaggregatefunction.md b/docs/ru/sql-reference/data-types/simpleaggregatefunction.md index 7441ceae655..b80826803de 100644 --- a/docs/ru/sql-reference/data-types/simpleaggregatefunction.md +++ b/docs/ru/sql-reference/data-types/simpleaggregatefunction.md @@ -32,22 +32,8 @@ **Пример** -Запрос: - ``` sql CREATE TABLE simple (id UInt64,val SimpleAggregateFunction(sum,Double)) ENGINE=AggregatingMergeTree ORDER BY id; ``` -Ответ: - -``` text -CREATE TABLE simple -( - `id` UInt64, - `val` SimpleAggregateFunction(sum, Double) -) -ENGINE = AggregatingMergeTree -ORDER BY id -``` - [Оригинальная статья](https://clickhouse.tech/docs/en/data_types/simpleaggregatefunction/) From 436954dc26de1263b9071d530101b9468ac8c2eb Mon Sep 17 00:00:00 2001 From: Roman Bug Date: Mon, 8 Feb 2021 23:54:52 +0300 Subject: [PATCH 0192/2357] Update simpleaggregatefunction.md --- .../data-types/simpleaggregatefunction.md | 14 -------------- 1 file changed, 14 deletions(-) diff --git a/docs/en/sql-reference/data-types/simpleaggregatefunction.md b/docs/en/sql-reference/data-types/simpleaggregatefunction.md index 9ea5a586981..e25d4803613 100644 --- a/docs/en/sql-reference/data-types/simpleaggregatefunction.md +++ b/docs/en/sql-reference/data-types/simpleaggregatefunction.md @@ -34,22 +34,8 @@ The following aggregate functions are supported: **Example** -Query: - ``` sql CREATE TABLE simple (id UInt64,val SimpleAggregateFunction(sum,Double)) ENGINE=AggregatingMergeTree ORDER BY id; ``` -Result: - -``` text -CREATE TABLE simple -( - `id` UInt64, - `val` SimpleAggregateFunction(sum, Double) -) -ENGINE = AggregatingMergeTree -ORDER BY id -``` - [Original article](https://clickhouse.tech/docs/en/data_types/simpleaggregatefunction/) From df5dc102c47cde5112186a9f7b0d62b0e6b574ee Mon Sep 17 00:00:00 2001 From: Pavel Kruglov Date: Mon, 8 Feb 2021 22:31:21 +0300 Subject: [PATCH 0193/2357] Don't use 'ReplicaStatePtr &' to prevent use-after-free --- src/Client/HedgedConnections.cpp | 30 ++++++++++++------------- src/Client/HedgedConnections.h | 20 ++++++++--------- src/Client/HedgedConnectionsFactory.cpp | 16 ++++++------- src/Client/HedgedConnectionsFactory.h | 18 +++++++-------- 4 files changed, 42 insertions(+), 42 deletions(-) diff --git a/src/Client/HedgedConnections.cpp b/src/Client/HedgedConnections.cpp index b361f04f0b1..8e547169f29 100644 --- a/src/Client/HedgedConnections.cpp +++ b/src/Client/HedgedConnections.cpp @@ -42,15 +42,15 @@ HedgedConnections::HedgedConnections( active_connection_count = connections.size(); offsets_with_received_first_data_packet = 0; - pipeline_for_new_replicas.add([throttler_](ReplicaStatePtr & replica_) { replica_->connection->setThrottler(throttler_); }); + pipeline_for_new_replicas.add([throttler_](ReplicaStatePtr replica_) { replica_->connection->setThrottler(throttler_); }); } -void HedgedConnections::Pipeline::add(std::function send_function) +void HedgedConnections::Pipeline::add(std::function send_function) { pipeline.push_back(send_function); } -void HedgedConnections::Pipeline::run(ReplicaStatePtr & replica) +void HedgedConnections::Pipeline::run(ReplicaStatePtr replica) { for (auto & send_func : pipeline) send_func(replica); @@ -63,7 +63,7 @@ void HedgedConnections::sendScalarsData(Scalars & data) if (!sent_query) throw Exception("Cannot send scalars data: query not yet sent.", ErrorCodes::LOGICAL_ERROR); - auto send_scalars_data = [&data](ReplicaStatePtr & replica) { replica->connection->sendScalarsData(data); }; + auto send_scalars_data = [&data](ReplicaStatePtr replica) { replica->connection->sendScalarsData(data); }; for (auto & offset_state : offset_states) for (auto & replica : offset_state.replicas) @@ -83,7 +83,7 @@ void HedgedConnections::sendExternalTablesData(std::vector & if (data.size() != size()) throw Exception("Mismatch between replicas and data sources", ErrorCodes::MISMATCH_REPLICAS_DATA_SOURCES); - auto send_external_tables_data = [&data](ReplicaStatePtr & replica) { replica->connection->sendExternalTablesData(data[0]); }; + auto send_external_tables_data = [&data](ReplicaStatePtr replica) { replica->connection->sendExternalTablesData(data[0]); }; for (auto & offset_state : offset_states) for (auto & replica : offset_state.replicas) @@ -100,7 +100,7 @@ void HedgedConnections::sendIgnoredPartUUIDs(const std::vector & uuids) if (sent_query) throw Exception("Cannot send uuids after query is sent.", ErrorCodes::LOGICAL_ERROR); - auto send_ignored_part_uuids = [&uuids](ReplicaStatePtr & replica) { replica->connection->sendIgnoredPartUUIDs(uuids); }; + auto send_ignored_part_uuids = [&uuids](ReplicaStatePtr replica) { replica->connection->sendIgnoredPartUUIDs(uuids); }; for (auto & offset_state : offset_states) for (auto & replica : offset_state.replicas) @@ -137,7 +137,7 @@ void HedgedConnections::sendQuery( break; } - auto send_query = [this, timeouts, query, query_id, stage, client_info, with_pending_data](ReplicaStatePtr & replica) + auto send_query = [this, timeouts, query, query_id, stage, client_info, with_pending_data](ReplicaStatePtr replica) { Settings modified_settings = settings; @@ -295,7 +295,7 @@ Packet HedgedConnections::receivePacketImpl(AsyncCallback async_callback) } else if (timeout_fd_to_replica.contains(event_fd)) { - ReplicaStatePtr & replica = timeout_fd_to_replica[event_fd]; + ReplicaStatePtr replica = timeout_fd_to_replica[event_fd]; processTimeoutEvent(replica, replica->active_timeouts[event_fd]); } else if (event_fd == hedged_connections_factory.getFileDescriptor()) @@ -321,7 +321,7 @@ int HedgedConnections::getReadyFileDescriptor(AsyncCallback async_callback) return event.data.fd; } -Packet HedgedConnections::receivePacketFromReplica(ReplicaStatePtr & replica, AsyncCallback async_callback) +Packet HedgedConnections::receivePacketFromReplica(ReplicaStatePtr replica, AsyncCallback async_callback) { removeTimeoutFromReplica(ConnectionTimeoutType::RECEIVE_TIMEOUT, replica); Packet packet = replica->connection->receivePacket(std::move(async_callback)); @@ -354,7 +354,7 @@ Packet HedgedConnections::receivePacketFromReplica(ReplicaStatePtr & replica, As return packet; } -void HedgedConnections::processReceivedFirstDataPacket(ReplicaStatePtr & replica) +void HedgedConnections::processReceivedFirstDataPacket(ReplicaStatePtr replica) { /// When we receive first packet of data from replica, we stop working with replicas, that are /// responsible for the same offset. @@ -384,7 +384,7 @@ void HedgedConnections::processReceivedFirstDataPacket(ReplicaStatePtr & replica } } -void HedgedConnections::processTimeoutEvent(ReplicaStatePtr & replica, ConnectionTimeoutDescriptorPtr timeout_descriptor) +void HedgedConnections::processTimeoutEvent(ReplicaStatePtr replica, ConnectionTimeoutDescriptorPtr timeout_descriptor) { epoll.remove(timeout_descriptor->timer.getDescriptor()); replica->active_timeouts.erase(timeout_descriptor->timer.getDescriptor()); @@ -457,7 +457,7 @@ void HedgedConnections::tryGetNewReplica(bool start_new_connection) } } -void HedgedConnections::finishProcessReplica(ReplicaStatePtr & replica, bool disconnect) +void HedgedConnections::finishProcessReplica(ReplicaStatePtr replica, bool disconnect) { removeTimeoutsFromReplica(replica); int socket_fd = replica->connection->getSocket()->impl()->sockfd(); @@ -471,7 +471,7 @@ void HedgedConnections::finishProcessReplica(ReplicaStatePtr & replica, bool dis replica->connection = nullptr; } -void HedgedConnections::addTimeoutToReplica(ConnectionTimeoutType type, ReplicaStatePtr & replica) +void HedgedConnections::addTimeoutToReplica(ConnectionTimeoutType type, ReplicaStatePtr replica) { ConnectionTimeoutDescriptorPtr timeout_descriptor = createConnectionTimeoutDescriptor(type, hedged_connections_factory.getConnectionTimeouts()); @@ -481,7 +481,7 @@ void HedgedConnections::addTimeoutToReplica(ConnectionTimeoutType type, ReplicaS replica->active_timeouts[timeout_descriptor->timer.getDescriptor()] = std::move(timeout_descriptor); } -void HedgedConnections::removeTimeoutsFromReplica(ReplicaStatePtr & replica) +void HedgedConnections::removeTimeoutsFromReplica(ReplicaStatePtr replica) { for (auto & [fd, _] : replica->active_timeouts) { @@ -491,7 +491,7 @@ void HedgedConnections::removeTimeoutsFromReplica(ReplicaStatePtr & replica) replica->active_timeouts.clear(); } -void HedgedConnections::removeTimeoutFromReplica(ConnectionTimeoutType type, ReplicaStatePtr & replica) +void HedgedConnections::removeTimeoutFromReplica(ConnectionTimeoutType type, ReplicaStatePtr replica) { auto it = std::find_if( replica->active_timeouts.begin(), replica->active_timeouts.end(), [type](auto & value) { return value.second->type == type; }); diff --git a/src/Client/HedgedConnections.h b/src/Client/HedgedConnections.h index 56eca3ffbe7..00145544096 100644 --- a/src/Client/HedgedConnections.h +++ b/src/Client/HedgedConnections.h @@ -76,32 +76,32 @@ private: class Pipeline { public: - void add(std::function send_function); + void add(std::function send_function); - void run(ReplicaStatePtr & replica); + void run(ReplicaStatePtr replica); private: - std::vector> pipeline; + std::vector> pipeline; }; - Packet receivePacketFromReplica(ReplicaStatePtr & replica, AsyncCallback async_callback = {}); + Packet receivePacketFromReplica(ReplicaStatePtr replica, AsyncCallback async_callback = {}); Packet receivePacketImpl(AsyncCallback async_callback = {}); - void processReceivedFirstDataPacket(ReplicaStatePtr & replica); + void processReceivedFirstDataPacket(ReplicaStatePtr replica); - void processTimeoutEvent(ReplicaStatePtr & replica, ConnectionTimeoutDescriptorPtr timeout_descriptor); + void processTimeoutEvent(ReplicaStatePtr replica, ConnectionTimeoutDescriptorPtr timeout_descriptor); void tryGetNewReplica(bool start_new_connection); - void finishProcessReplica(ReplicaStatePtr & replica, bool disconnect); + void finishProcessReplica(ReplicaStatePtr replica, bool disconnect); int getReadyFileDescriptor(AsyncCallback async_callback = {}); - void addTimeoutToReplica(ConnectionTimeoutType type, ReplicaStatePtr & replica); + void addTimeoutToReplica(ConnectionTimeoutType type, ReplicaStatePtr replica); - void removeTimeoutsFromReplica(ReplicaStatePtr & replica); + void removeTimeoutsFromReplica(ReplicaStatePtr replica); - void removeTimeoutFromReplica(ConnectionTimeoutType type, ReplicaStatePtr & replica); + void removeTimeoutFromReplica(ConnectionTimeoutType type, ReplicaStatePtr replica); HedgedConnectionsFactory hedged_connections_factory; diff --git a/src/Client/HedgedConnectionsFactory.cpp b/src/Client/HedgedConnectionsFactory.cpp index 2a5abbbaf57..732e1e4b7d0 100644 --- a/src/Client/HedgedConnectionsFactory.cpp +++ b/src/Client/HedgedConnectionsFactory.cpp @@ -227,7 +227,7 @@ HedgedConnectionsFactory::ReplicaStatePtr HedgedConnectionsFactory::startEstabli return replica; } -void HedgedConnectionsFactory::processConnectionEstablisherStage(ReplicaStatePtr & replica, bool remove_from_epoll) +void HedgedConnectionsFactory::processConnectionEstablisherStage(ReplicaStatePtr replica, bool remove_from_epoll) { ConnectionEstablisher & connection_establisher = connection_establishers[replica->index]; @@ -260,7 +260,7 @@ void HedgedConnectionsFactory::processConnectionEstablisherStage(ReplicaStatePtr processFailedConnection(replica); } -void HedgedConnectionsFactory::processFailedConnection(ReplicaStatePtr & replica) +void HedgedConnectionsFactory::processFailedConnection(ReplicaStatePtr replica) { ShuffledPool & shuffled_pool = shuffled_pools[replica->index]; LOG_WARNING( @@ -283,7 +283,7 @@ void HedgedConnectionsFactory::processFailedConnection(ReplicaStatePtr & replica replica->reset(); } -void HedgedConnectionsFactory::addTimeouts(ReplicaStatePtr & replica) +void HedgedConnectionsFactory::addTimeouts(ReplicaStatePtr replica) { addTimeoutToReplica(ConnectionTimeoutType::RECEIVE_TIMEOUT, replica); @@ -294,7 +294,7 @@ void HedgedConnectionsFactory::addTimeouts(ReplicaStatePtr & replica) addTimeoutToReplica(ConnectionTimeoutType::RECEIVE_TABLES_STATUS_TIMEOUT, replica); } -void HedgedConnectionsFactory::addTimeoutToReplica(ConnectionTimeoutType type, ReplicaStatePtr & replica) +void HedgedConnectionsFactory::addTimeoutToReplica(ConnectionTimeoutType type, ReplicaStatePtr replica) { ConnectionTimeoutDescriptorPtr timeout_descriptor = createConnectionTimeoutDescriptor(type, timeouts); epoll.add(timeout_descriptor->timer.getDescriptor()); @@ -302,7 +302,7 @@ void HedgedConnectionsFactory::addTimeoutToReplica(ConnectionTimeoutType type, R replica->active_timeouts[timeout_descriptor->timer.getDescriptor()] = std::move(timeout_descriptor); } -void HedgedConnectionsFactory::removeTimeoutsFromReplica(ReplicaStatePtr & replica) +void HedgedConnectionsFactory::removeTimeoutsFromReplica(ReplicaStatePtr replica) { for (auto & [fd, _] : replica->active_timeouts) { @@ -359,7 +359,7 @@ int HedgedConnectionsFactory::getReadyFileDescriptor(bool blocking) return event.data.fd; } -void HedgedConnectionsFactory::processReplicaEvent(ReplicaStatePtr & replica) +void HedgedConnectionsFactory::processReplicaEvent(ReplicaStatePtr replica) { removeTimeoutsFromReplica(replica); connection_establishers[replica->index].run(); @@ -368,7 +368,7 @@ void HedgedConnectionsFactory::processReplicaEvent(ReplicaStatePtr & replica) addTimeouts(replica); } -void HedgedConnectionsFactory::processTimeoutEvent(ReplicaStatePtr & replica, ConnectionTimeoutDescriptorPtr timeout_descriptor) +void HedgedConnectionsFactory::processTimeoutEvent(ReplicaStatePtr replica, ConnectionTimeoutDescriptorPtr timeout_descriptor) { epoll.remove(timeout_descriptor->timer.getDescriptor()); replica->active_timeouts.erase(timeout_descriptor->timer.getDescriptor()); @@ -393,7 +393,7 @@ void HedgedConnectionsFactory::processTimeoutEvent(ReplicaStatePtr & replica, Co replica = createNewReplica(); } -void HedgedConnectionsFactory::setBestUsableReplica(ReplicaStatePtr & replica) +void HedgedConnectionsFactory::setBestUsableReplica(ReplicaStatePtr replica) { std::vector indexes(connection_establishers.size()); for (size_t i = 0; i != indexes.size(); ++i) diff --git a/src/Client/HedgedConnectionsFactory.h b/src/Client/HedgedConnectionsFactory.h index 345a1f2fe3e..398629cf13c 100644 --- a/src/Client/HedgedConnectionsFactory.h +++ b/src/Client/HedgedConnectionsFactory.h @@ -98,7 +98,7 @@ public: private: ReplicaStatePtr startEstablishingConnection(int index); - void processConnectionEstablisherStage(ReplicaStatePtr & replica, bool remove_from_epoll = false); + void processConnectionEstablisherStage(ReplicaStatePtr replica, bool remove_from_epoll = false); /// Find an index of the next free replica to start connection. /// Return -1 if there is no free replica. @@ -106,25 +106,25 @@ private: int getReadyFileDescriptor(bool blocking); - void addTimeouts(ReplicaStatePtr & replica); + void addTimeouts(ReplicaStatePtr replica); - void addTimeoutToReplica(ConnectionTimeoutType type, ReplicaStatePtr & replica); + void addTimeoutToReplica(ConnectionTimeoutType type, ReplicaStatePtr replica); - void removeTimeoutsFromReplica(ReplicaStatePtr & replica); + void removeTimeoutsFromReplica(ReplicaStatePtr replica); - void processFailedConnection(ReplicaStatePtr & replica); + void processFailedConnection(ReplicaStatePtr replica); - void processReceiveTimeout(ReplicaStatePtr & replica); + void processReceiveTimeout(ReplicaStatePtr replica); - void processReplicaEvent(ReplicaStatePtr & replica); + void processReplicaEvent(ReplicaStatePtr replica); - void processTimeoutEvent(ReplicaStatePtr & replica, ConnectionTimeoutDescriptorPtr timeout_descriptor); + void processTimeoutEvent(ReplicaStatePtr replica, ConnectionTimeoutDescriptorPtr timeout_descriptor); /// Return false if there is no ready events, return true if replica is ready /// or we need to try next replica. bool processEpollEvents(ReplicaStatePtr & replica, bool blocking); - void setBestUsableReplica(ReplicaStatePtr & replica); + void setBestUsableReplica(ReplicaStatePtr replica); ReplicaStatePtr createNewReplica() { return std::make_shared(); } From fd396d1d36600acb6efedb8bdb957e3359454ef7 Mon Sep 17 00:00:00 2001 From: Vitaliy Zakaznikov Date: Mon, 8 Feb 2021 16:08:32 -0500 Subject: [PATCH 0194/2357] Starting to add documentation for live views. --- .../sql-reference/statements/create/view.md | 79 +++++++++++++++++++ docs/en/sql-reference/statements/watch.md | 68 ++++++++++++++++ 2 files changed, 147 insertions(+) create mode 100644 docs/en/sql-reference/statements/watch.md diff --git a/docs/en/sql-reference/statements/create/view.md b/docs/en/sql-reference/statements/create/view.md index 4370735b8d9..a9fe48ed6ac 100644 --- a/docs/en/sql-reference/statements/create/view.md +++ b/docs/en/sql-reference/statements/create/view.md @@ -62,3 +62,82 @@ The execution of [ALTER](../../../sql-reference/statements/alter/index.md) queri Views look the same as normal tables. For example, they are listed in the result of the `SHOW TABLES` query. There isn’t a separate query for deleting views. To delete a view, use [DROP TABLE](../../../sql-reference/statements/drop.md). + +## Live View (Experimental) {#live-view) + +!!! important "Important" + This is an experimental feature that may change in backwards-incompatible ways in the future releases. + Enable usage of live views and `WATCH` query using `set allow_experimental_live_view = 1`. + + +```sql +CREATE LIVE VIEW [IF NOT EXISTS] [db.]table_name [WITH [TIMEOUT [value_in_sec] [AND]] [REFRESH [value_in_sec]]] AS SELECT ... +``` + +Live views store result of the corresponding [SELECT](../../../sql-reference/statements/select/index.md) query +and are updated any time the result of the query changes. Query result as well as partial result +needed to combine with new data are stored in memory providing increased performance +for repeated queries. Live views can provide push notifications +when query result changes using the [WATCH](../../../sql-reference/statements/watch.md) query. + +Live views are triggered by insert into the innermost table specified in the query. + +!!! info "Note" + [Table function](../../../sql-reference/table-functions/index.md) is not supported as the innermost table. + +!!! info "Note" + Tables that do not have inserts such as a [dictionary](../../../sql-reference/dictionaries/index.md) + or a [system table](../../../operations/system-tables/index.md) + will not trigger a live view. See [WITH REFRESH](#live-view-with-refresh) to enable periodic + updates of a live view. + +Live views work similarly to how a query in a distributed table works. But instead of combining partial results +from different servers they combine partial result from current data with partial result from the new data. +When a live view query includes a subquery then the cached partial result is only stored for the innermost subquery. + +!!! info "Note" + Only queries where one can combine partial result from the old data plus partial result from the new data will work. + Live view will not work for queries that require the complete data set to compute the final result. + +You can execute [SELECT](../../../sql-reference/statements/select/index.md) query on a live view +in the same way as for any regular view or a table. If the query result is cached +it will return the result immediately without running the stored query on the underlying tables. + +### Force Refresh {#live-view-alter-refresh} + +You can force live view refresh using the `ALTER LIVE VIEW [db.]table_name REFRESH` statement. + +### With Timeout {#live-view-with-timeout} + +When a live view is create with a `WITH TIMEOUT` clause then the live view will be dropped automatically after the specified +number of seconds elapse since the end of the last [WATCH](../../../sql-reference/statements/watch.md) query. + +```sql +CREATE LIVE VIEW [db.]table_name WITH TIMEOUT value_in_sec AS SELECT ... +``` + +### With Refresh {#live-view-with-refresh} + +When a live view is created with a `WITH REFRESH` clause then it will be automatically refreshed +after the specified number of seconds elapse since the last refresh or trigger. + +```sql +CREATE LIVE VIEW [db.]table_name WITH REFRESH value_in_sec AS SELECT ... +``` + +You can combine `WITH TIMEOUT` and `WITH REFRESH` clauses using an `AND`. + +```sql +CREATE LIVE VIEW [db.]table_name WITH TIMEOUT value_in_sec AND REFRESH value_in_sec AS SELECT ... +``` + +### Settings {#live-view-settings} + +You can use the following settings to control the behaviour of live views. + +- `allow_experimental_live_view` - enable live views. Default `0`. +- `live_view_heartbeat_interval` - the heartbeat interval in seconds to indicate live query is alive +- `max_live_view_insert_blocks_before_refresh` - maximum number of inserted blocks after which + mergeable blocks are dropped and query is re-executed. Default `64`. +- `temporary_live_view_timeout` - interval after which live view with timeout is deleted. Default `0`. +- `periodic_live_view_refresh` - interval after which periodically refreshed live view is forced to refresh. Default `0`. diff --git a/docs/en/sql-reference/statements/watch.md b/docs/en/sql-reference/statements/watch.md new file mode 100644 index 00000000000..b09147f15eb --- /dev/null +++ b/docs/en/sql-reference/statements/watch.md @@ -0,0 +1,68 @@ +--- +toc_priority: 53 +toc_title: WATCH +--- + +# WATCH Statement {#watch} + +!!! important "Important" + This is an experimental feature that may change in backwards-incompatible ways in the future releases. + Enable live views and `WATCH` query using `set allow_experimental_live_view = 1`. + + +``` sql +WATCH [db.]live_view +[EVENTS] +[LIMIT n] +[FORMAT format] +``` + +The `WATCH` query performs continuous data retrieval from a [live view](./create/view.md#live-view) table. +Unless the `LIMIT` clause is specified it provides an infinite stream of query results +from a live view. + +```sql +WATCH [db.]live_view +``` + +The virtual `_version` column in the query result indicates the current result version. + +By default, the requested data is returned to the client, while in conjunction with [INSERT INTO](../../../sql-reference/statements/insert-into.md) +it can be forwarded to a different table. + +```sql +INSERT INTO [db.]table WATCH [db.]live_view ... +``` + +## EVENTS Clause + +The `EVENTS` clause can be used to obtain a short form of the `WATCH` query +where instead of the query result, you will just get the latest query +result version. + +```sql +WATCH [db.]live_view EVENTS LIMIT 1 +``` + +## LIMIT Clause {#limit-clause} + +The `LIMIT n` clause species the number of updates the `WATCH` query should wait +for before terminating. The value of `0` +indicates that the `WATCH` query should not wait for any new query results +and therefore will return immediately once query is evaluated. + +```sql +WATCH [db.]live_view LIMIT 1 +``` + +## FORMAT Clause {#format-clause} + +The `FORMAT` clause works the same way as for the [SELECT](./select/index.md#format-clause). + +### JSONEachRowWithProgress + +The `JSONEachRowWithProgress` format should be used when watching [live view](./create/view.md#live-view) +tables over the HTTP interface. The progress messages will be added to the output +to keep the long-lived HTTP connection alive until the query result changes. +The interval between progress messages is controlled using the [live_view_heartbeat_interval](./create/view.md#live-view-settings) setting. + From 84489b82433783f32572f8150dbd8ef4d1959acd Mon Sep 17 00:00:00 2001 From: Nicolae Vartolomei Date: Mon, 8 Feb 2021 21:19:32 +0000 Subject: [PATCH 0195/2357] Improve replicated fetches timeouts test and make it 3x faster --- .../configs/merge_tree.xml | 6 -- .../test.py | 88 ----------------- .../__init__.py | 0 .../configs/server.xml | 3 + .../test_replicated_fetches_timeouts/test.py | 95 +++++++++++++++++++ 5 files changed, 98 insertions(+), 94 deletions(-) delete mode 100644 tests/integration/test_replicated_fetches_network_partition/configs/merge_tree.xml delete mode 100644 tests/integration/test_replicated_fetches_network_partition/test.py rename tests/integration/{test_replicated_fetches_network_partition => test_replicated_fetches_timeouts}/__init__.py (100%) create mode 100644 tests/integration/test_replicated_fetches_timeouts/configs/server.xml create mode 100644 tests/integration/test_replicated_fetches_timeouts/test.py diff --git a/tests/integration/test_replicated_fetches_network_partition/configs/merge_tree.xml b/tests/integration/test_replicated_fetches_network_partition/configs/merge_tree.xml deleted file mode 100644 index eba2c5e8ffc..00000000000 --- a/tests/integration/test_replicated_fetches_network_partition/configs/merge_tree.xml +++ /dev/null @@ -1,6 +0,0 @@ - - - 30 - 1 - - diff --git a/tests/integration/test_replicated_fetches_network_partition/test.py b/tests/integration/test_replicated_fetches_network_partition/test.py deleted file mode 100644 index b0b5534cf1f..00000000000 --- a/tests/integration/test_replicated_fetches_network_partition/test.py +++ /dev/null @@ -1,88 +0,0 @@ -#!/usr/bin/env python3 - -import pytest -import time -from helpers.cluster import ClickHouseCluster -from helpers.network import PartitionManager -import random -import string - -cluster = ClickHouseCluster(__file__) -node1 = cluster.add_instance('node1', with_zookeeper=True) -node2 = cluster.add_instance('node2', with_zookeeper=True) -node3 = cluster.add_instance('node3', with_zookeeper=True, main_configs=['configs/merge_tree.xml']) - - -@pytest.fixture(scope="module") -def started_cluster(): - try: - cluster.start() - - yield cluster - - finally: - cluster.shutdown() - - -def get_random_string(length): - return ''.join(random.choice(string.ascii_uppercase + string.digits) for _ in range(length)) - - -def test_no_stall(started_cluster): - node1.query("CREATE TABLE t (key UInt64, data String) ENGINE = ReplicatedMergeTree('/clickhouse/test/t', '1') ORDER BY tuple() PARTITION BY key") - node2.query("CREATE TABLE t (key UInt64, data String) ENGINE = ReplicatedMergeTree('/clickhouse/test/t', '2') ORDER BY tuple() PARTITION BY key") - node3.query("CREATE TABLE t (key UInt64, data String) ENGINE = ReplicatedMergeTree('/clickhouse/test/t', '3') ORDER BY tuple() PARTITION BY key") - - node1.query("SYSTEM STOP MERGES") - node2.query("SYSTEM STOP MERGES") - node3.query("SYSTEM STOP MERGES") - - # Pause node3 until the test setup is prepared - node3.query("SYSTEM STOP FETCHES t") - - node1.query("INSERT INTO t SELECT 1, '{}' FROM numbers(5000)".format(get_random_string(104857))) - node1.query("INSERT INTO t SELECT 2, '{}' FROM numbers(5000)".format(get_random_string(104857))) - node1.query("INSERT INTO t SELECT 3, '{}' FROM numbers(5000)".format(get_random_string(104857))) - node1.query("INSERT INTO t SELECT 4, '{}' FROM numbers(5000)".format(get_random_string(104857))) - node1.query("INSERT INTO t SELECT 5, '{}' FROM numbers(5000)".format(get_random_string(104857))) - - # Make sure node2 has all the parts. - node2.query("SYSTEM SYNC REPLICA t") - - # Do not allow sending from replica 2 yet, force node3 to initiate replication from node1. - node2.query("SYSTEM STOP REPLICATED SENDS") - - print("replica 2 fully synced") - - with PartitionManager() as pm: - # Make node1 very slow, node3 should replicate from node2 instead. - pm.add_network_delay(node1, 2000) - - # node3 starts to replicate from node 1 - node3.query("SYSTEM START FETCHES t") - - # Wait some time to give a chance for node3 to try replicating without success from node1. - time.sleep(10) - - # Wait for replication... - node2.query("SYSTEM START REPLICATED SENDS") - - for _ in range(1000): - print('Currently running fetches:') - print(node3.query("SELECT result_part_name, source_replica_hostname, progress FROM system.replicated_fetches").strip()) - print() - - parts_fetched = node3.query("SELECT count() FROM system.parts WHERE table = 't'").strip() - print('parts_fetched:', parts_fetched) - print() - - # Replication done. - if parts_fetched == "5": - break - - time.sleep(3) - - for n in [node1, node2, node3]: - # Workaround for drop not finishing if it is started while table is readonly. - n.query("SYSTEM RESTART REPLICA t") - n.query("DROP TABLE t SYNC") diff --git a/tests/integration/test_replicated_fetches_network_partition/__init__.py b/tests/integration/test_replicated_fetches_timeouts/__init__.py similarity index 100% rename from tests/integration/test_replicated_fetches_network_partition/__init__.py rename to tests/integration/test_replicated_fetches_timeouts/__init__.py diff --git a/tests/integration/test_replicated_fetches_timeouts/configs/server.xml b/tests/integration/test_replicated_fetches_timeouts/configs/server.xml new file mode 100644 index 00000000000..d4b441b91fb --- /dev/null +++ b/tests/integration/test_replicated_fetches_timeouts/configs/server.xml @@ -0,0 +1,3 @@ + + 0.1 + diff --git a/tests/integration/test_replicated_fetches_timeouts/test.py b/tests/integration/test_replicated_fetches_timeouts/test.py new file mode 100644 index 00000000000..963ec2487fd --- /dev/null +++ b/tests/integration/test_replicated_fetches_timeouts/test.py @@ -0,0 +1,95 @@ +#!/usr/bin/env python3 + +import random +import string +import time + +import pytest +from helpers.cluster import ClickHouseCluster +from helpers.network import PartitionManager + +cluster = ClickHouseCluster(__file__) +node1 = cluster.add_instance( + 'node1', with_zookeeper=True, + main_configs=['configs/server.xml']) + +node2 = cluster.add_instance( + 'node2', with_zookeeper=True, + main_configs=['configs/server.xml']) + + +@pytest.fixture(scope="module") +def started_cluster(): + try: + cluster.start() + + yield cluster + + finally: + cluster.shutdown() + + +def get_random_string(length): + return ''.join(random.choice(string.ascii_uppercase + string.digits) for _ in range(length)) + + +def test_no_stall(started_cluster): + for instance in started_cluster.instances.values(): + instance.query(""" + CREATE TABLE t (key UInt64, data String) + ENGINE = ReplicatedMergeTree('/clickhouse/test/t', '{instance}') + ORDER BY tuple() + PARTITION BY key""") + + # Pause node3 until the test setup is prepared + node2.query("SYSTEM STOP FETCHES t") + + node1.query("INSERT INTO t SELECT 1, '{}' FROM numbers(500)".format(get_random_string(104857))) + node1.query("INSERT INTO t SELECT 2, '{}' FROM numbers(500)".format(get_random_string(104857))) + + with PartitionManager() as pm: + pm.add_network_delay(node1, 2000) + node2.query("SYSTEM START FETCHES t") + + # Wait for timeout exceptions to confirm that timeout is triggered. + while True: + conn_timeout_exceptions = int(node2.query( + """ + SELECT count() + FROM system.replication_queue + WHERE last_exception LIKE '%connect timed out%' + """)) + + if conn_timeout_exceptions >= 2: + break + + time.sleep(0.1) + + print("Connection timeouts tested!") + + # Increase connection timeout and wait for receive timeouts. + node2.query(""" + ALTER TABLE t + MODIFY SETTING replicated_fetches_http_connection_timeout = 30, + replicated_fetches_http_receive_timeout = 1""") + + while True: + timeout_exceptions = int(node2.query( + """ + SELECT count() + FROM system.replication_queue + WHERE last_exception LIKE '%e.displayText() = Timeout%' + AND last_exception NOT LIKE '%connect timed out%' + """).strip()) + + if timeout_exceptions >= 2: + break + + time.sleep(0.1) + + for instance in started_cluster.instances.values(): + # Workaround for DROP TABLE not finishing if it is started while table is readonly. + instance.query("SYSTEM RESTART REPLICA t") + + # Cleanup data directory from test results archive. + instance.query("DROP TABLE t SYNC") From 2e113a0faf9f264853289d9e2ba61ea7913a4d4a Mon Sep 17 00:00:00 2001 From: Vitaliy Zakaznikov Date: Mon, 8 Feb 2021 16:24:05 -0500 Subject: [PATCH 0196/2357] Update to live view docs. --- .../en/sql-reference/statements/create/view.md | 8 ++++---- docs/en/sql-reference/statements/watch.md | 18 +++++++++--------- 2 files changed, 13 insertions(+), 13 deletions(-) diff --git a/docs/en/sql-reference/statements/create/view.md b/docs/en/sql-reference/statements/create/view.md index a9fe48ed6ac..381dbbfe08a 100644 --- a/docs/en/sql-reference/statements/create/view.md +++ b/docs/en/sql-reference/statements/create/view.md @@ -136,8 +136,8 @@ CREATE LIVE VIEW [db.]table_name WITH TIMEOUT value_in_sec AND REFRESH value_in_ You can use the following settings to control the behaviour of live views. - `allow_experimental_live_view` - enable live views. Default `0`. -- `live_view_heartbeat_interval` - the heartbeat interval in seconds to indicate live query is alive +- `live_view_heartbeat_interval` - the heartbeat interval in seconds to indicate live query is alive. Default `15` seconds. - `max_live_view_insert_blocks_before_refresh` - maximum number of inserted blocks after which - mergeable blocks are dropped and query is re-executed. Default `64`. -- `temporary_live_view_timeout` - interval after which live view with timeout is deleted. Default `0`. -- `periodic_live_view_refresh` - interval after which periodically refreshed live view is forced to refresh. Default `0`. + mergeable blocks are dropped and query is re-executed. Default `64` inserts. +- `temporary_live_view_timeout` - interval after which live view with timeout is deleted. Default `5` seconds. +- `periodic_live_view_refresh` - interval after which periodically refreshed live view is forced to refresh. Default `60` seconds. diff --git a/docs/en/sql-reference/statements/watch.md b/docs/en/sql-reference/statements/watch.md index b09147f15eb..5cf10cdd5a0 100644 --- a/docs/en/sql-reference/statements/watch.md +++ b/docs/en/sql-reference/statements/watch.md @@ -27,7 +27,7 @@ WATCH [db.]live_view The virtual `_version` column in the query result indicates the current result version. -By default, the requested data is returned to the client, while in conjunction with [INSERT INTO](../../../sql-reference/statements/insert-into.md) +By default, the requested data is returned to the client, while in conjunction with [INSERT INTO](../../sql-reference/statements/insert-into.md) it can be forwarded to a different table. ```sql @@ -37,7 +37,7 @@ INSERT INTO [db.]table WATCH [db.]live_view ... ## EVENTS Clause The `EVENTS` clause can be used to obtain a short form of the `WATCH` query -where instead of the query result, you will just get the latest query +where instead of the query result you will just get the latest query result version. ```sql @@ -47,7 +47,8 @@ WATCH [db.]live_view EVENTS LIMIT 1 ## LIMIT Clause {#limit-clause} The `LIMIT n` clause species the number of updates the `WATCH` query should wait -for before terminating. The value of `0` +for before terminating. By default there is no limit on the number of updates and therefore +the query will not terminate. The value of `0` indicates that the `WATCH` query should not wait for any new query results and therefore will return immediately once query is evaluated. @@ -59,10 +60,9 @@ WATCH [db.]live_view LIMIT 1 The `FORMAT` clause works the same way as for the [SELECT](./select/index.md#format-clause). -### JSONEachRowWithProgress - -The `JSONEachRowWithProgress` format should be used when watching [live view](./create/view.md#live-view) -tables over the HTTP interface. The progress messages will be added to the output -to keep the long-lived HTTP connection alive until the query result changes. -The interval between progress messages is controlled using the [live_view_heartbeat_interval](./create/view.md#live-view-settings) setting. +!!! info "Note" + The [JSONEachRowWithProgress](../../interfaces/formats/#jsoneachrowwithprogress) format should be used when watching [live view](./create/view.md#live-view) + tables over the HTTP interface. The progress messages will be added to the output + to keep the long-lived HTTP connection alive until the query result changes. + The interval between progress messages is controlled using the [live_view_heartbeat_interval](./create/view.md#live-view-settings) setting. From d7f5ea784096ae0fe0049c9e2dcefff1ca059cfc Mon Sep 17 00:00:00 2001 From: Vitaliy Zakaznikov Date: Mon, 8 Feb 2021 16:25:07 -0500 Subject: [PATCH 0197/2357] Adding experimental note to the watch query. --- docs/en/sql-reference/statements/watch.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/en/sql-reference/statements/watch.md b/docs/en/sql-reference/statements/watch.md index 5cf10cdd5a0..b89cc63375c 100644 --- a/docs/en/sql-reference/statements/watch.md +++ b/docs/en/sql-reference/statements/watch.md @@ -3,7 +3,7 @@ toc_priority: 53 toc_title: WATCH --- -# WATCH Statement {#watch} +# WATCH Statement (Experimental) {#watch} !!! important "Important" This is an experimental feature that may change in backwards-incompatible ways in the future releases. From cd097e250b1544cceb487f4e950243a1c039269d Mon Sep 17 00:00:00 2001 From: Vitaliy Zakaznikov Date: Mon, 8 Feb 2021 16:29:47 -0500 Subject: [PATCH 0198/2357] Fix type in live view reference. --- docs/en/sql-reference/statements/create/view.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/en/sql-reference/statements/create/view.md b/docs/en/sql-reference/statements/create/view.md index 381dbbfe08a..0fdb36249ac 100644 --- a/docs/en/sql-reference/statements/create/view.md +++ b/docs/en/sql-reference/statements/create/view.md @@ -63,7 +63,7 @@ Views look the same as normal tables. For example, they are listed in the result There isn’t a separate query for deleting views. To delete a view, use [DROP TABLE](../../../sql-reference/statements/drop.md). -## Live View (Experimental) {#live-view) +## Live View (Experimental) {#live-view} !!! important "Important" This is an experimental feature that may change in backwards-incompatible ways in the future releases. From 46840b0a4f9211f997fed85330968584502904fc Mon Sep 17 00:00:00 2001 From: Pavel Kruglov Date: Tue, 9 Feb 2021 00:47:13 +0300 Subject: [PATCH 0199/2357] Add __init__.py --- tests/integration/test_hedged_requests/__init__.py | 0 tests/integration/test_hedged_requests_parallel/__init__.py | 0 2 files changed, 0 insertions(+), 0 deletions(-) create mode 100644 tests/integration/test_hedged_requests/__init__.py create mode 100644 tests/integration/test_hedged_requests_parallel/__init__.py diff --git a/tests/integration/test_hedged_requests/__init__.py b/tests/integration/test_hedged_requests/__init__.py new file mode 100644 index 00000000000..e69de29bb2d diff --git a/tests/integration/test_hedged_requests_parallel/__init__.py b/tests/integration/test_hedged_requests_parallel/__init__.py new file mode 100644 index 00000000000..e69de29bb2d From 52e9b9d73974d3f4b277fb0f37d14b1a0c29e1e9 Mon Sep 17 00:00:00 2001 From: Vitaliy Zakaznikov Date: Mon, 8 Feb 2021 16:56:25 -0500 Subject: [PATCH 0200/2357] Minor updates to the live view docs. --- .../sql-reference/statements/create/view.md | 29 ++++++++++++------- 1 file changed, 19 insertions(+), 10 deletions(-) diff --git a/docs/en/sql-reference/statements/create/view.md b/docs/en/sql-reference/statements/create/view.md index 0fdb36249ac..5a5c77534fb 100644 --- a/docs/en/sql-reference/statements/create/view.md +++ b/docs/en/sql-reference/statements/create/view.md @@ -103,6 +103,10 @@ You can execute [SELECT](../../../sql-reference/statements/select/index.md) quer in the same way as for any regular view or a table. If the query result is cached it will return the result immediately without running the stored query on the underlying tables. +```sql +SELECT * FROM [db.]live_view WHERE ... +``` + ### Force Refresh {#live-view-alter-refresh} You can force live view refresh using the `ALTER LIVE VIEW [db.]table_name REFRESH` statement. @@ -110,34 +114,39 @@ You can force live view refresh using the `ALTER LIVE VIEW [db.]table_name REFRE ### With Timeout {#live-view-with-timeout} When a live view is create with a `WITH TIMEOUT` clause then the live view will be dropped automatically after the specified -number of seconds elapse since the end of the last [WATCH](../../../sql-reference/statements/watch.md) query. +number of seconds elapse since the end of the last [WATCH](../../../sql-reference/statements/watch.md) query +that was watching the live view. ```sql -CREATE LIVE VIEW [db.]table_name WITH TIMEOUT value_in_sec AS SELECT ... +CREATE LIVE VIEW [db.]table_name WITH TIMEOUT [value_in_sec] AS SELECT ... ``` +If the timeout value is not specified then the value specified by the `temporary_live_view_timeout` setting is used. + ### With Refresh {#live-view-with-refresh} When a live view is created with a `WITH REFRESH` clause then it will be automatically refreshed after the specified number of seconds elapse since the last refresh or trigger. ```sql -CREATE LIVE VIEW [db.]table_name WITH REFRESH value_in_sec AS SELECT ... +CREATE LIVE VIEW [db.]table_name WITH REFRESH [value_in_sec] AS SELECT ... ``` -You can combine `WITH TIMEOUT` and `WITH REFRESH` clauses using an `AND`. +If the refresh value is not specified then the value specified by the `periodic_live_view_refresh` setting is used. + +You can combine `WITH TIMEOUT` and `WITH REFRESH` clauses using an `AND` clause. ```sql -CREATE LIVE VIEW [db.]table_name WITH TIMEOUT value_in_sec AND REFRESH value_in_sec AS SELECT ... +CREATE LIVE VIEW [db.]table_name WITH TIMEOUT [value_in_sec] AND REFRESH [value_in_sec] AS SELECT ... ``` ### Settings {#live-view-settings} You can use the following settings to control the behaviour of live views. -- `allow_experimental_live_view` - enable live views. Default `0`. -- `live_view_heartbeat_interval` - the heartbeat interval in seconds to indicate live query is alive. Default `15` seconds. +- `allow_experimental_live_view` - enable live views. Default is `0`. +- `live_view_heartbeat_interval` - the heartbeat interval in seconds to indicate live query is alive. Default is `15` seconds. - `max_live_view_insert_blocks_before_refresh` - maximum number of inserted blocks after which - mergeable blocks are dropped and query is re-executed. Default `64` inserts. -- `temporary_live_view_timeout` - interval after which live view with timeout is deleted. Default `5` seconds. -- `periodic_live_view_refresh` - interval after which periodically refreshed live view is forced to refresh. Default `60` seconds. + mergeable blocks are dropped and query is re-executed. Default is `64` inserts. +- `temporary_live_view_timeout` - interval after which live view with timeout is deleted. Default is `5` seconds. +- `periodic_live_view_refresh` - interval after which periodically refreshed live view is forced to refresh. Default is `60` seconds. From d737ffbe0c448d77be6f40fd812fea1bb6c6c55c Mon Sep 17 00:00:00 2001 From: Vitaliy Zakaznikov Date: Mon, 8 Feb 2021 16:59:39 -0500 Subject: [PATCH 0201/2357] Adding event clause reference. --- docs/en/sql-reference/statements/watch.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/en/sql-reference/statements/watch.md b/docs/en/sql-reference/statements/watch.md index b89cc63375c..480841cf1b9 100644 --- a/docs/en/sql-reference/statements/watch.md +++ b/docs/en/sql-reference/statements/watch.md @@ -34,7 +34,7 @@ it can be forwarded to a different table. INSERT INTO [db.]table WATCH [db.]live_view ... ``` -## EVENTS Clause +## EVENTS Clause {#events-clause} The `EVENTS` clause can be used to obtain a short form of the `WATCH` query where instead of the query result you will just get the latest query From 0270b96ffb48d305ea2125aca995c5046fff842f Mon Sep 17 00:00:00 2001 From: Vitaliy Zakaznikov Date: Mon, 8 Feb 2021 17:18:37 -0500 Subject: [PATCH 0202/2357] Adding example of using WATCH and WATCH ... EVENTS to live view description. --- docs/en/sql-reference/statements/create/view.md | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/docs/en/sql-reference/statements/create/view.md b/docs/en/sql-reference/statements/create/view.md index 5a5c77534fb..3544ad93aa5 100644 --- a/docs/en/sql-reference/statements/create/view.md +++ b/docs/en/sql-reference/statements/create/view.md @@ -99,6 +99,18 @@ When a live view query includes a subquery then the cached partial result is onl Only queries where one can combine partial result from the old data plus partial result from the new data will work. Live view will not work for queries that require the complete data set to compute the final result. +You can watch for changes in the live view query result using the [WATCH](../../../sql-reference/statements/watch.md) query + +```sql +WATCH [db.]live_view +``` + +or add [EVENTS](../../../sql-reference/statements/watch.md#events-clause) clause to just get change events. + +```sql +WATCH [db.]live_view EVENTS +``` + You can execute [SELECT](../../../sql-reference/statements/select/index.md) query on a live view in the same way as for any regular view or a table. If the query result is cached it will return the result immediately without running the stored query on the underlying tables. From 5769822c53aeca7ba772b8966322235a5e5192fe Mon Sep 17 00:00:00 2001 From: Vitaliy Zakaznikov Date: Mon, 8 Feb 2021 17:28:31 -0500 Subject: [PATCH 0203/2357] Fixing rendering. --- .../sql-reference/statements/create/view.md | 36 +++++-------------- docs/en/sql-reference/statements/watch.md | 24 ++++--------- 2 files changed, 15 insertions(+), 45 deletions(-) diff --git a/docs/en/sql-reference/statements/create/view.md b/docs/en/sql-reference/statements/create/view.md index 3544ad93aa5..1d6621ff67d 100644 --- a/docs/en/sql-reference/statements/create/view.md +++ b/docs/en/sql-reference/statements/create/view.md @@ -74,30 +74,17 @@ There isn’t a separate query for deleting views. To delete a view, use [DROP T CREATE LIVE VIEW [IF NOT EXISTS] [db.]table_name [WITH [TIMEOUT [value_in_sec] [AND]] [REFRESH [value_in_sec]]] AS SELECT ... ``` -Live views store result of the corresponding [SELECT](../../../sql-reference/statements/select/index.md) query -and are updated any time the result of the query changes. Query result as well as partial result -needed to combine with new data are stored in memory providing increased performance -for repeated queries. Live views can provide push notifications -when query result changes using the [WATCH](../../../sql-reference/statements/watch.md) query. +Live views store result of the corresponding [SELECT](../../../sql-reference/statements/select/index.md) query and are updated any time the result of the query changes. Query result as well as partial result needed to combine with new data are stored in memory providing increased performance +for repeated queries. Live views can provide push notifications when query result changes using the [WATCH](../../../sql-reference/statements/watch.md) query. Live views are triggered by insert into the innermost table specified in the query. -!!! info "Note" - [Table function](../../../sql-reference/table-functions/index.md) is not supported as the innermost table. +Live views work similarly to how a query in a distributed table works. But instead of combining partial results from different servers they combine partial result from current data with partial result from the new data. When a live view query includes a subquery then the cached partial result is only stored for the innermost subquery. !!! info "Note" - Tables that do not have inserts such as a [dictionary](../../../sql-reference/dictionaries/index.md) - or a [system table](../../../operations/system-tables/index.md) - will not trigger a live view. See [WITH REFRESH](#live-view-with-refresh) to enable periodic - updates of a live view. - -Live views work similarly to how a query in a distributed table works. But instead of combining partial results -from different servers they combine partial result from current data with partial result from the new data. -When a live view query includes a subquery then the cached partial result is only stored for the innermost subquery. - -!!! info "Note" - Only queries where one can combine partial result from the old data plus partial result from the new data will work. - Live view will not work for queries that require the complete data set to compute the final result. + - [Table function](../../../sql-reference/table-functions/index.md) is not supported as the innermost table. + - Tables that do not have inserts such as a [dictionary](../../../sql-reference/dictionaries/index.md) or a [system table](../../../operations/system-tables/index.md) will not trigger a live view. See [WITH REFRESH](#live-view-with-refresh) to enable periodic updates of a live view. + - Only queries where one can combine partial result from the old data plus partial result from the new data will work. Live view will not work for queries that require the complete data set to compute the final result. You can watch for changes in the live view query result using the [WATCH](../../../sql-reference/statements/watch.md) query @@ -111,9 +98,7 @@ or add [EVENTS](../../../sql-reference/statements/watch.md#events-clause) clause WATCH [db.]live_view EVENTS ``` -You can execute [SELECT](../../../sql-reference/statements/select/index.md) query on a live view -in the same way as for any regular view or a table. If the query result is cached -it will return the result immediately without running the stored query on the underlying tables. +You can execute [SELECT](../../../sql-reference/statements/select/index.md) query on a live view in the same way as for any regular view or a table. If the query result is cached it will return the result immediately without running the stored query on the underlying tables. ```sql SELECT * FROM [db.]live_view WHERE ... @@ -125,9 +110,7 @@ You can force live view refresh using the `ALTER LIVE VIEW [db.]table_name REFRE ### With Timeout {#live-view-with-timeout} -When a live view is create with a `WITH TIMEOUT` clause then the live view will be dropped automatically after the specified -number of seconds elapse since the end of the last [WATCH](../../../sql-reference/statements/watch.md) query -that was watching the live view. +When a live view is create with a `WITH TIMEOUT` clause then the live view will be dropped automatically after the specified number of seconds elapse since the end of the last [WATCH](../../../sql-reference/statements/watch.md) query that was watching the live view. ```sql CREATE LIVE VIEW [db.]table_name WITH TIMEOUT [value_in_sec] AS SELECT ... @@ -137,8 +120,7 @@ If the timeout value is not specified then the value specified by the `temporary ### With Refresh {#live-view-with-refresh} -When a live view is created with a `WITH REFRESH` clause then it will be automatically refreshed -after the specified number of seconds elapse since the last refresh or trigger. +When a live view is created with a `WITH REFRESH` clause then it will be automatically refreshed after the specified number of seconds elapse since the last refresh or trigger. ```sql CREATE LIVE VIEW [db.]table_name WITH REFRESH [value_in_sec] AS SELECT ... diff --git a/docs/en/sql-reference/statements/watch.md b/docs/en/sql-reference/statements/watch.md index 480841cf1b9..10d2a2715fb 100644 --- a/docs/en/sql-reference/statements/watch.md +++ b/docs/en/sql-reference/statements/watch.md @@ -17,9 +17,7 @@ WATCH [db.]live_view [FORMAT format] ``` -The `WATCH` query performs continuous data retrieval from a [live view](./create/view.md#live-view) table. -Unless the `LIMIT` clause is specified it provides an infinite stream of query results -from a live view. +The `WATCH` query performs continuous data retrieval from a [live view](./create/view.md#live-view) table. Unless the `LIMIT` clause is specified it provides an infinite stream of query results from a live view. ```sql WATCH [db.]live_view @@ -27,8 +25,7 @@ WATCH [db.]live_view The virtual `_version` column in the query result indicates the current result version. -By default, the requested data is returned to the client, while in conjunction with [INSERT INTO](../../sql-reference/statements/insert-into.md) -it can be forwarded to a different table. +By default, the requested data is returned to the client, while in conjunction with [INSERT INTO](../../sql-reference/statements/insert-into.md) it can be forwarded to a different table. ```sql INSERT INTO [db.]table WATCH [db.]live_view ... @@ -36,9 +33,7 @@ INSERT INTO [db.]table WATCH [db.]live_view ... ## EVENTS Clause {#events-clause} -The `EVENTS` clause can be used to obtain a short form of the `WATCH` query -where instead of the query result you will just get the latest query -result version. +The `EVENTS` clause can be used to obtain a short form of the `WATCH` query where instead of the query result you will just get the latest query result version. ```sql WATCH [db.]live_view EVENTS LIMIT 1 @@ -46,14 +41,10 @@ WATCH [db.]live_view EVENTS LIMIT 1 ## LIMIT Clause {#limit-clause} -The `LIMIT n` clause species the number of updates the `WATCH` query should wait -for before terminating. By default there is no limit on the number of updates and therefore -the query will not terminate. The value of `0` -indicates that the `WATCH` query should not wait for any new query results -and therefore will return immediately once query is evaluated. +The `LIMIT n` clause species the number of updates the `WATCH` query should wait for before terminating. By default there is no limit on the number of updates and therefore the query will not terminate. The value of `0` indicates that the `WATCH` query should not wait for any new query results and therefore will return immediately once query is evaluated. ```sql -WATCH [db.]live_view LIMIT 1 +WATCH [db.]live_view LIMIT 2 ``` ## FORMAT Clause {#format-clause} @@ -61,8 +52,5 @@ WATCH [db.]live_view LIMIT 1 The `FORMAT` clause works the same way as for the [SELECT](./select/index.md#format-clause). !!! info "Note" - The [JSONEachRowWithProgress](../../interfaces/formats/#jsoneachrowwithprogress) format should be used when watching [live view](./create/view.md#live-view) - tables over the HTTP interface. The progress messages will be added to the output - to keep the long-lived HTTP connection alive until the query result changes. - The interval between progress messages is controlled using the [live_view_heartbeat_interval](./create/view.md#live-view-settings) setting. + The [JSONEachRowWithProgress](../../interfaces/formats/#jsoneachrowwithprogress) format should be used when watching [live view](./create/view.md#live-view) tables over the HTTP interface. The progress messages will be added to the output to keep the long-lived HTTP connection alive until the query result changes. The interval between progress messages is controlled using the [live_view_heartbeat_interval](./create/view.md#live-view-settings) setting. From a56ffcee1830e3452eaf064696cc8b8508b28ac5 Mon Sep 17 00:00:00 2001 From: Vitaliy Zakaznikov Date: Mon, 8 Feb 2021 17:53:15 -0500 Subject: [PATCH 0204/2357] Fixing links in WATCH query docs. --- docs/en/sql-reference/statements/watch.md | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/docs/en/sql-reference/statements/watch.md b/docs/en/sql-reference/statements/watch.md index 10d2a2715fb..71f26d71e85 100644 --- a/docs/en/sql-reference/statements/watch.md +++ b/docs/en/sql-reference/statements/watch.md @@ -17,7 +17,7 @@ WATCH [db.]live_view [FORMAT format] ``` -The `WATCH` query performs continuous data retrieval from a [live view](./create/view.md#live-view) table. Unless the `LIMIT` clause is specified it provides an infinite stream of query results from a live view. +The `WATCH` query performs continuous data retrieval from a [live view](./create/view.md#live-view) table. Unless the `LIMIT` clause is specified it provides an infinite stream of query results from a [live view](./create/view.md#live-view). ```sql WATCH [db.]live_view @@ -49,8 +49,8 @@ WATCH [db.]live_view LIMIT 2 ## FORMAT Clause {#format-clause} -The `FORMAT` clause works the same way as for the [SELECT](./select/index.md#format-clause). +The `FORMAT` clause works the same way as for the [SELECT](./select/format.md). !!! info "Note" - The [JSONEachRowWithProgress](../../interfaces/formats/#jsoneachrowwithprogress) format should be used when watching [live view](./create/view.md#live-view) tables over the HTTP interface. The progress messages will be added to the output to keep the long-lived HTTP connection alive until the query result changes. The interval between progress messages is controlled using the [live_view_heartbeat_interval](./create/view.md#live-view-settings) setting. + The [JSONEachRowWithProgress](../../../interfaces/formats/#jsoneachrowwithprogress) format should be used when watching [live view](./create/view.md#live-view) tables over the HTTP interface. The progress messages will be added to the output to keep the long-lived HTTP connection alive until the query result changes. The interval between progress messages is controlled using the [live_view_heartbeat_interval](./create/view.md#live-view-settings) setting. From 3d2788e1b5b622f96fd15dd4636eba30984d39fb Mon Sep 17 00:00:00 2001 From: Vitaliy Zakaznikov Date: Mon, 8 Feb 2021 19:23:32 -0500 Subject: [PATCH 0205/2357] Fixes and updates to live view docs. --- .../sql-reference/statements/create/view.md | 21 ++++++++++++++----- docs/en/sql-reference/statements/watch.md | 2 +- 2 files changed, 17 insertions(+), 6 deletions(-) diff --git a/docs/en/sql-reference/statements/create/view.md b/docs/en/sql-reference/statements/create/view.md index 1d6621ff67d..662a4b54754 100644 --- a/docs/en/sql-reference/statements/create/view.md +++ b/docs/en/sql-reference/statements/create/view.md @@ -74,17 +74,20 @@ There isn’t a separate query for deleting views. To delete a view, use [DROP T CREATE LIVE VIEW [IF NOT EXISTS] [db.]table_name [WITH [TIMEOUT [value_in_sec] [AND]] [REFRESH [value_in_sec]]] AS SELECT ... ``` -Live views store result of the corresponding [SELECT](../../../sql-reference/statements/select/index.md) query and are updated any time the result of the query changes. Query result as well as partial result needed to combine with new data are stored in memory providing increased performance -for repeated queries. Live views can provide push notifications when query result changes using the [WATCH](../../../sql-reference/statements/watch.md) query. +Live views store result of the corresponding [SELECT](../../../sql-reference/statements/select/index.md) query and are updated any time the result of the query changes. Query result as well as partial result needed to combine with new data are stored in memory providing increased performance for repeated queries. Live views can provide push notifications when query result changes using the [WATCH](../../../sql-reference/statements/watch.md) query. Live views are triggered by insert into the innermost table specified in the query. Live views work similarly to how a query in a distributed table works. But instead of combining partial results from different servers they combine partial result from current data with partial result from the new data. When a live view query includes a subquery then the cached partial result is only stored for the innermost subquery. -!!! info "Note" +!!! info "Limitations" - [Table function](../../../sql-reference/table-functions/index.md) is not supported as the innermost table. - - Tables that do not have inserts such as a [dictionary](../../../sql-reference/dictionaries/index.md) or a [system table](../../../operations/system-tables/index.md) will not trigger a live view. See [WITH REFRESH](#live-view-with-refresh) to enable periodic updates of a live view. - - Only queries where one can combine partial result from the old data plus partial result from the new data will work. Live view will not work for queries that require the complete data set to compute the final result. + - Tables that do not have inserts such as a [dictionary](../../../sql-reference/dictionaries/index.md), [system table](../../../operations/system-tables/index.md), a [normal view](#normal), or a [materialized view](#materialized) will not trigger a live view. + - Only queries where one can combine partial result from the old data plus partial result from the new data will work. Live view will not work for queries that require the complete data set to compute the final result or aggregations where the state of the aggregation must be preserved. + - Does not work with replicated or distributed tables where inserts are performed on different nodes. + - Can't be triggered by multiple tables. + + See [WITH REFRESH](#live-view-with-refresh) to force periodic updates of a live view that in some cases can be used as a workaround. You can watch for changes in the live view query result using the [WATCH](../../../sql-reference/statements/watch.md) query @@ -133,6 +136,14 @@ You can combine `WITH TIMEOUT` and `WITH REFRESH` clauses using an `AND` clause. ```sql CREATE LIVE VIEW [db.]table_name WITH TIMEOUT [value_in_sec] AND REFRESH [value_in_sec] AS SELECT ... ``` +### Usage + +Most common uses of live view tables include: + +- Providing push notifications for query result changes to avoid polling. +- Caching results of most frequent queries to provide immediate query results. +- Watching for table changes and triggering a follow-up select queries. +- Watching metrics from system tables using periodic refresh. ### Settings {#live-view-settings} diff --git a/docs/en/sql-reference/statements/watch.md b/docs/en/sql-reference/statements/watch.md index 71f26d71e85..07b050d4c4e 100644 --- a/docs/en/sql-reference/statements/watch.md +++ b/docs/en/sql-reference/statements/watch.md @@ -49,7 +49,7 @@ WATCH [db.]live_view LIMIT 2 ## FORMAT Clause {#format-clause} -The `FORMAT` clause works the same way as for the [SELECT](./select/format.md). +The `FORMAT` clause works the same way as for the [SELECT](../../sql-reference/statements/select/format.md#format-clause). !!! info "Note" The [JSONEachRowWithProgress](../../../interfaces/formats/#jsoneachrowwithprogress) format should be used when watching [live view](./create/view.md#live-view) tables over the HTTP interface. The progress messages will be added to the output to keep the long-lived HTTP connection alive until the query result changes. The interval between progress messages is controlled using the [live_view_heartbeat_interval](./create/view.md#live-view-settings) setting. From 22b8bc6c99fafcd653884cfc9ac471ac1a81d9f4 Mon Sep 17 00:00:00 2001 From: Pavel Kruglov Date: Tue, 9 Feb 2021 05:01:09 +0300 Subject: [PATCH 0206/2357] Refactor 2.0 --- src/Client/HedgedConnections.cpp | 155 ++++++++-------- src/Client/HedgedConnections.h | 38 ++-- src/Client/HedgedConnectionsFactory.cpp | 224 +++++++++++------------- src/Client/HedgedConnectionsFactory.h | 52 ++---- 4 files changed, 219 insertions(+), 250 deletions(-) diff --git a/src/Client/HedgedConnections.cpp b/src/Client/HedgedConnections.cpp index 8e547169f29..32a91af6179 100644 --- a/src/Client/HedgedConnections.cpp +++ b/src/Client/HedgedConnections.cpp @@ -23,34 +23,31 @@ HedgedConnections::HedgedConnections( : hedged_connections_factory(pool_, &settings_, timeouts_, table_to_check_) , settings(settings_) , throttler(throttler_) - , log(&Poco::Logger::get("HedgedConnections")) { std::vector connections = hedged_connections_factory.getManyConnections(pool_mode); - ReplicaStatePtr replica = nullptr; + ReplicaState replica; for (size_t i = 0; i != connections.size(); ++i) { - replica = std::make_shared(); - replica->connection = connections[i]; - replica->offset = i; - replica->connection->setThrottler(throttler_); - int socket_fd = replica->connection->getSocket()->impl()->sockfd(); + replica.connection = connections[i]; + replica.connection->setThrottler(throttler_); + int socket_fd = replica.connection->getSocket()->impl()->sockfd(); epoll.add(socket_fd); - fd_to_replica[socket_fd] = replica; - offset_states.push_back(OffsetState{{std::move(replica)}, 1, false}); + fd_to_replica_location[socket_fd] = ReplicaLocation{i, 0}; + offset_states.push_back(OffsetState{{replica}, 1, false}); } active_connection_count = connections.size(); offsets_with_received_first_data_packet = 0; - pipeline_for_new_replicas.add([throttler_](ReplicaStatePtr replica_) { replica_->connection->setThrottler(throttler_); }); + pipeline_for_new_replicas.add([throttler_](ReplicaState & replica_) { replica_.connection->setThrottler(throttler_); }); } -void HedgedConnections::Pipeline::add(std::function send_function) +void HedgedConnections::Pipeline::add(std::function send_function) { pipeline.push_back(send_function); } -void HedgedConnections::Pipeline::run(ReplicaStatePtr replica) +void HedgedConnections::Pipeline::run(ReplicaState & replica) { for (auto & send_func : pipeline) send_func(replica); @@ -63,11 +60,11 @@ void HedgedConnections::sendScalarsData(Scalars & data) if (!sent_query) throw Exception("Cannot send scalars data: query not yet sent.", ErrorCodes::LOGICAL_ERROR); - auto send_scalars_data = [&data](ReplicaStatePtr replica) { replica->connection->sendScalarsData(data); }; + auto send_scalars_data = [&data](ReplicaState & replica) { replica.connection->sendScalarsData(data); }; for (auto & offset_state : offset_states) for (auto & replica : offset_state.replicas) - if (replica->connection) + if (replica.connection) send_scalars_data(replica); pipeline_for_new_replicas.add(send_scalars_data); @@ -83,11 +80,11 @@ void HedgedConnections::sendExternalTablesData(std::vector & if (data.size() != size()) throw Exception("Mismatch between replicas and data sources", ErrorCodes::MISMATCH_REPLICAS_DATA_SOURCES); - auto send_external_tables_data = [&data](ReplicaStatePtr replica) { replica->connection->sendExternalTablesData(data[0]); }; + auto send_external_tables_data = [&data](ReplicaState & replica) { replica.connection->sendExternalTablesData(data[0]); }; for (auto & offset_state : offset_states) for (auto & replica : offset_state.replicas) - if (replica->connection) + if (replica.connection) send_external_tables_data(replica); pipeline_for_new_replicas.add(send_external_tables_data); @@ -100,11 +97,11 @@ void HedgedConnections::sendIgnoredPartUUIDs(const std::vector & uuids) if (sent_query) throw Exception("Cannot send uuids after query is sent.", ErrorCodes::LOGICAL_ERROR); - auto send_ignored_part_uuids = [&uuids](ReplicaStatePtr replica) { replica->connection->sendIgnoredPartUUIDs(uuids); }; + auto send_ignored_part_uuids = [&uuids](ReplicaState & replica) { replica.connection->sendIgnoredPartUUIDs(uuids); }; for (auto & offset_state : offset_states) for (auto & replica : offset_state.replicas) - if (replica->connection) + if (replica.connection) send_ignored_part_uuids(replica); pipeline_for_new_replicas.add(send_ignored_part_uuids); @@ -127,7 +124,7 @@ void HedgedConnections::sendQuery( { for (auto & replica : offset_state.replicas) { - if (replica->connection->getServerRevision(timeouts) < DBMS_MIN_REVISION_WITH_CURRENT_AGGREGATION_VARIANT_SELECTION_METHOD) + if (replica.connection->getServerRevision(timeouts) < DBMS_MIN_REVISION_WITH_CURRENT_AGGREGATION_VARIANT_SELECTION_METHOD) { disable_two_level_aggregation = true; break; @@ -137,7 +134,7 @@ void HedgedConnections::sendQuery( break; } - auto send_query = [this, timeouts, query, query_id, stage, client_info, with_pending_data](ReplicaStatePtr replica) + auto send_query = [this, timeouts, query, query_id, stage, client_info, with_pending_data](ReplicaState & replica) { Settings modified_settings = settings; @@ -151,10 +148,10 @@ void HedgedConnections::sendQuery( if (offset_states.size() > 1) { modified_settings.parallel_replicas_count = offset_states.size(); - modified_settings.parallel_replica_offset = replica->offset; + modified_settings.parallel_replica_offset = fd_to_replica_location[replica.connection->getSocket()->impl()->sockfd()].offset; } - replica->connection->sendQuery(timeouts, query, query_id, stage, &modified_settings, &client_info, with_pending_data); + replica.connection->sendQuery(timeouts, query, query_id, stage, &modified_settings, &client_info, with_pending_data); addTimeoutToReplica(ConnectionTimeoutType::RECEIVE_TIMEOUT, replica); addTimeoutToReplica(ConnectionTimeoutType::RECEIVE_DATA_TIMEOUT, replica); }; @@ -173,7 +170,7 @@ void HedgedConnections::disconnect() for (auto & offset_status : offset_states) for (auto & replica : offset_status.replicas) - if (replica->connection) + if (replica.connection) finishProcessReplica(replica, true); if (hedged_connections_factory.hasEventsInProcess()) @@ -199,9 +196,9 @@ std::string HedgedConnections::dumpAddresses() const { for (const auto & replica : offset_state.replicas) { - if (replica->connection) + if (replica.connection) { - addresses += (is_first ? "" : "; ") + replica->connection->getDescription(); + addresses += (is_first ? "" : "; ") + replica.connection->getDescription(); is_first = false; } } @@ -219,8 +216,8 @@ void HedgedConnections::sendCancel() for (auto & offset_status : offset_states) for (auto & replica : offset_status.replicas) - if (replica->connection) - replica->connection->sendCancel(); + if (replica.connection) + replica.connection->sendCancel(); cancelled = true; } @@ -288,15 +285,16 @@ Packet HedgedConnections::receivePacketImpl(AsyncCallback async_callback) { event_fd = getReadyFileDescriptor(async_callback); - if (fd_to_replica.contains(event_fd)) + if (fd_to_replica_location.contains(event_fd)) { - packet = receivePacketFromReplica(fd_to_replica[event_fd], async_callback); + ReplicaLocation location = fd_to_replica_location[event_fd]; + packet = receivePacketFromReplica(location, async_callback); finish = true; } - else if (timeout_fd_to_replica.contains(event_fd)) + else if (timeout_fd_to_replica_location.contains(event_fd)) { - ReplicaStatePtr replica = timeout_fd_to_replica[event_fd]; - processTimeoutEvent(replica, replica->active_timeouts[event_fd]); + ReplicaLocation location = timeout_fd_to_replica_location[event_fd]; + processTimeoutEvent(location, offset_states[location.offset].replicas[location.index].active_timeouts[event_fd]); } else if (event_fd == hedged_connections_factory.getFileDescriptor()) tryGetNewReplica(false); @@ -309,10 +307,11 @@ Packet HedgedConnections::receivePacketImpl(AsyncCallback async_callback) int HedgedConnections::getReadyFileDescriptor(AsyncCallback async_callback) { - for (auto & [fd, replica] : fd_to_replica) + for (auto & [fd, location] : fd_to_replica_location) { - if (replica->connection->hasReadPendingData()) - return replica->connection->getSocket()->impl()->sockfd(); + ReplicaState & replica = offset_states[location.offset].replicas[location.index]; + if (replica.connection->hasReadPendingData()) + return replica.connection->getSocket()->impl()->sockfd(); } epoll_event event; @@ -321,15 +320,16 @@ int HedgedConnections::getReadyFileDescriptor(AsyncCallback async_callback) return event.data.fd; } -Packet HedgedConnections::receivePacketFromReplica(ReplicaStatePtr replica, AsyncCallback async_callback) +Packet HedgedConnections::receivePacketFromReplica(ReplicaLocation & replica_location, AsyncCallback async_callback) { + ReplicaState & replica = offset_states[replica_location.offset].replicas[replica_location.index]; removeTimeoutFromReplica(ConnectionTimeoutType::RECEIVE_TIMEOUT, replica); - Packet packet = replica->connection->receivePacket(std::move(async_callback)); + Packet packet = replica.connection->receivePacket(std::move(async_callback)); switch (packet.type) { case Protocol::Server::Data: - if (!offset_states[replica->offset].first_packet_of_data_received) - processReceivedFirstDataPacket(replica); + if (!offset_states[replica_location.offset].first_packet_of_data_received) + processReceivedFirstDataPacket(replica_location); addTimeoutToReplica(ConnectionTimeoutType::RECEIVE_TIMEOUT, replica); break; case Protocol::Server::PartUUIDs: @@ -354,21 +354,21 @@ Packet HedgedConnections::receivePacketFromReplica(ReplicaStatePtr replica, Asyn return packet; } -void HedgedConnections::processReceivedFirstDataPacket(ReplicaStatePtr replica) +void HedgedConnections::processReceivedFirstDataPacket(ReplicaLocation & replica_location) { /// When we receive first packet of data from replica, we stop working with replicas, that are /// responsible for the same offset. - OffsetState & offset_state = offset_states[replica->offset]; - removeTimeoutFromReplica(ConnectionTimeoutType::RECEIVE_DATA_TIMEOUT, replica); + OffsetState & offset_state = offset_states[replica_location.offset]; + removeTimeoutFromReplica(ConnectionTimeoutType::RECEIVE_DATA_TIMEOUT, offset_state.replicas[replica_location.index]); ++offsets_with_received_first_data_packet; offset_state.first_packet_of_data_received = true; - for (auto & other_replica : offset_state.replicas) + for (size_t i = 0; i != offset_state.replicas.size(); ++i) { - if (replica != other_replica && other_replica->connection) + if (i != replica_location.index && offset_state.replicas[i].connection) { - other_replica->connection->sendCancel(); - finishProcessReplica(other_replica, true); + offset_state.replicas[i].connection->sendCancel(); + finishProcessReplica(offset_state.replicas[i], true); } } @@ -384,23 +384,24 @@ void HedgedConnections::processReceivedFirstDataPacket(ReplicaStatePtr replica) } } -void HedgedConnections::processTimeoutEvent(ReplicaStatePtr replica, ConnectionTimeoutDescriptorPtr timeout_descriptor) +void HedgedConnections::processTimeoutEvent(ReplicaLocation & replica_location, ConnectionTimeoutDescriptorPtr timeout_descriptor) { + ReplicaState & replica = offset_states[replica_location.offset].replicas[replica_location.index]; epoll.remove(timeout_descriptor->timer.getDescriptor()); - replica->active_timeouts.erase(timeout_descriptor->timer.getDescriptor()); - timeout_fd_to_replica.erase(timeout_descriptor->timer.getDescriptor()); + replica.active_timeouts.erase(timeout_descriptor->timer.getDescriptor()); + timeout_fd_to_replica_location.erase(timeout_descriptor->timer.getDescriptor()); if (timeout_descriptor->type == ConnectionTimeoutType::RECEIVE_TIMEOUT) { finishProcessReplica(replica, true); /// Check if there is no active connections with the same offset and there is no new replica in process. - if (offset_states[replica->offset].active_connection_count == 0 && !next_replica_in_process) + if (offset_states[replica_location.offset].active_connection_count == 0 && !next_replica_in_process) throw NetException("Receive timeout expired", ErrorCodes::SOCKET_TIMEOUT); } else if (timeout_descriptor->type == ConnectionTimeoutType::RECEIVE_DATA_TIMEOUT) { - offsets_queue.push(replica->offset); + offsets_queue.push(replica_location.offset); tryGetNewReplica(true); } } @@ -413,24 +414,24 @@ void HedgedConnections::tryGetNewReplica(bool start_new_connection) /// Skip replicas that doesn't support two-level aggregation if we didn't disable it in sendQuery. while (state == HedgedConnectionsFactory::State::READY && !disable_two_level_aggregation && connection->getServerRevision(hedged_connections_factory.getConnectionTimeouts()) - < DBMS_MIN_REVISION_WITH_CURRENT_AGGREGATION_VARIANT_SELECTION_METHOD) + < DBMS_MIN_REVISION_WITH_CURRENT_AGGREGATION_VARIANT_SELECTION_METHOD) state = hedged_connections_factory.getNextConnection(true, false, connection); if (state == HedgedConnectionsFactory::State::READY) { size_t offset = offsets_queue.front(); offsets_queue.pop(); + size_t index = offset_states[offset].replicas.size(); - ReplicaStatePtr replica = std::make_shared(); - replica->connection = connection; - replica->offset = offset; - int socket_fd = replica->connection->getSocket()->impl()->sockfd(); + ReplicaState replica; + replica.connection = connection; + int socket_fd = replica.connection->getSocket()->impl()->sockfd(); epoll.add(socket_fd); - fd_to_replica[socket_fd] = replica; - offset_states[offset].replicas.push_back(replica); + fd_to_replica_location[socket_fd] = ReplicaLocation{offset, index}; ++offset_states[offset].active_connection_count; ++active_connection_count; pipeline_for_new_replicas.run(replica); + offset_states[offset].replicas.push_back(replica); } else if (state == HedgedConnectionsFactory::State::NOT_READY && !next_replica_in_process) { @@ -457,50 +458,50 @@ void HedgedConnections::tryGetNewReplica(bool start_new_connection) } } -void HedgedConnections::finishProcessReplica(ReplicaStatePtr replica, bool disconnect) +void HedgedConnections::finishProcessReplica(ReplicaState & replica, bool disconnect) { removeTimeoutsFromReplica(replica); - int socket_fd = replica->connection->getSocket()->impl()->sockfd(); + int socket_fd = replica.connection->getSocket()->impl()->sockfd(); epoll.remove(socket_fd); - --offset_states[replica->offset].active_connection_count; - fd_to_replica.erase(socket_fd); + --offset_states[fd_to_replica_location[socket_fd].offset].active_connection_count; + fd_to_replica_location.erase(socket_fd); --active_connection_count; if (disconnect) - replica->connection->disconnect(); - replica->connection = nullptr; + replica.connection->disconnect(); + replica.connection = nullptr; } -void HedgedConnections::addTimeoutToReplica(ConnectionTimeoutType type, ReplicaStatePtr replica) +void HedgedConnections::addTimeoutToReplica(ConnectionTimeoutType type, ReplicaState & replica) { ConnectionTimeoutDescriptorPtr timeout_descriptor = createConnectionTimeoutDescriptor(type, hedged_connections_factory.getConnectionTimeouts()); epoll.add(timeout_descriptor->timer.getDescriptor()); - timeout_fd_to_replica[timeout_descriptor->timer.getDescriptor()] - = fd_to_replica[replica->connection->getSocket()->impl()->sockfd()]; - replica->active_timeouts[timeout_descriptor->timer.getDescriptor()] = std::move(timeout_descriptor); + timeout_fd_to_replica_location[timeout_descriptor->timer.getDescriptor()] + = fd_to_replica_location[replica.connection->getSocket()->impl()->sockfd()]; + replica.active_timeouts[timeout_descriptor->timer.getDescriptor()] = std::move(timeout_descriptor); } -void HedgedConnections::removeTimeoutsFromReplica(ReplicaStatePtr replica) +void HedgedConnections::removeTimeoutsFromReplica(ReplicaState & replica) { - for (auto & [fd, _] : replica->active_timeouts) + for (auto & [fd, _] : replica.active_timeouts) { epoll.remove(fd); - timeout_fd_to_replica.erase(fd); + timeout_fd_to_replica_location.erase(fd); } - replica->active_timeouts.clear(); + replica.active_timeouts.clear(); } -void HedgedConnections::removeTimeoutFromReplica(ConnectionTimeoutType type, ReplicaStatePtr replica) +void HedgedConnections::removeTimeoutFromReplica(ConnectionTimeoutType type, ReplicaState & replica) { auto it = std::find_if( - replica->active_timeouts.begin(), replica->active_timeouts.end(), [type](auto & value) { return value.second->type == type; }); + replica.active_timeouts.begin(), replica.active_timeouts.end(), [type](auto & value) { return value.second->type == type; }); - if (it != replica->active_timeouts.end()) + if (it != replica.active_timeouts.end()) { epoll.remove(it->first); - timeout_fd_to_replica.erase(it->first); - replica->active_timeouts.erase(it); + timeout_fd_to_replica_location.erase(it->first); + replica.active_timeouts.erase(it); } } diff --git a/src/Client/HedgedConnections.h b/src/Client/HedgedConnections.h index 00145544096..4e3b6a67169 100644 --- a/src/Client/HedgedConnections.h +++ b/src/Client/HedgedConnections.h @@ -22,14 +22,17 @@ public: { Connection * connection = nullptr; std::unordered_map active_timeouts; - size_t offset = 0; }; - using ReplicaStatePtr = std::shared_ptr; + struct ReplicaLocation + { + size_t offset; + size_t index; + }; struct OffsetState { - std::vector replicas; + std::vector replicas; size_t active_connection_count; bool first_packet_of_data_received; }; @@ -76,32 +79,32 @@ private: class Pipeline { public: - void add(std::function send_function); + void add(std::function send_function); - void run(ReplicaStatePtr replica); + void run(ReplicaState & replica); private: - std::vector> pipeline; + std::vector> pipeline; }; - Packet receivePacketFromReplica(ReplicaStatePtr replica, AsyncCallback async_callback = {}); + Packet receivePacketFromReplica(ReplicaLocation & replica_location, AsyncCallback async_callback = {}); Packet receivePacketImpl(AsyncCallback async_callback = {}); - void processReceivedFirstDataPacket(ReplicaStatePtr replica); + void processReceivedFirstDataPacket(ReplicaLocation & replica_location); - void processTimeoutEvent(ReplicaStatePtr replica, ConnectionTimeoutDescriptorPtr timeout_descriptor); + void processTimeoutEvent(ReplicaLocation & replica_location, ConnectionTimeoutDescriptorPtr timeout_descriptor); void tryGetNewReplica(bool start_new_connection); - void finishProcessReplica(ReplicaStatePtr replica, bool disconnect); + void finishProcessReplica(ReplicaState & replica, bool disconnect); int getReadyFileDescriptor(AsyncCallback async_callback = {}); - void addTimeoutToReplica(ConnectionTimeoutType type, ReplicaStatePtr replica); + void addTimeoutToReplica(ConnectionTimeoutType type, ReplicaState & replica); - void removeTimeoutsFromReplica(ReplicaStatePtr replica); + void removeTimeoutsFromReplica(ReplicaState & replica); - void removeTimeoutFromReplica(ConnectionTimeoutType type, ReplicaStatePtr replica); + void removeTimeoutFromReplica(ConnectionTimeoutType type, ReplicaState & replica); HedgedConnectionsFactory hedged_connections_factory; @@ -111,10 +114,10 @@ private: /// replica_states[offset].replicas.size() = 1 (like in MultiplexedConnections). std::vector offset_states; - /// Map socket file descriptor to replica. - std::unordered_map fd_to_replica; - /// Map timeout file descriptor to replica. - std::unordered_map timeout_fd_to_replica; + /// Map socket file descriptor to replica location (it's offset and index in OffsetState.replicas). + std::unordered_map fd_to_replica_location; + /// Map timeout file descriptor to replica location (it's offset and index in OffsetState.replicas). + std::unordered_map timeout_fd_to_replica_location; /// A queue of offsets for new replicas. When we get RECEIVE_DATA_TIMEOUT from /// the replica, we push it's offset to this queue and start trying to get @@ -142,7 +145,6 @@ private: Epoll epoll; const Settings & settings; ThrottlerPtr throttler; - Poco::Logger * log; bool sent_query = false; bool cancelled = false; diff --git a/src/Client/HedgedConnectionsFactory.cpp b/src/Client/HedgedConnectionsFactory.cpp index 732e1e4b7d0..12362635904 100644 --- a/src/Client/HedgedConnectionsFactory.cpp +++ b/src/Client/HedgedConnectionsFactory.cpp @@ -24,6 +24,8 @@ HedgedConnectionsFactory::HedgedConnectionsFactory( for (size_t i = 0; i != shuffled_pools.size(); ++i) connection_establishers.emplace_back(shuffled_pools[i].pool, &timeouts, settings, table_to_check.get(), log); + replicas_timeouts.resize(shuffled_pools.size()); + max_tries = (settings ? size_t{settings->connections_with_failover_max_tries} : size_t{DBMS_CONNECTION_POOL_WITH_FAILOVER_DEFAULT_MAX_TRIES}); @@ -57,6 +59,7 @@ std::vector HedgedConnectionsFactory::getManyConnections(PoolMode std::vector connections; connections.reserve(max_entries); + Connection * connection = nullptr; /// Try to start establishing connections with max_entries replicas. int index; @@ -66,14 +69,13 @@ std::vector HedgedConnectionsFactory::getManyConnections(PoolMode if (index == -1) break; - ReplicaStatePtr replica = startEstablishingConnection(index); - if (replica->state == State::READY) - connections.push_back(replica->connection); + auto state = startEstablishingConnection(index, connection); + if (state == State::READY) + connections.push_back(connection); } /// Process connections until we get enough READY connections /// (work asynchronously with all connections we started). - Connection * connection = nullptr; while (connections.size() < max_entries) { auto state = getNextConnection(false, true, connection); @@ -102,7 +104,6 @@ std::vector HedgedConnectionsFactory::getManyConnections(PoolMode HedgedConnectionsFactory::State HedgedConnectionsFactory::getNextConnection(bool start_new_connection, bool blocking, Connection *& connection_out) { - ReplicaStatePtr replica = nullptr; int index = -1; if (start_new_connection) @@ -112,22 +113,14 @@ HedgedConnectionsFactory::State HedgedConnectionsFactory::getNextConnection(bool { if (index != -1) { - replica = startEstablishingConnection(index); - if (replica->state == State::READY) - { - connection_out = replica->connection; - return State::READY; - } + State state = startEstablishingConnection(index, connection_out); + if (state == State::READY) + return state; } - if (!processEpollEvents(replica, blocking)) - return State::NOT_READY; - - if (replica->state == State::READY) - { - connection_out = replica->connection; - return State::READY; - } + State state = processEpollEvents(blocking, connection_out); + if (state != State::EMPTY) + return state; index = getNextIndex(); } @@ -139,22 +132,19 @@ HedgedConnectionsFactory::State HedgedConnectionsFactory::getNextConnection(bool if (!fallback_to_stale_replicas || !canGetNewConnection()) return State::CANNOT_CHOOSE; - setBestUsableReplica(replica); - connection_out = replica->connection; - return replica->state; + return setBestUsableReplica(connection_out); } void HedgedConnectionsFactory::stopChoosingReplicas() { - for (auto & [fd, replica] : fd_to_replica) + for (auto & [fd, replica_index] : fd_to_replica_index) { - removeTimeoutsFromReplica(replica); + removeTimeoutsFromReplica(replica_index); epoll.remove(fd); - connection_establishers[replica->index].reset(); - replica->reset(); + connection_establishers[replica_index].reset(); } - fd_to_replica.clear(); + fd_to_replica_index.clear(); } int HedgedConnectionsFactory::getNextIndex() @@ -190,56 +180,54 @@ int HedgedConnectionsFactory::getNextIndex() return next_index; } -HedgedConnectionsFactory::ReplicaStatePtr HedgedConnectionsFactory::startEstablishingConnection(int index) +HedgedConnectionsFactory::State HedgedConnectionsFactory::startEstablishingConnection(int replica_index, Connection *& connection_out) { - ReplicaStatePtr replica = createNewReplica(); - + State state; do { - ConnectionEstablisher & connection_establisher = connection_establishers[index]; + ConnectionEstablisher & connection_establisher = connection_establishers[replica_index]; - replica->state = State::NOT_READY; - replica->index = index; - indexes_in_process.insert(index); + state = State::NOT_READY; + indexes_in_process.insert(replica_index); connection_establisher.reset(); connection_establisher.run(); - if (connection_establisher.stage != ConnectionEstablisher::Stage::FAILED) - replica->connection = &*connection_establisher.result.entry; + state = processConnectionEstablisherStage(replica_index); - processConnectionEstablisherStage(replica); - - if (replica->state == State::NOT_READY) + if (state == State::NOT_READY) { epoll.add(connection_establisher.socket_fd); - fd_to_replica[connection_establisher.socket_fd] = replica; + fd_to_replica_index[connection_establisher.socket_fd] = replica_index; connection_establisher.setActionBeforeDisconnect([&](int fd) { epoll.remove(fd); - fd_to_replica.erase(fd); + fd_to_replica_index.erase(fd); }); - addTimeouts(replica); + addTimeouts(replica_index); } } - while (replica->state == State::EMPTY && (index = getNextIndex()) != -1); + while (state == State::EMPTY && (replica_index = getNextIndex()) != -1); - return replica; + if (state == State::READY) + connection_out = &*connection_establishers[replica_index].result.entry; + + return state; } -void HedgedConnectionsFactory::processConnectionEstablisherStage(ReplicaStatePtr replica, bool remove_from_epoll) +HedgedConnectionsFactory::State HedgedConnectionsFactory::processConnectionEstablisherStage(int replica_index, bool remove_from_epoll) { - ConnectionEstablisher & connection_establisher = connection_establishers[replica->index]; + ConnectionEstablisher & connection_establisher = connection_establishers[replica_index]; if (connection_establisher.stage == ConnectionEstablisher::Stage::FINISHED) { - indexes_in_process.erase(replica->index); + indexes_in_process.erase(replica_index); ++entries_count; if (remove_from_epoll) { epoll.remove(connection_establisher.socket_fd); - fd_to_replica.erase(connection_establisher.socket_fd); + fd_to_replica_index.erase(connection_establisher.socket_fd); } if (connection_establisher.result.is_usable) @@ -247,24 +235,28 @@ void HedgedConnectionsFactory::processConnectionEstablisherStage(ReplicaStatePtr ++usable_count; if (connection_establisher.result.is_up_to_date) { - replica->state = State::READY; - ready_indexes.insert(replica->index); - return; + ready_indexes.insert(replica_index); + return State::READY; } } /// This replica is not up to date, we will try to find up to date. - replica->reset(); + return State::EMPTY; } else if (connection_establisher.stage == ConnectionEstablisher::Stage::FAILED) - processFailedConnection(replica); + { + processFailedConnection(replica_index); + return State::EMPTY; + } + + return State::NOT_READY; } -void HedgedConnectionsFactory::processFailedConnection(ReplicaStatePtr replica) +void HedgedConnectionsFactory::processFailedConnection(int replica_index) { - ShuffledPool & shuffled_pool = shuffled_pools[replica->index]; + ShuffledPool & shuffled_pool = shuffled_pools[replica_index]; LOG_WARNING( - log, "Connection failed at try №{}, reason: {}", (shuffled_pool.error_count + 1), connection_establishers[replica->index].fail_message); + log, "Connection failed at try №{}, reason: {}", (shuffled_pool.error_count + 1), connection_establishers[replica_index].fail_message); ProfileEvents::increment(ProfileEvents::DistributedConnectionFailTry); shuffled_pool.error_count = std::min(pool->getMaxErrorCup(), shuffled_pool.error_count + 1); @@ -275,83 +267,78 @@ void HedgedConnectionsFactory::processFailedConnection(ReplicaStatePtr replica) ProfileEvents::increment(ProfileEvents::DistributedConnectionFailAtAll); } - std::string & fail_message = connection_establishers[replica->index].fail_message; + std::string & fail_message = connection_establishers[replica_index].fail_message; if (!fail_message.empty()) fail_messages += fail_message + "\n"; - indexes_in_process.erase(replica->index); - replica->reset(); + indexes_in_process.erase(replica_index); } -void HedgedConnectionsFactory::addTimeouts(ReplicaStatePtr replica) +void HedgedConnectionsFactory::addTimeouts(int replica_index) { - addTimeoutToReplica(ConnectionTimeoutType::RECEIVE_TIMEOUT, replica); + addTimeoutToReplica(ConnectionTimeoutType::RECEIVE_TIMEOUT, replica_index); - auto stage = connection_establishers[replica->index].stage; + auto stage = connection_establishers[replica_index].stage; if (stage == ConnectionEstablisher::Stage::RECEIVE_HELLO) - addTimeoutToReplica(ConnectionTimeoutType::RECEIVE_HELLO_TIMEOUT, replica); + addTimeoutToReplica(ConnectionTimeoutType::RECEIVE_HELLO_TIMEOUT, replica_index); else if (stage == ConnectionEstablisher::Stage::RECEIVE_TABLES_STATUS) - addTimeoutToReplica(ConnectionTimeoutType::RECEIVE_TABLES_STATUS_TIMEOUT, replica); + addTimeoutToReplica(ConnectionTimeoutType::RECEIVE_TABLES_STATUS_TIMEOUT, replica_index); } -void HedgedConnectionsFactory::addTimeoutToReplica(ConnectionTimeoutType type, ReplicaStatePtr replica) +void HedgedConnectionsFactory::addTimeoutToReplica(ConnectionTimeoutType type, int replica_index) { ConnectionTimeoutDescriptorPtr timeout_descriptor = createConnectionTimeoutDescriptor(type, timeouts); epoll.add(timeout_descriptor->timer.getDescriptor()); - timeout_fd_to_replica[timeout_descriptor->timer.getDescriptor()] = replica; - replica->active_timeouts[timeout_descriptor->timer.getDescriptor()] = std::move(timeout_descriptor); + timeout_fd_to_replica_index[timeout_descriptor->timer.getDescriptor()] = replica_index; + replicas_timeouts[replica_index][timeout_descriptor->timer.getDescriptor()] = std::move(timeout_descriptor); } -void HedgedConnectionsFactory::removeTimeoutsFromReplica(ReplicaStatePtr replica) +void HedgedConnectionsFactory::removeTimeoutsFromReplica(int replica_index) { - for (auto & [fd, _] : replica->active_timeouts) + for (auto & [fd, _] : replicas_timeouts[replica_index]) { epoll.remove(fd); - timeout_fd_to_replica.erase(fd); + timeout_fd_to_replica_index.erase(fd); } - replica->active_timeouts.clear(); + replicas_timeouts[replica_index].clear(); } -bool HedgedConnectionsFactory::processEpollEvents(ReplicaStatePtr & replica, bool blocking) +HedgedConnectionsFactory::State HedgedConnectionsFactory::processEpollEvents(bool blocking, Connection *& connection_out) { int event_fd; - bool finish = false; - while (!finish) + while (true) { event_fd = getReadyFileDescriptor(blocking); /// Check if there is no events. if (event_fd == -1) - return false; + return State::NOT_READY; - if (fd_to_replica.find(event_fd) != fd_to_replica.end()) + if (fd_to_replica_index.find(event_fd) != fd_to_replica_index.end()) { - replica = fd_to_replica[event_fd]; - processReplicaEvent(replica); - /// Check if replica is ready or we need to try next replica. - if (replica->state == State::READY || replica->state == State::EMPTY) - finish = true; + int replica_index = fd_to_replica_index[event_fd]; + State state = processReplicaEvent(replica_index, connection_out); + /// Return only if replica is ready or we need to try next replica. + if (state != State::NOT_READY) + return state; } - else if (timeout_fd_to_replica.find(event_fd) != timeout_fd_to_replica.end()) + else if (timeout_fd_to_replica_index.find(event_fd) != timeout_fd_to_replica_index.end()) { - replica = timeout_fd_to_replica[event_fd]; - processTimeoutEvent(replica, replica->active_timeouts[event_fd]); - /// Check if we need to try next replica. - if (replica->state == State::EMPTY) - finish = true; + int replica_index = timeout_fd_to_replica_index[event_fd]; + /// Process received timeout. If retured values is true, we need to try new replica. + if (processTimeoutEvent(replica_index, replicas_timeouts[replica_index][event_fd])) + return State::EMPTY; } else throw Exception("Unknown event from epoll", ErrorCodes::LOGICAL_ERROR); } - - return true; } int HedgedConnectionsFactory::getReadyFileDescriptor(bool blocking) { - for (auto & [fd, replica] : fd_to_replica) - if (replica->connection->hasReadPendingData()) - return replica->connection->getSocket()->impl()->sockfd(); + for (auto & [fd, replica_index] : fd_to_replica_index) + if (connection_establishers[replica_index].result.entry->hasReadPendingData()) + return connection_establishers[replica_index].socket_fd; epoll_event event; event.data.fd = -1; @@ -359,41 +346,44 @@ int HedgedConnectionsFactory::getReadyFileDescriptor(bool blocking) return event.data.fd; } -void HedgedConnectionsFactory::processReplicaEvent(ReplicaStatePtr replica) +HedgedConnectionsFactory::State HedgedConnectionsFactory::processReplicaEvent(int replica_index, Connection *& connection_out) { - removeTimeoutsFromReplica(replica); - connection_establishers[replica->index].run(); - processConnectionEstablisherStage(replica, true); - if (replica->state == State::NOT_READY) - addTimeouts(replica); + removeTimeoutsFromReplica(replica_index); + connection_establishers[replica_index].run(); + State state = processConnectionEstablisherStage(replica_index, true); + if (state == State::NOT_READY) + addTimeouts(replica_index); + if (state == State::READY) + connection_out = &*connection_establishers[replica_index].result.entry; + return state; } -void HedgedConnectionsFactory::processTimeoutEvent(ReplicaStatePtr replica, ConnectionTimeoutDescriptorPtr timeout_descriptor) +bool HedgedConnectionsFactory::processTimeoutEvent(int replica_index, ConnectionTimeoutDescriptorPtr timeout_descriptor) { epoll.remove(timeout_descriptor->timer.getDescriptor()); - replica->active_timeouts.erase(timeout_descriptor->timer.getDescriptor()); - timeout_fd_to_replica[timeout_descriptor->timer.getDescriptor()]; + replicas_timeouts[replica_index].erase(timeout_descriptor->timer.getDescriptor()); + timeout_fd_to_replica_index[timeout_descriptor->timer.getDescriptor()]; if (timeout_descriptor->type == ConnectionTimeoutType::RECEIVE_TIMEOUT) { - removeTimeoutsFromReplica(replica); - int fd = replica->connection->getSocket()->impl()->sockfd(); + removeTimeoutsFromReplica(replica_index); + int fd = connection_establishers[replica_index].socket_fd; epoll.remove(fd); - fd_to_replica.erase(fd); + fd_to_replica_index.erase(fd); - ConnectionEstablisher & connection_establisher = connection_establishers[replica->index]; + ConnectionEstablisher & connection_establisher = connection_establishers[replica_index]; connection_establisher.fail_message = "Receive timeout expired (" + connection_establisher.result.entry->getDescription() + ")"; connection_establisher.resetResult(); connection_establisher.stage = ConnectionEstablisher::Stage::FAILED; - processFailedConnection(replica); + processFailedConnection(replica_index); + return true; } - else if ((timeout_descriptor->type == ConnectionTimeoutType::RECEIVE_HELLO_TIMEOUT - || timeout_descriptor->type == ConnectionTimeoutType::RECEIVE_TABLES_STATUS_TIMEOUT) - && entries_count + indexes_in_process.size() + failed_pools_count < shuffled_pools.size()) - replica = createNewReplica(); + + /// Return true if we can try to start one more connection. + return entries_count + indexes_in_process.size() + failed_pools_count < shuffled_pools.size(); } -void HedgedConnectionsFactory::setBestUsableReplica(ReplicaStatePtr replica) +HedgedConnectionsFactory::State HedgedConnectionsFactory::setBestUsableReplica(Connection *& connection_out) { std::vector indexes(connection_establishers.size()); for (size_t i = 0; i != indexes.size(); ++i) @@ -412,10 +402,7 @@ void HedgedConnectionsFactory::setBestUsableReplica(ReplicaStatePtr replica) indexes.end()); if (indexes.empty()) - { - replica->state = State::CANNOT_CHOOSE; - return; - } + return State::CANNOT_CHOOSE; /// Sort replicas by staleness. std::stable_sort( @@ -426,10 +413,9 @@ void HedgedConnectionsFactory::setBestUsableReplica(ReplicaStatePtr replica) return connection_establishers[lhs].result.staleness < connection_establishers[rhs].result.staleness; }); - replica->index = indexes[0]; - replica->connection = &*connection_establishers[indexes[0]].result.entry; - replica->state = State::READY; - ready_indexes.insert(replica->index); + ready_indexes.insert(indexes[0]); + connection_out = &*connection_establishers[indexes[0]].result.entry; + return State::READY; } ConnectionTimeoutDescriptorPtr createConnectionTimeoutDescriptor(ConnectionTimeoutType type, const ConnectionTimeouts & timeouts) diff --git a/src/Client/HedgedConnectionsFactory.h b/src/Client/HedgedConnectionsFactory.h index 398629cf13c..048d90e1de6 100644 --- a/src/Client/HedgedConnectionsFactory.h +++ b/src/Client/HedgedConnectionsFactory.h @@ -48,24 +48,6 @@ public: CANNOT_CHOOSE = 3, }; - struct ReplicaState - { - Connection * connection = nullptr; - size_t index = -1; - State state = State::EMPTY; - std::unordered_map active_timeouts; - - void reset() - { - connection = nullptr; - index = -1; - state = State::EMPTY; - active_timeouts.clear(); - } - }; - - using ReplicaStatePtr = std::shared_ptr; - HedgedConnectionsFactory(const ConnectionPoolWithFailoverPtr & pool_, const Settings * settings_, const ConnectionTimeouts & timeouts_, @@ -96,9 +78,9 @@ public: ~HedgedConnectionsFactory(); private: - ReplicaStatePtr startEstablishingConnection(int index); + State startEstablishingConnection(int index, Connection *& connection_out); - void processConnectionEstablisherStage(ReplicaStatePtr replica, bool remove_from_epoll = false); + State processConnectionEstablisherStage(int replica_index, bool remove_from_epoll = false); /// Find an index of the next free replica to start connection. /// Return -1 if there is no free replica. @@ -106,27 +88,23 @@ private: int getReadyFileDescriptor(bool blocking); - void addTimeouts(ReplicaStatePtr replica); + void addTimeouts(int replica_index); - void addTimeoutToReplica(ConnectionTimeoutType type, ReplicaStatePtr replica); + void addTimeoutToReplica(ConnectionTimeoutType type, int replica_index); - void removeTimeoutsFromReplica(ReplicaStatePtr replica); + void removeTimeoutsFromReplica(int replica_index); - void processFailedConnection(ReplicaStatePtr replica); + void processFailedConnection(int replica_index); - void processReceiveTimeout(ReplicaStatePtr replica); + State processReplicaEvent(int replica_index, Connection *& connection_out); - void processReplicaEvent(ReplicaStatePtr replica); - - void processTimeoutEvent(ReplicaStatePtr replica, ConnectionTimeoutDescriptorPtr timeout_descriptor); + bool processTimeoutEvent(int replica_index, ConnectionTimeoutDescriptorPtr timeout_descriptor); /// Return false if there is no ready events, return true if replica is ready /// or we need to try next replica. - bool processEpollEvents(ReplicaStatePtr & replica, bool blocking); + State processEpollEvents(bool blocking, Connection *& connection_out); - void setBestUsableReplica(ReplicaStatePtr replica); - - ReplicaStatePtr createNewReplica() { return std::make_shared(); } + State setBestUsableReplica(Connection *& connection_out); const ConnectionPoolWithFailoverPtr pool; const Settings * settings; @@ -136,10 +114,12 @@ private: std::vector connection_establishers; std::vector shuffled_pools; - /// Map socket file descriptor to replica. - std::unordered_map fd_to_replica; - /// Map timeout file descriptor to replica. - std::unordered_map timeout_fd_to_replica; + std::vector> replicas_timeouts; + + /// Map socket file descriptor to replica index. + std::unordered_map fd_to_replica_index; + /// Map timeout file descriptor to replica index. + std::unordered_map timeout_fd_to_replica_index; /// Indexes of replicas, that are in process of connection. std::unordered_set indexes_in_process; From 4cc7e2c5c68026af391af1b61ccae93ff87ff291 Mon Sep 17 00:00:00 2001 From: Pavel Kruglov Date: Tue, 9 Feb 2021 05:13:47 +0300 Subject: [PATCH 0207/2357] Update comment --- src/Client/HedgedConnectionsFactory.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/Client/HedgedConnectionsFactory.h b/src/Client/HedgedConnectionsFactory.h index 048d90e1de6..45a03e212c0 100644 --- a/src/Client/HedgedConnectionsFactory.h +++ b/src/Client/HedgedConnectionsFactory.h @@ -100,8 +100,8 @@ private: bool processTimeoutEvent(int replica_index, ConnectionTimeoutDescriptorPtr timeout_descriptor); - /// Return false if there is no ready events, return true if replica is ready - /// or we need to try next replica. + /// Return NOT_READY state if there is no ready events, READY if replica is ready + /// and EMPTY if we need to try next replica. State processEpollEvents(bool blocking, Connection *& connection_out); State setBestUsableReplica(Connection *& connection_out); From be3be85fa2167beb909ec75a6180ae0a63421186 Mon Sep 17 00:00:00 2001 From: Roman Bug Date: Tue, 9 Feb 2021 13:57:41 +0300 Subject: [PATCH 0208/2357] Update docs/en/sql-reference/functions/type-conversion-functions.md Co-authored-by: olgarev <56617294+olgarev@users.noreply.github.com> --- docs/en/sql-reference/functions/type-conversion-functions.md | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/docs/en/sql-reference/functions/type-conversion-functions.md b/docs/en/sql-reference/functions/type-conversion-functions.md index 1742f6b8888..cab71f46bf5 100644 --- a/docs/en/sql-reference/functions/type-conversion-functions.md +++ b/docs/en/sql-reference/functions/type-conversion-functions.md @@ -492,8 +492,9 @@ Result: ## accurateCast(x, T) {#type_conversion_function-accurate-cast} -Converts `x` to the `T` data type. The differente from [cast(x, T)](#type_conversion_function-cast) is that `accurateCast` -does not allow overflow of numeric types during cast if type value `x` does not fit bounds of type `T`. +Converts `x` to the `T` data type. + +The difference from [cast(x, T)](#type_conversion_function-cast) is that `accurateCast` does not allow overflow of numeric types during cast if type value `x` does not fit the bounds of type `T`. For example, `accurateCast(-1, 'UInt8')` throws an exception. **Example** From b676f63f1dec7b606f4f5559f910f02098f9c135 Mon Sep 17 00:00:00 2001 From: Roman Bug Date: Tue, 9 Feb 2021 13:58:22 +0300 Subject: [PATCH 0209/2357] Update docs/ru/sql-reference/functions/type-conversion-functions.md Co-authored-by: olgarev <56617294+olgarev@users.noreply.github.com> --- docs/ru/sql-reference/functions/type-conversion-functions.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/ru/sql-reference/functions/type-conversion-functions.md b/docs/ru/sql-reference/functions/type-conversion-functions.md index aa55e015c61..d95a5279716 100644 --- a/docs/ru/sql-reference/functions/type-conversion-functions.md +++ b/docs/ru/sql-reference/functions/type-conversion-functions.md @@ -497,7 +497,7 @@ SELECT toTypeName(CAST(x, 'Nullable(UInt16)')) FROM t_null; accurateCastOrNull(x, T) ``` -**Parameters** +**Параметры** - `x` — входное значение. - `T` — имя возвращаемого типа данных. From c22412b775b36009f3ceba36fb82a595a5d49075 Mon Sep 17 00:00:00 2001 From: Roman Bug Date: Tue, 9 Feb 2021 13:58:47 +0300 Subject: [PATCH 0210/2357] Update docs/en/sql-reference/operators/in.md Co-authored-by: olgarev <56617294+olgarev@users.noreply.github.com> --- docs/en/sql-reference/operators/in.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/en/sql-reference/operators/in.md b/docs/en/sql-reference/operators/in.md index a0dd0455c4d..4796c0f6bc0 100644 --- a/docs/en/sql-reference/operators/in.md +++ b/docs/en/sql-reference/operators/in.md @@ -17,7 +17,7 @@ Don’t list too many values explicitly (i.e. millions). If a data set is large The right side of the operator can be a set of constant expressions, a set of tuples with constant expressions (shown in the examples above), or the name of a database table or SELECT subquery in brackets. -ClickHouse allows different types inside `IN` subquery. For left hand side it applies conversion to the type of right hand side with [accurateCastOrNull](../functions/type-conversion-functions.md#type_conversion_function-accurate-cast_or_null). +ClickHouse allows different types in the left and right parts of `IN` subquery. In this case it converts the left hand side to the type of the right hand side as if the [accurateCastOrNull](../functions/type-conversion-functions.md#type_conversion_function-accurate-cast_or_null) function is applied. **Example** From df123e91e650c9f4dd11d12dff78753df58bbe6d Mon Sep 17 00:00:00 2001 From: Roman Bug Date: Tue, 9 Feb 2021 13:59:58 +0300 Subject: [PATCH 0211/2357] Update docs/en/sql-reference/functions/type-conversion-functions.md Co-authored-by: olgarev <56617294+olgarev@users.noreply.github.com> --- .../en/sql-reference/functions/type-conversion-functions.md | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/docs/en/sql-reference/functions/type-conversion-functions.md b/docs/en/sql-reference/functions/type-conversion-functions.md index cab71f46bf5..83cbad6f53b 100644 --- a/docs/en/sql-reference/functions/type-conversion-functions.md +++ b/docs/en/sql-reference/functions/type-conversion-functions.md @@ -559,9 +559,9 @@ Query: ``` sql SELECT - cast(-1, 'UInt8') as uint8, - cast(128, 'Int8') as int8, - cast('Test', 'FixedString(2)') as fixed_string; + accurateCastOrNull(-1, 'UInt8') as uint8, + accurateCastOrNull(128, 'Int8') as int8, + accurateCastOrNull('Test', 'FixedString(2)') as fixed_string; ``` Result: From 94a489ce97eef31f4036759b04d9651f4cd5512e Mon Sep 17 00:00:00 2001 From: Roman Bug Date: Tue, 9 Feb 2021 17:25:04 +0300 Subject: [PATCH 0212/2357] Update docs/ru/sql-reference/functions/date-time-functions.md Co-authored-by: Anna <42538400+adevyatova@users.noreply.github.com> --- docs/ru/sql-reference/functions/date-time-functions.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/ru/sql-reference/functions/date-time-functions.md b/docs/ru/sql-reference/functions/date-time-functions.md index 0acb9e3cd39..d019c18a688 100644 --- a/docs/ru/sql-reference/functions/date-time-functions.md +++ b/docs/ru/sql-reference/functions/date-time-functions.md @@ -686,7 +686,7 @@ SELECT formatDateTime(toDate('2010-01-04'), '%g'); ## FROM\_UNIXTIME {#fromunixtime} -Функция преобразует метку времени Unix в дату. +Функция преобразует Unix timestamp в календарную дату и время. **Примеры** From 79a1a5741f723374b41325953c78f927fc4a92a4 Mon Sep 17 00:00:00 2001 From: Roman Bug Date: Tue, 9 Feb 2021 17:25:38 +0300 Subject: [PATCH 0213/2357] Update docs/en/sql-reference/data-types/simpleaggregatefunction.md Co-authored-by: Anna <42538400+adevyatova@users.noreply.github.com> --- docs/en/sql-reference/data-types/simpleaggregatefunction.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/en/sql-reference/data-types/simpleaggregatefunction.md b/docs/en/sql-reference/data-types/simpleaggregatefunction.md index e25d4803613..244779c5ca8 100644 --- a/docs/en/sql-reference/data-types/simpleaggregatefunction.md +++ b/docs/en/sql-reference/data-types/simpleaggregatefunction.md @@ -35,7 +35,7 @@ The following aggregate functions are supported: **Example** ``` sql -CREATE TABLE simple (id UInt64,val SimpleAggregateFunction(sum,Double)) ENGINE=AggregatingMergeTree ORDER BY id; +CREATE TABLE simple (id UInt64, val SimpleAggregateFunction(sum, Double)) ENGINE=AggregatingMergeTree ORDER BY id; ``` [Original article](https://clickhouse.tech/docs/en/data_types/simpleaggregatefunction/) From 55727f511df2baa19584f32a7289d4e2ae298add Mon Sep 17 00:00:00 2001 From: Roman Bug Date: Tue, 9 Feb 2021 17:27:39 +0300 Subject: [PATCH 0214/2357] Update docs/en/sql-reference/functions/date-time-functions.md Co-authored-by: Anna <42538400+adevyatova@users.noreply.github.com> --- docs/en/sql-reference/functions/date-time-functions.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/en/sql-reference/functions/date-time-functions.md b/docs/en/sql-reference/functions/date-time-functions.md index ce2092a7818..ca62d2a61e5 100644 --- a/docs/en/sql-reference/functions/date-time-functions.md +++ b/docs/en/sql-reference/functions/date-time-functions.md @@ -663,7 +663,7 @@ Result: ## FROM\_UNIXTIME {#fromunixfime} -Function converts Unix timestamp to date. When there is only a single argument of [Integer](../../sql-reference/data-types/int-uint.md) type, it acts in the same way as [toDateTime](../../sql-reference/functions/type-conversion-functions.md#todatetime) and return [DateTime](../../sql-reference/data-types/datetime.md) type. +Function converts Unix timestamp to a calendar date and a time of a day. When there is only a single argument of [Integer](../../sql-reference/data-types/int-uint.md) type, it acts in the same way as [toDateTime](../../sql-reference/functions/type-conversion-functions.md#todatetime) and return [DateTime](../../sql-reference/data-types/datetime.md) type. **Example:** From 44e857b5ea3ca2bbf49d3746af1c1941ac3a2f33 Mon Sep 17 00:00:00 2001 From: Roman Bug Date: Tue, 9 Feb 2021 17:30:16 +0300 Subject: [PATCH 0215/2357] Update simpleaggregatefunction.md --- docs/ru/sql-reference/data-types/simpleaggregatefunction.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/ru/sql-reference/data-types/simpleaggregatefunction.md b/docs/ru/sql-reference/data-types/simpleaggregatefunction.md index b80826803de..7677b64e924 100644 --- a/docs/ru/sql-reference/data-types/simpleaggregatefunction.md +++ b/docs/ru/sql-reference/data-types/simpleaggregatefunction.md @@ -33,7 +33,7 @@ **Пример** ``` sql -CREATE TABLE simple (id UInt64,val SimpleAggregateFunction(sum,Double)) ENGINE=AggregatingMergeTree ORDER BY id; +CREATE TABLE simple (id UInt64, val SimpleAggregateFunction(sum, Double)) ENGINE=AggregatingMergeTree ORDER BY id; ``` [Оригинальная статья](https://clickhouse.tech/docs/en/data_types/simpleaggregatefunction/) From 3874effea16b4140227efa6e11fe6dc34024924f Mon Sep 17 00:00:00 2001 From: Vitaliy Zakaznikov Date: Tue, 9 Feb 2021 10:09:38 -0500 Subject: [PATCH 0216/2357] Fixing rendering issues and links. --- .../external-authenticators/index.md | 2 +- .../external-authenticators/ldap.md | 74 +++++++++---------- 2 files changed, 37 insertions(+), 39 deletions(-) diff --git a/docs/en/operations/external-authenticators/index.md b/docs/en/operations/external-authenticators/index.md index fb8483fa341..95f80f192f5 100644 --- a/docs/en/operations/external-authenticators/index.md +++ b/docs/en/operations/external-authenticators/index.md @@ -10,4 +10,4 @@ ClickHouse supports authenticating and managing users using external services. The following external authenticators and directories are supported: -- [LDAP](#external-authenticators-ldap) [Authenticator](#ldap-external-authenticator) and [Directory](#ldap-external-user-directory) +- [LDAP](./ldap.md#external-authenticators-ldap) [Authenticator](./ldap.md#ldap-external-authenticator) and [Directory](./ldap.md#ldap-external-user-directory) diff --git a/docs/en/operations/external-authenticators/ldap.md b/docs/en/operations/external-authenticators/ldap.md index 03be357a12a..36a13227852 100644 --- a/docs/en/operations/external-authenticators/ldap.md +++ b/docs/en/operations/external-authenticators/ldap.md @@ -5,8 +5,7 @@ LDAP server can be used to authenticate ClickHouse users. There are two differen - use LDAP as an external authenticator for existing users, which are defined in `users.xml` or in local access control paths - use LDAP as an external user directory and allow locally undefined users to be authenticated if they exist on the LDAP server -For both of these approaches, an internally named LDAP server must be defined in the ClickHouse config -so that other parts of config are able to refer to it. +For both of these approaches, an internally named LDAP server must be defined in the ClickHouse config so that other parts of config are able to refer to it. ## LDAP Server Definition {#ldap-server-definition} @@ -34,27 +33,27 @@ To define LDAP server you must add `ldap_servers` section to the `config.xml`. F ``` -Note, that you can define multiple LDAP servers inside `ldap_servers` section using distinct names. +Note, that you can define multiple LDAP servers inside the `ldap_servers` section using distinct names. Parameters: - `host` - LDAP server hostname or IP, this parameter is mandatory and cannot be empty. - `port` - LDAP server port, default is `636` if `enable_tls` is set to `true`, `389` otherwise. - `bind_dn` - template used to construct the DN to bind to. - - The resulting DN will be constructed by replacing all `{user_name}` substrings of the - template with the actual user name during each authentication attempt. + - The resulting DN will be constructed by replacing all `{user_name}` substrings of the + template with the actual user name during each authentication attempt. - `verification_cooldown` - a period of time, in seconds, after a successful bind attempt, during which the user will be assumed to be successfully authenticated for all consecutive requests without contacting the LDAP server. - - Specify `0` (the default) to disable caching and force contacting the LDAP server for each authentication request. + - Specify `0` (the default) to disable caching and force contacting the LDAP server for each authentication request. - `enable_tls` - flag to trigger use of secure connection to the LDAP server. - - Specify `no` for plain text `ldap://` protocol (not recommended). - - Specify `yes` for LDAP over SSL/TLS `ldaps://` protocol (recommended, the default). - - Specify `starttls` for legacy StartTLS protocol (plain text `ldap://` protocol, upgraded to TLS). + - Specify `no` for plain text `ldap://` protocol (not recommended). + - Specify `yes` for LDAP over SSL/TLS `ldaps://` protocol (recommended, the default). + - Specify `starttls` for legacy StartTLS protocol (plain text `ldap://` protocol, upgraded to TLS). - `tls_minimum_protocol_version` - the minimum protocol version of SSL/TLS. - - Accepted values are: `ssl2`, `ssl3`, `tls1.0`, `tls1.1`, `tls1.2` (the default). + - Accepted values are: `ssl2`, `ssl3`, `tls1.0`, `tls1.1`, `tls1.2` (the default). - `tls_require_cert` - SSL/TLS peer certificate verification behavior. - - Accepted values are: `never`, `allow`, `try`, `demand` (the default). + - Accepted values are: `never`, `allow`, `try`, `demand` (the default). - `tls_cert_file` - path to certificate file. - `tls_key_file` - path to certificate key file. - `tls_ca_cert_file` - path to CA certificate file. @@ -65,8 +64,7 @@ Parameters: A remote LDAP server can be used as a method for verifying passwords for locally defined users (users defined in `users.xml` or in local access control paths). In order to achieve this, specify previously defined LDAP server name instead of `password` or similar sections in the user definition. -At each login attempt, ClickHouse will try to "bind" to the specified DN defined by the `bind_dn` parameter -in the [LDAP server definition](#ldap-server-definition) using the provided credentials, and if successful, the user will be considered authenticated. This is often called a "simple bind" method. +At each login attempt, ClickHouse will try to "bind" to the specified DN defined by the `bind_dn` parameter in the [LDAP server definition](#ldap-server-definition) using the provided credentials, and if successful, the user will be considered authenticated. This is often called a "simple bind" method. For example, @@ -87,7 +85,7 @@ For example, Note, that user `my_user` refers to `my_ldap_server`. This LDAP server must be configured in the main `config.xml` file as described previously. -When SQL-driven [Access Control and Account Management](#access-control) is enabled in ClickHouse, users that are identified by LDAP servers can also be created using the [CRATE USER](#create-user-statement) statement. +When SQL-driven [Access Control and Account Management](../access-rights.md#access-control) is enabled in ClickHouse, users that are authenticated by LDAP servers can also be created using the [CRATE USER](../../sql-reference/statements/create/user.md#create-user-statement) statement. ```sql @@ -96,9 +94,9 @@ CREATE USER my_user IDENTIFIED WITH ldap_server BY 'my_ldap_server' ## LDAP Exernal User Directory {#ldap-external-user-directory} -In addition to the locally defined users, a remote LDAP server can be used as a source of user definitions. In order to achieve this, specify previously defined LDAP server name (see [LDAP Server Definition](#ldap-server-definition)) in the `ldap` section inside the `users_directories` section in of the `config.xml` file. +In addition to the locally defined users, a remote LDAP server can be used as a source of user definitions. In order to achieve this, specify previously defined LDAP server name (see [LDAP Server Definition](#ldap-server-definition)) in the `ldap` section inside the `users_directories` section of the `config.xml` file. -At each login attempt, ClickHouse will try to find the user definition locally and authenticate it as usual, but if the user is not defined, ClickHouse will assume it exists in the external LDAP directory, and will try to "bind" to the specified DN at the LDAP server using the provided credentials, and if successful, the user will be considered existing and authenticated. The user will be assigned roles from the list specified in the `roles` section. Additionally, LDAP "search" can be performed and results can be transformed and treated as role names and then assigned to the user if the `role_mapping` section is also configured. All this implies that the SQL-driven [Access Control and Account Management](#access-control) is enabled in ClickHouse and roles are created using the [CREATE ROLE](#create-role-statement) statement. +At each login attempt, ClickHouse will try to find the user definition locally and authenticate it as usual, but if the user is not defined, ClickHouse will assume it exists in the external LDAP directory, and will try to "bind" to the specified DN at the LDAP server using the provided credentials. If successful, the user will be considered existing and authenticated. The user will be assigned roles from the list specified in the `roles` section. Additionally, LDAP "search" can be performed and results can be transformed and treated as role names and then be assigned to the user if the `role_mapping` section is also configured. All this implies that the SQL-driven [Access Control and Account Management](../access-rights.md#access-control) is enabled and roles are created using the [CREATE ROLE](../../sql-reference/statements/create/role.md#create-role-statement) statement. Example (goes into `config.xml`): @@ -130,29 +128,29 @@ defined LDAP server that is configured in the `config.xml` (see [LDAP Server Def Parameters: -- `server` - one of LDAP server names defined in `ldap_servers` config section above. +- `server` - one of LDAP server names defined in the `ldap_servers` config section above. This parameter is mandatory and cannot be empty. - `roles` - section with a list of locally defined roles that will be assigned to each user retrieved from the LDAP server. - - If no roles are specified here or assigned during role mapping (below), user will not be able - to perform any actions after authentication. + - If no roles are specified here or assigned during role mapping (below), user will not be able + to perform any actions after authentication. - `role_mapping` - section with LDAP search parameters and mapping rules. - - When a user authenticates, while still bound to LDAP, an LDAP search is performed using `search_filter` - and the name of the logged in user. For each entry found during that search, the value of the specified - attribute is extracted. For each attribute value that has the specified prefix, the prefix is removed, - and the rest of the value becomes the name of a local role defined in ClickHouse, - which is expected to be created beforehand by the [CREATE ROLE](#create-role-statement) statement. - - There can be multiple `role_mapping` sections defined inside the same `ldap` section. All of them will be applied. - - `base_dn` - template used to construct the base DN for the LDAP search. - - The resulting DN will be constructed by replacing all `{user_name}` and `{bind_dn}` - substrings of the template with the actual user name and bind DN during each LDAP search. - - `scope` - scope of the LDAP search. - - Accepted values are: `base`, `one_level`, `children`, `subtree` (the default). - - `search_filter` - template used to construct the search filter for the LDAP search. - - The resulting filter will be constructed by replacing all `{user_name}`, `{bind_dn}`, and `{base_dn}` - substrings of the template with the actual user name, bind DN, and base DN during each LDAP search. - - Note, that the special characters must be escaped properly in XML. - - `attribute` - attribute name whose values will be returned by the LDAP search. - - `prefix` - prefix, that will be expected to be in front of each string in the original - list of strings returned by the LDAP search. Prefix will be removed from the original - strings and resulting strings will be treated as local role names. Empty, by default. + - When a user authenticates, while still bound to LDAP, an LDAP search is performed using `search_filter` + and the name of the logged in user. For each entry found during that search, the value of the specified + attribute is extracted. For each attribute value that has the specified prefix, the prefix is removed, + and the rest of the value becomes the name of a local role defined in ClickHouse, + which is expected to be created beforehand by the [CREATE ROLE](../../sql-reference/statements/create/role.md#create-role-statement) statement. + - There can be multiple `role_mapping` sections defined inside the same `ldap` section. All of them will be applied. + - `base_dn` - template used to construct the base DN for the LDAP search. + - The resulting DN will be constructed by replacing all `{user_name}` and `{bind_dn}` + substrings of the template with the actual user name and bind DN during each LDAP search. + - `scope` - scope of the LDAP search. + - Accepted values are: `base`, `one_level`, `children`, `subtree` (the default). + - `search_filter` - template used to construct the search filter for the LDAP search. + - The resulting filter will be constructed by replacing all `{user_name}`, `{bind_dn}`, and `{base_dn}` + substrings of the template with the actual user name, bind DN, and base DN during each LDAP search. + - Note, that the special characters must be escaped properly in XML. + - `attribute` - attribute name whose values will be returned by the LDAP search. + - `prefix` - prefix, that will be expected to be in front of each string in the original + list of strings returned by the LDAP search. Prefix will be removed from the original + strings and resulting strings will be treated as local role names. Empty, by default. From 2c6a0e74fb90d2cd5c8b988c4e9f3eebf60366c8 Mon Sep 17 00:00:00 2001 From: Alexander Tokmakov Date: Tue, 9 Feb 2021 18:14:20 +0300 Subject: [PATCH 0217/2357] better replica creation --- src/Databases/DatabaseReplicated.cpp | 119 ++++++++++-------- src/Databases/DatabaseReplicated.h | 6 +- src/Databases/DatabaseReplicatedWorker.cpp | 16 ++- src/Databases/DatabaseReplicatedWorker.h | 2 + src/Interpreters/DDLTask.cpp | 4 +- src/Interpreters/DDLTask.h | 5 +- src/Interpreters/DDLWorker.cpp | 39 +++--- src/Interpreters/DDLWorker.h | 6 + src/Interpreters/executeDDLQueryOnCluster.cpp | 1 - .../test_replicated_database/test.py | 9 +- 10 files changed, 115 insertions(+), 92 deletions(-) diff --git a/src/Databases/DatabaseReplicated.cpp b/src/Databases/DatabaseReplicated.cpp index 4a6058afcd0..a3da271a597 100644 --- a/src/Databases/DatabaseReplicated.cpp +++ b/src/Databases/DatabaseReplicated.cpp @@ -3,7 +3,6 @@ #include #include #include -#include #include #include #include @@ -105,8 +104,6 @@ DatabaseReplicated::DatabaseReplicated( throw Exception(ErrorCodes::REPLICA_IS_ALREADY_EXIST, "Replica {} of shard {} of replicated database at {} already exists. Replica host ID: '{}', current host ID: '{}'", replica_name, shard_name, zookeeper_path, replica_host_id, host_id); - - log_entry_to_execute = parse(current_zookeeper->get(replica_path + "/log_ptr")); } else { @@ -232,9 +229,6 @@ bool DatabaseReplicated::createDatabaseNodesInZooKeeper(const zkutil::ZooKeeperP void DatabaseReplicated::createReplicaNodesInZooKeeper(const zkutil::ZooKeeperPtr & current_zookeeper) { - /// When creating new replica, use latest snapshot version as initial value of log_pointer - //log_entry_to_execute = 0; //FIXME - /// Write host name to replica_path, it will protect from multiple replicas with the same name auto host_id = getHostID(global_context, db_uuid); @@ -265,40 +259,6 @@ void DatabaseReplicated::loadStoredObjects(Context & context, bool has_force_res ddl_worker->startup(); } -void DatabaseReplicated::onUnexpectedLogEntry(const String & entry_name, const ZooKeeperPtr & zookeeper) -{ - /// We cannot execute next entry of replication log. Possible reasons: - /// 1. Replica is staled, some entries were removed by log cleanup process. - /// In this case we should recover replica from the last snapshot. - /// 2. Replication log is broken due to manual operations with ZooKeeper or logical error. - /// In this case we just stop replication without any attempts to recover it automatically, - /// because such attempts may lead to unexpected data removal. - - constexpr const char * name = "query-"; - if (!startsWith(entry_name, name)) - throw Exception(ErrorCodes::DATABASE_REPLICATION_FAILED, "Unexpected entry in replication log: {}", entry_name); - - UInt32 entry_number; - if (!tryParse(entry_number, entry_name.substr(strlen(name)))) - throw Exception(ErrorCodes::DATABASE_REPLICATION_FAILED, "Cannot parse number of replication log entry {}", entry_name); - - if (entry_number < log_entry_to_execute) - throw Exception(ErrorCodes::LOGICAL_ERROR, "Entry {} already executed, current pointer is {}", entry_number, log_entry_to_execute); - - /// Entry name is valid. Let's get min log pointer to check if replica is staled. - UInt32 min_snapshot = parse(zookeeper->get(zookeeper_path + "/min_log_ptr")); // FIXME - - if (log_entry_to_execute < min_snapshot) - { - recoverLostReplica(zookeeper, 0); //FIXME log_pointer - return; - } - - throw Exception(ErrorCodes::LOGICAL_ERROR, "Cannot recover replica, probably it's a bug. " - "Got log entry '{}' when expected entry number {}"); -} - - BlockIO DatabaseReplicated::propose(const ASTPtr & query, const Context & query_context) { if (query_context.getClientInfo().query_kind != ClientInfo::QueryKind::INITIAL_QUERY) @@ -335,22 +295,25 @@ BlockIO DatabaseReplicated::propose(const ASTPtr & query, const Context & query_ } -void DatabaseReplicated::recoverLostReplica(const ZooKeeperPtr & current_zookeeper, UInt32 from_snapshot) +void DatabaseReplicated::recoverLostReplica(const ZooKeeperPtr & current_zookeeper, UInt32 our_log_ptr, UInt32 max_log_ptr) { - //LOG_WARNING(log, "Will recover replica"); + bool new_replica = our_log_ptr == 0; + if (new_replica) + LOG_INFO(log, "Will create new replica from log pointer {}", max_log_ptr); + else + LOG_WARNING(log, "Will recover replica with staled log pointer {} from log pointer {}", our_log_ptr, max_log_ptr); - //FIXME drop old tables + if (new_replica && !empty()) + throw Exception(ErrorCodes::LOGICAL_ERROR, "It's new replica, but database is not empty"); - String snapshot_metadata_path = zookeeper_path + "/metadata"; - Strings tables_in_snapshot = current_zookeeper->getChildren(snapshot_metadata_path); - snapshot_metadata_path += '/'; - from_snapshot = parse(current_zookeeper->get(zookeeper_path + "/max_log_ptr")); + if (!new_replica) + throw Exception(ErrorCodes::NOT_IMPLEMENTED, "Automatic replica recovery is not implemented"); - for (const auto & table_name : tables_in_snapshot) + auto table_name_to_metadata = tryGetConsistentMetadataSnapshot(current_zookeeper, max_log_ptr); + + for (const auto & name_and_meta : table_name_to_metadata) { - //FIXME It's not atomic. We need multiget here (available since ZooKeeper 3.6.0). - String query_text = current_zookeeper->get(snapshot_metadata_path + table_name); - auto query_ast = parseQueryFromMetadataInZooKeeper(table_name, query_text); + auto query_ast = parseQueryFromMetadataInZooKeeper(name_and_meta.first, name_and_meta.second); Context query_context = global_context; query_context.makeQueryContext(); @@ -358,14 +321,60 @@ void DatabaseReplicated::recoverLostReplica(const ZooKeeperPtr & current_zookeep query_context.setCurrentDatabase(database_name); query_context.setCurrentQueryId(""); // generate random query_id - //FIXME - DatabaseCatalog::instance().waitTableFinallyDropped(query_ast->as()->uuid); - LOG_INFO(log, "Executing {}", serializeAST(*query_ast)); InterpreterCreateQuery(query_ast, query_context).execute(); } - current_zookeeper->set(replica_path + "/log_ptr", toString(from_snapshot)); + current_zookeeper->set(replica_path + "/log_ptr", toString(max_log_ptr)); +} + +std::map DatabaseReplicated::tryGetConsistentMetadataSnapshot(const ZooKeeperPtr & zookeeper, UInt32 & max_log_ptr) +{ + std::map table_name_to_metadata; + constexpr int max_retries = 10; + int iteration = 0; + while (++iteration <= max_retries) + { + table_name_to_metadata.clear(); + LOG_DEBUG(log, "Trying to get consistent metadata snapshot for log pointer {}", max_log_ptr); + Strings table_names = zookeeper->getChildren(zookeeper_path + "/metadata"); + + std::vector futures; + futures.reserve(table_names.size()); + for (const auto & table : table_names) + futures.emplace_back(zookeeper->asyncTryGet(zookeeper_path + "/metadata/" + table)); + + for (size_t i = 0; i < table_names.size(); ++i) + { + auto res = futures[i].get(); + if (res.error != Coordination::Error::ZOK) + break; + table_name_to_metadata.emplace(table_names[i], res.data); + } + + UInt32 new_max_log_ptr = parse(zookeeper->get(zookeeper_path + "/max_log_ptr")); + if (new_max_log_ptr == max_log_ptr && table_names.size() == table_name_to_metadata.size()) + break; + + if (max_log_ptr < new_max_log_ptr) + { + LOG_DEBUG(log, "Log pointer moved from {} to {}, will retry", max_log_ptr, new_max_log_ptr); + max_log_ptr = new_max_log_ptr; + } + else + { + assert(max_log_ptr == new_max_log_ptr); + assert(table_names.size() != table_name_to_metadata.size()); + LOG_DEBUG(log, "Cannot get metadata of some tables due to ZooKeeper error, will retry"); + } + } + + if (max_retries < iteration) + throw Exception(ErrorCodes::DATABASE_REPLICATION_FAILED, "Cannot get consistent metadata snapshot"); + + LOG_DEBUG(log, "Got consistent metadata snapshot for log pointer {}", max_log_ptr); + + return table_name_to_metadata; } ASTPtr DatabaseReplicated::parseQueryFromMetadataInZooKeeper(const String & node_name, const String & query) diff --git a/src/Databases/DatabaseReplicated.h b/src/Databases/DatabaseReplicated.h index c39321f0caa..fffc2b5c98a 100644 --- a/src/Databases/DatabaseReplicated.h +++ b/src/Databases/DatabaseReplicated.h @@ -86,8 +86,8 @@ private: bool createDatabaseNodesInZooKeeper(const ZooKeeperPtr & current_zookeeper); void createReplicaNodesInZooKeeper(const ZooKeeperPtr & current_zookeeper); - void onUnexpectedLogEntry(const String & entry_name, const ZooKeeperPtr & zookeeper); - void recoverLostReplica(const ZooKeeperPtr & current_zookeeper, UInt32 from_snapshot); + void recoverLostReplica(const ZooKeeperPtr & current_zookeeper, UInt32 our_log_ptr, UInt32 max_log_ptr); + std::map tryGetConsistentMetadataSnapshot(const ZooKeeperPtr & zookeeper, UInt32 & max_log_ptr); ASTPtr parseQueryFromMetadataInZooKeeper(const String & node_name, const String & query); @@ -96,8 +96,6 @@ private: String replica_name; String replica_path; - UInt32 log_entry_to_execute; - zkutil::ZooKeeperPtr getZooKeeper() const; std::unique_ptr ddl_worker; diff --git a/src/Databases/DatabaseReplicatedWorker.cpp b/src/Databases/DatabaseReplicatedWorker.cpp index dd9dc322f9d..3162979e787 100644 --- a/src/Databases/DatabaseReplicatedWorker.cpp +++ b/src/Databases/DatabaseReplicatedWorker.cpp @@ -45,11 +45,14 @@ void DatabaseReplicatedDDLWorker::initializeReplication() /// Check if we need to recover replica. /// Invariant: replica is lost if it's log_ptr value is less then max_log_ptr - logs_to_keep. - UInt32 our_log_ptr = parse(current_zookeeper->get(database->replica_path + "/log_ptr")); + String log_ptr_str = current_zookeeper->get(database->replica_path + "/log_ptr"); + UInt32 our_log_ptr = parse(log_ptr_str); UInt32 max_log_ptr = parse(current_zookeeper->get(database->zookeeper_path + "/max_log_ptr")); - UInt32 logs_to_keep = parse(current_zookeeper->get(database->zookeeper_path + "/logs_to_keep")); + logs_to_keep = parse(current_zookeeper->get(database->zookeeper_path + "/logs_to_keep")); if (our_log_ptr == 0 || our_log_ptr + logs_to_keep < max_log_ptr) - database->recoverLostReplica(current_zookeeper, 0); + database->recoverLostReplica(current_zookeeper, our_log_ptr, max_log_ptr); + else + last_skipped_entry_name.emplace(log_ptr_str); } String DatabaseReplicatedDDLWorker::enqueueQuery(DDLLogEntry & entry) @@ -239,4 +242,11 @@ DDLTaskPtr DatabaseReplicatedDDLWorker::initAndCheckTask(const String & entry_na return task; } +bool DatabaseReplicatedDDLWorker::canRemoveQueueEntry(const String & entry_name, const Coordination::Stat &) +{ + UInt32 entry_number = DDLTaskBase::getLogEntryNumber(entry_name); + UInt32 max_log_ptr = parse(getAndSetZooKeeper()->get(database->zookeeper_path + "/max_log_ptr")); + return entry_number + logs_to_keep < max_log_ptr; +} + } diff --git a/src/Databases/DatabaseReplicatedWorker.h b/src/Databases/DatabaseReplicatedWorker.h index e3fd58c4305..33806df88ba 100644 --- a/src/Databases/DatabaseReplicatedWorker.h +++ b/src/Databases/DatabaseReplicatedWorker.h @@ -20,11 +20,13 @@ private: void initializeReplication(); DDLTaskPtr initAndCheckTask(const String & entry_name, String & out_reason, const ZooKeeperPtr & zookeeper) override; + bool canRemoveQueueEntry(const String & entry_name, const Coordination::Stat & stat) override; DatabaseReplicated * const database; mutable std::mutex mutex; std::condition_variable wait_current_task_change; String current_task; + UInt32 logs_to_keep = std::numeric_limits::max(); }; } diff --git a/src/Interpreters/DDLTask.cpp b/src/Interpreters/DDLTask.cpp index 9e379443364..7f47f0a6659 100644 --- a/src/Interpreters/DDLTask.cpp +++ b/src/Interpreters/DDLTask.cpp @@ -320,7 +320,7 @@ std::unique_ptr DatabaseReplicatedTask::makeQueryContext(Context & from return query_context; } -String DatabaseReplicatedTask::getLogEntryName(UInt32 log_entry_number) +String DDLTaskBase::getLogEntryName(UInt32 log_entry_number) { constexpr size_t seq_node_digits = 10; String number = toString(log_entry_number); @@ -328,7 +328,7 @@ String DatabaseReplicatedTask::getLogEntryName(UInt32 log_entry_number) return name; } -UInt32 DatabaseReplicatedTask::getLogEntryNumber(const String & log_entry_name) +UInt32 DDLTaskBase::getLogEntryNumber(const String & log_entry_name) { constexpr const char * name = "query-"; assert(startsWith(log_entry_name, name)); diff --git a/src/Interpreters/DDLTask.h b/src/Interpreters/DDLTask.h index 43d9fa1c0ae..f02e17103aa 100644 --- a/src/Interpreters/DDLTask.h +++ b/src/Interpreters/DDLTask.h @@ -101,6 +101,8 @@ struct DDLTaskBase inline String getFinishedNodePath() const { return entry_path + "/finished/" + host_id_str; } inline String getShardNodePath() const { return entry_path + "/shards/" + getShardID(); } + static String getLogEntryName(UInt32 log_entry_number); + static UInt32 getLogEntryNumber(const String & log_entry_name); }; struct DDLTask : public DDLTaskBase @@ -132,9 +134,6 @@ struct DatabaseReplicatedTask : public DDLTaskBase String getShardID() const override; std::unique_ptr makeQueryContext(Context & from_context) override; - static String getLogEntryName(UInt32 log_entry_number); - static UInt32 getLogEntryNumber(const String & log_entry_name); - DatabaseReplicated * database; }; diff --git a/src/Interpreters/DDLWorker.cpp b/src/Interpreters/DDLWorker.cpp index 665bacf9d6d..efaacabf4de 100644 --- a/src/Interpreters/DDLWorker.cpp +++ b/src/Interpreters/DDLWorker.cpp @@ -451,10 +451,7 @@ bool DDLWorker::tryExecuteQuery(const String & query, DDLTaskBase & task) void DDLWorker::updateMaxDDLEntryID(const String & entry_name) { - DB::ReadBufferFromString in(entry_name); - DB::assertString("query-", in); - UInt64 id; - readText(id, in); + UInt64 id = DDLTaskBase::getLogEntryNumber(entry_name); auto prev_id = max_id.load(std::memory_order_relaxed); while (prev_id < id) { @@ -744,16 +741,13 @@ bool DDLWorker::tryExecuteQueryOnLeaderReplica( } -void DDLWorker::cleanupQueue(Int64 current_time_seconds, const ZooKeeperPtr & zookeeper) +void DDLWorker::cleanupQueue(Int64, const ZooKeeperPtr & zookeeper) { LOG_DEBUG(log, "Cleaning queue"); Strings queue_nodes = zookeeper->getChildren(queue_dir); filterAndSortQueueNodes(queue_nodes); - size_t num_outdated_nodes = (queue_nodes.size() > max_tasks_in_queue) ? queue_nodes.size() - max_tasks_in_queue : 0; - auto first_non_outdated_node = queue_nodes.begin() + num_outdated_nodes; - for (auto it = queue_nodes.cbegin(); it < queue_nodes.cend(); ++it) { if (stop_flag) @@ -772,15 +766,7 @@ void DDLWorker::cleanupQueue(Int64 current_time_seconds, const ZooKeeperPtr & zo if (!zookeeper->exists(node_path, &stat)) continue; - /// Delete node if its lifetime is expired (according to task_max_lifetime parameter) - constexpr UInt64 zookeeper_time_resolution = 1000; - Int64 zookeeper_time_seconds = stat.ctime / zookeeper_time_resolution; - bool node_lifetime_is_expired = zookeeper_time_seconds + task_max_lifetime < current_time_seconds; - - /// If too many nodes in task queue (> max_tasks_in_queue), delete oldest one - bool node_is_outside_max_window = it < first_non_outdated_node; - - if (!node_lifetime_is_expired && !node_is_outside_max_window) + if (!canRemoveQueueEntry(node_name, stat)) continue; /// Skip if there are active nodes (it is weak guard) @@ -799,10 +785,7 @@ void DDLWorker::cleanupQueue(Int64 current_time_seconds, const ZooKeeperPtr & zo continue; } - if (node_lifetime_is_expired) - LOG_INFO(log, "Lifetime of task {} is expired, deleting it", node_name); - else if (node_is_outside_max_window) - LOG_INFO(log, "Task {} is outdated, deleting it", node_name); + LOG_INFO(log, "Task {} is outdated, deleting it", node_name); /// Deleting { @@ -827,6 +810,19 @@ void DDLWorker::cleanupQueue(Int64 current_time_seconds, const ZooKeeperPtr & zo } } +bool DDLWorker::canRemoveQueueEntry(const String & entry_name, const Coordination::Stat & stat) +{ + /// Delete node if its lifetime is expired (according to task_max_lifetime parameter) + constexpr UInt64 zookeeper_time_resolution = 1000; + Int64 zookeeper_time_seconds = stat.ctime / zookeeper_time_resolution; + bool node_lifetime_is_expired = zookeeper_time_seconds + task_max_lifetime < Poco::Timestamp().epochTime(); + + /// If too many nodes in task queue (> max_tasks_in_queue), delete oldest one + UInt32 entry_number = DDLTaskBase::getLogEntryNumber(entry_name); + bool node_is_outside_max_window = entry_number < max_id.load(std::memory_order_relaxed) - max_tasks_in_queue; + + return node_lifetime_is_expired || node_is_outside_max_window; +} /// Try to create nonexisting "status" dirs for a node void DDLWorker::createStatusDirs(const std::string & node_path, const ZooKeeperPtr & zookeeper) @@ -927,6 +923,7 @@ void DDLWorker::runMainThread() worker_pool = std::make_unique(pool_size); /// Clear other in-memory state, like server just started. current_tasks.clear(); + last_skipped_entry_name.reset(); max_id = 0; }; diff --git a/src/Interpreters/DDLWorker.h b/src/Interpreters/DDLWorker.h index 706face3885..1ae4f815b44 100644 --- a/src/Interpreters/DDLWorker.h +++ b/src/Interpreters/DDLWorker.h @@ -24,6 +24,11 @@ namespace Poco namespace Util { class AbstractConfiguration; } } +namespace Coordination +{ + struct Stat; +} + namespace DB { class ASTAlterQuery; @@ -94,6 +99,7 @@ protected: /// Checks and cleanups queue's nodes void cleanupQueue(Int64 current_time_seconds, const ZooKeeperPtr & zookeeper); + virtual bool canRemoveQueueEntry(const String & entry_name, const Coordination::Stat & stat); /// Init task node static void createStatusDirs(const std::string & node_path, const ZooKeeperPtr & zookeeper); diff --git a/src/Interpreters/executeDDLQueryOnCluster.cpp b/src/Interpreters/executeDDLQueryOnCluster.cpp index a0148316610..2774f78663e 100644 --- a/src/Interpreters/executeDDLQueryOnCluster.cpp +++ b/src/Interpreters/executeDDLQueryOnCluster.cpp @@ -277,7 +277,6 @@ Block DDLQueryStatusInputStream::readImpl() status.tryDeserializeText(status_data); } - //FIXME String host = host_id; UInt16 port = 0; if (by_hostname) diff --git a/tests/integration/test_replicated_database/test.py b/tests/integration/test_replicated_database/test.py index 2a5a7f4716e..04646507ed7 100644 --- a/tests/integration/test_replicated_database/test.py +++ b/tests/integration/test_replicated_database/test.py @@ -8,7 +8,7 @@ from helpers.test_tools import assert_eq_with_retry cluster = ClickHouseCluster(__file__) main_node = cluster.add_instance('main_node', main_configs=['configs/config.xml'], with_zookeeper=True, stay_alive=True, macros={"shard": 1, "replica": 1}) -dummy_node = cluster.add_instance('dummy_node', main_configs=['configs/config.xml'], with_zookeeper=True, macros={"shard": 1, "replica": 2}) +dummy_node = cluster.add_instance('dummy_node', main_configs=['configs/config.xml'], with_zookeeper=True, stay_alive=True, macros={"shard": 1, "replica": 2}) competing_node = cluster.add_instance('competing_node', main_configs=['configs/config.xml'], with_zookeeper=True, macros={"shard": 1, "replica": 3}) snapshotting_node = cluster.add_instance('snapshotting_node', main_configs=['configs/config.xml'], with_zookeeper=True, macros={"shard": 2, "replica": 1}) snapshot_recovering_node = cluster.add_instance('snapshot_recovering_node', main_configs=['configs/config.xml'], with_zookeeper=True, macros={"shard": 2, "replica": 2}) @@ -100,9 +100,12 @@ def test_alters_from_different_replicas(started_cluster): main_node.query("CREATE TABLE testdb.dist AS testdb.concurrent_test ENGINE = Distributed(cluster, testdb, concurrent_test, CounterID)") - dummy_node.kill_clickhouse(stop_start_wait_sec=0) + dummy_node.stop_clickhouse(kill=True) - competing_node.query("ALTER TABLE testdb.concurrent_test ADD COLUMN Added0 UInt32;") + settings = {"distributed_ddl_task_timeout": 10} + assert "There are 1 unfinished hosts (0 of them are currently active)" in \ + competing_node.query_and_get_error("ALTER TABLE testdb.concurrent_test ADD COLUMN Added0 UInt32;", settings=settings) + dummy_node.start_clickhouse() main_node.query("ALTER TABLE testdb.concurrent_test ADD COLUMN Added2 UInt32;") competing_node.query("ALTER TABLE testdb.concurrent_test ADD COLUMN Added1 UInt32 AFTER Added0;") main_node.query("ALTER TABLE testdb.concurrent_test ADD COLUMN AddedNested1 Nested(A UInt32, B UInt64) AFTER Added2;") From 178ada23f811354e47683677ab0c787c6170750e Mon Sep 17 00:00:00 2001 From: George Date: Wed, 10 Feb 2021 15:55:18 +0300 Subject: [PATCH 0218/2357] early draft --- .../functions/tuple-map-functions.md | 30 +++++++++++++++++++ 1 file changed, 30 insertions(+) diff --git a/docs/en/sql-reference/functions/tuple-map-functions.md b/docs/en/sql-reference/functions/tuple-map-functions.md index a46c36395b8..50015cd996e 100644 --- a/docs/en/sql-reference/functions/tuple-map-functions.md +++ b/docs/en/sql-reference/functions/tuple-map-functions.md @@ -112,4 +112,34 @@ Result: └──────────────────────────────┴───────────────────────────────────┘ ``` +## mapContains {#mapcontains} + +Determines whether `map.keys` contains the `key` parameter. + +**Syntax** + +``` sql +mapContains(map, key) +``` + +**Parameters** + +- `map` — Map. [Type name](relative/path/to/type/dscr.md#type). +- `key` — Key. Type matches the type of `map.keys`. + +**Returned value** + +- `1` if `map.keys` contains `key`, `0` if not. + +Type: [UInt8](../../sql-reference/data-types/int-uint.md). + +**Example** + +Query: + + +## mapKeys {#mapKeys} + +## mapValues {#mapvalues} + [Original article](https://clickhouse.tech/docs/en/query_language/functions/tuple-map-functions/) From 48f6f7e490754880ad179c3568d2c118454d0db9 Mon Sep 17 00:00:00 2001 From: Nikolai Kochetov Date: Wed, 10 Feb 2021 19:26:49 +0300 Subject: [PATCH 0219/2357] Split filter for predicate push down. --- src/Interpreters/ActionsDAG.cpp | 194 +++++++++++++++++++++++++++++++- src/Interpreters/ActionsDAG.h | 9 +- 2 files changed, 201 insertions(+), 2 deletions(-) diff --git a/src/Interpreters/ActionsDAG.cpp b/src/Interpreters/ActionsDAG.cpp index 176745c707d..223b4341f46 100644 --- a/src/Interpreters/ActionsDAG.cpp +++ b/src/Interpreters/ActionsDAG.cpp @@ -6,6 +6,7 @@ #include #include #include +#include #include #include #include @@ -338,7 +339,7 @@ void ActionsDAG::removeUnusedActions(const std::vector & required_nodes) removeUnusedActions(); } -void ActionsDAG::removeUnusedActions() +void ActionsDAG::removeUnusedActions(bool allow_remove_inputs) { std::unordered_set visited_nodes; std::stack stack; @@ -357,6 +358,9 @@ void ActionsDAG::removeUnusedActions() visited_nodes.insert(&node); stack.push(&node); } + + if (node.type == ActionType::INPUT && !allow_remove_inputs) + visited_nodes.insert(&node); } while (!stack.empty()) @@ -1153,4 +1157,192 @@ ActionsDAG::SplitResult ActionsDAG::splitActionsForFilter(const std::string & co return split(split_nodes); } +ActionsDAGPtr ActionsDAG::splitActionsForFilter(const std::string & filter_name, const Names & available_inputs) +{ + std::unordered_map> inputs_map; + for (const auto & input : inputs) + inputs_map[input->result_name].emplace_back(input); + + std::unordered_set allowed_nodes; + for (const auto & name : available_inputs) + { + auto & inputs_list = inputs_map[name]; + if (inputs_list.empty()) + throw Exception(ErrorCodes::LOGICAL_ERROR, "Cannot find input {} in ActionsDAG. DAG:\n{}", name, dumpDAG()); + + allowed_nodes.emplace(inputs_list.front()); + inputs_list.pop_front(); + } + + auto it = index.begin(); + for (; it != index.end(); ++it) + if ((*it)->result_name == filter_name) + break; + + if (it == index.end()) + throw Exception(ErrorCodes::LOGICAL_ERROR, + "Index for ActionsDAG does not contain filter column name {}. DAG:\n{}", + filter_name, dumpDAG()); + + std::unordered_set selected_predicates; + + { + struct Frame + { + const Node * node; + bool is_predicate = false; + size_t next_child_to_visit = 0; + size_t num_allowed_children = 0; + }; + + std::stack stack; + std::unordered_set visited_nodes; + + stack.push(Frame{.node = *it, .is_predicate = true}); + visited_nodes.insert(*it); + while (!stack.empty()) + { + auto & cur = stack.top(); + bool is_conjunction = cur.is_predicate + && cur.node->type == ActionType::FUNCTION + && cur.node->function_base->getName() == "and"; + + /// At first, visit all children. + while (cur.next_child_to_visit < cur.node->children.size()) + { + auto * child = cur.node->children[cur.next_child_to_visit]; + + if (visited_nodes.count(child) == 0) + { + visited_nodes.insert(child); + stack.push({.node = child, .is_predicate = is_conjunction}); + break; + } + + if (allowed_nodes.contains(child)) + ++cur.num_allowed_children; + ++cur.next_child_to_visit; + } + + if (cur.next_child_to_visit == cur.node->children.size()) + { + if (cur.num_allowed_children == cur.node->children.size()) + { + if (cur.node->type != ActionType::ARRAY_JOIN && cur.node->type != ActionType::INPUT) + allowed_nodes.emplace(cur.node); + } + else if (is_conjunction) + { + for (auto * child : cur.node->children) + if (allowed_nodes.count(child)) + selected_predicates.insert(child); + } + + stack.pop(); + } + } + } + + if (selected_predicates.empty()) + { + if (allowed_nodes.count(*it)) + selected_predicates.insert(*it); + else + return nullptr; + } + + auto actions = cloneEmpty(); + actions->settings.project_input = false; + + std::unordered_map nodes_mapping; + + { + struct Frame + { + const Node * node; + size_t next_child_to_visit = 0; + }; + + std::stack stack; + + for (const auto * predicate : selected_predicates) + { + if (nodes_mapping.count(predicate)) + continue; + + stack.push({.node = predicate}); + while (!stack.empty()) + { + auto & cur = stack.top(); + /// At first, visit all children. + while (cur.next_child_to_visit < cur.node->children.size()) + { + auto * child = cur.node->children[cur.next_child_to_visit]; + + if (nodes_mapping.count(child) == 0) + { + stack.push({.node = child}); + break; + } + + ++cur.next_child_to_visit; + } + + if (cur.next_child_to_visit == cur.node->children.size()) + { + auto & node = actions->nodes.emplace_back(*cur.node); + nodes_mapping[cur.node] = &node; + + for (auto & child : node.children) + child = nodes_mapping[child]; + + if (node.type == ActionType::INPUT) + { + actions->inputs.emplace_back(&node); + actions->index.insert(&node); + } + } + } + } + + Node * result_predicate = nodes_mapping[*selected_predicates.begin()]; + + if (selected_predicates.size() > 1) + { + FunctionOverloadResolverPtr func_builder_and = + std::make_shared( + std::make_unique( + std::make_shared())); + + std::vector args; + args.reserve(selected_predicates.size()); + for (const auto * predicate : selected_predicates) + args.emplace_back(nodes_mapping[predicate]); + + result_predicate = &actions->addFunction(func_builder_and, args, {}, true); + } + + actions->index.insert(result_predicate); + } + + + + /// Replace all predicates which are copied to constants. + /// Note: This also keeps valid const propagation. AND is constant only if all elements are. + /// But if all elements are constant, AND should is moved to split actions and replaced itself. + for (const auto & predicate : selected_predicates) + { + Node node; + node.type = ActionType::COLUMN; + node.result_name = std::move(predicate->result_name); + node.result_type = std::move(predicate->result_type); + node.column = node.result_type->createColumnConst(0, 1); + *predicate = std::move(node); + } + + removeUnusedActions(false); + + return actions; +} + } diff --git a/src/Interpreters/ActionsDAG.h b/src/Interpreters/ActionsDAG.h index e13a9bd62b3..6fd4e14568a 100644 --- a/src/Interpreters/ActionsDAG.h +++ b/src/Interpreters/ActionsDAG.h @@ -274,6 +274,13 @@ public: /// Index of initial actions must contain column_name. SplitResult splitActionsForFilter(const std::string & column_name) const; + /// Create actions which may calculate part of filter using only available_inputs. + /// If nothing may be calculated, returns nullptr. + /// Otherwise, return actions which inputs are from available_inputs. + /// Returned actions add single column which may be used for filter. + /// Also, replace some nodes of current inputs to constant 1 in case they are filtered. + ActionsDAGPtr splitActionsForFilter(const std::string & filter_name, const Names & available_inputs); + private: Node & addNode(Node node, bool can_replace = false); Node & getNode(const std::string & name); @@ -297,7 +304,7 @@ private: } void removeUnusedActions(const std::vector & required_nodes); - void removeUnusedActions(); + void removeUnusedActions(bool allow_remove_inputs = true); void addAliases(const NamesWithAliases & aliases, std::vector & result_nodes); void compileFunctions(); From a83885392e8233a9b9faa462eea371c71df2c745 Mon Sep 17 00:00:00 2001 From: Nikolai Kochetov Date: Wed, 10 Feb 2021 20:47:48 +0300 Subject: [PATCH 0220/2357] Split filter for predicate push down. --- src/Interpreters/ActionsDAG.cpp | 117 ++++++++++++++++++++++++++------ src/Interpreters/ActionsDAG.h | 2 +- 2 files changed, 98 insertions(+), 21 deletions(-) diff --git a/src/Interpreters/ActionsDAG.cpp b/src/Interpreters/ActionsDAG.cpp index 223b4341f46..eb1ff9ad998 100644 --- a/src/Interpreters/ActionsDAG.cpp +++ b/src/Interpreters/ActionsDAG.cpp @@ -1157,7 +1157,7 @@ ActionsDAG::SplitResult ActionsDAG::splitActionsForFilter(const std::string & co return split(split_nodes); } -ActionsDAGPtr ActionsDAG::splitActionsForFilter(const std::string & filter_name, const Names & available_inputs) +ActionsDAGPtr ActionsDAG::splitActionsForFilter(const std::string & filter_name, bool can_remove_filter, const Names & available_inputs) { std::unordered_map> inputs_map; for (const auto & input : inputs) @@ -1185,6 +1185,7 @@ ActionsDAGPtr ActionsDAG::splitActionsForFilter(const std::string & filter_name, filter_name, dumpDAG()); std::unordered_set selected_predicates; + std::unordered_set other_predicates; { struct Frame @@ -1234,8 +1235,12 @@ ActionsDAGPtr ActionsDAG::splitActionsForFilter(const std::string & filter_name, else if (is_conjunction) { for (auto * child : cur.node->children) + { if (allowed_nodes.count(child)) selected_predicates.insert(child); + else + other_predicates.insert(child); + } } stack.pop(); @@ -1254,6 +1259,11 @@ ActionsDAGPtr ActionsDAG::splitActionsForFilter(const std::string & filter_name, auto actions = cloneEmpty(); actions->settings.project_input = false; + FunctionOverloadResolverPtr func_builder_and = + std::make_shared( + std::make_unique( + std::make_shared())); + std::unordered_map nodes_mapping; { @@ -1309,11 +1319,6 @@ ActionsDAGPtr ActionsDAG::splitActionsForFilter(const std::string & filter_name, if (selected_predicates.size() > 1) { - FunctionOverloadResolverPtr func_builder_and = - std::make_shared( - std::make_unique( - std::make_shared())); - std::vector args; args.reserve(selected_predicates.size()); for (const auto * predicate : selected_predicates) @@ -1325,22 +1330,94 @@ ActionsDAGPtr ActionsDAG::splitActionsForFilter(const std::string & filter_name, actions->index.insert(result_predicate); } - - - /// Replace all predicates which are copied to constants. - /// Note: This also keeps valid const propagation. AND is constant only if all elements are. - /// But if all elements are constant, AND should is moved to split actions and replaced itself. - for (const auto & predicate : selected_predicates) + if (selected_predicates.count(*it)) { - Node node; - node.type = ActionType::COLUMN; - node.result_name = std::move(predicate->result_name); - node.result_type = std::move(predicate->result_type); - node.column = node.result_type->createColumnConst(0, 1); - *predicate = std::move(node); - } + /// The whole predicate was split. + if (can_remove_filter) + { + for (auto i = index.begin(); i != index.end(); ++i) + { + if (*i == *it) + { + index.remove(i); + break; + } + } + } + else + { + Node node; + node.type = ActionType::COLUMN; + node.result_name = std::move((*it)->result_name); + node.result_type = std::move((*it)->result_type); + node.column = node.result_type->createColumnConst(0, 1); + *(*it) = std::move(node); + } - removeUnusedActions(false); + removeUnusedActions(false); + } + else if ((*it)->type == ActionType::FUNCTION && (*it)->function_base->getName() == "and") + { + std::vector new_children(other_predicates.begin(), other_predicates.end()); + + if (new_children.size() == 1) + { + if (new_children.front()->result_type->equals(*((*it)->result_type))) + { + Node node; + node.type = ActionType::ALIAS; + node.result_name = (*it)->result_name; + node.result_type = (*it)->result_type; + node.children.swap(new_children); + *(*it) = std::move(node); + } + else + { + (*it)->children.swap(new_children); + ColumnsWithTypeAndName arguments; + arguments.reserve((*it)->children.size()); + + for (const auto * child : (*it)->children) + { + ColumnWithTypeAndName argument; + argument.column = child->column; + argument.type = child->result_type; + argument.name = child->result_name; + + arguments.emplace_back(std::move(argument)); + } + + FunctionOverloadResolverPtr func_builder_cast = + std::make_shared( + CastOverloadResolver::createImpl(false)); + + (*it)->function_builder = func_builder_cast; + (*it)->function_base = (*it)->function_builder->build(arguments); + (*it)->function = (*it)->function_base->prepare(arguments); + } + } + else + { + (*it)->children.swap(new_children); + ColumnsWithTypeAndName arguments; + arguments.reserve((*it)->children.size()); + + for (const auto * child : (*it)->children) + { + ColumnWithTypeAndName argument; + argument.column = child->column; + argument.type = child->result_type; + argument.name = child->result_name; + + arguments.emplace_back(std::move(argument)); + } + + (*it)->function_base = (*it)->function_builder->build(arguments); + (*it)->function = (*it)->function_base->prepare(arguments); + } + + removeUnusedActions(false); + } return actions; } diff --git a/src/Interpreters/ActionsDAG.h b/src/Interpreters/ActionsDAG.h index 6fd4e14568a..112c507e79f 100644 --- a/src/Interpreters/ActionsDAG.h +++ b/src/Interpreters/ActionsDAG.h @@ -279,7 +279,7 @@ public: /// Otherwise, return actions which inputs are from available_inputs. /// Returned actions add single column which may be used for filter. /// Also, replace some nodes of current inputs to constant 1 in case they are filtered. - ActionsDAGPtr splitActionsForFilter(const std::string & filter_name, const Names & available_inputs); + ActionsDAGPtr splitActionsForFilter(const std::string & filter_name, bool can_remove_filter, const Names & available_inputs); private: Node & addNode(Node node, bool can_replace = false); From e87e71ee43550f0f3a59abf227d20ce661a3bf4f Mon Sep 17 00:00:00 2001 From: Dmitriy Date: Wed, 10 Feb 2021 21:59:28 +0300 Subject: [PATCH 0221/2357] Document two functions MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Задокументировал две функции. --- .../functions/type-conversion-functions.md | 180 ++++++++++++++++++ 1 file changed, 180 insertions(+) diff --git a/docs/en/sql-reference/functions/type-conversion-functions.md b/docs/en/sql-reference/functions/type-conversion-functions.md index 3ca36f41c78..2116e55e3ef 100644 --- a/docs/en/sql-reference/functions/type-conversion-functions.md +++ b/docs/en/sql-reference/functions/type-conversion-functions.md @@ -689,6 +689,186 @@ Same as for [parseDateTimeBestEffort](#parsedatetimebesteffort) except that it r Same as for [parseDateTimeBestEffort](#parsedatetimebesteffort) except that it returns zero date or zero date time when it encounters a date format that cannot be processed. +## parseDateTimeBestEffortUSOrNull {#parsedatetimebesteffortusornull} + +Same as for [parseDateTimeBestEffortUS](#parsedatetimebesteffortUS) except that it returns null when it encounters a date format that cannot be processed. + +**Syntax** + +``` sql +parseDateTimeBestEffortUSOrNull(time_string [, time_zone]); +``` + +**Parameters** + +- `time_string` — String containing a date and time to convert. [String](../../sql-reference/data-types/string.md). +- `time_zone` — Time zone. The function parses `time_string` according to the time zone. [String](../../sql-reference/data-types/string.md). + +**Supported non-standard formats** + +- A string containing 9..10 digit [unix timestamp](https://en.wikipedia.org/wiki/Unix_time). +- A string with a date and a time component: `YYYYMMDDhhmmss`, `MM/DD/YYYY hh:mm:ss`, `MM-DD-YY hh:mm`, `YYYY-MM-DD hh:mm:ss`, etc. +- A string with a date, but no time component: `YYYY`, `YYYYMM`, `YYYY*MM`, `MM/DD/YYYY`, `MM-DD-YY` etc. +- A string with a day and time: `DD`, `DD hh`, `DD hh:mm`. In this case, `YYYY-MM` are substituted as `2000-01`. +- A string that includes the date and time along with time zone offset information: `YYYY-MM-DD hh:mm:ss ±h:mm`, etc. For example, `2020-12-12 17:36:00 -5:00`. + +**Returned values** + +- `time_string` converted to the `DateTime` data type. +- `NULL`. + +**Examples** + +Query: + +``` sql +SELECT parseDateTimeBestEffortUSOrNull('02/10/2021 21:12:57') +AS parseDateTimeBestEffortUSOrNull; +``` + +Result: + +``` text +┌─parseDateTimeBestEffortUSOrNull─┐ +│ 2021-02-10 21:12:57 │ +└─────────────────────────────────┘ +``` + +Query: + +``` sql +SELECT parseDateTimeBestEffortUSOrNull('02-10-2021 21:12:57') +AS parseDateTimeBestEffortUSOrNull; +``` + +Result: + +``` text +┌─parseDateTimeBestEffortUSOrNull─┐ +│ 2021-02-10 21:12:57 │ +└─────────────────────────────────┘ +``` + +Query: + +``` sql +SELECT parseDateTimeBestEffortUSOrNull('02.10.2021 21:12:57') +AS parseDateTimeBestEffortUSOrNull; +``` + +Result: + +``` text +┌─parseDateTimeBestEffortUSOrNull─┐ +│ 2021-02-10 21:12:57 │ +└─────────────────────────────────┘ +``` + +Query: + +``` sql +SELECT parseDateTimeBestEffortUSOrNull('02.2021 21:12:57') +AS parseDateTimeBestEffortUSOrNull; +``` + +Result: + +``` text +┌─parseDateTimeBestEffortUSOrNull─┐ +│ ᴺᵁᴸᴸ │ +└─────────────────────────────────┘ +``` + +## parseDateTimeBestEffortUSOrZero {#parsedatetimebesteffortusorzero} + +Same as for [parseDateTimeBestEffortUS](#parsedatetimebesteffortUS) except that it returns zero date or zero date time when it encounters a date format that cannot be processed. + +**Syntax** + +``` sql +parseDateTimeBestEffortUSOrZero(time_string [, time_zone]); +``` + +**Parameters** + +- `time_string` — String containing a date and time to convert. [String](../../sql-reference/data-types/string.md). +- `time_zone` — Time zone. The function parses `time_string` according to the time zone. [String](../../sql-reference/data-types/string.md). + +**Supported non-standard formats** + +- A string containing 9..10 digit [unix timestamp](https://en.wikipedia.org/wiki/Unix_time). +- A string with a date and a time component: `YYYYMMDDhhmmss`, `MM/DD/YYYY hh:mm:ss`, `MM-DD-YY hh:mm`, `YYYY-MM-DD hh:mm:ss`, etc. +- A string with a date, but no time component: `YYYY`, `YYYYMM`, `YYYY*MM`, `MM/DD/YYYY`, `MM-DD-YY` etc. +- A string with a day and time: `DD`, `DD hh`, `DD hh:mm`. In this case, `YYYY-MM` are substituted as `2000-01`. +- A string that includes the date and time along with time zone offset information: `YYYY-MM-DD hh:mm:ss ±h:mm`, etc. For example, `2020-12-12 17:36:00 -5:00`. + +**Returned value** + +- `time_string` converted to the `DateTime` data type. +- `zero date time`. + +**Examples** + +Query: + +``` sql +SELECT parseDateTimeBestEffortUSOrZero('02/10/2021 21:12:57') +AS parseDateTimeBestEffortUSOrZero; +``` + +Result: + +``` text +┌─parseDateTimeBestEffortUSOrZero─┐ +│ 2021-02-10 21:12:57 │ +└─────────────────────────────────┘ +``` + +Query: + +``` sql +SELECT parseDateTimeBestEffortUSOrZero('02-10-2021 21:12:57') +AS parseDateTimeBestEffortUSOrZero; +``` + +Result: + +``` text +┌─parseDateTimeBestEffortUSOrZero─┐ +│ 2021-02-10 21:12:57 │ +└─────────────────────────────────┘ +``` + +Query: + +``` sql +SELECT parseDateTimeBestEffortUSOrZero('02.10.2021 21:12:57') +AS parseDateTimeBestEffortUS; +``` + +Result: + +``` text +┌─parseDateTimeBestEffortUSOrZero─┐ +│ 2021-02-10 21:12:57 │ +└─────────────────────────────────┘ +``` + +Query: + +``` sql +SELECT parseDateTimeBestEffortUSOrZero('02.2021 21:12:57') +AS parseDateTimeBestEffortUSOrZero; +``` + +Result: + +``` text +┌─parseDateTimeBestEffortUSOrZero─┐ +│ 1970-01-01 00:00:00 │ +└─────────────────────────────────┘ +``` + ## toLowCardinality {#tolowcardinality} Converts input parameter to the [LowCardianlity](../../sql-reference/data-types/lowcardinality.md) version of same data type. From 15256d86e59613d36d13c93bbdec960ededcf81e Mon Sep 17 00:00:00 2001 From: Alexander Tokmakov Date: Wed, 10 Feb 2021 23:30:40 +0300 Subject: [PATCH 0222/2357] better replica recovery and queue cleanup --- src/Common/ZooKeeper/IKeeper.cpp | 2 +- src/Common/ZooKeeper/ZooKeeper.cpp | 21 ++-- src/Common/ZooKeeper/ZooKeeper.h | 11 +- src/Databases/DatabaseOnDisk.cpp | 2 +- src/Databases/DatabaseOnDisk.h | 2 +- src/Databases/DatabaseReplicated.cpp | 109 +++++++++++++++--- src/Databases/DatabaseReplicated.h | 2 + src/Databases/DatabaseReplicatedWorker.cpp | 3 +- src/Databases/IDatabase.h | 2 +- .../MySQL/DatabaseConnectionMySQL.cpp | 6 +- src/Databases/MySQL/DatabaseConnectionMySQL.h | 4 +- src/Interpreters/DDLWorker.cpp | 87 ++++++-------- src/Interpreters/InterpreterDropQuery.cpp | 2 +- .../test_distributed_ddl/cluster.py | 4 +- 14 files changed, 165 insertions(+), 92 deletions(-) diff --git a/src/Common/ZooKeeper/IKeeper.cpp b/src/Common/ZooKeeper/IKeeper.cpp index ad18fdd992a..94fd291bd12 100644 --- a/src/Common/ZooKeeper/IKeeper.cpp +++ b/src/Common/ZooKeeper/IKeeper.cpp @@ -59,7 +59,7 @@ static void addRootPath(String & path, const String & root_path) throw Exception("Path cannot be empty", Error::ZBADARGUMENTS); if (path[0] != '/') - throw Exception("Path must begin with /", Error::ZBADARGUMENTS); + throw Exception("Path must begin with /, got " + path, Error::ZBADARGUMENTS); if (root_path.empty()) return; diff --git a/src/Common/ZooKeeper/ZooKeeper.cpp b/src/Common/ZooKeeper/ZooKeeper.cpp index 7a64609dc22..dc6abca6892 100644 --- a/src/Common/ZooKeeper/ZooKeeper.cpp +++ b/src/Common/ZooKeeper/ZooKeeper.cpp @@ -610,7 +610,7 @@ void ZooKeeper::removeChildren(const std::string & path) } -void ZooKeeper::removeChildrenRecursive(const std::string & path) +void ZooKeeper::removeChildrenRecursive(const std::string & path, const String & keep_child_node) { Strings children = getChildren(path); while (!children.empty()) @@ -619,14 +619,15 @@ void ZooKeeper::removeChildrenRecursive(const std::string & path) for (size_t i = 0; i < MULTI_BATCH_SIZE && !children.empty(); ++i) { removeChildrenRecursive(path + "/" + children.back()); - ops.emplace_back(makeRemoveRequest(path + "/" + children.back(), -1)); + if (likely(keep_child_node.empty() || keep_child_node != children.back())) + ops.emplace_back(makeRemoveRequest(path + "/" + children.back(), -1)); children.pop_back(); } multi(ops); } } -void ZooKeeper::tryRemoveChildrenRecursive(const std::string & path) +void ZooKeeper::tryRemoveChildrenRecursive(const std::string & path, const String & keep_child_node) { Strings children; if (tryGetChildren(path, children) != Coordination::Error::ZOK) @@ -637,14 +638,14 @@ void ZooKeeper::tryRemoveChildrenRecursive(const std::string & path) Strings batch; for (size_t i = 0; i < MULTI_BATCH_SIZE && !children.empty(); ++i) { - batch.push_back(path + "/" + children.back()); + String child_path = path + "/" + children.back(); + tryRemoveChildrenRecursive(child_path); + if (likely(keep_child_node.empty() || keep_child_node != children.back())) + { + batch.push_back(child_path); + ops.emplace_back(zkutil::makeRemoveRequest(child_path, -1)); + } children.pop_back(); - tryRemoveChildrenRecursive(batch.back()); - - Coordination::RemoveRequest request; - request.path = batch.back(); - - ops.emplace_back(std::make_shared(std::move(request))); } /// Try to remove the children with a faster method - in bulk. If this fails, diff --git a/src/Common/ZooKeeper/ZooKeeper.h b/src/Common/ZooKeeper/ZooKeeper.h index d532da10f2f..fbe1bede91a 100644 --- a/src/Common/ZooKeeper/ZooKeeper.h +++ b/src/Common/ZooKeeper/ZooKeeper.h @@ -184,6 +184,12 @@ public: /// result would be the same as for the single call. void tryRemoveRecursive(const std::string & path); + /// Similar to removeRecursive(...) and tryRemoveRecursive(...), but does not remove path itself. + /// If keep_child_node is not empty, this method will not remove path/keep_child_node (but will remove its subtree). + /// It can be useful to keep some child node as a flag which indicates that path is currently removing. + void removeChildrenRecursive(const std::string & path, const String & keep_child_node = {}); + void tryRemoveChildrenRecursive(const std::string & path, const String & keep_child_node = {}); + /// Remove all children nodes (non recursive). void removeChildren(const std::string & path); @@ -247,9 +253,6 @@ private: void init(const std::string & implementation_, const std::string & hosts_, const std::string & identity_, int32_t session_timeout_ms_, int32_t operation_timeout_ms_, const std::string & chroot_); - void removeChildrenRecursive(const std::string & path); - void tryRemoveChildrenRecursive(const std::string & path); - /// The following methods don't throw exceptions but return error codes. Coordination::Error createImpl(const std::string & path, const std::string & data, int32_t mode, std::string & path_created); Coordination::Error removeImpl(const std::string & path, int32_t version); @@ -328,7 +331,7 @@ public: catch (...) { ProfileEvents::increment(ProfileEvents::CannotRemoveEphemeralNode); - DB::tryLogCurrentException(__PRETTY_FUNCTION__); + DB::tryLogCurrentException(__PRETTY_FUNCTION__, "Cannot remove " + path + ": "); } } diff --git a/src/Databases/DatabaseOnDisk.cpp b/src/Databases/DatabaseOnDisk.cpp index a03cb33591c..195f57d1bda 100644 --- a/src/Databases/DatabaseOnDisk.cpp +++ b/src/Databases/DatabaseOnDisk.cpp @@ -311,7 +311,7 @@ void DatabaseOnDisk::commitCreateTable(const ASTCreateQuery & query, const Stora } } -void DatabaseOnDisk::detachTablePermanently(const String & table_name) +void DatabaseOnDisk::detachTablePermanently(const Context &, const String & table_name) { auto table = detachTable(table_name); diff --git a/src/Databases/DatabaseOnDisk.h b/src/Databases/DatabaseOnDisk.h index 60a50ac4539..fefe6e91606 100644 --- a/src/Databases/DatabaseOnDisk.h +++ b/src/Databases/DatabaseOnDisk.h @@ -41,7 +41,7 @@ public: const StoragePtr & table, const ASTPtr & query) override; - void detachTablePermanently(const String & table_name) override; + void detachTablePermanently(const Context & context, const String & table_name) override; void dropTable( const Context & context, diff --git a/src/Databases/DatabaseReplicated.cpp b/src/Databases/DatabaseReplicated.cpp index a3da271a597..0ac71793e5d 100644 --- a/src/Databases/DatabaseReplicated.cpp +++ b/src/Databases/DatabaseReplicated.cpp @@ -39,6 +39,8 @@ namespace ErrorCodes } static constexpr const char * DROPPED_MARK = "DROPPED"; +static constexpr const char * BROKEN_TABLE_PREFIX = "_broken_"; + zkutil::ZooKeeperPtr DatabaseReplicated::getZooKeeper() const { @@ -306,13 +308,76 @@ void DatabaseReplicated::recoverLostReplica(const ZooKeeperPtr & current_zookeep if (new_replica && !empty()) throw Exception(ErrorCodes::LOGICAL_ERROR, "It's new replica, but database is not empty"); - if (!new_replica) - throw Exception(ErrorCodes::NOT_IMPLEMENTED, "Automatic replica recovery is not implemented"); - auto table_name_to_metadata = tryGetConsistentMetadataSnapshot(current_zookeeper, max_log_ptr); + Strings tables_to_detach; + size_t total_tables = 0; + auto existing_tables_it = getTablesIterator(global_context, [&](const String & name) { return !startsWith(name, BROKEN_TABLE_PREFIX); }); + while (existing_tables_it->isValid()) + { + String name = existing_tables_it->name(); + auto in_zk = table_name_to_metadata.find(name); + String local_metadata = readMetadataFile(name); + if (in_zk == table_name_to_metadata.end() || in_zk->second != local_metadata) + { + bool should_detach = true; + bool looks_like_replicated = in_zk->second.find("ReplicatedMergeTree") != std::string::npos; + + if (looks_like_replicated) + { + ParserCreateQuery parser; + auto size = global_context.getSettingsRef().max_query_size; + auto depth = global_context.getSettingsRef().max_parser_depth; + ASTPtr local_create = parseQuery(parser, local_metadata, size, depth); + ASTPtr zk_create = parseQuery(parser, in_zk->second, size, depth); + if (local_create->as()->uuid == zk_create->as()->uuid) + { + /// For ReplicatedMergeTree tables we can compare only UUIDs to ensure that it's tha same table. + /// Metadata can be different, it's handled on table replication level. + /// TODO maybe we should also compare MergeTree SETTINGS? + should_detach = false; + } + } + + if (should_detach) + tables_to_detach.emplace_back(std::move(name)); + } + existing_tables_it->next(); + ++total_tables; + } + + if (total_tables < tables_to_detach.size() * 2) + throw Exception(ErrorCodes::DATABASE_REPLICATION_FAILED, "Too many tables to detach: {} of {}", tables_to_detach.size(), total_tables); + else if (!tables_to_detach.empty()) + LOG_WARNING(log, "Will DETACH PERMANENTLY {} broken tables to recover replica", tables_to_detach.size()); + + auto db_guard = DatabaseCatalog::instance().getDDLGuard(getDatabaseName(), ""); + for (const auto & table_name : tables_to_detach) + { + String to_name = fmt::format("{}_{}_{}_{}", BROKEN_TABLE_PREFIX, table_name, max_log_ptr, thread_local_rng() % 1000); + DDLGuardPtr table_guard = DatabaseCatalog::instance().getDDLGuard(getDatabaseName(), std::min(table_name, to_name)); + DDLGuardPtr to_table_guard = DatabaseCatalog::instance().getDDLGuard(getDatabaseName(), std::max(table_name, to_name)); + + if (isDictionaryExist(table_name)) + { + /// TODO implement DETACH DICTIONARY PERMANENTLY + DatabaseAtomic::removeDictionary(global_context, table_name); + } + else + { + DatabaseAtomic::renameTable(global_context, table_name, *this, to_name, false, false); + DatabaseAtomic::detachTablePermanently(global_context, to_name); + } + } + for (const auto & name_and_meta : table_name_to_metadata) { + if (isTableExist(name_and_meta.first, global_context)) + { + assert(name_and_meta.second == readMetadataFile(name_and_meta.first)); + continue; + } + auto query_ast = parseQueryFromMetadataInZooKeeper(name_and_meta.first, name_and_meta.second); Context query_context = global_context; @@ -349,7 +414,7 @@ std::map DatabaseReplicated::tryGetConsistentMetadataSnapshot(co auto res = futures[i].get(); if (res.error != Coordination::Error::ZOK) break; - table_name_to_metadata.emplace(table_names[i], res.data); + table_name_to_metadata.emplace(unescapeForFileName(table_names[i]), res.data); } UInt32 new_max_log_ptr = parse(zookeeper->get(zookeeper_path + "/max_log_ptr")); @@ -451,18 +516,8 @@ void DatabaseReplicated::renameTable(const Context & context, const String & tab if (exchange && !to_database.isTableExist(to_table_name, context)) throw Exception(ErrorCodes::UNKNOWN_TABLE, "Table {} does not exist", to_table_name); - String statement; - String statement_to; - { - /// NOTE It's not atomic (however, we have only one thread) - ReadBufferFromFile in(getObjectMetadataPath(table_name), 4096); - readStringUntilEOF(statement, in); - if (exchange) - { - ReadBufferFromFile in_to(to_database.getObjectMetadataPath(to_table_name), 4096); - readStringUntilEOF(statement_to, in_to); - } - } + String statement = readMetadataFile(table_name); + String statement_to = readMetadataFile(to_table_name); String metadata_zk_path = txn->zookeeper_path + "/metadata/" + escapeForFileName(table_name); String metadata_zk_path_to = txn->zookeeper_path + "/metadata/" + escapeForFileName(to_table_name); txn->ops.emplace_back(zkutil::makeRemoveRequest(metadata_zk_path, -1)); @@ -481,6 +536,8 @@ void DatabaseReplicated::commitCreateTable(const ASTCreateQuery & query, const S const String & table_metadata_tmp_path, const String & table_metadata_path, const Context & query_context) { + if (startsWith(query.table, BROKEN_TABLE_PREFIX)) + throw Exception(ErrorCodes::INCORRECT_QUERY, "It's not allowed to attach broken tables"); auto txn = query_context.getMetadataTransaction(); assert(!ddl_worker->isCurrentlyActive() || txn); if (txn && txn->is_initial_query) @@ -533,4 +590,24 @@ void DatabaseReplicated::removeDictionary(const Context & context, const String DatabaseAtomic::removeDictionary(context, dictionary_name); } +void DatabaseReplicated::detachTablePermanently(const Context & context, const String & table_name) +{ + auto txn = context.getMetadataTransaction(); + assert(!ddl_worker->isCurrentlyActive() || txn); + if (txn && txn->is_initial_query) + { + String metadata_zk_path = zookeeper_path + "/metadata/" + escapeForFileName(table_name); + txn->ops.emplace_back(zkutil::makeRemoveRequest(metadata_zk_path, -1)); + } + DatabaseAtomic::detachTablePermanently(context, table_name); +} + +String DatabaseReplicated::readMetadataFile(const String & table_name) const +{ + String statement; + ReadBufferFromFile in(getObjectMetadataPath(table_name), 4096); + readStringUntilEOF(statement, in); + return statement; +} + } diff --git a/src/Databases/DatabaseReplicated.h b/src/Databases/DatabaseReplicated.h index fffc2b5c98a..2c998a8bc97 100644 --- a/src/Databases/DatabaseReplicated.h +++ b/src/Databases/DatabaseReplicated.h @@ -62,6 +62,7 @@ public: const String & dictionary_name, const ASTPtr & query) override; void removeDictionary(const Context & context, const String & dictionary_name) override; + void detachTablePermanently(const Context & context, const String & table_name) override; void drop(const Context & /*context*/) override; @@ -90,6 +91,7 @@ private: std::map tryGetConsistentMetadataSnapshot(const ZooKeeperPtr & zookeeper, UInt32 & max_log_ptr); ASTPtr parseQueryFromMetadataInZooKeeper(const String & node_name, const String & query); + String readMetadataFile(const String & table_name) const; String zookeeper_path; String shard_name; diff --git a/src/Databases/DatabaseReplicatedWorker.cpp b/src/Databases/DatabaseReplicatedWorker.cpp index 3162979e787..b29a8822c0c 100644 --- a/src/Databases/DatabaseReplicatedWorker.cpp +++ b/src/Databases/DatabaseReplicatedWorker.cpp @@ -24,13 +24,14 @@ DatabaseReplicatedDDLWorker::DatabaseReplicatedDDLWorker(DatabaseReplicated * db void DatabaseReplicatedDDLWorker::initializeMainThread() { - while (!initialized && !stop_flag) + while (!stop_flag) { try { auto zookeeper = getAndSetZooKeeper(); initializeReplication(); initialized = true; + return; } catch (...) { diff --git a/src/Databases/IDatabase.h b/src/Databases/IDatabase.h index fc821fcab30..3a196f827b7 100644 --- a/src/Databases/IDatabase.h +++ b/src/Databases/IDatabase.h @@ -249,7 +249,7 @@ public: /// Forget about the table without deleting it's data, but rename metadata file to prevent reloading it /// with next restart. The database may not support this method. - virtual void detachTablePermanently(const String & /*name*/) + virtual void detachTablePermanently(const Context & /*context*/, const String & /*name*/) { throw Exception("There is no DETACH TABLE PERMANENTLY query for Database" + getEngineName(), ErrorCodes::NOT_IMPLEMENTED); } diff --git a/src/Databases/MySQL/DatabaseConnectionMySQL.cpp b/src/Databases/MySQL/DatabaseConnectionMySQL.cpp index 35b016f255b..eeea12ae8f3 100644 --- a/src/Databases/MySQL/DatabaseConnectionMySQL.cpp +++ b/src/Databases/MySQL/DatabaseConnectionMySQL.cpp @@ -395,7 +395,7 @@ void DatabaseConnectionMySQL::loadStoredObjects(Context &, bool, bool /*force_at } } -void DatabaseConnectionMySQL::detachTablePermanently(const String & table_name) +void DatabaseConnectionMySQL::detachTablePermanently(const Context &, const String & table_name) { std::lock_guard lock{mutex}; @@ -429,9 +429,9 @@ void DatabaseConnectionMySQL::detachTablePermanently(const String & table_name) table_iter->second.second->is_dropped = true; } -void DatabaseConnectionMySQL::dropTable(const Context &, const String & table_name, bool /*no_delay*/) +void DatabaseConnectionMySQL::dropTable(const Context & context, const String & table_name, bool /*no_delay*/) { - detachTablePermanently(table_name); + detachTablePermanently(context, table_name); } DatabaseConnectionMySQL::~DatabaseConnectionMySQL() diff --git a/src/Databases/MySQL/DatabaseConnectionMySQL.h b/src/Databases/MySQL/DatabaseConnectionMySQL.h index 3e305fcb20d..d0a5c041d7b 100644 --- a/src/Databases/MySQL/DatabaseConnectionMySQL.h +++ b/src/Databases/MySQL/DatabaseConnectionMySQL.h @@ -72,9 +72,9 @@ public: StoragePtr detachTable(const String & table_name) override; - void detachTablePermanently(const String & table_name) override; + void detachTablePermanently(const Context & context, const String & table_name) override; - void dropTable(const Context &, const String & table_name, bool no_delay) override; + void dropTable(const Context & context, const String & table_name, bool no_delay) override; void attachTable(const String & table_name, const StoragePtr & storage, const String & relative_table_path) override; diff --git a/src/Interpreters/DDLWorker.cpp b/src/Interpreters/DDLWorker.cpp index efaacabf4de..975eaeaca1b 100644 --- a/src/Interpreters/DDLWorker.cpp +++ b/src/Interpreters/DDLWorker.cpp @@ -315,11 +315,10 @@ void DDLWorker::scheduleTasks() { /// Main thread of DDLWorker was restarted, probably due to lost connection with ZooKeeper. /// We have some unfinished tasks. To avoid duplication of some queries, try to write execution status. - bool status_written = task->ops.empty(); bool task_still_exists = zookeeper->exists(task->entry_path); + bool status_written = zookeeper->exists(task->getFinishedNodePath()); if (task->was_executed && !status_written && task_still_exists) { - assert(!zookeeper->exists(task->getFinishedNodePath())); processTask(*task); } } @@ -472,9 +471,16 @@ void DDLWorker::processTask(DDLTaskBase & task) String active_node_path = task.getActiveNodePath(); String finished_node_path = task.getFinishedNodePath(); - String dummy; - zookeeper->createAncestors(active_node_path); - auto active_node = zkutil::EphemeralNodeHolder::create(active_node_path, *zookeeper, ""); + auto create_active_res = zookeeper->tryCreate(active_node_path, {}, zkutil::CreateMode::Ephemeral); + if (create_active_res != Coordination::Error::ZOK) + { + if (create_active_res == Coordination::Error::ZNONODE) + throw Coordination::Exception(create_active_res, active_node_path); + createStatusDirs(task.entry_path, zookeeper); + zookeeper->create(active_node_path, {}, zkutil::CreateMode::Ephemeral); + + } + auto active_node = zkutil::EphemeralNodeHolder::existing(active_node_path, *zookeeper); if (!task.was_executed) { @@ -755,7 +761,6 @@ void DDLWorker::cleanupQueue(Int64, const ZooKeeperPtr & zookeeper) String node_name = *it; String node_path = fs::path(queue_dir) / node_name; - String lock_path = fs::path(node_path) / "lock"; Coordination::Stat stat; String dummy; @@ -769,39 +774,29 @@ void DDLWorker::cleanupQueue(Int64, const ZooKeeperPtr & zookeeper) if (!canRemoveQueueEntry(node_name, stat)) continue; - /// Skip if there are active nodes (it is weak guard) - if (zookeeper->exists(fs::path(node_path) / "active", &stat) && stat.numChildren > 0) + /// At first we remove entry/active node to prevent staled hosts from executing entry concurrently + auto rm_active_res = zookeeper->tryRemove(fs::path(node_path) / "active"); + if (rm_active_res != Coordination::Error::ZOK && rm_active_res != Coordination::Error::ZNONODE) { - LOG_INFO(log, "Task {} should be deleted, but there are active workers. Skipping it.", node_name); - continue; - } - - /// Usage of the lock is not necessary now (tryRemoveRecursive correctly removes node in a presence of concurrent cleaners) - /// But the lock will be required to implement system.distributed_ddl_queue table - auto lock = createSimpleZooKeeperLock(zookeeper, node_path, "lock", host_fqdn_id); - if (!lock->tryLock()) - { - LOG_INFO(log, "Task {} should be deleted, but it is locked. Skipping it.", node_name); + if (rm_active_res == Coordination::Error::ZNOTEMPTY) + LOG_DEBUG(log, "Task {} should be deleted, but there are active workers. Skipping it.", node_name); + else + LOG_WARNING(log, "Unexpected status code {} on attempt to remove {}/active", rm_active_res, node_name); continue; } + /// Now we can safely delete entry LOG_INFO(log, "Task {} is outdated, deleting it", node_name); - /// Deleting - { - Strings children = zookeeper->getChildren(node_path); - for (const String & child : children) - { - if (child != "lock") - zookeeper->tryRemoveRecursive(fs::path(node_path) / child); - } + /// We recursively delete all nodes except entry/finished to prevent staled hosts from + /// creating entry/active node (see createStatusDirs(...)) + zookeeper->tryRemoveChildrenRecursive(node_path, "finished"); - /// Remove the lock node and its parent atomically - Coordination::Requests ops; - ops.emplace_back(zkutil::makeRemoveRequest(lock_path, -1)); - ops.emplace_back(zkutil::makeRemoveRequest(node_path, -1)); - zookeeper->multi(ops); - } + /// And then we remove entry and entry/finished in a single transaction + Coordination::Requests ops; + ops.emplace_back(zkutil::makeRemoveRequest(fs::path(node_path) / "finished", -1)); + ops.emplace_back(zkutil::makeRemoveRequest(node_path, -1)); + zookeeper->multi(ops); } catch (...) { @@ -819,7 +814,7 @@ bool DDLWorker::canRemoveQueueEntry(const String & entry_name, const Coordinatio /// If too many nodes in task queue (> max_tasks_in_queue), delete oldest one UInt32 entry_number = DDLTaskBase::getLogEntryNumber(entry_name); - bool node_is_outside_max_window = entry_number < max_id.load(std::memory_order_relaxed) - max_tasks_in_queue; + bool node_is_outside_max_window = entry_number + max_tasks_in_queue < max_id.load(std::memory_order_relaxed); return node_lifetime_is_expired || node_is_outside_max_window; } @@ -828,21 +823,17 @@ bool DDLWorker::canRemoveQueueEntry(const String & entry_name, const Coordinatio void DDLWorker::createStatusDirs(const std::string & node_path, const ZooKeeperPtr & zookeeper) { Coordination::Requests ops; - { - Coordination::CreateRequest request; - request.path = fs::path(node_path) / "active"; - ops.emplace_back(std::make_shared(std::move(request))); - } - { - Coordination::CreateRequest request; - request.path = fs::path(node_path) / "finished"; - ops.emplace_back(std::make_shared(std::move(request))); - } + ops.emplace_back(zkutil::makeCreateRequest(fs::path(node_path) / "active", {}, zkutil::CreateMode::Persistent)); + ops.emplace_back(zkutil::makeCreateRequest(fs::path(node_path) / "finished", {}, zkutil::CreateMode::Persistent)); + Coordination::Responses responses; Coordination::Error code = zookeeper->tryMulti(ops, responses); - if (code != Coordination::Error::ZOK - && code != Coordination::Error::ZNODEEXISTS) - throw Coordination::Exception(code); + bool both_created = code == Coordination::Error::ZOK; + bool both_already_exists = responses.size() == 2 && responses[0]->error == Coordination::Error::ZNODEEXISTS + && responses[1]->error == Coordination::Error::ZNODEEXISTS; + if (both_created || both_already_exists) + return; + throw Coordination::Exception(code); } @@ -877,8 +868,6 @@ String DDLWorker::enqueueQuery(DDLLogEntry & entry) void DDLWorker::initializeMainThread() { assert(!initialized); - assert(max_id == 0); - assert(current_tasks.empty()); setThreadName("DDLWorker"); LOG_DEBUG(log, "Started DDLWorker thread"); @@ -896,7 +885,7 @@ void DDLWorker::initializeMainThread() if (!Coordination::isHardwareError(e.code)) { /// A logical error. - LOG_ERROR(log, "ZooKeeper error: {}. Failed to start DDLWorker.",getCurrentExceptionMessage(true)); + LOG_ERROR(log, "ZooKeeper error: {}. Failed to start DDLWorker.", getCurrentExceptionMessage(true)); assert(false); /// Catch such failures in tests with debug build } diff --git a/src/Interpreters/InterpreterDropQuery.cpp b/src/Interpreters/InterpreterDropQuery.cpp index ae76e8efd46..9e63c647f71 100644 --- a/src/Interpreters/InterpreterDropQuery.cpp +++ b/src/Interpreters/InterpreterDropQuery.cpp @@ -162,7 +162,7 @@ BlockIO InterpreterDropQuery::executeToTableImpl(const ASTDropQuery & query, Dat if (query.permanently) { /// Drop table from memory, don't touch data, metadata file renamed and will be skipped during server restart - database->detachTablePermanently(table_id.table_name); + database->detachTablePermanently(context, table_id.table_name); } else { diff --git a/tests/integration/test_distributed_ddl/cluster.py b/tests/integration/test_distributed_ddl/cluster.py index 811eb94bad4..45a159ed2b9 100644 --- a/tests/integration/test_distributed_ddl/cluster.py +++ b/tests/integration/test_distributed_ddl/cluster.py @@ -104,8 +104,8 @@ class ClickHouseClusterWithDDLHelpers(ClickHouseCluster): def ddl_check_there_are_no_dublicates(instance): query = "SELECT max(c), argMax(q, c) FROM (SELECT lower(query) AS q, count() AS c FROM system.query_log WHERE type=2 AND q LIKE '/* ddl_entry=query-%' GROUP BY query)" rows = instance.query(query) - assert len(rows) > 0 and rows[0][0] == "1", "dublicates on {} {}, query {}".format(instance.name, - instance.ip_address, query) + assert len(rows) > 0 and rows[0][0] == "1", "dublicates on {} {}: {}".format(instance.name, + instance.ip_address, rows) @staticmethod def insert_reliable(instance, query_insert): From 537b372c32732ddecc9a5f7414c23ea1722ec2fc Mon Sep 17 00:00:00 2001 From: Dmitriy Date: Thu, 11 Feb 2021 00:16:23 +0300 Subject: [PATCH 0223/2357] Update type-conversion-functions.md MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Исправил null на NULL. --- docs/en/sql-reference/functions/type-conversion-functions.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/en/sql-reference/functions/type-conversion-functions.md b/docs/en/sql-reference/functions/type-conversion-functions.md index 2116e55e3ef..f752bb9f6cb 100644 --- a/docs/en/sql-reference/functions/type-conversion-functions.md +++ b/docs/en/sql-reference/functions/type-conversion-functions.md @@ -691,7 +691,7 @@ Same as for [parseDateTimeBestEffort](#parsedatetimebesteffort) except that it r ## parseDateTimeBestEffortUSOrNull {#parsedatetimebesteffortusornull} -Same as for [parseDateTimeBestEffortUS](#parsedatetimebesteffortUS) except that it returns null when it encounters a date format that cannot be processed. +Same as for [parseDateTimeBestEffortUS](#parsedatetimebesteffortUS) except that it returns `NULL` when it encounters a date format that cannot be processed. **Syntax** From 59752cbf27104d76fa7a0c9b669f5dbe3b423c3e Mon Sep 17 00:00:00 2001 From: Roman Bug Date: Thu, 11 Feb 2021 01:50:13 +0300 Subject: [PATCH 0224/2357] Update type-conversion-functions.md Fix changes from EN review. --- .../functions/type-conversion-functions.md | 50 ++++++++++++++++--- 1 file changed, 44 insertions(+), 6 deletions(-) diff --git a/docs/ru/sql-reference/functions/type-conversion-functions.md b/docs/ru/sql-reference/functions/type-conversion-functions.md index d95a5279716..3a6d2bd9ca0 100644 --- a/docs/ru/sql-reference/functions/type-conversion-functions.md +++ b/docs/ru/sql-reference/functions/type-conversion-functions.md @@ -423,8 +423,11 @@ SELECT uuid = uuid2; ## CAST(x, T) {#type_conversion_function-cast} -Преобразует x в тип данных t. -Поддерживается также синтаксис CAST(x AS t). +Преобразует вхожное значение `x` в указананный тип данных `T`. + +Поддерживается также синтаксис `CAST(x AS t)`. + +Обратите внимание, что если значение `x` не соответствует границам типа `T`, функция переполняется. Например, `CAST(-1, 'UInt8')` возвращает 255. **Пример** @@ -487,9 +490,44 @@ SELECT toTypeName(CAST(x, 'Nullable(UInt16)')) FROM t_null; - Настройка [cast_keep_nullable](../../operations/settings/settings.md#cast_keep_nullable) +## accurateCast(x, T) {#type_conversion_function-accurate-cast} + +Преобразует входное значение `x` в указанный тип данных `T`. + +Отличие от [cast(x, T)](#type_conversion_function-cast) в том, что `accurateCast` не допускает переполнения числовых типов, если значение типа `x` не соответствует границам типа `T`. Например, `accurateCast(-1, 'UInt8')` вернет ошибку. + +**Примеры** + +Запрос: + +``` sql +SELECT cast(-1, 'UInt8') as uint8; +``` + +Результат: + +``` text +┌─uint8─┐ +│ 255 │ +└───── + +Запрос: + +```sql +SELECT accurateCast(-1, 'UInt8') as uint8; +``` + +Результат: + +``` text +Code: 70. DB::Exception: Received from localhost:9000. DB::Exception: Value in column Int8 cannot be safely converted into type UInt8: While processing accurateCast(-1, 'UInt8') AS uint8. +``` + ## accurateCastOrNull(x, T) {#type_conversion_function-accurate-cast_or_null} -Преобразует входное значение `x` в указанный тип данных `T`. Всегда возвращает тип [Nullable](../../sql-reference/data-types/nullable.md), и возвращает [NULL](../../sql-reference/syntax.md#null-literal), если приведенное значение не может быть представлено в целевом типе. +Преобразует входное значение `x` в указанный тип данных `T`. + +Всегда возвращает тип [Nullable](../../sql-reference/data-types/nullable.md), и возвращает [NULL](../../sql-reference/syntax.md#null-literal), если приведенное значение не может быть представлено в целевом типе. **Синтаксис** @@ -522,9 +560,9 @@ SELECT toTypeName(accurateCastOrNull(5, 'UInt8')); ``` sql SELECT - cast(-1, 'UInt8') as uint8, - cast(128, 'Int8') as int8, - cast('Test', 'FixedString(2)') as fixed_string; + accurateCastOrNull(-1, 'UInt8') as uint8, + accurateCastOrNull(128, 'Int8') as int8, + accurateCastOrNull('Test', 'FixedString(2)') as fixed_string; ``` Результат: From d4580f9fb4b18d4bb9ec1e2870a8d35db06fa6ef Mon Sep 17 00:00:00 2001 From: Roman Bug Date: Thu, 11 Feb 2021 01:51:19 +0300 Subject: [PATCH 0225/2357] Update type-conversion-functions.md --- .../sql-reference/functions/type-conversion-functions.md | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/docs/en/sql-reference/functions/type-conversion-functions.md b/docs/en/sql-reference/functions/type-conversion-functions.md index 83cbad6f53b..b452adbde60 100644 --- a/docs/en/sql-reference/functions/type-conversion-functions.md +++ b/docs/en/sql-reference/functions/type-conversion-functions.md @@ -427,7 +427,12 @@ Result: ## CAST(x, T) {#type_conversion_function-cast} -Converts input value `x` to the `T` data type. The syntax `CAST(x AS t)` is also supported. +Converts input value `x` to the `T` data type. + +The syntax `CAST(x AS t)` is also supported. + +Note, that if value `x` does not fit the bounds of type T, the function overflows. For example, CAST(-1, 'UInt8') returns 255. + **Example** From d4bd82c6c98eb2c4942ce80a42a8f543fd3865e9 Mon Sep 17 00:00:00 2001 From: Roman Bug Date: Thu, 11 Feb 2021 01:56:12 +0300 Subject: [PATCH 0226/2357] Update in.md Updates in IN from EN comments. --- docs/ru/sql-reference/operators/in.md | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/docs/ru/sql-reference/operators/in.md b/docs/ru/sql-reference/operators/in.md index d86d6f9ec57..c2d88a729be 100644 --- a/docs/ru/sql-reference/operators/in.md +++ b/docs/ru/sql-reference/operators/in.md @@ -17,7 +17,8 @@ SELECT (CounterID, UserID) IN ((34, 123), (101500, 456)) FROM ... В качестве правой части оператора может быть множество константных выражений, множество кортежей с константными выражениями (показано в примерах выше), а также имя таблицы или подзапрос SELECT в скобках. -ClickHouse допускает различные типы внутри подзапроса `IN`. Для левой стороны он применяет преобразование к типу правой стороны с помощью [accurateCastOrNull](../functions/type-conversion-functions.md#type_conversion_function-accurate-cast_or_null). +ClickHouse допускает различные типы в левой и правой частях подзапроса `IN`. +В этом случае он преобразует левую сторону в тип правой стороны, применяя функцию [accurateCastOrNull](../functions/type-conversion-functions.md#type_conversion_function-accurate-cast_or_null). **Пример** From 60f9f2e913fed325c4747fecbe0e1291265bc666 Mon Sep 17 00:00:00 2001 From: Roman Bug Date: Thu, 11 Feb 2021 02:03:23 +0300 Subject: [PATCH 0227/2357] Update type-conversion-functions.md Add Returned values --- docs/en/sql-reference/functions/type-conversion-functions.md | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/docs/en/sql-reference/functions/type-conversion-functions.md b/docs/en/sql-reference/functions/type-conversion-functions.md index b452adbde60..268a7565b81 100644 --- a/docs/en/sql-reference/functions/type-conversion-functions.md +++ b/docs/en/sql-reference/functions/type-conversion-functions.md @@ -544,6 +544,10 @@ accurateCastOrNull(x, T) - `x` — Input value. - `T` — The name of the returned data type. +**Returned value** + +- The value in specified data type `T`. + **Example** Query: From 37979c8b87d4747816446b1939248911a40ea081 Mon Sep 17 00:00:00 2001 From: Roman Bug Date: Thu, 11 Feb 2021 02:03:36 +0300 Subject: [PATCH 0228/2357] Update type-conversion-functions.md Add Returned values --- docs/ru/sql-reference/functions/type-conversion-functions.md | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/docs/ru/sql-reference/functions/type-conversion-functions.md b/docs/ru/sql-reference/functions/type-conversion-functions.md index 3a6d2bd9ca0..e16fa438aed 100644 --- a/docs/ru/sql-reference/functions/type-conversion-functions.md +++ b/docs/ru/sql-reference/functions/type-conversion-functions.md @@ -540,6 +540,10 @@ accurateCastOrNull(x, T) - `x` — входное значение. - `T` — имя возвращаемого типа данных. +**Возвращаемое значение** + +- Значение, преобразованное в указанный тип `T`. + **Примеры** Запрос: From 3feded8d0cb562b7d0ed7a8c4bd4939f2524301c Mon Sep 17 00:00:00 2001 From: Roman Bug Date: Thu, 11 Feb 2021 02:03:53 +0300 Subject: [PATCH 0229/2357] Create type-conversion-functions.md Add Returned values From e9586cc44e170090b8faf474c5f76465b60daaa5 Mon Sep 17 00:00:00 2001 From: bharatnc Date: Wed, 10 Feb 2021 19:13:19 -0800 Subject: [PATCH 0230/2357] Document ALTER RENAME Column --- .../en/sql-reference/statements/alter/column.md | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) diff --git a/docs/en/sql-reference/statements/alter/column.md b/docs/en/sql-reference/statements/alter/column.md index 0ea4d4b3dc5..5933cb8bce9 100644 --- a/docs/en/sql-reference/statements/alter/column.md +++ b/docs/en/sql-reference/statements/alter/column.md @@ -24,6 +24,7 @@ The following actions are supported: - [COMMENT COLUMN](#alter_comment-column) — Adds a text comment to the column. - [MODIFY COLUMN](#alter_modify-column) — Changes column’s type, default expression and TTL. - [MODIFY COLUMN REMOVE](#modify-remove) — Removes one of the column properties. +- [RENAME COLUMN](#alter_rename-column) — Renames an existing column. These actions are described in detail below. @@ -166,6 +167,22 @@ ALTER TABLE table_with_ttl MODIFY COLUMN column_ttl REMOVE TTL; - [REMOVE TTL](ttl.md). +## RENAME COLUMN {#alter_rename-column} + +Renames an existing column. + +Syntax: + +```sql +ALTER TABLE table_name RENAME COLUMN column_name TO new_column_name; +``` + +**Example** + +```sql +ALTER TABLE table_with_ttl RENAME COLUMN column_ttl TO column_ttl_new; +``` + ## Limitations {#alter-query-limitations} The `ALTER` query lets you create and delete separate elements (columns) in nested data structures, but not whole nested data structures. To add a nested data structure, you can add columns with a name like `name.nested_name` and the type `Array(T)`. A nested data structure is equivalent to multiple array columns with a name that has the same prefix before the dot. From b574d8331b2cd6c2cd8dfe7d36ad8257b392db83 Mon Sep 17 00:00:00 2001 From: George Date: Thu, 11 Feb 2021 11:46:31 +0300 Subject: [PATCH 0231/2357] Updated description --- .../functions/tuple-map-functions.md | 97 ++++++++++++++++++- 1 file changed, 96 insertions(+), 1 deletion(-) diff --git a/docs/en/sql-reference/functions/tuple-map-functions.md b/docs/en/sql-reference/functions/tuple-map-functions.md index 50015cd996e..d3503937af2 100644 --- a/docs/en/sql-reference/functions/tuple-map-functions.md +++ b/docs/en/sql-reference/functions/tuple-map-functions.md @@ -137,9 +137,104 @@ Type: [UInt8](../../sql-reference/data-types/int-uint.md). Query: +```sql +CREATE TABLE test (a Map(String,String)) ENGINE = Memory; -## mapKeys {#mapKeys} +INSERT INTO test VALUES ({'name':'eleven','age':'11'}), ({'number':'twelve','position':'6.0'}); + +SELECT mapContains(a, 'name') FROM test; + +``` + +Result: + +```text +┌─mapContains(a, 'name')─┐ +│ 1 │ +│ 0 │ +└────────────────────────┘ +``` + +## mapKeys {#mapkeys} + +Returns all the keys from `map` parameter. + +**Syntax** + +```sql +mapKeys(map) +``` + +**Parameters** + +- `map`- Map. + +**Returned value** + +- Array containing all the keys from `map`. + +Type: [Array](../../sql-reference/data-types/array.md). + +**Example** + +Query: + +```sql +CREATE TABLE test (a Map(String,String)) ENGINE = Memory; + +INSERT INTO test VALUES ({'name':'eleven','age':'11'}), ({'number':'twelve','position':'6.0'}); + +SELECT mapKeys(a) FROM test; +``` + +Result: + +```text +┌─mapKeys(a)────────────┐ +│ ['name','age'] │ +│ ['number','position'] │ +└───────────────────────┘ +``` ## mapValues {#mapvalues} +Returns all the values from `map` parameter. + +**Syntax** + +```sql +mapKeys(map) +``` + +**Parameters** + +- `map`- Map. + +**Returned value** + +- Array containing all the values from `map`. + +Type: [Array](../../sql-reference/data-types/array.md). + +**Example** + +Query: + +```sql +CREATE TABLE test (a Map(String,String)) ENGINE = Memory; + +INSERT INTO test VALUES ({'name':'eleven','age':'11'}), ({'number':'twelve','position':'6.0'}); + +SELECT mapValues(a) FROM test; +``` + +Result: + +```text +┌─mapValues(a)─────┐ +│ ['eleven','11'] │ +│ ['twelve','6.0'] │ +└──────────────────┘ +``` + [Original article](https://clickhouse.tech/docs/en/query_language/functions/tuple-map-functions/) From 3a020d2dd5c4ffda10fb4dd79509f5e04f45e692 Mon Sep 17 00:00:00 2001 From: Nikolai Kochetov Date: Thu, 11 Feb 2021 11:49:12 +0300 Subject: [PATCH 0232/2357] filter push down for Aggregating --- src/Processors/QueryPlan/AggregatingStep.h | 2 + .../QueryPlan/Optimizations/Optimizations.h | 7 +- .../Optimizations/filterPushDown.cpp | 77 +++++++++++++++++++ 3 files changed, 85 insertions(+), 1 deletion(-) create mode 100644 src/Processors/QueryPlan/Optimizations/filterPushDown.cpp diff --git a/src/Processors/QueryPlan/AggregatingStep.h b/src/Processors/QueryPlan/AggregatingStep.h index 853173895b3..6be92394fab 100644 --- a/src/Processors/QueryPlan/AggregatingStep.h +++ b/src/Processors/QueryPlan/AggregatingStep.h @@ -32,6 +32,8 @@ public: void describeActions(FormatSettings &) const override; void describePipeline(FormatSettings & settings) const override; + const Aggregator::Params & getParams() const { return params; } + private: Aggregator::Params params; bool final; diff --git a/src/Processors/QueryPlan/Optimizations/Optimizations.h b/src/Processors/QueryPlan/Optimizations/Optimizations.h index 454eab9649a..be7f81e5db0 100644 --- a/src/Processors/QueryPlan/Optimizations/Optimizations.h +++ b/src/Processors/QueryPlan/Optimizations/Optimizations.h @@ -38,14 +38,19 @@ size_t trySplitFilter(QueryPlan::Node * node, QueryPlan::Nodes & nodes); /// Replace chain `FilterStep -> ExpressionStep` to single FilterStep size_t tryMergeExpressions(QueryPlan::Node * parent_node, QueryPlan::Nodes &); +/// Move FilterStep down if possible. +/// May split FilterStep and push down only part of it. +size_t tryPushDownLimit(QueryPlan::Node * parent_node, QueryPlan::Nodes & nodes); + inline const auto & getOptimizations() { - static const std::array optimizations = + static const std::array optimizations = {{ {tryLiftUpArrayJoin, "liftUpArrayJoin"}, {tryPushDownLimit, "pushDownLimit"}, {trySplitFilter, "splitFilter"}, {tryMergeExpressions, "mergeExpressions"}, + {tryPushDownLimit, "pushDownFilter"}, }}; return optimizations; diff --git a/src/Processors/QueryPlan/Optimizations/filterPushDown.cpp b/src/Processors/QueryPlan/Optimizations/filterPushDown.cpp new file mode 100644 index 00000000000..82704bcbce9 --- /dev/null +++ b/src/Processors/QueryPlan/Optimizations/filterPushDown.cpp @@ -0,0 +1,77 @@ +#include +#include +#include +#include +#include +#include +#include + +namespace DB::ErrorCodes +{ + extern const int LOGICAL_ERROR; +} + +namespace DB::QueryPlanOptimizations +{ + +size_t tryPushDownLimit(QueryPlan::Node * parent_node, QueryPlan::Nodes & nodes) +{ + if (parent_node->children.size() != 1) + return 0; + + QueryPlan::Node * child_node = parent_node->children.front(); + + auto & parent = parent_node->step; + auto & child = child_node->step; + auto * filter = typeid_cast(parent.get()); + + if (!filter) + return 0; + + const auto & expression = filter->getExpression(); + const auto & filter_column_name = filter->getFilterColumnName(); + bool removes_filter = filter->removesFilterColumn(); + + if (auto * aggregating = typeid_cast(child.get())) + { + const auto & params = aggregating->getParams(); + + Names keys; + keys.reserve(params.keys.size()); + for (auto pos : params.keys) + keys.push_back(params.src_header.getByPosition(pos).name); + + if (auto split_filter = expression->splitActionsForFilter(filter_column_name, removes_filter, keys)) + { + auto it = expression->getIndex().find(filter_column_name); + if (it == expression->getIndex().end()) + { + if (!removes_filter) + throw Exception(ErrorCodes::LOGICAL_ERROR, + "Filter column {} was removed from ActionsDAG but it is needed in result. DAG:\n{}", + filter_column_name, expression->dumpDAG()); + + parent = std::make_unique(child->getOutputStream(), expression); + } + + /// Add new Filter step before Aggregating. + /// Expression/Filter -> Aggregating -> Something + auto & node = nodes.emplace_back(); + node.children.swap(child_node->children); + child_node->children.emplace_back(&node); + /// Expression/Filter -> Aggregating -> Filter -> Something + + /// New filter column is added to the end. + auto split_filter_column_name = (*split_filter->getIndex().rbegin())->result_name; + node.step = std::make_unique( + node.children.at(0)->step->getOutputStream(), + std::move(split_filter), std::move(split_filter_column_name), true); + + return 3; + } + } + + return 0; +} + +} From 8b4d9e421a1037f132f8c6511b92ee1a3a21580b Mon Sep 17 00:00:00 2001 From: George Date: Thu, 11 Feb 2021 12:21:59 +0300 Subject: [PATCH 0233/2357] Added translation --- .../functions/tuple-map-functions.md | 4 +- .../functions/tuple-map-functions.md | 127 +++++++++++++++++- 2 files changed, 128 insertions(+), 3 deletions(-) diff --git a/docs/en/sql-reference/functions/tuple-map-functions.md b/docs/en/sql-reference/functions/tuple-map-functions.md index d3503937af2..a08ca70e851 100644 --- a/docs/en/sql-reference/functions/tuple-map-functions.md +++ b/docs/en/sql-reference/functions/tuple-map-functions.md @@ -124,7 +124,7 @@ mapContains(map, key) **Parameters** -- `map` — Map. [Type name](relative/path/to/type/dscr.md#type). +- `map` — Map. - `key` — Key. Type matches the type of `map.keys`. **Returned value** @@ -237,4 +237,4 @@ Result: └──────────────────┘ ``` -[Original article](https://clickhouse.tech/docs/en/query_language/functions/tuple-map-functions/) +[Original article](https://clickhouse.tech/docs/en/sql-reference/functions/tuple-map-functions/) diff --git a/docs/ru/sql-reference/functions/tuple-map-functions.md b/docs/ru/sql-reference/functions/tuple-map-functions.md index a2b25e68fe5..6461412aec5 100644 --- a/docs/ru/sql-reference/functions/tuple-map-functions.md +++ b/docs/ru/sql-reference/functions/tuple-map-functions.md @@ -116,4 +116,129 @@ select mapPopulateSeries([1,2,4], [11,22,44], 5) as res, toTypeName(res) as type └──────────────────────────────┴───────────────────────────────────┘ ``` -[Оригинальная статья](https://clickhouse.tech/docs/en/query_language/functions/tuple-map-functions/) +## mapContains {#mapcontains} + +Определяет, включает ли в себя `map.keys` параметр `key`. + +**Синтаксис** + +``` sql +mapContains(map, key) +``` + +**Параметры** + +- `map` — Map. +- `key` — ключ. Тип соответстует типу `map.keys`. + +**Возвращаемое значение** + +- `1` если `map.keys` включает `key`, иначе `0`. + +Тип: [UInt8](../../sql-reference/data-types/int-uint.md). + +**Пример** + +Запрос: + +```sql +CREATE TABLE test (a Map(String,String)) ENGINE = Memory; + +INSERT INTO test VALUES ({'name':'eleven','age':'11'}), ({'number':'twelve','position':'6.0'}); + +SELECT mapContains(a, 'name') FROM test; + +``` + +Результат: + +```text +┌─mapContains(a, 'name')─┐ +│ 1 │ +│ 0 │ +└────────────────────────┘ +``` + +## mapKeys {#mapkeys} + +Возвращает все ключи контейнера `map`. + +**Синтаксис** + +```sql +mapKeys(map) +``` + +**Параметры** + +- `map`- map. + +**Возвращаемое значение** + +- Массив со всеми ключами контейнера `map`. + +Тип: [Array](../../sql-reference/data-types/array.md). + +**Пример** + +Запрос: + +```sql +CREATE TABLE test (a Map(String,String)) ENGINE = Memory; + +INSERT INTO test VALUES ({'name':'eleven','age':'11'}), ({'number':'twelve','position':'6.0'}); + +SELECT mapKeys(a) FROM test; +``` + +Результат: + +```text +┌─mapKeys(a)────────────┐ +│ ['name','age'] │ +│ ['number','position'] │ +└───────────────────────┘ +``` + +## mapValues {#mapvalues} + +Возвращает все значения контейнера `map`. + +**Синтаксис** + +```sql +mapKeys(map) +``` + +**Параметры** + +- `map`- map. + +**Возвращаемое значение** + +- Массив со всеми значениями `map`. + +Тип: [Array](../../sql-reference/data-types/array.md). + +**Примеры** + +Запрос: + +```sql +CREATE TABLE test (a Map(String,String)) ENGINE = Memory; + +INSERT INTO test VALUES ({'name':'eleven','age':'11'}), ({'number':'twelve','position':'6.0'}); + +SELECT mapValues(a) FROM test; +``` + +Результат: + +```text +┌─mapValues(a)─────┐ +│ ['eleven','11'] │ +│ ['twelve','6.0'] │ +└──────────────────┘ +``` + +[Оригинальная статья](https://clickhouse.tech/docs/ru/sql-reference/functions/tuple-map-functions/) From 48b8685d6ef0e690ee7055f0ba1812fa8dfa50e1 Mon Sep 17 00:00:00 2001 From: George Date: Thu, 11 Feb 2021 12:46:14 +0300 Subject: [PATCH 0234/2357] minor fixes --- docs/en/sql-reference/functions/tuple-map-functions.md | 4 ++-- docs/ru/sql-reference/functions/tuple-map-functions.md | 6 +++--- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/docs/en/sql-reference/functions/tuple-map-functions.md b/docs/en/sql-reference/functions/tuple-map-functions.md index a08ca70e851..f8755f1e2a9 100644 --- a/docs/en/sql-reference/functions/tuple-map-functions.md +++ b/docs/en/sql-reference/functions/tuple-map-functions.md @@ -167,7 +167,7 @@ mapKeys(map) **Parameters** -- `map`- Map. +- `map` — Map. **Returned value** @@ -208,7 +208,7 @@ mapKeys(map) **Parameters** -- `map`- Map. +- `map` — Map. **Returned value** diff --git a/docs/ru/sql-reference/functions/tuple-map-functions.md b/docs/ru/sql-reference/functions/tuple-map-functions.md index 6461412aec5..22bf1e98369 100644 --- a/docs/ru/sql-reference/functions/tuple-map-functions.md +++ b/docs/ru/sql-reference/functions/tuple-map-functions.md @@ -128,7 +128,7 @@ mapContains(map, key) **Параметры** -- `map` — Map. +- `map` — контейнер map. - `key` — ключ. Тип соответстует типу `map.keys`. **Возвращаемое значение** @@ -171,7 +171,7 @@ mapKeys(map) **Параметры** -- `map`- map. +- `map` — контейнер map. **Возвращаемое значение** @@ -212,7 +212,7 @@ mapKeys(map) **Параметры** -- `map`- map. +- `map` — контейнер map. **Возвращаемое значение** From e24b8e8a13ecea65e9d35e53cbe1a7fa44917680 Mon Sep 17 00:00:00 2001 From: Nikolai Kochetov Date: Thu, 11 Feb 2021 15:06:28 +0300 Subject: [PATCH 0235/2357] Fix ActionsDAG::splitActionsForFilter --- src/Interpreters/ActionsDAG.cpp | 2 ++ 1 file changed, 2 insertions(+) diff --git a/src/Interpreters/ActionsDAG.cpp b/src/Interpreters/ActionsDAG.cpp index eb1ff9ad998..cd3a2853687 100644 --- a/src/Interpreters/ActionsDAG.cpp +++ b/src/Interpreters/ActionsDAG.cpp @@ -1311,6 +1311,8 @@ ActionsDAGPtr ActionsDAG::splitActionsForFilter(const std::string & filter_name, actions->inputs.emplace_back(&node); actions->index.insert(&node); } + + stack.pop(); } } } From d539948fe72f3ee7c7e90a49cdffbc93d0a3749c Mon Sep 17 00:00:00 2001 From: Alexey Milovidov Date: Sun, 7 Feb 2021 04:41:31 +0300 Subject: [PATCH 0236/2357] In memory compression: a prototype --- src/Columns/ColumnVector.cpp | 51 ++++++++++++++++++ src/Columns/ColumnVector.h | 2 + src/Columns/IColumn.h | 11 ++++ src/Storages/MemorySettings.cpp | 36 +++++++++++++ src/Storages/MemorySettings.h | 26 +++++++++ src/Storages/StorageMemory.cpp | 96 +++++++++++++++++++++------------ src/Storages/StorageMemory.h | 16 +++++- src/Storages/StorageSet.cpp | 11 ++-- 8 files changed, 207 insertions(+), 42 deletions(-) create mode 100644 src/Storages/MemorySettings.cpp create mode 100644 src/Storages/MemorySettings.h diff --git a/src/Columns/ColumnVector.cpp b/src/Columns/ColumnVector.cpp index a075c10a8a9..59c8b5cf33b 100644 --- a/src/Columns/ColumnVector.cpp +++ b/src/Columns/ColumnVector.cpp @@ -16,6 +16,9 @@ #include #include #include +#include +#include +#include #include #include @@ -32,6 +35,8 @@ namespace ErrorCodes extern const int PARAMETER_OUT_OF_BOUND; extern const int SIZES_OF_COLUMNS_DOESNT_MATCH; extern const int LOGICAL_ERROR; + extern const int CANNOT_COMPRESS; + extern const int CANNOT_DECOMPRESS; } template @@ -520,6 +525,52 @@ void ColumnVector::getExtremes(Field & min, Field & max) const max = NearestFieldType(cur_max); } + +#pragma GCC diagnostic ignored "-Wold-style-cast" + +template +LazyColumn ColumnVector::compress() const +{ + size_t source_size = data.size() * sizeof(T); + size_t max_dest_size = LZ4_COMPRESSBOUND(source_size); + + if (max_dest_size > std::numeric_limits::max()) + throw Exception(ErrorCodes::CANNOT_COMPRESS, "Cannot compress column of size {}", formatReadableSizeWithBinarySuffix(source_size)); + + auto compressed = std::make_shared>(max_dest_size); + + auto compressed_size = LZ4_compress_default( + reinterpret_cast(data.data()), + compressed->data(), + source_size, + max_dest_size); + + if (compressed_size <= 0) + throw Exception(ErrorCodes::CANNOT_COMPRESS, "Cannot compress column"); + + /// If compression is inefficient. + if (static_cast(compressed_size) * 2 > source_size) + return IColumn::compress(); + + /// Shrink to fit. + auto shrank = std::make_shared>(compressed_size); + memcpy(shrank->data(), compressed->data(), compressed_size); + + return [compressed = std::move(shrank), column_size = data.size()] + { + auto res = ColumnVector::create(column_size); + auto processed_size = LZ4_decompress_fast( + compressed->data(), + reinterpret_cast(res->getData().data()), + column_size * sizeof(T)); + + if (processed_size <= 0) + throw Exception(ErrorCodes::CANNOT_DECOMPRESS, "Cannot decompress column"); + + return res; + }; +} + /// Explicit template instantiations - to avoid code bloat in headers. template class ColumnVector; template class ColumnVector; diff --git a/src/Columns/ColumnVector.h b/src/Columns/ColumnVector.h index 1b13859bdee..4f1cbcafcbc 100644 --- a/src/Columns/ColumnVector.h +++ b/src/Columns/ColumnVector.h @@ -298,6 +298,8 @@ public: return typeid(rhs) == typeid(ColumnVector); } + LazyColumn compress() const override; + /// Replace elements that match the filter with zeroes. If inverted replaces not matched elements. void applyZeroMap(const IColumn::Filter & filt, bool inverted = false); diff --git a/src/Columns/IColumn.h b/src/Columns/IColumn.h index 824b5411744..d441e9f7c4e 100644 --- a/src/Columns/IColumn.h +++ b/src/Columns/IColumn.h @@ -357,6 +357,14 @@ public: throw Exception("Method structureEquals is not supported for " + getName(), ErrorCodes::NOT_IMPLEMENTED); } + /// Compress column in memory to some representation that allows to decompress it back. + using Lazy = std::function; + virtual Lazy compress() const + { + /// No compression by default, just wrap the object. + return [column = getPtr()] { return column; }; + } + static MutablePtr mutate(Ptr ptr) { @@ -462,6 +470,9 @@ using MutableColumns = std::vector; using ColumnRawPtrs = std::vector; //using MutableColumnRawPtrs = std::vector; +using LazyColumn = IColumn::Lazy; +using LazyColumns = std::vector; + template struct IsMutableColumns; diff --git a/src/Storages/MemorySettings.cpp b/src/Storages/MemorySettings.cpp new file mode 100644 index 00000000000..f5e182b3484 --- /dev/null +++ b/src/Storages/MemorySettings.cpp @@ -0,0 +1,36 @@ +#include +#include +#include +#include +#include + + +namespace DB +{ + +namespace ErrorCodes +{ + extern const int UNKNOWN_SETTING; +} + +IMPLEMENT_SETTINGS_TRAITS(memorySettingsTraits, MEMORY_SETTINGS) + +void MemorySettings::loadFromQuery(ASTStorage & storage_def) +{ + if (storage_def.settings) + { + try + { + applyChanges(storage_def.settings->changes); + } + catch (Exception & e) + { + if (e.code() == ErrorCodes::UNKNOWN_SETTING) + e.addMessage("for storage " + storage_def.engine->name); + throw; + } + } +} + +} + diff --git a/src/Storages/MemorySettings.h b/src/Storages/MemorySettings.h new file mode 100644 index 00000000000..4a1ba57475f --- /dev/null +++ b/src/Storages/MemorySettings.h @@ -0,0 +1,26 @@ +#pragma once + +#include + + +namespace DB +{ +class ASTStorage; + + +#define MEMORY_SETTINGS(M) \ + M(Bool, compress, true, "Compress data in memory", 0) \ + +DECLARE_SETTINGS_TRAITS(memorySettingsTraits, MEMORY_SETTINGS) + + +/** Settings for the Memory engine. + * Could be loaded from a CREATE TABLE query (SETTINGS clause). + */ +struct MemorySettings : public BaseSettings +{ + void loadFromQuery(ASTStorage & storage_def); +}; + +} + diff --git a/src/Storages/StorageMemory.cpp b/src/Storages/StorageMemory.cpp index 4530d93c274..a67eea0f28a 100644 --- a/src/Storages/StorageMemory.cpp +++ b/src/Storages/StorageMemory.cpp @@ -6,6 +6,7 @@ #include #include #include +#include #include #include @@ -23,7 +24,7 @@ namespace ErrorCodes class MemorySource : public SourceWithProgress { - using InitializerFunc = std::function &)>; + using InitializerFunc = std::function &)>; public: /// Blocks are stored in std::list which may be appended in another thread. /// We use pointer to the beginning of the list and its current size. @@ -34,7 +35,7 @@ public: Names column_names_, const StorageMemory & storage, const StorageMetadataPtr & metadata_snapshot, - std::shared_ptr data_, + std::shared_ptr data_, std::shared_ptr> parallel_execution_index_, InitializerFunc initializer_func_ = {}) : SourceWithProgress(metadata_snapshot->getSampleBlockForColumns(column_names_, storage.getVirtuals(), storage.getStorageID())) @@ -43,6 +44,8 @@ public: , parallel_execution_index(parallel_execution_index_) , initializer_func(std::move(initializer_func_)) { + for (const auto & elem : column_names_and_types) + column_positions.push_back(metadata_snapshot->getSampleBlock().getPositionByName(elem.getNameInStorage())); } String getName() const override { return "Memory"; } @@ -63,21 +66,25 @@ protected: return {}; } - const Block & src = (*data)[current_index]; + const LazyBlock & src = (*data)[current_index]; Columns columns; columns.reserve(columns.size()); /// Add only required columns to `res`. + size_t i = 0; for (const auto & elem : column_names_and_types) { - auto current_column = src.getByName(elem.getNameInStorage()).column; + auto current_column = src[column_positions[i]](); if (elem.isSubcolumn()) columns.emplace_back(elem.getTypeInStorage()->getSubcolumn(elem.getSubcolumnName(), *current_column)); else columns.emplace_back(std::move(current_column)); + + ++i; } - return Chunk(std::move(columns), src.rows()); + size_t rows = columns.at(0)->size(); + return Chunk(std::move(columns), rows); } private: @@ -95,9 +102,10 @@ private: const NamesAndTypesList column_names_and_types; size_t execution_index = 0; - std::shared_ptr data; + std::shared_ptr data; std::shared_ptr> parallel_execution_index; InitializerFunc initializer_func; + std::vector column_positions; }; @@ -149,8 +157,12 @@ private: }; -StorageMemory::StorageMemory(const StorageID & table_id_, ColumnsDescription columns_description_, ConstraintsDescription constraints_) - : IStorage(table_id_), data(std::make_unique()) +StorageMemory::StorageMemory( + const StorageID & table_id_, + ColumnsDescription columns_description_, + ConstraintsDescription constraints_, + bool compress_) + : IStorage(table_id_), data(std::make_unique()), compress(compress_) { StorageInMemoryMetadata storage_metadata; storage_metadata.setColumns(std::move(columns_description_)); @@ -186,7 +198,7 @@ Pipe StorageMemory::read( metadata_snapshot, nullptr /* data */, nullptr /* parallel execution index */, - [this](std::shared_ptr & data_to_initialize) + [this](std::shared_ptr & data_to_initialize) { data_to_initialize = data.get(); })); @@ -219,18 +231,18 @@ BlockOutputStreamPtr StorageMemory::write(const ASTPtr & /*query*/, const Storag void StorageMemory::drop() { - data.set(std::make_unique()); + data.set(std::make_unique()); total_size_bytes.store(0, std::memory_order_relaxed); total_size_rows.store(0, std::memory_order_relaxed); } -static inline void updateBlockData(Block & old_block, const Block & new_block) +static inline void updateBlockData(LazyBlock & old_block, const LazyBlock & new_block, const Block & old_header, const Block & new_header) { - for (const auto & it : new_block) + size_t i = 0; + for (const auto & it : new_header) { - auto col_name = it.name; - auto & col_with_type_name = old_block.getByName(col_name); - col_with_type_name.column = it.column; + old_block[old_header.getPositionByName(it.name)] = new_block[i]; + ++i; } } @@ -242,36 +254,47 @@ void StorageMemory::mutate(const MutationCommands & commands, const Context & co auto storage_ptr = DatabaseCatalog::instance().getTable(storage, context); auto interpreter = std::make_unique(storage_ptr, metadata_snapshot, commands, context, true); auto in = interpreter->execute(); + Block old_header = metadata_snapshot->getSampleBlock(); + Block mutation_header = in->getHeader(); in->readPrefix(); - Blocks out; - Block block; - while ((block = in->read())) + LazyBlocks out; + while (Block block = in->read()) { - out.push_back(block); + LazyColumns lazy_columns; + + for (const auto & elem : block) + { + if (compress) + lazy_columns.emplace_back(elem.column->compress()); + else + lazy_columns.emplace_back([=]{ return elem.column; }); + } + + out.emplace_back(std::move(lazy_columns)); } in->readSuffix(); - std::unique_ptr new_data; + std::unique_ptr new_data; - // all column affected + /// All columns affected. if (interpreter->isAffectingAllColumns()) { - new_data = std::make_unique(out); + new_data = std::make_unique(out); } else { - /// just some of the column affected, we need update it with new column - new_data = std::make_unique(*(data.get())); + /// Just some of the columns affected, we need update it with new column. + new_data = std::make_unique(*(data.get())); auto data_it = new_data->begin(); auto out_it = out.begin(); while (data_it != new_data->end()) { - /// Mutation does not change the number of blocks + /// Mutation does not change the number of blocks. assert(out_it != out.end()); - updateBlockData(*data_it, *out_it); + updateBlockData(*data_it, *out_it, old_header, mutation_header); ++data_it; ++out_it; } @@ -279,7 +302,7 @@ void StorageMemory::mutate(const MutationCommands & commands, const Context & co assert(out_it == out.end()); } - size_t rows = 0; +/* size_t rows = 0; size_t bytes = 0; for (const auto & buffer : *new_data) { @@ -287,7 +310,8 @@ void StorageMemory::mutate(const MutationCommands & commands, const Context & co bytes += buffer.bytes(); } total_size_bytes.store(rows, std::memory_order_relaxed); - total_size_rows.store(bytes, std::memory_order_relaxed); + total_size_rows.store(bytes, std::memory_order_relaxed);*/ + data.set(std::move(new_data)); } @@ -295,7 +319,7 @@ void StorageMemory::mutate(const MutationCommands & commands, const Context & co void StorageMemory::truncate( const ASTPtr &, const StorageMetadataPtr &, const Context &, TableExclusiveLockHolder &) { - data.set(std::make_unique()); + data.set(std::make_unique()); total_size_bytes.store(0, std::memory_order_relaxed); total_size_rows.store(0, std::memory_order_relaxed); } @@ -317,13 +341,19 @@ void registerStorageMemory(StorageFactory & factory) factory.registerStorage("Memory", [](const StorageFactory::Arguments & args) { if (!args.engine_args.empty()) - throw Exception( - "Engine " + args.engine_name + " doesn't support any arguments (" + toString(args.engine_args.size()) + " given)", - ErrorCodes::NUMBER_OF_ARGUMENTS_DOESNT_MATCH); + throw Exception(ErrorCodes::NUMBER_OF_ARGUMENTS_DOESNT_MATCH, + "Engine {} doesn't support any arguments ({} given)", + args.engine_name, args.engine_args.size()); - return StorageMemory::create(args.table_id, args.columns, args.constraints); + bool has_settings = args.storage_def->settings; + MemorySettings settings; + if (has_settings) + settings.loadFromQuery(*args.storage_def); + + return StorageMemory::create(args.table_id, args.columns, args.constraints, settings.compress); }, { + .supports_settings = true, .supports_parallel_insert = true, }); } diff --git a/src/Storages/StorageMemory.h b/src/Storages/StorageMemory.h index dc695427156..97ddfa93d9a 100644 --- a/src/Storages/StorageMemory.h +++ b/src/Storages/StorageMemory.h @@ -15,6 +15,11 @@ namespace DB { +/// Lazy block contains possibly compressed columns. LazyColumn is std::function that reconstructs Column on call. +using LazyBlock = LazyColumns; +using LazyBlocks = std::vector; + + /** Implements storage in the RAM. * Suitable for temporary data. * It does not support keys. @@ -95,7 +100,8 @@ public: private: /// MultiVersion data storage, so that we can copy the list of blocks to readers. - MultiVersion data; + + MultiVersion data; mutable std::mutex mutex; @@ -104,8 +110,14 @@ private: std::atomic total_size_bytes = 0; std::atomic total_size_rows = 0; + bool compress; + protected: - StorageMemory(const StorageID & table_id_, ColumnsDescription columns_description_, ConstraintsDescription constraints_); + StorageMemory( + const StorageID & table_id_, + ColumnsDescription columns_description_, + ConstraintsDescription constraints_, + bool compress_ = false); }; } diff --git a/src/Storages/StorageSet.cpp b/src/Storages/StorageSet.cpp index e518c7da0e4..d64042f0c1e 100644 --- a/src/Storages/StorageSet.cpp +++ b/src/Storages/StorageSet.cpp @@ -242,15 +242,12 @@ void registerStorageSet(StorageFactory & factory) ErrorCodes::NUMBER_OF_ARGUMENTS_DOESNT_MATCH); bool has_settings = args.storage_def->settings; - - auto set_settings = std::make_unique(); + SetSettings set_settings; if (has_settings) - { - set_settings->loadFromQuery(*args.storage_def); - } + set_settings.loadFromQuery(*args.storage_def); - DiskPtr disk = args.context.getDisk(set_settings->disk); - return StorageSet::create(disk, args.relative_data_path, args.table_id, args.columns, args.constraints, set_settings->persistent); + DiskPtr disk = args.context.getDisk(set_settings.disk); + return StorageSet::create(disk, args.relative_data_path, args.table_id, args.columns, args.constraints, set_settings.persistent); }, StorageFactory::StorageFeatures{ .supports_settings = true, }); } From 280f459f71513752696a2fcc9753aae4a7e342b2 Mon Sep 17 00:00:00 2001 From: Alexey Milovidov Date: Sun, 7 Feb 2021 05:40:06 +0300 Subject: [PATCH 0237/2357] Fix quadratic INSERT --- src/Storages/StorageMemory.cpp | 37 ++++++++++++++++++++++------------ 1 file changed, 24 insertions(+), 13 deletions(-) diff --git a/src/Storages/StorageMemory.cpp b/src/Storages/StorageMemory.cpp index a67eea0f28a..20c8a44efd4 100644 --- a/src/Storages/StorageMemory.cpp +++ b/src/Storages/StorageMemory.cpp @@ -125,23 +125,32 @@ public: void write(const Block & block) override { metadata_snapshot->check(block, true); - new_blocks.emplace_back(block); + + inserted_bytes += block.allocatedBytes(); + inserted_rows += block.rows(); + + Block sample = metadata_snapshot->getSampleBlock(); + + LazyColumns lazy_columns; + lazy_columns.reserve(sample.columns()); + + for (const auto & elem : sample) + { + const ColumnPtr & column = block.getByName(elem.name).column; + + if (storage.compress) + lazy_columns.emplace_back(column->compress()); + else + lazy_columns.emplace_back([=]{ return column; }); + } + + new_blocks.emplace_back(std::move(lazy_columns)); } void writeSuffix() override { - size_t inserted_bytes = 0; - size_t inserted_rows = 0; - - for (const auto & block : new_blocks) - { - inserted_bytes += block.allocatedBytes(); - inserted_rows += block.rows(); - } - std::lock_guard lock(storage.mutex); - - auto new_data = std::make_unique(*(storage.data.get())); + auto new_data = std::make_unique(*(storage.data.get())); new_data->insert(new_data->end(), new_blocks.begin(), new_blocks.end()); storage.data.set(std::move(new_data)); @@ -150,7 +159,9 @@ public: } private: - Blocks new_blocks; + LazyBlocks new_blocks; + size_t inserted_bytes = 0; + size_t inserted_rows = 0; StorageMemory & storage; StorageMetadataPtr metadata_snapshot; From 58f1d4d910a2b6d34f484ff742df85e421276391 Mon Sep 17 00:00:00 2001 From: Alexey Milovidov Date: Sun, 7 Feb 2021 06:00:31 +0300 Subject: [PATCH 0238/2357] Add comment to config --- programs/server/config.xml | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/programs/server/config.xml b/programs/server/config.xml index 849d3dc32ba..571a8c6cf75 100644 --- a/programs/server/config.xml +++ b/programs/server/config.xml @@ -284,6 +284,11 @@ In bytes. Cache is single for server. Memory is allocated only on demand. Cache is used when 'use_uncompressed_cache' user setting turned on (off by default). Uncompressed cache is advantageous only for very short queries and in rare cases. + + Note: uncompressed cache is pointless for lz4, because memory bandwidth is slower than multi-core decompression. + Enabling it will only make queries slower. + If number of CPU cores is in order of 100 and memory bandwidth is in range of 100-200 GB/sec, + there is a chance it is also being pointless for zstd. --> 8589934592 From 4d650a2a5621723f4466db263a8602cb04e6d40b Mon Sep 17 00:00:00 2001 From: Alexey Milovidov Date: Sun, 7 Feb 2021 06:03:13 +0300 Subject: [PATCH 0239/2357] Adjust config --- programs/server/users.xml | 3 --- src/Core/Settings.h | 2 +- 2 files changed, 1 insertion(+), 4 deletions(-) diff --git a/programs/server/users.xml b/programs/server/users.xml index 3223d855651..ef66891a6a0 100644 --- a/programs/server/users.xml +++ b/programs/server/users.xml @@ -7,9 +7,6 @@ 10000000000 - - 0 - From 2a9a6cf4048969d1fa670fb7afac18d57b86649a Mon Sep 17 00:00:00 2001 From: George Date: Thu, 11 Feb 2021 19:46:23 +0300 Subject: [PATCH 0245/2357] Edited and translated parametric-functions --- .../sql-reference/aggregate-functions/parametric-functions.md | 2 +- .../sql-reference/aggregate-functions/parametric-functions.md | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/docs/en/sql-reference/aggregate-functions/parametric-functions.md b/docs/en/sql-reference/aggregate-functions/parametric-functions.md index 4b3bf12aa8c..2d2df3bd6cb 100644 --- a/docs/en/sql-reference/aggregate-functions/parametric-functions.md +++ b/docs/en/sql-reference/aggregate-functions/parametric-functions.md @@ -241,7 +241,7 @@ windowFunnel(window, [mode])(timestamp, cond1, cond2, ..., condN) **Parameters** -- `window` — Length of the sliding window. The unit of `window` depends on the timestamp itself and varies. Determined using the expression `timestamp of cond2 <= timestamp of cond1 + window`. +- `window` — Length of the sliding window. The unit of `window` depends on the `timestamp` itself and varies. Determined using the expression `timestamp of cond2 <= timestamp of cond1 + window`. - `mode` - It is an optional argument. - `'strict'` - When the `'strict'` is set, the windowFunnel() applies conditions only for the unique values. - `timestamp` — Name of the column containing the timestamp. Data types supported: [Date](../../sql-reference/data-types/date.md), [DateTime](../../sql-reference/data-types/datetime.md#data_type-datetime) and other unsigned integer types (note that even though timestamp supports the `UInt64` type, it’s value can’t exceed the Int64 maximum, which is 2^63 - 1). diff --git a/docs/ru/sql-reference/aggregate-functions/parametric-functions.md b/docs/ru/sql-reference/aggregate-functions/parametric-functions.md index f20acaa45c3..2c367882714 100644 --- a/docs/ru/sql-reference/aggregate-functions/parametric-functions.md +++ b/docs/ru/sql-reference/aggregate-functions/parametric-functions.md @@ -239,7 +239,7 @@ windowFunnel(window, [mode])(timestamp, cond1, cond2, ..., condN) **Параметры** -- `window` — ширина скользящего окна по времени в секундах. [UInt](../../sql-reference/aggregate-functions/parametric-functions.md). +- `window` — ширина скользящего окна по времени. Единица измерения зависит от `timestamp` и может варьироваться. Определяется выражением `timestamp от cond2 <= timestamp от cond1 + window`. - `mode` - необязательный параметр. Если установлено значение `'strict'`, то функция `windowFunnel()` применяет условия только для уникальных значений. - `timestamp` — имя столбца, содержащего временные отметки. [Date](../../sql-reference/aggregate-functions/parametric-functions.md), [DateTime](../../sql-reference/aggregate-functions/parametric-functions.md#data_type-datetime) и другие параметры с типом `Integer`. В случае хранения меток времени в столбцах с типом `UInt64`, максимально допустимое значение соответствует ограничению для типа `Int64`, т.е. равно `2^63-1`. - `cond` — условия или данные, описывающие цепочку событий. [UInt8](../../sql-reference/aggregate-functions/parametric-functions.md). From cd11212bba784958174fdfbd334622a533686756 Mon Sep 17 00:00:00 2001 From: George Date: Thu, 11 Feb 2021 19:57:41 +0300 Subject: [PATCH 0246/2357] Edited and translated settings --- docs/en/operations/settings/settings.md | 4 ++-- docs/ru/operations/settings/settings.md | 15 +++++++++++++++ 2 files changed, 17 insertions(+), 2 deletions(-) diff --git a/docs/en/operations/settings/settings.md b/docs/en/operations/settings/settings.md index c7ee48c11bf..70809885a99 100644 --- a/docs/en/operations/settings/settings.md +++ b/docs/en/operations/settings/settings.md @@ -1956,8 +1956,8 @@ Default value: 16. **See Also** -- [Kafka](../../engines/table-engines/integrations/kafka.md#kafka) engine -- [RabbitMQ](../../engines/table-engines/integrations/rabbitmq.md#rabbitmq-engine) engine +- [Kafka](../../engines/table-engines/integrations/kafka.md#kafka) engine. +- [RabbitMQ](../../engines/table-engines/integrations/rabbitmq.md#rabbitmq-engine) engine. ## validate_polygons {#validate_polygons} diff --git a/docs/ru/operations/settings/settings.md b/docs/ru/operations/settings/settings.md index 1352fe850df..fed10d21920 100644 --- a/docs/ru/operations/settings/settings.md +++ b/docs/ru/operations/settings/settings.md @@ -1939,6 +1939,21 @@ SELECT idx, i FROM null_in WHERE i IN (1, NULL) SETTINGS transform_null_in = 1; Значение по умолчанию: 16. +## background_message_broker_schedule_pool_size {#background_message_broker_schedule_pool_size} + +Задает количество потоков для вывода потокового вывода сообщений. Настройка применяется при запуске сервера ClickHouse и не может быть изменена в пользовательском сеансе. + +Допустимые значения: + +- Положительное целое число. + +Значение по умолчанию: 16. + +**Смотрите также** + +- Движок [Kafka](../../engines/table-engines/integrations/kafka.md#kafka). +- Движок [RabbitMQ](../../engines/table-engines/integrations/rabbitmq.md#rabbitmq-engine). + ## format_avro_schema_registry_url {#format_avro_schema_registry_url} Задает URL реестра схем [Confluent](https://docs.confluent.io/current/schema-registry/index.html) для использования с форматом [AvroConfluent](../../interfaces/formats.md#data-format-avro-confluent). From 93ea1e5e82da3a3eb07dbe9daa355d3ab31accf5 Mon Sep 17 00:00:00 2001 From: Nikolai Kochetov Date: Thu, 11 Feb 2021 20:13:59 +0300 Subject: [PATCH 0247/2357] Comment output --- .../QueryPlan/Optimizations/filterPushDown.cpp | 9 +++------ 1 file changed, 3 insertions(+), 6 deletions(-) diff --git a/src/Processors/QueryPlan/Optimizations/filterPushDown.cpp b/src/Processors/QueryPlan/Optimizations/filterPushDown.cpp index 2a42b08af73..a5f1d37e2f2 100644 --- a/src/Processors/QueryPlan/Optimizations/filterPushDown.cpp +++ b/src/Processors/QueryPlan/Optimizations/filterPushDown.cpp @@ -42,14 +42,11 @@ size_t tryPushDownLimit(QueryPlan::Node * parent_node, QueryPlan::Nodes & nodes) for (auto pos : params.keys) keys.push_back(params.src_header.getByPosition(pos).name); - std::cerr << "Filter: \n" << expression->dumpDAG() << std::endl; + // std::cerr << "Filter: \n" << expression->dumpDAG() << std::endl; if (auto split_filter = expression->splitActionsForFilter(filter_column_name, removes_filter, keys)) { - std::cerr << "===============\n" << expression->dumpDAG() << std::endl; - std::cerr << "---------------\n" << split_filter->dumpDAG() << std::endl; - - //if (split_filter) - // throw Exception("!!!!", 0); + // std::cerr << "===============\n" << expression->dumpDAG() << std::endl; + // std::cerr << "---------------\n" << split_filter->dumpDAG() << std::endl; auto it = expression->getIndex().find(filter_column_name); if (it == expression->getIndex().end()) From 838dab756491d5bdcd6151fb5075756d0807b807 Mon Sep 17 00:00:00 2001 From: George Date: Thu, 11 Feb 2021 21:07:38 +0300 Subject: [PATCH 0248/2357] Edit and translated Kafka --- .../table-engines/integrations/kafka.md | 22 +++++++++---------- .../table-engines/integrations/kafka.md | 19 +++++++++------- 2 files changed, 22 insertions(+), 19 deletions(-) diff --git a/docs/en/engines/table-engines/integrations/kafka.md b/docs/en/engines/table-engines/integrations/kafka.md index c519d6bb136..fb1df62bb15 100644 --- a/docs/en/engines/table-engines/integrations/kafka.md +++ b/docs/en/engines/table-engines/integrations/kafka.md @@ -38,20 +38,20 @@ SETTINGS Required parameters: -- `kafka_broker_list` – A comma-separated list of brokers (for example, `localhost:9092`). -- `kafka_topic_list` – A list of Kafka topics. -- `kafka_group_name` – A group of Kafka consumers. Reading margins are tracked for each group separately. If you don’t want messages to be duplicated in the cluster, use the same group name everywhere. -- `kafka_format` – Message format. Uses the same notation as the SQL `FORMAT` function, such as `JSONEachRow`. For more information, see the [Formats](../../../interfaces/formats.md) section. +- `kafka_broker_list` — A comma-separated list of brokers (for example, `localhost:9092`). +- `kafka_topic_list` — A list of Kafka topics. +- `kafka_group_name` — A group of Kafka consumers. Reading margins are tracked for each group separately. If you don’t want messages to be duplicated in the cluster, use the same group name everywhere. +- `kafka_format` — Message format. Uses the same notation as the SQL `FORMAT` function, such as `JSONEachRow`. For more information, see the [Formats](../../../interfaces/formats.md) section. Optional parameters: -- `kafka_row_delimiter` – Delimiter character, which ends the message. -- `kafka_schema` – Parameter that must be used if the format requires a schema definition. For example, [Cap’n Proto](https://capnproto.org/) requires the path to the schema file and the name of the root `schema.capnp:Message` object. -- `kafka_num_consumers` – The number of consumers per table. Default: `1`. Specify more consumers if the throughput of one consumer is insufficient. The total number of consumers should not exceed the number of partitions in the topic, since only one consumer can be assigned per partition. -- `kafka_max_block_size` - The maximum batch size (in messages) for poll (default: `max_block_size`). -- `kafka_skip_broken_messages` – Kafka message parser tolerance to schema-incompatible messages per block. Default: `0`. If `kafka_skip_broken_messages = N` then the engine skips *N* Kafka messages that cannot be parsed (a message equals a row of data). -- `kafka_commit_every_batch` - Commit every consumed and handled batch instead of a single commit after writing a whole block (default: `0`). -- `kafka_thread_per_consumer` - Provide independent thread for each consumer (default: `0`). When enabled, every consumer flush the data independently, in parallel (otherwise - rows from several consumers squashed to form one block). +- `kafka_row_delimiter` — Delimiter character, which ends the message. +- `kafka_schema` — Parameter that must be used if the format requires a schema definition. For example, [Cap’n Proto](https://capnproto.org/) requires the path to the schema file and the name of the root `schema.capnp:Message` object. +- `kafka_num_consumers` — The number of consumers per table. Default: `1`. Specify more consumers if the throughput of one consumer is insufficient. The total number of consumers should not exceed the number of partitions in the topic, since only one consumer can be assigned per partition. +- `kafka_max_block_size` — The maximum batch size (in messages) for poll (default: `max_block_size`). +- `kafka_skip_broken_messages` — Kafka message parser tolerance to schema-incompatible messages per block. Default: `0`. If `kafka_skip_broken_messages = N` then the engine skips *N* Kafka messages that cannot be parsed (a message equals a row of data). +- `kafka_commit_every_batch` — Commit every consumed and handled batch instead of a single commit after writing a whole block (default: `0`). +- `kafka_thread_per_consumer` — Provide independent thread for each consumer (default: `0`). When enabled, every consumer flush the data independently, in parallel (otherwise — rows from several consumers squashed to form one block). Examples: diff --git a/docs/ru/engines/table-engines/integrations/kafka.md b/docs/ru/engines/table-engines/integrations/kafka.md index 940fee2452b..2b9dfcd49da 100644 --- a/docs/ru/engines/table-engines/integrations/kafka.md +++ b/docs/ru/engines/table-engines/integrations/kafka.md @@ -31,21 +31,24 @@ SETTINGS [kafka_schema = '',] [kafka_num_consumers = N,] [kafka_skip_broken_messages = N] + [kafka_commit_every_batch = 0,] + [kafka_thread_per_consumer = 0] ``` Обязательные параметры: -- `kafka_broker_list` – перечень брокеров, разделенный запятыми (`localhost:9092`). -- `kafka_topic_list` – перечень необходимых топиков Kafka. -- `kafka_group_name` – группа потребителя Kafka. Отступы для чтения отслеживаются для каждой группы отдельно. Если необходимо, чтобы сообщения не повторялись на кластере, используйте везде одно имя группы. -- `kafka_format` – формат сообщений. Названия форматов должны быть теми же, что можно использовать в секции `FORMAT`, например, `JSONEachRow`. Подробнее читайте в разделе [Форматы](../../../interfaces/formats.md). +- `kafka_broker_list` — перечень брокеров, разделенный запятыми (`localhost:9092`). +- `kafka_topic_list` — перечень необходимых топиков Kafka. +- `kafka_group_name` — группа потребителя Kafka. Отступы для чтения отслеживаются для каждой группы отдельно. Если необходимо, чтобы сообщения не повторялись на кластере, используйте везде одно имя группы. +- `kafka_format` — формат сообщений. Названия форматов должны быть теми же, что можно использовать в секции `FORMAT`, например, `JSONEachRow`. Подробнее читайте в разделе [Форматы](../../../interfaces/formats.md). Опциональные параметры: -- `kafka_row_delimiter` – символ-разделитель записей (строк), которым завершается сообщение. -- `kafka_schema` – опциональный параметр, необходимый, если используется формат, требующий определения схемы. Например, [Cap’n Proto](https://capnproto.org/) требует путь к файлу со схемой и название корневого объекта `schema.capnp:Message`. -- `kafka_num_consumers` – количество потребителей (consumer) на таблицу. По умолчанию: `1`. Укажите больше потребителей, если пропускная способность одного потребителя недостаточна. Общее число потребителей не должно превышать количество партиций в топике, так как на одну партицию может быть назначено не более одного потребителя. -- `kafka_skip_broken_messages` – максимальное количество некорректных сообщений в блоке. Если `kafka_skip_broken_messages = N`, то движок отбрасывает `N` сообщений Кафки, которые не получилось обработать. Одно сообщение в точности соответствует одной записи (строке). Значение по умолчанию – 0. +- `kafka_row_delimiter` — символ-разделитель записей (строк), которым завершается сообщение. +- `kafka_schema` — опциональный параметр, необходимый, если используется формат, требующий определения схемы. Например, [Cap’n Proto](https://capnproto.org/) требует путь к файлу со схемой и название корневого объекта `schema.capnp:Message`. +- `kafka_num_consumers` — количество потребителей (consumer) на таблицу. По умолчанию: `1`. Укажите больше потребителей, если пропускная способность одного потребителя недостаточна. Общее число потребителей не должно превышать количество партиций в топике, так как на одну партицию может быть назначено не более одного потребителя. +- `kafka_skip_broken_messages` — максимальное количество некорректных сообщений в блоке. Если `kafka_skip_broken_messages = N`, то движок отбрасывает `N` сообщений Кафки, которые не получилось обработать. Одно сообщение в точности соответствует одной записи (строке). Значение по умолчанию – 0. +- `kafka_thread_per_consumer` — снабжает каждого потребителя независимым потоком (по умолчанию `0`). При включенном состоянии каждый потребитель сбрасывает данные независимо и параллельно (иначе — строки от нескольких потребителей склеиваются в один блок). Примеры From 9a9f88c5bb26d330f7f64bc2f7ff8fd89f79641b Mon Sep 17 00:00:00 2001 From: lehasm Date: Thu, 11 Feb 2021 23:16:01 +0300 Subject: [PATCH 0249/2357] test markdown --- .../sql-reference/aggregate-functions/reference/studentttest.md | 1 + 1 file changed, 1 insertion(+) diff --git a/docs/en/sql-reference/aggregate-functions/reference/studentttest.md b/docs/en/sql-reference/aggregate-functions/reference/studentttest.md index f868e976039..fde6a2ecc01 100644 --- a/docs/en/sql-reference/aggregate-functions/reference/studentttest.md +++ b/docs/en/sql-reference/aggregate-functions/reference/studentttest.md @@ -24,6 +24,7 @@ The null hypothesis is that means of populations are equal. Normal distribution **Returned values** [Tuple](../../../sql-reference/data-types/tuple.md) with two elements: + - calculated t-statistic. [Float64](../../../sql-reference/data-types/float.md). - calculated p-value. [Float64](../../../sql-reference/data-types/float.md). From 4c8632bd9ab32322af29abb04cf70c39c6cd3c79 Mon Sep 17 00:00:00 2001 From: George Date: Fri, 12 Feb 2021 00:22:55 +0300 Subject: [PATCH 0250/2357] Minor fixes --- docs/ru/operations/settings/settings.md | 2 +- docs/ru/sql-reference/functions/encryption-functions.md | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/docs/ru/operations/settings/settings.md b/docs/ru/operations/settings/settings.md index fed10d21920..a7754cfc421 100644 --- a/docs/ru/operations/settings/settings.md +++ b/docs/ru/operations/settings/settings.md @@ -1941,7 +1941,7 @@ SELECT idx, i FROM null_in WHERE i IN (1, NULL) SETTINGS transform_null_in = 1; ## background_message_broker_schedule_pool_size {#background_message_broker_schedule_pool_size} -Задает количество потоков для вывода потокового вывода сообщений. Настройка применяется при запуске сервера ClickHouse и не может быть изменена в пользовательском сеансе. +Задает количество потоков для фонового потокового вывода сообщений. Настройка применяется при запуске сервера ClickHouse и не может быть изменена в пользовательском сеансе. Допустимые значения: diff --git a/docs/ru/sql-reference/functions/encryption-functions.md b/docs/ru/sql-reference/functions/encryption-functions.md index 14ce97f5513..91b26a2415d 100644 --- a/docs/ru/sql-reference/functions/encryption-functions.md +++ b/docs/ru/sql-reference/functions/encryption-functions.md @@ -293,7 +293,7 @@ SELECT comment, decrypt('aes-256-cfb128', secret, '12345678910121314151617181920 └─────────────────────────────────────┴───────────┘ ``` -Обратите внимание, что только часть данных была расшифрована, а остальное является бессмыслицей, как как `mode`, `key`, или `iv`были другими во время шифрования. +Обратите внимание, что только часть данных была расшифрована, а остальное является бессмыслицей, как как `mode`, `key`, или `iv` были другими во время шифрования. ## aes_decrypt_mysql {#aes_decrypt_mysql} From 5355175e49b425b754785c411c548c058fd9d100 Mon Sep 17 00:00:00 2001 From: Alexey Milovidov Date: Fri, 12 Feb 2021 00:26:14 +0300 Subject: [PATCH 0251/2357] Development --- src/Columns/IColumn.h | 18 +++++++++++------- 1 file changed, 11 insertions(+), 7 deletions(-) diff --git a/src/Columns/IColumn.h b/src/Columns/IColumn.h index d441e9f7c4e..7697bd116bf 100644 --- a/src/Columns/IColumn.h +++ b/src/Columns/IColumn.h @@ -358,11 +358,18 @@ public: } /// Compress column in memory to some representation that allows to decompress it back. - using Lazy = std::function; - virtual Lazy compress() const + /// Return itself if compression is not applicable for this column type. + virtual ColumnPtr compress() const { - /// No compression by default, just wrap the object. - return [column = getPtr()] { return column; }; + /// No compression by default. + return getPtr(); + } + + /// If it's CompressedColumn, decompress it and return. + /// Otherwise return itself. + virtual ColumnPtr decompress() const + { + return getPtr(); } @@ -468,10 +475,7 @@ using Columns = std::vector; using MutableColumns = std::vector; using ColumnRawPtrs = std::vector; -//using MutableColumnRawPtrs = std::vector; -using LazyColumn = IColumn::Lazy; -using LazyColumns = std::vector; template struct IsMutableColumns; From c9cf63e958f058098e83c8a46391d249229954db Mon Sep 17 00:00:00 2001 From: Alexander Tokmakov Date: Fri, 12 Feb 2021 01:23:40 +0300 Subject: [PATCH 0252/2357] fix --- src/Databases/DatabaseAtomic.cpp | 6 ++ src/Databases/DatabaseAtomic.h | 1 + src/Databases/DatabaseOnDisk.cpp | 17 +++-- src/Databases/DatabaseReplicated.cpp | 56 ++++++++++++---- src/Databases/DatabaseReplicatedWorker.cpp | 4 +- src/Interpreters/DDLWorker.cpp | 2 +- .../test_replicated_database/test.py | 66 ++++++++++++++++++- 7 files changed, 130 insertions(+), 22 deletions(-) diff --git a/src/Databases/DatabaseAtomic.cpp b/src/Databases/DatabaseAtomic.cpp index e6bc3bfcd44..2065e036863 100644 --- a/src/Databases/DatabaseAtomic.cpp +++ b/src/Databases/DatabaseAtomic.cpp @@ -362,6 +362,12 @@ void DatabaseAtomic::assertDetachedTableNotInUse(const UUID & uuid) ", because it was detached but still used by some query. Retry later.", ErrorCodes::TABLE_ALREADY_EXISTS); } +void DatabaseAtomic::setDetachedTableNotInUseForce(const UUID & uuid) +{ + std::unique_lock lock{mutex}; + detached_tables.erase(uuid); +} + DatabaseAtomic::DetachedTables DatabaseAtomic::cleanupDetachedTables() { DetachedTables not_in_use; diff --git a/src/Databases/DatabaseAtomic.h b/src/Databases/DatabaseAtomic.h index be7227ed8f9..09cdf269b35 100644 --- a/src/Databases/DatabaseAtomic.h +++ b/src/Databases/DatabaseAtomic.h @@ -58,6 +58,7 @@ public: void tryRemoveSymlink(const String & table_name); void waitDetachedTableNotInUse(const UUID & uuid) override; + void setDetachedTableNotInUseForce(const UUID & uuid); protected: void commitAlterTable(const StorageID & table_id, const String & table_metadata_tmp_path, const String & table_metadata_path, const String & statement, const Context & query_context) override; diff --git a/src/Databases/DatabaseOnDisk.cpp b/src/Databases/DatabaseOnDisk.cpp index 195f57d1bda..24bab42cad2 100644 --- a/src/Databases/DatabaseOnDisk.cpp +++ b/src/Databases/DatabaseOnDisk.cpp @@ -407,6 +407,8 @@ void DatabaseOnDisk::renameTable( from_ordinary_to_atomic = true; else if (typeid_cast(this) && typeid_cast(&to_database)) from_atomic_to_ordinary = true; + else if (dynamic_cast(this) && typeid_cast(&to_database) && getEngineName() == "Replicated") + from_atomic_to_ordinary = true; else throw Exception("Moving tables between databases of different engines is not supported", ErrorCodes::NOT_IMPLEMENTED); } @@ -418,6 +420,7 @@ void DatabaseOnDisk::renameTable( /// DatabaseLazy::detachTable may return nullptr even if table exists, so we need tryGetTable for this case. StoragePtr table = tryGetTable(table_name, global_context); detachTable(table_name); + UUID prev_uuid = UUIDHelpers::Nil; try { table_lock = table->lockExclusively(context.getCurrentQueryId(), context.getSettingsRef().lock_acquire_timeout); @@ -430,7 +433,7 @@ void DatabaseOnDisk::renameTable( if (from_ordinary_to_atomic) create.uuid = UUIDHelpers::generateV4(); if (from_atomic_to_ordinary) - create.uuid = UUIDHelpers::Nil; + std::swap(create.uuid, prev_uuid); if (auto * target_db = dynamic_cast(&to_database)) target_db->checkMetadataFilenameAvailability(to_table_name); @@ -455,12 +458,16 @@ void DatabaseOnDisk::renameTable( Poco::File(table_metadata_path).remove(); - /// Special case: usually no actions with symlinks are required when detaching/attaching table, - /// but not when moving from Atomic database to Ordinary - if (from_atomic_to_ordinary && table->storesDataOnDisk()) + if (from_atomic_to_ordinary) { auto & atomic_db = assert_cast(*this); - atomic_db.tryRemoveSymlink(table_name); + /// Special case: usually no actions with symlinks are required when detaching/attaching table, + /// but not when moving from Atomic database to Ordinary + if (table->storesDataOnDisk()) + atomic_db.tryRemoveSymlink(table_name); + /// Forget about UUID, now it's possible to reuse it for new table + DatabaseCatalog::instance().removeUUIDMappingFinally(prev_uuid); + atomic_db.setDetachedTableNotInUseForce(prev_uuid); } } diff --git a/src/Databases/DatabaseReplicated.cpp b/src/Databases/DatabaseReplicated.cpp index b8ce48a4d5c..1756d33958d 100644 --- a/src/Databases/DatabaseReplicated.cpp +++ b/src/Databases/DatabaseReplicated.cpp @@ -39,7 +39,7 @@ namespace ErrorCodes } static constexpr const char * DROPPED_MARK = "DROPPED"; -static constexpr const char * BROKEN_TABLE_PREFIX = "_broken_"; +static constexpr const char * BROKEN_TABLES_SUFFIX = "_broken_tables"; zkutil::ZooKeeperPtr DatabaseReplicated::getZooKeeper() const @@ -312,7 +312,7 @@ void DatabaseReplicated::recoverLostReplica(const ZooKeeperPtr & current_zookeep Strings tables_to_detach; size_t total_tables = 0; - auto existing_tables_it = getTablesIterator(global_context, [&](const String & name) { return !startsWith(name, BROKEN_TABLE_PREFIX); }); + auto existing_tables_it = getTablesIterator(global_context, {}); while (existing_tables_it->isValid()) { String name = existing_tables_it->name(); @@ -345,30 +345,64 @@ void DatabaseReplicated::recoverLostReplica(const ZooKeeperPtr & current_zookeep existing_tables_it->next(); ++total_tables; } + existing_tables_it.reset(); + String db_name = getDatabaseName(); + String to_db_name = getDatabaseName() + BROKEN_TABLES_SUFFIX; if (total_tables < tables_to_detach.size() * 2) - throw Exception(ErrorCodes::DATABASE_REPLICATION_FAILED, "Too many tables to detach: {} of {}", tables_to_detach.size(), total_tables); + throw Exception(ErrorCodes::DATABASE_REPLICATION_FAILED, "Too many tables to recreate: {} of {}", tables_to_detach.size(), total_tables); else if (!tables_to_detach.empty()) - LOG_WARNING(log, "Will DETACH PERMANENTLY {} broken tables to recover replica", tables_to_detach.size()); + { + LOG_WARNING(log, "Will recreate {} broken tables to recover replica", tables_to_detach.size()); + /// It's too dangerous to automatically drop tables, so we will move them to special database. + /// We use Ordinary engine for destination database, because it's the only way to discard table UUID + /// and make possible creation of new table with the same UUID. + String query = fmt::format("CREATE DATABASE IF NOT EXISTS {} ENGINE=Ordinary", backQuoteIfNeed(to_db_name)); + Context query_context = global_context; + executeQuery(query, query_context, true); + } + size_t dropped_dicts = 0; + size_t moved_tables = 0; + std::vector dropped_tables; for (const auto & table_name : tables_to_detach) { - String to_name = fmt::format("{}_{}_{}_{}", BROKEN_TABLE_PREFIX, table_name, max_log_ptr, thread_local_rng() % 1000); - DDLGuardPtr table_guard = DatabaseCatalog::instance().getDDLGuard(getDatabaseName(), std::min(table_name, to_name)); - DDLGuardPtr to_table_guard = DatabaseCatalog::instance().getDDLGuard(getDatabaseName(), std::max(table_name, to_name)); + String to_name = fmt::format("{}_{}_{}", table_name, max_log_ptr, thread_local_rng() % 1000); + assert(db_name < to_db_name); + DDLGuardPtr table_guard = DatabaseCatalog::instance().getDDLGuard(db_name, table_name); + DDLGuardPtr to_table_guard = DatabaseCatalog::instance().getDDLGuard(to_db_name, to_name); + if (getDatabaseName() != db_name) + throw Exception(ErrorCodes::UNKNOWN_DATABASE, "Database was renamed, will retry"); if (isDictionaryExist(table_name)) { - /// TODO implement DETACH DICTIONARY PERMANENTLY + LOG_DEBUG(log, "Will DROP DICTIONARY {}", backQuoteIfNeed(table_name)); DatabaseAtomic::removeDictionary(global_context, table_name); + ++dropped_dicts; + } + else if (!tryGetTable(table_name, global_context)->storesDataOnDisk()) + { + LOG_DEBUG(log, "Will DROP TABLE {}, because it does not store data on disk and can be safely dropped", backQuoteIfNeed(table_name)); + dropped_tables.push_back(tryGetTableUUID(table_name)); + tryGetTable(table_name, global_context)->shutdown(); + DatabaseAtomic::dropTable(global_context, table_name, true); } else { - DatabaseAtomic::renameTable(global_context, table_name, *this, to_name, false, false); - DatabaseAtomic::detachTablePermanently(global_context, to_name); + LOG_DEBUG(log, "Will RENAME TABLE {} TO {}.{}", backQuoteIfNeed(table_name), backQuoteIfNeed(to_db_name), backQuoteIfNeed(to_name)); + auto to_db_ptr = DatabaseCatalog::instance().getDatabase(to_db_name); + DatabaseAtomic::renameTable(global_context, table_name, *to_db_ptr, to_name, false, false); + ++moved_tables; } } + if (!tables_to_detach.empty()) + LOG_WARNING(log, "Cleaned {} outdated objects: dropped {} dictionaries and {} tables, moved {} tables", + tables_to_detach.size(), dropped_dicts, dropped_tables.size(), moved_tables); + + for (const auto & id : dropped_tables) + DatabaseCatalog::instance().waitTableFinallyDropped(id); + for (const auto & name_and_meta : table_name_to_metadata) { if (isTableExist(name_and_meta.first, global_context)) @@ -535,8 +569,6 @@ void DatabaseReplicated::commitCreateTable(const ASTCreateQuery & query, const S const String & table_metadata_tmp_path, const String & table_metadata_path, const Context & query_context) { - if (startsWith(query.table, BROKEN_TABLE_PREFIX)) - throw Exception(ErrorCodes::INCORRECT_QUERY, "It's not allowed to attach broken tables"); auto txn = query_context.getMetadataTransaction(); assert(!ddl_worker->isCurrentlyActive() || txn); if (txn && txn->is_initial_query) diff --git a/src/Databases/DatabaseReplicatedWorker.cpp b/src/Databases/DatabaseReplicatedWorker.cpp index b29a8822c0c..5a350783dcb 100644 --- a/src/Databases/DatabaseReplicatedWorker.cpp +++ b/src/Databases/DatabaseReplicatedWorker.cpp @@ -81,7 +81,7 @@ String DatabaseReplicatedDDLWorker::enqueueQuery(DDLLogEntry & entry) return node_path; } -String DatabaseReplicatedDDLWorker::tryEnqueueAndExecuteEntry(DDLLogEntry & entry, const Context & query_context) +String DatabaseReplicatedDDLWorker::tryEnqueueAndExecuteEntry(DDLLogEntry & entry, const Context & /*query_context*/) { /// NOTE Possibly it would be better to execute initial query on the most up-to-date node, /// but it requires more complex logic around /try node. @@ -106,7 +106,7 @@ String DatabaseReplicatedDDLWorker::tryEnqueueAndExecuteEntry(DDLLogEntry & entr task->is_initial_query = true; LOG_DEBUG(log, "Waiting for worker thread to process all entries before {}", entry_name); - UInt64 timeout = query_context.getSettingsRef().distributed_ddl_task_timeout; + UInt64 timeout = 600; { std::unique_lock lock{mutex}; bool processed = wait_current_task_change.wait_for(lock, std::chrono::seconds(timeout), [&]() diff --git a/src/Interpreters/DDLWorker.cpp b/src/Interpreters/DDLWorker.cpp index 9a398df07b5..242ee7ea0e1 100644 --- a/src/Interpreters/DDLWorker.cpp +++ b/src/Interpreters/DDLWorker.cpp @@ -474,7 +474,7 @@ void DDLWorker::processTask(DDLTaskBase & task) auto create_active_res = zookeeper->tryCreate(active_node_path, {}, zkutil::CreateMode::Ephemeral); if (create_active_res != Coordination::Error::ZOK) { - if (create_active_res == Coordination::Error::ZNONODE) + if (create_active_res != Coordination::Error::ZNONODE) throw Coordination::Exception(create_active_res, active_node_path); createStatusDirs(task.entry_path, zookeeper); zookeeper->create(active_node_path, {}, zkutil::CreateMode::Ephemeral); diff --git a/tests/integration/test_replicated_database/test.py b/tests/integration/test_replicated_database/test.py index 04646507ed7..faeb436f279 100644 --- a/tests/integration/test_replicated_database/test.py +++ b/tests/integration/test_replicated_database/test.py @@ -3,7 +3,8 @@ import re import pytest from helpers.cluster import ClickHouseCluster -from helpers.test_tools import assert_eq_with_retry +from helpers.test_tools import assert_eq_with_retry, assert_logs_contain +from helpers.network import PartitionManager cluster = ClickHouseCluster(__file__) @@ -162,7 +163,7 @@ def test_alters_from_different_replicas(started_cluster): assert main_node.query("SELECT shard_num, replica_num, host_name FROM system.clusters WHERE cluster='testdb'") == expected # test_drop_and_create_replica - main_node.query("DROP DATABASE testdb") + main_node.query("DROP DATABASE testdb SYNC") main_node.query("CREATE DATABASE testdb ENGINE = Replicated('/clickhouse/databases/test1', 'shard1', 'replica1');") expected = "CREATE TABLE testdb.concurrent_test\\n(\\n `CounterID` UInt32,\\n `StartDate` Date,\\n `UserID` UInt32,\\n" \ @@ -183,3 +184,64 @@ def test_alters_from_different_replicas(started_cluster): assert_eq_with_retry(dummy_node, "SELECT CounterID, StartDate, UserID FROM testdb.dist ORDER BY CounterID", expected) +def test_recover_staled_replica(started_cluster): + main_node.query("CREATE DATABASE recover ENGINE = Replicated('/clickhouse/databases/recover', 'shard1', 'replica1');") + started_cluster.get_kazoo_client('zoo1').set('/clickhouse/databases/recover/logs_to_keep', b'10') + dummy_node.query("CREATE DATABASE recover ENGINE = Replicated('/clickhouse/databases/recover', 'shard1', 'replica2');") + + settings = {"distributed_ddl_task_timeout": 0} + main_node.query("CREATE TABLE recover.t1 (n int) ENGINE=Memory", settings=settings) + dummy_node.query("CREATE TABLE recover.t2 (s String) ENGINE=Memory", settings=settings) + main_node.query("CREATE TABLE recover.mt1 (n int) ENGINE=MergeTree order by n", settings=settings) + dummy_node.query("CREATE TABLE recover.mt2 (n int) ENGINE=MergeTree order by n", settings=settings) + main_node.query("CREATE TABLE recover.rmt1 (n int) ENGINE=ReplicatedMergeTree order by n", settings=settings) + dummy_node.query("CREATE TABLE recover.rmt2 (n int) ENGINE=ReplicatedMergeTree order by n", settings=settings) + main_node.query("CREATE DICTIONARY recover.d1 (n int DEFAULT 0, m int DEFAULT 1) PRIMARY KEY n SOURCE(CLICKHOUSE(HOST 'localhost' PORT 9000 USER 'default' TABLE 'rmt1' PASSWORD '' DB 'recover')) LIFETIME(MIN 1 MAX 10) LAYOUT(FLAT())") + dummy_node.query("CREATE DICTIONARY recover.d2 (n int DEFAULT 0, m int DEFAULT 1) PRIMARY KEY n SOURCE(CLICKHOUSE(HOST 'localhost' PORT 9000 USER 'default' TABLE 'rmt2' PASSWORD '' DB 'recover')) LIFETIME(MIN 1 MAX 10) LAYOUT(FLAT())") + + for table in ['t1', 't2', 'mt1', 'mt2', 'rmt1', 'rmt2']: + main_node.query("INSERT INTO recover.{} VALUES (42)".format(table)) + for table in ['t1', 't2', 'mt1', 'mt2']: + dummy_node.query("INSERT INTO recover.{} VALUES (42)".format(table)) + for table in ['rmt1', 'rmt2']: + main_node.query("SYSTEM SYNC REPLICA recover.{}".format(table)) + + with PartitionManager() as pm: + pm.drop_instance_zk_connections(dummy_node) + dummy_node.query_and_get_error("RENAME TABLE recover.t1 TO recover.m1") + main_node.query("RENAME TABLE recover.t1 TO recover.m1", settings=settings) + main_node.query("ALTER TABLE recover.mt1 ADD COLUMN m int", settings=settings) + main_node.query("ALTER TABLE recover.rmt1 ADD COLUMN m int", settings=settings) + main_node.query("DROP DICTIONARY recover.d2", settings=settings) + main_node.query("CREATE DICTIONARY recover.d2 (n int DEFAULT 0, m int DEFAULT 1) PRIMARY KEY n SOURCE(CLICKHOUSE(HOST 'localhost' PORT 9000 USER 'default' TABLE 'rmt1' PASSWORD '' DB 'recover')) LIFETIME(MIN 1 MAX 10) LAYOUT(FLAT());", settings=settings) + + main_node.query("CREATE TABLE recover.tmp AS recover.m1", settings=settings) + main_node.query("DROP TABLE recover.tmp", settings=settings) + main_node.query("CREATE TABLE recover.tmp AS recover.m1", settings=settings) + main_node.query("DROP TABLE recover.tmp", settings=settings) + main_node.query("CREATE TABLE recover.tmp AS recover.m1", settings=settings) + main_node.query("DROP TABLE recover.tmp", settings=settings) + main_node.query("CREATE TABLE recover.tmp AS recover.m1", settings=settings) + + assert main_node.query("SELECT name FROM system.tables WHERE database='recover' ORDER BY name") == "d1\nd2\nm1\nmt1\nmt2\nrmt1\nrmt2\nt2\ntmp\n" + query = "SELECT name, uuid, create_table_query FROM system.tables WHERE database='recover' ORDER BY name" + expected = main_node.query(query) + assert_eq_with_retry(dummy_node, query, expected) + + for table in ['m1', 't2', 'mt1', 'mt2', 'rmt1', 'rmt2', 'd1', 'd2']: + assert main_node.query("SELECT (*,).1 FROM recover.{}".format(table)) == "42\n" + for table in ['t2', 'rmt1', 'rmt2', 'd1', 'd2', 'mt2']: + assert dummy_node.query("SELECT (*,).1 FROM recover.{}".format(table)) == "42\n" + for table in ['m1', 'mt1']: + assert dummy_node.query("SELECT count() FROM recover.{}".format(table)) == "0\n" + + assert dummy_node.query("SELECT count() FROM system.tables WHERE database='recover_broken_tables'") == "1\n" + table = dummy_node.query("SHOW TABLES FROM recover_broken_tables").strip() + assert "mt1_22_" in table + assert dummy_node.query("SELECT (*,).1 FROM recover_broken_tables.{}".format(table)) == "42\n" + + expected = "Cleaned 3 outdated objects: dropped 1 dictionaries and 1 tables, moved 1 tables" + assert_logs_contain(dummy_node, expected) + + dummy_node.query("DROP TABLE recover.tmp") + From ed7270dd8bf96e2d67a766f0833d275978791838 Mon Sep 17 00:00:00 2001 From: Alexey Milovidov Date: Fri, 12 Feb 2021 03:25:00 +0300 Subject: [PATCH 0253/2357] Better interface --- src/Columns/ColumnCompressed.cpp | 61 ++++++++++++++++ src/Columns/ColumnCompressed.h | 120 +++++++++++++++++++++++++++++++ src/Columns/ColumnVector.cpp | 54 ++++---------- src/Columns/ColumnVector.h | 2 +- src/Columns/IColumn.h | 4 +- src/Storages/StorageMemory.cpp | 118 ++++++++++++++---------------- src/Storages/StorageMemory.h | 7 +- 7 files changed, 251 insertions(+), 115 deletions(-) create mode 100644 src/Columns/ColumnCompressed.cpp create mode 100644 src/Columns/ColumnCompressed.h diff --git a/src/Columns/ColumnCompressed.cpp b/src/Columns/ColumnCompressed.cpp new file mode 100644 index 00000000000..d7d30745868 --- /dev/null +++ b/src/Columns/ColumnCompressed.cpp @@ -0,0 +1,61 @@ +#include + +#pragma GCC diagnostic ignored "-Wold-style-cast" + +#include + + +namespace DB +{ + +namespace ErrorCodes +{ + extern const int CANNOT_COMPRESS; + extern const int CANNOT_DECOMPRESS; +} + + +std::shared_ptr> ColumnCompressed::compressBuffer(const void * data, size_t data_size) +{ + size_t max_dest_size = LZ4_COMPRESSBOUND(data_size); + + if (max_dest_size > std::numeric_limits::max()) + throw Exception(ErrorCodes::CANNOT_COMPRESS, "Cannot compress column of size {}", formatReadableSizeWithBinarySuffix(data_size)); + + Memory<> compressed(max_dest_size); + + auto compressed_size = LZ4_compress_default( + reinterpret_cast(data), + compressed.data(), + data_size, + max_dest_size); + + if (compressed_size <= 0) + throw Exception(ErrorCodes::CANNOT_COMPRESS, "Cannot compress column"); + + /// If compression is inefficient. + if (static_cast(compressed_size) * 2 > data_size) + return {}; + + /// Shrink to fit. + auto shrank = std::make_shared>(compressed_size); + memcpy(shrank->data(), compressed.data(), compressed_size); + + return shrank; +} + + +void ColumnCompressed::decompressBuffer( + const void * compressed_data, void * decompressed_data, size_t compressed_size, size_t decompressed_size) +{ + auto processed_size = LZ4_decompress_safe( + reinterpret_cast(compressed_data), + reinterpret_cast(decompressed_data), + compressed_size, + decompressed_size); + + if (processed_size <= 0) + throw Exception(ErrorCodes::CANNOT_DECOMPRESS, "Cannot decompress column"); +} + +} diff --git a/src/Columns/ColumnCompressed.h b/src/Columns/ColumnCompressed.h new file mode 100644 index 00000000000..bd70005ac5d --- /dev/null +++ b/src/Columns/ColumnCompressed.h @@ -0,0 +1,120 @@ +#pragma once + +#include +#include +#include +#include + + +namespace DB +{ + +namespace ErrorCodes +{ + extern const int LOGICAL_ERROR; +} + + +/** Wrapper for compressed column data. + * The only supported operations are: + * - decompress (reconstruct the source column) + * - get size in rows or bytes. + * + * It is needed to implement in-memory compression + * - to keep compressed data in Block or pass around. + * + * It's often beneficial to store compressed data in-memory and decompress on the fly + * because it allows to lower memory throughput. More specifically, if: + * + * decompression speed * num CPU cores >= memory read throughput + * + * Also in-memory compression allows to keep more data in RAM. + */ +class ColumnCompressed : public COWHelper +{ +public: + using Lazy = std::function; + + ColumnCompressed(size_t rows_, size_t bytes_, Lazy lazy_) + : rows(rows_), bytes(bytes_), lazy(lazy_) + { + } + + const char * getFamilyName() const override { return "Compressed"; } + + size_t size() const override { return rows; } + size_t byteSize() const override { return bytes; } + size_t allocatedBytes() const override { return bytes; } + + ColumnPtr decompress() const override + { + return lazy(); + } + + /** Wrap uncompressed column without compression. + * Method can be used when compression is not worth doing. + * But returning CompressedColumn is still needed to keep uniform block structure. + */ + static ColumnPtr wrap(ColumnPtr column) + { + return ColumnCompressed::create( + column->size(), + column->allocatedBytes(), + [column = std::move(column)]{ return column; }); + } + + /// Helper methods for compression. + + /// If data is not worth to be compressed - returns nullptr. Note: shared_ptr is to allow to be captured by std::function. + static std::shared_ptr> compressBuffer(const void * data, size_t data_size); + + static void decompressBuffer( + const void * compressed_data, void * decompressed_data, size_t compressed_size, size_t decompressed_size); + + /// All other methods throw exception. + + TypeIndex getDataType() const override { throwMustBeDecompressed(); } + Field operator[](size_t) const override { throwMustBeDecompressed(); } + void get(size_t, Field &) const override { throwMustBeDecompressed(); } + StringRef getDataAt(size_t) const override { throwMustBeDecompressed(); } + void insert(const Field &) override { throwMustBeDecompressed(); } + void insertRangeFrom(const IColumn &, size_t, size_t) override { throwMustBeDecompressed(); } + void insertData(const char *, size_t) override { throwMustBeDecompressed(); } + void insertDefault() override { throwMustBeDecompressed(); } + void popBack(size_t) override { throwMustBeDecompressed(); } + StringRef serializeValueIntoArena(size_t, Arena &, char const *&) const override { throwMustBeDecompressed(); } + const char * deserializeAndInsertFromArena(const char *) override { throwMustBeDecompressed(); } + void updateHashWithValue(size_t, SipHash &) const override { throwMustBeDecompressed(); } + void updateWeakHash32(WeakHash32 &) const override { throwMustBeDecompressed(); } + void updateHashFast(SipHash &) const override { throwMustBeDecompressed(); } + ColumnPtr filter(const Filter &, ssize_t) const override { throwMustBeDecompressed(); } + ColumnPtr permute(const Permutation &, size_t) const override { throwMustBeDecompressed(); } + ColumnPtr index(const IColumn &, size_t) const override { throwMustBeDecompressed(); } + int compareAt(size_t, size_t, const IColumn &, int) const override { throwMustBeDecompressed(); } + void compareColumn(const IColumn &, size_t, PaddedPODArray *, PaddedPODArray &, int, int) const override + { + throwMustBeDecompressed(); + } + void getPermutation(bool, size_t, int, Permutation &) const override { throwMustBeDecompressed(); } + void updatePermutation(bool, size_t, int, Permutation &, EqualRanges &) const override { throwMustBeDecompressed(); } + ColumnPtr replicate(const Offsets &) const override { throwMustBeDecompressed(); } + MutableColumns scatter(ColumnIndex, const Selector &) const override { throwMustBeDecompressed(); } + void gather(ColumnGathererStream &) override { throwMustBeDecompressed(); } + void getExtremes(Field &, Field &) const override { throwMustBeDecompressed(); } + size_t byteSizeAt(size_t) const override { throwMustBeDecompressed(); } + +protected: + size_t rows; + size_t bytes; + + Lazy lazy; + +private: + [[noreturn]] void throwMustBeDecompressed() const + { + throw Exception("ColumnCompressed must be decompressed before use", ErrorCodes::LOGICAL_ERROR); + } +}; + +} + diff --git a/src/Columns/ColumnVector.cpp b/src/Columns/ColumnVector.cpp index 32658eb3e34..324b23eabcc 100644 --- a/src/Columns/ColumnVector.cpp +++ b/src/Columns/ColumnVector.cpp @@ -2,6 +2,7 @@ #include #include +#include #include #include #include @@ -16,9 +17,6 @@ #include #include #include -#include -#include -#include #include #include @@ -529,51 +527,27 @@ void ColumnVector::getExtremes(Field & min, Field & max) const #pragma GCC diagnostic ignored "-Wold-style-cast" template -LazyColumn ColumnVector::compress() const +ColumnPtr ColumnVector::compress() const { size_t source_size = data.size() * sizeof(T); /// Don't compress small blocks. if (source_size < 4096) /// A wild guess. - return IColumn::compress(); + return ColumnCompressed::wrap(this->getPtr()); - size_t max_dest_size = LZ4_COMPRESSBOUND(source_size); + auto compressed = ColumnCompressed::compressBuffer(data.data(), source_size); - if (max_dest_size > std::numeric_limits::max()) - throw Exception(ErrorCodes::CANNOT_COMPRESS, "Cannot compress column of size {}", formatReadableSizeWithBinarySuffix(source_size)); + if (!compressed) + return ColumnCompressed::wrap(this->getPtr()); - auto compressed = std::make_shared>(max_dest_size); - - auto compressed_size = LZ4_compress_default( - reinterpret_cast(data.data()), - compressed->data(), - source_size, - max_dest_size); - - if (compressed_size <= 0) - throw Exception(ErrorCodes::CANNOT_COMPRESS, "Cannot compress column"); - - /// If compression is inefficient. - if (static_cast(compressed_size) * 2 > source_size) - return IColumn::compress(); - - /// Shrink to fit. - auto shrank = std::make_shared>(compressed_size); - memcpy(shrank->data(), compressed->data(), compressed_size); - - return [compressed = std::move(shrank), column_size = data.size()] - { - auto res = ColumnVector::create(column_size); - auto processed_size = LZ4_decompress_fast( - compressed->data(), - reinterpret_cast(res->getData().data()), - column_size * sizeof(T)); - - if (processed_size <= 0) - throw Exception(ErrorCodes::CANNOT_DECOMPRESS, "Cannot decompress column"); - - return res; - }; + return ColumnCompressed::create(data.size(), compressed->size(), + [compressed = std::move(compressed), column_size = data.size()] + { + auto res = ColumnVector::create(column_size); + ColumnCompressed::decompressBuffer( + compressed->data(), res->getData().data(), compressed->size(), column_size * sizeof(T)); + return res; + }); } /// Explicit template instantiations - to avoid code bloat in headers. diff --git a/src/Columns/ColumnVector.h b/src/Columns/ColumnVector.h index 4f1cbcafcbc..623a828a110 100644 --- a/src/Columns/ColumnVector.h +++ b/src/Columns/ColumnVector.h @@ -298,7 +298,7 @@ public: return typeid(rhs) == typeid(ColumnVector); } - LazyColumn compress() const override; + ColumnPtr compress() const override; /// Replace elements that match the filter with zeroes. If inverted replaces not matched elements. void applyZeroMap(const IColumn::Filter & filt, bool inverted = false); diff --git a/src/Columns/IColumn.h b/src/Columns/IColumn.h index 7697bd116bf..2b4b633f9a5 100644 --- a/src/Columns/IColumn.h +++ b/src/Columns/IColumn.h @@ -359,7 +359,7 @@ public: /// Compress column in memory to some representation that allows to decompress it back. /// Return itself if compression is not applicable for this column type. - virtual ColumnPtr compress() const + virtual Ptr compress() const { /// No compression by default. return getPtr(); @@ -367,7 +367,7 @@ public: /// If it's CompressedColumn, decompress it and return. /// Otherwise return itself. - virtual ColumnPtr decompress() const + virtual Ptr decompress() const { return getPtr(); } diff --git a/src/Storages/StorageMemory.cpp b/src/Storages/StorageMemory.cpp index 20c8a44efd4..01f70db5edd 100644 --- a/src/Storages/StorageMemory.cpp +++ b/src/Storages/StorageMemory.cpp @@ -24,7 +24,7 @@ namespace ErrorCodes class MemorySource : public SourceWithProgress { - using InitializerFunc = std::function &)>; + using InitializerFunc = std::function &)>; public: /// Blocks are stored in std::list which may be appended in another thread. /// We use pointer to the beginning of the list and its current size. @@ -35,7 +35,7 @@ public: Names column_names_, const StorageMemory & storage, const StorageMetadataPtr & metadata_snapshot, - std::shared_ptr data_, + std::shared_ptr data_, std::shared_ptr> parallel_execution_index_, InitializerFunc initializer_func_ = {}) : SourceWithProgress(metadata_snapshot->getSampleBlockForColumns(column_names_, storage.getVirtuals(), storage.getStorageID())) @@ -44,8 +44,6 @@ public: , parallel_execution_index(parallel_execution_index_) , initializer_func(std::move(initializer_func_)) { - for (const auto & elem : column_names_and_types) - column_positions.push_back(metadata_snapshot->getSampleBlock().getPositionByName(elem.getNameInStorage())); } String getName() const override { return "Memory"; } @@ -66,25 +64,23 @@ protected: return {}; } - const LazyBlock & src = (*data)[current_index]; + const Block & src = (*data)[current_index]; Columns columns; columns.reserve(columns.size()); /// Add only required columns to `res`. - size_t i = 0; for (const auto & elem : column_names_and_types) { - auto current_column = src[column_positions[i]](); + auto current_column = src.getByName(elem.getNameInStorage()).column; + current_column = current_column->decompress(); + if (elem.isSubcolumn()) columns.emplace_back(elem.getTypeInStorage()->getSubcolumn(elem.getSubcolumnName(), *current_column)); else columns.emplace_back(std::move(current_column)); - - ++i; } - size_t rows = columns.at(0)->size(); - return Chunk(std::move(columns), rows); + return Chunk(std::move(columns), src.rows()); } private: @@ -102,10 +98,9 @@ private: const NamesAndTypesList column_names_and_types; size_t execution_index = 0; - std::shared_ptr data; + std::shared_ptr data; std::shared_ptr> parallel_execution_index; InitializerFunc initializer_func; - std::vector column_positions; }; @@ -126,31 +121,34 @@ public: { metadata_snapshot->check(block, true); - inserted_bytes += block.allocatedBytes(); - inserted_rows += block.rows(); - - Block sample = metadata_snapshot->getSampleBlock(); - - LazyColumns lazy_columns; - lazy_columns.reserve(sample.columns()); - - for (const auto & elem : sample) + if (storage.compress) { - const ColumnPtr & column = block.getByName(elem.name).column; + Block compressed_block; + for (auto & elem : block) + compressed_block.insert({ elem.column->compress(), elem.type, elem.name }); - if (storage.compress) - lazy_columns.emplace_back(column->compress()); - else - lazy_columns.emplace_back([=]{ return column; }); + new_blocks.emplace_back(compressed_block); + } + else + { + new_blocks.emplace_back(block); } - - new_blocks.emplace_back(std::move(lazy_columns)); } void writeSuffix() override { + size_t inserted_bytes = 0; + size_t inserted_rows = 0; + + for (const auto & block : new_blocks) + { + inserted_bytes += block.allocatedBytes(); + inserted_rows += block.rows(); + } + std::lock_guard lock(storage.mutex); - auto new_data = std::make_unique(*(storage.data.get())); + + auto new_data = std::make_unique(*(storage.data.get())); new_data->insert(new_data->end(), new_blocks.begin(), new_blocks.end()); storage.data.set(std::move(new_data)); @@ -159,9 +157,7 @@ public: } private: - LazyBlocks new_blocks; - size_t inserted_bytes = 0; - size_t inserted_rows = 0; + Blocks new_blocks; StorageMemory & storage; StorageMetadataPtr metadata_snapshot; @@ -173,7 +169,7 @@ StorageMemory::StorageMemory( ColumnsDescription columns_description_, ConstraintsDescription constraints_, bool compress_) - : IStorage(table_id_), data(std::make_unique()), compress(compress_) + : IStorage(table_id_), data(std::make_unique()), compress(compress_) { StorageInMemoryMetadata storage_metadata; storage_metadata.setColumns(std::move(columns_description_)); @@ -209,7 +205,7 @@ Pipe StorageMemory::read( metadata_snapshot, nullptr /* data */, nullptr /* parallel execution index */, - [this](std::shared_ptr & data_to_initialize) + [this](std::shared_ptr & data_to_initialize) { data_to_initialize = data.get(); })); @@ -242,18 +238,18 @@ BlockOutputStreamPtr StorageMemory::write(const ASTPtr & /*query*/, const Storag void StorageMemory::drop() { - data.set(std::make_unique()); + data.set(std::make_unique()); total_size_bytes.store(0, std::memory_order_relaxed); total_size_rows.store(0, std::memory_order_relaxed); } -static inline void updateBlockData(LazyBlock & old_block, const LazyBlock & new_block, const Block & old_header, const Block & new_header) +static inline void updateBlockData(Block & old_block, const Block & new_block) { - size_t i = 0; - for (const auto & it : new_header) + for (const auto & it : new_block) { - old_block[old_header.getPositionByName(it.name)] = new_block[i]; - ++i; + auto col_name = it.name; + auto & col_with_type_name = old_block.getByName(col_name); + col_with_type_name.column = it.column; } } @@ -265,47 +261,39 @@ void StorageMemory::mutate(const MutationCommands & commands, const Context & co auto storage_ptr = DatabaseCatalog::instance().getTable(storage, context); auto interpreter = std::make_unique(storage_ptr, metadata_snapshot, commands, context, true); auto in = interpreter->execute(); - Block old_header = metadata_snapshot->getSampleBlock(); - Block mutation_header = in->getHeader(); in->readPrefix(); - LazyBlocks out; + Blocks out; while (Block block = in->read()) { - LazyColumns lazy_columns; + if (compress) + for (auto & elem : block) + elem.column = elem.column->compress(); - for (const auto & elem : block) - { - if (compress) - lazy_columns.emplace_back(elem.column->compress()); - else - lazy_columns.emplace_back([=]{ return elem.column; }); - } - - out.emplace_back(std::move(lazy_columns)); + out.push_back(block); } in->readSuffix(); - std::unique_ptr new_data; + std::unique_ptr new_data; - /// All columns affected. + // all column affected if (interpreter->isAffectingAllColumns()) { - new_data = std::make_unique(out); + new_data = std::make_unique(out); } else { - /// Just some of the columns affected, we need update it with new column. - new_data = std::make_unique(*(data.get())); + /// just some of the column affected, we need update it with new column + new_data = std::make_unique(*(data.get())); auto data_it = new_data->begin(); auto out_it = out.begin(); while (data_it != new_data->end()) { - /// Mutation does not change the number of blocks. + /// Mutation does not change the number of blocks assert(out_it != out.end()); - updateBlockData(*data_it, *out_it, old_header, mutation_header); + updateBlockData(*data_it, *out_it); ++data_it; ++out_it; } @@ -313,7 +301,7 @@ void StorageMemory::mutate(const MutationCommands & commands, const Context & co assert(out_it == out.end()); } -/* size_t rows = 0; + size_t rows = 0; size_t bytes = 0; for (const auto & buffer : *new_data) { @@ -321,8 +309,7 @@ void StorageMemory::mutate(const MutationCommands & commands, const Context & co bytes += buffer.bytes(); } total_size_bytes.store(rows, std::memory_order_relaxed); - total_size_rows.store(bytes, std::memory_order_relaxed);*/ - + total_size_rows.store(bytes, std::memory_order_relaxed); data.set(std::move(new_data)); } @@ -330,7 +317,7 @@ void StorageMemory::mutate(const MutationCommands & commands, const Context & co void StorageMemory::truncate( const ASTPtr &, const StorageMetadataPtr &, const Context &, TableExclusiveLockHolder &) { - data.set(std::make_unique()); + data.set(std::make_unique()); total_size_bytes.store(0, std::memory_order_relaxed); total_size_rows.store(0, std::memory_order_relaxed); } @@ -364,7 +351,6 @@ void registerStorageMemory(StorageFactory & factory) return StorageMemory::create(args.table_id, args.columns, args.constraints, settings.compress); }, { - .supports_settings = true, .supports_parallel_insert = true, }); } diff --git a/src/Storages/StorageMemory.h b/src/Storages/StorageMemory.h index 97ddfa93d9a..91cf616c57d 100644 --- a/src/Storages/StorageMemory.h +++ b/src/Storages/StorageMemory.h @@ -15,11 +15,6 @@ namespace DB { -/// Lazy block contains possibly compressed columns. LazyColumn is std::function that reconstructs Column on call. -using LazyBlock = LazyColumns; -using LazyBlocks = std::vector; - - /** Implements storage in the RAM. * Suitable for temporary data. * It does not support keys. @@ -101,7 +96,7 @@ public: private: /// MultiVersion data storage, so that we can copy the list of blocks to readers. - MultiVersion data; + MultiVersion data; mutable std::mutex mutex; From 71d84b9f67381f3456609caf548a928c5c88cbda Mon Sep 17 00:00:00 2001 From: Alexey Milovidov Date: Fri, 12 Feb 2021 03:52:53 +0300 Subject: [PATCH 0254/2357] Fix style --- src/Columns/ColumnVector.cpp | 2 -- 1 file changed, 2 deletions(-) diff --git a/src/Columns/ColumnVector.cpp b/src/Columns/ColumnVector.cpp index 324b23eabcc..1374b049ccf 100644 --- a/src/Columns/ColumnVector.cpp +++ b/src/Columns/ColumnVector.cpp @@ -33,8 +33,6 @@ namespace ErrorCodes extern const int PARAMETER_OUT_OF_BOUND; extern const int SIZES_OF_COLUMNS_DOESNT_MATCH; extern const int LOGICAL_ERROR; - extern const int CANNOT_COMPRESS; - extern const int CANNOT_DECOMPRESS; } template From 170daa5d6514a2a8c78f408ae40c62edc08a15c8 Mon Sep 17 00:00:00 2001 From: Alexey Milovidov Date: Fri, 12 Feb 2021 05:33:39 +0300 Subject: [PATCH 0255/2357] Generate ya.make --- src/Columns/ya.make | 1 + src/Storages/ya.make | 1 + 2 files changed, 2 insertions(+) diff --git a/src/Columns/ya.make b/src/Columns/ya.make index 2affaeb0fc6..def9dfd4cb7 100644 --- a/src/Columns/ya.make +++ b/src/Columns/ya.make @@ -19,6 +19,7 @@ SRCS( Collator.cpp ColumnAggregateFunction.cpp ColumnArray.cpp + ColumnCompressed.cpp ColumnConst.cpp ColumnDecimal.cpp ColumnFixedString.cpp diff --git a/src/Storages/ya.make b/src/Storages/ya.make index dbf37e58695..e3e1807c566 100644 --- a/src/Storages/ya.make +++ b/src/Storages/ya.make @@ -24,6 +24,7 @@ SRCS( KeyDescription.cpp LiveView/StorageLiveView.cpp LiveView/TemporaryLiveViewCleaner.cpp + MemorySettings.cpp MergeTree/ActiveDataPartSet.cpp MergeTree/AllMergeSelector.cpp MergeTree/BackgroundJobsExecutor.cpp From 7e75965af887d7a7d68699b7bac5e0401cbf02c7 Mon Sep 17 00:00:00 2001 From: Nikolai Kochetov Date: Fri, 12 Feb 2021 12:35:26 +0300 Subject: [PATCH 0256/2357] Fix ActionsDAG::splitActionsForFilter --- src/Interpreters/ActionsDAG.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/Interpreters/ActionsDAG.cpp b/src/Interpreters/ActionsDAG.cpp index 78254e5139a..6a7dbc47230 100644 --- a/src/Interpreters/ActionsDAG.cpp +++ b/src/Interpreters/ActionsDAG.cpp @@ -1168,7 +1168,7 @@ ActionsDAGPtr ActionsDAG::splitActionsForFilter(const std::string & filter_name, { auto & inputs_list = inputs_map[name]; if (inputs_list.empty()) - throw Exception(ErrorCodes::LOGICAL_ERROR, "Cannot find input {} in ActionsDAG. DAG:\n{}", name, dumpDAG()); + continue; allowed_nodes.emplace(inputs_list.front()); inputs_list.pop_front(); From 443a3e7e6fd2452bf3efa8e4ab2a349feaf3b29f Mon Sep 17 00:00:00 2001 From: Nikolai Kochetov Date: Fri, 12 Feb 2021 13:12:31 +0300 Subject: [PATCH 0257/2357] Fix limit push down. --- src/Processors/QueryPlan/Optimizations/Optimizations.h | 4 ++-- src/Processors/QueryPlan/Optimizations/filterPushDown.cpp | 6 +++--- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/src/Processors/QueryPlan/Optimizations/Optimizations.h b/src/Processors/QueryPlan/Optimizations/Optimizations.h index be7f81e5db0..a5c3af488a9 100644 --- a/src/Processors/QueryPlan/Optimizations/Optimizations.h +++ b/src/Processors/QueryPlan/Optimizations/Optimizations.h @@ -40,7 +40,7 @@ size_t tryMergeExpressions(QueryPlan::Node * parent_node, QueryPlan::Nodes &); /// Move FilterStep down if possible. /// May split FilterStep and push down only part of it. -size_t tryPushDownLimit(QueryPlan::Node * parent_node, QueryPlan::Nodes & nodes); +size_t tryPushDownFilter(QueryPlan::Node * parent_node, QueryPlan::Nodes & nodes); inline const auto & getOptimizations() { @@ -50,7 +50,7 @@ inline const auto & getOptimizations() {tryPushDownLimit, "pushDownLimit"}, {trySplitFilter, "splitFilter"}, {tryMergeExpressions, "mergeExpressions"}, - {tryPushDownLimit, "pushDownFilter"}, + {tryPushDownFilter, "pushDownFilter"}, }}; return optimizations; diff --git a/src/Processors/QueryPlan/Optimizations/filterPushDown.cpp b/src/Processors/QueryPlan/Optimizations/filterPushDown.cpp index a5f1d37e2f2..ac95d69d237 100644 --- a/src/Processors/QueryPlan/Optimizations/filterPushDown.cpp +++ b/src/Processors/QueryPlan/Optimizations/filterPushDown.cpp @@ -42,11 +42,11 @@ size_t tryPushDownLimit(QueryPlan::Node * parent_node, QueryPlan::Nodes & nodes) for (auto pos : params.keys) keys.push_back(params.src_header.getByPosition(pos).name); - // std::cerr << "Filter: \n" << expression->dumpDAG() << std::endl; + std::cerr << "Filter: \n" << expression->dumpDAG() << std::endl; if (auto split_filter = expression->splitActionsForFilter(filter_column_name, removes_filter, keys)) { - // std::cerr << "===============\n" << expression->dumpDAG() << std::endl; - // std::cerr << "---------------\n" << split_filter->dumpDAG() << std::endl; + std::cerr << "===============\n" << expression->dumpDAG() << std::endl; + std::cerr << "---------------\n" << split_filter->dumpDAG() << std::endl; auto it = expression->getIndex().find(filter_column_name); if (it == expression->getIndex().end()) From 93e1428f2119ecc5b3979ff5bff0d0304327579c Mon Sep 17 00:00:00 2001 From: Nikolai Kochetov Date: Fri, 12 Feb 2021 13:51:16 +0300 Subject: [PATCH 0258/2357] Fix limit push down. --- src/Processors/QueryPlan/Optimizations/filterPushDown.cpp | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/src/Processors/QueryPlan/Optimizations/filterPushDown.cpp b/src/Processors/QueryPlan/Optimizations/filterPushDown.cpp index ac95d69d237..ec005e59729 100644 --- a/src/Processors/QueryPlan/Optimizations/filterPushDown.cpp +++ b/src/Processors/QueryPlan/Optimizations/filterPushDown.cpp @@ -15,7 +15,7 @@ namespace DB::ErrorCodes namespace DB::QueryPlanOptimizations { -size_t tryPushDownLimit(QueryPlan::Node * parent_node, QueryPlan::Nodes & nodes) +size_t tryPushDownFilter(QueryPlan::Node * parent_node, QueryPlan::Nodes & nodes) { if (parent_node->children.size() != 1) return 0; @@ -42,11 +42,11 @@ size_t tryPushDownLimit(QueryPlan::Node * parent_node, QueryPlan::Nodes & nodes) for (auto pos : params.keys) keys.push_back(params.src_header.getByPosition(pos).name); - std::cerr << "Filter: \n" << expression->dumpDAG() << std::endl; + // std::cerr << "Filter: \n" << expression->dumpDAG() << std::endl; if (auto split_filter = expression->splitActionsForFilter(filter_column_name, removes_filter, keys)) { - std::cerr << "===============\n" << expression->dumpDAG() << std::endl; - std::cerr << "---------------\n" << split_filter->dumpDAG() << std::endl; + // std::cerr << "===============\n" << expression->dumpDAG() << std::endl; + // std::cerr << "---------------\n" << split_filter->dumpDAG() << std::endl; auto it = expression->getIndex().find(filter_column_name); if (it == expression->getIndex().end()) From 683d793cc289ec12b8885efe1405b79a22350a36 Mon Sep 17 00:00:00 2001 From: Nikolai Kochetov Date: Fri, 12 Feb 2021 14:31:14 +0300 Subject: [PATCH 0259/2357] Update test. --- .../01655_plan_optimizations.reference | 33 +++++++++++- .../0_stateless/01655_plan_optimizations.sh | 51 ++++++++++++++++++- 2 files changed, 80 insertions(+), 4 deletions(-) diff --git a/tests/queries/0_stateless/01655_plan_optimizations.reference b/tests/queries/0_stateless/01655_plan_optimizations.reference index fda40305f9d..510224146ed 100644 --- a/tests/queries/0_stateless/01655_plan_optimizations.reference +++ b/tests/queries/0_stateless/01655_plan_optimizations.reference @@ -1,7 +1,7 @@ -sipHash should be calculated after filtration +> sipHash should be calculated after filtration FUNCTION sipHash64 Filter column: equals -sorting steps should know about limit +> sorting steps should know about limit Limit 10 MergingSorted Limit 10 @@ -9,3 +9,32 @@ MergeSorting Limit 10 PartialSorting Limit 10 +-- filter push down -- +> filter should be pushed down after aggregating +Aggregating +Filter +> filter should be pushed down after aggregating, column after aggregation is const +COLUMN Const(UInt8) -> notEquals(y, 0) +Aggregating +Filter +Filter +> one condition of filter should be pushed down after aggregating, other condition is aliased +Filter column +ALIAS notEquals(s, 4) :: 1 -> and(notEquals(y, 0), notEquals(s, 4)) +Aggregating +Filter column: notEquals(y, 0) +> one condition of filter should be pushed down after aggregating, other condition is casted +Filter column +FUNCTION CAST(minus(s, 4) :: 1, UInt8 :: 3) -> and(notEquals(y, 0), minus(s, 4)) +Aggregating +Filter column: notEquals(y, 0) +> one condition of filter should be pushed down after aggregating, other two conditions are ANDed +Filter column +FUNCTION and(minus(s, 4) :: 2, minus(s, 8) :: 1) -> and(notEquals(y, 0), minus(s, 8), minus(s, 4)) +Aggregating +Filter column: notEquals(y, 0) +> two conditions of filter should be pushed down after aggregating and ANDed, one condition is aliased +Filter column +ALIAS notEquals(s, 8) :: 1 -> and(notEquals(y, 0), notEquals(s, 8), minus(y, 4)) +Aggregating +Filter column: and(minus(y, 4), notEquals(y, 0)) diff --git a/tests/queries/0_stateless/01655_plan_optimizations.sh b/tests/queries/0_stateless/01655_plan_optimizations.sh index 4f3541f9dde..ea76d15c648 100755 --- a/tests/queries/0_stateless/01655_plan_optimizations.sh +++ b/tests/queries/0_stateless/01655_plan_optimizations.sh @@ -4,7 +4,54 @@ CURDIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd) # shellcheck source=../shell_config.sh . "$CURDIR"/../shell_config.sh -echo "sipHash should be calculated after filtration" +echo "> sipHash should be calculated after filtration" $CLICKHOUSE_CLIENT -q "explain actions = 1 select sum(x), sum(y) from (select sipHash64(number) as x, bitAnd(number, 1024) as y from numbers_mt(1000000000) limit 1000000000) where y = 0" | grep -o "FUNCTION sipHash64\|Filter column: equals" -echo "sorting steps should know about limit" +echo "> sorting steps should know about limit" $CLICKHOUSE_CLIENT -q "explain actions = 1 select number from (select number from numbers(500000000) order by -number) limit 10" | grep -o "MergingSorted\|MergeSorting\|PartialSorting\|Limit 10" + +echo "-- filter push down --" +echo "> filter should be pushed down after aggregating" +$CLICKHOUSE_CLIENT -q " + explain select * from (select sum(x), y from ( + select number as x, number + 1 as y from numbers(10)) group by y + ) where y != 0 + settings enable_optimize_predicate_expression=0" | grep -o "Aggregating\|Filter" + +echo "> filter should be pushed down after aggregating, column after aggregation is const" +$CLICKHOUSE_CLIENT -q " + explain actions = 1 select *, y != 0 from (select sum(x), y from ( + select number as x, number + 1 as y from numbers(10)) group by y + ) where y != 0 + settings enable_optimize_predicate_expression=0" | grep -o "Aggregating\|Filter\|COLUMN Const(UInt8) -> notEquals(y, 0)" + +echo "> one condition of filter should be pushed down after aggregating, other condition is aliased" +$CLICKHOUSE_CLIENT -q " + explain actions = 1 select * from ( + select sum(x) as s, y from (select number as x, number + 1 as y from numbers(10)) group by y + ) where y != 0 and s != 4 + settings enable_optimize_predicate_expression=0" | + grep -o "Aggregating\|Filter column\|Filter column: notEquals(y, 0)\|ALIAS notEquals(s, 4) :: 1 -> and(notEquals(y, 0), notEquals(s, 4))" + +echo "> one condition of filter should be pushed down after aggregating, other condition is casted" +$CLICKHOUSE_CLIENT -q " + explain actions = 1 select * from ( + select sum(x) as s, y from (select number as x, number + 1 as y from numbers(10)) group by y + ) where y != 0 and s - 4 + settings enable_optimize_predicate_expression=0" | + grep -o "Aggregating\|Filter column\|Filter column: notEquals(y, 0)\|FUNCTION CAST(minus(s, 4) :: 1, UInt8 :: 3) -> and(notEquals(y, 0), minus(s, 4))" + +echo "> one condition of filter should be pushed down after aggregating, other two conditions are ANDed" +$CLICKHOUSE_CLIENT -q " + explain actions = 1 select * from ( + select sum(x) as s, y from (select number as x, number + 1 as y from numbers(10)) group by y + ) where y != 0 and s - 8 and s - 4 + settings enable_optimize_predicate_expression=0" | + grep -o "Aggregating\|Filter column\|Filter column: notEquals(y, 0)\|FUNCTION and(minus(s, 4) :: 2, minus(s, 8) :: 1) -> and(notEquals(y, 0), minus(s, 8), minus(s, 4))" + +echo "> two conditions of filter should be pushed down after aggregating and ANDed, one condition is aliased" +$CLICKHOUSE_CLIENT -q " + explain optimize = 1, actions = 1 select * from ( + select sum(x) as s, y from (select number as x, number + 1 as y from numbers(10)) group by y + ) where y != 0 and s != 8 and y - 4 + settings enable_optimize_predicate_expression=0" | + grep -o "Aggregating\|Filter column\|Filter column: and(minus(y, 4), notEquals(y, 0))\|ALIAS notEquals(s, 8) :: 1 -> and(notEquals(y, 0), notEquals(s, 8), minus(y, 4))" From 81d324da152b8411396444b760e6332ec8a2315a Mon Sep 17 00:00:00 2001 From: Stig Bakken Date: Fri, 12 Feb 2021 15:12:14 +0800 Subject: [PATCH 0260/2357] MaterializeMySQL: add skipping index for _version column --- .../MySQL/InterpretersMySQLDDLQuery.cpp | 16 +++- .../MySQL/tests/gtest_create_rewritten.cpp | 82 +++++++++++-------- 2 files changed, 64 insertions(+), 34 deletions(-) diff --git a/src/Interpreters/MySQL/InterpretersMySQLDDLQuery.cpp b/src/Interpreters/MySQL/InterpretersMySQLDDLQuery.cpp index 7f4da0638d4..dfc126a6c24 100644 --- a/src/Interpreters/MySQL/InterpretersMySQLDDLQuery.cpp +++ b/src/Interpreters/MySQL/InterpretersMySQLDDLQuery.cpp @@ -7,6 +7,7 @@ #include #include #include +#include #include #include #include @@ -411,13 +412,26 @@ ASTs InterpreterCreateImpl::getRewrittenQueries( return column_declaration; }; - /// Add _sign and _version column. + /// Add _sign and _version columns. String sign_column_name = getUniqueColumnName(columns_name_and_type, "_sign"); String version_column_name = getUniqueColumnName(columns_name_and_type, "_version"); columns->set(columns->columns, InterpreterCreateQuery::formatColumns(columns_name_and_type)); columns->columns->children.emplace_back(create_materialized_column_declaration(sign_column_name, "Int8", UInt64(1))); columns->columns->children.emplace_back(create_materialized_column_declaration(version_column_name, "UInt64", UInt64(1))); + /// Add minmax skipping index for _version column. + auto version_index = std::make_shared(); + version_index->name = version_column_name; + auto index_expr = std::make_shared(version_column_name); + auto index_type = makeASTFunction("minmax"); + index_type->no_empty_args = true; + version_index->set(version_index->expr, index_expr); + version_index->set(version_index->type, index_type); + version_index->granularity = 1; + ASTPtr indices = std::make_shared(); + indices->children.push_back(version_index); + columns->set(columns->indices, indices); + auto storage = std::make_shared(); /// The `partition by` expression must use primary keys, otherwise the primary keys will not be merge. diff --git a/src/Interpreters/MySQL/tests/gtest_create_rewritten.cpp b/src/Interpreters/MySQL/tests/gtest_create_rewritten.cpp index 0d8e57aafc5..5a82a570db0 100644 --- a/src/Interpreters/MySQL/tests/gtest_create_rewritten.cpp +++ b/src/Interpreters/MySQL/tests/gtest_create_rewritten.cpp @@ -28,6 +28,10 @@ static inline ASTPtr tryRewrittenCreateQuery(const String & query, const Context context, "test_database", "test_database")[0]; } +static const char MATERIALIZEMYSQL_TABLE_COLUMNS[] = ", `_sign` Int8() MATERIALIZED 1" + ", `_version` UInt64() MATERIALIZED 1" + ", INDEX _version _version TYPE minmax GRANULARITY 1"; + TEST(MySQLCreateRewritten, ColumnsDataType) { tryRegisterFunctions(); @@ -45,46 +49,46 @@ TEST(MySQLCreateRewritten, ColumnsDataType) { EXPECT_EQ(queryToString(tryRewrittenCreateQuery( "CREATE TABLE `test_database`.`test_table_1`(`key` INT NOT NULL PRIMARY KEY, test " + test_type + ")", context_holder.context)), - "CREATE TABLE test_database.test_table_1 (`key` Int32, `test` Nullable(" + mapped_type + ")" - ", `_sign` Int8() MATERIALIZED 1, `_version` UInt64() MATERIALIZED 1) ENGINE = " + "CREATE TABLE test_database.test_table_1 (`key` Int32, `test` Nullable(" + mapped_type + ")" + + MATERIALIZEMYSQL_TABLE_COLUMNS + ") ENGINE = " "ReplacingMergeTree(_version) PARTITION BY intDiv(key, 4294967) ORDER BY tuple(key)"); EXPECT_EQ(queryToString(tryRewrittenCreateQuery( "CREATE TABLE `test_database`.`test_table_1`(`key` INT NOT NULL PRIMARY KEY, test " + test_type + " NOT NULL)", context_holder.context)), "CREATE TABLE test_database.test_table_1 (`key` Int32, `test` " + mapped_type + - ", `_sign` Int8() MATERIALIZED 1, `_version` UInt64() MATERIALIZED 1) ENGINE = " + MATERIALIZEMYSQL_TABLE_COLUMNS + ") ENGINE = " "ReplacingMergeTree(_version) PARTITION BY intDiv(key, 4294967) ORDER BY tuple(key)"); EXPECT_EQ(queryToString(tryRewrittenCreateQuery( "CREATE TABLE `test_database`.`test_table_1`(`key` INT NOT NULL PRIMARY KEY, test " + test_type + " COMMENT 'test_comment' NOT NULL)", context_holder.context)), "CREATE TABLE test_database.test_table_1 (`key` Int32, `test` " + mapped_type + - ", `_sign` Int8() MATERIALIZED 1, `_version` UInt64() MATERIALIZED 1) ENGINE = " + MATERIALIZEMYSQL_TABLE_COLUMNS + ") ENGINE = " "ReplacingMergeTree(_version) PARTITION BY intDiv(key, 4294967) ORDER BY tuple(key)"); if (Poco::toUpper(test_type).find("INT") != std::string::npos) { EXPECT_EQ(queryToString(tryRewrittenCreateQuery( "CREATE TABLE `test_database`.`test_table_1`(`key` INT NOT NULL PRIMARY KEY, test " + test_type + " UNSIGNED)", context_holder.context)), - "CREATE TABLE test_database.test_table_1 (`key` Int32, `test` Nullable(U" + mapped_type + ")" - ", `_sign` Int8() MATERIALIZED 1, `_version` UInt64() MATERIALIZED 1) ENGINE = " + "CREATE TABLE test_database.test_table_1 (`key` Int32, `test` Nullable(U" + mapped_type + ")" + + MATERIALIZEMYSQL_TABLE_COLUMNS + ") ENGINE = " "ReplacingMergeTree(_version) PARTITION BY intDiv(key, 4294967) ORDER BY tuple(key)"); EXPECT_EQ(queryToString(tryRewrittenCreateQuery( "CREATE TABLE `test_database`.`test_table_1`(`key` INT NOT NULL PRIMARY KEY, test " + test_type + " COMMENT 'test_comment' UNSIGNED)", context_holder.context)), - "CREATE TABLE test_database.test_table_1 (`key` Int32, `test` Nullable(U" + mapped_type + ")" - ", `_sign` Int8() MATERIALIZED 1, `_version` UInt64() MATERIALIZED 1) ENGINE = " + "CREATE TABLE test_database.test_table_1 (`key` Int32, `test` Nullable(U" + mapped_type + ")" + + MATERIALIZEMYSQL_TABLE_COLUMNS + ") ENGINE = " "ReplacingMergeTree(_version) PARTITION BY intDiv(key, 4294967) ORDER BY tuple(key)"); EXPECT_EQ(queryToString(tryRewrittenCreateQuery( "CREATE TABLE `test_database`.`test_table_1`(`key` INT NOT NULL PRIMARY KEY, test " + test_type + " NOT NULL UNSIGNED)", context_holder.context)), "CREATE TABLE test_database.test_table_1 (`key` Int32, `test` U" + mapped_type + - ", `_sign` Int8() MATERIALIZED 1, `_version` UInt64() MATERIALIZED 1) ENGINE = " + MATERIALIZEMYSQL_TABLE_COLUMNS + ") ENGINE = " "ReplacingMergeTree(_version) PARTITION BY intDiv(key, 4294967) ORDER BY tuple(key)"); EXPECT_EQ(queryToString(tryRewrittenCreateQuery( "CREATE TABLE `test_database`.`test_table_1`(`key` INT NOT NULL PRIMARY KEY, test " + test_type + " COMMENT 'test_comment' UNSIGNED NOT NULL)", context_holder.context)), "CREATE TABLE test_database.test_table_1 (`key` Int32, `test` U" + mapped_type + - ", `_sign` Int8() MATERIALIZED 1, `_version` UInt64() MATERIALIZED 1) ENGINE = " + MATERIALIZEMYSQL_TABLE_COLUMNS + ") ENGINE = " "ReplacingMergeTree(_version) PARTITION BY intDiv(key, 4294967) ORDER BY tuple(key)"); } } @@ -109,13 +113,15 @@ TEST(MySQLCreateRewritten, PartitionPolicy) { EXPECT_EQ(queryToString(tryRewrittenCreateQuery( "CREATE TABLE `test_database`.`test_table_1` (`key` " + test_type + " PRIMARY KEY)", context_holder.context)), - "CREATE TABLE test_database.test_table_1 (`key` " + mapped_type + ", `_sign` Int8() MATERIALIZED 1, " - "`_version` UInt64() MATERIALIZED 1) ENGINE = ReplacingMergeTree(_version)" + partition_policy + " ORDER BY tuple(key)"); + "CREATE TABLE test_database.test_table_1 (`key` " + mapped_type + + MATERIALIZEMYSQL_TABLE_COLUMNS + + ") ENGINE = ReplacingMergeTree(_version)" + partition_policy + " ORDER BY tuple(key)"); EXPECT_EQ(queryToString(tryRewrittenCreateQuery( "CREATE TABLE `test_database`.`test_table_1` (`key` " + test_type + " NOT NULL PRIMARY KEY)", context_holder.context)), - "CREATE TABLE test_database.test_table_1 (`key` " + mapped_type + ", `_sign` Int8() MATERIALIZED 1, " - "`_version` UInt64() MATERIALIZED 1) ENGINE = ReplacingMergeTree(_version)" + partition_policy + " ORDER BY tuple(key)"); + "CREATE TABLE test_database.test_table_1 (`key` " + mapped_type + + MATERIALIZEMYSQL_TABLE_COLUMNS + + ") ENGINE = ReplacingMergeTree(_version)" + partition_policy + " ORDER BY tuple(key)"); } } @@ -138,23 +144,27 @@ TEST(MySQLCreateRewritten, OrderbyPolicy) { EXPECT_EQ(queryToString(tryRewrittenCreateQuery( "CREATE TABLE `test_database`.`test_table_1` (`key` " + test_type + " PRIMARY KEY, `key2` " + test_type + " UNIQUE KEY)", context_holder.context)), - "CREATE TABLE test_database.test_table_1 (`key` " + mapped_type + ", `key2` Nullable(" + mapped_type + "), `_sign` Int8() MATERIALIZED 1, " - "`_version` UInt64() MATERIALIZED 1) ENGINE = ReplacingMergeTree(_version)" + partition_policy + " ORDER BY (key, assumeNotNull(key2))"); + "CREATE TABLE test_database.test_table_1 (`key` " + mapped_type + ", `key2` Nullable(" + mapped_type + ")" + + MATERIALIZEMYSQL_TABLE_COLUMNS + + ") ENGINE = ReplacingMergeTree(_version)" + partition_policy + " ORDER BY (key, assumeNotNull(key2))"); EXPECT_EQ(queryToString(tryRewrittenCreateQuery( "CREATE TABLE `test_database`.`test_table_1` (`key` " + test_type + " NOT NULL PRIMARY KEY, `key2` " + test_type + " NOT NULL UNIQUE KEY)", context_holder.context)), - "CREATE TABLE test_database.test_table_1 (`key` " + mapped_type + ", `key2` " + mapped_type + ", `_sign` Int8() MATERIALIZED 1, " - "`_version` UInt64() MATERIALIZED 1) ENGINE = ReplacingMergeTree(_version)" + partition_policy + " ORDER BY (key, key2)"); + "CREATE TABLE test_database.test_table_1 (`key` " + mapped_type + ", `key2` " + mapped_type + + MATERIALIZEMYSQL_TABLE_COLUMNS + + ") ENGINE = ReplacingMergeTree(_version)" + partition_policy + " ORDER BY (key, key2)"); EXPECT_EQ(queryToString(tryRewrittenCreateQuery( "CREATE TABLE `test_database`.`test_table_1` (`key` " + test_type + " KEY UNIQUE KEY)", context_holder.context)), - "CREATE TABLE test_database.test_table_1 (`key` " + mapped_type + ", `_sign` Int8() MATERIALIZED 1, " - "`_version` UInt64() MATERIALIZED 1) ENGINE = ReplacingMergeTree(_version)" + partition_policy + " ORDER BY tuple(key)"); + "CREATE TABLE test_database.test_table_1 (`key` " + mapped_type + + MATERIALIZEMYSQL_TABLE_COLUMNS + + ") ENGINE = ReplacingMergeTree(_version)" + partition_policy + " ORDER BY tuple(key)"); EXPECT_EQ(queryToString(tryRewrittenCreateQuery( "CREATE TABLE `test_database`.`test_table_1` (`key` " + test_type + ", `key2` " + test_type + " UNIQUE KEY, PRIMARY KEY(`key`, `key2`))", context_holder.context)), - "CREATE TABLE test_database.test_table_1 (`key` " + mapped_type + ", `key2` " + mapped_type + ", `_sign` Int8() MATERIALIZED 1, " - "`_version` UInt64() MATERIALIZED 1) ENGINE = ReplacingMergeTree(_version)" + partition_policy + " ORDER BY (key, key2)"); + "CREATE TABLE test_database.test_table_1 (`key` " + mapped_type + ", `key2` " + mapped_type + + MATERIALIZEMYSQL_TABLE_COLUMNS + + ") ENGINE = ReplacingMergeTree(_version)" + partition_policy + " ORDER BY (key, key2)"); } } @@ -165,23 +175,27 @@ TEST(MySQLCreateRewritten, RewrittenQueryWithPrimaryKey) EXPECT_EQ(queryToString(tryRewrittenCreateQuery( "CREATE TABLE `test_database`.`test_table_1` (`key` int NOT NULL PRIMARY KEY) ENGINE=InnoDB DEFAULT CHARSET=utf8", context_holder.context)), - "CREATE TABLE test_database.test_table_1 (`key` Int32, `_sign` Int8() MATERIALIZED 1, `_version` UInt64() MATERIALIZED 1) ENGINE = ReplacingMergeTree(_version) " - "PARTITION BY intDiv(key, 4294967) ORDER BY tuple(key)"); + "CREATE TABLE test_database.test_table_1 (`key` Int32" + + std::string(MATERIALIZEMYSQL_TABLE_COLUMNS) + + ") ENGINE = ReplacingMergeTree(_version) PARTITION BY intDiv(key, 4294967) ORDER BY tuple(key)"); EXPECT_EQ(queryToString(tryRewrittenCreateQuery( "CREATE TABLE `test_database`.`test_table_1` (`key` int NOT NULL, PRIMARY KEY (`key`)) ENGINE=InnoDB DEFAULT CHARSET=utf8", context_holder.context)), - "CREATE TABLE test_database.test_table_1 (`key` Int32, `_sign` Int8() MATERIALIZED 1, `_version` UInt64() MATERIALIZED 1) ENGINE = ReplacingMergeTree(_version) " - "PARTITION BY intDiv(key, 4294967) ORDER BY tuple(key)"); + "CREATE TABLE test_database.test_table_1 (`key` Int32" + + std::string(MATERIALIZEMYSQL_TABLE_COLUMNS) + + ") ENGINE = ReplacingMergeTree(_version) PARTITION BY intDiv(key, 4294967) ORDER BY tuple(key)"); EXPECT_EQ(queryToString(tryRewrittenCreateQuery( "CREATE TABLE `test_database`.`test_table_1` (`key_1` int NOT NULL, key_2 INT NOT NULL, PRIMARY KEY (`key_1`, `key_2`)) ENGINE=InnoDB DEFAULT CHARSET=utf8", context_holder.context)), - "CREATE TABLE test_database.test_table_1 (`key_1` Int32, `key_2` Int32, `_sign` Int8() MATERIALIZED 1, `_version` UInt64() MATERIALIZED 1) ENGINE = " - "ReplacingMergeTree(_version) PARTITION BY intDiv(key_1, 4294967) ORDER BY (key_1, key_2)"); + "CREATE TABLE test_database.test_table_1 (`key_1` Int32, `key_2` Int32" + + std::string(MATERIALIZEMYSQL_TABLE_COLUMNS) + + ") ENGINE = ReplacingMergeTree(_version) PARTITION BY intDiv(key_1, 4294967) ORDER BY (key_1, key_2)"); EXPECT_EQ(queryToString(tryRewrittenCreateQuery( "CREATE TABLE `test_database`.`test_table_1` (`key_1` BIGINT NOT NULL, key_2 INT NOT NULL, PRIMARY KEY (`key_1`, `key_2`)) ENGINE=InnoDB DEFAULT CHARSET=utf8", context_holder.context)), - "CREATE TABLE test_database.test_table_1 (`key_1` Int64, `key_2` Int32, `_sign` Int8() MATERIALIZED 1, `_version` UInt64() MATERIALIZED 1) ENGINE = " - "ReplacingMergeTree(_version) PARTITION BY intDiv(key_2, 4294967) ORDER BY (key_1, key_2)"); + "CREATE TABLE test_database.test_table_1 (`key_1` Int64, `key_2` Int32" + + std::string(MATERIALIZEMYSQL_TABLE_COLUMNS) + + ") ENGINE = ReplacingMergeTree(_version) PARTITION BY intDiv(key_2, 4294967) ORDER BY (key_1, key_2)"); } TEST(MySQLCreateRewritten, RewrittenQueryWithPrefixKey) @@ -191,7 +205,8 @@ TEST(MySQLCreateRewritten, RewrittenQueryWithPrefixKey) EXPECT_EQ(queryToString(tryRewrittenCreateQuery( "CREATE TABLE `test_database`.`test_table_1` (`key` int NOT NULL PRIMARY KEY, `prefix_key` varchar(200) NOT NULL, KEY prefix_key_index(prefix_key(2))) ENGINE=InnoDB DEFAULT CHARSET=utf8", context_holder.context)), - "CREATE TABLE test_database.test_table_1 (`key` Int32, `prefix_key` String, `_sign` Int8() MATERIALIZED 1, `_version` UInt64() MATERIALIZED 1) ENGINE = " + "CREATE TABLE test_database.test_table_1 (`key` Int32, `prefix_key` String" + + std::string(MATERIALIZEMYSQL_TABLE_COLUMNS) + ") ENGINE = " "ReplacingMergeTree(_version) PARTITION BY intDiv(key, 4294967) ORDER BY (key, prefix_key)"); } @@ -204,6 +219,7 @@ TEST(MySQLCreateRewritten, UniqueKeysConvert) "CREATE TABLE `test_database`.`test_table_1` (code varchar(255) CHARACTER SET utf8 COLLATE utf8_bin NOT NULL,name varchar(255) CHARACTER SET utf8 COLLATE utf8_bin NOT NULL," " id bigint NOT NULL AUTO_INCREMENT, tenant_id bigint NOT NULL, PRIMARY KEY (id), UNIQUE KEY code_id (code, tenant_id), UNIQUE KEY name_id (name, tenant_id))" " ENGINE=InnoDB AUTO_INCREMENT=100 DEFAULT CHARSET=utf8 COLLATE=utf8_bin;", context_holder.context)), - "CREATE TABLE test_database.test_table_1 (`code` String, `name` String, `id` Int64, `tenant_id` Int64, `_sign` Int8() MATERIALIZED 1, `_version` UInt64() MATERIALIZED 1)" - " ENGINE = ReplacingMergeTree(_version) PARTITION BY intDiv(id, 18446744073709551) ORDER BY (code, name, tenant_id, id)"); + "CREATE TABLE test_database.test_table_1 (`code` String, `name` String, `id` Int64, `tenant_id` Int64" + + std::string(MATERIALIZEMYSQL_TABLE_COLUMNS) + + ") ENGINE = ReplacingMergeTree(_version) PARTITION BY intDiv(id, 18446744073709551) ORDER BY (code, name, tenant_id, id)"); } From f0163c2acfe41c78124e49582301e896ee3f8240 Mon Sep 17 00:00:00 2001 From: Pavel Kruglov Date: Fri, 12 Feb 2021 17:02:04 +0300 Subject: [PATCH 0261/2357] Don't create empty parts on INSERT --- src/Storages/MergeTree/MergeTreeBlockOutputStream.cpp | 6 ++++++ src/Storages/MergeTree/MergeTreeDataWriter.cpp | 5 +++++ .../MergeTree/ReplicatedMergeTreeBlockOutputStream.cpp | 5 +++++ .../queries/0_stateless/01560_optimize_on_insert.reference | 1 + tests/queries/0_stateless/01560_optimize_on_insert.sql | 7 +++++++ 5 files changed, 24 insertions(+) diff --git a/src/Storages/MergeTree/MergeTreeBlockOutputStream.cpp b/src/Storages/MergeTree/MergeTreeBlockOutputStream.cpp index 904081cc1df..bb5644567ae 100644 --- a/src/Storages/MergeTree/MergeTreeBlockOutputStream.cpp +++ b/src/Storages/MergeTree/MergeTreeBlockOutputStream.cpp @@ -29,6 +29,12 @@ void MergeTreeBlockOutputStream::write(const Block & block) Stopwatch watch; MergeTreeData::MutableDataPartPtr part = storage.writer.writeTempPart(current_block, metadata_snapshot, optimize_on_insert); + + /// If optimize_on_insert setting is true, current_block could become empty after merge + /// and we didn't create part. + if (!part) + continue; + storage.renameTempPartAndAdd(part, &storage.increment); PartLog::addNewPart(storage.global_context, part, watch.elapsed()); diff --git a/src/Storages/MergeTree/MergeTreeDataWriter.cpp b/src/Storages/MergeTree/MergeTreeDataWriter.cpp index 5a9bdd90bc8..5929293d714 100644 --- a/src/Storages/MergeTree/MergeTreeDataWriter.cpp +++ b/src/Storages/MergeTree/MergeTreeDataWriter.cpp @@ -327,6 +327,11 @@ MergeTreeData::MutableDataPartPtr MergeTreeDataWriter::writeTempPart(BlockWithPa /// Size of part would not be greater than block.bytes() + epsilon size_t expected_size = block.bytes(); + /// If optimize_on_insert is true, block may become empty after merge. + /// There is no need to create empty part. + if (expected_size == 0) + return nullptr; + DB::IMergeTreeDataPart::TTLInfos move_ttl_infos; const auto & move_ttl_entries = metadata_snapshot->getMoveTTLs(); for (const auto & ttl_entry : move_ttl_entries) diff --git a/src/Storages/MergeTree/ReplicatedMergeTreeBlockOutputStream.cpp b/src/Storages/MergeTree/ReplicatedMergeTreeBlockOutputStream.cpp index 7046a510f75..6f90d9f00a9 100644 --- a/src/Storages/MergeTree/ReplicatedMergeTreeBlockOutputStream.cpp +++ b/src/Storages/MergeTree/ReplicatedMergeTreeBlockOutputStream.cpp @@ -144,6 +144,11 @@ void ReplicatedMergeTreeBlockOutputStream::write(const Block & block) MergeTreeData::MutableDataPartPtr part = storage.writer.writeTempPart(current_block, metadata_snapshot, optimize_on_insert); + /// If optimize_on_insert setting is true, current_block could become empty after merge + /// and we didn't create part. + if (!part) + continue; + String block_id; if (deduplicate) diff --git a/tests/queries/0_stateless/01560_optimize_on_insert.reference b/tests/queries/0_stateless/01560_optimize_on_insert.reference index 7ace2043be0..477f48be7a9 100644 --- a/tests/queries/0_stateless/01560_optimize_on_insert.reference +++ b/tests/queries/0_stateless/01560_optimize_on_insert.reference @@ -11,3 +11,4 @@ Summing Merge Tree Aggregating Merge Tree 1 5 2020-01-01 00:00:00 2 5 2020-01-02 00:00:00 +Check creating empty parts diff --git a/tests/queries/0_stateless/01560_optimize_on_insert.sql b/tests/queries/0_stateless/01560_optimize_on_insert.sql index 9f6dac686bb..f64f4c75cfe 100644 --- a/tests/queries/0_stateless/01560_optimize_on_insert.sql +++ b/tests/queries/0_stateless/01560_optimize_on_insert.sql @@ -33,3 +33,10 @@ INSERT INTO aggregating_merge_tree VALUES (1, 1, '2020-01-01'), (2, 1, '2020-01- SELECT * FROM aggregating_merge_tree ORDER BY key; DROP TABLE aggregating_merge_tree; +SELECT 'Check creating empty parts'; +DROP TABLE IF EXISTS empty; +CREATE TABLE empty (key UInt32, val UInt32, date Datetime) ENGINE=SummingMergeTree(val) PARTITION BY date ORDER BY key; +INSERT INTO empty VALUES (1, 1, '2020-01-01'), (1, 1, '2020-01-01'), (1, -2, '2020-01-01'); +SELECT * FROM empty ORDER BY key; +SELECT table, partition, active FROM system.parts where table = 'empty' and active = 1; +DROP TABLE empty; From 09c4bd91ee14f26723ccbecc9d4010c7f6ae5383 Mon Sep 17 00:00:00 2001 From: vdimir Date: Fri, 12 Feb 2021 17:50:09 +0300 Subject: [PATCH 0262/2357] Try to fix cross join rewrite with conjunction of disjunction --- src/Interpreters/CrossToInnerJoinVisitor.cpp | 38 ++++++++++++-------- 1 file changed, 24 insertions(+), 14 deletions(-) diff --git a/src/Interpreters/CrossToInnerJoinVisitor.cpp b/src/Interpreters/CrossToInnerJoinVisitor.cpp index 604bfc7774f..2c80451aedb 100644 --- a/src/Interpreters/CrossToInnerJoinVisitor.cpp +++ b/src/Interpreters/CrossToInnerJoinVisitor.cpp @@ -1,20 +1,21 @@ #include #include #include +#include #include #include #include #include +#include +#include +#include +#include #include #include #include -#include -#include -#include -#include #include +#include #include -#include namespace DB { @@ -102,12 +103,12 @@ public: : joined_tables(tables_) , tables(tables_with_columns) , aliases(aliases_) - , ands_only(true) + , is_complex(false) {} void visit(const ASTFunction & node, const ASTPtr & ast) { - if (!ands_only) + if (is_complex) return; if (node.name == NameAnd::name) @@ -118,9 +119,14 @@ public: for (auto & child : node.arguments->children) { if (const auto * func = child->as()) + { visit(*func, child); + } else - ands_only = false; + { + bool is_literal_or_ident = !child->as() && !child->as(); + is_complex = is_complex || !is_literal_or_ident; + } } } else if (node.name == NameEquals::name) @@ -135,18 +141,22 @@ public: else if (functionIsLikeOperator(node.name) || /// LIKE, NOT LIKE, ILIKE, NOT ILIKE functionIsInOperator(node.name)) /// IN, NOT IN { - /// leave as is. It's not possible to make push down here cause of unknown aliases and not implemented JOIN predicates. - /// select a as b form t1, t2 where t1.x = t2.x and b in(42) - /// select a as b form t1 inner join t2 on t1.x = t2.x and b in(42) + /// Leave as is. It's not possible to make push down here cause of unknown aliases and not implemented JOIN predicates. + /// select a as b from t1, t2 where t1.x = t2.x and b in(42) + /// select a as b from t1 inner join t2 on t1.x = t2.x and b in(42) + } + else if (node.name == NameOr::name) + { + } else { - ands_only = false; + is_complex = true; asts_to_join_on.clear(); } } - bool complex() const { return !ands_only; } + bool complex() const { return is_complex; } bool matchAny(size_t t) const { return asts_to_join_on.count(t); } ASTPtr makeOnExpression(size_t table_pos) @@ -172,7 +182,7 @@ private: const std::vector & tables; std::map> asts_to_join_on; const Aliases & aliases; - bool ands_only; + bool is_complex; size_t canMoveEqualsToJoinOn(const ASTFunction & node) { From bbed905461d9e08adaa1303f71c228d2f62fff8c Mon Sep 17 00:00:00 2001 From: Nikolai Kochetov Date: Fri, 12 Feb 2021 18:20:54 +0300 Subject: [PATCH 0263/2357] Fix ActionsDAG::removeUnusedResult --- src/Interpreters/ActionsDAG.cpp | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/src/Interpreters/ActionsDAG.cpp b/src/Interpreters/ActionsDAG.cpp index 6a7dbc47230..255c774bbf9 100644 --- a/src/Interpreters/ActionsDAG.cpp +++ b/src/Interpreters/ActionsDAG.cpp @@ -490,6 +490,11 @@ bool ActionsDAG::removeUnusedResult(const std::string & column_name) if (col == child) return false; + /// Do not remove input if it was mentioned in index several times. + for (const auto * node : index) + if (col == node) + return false; + /// Remove from nodes and inputs. for (auto jt = nodes.begin(); jt != nodes.end(); ++jt) { From 90c7cf5a5293a32654e97cc8b4f8cb1d2090d3be Mon Sep 17 00:00:00 2001 From: Nikolai Kochetov Date: Fri, 12 Feb 2021 18:24:31 +0300 Subject: [PATCH 0264/2357] Push down for ArrayJoin --- .../Optimizations/filterPushDown.cpp | 116 ++++++++++++------ 1 file changed, 80 insertions(+), 36 deletions(-) diff --git a/src/Processors/QueryPlan/Optimizations/filterPushDown.cpp b/src/Processors/QueryPlan/Optimizations/filterPushDown.cpp index ec005e59729..98e923249f3 100644 --- a/src/Processors/QueryPlan/Optimizations/filterPushDown.cpp +++ b/src/Processors/QueryPlan/Optimizations/filterPushDown.cpp @@ -3,7 +3,9 @@ #include #include #include +#include #include +#include #include #include @@ -15,6 +17,68 @@ namespace DB::ErrorCodes namespace DB::QueryPlanOptimizations { +static size_t tryAddNewFilterStep( + QueryPlan::Node * parent_node, + QueryPlan::Nodes & nodes, + const Names & allowed_inputs) +{ + QueryPlan::Node * child_node = parent_node->children.front(); + + auto & parent = parent_node->step; + auto & child = child_node->step; + + auto * filter = static_cast(parent.get()); + const auto & expression = filter->getExpression(); + const auto & filter_column_name = filter->getFilterColumnName(); + bool removes_filter = filter->removesFilterColumn(); + + // std::cerr << "Filter: \n" << expression->dumpDAG() << std::endl; + + auto split_filter = expression->splitActionsForFilter(filter_column_name, removes_filter, allowed_inputs); + if (!split_filter) + return 0; + + // std::cerr << "===============\n" << expression->dumpDAG() << std::endl; + // std::cerr << "---------------\n" << split_filter->dumpDAG() << std::endl; + + const auto & index = expression->getIndex(); + auto it = index.begin(); + for (; it != index.end(); ++it) + if ((*it)->result_name == filter_column_name) + break; + + if (it == expression->getIndex().end()) + { + if (!removes_filter) + throw Exception(ErrorCodes::LOGICAL_ERROR, + "Filter column {} was removed from ActionsDAG but it is needed in result. DAG:\n{}", + filter_column_name, expression->dumpDAG()); + + std::cerr << "replacing to expr because filter " << filter_column_name << " was removed\n"; + parent = std::make_unique(child->getOutputStream(), expression); + } + else if ((*it)->column && isColumnConst(*(*it)->column)) + { + std::cerr << "replacing to expr because filter is const\n"; + parent = std::make_unique(child->getOutputStream(), expression); + } + + /// Add new Filter step before Aggregating. + /// Expression/Filter -> Aggregating -> Something + auto & node = nodes.emplace_back(); + node.children.swap(child_node->children); + child_node->children.emplace_back(&node); + /// Expression/Filter -> Aggregating -> Filter -> Something + + /// New filter column is added to the end. + auto split_filter_column_name = (*split_filter->getIndex().rbegin())->result_name; + node.step = std::make_unique( + node.children.at(0)->step->getOutputStream(), + std::move(split_filter), std::move(split_filter_column_name), true); + + return 3; +} + size_t tryPushDownFilter(QueryPlan::Node * parent_node, QueryPlan::Nodes & nodes) { if (parent_node->children.size() != 1) @@ -29,10 +93,6 @@ size_t tryPushDownFilter(QueryPlan::Node * parent_node, QueryPlan::Nodes & nodes if (!filter) return 0; - const auto & expression = filter->getExpression(); - const auto & filter_column_name = filter->getFilterColumnName(); - bool removes_filter = filter->removesFilterColumn(); - if (auto * aggregating = typeid_cast(child.get())) { const auto & params = aggregating->getParams(); @@ -42,42 +102,26 @@ size_t tryPushDownFilter(QueryPlan::Node * parent_node, QueryPlan::Nodes & nodes for (auto pos : params.keys) keys.push_back(params.src_header.getByPosition(pos).name); - // std::cerr << "Filter: \n" << expression->dumpDAG() << std::endl; - if (auto split_filter = expression->splitActionsForFilter(filter_column_name, removes_filter, keys)) - { - // std::cerr << "===============\n" << expression->dumpDAG() << std::endl; - // std::cerr << "---------------\n" << split_filter->dumpDAG() << std::endl; + if (auto updated_steps = tryAddNewFilterStep(parent_node, nodes, keys)) + return updated_steps; + } - auto it = expression->getIndex().find(filter_column_name); - if (it == expression->getIndex().end()) - { - if (!removes_filter) - throw Exception(ErrorCodes::LOGICAL_ERROR, - "Filter column {} was removed from ActionsDAG but it is needed in result. DAG:\n{}", - filter_column_name, expression->dumpDAG()); + if (auto * array_join = typeid_cast(child.get())) + { + const auto & array_join_actions = array_join->arrayJoin(); + const auto & keys = array_join_actions->columns; + const auto & array_join_header = array_join->getInputStreams().front().header; - parent = std::make_unique(child->getOutputStream(), expression); - } - else if ((*it)->column && isColumnConst(*(*it)->column)) - { - parent = std::make_unique(child->getOutputStream(), expression); - } + Names allowed_inputs; + for (const auto & column : array_join_header) + if (keys.count(column.name) == 0) + allowed_inputs.push_back(column.name); - /// Add new Filter step before Aggregating. - /// Expression/Filter -> Aggregating -> Something - auto & node = nodes.emplace_back(); - node.children.swap(child_node->children); - child_node->children.emplace_back(&node); - /// Expression/Filter -> Aggregating -> Filter -> Something + for (const auto & name : allowed_inputs) + std::cerr << name << std::endl; - /// New filter column is added to the end. - auto split_filter_column_name = (*split_filter->getIndex().rbegin())->result_name; - node.step = std::make_unique( - node.children.at(0)->step->getOutputStream(), - std::move(split_filter), std::move(split_filter_column_name), true); - - return 3; - } + if (auto updated_steps = tryAddNewFilterStep(parent_node, nodes, allowed_inputs)) + return updated_steps; } return 0; From 5fd80555aa6241e01737c9a9083f663a8d7ed0eb Mon Sep 17 00:00:00 2001 From: Nikolai Kochetov Date: Fri, 12 Feb 2021 19:06:18 +0300 Subject: [PATCH 0265/2357] Update test. --- .../queries/0_stateless/01655_plan_optimizations.reference | 4 ++++ tests/queries/0_stateless/01655_plan_optimizations.sh | 7 +++++++ 2 files changed, 11 insertions(+) diff --git a/tests/queries/0_stateless/01655_plan_optimizations.reference b/tests/queries/0_stateless/01655_plan_optimizations.reference index 510224146ed..1e638829c74 100644 --- a/tests/queries/0_stateless/01655_plan_optimizations.reference +++ b/tests/queries/0_stateless/01655_plan_optimizations.reference @@ -38,3 +38,7 @@ Filter column ALIAS notEquals(s, 8) :: 1 -> and(notEquals(y, 0), notEquals(s, 8), minus(y, 4)) Aggregating Filter column: and(minus(y, 4), notEquals(y, 0)) +> filter is split, one part is filtered before ARRAY JOIN +Filter column: and(notEquals(y, 2), notEquals(x, 0)) +ARRAY JOIN x +Filter column: notEquals(y, 2) diff --git a/tests/queries/0_stateless/01655_plan_optimizations.sh b/tests/queries/0_stateless/01655_plan_optimizations.sh index ea76d15c648..ccd331df45e 100755 --- a/tests/queries/0_stateless/01655_plan_optimizations.sh +++ b/tests/queries/0_stateless/01655_plan_optimizations.sh @@ -55,3 +55,10 @@ $CLICKHOUSE_CLIENT -q " ) where y != 0 and s != 8 and y - 4 settings enable_optimize_predicate_expression=0" | grep -o "Aggregating\|Filter column\|Filter column: and(minus(y, 4), notEquals(y, 0))\|ALIAS notEquals(s, 8) :: 1 -> and(notEquals(y, 0), notEquals(s, 8), minus(y, 4))" + +echo "> filter is split, one part is filtered before ARRAY JOIN" +$CLICKHOUSE_CLIENT -q " + explain actions = 1 select x, y from ( + select range(number) as x, number + 1 as y from numbers(3) + ) array join x where y != 2 and x != 0" | + grep -o "Filter column: and(notEquals(y, 2), notEquals(x, 0))\|ARRAY JOIN x\|Filter column: notEquals(y, 2)" \ No newline at end of file From f64f9b672b472eaf0a0f76447a21cf30c361f816 Mon Sep 17 00:00:00 2001 From: Alexander Tokmakov Date: Fri, 12 Feb 2021 19:22:01 +0300 Subject: [PATCH 0266/2357] fix --- src/Databases/DatabaseOrdinary.cpp | 5 --- src/Databases/DatabaseReplicated.cpp | 3 +- src/Databases/DatabaseReplicatedWorker.cpp | 10 ++++- src/Databases/DatabaseReplicatedWorker.h | 2 + src/Interpreters/DDLWorker.cpp | 37 +++++++++++++++++-- src/Interpreters/DDLWorker.h | 2 +- .../test_distributed_ddl/cluster.py | 4 +- .../integration/test_distributed_ddl/test.py | 2 +- .../test_replicated_alter.py | 2 +- 9 files changed, 50 insertions(+), 17 deletions(-) diff --git a/src/Databases/DatabaseOrdinary.cpp b/src/Databases/DatabaseOrdinary.cpp index d859578eb46..a94668dacf7 100644 --- a/src/Databases/DatabaseOrdinary.cpp +++ b/src/Databases/DatabaseOrdinary.cpp @@ -33,11 +33,6 @@ static constexpr size_t PRINT_MESSAGE_EACH_N_OBJECTS = 256; static constexpr size_t PRINT_MESSAGE_EACH_N_SECONDS = 5; static constexpr size_t METADATA_FILE_BUFFER_SIZE = 32768; -namespace ErrorCodes -{ - extern const int NOT_IMPLEMENTED; -} - namespace { void tryAttachTable( diff --git a/src/Databases/DatabaseReplicated.cpp b/src/Databases/DatabaseReplicated.cpp index 1756d33958d..d365ea24bbf 100644 --- a/src/Databases/DatabaseReplicated.cpp +++ b/src/Databases/DatabaseReplicated.cpp @@ -33,6 +33,7 @@ namespace ErrorCodes extern const int REPLICA_IS_ALREADY_EXIST; extern const int DATABASE_REPLICATION_FAILED; extern const int UNKNOWN_DATABASE; + extern const int UNKNOWN_TABLE; extern const int NOT_IMPLEMENTED; extern const int INCORRECT_QUERY; extern const int ALL_CONNECTION_TRIES_FAILED; @@ -332,7 +333,7 @@ void DatabaseReplicated::recoverLostReplica(const ZooKeeperPtr & current_zookeep ASTPtr zk_create = parseQuery(parser, in_zk->second, size, depth); if (local_create->as()->uuid == zk_create->as()->uuid) { - /// For ReplicatedMergeTree tables we can compare only UUIDs to ensure that it's tha same table. + /// For ReplicatedMergeTree tables we can compare only UUIDs to ensure that it's the same table. /// Metadata can be different, it's handled on table replication level. /// TODO maybe we should also compare MergeTree SETTINGS? should_detach = false; diff --git a/src/Databases/DatabaseReplicatedWorker.cpp b/src/Databases/DatabaseReplicatedWorker.cpp index 5a350783dcb..521ba5b7cb2 100644 --- a/src/Databases/DatabaseReplicatedWorker.cpp +++ b/src/Databases/DatabaseReplicatedWorker.cpp @@ -41,6 +41,12 @@ void DatabaseReplicatedDDLWorker::initializeMainThread() } } +void DatabaseReplicatedDDLWorker::shutdown() +{ + DDLWorker::shutdown(); + wait_current_task_change.notify_all(); +} + void DatabaseReplicatedDDLWorker::initializeReplication() { /// Check if we need to recover replica. @@ -120,8 +126,8 @@ String DatabaseReplicatedDDLWorker::tryEnqueueAndExecuteEntry(DDLLogEntry & entr "most likely because replica is busy with previous queue entries"); } - if (zookeeper->expired()) - throw Exception(ErrorCodes::DATABASE_REPLICATION_FAILED, "ZooKeeper session expired, try again"); + if (zookeeper->expired() || stop_flag) + throw Exception(ErrorCodes::DATABASE_REPLICATION_FAILED, "ZooKeeper session expired or replication stopped, try again"); processTask(*task); diff --git a/src/Databases/DatabaseReplicatedWorker.h b/src/Databases/DatabaseReplicatedWorker.h index 33806df88ba..1eafe2489e7 100644 --- a/src/Databases/DatabaseReplicatedWorker.h +++ b/src/Databases/DatabaseReplicatedWorker.h @@ -15,6 +15,8 @@ public: String tryEnqueueAndExecuteEntry(DDLLogEntry & entry, const Context & query_context); + void shutdown() override; + private: void initializeMainThread() override; void initializeReplication(); diff --git a/src/Interpreters/DDLWorker.cpp b/src/Interpreters/DDLWorker.cpp index 242ee7ea0e1..1f4c7932329 100644 --- a/src/Interpreters/DDLWorker.cpp +++ b/src/Interpreters/DDLWorker.cpp @@ -471,16 +471,42 @@ void DDLWorker::processTask(DDLTaskBase & task) String active_node_path = task.getActiveNodePath(); String finished_node_path = task.getFinishedNodePath(); + /// It will tryRemove(...) on exception + auto active_node = zkutil::EphemeralNodeHolder::existing(active_node_path, *zookeeper); + + /// Try fast path auto create_active_res = zookeeper->tryCreate(active_node_path, {}, zkutil::CreateMode::Ephemeral); if (create_active_res != Coordination::Error::ZOK) { - if (create_active_res != Coordination::Error::ZNONODE) + if (create_active_res != Coordination::Error::ZNONODE && create_active_res != Coordination::Error::ZNODEEXISTS) + { + assert(Coordination::isHardwareError(create_active_res)); throw Coordination::Exception(create_active_res, active_node_path); - createStatusDirs(task.entry_path, zookeeper); + } + + /// Status dirs were not created in enqueueQuery(...) or someone is removing entry + if (create_active_res == Coordination::Error::ZNONODE) + createStatusDirs(task.entry_path, zookeeper); + + if (create_active_res == Coordination::Error::ZNODEEXISTS) + { + /// Connection has been lost and now we are retrying to write query status, + /// but our previous ephemeral node still exists. + assert(task.was_executed); + zkutil::EventPtr eph_node_disappeared = std::make_shared(); + String dummy; + if (zookeeper->tryGet(active_node_path, dummy, nullptr, eph_node_disappeared)) + { + constexpr int timeout_ms = 5000; + if (!eph_node_disappeared->tryWait(timeout_ms)) + throw Exception(ErrorCodes::LOGICAL_ERROR, "Ephemeral node {} still exists, " + "probably it's owned by someone else", active_node_path); + } + } + zookeeper->create(active_node_path, {}, zkutil::CreateMode::Ephemeral); } - auto active_node = zkutil::EphemeralNodeHolder::existing(active_node_path, *zookeeper); if (!task.was_executed) { @@ -560,10 +586,12 @@ void DDLWorker::processTask(DDLTaskBase & task) if (!status_written) { zookeeper->multi(task.ops); - active_node->reset(); task.ops.clear(); } + /// Active node was removed in multi ops + active_node->reset(); + task.completely_processed = true; } @@ -947,6 +975,7 @@ void DDLWorker::runMainThread() current_tasks.clear(); last_skipped_entry_name.reset(); max_id = 0; + LOG_INFO(log, "Cleaned DDLWorker state"); }; setThreadName("DDLWorker"); diff --git a/src/Interpreters/DDLWorker.h b/src/Interpreters/DDLWorker.h index 1ae4f815b44..03c80e3f669 100644 --- a/src/Interpreters/DDLWorker.h +++ b/src/Interpreters/DDLWorker.h @@ -57,7 +57,7 @@ public: } void startup(); - void shutdown(); + virtual void shutdown(); bool isCurrentlyActive() const { return initialized && !stop_flag; } diff --git a/tests/integration/test_distributed_ddl/cluster.py b/tests/integration/test_distributed_ddl/cluster.py index 45a159ed2b9..24f11fec547 100644 --- a/tests/integration/test_distributed_ddl/cluster.py +++ b/tests/integration/test_distributed_ddl/cluster.py @@ -10,8 +10,8 @@ from helpers.test_tools import TSV class ClickHouseClusterWithDDLHelpers(ClickHouseCluster): - def __init__(self, base_path, config_dir): - ClickHouseCluster.__init__(self, base_path) + def __init__(self, base_path, config_dir, testcase_name): + ClickHouseCluster.__init__(self, base_path, name=testcase_name) self.test_config_dir = config_dir diff --git a/tests/integration/test_distributed_ddl/test.py b/tests/integration/test_distributed_ddl/test.py index f0e78dfec41..58e1d0d06f7 100755 --- a/tests/integration/test_distributed_ddl/test.py +++ b/tests/integration/test_distributed_ddl/test.py @@ -14,7 +14,7 @@ from .cluster import ClickHouseClusterWithDDLHelpers @pytest.fixture(scope="module", params=["configs", "configs_secure"]) def test_cluster(request): - cluster = ClickHouseClusterWithDDLHelpers(__file__, request.param) + cluster = ClickHouseClusterWithDDLHelpers(__file__, request.param, request.param) try: cluster.prepare() diff --git a/tests/integration/test_distributed_ddl/test_replicated_alter.py b/tests/integration/test_distributed_ddl/test_replicated_alter.py index bd95f5660b7..148ad5fca5e 100644 --- a/tests/integration/test_distributed_ddl/test_replicated_alter.py +++ b/tests/integration/test_distributed_ddl/test_replicated_alter.py @@ -12,7 +12,7 @@ from .cluster import ClickHouseClusterWithDDLHelpers @pytest.fixture(scope="module", params=["configs", "configs_secure"]) def test_cluster(request): - cluster = ClickHouseClusterWithDDLHelpers(__file__, request.param) + cluster = ClickHouseClusterWithDDLHelpers(__file__, request.param, "alters_" + request.param) try: # TODO: Fix ON CLUSTER alters when nodes have different configs. Need to canonicalize node identity. From eff5bdf3321c4f9ed01017254a914a065a314cc5 Mon Sep 17 00:00:00 2001 From: lehasm Date: Fri, 12 Feb 2021 22:28:03 +0300 Subject: [PATCH 0267/2357] Documented decodeXMLComponent function --- .../functions/string-functions.md | 44 +++++++++++++++++++ 1 file changed, 44 insertions(+) diff --git a/docs/en/sql-reference/functions/string-functions.md b/docs/en/sql-reference/functions/string-functions.md index 2b93dd924a3..fa9c84fa9af 100644 --- a/docs/en/sql-reference/functions/string-functions.md +++ b/docs/en/sql-reference/functions/string-functions.md @@ -600,4 +600,48 @@ Hello, "world"! 'foo' ``` +## decodeXMLComponent {#decode-xml-component} + +Replaces XML predefined entities with characters. +Predefined entities are `"` `&` `'` `>` `<` +This function also replaces numeric character references with Unicode characters. +Both decimal (like `✓`) and hexadecimal (`✓`) forms are supported. + +**Syntax** + +``` sql +decodeXMLComponent(x) +``` + +**Parameters** + +- `x` — A sequence of characters. [String](../../sql-reference/data-types/string.md). + +**Returned value** + +- The sequence of characters after replacement. + +Type: [String](../../sql-reference/data-types/string.md). + +**Example** + +Query: + +``` sql +SELECT decodeXMLComponent(''foo''); +SELECT decodeXMLComponent('< Σ >'); +``` + +Result: + +``` text +'foo' +< Σ > +``` + +**See Also** + +- [List of XML and HTML character entity references](https://en.wikipedia.org/wiki/List_of_XML_and_HTML_character_entity_references) + + [Original article](https://clickhouse.tech/docs/en/query_language/functions/string_functions/) From d9b85874c0139a3936cc15d85c3869ec22959a36 Mon Sep 17 00:00:00 2001 From: lehasm Date: Fri, 12 Feb 2021 22:52:02 +0300 Subject: [PATCH 0268/2357] welchttest, mannwhitneyutest markup fixed --- .../aggregate-functions/reference/mannwhitneyutest.md | 1 + .../en/sql-reference/aggregate-functions/reference/welchttest.md | 1 + 2 files changed, 2 insertions(+) diff --git a/docs/en/sql-reference/aggregate-functions/reference/mannwhitneyutest.md b/docs/en/sql-reference/aggregate-functions/reference/mannwhitneyutest.md index 012df7052aa..bc808ab0a9e 100644 --- a/docs/en/sql-reference/aggregate-functions/reference/mannwhitneyutest.md +++ b/docs/en/sql-reference/aggregate-functions/reference/mannwhitneyutest.md @@ -30,6 +30,7 @@ The null hypothesis is that two populations are stochastically equal. Also one-s **Returned values** [Tuple](../../../sql-reference/data-types/tuple.md) with two elements: + - calculated U-statistic. [Float64](../../../sql-reference/data-types/float.md). - calculated p-value. [Float64](../../../sql-reference/data-types/float.md). diff --git a/docs/en/sql-reference/aggregate-functions/reference/welchttest.md b/docs/en/sql-reference/aggregate-functions/reference/welchttest.md index 3fe1c9d58b9..44c320c4565 100644 --- a/docs/en/sql-reference/aggregate-functions/reference/welchttest.md +++ b/docs/en/sql-reference/aggregate-functions/reference/welchttest.md @@ -24,6 +24,7 @@ The null hypothesis is that means of populations are equal. Normal distribution **Returned values** [Tuple](../../../sql-reference/data-types/tuple.md) with two elements: + - calculated t-statistic. [Float64](../../../sql-reference/data-types/float.md). - calculated p-value. [Float64](../../../sql-reference/data-types/float.md). From 00ac1e691abbae0f656a4d913ac489d52ad9c3e4 Mon Sep 17 00:00:00 2001 From: lehasm Date: Fri, 12 Feb 2021 23:01:47 +0300 Subject: [PATCH 0269/2357] studentttest, welchttest, mannwhitneyutest markup fixed (ru) --- .../aggregate-functions/reference/mannwhitneyutest.md | 1 + .../sql-reference/aggregate-functions/reference/studentttest.md | 1 + .../ru/sql-reference/aggregate-functions/reference/welchttest.md | 1 + 3 files changed, 3 insertions(+) diff --git a/docs/ru/sql-reference/aggregate-functions/reference/mannwhitneyutest.md b/docs/ru/sql-reference/aggregate-functions/reference/mannwhitneyutest.md index fb73fff5f00..a4647ecfb34 100644 --- a/docs/ru/sql-reference/aggregate-functions/reference/mannwhitneyutest.md +++ b/docs/ru/sql-reference/aggregate-functions/reference/mannwhitneyutest.md @@ -31,6 +31,7 @@ mannWhitneyUTest[(alternative[, continuity_correction])](sample_data, sample_ind **Возвращаемые значения** [Кортеж](../../../sql-reference/data-types/tuple.md) с двумя элементами: + - вычисленное значение критерия Манна — Уитни. [Float64](../../../sql-reference/data-types/float.md). - вычисленное p-значение. [Float64](../../../sql-reference/data-types/float.md). diff --git a/docs/ru/sql-reference/aggregate-functions/reference/studentttest.md b/docs/ru/sql-reference/aggregate-functions/reference/studentttest.md index 5361e06c5e2..77378de95d1 100644 --- a/docs/ru/sql-reference/aggregate-functions/reference/studentttest.md +++ b/docs/ru/sql-reference/aggregate-functions/reference/studentttest.md @@ -24,6 +24,7 @@ studentTTest(sample_data, sample_index) **Возвращаемые значения** [Кортеж](../../../sql-reference/data-types/tuple.md) с двумя элементами: + - вычисленное значение критерия Стьюдента. [Float64](../../../sql-reference/data-types/float.md). - вычисленное p-значение. [Float64](../../../sql-reference/data-types/float.md). diff --git a/docs/ru/sql-reference/aggregate-functions/reference/welchttest.md b/docs/ru/sql-reference/aggregate-functions/reference/welchttest.md index 1f36b2d04ee..16c122d1b49 100644 --- a/docs/ru/sql-reference/aggregate-functions/reference/welchttest.md +++ b/docs/ru/sql-reference/aggregate-functions/reference/welchttest.md @@ -24,6 +24,7 @@ welchTTest(sample_data, sample_index) **Возвращаемые значения** [Кортеж](../../../sql-reference/data-types/tuple.md) с двумя элементами: + - вычисленное значение критерия Уэлча. [Float64](../../../sql-reference/data-types/float.md). - вычисленное p-значение. [Float64](../../../sql-reference/data-types/float.md). From 1c656830fc32606cbc52699beb775f80b7094243 Mon Sep 17 00:00:00 2001 From: Alexey Milovidov Date: Sat, 13 Feb 2021 00:26:12 +0300 Subject: [PATCH 0270/2357] Fix clang-tidy --- src/Storages/StorageMemory.cpp | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/Storages/StorageMemory.cpp b/src/Storages/StorageMemory.cpp index 01f70db5edd..d7b0ae055ab 100644 --- a/src/Storages/StorageMemory.cpp +++ b/src/Storages/StorageMemory.cpp @@ -124,7 +124,7 @@ public: if (storage.compress) { Block compressed_block; - for (auto & elem : block) + for (const auto & elem : block) compressed_block.insert({ elem.column->compress(), elem.type, elem.name }); new_blocks.emplace_back(compressed_block); @@ -351,6 +351,7 @@ void registerStorageMemory(StorageFactory & factory) return StorageMemory::create(args.table_id, args.columns, args.constraints, settings.compress); }, { + .supports_settings = true, .supports_parallel_insert = true, }); } From 453450985f9b5452779b6b4a7ec6c0a44105e3dc Mon Sep 17 00:00:00 2001 From: Alexey Milovidov Date: Sat, 13 Feb 2021 00:26:25 +0300 Subject: [PATCH 0271/2357] Performance improvement by Nikolai Kochetov --- src/Storages/StorageMemory.h | 2 ++ 1 file changed, 2 insertions(+) diff --git a/src/Storages/StorageMemory.h b/src/Storages/StorageMemory.h index 91cf616c57d..db71c13ca99 100644 --- a/src/Storages/StorageMemory.h +++ b/src/Storages/StorageMemory.h @@ -45,6 +45,8 @@ public: /// Smaller blocks (e.g. 64K rows) are better for CPU cache. bool prefersLargeBlocks() const override { return false; } + bool hasEvenlyDistributedRead() const override { return true; } + BlockOutputStreamPtr write(const ASTPtr & query, const StorageMetadataPtr & metadata_snapshot, const Context & context) override; void drop() override; From b5826121db6379acb5eb54e800ba73bd8cf0cd06 Mon Sep 17 00:00:00 2001 From: Alexey Milovidov Date: Sat, 13 Feb 2021 00:29:06 +0300 Subject: [PATCH 0272/2357] Fix Arcadia --- src/Columns/ya.make | 1 + src/Columns/ya.make.in | 1 + 2 files changed, 2 insertions(+) diff --git a/src/Columns/ya.make b/src/Columns/ya.make index def9dfd4cb7..061391b5214 100644 --- a/src/Columns/ya.make +++ b/src/Columns/ya.make @@ -13,6 +13,7 @@ PEERDIR( clickhouse/src/Common contrib/libs/icu contrib/libs/pdqsort + contrib/libs/lz4 ) SRCS( diff --git a/src/Columns/ya.make.in b/src/Columns/ya.make.in index 677a5bcbd70..4422d222ce1 100644 --- a/src/Columns/ya.make.in +++ b/src/Columns/ya.make.in @@ -12,6 +12,7 @@ PEERDIR( clickhouse/src/Common contrib/libs/icu contrib/libs/pdqsort + contrib/libs/lz4 ) SRCS( From 10d773d67154d67c2fa975f5c8d46c8f9ccfb5a6 Mon Sep 17 00:00:00 2001 From: lehasm Date: Sat, 13 Feb 2021 22:35:53 +0300 Subject: [PATCH 0273/2357] HTTP compression info updated xz compression method added. Text rearranged and edited. Examples improved. --- docs/en/interfaces/http.md | 38 +++++++++++++++++++++++++++----------- 1 file changed, 27 insertions(+), 11 deletions(-) diff --git a/docs/en/interfaces/http.md b/docs/en/interfaces/http.md index 310286e3d44..84c1e268e07 100644 --- a/docs/en/interfaces/http.md +++ b/docs/en/interfaces/http.md @@ -148,25 +148,41 @@ $ echo 'DROP TABLE t' | curl 'http://localhost:8123/' --data-binary @- For successful requests that don’t return a data table, an empty response body is returned. -You can use the internal ClickHouse compression format when transmitting data. The compressed data has a non-standard format, and you will need to use the special `clickhouse-compressor` program to work with it (it is installed with the `clickhouse-client` package). To increase the efficiency of data insertion, you can disable server-side checksum verification by using the [http_native_compression_disable_checksumming_on_decompress](../operations/settings/settings.md#settings-http_native_compression_disable_checksumming_on_decompress) setting. -If you specified `compress=1` in the URL, the server compresses the data it sends you. -If you specified `decompress=1` in the URL, the server decompresses the same data that you pass in the `POST` method. +## Compression {#compression} -You can also choose to use [HTTP compression](https://en.wikipedia.org/wiki/HTTP_compression). To send a compressed `POST` request, append the request header `Content-Encoding: compression_method`. In order for ClickHouse to compress the response, you must append `Accept-Encoding: compression_method`. ClickHouse supports `gzip`, `br`, and `deflate` [compression methods](https://en.wikipedia.org/wiki/HTTP_compression#Content-Encoding_tokens). To enable HTTP compression, you must use the ClickHouse [enable_http_compression](../operations/settings/settings.md#settings-enable_http_compression) setting. You can configure the data compression level in the [http_zlib_compression_level](#settings-http_zlib_compression_level) setting for all the compression methods. +You can use compression to reduce network traffic when transmitting a large amount of data or for creating dumps that are immediately compressed. -You can use this to reduce network traffic when transmitting a large amount of data, or for creating dumps that are immediately compressed. +You can use the internal ClickHouse compression format when transmitting data. The compressed data has a non-standard format, and you need `clickhouse-compressor` program to work with it. It is installed with the `clickhouse-client` package. To increase the efficiency of data insertion, you can disable server-side checksum verification by using the [http_native_compression_disable_checksumming_on_decompress](../operations/settings/settings.md#settings-http_native_compression_disable_checksumming_on_decompress) setting. -Examples of sending data with compression: +If you specify `compress=1` in the URL, the server will compress the data it sends to you. +If you specify `decompress=1` in the URL, the server will decompress the data which you pass in the `POST` method. +You can also choose to use [HTTP compression](https://en.wikipedia.org/wiki/HTTP_compression). ClickHouse supports the following [compression methods](https://en.wikipedia.org/wiki/HTTP_compression#Content-Encoding_tokens): + +- `gzip` +- `br` +- `deflate` +- `xz` + +To send a compressed `POST` request, append the request header `Content-Encoding: compression_method`. Example: ``` bash -#Sending data to the server: -$ curl -vsS "http://localhost:8123/?enable_http_compression=1" -d 'SELECT number FROM system.numbers LIMIT 10' -H 'Accept-Encoding: gzip' - -#Sending data to the client: -$ echo "SELECT 1" | gzip -c | curl -sS --data-binary @- -H 'Content-Encoding: gzip' 'http://localhost:8123/' +$ echo "SELECT 1" | gzip -c | \ + curl -sS --data-binary @- -H 'Content-Encoding: gzip' 'http://localhost:8123/' ``` +In order for ClickHouse to compress the response, enable compression with [enable_http_compression](../operations/settings/settings.md#settings-enable_http_compression) setting and append `Accept-Encoding: compression_method` header to the request. You can configure the data compression level in the [http_zlib_compression_level](../operations/settings/settings.md#settings-http_zlib_compression_level) setting for all compression methods. +``` bash +$ curl -vsS "http://localhost:8123/?enable_http_compression=1" \ + -H 'Accept-Encoding: gzip' --output result.gz -d 'SELECT number FROM system.numbers LIMIT 3' +$ zcat result.gz +0 +1 +2 +``` + +## Default Database {#default-database} + !!! note "Note" Some HTTP clients might decompress data from the server by default (with `gzip` and `deflate`) and you might get decompressed data even if you use the compression settings correctly. From d4ba07c5c6737f2c978331969d6b7c4ce535613c Mon Sep 17 00:00:00 2001 From: lehasm Date: Sat, 13 Feb 2021 23:26:56 +0300 Subject: [PATCH 0274/2357] Fix missplaced header --- docs/en/interfaces/http.md | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/docs/en/interfaces/http.md b/docs/en/interfaces/http.md index 84c1e268e07..d82d8baeb75 100644 --- a/docs/en/interfaces/http.md +++ b/docs/en/interfaces/http.md @@ -181,11 +181,12 @@ $ zcat result.gz 2 ``` -## Default Database {#default-database} - !!! note "Note" Some HTTP clients might decompress data from the server by default (with `gzip` and `deflate`) and you might get decompressed data even if you use the compression settings correctly. + +## Default Database {#default-database} + You can use the ‘database’ URL parameter or the ‘X-ClickHouse-Database’ header to specify the default database. ``` bash From 79592b73f840179faa1efc4ea447f6c2107921fe Mon Sep 17 00:00:00 2001 From: Denis Glazachev Date: Sun, 14 Feb 2021 02:07:13 +0400 Subject: [PATCH 0275/2357] Store filter info in prewhere info instead of multiple prewheres Some cleanups --- src/Interpreters/ExpressionAnalyzer.cpp | 6 +- src/Interpreters/ExpressionAnalyzer.h | 4 +- src/Interpreters/InterpreterSelectQuery.cpp | 140 +++++++++--------- src/Interpreters/InterpreterSelectQuery.h | 4 +- .../getHeaderForProcessingStage.cpp | 22 ++- src/Storages/IStorage.cpp | 8 +- .../MergeTreeBaseSelectProcessor.cpp | 63 +++++--- .../MergeTree/MergeTreeBaseSelectProcessor.h | 8 +- .../MergeTree/MergeTreeBlockReadUtils.cpp | 25 +++- .../MergeTree/MergeTreeBlockReadUtils.h | 7 +- .../MergeTree/MergeTreeDataSelectExecutor.cpp | 30 ++-- .../MergeTree/MergeTreeRangeReader.cpp | 76 ++++++---- src/Storages/MergeTree/MergeTreeRangeReader.h | 7 +- src/Storages/MergeTree/MergeTreeReadPool.cpp | 8 +- src/Storages/MergeTree/MergeTreeReadPool.h | 9 +- .../MergeTreeReverseSelectProcessor.cpp | 11 +- .../MergeTreeReverseSelectProcessor.h | 2 +- .../MergeTree/MergeTreeSelectProcessor.cpp | 11 +- .../MergeTree/MergeTreeSelectProcessor.h | 2 +- ...rgeTreeThreadSelectBlockInputProcessor.cpp | 8 +- ...MergeTreeThreadSelectBlockInputProcessor.h | 2 +- src/Storages/SelectQueryInfo.h | 60 +++++--- src/Storages/StorageBuffer.cpp | 39 +++-- 23 files changed, 320 insertions(+), 232 deletions(-) diff --git a/src/Interpreters/ExpressionAnalyzer.cpp b/src/Interpreters/ExpressionAnalyzer.cpp index 5fdd75d19eb..3b9e317934b 100644 --- a/src/Interpreters/ExpressionAnalyzer.cpp +++ b/src/Interpreters/ExpressionAnalyzer.cpp @@ -1320,7 +1320,7 @@ ExpressionAnalysisResult::ExpressionAnalysisResult( bool first_stage_, bool second_stage_, bool only_types, - const FilterInfoPtr & filter_info_, + const FilterDAGInfoPtr & filter_info_, const Block & source_header) : first_stage(first_stage_) , second_stage(second_stage_) @@ -1383,7 +1383,7 @@ ExpressionAnalysisResult::ExpressionAnalysisResult( if (storage && filter_info_) { filter_info = filter_info_; - query_analyzer.appendPreliminaryFilter(chain, filter_info->actions_dag, filter_info->column_name); + query_analyzer.appendPreliminaryFilter(chain, filter_info->actions, filter_info->column_name); } if (auto actions = query_analyzer.appendPrewhere(chain, !first_stage, additional_required_columns_after_prewhere)) @@ -1583,7 +1583,7 @@ void ExpressionAnalysisResult::finalize(const ExpressionActionsChain & chain, si void ExpressionAnalysisResult::removeExtraColumns() const { if (hasFilter()) - filter_info->actions_dag->projectInput(); + filter_info->actions->projectInput(); if (hasWhere()) before_where->projectInput(); if (hasHaving()) diff --git a/src/Interpreters/ExpressionAnalyzer.h b/src/Interpreters/ExpressionAnalyzer.h index 71301ad64a2..3ba9da534bc 100644 --- a/src/Interpreters/ExpressionAnalyzer.h +++ b/src/Interpreters/ExpressionAnalyzer.h @@ -215,7 +215,7 @@ struct ExpressionAnalysisResult NameSet columns_to_remove_after_prewhere; PrewhereDAGInfoPtr prewhere_info; - FilterInfoPtr filter_info; + FilterDAGInfoPtr filter_info; ConstantFilterDescription prewhere_constant_filter_description; ConstantFilterDescription where_constant_filter_description; /// Actions by every element of ORDER BY @@ -230,7 +230,7 @@ struct ExpressionAnalysisResult bool first_stage, bool second_stage, bool only_types, - const FilterInfoPtr & filter_info, + const FilterDAGInfoPtr & filter_info, const Block & source_header); /// Filter for row-level security. diff --git a/src/Interpreters/InterpreterSelectQuery.cpp b/src/Interpreters/InterpreterSelectQuery.cpp index 6a0e2515801..3d91b271b82 100644 --- a/src/Interpreters/InterpreterSelectQuery.cpp +++ b/src/Interpreters/InterpreterSelectQuery.cpp @@ -447,10 +447,10 @@ InterpreterSelectQuery::InterpreterSelectQuery( /// Fix source_header for filter actions. if (row_policy_filter) { - filter_info = std::make_shared(); - filter_info->column_name = generateFilterActions(filter_info->actions_dag, row_policy_filter, required_columns); + filter_info = std::make_shared(); + filter_info->column_name = generateFilterActions(filter_info->actions, row_policy_filter, required_columns); source_header = metadata_snapshot->getSampleBlockForColumns( - filter_info->actions_dag->getRequiredColumns().getNames(), storage->getVirtuals(), storage->getStorageID()); + filter_info->actions->getRequiredColumns().getNames(), storage->getVirtuals(), storage->getStorageID()); } } @@ -807,9 +807,12 @@ void InterpreterSelectQuery::executeImpl(QueryPlan & query_plan, const BlockInpu bool intermediate_stage = false; bool to_aggregation_stage = false; bool from_aggregation_stage = false; - const bool filter_in_prewhere = ( - (settings.optimize_move_to_prewhere || expressions.prewhere_info) && - !input && !input_pipe && storage && storage->supportsPrewhere() + const bool execute_row_level_filter_in_prewhere = ( + ( + settings.optimize_move_to_prewhere || // ...when it is allowed to move things to prewhere, so we do it for row-level filter actions too. + expressions.prewhere_info // ...or when we already have prewhere and must execute row-level filter before it. + ) && + !input && !input_pipe && storage && storage->supportsPrewhere() // Check that prewhere can be used at all. ); if (options.only_analyze) @@ -817,11 +820,11 @@ void InterpreterSelectQuery::executeImpl(QueryPlan & query_plan, const BlockInpu auto read_nothing = std::make_unique(source_header); query_plan.addStep(std::move(read_nothing)); - if (expressions.filter_info && filter_in_prewhere) + if (expressions.filter_info && execute_row_level_filter_in_prewhere) { auto row_level_security_step = std::make_unique( query_plan.getCurrentDataStream(), - expressions.filter_info->actions_dag, + expressions.filter_info->actions, expressions.filter_info->column_name, expressions.filter_info->do_remove_column); @@ -880,7 +883,7 @@ void InterpreterSelectQuery::executeImpl(QueryPlan & query_plan, const BlockInpu to_aggregation_stage = true; /// Read the data from Storage. from_stage - to what stage the request was completed in Storage. - executeFetchColumns(from_stage, query_plan, filter_in_prewhere); + executeFetchColumns(from_stage, query_plan, execute_row_level_filter_in_prewhere); LOG_TRACE(log, "{} -> {}", QueryProcessingStage::toString(from_stage), QueryProcessingStage::toString(options.to_stage)); } @@ -945,11 +948,11 @@ void InterpreterSelectQuery::executeImpl(QueryPlan & query_plan, const BlockInpu if (expressions.first_stage) { - if (expressions.filter_info && !filter_in_prewhere) + if (expressions.filter_info && !execute_row_level_filter_in_prewhere) { auto row_level_security_step = std::make_unique( query_plan.getCurrentDataStream(), - expressions.filter_info->actions_dag, + expressions.filter_info->actions, expressions.filter_info->column_name, expressions.filter_info->do_remove_column); @@ -1200,40 +1203,55 @@ void InterpreterSelectQuery::addEmptySourceToQueryPlan(QueryPlan & query_plan, c { Pipe pipe(std::make_shared(source_header)); - if (query_info.prewhere_info_list) + if (query_info.prewhere_info) { - for (const auto & prewhere_info : *query_info.prewhere_info_list) + auto & prewhere_info = *query_info.prewhere_info; + + if (prewhere_info.filter_info) { - if (prewhere_info.alias_actions) - { - pipe.addSimpleTransform([&](const Block & header) - { - return std::make_shared( - header, prewhere_info.alias_actions); - }); - } + auto & filter_info = *prewhere_info.filter_info; pipe.addSimpleTransform([&](const Block & header) { return std::make_shared( header, - prewhere_info.prewhere_actions, - prewhere_info.prewhere_column_name, - prewhere_info.remove_prewhere_column); + filter_info.actions, + filter_info.column_name, + filter_info.do_remove_column); }); + } - // To remove additional columns - // In some cases, we did not read any marks so that the pipeline.streams is empty - // Thus, some columns in prewhere are not removed as expected - // This leads to mismatched header in distributed table - if (prewhere_info.remove_columns_actions) + if (prewhere_info.alias_actions) + { + pipe.addSimpleTransform([&](const Block & header) { - pipe.addSimpleTransform([&](const Block & header) - { - return std::make_shared( - header, prewhere_info.remove_columns_actions); - }); - } + return std::make_shared( + header, + prewhere_info.alias_actions); + }); + } + + pipe.addSimpleTransform([&](const Block & header) + { + return std::make_shared( + header, + prewhere_info.prewhere_actions, + prewhere_info.prewhere_column_name, + prewhere_info.remove_prewhere_column); + }); + + // To remove additional columns + // In some cases, we did not read any marks so that the pipeline.streams is empty + // Thus, some columns in prewhere are not removed as expected + // This leads to mismatched header in distributed table + if (prewhere_info.remove_columns_actions) + { + pipe.addSimpleTransform([&](const Block & header) + { + return std::make_shared( + header, + prewhere_info.remove_columns_actions); + }); } } @@ -1242,7 +1260,7 @@ void InterpreterSelectQuery::addEmptySourceToQueryPlan(QueryPlan & query_plan, c query_plan.addStep(std::move(read_from_pipe)); } -void InterpreterSelectQuery::executeFetchColumns(QueryProcessingStage::Enum processing_stage, QueryPlan & query_plan, bool filter_in_prewhere) +void InterpreterSelectQuery::executeFetchColumns(QueryProcessingStage::Enum processing_stage, QueryPlan & query_plan, bool execute_row_level_filter_in_prewhere) { auto & query = getSelectQuery(); const Settings & settings = context->getSettingsRef(); @@ -1569,47 +1587,33 @@ void InterpreterSelectQuery::executeFetchColumns(QueryProcessingStage::Enum proc query_info.syntax_analyzer_result = syntax_analyzer_result; query_info.sets = query_analyzer->getPreparedSets(); - if (expressions.filter_info && filter_in_prewhere) - { - if (!query_info.prewhere_info_list) - query_info.prewhere_info_list = std::make_shared(); - - query_info.prewhere_info_list->emplace( - query_info.prewhere_info_list->begin(), - std::make_shared(expressions.filter_info->actions_dag), - expressions.filter_info->column_name); - - if (alias_actions) - { - query_info.prewhere_info_list->back().alias_actions = std::make_shared(alias_actions); - alias_actions = nullptr; - } - - auto & new_filter_info = query_info.prewhere_info_list->front(); - - new_filter_info.remove_prewhere_column = expressions.filter_info->do_remove_column; - new_filter_info.need_filter = true; - } - if (prewhere_info) { - if (!query_info.prewhere_info_list) - query_info.prewhere_info_list = std::make_shared(); + query_info.prewhere_info = std::make_shared(); - query_info.prewhere_info_list->emplace_back( - std::make_shared(prewhere_info->prewhere_actions), - prewhere_info->prewhere_column_name); + if (expressions.filter_info && execute_row_level_filter_in_prewhere) + { + query_info.prewhere_info->filter_info = std::make_shared(); - auto & new_prewhere_info = query_info.prewhere_info_list->back(); + if (expressions.filter_info->actions) + query_info.prewhere_info->filter_info->actions = std::make_shared(expressions.filter_info->actions); + + query_info.prewhere_info->filter_info->column_name = expressions.filter_info->column_name; + query_info.prewhere_info->filter_info->do_remove_column = expressions.filter_info->do_remove_column; + } if (prewhere_info->alias_actions) - new_prewhere_info.alias_actions = std::make_shared(prewhere_info->alias_actions); + query_info.prewhere_info->alias_actions = std::make_shared(prewhere_info->alias_actions); + + if (prewhere_info->prewhere_actions) + query_info.prewhere_info->prewhere_actions = std::make_shared(prewhere_info->prewhere_actions); if (prewhere_info->remove_columns_actions) - new_prewhere_info.remove_columns_actions = std::make_shared(prewhere_info->remove_columns_actions); + query_info.prewhere_info->remove_columns_actions = std::make_shared(prewhere_info->remove_columns_actions); - new_prewhere_info.remove_prewhere_column = prewhere_info->remove_prewhere_column; - new_prewhere_info.need_filter = prewhere_info->need_filter; + query_info.prewhere_info->prewhere_column_name = prewhere_info->prewhere_column_name; + query_info.prewhere_info->remove_prewhere_column = prewhere_info->remove_prewhere_column; + query_info.prewhere_info->need_filter = prewhere_info->need_filter; } /// Create optimizer with prepared actions. diff --git a/src/Interpreters/InterpreterSelectQuery.h b/src/Interpreters/InterpreterSelectQuery.h index 6fcbf102b05..793df612103 100644 --- a/src/Interpreters/InterpreterSelectQuery.h +++ b/src/Interpreters/InterpreterSelectQuery.h @@ -108,7 +108,7 @@ private: /// Different stages of query execution. - void executeFetchColumns(QueryProcessingStage::Enum processing_stage, QueryPlan & query_plan, bool filter_in_prewhere); + void executeFetchColumns(QueryProcessingStage::Enum processing_stage, QueryPlan & query_plan, bool execute_row_level_filter_in_prewhere); void executeWhere(QueryPlan & query_plan, const ActionsDAGPtr & expression, bool remove_filter); void executeAggregation(QueryPlan & query_plan, const ActionsDAGPtr & expression, bool overflow_row, bool final, InputOrderInfoPtr group_by_info); void executeMergeAggregated(QueryPlan & query_plan, bool overflow_row, bool final); @@ -157,7 +157,7 @@ private: /// Is calculated in getSampleBlock. Is used later in readImpl. ExpressionAnalysisResult analysis_result; /// For row-level security. - FilterInfoPtr filter_info; + FilterDAGInfoPtr filter_info; QueryProcessingStage::Enum from_stage = QueryProcessingStage::FetchColumns; diff --git a/src/Interpreters/getHeaderForProcessingStage.cpp b/src/Interpreters/getHeaderForProcessingStage.cpp index 761f04e81ee..2aef3c25c3c 100644 --- a/src/Interpreters/getHeaderForProcessingStage.cpp +++ b/src/Interpreters/getHeaderForProcessingStage.cpp @@ -42,14 +42,26 @@ Block getHeaderForProcessingStage( case QueryProcessingStage::FetchColumns: { Block header = metadata_snapshot->getSampleBlockForColumns(column_names, storage.getVirtuals(), storage.getStorageID()); - if (query_info.prewhere_info_list) + if (query_info.prewhere_info) { - for (const auto & prewhere_info : *query_info.prewhere_info_list) + auto & prewhere_info = *query_info.prewhere_info; + + if (prewhere_info.filter_info) { - prewhere_info.prewhere_actions->execute(header); - if (prewhere_info.remove_prewhere_column) - header.erase(prewhere_info.prewhere_column_name); + auto & filter_info = *prewhere_info.filter_info; + + if (filter_info.actions) + filter_info.actions->execute(header); + + if (filter_info.do_remove_column) + header.erase(filter_info.column_name); } + + if (prewhere_info.prewhere_actions) + prewhere_info.prewhere_actions->execute(header); + + if (prewhere_info.remove_prewhere_column) + header.erase(prewhere_info.prewhere_column_name); } return header; } diff --git a/src/Storages/IStorage.cpp b/src/Storages/IStorage.cpp index 5f500518516..b83c33be2aa 100644 --- a/src/Storages/IStorage.cpp +++ b/src/Storages/IStorage.cpp @@ -203,14 +203,14 @@ std::string PrewhereDAGInfo::dump() const return ss.str(); } -std::string FilterInfo::dump() const +std::string FilterDAGInfo::dump() const { WriteBufferFromOwnString ss; - ss << "FilterInfo for column '" << column_name <<"', do_remove_column " + ss << "FilterDAGInfo for column '" << column_name <<"', do_remove_column " << do_remove_column << "\n"; - if (actions_dag) + if (actions) { - ss << "actions_dag " << actions_dag->dumpDAG() << "\n"; + ss << "actions " << actions->dumpDAG() << "\n"; } return ss.str(); diff --git a/src/Storages/MergeTree/MergeTreeBaseSelectProcessor.cpp b/src/Storages/MergeTree/MergeTreeBaseSelectProcessor.cpp index 3405a211c98..54b343519fa 100644 --- a/src/Storages/MergeTree/MergeTreeBaseSelectProcessor.cpp +++ b/src/Storages/MergeTree/MergeTreeBaseSelectProcessor.cpp @@ -22,17 +22,17 @@ MergeTreeBaseSelectProcessor::MergeTreeBaseSelectProcessor( Block header, const MergeTreeData & storage_, const StorageMetadataPtr & metadata_snapshot_, - const PrewhereInfoListPtr & prewhere_info_list_, + const PrewhereInfoPtr & prewhere_info_, UInt64 max_block_size_rows_, UInt64 preferred_block_size_bytes_, UInt64 preferred_max_column_in_block_size_bytes_, const MergeTreeReaderSettings & reader_settings_, bool use_uncompressed_cache_, const Names & virt_column_names_) - : SourceWithProgress(getHeader(std::move(header), prewhere_info_list_, virt_column_names_)) + : SourceWithProgress(getHeader(std::move(header), prewhere_info_, virt_column_names_)) , storage(storage_) , metadata_snapshot(metadata_snapshot_) - , prewhere_info_list(prewhere_info_list_) + , prewhere_info(prewhere_info_) , max_block_size_rows(max_block_size_rows_) , preferred_block_size_bytes(preferred_block_size_bytes_) , preferred_max_column_in_block_size_bytes(preferred_max_column_in_block_size_bytes_) @@ -70,18 +70,18 @@ Chunk MergeTreeBaseSelectProcessor::generate() void MergeTreeBaseSelectProcessor::initializeRangeReaders(MergeTreeReadTask & current_task) { - if (prewhere_info_list) + if (prewhere_info) { if (reader->getColumns().empty()) { - current_task.range_reader = MergeTreeRangeReader(pre_reader.get(), nullptr, prewhere_info_list, true); + current_task.range_reader = MergeTreeRangeReader(pre_reader.get(), nullptr, prewhere_info, true); } else { MergeTreeRangeReader * pre_reader_ptr = nullptr; if (pre_reader != nullptr) { - current_task.pre_range_reader = MergeTreeRangeReader(pre_reader.get(), nullptr, prewhere_info_list, false); + current_task.pre_range_reader = MergeTreeRangeReader(pre_reader.get(), nullptr, prewhere_info, false); pre_reader_ptr = ¤t_task.pre_range_reader; } @@ -309,37 +309,60 @@ void MergeTreeBaseSelectProcessor::injectVirtualColumns(Chunk & chunk, MergeTree chunk.setColumns(columns, num_rows); } -void MergeTreeBaseSelectProcessor::executePrewhereActions(Block & block, const PrewhereInfoListPtr & prewhere_info_list) +void MergeTreeBaseSelectProcessor::executePrewhereActions(Block & block, const PrewhereInfoPtr & prewhere_info) { - if (!prewhere_info_list) - return; - - for (const auto & prewhere_info : *prewhere_info_list) + if (prewhere_info) { - if (prewhere_info.alias_actions) - prewhere_info.alias_actions->execute(block); + if (prewhere_info->filter_info) + { + auto & filter_info = *prewhere_info->filter_info; - prewhere_info.prewhere_actions->execute(block); - auto & prewhere_column = block.getByName(prewhere_info.prewhere_column_name); + if (filter_info.actions) + filter_info.actions->execute(block); + auto & filter_column = block.getByName(filter_info.column_name); + if (!filter_column.type->canBeUsedInBooleanContext()) + { + throw Exception("Invalid type for row-level security filter: " + filter_column.type->getName(), + ErrorCodes::LOGICAL_ERROR); + } + + if (filter_info.do_remove_column) + block.erase(filter_info.column_name); + else + { + auto & ctn = block.getByName(filter_info.column_name); + ctn.column = ctn.type->createColumnConst(block.rows(), 1u)->convertToFullColumnIfConst(); + } + } + + if (prewhere_info->alias_actions) + prewhere_info->alias_actions->execute(block); + + if (prewhere_info->prewhere_actions) + prewhere_info->prewhere_actions->execute(block); + + auto & prewhere_column = block.getByName(prewhere_info->prewhere_column_name); if (!prewhere_column.type->canBeUsedInBooleanContext()) + { throw Exception("Invalid type for filter in PREWHERE: " + prewhere_column.type->getName(), ErrorCodes::LOGICAL_ERROR); + } - if (prewhere_info.remove_prewhere_column) - block.erase(prewhere_info.prewhere_column_name); + if (prewhere_info->remove_prewhere_column) + block.erase(prewhere_info->prewhere_column_name); else { - auto & ctn = block.getByName(prewhere_info.prewhere_column_name); + auto & ctn = block.getByName(prewhere_info->prewhere_column_name); ctn.column = ctn.type->createColumnConst(block.rows(), 1u)->convertToFullColumnIfConst(); } } } Block MergeTreeBaseSelectProcessor::getHeader( - Block block, const PrewhereInfoListPtr & prewhere_info_list, const Names & virtual_columns) + Block block, const PrewhereInfoPtr & prewhere_info, const Names & virtual_columns) { - executePrewhereActions(block, prewhere_info_list); + executePrewhereActions(block, prewhere_info); injectVirtualColumns(block, nullptr, virtual_columns); return block; } diff --git a/src/Storages/MergeTree/MergeTreeBaseSelectProcessor.h b/src/Storages/MergeTree/MergeTreeBaseSelectProcessor.h index a3d7520b89a..00ef131ae45 100644 --- a/src/Storages/MergeTree/MergeTreeBaseSelectProcessor.h +++ b/src/Storages/MergeTree/MergeTreeBaseSelectProcessor.h @@ -23,7 +23,7 @@ public: Block header, const MergeTreeData & storage_, const StorageMetadataPtr & metadata_snapshot_, - const PrewhereInfoListPtr & prewhere_info_list_, + const PrewhereInfoPtr & prewhere_info_, UInt64 max_block_size_rows_, UInt64 preferred_block_size_bytes_, UInt64 preferred_max_column_in_block_size_bytes_, @@ -33,7 +33,7 @@ public: ~MergeTreeBaseSelectProcessor() override; - static void executePrewhereActions(Block & block, const PrewhereInfoListPtr & prewhere_info_list); + static void executePrewhereActions(Block & block, const PrewhereInfoPtr & prewhere_info); protected: Chunk generate() final; @@ -49,7 +49,7 @@ protected: static void injectVirtualColumns(Block & block, MergeTreeReadTask * task, const Names & virtual_columns); static void injectVirtualColumns(Chunk & chunk, MergeTreeReadTask * task, const Names & virtual_columns); - static Block getHeader(Block block, const PrewhereInfoListPtr & prewhere_info_list, const Names & virtual_columns); + static Block getHeader(Block block, const PrewhereInfoPtr & prewhere_info, const Names & virtual_columns); void initializeRangeReaders(MergeTreeReadTask & task); @@ -57,7 +57,7 @@ protected: const MergeTreeData & storage; StorageMetadataPtr metadata_snapshot; - PrewhereInfoListPtr prewhere_info_list; + PrewhereInfoPtr prewhere_info; UInt64 max_block_size_rows; UInt64 preferred_block_size_bytes; diff --git a/src/Storages/MergeTree/MergeTreeBlockReadUtils.cpp b/src/Storages/MergeTree/MergeTreeBlockReadUtils.cpp index f3191a76120..d5fb2f3300c 100644 --- a/src/Storages/MergeTree/MergeTreeBlockReadUtils.cpp +++ b/src/Storages/MergeTree/MergeTreeBlockReadUtils.cpp @@ -118,10 +118,11 @@ NameSet injectRequiredColumns(const MergeTreeData & storage, const StorageMetada MergeTreeReadTask::MergeTreeReadTask( const MergeTreeData::DataPartPtr & data_part_, const MarkRanges & mark_ranges_, const size_t part_index_in_query_, const Names & ordered_names_, const NameSet & column_name_set_, const NamesAndTypesList & columns_, - const NamesAndTypesList & pre_columns_, const bool should_reorder_, MergeTreeBlockSizePredictorPtr && size_predictor_) + const NamesAndTypesList & pre_columns_, const bool remove_prewhere_column_, const bool should_reorder_, + MergeTreeBlockSizePredictorPtr && size_predictor_) : data_part{data_part_}, mark_ranges{mark_ranges_}, part_index_in_query{part_index_in_query_}, ordered_names{ordered_names_}, column_name_set{column_name_set_}, columns{columns_}, pre_columns{pre_columns_}, - should_reorder{should_reorder_}, size_predictor{std::move(size_predictor_)} + remove_prewhere_column{remove_prewhere_column_}, should_reorder{should_reorder_}, size_predictor{std::move(size_predictor_)} { } @@ -257,7 +258,7 @@ MergeTreeReadTaskColumns getReadTaskColumns( const StorageMetadataPtr & metadata_snapshot, const MergeTreeData::DataPartPtr & data_part, const Names & required_columns, - const PrewhereInfoListPtr & prewhere_info_list, + const PrewhereInfoPtr & prewhere_info, bool check_columns) { Names column_names = required_columns; @@ -266,12 +267,22 @@ MergeTreeReadTaskColumns getReadTaskColumns( /// inject columns required for defaults evaluation bool should_reorder = !injectRequiredColumns(storage, metadata_snapshot, data_part, column_names).empty(); - if (prewhere_info_list) + if (prewhere_info) { - for (const auto & prewhere_info : *prewhere_info_list) + if (prewhere_info->filter_info && prewhere_info->filter_info->actions) { - const auto required_column_names = (prewhere_info.alias_actions ? - prewhere_info.alias_actions->getRequiredColumns() : prewhere_info.prewhere_actions->getRequiredColumns()); + const auto required_column_names = prewhere_info->filter_info->actions->getRequiredColumns(); + pre_column_names.insert(pre_column_names.end(), required_column_names.begin(), required_column_names.end()); + } + + if (prewhere_info->alias_actions) + { + const auto required_column_names = prewhere_info->alias_actions->getRequiredColumns(); + pre_column_names.insert(pre_column_names.end(), required_column_names.begin(), required_column_names.end()); + } + else + { + const auto required_column_names = prewhere_info->prewhere_actions->getRequiredColumns(); pre_column_names.insert(pre_column_names.end(), required_column_names.begin(), required_column_names.end()); } diff --git a/src/Storages/MergeTree/MergeTreeBlockReadUtils.h b/src/Storages/MergeTree/MergeTreeBlockReadUtils.h index f2537c554c3..31d609e4242 100644 --- a/src/Storages/MergeTree/MergeTreeBlockReadUtils.h +++ b/src/Storages/MergeTree/MergeTreeBlockReadUtils.h @@ -42,6 +42,8 @@ struct MergeTreeReadTask const NamesAndTypesList & columns; /// column names to read during PREWHERE const NamesAndTypesList & pre_columns; + /// should PREWHERE column be returned to requesting side? + const bool remove_prewhere_column; /// resulting block may require reordering in accordance with `ordered_names` const bool should_reorder; /// Used to satistfy preferred_block_size_bytes limitation @@ -55,7 +57,8 @@ struct MergeTreeReadTask MergeTreeReadTask( const MergeTreeData::DataPartPtr & data_part_, const MarkRanges & mark_ranges_, const size_t part_index_in_query_, const Names & ordered_names_, const NameSet & column_name_set_, const NamesAndTypesList & columns_, - const NamesAndTypesList & pre_columns_, const bool should_reorder_, MergeTreeBlockSizePredictorPtr && size_predictor_); + const NamesAndTypesList & pre_columns_, const bool remove_prewhere_column_, const bool should_reorder_, + MergeTreeBlockSizePredictorPtr && size_predictor_); virtual ~MergeTreeReadTask(); }; @@ -75,7 +78,7 @@ MergeTreeReadTaskColumns getReadTaskColumns( const StorageMetadataPtr & metadata_snapshot, const MergeTreeData::DataPartPtr & data_part, const Names & required_columns, - const PrewhereInfoListPtr & prewhere_info_list, + const PrewhereInfoPtr & prewhere_info, bool check_columns); struct MergeTreeBlockSizePredictor diff --git a/src/Storages/MergeTree/MergeTreeDataSelectExecutor.cpp b/src/Storages/MergeTree/MergeTreeDataSelectExecutor.cpp index 9d331f19a56..b44e7197c12 100644 --- a/src/Storages/MergeTree/MergeTreeDataSelectExecutor.cpp +++ b/src/Storages/MergeTree/MergeTreeDataSelectExecutor.cpp @@ -834,20 +834,14 @@ QueryPlanPtr MergeTreeDataSelectExecutor::readFromParts( plan->addStep(std::move(adding_column)); } - if (query_info.prewhere_info_list) + if (query_info.prewhere_info && query_info.prewhere_info->remove_columns_actions) { - for (const auto & prewhere_info : *query_info.prewhere_info_list) - { - if (prewhere_info.remove_columns_actions) - { - auto expression_step = std::make_unique( - plan->getCurrentDataStream(), - prewhere_info.remove_columns_actions->getActionsDAG().clone()); + auto expression_step = std::make_unique( + plan->getCurrentDataStream(), + query_info.prewhere_info->remove_columns_actions->getActionsDAG().clone()); - expression_step->setStepDescription("Remove unused columns after PREWHERE"); - plan->addStep(std::move(expression_step)); - } - } + expression_step->setStepDescription("Remove unused columns after PREWHERE"); + plan->addStep(std::move(expression_step)); } return plan; @@ -983,7 +977,7 @@ QueryPlanPtr MergeTreeDataSelectExecutor::spreadMarkRangesAmongStreams( std::move(parts), data, metadata_snapshot, - query_info.prewhere_info_list, + query_info.prewhere_info, true, column_names, MergeTreeReadPool::BackoffSettings(settings), @@ -999,7 +993,7 @@ QueryPlanPtr MergeTreeDataSelectExecutor::spreadMarkRangesAmongStreams( i, pool, min_marks_for_concurrent_read, max_block_size, settings.preferred_block_size_bytes, settings.preferred_max_column_in_block_size_bytes, data, metadata_snapshot, use_uncompressed_cache, - query_info.prewhere_info_list, reader_settings, virt_columns); + query_info.prewhere_info, reader_settings, virt_columns); if (i == 0) { @@ -1022,7 +1016,7 @@ QueryPlanPtr MergeTreeDataSelectExecutor::spreadMarkRangesAmongStreams( auto source = std::make_shared( data, metadata_snapshot, part.data_part, max_block_size, settings.preferred_block_size_bytes, settings.preferred_max_column_in_block_size_bytes, column_names, part.ranges, use_uncompressed_cache, - query_info.prewhere_info_list, true, reader_settings, virt_columns, part.part_index_in_query); + query_info.prewhere_info, true, reader_settings, virt_columns, part.part_index_in_query); res.emplace_back(std::move(source)); } @@ -1223,7 +1217,7 @@ QueryPlanPtr MergeTreeDataSelectExecutor::spreadMarkRangesAmongStreamsWithOrder( column_names, ranges_to_get_from_part, use_uncompressed_cache, - query_info.prewhere_info_list, + query_info.prewhere_info, true, reader_settings, virt_columns, @@ -1241,7 +1235,7 @@ QueryPlanPtr MergeTreeDataSelectExecutor::spreadMarkRangesAmongStreamsWithOrder( column_names, ranges_to_get_from_part, use_uncompressed_cache, - query_info.prewhere_info_list, + query_info.prewhere_info, true, reader_settings, virt_columns, @@ -1395,7 +1389,7 @@ QueryPlanPtr MergeTreeDataSelectExecutor::spreadMarkRangesAmongStreamsFinal( column_names, part_it->ranges, use_uncompressed_cache, - query_info.prewhere_info_list, + query_info.prewhere_info, true, reader_settings, virt_columns, diff --git a/src/Storages/MergeTree/MergeTreeRangeReader.cpp b/src/Storages/MergeTree/MergeTreeRangeReader.cpp index 7c9b1b36b33..8d149d9473f 100644 --- a/src/Storages/MergeTree/MergeTreeRangeReader.cpp +++ b/src/Storages/MergeTree/MergeTreeRangeReader.cpp @@ -543,12 +543,12 @@ size_t MergeTreeRangeReader::ReadResult::countBytesInResultFilter(const IColumn: MergeTreeRangeReader::MergeTreeRangeReader( IMergeTreeReader * merge_tree_reader_, MergeTreeRangeReader * prev_reader_, - const PrewhereInfoListPtr & prewhere_info_list_, + const PrewhereInfoPtr & prewhere_info_, bool last_reader_in_chain_) : merge_tree_reader(merge_tree_reader_) , index_granularity(&(merge_tree_reader->data_part->index_granularity)) , prev_reader(prev_reader_) - , prewhere_info_list(prewhere_info_list_) + , prewhere_info(prewhere_info_) , last_reader_in_chain(last_reader_in_chain_) , is_initialized(true) { @@ -558,19 +558,25 @@ MergeTreeRangeReader::MergeTreeRangeReader( for (const auto & name_and_type : merge_tree_reader->getColumns()) sample_block.insert({name_and_type.type->createColumn(), name_and_type.type, name_and_type.name}); - if (prewhere_info_list) + if (prewhere_info) { - for (const auto & prewhere_info : *prewhere_info_list) + if (prewhere_info->filter_info) { - if (prewhere_info.alias_actions) - prewhere_info.alias_actions->execute(sample_block, true); + if (prewhere_info->filter_info->actions) + prewhere_info->filter_info->actions->execute(sample_block, true); - if (prewhere_info.prewhere_actions) - prewhere_info.prewhere_actions->execute(sample_block, true); - - if (prewhere_info.remove_prewhere_column) - sample_block.erase(prewhere_info.prewhere_column_name); + if (prewhere_info->filter_info->do_remove_column) + sample_block.erase(prewhere_info->filter_info->column_name); } + + if (prewhere_info->alias_actions) + prewhere_info->alias_actions->execute(sample_block, true); + + if (prewhere_info->prewhere_actions) + prewhere_info->prewhere_actions->execute(sample_block, true); + + if (prewhere_info->remove_prewhere_column) + sample_block.erase(prewhere_info->prewhere_column_name); } } @@ -860,7 +866,7 @@ Columns MergeTreeRangeReader::continueReadingChain(ReadResult & result, size_t & void MergeTreeRangeReader::executePrewhereActionsAndFilterColumns(ReadResult & result) { - if (!prewhere_info_list || prewhere_info_list->empty()) + if (!prewhere_info) return; const auto & header = merge_tree_reader->getColumns(); @@ -890,29 +896,37 @@ void MergeTreeRangeReader::executePrewhereActionsAndFilterColumns(ReadResult & r for (auto name_and_type = header.begin(); pos < num_columns; ++pos, ++name_and_type) block.insert({result.columns[pos], name_and_type->type, name_and_type->name}); - for (size_t i = 0; i < prewhere_info_list->size(); ++i) + if (prewhere_info->filter_info) { - const auto & prewhere_info = (*prewhere_info_list)[i]; + if (prewhere_info->filter_info->actions) + prewhere_info->filter_info->actions->execute(block); - if (prewhere_info.alias_actions) - prewhere_info.alias_actions->execute(block); + const auto filter_column_pos = block.getPositionByName(prewhere_info->filter_info->column_name); + result.addFilter(block.getByPosition(filter_column_pos).column); - /// Columns might be projected out. We need to store them here so that default columns can be evaluated later. - result.block_before_prewhere = block; - prewhere_info.prewhere_actions->execute(block); - - prewhere_column_pos = block.getPositionByName(prewhere_info.prewhere_column_name); - result.addFilter(block.getByPosition(prewhere_column_pos).column); - - if (i + 1 != prewhere_info_list->size() && prewhere_info.remove_prewhere_column) - block.erase(prewhere_column_pos); + if (prewhere_info->filter_info->do_remove_column) + block.erase(prewhere_info->filter_info->column_name); else - block.getByPosition(prewhere_column_pos).column = block.getByPosition(prewhere_column_pos).type->createColumnConst(result.num_rows, 1); + block.getByPosition(filter_column_pos).column = block.getByPosition(filter_column_pos).type->createColumnConst(result.num_rows, 1); } - block.getByPosition(prewhere_column_pos).column = nullptr; + if (prewhere_info->alias_actions) + prewhere_info->alias_actions->execute(block); + + /// Columns might be projected out. We need to store them here so that default columns can be evaluated later. + result.block_before_prewhere = block; + + if (prewhere_info->prewhere_actions) + prewhere_info->prewhere_actions->execute(block); + + prewhere_column_pos = block.getPositionByName(prewhere_info->prewhere_column_name); + result.addFilter(block.getByPosition(prewhere_column_pos).column); + + block.getByPosition(prewhere_column_pos).column.reset(); + result.columns.clear(); result.columns.reserve(block.columns()); + for (auto & col : block) result.columns.emplace_back(std::move(col.column)); } @@ -925,7 +939,7 @@ void MergeTreeRangeReader::executePrewhereActionsAndFilterColumns(ReadResult & r if (result.totalRowsPerGranule() == 0) result.setFilterConstFalse(); /// If we need to filter in PREWHERE - else if (prewhere_info_list->back().need_filter || result.need_filter) + else if (prewhere_info->need_filter || result.need_filter) { /// If there is a filter and without optimized if (result.getFilter() && last_reader_in_chain) @@ -966,11 +980,11 @@ void MergeTreeRangeReader::executePrewhereActionsAndFilterColumns(ReadResult & r /// Check if the PREWHERE column is needed if (!result.columns.empty()) { - if (prewhere_info_list->back().remove_prewhere_column) + if (prewhere_info->remove_prewhere_column) result.columns.erase(result.columns.begin() + prewhere_column_pos); else result.columns[prewhere_column_pos] = - getSampleBlock().getByName(prewhere_info_list->back().prewhere_column_name).type-> + getSampleBlock().getByName(prewhere_info->prewhere_column_name).type-> createColumnConst(result.num_rows, 1u)->convertToFullColumnIfConst(); } } @@ -978,7 +992,7 @@ void MergeTreeRangeReader::executePrewhereActionsAndFilterColumns(ReadResult & r else { result.columns[prewhere_column_pos] = result.getFilterHolder()->convertToFullColumnIfConst(); - if (getSampleBlock().getByName(prewhere_info_list->back().prewhere_column_name).type->isNullable()) + if (getSampleBlock().getByName(prewhere_info->prewhere_column_name).type->isNullable()) result.columns[prewhere_column_pos] = makeNullable(std::move(result.columns[prewhere_column_pos])); result.clearFilter(); // Acting as a flag to not filter in PREWHERE } diff --git a/src/Storages/MergeTree/MergeTreeRangeReader.h b/src/Storages/MergeTree/MergeTreeRangeReader.h index 6ee7c9f3e29..884d2dbafd1 100644 --- a/src/Storages/MergeTree/MergeTreeRangeReader.h +++ b/src/Storages/MergeTree/MergeTreeRangeReader.h @@ -13,8 +13,7 @@ using ColumnUInt8 = ColumnVector; class IMergeTreeReader; class MergeTreeIndexGranularity; struct PrewhereInfo; -using PrewhereInfoList = std::vector; -using PrewhereInfoListPtr = std::shared_ptr; +using PrewhereInfoPtr = std::shared_ptr; /// MergeTreeReader iterator which allows sequential reading for arbitrary number of rows between pairs of marks in the same part. /// Stores reading state, which can be inside granule. Can skip rows in current granule and start reading from next mark. @@ -25,7 +24,7 @@ public: MergeTreeRangeReader( IMergeTreeReader * merge_tree_reader_, MergeTreeRangeReader * prev_reader_, - const PrewhereInfoListPtr & prewhere_info_list, + const PrewhereInfoPtr & prewhere_info_, bool last_reader_in_chain_); MergeTreeRangeReader() = default; @@ -218,7 +217,7 @@ private: IMergeTreeReader * merge_tree_reader = nullptr; const MergeTreeIndexGranularity * index_granularity = nullptr; MergeTreeRangeReader * prev_reader = nullptr; /// If not nullptr, read from prev_reader firstly. - PrewhereInfoListPtr prewhere_info_list; + PrewhereInfoPtr prewhere_info; Stream stream; diff --git a/src/Storages/MergeTree/MergeTreeReadPool.cpp b/src/Storages/MergeTree/MergeTreeReadPool.cpp index a3a580fa7f2..d9a250e3f7a 100644 --- a/src/Storages/MergeTree/MergeTreeReadPool.cpp +++ b/src/Storages/MergeTree/MergeTreeReadPool.cpp @@ -24,7 +24,7 @@ MergeTreeReadPool::MergeTreeReadPool( RangesInDataParts && parts_, const MergeTreeData & data_, const StorageMetadataPtr & metadata_snapshot_, - const PrewhereInfoListPtr & prewhere_info_list_, + const PrewhereInfoPtr & prewhere_info_, const bool check_columns_, const Names & column_names_, const BackoffSettings & backoff_settings_, @@ -37,7 +37,7 @@ MergeTreeReadPool::MergeTreeReadPool( , column_names{column_names_} , do_not_steal_tasks{do_not_steal_tasks_} , predict_block_size_bytes{preferred_block_size_bytes_ > 0} - , prewhere_info_list{prewhere_info_list_} + , prewhere_info{prewhere_info_} , parts_ranges{std::move(parts_)} { /// parts don't contain duplicate MergeTreeDataPart's. @@ -139,7 +139,7 @@ MergeTreeReadTaskPtr MergeTreeReadPool::getTask(const size_t min_marks_to_read, return std::make_unique( part.data_part, ranges_to_get_from_part, part.part_index_in_query, ordered_names, per_part_column_name_set[part_idx], per_part_columns[part_idx], per_part_pre_columns[part_idx], - per_part_should_reorder[part_idx], std::move(curr_task_size_predictor)); + prewhere_info && prewhere_info->remove_prewhere_column, per_part_should_reorder[part_idx], std::move(curr_task_size_predictor)); } MarkRanges MergeTreeReadPool::getRestMarks(const IMergeTreeDataPart & part, const MarkRange & from) const @@ -229,7 +229,7 @@ std::vector MergeTreeReadPool::fillPerPartInfo( per_part_sum_marks.push_back(sum_marks); auto [required_columns, required_pre_columns, should_reorder] = - getReadTaskColumns(data, metadata_snapshot, part.data_part, column_names, prewhere_info_list, check_columns); + getReadTaskColumns(data, metadata_snapshot, part.data_part, column_names, prewhere_info, check_columns); /// will be used to distinguish between PREWHERE and WHERE columns when applying filter const auto & required_column_names = required_columns.getNames(); diff --git a/src/Storages/MergeTree/MergeTreeReadPool.h b/src/Storages/MergeTree/MergeTreeReadPool.h index ec9523ccbe3..aa6811661e6 100644 --- a/src/Storages/MergeTree/MergeTreeReadPool.h +++ b/src/Storages/MergeTree/MergeTreeReadPool.h @@ -71,9 +71,10 @@ private: public: MergeTreeReadPool( const size_t threads_, const size_t sum_marks_, const size_t min_marks_for_concurrent_read_, - RangesInDataParts && parts_, const MergeTreeData & data_, const StorageMetadataPtr & metadata_snapshot_, - const PrewhereInfoListPtr & prewhere_info_list, const bool check_columns_, const Names & column_names_, - const BackoffSettings & backoff_settings_, size_t preferred_block_size_bytes_, const bool do_not_steal_tasks_ = false); + RangesInDataParts && parts_, const MergeTreeData & data_, const StorageMetadataPtr & metadata_snapshot_, const PrewhereInfoPtr & prewhere_info_, + const bool check_columns_, const Names & column_names_, + const BackoffSettings & backoff_settings_, size_t preferred_block_size_bytes_, + const bool do_not_steal_tasks_ = false); MergeTreeReadTaskPtr getTask(const size_t min_marks_to_read, const size_t thread, const Names & ordered_names); @@ -106,7 +107,7 @@ private: std::vector per_part_pre_columns; std::vector per_part_should_reorder; std::vector per_part_size_predictor; - PrewhereInfoListPtr prewhere_info_list; + PrewhereInfoPtr prewhere_info; struct Part { diff --git a/src/Storages/MergeTree/MergeTreeReverseSelectProcessor.cpp b/src/Storages/MergeTree/MergeTreeReverseSelectProcessor.cpp index 35df1106339..ee0a77ba3cf 100644 --- a/src/Storages/MergeTree/MergeTreeReverseSelectProcessor.cpp +++ b/src/Storages/MergeTree/MergeTreeReverseSelectProcessor.cpp @@ -22,7 +22,7 @@ MergeTreeReverseSelectProcessor::MergeTreeReverseSelectProcessor( Names required_columns_, MarkRanges mark_ranges_, bool use_uncompressed_cache_, - const PrewhereInfoListPtr & prewhere_info_list_, + const PrewhereInfoPtr & prewhere_info_, bool check_columns, const MergeTreeReaderSettings & reader_settings_, const Names & virt_column_names_, @@ -31,7 +31,7 @@ MergeTreeReverseSelectProcessor::MergeTreeReverseSelectProcessor( : MergeTreeBaseSelectProcessor{ metadata_snapshot_->getSampleBlockForColumns(required_columns_, storage_.getVirtuals(), storage_.getStorageID()), - storage_, metadata_snapshot_, prewhere_info_list_, max_block_size_rows_, + storage_, metadata_snapshot_, prewhere_info_, max_block_size_rows_, preferred_block_size_bytes_, preferred_max_column_in_block_size_bytes_, reader_settings_, use_uncompressed_cache_, virt_column_names_}, required_columns{std::move(required_columns_)}, @@ -56,7 +56,7 @@ MergeTreeReverseSelectProcessor::MergeTreeReverseSelectProcessor( ordered_names = header_without_virtual_columns.getNames(); - task_columns = getReadTaskColumns(storage, metadata_snapshot, data_part, required_columns, prewhere_info_list, check_columns); + task_columns = getReadTaskColumns(storage, metadata_snapshot, data_part, required_columns, prewhere_info, check_columns); /// will be used to distinguish between PREWHERE and WHERE columns when applying filter const auto & column_names = task_columns.columns.getNames(); @@ -71,7 +71,7 @@ MergeTreeReverseSelectProcessor::MergeTreeReverseSelectProcessor( all_mark_ranges, owned_uncompressed_cache.get(), owned_mark_cache.get(), reader_settings); - if (prewhere_info_list) + if (prewhere_info) pre_reader = data_part->getReader(task_columns.pre_columns, metadata_snapshot, all_mark_ranges, owned_uncompressed_cache.get(), owned_mark_cache.get(), reader_settings); } @@ -100,7 +100,8 @@ try task = std::make_unique( data_part, mark_ranges_for_task, part_index_in_query, ordered_names, column_name_set, - task_columns.columns, task_columns.pre_columns, task_columns.should_reorder, std::move(size_predictor)); + task_columns.columns, task_columns.pre_columns, prewhere_info && prewhere_info->remove_prewhere_column, + task_columns.should_reorder, std::move(size_predictor)); return true; } diff --git a/src/Storages/MergeTree/MergeTreeReverseSelectProcessor.h b/src/Storages/MergeTree/MergeTreeReverseSelectProcessor.h index b6da7166457..c9fd06c5534 100644 --- a/src/Storages/MergeTree/MergeTreeReverseSelectProcessor.h +++ b/src/Storages/MergeTree/MergeTreeReverseSelectProcessor.h @@ -26,7 +26,7 @@ public: Names required_columns_, MarkRanges mark_ranges, bool use_uncompressed_cache, - const PrewhereInfoListPtr & prewhere_info_list, + const PrewhereInfoPtr & prewhere_info, bool check_columns, const MergeTreeReaderSettings & reader_settings, const Names & virt_column_names = {}, diff --git a/src/Storages/MergeTree/MergeTreeSelectProcessor.cpp b/src/Storages/MergeTree/MergeTreeSelectProcessor.cpp index cdb97f47a47..65f9b1eba3b 100644 --- a/src/Storages/MergeTree/MergeTreeSelectProcessor.cpp +++ b/src/Storages/MergeTree/MergeTreeSelectProcessor.cpp @@ -22,7 +22,7 @@ MergeTreeSelectProcessor::MergeTreeSelectProcessor( Names required_columns_, MarkRanges mark_ranges_, bool use_uncompressed_cache_, - const PrewhereInfoListPtr & prewhere_info_list_, + const PrewhereInfoPtr & prewhere_info_, bool check_columns_, const MergeTreeReaderSettings & reader_settings_, const Names & virt_column_names_, @@ -31,7 +31,7 @@ MergeTreeSelectProcessor::MergeTreeSelectProcessor( : MergeTreeBaseSelectProcessor{ metadata_snapshot_->getSampleBlockForColumns(required_columns_, storage_.getVirtuals(), storage_.getStorageID()), - storage_, metadata_snapshot_, prewhere_info_list_, max_block_size_rows_, + storage_, metadata_snapshot_, prewhere_info_, max_block_size_rows_, preferred_block_size_bytes_, preferred_max_column_in_block_size_bytes_, reader_settings_, use_uncompressed_cache_, virt_column_names_}, required_columns{std::move(required_columns_)}, @@ -69,7 +69,7 @@ try task_columns = getReadTaskColumns( storage, metadata_snapshot, data_part, - required_columns, prewhere_info_list, check_columns); + required_columns, prewhere_info, check_columns); auto size_predictor = (preferred_block_size_bytes == 0) ? nullptr @@ -81,7 +81,8 @@ try task = std::make_unique( data_part, all_mark_ranges, part_index_in_query, ordered_names, column_name_set, task_columns.columns, - task_columns.pre_columns, task_columns.should_reorder, std::move(size_predictor)); + task_columns.pre_columns, prewhere_info && prewhere_info->remove_prewhere_column, + task_columns.should_reorder, std::move(size_predictor)); if (!reader) { @@ -93,7 +94,7 @@ try reader = data_part->getReader(task_columns.columns, metadata_snapshot, all_mark_ranges, owned_uncompressed_cache.get(), owned_mark_cache.get(), reader_settings); - if (prewhere_info_list) + if (prewhere_info) pre_reader = data_part->getReader(task_columns.pre_columns, metadata_snapshot, all_mark_ranges, owned_uncompressed_cache.get(), owned_mark_cache.get(), reader_settings); } diff --git a/src/Storages/MergeTree/MergeTreeSelectProcessor.h b/src/Storages/MergeTree/MergeTreeSelectProcessor.h index 521bbbfdba4..925c437f1ce 100644 --- a/src/Storages/MergeTree/MergeTreeSelectProcessor.h +++ b/src/Storages/MergeTree/MergeTreeSelectProcessor.h @@ -26,7 +26,7 @@ public: Names required_columns_, MarkRanges mark_ranges, bool use_uncompressed_cache, - const PrewhereInfoListPtr & prewhere_info_list, + const PrewhereInfoPtr & prewhere_info, bool check_columns, const MergeTreeReaderSettings & reader_settings, const Names & virt_column_names = {}, diff --git a/src/Storages/MergeTree/MergeTreeThreadSelectBlockInputProcessor.cpp b/src/Storages/MergeTree/MergeTreeThreadSelectBlockInputProcessor.cpp index eb1a80acb49..f57247e39ab 100644 --- a/src/Storages/MergeTree/MergeTreeThreadSelectBlockInputProcessor.cpp +++ b/src/Storages/MergeTree/MergeTreeThreadSelectBlockInputProcessor.cpp @@ -18,12 +18,12 @@ MergeTreeThreadSelectBlockInputProcessor::MergeTreeThreadSelectBlockInputProcess const MergeTreeData & storage_, const StorageMetadataPtr & metadata_snapshot_, const bool use_uncompressed_cache_, - const PrewhereInfoListPtr & prewhere_info_list_, + const PrewhereInfoPtr & prewhere_info_, const MergeTreeReaderSettings & reader_settings_, const Names & virt_column_names_) : MergeTreeBaseSelectProcessor{ - pool_->getHeader(), storage_, metadata_snapshot_, prewhere_info_list_, + pool_->getHeader(), storage_, metadata_snapshot_, prewhere_info_, max_block_size_rows_, preferred_block_size_bytes_, preferred_max_column_in_block_size_bytes_, reader_settings_, use_uncompressed_cache_, virt_column_names_}, @@ -78,7 +78,7 @@ bool MergeTreeThreadSelectBlockInputProcessor::getNewTask() owned_uncompressed_cache.get(), owned_mark_cache.get(), reader_settings, IMergeTreeReader::ValueSizeMap{}, profile_callback); - if (prewhere_info_list) + if (prewhere_info) pre_reader = task->data_part->getReader(task->pre_columns, metadata_snapshot, rest_mark_ranges, owned_uncompressed_cache.get(), owned_mark_cache.get(), reader_settings, IMergeTreeReader::ValueSizeMap{}, profile_callback); @@ -94,7 +94,7 @@ bool MergeTreeThreadSelectBlockInputProcessor::getNewTask() owned_uncompressed_cache.get(), owned_mark_cache.get(), reader_settings, reader->getAvgValueSizeHints(), profile_callback); - if (prewhere_info_list) + if (prewhere_info) pre_reader = task->data_part->getReader(task->pre_columns, metadata_snapshot, rest_mark_ranges, owned_uncompressed_cache.get(), owned_mark_cache.get(), reader_settings, reader->getAvgValueSizeHints(), profile_callback); diff --git a/src/Storages/MergeTree/MergeTreeThreadSelectBlockInputProcessor.h b/src/Storages/MergeTree/MergeTreeThreadSelectBlockInputProcessor.h index dd3ba8c973c..2b2ed36fc18 100644 --- a/src/Storages/MergeTree/MergeTreeThreadSelectBlockInputProcessor.h +++ b/src/Storages/MergeTree/MergeTreeThreadSelectBlockInputProcessor.h @@ -24,7 +24,7 @@ public: const MergeTreeData & storage_, const StorageMetadataPtr & metadata_snapshot_, const bool use_uncompressed_cache_, - const PrewhereInfoListPtr & prewhere_info_list_, + const PrewhereInfoPtr & prewhere_info_, const MergeTreeReaderSettings & reader_settings_, const Names & virt_column_names_); diff --git a/src/Storages/SelectQueryInfo.h b/src/Storages/SelectQueryInfo.h index 68f2f8f1361..325f54435ed 100644 --- a/src/Storages/SelectQueryInfo.h +++ b/src/Storages/SelectQueryInfo.h @@ -15,8 +15,34 @@ using ExpressionActionsPtr = std::shared_ptr; class ActionsDAG; using ActionsDAGPtr = std::shared_ptr; +struct PrewhereInfo; +using PrewhereInfoPtr = std::shared_ptr; + +struct PrewhereDAGInfo; +using PrewhereDAGInfoPtr = std::shared_ptr; + +struct FilterInfo; +using FilterInfoPtr = std::shared_ptr; + +struct FilterDAGInfo; +using FilterDAGInfoPtr = std::shared_ptr; + +struct InputOrderInfo; +using InputOrderInfoPtr = std::shared_ptr; + +struct TreeRewriterResult; +using TreeRewriterResultPtr = std::shared_ptr; + +class ReadInOrderOptimizer; +using ReadInOrderOptimizerPtr = std::shared_ptr; + +class Cluster; +using ClusterPtr = std::shared_ptr; + struct PrewhereInfo { + /// Information about the preliminary filter expression, if any. + FilterInfoPtr filter_info; /// Actions which are executed in order to alias columns are used for prewhere actions. ExpressionActionsPtr alias_actions; /// Actions which are executed on block in order to get filter column for prewhere step. @@ -26,15 +52,9 @@ struct PrewhereInfo String prewhere_column_name; bool remove_prewhere_column = false; bool need_filter = false; - - PrewhereInfo() = default; - explicit PrewhereInfo(ExpressionActionsPtr prewhere_actions_, String prewhere_column_name_) - : prewhere_actions(std::move(prewhere_actions_)), prewhere_column_name(std::move(prewhere_column_name_)) {} }; -using PrewhereInfoList = std::vector; - -/// Same as PrewhereInfo, but with ActionsDAG +/// Same as PrewhereInfo, but with ActionsDAG. struct PrewhereDAGInfo { ActionsDAGPtr alias_actions; @@ -54,7 +74,15 @@ struct PrewhereDAGInfo /// Helper struct to store all the information about the filter expression. struct FilterInfo { - ActionsDAGPtr actions_dag; + ExpressionActionsPtr actions; + String column_name; + bool do_remove_column = false; +}; + +/// Same as FilterInfo, but with ActionsDAG. +struct FilterDAGInfo +{ + ActionsDAGPtr actions; String column_name; bool do_remove_column = false; @@ -77,20 +105,6 @@ struct InputOrderInfo bool operator !=(const InputOrderInfo & other) const { return !(*this == other); } }; -using PrewhereInfoListPtr = std::shared_ptr; -using PrewhereDAGInfoPtr = std::shared_ptr; -using FilterInfoPtr = std::shared_ptr; -using InputOrderInfoPtr = std::shared_ptr; - -struct TreeRewriterResult; -using TreeRewriterResultPtr = std::shared_ptr; - -class ReadInOrderOptimizer; -using ReadInOrderOptimizerPtr = std::shared_ptr; - -class Cluster; -using ClusterPtr = std::shared_ptr; - /** Query along with some additional data, * that can be used during query processing * inside storage engines. @@ -106,7 +120,7 @@ struct SelectQueryInfo TreeRewriterResultPtr syntax_analyzer_result; - PrewhereInfoListPtr prewhere_info_list; + PrewhereInfoPtr prewhere_info; ReadInOrderOptimizerPtr order_optimizer; /// Can be modified while reading from storage diff --git a/src/Storages/StorageBuffer.cpp b/src/Storages/StorageBuffer.cpp index 53fee054f4b..6a2acb74192 100644 --- a/src/Storages/StorageBuffer.cpp +++ b/src/Storages/StorageBuffer.cpp @@ -314,27 +314,38 @@ void StorageBuffer::read( } else { - if (query_info.prewhere_info_list) + if (query_info.prewhere_info) { - for (const auto & prewhere_info : *query_info.prewhere_info_list) + if (query_info.prewhere_info->filter_info) { pipe_from_buffers.addSimpleTransform([&](const Block & header) { return std::make_shared( - header, prewhere_info.prewhere_actions, - prewhere_info.prewhere_column_name, - prewhere_info.remove_prewhere_column); + header, + query_info.prewhere_info->filter_info->actions, + query_info.prewhere_info->filter_info->column_name, + query_info.prewhere_info->filter_info->do_remove_column); }); - - if (prewhere_info.alias_actions) - { - pipe_from_buffers.addSimpleTransform([&](const Block & header) - { - return std::make_shared( - header, prewhere_info.alias_actions); - }); - } } + + if (query_info.prewhere_info->alias_actions) + { + pipe_from_buffers.addSimpleTransform([&](const Block & header) + { + return std::make_shared( + header, + query_info.prewhere_info->alias_actions); + }); + } + + pipe_from_buffers.addSimpleTransform([&](const Block & header) + { + return std::make_shared( + header, + query_info.prewhere_info->prewhere_actions, + query_info.prewhere_info->prewhere_column_name, + query_info.prewhere_info->remove_prewhere_column); + }); } auto read_from_buffers = std::make_unique(std::move(pipe_from_buffers)); From b13d1f31422fe52f944ca95fe11276791434815d Mon Sep 17 00:00:00 2001 From: Alexey Milovidov Date: Sun, 14 Feb 2021 04:34:42 +0300 Subject: [PATCH 0276/2357] Fix integration test --- tests/integration/test_settings_profile/test.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/tests/integration/test_settings_profile/test.py b/tests/integration/test_settings_profile/test.py index 3ceef9f25cf..1945875bf53 100644 --- a/tests/integration/test_settings_profile/test.py +++ b/tests/integration/test_settings_profile/test.py @@ -46,7 +46,7 @@ def reset_after_test(): def test_smoke(): - # Set settings and constraints via CREATE SETTINGS PROFILE ... TO user + # Set settings and constraints via CREATE SETTINGS PROFILE ... TO user instance.query( "CREATE SETTINGS PROFILE xyz SETTINGS max_memory_usage = 100000001 MIN 90000000 MAX 110000000 TO robin") assert instance.query( @@ -194,13 +194,13 @@ def test_show_profiles(): assert instance.query("SHOW CREATE PROFILE xyz") == "CREATE SETTINGS PROFILE xyz\n" assert instance.query( - "SHOW CREATE SETTINGS PROFILE default") == "CREATE SETTINGS PROFILE default SETTINGS max_memory_usage = 10000000000, use_uncompressed_cache = 0, load_balancing = \\'random\\'\n" + "SHOW CREATE SETTINGS PROFILE default") == "CREATE SETTINGS PROFILE default SETTINGS max_memory_usage = 10000000000, load_balancing = \\'random\\'\n" assert instance.query( - "SHOW CREATE PROFILES") == "CREATE SETTINGS PROFILE default SETTINGS max_memory_usage = 10000000000, use_uncompressed_cache = 0, load_balancing = \\'random\\'\n" \ + "SHOW CREATE PROFILES") == "CREATE SETTINGS PROFILE default SETTINGS max_memory_usage = 10000000000, load_balancing = \\'random\\'\n" \ "CREATE SETTINGS PROFILE readonly SETTINGS readonly = 1\n" \ "CREATE SETTINGS PROFILE xyz\n" - expected_access = "CREATE SETTINGS PROFILE default SETTINGS max_memory_usage = 10000000000, use_uncompressed_cache = 0, load_balancing = \\'random\\'\n" \ + expected_access = "CREATE SETTINGS PROFILE default SETTINGS max_memory_usage = 10000000000, load_balancing = \\'random\\'\n" \ "CREATE SETTINGS PROFILE readonly SETTINGS readonly = 1\n" \ "CREATE SETTINGS PROFILE xyz\n" assert expected_access in instance.query("SHOW ACCESS") @@ -210,7 +210,7 @@ def test_allow_ddl(): assert "it's necessary to have grant" in instance.query_and_get_error("CREATE TABLE tbl(a Int32) ENGINE=Log", user="robin") assert "it's necessary to have grant" in instance.query_and_get_error("GRANT CREATE ON tbl TO robin", user="robin") assert "DDL queries are prohibited" in instance.query_and_get_error("CREATE TABLE tbl(a Int32) ENGINE=Log", settings={"allow_ddl": 0}) - + instance.query("GRANT CREATE ON tbl TO robin") instance.query("CREATE TABLE tbl(a Int32) ENGINE=Log", user="robin") instance.query("DROP TABLE tbl") From 0d9578efc98af871604b11ee79abf0938c339a58 Mon Sep 17 00:00:00 2001 From: Dmitriy Date: Sun, 14 Feb 2021 14:14:39 +0300 Subject: [PATCH 0277/2357] Edit and translate MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Поправил английскую версию и выполнил перевод. --- docs/en/operations/caches.md | 15 ++-- .../system-tables/distributed_ddl_queue.md | 2 +- docs/en/sql-reference/table-functions/file.md | 20 ++--- .../sql-reference/table-functions/remote.md | 26 +++--- docs/en/sql-reference/table-functions/url.md | 14 +-- docs/ru/operations/caches.md | 29 ++++++ .../system-tables/distributed_ddl_queue.md | 2 +- docs/ru/sql-reference/table-functions/file.md | 89 +++++++++++-------- .../sql-reference/table-functions/remote.md | 78 ++++++++++------ docs/ru/sql-reference/table-functions/url.md | 43 ++++++--- 10 files changed, 206 insertions(+), 112 deletions(-) create mode 100644 docs/ru/operations/caches.md diff --git a/docs/en/operations/caches.md b/docs/en/operations/caches.md index 7b096b76f75..ec7e4239a9d 100644 --- a/docs/en/operations/caches.md +++ b/docs/en/operations/caches.md @@ -8,18 +8,21 @@ toc_title: Caches When performing queries, ClichHouse uses different caches. Main cache types: + - `mark_cache` — Cache of marks used by table engines of the [MergeTree](../engines/table-engines/mergetree-family/mergetree.md) family. - `uncompressed_cache` — Cache of uncompressed data used by table engines of the [MergeTree](../engines/table-engines/mergetree-family/mergetree.md) family. Additional cache types: -- DNS cache -- [regexp](../interfaces/formats.md#data-format-regexp) cache -- compiled expressions cache -- [Avro format](../interfaces/formats.md#data-format-avro) schemas cache -- [dictionaries data cache](../sql-reference/dictionaries/index.md) + +- DNS cache. +- [Regexp](../interfaces/formats.md#data-format-regexp) cache. +- Compiled expressions cache. +- [Avro format](../interfaces/formats.md#data-format-avro) schemas cache. +- [Dictionaries](../sql-reference/dictionaries/index.md) data cache. Indirectly used: -- OS page cache + +- OS page cache. To drop cache, use [SYSTEM DROP ... CACHE](../sql-reference/statements/system.md) statements. diff --git a/docs/en/operations/system-tables/distributed_ddl_queue.md b/docs/en/operations/system-tables/distributed_ddl_queue.md index c252458af8a..fa871d215b5 100644 --- a/docs/en/operations/system-tables/distributed_ddl_queue.md +++ b/docs/en/operations/system-tables/distributed_ddl_queue.md @@ -14,7 +14,7 @@ Columns: - `initiator` ([String](../../sql-reference/data-types/string.md)) — Node that executed the query. - `query_start_time` ([DateTime](../../sql-reference/data-types/datetime.md)) — Query start time. - `query_finish_time` ([DateTime](../../sql-reference/data-types/datetime.md)) — Query finish time. -- `query_duration_ms` ([UInt64](../../sql-reference/data-types/datetime64.md)) — Duration of query execution (in milliseconds). +- `query_duration_ms` ([UInt64](../../sql-reference/data-types/int-uint.md)) — Duration of query execution (in milliseconds). - `exception_code` ([Enum8](../../sql-reference/data-types/enum.md)) — Exception code from [ZooKeeper](../../operations/tips.md#zookeeper). **Example** diff --git a/docs/en/sql-reference/table-functions/file.md b/docs/en/sql-reference/table-functions/file.md index d1eb81e52c6..e4ea59aface 100644 --- a/docs/en/sql-reference/table-functions/file.md +++ b/docs/en/sql-reference/table-functions/file.md @@ -5,7 +5,7 @@ toc_title: file # file {#file} -Creates a table from a file. This table function is similar to [url](../../sql-reference/table-functions/url.md) and [hdfs](../../sql-reference/table-functions/hdfs.md) ones. +Creates a table from a file. This table function is similar to [url](../../sql-reference/table-functions/url.md) and [hdfs](../../sql-reference/table-functions/hdfs.md) ones. `file` function can be used in `SELECT` and `INSERT` queries on data in [File](../../engines/table-engines/special/file.md) tables. @@ -15,9 +15,9 @@ Creates a table from a file. This table function is similar to [url](../../sql-r file(path, format, structure) ``` -**Input parameters** +**Parameters** -- `path` — The relative path to the file from [user_files_path](../../operations/server-configuration-parameters/settings.md#server_configuration_parameters-user_files_path). Path to file support following globs in readonly mode: `*`, `?`, `{abc,def}` and `{N..M}` where `N`, `M` — numbers, `'abc', 'def'` — strings. +- `path` — The relative path to the file from [user_files_path](../../operations/server-configuration-parameters/settings.md#server_configuration_parameters-user_files_path). Path to file support following globs in read-only mode: `*`, `?`, `{abc,def}` and `{N..M}` where `N`, `M` — numbers, `'abc', 'def'` — strings. - `format` — The [format](../../interfaces/formats.md#formats) of the file. - `structure` — Structure of the table. Format: `'column1_name column1_type, column2_name column2_type, ...'`. @@ -39,7 +39,7 @@ $ cat /var/lib/clickhouse/user_files/test.csv 78,43,45 ``` -Getting data from a table in `test.csv` and selecting first two rows from it: +Getting data from a table in `test.csv` and selecting the first two rows from it: ``` sql SELECT * FROM file('test.csv', 'CSV', 'column1 UInt32, column2 UInt32, column3 UInt32') LIMIT 2; @@ -51,7 +51,8 @@ SELECT * FROM file('test.csv', 'CSV', 'column1 UInt32, column2 UInt32, column3 U │ 3 │ 2 │ 1 │ └─────────┴─────────┴─────────┘ ``` -Getting the first 10 lines of a table that contains 3 columns of UInt32 type from a CSV file: + +Getting the first 10 lines of a table that contains 3 columns of [UInt32](../../sql-reference/data-types/int-uint.md) type from a CSV file: ``` sql SELECT * FROM file('test.csv', 'CSV', 'column1 UInt32, column2 UInt32, column3 UInt32') LIMIT 10; @@ -71,7 +72,6 @@ SELECT * FROM file('test.csv', 'CSV', 'column1 UInt32, column2 UInt32, column3 U └─────────┴─────────┴─────────┘ ``` - ## Globs in Path {#globs-in-path} Multiple path components can have globs. For being processed file should exists and matches to the whole path pattern (not only suffix or prefix). @@ -81,7 +81,7 @@ Multiple path components can have globs. For being processed file should exists - `{some_string,another_string,yet_another_one}` — Substitutes any of strings `'some_string', 'another_string', 'yet_another_one'`. - `{N..M}` — Substitutes any number in range from N to M including both borders. -Constructions with `{}` are similar to the [remote table function](../../sql-reference/table-functions/remote.md)). +Constructions with `{}` are similar to the [remote](remote.md) table function. **Example** @@ -94,13 +94,13 @@ Suppose we have several files with the following relative paths: - 'another_dir/some_file_2' - 'another_dir/some_file_3' -Query the amount of rows in these files: +Query the number of rows in these files: ``` sql SELECT count(*) FROM file('{some,another}_dir/some_file_{1..3}', 'TSV', 'name String, value UInt32'); ``` -Query the amount of rows in all files of these two directories: +Query the number of rows in all files of these two directories: ``` sql SELECT count(*) FROM file('{some,another}_dir/*', 'TSV', 'name String, value UInt32'); @@ -124,6 +124,6 @@ SELECT count(*) FROM file('big_dir/file{0..9}{0..9}{0..9}', 'CSV', 'name String, **See Also** -- [Virtual columns](https://clickhouse.tech/docs/en/operations/table_engines/#table_engines-virtual_columns) +- [Virtual columns](index.md#table_engines-virtual_columns) [Original article](https://clickhouse.tech/docs/en/sql-reference/table-functions/file/) diff --git a/docs/en/sql-reference/table-functions/remote.md b/docs/en/sql-reference/table-functions/remote.md index 8af5b588412..e80e58a76aa 100644 --- a/docs/en/sql-reference/table-functions/remote.md +++ b/docs/en/sql-reference/table-functions/remote.md @@ -5,7 +5,7 @@ toc_title: remote # remote, remoteSecure {#remote-remotesecure} -Allows to access remote servers without creating a [Distributed](../../engines/table-engines/special/distributed.md) table. `remoteSecure` - same as `remote` but with secured connection. +Allows to access remote servers without creating a [Distributed](../../engines/table-engines/special/distributed.md) table. `remoteSecure` - same as `remote` but with a secured connection. Both functions can be used in `SELECT` and `INSERT` queries. @@ -18,31 +18,31 @@ remoteSecure('addresses_expr', db, table[, 'user'[, 'password'], sharding_key]) remoteSecure('addresses_expr', db.table[, 'user'[, 'password'], sharding_key]) ``` -**Input parameters** +**Parameters** -- `addresses_expr` – An expression that generates addresses of remote servers. This may be just one server address. The server address is `host:port`, or just `host`. +- `addresses_expr` — An expression that generates addresses of remote servers. This may be just one server address. The server address is `host:port`, or just `host`. The host can be specified as the server name, or as the IPv4 or IPv6 address. An IPv6 address is specified in square brackets. - The port is the TCP port on the remote server. If the port is omitted, it uses [tcp_port](../../operations/server-configuration-parameters/settings.md#server_configuration_parameters-tcp_port) from the server’s config file in `remote` (by default, 9000) and [tcp_port_secure](../../operations/server-configuration-parameters/settings.md#server_configuration_parameters-tcp_port_secure) in `remoteSecure` (by default, 9440). + The port is the TCP port on the remote server. If the port is omitted, it uses [tcp_port](../../operations/server-configuration-parameters/settings.md#server_configuration_parameters-tcp_port) from the server’s config file in `remote` (by default, 9000) and [tcp_port_secure](../../operations/server-configuration-parameters/settings.md#server_configuration_parameters-tcp_port_secure) in `remoteSecure` (by default, 9440). The port is required for an IPv6 address. Type: [String](../../sql-reference/data-types/string.md). -- `db` - Database name. Type: [String](../../sql-reference/data-types/string.md). -- `table` - Table name. Type: [String](../../sql-reference/data-types/string.md). -- `user` - User name. If the user is not specified, `default` is used. Type: [String](../../sql-reference/data-types/string.md). -- `password` - User password. If the password is not specified, an empty password is used. Type: [String](../../sql-reference/data-types/string.md). -- `sharding_key` - Sharding key to support distributing data across nodes. For example: `insert into remote('127.0.0.1:9000,127.0.0.2', db, table, 'default', rand())`. Type: [UInt32](../../sql-reference/data-types/int-uint.md). +- `db` — Database name. Type: [String](../../sql-reference/data-types/string.md). +- `table` — Table name. Type: [String](../../sql-reference/data-types/string.md). +- `user` — User name. If the user is not specified, `default` is used. Type: [String](../../sql-reference/data-types/string.md). +- `password` — User password. If the password is not specified, an empty password is used. Type: [String](../../sql-reference/data-types/string.md). +- `sharding_key` — Sharding key to support distributing data across nodes. For example: `insert into remote('127.0.0.1:9000,127.0.0.2', db, table, 'default', rand())`. Type: [UInt32](../../sql-reference/data-types/int-uint.md). **Returned value** -Dataset from remote servers. +The dataset from remote servers. **Usage** -Using the `remote` table function is less optimal than creating a `Distributed` table, because in this case the server connection is re-established for every request. In addition, if host names are set, the names are resolved, and errors are not counted when working with various replicas. When processing a large number of queries, always create the `Distributed` table ahead of time, and don’t use the `remote` table function. +Using the `remote` table function is less optimal than creating a `Distributed` table because in this case the server connection is re-established for every request. Also, if hostnames are set, the names are resolved, and errors are not counted when working with various replicas. When processing a large number of queries, always create the `Distributed` table ahead of time, and don’t use the `remote` table function. The `remote` table function can be useful in the following cases: @@ -62,7 +62,7 @@ localhost [2a02:6b8:0:1111::11]:9000 ``` -Multiple addresses can be comma-separated. In this case, ClickHouse will use distributed processing, so it will send the query to all specified addresses (like to shards with different data). Example: +Multiple addresses can be comma-separated. In this case, ClickHouse will use distributed processing, so it will send the query to all specified addresses (like shards with different data). Example: ``` text example01-01-1,example01-02-1 @@ -82,7 +82,7 @@ example01-{01..02}-1 If you have multiple pairs of curly brackets, it generates the direct product of the corresponding sets. -Addresses and parts of addresses in curly brackets can be separated by the pipe symbol (\|). In this case, the corresponding sets of addresses are interpreted as replicas, and the query will be sent to the first healthy replica. However, the replicas are iterated in the order currently set in the [load_balancing](../../operations/settings/settings.md) setting. This example specifies two shards that each have two replicas: +Addresses and parts of addresses in curly brackets can be separated by the pipe symbol (\|). In this case, the corresponding sets of addresses are interpreted as replicas, and the query will be sent to the first healthy replica. However, the replicas are iterated in the order currently set in the [load_balancing](../../operations/settings/settings.md#settings-load_balancing) setting. This example specifies two shards that each have two replicas: ``` text example01-{01..02}-{1|2} diff --git a/docs/en/sql-reference/table-functions/url.md b/docs/en/sql-reference/table-functions/url.md index d70774b7588..0d004f9601a 100644 --- a/docs/en/sql-reference/table-functions/url.md +++ b/docs/en/sql-reference/table-functions/url.md @@ -15,25 +15,25 @@ toc_title: url url(URL, format, structure) ``` -**Input parameters** +**Parameters** -- `URL` - HTTP or HTTPS server address, which can accept `GET` (for `SELECT`) or `POST` (for `INSERT`) requests. Type: [String](../../sql-reference/data-types/string.md). -- `format` - [Format](../../interfaces/formats.md#formats) of the data. Type: [String](../../sql-reference/data-types/string.md). -- `structure` - Table structure in `'UserID UInt64, Name String'` format. Determines column names and types. Type: [String](../../sql-reference/data-types/string.md). +- `URL` — HTTP or HTTPS server address, which can accept `GET` (for `SELECT`) or `POST` (for `INSERT`) queries. Type: [String](../../sql-reference/data-types/string.md). +- `format` — [Format](../../interfaces/formats.md#formats) of the data. Type: [String](../../sql-reference/data-types/string.md). +- `structure` — Table structure in `'UserID UInt64, Name String'` format. Determines column names and types. Type: [String](../../sql-reference/data-types/string.md). **Returned value** -A table with the specified format and structure and with data from the defined URL. +A table with the specified format and structure and with data from the defined `URL`. **Examples** -Getting the first 3 lines of a table that contains columns of `String` and `UInt32` type from HTTP-server which answers in `CSV` format. +Getting the first 3 lines of a table that contains columns of `String` and [UInt32](../../sql-reference/data-types/int-uint.md) type from HTTP-server which answers in [CSV](../../interfaces/formats.md/#csv) format. ``` sql SELECT * FROM url('http://127.0.0.1:12345/', CSV, 'column1 String, column2 UInt32') LIMIT 3; ``` -Inserting data from a URL into a table: +Inserting data from a `URL` into a table: ``` sql CREATE TABLE test_table (column1 String, column2 UInt32) ENGINE=Memory; diff --git a/docs/ru/operations/caches.md b/docs/ru/operations/caches.md new file mode 100644 index 00000000000..cf7118eb1f3 --- /dev/null +++ b/docs/ru/operations/caches.md @@ -0,0 +1,29 @@ +--- +toc_priority: 65 +toc_title: Кеши +--- + +# Типы кеша {#cache-types} + +При выполнении запросов ClickHouse использует различные типы кеша. + +Основные типы кеша: + +- `mark_cache` — кеш меток, используемых движками таблиц семейства [MergeTree](../engines/table-engines/mergetree-family/mergetree.md). +- `uncompressed_cache` — кеш несжатых данных, используемых движками таблиц семейства [MergeTree](../engines/table-engines/mergetree-family/mergetree.md). + +Дополнительные типы кеша: + +- DNS-кеш. +- Кеш данных формата [regexp](../interfaces/formats.md#data-format-regexp). +- Кеш скомпилированных выражений. +- Кеш схем формата [Avro](../interfaces/formats.md#data-format-avro). +- Кеш данных в [словарях](../sql-reference/dictionaries/index.md). + +Непрямое использование: + +- Кеш страницы ОС. + +Чтобы удалить кеш, используйте выражения типа [SYSTEM DROP ... CACHE](../sql-reference/statements/system.md). + +[Оригинальная статья](https://clickhouse.tech/docs/ru/operations/caches/) diff --git a/docs/ru/operations/system-tables/distributed_ddl_queue.md b/docs/ru/operations/system-tables/distributed_ddl_queue.md index 058ed06f639..71be69e98d7 100644 --- a/docs/ru/operations/system-tables/distributed_ddl_queue.md +++ b/docs/ru/operations/system-tables/distributed_ddl_queue.md @@ -14,7 +14,7 @@ - `initiator` ([String](../../sql-reference/data-types/string.md)) — узел, выполнивший запрос. - `query_start_time` ([DateTime](../../sql-reference/data-types/datetime.md)) — время начала запроса. - `query_finish_time` ([DateTime](../../sql-reference/data-types/datetime.md)) — время окончания запроса. -- `query_duration_ms` ([UInt64](../../sql-reference/data-types/datetime64.md)) — продолжительность выполнения запроса (в миллисекундах). +- `query_duration_ms` ([UInt64](../../sql-reference/data-types/int-uint.md)) — продолжительность выполнения запроса (в миллисекундах). - `exception_code` ([Enum8](../../sql-reference/data-types/enum.md)) — код исключения из [ZooKeeper](../../operations/tips.md#zookeeper). **Пример** diff --git a/docs/ru/sql-reference/table-functions/file.md b/docs/ru/sql-reference/table-functions/file.md index d3e6e106125..ca1ac8b29db 100644 --- a/docs/ru/sql-reference/table-functions/file.md +++ b/docs/ru/sql-reference/table-functions/file.md @@ -5,23 +5,27 @@ toc_title: file # file {#file} -Создаёт таблицу из файла. Данная табличная функция похожа на табличные функции [file](file.md) и [hdfs](hdfs.md). +Создаёт таблицу из файла. Данная табличная функция похожа на табличные функции [url](../../sql-reference/table-functions/url.md) and [hdfs](../../sql-reference/table-functions/hdfs.md). + +Функция `file` может использоваться в запросах `SELECT` и `INSERT` движка таблиц [File](../../engines/table-engines/special/file.md). + +**Синтаксис** ``` sql file(path, format, structure) ``` -**Входные параметры** +**Параметры** -- `path` — относительный путь до файла от [user_files_path](../../sql-reference/table-functions/file.md#server_configuration_parameters-user_files_path). Путь к файлу поддерживает следующие шаблоны в режиме доступа только для чтения `*`, `?`, `{abc,def}` и `{N..M}`, где `N`, `M` — числа, \``'abc', 'def'` — строки. +- `path` — относительный путь до файла от [user_files_path](../../sql-reference/table-functions/file.md#server_configuration_parameters-user_files_path). Путь к файлу поддерживает следующие шаблоны в режиме доступа только для чтения `*`, `?`, `{abc,def}` и `{N..M}`, где `N`, `M` — числа, `'abc', 'def'` — строки. - `format` — [формат](../../interfaces/formats.md#formats) файла. -- `structure` — структура таблицы. Формат `'colunmn1_name column1_ype, column2_name column2_type, ...'`. +- `structure` — структура таблицы. Формат: `'colunmn1_name column1_ype, column2_name column2_type, ...'`. **Возвращаемое значение** Таблица с указанной структурой, предназначенная для чтения или записи данных в указанном файле. -**Пример** +**Примеры** Настройка `user_files_path` и содержимое файла `test.csv`: @@ -35,12 +39,10 @@ $ cat /var/lib/clickhouse/user_files/test.csv 78,43,45 ``` -Таблица из `test.csv` и выборка первых двух строк из неё: +Получение данных из таблицы в файле `test.csv` и выборка первых двух строк из неё: ``` sql -SELECT * -FROM file('test.csv', 'CSV', 'column1 UInt32, column2 UInt32, column3 UInt32') -LIMIT 2 +SELECT * FROM file('test.csv', 'CSV', 'column1 UInt32, column2 UInt32, column3 UInt32') LIMIT 2; ``` ``` text @@ -50,45 +52,61 @@ LIMIT 2 └─────────┴─────────┴─────────┘ ``` +Получение первых 10 строк таблицы, содержащей 3 столбца типа [UInt32](../../sql-reference/data-types/int-uint.md), из CSV-файла: + +``` sql +SELECT * FROM file('test.csv', 'CSV', 'column1 UInt32, column2 UInt32, column3 UInt32') LIMIT 10; +``` + +Вставка данных из файла в таблицу: + +``` sql +INSERT INTO FUNCTION file('test.csv', 'CSV', 'column1 UInt32, column2 UInt32, column3 UInt32') VALUES (1, 2, 3), (3, 2, 1); +SELECT * FROM file('test.csv', 'CSV', 'column1 UInt32, column2 UInt32, column3 UInt32'); +``` + +``` text +┌─column1─┬─column2─┬─column3─┐ +│ 1 │ 2 │ 3 │ +│ 3 │ 2 │ 1 │ +└─────────┴─────────┴─────────┘ +``` + +## Шаблоны в компонентах пути {#globs-in-path} + Шаблоны могут содержаться в нескольких компонентах пути. Обрабатываются только существующие файлы, название которых целиком удовлетворяет шаблону (не только суффиксом или префиксом). -- `*` — Заменяет любое количество любых символов кроме `/`, включая отсутствие символов. -- `?` — Заменяет ровно один любой символ. -- `{some_string,another_string,yet_another_one}` — Заменяет любую из строк `'some_string', 'another_string', 'yet_another_one'`. -- `{N..M}` — Заменяет любое число в интервале от `N` до `M` включительно (может содержать ведущие нули). +- `*` — заменяет любое количество любых символов кроме `/`, включая отсутствие символов. +- `?` — заменяет ровно один любой символ. +- `{some_string,another_string,yet_another_one}` — заменяет любую из строк `'some_string', 'another_string', 'yet_another_one'`. +- `{N..M}` — заменяет любое число в интервале от `N` до `M` включительно (может содержать ведущие нули). Конструкция с `{}` аналогична табличной функции [remote](remote.md). **Пример** -1. Предположим у нас есть несколько файлов со следующими относительными путями: +Предположим у нас есть несколько файлов со следующими относительными путями: -- ‘some_dir/some_file_1’ -- ‘some_dir/some_file_2’ -- ‘some_dir/some_file_3’ -- ‘another_dir/some_file_1’ -- ‘another_dir/some_file_2’ -- ‘another_dir/some_file_3’ +- 'some_dir/some_file_1' +- 'some_dir/some_file_2' +- 'some_dir/some_file_3' +- 'another_dir/some_file_1' +- 'another_dir/some_file_2' +- 'another_dir/some_file_3' -1. Запросим количество строк в этих файлах: - - +Запросим количество строк в этих файлах: ``` sql -SELECT count(*) -FROM file('{some,another}_dir/some_file_{1..3}', 'TSV', 'name String, value UInt32') +SELECT count(*) FROM file('{some,another}_dir/some_file_{1..3}', 'TSV', 'name String, value UInt32'); ``` -1. Запросим количество строк во всех файлах этих двух директорий: - - +Запросим количество строк во всех файлах этих двух директорий: ``` sql -SELECT count(*) -FROM file('{some,another}_dir/*', 'TSV', 'name String, value UInt32') +SELECT count(*) FROM file('{some,another}_dir/*', 'TSV', 'name String, value UInt32'); ``` -!!! warning "Warning" +!!! warning "Предупреждение" Если ваш список файлов содержит интервал с ведущими нулями, используйте конструкцию с фигурными скобками для каждой цифры по отдельности или используйте `?`. **Пример** @@ -96,17 +114,16 @@ FROM file('{some,another}_dir/*', 'TSV', 'name String, value UInt32') Запрос данных из файлов с именами `file000`, `file001`, … , `file999`: ``` sql -SELECT count(*) -FROM file('big_dir/file{0..9}{0..9}{0..9}', 'CSV', 'name String, value UInt32') +SELECT count(*) FROM file('big_dir/file{0..9}{0..9}{0..9}', 'CSV', 'name String, value UInt32'); ``` ## Виртуальные столбцы {#virtualnye-stolbtsy} -- `_path` — Путь к файлу. -- `_file` — Имя файла. +- `_path` — путь к файлу. +- `_file` — имя файла. **Смотрите также** - [Виртуальные столбцы](index.md#table_engines-virtual_columns) -[Оригинальная статья](https://clickhouse.tech/docs/ru/query_language/table_functions/file/) +[Оригинальная статья](https://clickhouse.tech/docs/ru/sql-reference/table-functions/file/) diff --git a/docs/ru/sql-reference/table-functions/remote.md b/docs/ru/sql-reference/table-functions/remote.md index 901317a805d..435fb5bb6d7 100644 --- a/docs/ru/sql-reference/table-functions/remote.md +++ b/docs/ru/sql-reference/table-functions/remote.md @@ -5,9 +5,11 @@ toc_title: remote # remote, remoteSecure {#remote-remotesecure} -Позволяет обратиться к удалённым серверам без создания таблицы типа `Distributed`. +Позволяет обратиться к удалённым серверам без создания таблицы типа [Distributed](../../engines/table-engines/special/distributed.md). Функция `remoteSecure` такая же, как и `remote`, но с защищенным соединением. -Сигнатуры: +Обе функции могут быть использованы в запросах типа `SELECT` и `INSERT`. + +**Синтаксис** ``` sql remote('addresses_expr', db, table[, 'user'[, 'password']]) @@ -16,12 +18,40 @@ remoteSecure('addresses_expr', db, table[, 'user'[, 'password']]) remoteSecure('addresses_expr', db.table[, 'user'[, 'password']]) ``` -`addresses_expr` - выражение, генерирующее адреса удалённых серверов. Это может быть просто один адрес сервера. Адрес сервера - это `хост:порт`, или только `хост`. Хост может быть указан в виде имени сервера, или в виде IPv4 или IPv6 адреса. IPv6 адрес указывается в квадратных скобках. Порт - TCP-порт удалённого сервера. Если порт не указан, используется `tcp_port` из конфигурационного файла сервера (по умолчанию - 9000). +**Параметры** + +- `addresses_expr` — выражение, генерирующее адреса удалённых серверов. Это может быть просто один адрес сервера. Адрес сервера — это `хост:порт`, или только `хост`. + + Хост может быть указан в виде имени сервера, или в виде IPv4 или IPv6 адреса. IPv6 адрес указывается в квадратных скобках. + + Порт — TCP-порт удалённого сервера. Если порт не указан, используется [tcp_port](../../operations/server-configuration-parameters/settings.md#server_configuration_parameters-tcp_port) из конфигурационного файла сервера, к которому обратились через функцию `remote` (по умолчанию - 9000), и [tcp_port_secure](../../operations/server-configuration-parameters/settings.md#server_configuration_parameters-tcp_port_secure), к которому обратились через функцию `remoteSecure` (по умолчанию — 9440). -!!! important "Важно" С IPv6-адресом обязательно нужно указывать порт. -Примеры: + Тип: [String](../../sql-reference/data-types/string.md). + +- `db` — имя базы данных. Тип: [String](../../sql-reference/data-types/string.md). +- `table` — имя таблицы. Тип: [String](../../sql-reference/data-types/string.md). +- `user` — имя пользователя. Если пользователь не указан, то по умолчанию `default`. Тип: [String](../../sql-reference/data-types/string.md). +- `password` — пароль. Если пароль не указан, то используется пустой пароль. Тип: [String](../../sql-reference/data-types/string.md). +- `sharding_key` — ключ шардирования для поддержки распределения данных между узлами. Например: `insert into remote('127.0.0.1:9000,127.0.0.2', db, table, 'default', rand())`. Тип: [UInt32](../../sql-reference/data-types/int-uint.md). + +**Возвращаемое значение** + +Набор данных с удаленных серверов. + +**Использование** + +Использование табличной функции `remote` менее оптимально, чем создание таблицы типа `Distributed`, так как в этом случае соединения с серверами устанавливаются заново при каждом запросе. В случае задания имён хостов делается резолвинг имён, а также не ведётся подсчёт ошибок при работе с разными репликами. При обработке большого количества запросов всегда создавайте таблицу типа `Distributed` заранее, не используйте табличную функцию `remote`. + +Табличная функция `remote` может быть полезна в следующих случаях: + +- Обращение на конкретный сервер в целях сравнения данных, отладки и тестирования. +- Запросы между разными кластерами ClickHouse в целях исследований. +- Нечастые распределённые запросы, задаваемые вручную. +- Распределённые запросы, где набор серверов определяется каждый раз заново. + +**Адреса** ``` text example01-01-1 @@ -32,9 +62,7 @@ localhost [2a02:6b8:0:1111::11]:9000 ``` -Адреса можно указать через запятую, в этом случае ClickHouse обработает запрос как распределённый, т.е. отправит его по всем указанным адресам как на шарды с разными данными. - -Пример: +Адреса можно указать через запятую. В этом случае ClickHouse обработает запрос как распределённый, т.е. отправит его по всем указанным адресам как на шарды с разными данными. Пример: ``` text example01-01-1,example01-02-1 @@ -46,38 +74,36 @@ example01-01-1,example01-02-1 example01-0{1,2}-1 ``` -В фигурных скобках может быть указан диапазон (неотрицательных целых) чисел через две точки. В этом случае, диапазон раскрывается в множество значений, генерирующих адреса шардов. Если запись первого числа начинается с нуля, то значения формируются с таким же выравниванием нулями. Предыдущий пример может быть записан следующим образом: +В фигурных скобках может быть указан диапазон (неотрицательных целых) чисел через две точки. В этом случае диапазон раскрывается в множество значений, генерирующих адреса шардов. Если запись первого числа начинается с нуля, то значения формируются с таким же выравниванием нулями. Предыдущий пример может быть записан следующим образом: ``` text example01-{01..02}-1 ``` -При наличии нескольких пар фигурных скобок, генерируется прямое произведение соответствующих множеств. +При наличии нескольких пар фигурных скобок генерируется прямое произведение соответствующих множеств. -Адреса или их фрагменты в фигурных скобках можно указать через символ \|. В этом случае, соответствующие множества адресов понимаются как реплики - запрос будет отправлен на первую живую реплику. При этом, реплики перебираются в порядке, согласно текущей настройке [load_balancing](../../operations/settings/settings.md). - -Пример: +Адреса или их фрагменты в фигурных скобках можно указать через символ \|. В этом случае соответствующие множества адресов понимаются как реплики — запрос будет отправлен на первую живую реплику. При этом реплики перебираются в порядке, согласно текущей настройке [load_balancing](../../operations/settings/settings.md#settings-load_balancing). В этом примере указано два шарда, в каждом из которых имеется две реплики: ``` text example01-{01..02}-{1|2} ``` -В этом примере указано два шарда, в каждом из которых имеется две реплики. +Количество генерируемых адресов ограничено константой. Сейчас это 1000 адресов. -Количество генерируемых адресов ограничено константой - сейчас это 1000 штук. +**Примеры** -Использование табличной функции `remote` менее оптимально, чем создание таблицы типа `Distributed`, так как в этом случае, соединения с серверами устанавливаются заново при каждом запросе, в случае задания имён хостов, делается резолвинг имён, а также не ведётся подсчёт ошибок при работе с разными репликами. При обработке большого количества запросов, всегда создавайте `Distributed` таблицу заранее, не используйте табличную функцию `remote`. +Выборка данных с удаленного сервера: -Табличная функция `remote` может быть полезна для следующих случаях: +``` sql +SELECT * FROM remote('127.0.0.1', db.remote_engine_table) LIMIT 3; +``` -- обращение на конкретный сервер в целях сравнения данных, отладки и тестирования; -- запросы между разными кластерами ClickHouse в целях исследований; -- нечастых распределённых запросов, задаваемых вручную; -- распределённых запросов, где набор серверов определяется каждый раз заново. +Вставка данных с удаленного сервера в таблицу: -Если пользователь не задан,то используется `default`. -Если пароль не задан, то используется пустой пароль. +``` sql +CREATE TABLE remote_table (name String, value UInt32) ENGINE=Memory; +INSERT INTO FUNCTION remote('127.0.0.1', currentDatabase(), 'remote_table') VALUES ('test', 42); +SELECT * FROM remote_table; +``` -`remoteSecure` - аналогично функции `remote`, но с соединением по шифрованному каналу. Порт по умолчанию - `tcp_port_secure` из конфига или 9440. - -[Оригинальная статья](https://clickhouse.tech/docs/ru/query_language/table_functions/remote/) +[Оригинальная статья](https://clickhouse.tech/docs/ru/sql-reference/table-functions/remote/) diff --git a/docs/ru/sql-reference/table-functions/url.md b/docs/ru/sql-reference/table-functions/url.md index 0cd7c24c663..afb4a23b88e 100644 --- a/docs/ru/sql-reference/table-functions/url.md +++ b/docs/ru/sql-reference/table-functions/url.md @@ -5,21 +5,40 @@ toc_title: url # url {#url} -`url(URL, format, structure)` - возвращает таблицу со столбцами, указанными в -`structure`, созданную из данных находящихся по `URL` в формате `format`. +Функция `url` создает таблицу с помощью адреса `URL`, формата данных и структуры таблицы. -URL - адрес, по которому сервер принимает `GET` и/или `POST` запросы по -протоколу HTTP или HTTPS. +Функция `url` может быть использована в запросах `SELECT` и `INSERT` в таблицах движка [URL](../../engines/table-engines/special/url.md). -format - [формат](../../interfaces/formats.md#formats) данных. - -structure - структура таблицы в форме `'UserID UInt64, Name String'`. Определяет имена и типы столбцов. - -**Пример** +**Синтаксис** ``` sql --- получение 3-х строк таблицы, состоящей из двух колонк типа String и UInt32 от сервера, отдающего данные в формате CSV -SELECT * FROM url('http://127.0.0.1:12345/', CSV, 'column1 String, column2 UInt32') LIMIT 3 +url(URL, format, structure) ``` -[Оригинальная статья](https://clickhouse.tech/docs/ru/query_language/table_functions/url/) +**Параметры** + +- `URL` — HTTP или HTTPS-адрес сервера, который может принимать запросы `GET` (для `SELECT`) или `POST` (для `INSERT`). Тип: [String](../../sql-reference/data-types/string.md). +- `format` — [формат](../../interfaces/formats.md#formats) данных. Тип: [String](../../sql-reference/data-types/string.md). +- `structure` — структура таблицы в формате `'UserID UInt64, Name String'`. Определяет имена и типы столбцов. Тип: [String](../../sql-reference/data-types/string.md). + +**Возвращаемое значение** + +Таблица с указанными форматом и структурой, а также с данными, полученными из указанного адреса `URL`. + +**Примеры** + +Получение первых 3 строк таблицы, содержащей столбцы типа `String` и [UInt32](../../sql-reference/data-types/int-uint.md), с HTTP-сервера в формате [CSV](../../interfaces/formats.md/#csv). + +``` sql +SELECT * FROM url('http://127.0.0.1:12345/', CSV, 'column1 String, column2 UInt32') LIMIT 3; +``` + +Вставка данных в таблицу: + +``` sql +CREATE TABLE test_table (column1 String, column2 UInt32) ENGINE=Memory; +INSERT INTO FUNCTION url('http://127.0.0.1:8123/?query=INSERT+INTO+test_table+FORMAT+CSV', 'CSV', 'column1 String, column2 UInt32') VALUES ('http interface', 42); +SELECT * FROM test_table; +``` + +[Оригинальная статья](https://clickhouse.tech/docs/ru/sql-reference/table-functions/url/) From 45e90961f7612f46a315d80b6b8e7a856c2bed33 Mon Sep 17 00:00:00 2001 From: Denis Glazachev Date: Sun, 14 Feb 2021 21:16:40 +0400 Subject: [PATCH 0278/2357] Store and process alias_actions in FilterInfo --- src/Interpreters/InterpreterSelectQuery.cpp | 13 +++++++++++++ .../MergeTree/MergeTreeBaseSelectProcessor.cpp | 3 +++ .../MergeTree/MergeTreeBlockReadUtils.cpp | 16 ++++++++++++---- src/Storages/MergeTree/MergeTreeRangeReader.cpp | 3 +++ src/Storages/MergeTree/MergeTreeReadPool.h | 3 ++- src/Storages/SelectQueryInfo.h | 1 + src/Storages/StorageBuffer.cpp | 10 ++++++++++ 7 files changed, 44 insertions(+), 5 deletions(-) diff --git a/src/Interpreters/InterpreterSelectQuery.cpp b/src/Interpreters/InterpreterSelectQuery.cpp index 0a639922e55..d13c28e8ff2 100644 --- a/src/Interpreters/InterpreterSelectQuery.cpp +++ b/src/Interpreters/InterpreterSelectQuery.cpp @@ -1215,6 +1215,16 @@ void InterpreterSelectQuery::addEmptySourceToQueryPlan(QueryPlan & query_plan, c { auto & filter_info = *prewhere_info.filter_info; + if (filter_info.alias_actions) + { + pipe.addSimpleTransform([&](const Block & header) + { + return std::make_shared( + header, + filter_info.alias_actions); + }); + } + pipe.addSimpleTransform([&](const Block & header) { return std::make_shared( @@ -1599,6 +1609,9 @@ void InterpreterSelectQuery::executeFetchColumns(QueryProcessingStage::Enum proc { query_info.prewhere_info->filter_info = std::make_shared(); + if (alias_actions) + query_info.prewhere_info->filter_info->alias_actions = std::make_shared(std::move(alias_actions)); + if (expressions.filter_info->actions) query_info.prewhere_info->filter_info->actions = std::make_shared(expressions.filter_info->actions); diff --git a/src/Storages/MergeTree/MergeTreeBaseSelectProcessor.cpp b/src/Storages/MergeTree/MergeTreeBaseSelectProcessor.cpp index 2ab275a7bd6..96993e4a106 100644 --- a/src/Storages/MergeTree/MergeTreeBaseSelectProcessor.cpp +++ b/src/Storages/MergeTree/MergeTreeBaseSelectProcessor.cpp @@ -338,6 +338,9 @@ void MergeTreeBaseSelectProcessor::executePrewhereActions(Block & block, const P { auto & filter_info = *prewhere_info->filter_info; + if (filter_info.alias_actions) + filter_info.alias_actions->execute(block); + if (filter_info.actions) filter_info.actions->execute(block); diff --git a/src/Storages/MergeTree/MergeTreeBlockReadUtils.cpp b/src/Storages/MergeTree/MergeTreeBlockReadUtils.cpp index d5fb2f3300c..f4a5b1fcb9e 100644 --- a/src/Storages/MergeTree/MergeTreeBlockReadUtils.cpp +++ b/src/Storages/MergeTree/MergeTreeBlockReadUtils.cpp @@ -269,10 +269,18 @@ MergeTreeReadTaskColumns getReadTaskColumns( if (prewhere_info) { - if (prewhere_info->filter_info && prewhere_info->filter_info->actions) + if (prewhere_info->filter_info) { - const auto required_column_names = prewhere_info->filter_info->actions->getRequiredColumns(); - pre_column_names.insert(pre_column_names.end(), required_column_names.begin(), required_column_names.end()); + if (prewhere_info->filter_info->alias_actions) + { + const auto required_column_names = prewhere_info->filter_info->alias_actions->getRequiredColumns(); + pre_column_names.insert(pre_column_names.end(), required_column_names.begin(), required_column_names.end()); + } + else if (prewhere_info->filter_info->actions) + { + const auto required_column_names = prewhere_info->filter_info->actions->getRequiredColumns(); + pre_column_names.insert(pre_column_names.end(), required_column_names.begin(), required_column_names.end()); + } } if (prewhere_info->alias_actions) @@ -280,7 +288,7 @@ MergeTreeReadTaskColumns getReadTaskColumns( const auto required_column_names = prewhere_info->alias_actions->getRequiredColumns(); pre_column_names.insert(pre_column_names.end(), required_column_names.begin(), required_column_names.end()); } - else + else if (prewhere_info->prewhere_actions) { const auto required_column_names = prewhere_info->prewhere_actions->getRequiredColumns(); pre_column_names.insert(pre_column_names.end(), required_column_names.begin(), required_column_names.end()); diff --git a/src/Storages/MergeTree/MergeTreeRangeReader.cpp b/src/Storages/MergeTree/MergeTreeRangeReader.cpp index 5995e2318c0..3c79ed73a16 100644 --- a/src/Storages/MergeTree/MergeTreeRangeReader.cpp +++ b/src/Storages/MergeTree/MergeTreeRangeReader.cpp @@ -899,6 +899,9 @@ void MergeTreeRangeReader::executePrewhereActionsAndFilterColumns(ReadResult & r if (prewhere_info->filter_info) { + if (prewhere_info->filter_info->alias_actions) + prewhere_info->filter_info->alias_actions->execute(block); + if (prewhere_info->filter_info->actions) prewhere_info->filter_info->actions->execute(block); diff --git a/src/Storages/MergeTree/MergeTreeReadPool.h b/src/Storages/MergeTree/MergeTreeReadPool.h index aa6811661e6..366e9a2381a 100644 --- a/src/Storages/MergeTree/MergeTreeReadPool.h +++ b/src/Storages/MergeTree/MergeTreeReadPool.h @@ -71,7 +71,8 @@ private: public: MergeTreeReadPool( const size_t threads_, const size_t sum_marks_, const size_t min_marks_for_concurrent_read_, - RangesInDataParts && parts_, const MergeTreeData & data_, const StorageMetadataPtr & metadata_snapshot_, const PrewhereInfoPtr & prewhere_info_, + RangesInDataParts && parts_, const MergeTreeData & data_, const StorageMetadataPtr & metadata_snapshot_, + const PrewhereInfoPtr & prewhere_info_, const bool check_columns_, const Names & column_names_, const BackoffSettings & backoff_settings_, size_t preferred_block_size_bytes_, const bool do_not_steal_tasks_ = false); diff --git a/src/Storages/SelectQueryInfo.h b/src/Storages/SelectQueryInfo.h index 325f54435ed..a87ff2f40d3 100644 --- a/src/Storages/SelectQueryInfo.h +++ b/src/Storages/SelectQueryInfo.h @@ -74,6 +74,7 @@ struct PrewhereDAGInfo /// Helper struct to store all the information about the filter expression. struct FilterInfo { + ExpressionActionsPtr alias_actions; ExpressionActionsPtr actions; String column_name; bool do_remove_column = false; diff --git a/src/Storages/StorageBuffer.cpp b/src/Storages/StorageBuffer.cpp index 2ed7fe377c4..64bcdd2d145 100644 --- a/src/Storages/StorageBuffer.cpp +++ b/src/Storages/StorageBuffer.cpp @@ -323,6 +323,16 @@ void StorageBuffer::read( { if (query_info.prewhere_info->filter_info) { + if (query_info.prewhere_info->filter_info->alias_actions) + { + pipe_from_buffers.addSimpleTransform([&](const Block & header) + { + return std::make_shared( + header, + query_info.prewhere_info->filter_info->alias_actions); + }); + } + pipe_from_buffers.addSimpleTransform([&](const Block & header) { return std::make_shared( From 96dc69609c9def6dc5f457e67529e106f55ffccd Mon Sep 17 00:00:00 2001 From: Alexey Milovidov Date: Mon, 15 Feb 2021 00:00:18 +0300 Subject: [PATCH 0279/2357] Fix Arcadia --- src/Columns/ya.make | 1 + src/Columns/ya.make.in | 1 + 2 files changed, 2 insertions(+) diff --git a/src/Columns/ya.make b/src/Columns/ya.make index 061391b5214..54dd02609ff 100644 --- a/src/Columns/ya.make +++ b/src/Columns/ya.make @@ -7,6 +7,7 @@ ADDINCL( contrib/libs/icu/common contrib/libs/icu/i18n contrib/libs/pdqsort + contrib/libs/lz4 ) PEERDIR( diff --git a/src/Columns/ya.make.in b/src/Columns/ya.make.in index 4422d222ce1..846e2c6c3bd 100644 --- a/src/Columns/ya.make.in +++ b/src/Columns/ya.make.in @@ -6,6 +6,7 @@ ADDINCL( contrib/libs/icu/common contrib/libs/icu/i18n contrib/libs/pdqsort + contrib/libs/lz4 ) PEERDIR( From c24221b04f1bc511cc0a9524e6e2388c03d08246 Mon Sep 17 00:00:00 2001 From: Roman Bug Date: Mon, 15 Feb 2021 01:53:44 +0300 Subject: [PATCH 0280/2357] Update docs/en/sql-reference/functions/type-conversion-functions.md Co-authored-by: olgarev <56617294+olgarev@users.noreply.github.com> --- docs/en/sql-reference/functions/type-conversion-functions.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/en/sql-reference/functions/type-conversion-functions.md b/docs/en/sql-reference/functions/type-conversion-functions.md index 268a7565b81..c80f8934f72 100644 --- a/docs/en/sql-reference/functions/type-conversion-functions.md +++ b/docs/en/sql-reference/functions/type-conversion-functions.md @@ -546,7 +546,7 @@ accurateCastOrNull(x, T) **Returned value** -- The value in specified data type `T`. +- The value, converted to the specified data type `T`. **Example** From cdac3cf9ce17391479681444b48e005dc24327d7 Mon Sep 17 00:00:00 2001 From: Roman Bug Date: Mon, 15 Feb 2021 01:53:51 +0300 Subject: [PATCH 0281/2357] Update docs/ru/sql-reference/functions/type-conversion-functions.md Co-authored-by: olgarev <56617294+olgarev@users.noreply.github.com> --- docs/ru/sql-reference/functions/type-conversion-functions.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/ru/sql-reference/functions/type-conversion-functions.md b/docs/ru/sql-reference/functions/type-conversion-functions.md index e16fa438aed..985dd16c231 100644 --- a/docs/ru/sql-reference/functions/type-conversion-functions.md +++ b/docs/ru/sql-reference/functions/type-conversion-functions.md @@ -423,7 +423,7 @@ SELECT uuid = uuid2; ## CAST(x, T) {#type_conversion_function-cast} -Преобразует вхожное значение `x` в указананный тип данных `T`. +Преобразует входное значение `x` в указанный тип данных `T`. Поддерживается также синтаксис `CAST(x AS t)`. From cda9dc7600880ee35582cfe1d98d15bd4df43c28 Mon Sep 17 00:00:00 2001 From: Roman Bug Date: Mon, 15 Feb 2021 01:54:02 +0300 Subject: [PATCH 0282/2357] Update docs/ru/sql-reference/functions/type-conversion-functions.md Co-authored-by: olgarev <56617294+olgarev@users.noreply.github.com> --- docs/ru/sql-reference/functions/type-conversion-functions.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/ru/sql-reference/functions/type-conversion-functions.md b/docs/ru/sql-reference/functions/type-conversion-functions.md index 985dd16c231..3c9d3993120 100644 --- a/docs/ru/sql-reference/functions/type-conversion-functions.md +++ b/docs/ru/sql-reference/functions/type-conversion-functions.md @@ -427,7 +427,7 @@ SELECT uuid = uuid2; Поддерживается также синтаксис `CAST(x AS t)`. -Обратите внимание, что если значение `x` не соответствует границам типа `T`, функция переполняется. Например, `CAST(-1, 'UInt8')` возвращает 255. +Обратите внимание, что если значение `x` не может быть преобразовано к типу `T`, возникает переполнение. Например, `CAST(-1, 'UInt8')` возвращает 255. **Пример** From b82bf79c5245092fea0a866f3cae2934262d66d6 Mon Sep 17 00:00:00 2001 From: Roman Bug Date: Mon, 15 Feb 2021 01:54:10 +0300 Subject: [PATCH 0283/2357] Update docs/ru/sql-reference/functions/type-conversion-functions.md Co-authored-by: olgarev <56617294+olgarev@users.noreply.github.com> --- docs/ru/sql-reference/functions/type-conversion-functions.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/ru/sql-reference/functions/type-conversion-functions.md b/docs/ru/sql-reference/functions/type-conversion-functions.md index 3c9d3993120..16e52efceec 100644 --- a/docs/ru/sql-reference/functions/type-conversion-functions.md +++ b/docs/ru/sql-reference/functions/type-conversion-functions.md @@ -494,7 +494,7 @@ SELECT toTypeName(CAST(x, 'Nullable(UInt16)')) FROM t_null; Преобразует входное значение `x` в указанный тип данных `T`. -Отличие от [cast(x, T)](#type_conversion_function-cast) в том, что `accurateCast` не допускает переполнения числовых типов, если значение типа `x` не соответствует границам типа `T`. Например, `accurateCast(-1, 'UInt8')` вернет ошибку. +В отличие от функции [cast(x, T)](#type_conversion_function-cast), `accurateCast` не допускает переполнения при преобразовании числовых типов. Например, `accurateCast(-1, 'UInt8')` вызовет исключение. **Примеры** From 82701ecbeccf88f38a73ccb0ea556267d2fa99a0 Mon Sep 17 00:00:00 2001 From: Roman Bug Date: Mon, 15 Feb 2021 01:54:15 +0300 Subject: [PATCH 0284/2357] Update docs/ru/sql-reference/functions/type-conversion-functions.md Co-authored-by: olgarev <56617294+olgarev@users.noreply.github.com> --- docs/ru/sql-reference/functions/type-conversion-functions.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/ru/sql-reference/functions/type-conversion-functions.md b/docs/ru/sql-reference/functions/type-conversion-functions.md index 16e52efceec..0723ed2c752 100644 --- a/docs/ru/sql-reference/functions/type-conversion-functions.md +++ b/docs/ru/sql-reference/functions/type-conversion-functions.md @@ -527,7 +527,7 @@ Code: 70. DB::Exception: Received from localhost:9000. DB::Exception: Value in c Преобразует входное значение `x` в указанный тип данных `T`. -Всегда возвращает тип [Nullable](../../sql-reference/data-types/nullable.md), и возвращает [NULL](../../sql-reference/syntax.md#null-literal), если приведенное значение не может быть представлено в целевом типе. +Всегда возвращает тип [Nullable](../../sql-reference/data-types/nullable.md). Если исходное значение не может быть преобразовано к целевому типу, возвращает [NULL](../../sql-reference/syntax.md#null-literal). **Синтаксис** From 994b998df9863e772b438a858a2cdabdb2ce27ea Mon Sep 17 00:00:00 2001 From: Roman Bug Date: Mon, 15 Feb 2021 01:54:20 +0300 Subject: [PATCH 0285/2357] Update docs/ru/sql-reference/operators/in.md Co-authored-by: olgarev <56617294+olgarev@users.noreply.github.com> --- docs/ru/sql-reference/operators/in.md | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/docs/ru/sql-reference/operators/in.md b/docs/ru/sql-reference/operators/in.md index c2d88a729be..e0412747898 100644 --- a/docs/ru/sql-reference/operators/in.md +++ b/docs/ru/sql-reference/operators/in.md @@ -17,8 +17,7 @@ SELECT (CounterID, UserID) IN ((34, 123), (101500, 456)) FROM ... В качестве правой части оператора может быть множество константных выражений, множество кортежей с константными выражениями (показано в примерах выше), а также имя таблицы или подзапрос SELECT в скобках. -ClickHouse допускает различные типы в левой и правой частях подзапроса `IN`. -В этом случае он преобразует левую сторону в тип правой стороны, применяя функцию [accurateCastOrNull](../functions/type-conversion-functions.md#type_conversion_function-accurate-cast_or_null). +Если типы данных в левой и правой частях подзапроса `IN` различаются, ClickHouse преобразует значение в левой части к типу данных из правой части. Преобразование выполняется по аналогии с функцией [accurateCastOrNull](../functions/type-conversion-functions.md#type_conversion_function-accurate-cast_or_null), т.е. тип данных становится [Nullable](../../sql-reference/data-types/nullable.md), а если преобразование не может быть выполнено, возвращается значение [NULL](../../sql-reference/syntax.md#null-literal). **Пример** From 2a71053c695ee6deb84d8583c51dec0cc74dcdb1 Mon Sep 17 00:00:00 2001 From: Roman Bug Date: Mon, 15 Feb 2021 01:54:25 +0300 Subject: [PATCH 0286/2357] Update docs/en/sql-reference/operators/in.md Co-authored-by: olgarev <56617294+olgarev@users.noreply.github.com> --- docs/en/sql-reference/operators/in.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/en/sql-reference/operators/in.md b/docs/en/sql-reference/operators/in.md index 4796c0f6bc0..34866f3d09a 100644 --- a/docs/en/sql-reference/operators/in.md +++ b/docs/en/sql-reference/operators/in.md @@ -17,7 +17,7 @@ Don’t list too many values explicitly (i.e. millions). If a data set is large The right side of the operator can be a set of constant expressions, a set of tuples with constant expressions (shown in the examples above), or the name of a database table or SELECT subquery in brackets. -ClickHouse allows different types in the left and right parts of `IN` subquery. In this case it converts the left hand side to the type of the right hand side as if the [accurateCastOrNull](../functions/type-conversion-functions.md#type_conversion_function-accurate-cast_or_null) function is applied. +ClickHouse allows types to differ in the left and the right parts of `IN` subquery. In this case it converts the left side value to the type of the right side, as if the [accurateCastOrNull](../functions/type-conversion-functions.md#type_conversion_function-accurate-cast_or_null) function is applied. That means, that the data type becomes [Nullable](../../sql-reference/data-types/nullable.md), and if the conversion cannot be performed, it returns [NULL](../../sql-reference/syntax.md#null-literal). **Example** From 320fd6b264db77de1ef335c0025c5487868e9ddb Mon Sep 17 00:00:00 2001 From: Alexander Tokmakov Date: Mon, 15 Feb 2021 03:04:46 +0300 Subject: [PATCH 0287/2357] startup without zk --- src/Databases/DatabaseReplicated.cpp | 169 ++++++++++++------ src/Databases/DatabaseReplicated.h | 2 + src/Databases/DatabaseReplicatedWorker.cpp | 2 + src/Interpreters/DDLWorker.cpp | 2 +- .../test_replicated_database/test.py | 49 ++++- 5 files changed, 156 insertions(+), 68 deletions(-) diff --git a/src/Databases/DatabaseReplicated.cpp b/src/Databases/DatabaseReplicated.cpp index d365ea24bbf..24a193d9134 100644 --- a/src/Databases/DatabaseReplicated.cpp +++ b/src/Databases/DatabaseReplicated.cpp @@ -82,37 +82,6 @@ DatabaseReplicated::DatabaseReplicated( /// If zookeeper chroot prefix is used, path should start with '/', because chroot concatenates without it. if (zookeeper_path.front() != '/') zookeeper_path = "/" + zookeeper_path; - - if (!context_.hasZooKeeper()) - { - throw Exception("Can't create replicated database without ZooKeeper", ErrorCodes::NO_ZOOKEEPER); - } - //FIXME it will fail on startup if zk is not available - - auto current_zookeeper = global_context.getZooKeeper(); - - if (!current_zookeeper->exists(zookeeper_path)) - { - /// Create new database, multiple nodes can execute it concurrently - createDatabaseNodesInZooKeeper(current_zookeeper); - } - - replica_path = zookeeper_path + "/replicas/" + getFullReplicaName(); - - String replica_host_id; - if (current_zookeeper->tryGet(replica_path, replica_host_id)) - { - String host_id = getHostID(global_context, db_uuid); - if (replica_host_id != host_id) - throw Exception(ErrorCodes::REPLICA_IS_ALREADY_EXIST, - "Replica {} of shard {} of replicated database at {} already exists. Replica host ID: '{}', current host ID: '{}'", - replica_name, shard_name, zookeeper_path, replica_host_id, host_id); - } - else - { - /// Throws if replica with the same name was created concurrently - createReplicaNodesInZooKeeper(current_zookeeper); - } } String DatabaseReplicated::getFullReplicaName() const @@ -203,6 +172,50 @@ ClusterPtr DatabaseReplicated::getCluster() const return std::make_shared(global_context.getSettingsRef(), shards, username, password, global_context.getTCPPort(), false); } +void DatabaseReplicated::tryConnectToZooKeeper(bool force_attach) +{ + try + { + if (!global_context.hasZooKeeper()) + { + throw Exception("Can't create replicated database without ZooKeeper", ErrorCodes::NO_ZOOKEEPER); + } + + auto current_zookeeper = global_context.getZooKeeper(); + + if (!current_zookeeper->exists(zookeeper_path)) + { + /// Create new database, multiple nodes can execute it concurrently + createDatabaseNodesInZooKeeper(current_zookeeper); + } + + replica_path = zookeeper_path + "/replicas/" + getFullReplicaName(); + + String replica_host_id; + if (current_zookeeper->tryGet(replica_path, replica_host_id)) + { + String host_id = getHostID(global_context, db_uuid); + if (replica_host_id != host_id) + throw Exception(ErrorCodes::REPLICA_IS_ALREADY_EXIST, + "Replica {} of shard {} of replicated database at {} already exists. Replica host ID: '{}', current host ID: '{}'", + replica_name, shard_name, zookeeper_path, replica_host_id, host_id); + } + else + { + /// Throws if replica with the same name already exists + createReplicaNodesInZooKeeper(current_zookeeper); + } + + is_readonly = false; + } + catch(...) + { + if (!force_attach) + throw; + tryLogCurrentException(log); + } +} + bool DatabaseReplicated::createDatabaseNodesInZooKeeper(const zkutil::ZooKeeperPtr & current_zookeeper) { current_zookeeper->createAncestors(zookeeper_path); @@ -256,6 +269,8 @@ void DatabaseReplicated::createReplicaNodesInZooKeeper(const zkutil::ZooKeeperPt void DatabaseReplicated::loadStoredObjects(Context & context, bool has_force_restore_data_flag, bool force_attach) { + tryConnectToZooKeeper(force_attach); + DatabaseAtomic::loadStoredObjects(context, has_force_restore_data_flag, force_attach); ddl_worker = std::make_unique(this, global_context); @@ -264,6 +279,9 @@ void DatabaseReplicated::loadStoredObjects(Context & context, bool has_force_res BlockIO DatabaseReplicated::propose(const ASTPtr & query, const Context & query_context) { + if (is_readonly) + throw Exception(ErrorCodes::NO_ZOOKEEPER, "Database is in readonly mode, because it cannot connect to ZooKeeper"); + if (query_context.getClientInfo().query_kind != ClientInfo::QueryKind::INITIAL_QUERY) throw Exception(ErrorCodes::INCORRECT_QUERY, "It's not initial query. ON CLUSTER is not allowed for Replicated database."); @@ -297,6 +315,24 @@ BlockIO DatabaseReplicated::propose(const ASTPtr & query, const Context & query_ return io; } +static UUID getTableUUIDIfReplicated(const String & metadata, const Context & context) +{ + bool looks_like_replicated = metadata.find("ReplicatedMergeTree") != std::string::npos; + if (!looks_like_replicated) + return UUIDHelpers::Nil; + + ParserCreateQuery parser; + auto size = context.getSettingsRef().max_query_size; + auto depth = context.getSettingsRef().max_parser_depth; + ASTPtr query = parseQuery(parser, metadata, size, depth); + const ASTCreateQuery & create = query->as(); + if (!create.storage || !create.storage->engine) + return UUIDHelpers::Nil; + if (!startsWith(create.storage->engine->name, "Replicated") || !endsWith(create.storage->engine->name, "MergeTree")) + return UUIDHelpers::Nil; + assert(create.uuid != UUIDHelpers::Nil); + return create.uuid; +} void DatabaseReplicated::recoverLostReplica(const ZooKeeperPtr & current_zookeeper, UInt32 our_log_ptr, UInt32 max_log_ptr) { @@ -311,42 +347,44 @@ void DatabaseReplicated::recoverLostReplica(const ZooKeeperPtr & current_zookeep auto table_name_to_metadata = tryGetConsistentMetadataSnapshot(current_zookeeper, max_log_ptr); + /// For ReplicatedMergeTree tables we can compare only UUIDs to ensure that it's the same table. + /// Metadata can be different, it's handled on table replication level. + /// We need to handle only renamed tables. + /// TODO maybe we should also update MergeTree SETTINGS if required? + std::unordered_map zk_replicated_id_to_name; + for (const auto & zk_table : table_name_to_metadata) + { + UUID zk_replicated_id = getTableUUIDIfReplicated(zk_table.second, global_context); + if (zk_replicated_id != UUIDHelpers::Nil) + zk_replicated_id_to_name.emplace(zk_replicated_id, zk_table.first); + } + Strings tables_to_detach; + std::vector> replicated_tables_to_rename; size_t total_tables = 0; - auto existing_tables_it = getTablesIterator(global_context, {}); - while (existing_tables_it->isValid()) + std::vector replicated_ids; + for (auto existing_tables_it = getTablesIterator(global_context, {}); existing_tables_it->isValid(); existing_tables_it->next(), ++total_tables) { String name = existing_tables_it->name(); - auto in_zk = table_name_to_metadata.find(name); - String local_metadata = readMetadataFile(name); - if (in_zk == table_name_to_metadata.end() || in_zk->second != local_metadata) + UUID local_replicated_id = UUIDHelpers::Nil; + if (existing_tables_it->table()->supportsReplication()) { - bool should_detach = true; - bool looks_like_replicated = in_zk->second.find("ReplicatedMergeTree") != std::string::npos; - - if (looks_like_replicated) + local_replicated_id = existing_tables_it->table()->getStorageID().uuid; + auto it = zk_replicated_id_to_name.find(local_replicated_id); + if (it != zk_replicated_id_to_name.end()) { - ParserCreateQuery parser; - auto size = global_context.getSettingsRef().max_query_size; - auto depth = global_context.getSettingsRef().max_parser_depth; - ASTPtr local_create = parseQuery(parser, local_metadata, size, depth); - ASTPtr zk_create = parseQuery(parser, in_zk->second, size, depth); - if (local_create->as()->uuid == zk_create->as()->uuid) - { - /// For ReplicatedMergeTree tables we can compare only UUIDs to ensure that it's the same table. - /// Metadata can be different, it's handled on table replication level. - /// TODO maybe we should also compare MergeTree SETTINGS? - should_detach = false; - } + if (name != it->second) + replicated_tables_to_rename.emplace_back(name, it->second); + continue; } + } - if (should_detach) + auto in_zk = table_name_to_metadata.find(name); + if (in_zk == table_name_to_metadata.end() || in_zk->second != readMetadataFile(name)) + { tables_to_detach.emplace_back(std::move(name)); } - existing_tables_it->next(); - ++total_tables; } - existing_tables_it.reset(); String db_name = getDatabaseName(); String to_db_name = getDatabaseName() + BROKEN_TABLES_SUFFIX; @@ -375,17 +413,18 @@ void DatabaseReplicated::recoverLostReplica(const ZooKeeperPtr & current_zookeep if (getDatabaseName() != db_name) throw Exception(ErrorCodes::UNKNOWN_DATABASE, "Database was renamed, will retry"); + auto table = tryGetTable(table_name, global_context); if (isDictionaryExist(table_name)) { LOG_DEBUG(log, "Will DROP DICTIONARY {}", backQuoteIfNeed(table_name)); DatabaseAtomic::removeDictionary(global_context, table_name); ++dropped_dicts; } - else if (!tryGetTable(table_name, global_context)->storesDataOnDisk()) + else if (!table->storesDataOnDisk()) { LOG_DEBUG(log, "Will DROP TABLE {}, because it does not store data on disk and can be safely dropped", backQuoteIfNeed(table_name)); dropped_tables.push_back(tryGetTableUUID(table_name)); - tryGetTable(table_name, global_context)->shutdown(); + table->shutdown(); DatabaseAtomic::dropTable(global_context, table_name, true); } else @@ -401,6 +440,20 @@ void DatabaseReplicated::recoverLostReplica(const ZooKeeperPtr & current_zookeep LOG_WARNING(log, "Cleaned {} outdated objects: dropped {} dictionaries and {} tables, moved {} tables", tables_to_detach.size(), dropped_dicts, dropped_tables.size(), moved_tables); + /// Now database is cleared from outdated tables, let's rename ReplicatedMergeTree tables to actual names + for (const auto & old_to_new : replicated_tables_to_rename) + { + const String & from = old_to_new.first; + const String & to = old_to_new.second; + + LOG_DEBUG(log, "Will RENAME TABLE {} TO {}", backQuoteIfNeed(from), backQuoteIfNeed(to)); + /// TODO Maybe we should do it in two steps: rename all tables to temporary names and then rename them to actual names? + DDLGuardPtr table_guard = DatabaseCatalog::instance().getDDLGuard(db_name, std::min(from, to)); + DDLGuardPtr to_table_guard = DatabaseCatalog::instance().getDDLGuard(db_name, std::max(from, to)); + DatabaseAtomic::renameTable(global_context, from, *this, to, false, false); + } + + for (const auto & id : dropped_tables) DatabaseCatalog::instance().waitTableFinallyDropped(id); diff --git a/src/Databases/DatabaseReplicated.h b/src/Databases/DatabaseReplicated.h index 2c998a8bc97..43a6ce15376 100644 --- a/src/Databases/DatabaseReplicated.h +++ b/src/Databases/DatabaseReplicated.h @@ -84,6 +84,7 @@ public: friend struct DatabaseReplicatedTask; friend class DatabaseReplicatedDDLWorker; private: + void tryConnectToZooKeeper(bool force_attach); bool createDatabaseNodesInZooKeeper(const ZooKeeperPtr & current_zookeeper); void createReplicaNodesInZooKeeper(const ZooKeeperPtr & current_zookeeper); @@ -100,6 +101,7 @@ private: zkutil::ZooKeeperPtr getZooKeeper() const; + std::atomic_bool is_readonly = true; std::unique_ptr ddl_worker; }; diff --git a/src/Databases/DatabaseReplicatedWorker.cpp b/src/Databases/DatabaseReplicatedWorker.cpp index 521ba5b7cb2..8751c125383 100644 --- a/src/Databases/DatabaseReplicatedWorker.cpp +++ b/src/Databases/DatabaseReplicatedWorker.cpp @@ -29,6 +29,8 @@ void DatabaseReplicatedDDLWorker::initializeMainThread() try { auto zookeeper = getAndSetZooKeeper(); + if (database->is_readonly) + database->tryConnectToZooKeeper(false); initializeReplication(); initialized = true; return; diff --git a/src/Interpreters/DDLWorker.cpp b/src/Interpreters/DDLWorker.cpp index 1f4c7932329..ac365dbb8d4 100644 --- a/src/Interpreters/DDLWorker.cpp +++ b/src/Interpreters/DDLWorker.cpp @@ -211,7 +211,7 @@ void DDLWorker::shutdown() DDLWorker::~DDLWorker() { - shutdown(); + DDLWorker::shutdown(); } diff --git a/tests/integration/test_replicated_database/test.py b/tests/integration/test_replicated_database/test.py index faeb436f279..0db6884fbb7 100644 --- a/tests/integration/test_replicated_database/test.py +++ b/tests/integration/test_replicated_database/test.py @@ -196,14 +196,16 @@ def test_recover_staled_replica(started_cluster): dummy_node.query("CREATE TABLE recover.mt2 (n int) ENGINE=MergeTree order by n", settings=settings) main_node.query("CREATE TABLE recover.rmt1 (n int) ENGINE=ReplicatedMergeTree order by n", settings=settings) dummy_node.query("CREATE TABLE recover.rmt2 (n int) ENGINE=ReplicatedMergeTree order by n", settings=settings) + main_node.query("CREATE TABLE recover.rmt3 (n int) ENGINE=ReplicatedMergeTree order by n", settings=settings) + dummy_node.query("CREATE TABLE recover.rmt5 (n int) ENGINE=ReplicatedMergeTree order by n", settings=settings) main_node.query("CREATE DICTIONARY recover.d1 (n int DEFAULT 0, m int DEFAULT 1) PRIMARY KEY n SOURCE(CLICKHOUSE(HOST 'localhost' PORT 9000 USER 'default' TABLE 'rmt1' PASSWORD '' DB 'recover')) LIFETIME(MIN 1 MAX 10) LAYOUT(FLAT())") dummy_node.query("CREATE DICTIONARY recover.d2 (n int DEFAULT 0, m int DEFAULT 1) PRIMARY KEY n SOURCE(CLICKHOUSE(HOST 'localhost' PORT 9000 USER 'default' TABLE 'rmt2' PASSWORD '' DB 'recover')) LIFETIME(MIN 1 MAX 10) LAYOUT(FLAT())") - for table in ['t1', 't2', 'mt1', 'mt2', 'rmt1', 'rmt2']: + for table in ['t1', 't2', 'mt1', 'mt2', 'rmt1', 'rmt2', 'rmt3', 'rmt5']: main_node.query("INSERT INTO recover.{} VALUES (42)".format(table)) for table in ['t1', 't2', 'mt1', 'mt2']: dummy_node.query("INSERT INTO recover.{} VALUES (42)".format(table)) - for table in ['rmt1', 'rmt2']: + for table in ['rmt1', 'rmt2', 'rmt3', 'rmt5']: main_node.query("SYSTEM SYNC REPLICA recover.{}".format(table)) with PartitionManager() as pm: @@ -212,6 +214,8 @@ def test_recover_staled_replica(started_cluster): main_node.query("RENAME TABLE recover.t1 TO recover.m1", settings=settings) main_node.query("ALTER TABLE recover.mt1 ADD COLUMN m int", settings=settings) main_node.query("ALTER TABLE recover.rmt1 ADD COLUMN m int", settings=settings) + main_node.query("RENAME TABLE recover.rmt3 TO recover.rmt4", settings=settings) + main_node.query("DROP TABLE recover.rmt5", settings=settings) main_node.query("DROP DICTIONARY recover.d2", settings=settings) main_node.query("CREATE DICTIONARY recover.d2 (n int DEFAULT 0, m int DEFAULT 1) PRIMARY KEY n SOURCE(CLICKHOUSE(HOST 'localhost' PORT 9000 USER 'default' TABLE 'rmt1' PASSWORD '' DB 'recover')) LIFETIME(MIN 1 MAX 10) LAYOUT(FLAT());", settings=settings) @@ -223,25 +227,52 @@ def test_recover_staled_replica(started_cluster): main_node.query("DROP TABLE recover.tmp", settings=settings) main_node.query("CREATE TABLE recover.tmp AS recover.m1", settings=settings) - assert main_node.query("SELECT name FROM system.tables WHERE database='recover' ORDER BY name") == "d1\nd2\nm1\nmt1\nmt2\nrmt1\nrmt2\nt2\ntmp\n" + assert main_node.query("SELECT name FROM system.tables WHERE database='recover' ORDER BY name") == "d1\nd2\nm1\nmt1\nmt2\nrmt1\nrmt2\nrmt4\nt2\ntmp\n" query = "SELECT name, uuid, create_table_query FROM system.tables WHERE database='recover' ORDER BY name" expected = main_node.query(query) assert_eq_with_retry(dummy_node, query, expected) - for table in ['m1', 't2', 'mt1', 'mt2', 'rmt1', 'rmt2', 'd1', 'd2']: + for table in ['m1', 't2', 'mt1', 'mt2', 'rmt1', 'rmt2', 'rmt4', 'd1', 'd2']: assert main_node.query("SELECT (*,).1 FROM recover.{}".format(table)) == "42\n" - for table in ['t2', 'rmt1', 'rmt2', 'd1', 'd2', 'mt2']: + for table in ['t2', 'rmt1', 'rmt2', 'rmt4', 'd1', 'd2', 'mt2']: assert dummy_node.query("SELECT (*,).1 FROM recover.{}".format(table)) == "42\n" for table in ['m1', 'mt1']: assert dummy_node.query("SELECT count() FROM recover.{}".format(table)) == "0\n" - assert dummy_node.query("SELECT count() FROM system.tables WHERE database='recover_broken_tables'") == "1\n" - table = dummy_node.query("SHOW TABLES FROM recover_broken_tables").strip() - assert "mt1_22_" in table + assert dummy_node.query("SELECT count() FROM system.tables WHERE database='recover_broken_tables'") == "2\n" + table = dummy_node.query("SHOW TABLES FROM recover_broken_tables LIKE 'mt1_26_%'").strip() + assert dummy_node.query("SELECT (*,).1 FROM recover_broken_tables.{}".format(table)) == "42\n" + table = dummy_node.query("SHOW TABLES FROM recover_broken_tables LIKE 'rmt5_26_%'").strip() assert dummy_node.query("SELECT (*,).1 FROM recover_broken_tables.{}".format(table)) == "42\n" - expected = "Cleaned 3 outdated objects: dropped 1 dictionaries and 1 tables, moved 1 tables" + expected = "Cleaned 4 outdated objects: dropped 1 dictionaries and 1 tables, moved 2 tables" assert_logs_contain(dummy_node, expected) dummy_node.query("DROP TABLE recover.tmp") + assert_eq_with_retry(main_node, "SELECT count() FROM system.tables WHERE database='recover' AND name='tmp'", "0\n") +def test_startup_without_zk(started_cluster): + main_node.query("DROP DATABASE IF EXISTS testdb SYNC") + main_node.query("DROP DATABASE IF EXISTS recover SYNC") + with PartitionManager() as pm: + pm.drop_instance_zk_connections(main_node) + err = main_node.query_and_get_error("CREATE DATABASE startup ENGINE = Replicated('/clickhouse/databases/startup', 'shard1', 'replica1');") + assert "ZooKeeper" in err + main_node.query("CREATE DATABASE startup ENGINE = Replicated('/clickhouse/databases/startup', 'shard1', 'replica1');") + #main_node.query("CREATE TABLE startup.rmt (n int) ENGINE=ReplicatedMergeTree order by n") + main_node.query("CREATE TABLE startup.rmt (n int) ENGINE=MergeTree order by n") + main_node.query("INSERT INTO startup.rmt VALUES (42)") + with PartitionManager() as pm: + pm.drop_instance_zk_connections(main_node) + main_node.restart_clickhouse(stop_start_wait_sec=30) + assert main_node.query("SELECT (*,).1 FROM startup.rmt") == "42\n" + + for _ in range(10): + try: + main_node.query("CREATE TABLE startup.m (n int) ENGINE=Memory") + break + except: + time.sleep(1) + + main_node.query("EXCHANGE TABLES startup.rmt AND startup.m") + assert main_node.query("SELECT (*,).1 FROM startup.m") == "42\n" From 9c7cf9e92e8c75bc670abf070397c3aacbcf3193 Mon Sep 17 00:00:00 2001 From: Alexander Tokmakov Date: Mon, 15 Feb 2021 13:26:34 +0300 Subject: [PATCH 0288/2357] remove some debug code --- docker/test/stateful/run.sh | 4 +++ docker/test/stateless/run.sh | 4 +++ programs/server/Server.cpp | 4 ++- src/Core/Settings.h | 3 ++ src/Databases/DatabaseReplicated.cpp | 3 +- src/Databases/DatabaseReplicated.h | 1 - src/Databases/DatabaseReplicatedWorker.cpp | 4 +-- src/Interpreters/DDLWorker.cpp | 15 ++++----- src/Interpreters/DDLWorker.h | 5 +-- src/Interpreters/InterpreterCreateQuery.cpp | 21 ++++-------- src/Interpreters/executeDDLQueryOnCluster.cpp | 12 +------ tests/ci/ci_config.json | 24 ++++++++++++++ tests/clickhouse-test | 17 +++++++--- tests/config/install.sh | 3 ++ tests/config/users.d/database_replicated.xml | 10 ++++++ .../test_materialize_mysql_database/test.py | 2 +- .../configs/settings.xml | 12 +++++++ .../test_replicated_database/test.py | 10 +++--- tests/queries/skip_list.json | 33 ++++--------------- 19 files changed, 109 insertions(+), 78 deletions(-) create mode 100644 tests/config/users.d/database_replicated.xml create mode 100644 tests/integration/test_replicated_database/configs/settings.xml diff --git a/docker/test/stateful/run.sh b/docker/test/stateful/run.sh index f2fcefd604f..7779f0e9dc2 100755 --- a/docker/test/stateful/run.sh +++ b/docker/test/stateful/run.sh @@ -60,4 +60,8 @@ fi # more idiologically correct. read -ra ADDITIONAL_OPTIONS <<< "${ADDITIONAL_OPTIONS:-}" +if [[ -n "$USE_DATABASE_REPLICATED" ]] && [[ "$USE_DATABASE_REPLICATED" -eq 1 ]]; then + ADDITIONAL_OPTIONS+=('--replicated-database') +fi + clickhouse-test --testname --shard --zookeeper --no-stateless --hung-check --print-time "$SKIP_LIST_OPT" "${ADDITIONAL_OPTIONS[@]}" "$SKIP_TESTS_OPTION" 2>&1 | ts '%Y-%m-%d %H:%M:%S' | tee test_output/test_result.txt diff --git a/docker/test/stateless/run.sh b/docker/test/stateless/run.sh index 575be721a54..d078f3739fd 100755 --- a/docker/test/stateless/run.sh +++ b/docker/test/stateless/run.sh @@ -57,6 +57,10 @@ function run_tests() ADDITIONAL_OPTIONS+=('4') fi + if [[ -n "$USE_DATABASE_REPLICATED" ]] && [[ "$USE_DATABASE_REPLICATED" -eq 1 ]]; then + ADDITIONAL_OPTIONS+=('--replicated-database') + fi + clickhouse-test --testname --shard --zookeeper --hung-check --print-time \ --test-runs "$NUM_TRIES" \ "$SKIP_LIST_OPT" "${ADDITIONAL_OPTIONS[@]}" 2>&1 \ diff --git a/programs/server/Server.cpp b/programs/server/Server.cpp index 2bb5181d348..400796981d5 100644 --- a/programs/server/Server.cpp +++ b/programs/server/Server.cpp @@ -100,6 +100,7 @@ namespace CurrentMetrics extern const Metric Revision; extern const Metric VersionInteger; extern const Metric MemoryTracking; + extern const Metric MaxDDLEntryID; } @@ -997,7 +998,8 @@ int Server::main(const std::vector & /*args*/) int pool_size = config().getInt("distributed_ddl.pool_size", 1); if (pool_size < 1) throw Exception("distributed_ddl.pool_size should be greater then 0", ErrorCodes::ARGUMENT_OUT_OF_BOUND); - global_context->setDDLWorker(std::make_unique(pool_size, ddl_zookeeper_path, *global_context, &config(), "distributed_ddl")); + global_context->setDDLWorker(std::make_unique(pool_size, ddl_zookeeper_path, *global_context, &config(), + "distributed_ddl", "DDLWorker", &CurrentMetrics::MaxDDLEntryID)); } std::unique_ptr dns_cache_updater; diff --git a/src/Core/Settings.h b/src/Core/Settings.h index 96571cedd3f..ba4fcdda48c 100644 --- a/src/Core/Settings.h +++ b/src/Core/Settings.h @@ -420,6 +420,9 @@ class IColumn; M(Bool, optimize_rewrite_sum_if_to_count_if, true, "Rewrite sumIf() and sum(if()) function countIf() function when logically equivalent", 0) \ M(UInt64, insert_shard_id, 0, "If non zero, when insert into a distributed table, the data will be inserted into the shard `insert_shard_id` synchronously. Possible values range from 1 to `shards_number` of corresponding distributed table", 0) \ M(Bool, allow_experimental_query_deduplication, false, "Allow sending parts' UUIDs for a query in order to deduplicate data parts if any", 0) \ + M(Bool, allow_experimental_database_replicated, false, "Allow to create databases with Replicated engine", 0) \ + M(UInt64, database_replicated_initial_query_timeout_sec, 300, "How long initial DDL query should wait for Replicated database to precess previous DDL queue entries", 0) \ + M(Bool, database_replicated_ddl_output, true, "Return table with query execution status as a result of DDL query", 0) \ \ /** Obsolete settings that do nothing but left for compatibility reasons. Remove each one after half a year of obsolescence. */ \ \ diff --git a/src/Databases/DatabaseReplicated.cpp b/src/Databases/DatabaseReplicated.cpp index 24a193d9134..dc1203e8cc9 100644 --- a/src/Databases/DatabaseReplicated.cpp +++ b/src/Databases/DatabaseReplicated.cpp @@ -311,7 +311,8 @@ BlockIO DatabaseReplicated::propose(const ASTPtr & query, const Context & query_ Strings hosts_to_wait = getZooKeeper()->getChildren(zookeeper_path + "/replicas"); auto stream = std::make_shared(node_path, entry, query_context, hosts_to_wait); - io.in = std::move(stream); + if (query_context.getSettingsRef().database_replicated_ddl_output) + io.in = std::move(stream); return io; } diff --git a/src/Databases/DatabaseReplicated.h b/src/Databases/DatabaseReplicated.h index 43a6ce15376..2ae97b0d82a 100644 --- a/src/Databases/DatabaseReplicated.h +++ b/src/Databases/DatabaseReplicated.h @@ -80,7 +80,6 @@ public: ClusterPtr getCluster() const; - //FIXME friend struct DatabaseReplicatedTask; friend class DatabaseReplicatedDDLWorker; private: diff --git a/src/Databases/DatabaseReplicatedWorker.cpp b/src/Databases/DatabaseReplicatedWorker.cpp index 8751c125383..ff15878b136 100644 --- a/src/Databases/DatabaseReplicatedWorker.cpp +++ b/src/Databases/DatabaseReplicatedWorker.cpp @@ -89,7 +89,7 @@ String DatabaseReplicatedDDLWorker::enqueueQuery(DDLLogEntry & entry) return node_path; } -String DatabaseReplicatedDDLWorker::tryEnqueueAndExecuteEntry(DDLLogEntry & entry, const Context & /*query_context*/) +String DatabaseReplicatedDDLWorker::tryEnqueueAndExecuteEntry(DDLLogEntry & entry, const Context & query_context) { /// NOTE Possibly it would be better to execute initial query on the most up-to-date node, /// but it requires more complex logic around /try node. @@ -114,7 +114,7 @@ String DatabaseReplicatedDDLWorker::tryEnqueueAndExecuteEntry(DDLLogEntry & entr task->is_initial_query = true; LOG_DEBUG(log, "Waiting for worker thread to process all entries before {}", entry_name); - UInt64 timeout = 600; + UInt64 timeout = query_context.getSettingsRef().database_replicated_initial_query_timeout_sec; { std::unique_lock lock{mutex}; bool processed = wait_current_task_change.wait_for(lock, std::chrono::seconds(timeout), [&]() diff --git a/src/Interpreters/DDLWorker.cpp b/src/Interpreters/DDLWorker.cpp index ac365dbb8d4..f08f47b1c0e 100644 --- a/src/Interpreters/DDLWorker.cpp +++ b/src/Interpreters/DDLWorker.cpp @@ -32,10 +32,6 @@ namespace fs = std::filesystem; -namespace CurrentMetrics -{ - extern const Metric MaxDDLEntryID; -} namespace DB { @@ -152,12 +148,14 @@ std::unique_ptr createSimpleZooKeeperLock( DDLWorker::DDLWorker(int pool_size_, const std::string & zk_root_dir, const Context & context_, const Poco::Util::AbstractConfiguration * config, const String & prefix, - const String & logger_name) + const String & logger_name, const CurrentMetrics::Metric * max_entry_metric_) : context(context_) , log(&Poco::Logger::get(logger_name)) , pool_size(pool_size_) + , max_entry_metric(max_entry_metric_) { - CurrentMetrics::set(CurrentMetrics::MaxDDLEntryID, 0); + if (max_entry_metric) + CurrentMetrics::set(*max_entry_metric, 0); if (1 < pool_size) { @@ -456,7 +454,8 @@ void DDLWorker::updateMaxDDLEntryID(const String & entry_name) { if (max_id.compare_exchange_weak(prev_id, id)) { - CurrentMetrics::set(CurrentMetrics::MaxDDLEntryID, id); + if (max_entry_metric) + CurrentMetrics::set(*max_entry_metric, id); break; } } @@ -596,7 +595,7 @@ void DDLWorker::processTask(DDLTaskBase & task) } -bool DDLWorker::taskShouldBeExecutedOnLeader(const ASTPtr ast_ddl, const StoragePtr storage) +bool DDLWorker::taskShouldBeExecutedOnLeader(const ASTPtr & ast_ddl, const StoragePtr storage) { /// Pure DROP queries have to be executed on each node separately if (auto * query = ast_ddl->as(); query && query->kind != ASTDropQuery::Kind::Truncate) diff --git a/src/Interpreters/DDLWorker.h b/src/Interpreters/DDLWorker.h index 03c80e3f669..0985884eef7 100644 --- a/src/Interpreters/DDLWorker.h +++ b/src/Interpreters/DDLWorker.h @@ -43,7 +43,7 @@ class DDLWorker { public: DDLWorker(int pool_size_, const std::string & zk_root_dir, const Context & context_, const Poco::Util::AbstractConfiguration * config, const String & prefix, - const String & logger_name = "DDLWorker"); + const String & logger_name = "DDLWorker", const CurrentMetrics::Metric * max_entry_metric_ = nullptr); virtual ~DDLWorker(); /// Pushes query into DDL queue, returns path to created node @@ -81,7 +81,7 @@ protected: void updateMaxDDLEntryID(const String & entry_name); /// Check that query should be executed on leader replica only - static bool taskShouldBeExecutedOnLeader(const ASTPtr ast_ddl, StoragePtr storage); + static bool taskShouldBeExecutedOnLeader(const ASTPtr & ast_ddl, StoragePtr storage); /// Executes query only on leader replica in case of replicated table. /// Queries like TRUNCATE/ALTER .../OPTIMIZE have to be executed only on one node of shard. @@ -144,6 +144,7 @@ protected: size_t max_tasks_in_queue = 1000; std::atomic max_id = 0; + const CurrentMetrics::Metric * max_entry_metric; }; diff --git a/src/Interpreters/InterpreterCreateQuery.cpp b/src/Interpreters/InterpreterCreateQuery.cpp index bbe8526ae5b..2021c1f1d60 100644 --- a/src/Interpreters/InterpreterCreateQuery.cpp +++ b/src/Interpreters/InterpreterCreateQuery.cpp @@ -138,20 +138,7 @@ BlockIO InterpreterCreateQuery::createDatabase(ASTCreateQuery & create) bool old_style_database = context.getSettingsRef().default_database_engine.value == DefaultDatabaseEngine::Ordinary; auto engine = std::make_shared(); auto storage = std::make_shared(); - - //FIXME revert it before merge - engine->name = "Atomic"; - if (old_style_database) - { - if (database_name == "test") - engine->name = "Ordinary"; // for stateful tests - else - engine = makeASTFunction("Replicated", - std::make_shared(fmt::format("/clickhouse/db/{}/", create.database)), - std::make_shared("s1"), - std::make_shared("r" + toString(getpid()))); - } - + engine->name = old_style_database ? "Ordinary" : "Atomic"; engine->no_empty_args = true; storage->set(storage->engine, engine); create.set(create.storage, storage); @@ -221,6 +208,12 @@ BlockIO InterpreterCreateQuery::createDatabase(ASTCreateQuery & create) "Enable allow_experimental_database_materialize_mysql to use it.", ErrorCodes::UNKNOWN_DATABASE_ENGINE); } + if (create.storage->engine->name == "Replicated" && !context.getSettingsRef().allow_experimental_database_replicated && !internal) + { + throw Exception("Replicated is an experimental database engine. " + "Enable allow_experimental_database_replicated to use it.", ErrorCodes::UNKNOWN_DATABASE_ENGINE); + } + DatabasePtr database = DatabaseFactory::get(create, metadata_path / "", context); if (create.uuid != UUIDHelpers::Nil) diff --git a/src/Interpreters/executeDDLQueryOnCluster.cpp b/src/Interpreters/executeDDLQueryOnCluster.cpp index 2774f78663e..1937fbaf905 100644 --- a/src/Interpreters/executeDDLQueryOnCluster.cpp +++ b/src/Interpreters/executeDDLQueryOnCluster.cpp @@ -205,10 +205,6 @@ DDLQueryStatusInputStream::DDLQueryStatusInputStream(const String & zk_node_path addTotalRowsApprox(waiting_hosts.size()); timeout_seconds = context.getSettingsRef().distributed_ddl_task_timeout; - - //FIXME revert it before merge - if (context.getSettingsRef().default_database_engine.value == DefaultDatabaseEngine::Ordinary) - timeout_seconds = 10; } Block DDLQueryStatusInputStream::readImpl() @@ -252,7 +248,6 @@ Block DDLQueryStatusInputStream::readImpl() sleepForMilliseconds(std::min(1000, 50 * (try_number + 1))); } - /// TODO: add shared lock if (!zookeeper->exists(node_path)) { throw Exception(ErrorCodes::UNFINISHED, @@ -301,12 +296,7 @@ Block DDLQueryStatusInputStream::readImpl() res = sample.cloneWithColumns(std::move(columns)); } - //FIXME revert it before merge - bool is_functional_tests = !by_hostname && context.getSettingsRef().default_database_engine.value == DefaultDatabaseEngine::Ordinary; - if (is_functional_tests) - return {}; - else - return res; + return res; } Strings DDLQueryStatusInputStream::getChildrenAllowNoNode(const std::shared_ptr & zookeeper, const String & node_path) diff --git a/tests/ci/ci_config.json b/tests/ci/ci_config.json index 44b35d61601..0e467319285 100644 --- a/tests/ci/ci_config.json +++ b/tests/ci/ci_config.json @@ -261,6 +261,18 @@ "with_coverage": false } }, + "Functional stateful tests (release, DatabaseReplicated)": { + "required_build_properties": { + "compiler": "clang-11", + "package_type": "deb", + "build_type": "relwithdebuginfo", + "sanitizer": "none", + "bundled": "bundled", + "splitted": "unsplitted", + "clang-tidy": "disable", + "with_coverage": false + } + }, "Functional stateless tests (address)": { "required_build_properties": { "compiler": "clang-11", @@ -381,6 +393,18 @@ "with_coverage": false } }, + "Functional stateless tests (release, DatabaseReplicated)": { + "required_build_properties": { + "compiler": "clang-11", + "package_type": "deb", + "build_type": "relwithdebuginfo", + "sanitizer": "none", + "bundled": "bundled", + "splitted": "unsplitted", + "clang-tidy": "disable", + "with_coverage": false + } + }, "Stress test (address)": { "required_build_properties": { "compiler": "clang-11", diff --git a/tests/clickhouse-test b/tests/clickhouse-test index b2f3f73b6c0..64a93416c41 100755 --- a/tests/clickhouse-test +++ b/tests/clickhouse-test @@ -105,7 +105,9 @@ def remove_control_characters(s): s = re.sub(r"[\x00-\x08\x0b\x0e-\x1f\x7f]", "", s) return s -def get_db_engine(args): +def get_db_engine(args, database_name): + if args.replicated_database: + return " ENGINE=Replicated('/test/clickhouse/db/{}', 's1', 'r1')".format(database_name) if args.db_engine: return " ENGINE=" + args.db_engine return "" # Will use default engine @@ -128,7 +130,7 @@ def run_single_test(args, ext, server_logs_level, client_options, case_file, std clickhouse_proc_create = Popen(shlex.split(args.client), stdin=PIPE, stdout=PIPE, stderr=PIPE, universal_newlines=True) try: - clickhouse_proc_create.communicate(("CREATE DATABASE " + database + get_db_engine(args)), timeout=args.timeout) + clickhouse_proc_create.communicate(("CREATE DATABASE " + database + get_db_engine(args, database)), timeout=args.timeout) except TimeoutExpired: total_time = (datetime.now() - start_time).total_seconds() return clickhouse_proc_create, "", "Timeout creating database {} before test".format(database), total_time @@ -532,6 +534,8 @@ class BuildFlags(): RELEASE = 'release-build' DATABASE_ORDINARY = 'database-ordinary' POLYMORPHIC_PARTS = 'polymorphic-parts' + ANTLR = 'antlr' + DATABASE_REPLICATED = 'database-replicated' def collect_build_flags(client): @@ -613,7 +617,9 @@ def main(args): build_flags = collect_build_flags(args.client) if args.antlr: - build_flags.append('antlr') + build_flags.append(BuildFlags.ANTLR) + if args.replicated_database: + build_flags.append(BuildFlags.DATABASE_REPLICATED) if args.use_skip_list: tests_to_skip_from_list = collect_tests_to_skip(args.skip_list_path, build_flags) @@ -666,10 +672,10 @@ def main(args): if args.database and args.database != "test": clickhouse_proc_create = Popen(shlex.split(args.client), stdin=PIPE, stdout=PIPE, stderr=PIPE, universal_newlines=True) - clickhouse_proc_create.communicate(("CREATE DATABASE IF NOT EXISTS " + args.database + get_db_engine(args))) + clickhouse_proc_create.communicate(("CREATE DATABASE IF NOT EXISTS " + args.database + get_db_engine(args, args.database))) clickhouse_proc_create = Popen(shlex.split(args.client), stdin=PIPE, stdout=PIPE, stderr=PIPE, universal_newlines=True) - clickhouse_proc_create.communicate(("CREATE DATABASE IF NOT EXISTS test" + get_db_engine(args))) + clickhouse_proc_create.communicate(("CREATE DATABASE IF NOT EXISTS test" + get_db_engine(args, 'test'))) def is_test_from_dir(suite_dir, case): case_file = os.path.join(suite_dir, case) @@ -923,6 +929,7 @@ if __name__ == '__main__': parser.add_argument('--skip-list-path', help="Path to skip-list file") parser.add_argument('--use-skip-list', action='store_true', default=False, help="Use skip list to skip tests if found") parser.add_argument('--db-engine', help='Database engine name') + parser.add_argument('--replicated-database', action='store_true', default=False, help='Run tests with Replicated database engine') parser.add_argument('--antlr', action='store_true', default=False, dest='antlr', help='Use new ANTLR parser in tests') parser.add_argument('--no-stateless', action='store_true', help='Disable all stateless tests') diff --git a/tests/config/install.sh b/tests/config/install.sh index 9965e1fb1ad..de6ba2a7a09 100755 --- a/tests/config/install.sh +++ b/tests/config/install.sh @@ -61,5 +61,8 @@ fi if [[ -n "$USE_DATABASE_ORDINARY" ]] && [[ "$USE_DATABASE_ORDINARY" -eq 1 ]]; then ln -sf $SRC_PATH/users.d/database_ordinary.xml $DEST_SERVER_PATH/users.d/ fi +if [[ -n "$USE_DATABASE_REPLICATED" ]] && [[ "$USE_DATABASE_REPLICATED" -eq 1 ]]; then + ln -sf $SRC_PATH/users.d/database_replicated.xml $DEST_SERVER_PATH/users.d/ +fi ln -sf $SRC_PATH/client_config.xml $DEST_CLIENT_PATH/config.xml diff --git a/tests/config/users.d/database_replicated.xml b/tests/config/users.d/database_replicated.xml new file mode 100644 index 00000000000..23801d00154 --- /dev/null +++ b/tests/config/users.d/database_replicated.xml @@ -0,0 +1,10 @@ + + + + 1 + 0 + 30 + 30 + + + diff --git a/tests/integration/test_materialize_mysql_database/test.py b/tests/integration/test_materialize_mysql_database/test.py index 0175ec78587..e55772d9e1d 100644 --- a/tests/integration/test_materialize_mysql_database/test.py +++ b/tests/integration/test_materialize_mysql_database/test.py @@ -14,7 +14,7 @@ DOCKER_COMPOSE_PATH = get_docker_compose_path() cluster = ClickHouseCluster(__file__) -node_db_ordinary = cluster.add_instance('node1', user_configs=["configs/users.xml"], with_mysql=False, stay_alive=True, with_zookeeper=True) #FIXME +node_db_ordinary = cluster.add_instance('node1', user_configs=["configs/users.xml"], with_mysql=False, stay_alive=True) node_db_atomic = cluster.add_instance('node2', user_configs=["configs/users_db_atomic.xml"], with_mysql=False, stay_alive=True) diff --git a/tests/integration/test_replicated_database/configs/settings.xml b/tests/integration/test_replicated_database/configs/settings.xml new file mode 100644 index 00000000000..e0f7e8691e6 --- /dev/null +++ b/tests/integration/test_replicated_database/configs/settings.xml @@ -0,0 +1,12 @@ + + + + 1 + + + + + default + + + diff --git a/tests/integration/test_replicated_database/test.py b/tests/integration/test_replicated_database/test.py index 0db6884fbb7..99e7d6077f8 100644 --- a/tests/integration/test_replicated_database/test.py +++ b/tests/integration/test_replicated_database/test.py @@ -8,11 +8,11 @@ from helpers.network import PartitionManager cluster = ClickHouseCluster(__file__) -main_node = cluster.add_instance('main_node', main_configs=['configs/config.xml'], with_zookeeper=True, stay_alive=True, macros={"shard": 1, "replica": 1}) -dummy_node = cluster.add_instance('dummy_node', main_configs=['configs/config.xml'], with_zookeeper=True, stay_alive=True, macros={"shard": 1, "replica": 2}) -competing_node = cluster.add_instance('competing_node', main_configs=['configs/config.xml'], with_zookeeper=True, macros={"shard": 1, "replica": 3}) -snapshotting_node = cluster.add_instance('snapshotting_node', main_configs=['configs/config.xml'], with_zookeeper=True, macros={"shard": 2, "replica": 1}) -snapshot_recovering_node = cluster.add_instance('snapshot_recovering_node', main_configs=['configs/config.xml'], with_zookeeper=True, macros={"shard": 2, "replica": 2}) +main_node = cluster.add_instance('main_node', main_configs=['configs/config.xml'], user_configs=['configs/settings.xml'], with_zookeeper=True, stay_alive=True, macros={"shard": 1, "replica": 1}) +dummy_node = cluster.add_instance('dummy_node', main_configs=['configs/config.xml'], user_configs=['configs/settings.xml'], with_zookeeper=True, stay_alive=True, macros={"shard": 1, "replica": 2}) +competing_node = cluster.add_instance('competing_node', main_configs=['configs/config.xml'], user_configs=['configs/settings.xml'], with_zookeeper=True, macros={"shard": 1, "replica": 3}) +snapshotting_node = cluster.add_instance('snapshotting_node', main_configs=['configs/config.xml'], user_configs=['configs/settings.xml'], with_zookeeper=True, macros={"shard": 2, "replica": 1}) +snapshot_recovering_node = cluster.add_instance('snapshot_recovering_node', main_configs=['configs/config.xml'], user_configs=['configs/settings.xml'], with_zookeeper=True, macros={"shard": 2, "replica": 2}) all_nodes = [main_node, dummy_node, competing_node, snapshotting_node, snapshot_recovering_node] diff --git a/tests/queries/skip_list.json b/tests/queries/skip_list.json index 2317cdcecac..db7b0631b97 100644 --- a/tests/queries/skip_list.json +++ b/tests/queries/skip_list.json @@ -100,10 +100,15 @@ "00604_show_create_database", "00609_mv_index_in_in", "00510_materizlized_view_and_deduplication_zookeeper", - "memory_tracking", /// FIXME remove it before merge + "00738_lock_for_inner_table" + ], + "database-replicated": [ "memory_tracking", "memory_usage", + "live_view", "01188_attach_table_from_pat", + "01415_sticking_mutations", + "01130_in_memory_parts", "01110_dictionary_layout_without_arguments", "01018_ddl_dictionaries_create", "01018_ddl_dictionaries_select", @@ -167,7 +172,6 @@ "01493_alter_remove_properties_zookeeper", "01475_read_subcolumns_storages", "01475_read_subcolumns", - "01463_test_alter_live_view_refresh", "01451_replicated_detach_drop_part", "01451_detach_drop_part", "01440_big_int_exotic_casts", @@ -180,9 +184,6 @@ "01355_alter_column_with_order", "01291_geo_types", "01270_optimize_skip_unused_shards_low_cardinality", - "01237_live_view_over_distributed_with_subquery_select_table_alias", - "01236_distributed_over_live_view_over_distributed", - "01235_live_view_over_distributed", "01182_materialized_view_different_structure", "01150_ddl_guard_rwr", "01148_zookeeper_path_macros_unfolding", @@ -194,7 +195,6 @@ "01073_attach_if_not_exists", "01072_optimize_skip_unused_shards_const_expr_eval", "01071_prohibition_secondary_index_with_old_format_merge_tree", - "01071_live_view_detach_dependency", "01062_alter_on_mutataion_zookeeper", "01060_shutdown_table_after_detach", "01056_create_table_as", @@ -207,27 +207,6 @@ "00989_parallel_parts_loading", "00980_zookeeper_merge_tree_alter_settings", "00980_merge_alter_settings", - "00980_create_temporary_live_view", - "00978_live_view_watch", - "00977_live_view_watch_events", - "00976_live_view_select_version", - "00975_live_view_create", - "00974_live_view_select_with_aggregation", - "00973_live_view_with_subquery_select_with_aggregation_in_subquery", - "00973_live_view_with_subquery_select_with_aggregation", - "00973_live_view_with_subquery_select_table_alias", - "00973_live_view_with_subquery_select_nested_with_aggregation_table_alias", - "00973_live_view_with_subquery_select_nested_with_aggregation", - "00973_live_view_with_subquery_select_nested", - "00973_live_view_with_subquery_select_join_no_alias", - "00973_live_view_with_subquery_select_join", - "00973_live_view_with_subquery_select", - "00973_live_view_select_prewhere", - "00973_live_view_select", - "00972_live_view_select_1", - "00969_live_view_watch_format_jsoneachrowwithprogress", - "00968_live_view_select_format_jsoneachrowwithprogress", - "00961_temporary_live_view_watch", "00955_test_final_mark", "00933_reserved_word", "00926_zookeeper_adaptive_index_granularity_replicated_merge_tree", From df09a5cac91f216ea360dc95b0afdc50b3e6fe44 Mon Sep 17 00:00:00 2001 From: Pavel Kruglov Date: Mon, 15 Feb 2021 16:21:36 +0300 Subject: [PATCH 0289/2357] Refactor 3.0 and fix tests --- src/Client/Connection.cpp | 305 ++++++++------- src/Client/ConnectionPoolWithFailover.cpp | 32 +- src/Client/ConnectionPoolWithFailover.h | 3 - src/Client/HedgedConnections.cpp | 211 +++++----- src/Client/HedgedConnections.h | 60 +-- src/Client/HedgedConnectionsFactory.cpp | 370 +++++++++--------- src/Client/HedgedConnectionsFactory.h | 73 ++-- src/Common/Epoll.cpp | 11 +- src/Common/Epoll.h | 9 +- src/Common/TimerDescriptor.cpp | 8 +- src/Common/TimerDescriptor.h | 2 +- .../configs/remote_servers.xml | 4 + .../integration/test_hedged_requests/test.py | 253 ++++++++++-- .../configs/remote_servers.xml | 4 + .../configs/users1.xml | 1 - .../test_hedged_requests_parallel/test.py | 113 ++++-- 16 files changed, 882 insertions(+), 577 deletions(-) diff --git a/src/Client/Connection.cpp b/src/Client/Connection.cpp index 2820046782f..5ef326acb73 100644 --- a/src/Client/Connection.cpp +++ b/src/Client/Connection.cpp @@ -59,19 +59,87 @@ namespace ErrorCodes void Connection::connect(const ConnectionTimeouts & timeouts) +{ + if (connected) + disconnect(); + + prepare(timeouts); + sendHello(); + receiveHello(); + + LOG_TRACE(log_wrapper.get(), "Connected to {} server version {}.{}.{}.", + server_name, server_version_major, server_version_minor, server_version_patch); +} + + +void Connection::disconnect() +{ + maybe_compressed_out = nullptr; + in = nullptr; + last_input_packet_type.reset(); + out = nullptr; // can write to socket + if (socket) + socket->close(); + socket = nullptr; + connected = false; +} + +void Connection::prepare(const ConnectionTimeouts & timeouts) { try { - if (connected) - disconnect(); + LOG_TRACE( + log_wrapper.get(), + "Connecting. Database: {}. User: {}{}{}", + default_database.empty() ? "(not specified)" : default_database, + user, + static_cast(secure) ? ". Secure" : "", + static_cast(compression) ? "" : ". Uncompressed"); - prepare(timeouts); + if (static_cast(secure)) + { +#if USE_SSL + socket = std::make_unique(); - sendHello(); - receiveHello(); + /// we resolve the ip when we open SecureStreamSocket, so to make Server Name Indication (SNI) + /// work we need to pass host name separately. It will be send into TLS Hello packet to let + /// the server know which host we want to talk with (single IP can process requests for multiple hosts using SNI). + static_cast(socket.get())->setPeerHostName(host); +#else + throw Exception{ + "tcp_secure protocol is disabled because poco library was built without NetSSL support.", ErrorCodes::SUPPORT_IS_DISABLED}; +#endif + } + else + { + socket = std::make_unique(); + } - LOG_TRACE(log_wrapper.get(), "Connected to {} server version {}.{}.{}.", - server_name, server_version_major, server_version_minor, server_version_patch); + current_resolved_address = DNSResolver::instance().resolveAddress(host, port); + + const auto & connection_timeout = static_cast(secure) ? timeouts.secure_connection_timeout : timeouts.connection_timeout; + socket->connect(*current_resolved_address, connection_timeout); + socket->setReceiveTimeout(timeouts.receive_timeout); + socket->setSendTimeout(timeouts.send_timeout); + socket->setNoDelay(true); + if (timeouts.tcp_keep_alive_timeout.totalSeconds()) + { + socket->setKeepAlive(true); + socket->setOption( + IPPROTO_TCP, +#if defined(TCP_KEEPALIVE) + TCP_KEEPALIVE +#else + TCP_KEEPIDLE // __APPLE__ +#endif + , + timeouts.tcp_keep_alive_timeout); + } + + in = std::make_shared(*socket); + out = std::make_shared(*socket); + + connected = true; } catch (Poco::Net::NetException & e) { @@ -90,73 +158,11 @@ void Connection::connect(const ConnectionTimeouts & timeouts) } -void Connection::disconnect() -{ - maybe_compressed_out = nullptr; - in = nullptr; - last_input_packet_type.reset(); - out = nullptr; // can write to socket - if (socket) - socket->close(); - socket = nullptr; - connected = false; -} - -void Connection::prepare(const ConnectionTimeouts & timeouts) -{ - LOG_TRACE(log_wrapper.get(), "Connecting. Database: {}. User: {}{}{}", - default_database.empty() ? "(not specified)" : default_database, - user, - static_cast(secure) ? ". Secure" : "", - static_cast(compression) ? "" : ". Uncompressed"); - - if (static_cast(secure)) - { -#if USE_SSL - socket = std::make_unique(); - - /// we resolve the ip when we open SecureStreamSocket, so to make Server Name Indication (SNI) - /// work we need to pass host name separately. It will be send into TLS Hello packet to let - /// the server know which host we want to talk with (single IP can process requests for multiple hosts using SNI). - static_cast(socket.get())->setPeerHostName(host); -#else - throw Exception{"tcp_secure protocol is disabled because poco library was built without NetSSL support.", ErrorCodes::SUPPORT_IS_DISABLED}; -#endif - } - else - { - socket = std::make_unique(); - } - - current_resolved_address = DNSResolver::instance().resolveAddress(host, port); - - const auto & connection_timeout = static_cast(secure) ? timeouts.secure_connection_timeout : timeouts.connection_timeout; - socket->connect(*current_resolved_address, connection_timeout); - socket->setReceiveTimeout(timeouts.receive_timeout); - socket->setSendTimeout(timeouts.send_timeout); - socket->setNoDelay(true); - if (timeouts.tcp_keep_alive_timeout.totalSeconds()) - { - socket->setKeepAlive(true); - socket->setOption(IPPROTO_TCP, -#if defined(TCP_KEEPALIVE) - TCP_KEEPALIVE -#else - TCP_KEEPIDLE // __APPLE__ -#endif - , timeouts.tcp_keep_alive_timeout); - } - - in = std::make_shared(*socket); - out = std::make_shared(*socket); - - connected = true; -} - - void Connection::sendHello() { - /** Disallow control characters in user controlled parameters + try + { + /** Disallow control characters in user controlled parameters * to mitigate the possibility of SSRF. * The user may do server side requests with 'remote' table function. * Malicious user with full r/w access to ClickHouse @@ -165,85 +171,116 @@ void Connection::sendHello() * Limiting number of possible characters in user-controlled part of handshake * will mitigate this possibility but doesn't solve it completely. */ - auto has_control_character = [](const std::string & s) - { - for (auto c : s) - if (isControlASCII(c)) - return true; - return false; - }; + auto has_control_character = [](const std::string & s) { + for (auto c : s) + if (isControlASCII(c)) + return true; + return false; + }; - if (has_control_character(default_database) - || has_control_character(user) - || has_control_character(password)) - throw Exception("Parameters 'default_database', 'user' and 'password' must not contain ASCII control characters", ErrorCodes::BAD_ARGUMENTS); + if (has_control_character(default_database) || has_control_character(user) || has_control_character(password)) + throw Exception( + "Parameters 'default_database', 'user' and 'password' must not contain ASCII control characters", + ErrorCodes::BAD_ARGUMENTS); - writeVarUInt(Protocol::Client::Hello, *out); - writeStringBinary((DBMS_NAME " ") + client_name, *out); - writeVarUInt(DBMS_VERSION_MAJOR, *out); - writeVarUInt(DBMS_VERSION_MINOR, *out); - // NOTE For backward compatibility of the protocol, client cannot send its version_patch. - writeVarUInt(DBMS_TCP_PROTOCOL_VERSION, *out); - writeStringBinary(default_database, *out); - /// If interserver-secret is used, one do not need password - /// (NOTE we do not check for DBMS_MIN_REVISION_WITH_INTERSERVER_SECRET, since we cannot ignore inter-server secret if it was requested) - if (!cluster_secret.empty()) - { - writeStringBinary(USER_INTERSERVER_MARKER, *out); - writeStringBinary("" /* password */, *out); + writeVarUInt(Protocol::Client::Hello, *out); + writeStringBinary((DBMS_NAME " ") + client_name, *out); + writeVarUInt(DBMS_VERSION_MAJOR, *out); + writeVarUInt(DBMS_VERSION_MINOR, *out); + // NOTE For backward compatibility of the protocol, client cannot send its version_patch. + writeVarUInt(DBMS_TCP_PROTOCOL_VERSION, *out); + writeStringBinary(default_database, *out); + /// If interserver-secret is used, one do not need password + /// (NOTE we do not check for DBMS_MIN_REVISION_WITH_INTERSERVER_SECRET, since we cannot ignore inter-server secret if it was requested) + if (!cluster_secret.empty()) + { + writeStringBinary(USER_INTERSERVER_MARKER, *out); + writeStringBinary("" /* password */, *out); #if USE_SSL - sendClusterNameAndSalt(); + sendClusterNameAndSalt(); #else - throw Exception( - "Inter-server secret support is disabled, because ClickHouse was built without SSL library", - ErrorCodes::SUPPORT_IS_DISABLED); + throw Exception( + "Inter-server secret support is disabled, because ClickHouse was built without SSL library", + ErrorCodes::SUPPORT_IS_DISABLED); #endif - } - else - { - writeStringBinary(user, *out); - writeStringBinary(password, *out); - } + } + else + { + writeStringBinary(user, *out); + writeStringBinary(password, *out); + } - out->next(); + out->next(); + } + catch (Poco::Net::NetException & e) + { + disconnect(); + + /// Add server address to exception. Also Exception will remember stack trace. It's a pity that more precise exception type is lost. + throw NetException(e.displayText() + " (" + getDescription() + ")", ErrorCodes::NETWORK_ERROR); + } + catch (Poco::TimeoutException & e) + { + disconnect(); + + /// Add server address to exception. Also Exception will remember stack trace. It's a pity that more precise exception type is lost. + throw NetException(e.displayText() + " (" + getDescription() + ")", ErrorCodes::SOCKET_TIMEOUT); + } } void Connection::receiveHello() { - /// Receive hello packet. - UInt64 packet_type = 0; - - /// Prevent read after eof in readVarUInt in case of reset connection - /// (Poco should throw such exception while reading from socket but - /// sometimes it doesn't for unknown reason) - if (in->eof()) - throw Poco::Net::NetException("Connection reset by peer"); - - readVarUInt(packet_type, *in); - if (packet_type == Protocol::Server::Hello) + try { - readStringBinary(server_name, *in); - readVarUInt(server_version_major, *in); - readVarUInt(server_version_minor, *in); - readVarUInt(server_revision, *in); - if (server_revision >= DBMS_MIN_REVISION_WITH_SERVER_TIMEZONE) - readStringBinary(server_timezone, *in); - if (server_revision >= DBMS_MIN_REVISION_WITH_SERVER_DISPLAY_NAME) - readStringBinary(server_display_name, *in); - if (server_revision >= DBMS_MIN_REVISION_WITH_VERSION_PATCH) - readVarUInt(server_version_patch, *in); + /// Receive hello packet. + UInt64 packet_type = 0; + + /// Prevent read after eof in readVarUInt in case of reset connection + /// (Poco should throw such exception while reading from socket but + /// sometimes it doesn't for unknown reason) + if (in->eof()) + throw Poco::Net::NetException("Connection reset by peer"); + + readVarUInt(packet_type, *in); + if (packet_type == Protocol::Server::Hello) + { + readStringBinary(server_name, *in); + readVarUInt(server_version_major, *in); + readVarUInt(server_version_minor, *in); + readVarUInt(server_revision, *in); + if (server_revision >= DBMS_MIN_REVISION_WITH_SERVER_TIMEZONE) + readStringBinary(server_timezone, *in); + if (server_revision >= DBMS_MIN_REVISION_WITH_SERVER_DISPLAY_NAME) + readStringBinary(server_display_name, *in); + if (server_revision >= DBMS_MIN_REVISION_WITH_VERSION_PATCH) + readVarUInt(server_version_patch, *in); + else + server_version_patch = server_revision; + } + else if (packet_type == Protocol::Server::Exception) + receiveException()->rethrow(); else - server_version_patch = server_revision; + { + /// Close connection, to not stay in unsynchronised state. + disconnect(); + throwUnexpectedPacket(packet_type, "Hello or Exception"); + } } - else if (packet_type == Protocol::Server::Exception) - receiveException()->rethrow(); - else + catch (Poco::Net::NetException & e) { - /// Close connection, to not stay in unsynchronised state. disconnect(); - throwUnexpectedPacket(packet_type, "Hello or Exception"); + + /// Add server address to exception. Also Exception will remember stack trace. It's a pity that more precise exception type is lost. + throw NetException(e.displayText() + " (" + getDescription() + ")", ErrorCodes::NETWORK_ERROR); + } + catch (Poco::TimeoutException & e) + { + disconnect(); + + /// Add server address to exception. Also Exception will remember stack trace. It's a pity that more precise exception type is lost. + throw NetException(e.displayText() + " (" + getDescription() + ")", ErrorCodes::SOCKET_TIMEOUT); } } diff --git a/src/Client/ConnectionPoolWithFailover.cpp b/src/Client/ConnectionPoolWithFailover.cpp index 15344b3b18b..a027f7a186b 100644 --- a/src/Client/ConnectionPoolWithFailover.cpp +++ b/src/Client/ConnectionPoolWithFailover.cpp @@ -358,19 +358,6 @@ void ConnectionEstablisher::resetResult() } } -void ConnectionEstablisher::processFail(bool add_description) -{ - if (action_before_disconnect) - action_before_disconnect(socket_fd); - - fail_message = getCurrentExceptionMessage(/* with_stacktrace = */ false); - if (add_description) - fail_message += " (" + result.entry->getDescription() + ")"; - resetResult(); - socket_fd = -1; - stage = Stage::FAILED; -} - void ConnectionEstablisher::run() { try @@ -463,20 +450,19 @@ void ConnectionEstablisher::run() stage = Stage::FINISHED; } - catch (Poco::Net::NetException &) - { - processFail(true); - } - catch (Poco::TimeoutException &) - { - processFail(true); - } catch (const Exception & e) { - if (e.code() != ErrorCodes::ATTEMPT_TO_READ_AFTER_EOF) + if (e.code() != ErrorCodes::NETWORK_ERROR && e.code() != ErrorCodes::SOCKET_TIMEOUT + && e.code() != ErrorCodes::ATTEMPT_TO_READ_AFTER_EOF) throw; - processFail(false); + if (action_before_disconnect) + action_before_disconnect(socket_fd); + + fail_message = getCurrentExceptionMessage(/* with_stacktrace = */ false); + resetResult(); + socket_fd = -1; + stage = Stage::FAILED; } } diff --git a/src/Client/ConnectionPoolWithFailover.h b/src/Client/ConnectionPoolWithFailover.h index 44b06e871ec..b25eee6e33d 100644 --- a/src/Client/ConnectionPoolWithFailover.h +++ b/src/Client/ConnectionPoolWithFailover.h @@ -65,9 +65,6 @@ public: /// disconnect. It may be useful for removing file descriptor from epoll. void setActionBeforeDisconnect(std::function action) { action_before_disconnect = action; } - /// Process fail connection. - void processFail(bool add_description = false); - IConnectionPool * pool; const ConnectionTimeouts * timeouts; std::string fail_message; diff --git a/src/Client/HedgedConnections.cpp b/src/Client/HedgedConnections.cpp index 32a91af6179..ad00c60b302 100644 --- a/src/Client/HedgedConnections.cpp +++ b/src/Client/HedgedConnections.cpp @@ -26,15 +26,20 @@ HedgedConnections::HedgedConnections( { std::vector connections = hedged_connections_factory.getManyConnections(pool_mode); - ReplicaState replica; + if (connections.empty()) + return; + for (size_t i = 0; i != connections.size(); ++i) { + ReplicaState replica; replica.connection = connections[i]; replica.connection->setThrottler(throttler_); - int socket_fd = replica.connection->getSocket()->impl()->sockfd(); - epoll.add(socket_fd); - fd_to_replica_location[socket_fd] = ReplicaLocation{i, 0}; - offset_states.push_back(OffsetState{{replica}, 1, false}); + replica.epoll.add(replica.connection->getSocket()->impl()->sockfd()); + epoll.add(replica.epoll.getFileDescriptor()); + fd_to_replica_location[replica.epoll.getFileDescriptor()] = ReplicaLocation{i, 0}; + offset_states.emplace_back(); + offset_states[i].replicas.emplace_back(std::move(replica)); + offset_states[i].active_connection_count = 1; } active_connection_count = connections.size(); @@ -148,12 +153,12 @@ void HedgedConnections::sendQuery( if (offset_states.size() > 1) { modified_settings.parallel_replicas_count = offset_states.size(); - modified_settings.parallel_replica_offset = fd_to_replica_location[replica.connection->getSocket()->impl()->sockfd()].offset; + modified_settings.parallel_replica_offset = fd_to_replica_location[replica.epoll.getFileDescriptor()].offset; } replica.connection->sendQuery(timeouts, query, query_id, stage, &modified_settings, &client_info, with_pending_data); - addTimeoutToReplica(ConnectionTimeoutType::RECEIVE_TIMEOUT, replica); - addTimeoutToReplica(ConnectionTimeoutType::RECEIVE_DATA_TIMEOUT, replica); + replica.receive_timeout.setRelative(timeouts.receive_timeout); + replica.change_replica_timeout.setRelative(timeouts.receive_data_timeout); }; for (auto & offset_status : offset_states) @@ -234,7 +239,8 @@ Packet HedgedConnections::drain() while (!epoll.empty()) { - Packet packet = receivePacketImpl(); + ReplicaLocation location = getReadyReplicaLocation(); + Packet packet = receivePacketFromReplica(location); switch (packet.type) { case Protocol::Server::PartUUIDs: @@ -273,64 +279,110 @@ Packet HedgedConnections::receivePacketUnlocked(AsyncCallback async_callback) if (epoll.empty()) throw Exception("No pending events in epoll.", ErrorCodes::LOGICAL_ERROR); - return receivePacketImpl(std::move(async_callback)); + ReplicaLocation location = getReadyReplicaLocation(async_callback); + return receivePacketFromReplica(location, std::move(async_callback)); } -Packet HedgedConnections::receivePacketImpl(AsyncCallback async_callback) +HedgedConnections::ReplicaLocation HedgedConnections::getReadyReplicaLocation(AsyncCallback async_callback) { int event_fd; - Packet packet; - bool finish = false; - while (!finish) + while (true) { + /// Check connections for pending data. + ReplicaLocation location; + if (checkPendingData(location)) + return location; + + /// Get ready file descriptor from epoll and process it. event_fd = getReadyFileDescriptor(async_callback); - if (fd_to_replica_location.contains(event_fd)) + if (event_fd == hedged_connections_factory.getFileDescriptor()) { - ReplicaLocation location = fd_to_replica_location[event_fd]; - packet = receivePacketFromReplica(location, async_callback); - finish = true; - } - else if (timeout_fd_to_replica_location.contains(event_fd)) - { - ReplicaLocation location = timeout_fd_to_replica_location[event_fd]; - processTimeoutEvent(location, offset_states[location.offset].replicas[location.index].active_timeouts[event_fd]); - } - else if (event_fd == hedged_connections_factory.getFileDescriptor()) tryGetNewReplica(false); - else - throw Exception("Unknown event from epoll", ErrorCodes::LOGICAL_ERROR); - } + continue; + } - return packet; + if (!fd_to_replica_location.contains(event_fd)) + throw Exception("Unknown event from epoll", ErrorCodes::LOGICAL_ERROR); + + location = fd_to_replica_location[event_fd]; + + /// Read all events from replica epoll. + /// If socket is ready and timeout is alarmed simultaneously, skip timeout. + bool is_socket_ready = false; + bool is_change_replica_timeout_alarmed = false; + bool is_receive_timeout_alarmed = false; + + epoll_event events[3]; + events[0].data.fd = events[1].data.fd = events[2].data.fd = -1; + ReplicaState & replica_state = offset_states[location.offset].replicas[location.index]; + size_t ready_count = replica_state.epoll.getManyReady(3, events, true); + + for (size_t i = 0; i != ready_count; ++i) + { + if (events[i].data.fd == replica_state.connection->getSocket()->impl()->sockfd()) + is_socket_ready = true; + if (events[i].data.fd == replica_state.change_replica_timeout.getDescriptor()) + is_change_replica_timeout_alarmed = true; + if (events[i].data.fd == replica_state.receive_timeout.getDescriptor()) + is_receive_timeout_alarmed = true; + } + + if (is_socket_ready) + return location; + + /// We reach this point only if there is an alarmed timeout. + + if (is_change_replica_timeout_alarmed) + { + replica_state.change_replica_timeout.reset(); + offsets_queue.push(location.offset); + tryGetNewReplica(true); + } + if (is_receive_timeout_alarmed) + { + finishProcessReplica(replica_state, true); + + /// Check if there is no more active connections with the same offset and there is no new replica in process. + if (offset_states[location.offset].active_connection_count == 0 && !next_replica_in_process) + throw NetException("Receive timeout expired", ErrorCodes::SOCKET_TIMEOUT); + } + } }; int HedgedConnections::getReadyFileDescriptor(AsyncCallback async_callback) { - for (auto & [fd, location] : fd_to_replica_location) - { - ReplicaState & replica = offset_states[location.offset].replicas[location.index]; - if (replica.connection->hasReadPendingData()) - return replica.connection->getSocket()->impl()->sockfd(); - } - epoll_event event; event.data.fd = -1; epoll.getManyReady(1, &event, true, std::move(async_callback)); return event.data.fd; } -Packet HedgedConnections::receivePacketFromReplica(ReplicaLocation & replica_location, AsyncCallback async_callback) +bool HedgedConnections::checkPendingData(ReplicaLocation & location_out) +{ + for (auto & [fd, location] : fd_to_replica_location) + { + if (offset_states[location.offset].replicas[location.index].connection->hasReadPendingData()) + { + location_out = location; + return true; + } + } + + return false; +} + +Packet HedgedConnections::receivePacketFromReplica(const ReplicaLocation & replica_location, AsyncCallback async_callback) { ReplicaState & replica = offset_states[replica_location.offset].replicas[replica_location.index]; - removeTimeoutFromReplica(ConnectionTimeoutType::RECEIVE_TIMEOUT, replica); + replica.receive_timeout.reset(); Packet packet = replica.connection->receivePacket(std::move(async_callback)); switch (packet.type) { case Protocol::Server::Data: if (!offset_states[replica_location.offset].first_packet_of_data_received) processReceivedFirstDataPacket(replica_location); - addTimeoutToReplica(ConnectionTimeoutType::RECEIVE_TIMEOUT, replica); + replica.receive_timeout.setRelative(hedged_connections_factory.getConnectionTimeouts().receive_timeout); break; case Protocol::Server::PartUUIDs: case Protocol::Server::Progress: @@ -338,7 +390,7 @@ Packet HedgedConnections::receivePacketFromReplica(ReplicaLocation & replica_loc case Protocol::Server::Totals: case Protocol::Server::Extremes: case Protocol::Server::Log: - addTimeoutToReplica(ConnectionTimeoutType::RECEIVE_TIMEOUT, replica); + replica.receive_timeout.setRelative(hedged_connections_factory.getConnectionTimeouts().receive_timeout); break; case Protocol::Server::EndOfStream: @@ -354,12 +406,12 @@ Packet HedgedConnections::receivePacketFromReplica(ReplicaLocation & replica_loc return packet; } -void HedgedConnections::processReceivedFirstDataPacket(ReplicaLocation & replica_location) +void HedgedConnections::processReceivedFirstDataPacket(const ReplicaLocation & replica_location) { /// When we receive first packet of data from replica, we stop working with replicas, that are /// responsible for the same offset. OffsetState & offset_state = offset_states[replica_location.offset]; - removeTimeoutFromReplica(ConnectionTimeoutType::RECEIVE_DATA_TIMEOUT, offset_state.replicas[replica_location.index]); + offset_state.replicas[replica_location.index].change_replica_timeout.reset(); ++offsets_with_received_first_data_packet; offset_state.first_packet_of_data_received = true; @@ -384,28 +436,6 @@ void HedgedConnections::processReceivedFirstDataPacket(ReplicaLocation & replica } } -void HedgedConnections::processTimeoutEvent(ReplicaLocation & replica_location, ConnectionTimeoutDescriptorPtr timeout_descriptor) -{ - ReplicaState & replica = offset_states[replica_location.offset].replicas[replica_location.index]; - epoll.remove(timeout_descriptor->timer.getDescriptor()); - replica.active_timeouts.erase(timeout_descriptor->timer.getDescriptor()); - timeout_fd_to_replica_location.erase(timeout_descriptor->timer.getDescriptor()); - - if (timeout_descriptor->type == ConnectionTimeoutType::RECEIVE_TIMEOUT) - { - finishProcessReplica(replica, true); - - /// Check if there is no active connections with the same offset and there is no new replica in process. - if (offset_states[replica_location.offset].active_connection_count == 0 && !next_replica_in_process) - throw NetException("Receive timeout expired", ErrorCodes::SOCKET_TIMEOUT); - } - else if (timeout_descriptor->type == ConnectionTimeoutType::RECEIVE_DATA_TIMEOUT) - { - offsets_queue.push(replica_location.offset); - tryGetNewReplica(true); - } -} - void HedgedConnections::tryGetNewReplica(bool start_new_connection) { Connection * connection = nullptr; @@ -414,24 +444,22 @@ void HedgedConnections::tryGetNewReplica(bool start_new_connection) /// Skip replicas that doesn't support two-level aggregation if we didn't disable it in sendQuery. while (state == HedgedConnectionsFactory::State::READY && !disable_two_level_aggregation && connection->getServerRevision(hedged_connections_factory.getConnectionTimeouts()) - < DBMS_MIN_REVISION_WITH_CURRENT_AGGREGATION_VARIANT_SELECTION_METHOD) + < DBMS_MIN_REVISION_WITH_CURRENT_AGGREGATION_VARIANT_SELECTION_METHOD) state = hedged_connections_factory.getNextConnection(true, false, connection); if (state == HedgedConnectionsFactory::State::READY) { size_t offset = offsets_queue.front(); offsets_queue.pop(); - size_t index = offset_states[offset].replicas.size(); ReplicaState replica; replica.connection = connection; - int socket_fd = replica.connection->getSocket()->impl()->sockfd(); - epoll.add(socket_fd); - fd_to_replica_location[socket_fd] = ReplicaLocation{offset, index}; - ++offset_states[offset].active_connection_count; + replica.epoll.add(replica.connection->getSocket()->impl()->sockfd()); + epoll.add(replica.epoll.getFileDescriptor()); + fd_to_replica_location[replica.epoll.getFileDescriptor()] = ReplicaLocation{offset, offset_states[offset].replicas.size()}; ++active_connection_count; pipeline_for_new_replicas.run(replica); - offset_states[offset].replicas.push_back(replica); + offset_states[offset].replicas.push_back(std::move(replica)); } else if (state == HedgedConnectionsFactory::State::NOT_READY && !next_replica_in_process) { @@ -460,11 +488,9 @@ void HedgedConnections::tryGetNewReplica(bool start_new_connection) void HedgedConnections::finishProcessReplica(ReplicaState & replica, bool disconnect) { - removeTimeoutsFromReplica(replica); - int socket_fd = replica.connection->getSocket()->impl()->sockfd(); - epoll.remove(socket_fd); - --offset_states[fd_to_replica_location[socket_fd].offset].active_connection_count; - fd_to_replica_location.erase(socket_fd); + epoll.remove(replica.epoll.getFileDescriptor()); + --offset_states[fd_to_replica_location[replica.epoll.getFileDescriptor()].offset].active_connection_count; + fd_to_replica_location.erase(replica.epoll.getFileDescriptor()); --active_connection_count; if (disconnect) @@ -472,38 +498,5 @@ void HedgedConnections::finishProcessReplica(ReplicaState & replica, bool discon replica.connection = nullptr; } -void HedgedConnections::addTimeoutToReplica(ConnectionTimeoutType type, ReplicaState & replica) -{ - ConnectionTimeoutDescriptorPtr timeout_descriptor - = createConnectionTimeoutDescriptor(type, hedged_connections_factory.getConnectionTimeouts()); - epoll.add(timeout_descriptor->timer.getDescriptor()); - timeout_fd_to_replica_location[timeout_descriptor->timer.getDescriptor()] - = fd_to_replica_location[replica.connection->getSocket()->impl()->sockfd()]; - replica.active_timeouts[timeout_descriptor->timer.getDescriptor()] = std::move(timeout_descriptor); -} - -void HedgedConnections::removeTimeoutsFromReplica(ReplicaState & replica) -{ - for (auto & [fd, _] : replica.active_timeouts) - { - epoll.remove(fd); - timeout_fd_to_replica_location.erase(fd); - } - replica.active_timeouts.clear(); -} - -void HedgedConnections::removeTimeoutFromReplica(ConnectionTimeoutType type, ReplicaState & replica) -{ - auto it = std::find_if( - replica.active_timeouts.begin(), replica.active_timeouts.end(), [type](auto & value) { return value.second->type == type; }); - - if (it != replica.active_timeouts.end()) - { - epoll.remove(it->first); - timeout_fd_to_replica_location.erase(it->first); - replica.active_timeouts.erase(it); - } -} - } #endif diff --git a/src/Client/HedgedConnections.h b/src/Client/HedgedConnections.h index 4e3b6a67169..249c41a7a06 100644 --- a/src/Client/HedgedConnections.h +++ b/src/Client/HedgedConnections.h @@ -20,21 +20,40 @@ class HedgedConnections : public IConnections public: struct ReplicaState { - Connection * connection = nullptr; - std::unordered_map active_timeouts; - }; + ReplicaState() + { + epoll.add(receive_timeout.getDescriptor()); + epoll.add(change_replica_timeout.getDescriptor()); + } - struct ReplicaLocation - { - size_t offset; - size_t index; + Connection * connection = nullptr; + TimerDescriptor receive_timeout; + TimerDescriptor change_replica_timeout; + /// We store socket and timeout descriptors in epoll + /// and use it's fd outside. + Epoll epoll; }; struct OffsetState { + /// Replicas with the same offset. std::vector replicas; - size_t active_connection_count; - bool first_packet_of_data_received; + /// An amount of active replicas, when first_packet_of_data_received is true, + /// active_connection_count is always <= 1 (because we stop working with + /// other replicas when we receive first data packet from one of them) + size_t active_connection_count = 0; + bool first_packet_of_data_received = false; + }; + + /// We process events in epoll, so we need to determine replica by it's + /// file descriptor. We store map fd -> replica location. To determine + /// where replica is, we need a replica offset + /// (the same as parallel_replica_offset), and index, which is needed because + /// we can have many replicas with same offset (when receive_data_timeout has expired). + struct ReplicaLocation + { + size_t offset; + size_t index; }; HedgedConnections(const ConnectionPoolWithFailoverPtr & pool_, @@ -75,7 +94,11 @@ public: bool hasActiveConnections() const override { return active_connection_count > 0; } private: - /// We will save actions with replicas in pipeline to perform them on the new replicas. + /// If we don't receive data from replica for receive_data_timeout, we are trying + /// to get new replica and send query to it. Beside sending query, there are some + /// additional actions like sendScalarsData or sendExternalTablesData and we need + /// to perform these actions in the same order on the new replica. So, we will + /// save actions with replicas in pipeline to perform them on the new replicas. class Pipeline { public: @@ -86,13 +109,11 @@ private: std::vector> pipeline; }; - Packet receivePacketFromReplica(ReplicaLocation & replica_location, AsyncCallback async_callback = {}); + Packet receivePacketFromReplica(const ReplicaLocation & replica_location, AsyncCallback async_callback = {}); - Packet receivePacketImpl(AsyncCallback async_callback = {}); + ReplicaLocation getReadyReplicaLocation(AsyncCallback async_callback = {}); - void processReceivedFirstDataPacket(ReplicaLocation & replica_location); - - void processTimeoutEvent(ReplicaLocation & replica_location, ConnectionTimeoutDescriptorPtr timeout_descriptor); + void processReceivedFirstDataPacket(const ReplicaLocation & replica_location); void tryGetNewReplica(bool start_new_connection); @@ -100,12 +121,7 @@ private: int getReadyFileDescriptor(AsyncCallback async_callback = {}); - void addTimeoutToReplica(ConnectionTimeoutType type, ReplicaState & replica); - - void removeTimeoutsFromReplica(ReplicaState & replica); - - void removeTimeoutFromReplica(ConnectionTimeoutType type, ReplicaState & replica); - + bool checkPendingData(ReplicaLocation & location_out); HedgedConnectionsFactory hedged_connections_factory; @@ -116,8 +132,6 @@ private: /// Map socket file descriptor to replica location (it's offset and index in OffsetState.replicas). std::unordered_map fd_to_replica_location; - /// Map timeout file descriptor to replica location (it's offset and index in OffsetState.replicas). - std::unordered_map timeout_fd_to_replica_location; /// A queue of offsets for new replicas. When we get RECEIVE_DATA_TIMEOUT from /// the replica, we push it's offset to this queue and start trying to get diff --git a/src/Client/HedgedConnectionsFactory.cpp b/src/Client/HedgedConnectionsFactory.cpp index 12362635904..c4a10379985 100644 --- a/src/Client/HedgedConnectionsFactory.cpp +++ b/src/Client/HedgedConnectionsFactory.cpp @@ -22,9 +22,10 @@ HedgedConnectionsFactory::HedgedConnectionsFactory( { shuffled_pools = pool->getShuffledPools(settings); for (size_t i = 0; i != shuffled_pools.size(); ++i) - connection_establishers.emplace_back(shuffled_pools[i].pool, &timeouts, settings, table_to_check.get(), log); - - replicas_timeouts.resize(shuffled_pools.size()); + { + ConnectionEstablisher establisher(shuffled_pools[i].pool, &timeouts, settings, table_to_check.get(), log); + replicas.emplace_back(std::move(establisher)); + } max_tries = (settings ? size_t{settings->connections_with_failover_max_tries} : size_t{DBMS_CONNECTION_POOL_WITH_FAILOVER_DEFAULT_MAX_TRIES}); @@ -62,15 +63,12 @@ std::vector HedgedConnectionsFactory::getManyConnections(PoolMode Connection * connection = nullptr; /// Try to start establishing connections with max_entries replicas. - int index; for (size_t i = 0; i != max_entries; ++i) { - index = getNextIndex(); + int index = startEstablishingNewConnection(connection); if (index == -1) break; - - auto state = startEstablishingConnection(index, connection); - if (state == State::READY) + if (replicas[index].is_ready) connections.push_back(connection); } @@ -104,27 +102,17 @@ std::vector HedgedConnectionsFactory::getManyConnections(PoolMode HedgedConnectionsFactory::State HedgedConnectionsFactory::getNextConnection(bool start_new_connection, bool blocking, Connection *& connection_out) { - int index = -1; - if (start_new_connection) - index = getNextIndex(); - - while (index != -1 || !epoll.empty()) { - if (index != -1) - { - State state = startEstablishingConnection(index, connection_out); - if (state == State::READY) - return state; - } - - State state = processEpollEvents(blocking, connection_out); - if (state != State::EMPTY) - return state; - - index = getNextIndex(); + int index = startEstablishingNewConnection(connection_out); + if (index != -1 && replicas[index].is_ready) + return State::READY; } + State state = processEpollEvents(blocking, connection_out); + if (state != State::CANNOT_CHOOSE) + return state; + /// We reach this point only if there was no free up to date replica. /// We will try to use usable replica. @@ -139,9 +127,9 @@ void HedgedConnectionsFactory::stopChoosingReplicas() { for (auto & [fd, replica_index] : fd_to_replica_index) { - removeTimeoutsFromReplica(replica_index); + resetReplicaTimeouts(replica_index); epoll.remove(fd); - connection_establishers[replica_index].reset(); + replicas[replica_index].connection_establisher.reset(); } fd_to_replica_index.clear(); @@ -150,7 +138,7 @@ void HedgedConnectionsFactory::stopChoosingReplicas() int HedgedConnectionsFactory::getNextIndex() { /// Check if there is no free replica. - if (entries_count + indexes_in_process.size() + failed_pools_count >= shuffled_pools.size()) + if (entries_count + replicas_in_process_count + failed_pools_count >= shuffled_pools.size()) return -1; /// Check if it's the first time. @@ -167,8 +155,8 @@ int HedgedConnectionsFactory::getNextIndex() next_index = (next_index + 1) % shuffled_pools.size(); /// Check if we can try this replica. - if (indexes_in_process.find(next_index) == indexes_in_process.end() && (max_tries == 0 || shuffled_pools[next_index].error_count < max_tries) - && connection_establishers[next_index].stage != ConnectionEstablisher::Stage::FINISHED) + if (!replicas[next_index].is_in_process && (max_tries == 0 || shuffled_pools[next_index].error_count < max_tries) + && replicas[next_index].connection_establisher.stage != ConnectionEstablisher::Stage::FINISHED) finish = true; /// If we made a complete round, there is no replica to connect. @@ -180,83 +168,93 @@ int HedgedConnectionsFactory::getNextIndex() return next_index; } -HedgedConnectionsFactory::State HedgedConnectionsFactory::startEstablishingConnection(int replica_index, Connection *& connection_out) +int HedgedConnectionsFactory::startEstablishingNewConnection(Connection *& connection_out) { - State state; + int index; do { - ConnectionEstablisher & connection_establisher = connection_establishers[replica_index]; + index = getNextIndex(); + if (index == -1) + return -1; - state = State::NOT_READY; - indexes_in_process.insert(replica_index); + ReplicaStatus & replica = replicas[index]; - connection_establisher.reset(); - connection_establisher.run(); + ++replicas_in_process_count; + replica.is_in_process = true; + replica.connection_establisher.reset(); + replica.connection_establisher.run(); - state = processConnectionEstablisherStage(replica_index); + processConnectionEstablisherStage(index); - if (state == State::NOT_READY) + if (replica.is_in_process) { - epoll.add(connection_establisher.socket_fd); - fd_to_replica_index[connection_establisher.socket_fd] = replica_index; - connection_establisher.setActionBeforeDisconnect([&](int fd) - { - epoll.remove(fd); - fd_to_replica_index.erase(fd); - }); - addTimeouts(replica_index); + replica.epoll.add(replica.connection_establisher.socket_fd); + replica.connection_establisher.setActionBeforeDisconnect([&](int fd){ replica.epoll.remove(fd); }); + addTimeouts(index); + epoll.add(replica.epoll.getFileDescriptor()); + fd_to_replica_index[replica.epoll.getFileDescriptor()] = index; } } - while (state == State::EMPTY && (replica_index = getNextIndex()) != -1); + while (!replicas[index].is_ready && !replicas[index].is_in_process); - if (state == State::READY) - connection_out = &*connection_establishers[replica_index].result.entry; + if (replicas[index].is_ready) + connection_out = &*replicas[index].connection_establisher.result.entry; - return state; + return index; } -HedgedConnectionsFactory::State HedgedConnectionsFactory::processConnectionEstablisherStage(int replica_index, bool remove_from_epoll) +void HedgedConnectionsFactory::processConnectionEstablisherStage(int replica_index, bool remove_from_epoll) { - ConnectionEstablisher & connection_establisher = connection_establishers[replica_index]; + ReplicaStatus & replica = replicas[replica_index]; - if (connection_establisher.stage == ConnectionEstablisher::Stage::FINISHED) + if (replica.connection_establisher.stage == ConnectionEstablisher::Stage::FINISHED) { - indexes_in_process.erase(replica_index); + replica.is_in_process = false; + --replicas_in_process_count; ++entries_count; if (remove_from_epoll) { - epoll.remove(connection_establisher.socket_fd); - fd_to_replica_index.erase(connection_establisher.socket_fd); + epoll.remove(replica.epoll.getFileDescriptor()); + fd_to_replica_index.erase(replica.epoll.getFileDescriptor()); } - if (connection_establisher.result.is_usable) + if (replica.connection_establisher.result.is_usable) { ++usable_count; - if (connection_establisher.result.is_up_to_date) + if (replica.connection_establisher.result.is_up_to_date) { - ready_indexes.insert(replica_index); - return State::READY; + ++ready_replicas_count; + replica.is_ready = true; + return; } } - - /// This replica is not up to date, we will try to find up to date. - return State::EMPTY; + else + { + std::string & fail_message = replica.connection_establisher.fail_message; + if (!fail_message.empty()) + fail_messages += fail_message + "\n"; + } } - else if (connection_establisher.stage == ConnectionEstablisher::Stage::FAILED) - { - processFailedConnection(replica_index); - return State::EMPTY; - } - - return State::NOT_READY; + else if (replica.connection_establisher.stage == ConnectionEstablisher::Stage::FAILED) + processFailedConnection(replica_index, remove_from_epoll); } -void HedgedConnectionsFactory::processFailedConnection(int replica_index) +void HedgedConnectionsFactory::processFailedConnection(int replica_index, bool remove_from_epoll) { + if (remove_from_epoll) + { + epoll.remove(replicas[replica_index].epoll.getFileDescriptor()); + fd_to_replica_index.erase(replicas[replica_index].epoll.getFileDescriptor()); + } + + std::string & fail_message = replicas[replica_index].connection_establisher.fail_message; + if (!fail_message.empty()) + fail_messages += fail_message + "\n"; + ShuffledPool & shuffled_pool = shuffled_pools[replica_index]; LOG_WARNING( - log, "Connection failed at try №{}, reason: {}", (shuffled_pool.error_count + 1), connection_establishers[replica_index].fail_message); + log, "Connection failed at try №{}, reason: {}", (shuffled_pool.error_count + 1), fail_message); ProfileEvents::increment(ProfileEvents::DistributedConnectionFailTry); shuffled_pool.error_count = std::min(pool->getMaxErrorCup(), shuffled_pool.error_count + 1); @@ -267,139 +265,157 @@ void HedgedConnectionsFactory::processFailedConnection(int replica_index) ProfileEvents::increment(ProfileEvents::DistributedConnectionFailAtAll); } - std::string & fail_message = connection_establishers[replica_index].fail_message; - if (!fail_message.empty()) - fail_messages += fail_message + "\n"; - - indexes_in_process.erase(replica_index); + --replicas_in_process_count; + replicas[replica_index].is_in_process = false; } void HedgedConnectionsFactory::addTimeouts(int replica_index) { - addTimeoutToReplica(ConnectionTimeoutType::RECEIVE_TIMEOUT, replica_index); - - auto stage = connection_establishers[replica_index].stage; + auto stage = replicas[replica_index].connection_establisher.stage; if (stage == ConnectionEstablisher::Stage::RECEIVE_HELLO) - addTimeoutToReplica(ConnectionTimeoutType::RECEIVE_HELLO_TIMEOUT, replica_index); - else if (stage == ConnectionEstablisher::Stage::RECEIVE_TABLES_STATUS) - addTimeoutToReplica(ConnectionTimeoutType::RECEIVE_TABLES_STATUS_TIMEOUT, replica_index); -} - -void HedgedConnectionsFactory::addTimeoutToReplica(ConnectionTimeoutType type, int replica_index) -{ - ConnectionTimeoutDescriptorPtr timeout_descriptor = createConnectionTimeoutDescriptor(type, timeouts); - epoll.add(timeout_descriptor->timer.getDescriptor()); - timeout_fd_to_replica_index[timeout_descriptor->timer.getDescriptor()] = replica_index; - replicas_timeouts[replica_index][timeout_descriptor->timer.getDescriptor()] = std::move(timeout_descriptor); -} - -void HedgedConnectionsFactory::removeTimeoutsFromReplica(int replica_index) -{ - for (auto & [fd, _] : replicas_timeouts[replica_index]) { - epoll.remove(fd); - timeout_fd_to_replica_index.erase(fd); + replicas[replica_index].receive_timeout.setRelative(timeouts.receive_timeout); + replicas[replica_index].change_replica_timeout.setRelative(timeouts.receive_hello_timeout); } - replicas_timeouts[replica_index].clear(); + else if (stage == ConnectionEstablisher::Stage::RECEIVE_TABLES_STATUS) + { + replicas[replica_index].receive_timeout.setRelative(Poco::Timespan(DBMS_DEFAULT_SYNC_REQUEST_TIMEOUT_SEC, 0)); + replicas[replica_index].change_replica_timeout.setRelative(timeouts.receive_tables_status_timeout); + } +} + +void HedgedConnectionsFactory::resetReplicaTimeouts(int replica_index) +{ + replicas[replica_index].receive_timeout.reset(); + replicas[replica_index].change_replica_timeout.reset(); } HedgedConnectionsFactory::State HedgedConnectionsFactory::processEpollEvents(bool blocking, Connection *& connection_out) { int event_fd; - while (true) + while (!epoll.empty()) { + /// Firstly, check connections for pending data. + int replica_index = checkPendingData(); + if (replica_index != -1) + { + processSocketEvent(replica_index, connection_out); + /// Return only if replica is ready. + if (replicas[replica_index].is_ready) + return State::READY; + + continue; + } + + /// Get ready descriptor fro epoll. event_fd = getReadyFileDescriptor(blocking); /// Check if there is no events. if (event_fd == -1) return State::NOT_READY; - if (fd_to_replica_index.find(event_fd) != fd_to_replica_index.end()) + if (!fd_to_replica_index.contains(event_fd)) + throw Exception("Unknown event from epoll", ErrorCodes::LOGICAL_ERROR); + + replica_index = fd_to_replica_index[event_fd]; + + /// Read all events from replica epoll. + /// If socket is ready and timeout is alarmed simultaneously, skip timeout. + bool is_socket_ready = false; + bool is_receive_timeout_alarmed = false; + bool is_change_replica_timeout_alarmed = false; + + epoll_event events[3]; + events[0].data.fd = events[1].data.fd = events[2].data.fd = -1; + size_t ready_count = replicas[replica_index].epoll.getManyReady(3, events, true); + for (size_t i = 0; i != ready_count; ++i) { - int replica_index = fd_to_replica_index[event_fd]; - State state = processReplicaEvent(replica_index, connection_out); - /// Return only if replica is ready or we need to try next replica. - if (state != State::NOT_READY) - return state; + if (events[i].data.fd == replicas[replica_index].connection_establisher.socket_fd) + is_socket_ready = true; + if (events[i].data.fd == replicas[replica_index].receive_timeout.getDescriptor()) + is_receive_timeout_alarmed = true; + if (events[i].data.fd == replicas[replica_index].change_replica_timeout.getDescriptor()) + is_change_replica_timeout_alarmed = true; } - else if (timeout_fd_to_replica_index.find(event_fd) != timeout_fd_to_replica_index.end()) + + if (is_socket_ready) { - int replica_index = timeout_fd_to_replica_index[event_fd]; - /// Process received timeout. If retured values is true, we need to try new replica. - if (processTimeoutEvent(replica_index, replicas_timeouts[replica_index][event_fd])) - return State::EMPTY; + processSocketEvent(replica_index, connection_out); + /// Return only if replica is ready. + if (replicas[replica_index].is_ready) + return State::READY; + if (replicas[replica_index].is_in_process) + continue; } else - throw Exception("Unknown event from epoll", ErrorCodes::LOGICAL_ERROR); + { + if (is_receive_timeout_alarmed) + processReceiveTimeout(replica_index); + + if (is_change_replica_timeout_alarmed) + replicas[replica_index].change_replica_timeout.reset(); + } + + /// We reach this point only if we need to start new connection. + replica_index = startEstablishingNewConnection(connection_out); + /// Return only if replica is ready. + if (replica_index != -1 && replicas[replica_index].is_ready) + return State::READY; } + + return State::CANNOT_CHOOSE; } int HedgedConnectionsFactory::getReadyFileDescriptor(bool blocking) { - for (auto & [fd, replica_index] : fd_to_replica_index) - if (connection_establishers[replica_index].result.entry->hasReadPendingData()) - return connection_establishers[replica_index].socket_fd; - epoll_event event; event.data.fd = -1; epoll.getManyReady(1, &event, blocking); return event.data.fd; } -HedgedConnectionsFactory::State HedgedConnectionsFactory::processReplicaEvent(int replica_index, Connection *& connection_out) +int HedgedConnectionsFactory::checkPendingData() { - removeTimeoutsFromReplica(replica_index); - connection_establishers[replica_index].run(); - State state = processConnectionEstablisherStage(replica_index, true); - if (state == State::NOT_READY) - addTimeouts(replica_index); - if (state == State::READY) - connection_out = &*connection_establishers[replica_index].result.entry; - return state; + for (auto & [fd, replica_index] : fd_to_replica_index) + if (replicas[replica_index].connection_establisher.result.entry->hasReadPendingData()) + return replica_index; + + return -1; } -bool HedgedConnectionsFactory::processTimeoutEvent(int replica_index, ConnectionTimeoutDescriptorPtr timeout_descriptor) +void HedgedConnectionsFactory::processSocketEvent(int replica_index, Connection *& connection_out) { - epoll.remove(timeout_descriptor->timer.getDescriptor()); - replicas_timeouts[replica_index].erase(timeout_descriptor->timer.getDescriptor()); - timeout_fd_to_replica_index[timeout_descriptor->timer.getDescriptor()]; + resetReplicaTimeouts(replica_index); + replicas[replica_index].connection_establisher.run(); + processConnectionEstablisherStage(replica_index, true); + if (replicas[replica_index].is_in_process) + addTimeouts(replica_index); + if (replicas[replica_index].is_ready) + connection_out = &*replicas[replica_index].connection_establisher.result.entry; +} - if (timeout_descriptor->type == ConnectionTimeoutType::RECEIVE_TIMEOUT) - { - removeTimeoutsFromReplica(replica_index); - int fd = connection_establishers[replica_index].socket_fd; - epoll.remove(fd); - fd_to_replica_index.erase(fd); +void HedgedConnectionsFactory::processReceiveTimeout(int replica_index) +{ + resetReplicaTimeouts(replica_index); + ReplicaStatus & replica = replicas[replica_index]; - ConnectionEstablisher & connection_establisher = connection_establishers[replica_index]; - connection_establisher.fail_message = "Receive timeout expired (" + connection_establisher.result.entry->getDescription() + ")"; - connection_establisher.resetResult(); - connection_establisher.stage = ConnectionEstablisher::Stage::FAILED; - processFailedConnection(replica_index); - return true; - } - - /// Return true if we can try to start one more connection. - return entries_count + indexes_in_process.size() + failed_pools_count < shuffled_pools.size(); + replica.connection_establisher.fail_message = + "Code: 209, e.displayText() = DB::NetException: Timeout exceeded while reading from socket (" + replica.connection_establisher.result.entry->getDescription() + ")"; + replica.connection_establisher.resetResult(); + replica.connection_establisher.stage = ConnectionEstablisher::Stage::FAILED; + processFailedConnection(replica_index, true); } HedgedConnectionsFactory::State HedgedConnectionsFactory::setBestUsableReplica(Connection *& connection_out) { - std::vector indexes(connection_establishers.size()); - for (size_t i = 0; i != indexes.size(); ++i) - indexes[i] = i; - - /// Remove unusable, failed replicas and replicas that are ready or in process. - indexes.erase( - std::remove_if( - indexes.begin(), - indexes.end(), - [&](int i) - { - return connection_establishers[i].result.entry.isNull() || !connection_establishers[i].result.is_usable || - indexes_in_process.find(i) != indexes_in_process.end() || ready_indexes.find(i) != ready_indexes.end(); - }), - indexes.end()); + std::vector indexes; + for (size_t i = 0; i != replicas.size(); ++i) + { + /// Don't add unusable, failed replicas and replicas that are ready or in process. + if (!replicas[i].connection_establisher.result.entry.isNull() && replicas[i].connection_establisher.result.is_usable && + !replicas[i].is_in_process && !replicas[i].is_ready) + indexes.push_back(i); + } if (indexes.empty()) return State::CANNOT_CHOOSE; @@ -410,38 +426,14 @@ HedgedConnectionsFactory::State HedgedConnectionsFactory::setBestUsableReplica(C indexes.end(), [&](size_t lhs, size_t rhs) { - return connection_establishers[lhs].result.staleness < connection_establishers[rhs].result.staleness; + return replicas[lhs].connection_establisher.result.staleness < replicas[rhs].connection_establisher.result.staleness; }); - ready_indexes.insert(indexes[0]); - connection_out = &*connection_establishers[indexes[0]].result.entry; + ++ready_replicas_count; + replicas[indexes[0]].is_ready = true; + connection_out = &*replicas[indexes[0]].connection_establisher.result.entry; return State::READY; } -ConnectionTimeoutDescriptorPtr createConnectionTimeoutDescriptor(ConnectionTimeoutType type, const ConnectionTimeouts & timeouts) -{ - Poco::Timespan timeout; - switch (type) - { - case ConnectionTimeoutType::RECEIVE_HELLO_TIMEOUT: - timeout = timeouts.receive_hello_timeout; - break; - case ConnectionTimeoutType::RECEIVE_TABLES_STATUS_TIMEOUT: - timeout = timeouts.receive_tables_status_timeout; - break; - case ConnectionTimeoutType::RECEIVE_DATA_TIMEOUT: - timeout = timeouts.receive_data_timeout; - break; - case ConnectionTimeoutType::RECEIVE_TIMEOUT: - timeout = timeouts.receive_timeout; - break; - } - - ConnectionTimeoutDescriptorPtr timeout_descriptor = std::make_shared(); - timeout_descriptor->type = type; - timeout_descriptor->timer.setRelative(timeout); - return timeout_descriptor; -} - } #endif diff --git a/src/Client/HedgedConnectionsFactory.h b/src/Client/HedgedConnectionsFactory.h index 45a03e212c0..b821a9c925e 100644 --- a/src/Client/HedgedConnectionsFactory.h +++ b/src/Client/HedgedConnectionsFactory.h @@ -12,21 +12,6 @@ namespace DB { -enum class ConnectionTimeoutType -{ - RECEIVE_HELLO_TIMEOUT, - RECEIVE_TABLES_STATUS_TIMEOUT, - RECEIVE_DATA_TIMEOUT, - RECEIVE_TIMEOUT, -}; - -struct ConnectionTimeoutDescriptor -{ - ConnectionTimeoutType type; - TimerDescriptor timer; -}; - -using ConnectionTimeoutDescriptorPtr = std::shared_ptr; using TimerDescriptorPtr = std::shared_ptr; /** Class for establishing hedged connections with replicas. @@ -40,12 +25,27 @@ class HedgedConnectionsFactory public: using ShuffledPool = ConnectionPoolWithFailover::Base::ShuffledPool; + struct ReplicaStatus + { + ReplicaStatus(const ConnectionEstablisher & establisher) : connection_establisher(establisher) + { + epoll.add(receive_timeout.getDescriptor()); + epoll.add(change_replica_timeout.getDescriptor()); + } + + ConnectionEstablisher connection_establisher; + TimerDescriptor receive_timeout; + TimerDescriptor change_replica_timeout; + bool is_ready = false; + bool is_in_process = false; + Epoll epoll; + }; + enum class State { - EMPTY = 0, - READY = 1, - NOT_READY = 2, - CANNOT_CHOOSE = 3, + READY, + NOT_READY, + CANNOT_CHOOSE, }; HedgedConnectionsFactory(const ConnectionPoolWithFailoverPtr & pool_, @@ -64,7 +64,7 @@ public: State getNextConnection(bool start_new_connection, bool blocking, Connection *& connection_out); /// Check if we can try to produce new READY replica. - bool canGetNewConnection() const { return ready_indexes.size() + failed_pools_count < shuffled_pools.size(); } + bool canGetNewConnection() const { return ready_replicas_count + failed_pools_count < shuffled_pools.size(); } /// Stop working with all replicas that are not READY. void stopChoosingReplicas(); @@ -78,9 +78,11 @@ public: ~HedgedConnectionsFactory(); private: - State startEstablishingConnection(int index, Connection *& connection_out); + /// Try to start establishing connection to the new replica. Return + /// the index of the new replica or -1 if cannot start new connection. + int startEstablishingNewConnection(Connection *& connection_out); - State processConnectionEstablisherStage(int replica_index, bool remove_from_epoll = false); + void processConnectionEstablisherStage(int replica_index, bool remove_from_epoll = false); /// Find an index of the next free replica to start connection. /// Return -1 if there is no free replica. @@ -88,20 +90,20 @@ private: int getReadyFileDescriptor(bool blocking); + int checkPendingData(); + void addTimeouts(int replica_index); - void addTimeoutToReplica(ConnectionTimeoutType type, int replica_index); + void resetReplicaTimeouts(int replica_index); - void removeTimeoutsFromReplica(int replica_index); + void processFailedConnection(int replica_index, bool remove_from_epoll); - void processFailedConnection(int replica_index); + void processSocketEvent(int replica_index, Connection *& connection_out); - State processReplicaEvent(int replica_index, Connection *& connection_out); - - bool processTimeoutEvent(int replica_index, ConnectionTimeoutDescriptorPtr timeout_descriptor); + void processReceiveTimeout(int replica_index); /// Return NOT_READY state if there is no ready events, READY if replica is ready - /// and EMPTY if we need to try next replica. + /// and CANNOT_CHOOSE if there is no more events in epoll. State processEpollEvents(bool blocking, Connection *& connection_out); State setBestUsableReplica(Connection *& connection_out); @@ -111,20 +113,16 @@ private: const ConnectionTimeouts timeouts; std::shared_ptr table_to_check; - std::vector connection_establishers; + std::vector replicas; std::vector shuffled_pools; - std::vector> replicas_timeouts; - /// Map socket file descriptor to replica index. std::unordered_map fd_to_replica_index; - /// Map timeout file descriptor to replica index. - std::unordered_map timeout_fd_to_replica_index; /// Indexes of replicas, that are in process of connection. - std::unordered_set indexes_in_process; + size_t replicas_in_process_count = 0; /// Indexes of ready replicas. - std::unordered_set ready_indexes; + size_t ready_replicas_count = 0; int last_used_index = -1; bool fallback_to_stale_replicas; @@ -137,8 +135,5 @@ private: size_t max_tries; }; -/// Create ConnectionTimeoutDescriptor with particular type. -ConnectionTimeoutDescriptorPtr createConnectionTimeoutDescriptor(ConnectionTimeoutType type, const ConnectionTimeouts & timeouts); - } #endif diff --git a/src/Common/Epoll.cpp b/src/Common/Epoll.cpp index da3a4c4c04b..770807d7c9c 100644 --- a/src/Common/Epoll.cpp +++ b/src/Common/Epoll.cpp @@ -20,6 +20,11 @@ Epoll::Epoll() : events_count(0) throwFromErrno("Cannot open epoll descriptor", DB::ErrorCodes::EPOLL_ERROR); } +Epoll::Epoll(Epoll && other) : epoll_fd(other.epoll_fd), events_count(other.events_count) +{ + other.epoll_fd = -1; +} + void Epoll::add(int fd, void * ptr) { epoll_event event; @@ -45,6 +50,9 @@ void Epoll::remove(int fd) size_t Epoll::getManyReady(int max_events, epoll_event * events_out, bool blocking, AsyncCallback async_callback) const { + if (events_count == 0) + throw Exception("There is no events in epoll", ErrorCodes::LOGICAL_ERROR); + int ready_size = 0; int timeout = blocking && !async_callback ? -1 : 0; do @@ -64,7 +72,8 @@ size_t Epoll::getManyReady(int max_events, epoll_event * events_out, bool blocki Epoll::~Epoll() { - close(epoll_fd); + if (epoll_fd != -1) + close(epoll_fd); } } diff --git a/src/Common/Epoll.h b/src/Common/Epoll.h index 3a91199799b..eb168c22a92 100644 --- a/src/Common/Epoll.h +++ b/src/Common/Epoll.h @@ -11,11 +11,18 @@ namespace DB using AsyncCallback = std::function; -class Epoll : boost::noncopyable +class Epoll { public: Epoll(); + Epoll(const Epoll & other) = delete; + Epoll & operator=(const Epoll & other) = delete; + + Epoll(Epoll && other); + + Epoll & operator=(Epoll && other) = default; + /// Add new file descriptor to epoll. If ptr set to nullptr, epoll_event.data.fd = fd, /// otherwise epoll_event.data.ptr = ptr. void add(int fd, void * ptr = nullptr); diff --git a/src/Common/TimerDescriptor.cpp b/src/Common/TimerDescriptor.cpp index e2b8a0ec928..791e6380a89 100644 --- a/src/Common/TimerDescriptor.cpp +++ b/src/Common/TimerDescriptor.cpp @@ -27,10 +27,16 @@ TimerDescriptor::TimerDescriptor(int clockid, int flags) throwFromErrno("Cannot set O_NONBLOCK for timer_fd", ErrorCodes::CANNOT_FCNTL); } +TimerDescriptor::TimerDescriptor(TimerDescriptor && other) : timer_fd(other.timer_fd) +{ + other.timer_fd = -1; +} + TimerDescriptor::~TimerDescriptor() { /// Do not check for result cause cannot throw exception. - close(timer_fd); + if (timer_fd != -1) + close(timer_fd); } void TimerDescriptor::reset() const diff --git a/src/Common/TimerDescriptor.h b/src/Common/TimerDescriptor.h index 6f7003f6980..42f8eb386af 100644 --- a/src/Common/TimerDescriptor.h +++ b/src/Common/TimerDescriptor.h @@ -17,7 +17,7 @@ public: TimerDescriptor(const TimerDescriptor &) = delete; TimerDescriptor & operator=(const TimerDescriptor &) = delete; - TimerDescriptor(TimerDescriptor &&) = default; + TimerDescriptor(TimerDescriptor && other); TimerDescriptor & operator=(TimerDescriptor &&) = default; int getDescriptor() const { return timer_fd; } diff --git a/tests/integration/test_hedged_requests/configs/remote_servers.xml b/tests/integration/test_hedged_requests/configs/remote_servers.xml index 60d2f5891d7..9d753ca2b6a 100644 --- a/tests/integration/test_hedged_requests/configs/remote_servers.xml +++ b/tests/integration/test_hedged_requests/configs/remote_servers.xml @@ -11,6 +11,10 @@ node_2 9000 + + node_3 + 9000 + diff --git a/tests/integration/test_hedged_requests/test.py b/tests/integration/test_hedged_requests/test.py index 20602b1af0a..24dc9202880 100644 --- a/tests/integration/test_hedged_requests/test.py +++ b/tests/integration/test_hedged_requests/test.py @@ -11,60 +11,253 @@ from helpers.network import PartitionManager cluster = ClickHouseCluster(__file__) -# Cluster with 1 shard of 2 replicas. node is the instance with Distributed table. -node = cluster.add_instance( - 'node', with_zookeeper=True, main_configs=['configs/remote_servers.xml'], user_configs=['configs/users.xml']) -node_1 = cluster.add_instance('node_1', with_zookeeper=True, stay_alive=True, user_configs=['configs/users1.xml']) -node_2 = cluster.add_instance('node_2', with_zookeeper=True) +NODES = {'node_' + str(i): None for i in (1, 2, 3)} +NODES['node'] = None -config = ''' - - - <{setting}>30 - - -''' +sleep_time = 30 @pytest.fixture(scope="module") def started_cluster(): + cluster = ClickHouseCluster(__file__) + NODES['node'] = cluster.add_instance( + 'node', with_zookeeper=True, main_configs=['configs/remote_servers.xml'], user_configs=['configs/users.xml']) + + for name in NODES: + if name != 'node': + NODES[name] = cluster.add_instance(name, with_zookeeper=True, stay_alive=True, user_configs=['configs/users1.xml']) + try: cluster.start() - node_1.query('''CREATE TABLE replicated (id UInt32, date Date) ENGINE = - ReplicatedMergeTree('/clickhouse/tables/replicated', 'node_1') ORDER BY id PARTITION BY toYYYYMM(date)''') + for node_id, node in list(NODES.items()): + node.query('''CREATE TABLE replicated (id UInt32, date Date) ENGINE = + ReplicatedMergeTree('/clickhouse/tables/replicated', '{}') ORDER BY id PARTITION BY toYYYYMM(date)'''.format(node_id)) - node_2.query('''CREATE TABLE replicated (id UInt32, date Date) ENGINE = - ReplicatedMergeTree('/clickhouse/tables/replicated', 'node_2') ORDER BY id PARTITION BY toYYYYMM(date)''') - - node.query('''CREATE TABLE distributed (id UInt32, date Date) ENGINE = + NODES['node'].query('''CREATE TABLE distributed (id UInt32, date Date) ENGINE = Distributed('test_cluster', 'default', 'replicated')''') + NODES['node'].query("INSERT INTO distributed VALUES (1, '2020-01-01')") + yield cluster finally: cluster.shutdown() -def process_test(sleep_setting_name, receive_timeout_name): - node_1.replace_config('/etc/clickhouse-server/users.d/users1.xml', config.format(setting=sleep_setting_name)) - # Restart node to make new config relevant - node_1.restart_clickhouse(30) - +config = ''' + + + {sleep_before_send_hello} + {sleep_before_send_tables_status} + {sleep_before_send_data} + + +''' + + +def check_query(): # Without hedged requests select query will last more than 30 seconds, # with hedged requests it will last just around 1-2 second start = time.time() - node.query("SELECT * FROM distributed"); + NODES['node'].query("SELECT * FROM distributed"); query_time = time.time() - start - print(query_time) + + assert query_time < 5 -def test_hedged_requests(started_cluster): - node.query("INSERT INTO distributed VALUES (1, '2020-01-01')") +def test_send_hello_sleep(started_cluster): + NODES['node_1'].replace_config( + '/etc/clickhouse-server/users.d/users1.xml', + config.format(sleep_before_send_hello=sleep_time, sleep_before_send_tables_status=0, sleep_before_send_data=0)) - process_test("sleep_before_send_hello", "receive_hello_timeout") - process_test("sleep_before_send_tables_status", "receive_tables_status_timeout") - process_test("sleep_before_send_data", "receive_data_timeout") + NODES['node_1'].restart_clickhouse(sleep_time) + NODES['node_1'].query('SYSTEM SYNC REPLICA replicated') + + check_query() + + +def test_send_hello_sleep2(started_cluster): + NODES['node_1'].replace_config( + '/etc/clickhouse-server/users.d/users1.xml', + config.format(sleep_before_send_hello=sleep_time, sleep_before_send_tables_status=0, sleep_before_send_data=0)) + + NODES['node_1'].restart_clickhouse(sleep_time) + NODES['node_1'].query('SYSTEM SYNC REPLICA replicated') + + + NODES['node_2'].replace_config( + '/etc/clickhouse-server/users.d/users1.xml', + config.format(sleep_before_send_hello=sleep_time, sleep_before_send_tables_status=0, sleep_before_send_data=0)) + + NODES['node_2'].restart_clickhouse(sleep_time) + NODES['node_2'].query('SYSTEM SYNC REPLICA replicated') + + check_query() + + +def test_send_table_status_sleep(started_cluster): + NODES['node_1'].replace_config( + '/etc/clickhouse-server/users.d/users1.xml', + config.format(sleep_before_send_hello=0, sleep_before_send_tables_status=sleep_time, sleep_before_send_data=0)) + + NODES['node_1'].restart_clickhouse(sleep_time) + NODES['node_1'].query('SYSTEM SYNC REPLICA replicated') + + check_query() + + +def test_send_table_status_sleep2(started_cluster): + NODES['node_1'].replace_config( + '/etc/clickhouse-server/users.d/users1.xml', + config.format(sleep_before_send_hello=0, sleep_before_send_tables_status=sleep_time, sleep_before_send_data=0)) + + NODES['node_1'].restart_clickhouse(sleep_time) + NODES['node_1'].query('SYSTEM SYNC REPLICA replicated') + + NODES['node_2'].replace_config( + '/etc/clickhouse-server/users.d/users1.xml', + config.format(sleep_before_send_hello=0, sleep_before_send_tables_status=sleep_time, sleep_before_send_data=0)) + + NODES['node_2'].restart_clickhouse(sleep_time) + NODES['node_2'].query('SYSTEM SYNC REPLICA replicated') + + check_query() + +def test_send_data(started_cluster): + NODES['node_1'].replace_config( + '/etc/clickhouse-server/users.d/users1.xml', + config.format(sleep_before_send_hello=0, sleep_before_send_tables_status=0, sleep_before_send_data=sleep_time)) + + NODES['node_1'].restart_clickhouse(sleep_time) + NODES['node_1'].query('SYSTEM SYNC REPLICA replicated') + + check_query() + + +def test_send_data2(started_cluster): + NODES['node_1'].replace_config( + '/etc/clickhouse-server/users.d/users1.xml', + config.format(sleep_before_send_hello=0, sleep_before_send_tables_status=0, sleep_before_send_data=sleep_time)) + + NODES['node_1'].restart_clickhouse(sleep_time) + NODES['node_1'].query('SYSTEM SYNC REPLICA replicated') + + NODES['node_2'].replace_config( + '/etc/clickhouse-server/users.d/users1.xml', + config.format(sleep_before_send_hello=0, sleep_before_send_tables_status=0, sleep_before_send_data=sleep_time)) + + NODES['node_2'].restart_clickhouse(sleep_time) + NODES['node_2'].query('SYSTEM SYNC REPLICA replicated') + + check_query() + + +def test_combination1(started_cluster): + NODES['node_1'].replace_config( + '/etc/clickhouse-server/users.d/users1.xml', + config.format(sleep_before_send_hello=sleep_time, sleep_before_send_tables_status=0, sleep_before_send_data=0)) + + NODES['node_1'].restart_clickhouse(sleep_time) + NODES['node_1'].query('SYSTEM SYNC REPLICA replicated') + + NODES['node_2'].replace_config( + '/etc/clickhouse-server/users.d/users1.xml', + config.format(sleep_before_send_hello=0, sleep_before_send_tables_status=sleep_time, sleep_before_send_data=0)) + + NODES['node_2'].restart_clickhouse(sleep_time) + NODES['node_2'].query('SYSTEM SYNC REPLICA replicated') + + check_query() + + +def test_combination2(started_cluster): + NODES['node_1'].replace_config( + '/etc/clickhouse-server/users.d/users1.xml', + config.format(sleep_before_send_hello=sleep_time, sleep_before_send_tables_status=0, sleep_before_send_data=0)) + + NODES['node_1'].restart_clickhouse(sleep_time) + NODES['node_1'].query('SYSTEM SYNC REPLICA replicated') + + NODES['node_2'].replace_config( + '/etc/clickhouse-server/users.d/users1.xml', + config.format(sleep_before_send_hello=0, sleep_before_send_tables_status=0, sleep_before_send_data=sleep_time)) + + NODES['node_2'].restart_clickhouse(sleep_time) + NODES['node_2'].query('SYSTEM SYNC REPLICA replicated') + + check_query() + + +def test_combination3(started_cluster): + NODES['node_1'].replace_config( + '/etc/clickhouse-server/users.d/users1.xml', + config.format(sleep_before_send_hello=0, sleep_before_send_tables_status=sleep_time, sleep_before_send_data=0)) + + NODES['node_1'].restart_clickhouse(sleep_time) + NODES['node_1'].query('SYSTEM SYNC REPLICA replicated') + + NODES['node_2'].replace_config( + '/etc/clickhouse-server/users.d/users1.xml', + config.format(sleep_before_send_hello=sleep_time, sleep_before_send_tables_status=0, sleep_before_send_data=0)) + + NODES['node_2'].restart_clickhouse(sleep_time) + NODES['node_2'].query('SYSTEM SYNC REPLICA replicated') + + check_query() + + +def test_combination4(started_cluster): + NODES['node_1'].replace_config( + '/etc/clickhouse-server/users.d/users1.xml', + config.format(sleep_before_send_hello=0, sleep_before_send_tables_status=sleep_time, sleep_before_send_data=0)) + + NODES['node_1'].restart_clickhouse(sleep_time) + NODES['node_1'].query('SYSTEM SYNC REPLICA replicated') + + NODES['node_2'].replace_config( + '/etc/clickhouse-server/users.d/users1.xml', + config.format(sleep_before_send_hello=0, sleep_before_send_tables_status=0, sleep_before_send_data=sleep_time)) + + NODES['node_2'].restart_clickhouse(sleep_time) + NODES['node_2'].query('SYSTEM SYNC REPLICA replicated') + + check_query() + +def test_combination5(started_cluster): + NODES['node_1'].replace_config( + '/etc/clickhouse-server/users.d/users1.xml', + config.format(sleep_before_send_hello=0, sleep_before_send_tables_status=0, sleep_before_send_data=sleep_time)) + + NODES['node_1'].restart_clickhouse(sleep_time) + NODES['node_1'].query('SYSTEM SYNC REPLICA replicated') + + NODES['node_2'].replace_config( + '/etc/clickhouse-server/users.d/users1.xml', + config.format(sleep_before_send_hello=sleep_time, sleep_before_send_tables_status=0, sleep_before_send_data=0)) + + NODES['node_2'].restart_clickhouse(sleep_time) + NODES['node_2'].query('SYSTEM SYNC REPLICA replicated') + + check_query() + + +def test_combination6(started_cluster): + NODES['node_1'].replace_config( + '/etc/clickhouse-server/users.d/users1.xml', + config.format(sleep_before_send_hello=0, sleep_before_send_tables_status=0, sleep_before_send_data=sleep_time)) + + NODES['node_1'].restart_clickhouse(sleep_time) + NODES['node_1'].query('SYSTEM SYNC REPLICA replicated') + + NODES['node_2'].replace_config( + '/etc/clickhouse-server/users.d/users1.xml', + config.format(sleep_before_send_hello=0, sleep_before_send_tables_status=sleep_time, sleep_before_send_data=0)) + + NODES['node_2'].restart_clickhouse(sleep_time) + NODES['node_2'].query('SYSTEM SYNC REPLICA replicated') + + check_query() diff --git a/tests/integration/test_hedged_requests_parallel/configs/remote_servers.xml b/tests/integration/test_hedged_requests_parallel/configs/remote_servers.xml index 9d753ca2b6a..63767185b34 100644 --- a/tests/integration/test_hedged_requests_parallel/configs/remote_servers.xml +++ b/tests/integration/test_hedged_requests_parallel/configs/remote_servers.xml @@ -15,6 +15,10 @@ node_3 9000 + + node_4 + 9000 +
diff --git a/tests/integration/test_hedged_requests_parallel/configs/users1.xml b/tests/integration/test_hedged_requests_parallel/configs/users1.xml index 5fe444b94ff..2a54396feca 100644 --- a/tests/integration/test_hedged_requests_parallel/configs/users1.xml +++ b/tests/integration/test_hedged_requests_parallel/configs/users1.xml @@ -2,7 +2,6 @@ - 30 diff --git a/tests/integration/test_hedged_requests_parallel/test.py b/tests/integration/test_hedged_requests_parallel/test.py index 08c5c0d3cd1..95e32a0f3fc 100644 --- a/tests/integration/test_hedged_requests_parallel/test.py +++ b/tests/integration/test_hedged_requests_parallel/test.py @@ -11,45 +11,114 @@ from helpers.network import PartitionManager cluster = ClickHouseCluster(__file__) -# Cluster with 1 shard of 3 replicas. node is the instance with Distributed table. -node = cluster.add_instance( - 'node', with_zookeeper=True, main_configs=['configs/remote_servers.xml'], user_configs=['configs/users.xml']) -node_1 = cluster.add_instance('node_1', with_zookeeper=True, user_configs=['configs/users1.xml']) -node_2 = cluster.add_instance('node_2', with_zookeeper=True) -node_3 = cluster.add_instance('node_3', with_zookeeper=True) +NODES = {'node_' + str(i): None for i in (1, 2, 3, 4)} +NODES['node'] = None +sleep_time = 30 @pytest.fixture(scope="module") def started_cluster(): + cluster = ClickHouseCluster(__file__) + NODES['node'] = cluster.add_instance( + 'node', with_zookeeper=True, main_configs=['configs/remote_servers.xml'], user_configs=['configs/users.xml']) + + for name in NODES: + if name != 'node': + NODES[name] = cluster.add_instance(name, with_zookeeper=True, stay_alive=True, user_configs=['configs/users1.xml']) + try: cluster.start() - node_1.query('''CREATE TABLE replicated (id UInt32, date Date) ENGINE = - ReplicatedMergeTree('/clickhouse/tables/replicated', 'node_1') ORDER BY id PARTITION BY toYYYYMM(date)''') + for node_id, node in list(NODES.items()): + node.query('''CREATE TABLE replicated (id UInt32, date Date) ENGINE = + ReplicatedMergeTree('/clickhouse/tables/replicated', '{}') ORDER BY id PARTITION BY toYYYYMM(date)'''.format(node_id)) - node_2.query('''CREATE TABLE replicated (id UInt32, date Date) ENGINE = - ReplicatedMergeTree('/clickhouse/tables/replicated', 'node_2') ORDER BY id PARTITION BY toYYYYMM(date)''') - - node_3.query('''CREATE TABLE replicated (id UInt32, date Date) ENGINE = - ReplicatedMergeTree('/clickhouse/tables/replicated', 'node_3') ORDER BY id PARTITION BY toYYYYMM(date)''') - - node.query('''CREATE TABLE distributed (id UInt32, date Date) ENGINE = + NODES['node'].query('''CREATE TABLE distributed (id UInt32, date Date) ENGINE = Distributed('test_cluster', 'default', 'replicated')''') + NODES['node'].query("INSERT INTO distributed VALUES (1, '2020-01-01'), (2, '2020-01-02')") + yield cluster finally: cluster.shutdown() -def test_hedged_requests_with_max_parallel_replicas(started_cluster): - node.query("INSERT INTO distributed VALUES (1, '2020-01-01')") - - # Without hedged requests select query will last more 30 seconds, - # with hedged requests it will last just over 2 seconds + +config = ''' + + + {sleep_before_send_hello} + {sleep_before_send_tables_status} + {sleep_before_send_data} + + +''' + + +def check_query(): + # Without hedged requests select query will last more than 30 seconds, + # with hedged requests it will last just around 1-2 second start = time.time() - node.query("SELECT * FROM distributed"); + NODES['node'].query("SELECT * FROM distributed"); query_time = time.time() - start - print(query_time) + + assert query_time < 5 + + +def test_send_hello_sleep(started_cluster): + NODES['node_1'].replace_config( + '/etc/clickhouse-server/users.d/users1.xml', + config.format(sleep_before_send_hello=sleep_time, sleep_before_send_tables_status=0, sleep_before_send_data=0)) + + NODES['node_1'].restart_clickhouse(sleep_time) + NODES['node_1'].query('SYSTEM SYNC REPLICA replicated') + + + NODES['node_2'].replace_config( + '/etc/clickhouse-server/users.d/users1.xml', + config.format(sleep_before_send_hello=sleep_time, sleep_before_send_tables_status=0, sleep_before_send_data=0)) + + NODES['node_2'].restart_clickhouse(sleep_time) + NODES['node_2'].query('SYSTEM SYNC REPLICA replicated') + + check_query() + + +def test_send_table_status_sleep(started_cluster): + NODES['node_1'].replace_config( + '/etc/clickhouse-server/users.d/users1.xml', + config.format(sleep_before_send_hello=0, sleep_before_send_tables_status=sleep_time, sleep_before_send_data=0)) + + NODES['node_1'].restart_clickhouse(sleep_time) + NODES['node_1'].query('SYSTEM SYNC REPLICA replicated') + + NODES['node_2'].replace_config( + '/etc/clickhouse-server/users.d/users1.xml', + config.format(sleep_before_send_hello=0, sleep_before_send_tables_status=sleep_time, sleep_before_send_data=0)) + + NODES['node_2'].restart_clickhouse(sleep_time) + NODES['node_2'].query('SYSTEM SYNC REPLICA replicated') + + check_query() + + +def test_send_data(started_cluster): + NODES['node_1'].replace_config( + '/etc/clickhouse-server/users.d/users1.xml', + config.format(sleep_before_send_hello=0, sleep_before_send_tables_status=0, sleep_before_send_data=sleep_time)) + + NODES['node_1'].restart_clickhouse(sleep_time) + NODES['node_1'].query('SYSTEM SYNC REPLICA replicated') + + NODES['node_2'].replace_config( + '/etc/clickhouse-server/users.d/users1.xml', + config.format(sleep_before_send_hello=0, sleep_before_send_tables_status=0, sleep_before_send_data=sleep_time)) + + NODES['node_2'].restart_clickhouse(sleep_time) + NODES['node_2'].query('SYSTEM SYNC REPLICA replicated') + + check_query() + From eb0847e2b6c9246b4f6d3e9ecf35242b7622f79e Mon Sep 17 00:00:00 2001 From: Pavel Kruglov Date: Mon, 15 Feb 2021 17:44:05 +0300 Subject: [PATCH 0290/2357] Style --- src/Client/HedgedConnectionsFactory.cpp | 2 +- src/Common/Epoll.cpp | 1 + 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/src/Client/HedgedConnectionsFactory.cpp b/src/Client/HedgedConnectionsFactory.cpp index c4a10379985..c881c2723df 100644 --- a/src/Client/HedgedConnectionsFactory.cpp +++ b/src/Client/HedgedConnectionsFactory.cpp @@ -307,7 +307,7 @@ HedgedConnectionsFactory::State HedgedConnectionsFactory::processEpollEvents(boo continue; } - /// Get ready descriptor fro epoll. + /// Get ready descriptor from epoll. event_fd = getReadyFileDescriptor(blocking); /// Check if there is no events. diff --git a/src/Common/Epoll.cpp b/src/Common/Epoll.cpp index 770807d7c9c..628bb45e796 100644 --- a/src/Common/Epoll.cpp +++ b/src/Common/Epoll.cpp @@ -11,6 +11,7 @@ namespace DB namespace ErrorCodes { extern const int EPOLL_ERROR; + extern const int LOGICAL_ERROR; } Epoll::Epoll() : events_count(0) From 3ce33603795d0649ae4fca41ae11aa9918d8b143 Mon Sep 17 00:00:00 2001 From: alesapin Date: Mon, 15 Feb 2021 18:36:25 +0300 Subject: [PATCH 0291/2357] Some initial code --- src/Coordination/Changelog.cpp | 315 ++++++++++++++++++++++++++ src/Coordination/Changelog.h | 81 +++++++ src/Coordination/InMemoryLogStore.cpp | 8 +- src/Coordination/NuKeeperLogStore.h | 24 ++ 4 files changed, 424 insertions(+), 4 deletions(-) create mode 100644 src/Coordination/Changelog.cpp create mode 100644 src/Coordination/Changelog.h create mode 100644 src/Coordination/NuKeeperLogStore.h diff --git a/src/Coordination/Changelog.cpp b/src/Coordination/Changelog.cpp new file mode 100644 index 00000000000..a38f039fa40 --- /dev/null +++ b/src/Coordination/Changelog.cpp @@ -0,0 +1,315 @@ +#include +#include +#include +#include + +namespace DB +{ + +namespace ErrorCodes +{ + extern const int CHECKSUM_DOESNT_MATCH; + extern const int CORRUPTED_DATA; + extern const int UNKNOWN_FORMAT_VERSION; + extern const int LOGICAL_ERROR; + extern const int UNIMPLEMENTED; +} + + +std::string toString(const ChangelogVersion & version) +{ + if (version == ChangelogVersion::V0) + return "V0"; + + throw Exception(ErrorCodes::UNKNOWN_FORMAT_VERSION, "Unknown chagelog version {}", static_cast(version)); +} + +ChangelogVersion fromString(const std::string & version_str) +{ + if (version == "V0") + return ChangelogVersion::V0; + + throw Exception(ErrorCodes::UNKNOWN_FORMAT_VERSION, "Unknown chagelog version {}", version_str); +} + +namespace +{ + +static constexpr auto DEFAULT_PREFIX = "changelog"; + +struct ChangelogName +{ + std::string prefix; + ChangelogVersion version; + size_t from_log_idx; + size_t to_log_idx; +}; + +std::string formatChangelogPath(const std::string & prefix, const ChangelogVersion & version, const ChangelogName & name) +{ + std::filesystem::path path(prefix); + path /= std::filesystem::path(name.prefix + "_" + toString(version) + "_" + std::to_string(name.from_log_idx) + "_" + std::to_string(name.to_log_idx) + ".log"); + return path.to_string(); +} + +ChangelogName getChangelogName(const std::string & path_str) +{ + std::filesystem::path path(path_str); + std:string filename = path.stem(); + Strings filename_parts; + boost::split(filename_parts, filename, boost::is_any_of("_")); + if (filename_parts.size() < 4) + throw Exception(ErrorCodes::CORRUPTED_DATA, "Invalid changelog {}", path_str); + + ChangelogName result; + result.prefix = filename_parts[0]; + result.version = fromString(filename_parts[1]); + result.form_log_idx = parse(filename_parts[2]); + result.to_log_idx = parse(filename_parts[3]); + return result; +} + +} + +class ChangelogWriter +{ +public: + ChangelogWriter(const std::string & filepath_, WriteMode mode, size_t start_index_) + : filepath(filepath_) + , plain_buf(filepath, DBMS_DEFAULT_BUFFER_SIZE, mode == WriteMode::Rewrite ? -1 : (O_APPEND | O_CREAT | O_WRONLY)) + , start_index(start_index_) + {} + + + off_t appendRecord(ChangelogRecord && record, bool sync) + { + off_t result = plain_buf.count(); + writeIntBinary(record.header.version, plain_buf); + writeIntBinary(record.header.index, plain_buf); + writeIntBinary(record.header.term, plain_buf); + writeIntBinary(record.header.value_type, plain_buf); + writeIntBinary(record.header.blob_size, plain_buf); + writeIntBinary(record.header.blob_checksum, plain_buf); + + if (record.blob_size != 0) + plain_buf.write(reinterpret_cast(record.blob->data_begin()), record.blob->size()); + + entries_written++; + + if (sync) + plain_buf.sync(); + reeturn result; + } + + void truncateToLength(off_t new_length) + { + flush(); + plain_buf.truncate(new_length); + } + + void flush() + { + plain_buf.sync(); + } + + size_t getEntriesWritten() const + { + return entries_written; + } + + size_t setEntriesWritten(size_t entries_written_) + { + entries_written = entries_written_; + } + + size_t getStartIndex() const + { + return start_index; + } + + void setStartIndex(size_t start_index_) + { + start_index = start_index_; + } + +private: + std::string filepath; + WriteBufferFromFile plain_buf; + size_t entries_written = 0; + size_t start_index; +}; + + +class ChangelogReader +{ +public: + explicit ChangelogReader(const std::string & filepath_) + : filepath(filepath_) + , read_buf(filepath) + {} + + size_t readChangelog(Changelog & changelog, IndexToOffset & index_to_offset) + { + size_t total_read = 0; + while (!read_buf.eof()) + { + total_read += 1; + off_t pos = read_buf.count(); + ChangelogRecord record; + readIntBinary(record.header.version, read_buf); + readIntBinary(record.header.index, read_buf); + readIntBinary(record.header.term, read_buf); + readIntBinary(record.header.value_type, read_buf); + readIntBinary(record.header.blob_size, read_buf); + readIntBinary(record.header.blob_checksum, read_buf); + auto buffer = nuraft::buffer::alloc(record.header.blob_size); + auto buffer_begin = reinterpret_cast(buffer->data_begin()); + read_buf.readStrict(buffer_begin, record.header.blob_size); + index_to_offset[record.header.index] = pos; + + Checksum checksum = CityHash_v1_0_2::CityHash128(buffer_begin, record.header.blob_size); + if (checksum != record.header.blob_checksum) + { + throw Exception(ErrorCodes::CHECKSUM_DOESNT_MATCH, + "Checksums doesn't match for log {} (version {}), index {}, blob_size {}", + filepath, record.header.version, record.header.index, record.header.blob_size); + } + + if (changlog.start_idx == 0) + changelog.start_idx = record.header.index; + + if (!changelog.try_emplace(record.header.index, buffer).second) + throw Exception(ErrorCodes::CORRUPTED_DATA, "Duplicated index id {} in log {}", record.header.index, filename); + } + return total_read; + } +private: + std::string filepath; + ReadBufferFromFile read_buf; +}; + +ChangelogOnDiskHelper::ChangelogOnDiskHelper(const std::string & changelogs_dir, size_t rotate_interval_) + : changelogs_dir(chagelogs_dir_) + , rotate_interval(rotate_interval_) +{ + namespace fs = std::filesystem; + for(const auto & p : fs::directory_iterator(changelogs_dir)) + existing_changelogs.push_back(p.path()); +} + +Changelog ChangelogOnDiskHelper::readChangelogAndInitWriter(size_t from_log_idx) +{ + Changelog result; + size_t read_from_last = 0; + for (const std::string & changelog_file : existing_changelogs) + { + ChangelogName parsed_name = getChangelogName(changelog_file); + if (parsed_name.to_log_idx >= from_log_idx) + { + ChangelogReader reader(changelog_file); + read_from_last = reader.readChangelog(result, index_to_start_pos); + } + } + if (existing_changelogs.size() > 0 && read_from_last < rotate_interval) + { + auto parsed_name = getChangelogName(existing_changelogs.back()); + current_writer = std::make_unique(existing_changelogs.back(), WriteMode::Append, parsed_name.from_log_idx); + current_writer->setEntriesWritten(read_from_last); + } + else + { + rotate(from_log_idx); + } + return result; +} + +void ChangelogOnDiskHelper::rotate(size_t new_start_log_idx) +{ + if (current_writer) + current_writer->flush(); + + ChangelogName new_name; + new_name.prefix = changelogs_dir; + new_name.version = CURRENT_CHANGELOG_VERSION; + new_name.from_log_idx = new_start_log_idx; + new_name.to_log_idx = new_start_log_idx; + + auto new_log_path = formatChagelogPath(changelogs_dir, CURRENT_CHANGELOG_VERSION, new_name); + existing_changelogs.push_back(new_log_path); + current_writer = std::make_unique(existing_changelogs.back(), WriteMode::Rewrite, new_start_log_idx); +} + +ChangelogRecord ChangelogOnDiskHelper::buildRecord(size_t index, nuraft::ptr log_entry) const +{ + ChangelogRecordHeader header; + header.index = index; + header.term = log_entry->get_term(); + header.value_type = log_entry->get_val_type(); + auto buffer = log_entry->get_buf_ptr(); + if (buffer) + { + header.blob_size = buffer->size(); + header.blob_checksum = CityHash_v1_0_2::CityHash128(reinterpret_cast(buffer->data_begin()), buffer->size()); + } + else + { + header.blob_size = 0; + header.blob_checksum = 0; + } + + return ChangelogRecord{header, buffer}; +} + +void ChangelogOnDiskHelper::appendRecord(size_t index, nuraft::ptr log_entry) +{ + if (!current_writer) + throw Exception(ErrorCodes::LOGICAL_ERROR, "ChangelogOnDiskHelper must be initialized before appending records"); + + if (current_writer->getEntriesWritten() == rotate_interval) + rotate(index); + + auto offset = current_writer->appendRecord(buildRecord(index, log_entry), true); + if (!index_to_start_pos.try_emplace(index, offset).second) + throw Exception(ErrorCodes::LOGICAL_ERROR, "Record with index {} already exists", index); + +} + +void ChangelogOnDiskHelper::writeAt(size_t index, nuraft::ptr log_entry) +{ + if (index < current_writer->getStartIndex()) + throw Exception(ErrorCodes::UNIMPLEMENTED, "Currently cannot overwrite index from previous file"); + + auto entries_written = current_writer->getEntriesWritten(); + current_writer->truncateToLength(index_to_start_pos(index)); + for (auto itr = index_to_start_pos.begin(); itr != index_to_start_pos.end();) + { + if (itr->first >= index) + { + entries_written--; + itr = index_to_start_pos.erase(itr); + } + else + itr++; + } + + current_writer->setEntriesWritten(entries_written); + + appendRecord(index, log_entry); +} + +void ChangelogOnDiskHelper::compact(size_t up_to_log_idx) +{ + for (auto itr = existing_changelogs.begin(); itr != existing_changelogs.end();) + { + ChangelogName parsed_name = getChangelogName(*itr); + if (parsed_name.to_log_idx <= up_to_log_idx) + { + std::filesystem::remove(itr); + itr = existing_changelogs.erase(itr); + for (size_t idx = parsed_name.from_log_idx; idx <= parsed_name.to_log_idx; ++idx) + index_to_start_pos.erase(idx); + } + } +} + +} diff --git a/src/Coordination/Changelog.h b/src/Coordination/Changelog.h new file mode 100644 index 00000000000..ffcd2a353bb --- /dev/null +++ b/src/Coordination/Changelog.h @@ -0,0 +1,81 @@ +#pragma once + +#include // Y_IGNORE +#include +#include +#include +#include +#include + +namespace DB +{ + +using Checksum = CityHash_v1_0_2::uint128; + +enum class ChangelogVersion : uint8_t +{ + V0 = 0, +}; + +std::string toString(const ChangelogVersion & version); +ChangelogVersion fromString(const std::string & version_str); + +static constexpr auto CURRENT_CHANGELOG_VERSION = ChangeLogVersion::V0; + +struct ChangelogRecordHeader +{ + ChangelogVersion version = CURRENT_CHANGELOG_VERSION; + size_t index; + size_t term; + nuraft::log_val_type value_type; + size_t blob_size; + Checksum blob_checksum; +}; + +struct ChangelogRecord +{ + ChangelogRecordHeader header; + nuraft::ptr blob; +}; + +using IndexToOffset = std::unordered_map; +using IndexToLogEntry = std::map>; + +struct Changelog +{ +public: +private: + IndexToLogEntry logs; + size_t start_idx = 0; +}; + +class ChangelogWriter; + +class ChangelogOnDiskHelper +{ + +public: + ChangelogOnDiskHelper(const std::string & changelogs_dir_, size_t rotate_interval_); + + Changelog readChangelogAndInitWriter(size_t from_log_idx); + + void appendRecord(size_t index, nuraft::ptr log_entry); + + void writeAt(size_t index, nuraft::ptr log_entry); + + void compact(size_t up_to_log_idx); + +private: + void rotate(size_t new_start_log_idex); + + ChangelogRecord buildRecord(size_t index, nuraft::ptr log_entry) const; + +private: + std::string changelogs_dir; + std::deque existing_changelogs; + std::unique_ptr current_writer; + IndexToOffset index_to_start_pos; + const size_t rotate_interval; +}; + +} diff --git a/src/Coordination/InMemoryLogStore.cpp b/src/Coordination/InMemoryLogStore.cpp index 101458891e7..877c8a60a2a 100644 --- a/src/Coordination/InMemoryLogStore.cpp +++ b/src/Coordination/InMemoryLogStore.cpp @@ -72,12 +72,12 @@ nuraft::ptr>> InMemoryLogStore::log_e ret->resize(end - start); size_t cc = 0; - for (size_t ii = start; ii < end; ++ii) + for (size_t i = start; i < end; ++i) { nuraft::ptr src = nullptr; { std::lock_guard l(logs_lock); - auto entry = logs.find(ii); + auto entry = logs.find(i); if (entry == logs.end()) { entry = logs.find(0); @@ -152,9 +152,9 @@ void InMemoryLogStore::apply_pack(size_t index, nuraft::buffer & pack) pack.pos(0); Int32 num_logs = pack.get_int(); - for (Int32 ii = 0; ii < num_logs; ++ii) + for (Int32 i = 0; i < num_logs; ++i) { - size_t cur_idx = index + ii; + size_t cur_idx = index + i; Int32 buf_size = pack.get_int(); nuraft::ptr buf_local = nuraft::buffer::alloc(buf_size); diff --git a/src/Coordination/NuKeeperLogStore.h b/src/Coordination/NuKeeperLogStore.h new file mode 100644 index 00000000000..2d066ac3e3a --- /dev/null +++ b/src/Coordination/NuKeeperLogStore.h @@ -0,0 +1,24 @@ +#pragma once +#include // Y_IGNORE +#include +#include +#include +#include + +namespace DB +{ + +class NuKeeperLogStore : public nuraft::log_store +{ +public: + NuKeeperLogStore(const std::string & changelogs_path, size_t rotate_interval_); + + +private: + mutable std::mutex logs_lock; + std::atomic start_idx; + Changelog in_memory_changelog; + ChangelogOnDiskHelper on_disk_changelog_helper; +}; + +} From d38198dade3b79bcfecbee338d719e38d2c68501 Mon Sep 17 00:00:00 2001 From: lehasm Date: Mon, 15 Feb 2021 18:58:46 +0300 Subject: [PATCH 0292/2357] ru translation --- .../functions/string-functions.md | 43 +++++++++++++++++++ 1 file changed, 43 insertions(+) diff --git a/docs/ru/sql-reference/functions/string-functions.md b/docs/ru/sql-reference/functions/string-functions.md index aeb0652cc18..b1c4012e9f9 100644 --- a/docs/ru/sql-reference/functions/string-functions.md +++ b/docs/ru/sql-reference/functions/string-functions.md @@ -597,4 +597,47 @@ Hello, "world"! 'foo' ``` + +## decodeXMLComponent {#decode-xml-component} + +Заменяет символами предопределенные мнемоники XML: `"` `&` `'` `>` `<` +Также эта функция заменяет числовые ссылки соответствующими символами юникод. +Поддерживаются десятичная (например, `✓`) и шестнадцатеричная (`✓`) формы. + +**Синтаксис** + +``` sql +decodeXMLComponent(x) +``` + +**Параметры** + +- `x` — последовательность символов. [String](../../sql-reference/data-types/string.md). + +**Возвращаемое значение** + +- Строка с произведенными заменами. + +Тип: [String](../../sql-reference/data-types/string.md). + +**Пример** + +Запрос: + +``` sql +SELECT decodeXMLComponent(''foo''); +SELECT decodeXMLComponent('< Σ >'); +``` + +Результат: + +``` text +'foo' +< Σ > +``` + +**Смотрите также** + +- [Мнемоники в HTML](https://ru.wikipedia.org/wiki/%D0%9C%D0%BD%D0%B5%D0%BC%D0%BE%D0%BD%D0%B8%D0%BA%D0%B8_%D0%B2_HTML) + [Оригинальная статья](https://clickhouse.tech/docs/ru/query_language/functions/string_functions/) From 28dec516acb4921cbd2703b1706bfd67964fe651 Mon Sep 17 00:00:00 2001 From: Kruglov Pavel <48961922+Avogar@users.noreply.github.com> Date: Mon, 15 Feb 2021 19:12:03 +0300 Subject: [PATCH 0293/2357] Style --- src/Client/Connection.cpp | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/Client/Connection.cpp b/src/Client/Connection.cpp index 5ef326acb73..0e8b94ef1cb 100644 --- a/src/Client/Connection.cpp +++ b/src/Client/Connection.cpp @@ -171,7 +171,8 @@ void Connection::sendHello() * Limiting number of possible characters in user-controlled part of handshake * will mitigate this possibility but doesn't solve it completely. */ - auto has_control_character = [](const std::string & s) { + auto has_control_character = [](const std::string & s) + { for (auto c : s) if (isControlASCII(c)) return true; From 5401116988b83cee6e4cf136d95843494c5523f0 Mon Sep 17 00:00:00 2001 From: alesapin Date: Mon, 15 Feb 2021 20:59:40 +0300 Subject: [PATCH 0294/2357] Compileable code --- src/Coordination/Changelog.cpp | 183 +++++++++++++++++---- src/Coordination/Changelog.h | 54 ++++-- src/Coordination/NuKeeperLogStore.h | 31 +++- src/Coordination/tests/gtest_for_build.cpp | 26 ++- 4 files changed, 238 insertions(+), 56 deletions(-) diff --git a/src/Coordination/Changelog.cpp b/src/Coordination/Changelog.cpp index a38f039fa40..f06185124da 100644 --- a/src/Coordination/Changelog.cpp +++ b/src/Coordination/Changelog.cpp @@ -1,7 +1,11 @@ #include #include #include +#include #include +#include +#include +#include namespace DB { @@ -26,7 +30,7 @@ std::string toString(const ChangelogVersion & version) ChangelogVersion fromString(const std::string & version_str) { - if (version == "V0") + if (version_str == "V0") return ChangelogVersion::V0; throw Exception(ErrorCodes::UNKNOWN_FORMAT_VERSION, "Unknown chagelog version {}", version_str); @@ -49,13 +53,13 @@ std::string formatChangelogPath(const std::string & prefix, const ChangelogVersi { std::filesystem::path path(prefix); path /= std::filesystem::path(name.prefix + "_" + toString(version) + "_" + std::to_string(name.from_log_idx) + "_" + std::to_string(name.to_log_idx) + ".log"); - return path.to_string(); + return path; } ChangelogName getChangelogName(const std::string & path_str) { std::filesystem::path path(path_str); - std:string filename = path.stem(); + std::string filename = path.stem(); Strings filename_parts; boost::split(filename_parts, filename, boost::is_any_of("_")); if (filename_parts.size() < 4) @@ -64,11 +68,16 @@ ChangelogName getChangelogName(const std::string & path_str) ChangelogName result; result.prefix = filename_parts[0]; result.version = fromString(filename_parts[1]); - result.form_log_idx = parse(filename_parts[2]); + result.from_log_idx = parse(filename_parts[2]); result.to_log_idx = parse(filename_parts[3]); return result; } +LogEntryPtr makeClone(const LogEntryPtr & entry) +{ + return cs_new(entry->get_term(), nuraft::buffer::clone(entry->get_buf()), entry->get_val_type()); +} + } class ChangelogWriter @@ -91,14 +100,14 @@ public: writeIntBinary(record.header.blob_size, plain_buf); writeIntBinary(record.header.blob_checksum, plain_buf); - if (record.blob_size != 0) + if (record.header.blob_size != 0) plain_buf.write(reinterpret_cast(record.blob->data_begin()), record.blob->size()); entries_written++; if (sync) plain_buf.sync(); - reeturn result; + return result; } void truncateToLength(off_t new_length) @@ -117,7 +126,7 @@ public: return entries_written; } - size_t setEntriesWritten(size_t entries_written_) + void setEntriesWritten(size_t entries_written_) { entries_written = entries_written_; } @@ -148,7 +157,7 @@ public: , read_buf(filepath) {} - size_t readChangelog(Changelog & changelog, IndexToOffset & index_to_offset) + size_t readChangelog(IndexToLogEntry & logs, size_t start_log_idx, IndexToOffset & index_to_offset) { size_t total_read = 0; while (!read_buf.eof()) @@ -174,12 +183,12 @@ public: "Checksums doesn't match for log {} (version {}), index {}, blob_size {}", filepath, record.header.version, record.header.index, record.header.blob_size); } + if (record.header.index < start_log_idx) + continue; - if (changlog.start_idx == 0) - changelog.start_idx = record.header.index; - - if (!changelog.try_emplace(record.header.index, buffer).second) - throw Exception(ErrorCodes::CORRUPTED_DATA, "Duplicated index id {} in log {}", record.header.index, filename); + auto log_entry = nuraft::cs_new(record.header.term, buffer, record.header.value_type); + if (!logs.try_emplace(record.header.index, log_entry).second) + throw Exception(ErrorCodes::CORRUPTED_DATA, "Duplicated index id {} in log {}", record.header.index, filepath); } return total_read; } @@ -188,8 +197,8 @@ private: ReadBufferFromFile read_buf; }; -ChangelogOnDiskHelper::ChangelogOnDiskHelper(const std::string & changelogs_dir, size_t rotate_interval_) - : changelogs_dir(chagelogs_dir_) +Changelog::Changelog(const std::string & changelogs_dir_, size_t rotate_interval_) + : changelogs_dir(changelogs_dir_) , rotate_interval(rotate_interval_) { namespace fs = std::filesystem; @@ -197,9 +206,8 @@ ChangelogOnDiskHelper::ChangelogOnDiskHelper(const std::string & changelogs_dir, existing_changelogs.push_back(p.path()); } -Changelog ChangelogOnDiskHelper::readChangelogAndInitWriter(size_t from_log_idx) +void Changelog::readChangelogAndInitWriter(size_t from_log_idx) { - Changelog result; size_t read_from_last = 0; for (const std::string & changelog_file : existing_changelogs) { @@ -207,9 +215,12 @@ Changelog ChangelogOnDiskHelper::readChangelogAndInitWriter(size_t from_log_idx) if (parsed_name.to_log_idx >= from_log_idx) { ChangelogReader reader(changelog_file); - read_from_last = reader.readChangelog(result, index_to_start_pos); + read_from_last = reader.readChangelog(logs, from_log_idx, index_to_start_pos); } } + + start_index = from_log_idx == 0 ? 1 : from_log_idx; + if (existing_changelogs.size() > 0 && read_from_last < rotate_interval) { auto parsed_name = getChangelogName(existing_changelogs.back()); @@ -220,26 +231,25 @@ Changelog ChangelogOnDiskHelper::readChangelogAndInitWriter(size_t from_log_idx) { rotate(from_log_idx); } - return result; } -void ChangelogOnDiskHelper::rotate(size_t new_start_log_idx) +void Changelog::rotate(size_t new_start_log_idx) { if (current_writer) current_writer->flush(); ChangelogName new_name; - new_name.prefix = changelogs_dir; + new_name.prefix = DEFAULT_PREFIX; new_name.version = CURRENT_CHANGELOG_VERSION; new_name.from_log_idx = new_start_log_idx; new_name.to_log_idx = new_start_log_idx; - auto new_log_path = formatChagelogPath(changelogs_dir, CURRENT_CHANGELOG_VERSION, new_name); + auto new_log_path = formatChangelogPath(changelogs_dir, CURRENT_CHANGELOG_VERSION, new_name); existing_changelogs.push_back(new_log_path); current_writer = std::make_unique(existing_changelogs.back(), WriteMode::Rewrite, new_start_log_idx); } -ChangelogRecord ChangelogOnDiskHelper::buildRecord(size_t index, nuraft::ptr log_entry) const +ChangelogRecord Changelog::buildRecord(size_t index, nuraft::ptr log_entry) const { ChangelogRecordHeader header; header.index = index; @@ -254,16 +264,16 @@ ChangelogRecord ChangelogOnDiskHelper::buildRecord(size_t index, nuraft::ptr log_entry) +void Changelog::appendEntry(size_t index, nuraft::ptr log_entry) { if (!current_writer) - throw Exception(ErrorCodes::LOGICAL_ERROR, "ChangelogOnDiskHelper must be initialized before appending records"); + throw Exception(ErrorCodes::LOGICAL_ERROR, "Changelog must be initialized before appending records"); if (current_writer->getEntriesWritten() == rotate_interval) rotate(index); @@ -271,16 +281,19 @@ void ChangelogOnDiskHelper::appendRecord(size_t index, nuraft::ptrappendRecord(buildRecord(index, log_entry), true); if (!index_to_start_pos.try_emplace(index, offset).second) throw Exception(ErrorCodes::LOGICAL_ERROR, "Record with index {} already exists", index); - + logs[index] = makeClone(log_entry); } -void ChangelogOnDiskHelper::writeAt(size_t index, nuraft::ptr log_entry) +void Changelog::writeAt(size_t index, nuraft::ptr log_entry) { if (index < current_writer->getStartIndex()) throw Exception(ErrorCodes::UNIMPLEMENTED, "Currently cannot overwrite index from previous file"); + if (index_to_start_pos.count(index) == 0) + throw Exception(ErrorCodes::LOGICAL_ERROR, "Cannot write at index {} because changelog doesn't contain it", index); + auto entries_written = current_writer->getEntriesWritten(); - current_writer->truncateToLength(index_to_start_pos(index)); + current_writer->truncateToLength(index_to_start_pos[index]); for (auto itr = index_to_start_pos.begin(); itr != index_to_start_pos.end();) { if (itr->first >= index) @@ -294,22 +307,128 @@ void ChangelogOnDiskHelper::writeAt(size_t index, nuraft::ptr current_writer->setEntriesWritten(entries_written); - appendRecord(index, log_entry); + auto itr = logs.lower_bound(index); + while (itr != logs.end()) + itr = logs.erase(itr); + + appendEntry(index, log_entry); } -void ChangelogOnDiskHelper::compact(size_t up_to_log_idx) +void Changelog::compact(size_t up_to_log_idx) { for (auto itr = existing_changelogs.begin(); itr != existing_changelogs.end();) { ChangelogName parsed_name = getChangelogName(*itr); if (parsed_name.to_log_idx <= up_to_log_idx) { - std::filesystem::remove(itr); + std::filesystem::remove(*itr); itr = existing_changelogs.erase(itr); for (size_t idx = parsed_name.from_log_idx; idx <= parsed_name.to_log_idx; ++idx) + { + auto logs_itr = logs.find(idx); + if (logs_itr != logs.end()) + logs.erase(idx); + else + break; index_to_start_pos.erase(idx); + } } } } +LogEntryPtr Changelog::getLastEntry() const +{ + + static LogEntryPtr fake_entry = nuraft::cs_new(0, nuraft::buffer::alloc(sizeof(size_t))); + + size_t next_idx = getNextEntryIndex() - 1; + auto entry = logs.find(next_idx); + if (entry == logs.end()) + return fake_entry; + + return makeClone(entry->second); +} + +LogEntriesPtr Changelog::getLogEntriesBetween(size_t start, size_t end) +{ + LogEntriesPtr ret = nuraft::cs_new>>(); + + ret->resize(end - start); + size_t result_pos = 0; + for (size_t i = start; i < end; ++i) + { + (*ret)[result_pos] = entryAt(i); + result_pos++; + } + return ret; +} + +LogEntryPtr Changelog::entryAt(size_t idx) +{ + nuraft::ptr src = nullptr; + auto entry = logs.find(idx); + if (entry == logs.end()) + return nullptr; + + src = entry->second; + return makeClone(src); +} + +nuraft::ptr Changelog::serializeEntriesToBuffer(size_t index, int32_t cnt) +{ + std::vector> returned_logs; + + size_t size_total = 0; + for (size_t i = index; i < index + cnt; ++i) + { + auto entry = logs.find(i); + if (entry == logs.end()) + throw Exception(ErrorCodes::LOGICAL_ERROR, "Don't have log entry {}", i); + + nuraft::ptr buf = entry->second->serialize(); + size_total += buf->size(); + returned_logs.push_back(buf); + } + + nuraft::ptr buf_out = nuraft::buffer::alloc(sizeof(int32_t) + cnt * sizeof(int32_t) + size_total); + buf_out->pos(0); + buf_out->put(static_cast(cnt)); + + for (auto & entry : returned_logs) + { + nuraft::ptr & bb = entry; + buf_out->put(static_cast(bb->size())); + buf_out->put(*bb); + } + return buf_out; +} + +void Changelog::applyEntriesFromBuffer(size_t index, nuraft::buffer & buffer) +{ + buffer.pos(0); + int num_logs = buffer.get_int(); + + for (int i = 0; i < num_logs; ++i) + { + size_t cur_idx = index + i; + int buf_size = buffer.get_int(); + + nuraft::ptr buf_local = nuraft::buffer::alloc(buf_size); + buffer.get(buf_local); + + LogEntryPtr log_entry = nuraft::log_entry::deserialize(*buf_local); + if (i == 0 && logs.count(cur_idx)) + writeAt(cur_idx, log_entry); + else + appendEntry(cur_idx, log_entry); + } +} + +void Changelog::flush() +{ + current_writer->flush(); +} + +Changelog::~Changelog() = default; + } diff --git a/src/Coordination/Changelog.h b/src/Coordination/Changelog.h index ffcd2a353bb..c58f35cb4a1 100644 --- a/src/Coordination/Changelog.h +++ b/src/Coordination/Changelog.h @@ -12,6 +12,13 @@ namespace DB using Checksum = CityHash_v1_0_2::uint128; +using LogEntryPtr = nuraft::ptr; +using LogEntries = std::vector; +using LogEntriesPtr = nuraft::ptr; + +using IndexToOffset = std::unordered_map; +using IndexToLogEntry = std::map; + enum class ChangelogVersion : uint8_t { V0 = 0, @@ -20,7 +27,7 @@ enum class ChangelogVersion : uint8_t std::string toString(const ChangelogVersion & version); ChangelogVersion fromString(const std::string & version_str); -static constexpr auto CURRENT_CHANGELOG_VERSION = ChangeLogVersion::V0; +static constexpr auto CURRENT_CHANGELOG_VERSION = ChangelogVersion::V0; struct ChangelogRecordHeader { @@ -38,33 +45,48 @@ struct ChangelogRecord nuraft::ptr blob; }; -using IndexToOffset = std::unordered_map; -using IndexToLogEntry = std::map>; -struct Changelog -{ -public: -private: - IndexToLogEntry logs; - size_t start_idx = 0; -}; class ChangelogWriter; -class ChangelogOnDiskHelper +class Changelog { public: - ChangelogOnDiskHelper(const std::string & changelogs_dir_, size_t rotate_interval_); + Changelog(const std::string & changelogs_dir_, size_t rotate_interval_); - Changelog readChangelogAndInitWriter(size_t from_log_idx); + void readChangelogAndInitWriter(size_t from_log_idx); - void appendRecord(size_t index, nuraft::ptr log_entry); + void appendEntry(size_t index, LogEntryPtr log_entry); - void writeAt(size_t index, nuraft::ptr log_entry); + void writeAt(size_t index, LogEntryPtr log_entry); void compact(size_t up_to_log_idx); + size_t getNextEntryIndex() const + { + return start_index + logs.size() - 1; + } + + size_t getStartIndex() const + { + return start_index; + } + + LogEntryPtr getLastEntry() const; + + LogEntriesPtr getLogEntriesBetween(size_t start_index, size_t end_idx); + + LogEntryPtr entryAt(size_t idx); + + nuraft::ptr serializeEntriesToBuffer(size_t index, Int32 cnt); + + void applyEntriesFromBuffer(size_t index, nuraft::buffer & buffer); + + void flush(); + + ~Changelog(); + private: void rotate(size_t new_start_log_idex); @@ -76,6 +98,8 @@ private: std::unique_ptr current_writer; IndexToOffset index_to_start_pos; const size_t rotate_interval; + IndexToLogEntry logs; + size_t start_index = 0; }; } diff --git a/src/Coordination/NuKeeperLogStore.h b/src/Coordination/NuKeeperLogStore.h index 2d066ac3e3a..981dc3f24e7 100644 --- a/src/Coordination/NuKeeperLogStore.h +++ b/src/Coordination/NuKeeperLogStore.h @@ -13,12 +13,35 @@ class NuKeeperLogStore : public nuraft::log_store public: NuKeeperLogStore(const std::string & changelogs_path, size_t rotate_interval_); + void init(size_t from_log_idx); + + size_t start_index() const override; + + size_t next_slot() const override; + + nuraft::ptr last_entry() const override; + + size_t append(nuraft::ptr & entry) override; + + void write_at(size_t index, nuraft::ptr & entry) override; + + nuraft::ptr>> log_entries(size_t start, size_t end) override; + + nuraft::ptr entry_at(size_t index) override; + + size_t term_at(size_t index) override; + + nuraft::ptr pack(size_t index, int32_t cnt) override; + + void apply_pack(size_t index, nuraft::buffer & pack) override; + + bool compact(size_t last_log_index) override; + + bool flush() override; private: - mutable std::mutex logs_lock; - std::atomic start_idx; - Changelog in_memory_changelog; - ChangelogOnDiskHelper on_disk_changelog_helper; + mutable std::mutex changelog_lock; + Changelog changelog; }; } diff --git a/src/Coordination/tests/gtest_for_build.cpp b/src/Coordination/tests/gtest_for_build.cpp index ed9777350c5..6142ee0b5c0 100644 --- a/src/Coordination/tests/gtest_for_build.cpp +++ b/src/Coordination/tests/gtest_for_build.cpp @@ -22,6 +22,8 @@ #include #include // Y_IGNORE #include +#include +#include TEST(CoordinationTest, BuildTest) @@ -134,7 +136,7 @@ struct SimpliestRaftServer using SummingRaftServer = SimpliestRaftServer; -nuraft::ptr getLogEntry(int64_t number) +nuraft::ptr getBuffer(int64_t number) { nuraft::ptr ret = nuraft::buffer::alloc(sizeof(number)); nuraft::buffer_serializer bs(ret); @@ -151,7 +153,7 @@ TEST(CoordinationTest, TestSummingRaft1) /// Single node is leader EXPECT_EQ(s1.raft_instance->get_leader(), 1); - auto entry1 = getLogEntry(143); + auto entry1 = getBuffer(143); auto ret = s1.raft_instance->append_entries({entry1}); EXPECT_TRUE(ret->get_accepted()) << "failed to replicate: entry 1" << ret->get_result_code(); EXPECT_EQ(ret->get_result_code(), nuraft::cmd_result_code::OK) << "failed to replicate: entry 1" << ret->get_result_code(); @@ -209,7 +211,7 @@ TEST(CoordinationTest, TestSummingRaft3) EXPECT_EQ(s3.raft_instance->get_leader(), 2); std::cerr << "Starting to add entries\n"; - auto entry = getLogEntry(1); + auto entry = getBuffer(1); auto ret = s2.raft_instance->append_entries({entry}); EXPECT_TRUE(ret->get_accepted()) << "failed to replicate: entry 1" << ret->get_result_code(); EXPECT_EQ(ret->get_result_code(), nuraft::cmd_result_code::OK) << "failed to replicate: entry 1" << ret->get_result_code(); @@ -236,7 +238,7 @@ TEST(CoordinationTest, TestSummingRaft3) EXPECT_EQ(s2.state_machine->getValue(), 1); EXPECT_EQ(s3.state_machine->getValue(), 1); - auto non_leader_entry = getLogEntry(3); + auto non_leader_entry = getBuffer(3); auto ret_non_leader1 = s1.raft_instance->append_entries({non_leader_entry}); EXPECT_FALSE(ret_non_leader1->get_accepted()); @@ -245,7 +247,7 @@ TEST(CoordinationTest, TestSummingRaft3) EXPECT_FALSE(ret_non_leader3->get_accepted()); - auto leader_entry = getLogEntry(77); + auto leader_entry = getBuffer(77); auto ret_leader = s2.raft_instance->append_entries({leader_entry}); EXPECT_TRUE(ret_leader->get_accepted()) << "failed to replicate: entry 78" << ret_leader->get_result_code(); EXPECT_EQ(ret_leader->get_result_code(), nuraft::cmd_result_code::OK) << "failed to replicate: entry 78" << ret_leader->get_result_code(); @@ -333,4 +335,18 @@ TEST(CoordinationTest, TestStorageSerialization) EXPECT_EQ(new_storage.ephemerals[1].size(), 1); } +DB::LogEntryPtr getLogEntry(const std::string & s) +{ + DB::WriteBufferFromNuraftBuffer bufwriter; + writeText(s, bufwriter); + return nuraft::cs_new(0, bufwriter.getBuffer()); +} + +TEST(CoordinationTest, ChangelogTestSimple) +{ + DB::Changelog changelog("./logs", 5); + auto entry = getLogEntry("hello world"); + changelog.appendEntry(1, entry); +} + #endif From ed9f2b5eb99335471c9f0b60bf9633e1d75a5204 Mon Sep 17 00:00:00 2001 From: alesapin Date: Mon, 15 Feb 2021 21:01:01 +0300 Subject: [PATCH 0295/2357] Linkable code --- src/Coordination/Changelog.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/Coordination/Changelog.cpp b/src/Coordination/Changelog.cpp index f06185124da..d3ba176f209 100644 --- a/src/Coordination/Changelog.cpp +++ b/src/Coordination/Changelog.cpp @@ -16,7 +16,7 @@ namespace ErrorCodes extern const int CORRUPTED_DATA; extern const int UNKNOWN_FORMAT_VERSION; extern const int LOGICAL_ERROR; - extern const int UNIMPLEMENTED; + extern const int NOT_IMPLEMENTED; } @@ -287,7 +287,7 @@ void Changelog::appendEntry(size_t index, nuraft::ptr log_ent void Changelog::writeAt(size_t index, nuraft::ptr log_entry) { if (index < current_writer->getStartIndex()) - throw Exception(ErrorCodes::UNIMPLEMENTED, "Currently cannot overwrite index from previous file"); + throw Exception(ErrorCodes::NOT_IMPLEMENTED, "Currently cannot overwrite index from previous file"); if (index_to_start_pos.count(index) == 0) throw Exception(ErrorCodes::LOGICAL_ERROR, "Cannot write at index {} because changelog doesn't contain it", index); From e34d6b0f37da637e2fa68fc05945c6a3e4e57e5a Mon Sep 17 00:00:00 2001 From: Roman Bug Date: Mon, 15 Feb 2021 21:25:10 +0300 Subject: [PATCH 0296/2357] Update docs/ru/sql-reference/functions/date-time-functions.md Co-authored-by: Anna <42538400+adevyatova@users.noreply.github.com> --- docs/ru/sql-reference/functions/date-time-functions.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/ru/sql-reference/functions/date-time-functions.md b/docs/ru/sql-reference/functions/date-time-functions.md index d019c18a688..bb4c49e898e 100644 --- a/docs/ru/sql-reference/functions/date-time-functions.md +++ b/docs/ru/sql-reference/functions/date-time-functions.md @@ -706,7 +706,7 @@ SELECT FROM_UNIXTIME(423543535); └──────────────────────────┘ ``` -В случае, когда есть два аргумента: первый типа [Integer](../../sql-reference/data-types/int-uint.md) или [DateTime](../../sql-reference/data-types/datetime.md), а второй является строкой постоянного формата — функция работает таким же образом, как [formatDateTime](#formatdatetime), и возвращает значение типа [String](../../sql-reference/data-types/string.md#string). +В случае, когда есть два аргумента: первый типа [Integer](../../sql-reference/data-types/int-uint.md) или [DateTime](../../sql-reference/data-types/datetime.md), а второй является строкой постоянного формата — функция работает также, как [formatDateTime](#formatdatetime), и возвращает значение типа [String](../../sql-reference/data-types/string.md#string). Запрос: From e8889463a6351316c1d0ae1cc0b99c8424c767d5 Mon Sep 17 00:00:00 2001 From: Roman Bug Date: Mon, 15 Feb 2021 21:25:31 +0300 Subject: [PATCH 0297/2357] Update docs/ru/operations/utilities/clickhouse-local.md Co-authored-by: Anna <42538400+adevyatova@users.noreply.github.com> --- docs/ru/operations/utilities/clickhouse-local.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/ru/operations/utilities/clickhouse-local.md b/docs/ru/operations/utilities/clickhouse-local.md index 8ecbbfcce8c..15d069c9acf 100644 --- a/docs/ru/operations/utilities/clickhouse-local.md +++ b/docs/ru/operations/utilities/clickhouse-local.md @@ -77,7 +77,7 @@ $ clickhouse-local --query " 1 2 ``` -Объём оперативной памяти, занимаемой пользователями (Unix): +Объём оперативной памяти, занимаемой процессами, которые запустил пользователь (Unix): Запрос: From 6734df2a014fd8b3b587592ecfe21244f06ef0c4 Mon Sep 17 00:00:00 2001 From: lehasm Date: Mon, 15 Feb 2021 21:25:32 +0300 Subject: [PATCH 0298/2357] Unnecessary new lines removed --- docs/en/sql-reference/functions/string-functions.md | 6 ++---- docs/ru/sql-reference/functions/string-functions.md | 3 +-- 2 files changed, 3 insertions(+), 6 deletions(-) diff --git a/docs/en/sql-reference/functions/string-functions.md b/docs/en/sql-reference/functions/string-functions.md index fa9c84fa9af..03f6237bfe8 100644 --- a/docs/en/sql-reference/functions/string-functions.md +++ b/docs/en/sql-reference/functions/string-functions.md @@ -602,10 +602,8 @@ Hello, "world"! ## decodeXMLComponent {#decode-xml-component} -Replaces XML predefined entities with characters. -Predefined entities are `"` `&` `'` `>` `<` -This function also replaces numeric character references with Unicode characters. -Both decimal (like `✓`) and hexadecimal (`✓`) forms are supported. +Replaces XML predefined entities with characters. Predefined entities are `"` `&` `'` `>` `<` +This function also replaces numeric character references with Unicode characters. Both decimal (like `✓`) and hexadecimal (`✓`) forms are supported. **Syntax** diff --git a/docs/ru/sql-reference/functions/string-functions.md b/docs/ru/sql-reference/functions/string-functions.md index b1c4012e9f9..236583c211a 100644 --- a/docs/ru/sql-reference/functions/string-functions.md +++ b/docs/ru/sql-reference/functions/string-functions.md @@ -601,8 +601,7 @@ Hello, "world"! ## decodeXMLComponent {#decode-xml-component} Заменяет символами предопределенные мнемоники XML: `"` `&` `'` `>` `<` -Также эта функция заменяет числовые ссылки соответствующими символами юникод. -Поддерживаются десятичная (например, `✓`) и шестнадцатеричная (`✓`) формы. +Также эта функция заменяет числовые ссылки соответствующими символами юникод. Поддерживаются десятичная (например, `✓`) и шестнадцатеричная (`✓`) формы. **Синтаксис** From ed9d49abc3645ad49435e0effb649346ecf56390 Mon Sep 17 00:00:00 2001 From: vdimir Date: Mon, 15 Feb 2021 15:00:08 +0300 Subject: [PATCH 0299/2357] Update cross to inner rewrite --- src/Interpreters/CrossToInnerJoinVisitor.cpp | 280 +++++++++--------- .../01083_cross_to_inner_with_like.reference | 7 + .../01083_cross_to_inner_with_like.sql | 1 + 3 files changed, 142 insertions(+), 146 deletions(-) diff --git a/src/Interpreters/CrossToInnerJoinVisitor.cpp b/src/Interpreters/CrossToInnerJoinVisitor.cpp index 2c80451aedb..4d06ad31c03 100644 --- a/src/Interpreters/CrossToInnerJoinVisitor.cpp +++ b/src/Interpreters/CrossToInnerJoinVisitor.cpp @@ -81,156 +81,143 @@ private: ASTTableJoin * join = nullptr; }; -bool isComparison(const String & name) -{ - return name == NameEquals::name || - name == NameNotEquals::name || - name == NameLess::name || - name == NameGreater::name || - name == NameLessOrEquals::name || - name == NameGreaterOrEquals::name; -} - -/// It checks if where expression could be moved to JOIN ON expression partially or entirely. -class CheckExpressionVisitorData +/// Collect all identifiers from ast +class IdentifiersCollector { public: - using TypeToVisit = const ASTFunction; - - CheckExpressionVisitorData(const std::vector & tables_, - const std::vector & tables_with_columns, - const Aliases & aliases_) - : joined_tables(tables_) - , tables(tables_with_columns) - , aliases(aliases_) - , is_complex(false) - {} - - void visit(const ASTFunction & node, const ASTPtr & ast) + using ASTIdentPtr = const ASTIdentifier *; + using ASTIdentifiers = std::vector; + struct Data { - if (is_complex) - return; + ASTIdentifiers idents; + }; - if (node.name == NameAnd::name) - { - if (!node.arguments || node.arguments->children.empty()) - throw Exception("Logical error: function requires argument", ErrorCodes::LOGICAL_ERROR); - - for (auto & child : node.arguments->children) - { - if (const auto * func = child->as()) - { - visit(*func, child); - } - else - { - bool is_literal_or_ident = !child->as() && !child->as(); - is_complex = is_complex || !is_literal_or_ident; - } - } - } - else if (node.name == NameEquals::name) - { - if (size_t min_table = canMoveEqualsToJoinOn(node)) - asts_to_join_on[min_table].push_back(ast); - } - else if (isComparison(node.name)) - { - /// leave other comparisons as is - } - else if (functionIsLikeOperator(node.name) || /// LIKE, NOT LIKE, ILIKE, NOT ILIKE - functionIsInOperator(node.name)) /// IN, NOT IN - { - /// Leave as is. It's not possible to make push down here cause of unknown aliases and not implemented JOIN predicates. - /// select a as b from t1, t2 where t1.x = t2.x and b in(42) - /// select a as b from t1 inner join t2 on t1.x = t2.x and b in(42) - } - else if (node.name == NameOr::name) - { - - } - else - { - is_complex = true; - asts_to_join_on.clear(); - } + static void visit(const ASTPtr & node, Data & data) + { + if (const auto * ident = node->as()) + data.idents.push_back(ident); } - bool complex() const { return is_complex; } - bool matchAny(size_t t) const { return asts_to_join_on.count(t); } - - ASTPtr makeOnExpression(size_t table_pos) + static bool needChildVisit(const ASTPtr &, const ASTPtr &) { - if (!asts_to_join_on.count(table_pos)) - return {}; - - std::vector & expressions = asts_to_join_on[table_pos]; - - if (expressions.size() == 1) - return expressions[0]->clone(); - - std::vector arguments; - arguments.reserve(expressions.size()); - for (auto & ast : expressions) - arguments.emplace_back(ast->clone()); - - return makeASTFunction(NameAnd::name, std::move(arguments)); + return true; } -private: - const std::vector & joined_tables; - const std::vector & tables; - std::map> asts_to_join_on; - const Aliases & aliases; - bool is_complex; - - size_t canMoveEqualsToJoinOn(const ASTFunction & node) + static ASTIdentifiers collect(const ASTPtr & node) { - if (!node.arguments) - throw Exception("Logical error: function requires arguments", ErrorCodes::LOGICAL_ERROR); - if (node.arguments->children.size() != 2) - return false; - - const auto * left = node.arguments->children[0]->as(); - const auto * right = node.arguments->children[1]->as(); - if (!left || !right) - return false; - - /// Moving expressions that use column aliases is not supported. - if (left->isShort() && aliases.count(left->shortName())) - return false; - if (right->isShort() && aliases.count(right->shortName())) - return false; - - return checkIdentifiers(*left, *right); - } - - /// Check if the identifiers are from different joined tables. If it's a self joint, tables should have aliases. - /// select * from t1 a cross join t2 b where a.x = b.x - /// @return table position to attach expression to or 0. - size_t checkIdentifiers(const ASTIdentifier & left, const ASTIdentifier & right) - { - std::optional left_table_pos = IdentifierSemantic::getMembership(left); - if (!left_table_pos) - left_table_pos = IdentifierSemantic::chooseTableColumnMatch(left, tables); - - std::optional right_table_pos = IdentifierSemantic::getMembership(right); - if (!right_table_pos) - right_table_pos = IdentifierSemantic::chooseTableColumnMatch(right, tables); - - if (left_table_pos && right_table_pos && (*left_table_pos != *right_table_pos)) - { - size_t table_pos = std::max(*left_table_pos, *right_table_pos); - if (joined_tables[table_pos].canAttachOnExpression()) - return table_pos; - } - return 0; + IdentifiersCollector::Data ident_data; + ConstInDepthNodeVisitor ident_visitor(ident_data); + ident_visitor.visit(node); + return ident_data.idents; } }; -using CheckExpressionMatcher = ConstOneTypeMatcher; -using CheckExpressionVisitor = ConstInDepthNodeVisitor; +/// Split expression `expr_1 AND expr_2 AND ... AND expr_n` into vector `[expr_1, expr_2, ..., expr_n]` +void collectConjunctions(const ASTPtr & node, std::vector & members) +{ + if (const auto * func = node->as(); func && func->name == NameAnd::name) + { + for (const auto & child : func->arguments->children) + collectConjunctions(child, members); + return; + } + members.push_back(node); +} +std::optional getIdentMembership(const ASTIdentifier & ident, const std::vector & tables) +{ + std::optional table_pos = IdentifierSemantic::getMembership(ident); + if (table_pos) + return table_pos; + return IdentifierSemantic::chooseTableColumnMatch(ident, tables); +} + +std::optional getIdentsMembership(const ASTPtr ast, + const std::vector & tables, + const Aliases & aliases) +{ + auto idents = IdentifiersCollector::collect(ast); + + std::optional result; + for (const auto * ident : idents) + { + /// Moving expressions that use column aliases is not supported. + if (ident->isShort() && aliases.count(ident->shortName())) + return {}; + const auto pos = getIdentMembership(*ident, tables); + if (!pos) + return {}; + if (result && *pos != *result) + return {}; + result = pos; + } + return result; +} + +bool isAllowedToRewriteCrossJoin(const ASTPtr & node, const Aliases & aliases) +{ + if (const auto * func = node->as()) + { + auto idents = IdentifiersCollector::collect(node); + for (const auto * ident : idents) + { + if (ident->isShort() && aliases.count(ident->shortName())) + return false; + } + return true; + } + return node->as() || node->as(); +} + +bool canMoveExpressionToJoinOn(const ASTPtr & ast, + const std::vector & joined_tables, + const std::vector & tables, + const Aliases & aliases, + std::map> & asts_to_join_on) +{ + std::vector conjuncts; + collectConjunctions(ast, conjuncts); + for (const auto & node : conjuncts) + { + if (const auto * func = node->as(); func && func->name == NameEquals::name) + { + if (!func->arguments || func->arguments->children.size() != 2) + return false; + + /// Check if the identifiers are from different joined tables. + /// If it's a self joint, tables should have aliases. + auto left_table_pos = getIdentsMembership(func->arguments->children[0], tables, aliases); + auto right_table_pos = getIdentsMembership(func->arguments->children[1], tables, aliases); + + /// Identifiers from different table move to JOIN ON + if (left_table_pos && right_table_pos && *left_table_pos != *right_table_pos) + { + size_t table_pos = std::max(*left_table_pos, *right_table_pos); + if (joined_tables[table_pos].canAttachOnExpression()) + asts_to_join_on[table_pos].push_back(node); + else + return false; + } + } + + if (!isAllowedToRewriteCrossJoin(node, aliases)) + return false; + } + return true; +} + +ASTPtr makeOnExpression(const std::vector & expressions) +{ + if (expressions.size() == 1) + return expressions[0]->clone(); + + std::vector arguments; + arguments.reserve(expressions.size()); + for (const auto & ast : expressions) + arguments.emplace_back(ast->clone()); + + return makeASTFunction(NameAnd::name, std::move(arguments)); +} bool getTables(ASTSelectQuery & select, std::vector & joined_tables, size_t & num_comma) { @@ -342,18 +329,19 @@ void CrossToInnerJoinMatcher::visit(ASTSelectQuery & select, ASTPtr &, Data & da if (!select.where()) return; - CheckExpressionVisitor::Data visitor_data{joined_tables, data.tables_with_columns, data.aliases}; - CheckExpressionVisitor(visitor_data).visit(select.where()); - - if (visitor_data.complex()) - return; - - for (size_t i = 1; i < joined_tables.size(); ++i) + std::map> asts_to_join_on; + bool can_move_where = canMoveExpressionToJoinOn( + select.where(), joined_tables, data.tables_with_columns, data.aliases, asts_to_join_on); + if (can_move_where) { - if (visitor_data.matchAny(i)) + for (size_t i = 1; i < joined_tables.size(); ++i) { - if (joined_tables[i].rewriteCrossToInner(visitor_data.makeOnExpression(i))) - data.done = true; + const auto & expr_it = asts_to_join_on.find(i); + if (expr_it != asts_to_join_on.end()) + { + if (joined_tables[i].rewriteCrossToInner(makeOnExpression(expr_it->second))) + data.done = true; + } } } } diff --git a/tests/queries/0_stateless/01083_cross_to_inner_with_like.reference b/tests/queries/0_stateless/01083_cross_to_inner_with_like.reference index 42bbeb05ecb..bf043b4668a 100644 --- a/tests/queries/0_stateless/01083_cross_to_inner_with_like.reference +++ b/tests/queries/0_stateless/01083_cross_to_inner_with_like.reference @@ -19,3 +19,10 @@ SELECT FROM n ALL INNER JOIN r ON k = r.k WHERE (k = r.k) AND (name NOT LIKE \'A%\') +SELECT + k, + r.k, + name +FROM n +ALL INNER JOIN r ON (k + 1) = (r.k + 1) +WHERE ((k + 1) = (r.k + 1)) AND ((name = \'A\') OR (name = \'AA\')) diff --git a/tests/queries/0_stateless/01083_cross_to_inner_with_like.sql b/tests/queries/0_stateless/01083_cross_to_inner_with_like.sql index 644190cbddf..c6544553816 100644 --- a/tests/queries/0_stateless/01083_cross_to_inner_with_like.sql +++ b/tests/queries/0_stateless/01083_cross_to_inner_with_like.sql @@ -9,6 +9,7 @@ SET enable_optimize_predicate_expression = 0; EXPLAIN SYNTAX SELECT * FROM n, r WHERE n.k = r.k AND r.name = 'A'; EXPLAIN SYNTAX SELECT * FROM n, r WHERE n.k = r.k AND r.name LIKE 'A%'; EXPLAIN SYNTAX SELECT * FROM n, r WHERE n.k = r.k AND r.name NOT LIKE 'A%'; +EXPLAIN SYNTAX SELECT * FROM n, r WHERE n.k + 1 = r.k + 1 AND (r.name = 'A' OR r.name = 'AA'); DROP TABLE n; DROP TABLE r; From 9afa6b5b1b30ddf74d107f07b90f51dba510e4b3 Mon Sep 17 00:00:00 2001 From: vdimir Date: Mon, 15 Feb 2021 22:40:32 +0300 Subject: [PATCH 0300/2357] Add option cross_to_inner_join_rewrite --- src/Core/Settings.h | 1 + src/Interpreters/CrossToInnerJoinVisitor.cpp | 24 ++++++++++---------- src/Interpreters/CrossToInnerJoinVisitor.h | 1 + src/Interpreters/InterpreterSelectQuery.cpp | 5 ++-- 4 files changed, 17 insertions(+), 14 deletions(-) diff --git a/src/Core/Settings.h b/src/Core/Settings.h index 9bb9ad30f15..2ddd1e003ca 100644 --- a/src/Core/Settings.h +++ b/src/Core/Settings.h @@ -503,6 +503,7 @@ class IColumn; M(Bool, output_format_write_statistics, true, "Write statistics about read rows, bytes, time elapsed in suitable output formats.", 0) \ M(Bool, output_format_pretty_row_numbers, false, "Add row numbers before each row for pretty output format", 0) \ M(Bool, insert_distributed_one_random_shard, false, "If setting is enabled, inserting into distributed table will choose a random shard to write when there is no sharding key", 0) \ + M(Bool, cross_to_inner_join_rewrite, true, "Use inner join instead of comma/cross join if possible", 0) \ // End of FORMAT_FACTORY_SETTINGS diff --git a/src/Interpreters/CrossToInnerJoinVisitor.cpp b/src/Interpreters/CrossToInnerJoinVisitor.cpp index 4d06ad31c03..fc2747af8eb 100644 --- a/src/Interpreters/CrossToInnerJoinVisitor.cpp +++ b/src/Interpreters/CrossToInnerJoinVisitor.cpp @@ -326,21 +326,21 @@ void CrossToInnerJoinMatcher::visit(ASTSelectQuery & select, ASTPtr &, Data & da /// CROSS to INNER - if (!select.where()) - return; - - std::map> asts_to_join_on; - bool can_move_where = canMoveExpressionToJoinOn( - select.where(), joined_tables, data.tables_with_columns, data.aliases, asts_to_join_on); - if (can_move_where) + if (select.where() && data.cross_to_inner_join_rewrite) { - for (size_t i = 1; i < joined_tables.size(); ++i) + std::map> asts_to_join_on; + bool can_move_where + = canMoveExpressionToJoinOn(select.where(), joined_tables, data.tables_with_columns, data.aliases, asts_to_join_on); + if (can_move_where) { - const auto & expr_it = asts_to_join_on.find(i); - if (expr_it != asts_to_join_on.end()) + for (size_t i = 1; i < joined_tables.size(); ++i) { - if (joined_tables[i].rewriteCrossToInner(makeOnExpression(expr_it->second))) - data.done = true; + const auto & expr_it = asts_to_join_on.find(i); + if (expr_it != asts_to_join_on.end()) + { + if (joined_tables[i].rewriteCrossToInner(makeOnExpression(expr_it->second))) + data.done = true; + } } } } diff --git a/src/Interpreters/CrossToInnerJoinVisitor.h b/src/Interpreters/CrossToInnerJoinVisitor.h index 7cd5c93b1e3..885cf8162c1 100644 --- a/src/Interpreters/CrossToInnerJoinVisitor.h +++ b/src/Interpreters/CrossToInnerJoinVisitor.h @@ -19,6 +19,7 @@ public: const Aliases & aliases; const String current_database; bool done = false; + bool cross_to_inner_join_rewrite = true; }; static bool needChildVisit(ASTPtr &, const ASTPtr &); diff --git a/src/Interpreters/InterpreterSelectQuery.cpp b/src/Interpreters/InterpreterSelectQuery.cpp index 84de6fa4e6c..37d54e01a71 100644 --- a/src/Interpreters/InterpreterSelectQuery.cpp +++ b/src/Interpreters/InterpreterSelectQuery.cpp @@ -197,7 +197,7 @@ static Context getSubqueryContext(const Context & context) return subquery_context; } -static void rewriteMultipleJoins(ASTPtr & query, const TablesWithColumns & tables, const String & database) +static void rewriteMultipleJoins(ASTPtr & query, const TablesWithColumns & tables, const String & database, const Settings & settings) { ASTSelectQuery & select = query->as(); @@ -207,6 +207,7 @@ static void rewriteMultipleJoins(ASTPtr & query, const TablesWithColumns & table QueryAliasesNoSubqueriesVisitor(aliases).visit(select.select()); CrossToInnerJoinVisitor::Data cross_to_inner{tables, aliases, database}; + cross_to_inner.cross_to_inner_join_rewrite = settings.cross_to_inner_join_rewrite; CrossToInnerJoinVisitor(cross_to_inner).visit(query); JoinToSubqueryTransformVisitor::Data join_to_subs_data{tables, aliases}; @@ -324,7 +325,7 @@ InterpreterSelectQuery::InterpreterSelectQuery( /// Rewrite JOINs if (!has_input && joined_tables.tablesCount() > 1) { - rewriteMultipleJoins(query_ptr, joined_tables.tablesWithColumns(), context->getCurrentDatabase()); + rewriteMultipleJoins(query_ptr, joined_tables.tablesWithColumns(), context->getCurrentDatabase(), context->getSettingsRef()); joined_tables.reset(getSelectQuery()); joined_tables.resolveTables(); From ae73600fb0b1b673973199e9213db2b535572458 Mon Sep 17 00:00:00 2001 From: Nikolai Kochetov Date: Mon, 15 Feb 2021 22:48:06 +0300 Subject: [PATCH 0301/2357] Refactor row level security actions. --- src/Interpreters/ActionsDAG.cpp | 4 +- src/Interpreters/ActionsDAG.h | 4 +- src/Interpreters/ExpressionAnalyzer.cpp | 6 + src/Interpreters/InterpreterSelectQuery.cpp | 155 +++++++++++------- src/Interpreters/InterpreterSelectQuery.h | 2 +- .../getHeaderForProcessingStage.cpp | 12 +- .../MergeTreeBaseSelectProcessor.cpp | 44 +++-- .../MergeTree/MergeTreeBlockReadUtils.cpp | 26 +-- .../MergeTree/MergeTreeRangeReader.cpp | 41 ++--- src/Storages/SelectQueryInfo.h | 8 +- src/Storages/StorageBuffer.cpp | 34 ++-- 11 files changed, 161 insertions(+), 175 deletions(-) diff --git a/src/Interpreters/ActionsDAG.cpp b/src/Interpreters/ActionsDAG.cpp index 12942371d4f..bd092bc0296 100644 --- a/src/Interpreters/ActionsDAG.cpp +++ b/src/Interpreters/ActionsDAG.cpp @@ -80,14 +80,14 @@ ActionsDAG::Node & ActionsDAG::getNode(const std::string & name) return **it; } -const ActionsDAG::Node & ActionsDAG::addInput(std::string name, DataTypePtr type, bool can_replace) +const ActionsDAG::Node & ActionsDAG::addInput(std::string name, DataTypePtr type, bool can_replace, bool add_to_index) { Node node; node.type = ActionType::INPUT; node.result_type = std::move(type); node.result_name = std::move(name); - return addNode(std::move(node), can_replace); + return addNode(std::move(node), can_replace, add_to_index); } const ActionsDAG::Node & ActionsDAG::addInput(ColumnWithTypeAndName column, bool can_replace) diff --git a/src/Interpreters/ActionsDAG.h b/src/Interpreters/ActionsDAG.h index 3c8778e239a..d3f1d65d454 100644 --- a/src/Interpreters/ActionsDAG.h +++ b/src/Interpreters/ActionsDAG.h @@ -196,7 +196,7 @@ public: std::string dumpNames() const; std::string dumpDAG() const; - const Node & addInput(std::string name, DataTypePtr type, bool can_replace = false); + const Node & addInput(std::string name, DataTypePtr type, bool can_replace = false, bool add_to_index = true); const Node & addInput(ColumnWithTypeAndName column, bool can_replace = false); const Node & addColumn(ColumnWithTypeAndName column, bool can_replace = false, bool materialize = false); const Node & addAlias(const std::string & name, std::string alias, bool can_replace = false); @@ -220,7 +220,7 @@ public: /// Return true if column was removed from inputs. bool removeUnusedResult(const std::string & column_name); - void projectInput() { settings.project_input = true; } + void projectInput(bool project = true) { settings.project_input = project; } void removeUnusedActions(const Names & required_names); bool hasArrayJoin() const; diff --git a/src/Interpreters/ExpressionAnalyzer.cpp b/src/Interpreters/ExpressionAnalyzer.cpp index d5a6876f8ef..3145df23b95 100644 --- a/src/Interpreters/ExpressionAnalyzer.cpp +++ b/src/Interpreters/ExpressionAnalyzer.cpp @@ -855,6 +855,10 @@ ActionsDAGPtr SelectQueryExpressionAnalyzer::appendPrewhere( if (!select_query->prewhere()) return prewhere_actions; + Names first_action_names; + if (!chain.steps.empty()) + first_action_names = chain.steps.front()->getRequiredColumns().getNames(); + auto & step = chain.lastStep(sourceColumns()); getRootActions(select_query->prewhere(), only_types, step.actions()); String prewhere_column_name = select_query->prewhere()->getColumnName(); @@ -879,6 +883,7 @@ ActionsDAGPtr SelectQueryExpressionAnalyzer::appendPrewhere( auto tmp_actions = std::make_shared(tmp_actions_dag); auto required_columns = tmp_actions->getRequiredColumns(); NameSet required_source_columns(required_columns.begin(), required_columns.end()); + required_source_columns.insert(first_action_names.begin(), first_action_names.end()); /// Add required columns to required output in order not to remove them after prewhere execution. /// TODO: add sampling and final execution to common chain. @@ -1579,6 +1584,7 @@ void ExpressionAnalysisResult::finalize(const ExpressionActionsChain & chain, si { const ExpressionActionsChain::Step & step = *chain.steps.at(next_step_i++); prewhere_info->remove_prewhere_column = step.can_remove_required_output.at(0); + prewhere_info->prewhere_actions->projectInput(false); NameSet columns_to_remove; for (size_t i = 1; i < step.required_output.size(); ++i) diff --git a/src/Interpreters/InterpreterSelectQuery.cpp b/src/Interpreters/InterpreterSelectQuery.cpp index d13c28e8ff2..2c960b6983a 100644 --- a/src/Interpreters/InterpreterSelectQuery.cpp +++ b/src/Interpreters/InterpreterSelectQuery.cpp @@ -107,6 +107,10 @@ namespace ErrorCodes String InterpreterSelectQuery::generateFilterActions( ActionsDAGPtr & actions, const ASTPtr & row_policy_filter, const Names & prerequisite_columns) const { + std::cerr << "----- InterpreterSelectQuery::generateFilterActions\n"; + for (const auto & name : prerequisite_columns) + std::cerr << name << std::endl; + const auto & db_name = table_id.getDatabaseName(); const auto & table_name = table_id.getTableName(); @@ -141,6 +145,7 @@ String InterpreterSelectQuery::generateFilterActions( auto syntax_result = TreeRewriter(*context).analyzeSelect(query_ast, TreeRewriterResult({}, storage, metadata_snapshot)); SelectQueryExpressionAnalyzer analyzer(query_ast, syntax_result, *context, metadata_snapshot); actions = analyzer.simpleSelectActions(); + //std::cerr << actions-> return expr_list->children.at(0)->getColumnName(); } @@ -524,6 +529,10 @@ void InterpreterSelectQuery::buildQueryPlan(QueryPlan & query_plan) { executeImpl(query_plan, input, std::move(input_pipe)); + WriteBufferFromOwnString buf; + query_plan.explainPlan(buf, {.header = true, .actions = true}); + std::cerr << buf.str(); + /// We must guarantee that result structure is the same as in getSampleBlock() if (!blocksHaveEqualStructure(query_plan.getCurrentDataStream().header, result_header)) { @@ -811,20 +820,54 @@ void InterpreterSelectQuery::executeImpl(QueryPlan & query_plan, const BlockInpu bool intermediate_stage = false; bool to_aggregation_stage = false; bool from_aggregation_stage = false; - const bool execute_row_level_filter_in_prewhere = ( - ( - settings.optimize_move_to_prewhere || // ...when it is allowed to move things to prewhere, so we do it for row-level filter actions too. - expressions.prewhere_info // ...or when we already have prewhere and must execute row-level filter before it. - ) && - !input && !input_pipe && storage && storage->supportsPrewhere() // Check that prewhere can be used at all. - ); + + if (expressions.filter_info) + { + if (!expressions.prewhere_info) + { + const bool does_storage_support_prewhere = !input && !input_pipe && storage && storage->supportsPrewhere(); + if (does_storage_support_prewhere && settings.optimize_move_to_prewhere) + { + /// Execute row level filter in prewhere as a part of "move to prewhere" optimization. + expressions.prewhere_info = std::make_shared( + std::move(expressions.filter_info->actions), + std::move(expressions.filter_info->column_name)); + expressions.prewhere_info->remove_prewhere_column = expressions.filter_info->do_remove_column; + expressions.prewhere_info->need_filter = true; + expressions.filter_info = nullptr; + } + } + else + { + /// Add row level security actions to prewhere. + std::cerr << expressions.filter_info->actions->dumpDAG() << std::endl; + expressions.prewhere_info->row_level_filter_actions = std::move(expressions.filter_info->actions); + expressions.prewhere_info->row_level_column_name = std::move(expressions.filter_info->column_name); + expressions.prewhere_info->row_level_filter_actions->projectInput(false); + if (expressions.filter_info->do_remove_column) + { + /// Instead of removing column, add it to prewhere_actions input (but not in index). + /// It will be removed at prewhere_actions execution. + const auto & index = expressions.prewhere_info->row_level_filter_actions->getIndex(); + auto it = index.find(expressions.prewhere_info->row_level_column_name); + if (it == index.end()) + throw Exception(ErrorCodes::LOGICAL_ERROR, "Not found column {} in row level security filter {}", + expressions.prewhere_info->row_level_column_name, expressions.prewhere_info->row_level_filter_actions->dumpDAG()); + const auto & node = *it; + + expressions.prewhere_info->prewhere_actions->addInput(node->result_name, node->result_type, true, false); + } + + expressions.filter_info = nullptr; + } + } if (options.only_analyze) { auto read_nothing = std::make_unique(source_header); query_plan.addStep(std::move(read_nothing)); - if (expressions.filter_info && execute_row_level_filter_in_prewhere) + if (expressions.filter_info) { auto row_level_security_step = std::make_unique( query_plan.getCurrentDataStream(), @@ -832,12 +875,24 @@ void InterpreterSelectQuery::executeImpl(QueryPlan & query_plan, const BlockInpu expressions.filter_info->column_name, expressions.filter_info->do_remove_column); - row_level_security_step->setStepDescription("Row-level security filter (PREWHERE)"); + row_level_security_step->setStepDescription("Row-level security filter"); query_plan.addStep(std::move(row_level_security_step)); } if (expressions.prewhere_info) { + if (expressions.prewhere_info->row_level_filter_actions) + { + auto row_level_filter_step = std::make_unique( + query_plan.getCurrentDataStream(), + expressions.prewhere_info->row_level_filter_actions, + expressions.prewhere_info->row_level_column_name, + false); + + row_level_filter_step->setStepDescription("Row-level security filter (PREWHERE)"); + query_plan.addStep(std::move(row_level_filter_step)); + } + auto prewhere_step = std::make_unique( query_plan.getCurrentDataStream(), expressions.prewhere_info->prewhere_actions, @@ -887,7 +942,7 @@ void InterpreterSelectQuery::executeImpl(QueryPlan & query_plan, const BlockInpu to_aggregation_stage = true; /// Read the data from Storage. from_stage - to what stage the request was completed in Storage. - executeFetchColumns(from_stage, query_plan, execute_row_level_filter_in_prewhere); + executeFetchColumns(from_stage, query_plan); LOG_TRACE(log, "{} -> {}", QueryProcessingStage::toString(from_stage), QueryProcessingStage::toString(options.to_stage)); } @@ -952,7 +1007,7 @@ void InterpreterSelectQuery::executeImpl(QueryPlan & query_plan, const BlockInpu if (expressions.first_stage) { - if (expressions.filter_info && !execute_row_level_filter_in_prewhere) + if (expressions.filter_info) { auto row_level_security_step = std::make_unique( query_plan.getCurrentDataStream(), @@ -1211,30 +1266,6 @@ void InterpreterSelectQuery::addEmptySourceToQueryPlan(QueryPlan & query_plan, c { auto & prewhere_info = *query_info.prewhere_info; - if (prewhere_info.filter_info) - { - auto & filter_info = *prewhere_info.filter_info; - - if (filter_info.alias_actions) - { - pipe.addSimpleTransform([&](const Block & header) - { - return std::make_shared( - header, - filter_info.alias_actions); - }); - } - - pipe.addSimpleTransform([&](const Block & header) - { - return std::make_shared( - header, - filter_info.actions, - filter_info.column_name, - filter_info.do_remove_column); - }); - } - if (prewhere_info.alias_actions) { pipe.addSimpleTransform([&](const Block & header) @@ -1245,6 +1276,18 @@ void InterpreterSelectQuery::addEmptySourceToQueryPlan(QueryPlan & query_plan, c }); } + if (prewhere_info.row_level_filter) + { + pipe.addSimpleTransform([&](const Block & header) + { + return std::make_shared( + header, + prewhere_info.row_level_filter, + prewhere_info.row_level_column_name, + false); + }); + } + pipe.addSimpleTransform([&](const Block & header) { return std::make_shared( @@ -1274,7 +1317,7 @@ void InterpreterSelectQuery::addEmptySourceToQueryPlan(QueryPlan & query_plan, c query_plan.addStep(std::move(read_from_pipe)); } -void InterpreterSelectQuery::executeFetchColumns(QueryProcessingStage::Enum processing_stage, QueryPlan & query_plan, bool execute_row_level_filter_in_prewhere) +void InterpreterSelectQuery::executeFetchColumns(QueryProcessingStage::Enum processing_stage, QueryPlan & query_plan) { auto & query = getSelectQuery(); const Settings & settings = context->getSettingsRef(); @@ -1351,13 +1394,15 @@ void InterpreterSelectQuery::executeFetchColumns(QueryProcessingStage::Enum proc if (storage) { /// Append columns from the table filter to required - auto row_policy_filter = context->getRowPolicyCondition(table_id.getDatabaseName(), table_id.getTableName(), RowPolicy::SELECT_FILTER); - if (row_policy_filter) + ActionsDAG * row_policy_filter = nullptr; + if (expressions.filter_info) + row_policy_filter = expressions.filter_info->actions.get(); + // else if (expressions.prewhere_info && expressions.prewhere_info->row_level_filter_actions) + // row_policy_filter = expressions.prewhere_info->row_level_filter_actions.get(); + + if (expressions.filter_info) { - auto initial_required_columns = required_columns; - ActionsDAGPtr actions_dag; - generateFilterActions(actions_dag, row_policy_filter, initial_required_columns); - auto required_columns_from_filter = actions_dag->getRequiredColumns(); + auto required_columns_from_filter = expressions.filter_info->actions->getRequiredColumns(); for (const auto & column : required_columns_from_filter) { @@ -1394,7 +1439,10 @@ void InterpreterSelectQuery::executeFetchColumns(QueryProcessingStage::Enum proc if (prewhere_info) { /// Get some columns directly from PREWHERE expression actions - auto prewhere_required_columns = prewhere_info->prewhere_actions->getRequiredColumns().getNames(); + auto prewhere_required_columns = ( + prewhere_info->row_level_filter_actions ? + prewhere_info->row_level_filter_actions : + prewhere_info->prewhere_actions)->getRequiredColumns().getNames(); required_columns_from_prewhere.insert(prewhere_required_columns.begin(), prewhere_required_columns.end()); } @@ -1605,31 +1653,18 @@ void InterpreterSelectQuery::executeFetchColumns(QueryProcessingStage::Enum proc { query_info.prewhere_info = std::make_shared(); - if (expressions.filter_info && execute_row_level_filter_in_prewhere) - { - query_info.prewhere_info->filter_info = std::make_shared(); - - if (alias_actions) - query_info.prewhere_info->filter_info->alias_actions = std::make_shared(std::move(alias_actions)); - - if (expressions.filter_info->actions) - query_info.prewhere_info->filter_info->actions = std::make_shared(expressions.filter_info->actions); - - query_info.prewhere_info->filter_info->column_name = expressions.filter_info->column_name; - query_info.prewhere_info->filter_info->do_remove_column = expressions.filter_info->do_remove_column; - } + query_info.prewhere_info->prewhere_actions = std::make_shared(prewhere_info->prewhere_actions); + if (prewhere_info->row_level_filter_actions) + query_info.prewhere_info->row_level_filter = std::make_shared(prewhere_info->row_level_filter_actions); if (prewhere_info->alias_actions) query_info.prewhere_info->alias_actions = std::make_shared(prewhere_info->alias_actions); - - if (prewhere_info->prewhere_actions) - query_info.prewhere_info->prewhere_actions = std::make_shared(prewhere_info->prewhere_actions); - if (prewhere_info->remove_columns_actions) query_info.prewhere_info->remove_columns_actions = std::make_shared(prewhere_info->remove_columns_actions); query_info.prewhere_info->prewhere_column_name = prewhere_info->prewhere_column_name; query_info.prewhere_info->remove_prewhere_column = prewhere_info->remove_prewhere_column; + query_info.prewhere_info->row_level_column_name = prewhere_info->row_level_column_name; query_info.prewhere_info->need_filter = prewhere_info->need_filter; } diff --git a/src/Interpreters/InterpreterSelectQuery.h b/src/Interpreters/InterpreterSelectQuery.h index 793df612103..20cffdf5702 100644 --- a/src/Interpreters/InterpreterSelectQuery.h +++ b/src/Interpreters/InterpreterSelectQuery.h @@ -108,7 +108,7 @@ private: /// Different stages of query execution. - void executeFetchColumns(QueryProcessingStage::Enum processing_stage, QueryPlan & query_plan, bool execute_row_level_filter_in_prewhere); + void executeFetchColumns(QueryProcessingStage::Enum processing_stage, QueryPlan & query_plan); void executeWhere(QueryPlan & query_plan, const ActionsDAGPtr & expression, bool remove_filter); void executeAggregation(QueryPlan & query_plan, const ActionsDAGPtr & expression, bool overflow_row, bool final, InputOrderInfoPtr group_by_info); void executeMergeAggregated(QueryPlan & query_plan, bool overflow_row, bool final); diff --git a/src/Interpreters/getHeaderForProcessingStage.cpp b/src/Interpreters/getHeaderForProcessingStage.cpp index 2aef3c25c3c..3adbab8413f 100644 --- a/src/Interpreters/getHeaderForProcessingStage.cpp +++ b/src/Interpreters/getHeaderForProcessingStage.cpp @@ -46,16 +46,8 @@ Block getHeaderForProcessingStage( { auto & prewhere_info = *query_info.prewhere_info; - if (prewhere_info.filter_info) - { - auto & filter_info = *prewhere_info.filter_info; - - if (filter_info.actions) - filter_info.actions->execute(header); - - if (filter_info.do_remove_column) - header.erase(filter_info.column_name); - } + if (prewhere_info.row_level_filter) + prewhere_info.row_level_filter->execute(header); if (prewhere_info.prewhere_actions) prewhere_info.prewhere_actions->execute(header); diff --git a/src/Storages/MergeTree/MergeTreeBaseSelectProcessor.cpp b/src/Storages/MergeTree/MergeTreeBaseSelectProcessor.cpp index 96993e4a106..5a46ed29e3d 100644 --- a/src/Storages/MergeTree/MergeTreeBaseSelectProcessor.cpp +++ b/src/Storages/MergeTree/MergeTreeBaseSelectProcessor.cpp @@ -334,38 +334,30 @@ void MergeTreeBaseSelectProcessor::executePrewhereActions(Block & block, const P { if (prewhere_info) { - if (prewhere_info->filter_info) - { - auto & filter_info = *prewhere_info->filter_info; - - if (filter_info.alias_actions) - filter_info.alias_actions->execute(block); - - if (filter_info.actions) - filter_info.actions->execute(block); - - auto & filter_column = block.getByName(filter_info.column_name); - if (!filter_column.type->canBeUsedInBooleanContext()) - { - throw Exception("Invalid type for row-level security filter: " + filter_column.type->getName(), - ErrorCodes::LOGICAL_ERROR); - } - - if (filter_info.do_remove_column) - block.erase(filter_info.column_name); - else - { - auto & ctn = block.getByName(filter_info.column_name); - ctn.column = ctn.type->createColumnConst(block.rows(), 1u)->convertToFullColumnIfConst(); - } - } + std::cerr << "0: " << block.dumpStructure() << std::endl; if (prewhere_info->alias_actions) prewhere_info->alias_actions->execute(block); + std::cerr << "1: " << block.dumpStructure() << std::endl; + + if (prewhere_info->row_level_filter) + { + prewhere_info->row_level_filter->execute(block); + auto & row_level_column = block.getByName(prewhere_info->row_level_column_name); + if (!row_level_column.type->canBeUsedInBooleanContext()) + { + throw Exception("Invalid type for filter in PREWHERE: " + row_level_column.type->getName(), + ErrorCodes::LOGICAL_ERROR); + } + } + std::cerr << "2: " << block.dumpStructure() << std::endl; + if (prewhere_info->prewhere_actions) prewhere_info->prewhere_actions->execute(block); + std::cerr << "3: " << block.dumpStructure() << std::endl; + auto & prewhere_column = block.getByName(prewhere_info->prewhere_column_name); if (!prewhere_column.type->canBeUsedInBooleanContext()) { @@ -380,6 +372,8 @@ void MergeTreeBaseSelectProcessor::executePrewhereActions(Block & block, const P auto & ctn = block.getByName(prewhere_info->prewhere_column_name); ctn.column = ctn.type->createColumnConst(block.rows(), 1u)->convertToFullColumnIfConst(); } + + std::cerr << "4: " << block.dumpStructure() << std::endl; } } diff --git a/src/Storages/MergeTree/MergeTreeBlockReadUtils.cpp b/src/Storages/MergeTree/MergeTreeBlockReadUtils.cpp index f4a5b1fcb9e..ed5fc48dad1 100644 --- a/src/Storages/MergeTree/MergeTreeBlockReadUtils.cpp +++ b/src/Storages/MergeTree/MergeTreeBlockReadUtils.cpp @@ -269,30 +269,12 @@ MergeTreeReadTaskColumns getReadTaskColumns( if (prewhere_info) { - if (prewhere_info->filter_info) - { - if (prewhere_info->filter_info->alias_actions) - { - const auto required_column_names = prewhere_info->filter_info->alias_actions->getRequiredColumns(); - pre_column_names.insert(pre_column_names.end(), required_column_names.begin(), required_column_names.end()); - } - else if (prewhere_info->filter_info->actions) - { - const auto required_column_names = prewhere_info->filter_info->actions->getRequiredColumns(); - pre_column_names.insert(pre_column_names.end(), required_column_names.begin(), required_column_names.end()); - } - } - if (prewhere_info->alias_actions) - { - const auto required_column_names = prewhere_info->alias_actions->getRequiredColumns(); - pre_column_names.insert(pre_column_names.end(), required_column_names.begin(), required_column_names.end()); - } + pre_column_names = prewhere_info->alias_actions->getRequiredColumns(); + else if (prewhere_info->row_level_filter) + pre_column_names = prewhere_info->row_level_filter->getRequiredColumns(); else if (prewhere_info->prewhere_actions) - { - const auto required_column_names = prewhere_info->prewhere_actions->getRequiredColumns(); - pre_column_names.insert(pre_column_names.end(), required_column_names.begin(), required_column_names.end()); - } + pre_column_names = prewhere_info->prewhere_actions->getRequiredColumns(); if (pre_column_names.empty()) pre_column_names.push_back(column_names[0]); diff --git a/src/Storages/MergeTree/MergeTreeRangeReader.cpp b/src/Storages/MergeTree/MergeTreeRangeReader.cpp index 3c79ed73a16..b4b8e4309b5 100644 --- a/src/Storages/MergeTree/MergeTreeRangeReader.cpp +++ b/src/Storages/MergeTree/MergeTreeRangeReader.cpp @@ -561,18 +561,12 @@ MergeTreeRangeReader::MergeTreeRangeReader( if (prewhere_info) { - if (prewhere_info->filter_info) - { - if (prewhere_info->filter_info->actions) - prewhere_info->filter_info->actions->execute(sample_block, true); - - if (prewhere_info->filter_info->do_remove_column) - sample_block.erase(prewhere_info->filter_info->column_name); - } - if (prewhere_info->alias_actions) prewhere_info->alias_actions->execute(sample_block, true); + if (prewhere_info->row_level_filter) + prewhere_info->row_level_filter->execute(sample_block, true); + if (prewhere_info->prewhere_actions) prewhere_info->prewhere_actions->execute(sample_block, true); @@ -897,31 +891,20 @@ void MergeTreeRangeReader::executePrewhereActionsAndFilterColumns(ReadResult & r for (auto name_and_type = header.begin(); pos < num_columns; ++pos, ++name_and_type) block.insert({result.columns[pos], name_and_type->type, name_and_type->name}); - if (prewhere_info->filter_info) - { - if (prewhere_info->filter_info->alias_actions) - prewhere_info->filter_info->alias_actions->execute(block); - - if (prewhere_info->filter_info->actions) - prewhere_info->filter_info->actions->execute(block); - - const auto filter_column_pos = block.getPositionByName(prewhere_info->filter_info->column_name); - result.addFilter(block.getByPosition(filter_column_pos).column); - - if (prewhere_info->filter_info->do_remove_column) - block.erase(prewhere_info->filter_info->column_name); - else - block.getByPosition(filter_column_pos).column = block.getByPosition(filter_column_pos).type->createColumnConst(result.num_rows, 1); - } - if (prewhere_info->alias_actions) prewhere_info->alias_actions->execute(block); /// Columns might be projected out. We need to store them here so that default columns can be evaluated later. result.block_before_prewhere = block; - if (prewhere_info->prewhere_actions) - prewhere_info->prewhere_actions->execute(block); + if (prewhere_info->row_level_filter) + { + prewhere_info->row_level_filter->execute(block); + const auto filter_column_pos = block.getPositionByName(prewhere_info->row_level_column_name); + result.addFilter(block.getByPosition(filter_column_pos).column); + } + + prewhere_info->prewhere_actions->execute(block); prewhere_column_pos = block.getPositionByName(prewhere_info->prewhere_column_name); result.addFilter(block.getByPosition(prewhere_column_pos).column); @@ -943,7 +926,7 @@ void MergeTreeRangeReader::executePrewhereActionsAndFilterColumns(ReadResult & r if (result.totalRowsPerGranule() == 0) result.setFilterConstFalse(); /// If we need to filter in PREWHERE - else if (prewhere_info->need_filter || result.need_filter || prewhere_info->remove_prewhere_column) + else if (prewhere_info->need_filter || result.need_filter || prewhere_info->row_level_filter) { /// If there is a filter and without optimized if (result.getFilter() && last_reader_in_chain) diff --git a/src/Storages/SelectQueryInfo.h b/src/Storages/SelectQueryInfo.h index a87ff2f40d3..fea9a7bad68 100644 --- a/src/Storages/SelectQueryInfo.h +++ b/src/Storages/SelectQueryInfo.h @@ -41,14 +41,16 @@ using ClusterPtr = std::shared_ptr; struct PrewhereInfo { - /// Information about the preliminary filter expression, if any. - FilterInfoPtr filter_info; /// Actions which are executed in order to alias columns are used for prewhere actions. ExpressionActionsPtr alias_actions; + /// Actions for row level security filter. Applied separately before prewhere_actions. + /// This actions are separate because prewhere condition should not be executed over filtered rows. + ExpressionActionsPtr row_level_filter; /// Actions which are executed on block in order to get filter column for prewhere step. ExpressionActionsPtr prewhere_actions; /// Actions which are executed after reading from storage in order to remove unused columns. ExpressionActionsPtr remove_columns_actions; + String row_level_column_name; String prewhere_column_name; bool remove_prewhere_column = false; bool need_filter = false; @@ -58,8 +60,10 @@ struct PrewhereInfo struct PrewhereDAGInfo { ActionsDAGPtr alias_actions; + ActionsDAGPtr row_level_filter_actions; ActionsDAGPtr prewhere_actions; ActionsDAGPtr remove_columns_actions; + String row_level_column_name; String prewhere_column_name; bool remove_prewhere_column = false; bool need_filter = false; diff --git a/src/Storages/StorageBuffer.cpp b/src/Storages/StorageBuffer.cpp index 64bcdd2d145..15dec77caf3 100644 --- a/src/Storages/StorageBuffer.cpp +++ b/src/Storages/StorageBuffer.cpp @@ -321,28 +321,6 @@ void StorageBuffer::read( { if (query_info.prewhere_info) { - if (query_info.prewhere_info->filter_info) - { - if (query_info.prewhere_info->filter_info->alias_actions) - { - pipe_from_buffers.addSimpleTransform([&](const Block & header) - { - return std::make_shared( - header, - query_info.prewhere_info->filter_info->alias_actions); - }); - } - - pipe_from_buffers.addSimpleTransform([&](const Block & header) - { - return std::make_shared( - header, - query_info.prewhere_info->filter_info->actions, - query_info.prewhere_info->filter_info->column_name, - query_info.prewhere_info->filter_info->do_remove_column); - }); - } - if (query_info.prewhere_info->alias_actions) { pipe_from_buffers.addSimpleTransform([&](const Block & header) @@ -353,6 +331,18 @@ void StorageBuffer::read( }); } + if (query_info.prewhere_info->row_level_filter) + { + pipe_from_buffers.addSimpleTransform([&](const Block & header) + { + return std::make_shared( + header, + query_info.prewhere_info->row_level_filter, + query_info.prewhere_info->row_level_column_name, + false); + }); + } + pipe_from_buffers.addSimpleTransform([&](const Block & header) { return std::make_shared( From 85277d6a417b633431ac18f2bdbdb7ca7da57568 Mon Sep 17 00:00:00 2001 From: Nikolai Kochetov Date: Mon, 15 Feb 2021 22:54:47 +0300 Subject: [PATCH 0302/2357] Comment debug output. --- .../MergeTree/MergeTreeBaseSelectProcessor.cpp | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/src/Storages/MergeTree/MergeTreeBaseSelectProcessor.cpp b/src/Storages/MergeTree/MergeTreeBaseSelectProcessor.cpp index 5a46ed29e3d..90da45cc6d8 100644 --- a/src/Storages/MergeTree/MergeTreeBaseSelectProcessor.cpp +++ b/src/Storages/MergeTree/MergeTreeBaseSelectProcessor.cpp @@ -334,12 +334,12 @@ void MergeTreeBaseSelectProcessor::executePrewhereActions(Block & block, const P { if (prewhere_info) { - std::cerr << "0: " << block.dumpStructure() << std::endl; + // std::cerr << "0: " << block.dumpStructure() << std::endl; if (prewhere_info->alias_actions) prewhere_info->alias_actions->execute(block); - std::cerr << "1: " << block.dumpStructure() << std::endl; + // std::cerr << "1: " << block.dumpStructure() << std::endl; if (prewhere_info->row_level_filter) { @@ -351,12 +351,12 @@ void MergeTreeBaseSelectProcessor::executePrewhereActions(Block & block, const P ErrorCodes::LOGICAL_ERROR); } } - std::cerr << "2: " << block.dumpStructure() << std::endl; + // std::cerr << "2: " << block.dumpStructure() << std::endl; if (prewhere_info->prewhere_actions) prewhere_info->prewhere_actions->execute(block); - std::cerr << "3: " << block.dumpStructure() << std::endl; + // std::cerr << "3: " << block.dumpStructure() << std::endl; auto & prewhere_column = block.getByName(prewhere_info->prewhere_column_name); if (!prewhere_column.type->canBeUsedInBooleanContext()) @@ -373,7 +373,7 @@ void MergeTreeBaseSelectProcessor::executePrewhereActions(Block & block, const P ctn.column = ctn.type->createColumnConst(block.rows(), 1u)->convertToFullColumnIfConst(); } - std::cerr << "4: " << block.dumpStructure() << std::endl; + // std::cerr << "4: " << block.dumpStructure() << std::endl; } } From cf57c3b4a2b1741a8f12ee41ddb29659e06876de Mon Sep 17 00:00:00 2001 From: Alexander Tokmakov Date: Mon, 15 Feb 2021 23:00:59 +0300 Subject: [PATCH 0303/2357] update comments --- src/Common/ZooKeeper/ZooKeeper.cpp | 8 ------ src/Databases/DatabaseFactory.cpp | 12 ++++++--- src/Databases/DatabaseReplicated.cpp | 35 +++++++++++++++++------- src/Databases/DatabaseReplicated.h | 40 ++++++++-------------------- tests/queries/skip_list.json | 1 + 5 files changed, 46 insertions(+), 50 deletions(-) diff --git a/src/Common/ZooKeeper/ZooKeeper.cpp b/src/Common/ZooKeeper/ZooKeeper.cpp index dc6abca6892..a1c6eb9b481 100644 --- a/src/Common/ZooKeeper/ZooKeeper.cpp +++ b/src/Common/ZooKeeper/ZooKeeper.cpp @@ -551,14 +551,6 @@ Coordination::Error ZooKeeper::trySet(const std::string & path, const std::strin Coordination::Error ZooKeeper::multiImpl(const Coordination::Requests & requests, Coordination::Responses & responses) { - String desc; - for (const auto & r : requests) - { - auto & r_ref = *r; - desc += String(typeid(r_ref).name()) + "\t" + r->getPath() + "\n"; - } - LOG_TRACE(&Poco::Logger::get("ZKTX"), "zk multi {}", desc); - if (requests.empty()) return Coordination::Error::ZOK; diff --git a/src/Databases/DatabaseFactory.cpp b/src/Databases/DatabaseFactory.cpp index cbe1b8bb02a..ca2b9bb083e 100644 --- a/src/Databases/DatabaseFactory.cpp +++ b/src/Databases/DatabaseFactory.cpp @@ -14,6 +14,7 @@ #include #include #include +#include #if !defined(ARCADIA_BUILD) # include "config_core.h" @@ -196,10 +197,13 @@ DatabasePtr DatabaseFactory::getImpl(const ASTCreateQuery & create, const String const auto & arguments = engine->arguments->children; - //TODO allow macros in arguments - const auto & zookeeper_path = safeGetLiteralValue(arguments[0], "Replicated"); - const auto & shard_name = safeGetLiteralValue(arguments[1], "Replicated"); - const auto & replica_name = safeGetLiteralValue(arguments[2], "Replicated"); + String zookeeper_path = safeGetLiteralValue(arguments[0], "Replicated"); + String shard_name = safeGetLiteralValue(arguments[1], "Replicated"); + String replica_name = safeGetLiteralValue(arguments[2], "Replicated"); + + zookeeper_path = context.getMacros()->expand(zookeeper_path); + shard_name = context.getMacros()->expand(shard_name); + replica_name = context.getMacros()->expand(replica_name); return std::make_shared(database_name, metadata_path, uuid, zookeeper_path, shard_name, replica_name, context); } diff --git a/src/Databases/DatabaseReplicated.cpp b/src/Databases/DatabaseReplicated.cpp index dc1203e8cc9..441880ae616 100644 --- a/src/Databases/DatabaseReplicated.cpp +++ b/src/Databases/DatabaseReplicated.cpp @@ -208,10 +208,13 @@ void DatabaseReplicated::tryConnectToZooKeeper(bool force_attach) is_readonly = false; } - catch(...) + catch (...) { if (!force_attach) throw; + + /// It's server startup, ignore error. + /// Worker thread will try to setup ZooKeeper connection tryLogCurrentException(log); } } @@ -234,10 +237,11 @@ bool DatabaseReplicated::createDatabaseNodesInZooKeeper(const zkutil::ZooKeeperP Coordination::Responses responses; auto res = current_zookeeper->tryMulti(ops, responses); if (res == Coordination::Error::ZOK) - return true; + return true; /// Created new database (it's the first replica) if (res == Coordination::Error::ZNODEEXISTS) - return false; + return false; /// Database exists, we will add new replica + /// Other codes are unexpected, will throw zkutil::KeeperMultiException::check(res, ops, responses); assert(false); __builtin_unreachable(); @@ -285,6 +289,7 @@ BlockIO DatabaseReplicated::propose(const ASTPtr & query, const Context & query_ if (query_context.getClientInfo().query_kind != ClientInfo::QueryKind::INITIAL_QUERY) throw Exception(ErrorCodes::INCORRECT_QUERY, "It's not initial query. ON CLUSTER is not allowed for Replicated database."); + /// Replicas will set correct name of current database in query context (database name can be different on replicas) if (auto * ddl_query = query->as()) ddl_query->database.clear(); @@ -337,6 +342,11 @@ static UUID getTableUUIDIfReplicated(const String & metadata, const Context & co void DatabaseReplicated::recoverLostReplica(const ZooKeeperPtr & current_zookeeper, UInt32 our_log_ptr, UInt32 max_log_ptr) { + /// Let's compare local (possibly outdated) metadata with (most actual) metadata stored in ZooKeeper + /// and try to update the set of local tables. + /// We could drop all local tables and create the new ones just like it's new replica. + /// But it will cause all ReplicatedMergeTree tables to fetch all data parts again and data in other tables will be lost. + bool new_replica = our_log_ptr == 0; if (new_replica) LOG_INFO(log, "Will create new replica from log pointer {}", max_log_ptr); @@ -350,7 +360,7 @@ void DatabaseReplicated::recoverLostReplica(const ZooKeeperPtr & current_zookeep /// For ReplicatedMergeTree tables we can compare only UUIDs to ensure that it's the same table. /// Metadata can be different, it's handled on table replication level. - /// We need to handle only renamed tables. + /// We need to handle renamed tables only. /// TODO maybe we should also update MergeTree SETTINGS if required? std::unordered_map zk_replicated_id_to_name; for (const auto & zk_table : table_name_to_metadata) @@ -360,6 +370,7 @@ void DatabaseReplicated::recoverLostReplica(const ZooKeeperPtr & current_zookeep zk_replicated_id_to_name.emplace(zk_replicated_id, zk_table.first); } + /// We will drop or move tables which exist only in local metadata Strings tables_to_detach; std::vector> replicated_tables_to_rename; size_t total_tables = 0; @@ -370,12 +381,16 @@ void DatabaseReplicated::recoverLostReplica(const ZooKeeperPtr & current_zookeep UUID local_replicated_id = UUIDHelpers::Nil; if (existing_tables_it->table()->supportsReplication()) { + /// Check if replicated tables have the same UUID local_replicated_id = existing_tables_it->table()->getStorageID().uuid; auto it = zk_replicated_id_to_name.find(local_replicated_id); if (it != zk_replicated_id_to_name.end()) { if (name != it->second) + { + /// Need just update table name replicated_tables_to_rename.emplace_back(name, it->second); + } continue; } } @@ -383,7 +398,8 @@ void DatabaseReplicated::recoverLostReplica(const ZooKeeperPtr & current_zookeep auto in_zk = table_name_to_metadata.find(name); if (in_zk == table_name_to_metadata.end() || in_zk->second != readMetadataFile(name)) { - tables_to_detach.emplace_back(std::move(name)); + /// Local table does not exits in ZooKeeper or has different metadata + tables_to_detach.emplace_back(std::move(name)); } } @@ -407,16 +423,14 @@ void DatabaseReplicated::recoverLostReplica(const ZooKeeperPtr & current_zookeep std::vector dropped_tables; for (const auto & table_name : tables_to_detach) { - String to_name = fmt::format("{}_{}_{}", table_name, max_log_ptr, thread_local_rng() % 1000); - assert(db_name < to_db_name); DDLGuardPtr table_guard = DatabaseCatalog::instance().getDDLGuard(db_name, table_name); - DDLGuardPtr to_table_guard = DatabaseCatalog::instance().getDDLGuard(to_db_name, to_name); if (getDatabaseName() != db_name) throw Exception(ErrorCodes::UNKNOWN_DATABASE, "Database was renamed, will retry"); auto table = tryGetTable(table_name, global_context); if (isDictionaryExist(table_name)) { + /// We can safely drop any dictionaries because they do not store data LOG_DEBUG(log, "Will DROP DICTIONARY {}", backQuoteIfNeed(table_name)); DatabaseAtomic::removeDictionary(global_context, table_name); ++dropped_dicts; @@ -430,7 +444,11 @@ void DatabaseReplicated::recoverLostReplica(const ZooKeeperPtr & current_zookeep } else { + /// Table probably stores some data. Let's move it to another database. + String to_name = fmt::format("{}_{}_{}", table_name, max_log_ptr, thread_local_rng() % 1000); LOG_DEBUG(log, "Will RENAME TABLE {} TO {}.{}", backQuoteIfNeed(table_name), backQuoteIfNeed(to_db_name), backQuoteIfNeed(to_name)); + assert(db_name < to_db_name); + DDLGuardPtr to_table_guard = DatabaseCatalog::instance().getDDLGuard(to_db_name, to_name); auto to_db_ptr = DatabaseCatalog::instance().getDatabase(to_db_name); DatabaseAtomic::renameTable(global_context, table_name, *to_db_ptr, to_name, false, false); ++moved_tables; @@ -454,7 +472,6 @@ void DatabaseReplicated::recoverLostReplica(const ZooKeeperPtr & current_zookeep DatabaseAtomic::renameTable(global_context, from, *this, to, false, false); } - for (const auto & id : dropped_tables) DatabaseCatalog::instance().waitTableFinallyDropped(id); diff --git a/src/Databases/DatabaseReplicated.h b/src/Databases/DatabaseReplicated.h index 2ae97b0d82a..83efb24a49d 100644 --- a/src/Databases/DatabaseReplicated.h +++ b/src/Databases/DatabaseReplicated.h @@ -18,28 +18,6 @@ using ZooKeeperPtr = std::shared_ptr; class Cluster; using ClusterPtr = std::shared_ptr; -/** DatabaseReplicated engine - * supports replication of metadata - * via DDL log being written to ZooKeeper - * and executed on all of the replicas - * for a given database. - * - * One Clickhouse server can have multiple - * replicated databases running and updating - * at the same time. - * - * The engine has two parameters ZooKeeper path and - * replica name. - * The same ZooKeeper path corresponds to the same - * database. Replica names MUST be different for all replicas - * of the same database. - * - * Using this engine, creation of Replicated tables - * requires no ZooKeeper path and replica name parameters. - * Table's replica name is the same as database replica name. - * Table's ZooKeeper path is a concatenation of database - * ZooKeeper path, /tables/, and UUID of the table. - */ class DatabaseReplicated : public DatabaseAtomic { public: @@ -49,6 +27,9 @@ public: ~DatabaseReplicated() override; + String getEngineName() const override { return "Replicated"; } + + /// If current query is initial, then the following methods add metadata updating ZooKeeper operations to current MetadataTransaction. void dropTable(const Context &, const String & table_name, bool no_delay) override; void renameTable(const Context & context, const String & table_name, IDatabase & to_database, const String & to_table_name, bool exchange, bool dictionary) override; @@ -64,22 +45,23 @@ public: void removeDictionary(const Context & context, const String & dictionary_name) override; void detachTablePermanently(const Context & context, const String & table_name) override; - void drop(const Context & /*context*/) override; - - String getEngineName() const override { return "Replicated"; } - + /// Try to execute DLL query on current host as initial query. If query is succeed, + /// then it will be executed on all replicas. BlockIO propose(const ASTPtr & query, const Context & query_context); void stopReplication(); - void shutdown() override; - - void loadStoredObjects(Context & context, bool has_force_restore_data_flag, bool force_attach) override; String getFullReplicaName() const; static std::pair parseFullReplicaName(const String & name); + /// Returns cluster consisting of database replicas ClusterPtr getCluster() const; + void drop(const Context & /*context*/) override; + + void loadStoredObjects(Context & context, bool has_force_restore_data_flag, bool force_attach) override; + void shutdown() override; + friend struct DatabaseReplicatedTask; friend class DatabaseReplicatedDDLWorker; private: diff --git a/tests/queries/skip_list.json b/tests/queries/skip_list.json index db7b0631b97..f28e2dd7226 100644 --- a/tests/queries/skip_list.json +++ b/tests/queries/skip_list.json @@ -296,6 +296,7 @@ "01015_attach_part", "01015_database_bad_tables", "01017_uniqCombined_memory_usage", + "01018_ddl_dictionaries_concurrent_requrests", /// Cannot parse ATTACH DICTIONARY IF NOT EXISTS "01019_alter_materialized_view_atomic", "01019_alter_materialized_view_consistent", "01019_alter_materialized_view_query", From e7bbb6cb23446791cabdd1ab315d29107e857324 Mon Sep 17 00:00:00 2001 From: sevirov <72220289+sevirov@users.noreply.github.com> Date: Mon, 15 Feb 2021 23:09:06 +0300 Subject: [PATCH 0304/2357] Update docs/en/sql-reference/functions/type-conversion-functions.md Co-authored-by: olgarev <56617294+olgarev@users.noreply.github.com> --- docs/en/sql-reference/functions/type-conversion-functions.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/en/sql-reference/functions/type-conversion-functions.md b/docs/en/sql-reference/functions/type-conversion-functions.md index f752bb9f6cb..189cf74049c 100644 --- a/docs/en/sql-reference/functions/type-conversion-functions.md +++ b/docs/en/sql-reference/functions/type-conversion-functions.md @@ -701,7 +701,7 @@ parseDateTimeBestEffortUSOrNull(time_string [, time_zone]); **Parameters** -- `time_string` — String containing a date and time to convert. [String](../../sql-reference/data-types/string.md). +- `time_string` — String containing a date or date with time to convert. The date must be in the US date format (`MM/DD/YYYY`). [String](../../sql-reference/data-types/string.md). - `time_zone` — Time zone. The function parses `time_string` according to the time zone. [String](../../sql-reference/data-types/string.md). **Supported non-standard formats** From 5eda6169902306fb4e9f07e28327aff9531b3052 Mon Sep 17 00:00:00 2001 From: sevirov <72220289+sevirov@users.noreply.github.com> Date: Mon, 15 Feb 2021 23:14:01 +0300 Subject: [PATCH 0305/2357] Update docs/en/sql-reference/functions/type-conversion-functions.md Co-authored-by: olgarev <56617294+olgarev@users.noreply.github.com> --- docs/en/sql-reference/functions/type-conversion-functions.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/en/sql-reference/functions/type-conversion-functions.md b/docs/en/sql-reference/functions/type-conversion-functions.md index 189cf74049c..06ac64646ae 100644 --- a/docs/en/sql-reference/functions/type-conversion-functions.md +++ b/docs/en/sql-reference/functions/type-conversion-functions.md @@ -702,7 +702,7 @@ parseDateTimeBestEffortUSOrNull(time_string [, time_zone]); **Parameters** - `time_string` — String containing a date or date with time to convert. The date must be in the US date format (`MM/DD/YYYY`). [String](../../sql-reference/data-types/string.md). -- `time_zone` — Time zone. The function parses `time_string` according to the time zone. [String](../../sql-reference/data-types/string.md). +- `time_zone` — [Timezone](../../operations/server-configuration-parameters/settings.md#server_configuration_parameters-timezone). The function parses `time_string` according to the timezone. Optional. [String](../../sql-reference/data-types/string.md). **Supported non-standard formats** From a09c9be48b6ba4d42029459486639b3c6b504429 Mon Sep 17 00:00:00 2001 From: sevirov <72220289+sevirov@users.noreply.github.com> Date: Mon, 15 Feb 2021 23:30:39 +0300 Subject: [PATCH 0306/2357] Update docs/en/sql-reference/functions/type-conversion-functions.md Co-authored-by: olgarev <56617294+olgarev@users.noreply.github.com> --- .../en/sql-reference/functions/type-conversion-functions.md | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/docs/en/sql-reference/functions/type-conversion-functions.md b/docs/en/sql-reference/functions/type-conversion-functions.md index 06ac64646ae..24ac8d91d22 100644 --- a/docs/en/sql-reference/functions/type-conversion-functions.md +++ b/docs/en/sql-reference/functions/type-conversion-functions.md @@ -707,10 +707,10 @@ parseDateTimeBestEffortUSOrNull(time_string [, time_zone]); **Supported non-standard formats** - A string containing 9..10 digit [unix timestamp](https://en.wikipedia.org/wiki/Unix_time). -- A string with a date and a time component: `YYYYMMDDhhmmss`, `MM/DD/YYYY hh:mm:ss`, `MM-DD-YY hh:mm`, `YYYY-MM-DD hh:mm:ss`, etc. +- A string with a date and a time components: `YYYYMMDDhhmmss`, `MM/DD/YYYY hh:mm:ss`, `MM-DD-YY hh:mm`, `YYYY-MM-DD hh:mm:ss`, etc. - A string with a date, but no time component: `YYYY`, `YYYYMM`, `YYYY*MM`, `MM/DD/YYYY`, `MM-DD-YY` etc. -- A string with a day and time: `DD`, `DD hh`, `DD hh:mm`. In this case, `YYYY-MM` are substituted as `2000-01`. -- A string that includes the date and time along with time zone offset information: `YYYY-MM-DD hh:mm:ss ±h:mm`, etc. For example, `2020-12-12 17:36:00 -5:00`. +- A string with a day and time: `DD`, `DD hh`, `DD hh:mm`. In this case, `YYYY-MM` are substituted with `2000-01`. +- A string that includes date and time along with timezone offset information: `YYYY-MM-DD hh:mm:ss ±h:mm`, etc. For example, `2020-12-12 17:36:00 -5:00`. **Returned values** From f6cbad65e82267b6c6e9bc0fcc672f0802085384 Mon Sep 17 00:00:00 2001 From: sevirov <72220289+sevirov@users.noreply.github.com> Date: Mon, 15 Feb 2021 23:33:35 +0300 Subject: [PATCH 0307/2357] Update docs/en/sql-reference/functions/type-conversion-functions.md Co-authored-by: olgarev <56617294+olgarev@users.noreply.github.com> --- .../en/sql-reference/functions/type-conversion-functions.md | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/docs/en/sql-reference/functions/type-conversion-functions.md b/docs/en/sql-reference/functions/type-conversion-functions.md index 24ac8d91d22..6cc0fe52442 100644 --- a/docs/en/sql-reference/functions/type-conversion-functions.md +++ b/docs/en/sql-reference/functions/type-conversion-functions.md @@ -714,8 +714,10 @@ parseDateTimeBestEffortUSOrNull(time_string [, time_zone]); **Returned values** -- `time_string` converted to the `DateTime` data type. -- `NULL`. +Possible values: + +- `time_string` converted to the [DateTime](../../sql-reference/data-types/datetime.md) data type. +- `NULL` if the input string cannot be converted to the `DateTime` data type. **Examples** From c9a6b21fc8c20f08c4abbe62398d635deb5de3d4 Mon Sep 17 00:00:00 2001 From: Dmitriy Date: Mon, 15 Feb 2021 23:47:12 +0300 Subject: [PATCH 0308/2357] Fix the English version MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Поправил английскую версию согласно комментариям в PR. --- .../functions/type-conversion-functions.md | 52 ++++++++----------- 1 file changed, 23 insertions(+), 29 deletions(-) diff --git a/docs/en/sql-reference/functions/type-conversion-functions.md b/docs/en/sql-reference/functions/type-conversion-functions.md index 6cc0fe52442..08e83771af7 100644 --- a/docs/en/sql-reference/functions/type-conversion-functions.md +++ b/docs/en/sql-reference/functions/type-conversion-functions.md @@ -691,12 +691,12 @@ Same as for [parseDateTimeBestEffort](#parsedatetimebesteffort) except that it r ## parseDateTimeBestEffortUSOrNull {#parsedatetimebesteffortusornull} -Same as for [parseDateTimeBestEffortUS](#parsedatetimebesteffortUS) except that it returns `NULL` when it encounters a date format that cannot be processed. +Same as [parseDateTimeBestEffortUS](#parsedatetimebesteffortUS) function except that it returns `NULL` when it encounters a date format that cannot be processed. **Syntax** ``` sql -parseDateTimeBestEffortUSOrNull(time_string [, time_zone]); +parseDateTimeBestEffortUSOrNull(time_string[, time_zone]) ``` **Parameters** @@ -716,16 +716,15 @@ parseDateTimeBestEffortUSOrNull(time_string [, time_zone]); Possible values: -- `time_string` converted to the [DateTime](../../sql-reference/data-types/datetime.md) data type. -- `NULL` if the input string cannot be converted to the `DateTime` data type. +- `time_string` converted to the [DateTime](../../sql-reference/data-types/datetime.md) data type. +- `NULL` if the input string cannot be converted to the `DateTime` data type. **Examples** Query: ``` sql -SELECT parseDateTimeBestEffortUSOrNull('02/10/2021 21:12:57') -AS parseDateTimeBestEffortUSOrNull; +SELECT parseDateTimeBestEffortUSOrNull('02/10/2021 21:12:57') AS parseDateTimeBestEffortUSOrNull; ``` Result: @@ -739,8 +738,7 @@ Result: Query: ``` sql -SELECT parseDateTimeBestEffortUSOrNull('02-10-2021 21:12:57') -AS parseDateTimeBestEffortUSOrNull; +SELECT parseDateTimeBestEffortUSOrNull('02-10-2021 21:12:57') AS parseDateTimeBestEffortUSOrNull; ``` Result: @@ -754,8 +752,7 @@ Result: Query: ``` sql -SELECT parseDateTimeBestEffortUSOrNull('02.10.2021 21:12:57') -AS parseDateTimeBestEffortUSOrNull; +SELECT parseDateTimeBestEffortUSOrNull('02.10.2021 21:12:57') AS parseDateTimeBestEffortUSOrNull; ``` Result: @@ -769,8 +766,7 @@ Result: Query: ``` sql -SELECT parseDateTimeBestEffortUSOrNull('02.2021 21:12:57') -AS parseDateTimeBestEffortUSOrNull; +SELECT parseDateTimeBestEffortUSOrNull('02.10.2021') AS parseDateTimeBestEffortUSOrNull; ``` Result: @@ -783,30 +779,32 @@ Result: ## parseDateTimeBestEffortUSOrZero {#parsedatetimebesteffortusorzero} -Same as for [parseDateTimeBestEffortUS](#parsedatetimebesteffortUS) except that it returns zero date or zero date time when it encounters a date format that cannot be processed. +Same as [parseDateTimeBestEffortUS](#parsedatetimebesteffortUS) function except that it returns zero date or zero date with time when it encounters a date format that cannot be processed. **Syntax** ``` sql -parseDateTimeBestEffortUSOrZero(time_string [, time_zone]); +parseDateTimeBestEffortUSOrZero(time_string[, time_zone]) ``` **Parameters** -- `time_string` — String containing a date and time to convert. [String](../../sql-reference/data-types/string.md). -- `time_zone` — Time zone. The function parses `time_string` according to the time zone. [String](../../sql-reference/data-types/string.md). +- `time_string` — String containing a date or date with time to convert. The date must be in the US date format (`MM/DD/YYYY`). [String](../../sql-reference/data-types/string.md). +- `time_zone` — [Timezone](../../operations/server-configuration-parameters/settings.md#server_configuration_parameters-timezone). The function parses `time_string` according to the timezone. Optional. [String](../../sql-reference/data-types/string.md). **Supported non-standard formats** - A string containing 9..10 digit [unix timestamp](https://en.wikipedia.org/wiki/Unix_time). -- A string with a date and a time component: `YYYYMMDDhhmmss`, `MM/DD/YYYY hh:mm:ss`, `MM-DD-YY hh:mm`, `YYYY-MM-DD hh:mm:ss`, etc. +- A string with a date and a time components: `YYYYMMDDhhmmss`, `MM/DD/YYYY hh:mm:ss`, `MM-DD-YY hh:mm`, `YYYY-MM-DD hh:mm:ss`, etc. - A string with a date, but no time component: `YYYY`, `YYYYMM`, `YYYY*MM`, `MM/DD/YYYY`, `MM-DD-YY` etc. -- A string with a day and time: `DD`, `DD hh`, `DD hh:mm`. In this case, `YYYY-MM` are substituted as `2000-01`. -- A string that includes the date and time along with time zone offset information: `YYYY-MM-DD hh:mm:ss ±h:mm`, etc. For example, `2020-12-12 17:36:00 -5:00`. +- A string with a day and time: `DD`, `DD hh`, `DD hh:mm`. In this case, `YYYY-MM` are substituted with `2000-01`. +- A string that includes date and time along with timezone offset information: `YYYY-MM-DD hh:mm:ss ±h:mm`, etc. For example, `2020-12-12 17:36:00 -5:00`. -**Returned value** +**Returned values** -- `time_string` converted to the `DateTime` data type. +Possible values: + +- `time_string` converted to the [DateTime](../../sql-reference/data-types/datetime.md) data type. - `zero date time`. **Examples** @@ -814,8 +812,7 @@ parseDateTimeBestEffortUSOrZero(time_string [, time_zone]); Query: ``` sql -SELECT parseDateTimeBestEffortUSOrZero('02/10/2021 21:12:57') -AS parseDateTimeBestEffortUSOrZero; +SELECT parseDateTimeBestEffortUSOrZero('02/10/2021 21:12:57') AS parseDateTimeBestEffortUSOrZero; ``` Result: @@ -829,8 +826,7 @@ Result: Query: ``` sql -SELECT parseDateTimeBestEffortUSOrZero('02-10-2021 21:12:57') -AS parseDateTimeBestEffortUSOrZero; +SELECT parseDateTimeBestEffortUSOrZero('02-10-2021 21:12:57') AS parseDateTimeBestEffortUSOrZero; ``` Result: @@ -844,8 +840,7 @@ Result: Query: ``` sql -SELECT parseDateTimeBestEffortUSOrZero('02.10.2021 21:12:57') -AS parseDateTimeBestEffortUS; +SELECT parseDateTimeBestEffortUSOrZero('02.10.2021 21:12:57') AS parseDateTimeBestEffortUS; ``` Result: @@ -859,8 +854,7 @@ Result: Query: ``` sql -SELECT parseDateTimeBestEffortUSOrZero('02.2021 21:12:57') -AS parseDateTimeBestEffortUSOrZero; +SELECT parseDateTimeBestEffortUSOrZero('02.2021 21:12:57') AS parseDateTimeBestEffortUSOrZero; ``` Result: From 937a3192eb6d5fad2ccdb4294f91f5c6d7af53b8 Mon Sep 17 00:00:00 2001 From: Pavel Kruglov Date: Mon, 15 Feb 2021 22:54:23 +0300 Subject: [PATCH 0309/2357] Fix data race --- src/Common/Epoll.cpp | 24 ++++++++++++++++++------ src/Common/Epoll.h | 9 ++++----- 2 files changed, 22 insertions(+), 11 deletions(-) diff --git a/src/Common/Epoll.cpp b/src/Common/Epoll.cpp index 628bb45e796..d085315b1a0 100644 --- a/src/Common/Epoll.cpp +++ b/src/Common/Epoll.cpp @@ -21,9 +21,21 @@ Epoll::Epoll() : events_count(0) throwFromErrno("Cannot open epoll descriptor", DB::ErrorCodes::EPOLL_ERROR); } -Epoll::Epoll(Epoll && other) : epoll_fd(other.epoll_fd), events_count(other.events_count) +Epoll::Epoll(Epoll && other) { + epoll_fd = other.epoll_fd; other.epoll_fd = -1; + int count = other.events_count; + events_count = count; +} + +Epoll & Epoll::operator=(Epoll && other) +{ + epoll_fd = other.epoll_fd; + other.epoll_fd = -1; + int count = other.events_count; + events_count = count; + return *this; } void Epoll::add(int fd, void * ptr) @@ -35,18 +47,18 @@ void Epoll::add(int fd, void * ptr) else event.data.fd = fd; + ++events_count; + if (epoll_ctl(epoll_fd, EPOLL_CTL_ADD, fd, &event) == -1) throwFromErrno("Cannot add new descriptor to epoll", DB::ErrorCodes::EPOLL_ERROR); - - ++events_count; } void Epoll::remove(int fd) { + --events_count; + if (epoll_ctl(epoll_fd, EPOLL_CTL_DEL, fd, nullptr) == -1) throwFromErrno("Cannot remove descriptor from epoll", DB::ErrorCodes::EPOLL_ERROR); - - --events_count; } size_t Epoll::getManyReady(int max_events, epoll_event * events_out, bool blocking, AsyncCallback async_callback) const @@ -54,7 +66,7 @@ size_t Epoll::getManyReady(int max_events, epoll_event * events_out, bool blocki if (events_count == 0) throw Exception("There is no events in epoll", ErrorCodes::LOGICAL_ERROR); - int ready_size = 0; + int ready_size; int timeout = blocking && !async_callback ? -1 : 0; do { diff --git a/src/Common/Epoll.h b/src/Common/Epoll.h index eb168c22a92..a7090bdb9b6 100644 --- a/src/Common/Epoll.h +++ b/src/Common/Epoll.h @@ -16,13 +16,12 @@ class Epoll public: Epoll(); - Epoll(const Epoll & other) = delete; - Epoll & operator=(const Epoll & other) = delete; + Epoll(const Epoll &) = delete; + Epoll & operator=(const Epoll &) = delete; + Epoll & operator=(Epoll && other); Epoll(Epoll && other); - Epoll & operator=(Epoll && other) = default; - /// Add new file descriptor to epoll. If ptr set to nullptr, epoll_event.data.fd = fd, /// otherwise epoll_event.data.ptr = ptr. void add(int fd, void * ptr = nullptr); @@ -47,7 +46,7 @@ public: private: int epoll_fd; - int events_count; + std::atomic events_count; }; } From 6eeef74d4389d97fcd614d3ae0b49025c6ac1a91 Mon Sep 17 00:00:00 2001 From: George Date: Tue, 16 Feb 2021 00:32:39 +0300 Subject: [PATCH 0310/2357] first draft --- docs/en/sql-reference/statements/detach.md | 22 ++++++++++++++++++++-- 1 file changed, 20 insertions(+), 2 deletions(-) diff --git a/docs/en/sql-reference/statements/detach.md b/docs/en/sql-reference/statements/detach.md index 62a7c0cc1e0..f3f8b053724 100644 --- a/docs/en/sql-reference/statements/detach.md +++ b/docs/en/sql-reference/statements/detach.md @@ -5,7 +5,9 @@ toc_title: DETACH # DETACH Statement {#detach} -Deletes information about the ‘name’ table from the server. The server stops knowing about the table’s existence. +Deletes information about the `name` table from the server. The server stops knowing about the table’s existence. + +Syntax: ``` sql DETACH TABLE [IF EXISTS] [db.]name [ON CLUSTER cluster] @@ -13,4 +15,20 @@ DETACH TABLE [IF EXISTS] [db.]name [ON CLUSTER cluster] This does not delete the table’s data or metadata. On the next server launch, the server will read the metadata and find out about the table again. -Similarly, a “detached” table can be re-attached using the `ATTACH` query (with the exception of system tables, which do not have metadata stored for them). +Similarly, a “detached” table can be re-attached using the [ATTACH](../../sql-reference/statements/attach) query (with the exception of system tables, which do not have metadata stored for them). + +## DETACH PERMAMENTLY {#detach-permamently} + +Deletes information about `name` table or view from the server. Permamently detached tables won't automatically reappear after the server restart. + +Syntax: + +``` sql +DETACH TABLE/VIEW [IF EXISTS] [db.]name PERMAMENTLY [ON CLUSTER cluster] +``` + +This statement does not delete the table’s data or metadata. + +Permamently detached table or view can be reattached with [ATTACH](../../sql-reference/statements/attach) query and can be shown with [SHOW CREATE TABLE](../../sql-reference/statements/show.md#show-create-table) query. + +[Original article](https://clickhouse.tech/docs/en/sql-reference/statements/detach/) From 21f80a9367760528b12c0639d3c4faacf7c100e0 Mon Sep 17 00:00:00 2001 From: Dmitriy Date: Tue, 16 Feb 2021 00:42:16 +0300 Subject: [PATCH 0311/2357] Add examples MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Добавил примеры. --- .../functions/type-conversion-functions.md | 42 +++++++++---------- 1 file changed, 21 insertions(+), 21 deletions(-) diff --git a/docs/en/sql-reference/functions/type-conversion-functions.md b/docs/en/sql-reference/functions/type-conversion-functions.md index 08e83771af7..81b5649db32 100644 --- a/docs/en/sql-reference/functions/type-conversion-functions.md +++ b/docs/en/sql-reference/functions/type-conversion-functions.md @@ -738,28 +738,14 @@ Result: Query: ``` sql -SELECT parseDateTimeBestEffortUSOrNull('02-10-2021 21:12:57') AS parseDateTimeBestEffortUSOrNull; +SELECT parseDateTimeBestEffortUSOrNull('02-10-2021 21:12:57 GMT', 'Europe/Moscow') AS parseDateTimeBestEffortUSOrNull; ``` Result: ``` text ┌─parseDateTimeBestEffortUSOrNull─┐ -│ 2021-02-10 21:12:57 │ -└─────────────────────────────────┘ -``` - -Query: - -``` sql -SELECT parseDateTimeBestEffortUSOrNull('02.10.2021 21:12:57') AS parseDateTimeBestEffortUSOrNull; -``` - -Result: - -``` text -┌─parseDateTimeBestEffortUSOrNull─┐ -│ 2021-02-10 21:12:57 │ +│ 2021-02-11 00:12:57 │ └─────────────────────────────────┘ ``` @@ -771,6 +757,20 @@ SELECT parseDateTimeBestEffortUSOrNull('02.10.2021') AS parseDateTimeBestEffortU Result: +``` text +┌─parseDateTimeBestEffortUSOrNull─┐ +│ 2021-02-10 00:00:00 │ +└─────────────────────────────────┘ +``` + +Query: + +``` sql +SELECT parseDateTimeBestEffortUSOrNull('10.2021') AS parseDateTimeBestEffortUSOrNull; +``` + +Result: + ``` text ┌─parseDateTimeBestEffortUSOrNull─┐ │ ᴺᵁᴸᴸ │ @@ -826,35 +826,35 @@ Result: Query: ``` sql -SELECT parseDateTimeBestEffortUSOrZero('02-10-2021 21:12:57') AS parseDateTimeBestEffortUSOrZero; +SELECT parseDateTimeBestEffortUSOrZero('02-10-2021 21:12:57 GMT', 'Europe/Moscow') AS parseDateTimeBestEffortUSOrZero; ``` Result: ``` text ┌─parseDateTimeBestEffortUSOrZero─┐ -│ 2021-02-10 21:12:57 │ +│ 2021-02-11 00:12:57 │ └─────────────────────────────────┘ ``` Query: ``` sql -SELECT parseDateTimeBestEffortUSOrZero('02.10.2021 21:12:57') AS parseDateTimeBestEffortUS; +SELECT parseDateTimeBestEffortUSOrZero('02.10.2021') AS parseDateTimeBestEffortUSOrZero; ``` Result: ``` text ┌─parseDateTimeBestEffortUSOrZero─┐ -│ 2021-02-10 21:12:57 │ +│ 2021-02-10 00:00:00 │ └─────────────────────────────────┘ ``` Query: ``` sql -SELECT parseDateTimeBestEffortUSOrZero('02.2021 21:12:57') AS parseDateTimeBestEffortUSOrZero; +SELECT parseDateTimeBestEffortUSOrZero('02.2021') AS parseDateTimeBestEffortUSOrZero; ``` Result: From 6dcb306060e0fb70371eb6b5d5fceb1357d29ed9 Mon Sep 17 00:00:00 2001 From: Kruglov Pavel <48961922+Avogar@users.noreply.github.com> Date: Tue, 16 Feb 2021 00:46:51 +0300 Subject: [PATCH 0312/2357] Style --- src/Client/Connection.cpp | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/src/Client/Connection.cpp b/src/Client/Connection.cpp index 0e8b94ef1cb..ee2d4474a0d 100644 --- a/src/Client/Connection.cpp +++ b/src/Client/Connection.cpp @@ -106,8 +106,7 @@ void Connection::prepare(const ConnectionTimeouts & timeouts) /// the server know which host we want to talk with (single IP can process requests for multiple hosts using SNI). static_cast(socket.get())->setPeerHostName(host); #else - throw Exception{ - "tcp_secure protocol is disabled because poco library was built without NetSSL support.", ErrorCodes::SUPPORT_IS_DISABLED}; + throw Exception{"tcp_secure protocol is disabled because poco library was built without NetSSL support.", ErrorCodes::SUPPORT_IS_DISABLED}; #endif } else From 2a887b9772180e6d0a731f966dc57572c73f25bd Mon Sep 17 00:00:00 2001 From: kssenii Date: Mon, 15 Feb 2021 21:56:51 +0000 Subject: [PATCH 0313/2357] Add missing format factory settings --- .../table-engines/integrations/rabbitmq.md | 9 +++- .../table-engines/integrations/rabbitmq.md | 9 +++- src/Storages/RabbitMQ/RabbitMQSettings.h | 7 ++- src/Storages/RabbitMQ/StorageRabbitMQ.cpp | 9 ++++ .../integration/test_storage_rabbitmq/test.py | 53 +++++++++++++++++++ 5 files changed, 82 insertions(+), 5 deletions(-) diff --git a/docs/en/engines/table-engines/integrations/rabbitmq.md b/docs/en/engines/table-engines/integrations/rabbitmq.md index b0901ee6f6e..dbae6b62257 100644 --- a/docs/en/engines/table-engines/integrations/rabbitmq.md +++ b/docs/en/engines/table-engines/integrations/rabbitmq.md @@ -59,6 +59,8 @@ Optional parameters: - `rabbitmq_max_block_size` - `rabbitmq_flush_interval_ms` +Also FormatFactory settings can be added along with rabbitmq-related settings. + Required configuration: The RabbitMQ server configuration should be added using the ClickHouse config file. @@ -75,11 +77,13 @@ Example: ``` sql CREATE TABLE queue ( key UInt64, - value UInt64 + value UInt64, + date DateTime ) ENGINE = RabbitMQ SETTINGS rabbitmq_host_port = 'localhost:5672', rabbitmq_exchange_name = 'exchange1', rabbitmq_format = 'JSONEachRow', - rabbitmq_num_consumers = 5; + rabbitmq_num_consumers = 5, + date_time_input_format = 'best_effort'; ``` ## Description {#description} @@ -105,6 +109,7 @@ Exchange type options: - `consistent_hash` - Data is evenly distributed between all bound tables (where the exchange name is the same). Note that this exchange type must be enabled with RabbitMQ plugin: `rabbitmq-plugins enable rabbitmq_consistent_hash_exchange`. Setting `rabbitmq_queue_base` may be used for the following cases: + - to let different tables share queues, so that multiple consumers could be registered for the same queues, which makes a better performance. If using `rabbitmq_num_consumers` and/or `rabbitmq_num_queues` settings, the exact match of queues is achieved in case these parameters are the same. - to be able to restore reading from certain durable queues when not all messages were successfully consumed. To resume consumption from one specific queue - set its name in `rabbitmq_queue_base` setting and do not specify `rabbitmq_num_consumers` and `rabbitmq_num_queues` (defaults to 1). To resume consumption from all queues, which were declared for a specific table - just specify the same settings: `rabbitmq_queue_base`, `rabbitmq_num_consumers`, `rabbitmq_num_queues`. By default, queue names will be unique to tables. - to reuse queues as they are declared durable and not auto-deleted. (Can be deleted via any of RabbitMQ CLI tools.) diff --git a/docs/ru/engines/table-engines/integrations/rabbitmq.md b/docs/ru/engines/table-engines/integrations/rabbitmq.md index dedb5842d68..bc2eda746cf 100644 --- a/docs/ru/engines/table-engines/integrations/rabbitmq.md +++ b/docs/ru/engines/table-engines/integrations/rabbitmq.md @@ -52,6 +52,8 @@ CREATE TABLE [IF NOT EXISTS] [db.]table_name [ON CLUSTER cluster] - `rabbitmq_max_block_size` - `rabbitmq_flush_interval_ms` +Настройки FormatFactory также могут быть добавлены в списке RabbitMQ настроек. + Требуемая конфигурация: Конфигурация сервера RabbitMQ добавляется с помощью конфигурационного файла ClickHouse. @@ -68,11 +70,13 @@ Example: ``` sql CREATE TABLE queue ( key UInt64, - value UInt64 + value UInt64, + date DateTime ) ENGINE = RabbitMQ SETTINGS rabbitmq_host_port = 'localhost:5672', rabbitmq_exchange_name = 'exchange1', rabbitmq_format = 'JSONEachRow', - rabbitmq_num_consumers = 5; + rabbitmq_num_consumers = 5, + date_time_input_format = 'best_effort'; ``` ## Описание {#description} @@ -98,6 +102,7 @@ Example: - `consistent_hash` - данные равномерно распределяются между всеми связанными таблицами, где имя точки обмена совпадает. Обратите внимание, что этот тип обмена должен быть включен с помощью плагина RabbitMQ: `rabbitmq-plugins enable rabbitmq_consistent_hash_exchange`. Настройка `rabbitmq_queue_base` может быть использована в следующих случаях: + 1. чтобы восстановить чтение из ранее созданных очередей, если оно прекратилось по какой-либо причине, но очереди остались непустыми. Для восстановления чтения из одной конкретной очереди, нужно написать ее имя в `rabbitmq_queue_base` настройку и не указывать настройки `rabbitmq_num_consumers` и `rabbitmq_num_queues`. Чтобы восстановить чтение из всех очередей, которые были созданы для конкретной таблицы, необходимо совпадение следующих настроек: `rabbitmq_queue_base`, `rabbitmq_num_consumers`, `rabbitmq_num_queues`. По умолчанию, если настройка `rabbitmq_queue_base` не указана, будут использованы уникальные для каждой таблицы имена очередей. 2. чтобы объявить одни и те же очереди для разных таблиц, что позволяет создавать несколько параллельных подписчиков на каждую из очередей. То есть обеспечивается лучшая производительность. В данном случае, для таких таблиц также необходимо совпадение настроек: `rabbitmq_num_consumers`, `rabbitmq_num_queues`. 3. чтобы повторно использовать созданные c `durable` настройкой очереди, так как они не удаляются автоматически (но могут быть удалены с помощью любого RabbitMQ CLI). diff --git a/src/Storages/RabbitMQ/RabbitMQSettings.h b/src/Storages/RabbitMQ/RabbitMQSettings.h index 2f8d6adfa16..66348d61424 100644 --- a/src/Storages/RabbitMQ/RabbitMQSettings.h +++ b/src/Storages/RabbitMQ/RabbitMQSettings.h @@ -1,13 +1,14 @@ #pragma once #include +#include namespace DB { class ASTStorage; -#define LIST_OF_RABBITMQ_SETTINGS(M) \ +#define RABBITMQ_RELATED_SETTINGS(M) \ M(String, rabbitmq_host_port, "", "A host-port to connect to RabbitMQ server.", 0) \ M(String, rabbitmq_exchange_name, "clickhouse-exchange", "The exchange name, to which messages are sent.", 0) \ M(String, rabbitmq_format, "", "The message format.", 0) \ @@ -24,6 +25,10 @@ namespace DB M(UInt64, rabbitmq_max_block_size, 0, "Number of row collected before flushing data from RabbitMQ.", 0) \ M(Milliseconds, rabbitmq_flush_interval_ms, 0, "Timeout for flushing data from RabbitMQ.", 0) \ +#define LIST_OF_RABBITMQ_SETTINGS(M) \ + RABBITMQ_RELATED_SETTINGS(M) \ + FORMAT_FACTORY_SETTINGS(M) + DECLARE_SETTINGS_TRAITS(RabbitMQSettingsTraits, LIST_OF_RABBITMQ_SETTINGS) struct RabbitMQSettings : public BaseSettings diff --git a/src/Storages/RabbitMQ/StorageRabbitMQ.cpp b/src/Storages/RabbitMQ/StorageRabbitMQ.cpp index 3ee9dda2bf3..edce1a4b658 100644 --- a/src/Storages/RabbitMQ/StorageRabbitMQ.cpp +++ b/src/Storages/RabbitMQ/StorageRabbitMQ.cpp @@ -199,6 +199,15 @@ std::shared_ptr StorageRabbitMQ::addSettings(const Context & context) c if (!schema_name.empty()) modified_context->setSetting("format_schema", schema_name); + for (const auto & setting : *rabbitmq_settings) + { + const auto & setting_name = setting.getName(); + + /// check for non-rabbitmq-related settings + if (!setting_name.starts_with("rabbitmq_")) + modified_context->setSetting(setting_name, setting.getValue()); + } + return modified_context; } diff --git a/tests/integration/test_storage_rabbitmq/test.py b/tests/integration/test_storage_rabbitmq/test.py index 911f6d144f9..ca89ebdea0a 100644 --- a/tests/integration/test_storage_rabbitmq/test.py +++ b/tests/integration/test_storage_rabbitmq/test.py @@ -1912,6 +1912,59 @@ def test_rabbitmq_no_connection_at_startup(rabbitmq_cluster): assert int(result) == messages_num, 'ClickHouse lost some messages: {}'.format(result) +@pytest.mark.timeout(120) +def test_rabbitmq_format_factory_settings(rabbitmq_cluster): + instance.query(''' + CREATE TABLE test.format_settings ( + id String, date DateTime + ) ENGINE = RabbitMQ + SETTINGS rabbitmq_host_port = 'rabbitmq1:5672', + rabbitmq_exchange_name = 'format_settings', + rabbitmq_format = 'JSONEachRow', + date_time_input_format = 'best_effort'; + ''') + + credentials = pika.PlainCredentials('root', 'clickhouse') + parameters = pika.ConnectionParameters('localhost', 5672, '/', credentials) + connection = pika.BlockingConnection(parameters) + channel = connection.channel() + + message = json.dumps({"id":"format_settings_test","date":"2021-01-19T14:42:33.1829214Z"}) + expected = instance.query('''SELECT parseDateTimeBestEffort(CAST('2021-01-19T14:42:33.1829214Z', 'String'))''') + + channel.basic_publish(exchange='format_settings', routing_key='', body=message) + result = '' + while True: + result = instance.query('SELECT date FROM test.format_settings') + if result == expected: + break; + + instance.query(''' + DROP TABLE IF EXISTS test.view; + DROP TABLE IF EXISTS test.consumer; + CREATE TABLE test.view ( + id String, date DateTime + ) ENGINE = MergeTree ORDER BY id; + CREATE MATERIALIZED VIEW test.consumer TO test.view AS + SELECT * FROM test.format_settings; + ''') + + channel.basic_publish(exchange='format_settings', routing_key='', body=message) + result = '' + while True: + result = instance.query('SELECT date FROM test.view') + if result == expected: + break; + + connection.close() + instance.query(''' + DROP TABLE test.consumer; + DROP TABLE test.format_settings; + ''') + + assert(result == expected) + + if __name__ == '__main__': cluster.start() input("Cluster created, press any key to destroy...") From d9f66d8d30b35058b9d2fc0fa070ad4c3c1a5cd5 Mon Sep 17 00:00:00 2001 From: kssenii Date: Mon, 15 Feb 2021 23:25:19 +0000 Subject: [PATCH 0314/2357] Better doc --- docs/en/engines/table-engines/integrations/rabbitmq.md | 2 +- docs/ru/engines/table-engines/integrations/rabbitmq.md | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/docs/en/engines/table-engines/integrations/rabbitmq.md b/docs/en/engines/table-engines/integrations/rabbitmq.md index dbae6b62257..946f70f903d 100644 --- a/docs/en/engines/table-engines/integrations/rabbitmq.md +++ b/docs/en/engines/table-engines/integrations/rabbitmq.md @@ -59,7 +59,7 @@ Optional parameters: - `rabbitmq_max_block_size` - `rabbitmq_flush_interval_ms` -Also FormatFactory settings can be added along with rabbitmq-related settings. +Also format settings can be added along with rabbitmq-related settings. Required configuration: diff --git a/docs/ru/engines/table-engines/integrations/rabbitmq.md b/docs/ru/engines/table-engines/integrations/rabbitmq.md index bc2eda746cf..173beecb6e7 100644 --- a/docs/ru/engines/table-engines/integrations/rabbitmq.md +++ b/docs/ru/engines/table-engines/integrations/rabbitmq.md @@ -52,7 +52,7 @@ CREATE TABLE [IF NOT EXISTS] [db.]table_name [ON CLUSTER cluster] - `rabbitmq_max_block_size` - `rabbitmq_flush_interval_ms` -Настройки FormatFactory также могут быть добавлены в списке RabbitMQ настроек. +Настройки форматов данных также могут быть добавлены в списке RabbitMQ настроек. Требуемая конфигурация: From f139ad8080f66105eef83a80fef8310d31a85b2f Mon Sep 17 00:00:00 2001 From: Nikolai Kochetov Date: Tue, 16 Feb 2021 09:15:12 +0300 Subject: [PATCH 0315/2357] Comment debug output. --- src/Interpreters/InterpreterSelectQuery.cpp | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/src/Interpreters/InterpreterSelectQuery.cpp b/src/Interpreters/InterpreterSelectQuery.cpp index 2c960b6983a..45d187c34d5 100644 --- a/src/Interpreters/InterpreterSelectQuery.cpp +++ b/src/Interpreters/InterpreterSelectQuery.cpp @@ -107,9 +107,9 @@ namespace ErrorCodes String InterpreterSelectQuery::generateFilterActions( ActionsDAGPtr & actions, const ASTPtr & row_policy_filter, const Names & prerequisite_columns) const { - std::cerr << "----- InterpreterSelectQuery::generateFilterActions\n"; - for (const auto & name : prerequisite_columns) - std::cerr << name << std::endl; + // std::cerr << "----- InterpreterSelectQuery::generateFilterActions\n"; + // for (const auto & name : prerequisite_columns) + // std::cerr << name << std::endl; const auto & db_name = table_id.getDatabaseName(); const auto & table_name = table_id.getTableName(); @@ -529,9 +529,9 @@ void InterpreterSelectQuery::buildQueryPlan(QueryPlan & query_plan) { executeImpl(query_plan, input, std::move(input_pipe)); - WriteBufferFromOwnString buf; - query_plan.explainPlan(buf, {.header = true, .actions = true}); - std::cerr << buf.str(); + // WriteBufferFromOwnString buf; + // query_plan.explainPlan(buf, {.header = true, .actions = true}); + // std::cerr << buf.str(); /// We must guarantee that result structure is the same as in getSampleBlock() if (!blocksHaveEqualStructure(query_plan.getCurrentDataStream().header, result_header)) @@ -840,7 +840,7 @@ void InterpreterSelectQuery::executeImpl(QueryPlan & query_plan, const BlockInpu else { /// Add row level security actions to prewhere. - std::cerr << expressions.filter_info->actions->dumpDAG() << std::endl; + // std::cerr << expressions.filter_info->actions->dumpDAG() << std::endl; expressions.prewhere_info->row_level_filter_actions = std::move(expressions.filter_info->actions); expressions.prewhere_info->row_level_column_name = std::move(expressions.filter_info->column_name); expressions.prewhere_info->row_level_filter_actions->projectInput(false); From 0e1d67ad9ac17c48fe2c7b44bd8b3a1ad485927e Mon Sep 17 00:00:00 2001 From: Pavel Kruglov Date: Tue, 16 Feb 2021 10:56:45 +0300 Subject: [PATCH 0316/2357] Add LOG_DEBUG to debug test_distributed_load_balancing --- src/Client/Connection.cpp | 16 +++++++++++ src/Client/HedgedConnections.cpp | 11 ++++++++ src/Client/HedgedConnections.h | 2 ++ src/Client/HedgedConnectionsFactory.cpp | 28 +++++++++++++++++++ .../configs/users.xml | 8 ++++++ .../test_distributed_load_balancing/test.py | 1 + 6 files changed, 66 insertions(+) create mode 100644 tests/integration/test_distributed_load_balancing/configs/users.xml diff --git a/src/Client/Connection.cpp b/src/Client/Connection.cpp index 0e8b94ef1cb..65dcdfd5fe7 100644 --- a/src/Client/Connection.cpp +++ b/src/Client/Connection.cpp @@ -74,6 +74,7 @@ void Connection::connect(const ConnectionTimeouts & timeouts) void Connection::disconnect() { + LOG_DEBUG(log, "disconnect"); maybe_compressed_out = nullptr; in = nullptr; last_input_packet_type.reset(); @@ -160,6 +161,7 @@ void Connection::prepare(const ConnectionTimeouts & timeouts) void Connection::sendHello() { + LOG_DEBUG(log_wrapper.get(), "sendHello"); try { /** Disallow control characters in user controlled parameters @@ -233,6 +235,8 @@ void Connection::sendHello() void Connection::receiveHello() { + LOG_DEBUG(log_wrapper.get(), "receiveHello"); + try { /// Receive hello packet. @@ -430,6 +434,8 @@ TablesStatusResponse Connection::getTablesStatus(const ConnectionTimeouts & time void Connection::sendTablesStatusRequest(const TablesStatusRequest & request) { + LOG_DEBUG(log_wrapper.get(), "sendTablesStatusRequest"); + writeVarUInt(Protocol::Client::TablesStatusRequest, *out); request.write(*out, server_revision); out->next(); @@ -437,6 +443,8 @@ void Connection::sendTablesStatusRequest(const TablesStatusRequest & request) TablesStatusResponse Connection::receiveTablesStatusResponse() { + LOG_DEBUG(log_wrapper.get(), "receiveTablesStatusResponse"); + UInt64 response_type = 0; readVarUInt(response_type, *in); @@ -459,6 +467,8 @@ void Connection::sendQuery( const ClientInfo * client_info, bool with_pending_data) { + LOG_DEBUG(log_wrapper.get(), "sendQuery"); + if (!connected) connect(timeouts); @@ -556,6 +566,8 @@ void Connection::sendQuery( void Connection::sendCancel() { + LOG_DEBUG(log_wrapper.get(), "sendCancel"); + /// If we already disconnected. if (!out) return; @@ -806,6 +818,8 @@ std::optional Connection::checkPacket(size_t timeout_microseconds) Packet Connection::receivePacket(AsyncCallback async_callback) { + LOG_DEBUG(log_wrapper.get(), "receivePacket"); + in->setAsyncCallback(std::move(async_callback)); SCOPE_EXIT(in->setAsyncCallback({})); @@ -883,6 +897,8 @@ Packet Connection::receivePacket(AsyncCallback async_callback) Block Connection::receiveData() { + LOG_DEBUG(log_wrapper.get(), "receiveData"); + initBlockInput(); return receiveDataImpl(block_in); } diff --git a/src/Client/HedgedConnections.cpp b/src/Client/HedgedConnections.cpp index ad00c60b302..6d49c0f6749 100644 --- a/src/Client/HedgedConnections.cpp +++ b/src/Client/HedgedConnections.cpp @@ -45,6 +45,8 @@ HedgedConnections::HedgedConnections( active_connection_count = connections.size(); offsets_with_received_first_data_packet = 0; pipeline_for_new_replicas.add([throttler_](ReplicaState & replica_) { replica_.connection->setThrottler(throttler_); }); + + log = &Poco::Logger::get("HedgedConnections"); } void HedgedConnections::Pipeline::add(std::function send_function) @@ -285,6 +287,7 @@ Packet HedgedConnections::receivePacketUnlocked(AsyncCallback async_callback) HedgedConnections::ReplicaLocation HedgedConnections::getReadyReplicaLocation(AsyncCallback async_callback) { + LOG_DEBUG(log, "getReadyReplicaLocation"); int event_fd; while (true) { @@ -374,6 +377,8 @@ bool HedgedConnections::checkPendingData(ReplicaLocation & location_out) Packet HedgedConnections::receivePacketFromReplica(const ReplicaLocation & replica_location, AsyncCallback async_callback) { + LOG_DEBUG(log, "receivePacketFromReplica"); + ReplicaState & replica = offset_states[replica_location.offset].replicas[replica_location.index]; replica.receive_timeout.reset(); Packet packet = replica.connection->receivePacket(std::move(async_callback)); @@ -408,6 +413,8 @@ Packet HedgedConnections::receivePacketFromReplica(const ReplicaLocation & repli void HedgedConnections::processReceivedFirstDataPacket(const ReplicaLocation & replica_location) { + LOG_DEBUG(log, "processReceivedFirstDataPacket"); + /// When we receive first packet of data from replica, we stop working with replicas, that are /// responsible for the same offset. OffsetState & offset_state = offset_states[replica_location.offset]; @@ -438,6 +445,8 @@ void HedgedConnections::processReceivedFirstDataPacket(const ReplicaLocation & r void HedgedConnections::tryGetNewReplica(bool start_new_connection) { + LOG_DEBUG(log, "tryGetNewReplica"); + Connection * connection = nullptr; HedgedConnectionsFactory::State state = hedged_connections_factory.getNextConnection(start_new_connection, false, connection); @@ -488,6 +497,8 @@ void HedgedConnections::tryGetNewReplica(bool start_new_connection) void HedgedConnections::finishProcessReplica(ReplicaState & replica, bool disconnect) { + LOG_DEBUG(log, "finishProcessReplica"); + epoll.remove(replica.epoll.getFileDescriptor()); --offset_states[fd_to_replica_location[replica.epoll.getFileDescriptor()].offset].active_connection_count; fd_to_replica_location.erase(replica.epoll.getFileDescriptor()); diff --git a/src/Client/HedgedConnections.h b/src/Client/HedgedConnections.h index 249c41a7a06..41c548de9ef 100644 --- a/src/Client/HedgedConnections.h +++ b/src/Client/HedgedConnections.h @@ -163,6 +163,8 @@ private: bool cancelled = false; mutable std::mutex cancel_mutex; + + Poco::Logger * log; }; } diff --git a/src/Client/HedgedConnectionsFactory.cpp b/src/Client/HedgedConnectionsFactory.cpp index c881c2723df..ba0e4ac7b22 100644 --- a/src/Client/HedgedConnectionsFactory.cpp +++ b/src/Client/HedgedConnectionsFactory.cpp @@ -43,6 +43,7 @@ HedgedConnectionsFactory::~HedgedConnectionsFactory() std::vector HedgedConnectionsFactory::getManyConnections(PoolMode pool_mode) { + LOG_DEBUG(log, "getManyConnections"); size_t min_entries = (settings && settings->skip_unavailable_shards) ? 0 : 1; size_t max_entries; @@ -102,6 +103,8 @@ std::vector HedgedConnectionsFactory::getManyConnections(PoolMode HedgedConnectionsFactory::State HedgedConnectionsFactory::getNextConnection(bool start_new_connection, bool blocking, Connection *& connection_out) { + LOG_DEBUG(log, "getNextConnection"); + if (start_new_connection) { int index = startEstablishingNewConnection(connection_out); @@ -125,6 +128,7 @@ HedgedConnectionsFactory::State HedgedConnectionsFactory::getNextConnection(bool void HedgedConnectionsFactory::stopChoosingReplicas() { + LOG_DEBUG(log, "stopChoosingReplicas"); for (auto & [fd, replica_index] : fd_to_replica_index) { resetReplicaTimeouts(replica_index); @@ -137,6 +141,8 @@ void HedgedConnectionsFactory::stopChoosingReplicas() int HedgedConnectionsFactory::getNextIndex() { + LOG_DEBUG(log, "getNextIndex"); + /// Check if there is no free replica. if (entries_count + replicas_in_process_count + failed_pools_count >= shuffled_pools.size()) return -1; @@ -170,9 +176,13 @@ int HedgedConnectionsFactory::getNextIndex() int HedgedConnectionsFactory::startEstablishingNewConnection(Connection *& connection_out) { + LOG_DEBUG(log, "startEstablishingNewConnection"); + int index; do { + LOG_DEBUG(log, "startEstablishingNewConnection loop"); + index = getNextIndex(); if (index == -1) return -1; @@ -205,6 +215,8 @@ int HedgedConnectionsFactory::startEstablishingNewConnection(Connection *& conne void HedgedConnectionsFactory::processConnectionEstablisherStage(int replica_index, bool remove_from_epoll) { + LOG_DEBUG(log, "processConnectionEstablisherStage"); + ReplicaStatus & replica = replicas[replica_index]; if (replica.connection_establisher.stage == ConnectionEstablisher::Stage::FINISHED) @@ -224,6 +236,7 @@ void HedgedConnectionsFactory::processConnectionEstablisherStage(int replica_ind ++usable_count; if (replica.connection_establisher.result.is_up_to_date) { + LOG_DEBUG(log, "READY"); ++ready_replicas_count; replica.is_ready = true; return; @@ -242,6 +255,8 @@ void HedgedConnectionsFactory::processConnectionEstablisherStage(int replica_ind void HedgedConnectionsFactory::processFailedConnection(int replica_index, bool remove_from_epoll) { + LOG_DEBUG(log, "processFailedConnection"); + if (remove_from_epoll) { epoll.remove(replicas[replica_index].epoll.getFileDescriptor()); @@ -271,6 +286,8 @@ void HedgedConnectionsFactory::processFailedConnection(int replica_index, bool r void HedgedConnectionsFactory::addTimeouts(int replica_index) { + LOG_DEBUG(log, "addTimeouts"); + auto stage = replicas[replica_index].connection_establisher.stage; if (stage == ConnectionEstablisher::Stage::RECEIVE_HELLO) { @@ -286,12 +303,16 @@ void HedgedConnectionsFactory::addTimeouts(int replica_index) void HedgedConnectionsFactory::resetReplicaTimeouts(int replica_index) { + LOG_DEBUG(log, "resetReplicaTimeouts"); + replicas[replica_index].receive_timeout.reset(); replicas[replica_index].change_replica_timeout.reset(); } HedgedConnectionsFactory::State HedgedConnectionsFactory::processEpollEvents(bool blocking, Connection *& connection_out) { + LOG_DEBUG(log, "processEpollEvents"); + int event_fd; while (!epoll.empty()) { @@ -353,7 +374,10 @@ HedgedConnectionsFactory::State HedgedConnectionsFactory::processEpollEvents(boo processReceiveTimeout(replica_index); if (is_change_replica_timeout_alarmed) + { + LOG_DEBUG(log, "change_replica_timeout"); replicas[replica_index].change_replica_timeout.reset(); + } } /// We reach this point only if we need to start new connection. @@ -385,6 +409,8 @@ int HedgedConnectionsFactory::checkPendingData() void HedgedConnectionsFactory::processSocketEvent(int replica_index, Connection *& connection_out) { + LOG_DEBUG(log, "processSocketEvent"); + resetReplicaTimeouts(replica_index); replicas[replica_index].connection_establisher.run(); processConnectionEstablisherStage(replica_index, true); @@ -396,6 +422,8 @@ void HedgedConnectionsFactory::processSocketEvent(int replica_index, Connection void HedgedConnectionsFactory::processReceiveTimeout(int replica_index) { + LOG_DEBUG(log, "processReceiveTimeout"); + resetReplicaTimeouts(replica_index); ReplicaStatus & replica = replicas[replica_index]; diff --git a/tests/integration/test_distributed_load_balancing/configs/users.xml b/tests/integration/test_distributed_load_balancing/configs/users.xml new file mode 100644 index 00000000000..b2dcdbcd8f3 --- /dev/null +++ b/tests/integration/test_distributed_load_balancing/configs/users.xml @@ -0,0 +1,8 @@ + + + + + 0 + + + diff --git a/tests/integration/test_distributed_load_balancing/test.py b/tests/integration/test_distributed_load_balancing/test.py index df7b74fcae1..d3ac5c132cd 100644 --- a/tests/integration/test_distributed_load_balancing/test.py +++ b/tests/integration/test_distributed_load_balancing/test.py @@ -166,6 +166,7 @@ def test_load_balancing_priority_round_robin(dist_table): def test_distributed_replica_max_ignored_errors(): settings = { + 'use_hedged_requests' : 0, 'load_balancing': 'in_order', 'prefer_localhost_replica': 0, 'connect_timeout': 2, From 1872319d8c2e41b35a93e1c2c930025453d0dc26 Mon Sep 17 00:00:00 2001 From: vdimir Date: Tue, 16 Feb 2021 11:21:54 +0300 Subject: [PATCH 0317/2357] Fix unused variable in isAllowedToRewriteCrossJoin --- src/Interpreters/CrossToInnerJoinVisitor.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/Interpreters/CrossToInnerJoinVisitor.cpp b/src/Interpreters/CrossToInnerJoinVisitor.cpp index fc2747af8eb..b1e42b23ad5 100644 --- a/src/Interpreters/CrossToInnerJoinVisitor.cpp +++ b/src/Interpreters/CrossToInnerJoinVisitor.cpp @@ -156,7 +156,7 @@ std::optional getIdentsMembership(const ASTPtr ast, bool isAllowedToRewriteCrossJoin(const ASTPtr & node, const Aliases & aliases) { - if (const auto * func = node->as()) + if (node->as()) { auto idents = IdentifiersCollector::collect(node); for (const auto * ident : idents) From a72ef6f026eb955fe43ba9c2d07e3ad6e6646983 Mon Sep 17 00:00:00 2001 From: Nikolai Kochetov Date: Tue, 16 Feb 2021 11:26:24 +0300 Subject: [PATCH 0318/2357] Fix number of threads for scalar subqueries and subqueries for index. --- .../ExecuteScalarSubqueriesVisitor.cpp | 16 ++++++++++++---- src/Interpreters/ExpressionAnalyzer.cpp | 7 +++++-- .../Executors/PullingAsyncPipelineExecutor.cpp | 7 ++++++- src/Processors/Formats/LazyOutputFormat.cpp | 9 +++++++-- 4 files changed, 30 insertions(+), 9 deletions(-) diff --git a/src/Interpreters/ExecuteScalarSubqueriesVisitor.cpp b/src/Interpreters/ExecuteScalarSubqueriesVisitor.cpp index e6061aabe94..7ee7bb1f301 100644 --- a/src/Interpreters/ExecuteScalarSubqueriesVisitor.cpp +++ b/src/Interpreters/ExecuteScalarSubqueriesVisitor.cpp @@ -21,7 +21,7 @@ #include -#include +#include namespace DB { @@ -122,8 +122,10 @@ void ExecuteScalarSubqueriesMatcher::visit(const ASTSubquery & subquery, ASTPtr try { - PullingPipelineExecutor executor(io.pipeline); - if (!executor.pull(block)) + PullingAsyncPipelineExecutor executor(io.pipeline); + while (block.rows() == 0 && executor.pull(block)); + + if (block.rows() == 0) { /// Interpret subquery with empty result as Null literal auto ast_new = std::make_unique(Null()); @@ -132,7 +134,13 @@ void ExecuteScalarSubqueriesMatcher::visit(const ASTSubquery & subquery, ASTPtr return; } - if (block.rows() != 1 || executor.pull(block)) + if (block.rows() != 1) + throw Exception("Scalar subquery returned more than one row", ErrorCodes::INCORRECT_RESULT_OF_SCALAR_SUBQUERY); + + Block tmp_block; + while (tmp_block.rows() == 0 && executor.pull(tmp_block)); + + if (tmp_block.rows() != 0) throw Exception("Scalar subquery returned more than one row", ErrorCodes::INCORRECT_RESULT_OF_SCALAR_SUBQUERY); } catch (const Exception & e) diff --git a/src/Interpreters/ExpressionAnalyzer.cpp b/src/Interpreters/ExpressionAnalyzer.cpp index 3f65a6f3f58..cea056d6a21 100644 --- a/src/Interpreters/ExpressionAnalyzer.cpp +++ b/src/Interpreters/ExpressionAnalyzer.cpp @@ -54,7 +54,7 @@ #include #include -#include +#include #include namespace DB @@ -321,7 +321,7 @@ void SelectQueryExpressionAnalyzer::tryMakeSetForIndexFromSubquery(const ASTPtr auto interpreter_subquery = interpretSubquery(subquery_or_table_name, context, {}, query_options); auto io = interpreter_subquery->execute(); - PullingPipelineExecutor executor(io.pipeline); + PullingAsyncPipelineExecutor executor(io.pipeline); SetPtr set = std::make_shared(settings.size_limits_for_set, true, context.getSettingsRef().transform_null_in); set->setHeader(executor.getHeader()); @@ -329,6 +329,9 @@ void SelectQueryExpressionAnalyzer::tryMakeSetForIndexFromSubquery(const ASTPtr Block block; while (executor.pull(block)) { + if (block.rows() == 0) + continue; + /// If the limits have been exceeded, give up and let the default subquery processing actions take place. if (!set->insertFromBlock(block)) return; diff --git a/src/Processors/Executors/PullingAsyncPipelineExecutor.cpp b/src/Processors/Executors/PullingAsyncPipelineExecutor.cpp index e4bcf6dc0ab..21741d30dfa 100644 --- a/src/Processors/Executors/PullingAsyncPipelineExecutor.cpp +++ b/src/Processors/Executors/PullingAsyncPipelineExecutor.cpp @@ -133,7 +133,12 @@ bool PullingAsyncPipelineExecutor::pull(Chunk & chunk, uint64_t milliseconds) } chunk.clear(); - data->finish_event.tryWait(milliseconds); + + if (milliseconds) + data->finish_event.tryWait(milliseconds); + else + data->finish_event.wait(); + return true; } diff --git a/src/Processors/Formats/LazyOutputFormat.cpp b/src/Processors/Formats/LazyOutputFormat.cpp index 46287d1cce9..0663ff28f84 100644 --- a/src/Processors/Formats/LazyOutputFormat.cpp +++ b/src/Processors/Formats/LazyOutputFormat.cpp @@ -16,8 +16,13 @@ Chunk LazyOutputFormat::getChunk(UInt64 milliseconds) } Chunk chunk; - if (!queue.tryPop(chunk, milliseconds)) - return {}; + if (milliseconds) + { + if (!queue.tryPop(chunk, milliseconds)) + return {}; + } + else + queue.pop(chunk); if (chunk) info.update(chunk.getNumRows(), chunk.allocatedBytes()); From 10f1432c5cb1dc77c0c31cd960a275480fa380dd Mon Sep 17 00:00:00 2001 From: Nikolai Kochetov Date: Tue, 16 Feb 2021 11:31:17 +0300 Subject: [PATCH 0319/2357] Added perftest. --- tests/performance/subqueries.xml | 7 +++++++ 1 file changed, 7 insertions(+) create mode 100644 tests/performance/subqueries.xml diff --git a/tests/performance/subqueries.xml b/tests/performance/subqueries.xml new file mode 100644 index 00000000000..f1481a78c7e --- /dev/null +++ b/tests/performance/subqueries.xml @@ -0,0 +1,7 @@ + + create table tab (a UInt32, b UInt32) engine = MergeTree order by (a, b) + insert into tab values (1, 1) + select a, b from tab where (a, b) in (select toUInt32(number) as x, toUInt32(sleep(0.1) + 1) from numbers_mt(16)) settings max_threads = 2, max_block_size = 4 + select a, b from tab where (1, 1) = (select min(toUInt32(number + 1)) as x, min(toUInt32(sleep(0.1) + 1)) from numbers_mt(16)) settings max_threads = 2, max_block_size = 4 + DROP TABLE tab + \ No newline at end of file From a14b6c2650015c379c811646f345c4c66c1d9afd Mon Sep 17 00:00:00 2001 From: Nikolai Kochetov Date: Tue, 16 Feb 2021 12:37:19 +0300 Subject: [PATCH 0320/2357] Fix trivial count optimization --- src/Interpreters/InterpreterSelectQuery.cpp | 16 ++++++++-------- src/Interpreters/InterpreterSelectQuery.h | 4 ++-- 2 files changed, 10 insertions(+), 10 deletions(-) diff --git a/src/Interpreters/InterpreterSelectQuery.cpp b/src/Interpreters/InterpreterSelectQuery.cpp index 45d187c34d5..a99f99cfa13 100644 --- a/src/Interpreters/InterpreterSelectQuery.cpp +++ b/src/Interpreters/InterpreterSelectQuery.cpp @@ -104,8 +104,7 @@ namespace ErrorCodes } /// Assumes `storage` is set and the table filter (row-level security) is not empty. -String InterpreterSelectQuery::generateFilterActions( - ActionsDAGPtr & actions, const ASTPtr & row_policy_filter, const Names & prerequisite_columns) const +String InterpreterSelectQuery::generateFilterActions(ActionsDAGPtr & actions, const Names & prerequisite_columns) const { // std::cerr << "----- InterpreterSelectQuery::generateFilterActions\n"; // for (const auto & name : prerequisite_columns) @@ -357,7 +356,6 @@ InterpreterSelectQuery::InterpreterSelectQuery( ASTSelectQuery & query = getSelectQuery(); std::shared_ptr table_join = joined_tables.makeTableJoin(query); - ASTPtr row_policy_filter; if (storage) row_policy_filter = context->getRowPolicyCondition(table_id.getDatabaseName(), table_id.getTableName(), RowPolicy::SELECT_FILTER); @@ -457,7 +455,7 @@ InterpreterSelectQuery::InterpreterSelectQuery( if (row_policy_filter) { filter_info = std::make_shared(); - filter_info->column_name = generateFilterActions(filter_info->actions, row_policy_filter, required_columns); + filter_info->column_name = generateFilterActions(filter_info->actions, required_columns); source_header = metadata_snapshot->getSampleBlockForColumns( filter_info->actions->getRequiredColumns().getNames(), storage->getVirtuals(), storage->getStorageID()); } @@ -828,6 +826,7 @@ void InterpreterSelectQuery::executeImpl(QueryPlan & query_plan, const BlockInpu const bool does_storage_support_prewhere = !input && !input_pipe && storage && storage->supportsPrewhere(); if (does_storage_support_prewhere && settings.optimize_move_to_prewhere) { + std::cerr << "----- Moving row level filter to prewhere\n"; /// Execute row level filter in prewhere as a part of "move to prewhere" optimization. expressions.prewhere_info = std::make_shared( std::move(expressions.filter_info->actions), @@ -1331,7 +1330,7 @@ void InterpreterSelectQuery::executeFetchColumns(QueryProcessingStage::Enum proc && (settings.max_parallel_replicas <= 1) && storage && storage->getName() != "MaterializeMySQL" - && !expressions.filter_info + && !row_policy_filter && processing_stage == QueryProcessingStage::FetchColumns && query_analyzer->hasAggregation() && (query_analyzer->aggregates().size() == 1) @@ -1394,9 +1393,9 @@ void InterpreterSelectQuery::executeFetchColumns(QueryProcessingStage::Enum proc if (storage) { /// Append columns from the table filter to required - ActionsDAG * row_policy_filter = nullptr; - if (expressions.filter_info) - row_policy_filter = expressions.filter_info->actions.get(); + // ActionsDAG * row_policy_filter = nullptr; + // if (expressions.filter_info) + // row_policy_filter = expressions.filter_info->actions.get(); // else if (expressions.prewhere_info && expressions.prewhere_info->row_level_filter_actions) // row_policy_filter = expressions.prewhere_info->row_level_filter_actions.get(); @@ -1651,6 +1650,7 @@ void InterpreterSelectQuery::executeFetchColumns(QueryProcessingStage::Enum proc if (prewhere_info) { + std::cerr << "-------- filling prewhere info \n"; query_info.prewhere_info = std::make_shared(); query_info.prewhere_info->prewhere_actions = std::make_shared(prewhere_info->prewhere_actions); diff --git a/src/Interpreters/InterpreterSelectQuery.h b/src/Interpreters/InterpreterSelectQuery.h index 20cffdf5702..49169c66d1b 100644 --- a/src/Interpreters/InterpreterSelectQuery.h +++ b/src/Interpreters/InterpreterSelectQuery.h @@ -131,8 +131,7 @@ private: void executeSubqueriesInSetsAndJoins(QueryPlan & query_plan, std::unordered_map & subqueries_for_sets); void executeMergeSorted(QueryPlan & query_plan, const SortDescription & sort_description, UInt64 limit, const std::string & description); - String generateFilterActions( - ActionsDAGPtr & actions, const ASTPtr & row_policy_filter, const Names & prerequisite_columns = {}) const; + String generateFilterActions(ActionsDAGPtr & actions, const Names & prerequisite_columns = {}) const; enum class Modificator { @@ -157,6 +156,7 @@ private: /// Is calculated in getSampleBlock. Is used later in readImpl. ExpressionAnalysisResult analysis_result; /// For row-level security. + ASTPtr row_policy_filter; FilterDAGInfoPtr filter_info; QueryProcessingStage::Enum from_stage = QueryProcessingStage::FetchColumns; From 341e7bc8482e99478a0e40ea1afa446ca15f9312 Mon Sep 17 00:00:00 2001 From: George Date: Tue, 16 Feb 2021 13:23:52 +0300 Subject: [PATCH 0321/2357] Fixed links --- docs/en/sql-reference/statements/detach.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/docs/en/sql-reference/statements/detach.md b/docs/en/sql-reference/statements/detach.md index f3f8b053724..b2720acaaa5 100644 --- a/docs/en/sql-reference/statements/detach.md +++ b/docs/en/sql-reference/statements/detach.md @@ -15,7 +15,7 @@ DETACH TABLE [IF EXISTS] [db.]name [ON CLUSTER cluster] This does not delete the table’s data or metadata. On the next server launch, the server will read the metadata and find out about the table again. -Similarly, a “detached” table can be re-attached using the [ATTACH](../../sql-reference/statements/attach) query (with the exception of system tables, which do not have metadata stored for them). +Similarly, a “detached” table can be re-attached using the [ATTACH](../../sql-reference/statements/attach.md) query (with the exception of system tables, which do not have metadata stored for them). ## DETACH PERMAMENTLY {#detach-permamently} @@ -29,6 +29,6 @@ DETACH TABLE/VIEW [IF EXISTS] [db.]name PERMAMENTLY [ON CLUSTER cluster] This statement does not delete the table’s data or metadata. -Permamently detached table or view can be reattached with [ATTACH](../../sql-reference/statements/attach) query and can be shown with [SHOW CREATE TABLE](../../sql-reference/statements/show.md#show-create-table) query. +Permamently detached table or view can be reattached with [ATTACH](../../sql-reference/statements/attach.md) query and can be shown with [SHOW CREATE TABLE](../../sql-reference/statements/show.md#show-create-table) query. [Original article](https://clickhouse.tech/docs/en/sql-reference/statements/detach/) From 17d7a49106342536a0348c020ca92e1cafc52434 Mon Sep 17 00:00:00 2001 From: George Date: Tue, 16 Feb 2021 13:33:20 +0300 Subject: [PATCH 0322/2357] Fixed typos --- docs/en/sql-reference/statements/detach.md | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/docs/en/sql-reference/statements/detach.md b/docs/en/sql-reference/statements/detach.md index b2720acaaa5..adb2df570d7 100644 --- a/docs/en/sql-reference/statements/detach.md +++ b/docs/en/sql-reference/statements/detach.md @@ -17,9 +17,9 @@ This does not delete the table’s data or metadata. On the next server launch, Similarly, a “detached” table can be re-attached using the [ATTACH](../../sql-reference/statements/attach.md) query (with the exception of system tables, which do not have metadata stored for them). -## DETACH PERMAMENTLY {#detach-permamently} +## DETACH PERMANENTLY {#detach-permanently} -Deletes information about `name` table or view from the server. Permamently detached tables won't automatically reappear after the server restart. +Deletes information about `name` table or view from the server. Permanently detached tables won't automatically reappear after the server restart. Syntax: @@ -29,6 +29,6 @@ DETACH TABLE/VIEW [IF EXISTS] [db.]name PERMAMENTLY [ON CLUSTER cluster] This statement does not delete the table’s data or metadata. -Permamently detached table or view can be reattached with [ATTACH](../../sql-reference/statements/attach.md) query and can be shown with [SHOW CREATE TABLE](../../sql-reference/statements/show.md#show-create-table) query. +Permanently detached table or view can be reattached with [ATTACH](../../sql-reference/statements/attach.md) query and can be shown with [SHOW CREATE TABLE](../../sql-reference/statements/show.md#show-create-table) query. [Original article](https://clickhouse.tech/docs/en/sql-reference/statements/detach/) From a6322800118f9f9c27b3c239d78707af1025e97d Mon Sep 17 00:00:00 2001 From: George Date: Tue, 16 Feb 2021 13:53:44 +0300 Subject: [PATCH 0323/2357] added alias for nulls --- docs/en/sql-reference/functions/functions-for-nulls.md | 2 ++ docs/ru/sql-reference/functions/functions-for-nulls.md | 2 ++ 2 files changed, 4 insertions(+) diff --git a/docs/en/sql-reference/functions/functions-for-nulls.md b/docs/en/sql-reference/functions/functions-for-nulls.md index c32af7194fb..fbbda2c0ecc 100644 --- a/docs/en/sql-reference/functions/functions-for-nulls.md +++ b/docs/en/sql-reference/functions/functions-for-nulls.md @@ -13,6 +13,8 @@ Checks whether the argument is [NULL](../../sql-reference/syntax.md#null-literal isNull(x) ``` +Alias: `ISNULL`. + **Parameters** - `x` — A value with a non-compound data type. diff --git a/docs/ru/sql-reference/functions/functions-for-nulls.md b/docs/ru/sql-reference/functions/functions-for-nulls.md index 17da1ea9194..0db55847631 100644 --- a/docs/ru/sql-reference/functions/functions-for-nulls.md +++ b/docs/ru/sql-reference/functions/functions-for-nulls.md @@ -13,6 +13,8 @@ toc_title: "\u0424\u0443\u043d\u043a\u0446\u0438\u0438\u0020\u0434\u043b\u044f\u isNull(x) ``` +Синоним: `ISNULL`. + **Параметры** - `x` — значение с не составным типом данных. From bc6fdc7d4b09f290a57f7da39ba4abae2532d7c6 Mon Sep 17 00:00:00 2001 From: George Date: Tue, 16 Feb 2021 14:12:12 +0300 Subject: [PATCH 0324/2357] added aliases for date-time functions --- .../functions/date-time-functions.md | 18 ++++++++++++++++++ .../functions/date-time-functions.md | 18 ++++++++++++++++++ 2 files changed, 36 insertions(+) diff --git a/docs/en/sql-reference/functions/date-time-functions.md b/docs/en/sql-reference/functions/date-time-functions.md index 4a73bdb2546..a0c89ecb035 100644 --- a/docs/en/sql-reference/functions/date-time-functions.md +++ b/docs/en/sql-reference/functions/date-time-functions.md @@ -61,40 +61,58 @@ int32samoa: 1546300800 Converts a date or date with time to a UInt16 number containing the year number (AD). +Alias: `Year`. + ## toQuarter {#toquarter} Converts a date or date with time to a UInt8 number containing the quarter number. +Alias: `QUARTER`. + ## toMonth {#tomonth} Converts a date or date with time to a UInt8 number containing the month number (1-12). +Alias: `MONTH`. + ## toDayOfYear {#todayofyear} Converts a date or date with time to a UInt16 number containing the number of the day of the year (1-366). +Alias: `DAYOFYEAR`. + ## toDayOfMonth {#todayofmonth} Converts a date or date with time to a UInt8 number containing the number of the day of the month (1-31). +Aliases: `DAYOFMONTH`, `DAY`. + ## toDayOfWeek {#todayofweek} Converts a date or date with time to a UInt8 number containing the number of the day of the week (Monday is 1, and Sunday is 7). +Alias: `DAYOFWEEK`. + ## toHour {#tohour} Converts a date with time to a UInt8 number containing the number of the hour in 24-hour time (0-23). This function assumes that if clocks are moved ahead, it is by one hour and occurs at 2 a.m., and if clocks are moved back, it is by one hour and occurs at 3 a.m. (which is not always true – even in Moscow the clocks were twice changed at a different time). +Alias: `HOUR`. + ## toMinute {#tominute} Converts a date with time to a UInt8 number containing the number of the minute of the hour (0-59). +Alias: `MINUTE`. + ## toSecond {#tosecond} Converts a date with time to a UInt8 number containing the number of the second in the minute (0-59). Leap seconds are not accounted for. +Alias: `SECOND`. + ## toUnixTimestamp {#to-unix-timestamp} For DateTime argument: converts value to the number with type UInt32 -- Unix Timestamp (https://en.wikipedia.org/wiki/Unix_time). diff --git a/docs/ru/sql-reference/functions/date-time-functions.md b/docs/ru/sql-reference/functions/date-time-functions.md index 31482cde77f..add47e9dad1 100644 --- a/docs/ru/sql-reference/functions/date-time-functions.md +++ b/docs/ru/sql-reference/functions/date-time-functions.md @@ -63,40 +63,58 @@ int32samoa: 1546300800 Переводит дату или дату-с-временем в число типа UInt16, содержащее номер года (AD). +Синоним: `Year`. + ## toQuarter {#toquarter} Переводит дату или дату-с-временем в число типа UInt8, содержащее номер квартала. +Синоним: `QUARTER`. + ## toMonth {#tomonth} Переводит дату или дату-с-временем в число типа UInt8, содержащее номер месяца (1-12). +Синоним: `MONTH`. + ## toDayOfYear {#todayofyear} Переводит дату или дату-с-временем в число типа UInt16, содержащее номер дня года (1-366). +Синоним: `DAYOFYEAR`. + ## toDayOfMonth {#todayofmonth} Переводит дату или дату-с-временем в число типа UInt8, содержащее номер дня в месяце (1-31). +Синонимы: `DAYOFMONTH`, `DAY`. + ## toDayOfWeek {#todayofweek} Переводит дату или дату-с-временем в число типа UInt8, содержащее номер дня в неделе (понедельник - 1, воскресенье - 7). +Синоним: `DAYOFWEEK`. + ## toHour {#tohour} Переводит дату-с-временем в число типа UInt8, содержащее номер часа в сутках (0-23). Функция исходит из допущения, что перевод стрелок вперёд, если осуществляется, то на час, в два часа ночи, а перевод стрелок назад, если осуществляется, то на час, в три часа ночи (что, в общем, не верно - даже в Москве два раза перевод стрелок был осуществлён в другое время). +Синоним: `HOUR`. + ## toMinute {#tominute} Переводит дату-с-временем в число типа UInt8, содержащее номер минуты в часе (0-59). +Синоним: `MINUTE`. + ## toSecond {#tosecond} Переводит дату-с-временем в число типа UInt8, содержащее номер секунды в минуте (0-59). Секунды координации не учитываются. +Синоним: `SECOND`. + ## toUnixTimestamp {#to-unix-timestamp} Переводит дату-с-временем в число типа UInt32 -- Unix Timestamp (https://en.wikipedia.org/wiki/Unix_time). From 33e12f7b4a628fdd63f3a30e070cedbb0449473a Mon Sep 17 00:00:00 2001 From: George Date: Tue, 16 Feb 2021 14:12:44 +0300 Subject: [PATCH 0325/2357] added aliases for encoding functions --- docs/en/sql-reference/functions/encoding-functions.md | 2 ++ docs/ru/sql-reference/functions/encoding-functions.md | 2 ++ 2 files changed, 4 insertions(+) diff --git a/docs/en/sql-reference/functions/encoding-functions.md b/docs/en/sql-reference/functions/encoding-functions.md index bc3f5ca4345..3ec6c8ec3dd 100644 --- a/docs/en/sql-reference/functions/encoding-functions.md +++ b/docs/en/sql-reference/functions/encoding-functions.md @@ -75,6 +75,8 @@ Result: Returns a string containing the argument’s hexadecimal representation. +Alias: `HEX`. + **Syntax** ``` sql diff --git a/docs/ru/sql-reference/functions/encoding-functions.md b/docs/ru/sql-reference/functions/encoding-functions.md index 6f1c2aad6cb..8c3065e5a77 100644 --- a/docs/ru/sql-reference/functions/encoding-functions.md +++ b/docs/ru/sql-reference/functions/encoding-functions.md @@ -75,6 +75,8 @@ SELECT char(0xE4, 0xBD, 0xA0, 0xE5, 0xA5, 0xBD) AS hello; Returns a string containing the argument’s hexadecimal representation. +Синоним: `HEX`. + **Syntax** ``` sql From 1bd1a97716264f668659a972861c3f172e3b1cef Mon Sep 17 00:00:00 2001 From: George Date: Tue, 16 Feb 2021 14:13:01 +0300 Subject: [PATCH 0326/2357] added aliases for string functions --- docs/en/sql-reference/functions/string-functions.md | 4 ++++ docs/ru/sql-reference/functions/string-functions.md | 4 ++++ 2 files changed, 8 insertions(+) diff --git a/docs/en/sql-reference/functions/string-functions.md b/docs/en/sql-reference/functions/string-functions.md index 2b93dd924a3..c1f3625c14d 100644 --- a/docs/en/sql-reference/functions/string-functions.md +++ b/docs/en/sql-reference/functions/string-functions.md @@ -276,10 +276,14 @@ Returns the string ‘s’ that was converted from the encoding in ‘from’ to Encodes ‘s’ string into base64 +Alias: `TO_BASE64`. + ## base64Decode(s) {#base64decode} Decode base64-encoded string ‘s’ into original string. In case of failure raises an exception. +Alias: `FROM_BASE64`. + ## tryBase64Decode(s) {#trybase64decode} Similar to base64Decode, but in case of error an empty string would be returned. diff --git a/docs/ru/sql-reference/functions/string-functions.md b/docs/ru/sql-reference/functions/string-functions.md index aeb0652cc18..24edc3618fb 100644 --- a/docs/ru/sql-reference/functions/string-functions.md +++ b/docs/ru/sql-reference/functions/string-functions.md @@ -273,10 +273,14 @@ SELECT concat(key1, key2), sum(value) FROM key_val GROUP BY (key1, key2) Производит кодирование строки s в base64-представление. +Синоним: `TO_BASE64`. + ## base64Decode(s) {#base64decode} Декодирует base64-представление s в исходную строку. При невозможности декодирования выбрасывает исключение +Синоним: `FROM_BASE64`. + ## tryBase64Decode(s) {#trybase64decode} Функционал аналогичен base64Decode, но при невозможности декодирования возвращает пустую строку. From 3603fbd46a30e5a8f77877de5cac871ebec17564 Mon Sep 17 00:00:00 2001 From: George Date: Tue, 16 Feb 2021 14:13:17 +0300 Subject: [PATCH 0327/2357] added aliases for ip-address functions --- .../sql-reference/functions/ip-address-functions.md | 12 +++++++++++- .../sql-reference/functions/ip-address-functions.md | 12 +++++++++++- 2 files changed, 22 insertions(+), 2 deletions(-) diff --git a/docs/en/sql-reference/functions/ip-address-functions.md b/docs/en/sql-reference/functions/ip-address-functions.md index 0c1f675304b..8e2939e9272 100644 --- a/docs/en/sql-reference/functions/ip-address-functions.md +++ b/docs/en/sql-reference/functions/ip-address-functions.md @@ -9,10 +9,14 @@ toc_title: IP Addresses Takes a UInt32 number. Interprets it as an IPv4 address in big endian. Returns a string containing the corresponding IPv4 address in the format A.B.C.d (dot-separated numbers in decimal form). +Alias: `INET_NTOA`. + ## IPv4StringToNum(s) {#ipv4stringtonums} The reverse function of IPv4NumToString. If the IPv4 address has an invalid format, it returns 0. +Alias: `INET_ATON`. + ## IPv4NumToStringClassC(num) {#ipv4numtostringclasscnum} Similar to IPv4NumToString, but using xxx instead of the last octet. @@ -49,7 +53,11 @@ Since using ‘xxx’ is highly unusual, this may be changed in the future. We r ### IPv6NumToString(x) {#ipv6numtostringx} Accepts a FixedString(16) value containing the IPv6 address in binary format. Returns a string containing this address in text format. -IPv6-mapped IPv4 addresses are output in the format ::ffff:111.222.33.44. Examples: +IPv6-mapped IPv4 addresses are output in the format ::ffff:111.222.33.44. + +Alias: `INET6_NTOA`. + +Examples: ``` sql SELECT IPv6NumToString(toFixedString(unhex('2A0206B8000000000000000000000011'), 16)) AS addr @@ -119,6 +127,8 @@ The reverse function of IPv6NumToString. If the IPv6 address has an invalid form If the IP address is a valid IPv4 address then the IPv6 equivalent of the IPv4 address is returned. HEX can be uppercase or lowercase. +Alias: `INET6_ATON`. + ``` sql SELECT cutIPv6(IPv6StringToNum('127.0.0.1'), 0, 0); ``` diff --git a/docs/ru/sql-reference/functions/ip-address-functions.md b/docs/ru/sql-reference/functions/ip-address-functions.md index 52f0a92bc9f..3b7379e9a65 100644 --- a/docs/ru/sql-reference/functions/ip-address-functions.md +++ b/docs/ru/sql-reference/functions/ip-address-functions.md @@ -9,10 +9,14 @@ toc_title: "\u0424\u0443\u043d\u043a\u0446\u0438\u0438\u0020\u0434\u043b\u044f\u Принимает число типа UInt32. Интерпретирует его, как IPv4-адрес в big endian. Возвращает строку, содержащую соответствующий IPv4-адрес в формате A.B.C.D (числа в десятичной форме через точки). +Синоним: `INET_NTOA`. + ## IPv4StringToNum(s) {#ipv4stringtonums} Функция, обратная к IPv4NumToString. Если IPv4 адрес в неправильном формате, то возвращает 0. +Синоним: `INET_ATON`. + ## IPv4NumToStringClassC(num) {#ipv4numtostringclasscnum} Похоже на IPv4NumToString, но вместо последнего октета используется xxx. @@ -49,7 +53,11 @@ LIMIT 10 ### IPv6NumToString(x) {#ipv6numtostringx} Принимает значение типа FixedString(16), содержащее IPv6-адрес в бинарном виде. Возвращает строку, содержащую этот адрес в текстовом виде. -IPv6-mapped IPv4 адреса выводится в формате ::ffff:111.222.33.44. Примеры: +IPv6-mapped IPv4 адреса выводится в формате ::ffff:111.222.33.44. + +Примеры: `INET6_NTOA`. + +Примеры: ``` sql SELECT IPv6NumToString(toFixedString(unhex('2A0206B8000000000000000000000011'), 16)) AS addr @@ -118,6 +126,8 @@ LIMIT 10 Функция, обратная к IPv6NumToString. Если IPv6 адрес в неправильном формате, то возвращает строку из нулевых байт. HEX может быть в любом регистре. +Alias: `INET6_ATON`. + ## IPv4ToIPv6(x) {#ipv4toipv6x} Принимает число типа `UInt32`. Интерпретирует его, как IPv4-адрес в [big endian](https://en.wikipedia.org/wiki/Endianness). Возвращает значение `FixedString(16)`, содержащее адрес IPv6 в двоичном формате. Примеры: From c661760113164e74d7cb5ee5c394de3c57892d6c Mon Sep 17 00:00:00 2001 From: George Date: Tue, 16 Feb 2021 14:27:52 +0300 Subject: [PATCH 0328/2357] fixed a typo --- docs/ru/sql-reference/functions/date-time-functions.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/ru/sql-reference/functions/date-time-functions.md b/docs/ru/sql-reference/functions/date-time-functions.md index add47e9dad1..85d7c275f27 100644 --- a/docs/ru/sql-reference/functions/date-time-functions.md +++ b/docs/ru/sql-reference/functions/date-time-functions.md @@ -63,7 +63,7 @@ int32samoa: 1546300800 Переводит дату или дату-с-временем в число типа UInt16, содержащее номер года (AD). -Синоним: `Year`. +Синоним: `YEAR`. ## toQuarter {#toquarter} From 8a7d59f0fef99281a935cad8e51f40ff8a7341bc Mon Sep 17 00:00:00 2001 From: George Date: Tue, 16 Feb 2021 14:31:24 +0300 Subject: [PATCH 0329/2357] Added aliases for string function --- docs/en/sql-reference/functions/string-functions.md | 2 ++ docs/ru/sql-reference/functions/string-functions.md | 2 ++ 2 files changed, 4 insertions(+) diff --git a/docs/en/sql-reference/functions/string-functions.md b/docs/en/sql-reference/functions/string-functions.md index c1f3625c14d..a4c127507b7 100644 --- a/docs/en/sql-reference/functions/string-functions.md +++ b/docs/en/sql-reference/functions/string-functions.md @@ -98,6 +98,8 @@ SELECT toValidUTF8('\x61\xF0\x80\x80\x80b') Repeats a string as many times as specified and concatenates the replicated values as a single string. +Alias: `REPEAT`. + **Syntax** ``` sql diff --git a/docs/ru/sql-reference/functions/string-functions.md b/docs/ru/sql-reference/functions/string-functions.md index 24edc3618fb..d01d12ac8d5 100644 --- a/docs/ru/sql-reference/functions/string-functions.md +++ b/docs/ru/sql-reference/functions/string-functions.md @@ -95,6 +95,8 @@ SELECT toValidUTF8('\x61\xF0\x80\x80\x80b') Повторяет строку определенное количество раз и объединяет повторяемые значения в одну строку. +Синоним: `REPEAT`. + **Синтаксис** ``` sql From 4315cd8d26cb838553dc38a38ba35380e0eed767 Mon Sep 17 00:00:00 2001 From: George Date: Tue, 16 Feb 2021 14:34:24 +0300 Subject: [PATCH 0330/2357] fixed a typo --- docs/en/sql-reference/functions/date-time-functions.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/en/sql-reference/functions/date-time-functions.md b/docs/en/sql-reference/functions/date-time-functions.md index a0c89ecb035..880942a02f9 100644 --- a/docs/en/sql-reference/functions/date-time-functions.md +++ b/docs/en/sql-reference/functions/date-time-functions.md @@ -61,7 +61,7 @@ int32samoa: 1546300800 Converts a date or date with time to a UInt16 number containing the year number (AD). -Alias: `Year`. +Alias: `YEAR`. ## toQuarter {#toquarter} From 243ca5fe58d7b12fee746784c2f8a2f36790ff1e Mon Sep 17 00:00:00 2001 From: George Date: Tue, 16 Feb 2021 14:48:28 +0300 Subject: [PATCH 0331/2357] Added aliases for type conversion functions --- docs/en/sql-reference/functions/type-conversion-functions.md | 2 ++ docs/ru/sql-reference/functions/type-conversion-functions.md | 2 ++ 2 files changed, 4 insertions(+) diff --git a/docs/en/sql-reference/functions/type-conversion-functions.md b/docs/en/sql-reference/functions/type-conversion-functions.md index 3ca36f41c78..6e21ee9774d 100644 --- a/docs/en/sql-reference/functions/type-conversion-functions.md +++ b/docs/en/sql-reference/functions/type-conversion-functions.md @@ -124,6 +124,8 @@ SELECT toUInt64(nan), toUInt32(-32), toUInt16('16'), toUInt8(8.8) ## toDate {#todate} +Alias: `DATE`. + ## toDateOrZero {#todateorzero} ## toDateOrNull {#todateornull} diff --git a/docs/ru/sql-reference/functions/type-conversion-functions.md b/docs/ru/sql-reference/functions/type-conversion-functions.md index 4a314bd22d8..022b4c3ebc7 100644 --- a/docs/ru/sql-reference/functions/type-conversion-functions.md +++ b/docs/ru/sql-reference/functions/type-conversion-functions.md @@ -124,6 +124,8 @@ SELECT toUInt64(nan), toUInt32(-32), toUInt16('16'), toUInt8(8.8) ## toDate {#todate} +Cиноним: `DATE`. + ## toDateOrZero {#todateorzero} ## toDateOrNull {#todateornull} From bcf30d841262fd9316d0de1760d592c426805c5b Mon Sep 17 00:00:00 2001 From: Nikolai Kochetov Date: Tue, 16 Feb 2021 15:57:00 +0300 Subject: [PATCH 0332/2357] Try fix tests. --- src/Interpreters/InterpreterSelectQuery.cpp | 36 +++++++++++++-------- 1 file changed, 22 insertions(+), 14 deletions(-) diff --git a/src/Interpreters/InterpreterSelectQuery.cpp b/src/Interpreters/InterpreterSelectQuery.cpp index a99f99cfa13..826be1e5143 100644 --- a/src/Interpreters/InterpreterSelectQuery.cpp +++ b/src/Interpreters/InterpreterSelectQuery.cpp @@ -826,7 +826,7 @@ void InterpreterSelectQuery::executeImpl(QueryPlan & query_plan, const BlockInpu const bool does_storage_support_prewhere = !input && !input_pipe && storage && storage->supportsPrewhere(); if (does_storage_support_prewhere && settings.optimize_move_to_prewhere) { - std::cerr << "----- Moving row level filter to prewhere\n"; + // std::cerr << "----- Moving row level filter to prewhere\n"; /// Execute row level filter in prewhere as a part of "move to prewhere" optimization. expressions.prewhere_info = std::make_shared( std::move(expressions.filter_info->actions), @@ -1393,20 +1393,28 @@ void InterpreterSelectQuery::executeFetchColumns(QueryProcessingStage::Enum proc if (storage) { /// Append columns from the table filter to required - // ActionsDAG * row_policy_filter = nullptr; - // if (expressions.filter_info) - // row_policy_filter = expressions.filter_info->actions.get(); - // else if (expressions.prewhere_info && expressions.prewhere_info->row_level_filter_actions) - // row_policy_filter = expressions.prewhere_info->row_level_filter_actions.get(); - - if (expressions.filter_info) + if (row_policy_filter) { - auto required_columns_from_filter = expressions.filter_info->actions->getRequiredColumns(); - - for (const auto & column : required_columns_from_filter) + ActionsDAG * row_policy_dag = nullptr; + if (expressions.filter_info) + row_policy_dag = expressions.filter_info->actions.get(); + else if (expressions.prewhere_info) { - if (required_columns.end() == std::find(required_columns.begin(), required_columns.end(), column.name)) - required_columns.push_back(column.name); + if (expressions.prewhere_info->row_level_filter_actions) + row_policy_dag = expressions.prewhere_info->row_level_filter_actions.get(); + else if (expressions.prewhere_info->prewhere_actions) + row_policy_dag = expressions.prewhere_info->prewhere_actions.get(); + } + + if (row_policy_dag) + { + auto required_columns_from_filter = row_policy_dag->getRequiredColumns(); + + for (const auto & column : required_columns_from_filter) + { + if (required_columns.end() == std::find(required_columns.begin(), required_columns.end(), column.name)) + required_columns.push_back(column.name); + } } } @@ -1650,7 +1658,7 @@ void InterpreterSelectQuery::executeFetchColumns(QueryProcessingStage::Enum proc if (prewhere_info) { - std::cerr << "-------- filling prewhere info \n"; + // std::cerr << "-------- filling prewhere info \n"; query_info.prewhere_info = std::make_shared(); query_info.prewhere_info->prewhere_actions = std::make_shared(prewhere_info->prewhere_actions); From 7b54b892b5eed13edfb0963dd02287fbe0d8881f Mon Sep 17 00:00:00 2001 From: Alexander Tokmakov Date: Tue, 16 Feb 2021 17:05:58 +0300 Subject: [PATCH 0333/2357] fix --- src/Databases/DatabaseOnDisk.cpp | 2 +- src/Interpreters/Context.cpp | 4 ++-- src/Interpreters/Context.h | 2 +- src/Interpreters/DDLWorker.cpp | 9 +++++++-- src/Interpreters/DDLWorker.h | 2 +- src/Storages/StorageMaterializedView.cpp | 19 +++++++++++++++---- tests/queries/skip_list.json | 7 +++++++ 7 files changed, 34 insertions(+), 11 deletions(-) diff --git a/src/Databases/DatabaseOnDisk.cpp b/src/Databases/DatabaseOnDisk.cpp index 24bab42cad2..e5d2b23ace0 100644 --- a/src/Databases/DatabaseOnDisk.cpp +++ b/src/Databases/DatabaseOnDisk.cpp @@ -460,7 +460,7 @@ void DatabaseOnDisk::renameTable( if (from_atomic_to_ordinary) { - auto & atomic_db = assert_cast(*this); + auto & atomic_db = dynamic_cast(*this); /// Special case: usually no actions with symlinks are required when detaching/attaching table, /// but not when moving from Atomic database to Ordinary if (table->storesDataOnDisk()) diff --git a/src/Interpreters/Context.cpp b/src/Interpreters/Context.cpp index d0a1e4d37bf..766b14dea42 100644 --- a/src/Interpreters/Context.cpp +++ b/src/Interpreters/Context.cpp @@ -2553,10 +2553,10 @@ StorageID Context::resolveStorageIDImpl(StorageID storage_id, StorageNamespace w return StorageID::createEmpty(); } -void Context::initMetadataTransaction(MetadataTransactionPtr txn) +void Context::initMetadataTransaction(MetadataTransactionPtr txn, [[maybe_unused]] bool attach_existing) { assert(!metadata_transaction); - assert(query_context == this); + assert(attach_existing || query_context == this); metadata_transaction = std::move(txn); } diff --git a/src/Interpreters/Context.h b/src/Interpreters/Context.h index f6ee28aca22..8b59b225480 100644 --- a/src/Interpreters/Context.h +++ b/src/Interpreters/Context.h @@ -746,7 +746,7 @@ public: IHostContextPtr & getHostContext(); const IHostContextPtr & getHostContext() const; - void initMetadataTransaction(MetadataTransactionPtr txn); + void initMetadataTransaction(MetadataTransactionPtr txn, bool attach_to_context = false); MetadataTransactionPtr getMetadataTransaction() const; struct MySQLWireContext diff --git a/src/Interpreters/DDLWorker.cpp b/src/Interpreters/DDLWorker.cpp index f08f47b1c0e..c342a994395 100644 --- a/src/Interpreters/DDLWorker.cpp +++ b/src/Interpreters/DDLWorker.cpp @@ -328,6 +328,8 @@ void DDLWorker::scheduleTasks() LOG_TRACE(log, "No tasks to schedule"); return; } + else if (max_tasks_in_queue < queue_nodes.size()) + cleanup_event->set(); bool server_startup = current_tasks.empty(); auto begin_node = queue_nodes.begin(); @@ -489,9 +491,8 @@ void DDLWorker::processTask(DDLTaskBase & task) if (create_active_res == Coordination::Error::ZNODEEXISTS) { - /// Connection has been lost and now we are retrying to write query status, + /// Connection has been lost and now we are retrying, /// but our previous ephemeral node still exists. - assert(task.was_executed); zkutil::EventPtr eph_node_disappeared = std::make_shared(); String dummy; if (zookeeper->tryGet(active_node_path, dummy, nullptr, eph_node_disappeared)) @@ -826,6 +827,7 @@ void DDLWorker::cleanupQueue(Int64, const ZooKeeperPtr & zookeeper) ops.emplace_back(zkutil::makeRemoveRequest(fs::path(node_path) / "finished", -1)); ops.emplace_back(zkutil::makeRemoveRequest(node_path, -1)); auto rm_entry_res = zookeeper->tryMulti(ops, res); + if (rm_entry_res == Coordination::Error::ZNONODE) { /// Most likely both node_path/finished and node_path were removed concurrently. @@ -888,8 +890,11 @@ void DDLWorker::createStatusDirs(const std::string & node_path, const ZooKeeperP return; if (is_currently_deleting) + { + cleanup_event->set(); throw Exception(ErrorCodes::UNFINISHED, "Cannot create status dirs for {}, " "most likely because someone is deleting it concurrently", node_path); + } /// Connection lost or entry was removed assert(Coordination::isHardwareError(code) || code == Coordination::Error::ZNONODE); diff --git a/src/Interpreters/DDLWorker.h b/src/Interpreters/DDLWorker.h index 0985884eef7..c39a832c098 100644 --- a/src/Interpreters/DDLWorker.h +++ b/src/Interpreters/DDLWorker.h @@ -102,7 +102,7 @@ protected: virtual bool canRemoveQueueEntry(const String & entry_name, const Coordination::Stat & stat); /// Init task node - static void createStatusDirs(const std::string & node_path, const ZooKeeperPtr & zookeeper); + void createStatusDirs(const std::string & node_path, const ZooKeeperPtr & zookeeper); virtual void initializeMainThread(); diff --git a/src/Storages/StorageMaterializedView.cpp b/src/Storages/StorageMaterializedView.cpp index fb75a933910..32317968fe5 100644 --- a/src/Storages/StorageMaterializedView.cpp +++ b/src/Storages/StorageMaterializedView.cpp @@ -194,9 +194,9 @@ BlockOutputStreamPtr StorageMaterializedView::write(const ASTPtr & query, const } -static void executeDropQuery(ASTDropQuery::Kind kind, const Context & global_context, const StorageID & target_table_id, bool no_delay) +static void executeDropQuery(ASTDropQuery::Kind kind, const Context & global_context, const Context & current_context, const StorageID & target_table_id, bool no_delay) { - if (DatabaseCatalog::instance().tryGetTable(target_table_id, global_context)) + if (DatabaseCatalog::instance().tryGetTable(target_table_id, current_context)) { /// We create and execute `drop` query for internal table. auto drop_query = std::make_shared(); @@ -206,7 +206,18 @@ static void executeDropQuery(ASTDropQuery::Kind kind, const Context & global_con drop_query->no_delay = no_delay; drop_query->if_exists = true; ASTPtr ast_drop_query = drop_query; + /// FIXME We have to use global context to execute DROP query for inner table + /// to avoid "Not enough privileges" error if current user has only DROP VIEW ON mat_view_name privilege + /// and not allowed to drop inner table explicitly. Allowing to drop inner table without explicit grant + /// looks like expected behaviour and we have tests for it. auto drop_context = Context(global_context); + drop_context.getClientInfo().query_kind = ClientInfo::QueryKind::SECONDARY_QUERY; + if (auto txn = current_context.getMetadataTransaction()) + { + /// For Replicated database + drop_context.setQueryContext(const_cast(current_context)); + drop_context.initMetadataTransaction(txn, true); + } InterpreterDropQuery drop_interpreter(ast_drop_query, drop_context); drop_interpreter.execute(); } @@ -226,13 +237,13 @@ void StorageMaterializedView::drop() void StorageMaterializedView::dropInnerTable(bool no_delay, const Context & context) { if (has_inner_table && tryGetTargetTable()) - executeDropQuery(ASTDropQuery::Kind::Drop, context, target_table_id, no_delay); + executeDropQuery(ASTDropQuery::Kind::Drop, global_context, context, target_table_id, no_delay); } void StorageMaterializedView::truncate(const ASTPtr &, const StorageMetadataPtr &, const Context & context, TableExclusiveLockHolder &) { if (has_inner_table) - executeDropQuery(ASTDropQuery::Kind::Truncate, context, target_table_id, true); + executeDropQuery(ASTDropQuery::Kind::Truncate, global_context, context, target_table_id, true); } void StorageMaterializedView::checkStatementCanBeForwarded() const diff --git a/tests/queries/skip_list.json b/tests/queries/skip_list.json index 5c75fc0300b..52cef210748 100644 --- a/tests/queries/skip_list.json +++ b/tests/queries/skip_list.json @@ -103,9 +103,16 @@ "00738_lock_for_inner_table" ], "database-replicated": [ + /// Tests with DETACH TABLE (it's not allowed) + /// and tests with SET (session and query settings are not supported) "memory_tracking", "memory_usage", "live_view", + "00152_insert_different_granularity", + "01715_background_checker_blather_zookeeper", + "01714_alter_drop_version", + "01114_materialize_clear_index_compact_parts", + "00814_replicated_minimalistic_part_header_zookeeper", "01188_attach_table_from_pat", "01415_sticking_mutations", "01130_in_memory_parts", From 75117389eccf862b1a08b93a32d4f839846715f6 Mon Sep 17 00:00:00 2001 From: M0r64n Date: Tue, 16 Feb 2021 18:50:11 +0400 Subject: [PATCH 0334/2357] Add a couple of QOL file engine settings --- docs/en/operations/settings/settings.md | 20 +++++++++++++++++++ src/Core/Settings.h | 2 ++ src/Storages/StorageFile.cpp | 12 ++++++++++- ..._engine_file_empty_if_not_exists.reference | 0 .../01720_engine_file_empty_if_not_exists.sql | 15 ++++++++++++++ ...1_engine_file_truncate_on_insert.reference | 13 ++++++++++++ .../01721_engine_file_truncate_on_insert.sql | 20 +++++++++++++++++++ 7 files changed, 81 insertions(+), 1 deletion(-) create mode 100644 tests/queries/0_stateless/01720_engine_file_empty_if_not_exists.reference create mode 100644 tests/queries/0_stateless/01720_engine_file_empty_if_not_exists.sql create mode 100644 tests/queries/0_stateless/01721_engine_file_truncate_on_insert.reference create mode 100644 tests/queries/0_stateless/01721_engine_file_truncate_on_insert.sql diff --git a/docs/en/operations/settings/settings.md b/docs/en/operations/settings/settings.md index 43519bfc8dc..6440f09bb40 100644 --- a/docs/en/operations/settings/settings.md +++ b/docs/en/operations/settings/settings.md @@ -2659,3 +2659,23 @@ Result: Note that this setting influences [Materialized view](../../sql-reference/statements/create/view.md#materialized) and [MaterializeMySQL](../../engines/database-engines/materialize-mysql.md) behaviour. [Original article](https://clickhouse.tech/docs/en/operations/settings/settings/) + +## engine_file_empty_if_not_exists {#engine-file-empty_if-not-exists} + +Allows to select data from a file engine table without file. + +Possible values: +- 0 — `SELECT` throws exception. +- 1 — `SELECT` returns empty result. + +Default value: `0`. + +## engine_file_truncate_on_insert {#engine-file-truncate-on-insert} + +Enables or disables truncate before insert in file engine tables. + +Possible values: +- 0 — Disabled. +- 1 — Enabled. + +Default value: `0`. diff --git a/src/Core/Settings.h b/src/Core/Settings.h index 9bb9ad30f15..98c3b9d1f85 100644 --- a/src/Core/Settings.h +++ b/src/Core/Settings.h @@ -421,6 +421,8 @@ class IColumn; M(Bool, optimize_rewrite_sum_if_to_count_if, true, "Rewrite sumIf() and sum(if()) function countIf() function when logically equivalent", 0) \ M(UInt64, insert_shard_id, 0, "If non zero, when insert into a distributed table, the data will be inserted into the shard `insert_shard_id` synchronously. Possible values range from 1 to `shards_number` of corresponding distributed table", 0) \ M(Bool, allow_experimental_query_deduplication, false, "Allow sending parts' UUIDs for a query in order to deduplicate data parts if any", 0) \ + M(Bool, engine_file_empty_if_not_exists, false, "Allows to select data from a file engine table without file", 0) \ + M(Bool, engine_file_truncate_on_insert, false, "Enables or disables truncate before insert in file engine tables", 0) \ \ /** Obsolete settings that do nothing but left for compatibility reasons. Remove each one after half a year of obsolescence. */ \ \ diff --git a/src/Storages/StorageFile.cpp b/src/Storages/StorageFile.cpp index a5935ba3bf4..856d03ea2ce 100644 --- a/src/Storages/StorageFile.cpp +++ b/src/Storages/StorageFile.cpp @@ -34,6 +34,7 @@ #include #include #include +#include #include namespace fs = std::filesystem; @@ -427,7 +428,12 @@ Pipe StorageFile::read( paths = {""}; /// when use fd, paths are empty else if (paths.size() == 1 && !Poco::File(paths[0]).exists()) - throw Exception("File " + paths[0] + " doesn't exist", ErrorCodes::FILE_DOESNT_EXIST); + { + if (context.getSettingsRef().engine_file_empty_if_not_exists) + return Pipe(std::make_shared(metadata_snapshot->getSampleBlockForColumns(column_names, getVirtuals(), getStorageID()))); + else + throw Exception("File " + paths[0] + " doesn't exist", ErrorCodes::FILE_DOESNT_EXIST); + } auto files_info = std::make_shared(); @@ -547,6 +553,10 @@ BlockOutputStreamPtr StorageFile::write( throw Exception("Method write is not implemented for Distributed format", ErrorCodes::NOT_IMPLEMENTED); std::string path; + if (context.getSettingsRef().engine_file_truncate_on_insert) + if (0 != ::truncate(paths[0].c_str(), 0)) + throwFromErrnoWithPath("Cannot truncate file " + paths[0], paths[0], ErrorCodes::CANNOT_TRUNCATE_FILE); + if (!paths.empty()) { path = paths[0]; diff --git a/tests/queries/0_stateless/01720_engine_file_empty_if_not_exists.reference b/tests/queries/0_stateless/01720_engine_file_empty_if_not_exists.reference new file mode 100644 index 00000000000..e69de29bb2d diff --git a/tests/queries/0_stateless/01720_engine_file_empty_if_not_exists.sql b/tests/queries/0_stateless/01720_engine_file_empty_if_not_exists.sql new file mode 100644 index 00000000000..c04e01ccc88 --- /dev/null +++ b/tests/queries/0_stateless/01720_engine_file_empty_if_not_exists.sql @@ -0,0 +1,15 @@ +DROP TABLE IF EXISTS file_engine_table; + +CREATE TABLE file_engine_table (id UInt32) ENGINE=File(TSV); + +SELECT * FROM file_engine_table; --{ serverError 107 } + +SET engine_file_empty_if_not_exists=0; + +SELECT * FROM file_engine_table; --{ serverError 107 } + +SET engine_file_empty_if_not_exists=1; + +SELECT * FROM file_engine_table; + +SET engine_file_empty_if_not_exists=0; diff --git a/tests/queries/0_stateless/01721_engine_file_truncate_on_insert.reference b/tests/queries/0_stateless/01721_engine_file_truncate_on_insert.reference new file mode 100644 index 00000000000..a25fb4f0e7e --- /dev/null +++ b/tests/queries/0_stateless/01721_engine_file_truncate_on_insert.reference @@ -0,0 +1,13 @@ +1 +2 +3 +4 +1 +2 +3 +4 +5 +6 +0 +1 +2 \ No newline at end of file diff --git a/tests/queries/0_stateless/01721_engine_file_truncate_on_insert.sql b/tests/queries/0_stateless/01721_engine_file_truncate_on_insert.sql new file mode 100644 index 00000000000..65246db7963 --- /dev/null +++ b/tests/queries/0_stateless/01721_engine_file_truncate_on_insert.sql @@ -0,0 +1,20 @@ +INSERT INTO TABLE FUNCTION file('01718_file/test/data.TSV', 'TSV', 'id UInt32') VALUES ('file', 42); +ATTACH TABLE test FROM '01718_file/test' (id UInt8) ENGINE=File(TSV); + +CREATE TABLE file_engine_table (id UInt32) ENGINE=File(TabSeparated); + +INSERT INTO file_engine_table VALUES (1), (2), (3); +INSERT INTO file_engine_table VALUES (4); +SELECT * FROM file_engine_table; + +SET engine_file_truncate_on_insert=0; + +INSERT INTO file_engine_table VALUES (5), (6); +SELECT * FROM file_engine_table; + +SET engine_file_truncate_on_insert=1; + +INSERT INTO file_engine_table VALUES (0), (1), (2); +SELECT * FROM file_engine_table; + +SET engine_file_truncate_on_insert=0; From 16bcd9d247877c55d27936e64a0d3c76dbe9cf7a Mon Sep 17 00:00:00 2001 From: alesapin Date: Tue, 16 Feb 2021 20:28:54 +0300 Subject: [PATCH 0335/2357] Add changelog tests --- src/Coordination/Changelog.cpp | 102 ++++--- src/Coordination/Changelog.h | 12 +- src/Coordination/tests/gtest_for_build.cpp | 325 ++++++++++++++++++++- 3 files changed, 396 insertions(+), 43 deletions(-) diff --git a/src/Coordination/Changelog.cpp b/src/Coordination/Changelog.cpp index d3ba176f209..6fa3e0e9e03 100644 --- a/src/Coordination/Changelog.cpp +++ b/src/Coordination/Changelog.cpp @@ -44,15 +44,14 @@ static constexpr auto DEFAULT_PREFIX = "changelog"; struct ChangelogName { std::string prefix; - ChangelogVersion version; size_t from_log_idx; size_t to_log_idx; }; -std::string formatChangelogPath(const std::string & prefix, const ChangelogVersion & version, const ChangelogName & name) +std::string formatChangelogPath(const std::string & prefix, const ChangelogName & name) { std::filesystem::path path(prefix); - path /= std::filesystem::path(name.prefix + "_" + toString(version) + "_" + std::to_string(name.from_log_idx) + "_" + std::to_string(name.to_log_idx) + ".log"); + path /= std::filesystem::path(name.prefix + "_" + std::to_string(name.from_log_idx) + "_" + std::to_string(name.to_log_idx) + ".bin"); return path; } @@ -62,14 +61,13 @@ ChangelogName getChangelogName(const std::string & path_str) std::string filename = path.stem(); Strings filename_parts; boost::split(filename_parts, filename, boost::is_any_of("_")); - if (filename_parts.size() < 4) + if (filename_parts.size() < 3) throw Exception(ErrorCodes::CORRUPTED_DATA, "Invalid changelog {}", path_str); ChangelogName result; result.prefix = filename_parts[0]; - result.version = fromString(filename_parts[1]); - result.from_log_idx = parse(filename_parts[2]); - result.to_log_idx = parse(filename_parts[3]); + result.from_log_idx = parse(filename_parts[1]); + result.to_log_idx = parse(filename_parts[2]); return result; } @@ -114,6 +112,7 @@ public: { flush(); plain_buf.truncate(new_length); + plain_buf.seek(new_length, SEEK_SET); } void flush() @@ -190,6 +189,7 @@ public: if (!logs.try_emplace(record.header.index, log_entry).second) throw Exception(ErrorCodes::CORRUPTED_DATA, "Duplicated index id {} in log {}", record.header.index, filepath); } + return total_read; } private: @@ -203,13 +203,16 @@ Changelog::Changelog(const std::string & changelogs_dir_, size_t rotate_interval { namespace fs = std::filesystem; for(const auto & p : fs::directory_iterator(changelogs_dir)) - existing_changelogs.push_back(p.path()); + { + auto name = getChangelogName(p.path()); + existing_changelogs[name.from_log_idx] = p.path(); + } } void Changelog::readChangelogAndInitWriter(size_t from_log_idx) { size_t read_from_last = 0; - for (const std::string & changelog_file : existing_changelogs) + for (const auto & [start_id, changelog_file] : existing_changelogs) { ChangelogName parsed_name = getChangelogName(changelog_file); if (parsed_name.to_log_idx >= from_log_idx) @@ -223,8 +226,9 @@ void Changelog::readChangelogAndInitWriter(size_t from_log_idx) if (existing_changelogs.size() > 0 && read_from_last < rotate_interval) { - auto parsed_name = getChangelogName(existing_changelogs.back()); - current_writer = std::make_unique(existing_changelogs.back(), WriteMode::Append, parsed_name.from_log_idx); + auto str_name = existing_changelogs.rbegin()->second; + auto parsed_name = getChangelogName(str_name); + current_writer = std::make_unique(str_name, WriteMode::Append, parsed_name.from_log_idx); current_writer->setEntriesWritten(read_from_last); } else @@ -240,13 +244,12 @@ void Changelog::rotate(size_t new_start_log_idx) ChangelogName new_name; new_name.prefix = DEFAULT_PREFIX; - new_name.version = CURRENT_CHANGELOG_VERSION; new_name.from_log_idx = new_start_log_idx; - new_name.to_log_idx = new_start_log_idx; + new_name.to_log_idx = new_start_log_idx + rotate_interval - 1; - auto new_log_path = formatChangelogPath(changelogs_dir, CURRENT_CHANGELOG_VERSION, new_name); - existing_changelogs.push_back(new_log_path); - current_writer = std::make_unique(existing_changelogs.back(), WriteMode::Rewrite, new_start_log_idx); + auto new_log_path = formatChangelogPath(changelogs_dir, new_name); + existing_changelogs[new_start_log_idx] = new_log_path; + current_writer = std::make_unique(new_log_path, WriteMode::Rewrite, new_start_log_idx); } ChangelogRecord Changelog::buildRecord(size_t index, nuraft::ptr log_entry) const @@ -275,42 +278,62 @@ void Changelog::appendEntry(size_t index, nuraft::ptr log_ent if (!current_writer) throw Exception(ErrorCodes::LOGICAL_ERROR, "Changelog must be initialized before appending records"); + if (logs.empty()) + start_index = index; + if (current_writer->getEntriesWritten() == rotate_interval) rotate(index); auto offset = current_writer->appendRecord(buildRecord(index, log_entry), true); if (!index_to_start_pos.try_emplace(index, offset).second) throw Exception(ErrorCodes::LOGICAL_ERROR, "Record with index {} already exists", index); + logs[index] = makeClone(log_entry); } void Changelog::writeAt(size_t index, nuraft::ptr log_entry) { - if (index < current_writer->getStartIndex()) - throw Exception(ErrorCodes::NOT_IMPLEMENTED, "Currently cannot overwrite index from previous file"); - if (index_to_start_pos.count(index) == 0) throw Exception(ErrorCodes::LOGICAL_ERROR, "Cannot write at index {} because changelog doesn't contain it", index); + bool need_rollback = index < current_writer->getStartIndex(); + if (need_rollback) + { + auto index_changelog = existing_changelogs.lower_bound(index); + std::string fname; + if (index_changelog->first == index) + fname = index_changelog->second; + else + fname = std::prev(index_changelog)->second; + + current_writer = std::make_unique(fname, WriteMode::Append, index_changelog->first); + auto formated_name = getChangelogName(fname); + current_writer->setEntriesWritten(formated_name.to_log_idx - formated_name.from_log_idx + 1); + } + auto entries_written = current_writer->getEntriesWritten(); current_writer->truncateToLength(index_to_start_pos[index]); - for (auto itr = index_to_start_pos.begin(); itr != index_to_start_pos.end();) + + if (need_rollback) { - if (itr->first >= index) + auto to_remove_itr = existing_changelogs.upper_bound(index); + for (auto itr = to_remove_itr; itr != existing_changelogs.end();) { - entries_written--; - itr = index_to_start_pos.erase(itr); + std::filesystem::remove(itr->second); + itr = existing_changelogs.erase(itr); } - else - itr++; + } + + /// Rollback in memory state + for (auto itr = logs.lower_bound(index); itr != logs.end();) + { + index_to_start_pos.erase(itr->first); + itr = logs.erase(itr); + entries_written--; } current_writer->setEntriesWritten(entries_written); - auto itr = logs.lower_bound(index); - while (itr != logs.end()) - itr = logs.erase(itr); - appendEntry(index, log_entry); } @@ -318,22 +341,27 @@ void Changelog::compact(size_t up_to_log_idx) { for (auto itr = existing_changelogs.begin(); itr != existing_changelogs.end();) { - ChangelogName parsed_name = getChangelogName(*itr); + ChangelogName parsed_name = getChangelogName(itr->second); if (parsed_name.to_log_idx <= up_to_log_idx) { - std::filesystem::remove(*itr); - itr = existing_changelogs.erase(itr); + for (size_t idx = parsed_name.from_log_idx; idx <= parsed_name.to_log_idx; ++idx) { - auto logs_itr = logs.find(idx); - if (logs_itr != logs.end()) - logs.erase(idx); - else + auto index_pos = index_to_start_pos.find(idx); + if (index_pos == index_to_start_pos.end()) break; - index_to_start_pos.erase(idx); + index_to_start_pos.erase(index_pos); } + std::filesystem::remove(itr->second); + itr = existing_changelogs.erase(itr); } + else + break; } + auto start = logs.begin(); + auto end = logs.upper_bound(up_to_log_idx); + logs.erase(start, end); + start_index = up_to_log_idx + 1; } LogEntryPtr Changelog::getLastEntry() const diff --git a/src/Coordination/Changelog.h b/src/Coordination/Changelog.h index c58f35cb4a1..97669d1aa19 100644 --- a/src/Coordination/Changelog.h +++ b/src/Coordination/Changelog.h @@ -65,7 +65,7 @@ public: size_t getNextEntryIndex() const { - return start_index + logs.size() - 1; + return start_index + logs.size(); } size_t getStartIndex() const @@ -79,22 +79,28 @@ public: LogEntryPtr entryAt(size_t idx); - nuraft::ptr serializeEntriesToBuffer(size_t index, Int32 cnt); + nuraft::ptr serializeEntriesToBuffer(size_t index, int32_t cnt); void applyEntriesFromBuffer(size_t index, nuraft::buffer & buffer); void flush(); + size_t size() const + { + return logs.size(); + } + ~Changelog(); private: + void rotate(size_t new_start_log_idex); ChangelogRecord buildRecord(size_t index, nuraft::ptr log_entry) const; private: std::string changelogs_dir; - std::deque existing_changelogs; + std::map existing_changelogs; std::unique_ptr current_writer; IndexToOffset index_to_start_pos; const size_t rotate_interval; diff --git a/src/Coordination/tests/gtest_for_build.cpp b/src/Coordination/tests/gtest_for_build.cpp index 6142ee0b5c0..6335df4b940 100644 --- a/src/Coordination/tests/gtest_for_build.cpp +++ b/src/Coordination/tests/gtest_for_build.cpp @@ -24,6 +24,7 @@ #include #include #include +#include TEST(CoordinationTest, BuildTest) @@ -335,18 +336,336 @@ TEST(CoordinationTest, TestStorageSerialization) EXPECT_EQ(new_storage.ephemerals[1].size(), 1); } -DB::LogEntryPtr getLogEntry(const std::string & s) +DB::LogEntryPtr getLogEntry(const std::string & s, size_t term) { DB::WriteBufferFromNuraftBuffer bufwriter; writeText(s, bufwriter); - return nuraft::cs_new(0, bufwriter.getBuffer()); + return nuraft::cs_new(term, bufwriter.getBuffer()); } +namespace fs = std::filesystem; +struct ChangelogDirTest +{ + std::string path; + bool drop; + ChangelogDirTest(std::string path_, bool drop_ = true) + : path(path_) + , drop(drop_) + { + if (fs::exists(path)) + EXPECT_TRUE(false) << "Path " << path << " already exists, remove it to run test"; + fs::create_directory(path); + } + + ~ChangelogDirTest() + { + if (fs::exists(path) && drop) + fs::remove_all(path); + } +}; + TEST(CoordinationTest, ChangelogTestSimple) { + ChangelogDirTest test("./logs"); DB::Changelog changelog("./logs", 5); - auto entry = getLogEntry("hello world"); + changelog.readChangelogAndInitWriter(1); + auto entry = getLogEntry("hello world", 77); changelog.appendEntry(1, entry); + EXPECT_EQ(changelog.getNextEntryIndex(), 2); + EXPECT_EQ(changelog.getStartIndex(), 1); + EXPECT_EQ(changelog.getLastEntry()->get_term(), 77); + EXPECT_EQ(changelog.entryAt(1)->get_term(), 77); + EXPECT_EQ(changelog.getLogEntriesBetween(1, 2)->size(), 1); +} + +TEST(CoordinationTest, ChangelogTestFile) +{ + ChangelogDirTest test("./logs"); + DB::Changelog changelog("./logs", 5); + changelog.readChangelogAndInitWriter(1); + auto entry = getLogEntry("hello world", 77); + changelog.appendEntry(1, entry); + EXPECT_TRUE(fs::exists("./logs/changelog_1_5.bin")); + for(const auto & p : fs::directory_iterator("./logs")) + EXPECT_EQ(p.path(), "./logs/changelog_1_5.bin"); + + changelog.appendEntry(2, entry); + changelog.appendEntry(3, entry); + changelog.appendEntry(4, entry); + changelog.appendEntry(5, entry); + changelog.appendEntry(6, entry); + + EXPECT_TRUE(fs::exists("./logs/changelog_1_5.bin")); + EXPECT_TRUE(fs::exists("./logs/changelog_6_10.bin")); +} + +TEST(CoordinationTest, ChangelogReadWrite) +{ + ChangelogDirTest test("./logs"); + DB::Changelog changelog("./logs", 1000); + changelog.readChangelogAndInitWriter(1); + for (size_t i = 0; i < 10; ++i) + { + auto entry = getLogEntry("hello world", i * 10); + changelog.appendEntry(changelog.getNextEntryIndex(), entry); + } + EXPECT_EQ(changelog.size(), 10); + DB::Changelog changelog_reader("./logs", 1000); + changelog_reader.readChangelogAndInitWriter(1); + EXPECT_EQ(changelog_reader.size(), 10); + EXPECT_EQ(changelog_reader.getLastEntry()->get_term(), changelog.getLastEntry()->get_term()); + EXPECT_EQ(changelog_reader.getStartIndex(), changelog.getStartIndex()); + EXPECT_EQ(changelog_reader.getNextEntryIndex(), changelog.getNextEntryIndex()); + + for (size_t i = 0; i < 10; ++i) + EXPECT_EQ(changelog_reader.entryAt(i + 1)->get_term(), changelog.entryAt(i + 1)->get_term()); + + auto entries_from_range_read = changelog_reader.getLogEntriesBetween(1, 11); + auto entries_from_range = changelog.getLogEntriesBetween(1, 11); + EXPECT_EQ(entries_from_range_read->size(), entries_from_range->size()); + EXPECT_EQ(10, entries_from_range->size()); +} + +TEST(CoordinationTest, ChangelogWriteAt) +{ + ChangelogDirTest test("./logs"); + DB::Changelog changelog("./logs", 1000); + changelog.readChangelogAndInitWriter(1); + for (size_t i = 0; i < 10; ++i) + { + auto entry = getLogEntry("hello world", i * 10); + changelog.appendEntry(changelog.getNextEntryIndex(), entry); + } + EXPECT_EQ(changelog.size(), 10); + + auto entry = getLogEntry("writer", 77); + changelog.writeAt(7, entry); + EXPECT_EQ(changelog.size(), 7); + EXPECT_EQ(changelog.getLastEntry()->get_term(), 77); + EXPECT_EQ(changelog.entryAt(7)->get_term(), 77); + EXPECT_EQ(changelog.getNextEntryIndex(), 8); + + DB::Changelog changelog_reader("./logs", 1000); + changelog_reader.readChangelogAndInitWriter(1); + + EXPECT_EQ(changelog_reader.size(), changelog.size()); + EXPECT_EQ(changelog_reader.getLastEntry()->get_term(), changelog.getLastEntry()->get_term()); + EXPECT_EQ(changelog_reader.getStartIndex(), changelog.getStartIndex()); + EXPECT_EQ(changelog_reader.getNextEntryIndex(), changelog.getNextEntryIndex()); +} + + +TEST(CoordinationTest, ChangelogTestAppendAfterRead) +{ + ChangelogDirTest test("./logs"); + DB::Changelog changelog("./logs", 5); + changelog.readChangelogAndInitWriter(1); + for (size_t i = 0; i < 7; ++i) + { + auto entry = getLogEntry("hello world", i * 10); + changelog.appendEntry(changelog.getNextEntryIndex(), entry); + } + + EXPECT_EQ(changelog.size(), 7); + EXPECT_TRUE(fs::exists("./logs/changelog_1_5.bin")); + EXPECT_TRUE(fs::exists("./logs/changelog_6_10.bin")); + + DB::Changelog changelog_reader("./logs", 5); + changelog_reader.readChangelogAndInitWriter(1); + + EXPECT_EQ(changelog_reader.size(), 7); + for (size_t i = 7; i < 10; ++i) + { + auto entry = getLogEntry("hello world", i * 10); + changelog_reader.appendEntry(changelog_reader.getNextEntryIndex(), entry); + } + EXPECT_EQ(changelog_reader.size(), 10); + EXPECT_TRUE(fs::exists("./logs/changelog_1_5.bin")); + EXPECT_TRUE(fs::exists("./logs/changelog_6_10.bin")); + + size_t logs_count = 0; + for(const auto & _ [[maybe_unused]]: fs::directory_iterator("./logs")) + logs_count++; + + EXPECT_EQ(logs_count, 2); + + auto entry = getLogEntry("someentry", 77); + changelog_reader.appendEntry(changelog_reader.getNextEntryIndex(), entry); + EXPECT_EQ(changelog_reader.size(), 11); + EXPECT_TRUE(fs::exists("./logs/changelog_1_5.bin")); + EXPECT_TRUE(fs::exists("./logs/changelog_6_10.bin")); + EXPECT_TRUE(fs::exists("./logs/changelog_11_15.bin")); + + logs_count = 0; + for(const auto & _ [[maybe_unused]]: fs::directory_iterator("./logs")) + logs_count++; + + EXPECT_EQ(logs_count, 3); +} + +TEST(CoordinationTest, ChangelogTestCompaction) +{ + ChangelogDirTest test("./logs"); + DB::Changelog changelog("./logs", 5); + changelog.readChangelogAndInitWriter(1); + + for (size_t i = 0; i < 3; ++i) + { + auto entry = getLogEntry("hello world", i * 10); + changelog.appendEntry(changelog.getNextEntryIndex(), entry); + } + + EXPECT_EQ(changelog.size(), 3); + + changelog.compact(2); + + EXPECT_EQ(changelog.size(), 1); + EXPECT_EQ(changelog.getStartIndex(), 3); + EXPECT_EQ(changelog.getNextEntryIndex(), 4); + EXPECT_EQ(changelog.getLastEntry()->get_term(), 20); + EXPECT_TRUE(fs::exists("./logs/changelog_1_5.bin")); + + changelog.appendEntry(changelog.getNextEntryIndex(), getLogEntry("hello world", 30)); + changelog.appendEntry(changelog.getNextEntryIndex(), getLogEntry("hello world", 40)); + changelog.appendEntry(changelog.getNextEntryIndex(), getLogEntry("hello world", 50)); + changelog.appendEntry(changelog.getNextEntryIndex(), getLogEntry("hello world", 60)); + + EXPECT_TRUE(fs::exists("./logs/changelog_1_5.bin")); + EXPECT_TRUE(fs::exists("./logs/changelog_6_10.bin")); + + changelog.compact(6); + + EXPECT_FALSE(fs::exists("./logs/changelog_1_5.bin")); + EXPECT_TRUE(fs::exists("./logs/changelog_6_10.bin")); + + EXPECT_EQ(changelog.size(), 1); + EXPECT_EQ(changelog.getStartIndex(), 7); + EXPECT_EQ(changelog.getNextEntryIndex(), 8); + EXPECT_EQ(changelog.getLastEntry()->get_term(), 60); + /// And we able to read it + DB::Changelog changelog_reader("./logs", 5); + changelog_reader.readChangelogAndInitWriter(7); + EXPECT_EQ(changelog_reader.size(), 1); + EXPECT_EQ(changelog_reader.getStartIndex(), 7); + EXPECT_EQ(changelog_reader.getNextEntryIndex(), 8); + EXPECT_EQ(changelog_reader.getLastEntry()->get_term(), 60); +} + +TEST(CoordinationTest, ChangelogTestBatchOperations) +{ + ChangelogDirTest test("./logs"); + DB::Changelog changelog("./logs", 100); + changelog.readChangelogAndInitWriter(1); + for (size_t i = 0; i < 10; ++i) + { + auto entry = getLogEntry(std::to_string(i) + "_hello_world", i * 10); + changelog.appendEntry(changelog.getNextEntryIndex(), entry); + } + + EXPECT_EQ(changelog.size(), 10); + + auto entries = changelog.serializeEntriesToBuffer(1, 5); + + DB::Changelog apply_changelog("./logs", 100); + apply_changelog.readChangelogAndInitWriter(1); + + for (size_t i = 0; i < 10; ++i) + { + EXPECT_EQ(apply_changelog.entryAt(i + 1)->get_term(), i * 10); + } + EXPECT_EQ(apply_changelog.size(), 10); + + apply_changelog.applyEntriesFromBuffer(8, *entries); + + EXPECT_EQ(apply_changelog.size(), 12); + EXPECT_EQ(apply_changelog.getStartIndex(), 1); + EXPECT_EQ(apply_changelog.getNextEntryIndex(), 13); + + for (size_t i = 0; i < 7; ++i) + { + EXPECT_EQ(apply_changelog.entryAt(i + 1)->get_term(), i * 10); + } + + EXPECT_EQ(apply_changelog.entryAt(8)->get_term(), 0); + EXPECT_EQ(apply_changelog.entryAt(9)->get_term(), 10); + EXPECT_EQ(apply_changelog.entryAt(10)->get_term(), 20); + EXPECT_EQ(apply_changelog.entryAt(11)->get_term(), 30); + EXPECT_EQ(apply_changelog.entryAt(12)->get_term(), 40); +} + +TEST(CoordinationTest, ChangelogTestBatchOperationsEmpty) +{ + ChangelogDirTest test("./logs"); + DB::Changelog changelog("./logs", 100); + changelog.readChangelogAndInitWriter(1); + for (size_t i = 0; i < 10; ++i) + { + auto entry = getLogEntry(std::to_string(i) + "_hello_world", i * 10); + changelog.appendEntry(changelog.getNextEntryIndex(), entry); + } + + EXPECT_EQ(changelog.size(), 10); + + auto entries = changelog.serializeEntriesToBuffer(5, 5); + + ChangelogDirTest test1("./logs1"); + DB::Changelog changelog_new("./logs1", 100); + changelog_new.readChangelogAndInitWriter(1); + EXPECT_EQ(changelog_new.size(), 0); + + changelog_new.applyEntriesFromBuffer(5, *entries); + + EXPECT_EQ(changelog_new.size(), 5); + EXPECT_EQ(changelog_new.getStartIndex(), 5); + EXPECT_EQ(changelog_new.getNextEntryIndex(), 10); + + for (size_t i = 4; i < 9; ++i) + EXPECT_EQ(changelog_new.entryAt(i + 1)->get_term(), i * 10); + + changelog_new.appendEntry(changelog_new.getNextEntryIndex(), getLogEntry("hello_world", 110)); + EXPECT_EQ(changelog_new.size(), 6); + EXPECT_EQ(changelog_new.getStartIndex(), 5); + EXPECT_EQ(changelog_new.getNextEntryIndex(), 11); + + DB::Changelog changelog_reader("./logs1", 100); + changelog_reader.readChangelogAndInitWriter(5); +} + + +TEST(CoordinationTest, ChangelogTestWriteAtPreviousFile) +{ + ChangelogDirTest test("./logs"); + DB::Changelog changelog("./logs", 5); + changelog.readChangelogAndInitWriter(1); + + for (size_t i = 0; i < 33; ++i) + { + auto entry = getLogEntry(std::to_string(i) + "_hello_world", i * 10); + changelog.appendEntry(changelog.getNextEntryIndex(), entry); + } + EXPECT_EQ(changelog.size(), 33); + + changelog.writeAt(7, getLogEntry("helloworld", 5555)); + EXPECT_EQ(changelog.size(), 7); + EXPECT_EQ(changelog.getStartIndex(), 1); + EXPECT_EQ(changelog.getNextEntryIndex(), 8); + EXPECT_EQ(changelog.getLastEntry()->get_term(), 5555); + + EXPECT_TRUE(fs::exists("./logs/changelog_1_5.bin")); + EXPECT_TRUE(fs::exists("./logs/changelog_6_10.bin")); + + EXPECT_FALSE(fs::exists("./logs/changelog_11_15.bin")); + EXPECT_FALSE(fs::exists("./logs/changelog_16_20.bin")); + EXPECT_FALSE(fs::exists("./logs/changelog_11_25.bin")); + EXPECT_FALSE(fs::exists("./logs/changelog_26_30.bin")); + EXPECT_FALSE(fs::exists("./logs/changelog_31_35.bin")); + + DB::Changelog changelog_read("./logs", 5); + changelog_read.readChangelogAndInitWriter(1); + EXPECT_EQ(changelog_read.size(), 7); + EXPECT_EQ(changelog_read.getStartIndex(), 1); + EXPECT_EQ(changelog_read.getNextEntryIndex(), 8); + EXPECT_EQ(changelog_read.getLastEntry()->get_term(), 5555); } #endif From b029f3e5cf4b03df444ee2da007040756cb46570 Mon Sep 17 00:00:00 2001 From: alesapin Date: Tue, 16 Feb 2021 20:32:35 +0300 Subject: [PATCH 0336/2357] Border test --- src/Coordination/tests/gtest_for_build.cpp | 36 ++++++++++++++++++++++ 1 file changed, 36 insertions(+) diff --git a/src/Coordination/tests/gtest_for_build.cpp b/src/Coordination/tests/gtest_for_build.cpp index 6335df4b940..f6139ea5de3 100644 --- a/src/Coordination/tests/gtest_for_build.cpp +++ b/src/Coordination/tests/gtest_for_build.cpp @@ -668,4 +668,40 @@ TEST(CoordinationTest, ChangelogTestWriteAtPreviousFile) EXPECT_EQ(changelog_read.getLastEntry()->get_term(), 5555); } +TEST(CoordinationTest, ChangelogTestWriteAtFileBorder) +{ + ChangelogDirTest test("./logs"); + DB::Changelog changelog("./logs", 5); + changelog.readChangelogAndInitWriter(1); + + for (size_t i = 0; i < 33; ++i) + { + auto entry = getLogEntry(std::to_string(i) + "_hello_world", i * 10); + changelog.appendEntry(changelog.getNextEntryIndex(), entry); + } + EXPECT_EQ(changelog.size(), 33); + + changelog.writeAt(11, getLogEntry("helloworld", 5555)); + EXPECT_EQ(changelog.size(), 11); + EXPECT_EQ(changelog.getStartIndex(), 1); + EXPECT_EQ(changelog.getNextEntryIndex(), 12); + EXPECT_EQ(changelog.getLastEntry()->get_term(), 5555); + + EXPECT_TRUE(fs::exists("./logs/changelog_1_5.bin")); + EXPECT_TRUE(fs::exists("./logs/changelog_6_10.bin")); + EXPECT_TRUE(fs::exists("./logs/changelog_11_15.bin")); + + EXPECT_FALSE(fs::exists("./logs/changelog_16_20.bin")); + EXPECT_FALSE(fs::exists("./logs/changelog_11_25.bin")); + EXPECT_FALSE(fs::exists("./logs/changelog_26_30.bin")); + EXPECT_FALSE(fs::exists("./logs/changelog_31_35.bin")); + + DB::Changelog changelog_read("./logs", 5); + changelog_read.readChangelogAndInitWriter(1); + EXPECT_EQ(changelog_read.size(), 11); + EXPECT_EQ(changelog_read.getStartIndex(), 1); + EXPECT_EQ(changelog_read.getNextEntryIndex(), 12); + EXPECT_EQ(changelog_read.getLastEntry()->get_term(), 5555); +} + #endif From b76b8013ba88b081362ab9f31c103a3b6c77bc27 Mon Sep 17 00:00:00 2001 From: alesapin Date: Tue, 16 Feb 2021 20:47:12 +0300 Subject: [PATCH 0337/2357] Fix tests --- src/Coordination/Changelog.cpp | 1 - src/Coordination/tests/gtest_for_build.cpp | 22 ++++++++++++++++++++-- 2 files changed, 20 insertions(+), 3 deletions(-) diff --git a/src/Coordination/Changelog.cpp b/src/Coordination/Changelog.cpp index 6fa3e0e9e03..5198382e731 100644 --- a/src/Coordination/Changelog.cpp +++ b/src/Coordination/Changelog.cpp @@ -344,7 +344,6 @@ void Changelog::compact(size_t up_to_log_idx) ChangelogName parsed_name = getChangelogName(itr->second); if (parsed_name.to_log_idx <= up_to_log_idx) { - for (size_t idx = parsed_name.from_log_idx; idx <= parsed_name.to_log_idx; ++idx) { auto index_pos = index_to_start_pos.find(idx); diff --git a/src/Coordination/tests/gtest_for_build.cpp b/src/Coordination/tests/gtest_for_build.cpp index f6139ea5de3..fa8ae8f8b82 100644 --- a/src/Coordination/tests/gtest_for_build.cpp +++ b/src/Coordination/tests/gtest_for_build.cpp @@ -643,6 +643,15 @@ TEST(CoordinationTest, ChangelogTestWriteAtPreviousFile) auto entry = getLogEntry(std::to_string(i) + "_hello_world", i * 10); changelog.appendEntry(changelog.getNextEntryIndex(), entry); } + + EXPECT_TRUE(fs::exists("./logs/changelog_1_5.bin")); + EXPECT_TRUE(fs::exists("./logs/changelog_6_10.bin")); + EXPECT_TRUE(fs::exists("./logs/changelog_11_15.bin")); + EXPECT_TRUE(fs::exists("./logs/changelog_16_20.bin")); + EXPECT_TRUE(fs::exists("./logs/changelog_21_25.bin")); + EXPECT_TRUE(fs::exists("./logs/changelog_26_30.bin")); + EXPECT_TRUE(fs::exists("./logs/changelog_31_35.bin")); + EXPECT_EQ(changelog.size(), 33); changelog.writeAt(7, getLogEntry("helloworld", 5555)); @@ -656,7 +665,7 @@ TEST(CoordinationTest, ChangelogTestWriteAtPreviousFile) EXPECT_FALSE(fs::exists("./logs/changelog_11_15.bin")); EXPECT_FALSE(fs::exists("./logs/changelog_16_20.bin")); - EXPECT_FALSE(fs::exists("./logs/changelog_11_25.bin")); + EXPECT_FALSE(fs::exists("./logs/changelog_21_25.bin")); EXPECT_FALSE(fs::exists("./logs/changelog_26_30.bin")); EXPECT_FALSE(fs::exists("./logs/changelog_31_35.bin")); @@ -679,6 +688,15 @@ TEST(CoordinationTest, ChangelogTestWriteAtFileBorder) auto entry = getLogEntry(std::to_string(i) + "_hello_world", i * 10); changelog.appendEntry(changelog.getNextEntryIndex(), entry); } + + EXPECT_TRUE(fs::exists("./logs/changelog_1_5.bin")); + EXPECT_TRUE(fs::exists("./logs/changelog_6_10.bin")); + EXPECT_TRUE(fs::exists("./logs/changelog_11_15.bin")); + EXPECT_TRUE(fs::exists("./logs/changelog_16_20.bin")); + EXPECT_TRUE(fs::exists("./logs/changelog_21_25.bin")); + EXPECT_TRUE(fs::exists("./logs/changelog_26_30.bin")); + EXPECT_TRUE(fs::exists("./logs/changelog_31_35.bin")); + EXPECT_EQ(changelog.size(), 33); changelog.writeAt(11, getLogEntry("helloworld", 5555)); @@ -692,7 +710,7 @@ TEST(CoordinationTest, ChangelogTestWriteAtFileBorder) EXPECT_TRUE(fs::exists("./logs/changelog_11_15.bin")); EXPECT_FALSE(fs::exists("./logs/changelog_16_20.bin")); - EXPECT_FALSE(fs::exists("./logs/changelog_11_25.bin")); + EXPECT_FALSE(fs::exists("./logs/changelog_21_25.bin")); EXPECT_FALSE(fs::exists("./logs/changelog_26_30.bin")); EXPECT_FALSE(fs::exists("./logs/changelog_31_35.bin")); From e93e1911ee0b11278e13a2deb8022bbb456ef15d Mon Sep 17 00:00:00 2001 From: Dmitriy Date: Tue, 16 Feb 2021 21:01:36 +0300 Subject: [PATCH 0338/2357] Translate to Russian MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Выполнил перевод на русский язык. --- .../functions/type-conversion-functions.md | 14 +- .../functions/type-conversion-functions.md | 172 ++++++++++++++++++ 2 files changed, 177 insertions(+), 9 deletions(-) diff --git a/docs/en/sql-reference/functions/type-conversion-functions.md b/docs/en/sql-reference/functions/type-conversion-functions.md index 81b5649db32..6795b31bd33 100644 --- a/docs/en/sql-reference/functions/type-conversion-functions.md +++ b/docs/en/sql-reference/functions/type-conversion-functions.md @@ -701,21 +701,19 @@ parseDateTimeBestEffortUSOrNull(time_string[, time_zone]) **Parameters** -- `time_string` — String containing a date or date with time to convert. The date must be in the US date format (`MM/DD/YYYY`). [String](../../sql-reference/data-types/string.md). +- `time_string` — String containing a date or date with time to convert. The date must be in the US date format (`MM/DD/YYYY`, etc). [String](../../sql-reference/data-types/string.md). - `time_zone` — [Timezone](../../operations/server-configuration-parameters/settings.md#server_configuration_parameters-timezone). The function parses `time_string` according to the timezone. Optional. [String](../../sql-reference/data-types/string.md). **Supported non-standard formats** - A string containing 9..10 digit [unix timestamp](https://en.wikipedia.org/wiki/Unix_time). - A string with a date and a time components: `YYYYMMDDhhmmss`, `MM/DD/YYYY hh:mm:ss`, `MM-DD-YY hh:mm`, `YYYY-MM-DD hh:mm:ss`, etc. -- A string with a date, but no time component: `YYYY`, `YYYYMM`, `YYYY*MM`, `MM/DD/YYYY`, `MM-DD-YY` etc. +- A string with a date, but no time component: `YYYY`, `YYYYMM`, `YYYY*MM`, `MM/DD/YYYY`, `MM-DD-YY`, etc. - A string with a day and time: `DD`, `DD hh`, `DD hh:mm`. In this case, `YYYY-MM` are substituted with `2000-01`. - A string that includes date and time along with timezone offset information: `YYYY-MM-DD hh:mm:ss ±h:mm`, etc. For example, `2020-12-12 17:36:00 -5:00`. **Returned values** -Possible values: - - `time_string` converted to the [DateTime](../../sql-reference/data-types/datetime.md) data type. - `NULL` if the input string cannot be converted to the `DateTime` data type. @@ -789,23 +787,21 @@ parseDateTimeBestEffortUSOrZero(time_string[, time_zone]) **Parameters** -- `time_string` — String containing a date or date with time to convert. The date must be in the US date format (`MM/DD/YYYY`). [String](../../sql-reference/data-types/string.md). +- `time_string` — String containing a date or date with time to convert. The date must be in the US date format (`MM/DD/YYYY`, etc). [String](../../sql-reference/data-types/string.md). - `time_zone` — [Timezone](../../operations/server-configuration-parameters/settings.md#server_configuration_parameters-timezone). The function parses `time_string` according to the timezone. Optional. [String](../../sql-reference/data-types/string.md). **Supported non-standard formats** - A string containing 9..10 digit [unix timestamp](https://en.wikipedia.org/wiki/Unix_time). - A string with a date and a time components: `YYYYMMDDhhmmss`, `MM/DD/YYYY hh:mm:ss`, `MM-DD-YY hh:mm`, `YYYY-MM-DD hh:mm:ss`, etc. -- A string with a date, but no time component: `YYYY`, `YYYYMM`, `YYYY*MM`, `MM/DD/YYYY`, `MM-DD-YY` etc. +- A string with a date, but no time component: `YYYY`, `YYYYMM`, `YYYY*MM`, `MM/DD/YYYY`, `MM-DD-YY`, etc. - A string with a day and time: `DD`, `DD hh`, `DD hh:mm`. In this case, `YYYY-MM` are substituted with `2000-01`. - A string that includes date and time along with timezone offset information: `YYYY-MM-DD hh:mm:ss ±h:mm`, etc. For example, `2020-12-12 17:36:00 -5:00`. **Returned values** -Possible values: - - `time_string` converted to the [DateTime](../../sql-reference/data-types/datetime.md) data type. -- `zero date time`. +- Zero date or zero date with time if the input string cannot be converted to the `DateTime` data type. **Examples** diff --git a/docs/ru/sql-reference/functions/type-conversion-functions.md b/docs/ru/sql-reference/functions/type-conversion-functions.md index 4a314bd22d8..92e674242df 100644 --- a/docs/ru/sql-reference/functions/type-conversion-functions.md +++ b/docs/ru/sql-reference/functions/type-conversion-functions.md @@ -658,6 +658,178 @@ AS parseDateTimeBestEffortUS; └─────────────────────────——┘ ``` +## parseDateTimeBestEffortUSOrNull {#parsedatetimebesteffortusornull} + +Похожа на функцию [parseDateTimeBestEffortUS](#parsedatetimebesteffortUS), но разница состоит в том, что возвращает `NULL`, если входная строка не может быть преобразована в тип данных [DateTime](../../sql-reference/data-types/datetime.md). + +**Синтаксис** + +``` sql +parseDateTimeBestEffortUSOrNull(time_string[, time_zone]) +``` + +**Параметры** + +- `time_string` — строка, содержащая дату или дату со временем для преобразования. Дата должна быть в американском формате (`MM/DD/YYYY` и т.д.). [String](../../sql-reference/data-types/string.md). +- `time_zone` — [часовой пояс](../../operations/server-configuration-parameters/settings.md#server_configuration_parameters-timezone). Функция анализирует `time_string` в соответствии с заданным часовым поясом. Опциональный параметр. [String](../../sql-reference/data-types/string.md). + +**Поддерживаемые нестандартные форматы** + +- Строка в формате [unix timestamp](https://en.wikipedia.org/wiki/Unix_time), содержащая 9-10 символов. +- Строка, содержащая дату и время: `YYYYMMDDhhmmss`, `MM/DD/YYYY hh:mm:ss`, `MM-DD-YY hh:mm`, `YYYY-MM-DD hh:mm:ss` и т.д. +- Строка, содержащая дату без времени: `YYYY`, `YYYYMM`, `YYYY*MM`, `MM/DD/YYYY`, `MM-DD-YY` и т.д. +- Строка, содержащая день и время: `DD`, `DD hh`, `DD hh:mm`. В этом случае `YYYY-MM` заменяется на `2000-01`. +- Строка, содержащая дату и время, а также информацию о часовом поясе: `YYYY-MM-DD hh:mm:ss ±h:mm` и т.д. Например, `2020-12-12 17:36:00 -5:00`. + +**Возвращаемые значения** + +- `time_string`, преобразованная в тип данных `DateTime`. +- `NULL`, если входная строка не может быть преобразована в тип данных `DateTime`. + +**Примеры** + +Запрос: + +``` sql +SELECT parseDateTimeBestEffortUSOrNull('02/10/2021 21:12:57') AS parseDateTimeBestEffortUSOrNull; +``` + +Результат: + +``` text +┌─parseDateTimeBestEffortUSOrNull─┐ +│ 2021-02-10 21:12:57 │ +└─────────────────────────────────┘ +``` + +Запрос: + +``` sql +SELECT parseDateTimeBestEffortUSOrNull('02-10-2021 21:12:57 GMT', 'Europe/Moscow') AS parseDateTimeBestEffortUSOrNull; +``` + +Результат: + +``` text +┌─parseDateTimeBestEffortUSOrNull─┐ +│ 2021-02-11 00:12:57 │ +└─────────────────────────────────┘ +``` + +Запрос: + +``` sql +SELECT parseDateTimeBestEffortUSOrNull('02.10.2021') AS parseDateTimeBestEffortUSOrNull; +``` + +Результат: + +``` text +┌─parseDateTimeBestEffortUSOrNull─┐ +│ 2021-02-10 00:00:00 │ +└─────────────────────────────────┘ +``` + +Запрос: + +``` sql +SELECT parseDateTimeBestEffortUSOrNull('10.2021') AS parseDateTimeBestEffortUSOrNull; +``` + +Результат: + +``` text +┌─parseDateTimeBestEffortUSOrNull─┐ +│ ᴺᵁᴸᴸ │ +└─────────────────────────────────┘ +``` + +## parseDateTimeBestEffortUSOrZero {#parsedatetimebesteffortusorzero} + +Похожа на функцию [parseDateTimeBestEffortUS](#parsedatetimebesteffortUS), но разница в том, что возвращает нулевую дату или нулевую дату со временем, если входная строка не может быть преобразована в тип данных [DateTime](../../sql-reference/data-types/datetime.md). + +**Синтаксис** + +``` sql +parseDateTimeBestEffortUSOrZero(time_string[, time_zone]) +``` + +**Параметры** + +- `time_string` — строка, содержащая дату или дату со временем для преобразования. Дата должна быть в американском формате (`MM/DD/YYYY` и т.д.). [String](../../sql-reference/data-types/string.md). +- `time_zone` — [часовой пояс](../../operations/server-configuration-parameters/settings.md#server_configuration_parameters-timezone). Функция анализирует `time_string` в соответствии с заданным часовым поясом. Опциональный параметр. [String](../../sql-reference/data-types/string.md). + +**Поддерживаемые нестандартные форматы** + +- Строка в формате [unix timestamp](https://en.wikipedia.org/wiki/Unix_time), содержащая 9-10 символов. +- Строка, содержащая дату и время: `YYYYMMDDhhmmss`, `MM/DD/YYYY hh:mm:ss`, `MM-DD-YY hh:mm`, `YYYY-MM-DD hh:mm:ss` и т.д. +- Строка, содержащая дату без времени: `YYYY`, `YYYYMM`, `YYYY*MM`, `MM/DD/YYYY`, `MM-DD-YY` и т.д. +- Строка, содержащая день и время: `DD`, `DD hh`, `DD hh:mm`. В этом случае `YYYY-MM` заменяется на `2000-01`. +- Строка, содержащая дату и время, а также информацию о часовом поясе: `YYYY-MM-DD hh:mm:ss ±h:mm` и т.д. Например, `2020-12-12 17:36:00 -5:00`. + +**Возвращаемые значения** + +- `time_string`, преобразованная в тип данных `DateTime`. +- Нулевая дата или нулевая дата со временем, если входная строка не может быть преобразована в тип данных `DateTime`. + +**Примеры** + +Запрос: + +``` sql +SELECT parseDateTimeBestEffortUSOrZero('02/10/2021 21:12:57') AS parseDateTimeBestEffortUSOrZero; +``` + +Результат: + +``` text +┌─parseDateTimeBestEffortUSOrZero─┐ +│ 2021-02-10 21:12:57 │ +└─────────────────────────────────┘ +``` + +Запрос: + +``` sql +SELECT parseDateTimeBestEffortUSOrZero('02-10-2021 21:12:57 GMT', 'Europe/Moscow') AS parseDateTimeBestEffortUSOrZero; +``` + +Результат: + +``` text +┌─parseDateTimeBestEffortUSOrZero─┐ +│ 2021-02-11 00:12:57 │ +└─────────────────────────────────┘ +``` + +Запрос: + +``` sql +SELECT parseDateTimeBestEffortUSOrZero('02.10.2021') AS parseDateTimeBestEffortUSOrZero; +``` + +Результат: + +``` text +┌─parseDateTimeBestEffortUSOrZero─┐ +│ 2021-02-10 00:00:00 │ +└─────────────────────────────────┘ +``` + +Запрос: + +``` sql +SELECT parseDateTimeBestEffortUSOrZero('02.2021') AS parseDateTimeBestEffortUSOrZero; +``` + +Результат: + +``` text +┌─parseDateTimeBestEffortUSOrZero─┐ +│ 1970-01-01 00:00:00 │ +└─────────────────────────────────┘ +``` + ## toUnixTimestamp64Milli ## toUnixTimestamp64Micro ## toUnixTimestamp64Nano From d3e87701d478c2f779eae5b892c040b1132d8b6c Mon Sep 17 00:00:00 2001 From: alesapin Date: Tue, 16 Feb 2021 22:02:18 +0300 Subject: [PATCH 0339/2357] Persistent storage --- src/Coordination/Changelog.cpp | 10 ++-- src/Coordination/Changelog.h | 2 - src/Coordination/CoordinationSettings.h | 3 +- src/Coordination/InMemoryStateManager.cpp | 21 ++++--- src/Coordination/InMemoryStateManager.h | 13 +++-- src/Coordination/NuKeeperServer.cpp | 12 +++- src/Coordination/tests/gtest_for_build.cpp | 67 +++++++++++----------- tests/config/config.d/test_keeper_port.xml | 1 + 8 files changed, 77 insertions(+), 52 deletions(-) diff --git a/src/Coordination/Changelog.cpp b/src/Coordination/Changelog.cpp index 5198382e731..e4d8b13ec37 100644 --- a/src/Coordination/Changelog.cpp +++ b/src/Coordination/Changelog.cpp @@ -16,10 +16,8 @@ namespace ErrorCodes extern const int CORRUPTED_DATA; extern const int UNKNOWN_FORMAT_VERSION; extern const int LOGICAL_ERROR; - extern const int NOT_IMPLEMENTED; } - std::string toString(const ChangelogVersion & version) { if (version == ChangelogVersion::V0) @@ -147,7 +145,6 @@ private: size_t start_index; }; - class ChangelogReader { public: @@ -202,7 +199,10 @@ Changelog::Changelog(const std::string & changelogs_dir_, size_t rotate_interval , rotate_interval(rotate_interval_) { namespace fs = std::filesystem; - for(const auto & p : fs::directory_iterator(changelogs_dir)) + if (!fs::exists(changelogs_dir)) + fs::create_directories(changelogs_dir); + + for (const auto & p : fs::directory_iterator(changelogs_dir)) { auto name = getChangelogName(p.path()); existing_changelogs[name.from_log_idx] = p.path(); @@ -233,7 +233,7 @@ void Changelog::readChangelogAndInitWriter(size_t from_log_idx) } else { - rotate(from_log_idx); + rotate(start_index); } } diff --git a/src/Coordination/Changelog.h b/src/Coordination/Changelog.h index 97669d1aa19..7c352e7a91b 100644 --- a/src/Coordination/Changelog.h +++ b/src/Coordination/Changelog.h @@ -45,8 +45,6 @@ struct ChangelogRecord nuraft::ptr blob; }; - - class ChangelogWriter; class Changelog diff --git a/src/Coordination/CoordinationSettings.h b/src/Coordination/CoordinationSettings.h index 441e1a5936f..0f1afb3fffe 100644 --- a/src/Coordination/CoordinationSettings.h +++ b/src/Coordination/CoordinationSettings.h @@ -28,7 +28,8 @@ struct Settings; M(Bool, auto_forwarding, true, "Allow to forward write requests from followers to leader", 0) \ M(Milliseconds, shutdown_timeout, 5000, "How many time we will until RAFT shutdown", 0) \ M(Milliseconds, startup_timeout, 30000, "How many time we will until RAFT to start", 0) \ - M(LogsLevel, raft_logs_level, LogsLevel::information, "Log internal RAFT logs into main server log level. Valid values: 'trace', 'debug', 'information', 'warning', 'error', 'fatal', 'none'", 0) + M(LogsLevel, raft_logs_level, LogsLevel::information, "Log internal RAFT logs into main server log level. Valid values: 'trace', 'debug', 'information', 'warning', 'error', 'fatal', 'none'", 0) \ + M(UInt64, rotate_log_storage_interval, 500000, "How many records will be stored in one log storage file", 0) DECLARE_SETTINGS_TRAITS(CoordinationSettingsTraits, LIST_OF_COORDINATION_SETTINGS) diff --git a/src/Coordination/InMemoryStateManager.cpp b/src/Coordination/InMemoryStateManager.cpp index 69e93578cc1..6c4e95b993a 100644 --- a/src/Coordination/InMemoryStateManager.cpp +++ b/src/Coordination/InMemoryStateManager.cpp @@ -9,10 +9,10 @@ namespace ErrorCodes extern const int RAFT_ERROR; } -InMemoryStateManager::InMemoryStateManager(int server_id_, const std::string & host, int port) +InMemoryStateManager::InMemoryStateManager(int server_id_, const std::string & host, int port, const std::string & logs_path) : my_server_id(server_id_) , my_port(port) - , log_store(nuraft::cs_new()) + , log_store(nuraft::cs_new(logs_path, 5000)) , cluster_config(nuraft::cs_new()) { auto peer_config = nuraft::cs_new(my_server_id, host + ":" + std::to_string(port)); @@ -22,17 +22,19 @@ InMemoryStateManager::InMemoryStateManager(int server_id_, const std::string & h InMemoryStateManager::InMemoryStateManager( int my_server_id_, const std::string & config_prefix, - const Poco::Util::AbstractConfiguration & config) + const Poco::Util::AbstractConfiguration & config, + const CoordinationSettingsPtr & coordination_settings) : my_server_id(my_server_id_) - , log_store(nuraft::cs_new()) + , log_store(nuraft::cs_new(config.getString(config_prefix + ".log_storage_path"), coordination_settings->rotate_log_storage_interval)) , cluster_config(nuraft::cs_new()) { + Poco::Util::AbstractConfiguration::Keys keys; - config.keys(config_prefix, keys); + config.keys(config_prefix + ".raft_configuration", keys); for (const auto & server_key : keys) { - std::string full_prefix = config_prefix + "." + server_key; + std::string full_prefix = config_prefix + ".raft_configuration." + server_key; int server_id = config.getInt(full_prefix + ".id"); std::string hostname = config.getString(full_prefix + ".hostname"); int port = config.getInt(full_prefix + ".port"); @@ -53,12 +55,17 @@ InMemoryStateManager::InMemoryStateManager( cluster_config->get_servers().push_back(peer_config); } if (!my_server_config) - throw Exception(ErrorCodes::RAFT_ERROR, "Our server id {} not found in raft_configuration section"); + throw Exception(ErrorCodes::RAFT_ERROR, "Our server id {} not found in raft_configuration section", my_server_id); if (start_as_follower_servers.size() == cluster_config->get_servers().size()) throw Exception(ErrorCodes::RAFT_ERROR, "At least one of servers should be able to start as leader (without )"); } +void InMemoryStateManager::loadLogStore(size_t start_log_index) +{ + log_store->init(start_log_index); +} + void InMemoryStateManager::save_config(const nuraft::cluster_config & config) { // Just keep in memory in this example. diff --git a/src/Coordination/InMemoryStateManager.h b/src/Coordination/InMemoryStateManager.h index 2a5c2f00dba..8a7be7d0129 100644 --- a/src/Coordination/InMemoryStateManager.h +++ b/src/Coordination/InMemoryStateManager.h @@ -2,7 +2,8 @@ #include #include -#include +#include +#include #include // Y_IGNORE #include @@ -15,12 +16,16 @@ public: InMemoryStateManager( int server_id_, const std::string & config_prefix, - const Poco::Util::AbstractConfiguration & config); + const Poco::Util::AbstractConfiguration & config, + const CoordinationSettingsPtr & coordination_settings); InMemoryStateManager( int server_id_, const std::string & host, - int port); + int port, + const std::string & logs_path); + + void loadLogStore(size_t start_log_index); nuraft::ptr load_config() override { return cluster_config; } @@ -49,7 +54,7 @@ private: int my_server_id; int my_port; std::unordered_set start_as_follower_servers; - nuraft::ptr log_store; + nuraft::ptr log_store; nuraft::ptr my_server_config; nuraft::ptr cluster_config; nuraft::ptr server_state; diff --git a/src/Coordination/NuKeeperServer.cpp b/src/Coordination/NuKeeperServer.cpp index 7464a06e86f..a4582a5fbb8 100644 --- a/src/Coordination/NuKeeperServer.cpp +++ b/src/Coordination/NuKeeperServer.cpp @@ -26,13 +26,16 @@ NuKeeperServer::NuKeeperServer( : server_id(server_id_) , coordination_settings(coordination_settings_) , state_machine(nuraft::cs_new(responses_queue_, coordination_settings)) - , state_manager(nuraft::cs_new(server_id, "test_keeper_server.raft_configuration", config)) + , state_manager(nuraft::cs_new(server_id, "test_keeper_server", config, coordination_settings)) , responses_queue(responses_queue_) { } void NuKeeperServer::startup() { + + state_manager->loadLogStore(state_machine->last_commit_index()); + nuraft::raft_params params; params.heart_beat_interval_ = coordination_settings->heart_beat_interval_ms.totalMilliseconds(); params.election_timeout_lower_bound_ = coordination_settings->election_timeout_lower_bound_ms.totalMilliseconds(); @@ -172,6 +175,13 @@ void NuKeeperServer::waitInit() int64_t timeout = coordination_settings->startup_timeout.totalMilliseconds(); if (!initialized_cv.wait_for(lock, std::chrono::milliseconds(timeout), [&] { return initialized_flag; })) throw Exception(ErrorCodes::RAFT_ERROR, "Failed to wait RAFT initialization"); + + /// TODO FIXME somehow + while (isLeader() && raft_instance->get_committed_log_idx() != raft_instance->get_last_log_idx()) + { + LOG_WARNING(&Poco::Logger::get("NuKeeperServer"), "Loading from log store {}/{}", raft_instance->get_committed_log_idx(), raft_instance->get_last_log_idx()); + std::this_thread::sleep_for(std::chrono::milliseconds(100)); + } } std::unordered_set NuKeeperServer::getDeadSessions() diff --git a/src/Coordination/tests/gtest_for_build.cpp b/src/Coordination/tests/gtest_for_build.cpp index fa8ae8f8b82..6d91ba95111 100644 --- a/src/Coordination/tests/gtest_for_build.cpp +++ b/src/Coordination/tests/gtest_for_build.cpp @@ -26,6 +26,26 @@ #include #include +namespace fs = std::filesystem; +struct ChangelogDirTest +{ + std::string path; + bool drop; + ChangelogDirTest(std::string path_, bool drop_ = true) + : path(path_) + , drop(drop_) + { + if (fs::exists(path)) + EXPECT_TRUE(false) << "Path " << path << " already exists, remove it to run test"; + fs::create_directory(path); + } + + ~ChangelogDirTest() + { + if (fs::exists(path) && drop) + fs::remove_all(path); + } +}; TEST(CoordinationTest, BuildTest) { @@ -70,14 +90,15 @@ TEST(CoordinationTest, BufferSerde) template struct SimpliestRaftServer { - SimpliestRaftServer(int server_id_, const std::string & hostname_, int port_) + SimpliestRaftServer(int server_id_, const std::string & hostname_, int port_, const std::string & logs_path) : server_id(server_id_) , hostname(hostname_) , port(port_) , endpoint(hostname + ":" + std::to_string(port)) , state_machine(nuraft::cs_new()) - , state_manager(nuraft::cs_new(server_id, hostname, port)) + , state_manager(nuraft::cs_new(server_id, hostname, port, logs_path)) { + state_manager->loadLogStore(1); nuraft::raft_params params; params.heart_beat_interval_ = 100; params.election_timeout_lower_bound_ = 200; @@ -126,7 +147,7 @@ struct SimpliestRaftServer nuraft::ptr state_machine; // State manager. - nuraft::ptr state_manager; + nuraft::ptr state_manager; // Raft launcher. nuraft::raft_launcher launcher; @@ -141,7 +162,6 @@ nuraft::ptr getBuffer(int64_t number) { nuraft::ptr ret = nuraft::buffer::alloc(sizeof(number)); nuraft::buffer_serializer bs(ret); - // WARNING: We don't consider endian-safety in this example. bs.put_raw(&number, sizeof(number)); return ret; } @@ -149,7 +169,8 @@ nuraft::ptr getBuffer(int64_t number) TEST(CoordinationTest, TestSummingRaft1) { - SummingRaftServer s1(1, "localhost", 44444); + ChangelogDirTest test("./logs"); + SummingRaftServer s1(1, "localhost", 44444, "./logs"); /// Single node is leader EXPECT_EQ(s1.raft_instance->get_leader(), 1); @@ -172,9 +193,12 @@ TEST(CoordinationTest, TestSummingRaft1) TEST(CoordinationTest, TestSummingRaft3) { - SummingRaftServer s1(1, "localhost", 44444); - SummingRaftServer s2(2, "localhost", 44445); - SummingRaftServer s3(3, "localhost", 44446); + ChangelogDirTest test1("./logs1"); + SummingRaftServer s1(1, "localhost", 44444, "./logs1"); + ChangelogDirTest test2("./logs2"); + SummingRaftServer s2(2, "localhost", 44445, "./logs2"); + ChangelogDirTest test3("./logs3"); + SummingRaftServer s3(3, "localhost", 44446, "./logs3"); nuraft::srv_config first_config(1, "localhost:44444"); auto ret1 = s2.raft_instance->add_srv(first_config); @@ -343,27 +367,6 @@ DB::LogEntryPtr getLogEntry(const std::string & s, size_t term) return nuraft::cs_new(term, bufwriter.getBuffer()); } -namespace fs = std::filesystem; -struct ChangelogDirTest -{ - std::string path; - bool drop; - ChangelogDirTest(std::string path_, bool drop_ = true) - : path(path_) - , drop(drop_) - { - if (fs::exists(path)) - EXPECT_TRUE(false) << "Path " << path << " already exists, remove it to run test"; - fs::create_directory(path); - } - - ~ChangelogDirTest() - { - if (fs::exists(path) && drop) - fs::remove_all(path); - } -}; - TEST(CoordinationTest, ChangelogTestSimple) { ChangelogDirTest test("./logs"); @@ -386,7 +389,7 @@ TEST(CoordinationTest, ChangelogTestFile) auto entry = getLogEntry("hello world", 77); changelog.appendEntry(1, entry); EXPECT_TRUE(fs::exists("./logs/changelog_1_5.bin")); - for(const auto & p : fs::directory_iterator("./logs")) + for (const auto & p : fs::directory_iterator("./logs")) EXPECT_EQ(p.path(), "./logs/changelog_1_5.bin"); changelog.appendEntry(2, entry); @@ -484,7 +487,7 @@ TEST(CoordinationTest, ChangelogTestAppendAfterRead) EXPECT_TRUE(fs::exists("./logs/changelog_6_10.bin")); size_t logs_count = 0; - for(const auto & _ [[maybe_unused]]: fs::directory_iterator("./logs")) + for (const auto & _ [[maybe_unused]]: fs::directory_iterator("./logs")) logs_count++; EXPECT_EQ(logs_count, 2); @@ -497,7 +500,7 @@ TEST(CoordinationTest, ChangelogTestAppendAfterRead) EXPECT_TRUE(fs::exists("./logs/changelog_11_15.bin")); logs_count = 0; - for(const auto & _ [[maybe_unused]]: fs::directory_iterator("./logs")) + for (const auto & _ [[maybe_unused]]: fs::directory_iterator("./logs")) logs_count++; EXPECT_EQ(logs_count, 3); diff --git a/tests/config/config.d/test_keeper_port.xml b/tests/config/config.d/test_keeper_port.xml index 97c6d7c2e33..44123ffe9c1 100644 --- a/tests/config/config.d/test_keeper_port.xml +++ b/tests/config/config.d/test_keeper_port.xml @@ -2,6 +2,7 @@ 9181 1 + /var/lib/clickhouse/coordination/log 10000 From a1cd07b9a00ff0ea4bc4e98d03af9b5046e6854f Mon Sep 17 00:00:00 2001 From: gyuton <40863448+gyuton@users.noreply.github.com> Date: Tue, 16 Feb 2021 22:24:49 +0300 Subject: [PATCH 0340/2357] Update docs/ru/sql-reference/aggregate-functions/parametric-functions.md Co-authored-by: olgarev <56617294+olgarev@users.noreply.github.com> --- .../sql-reference/aggregate-functions/parametric-functions.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/ru/sql-reference/aggregate-functions/parametric-functions.md b/docs/ru/sql-reference/aggregate-functions/parametric-functions.md index 2c367882714..d96f7a13bcc 100644 --- a/docs/ru/sql-reference/aggregate-functions/parametric-functions.md +++ b/docs/ru/sql-reference/aggregate-functions/parametric-functions.md @@ -239,7 +239,7 @@ windowFunnel(window, [mode])(timestamp, cond1, cond2, ..., condN) **Параметры** -- `window` — ширина скользящего окна по времени. Единица измерения зависит от `timestamp` и может варьироваться. Определяется выражением `timestamp от cond2 <= timestamp от cond1 + window`. +- `window` — ширина скользящего окна по времени. Единица измерения зависит от `timestamp` и может варьироваться. Должно соблюдаться условие `timestamp события cond2 <= timestamp события cond1 + window`. - `mode` - необязательный параметр. Если установлено значение `'strict'`, то функция `windowFunnel()` применяет условия только для уникальных значений. - `timestamp` — имя столбца, содержащего временные отметки. [Date](../../sql-reference/aggregate-functions/parametric-functions.md), [DateTime](../../sql-reference/aggregate-functions/parametric-functions.md#data_type-datetime) и другие параметры с типом `Integer`. В случае хранения меток времени в столбцах с типом `UInt64`, максимально допустимое значение соответствует ограничению для типа `Int64`, т.е. равно `2^63-1`. - `cond` — условия или данные, описывающие цепочку событий. [UInt8](../../sql-reference/aggregate-functions/parametric-functions.md). From 8717dbd0e222536e6daf709820c3bee1ef395c05 Mon Sep 17 00:00:00 2001 From: alesapin Date: Tue, 16 Feb 2021 22:29:09 +0300 Subject: [PATCH 0341/2357] Missed configs --- .../test_testkeeper_back_to_back/configs/enable_test_keeper.xml | 1 + .../configs/enable_test_keeper1.xml | 1 + .../configs/enable_test_keeper2.xml | 1 + .../configs/enable_test_keeper3.xml | 1 + .../configs/enable_test_keeper1.xml | 1 + .../configs/enable_test_keeper2.xml | 1 + .../configs/enable_test_keeper3.xml | 1 + 7 files changed, 7 insertions(+) diff --git a/tests/integration/test_testkeeper_back_to_back/configs/enable_test_keeper.xml b/tests/integration/test_testkeeper_back_to_back/configs/enable_test_keeper.xml index 1a441909998..a8b8991f959 100644 --- a/tests/integration/test_testkeeper_back_to_back/configs/enable_test_keeper.xml +++ b/tests/integration/test_testkeeper_back_to_back/configs/enable_test_keeper.xml @@ -2,6 +2,7 @@ 9181 1 + /var/lib/clickhouse/coordination/log 5000 diff --git a/tests/integration/test_testkeeper_multinode_blocade_leader/configs/enable_test_keeper1.xml b/tests/integration/test_testkeeper_multinode_blocade_leader/configs/enable_test_keeper1.xml index 4ad76889d1e..a47e5eae09a 100644 --- a/tests/integration/test_testkeeper_multinode_blocade_leader/configs/enable_test_keeper1.xml +++ b/tests/integration/test_testkeeper_multinode_blocade_leader/configs/enable_test_keeper1.xml @@ -2,6 +2,7 @@ 9181 1 + /var/lib/clickhouse/coordination/log 5000 diff --git a/tests/integration/test_testkeeper_multinode_blocade_leader/configs/enable_test_keeper2.xml b/tests/integration/test_testkeeper_multinode_blocade_leader/configs/enable_test_keeper2.xml index a1954a1e639..18681f0dc95 100644 --- a/tests/integration/test_testkeeper_multinode_blocade_leader/configs/enable_test_keeper2.xml +++ b/tests/integration/test_testkeeper_multinode_blocade_leader/configs/enable_test_keeper2.xml @@ -2,6 +2,7 @@ 9181 2 + /var/lib/clickhouse/coordination/log 5000 diff --git a/tests/integration/test_testkeeper_multinode_blocade_leader/configs/enable_test_keeper3.xml b/tests/integration/test_testkeeper_multinode_blocade_leader/configs/enable_test_keeper3.xml index 88d2358138f..184d3724219 100644 --- a/tests/integration/test_testkeeper_multinode_blocade_leader/configs/enable_test_keeper3.xml +++ b/tests/integration/test_testkeeper_multinode_blocade_leader/configs/enable_test_keeper3.xml @@ -2,6 +2,7 @@ 9181 3 + /var/lib/clickhouse/coordination/log 5000 diff --git a/tests/integration/test_testkeeper_multinode_simple/configs/enable_test_keeper1.xml b/tests/integration/test_testkeeper_multinode_simple/configs/enable_test_keeper1.xml index 4ad76889d1e..a47e5eae09a 100644 --- a/tests/integration/test_testkeeper_multinode_simple/configs/enable_test_keeper1.xml +++ b/tests/integration/test_testkeeper_multinode_simple/configs/enable_test_keeper1.xml @@ -2,6 +2,7 @@ 9181 1 + /var/lib/clickhouse/coordination/log 5000 diff --git a/tests/integration/test_testkeeper_multinode_simple/configs/enable_test_keeper2.xml b/tests/integration/test_testkeeper_multinode_simple/configs/enable_test_keeper2.xml index a1954a1e639..18681f0dc95 100644 --- a/tests/integration/test_testkeeper_multinode_simple/configs/enable_test_keeper2.xml +++ b/tests/integration/test_testkeeper_multinode_simple/configs/enable_test_keeper2.xml @@ -2,6 +2,7 @@ 9181 2 + /var/lib/clickhouse/coordination/log 5000 diff --git a/tests/integration/test_testkeeper_multinode_simple/configs/enable_test_keeper3.xml b/tests/integration/test_testkeeper_multinode_simple/configs/enable_test_keeper3.xml index 88d2358138f..184d3724219 100644 --- a/tests/integration/test_testkeeper_multinode_simple/configs/enable_test_keeper3.xml +++ b/tests/integration/test_testkeeper_multinode_simple/configs/enable_test_keeper3.xml @@ -2,6 +2,7 @@ 9181 3 + /var/lib/clickhouse/coordination/log 5000 From b8be90cdf9c8505714cfaeb94ac6ffa296a0778d Mon Sep 17 00:00:00 2001 From: gyuton <40863448+gyuton@users.noreply.github.com> Date: Tue, 16 Feb 2021 22:32:59 +0300 Subject: [PATCH 0342/2357] Update docs/ru/sql-reference/functions/encryption-functions.md Co-authored-by: olgarev <56617294+olgarev@users.noreply.github.com> --- docs/ru/sql-reference/functions/encryption-functions.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/ru/sql-reference/functions/encryption-functions.md b/docs/ru/sql-reference/functions/encryption-functions.md index 91b26a2415d..adf084a6b21 100644 --- a/docs/ru/sql-reference/functions/encryption-functions.md +++ b/docs/ru/sql-reference/functions/encryption-functions.md @@ -110,7 +110,7 @@ SELECT comment, hex(secret) FROM encryption_test WHERE comment LIKE '%gcm%'; Совместима с шифрованием myqsl, результат может быть расшифрован функцией [AES_DECRYPT](https://dev.mysql.com/doc/refman/8.0/en/encryption-functions.html#function_aes-decrypt). -При одинаковых вводных зашифрованный текст будет совпадать с результатом `encrypt`. Однако, когда `key` или `iv` длиннее, чем должны быть, `aes_encrypt_mysql` будет работать аналогично MySQL `aes_encrypt`: свернет ключ и проигнорирует лишнюю часть `iv`. +При одинаковых входящих значениях зашифрованный текст будет совпадать с результатом, возвращаемым функцией `encrypt`. Однако если `key` или `iv` длиннее, чем должны быть, `aes_encrypt_mysql` будет работать аналогично функции `aes_encrypt` в MySQL: свернет ключ и проигнорирует лишнюю часть `iv`. Функция поддерживает шифрофание данных следующими режимами: From a642dbce46f1734b1893f6528ad591641edbdc70 Mon Sep 17 00:00:00 2001 From: gyuton <40863448+gyuton@users.noreply.github.com> Date: Tue, 16 Feb 2021 22:33:19 +0300 Subject: [PATCH 0343/2357] Update docs/ru/sql-reference/functions/encryption-functions.md Co-authored-by: olgarev <56617294+olgarev@users.noreply.github.com> --- docs/ru/sql-reference/functions/encryption-functions.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/ru/sql-reference/functions/encryption-functions.md b/docs/ru/sql-reference/functions/encryption-functions.md index adf084a6b21..0e8e7d2a33a 100644 --- a/docs/ru/sql-reference/functions/encryption-functions.md +++ b/docs/ru/sql-reference/functions/encryption-functions.md @@ -140,7 +140,7 @@ aes_encrypt_mysql('mode', 'plaintext', 'key' [, iv]) **Примеры** -При одинаковых вводных результаты шифрования `encrypt` и `aes_encrypt_mysql` будут совпадать. +При одинаковых входящих значениях результаты шифрования у функций `encrypt` и `aes_encrypt_mysql` совпадают. Запрос: From 22ab639287ea47b9a2dba80982170e15c9edd3a0 Mon Sep 17 00:00:00 2001 From: gyuton <40863448+gyuton@users.noreply.github.com> Date: Tue, 16 Feb 2021 22:33:32 +0300 Subject: [PATCH 0344/2357] Update docs/ru/sql-reference/functions/encryption-functions.md Co-authored-by: olgarev <56617294+olgarev@users.noreply.github.com> --- docs/ru/sql-reference/functions/encryption-functions.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/ru/sql-reference/functions/encryption-functions.md b/docs/ru/sql-reference/functions/encryption-functions.md index 0e8e7d2a33a..a72866121c4 100644 --- a/docs/ru/sql-reference/functions/encryption-functions.md +++ b/docs/ru/sql-reference/functions/encryption-functions.md @@ -156,7 +156,7 @@ SELECT encrypt('aes-256-cfb128', 'Secret', '12345678910121314151617181920212', ' └───────────────────┘ ``` -Но `encrypt` генерирует исключение, когда `key` или `iv` длиннее, чем нужно: +Функция `encrypt` генерирует исключение, если `key` или `iv` длиннее чем нужно: Запрос: From d213039fe58fa8efe4340fdd4e3b14564139c71f Mon Sep 17 00:00:00 2001 From: gyuton <40863448+gyuton@users.noreply.github.com> Date: Tue, 16 Feb 2021 22:33:57 +0300 Subject: [PATCH 0345/2357] Update docs/ru/sql-reference/functions/encryption-functions.md Co-authored-by: olgarev <56617294+olgarev@users.noreply.github.com> --- docs/ru/sql-reference/functions/encryption-functions.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/ru/sql-reference/functions/encryption-functions.md b/docs/ru/sql-reference/functions/encryption-functions.md index a72866121c4..90aa3268922 100644 --- a/docs/ru/sql-reference/functions/encryption-functions.md +++ b/docs/ru/sql-reference/functions/encryption-functions.md @@ -171,7 +171,7 @@ Received exception from server (version 21.1.2): Code: 36. DB::Exception: Received from localhost:9000. DB::Exception: Invalid key size: 33 expected 32: While processing encrypt('aes-256-cfb128', 'Secret', '123456789101213141516171819202122', 'iviviviviviviviv123'). ``` -Тогда как `aes_encrypt_mysql` возвращает совместимый с MySQL вывод: +Однако функция `aes_encrypt_mysql` в аналогичном случае возвращает результат, который может быть обработан MySQL: Запрос: From 66d6b7a3a088be7e72cab7ced29b1c7fa5c4f418 Mon Sep 17 00:00:00 2001 From: gyuton <40863448+gyuton@users.noreply.github.com> Date: Tue, 16 Feb 2021 22:34:33 +0300 Subject: [PATCH 0346/2357] Update docs/ru/sql-reference/functions/encryption-functions.md Co-authored-by: olgarev <56617294+olgarev@users.noreply.github.com> --- docs/ru/sql-reference/functions/encryption-functions.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/ru/sql-reference/functions/encryption-functions.md b/docs/ru/sql-reference/functions/encryption-functions.md index 90aa3268922..f75e7bcc1a3 100644 --- a/docs/ru/sql-reference/functions/encryption-functions.md +++ b/docs/ru/sql-reference/functions/encryption-functions.md @@ -220,7 +220,7 @@ mysql> SELECT aes_encrypt('Secret', '123456789101213141516171819202122', 'iviviv ## decrypt {#decrypt} -Функция расшифровывает зашифрованный текст в обычный следующими режимами: +Функция расшифровывает зашифрованный текст и может работать в следующих режимах: - aes-128-ecb, aes-192-ecb, aes-256-ecb - aes-128-cbc, aes-192-cbc, aes-256-cbc From 5edba428658e60f9ee0be3681e17b638e8f2d254 Mon Sep 17 00:00:00 2001 From: gyuton <40863448+gyuton@users.noreply.github.com> Date: Tue, 16 Feb 2021 22:34:43 +0300 Subject: [PATCH 0347/2357] Update docs/ru/sql-reference/functions/encryption-functions.md Co-authored-by: olgarev <56617294+olgarev@users.noreply.github.com> --- docs/ru/sql-reference/functions/encryption-functions.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/ru/sql-reference/functions/encryption-functions.md b/docs/ru/sql-reference/functions/encryption-functions.md index f75e7bcc1a3..c4e0968d6f9 100644 --- a/docs/ru/sql-reference/functions/encryption-functions.md +++ b/docs/ru/sql-reference/functions/encryption-functions.md @@ -203,7 +203,7 @@ SELECT hex(aes_encrypt_mysql('aes-256-cfb128', 'Secret', '1234567891012131415161 └──────────────┘ ``` -Это совпадает с тем, что выводит MySQL с такими же вводными: +Это совпадает с результатом, возвращаемым MySQL при таких же входящих значениях: ``` sql mysql> SET block_encryption_mode='aes-256-cfb128'; From a26f2b77cb84e5d5629a706f42bd5a0c8214c694 Mon Sep 17 00:00:00 2001 From: gyuton <40863448+gyuton@users.noreply.github.com> Date: Tue, 16 Feb 2021 22:35:07 +0300 Subject: [PATCH 0348/2357] Update docs/ru/sql-reference/functions/encryption-functions.md Co-authored-by: olgarev <56617294+olgarev@users.noreply.github.com> --- docs/ru/sql-reference/functions/encryption-functions.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/ru/sql-reference/functions/encryption-functions.md b/docs/ru/sql-reference/functions/encryption-functions.md index c4e0968d6f9..92e8d62faca 100644 --- a/docs/ru/sql-reference/functions/encryption-functions.md +++ b/docs/ru/sql-reference/functions/encryption-functions.md @@ -250,7 +250,7 @@ decrypt('mode', 'ciphertext', 'key' [, iv, aad]) **Примеры** -Используется таблица из [encrypt](#encrypt). +Рассмотрим таблицу из примера для функции [encrypt](#encrypt). Запрос: From 7a910d38a10c92f1aae4d13e5de34a73e10e978e Mon Sep 17 00:00:00 2001 From: gyuton <40863448+gyuton@users.noreply.github.com> Date: Tue, 16 Feb 2021 22:35:12 +0300 Subject: [PATCH 0349/2357] Update docs/ru/sql-reference/functions/encryption-functions.md Co-authored-by: olgarev <56617294+olgarev@users.noreply.github.com> --- docs/ru/sql-reference/functions/encryption-functions.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/ru/sql-reference/functions/encryption-functions.md b/docs/ru/sql-reference/functions/encryption-functions.md index 92e8d62faca..faddf314fe7 100644 --- a/docs/ru/sql-reference/functions/encryption-functions.md +++ b/docs/ru/sql-reference/functions/encryption-functions.md @@ -278,7 +278,7 @@ SELECT comment, hex(secret) FROM encryption_test; Запрос: ``` sql -SELECT comment, decrypt('aes-256-cfb128', secret, '12345678910121314151617181920212') as plaintext FROM encryption_test +SELECT comment, decrypt('aes-256-cfb128', secret, '12345678910121314151617181920212') as plaintext FROM encryption_test; ``` Результат: From 07795335cecc9352b7d4164bbd6c63599d19bda1 Mon Sep 17 00:00:00 2001 From: gyuton <40863448+gyuton@users.noreply.github.com> Date: Tue, 16 Feb 2021 22:35:23 +0300 Subject: [PATCH 0350/2357] Update docs/ru/sql-reference/functions/encryption-functions.md Co-authored-by: olgarev <56617294+olgarev@users.noreply.github.com> --- docs/ru/sql-reference/functions/encryption-functions.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/ru/sql-reference/functions/encryption-functions.md b/docs/ru/sql-reference/functions/encryption-functions.md index faddf314fe7..0f46f3c1fd5 100644 --- a/docs/ru/sql-reference/functions/encryption-functions.md +++ b/docs/ru/sql-reference/functions/encryption-functions.md @@ -293,7 +293,7 @@ SELECT comment, decrypt('aes-256-cfb128', secret, '12345678910121314151617181920 └─────────────────────────────────────┴───────────┘ ``` -Обратите внимание, что только часть данных была расшифрована, а остальное является бессмыслицей, как как `mode`, `key`, или `iv` были другими во время шифрования. +Обратите внимание, что только часть данных была расшифрована верно. Оставшаяся часть расшифрована некорректно, так как при шифровании использовались другие значения `mode`, `key`, или `iv`. ## aes_decrypt_mysql {#aes_decrypt_mysql} From 579f8a95bcaa804b4264e8047d68474af5ef3ec6 Mon Sep 17 00:00:00 2001 From: gyuton <40863448+gyuton@users.noreply.github.com> Date: Tue, 16 Feb 2021 22:35:43 +0300 Subject: [PATCH 0351/2357] Update docs/ru/sql-reference/functions/encryption-functions.md Co-authored-by: olgarev <56617294+olgarev@users.noreply.github.com> --- docs/ru/sql-reference/functions/encryption-functions.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/ru/sql-reference/functions/encryption-functions.md b/docs/ru/sql-reference/functions/encryption-functions.md index 0f46f3c1fd5..6cf5b520f23 100644 --- a/docs/ru/sql-reference/functions/encryption-functions.md +++ b/docs/ru/sql-reference/functions/encryption-functions.md @@ -299,7 +299,7 @@ SELECT comment, decrypt('aes-256-cfb128', secret, '12345678910121314151617181920 Совместима с шифрованием myqsl и может расшифровать данные, зашифрованные функцией [AES_ENCRYPT](https://dev.mysql.com/doc/refman/8.0/en/encryption-functions.html#function_aes-encrypt). -При одинаковых вводных расшифрованный текст будет совпадать с результатом `decrypt`. Однако, когда `key` или `iv` длиннее, чем должны быть, `aes_decrypt_mysql` будет работать аналогично MySQL `aes_decrypt`: свернет ключ и проигнорирует лишнюю часть `iv`. +При одинаковых входящих значениях расшифрованный текст будет совпадать с результатом, возвращаемым функцией `decrypt`. Однако если `key` или `iv` длиннее, чем должны быть, `aes_decrypt_mysql` будет работать аналогично функции `aes_decrypt` в MySQL: свернет ключ и проигнорирует лишнюю часть `iv`. Функция поддерживает расшифровку данных следующими режимами: From b82e564076203733a292d53ebcf843ad0289ace9 Mon Sep 17 00:00:00 2001 From: gyuton <40863448+gyuton@users.noreply.github.com> Date: Tue, 16 Feb 2021 22:35:48 +0300 Subject: [PATCH 0352/2357] Update docs/ru/sql-reference/functions/encryption-functions.md Co-authored-by: olgarev <56617294+olgarev@users.noreply.github.com> --- docs/ru/sql-reference/functions/encryption-functions.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/ru/sql-reference/functions/encryption-functions.md b/docs/ru/sql-reference/functions/encryption-functions.md index 6cf5b520f23..04a74fe8107 100644 --- a/docs/ru/sql-reference/functions/encryption-functions.md +++ b/docs/ru/sql-reference/functions/encryption-functions.md @@ -301,7 +301,7 @@ SELECT comment, decrypt('aes-256-cfb128', secret, '12345678910121314151617181920 При одинаковых входящих значениях расшифрованный текст будет совпадать с результатом, возвращаемым функцией `decrypt`. Однако если `key` или `iv` длиннее, чем должны быть, `aes_decrypt_mysql` будет работать аналогично функции `aes_decrypt` в MySQL: свернет ключ и проигнорирует лишнюю часть `iv`. -Функция поддерживает расшифровку данных следующими режимами: +Функция поддерживает расшифровку данных в следующих режимах: - aes-128-ecb, aes-192-ecb, aes-256-ecb - aes-128-cbc, aes-192-cbc, aes-256-cbc From c10485d21a29ab7e1ec405ef19fad35ca306185a Mon Sep 17 00:00:00 2001 From: gyuton <40863448+gyuton@users.noreply.github.com> Date: Tue, 16 Feb 2021 22:35:55 +0300 Subject: [PATCH 0353/2357] Update docs/ru/sql-reference/functions/encryption-functions.md Co-authored-by: olgarev <56617294+olgarev@users.noreply.github.com> --- docs/ru/sql-reference/functions/encryption-functions.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/ru/sql-reference/functions/encryption-functions.md b/docs/ru/sql-reference/functions/encryption-functions.md index 04a74fe8107..3c2f9e3e682 100644 --- a/docs/ru/sql-reference/functions/encryption-functions.md +++ b/docs/ru/sql-reference/functions/encryption-functions.md @@ -348,7 +348,7 @@ mysql> SELECT aes_encrypt('Secret', '123456789101213141516171819202122', 'iviviv Запрос: ``` sql -SELECT aes_decrypt_mysql('aes-256-cfb128', unhex('24E9E4966469'), '123456789101213141516171819202122', 'iviviviviviviviv123456') AS plaintext +SELECT aes_decrypt_mysql('aes-256-cfb128', unhex('24E9E4966469'), '123456789101213141516171819202122', 'iviviviviviviviv123456') AS plaintext; ``` Результат: From 236b9cfeff06a9ac5115736041586a9ae119d761 Mon Sep 17 00:00:00 2001 From: gyuton <40863448+gyuton@users.noreply.github.com> Date: Tue, 16 Feb 2021 22:36:01 +0300 Subject: [PATCH 0354/2357] Update docs/ru/sql-reference/functions/encryption-functions.md Co-authored-by: olgarev <56617294+olgarev@users.noreply.github.com> --- docs/ru/sql-reference/functions/encryption-functions.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/ru/sql-reference/functions/encryption-functions.md b/docs/ru/sql-reference/functions/encryption-functions.md index 3c2f9e3e682..5406112624f 100644 --- a/docs/ru/sql-reference/functions/encryption-functions.md +++ b/docs/ru/sql-reference/functions/encryption-functions.md @@ -329,7 +329,7 @@ aes_decrypt_mysql('mode', 'ciphertext', 'key' [, iv]) **Примеры** -Расшифруем данные, которые до этого зашифровали с помощью MySQL: +Расшифруем данные, которые до этого были зашифрованы в MySQL: ``` sql From f2c7c38c18b817bf101769d4d69e1ab78075778e Mon Sep 17 00:00:00 2001 From: gyuton <40863448+gyuton@users.noreply.github.com> Date: Tue, 16 Feb 2021 22:38:20 +0300 Subject: [PATCH 0355/2357] Update docs/ru/sql-reference/functions/encryption-functions.md Co-authored-by: olgarev <56617294+olgarev@users.noreply.github.com> --- docs/ru/sql-reference/functions/encryption-functions.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/ru/sql-reference/functions/encryption-functions.md b/docs/ru/sql-reference/functions/encryption-functions.md index 5406112624f..e2c5560e4f6 100644 --- a/docs/ru/sql-reference/functions/encryption-functions.md +++ b/docs/ru/sql-reference/functions/encryption-functions.md @@ -11,7 +11,7 @@ toc_title: "\u0424\u0443\u043d\u043a\u0446\u0438\u0438 \u0434\u043b\u044f \u0448 Длина инициализирующего вектора всегда 16 байт (лишнии байты игнорируются). -Обратите внимание, что до версии Clickhouse 21.1 эти функции работают медленно. +Обратите внимание, что до версии Clickhouse 21.1 эти функции работали медленно. ## encrypt {#encrypt} From 2858151d09b70b018a9626a2c4efda6d1535ec8b Mon Sep 17 00:00:00 2001 From: gyuton <40863448+gyuton@users.noreply.github.com> Date: Wed, 17 Feb 2021 00:25:34 +0300 Subject: [PATCH 0356/2357] Update kafka.md --- docs/ru/engines/table-engines/integrations/kafka.md | 2 ++ 1 file changed, 2 insertions(+) diff --git a/docs/ru/engines/table-engines/integrations/kafka.md b/docs/ru/engines/table-engines/integrations/kafka.md index 2b9dfcd49da..a1528edfd1d 100644 --- a/docs/ru/engines/table-engines/integrations/kafka.md +++ b/docs/ru/engines/table-engines/integrations/kafka.md @@ -47,7 +47,9 @@ SETTINGS - `kafka_row_delimiter` — символ-разделитель записей (строк), которым завершается сообщение. - `kafka_schema` — опциональный параметр, необходимый, если используется формат, требующий определения схемы. Например, [Cap’n Proto](https://capnproto.org/) требует путь к файлу со схемой и название корневого объекта `schema.capnp:Message`. - `kafka_num_consumers` — количество потребителей (consumer) на таблицу. По умолчанию: `1`. Укажите больше потребителей, если пропускная способность одного потребителя недостаточна. Общее число потребителей не должно превышать количество партиций в топике, так как на одну партицию может быть назначено не более одного потребителя. +- `kafka_max_block_size` — максимальный размер пачек (в сообщениях) для poll (по умолчанию `max_block_size`). - `kafka_skip_broken_messages` — максимальное количество некорректных сообщений в блоке. Если `kafka_skip_broken_messages = N`, то движок отбрасывает `N` сообщений Кафки, которые не получилось обработать. Одно сообщение в точности соответствует одной записи (строке). Значение по умолчанию – 0. +- `kafka_commit_every_batch` — фиксирует каждый обработанный и потребленный пакет вместо отдельной фиксации после записи целого блока (по умолчанию `0`). - `kafka_thread_per_consumer` — снабжает каждого потребителя независимым потоком (по умолчанию `0`). При включенном состоянии каждый потребитель сбрасывает данные независимо и параллельно (иначе — строки от нескольких потребителей склеиваются в один блок). Примеры From e15a080104afc968d041b5894e2260bb3385a29a Mon Sep 17 00:00:00 2001 From: George Date: Wed, 17 Feb 2021 00:36:18 +0300 Subject: [PATCH 0357/2357] First draft --- docs/en/operations/settings/settings.md | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/docs/en/operations/settings/settings.md b/docs/en/operations/settings/settings.md index 963f9fa18bd..3e7694380cb 100644 --- a/docs/en/operations/settings/settings.md +++ b/docs/en/operations/settings/settings.md @@ -1872,6 +1872,18 @@ Possible values: Default value: `0`. +## insert_shard_id {#insert_shard_id} + +Enables insertion of data into specific shards from [Distributed](../../engines/table-engines/special/distributed.md#distributed) tables. + +This setting allows to insert data into specific shard from distributed table without perceiving local tables. + +Possible values: + +- Any number from `0` (disabled) to `shards_number` of corresponding [Distributed](../../engines/table-engines/special/distributed.md#distributed) table. + +Defauld value: `0` + ## use_compact_format_in_distributed_parts_names {#use_compact_format_in_distributed_parts_names} Uses compact format for storing blocks for async (`insert_distributed_sync`) INSERT into tables with `Distributed` engine. From 23754e46e8a8c54ff00537546908fa629f8ece71 Mon Sep 17 00:00:00 2001 From: gyuton <40863448+gyuton@users.noreply.github.com> Date: Wed, 17 Feb 2021 01:41:47 +0300 Subject: [PATCH 0358/2357] Update docs/ru/engines/table-engines/integrations/kafka.md Co-authored-by: olgarev <56617294+olgarev@users.noreply.github.com> --- docs/ru/engines/table-engines/integrations/kafka.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/docs/ru/engines/table-engines/integrations/kafka.md b/docs/ru/engines/table-engines/integrations/kafka.md index a1528edfd1d..5a6971b1ae6 100644 --- a/docs/ru/engines/table-engines/integrations/kafka.md +++ b/docs/ru/engines/table-engines/integrations/kafka.md @@ -49,8 +49,8 @@ SETTINGS - `kafka_num_consumers` — количество потребителей (consumer) на таблицу. По умолчанию: `1`. Укажите больше потребителей, если пропускная способность одного потребителя недостаточна. Общее число потребителей не должно превышать количество партиций в топике, так как на одну партицию может быть назначено не более одного потребителя. - `kafka_max_block_size` — максимальный размер пачек (в сообщениях) для poll (по умолчанию `max_block_size`). - `kafka_skip_broken_messages` — максимальное количество некорректных сообщений в блоке. Если `kafka_skip_broken_messages = N`, то движок отбрасывает `N` сообщений Кафки, которые не получилось обработать. Одно сообщение в точности соответствует одной записи (строке). Значение по умолчанию – 0. -- `kafka_commit_every_batch` — фиксирует каждый обработанный и потребленный пакет вместо отдельной фиксации после записи целого блока (по умолчанию `0`). -- `kafka_thread_per_consumer` — снабжает каждого потребителя независимым потоком (по умолчанию `0`). При включенном состоянии каждый потребитель сбрасывает данные независимо и параллельно (иначе — строки от нескольких потребителей склеиваются в один блок). +- `kafka_commit_every_batch` — включает или отключает режим записи каждой принятой и обработанной пачки по отдельности вместо единой записи целого блока (по умолчанию `0`). +- `kafka_thread_per_consumer` — включает или отключает предоставление отдельного потока каждому потребителю (по умолчанию `0`). При включенном режиме каждый потребитель сбрасывает данные независимо и параллельно, при отключённом — строки с данными от нескольких потребителей собираются в один блок. Примеры From fa200160915ee9c187e5e64a4a1e395d70430b7f Mon Sep 17 00:00:00 2001 From: Alexey Milovidov Date: Wed, 17 Feb 2021 09:53:18 +0300 Subject: [PATCH 0359/2357] Enable distributed_aggregation_memory_efficient by default --- src/Core/Settings.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/Core/Settings.h b/src/Core/Settings.h index 9bb9ad30f15..6c05d247037 100644 --- a/src/Core/Settings.h +++ b/src/Core/Settings.h @@ -100,7 +100,7 @@ class IColumn; M(UInt64, min_count_to_compile_expression, 3, "The number of identical expressions before they are JIT-compiled", 0) \ M(UInt64, group_by_two_level_threshold, 100000, "From what number of keys, a two-level aggregation starts. 0 - the threshold is not set.", 0) \ M(UInt64, group_by_two_level_threshold_bytes, 100000000, "From what size of the aggregation state in bytes, a two-level aggregation begins to be used. 0 - the threshold is not set. Two-level aggregation is used when at least one of the thresholds is triggered.", 0) \ - M(Bool, distributed_aggregation_memory_efficient, false, "Is the memory-saving mode of distributed aggregation enabled.", 0) \ + M(Bool, distributed_aggregation_memory_efficient, true, "Is the memory-saving mode of distributed aggregation enabled.", 0) \ M(UInt64, aggregation_memory_efficient_merge_threads, 0, "Number of threads to use for merge intermediate aggregation results in memory efficient mode. When bigger, then more memory is consumed. 0 means - same as 'max_threads'.", 0) \ \ M(UInt64, max_parallel_replicas, 1, "The maximum number of replicas of each shard used when the query is executed. For consistency (to get different parts of the same partition), this option only works for the specified sampling key. The lag of the replicas is not controlled.", 0) \ From dfaa79b88ed8bd5e67df1e510d1a91cb1644a6a5 Mon Sep 17 00:00:00 2001 From: alesapin Date: Wed, 17 Feb 2021 10:10:46 +0300 Subject: [PATCH 0360/2357] Add missed file --- src/Coordination/NuKeeperLogStore.cpp | 97 +++++++++++++++++++++++++++ 1 file changed, 97 insertions(+) create mode 100644 src/Coordination/NuKeeperLogStore.cpp diff --git a/src/Coordination/NuKeeperLogStore.cpp b/src/Coordination/NuKeeperLogStore.cpp new file mode 100644 index 00000000000..fa0631e14ad --- /dev/null +++ b/src/Coordination/NuKeeperLogStore.cpp @@ -0,0 +1,97 @@ +#include + +namespace DB +{ + +NuKeeperLogStore::NuKeeperLogStore(const std::string & changelogs_path, size_t rotate_interval_) + : changelog(changelogs_path, rotate_interval_) +{ +} + +size_t NuKeeperLogStore::start_index() const +{ + std::lock_guard lock(changelog_lock); + return changelog.getStartIndex(); +} + +void NuKeeperLogStore::init(size_t from_log_idx) +{ + std::lock_guard lock(changelog_lock); + changelog.readChangelogAndInitWriter(from_log_idx); +} + +size_t NuKeeperLogStore::next_slot() const +{ + std::lock_guard lock(changelog_lock); + return changelog.getNextEntryIndex(); +} + +nuraft::ptr NuKeeperLogStore::last_entry() const +{ + std::lock_guard lock(changelog_lock); + return changelog.getLastEntry(); +} + +size_t NuKeeperLogStore::append(nuraft::ptr & entry) +{ + std::lock_guard lock(changelog_lock); + size_t idx = changelog.getNextEntryIndex(); + changelog.appendEntry(idx, entry); + return idx; +} + + +void NuKeeperLogStore::write_at(size_t index, nuraft::ptr & entry) +{ + std::lock_guard lock(changelog_lock); + changelog.writeAt(index, entry); +} + +nuraft::ptr>> NuKeeperLogStore::log_entries(size_t start, size_t end) +{ + std::lock_guard lock(changelog_lock); + return changelog.getLogEntriesBetween(start, end); +} + +nuraft::ptr NuKeeperLogStore::entry_at(size_t index) +{ + std::lock_guard lock(changelog_lock); + return changelog.entryAt(index); +} + +size_t NuKeeperLogStore::term_at(size_t index) +{ + std::lock_guard lock(changelog_lock); + auto entry = changelog.entryAt(index); + if (entry) + return entry->get_term(); + return 0; +} + +nuraft::ptr NuKeeperLogStore::pack(size_t index, int32_t cnt) +{ + std::lock_guard lock(changelog_lock); + return changelog.serializeEntriesToBuffer(index, cnt); +} + +bool NuKeeperLogStore::compact(size_t last_log_index) +{ + std::lock_guard lock(changelog_lock); + changelog.compact(last_log_index); + return true; +} + +bool NuKeeperLogStore::flush() +{ + std::lock_guard lock(changelog_lock); + changelog.flush(); + return true; +} + +void NuKeeperLogStore::apply_pack(size_t index, nuraft::buffer & pack) +{ + std::lock_guard lock(changelog_lock); + changelog.applyEntriesFromBuffer(index, pack); +} + +} From af95db2fcf8ac6c974e9a3d546392419b1ba6a5f Mon Sep 17 00:00:00 2001 From: alesapin Date: Wed, 17 Feb 2021 11:00:17 +0300 Subject: [PATCH 0361/2357] Test log storage instead of changelog --- src/Coordination/Changelog.cpp | 7 +- src/Coordination/NuKeeperLogStore.cpp | 6 + src/Coordination/NuKeeperLogStore.h | 2 + src/Coordination/tests/gtest_for_build.cpp | 327 +++++++++++++-------- 4 files changed, 218 insertions(+), 124 deletions(-) diff --git a/src/Coordination/Changelog.cpp b/src/Coordination/Changelog.cpp index e4d8b13ec37..4f095974836 100644 --- a/src/Coordination/Changelog.cpp +++ b/src/Coordination/Changelog.cpp @@ -212,6 +212,8 @@ Changelog::Changelog(const std::string & changelogs_dir_, size_t rotate_interval void Changelog::readChangelogAndInitWriter(size_t from_log_idx) { size_t read_from_last = 0; + start_index = from_log_idx == 0 ? 1 : from_log_idx; + size_t total_read = 0; for (const auto & [start_id, changelog_file] : existing_changelogs) { ChangelogName parsed_name = getChangelogName(changelog_file); @@ -219,11 +221,10 @@ void Changelog::readChangelogAndInitWriter(size_t from_log_idx) { ChangelogReader reader(changelog_file); read_from_last = reader.readChangelog(logs, from_log_idx, index_to_start_pos); + total_read += read_from_last; } } - start_index = from_log_idx == 0 ? 1 : from_log_idx; - if (existing_changelogs.size() > 0 && read_from_last < rotate_interval) { auto str_name = existing_changelogs.rbegin()->second; @@ -233,7 +234,7 @@ void Changelog::readChangelogAndInitWriter(size_t from_log_idx) } else { - rotate(start_index); + rotate(start_index + total_read); } } diff --git a/src/Coordination/NuKeeperLogStore.cpp b/src/Coordination/NuKeeperLogStore.cpp index fa0631e14ad..fa8d6d6c299 100644 --- a/src/Coordination/NuKeeperLogStore.cpp +++ b/src/Coordination/NuKeeperLogStore.cpp @@ -94,4 +94,10 @@ void NuKeeperLogStore::apply_pack(size_t index, nuraft::buffer & pack) changelog.applyEntriesFromBuffer(index, pack); } +size_t NuKeeperLogStore::size() const +{ + std::lock_guard lock(changelog_lock); + return changelog.size(); +} + } diff --git a/src/Coordination/NuKeeperLogStore.h b/src/Coordination/NuKeeperLogStore.h index 981dc3f24e7..49d5dbfdf7c 100644 --- a/src/Coordination/NuKeeperLogStore.h +++ b/src/Coordination/NuKeeperLogStore.h @@ -39,6 +39,8 @@ public: bool flush() override; + size_t size() const; + private: mutable std::mutex changelog_lock; Changelog changelog; diff --git a/src/Coordination/tests/gtest_for_build.cpp b/src/Coordination/tests/gtest_for_build.cpp index 6d91ba95111..8328d93d9cf 100644 --- a/src/Coordination/tests/gtest_for_build.cpp +++ b/src/Coordination/tests/gtest_for_build.cpp @@ -114,10 +114,10 @@ struct SimpliestRaftServer if (!raft_instance) { - std::cerr << "Failed to initialize launcher (see the message " - "in the log file)." << std::endl; + std::cerr << "Failed to initialize launcher" << std::endl; exit(-1); } + std::cout << "init Raft instance " << server_id; for (size_t ii = 0; ii < 20; ++ii) { @@ -370,33 +370,33 @@ DB::LogEntryPtr getLogEntry(const std::string & s, size_t term) TEST(CoordinationTest, ChangelogTestSimple) { ChangelogDirTest test("./logs"); - DB::Changelog changelog("./logs", 5); - changelog.readChangelogAndInitWriter(1); + DB::NuKeeperLogStore changelog("./logs", 5); + changelog.init(1); auto entry = getLogEntry("hello world", 77); - changelog.appendEntry(1, entry); - EXPECT_EQ(changelog.getNextEntryIndex(), 2); - EXPECT_EQ(changelog.getStartIndex(), 1); - EXPECT_EQ(changelog.getLastEntry()->get_term(), 77); - EXPECT_EQ(changelog.entryAt(1)->get_term(), 77); - EXPECT_EQ(changelog.getLogEntriesBetween(1, 2)->size(), 1); + changelog.append(entry); + EXPECT_EQ(changelog.next_slot(), 2); + EXPECT_EQ(changelog.start_index(), 1); + EXPECT_EQ(changelog.last_entry()->get_term(), 77); + EXPECT_EQ(changelog.entry_at(1)->get_term(), 77); + EXPECT_EQ(changelog.log_entries(1, 2)->size(), 1); } TEST(CoordinationTest, ChangelogTestFile) { ChangelogDirTest test("./logs"); - DB::Changelog changelog("./logs", 5); - changelog.readChangelogAndInitWriter(1); + DB::NuKeeperLogStore changelog("./logs", 5); + changelog.init(1); auto entry = getLogEntry("hello world", 77); - changelog.appendEntry(1, entry); + changelog.append(entry); EXPECT_TRUE(fs::exists("./logs/changelog_1_5.bin")); for (const auto & p : fs::directory_iterator("./logs")) EXPECT_EQ(p.path(), "./logs/changelog_1_5.bin"); - changelog.appendEntry(2, entry); - changelog.appendEntry(3, entry); - changelog.appendEntry(4, entry); - changelog.appendEntry(5, entry); - changelog.appendEntry(6, entry); + changelog.append(entry); + changelog.append(entry); + changelog.append(entry); + changelog.append(entry); + changelog.append(entry); EXPECT_TRUE(fs::exists("./logs/changelog_1_5.bin")); EXPECT_TRUE(fs::exists("./logs/changelog_6_10.bin")); @@ -405,26 +405,26 @@ TEST(CoordinationTest, ChangelogTestFile) TEST(CoordinationTest, ChangelogReadWrite) { ChangelogDirTest test("./logs"); - DB::Changelog changelog("./logs", 1000); - changelog.readChangelogAndInitWriter(1); + DB::NuKeeperLogStore changelog("./logs", 1000); + changelog.init(1); for (size_t i = 0; i < 10; ++i) { auto entry = getLogEntry("hello world", i * 10); - changelog.appendEntry(changelog.getNextEntryIndex(), entry); + changelog.append(entry); } EXPECT_EQ(changelog.size(), 10); - DB::Changelog changelog_reader("./logs", 1000); - changelog_reader.readChangelogAndInitWriter(1); + DB::NuKeeperLogStore changelog_reader("./logs", 1000); + changelog_reader.init(1); EXPECT_EQ(changelog_reader.size(), 10); - EXPECT_EQ(changelog_reader.getLastEntry()->get_term(), changelog.getLastEntry()->get_term()); - EXPECT_EQ(changelog_reader.getStartIndex(), changelog.getStartIndex()); - EXPECT_EQ(changelog_reader.getNextEntryIndex(), changelog.getNextEntryIndex()); + EXPECT_EQ(changelog_reader.last_entry()->get_term(), changelog.last_entry()->get_term()); + EXPECT_EQ(changelog_reader.start_index(), changelog.start_index()); + EXPECT_EQ(changelog_reader.next_slot(), changelog.next_slot()); for (size_t i = 0; i < 10; ++i) - EXPECT_EQ(changelog_reader.entryAt(i + 1)->get_term(), changelog.entryAt(i + 1)->get_term()); + EXPECT_EQ(changelog_reader.entry_at(i + 1)->get_term(), changelog.entry_at(i + 1)->get_term()); - auto entries_from_range_read = changelog_reader.getLogEntriesBetween(1, 11); - auto entries_from_range = changelog.getLogEntriesBetween(1, 11); + auto entries_from_range_read = changelog_reader.log_entries(1, 11); + auto entries_from_range = changelog.log_entries(1, 11); EXPECT_EQ(entries_from_range_read->size(), entries_from_range->size()); EXPECT_EQ(10, entries_from_range->size()); } @@ -432,55 +432,55 @@ TEST(CoordinationTest, ChangelogReadWrite) TEST(CoordinationTest, ChangelogWriteAt) { ChangelogDirTest test("./logs"); - DB::Changelog changelog("./logs", 1000); - changelog.readChangelogAndInitWriter(1); + DB::NuKeeperLogStore changelog("./logs", 1000); + changelog.init(1); for (size_t i = 0; i < 10; ++i) { auto entry = getLogEntry("hello world", i * 10); - changelog.appendEntry(changelog.getNextEntryIndex(), entry); + changelog.append(entry); } EXPECT_EQ(changelog.size(), 10); auto entry = getLogEntry("writer", 77); - changelog.writeAt(7, entry); + changelog.write_at(7, entry); EXPECT_EQ(changelog.size(), 7); - EXPECT_EQ(changelog.getLastEntry()->get_term(), 77); - EXPECT_EQ(changelog.entryAt(7)->get_term(), 77); - EXPECT_EQ(changelog.getNextEntryIndex(), 8); + EXPECT_EQ(changelog.last_entry()->get_term(), 77); + EXPECT_EQ(changelog.entry_at(7)->get_term(), 77); + EXPECT_EQ(changelog.next_slot(), 8); - DB::Changelog changelog_reader("./logs", 1000); - changelog_reader.readChangelogAndInitWriter(1); + DB::NuKeeperLogStore changelog_reader("./logs", 1000); + changelog_reader.init(1); EXPECT_EQ(changelog_reader.size(), changelog.size()); - EXPECT_EQ(changelog_reader.getLastEntry()->get_term(), changelog.getLastEntry()->get_term()); - EXPECT_EQ(changelog_reader.getStartIndex(), changelog.getStartIndex()); - EXPECT_EQ(changelog_reader.getNextEntryIndex(), changelog.getNextEntryIndex()); + EXPECT_EQ(changelog_reader.last_entry()->get_term(), changelog.last_entry()->get_term()); + EXPECT_EQ(changelog_reader.start_index(), changelog.start_index()); + EXPECT_EQ(changelog_reader.next_slot(), changelog.next_slot()); } TEST(CoordinationTest, ChangelogTestAppendAfterRead) { ChangelogDirTest test("./logs"); - DB::Changelog changelog("./logs", 5); - changelog.readChangelogAndInitWriter(1); + DB::NuKeeperLogStore changelog("./logs", 5); + changelog.init(1); for (size_t i = 0; i < 7; ++i) { auto entry = getLogEntry("hello world", i * 10); - changelog.appendEntry(changelog.getNextEntryIndex(), entry); + changelog.append(entry); } EXPECT_EQ(changelog.size(), 7); EXPECT_TRUE(fs::exists("./logs/changelog_1_5.bin")); EXPECT_TRUE(fs::exists("./logs/changelog_6_10.bin")); - DB::Changelog changelog_reader("./logs", 5); - changelog_reader.readChangelogAndInitWriter(1); + DB::NuKeeperLogStore changelog_reader("./logs", 5); + changelog_reader.init(1); EXPECT_EQ(changelog_reader.size(), 7); for (size_t i = 7; i < 10; ++i) { auto entry = getLogEntry("hello world", i * 10); - changelog_reader.appendEntry(changelog_reader.getNextEntryIndex(), entry); + changelog_reader.append(entry); } EXPECT_EQ(changelog_reader.size(), 10); EXPECT_TRUE(fs::exists("./logs/changelog_1_5.bin")); @@ -493,7 +493,7 @@ TEST(CoordinationTest, ChangelogTestAppendAfterRead) EXPECT_EQ(logs_count, 2); auto entry = getLogEntry("someentry", 77); - changelog_reader.appendEntry(changelog_reader.getNextEntryIndex(), entry); + changelog_reader.append(entry); EXPECT_EQ(changelog_reader.size(), 11); EXPECT_TRUE(fs::exists("./logs/changelog_1_5.bin")); EXPECT_TRUE(fs::exists("./logs/changelog_6_10.bin")); @@ -509,13 +509,13 @@ TEST(CoordinationTest, ChangelogTestAppendAfterRead) TEST(CoordinationTest, ChangelogTestCompaction) { ChangelogDirTest test("./logs"); - DB::Changelog changelog("./logs", 5); - changelog.readChangelogAndInitWriter(1); + DB::NuKeeperLogStore changelog("./logs", 5); + changelog.init(1); for (size_t i = 0; i < 3; ++i) { auto entry = getLogEntry("hello world", i * 10); - changelog.appendEntry(changelog.getNextEntryIndex(), entry); + changelog.append(entry); } EXPECT_EQ(changelog.size(), 3); @@ -523,15 +523,19 @@ TEST(CoordinationTest, ChangelogTestCompaction) changelog.compact(2); EXPECT_EQ(changelog.size(), 1); - EXPECT_EQ(changelog.getStartIndex(), 3); - EXPECT_EQ(changelog.getNextEntryIndex(), 4); - EXPECT_EQ(changelog.getLastEntry()->get_term(), 20); + EXPECT_EQ(changelog.start_index(), 3); + EXPECT_EQ(changelog.next_slot(), 4); + EXPECT_EQ(changelog.last_entry()->get_term(), 20); EXPECT_TRUE(fs::exists("./logs/changelog_1_5.bin")); - changelog.appendEntry(changelog.getNextEntryIndex(), getLogEntry("hello world", 30)); - changelog.appendEntry(changelog.getNextEntryIndex(), getLogEntry("hello world", 40)); - changelog.appendEntry(changelog.getNextEntryIndex(), getLogEntry("hello world", 50)); - changelog.appendEntry(changelog.getNextEntryIndex(), getLogEntry("hello world", 60)); + auto e1 = getLogEntry("hello world", 30); + changelog.append(e1); + auto e2 = getLogEntry("hello world", 40); + changelog.append(e2); + auto e3 = getLogEntry("hello world", 50); + changelog.append(e3); + auto e4 = getLogEntry("hello world", 60); + changelog.append(e4); EXPECT_TRUE(fs::exists("./logs/changelog_1_5.bin")); EXPECT_TRUE(fs::exists("./logs/changelog_6_10.bin")); @@ -542,109 +546,110 @@ TEST(CoordinationTest, ChangelogTestCompaction) EXPECT_TRUE(fs::exists("./logs/changelog_6_10.bin")); EXPECT_EQ(changelog.size(), 1); - EXPECT_EQ(changelog.getStartIndex(), 7); - EXPECT_EQ(changelog.getNextEntryIndex(), 8); - EXPECT_EQ(changelog.getLastEntry()->get_term(), 60); + EXPECT_EQ(changelog.start_index(), 7); + EXPECT_EQ(changelog.next_slot(), 8); + EXPECT_EQ(changelog.last_entry()->get_term(), 60); /// And we able to read it - DB::Changelog changelog_reader("./logs", 5); - changelog_reader.readChangelogAndInitWriter(7); + DB::NuKeeperLogStore changelog_reader("./logs", 5); + changelog_reader.init(7); EXPECT_EQ(changelog_reader.size(), 1); - EXPECT_EQ(changelog_reader.getStartIndex(), 7); - EXPECT_EQ(changelog_reader.getNextEntryIndex(), 8); - EXPECT_EQ(changelog_reader.getLastEntry()->get_term(), 60); + EXPECT_EQ(changelog_reader.start_index(), 7); + EXPECT_EQ(changelog_reader.next_slot(), 8); + EXPECT_EQ(changelog_reader.last_entry()->get_term(), 60); } TEST(CoordinationTest, ChangelogTestBatchOperations) { ChangelogDirTest test("./logs"); - DB::Changelog changelog("./logs", 100); - changelog.readChangelogAndInitWriter(1); + DB::NuKeeperLogStore changelog("./logs", 100); + changelog.init(1); for (size_t i = 0; i < 10; ++i) { auto entry = getLogEntry(std::to_string(i) + "_hello_world", i * 10); - changelog.appendEntry(changelog.getNextEntryIndex(), entry); + changelog.append(entry); } EXPECT_EQ(changelog.size(), 10); - auto entries = changelog.serializeEntriesToBuffer(1, 5); + auto entries = changelog.pack(1, 5); - DB::Changelog apply_changelog("./logs", 100); - apply_changelog.readChangelogAndInitWriter(1); + DB::NuKeeperLogStore apply_changelog("./logs", 100); + apply_changelog.init(1); for (size_t i = 0; i < 10; ++i) { - EXPECT_EQ(apply_changelog.entryAt(i + 1)->get_term(), i * 10); + EXPECT_EQ(apply_changelog.entry_at(i + 1)->get_term(), i * 10); } EXPECT_EQ(apply_changelog.size(), 10); - apply_changelog.applyEntriesFromBuffer(8, *entries); + apply_changelog.apply_pack(8, *entries); EXPECT_EQ(apply_changelog.size(), 12); - EXPECT_EQ(apply_changelog.getStartIndex(), 1); - EXPECT_EQ(apply_changelog.getNextEntryIndex(), 13); + EXPECT_EQ(apply_changelog.start_index(), 1); + EXPECT_EQ(apply_changelog.next_slot(), 13); for (size_t i = 0; i < 7; ++i) { - EXPECT_EQ(apply_changelog.entryAt(i + 1)->get_term(), i * 10); + EXPECT_EQ(apply_changelog.entry_at(i + 1)->get_term(), i * 10); } - EXPECT_EQ(apply_changelog.entryAt(8)->get_term(), 0); - EXPECT_EQ(apply_changelog.entryAt(9)->get_term(), 10); - EXPECT_EQ(apply_changelog.entryAt(10)->get_term(), 20); - EXPECT_EQ(apply_changelog.entryAt(11)->get_term(), 30); - EXPECT_EQ(apply_changelog.entryAt(12)->get_term(), 40); + EXPECT_EQ(apply_changelog.entry_at(8)->get_term(), 0); + EXPECT_EQ(apply_changelog.entry_at(9)->get_term(), 10); + EXPECT_EQ(apply_changelog.entry_at(10)->get_term(), 20); + EXPECT_EQ(apply_changelog.entry_at(11)->get_term(), 30); + EXPECT_EQ(apply_changelog.entry_at(12)->get_term(), 40); } TEST(CoordinationTest, ChangelogTestBatchOperationsEmpty) { ChangelogDirTest test("./logs"); - DB::Changelog changelog("./logs", 100); - changelog.readChangelogAndInitWriter(1); + DB::NuKeeperLogStore changelog("./logs", 100); + changelog.init(1); for (size_t i = 0; i < 10; ++i) { auto entry = getLogEntry(std::to_string(i) + "_hello_world", i * 10); - changelog.appendEntry(changelog.getNextEntryIndex(), entry); + changelog.append(entry); } EXPECT_EQ(changelog.size(), 10); - auto entries = changelog.serializeEntriesToBuffer(5, 5); + auto entries = changelog.pack(5, 5); ChangelogDirTest test1("./logs1"); - DB::Changelog changelog_new("./logs1", 100); - changelog_new.readChangelogAndInitWriter(1); + DB::NuKeeperLogStore changelog_new("./logs1", 100); + changelog_new.init(1); EXPECT_EQ(changelog_new.size(), 0); - changelog_new.applyEntriesFromBuffer(5, *entries); + changelog_new.apply_pack(5, *entries); EXPECT_EQ(changelog_new.size(), 5); - EXPECT_EQ(changelog_new.getStartIndex(), 5); - EXPECT_EQ(changelog_new.getNextEntryIndex(), 10); + EXPECT_EQ(changelog_new.start_index(), 5); + EXPECT_EQ(changelog_new.next_slot(), 10); for (size_t i = 4; i < 9; ++i) - EXPECT_EQ(changelog_new.entryAt(i + 1)->get_term(), i * 10); + EXPECT_EQ(changelog_new.entry_at(i + 1)->get_term(), i * 10); - changelog_new.appendEntry(changelog_new.getNextEntryIndex(), getLogEntry("hello_world", 110)); + auto e = getLogEntry("hello_world", 110); + changelog_new.append(e); EXPECT_EQ(changelog_new.size(), 6); - EXPECT_EQ(changelog_new.getStartIndex(), 5); - EXPECT_EQ(changelog_new.getNextEntryIndex(), 11); + EXPECT_EQ(changelog_new.start_index(), 5); + EXPECT_EQ(changelog_new.next_slot(), 11); - DB::Changelog changelog_reader("./logs1", 100); - changelog_reader.readChangelogAndInitWriter(5); + DB::NuKeeperLogStore changelog_reader("./logs1", 100); + changelog_reader.init(5); } TEST(CoordinationTest, ChangelogTestWriteAtPreviousFile) { ChangelogDirTest test("./logs"); - DB::Changelog changelog("./logs", 5); - changelog.readChangelogAndInitWriter(1); + DB::NuKeeperLogStore changelog("./logs", 5); + changelog.init(1); for (size_t i = 0; i < 33; ++i) { auto entry = getLogEntry(std::to_string(i) + "_hello_world", i * 10); - changelog.appendEntry(changelog.getNextEntryIndex(), entry); + changelog.append(entry); } EXPECT_TRUE(fs::exists("./logs/changelog_1_5.bin")); @@ -657,11 +662,12 @@ TEST(CoordinationTest, ChangelogTestWriteAtPreviousFile) EXPECT_EQ(changelog.size(), 33); - changelog.writeAt(7, getLogEntry("helloworld", 5555)); + auto e1 = getLogEntry("helloworld", 5555); + changelog.write_at(7, e1); EXPECT_EQ(changelog.size(), 7); - EXPECT_EQ(changelog.getStartIndex(), 1); - EXPECT_EQ(changelog.getNextEntryIndex(), 8); - EXPECT_EQ(changelog.getLastEntry()->get_term(), 5555); + EXPECT_EQ(changelog.start_index(), 1); + EXPECT_EQ(changelog.next_slot(), 8); + EXPECT_EQ(changelog.last_entry()->get_term(), 5555); EXPECT_TRUE(fs::exists("./logs/changelog_1_5.bin")); EXPECT_TRUE(fs::exists("./logs/changelog_6_10.bin")); @@ -672,24 +678,24 @@ TEST(CoordinationTest, ChangelogTestWriteAtPreviousFile) EXPECT_FALSE(fs::exists("./logs/changelog_26_30.bin")); EXPECT_FALSE(fs::exists("./logs/changelog_31_35.bin")); - DB::Changelog changelog_read("./logs", 5); - changelog_read.readChangelogAndInitWriter(1); + DB::NuKeeperLogStore changelog_read("./logs", 5); + changelog_read.init(1); EXPECT_EQ(changelog_read.size(), 7); - EXPECT_EQ(changelog_read.getStartIndex(), 1); - EXPECT_EQ(changelog_read.getNextEntryIndex(), 8); - EXPECT_EQ(changelog_read.getLastEntry()->get_term(), 5555); + EXPECT_EQ(changelog_read.start_index(), 1); + EXPECT_EQ(changelog_read.next_slot(), 8); + EXPECT_EQ(changelog_read.last_entry()->get_term(), 5555); } TEST(CoordinationTest, ChangelogTestWriteAtFileBorder) { ChangelogDirTest test("./logs"); - DB::Changelog changelog("./logs", 5); - changelog.readChangelogAndInitWriter(1); + DB::NuKeeperLogStore changelog("./logs", 5); + changelog.init(1); for (size_t i = 0; i < 33; ++i) { auto entry = getLogEntry(std::to_string(i) + "_hello_world", i * 10); - changelog.appendEntry(changelog.getNextEntryIndex(), entry); + changelog.append(entry); } EXPECT_TRUE(fs::exists("./logs/changelog_1_5.bin")); @@ -702,11 +708,12 @@ TEST(CoordinationTest, ChangelogTestWriteAtFileBorder) EXPECT_EQ(changelog.size(), 33); - changelog.writeAt(11, getLogEntry("helloworld", 5555)); + auto e1 = getLogEntry("helloworld", 5555); + changelog.write_at(11, e1); EXPECT_EQ(changelog.size(), 11); - EXPECT_EQ(changelog.getStartIndex(), 1); - EXPECT_EQ(changelog.getNextEntryIndex(), 12); - EXPECT_EQ(changelog.getLastEntry()->get_term(), 5555); + EXPECT_EQ(changelog.start_index(), 1); + EXPECT_EQ(changelog.next_slot(), 12); + EXPECT_EQ(changelog.last_entry()->get_term(), 5555); EXPECT_TRUE(fs::exists("./logs/changelog_1_5.bin")); EXPECT_TRUE(fs::exists("./logs/changelog_6_10.bin")); @@ -717,12 +724,90 @@ TEST(CoordinationTest, ChangelogTestWriteAtFileBorder) EXPECT_FALSE(fs::exists("./logs/changelog_26_30.bin")); EXPECT_FALSE(fs::exists("./logs/changelog_31_35.bin")); - DB::Changelog changelog_read("./logs", 5); - changelog_read.readChangelogAndInitWriter(1); + DB::NuKeeperLogStore changelog_read("./logs", 5); + changelog_read.init(1); EXPECT_EQ(changelog_read.size(), 11); - EXPECT_EQ(changelog_read.getStartIndex(), 1); - EXPECT_EQ(changelog_read.getNextEntryIndex(), 12); - EXPECT_EQ(changelog_read.getLastEntry()->get_term(), 5555); + EXPECT_EQ(changelog_read.start_index(), 1); + EXPECT_EQ(changelog_read.next_slot(), 12); + EXPECT_EQ(changelog_read.last_entry()->get_term(), 5555); +} + +TEST(CoordinationTest, ChangelogTestWriteAtAllFiles) +{ + ChangelogDirTest test("./logs"); + DB::NuKeeperLogStore changelog("./logs", 5); + changelog.init(1); + + for (size_t i = 0; i < 33; ++i) + { + auto entry = getLogEntry(std::to_string(i) + "_hello_world", i * 10); + changelog.append(entry); + } + + EXPECT_TRUE(fs::exists("./logs/changelog_1_5.bin")); + EXPECT_TRUE(fs::exists("./logs/changelog_6_10.bin")); + EXPECT_TRUE(fs::exists("./logs/changelog_11_15.bin")); + EXPECT_TRUE(fs::exists("./logs/changelog_16_20.bin")); + EXPECT_TRUE(fs::exists("./logs/changelog_21_25.bin")); + EXPECT_TRUE(fs::exists("./logs/changelog_26_30.bin")); + EXPECT_TRUE(fs::exists("./logs/changelog_31_35.bin")); + + EXPECT_EQ(changelog.size(), 33); + + auto e1 = getLogEntry("helloworld", 5555); + changelog.write_at(1, e1); + EXPECT_EQ(changelog.size(), 1); + EXPECT_EQ(changelog.start_index(), 1); + EXPECT_EQ(changelog.next_slot(), 2); + EXPECT_EQ(changelog.last_entry()->get_term(), 5555); + + EXPECT_TRUE(fs::exists("./logs/changelog_1_5.bin")); + + EXPECT_FALSE(fs::exists("./logs/changelog_6_10.bin")); + EXPECT_FALSE(fs::exists("./logs/changelog_11_15.bin")); + EXPECT_FALSE(fs::exists("./logs/changelog_16_20.bin")); + EXPECT_FALSE(fs::exists("./logs/changelog_21_25.bin")); + EXPECT_FALSE(fs::exists("./logs/changelog_26_30.bin")); + EXPECT_FALSE(fs::exists("./logs/changelog_31_35.bin")); +} + +TEST(CoordinationTest, ChangelogTestStartNewLogAfterRead) +{ + ChangelogDirTest test("./logs"); + DB::NuKeeperLogStore changelog("./logs", 5); + changelog.init(1); + + for (size_t i = 0; i < 35; ++i) + { + auto entry = getLogEntry(std::to_string(i) + "_hello_world", i * 10); + changelog.append(entry); + } + EXPECT_EQ(changelog.size(), 35); + EXPECT_TRUE(fs::exists("./logs/changelog_1_5.bin")); + EXPECT_TRUE(fs::exists("./logs/changelog_6_10.bin")); + EXPECT_TRUE(fs::exists("./logs/changelog_11_15.bin")); + EXPECT_TRUE(fs::exists("./logs/changelog_16_20.bin")); + EXPECT_TRUE(fs::exists("./logs/changelog_21_25.bin")); + EXPECT_TRUE(fs::exists("./logs/changelog_26_30.bin")); + EXPECT_TRUE(fs::exists("./logs/changelog_31_35.bin")); + EXPECT_FALSE(fs::exists("./logs/changelog_36_40.bin")); + + + DB::NuKeeperLogStore changelog_reader("./logs", 5); + changelog_reader.init(1); + + auto entry = getLogEntry("36_hello_world", 360); + changelog_reader.append(entry); + + EXPECT_EQ(changelog_reader.size(), 36); + EXPECT_TRUE(fs::exists("./logs/changelog_1_5.bin")); + EXPECT_TRUE(fs::exists("./logs/changelog_6_10.bin")); + EXPECT_TRUE(fs::exists("./logs/changelog_11_15.bin")); + EXPECT_TRUE(fs::exists("./logs/changelog_16_20.bin")); + EXPECT_TRUE(fs::exists("./logs/changelog_21_25.bin")); + EXPECT_TRUE(fs::exists("./logs/changelog_26_30.bin")); + EXPECT_TRUE(fs::exists("./logs/changelog_31_35.bin")); + EXPECT_TRUE(fs::exists("./logs/changelog_36_40.bin")); } #endif From 5f88f5817f4a348051e7aeaa93b8bdb589b8805a Mon Sep 17 00:00:00 2001 From: Maksim Kita Date: Wed, 17 Feb 2021 11:23:24 +0300 Subject: [PATCH 0362/2357] Rename untyped function reinterpretAs into reinterpret --- src/Functions/reinterpretAs.cpp | 50 +++++++++---------- .../01676_reinterpret_as.reference | 6 +-- .../0_stateless/01676_reinterpret_as.sql | 42 ++++++++-------- 3 files changed, 49 insertions(+), 49 deletions(-) diff --git a/src/Functions/reinterpretAs.cpp b/src/Functions/reinterpretAs.cpp index 363455cb38f..1d105f4ce38 100644 --- a/src/Functions/reinterpretAs.cpp +++ b/src/Functions/reinterpretAs.cpp @@ -39,12 +39,12 @@ namespace * 3. Types that can be interpreted as numeric (Integers, Float, Date, DateTime, UUID) into FixedString, * String, and types that can be interpreted as numeric (Integers, Float, Date, DateTime, UUID). */ -class FunctionReinterpretAs : public IFunction +class FunctionReinterpret : public IFunction { public: - static constexpr auto name = "reinterpretAs"; + static constexpr auto name = "reinterpret"; - static FunctionPtr create(const Context &) { return std::make_shared(); } + static FunctionPtr create(const Context &) { return std::make_shared(); } String getName() const override { return name; } @@ -308,11 +308,11 @@ private: }; template -class FunctionReinterpretAsTyped : public IFunction +class FunctionReinterpretAs : public IFunction { public: static constexpr auto name = Name::name; - static FunctionPtr create(const Context &) { return std::make_shared(); } + static FunctionPtr create(const Context &) { return std::make_shared(); } String getName() const override { return name; } @@ -365,7 +365,7 @@ public: return impl.executeImpl(arguments_with_type, return_type, input_rows_count); } - FunctionReinterpretAs impl; + FunctionReinterpret impl; }; struct NameReinterpretAsUInt8 { static constexpr auto name = "reinterpretAsUInt8"; }; @@ -387,26 +387,26 @@ struct NameReinterpretAsUUID { static constexpr auto name = "reinterpretA struct NameReinterpretAsString { static constexpr auto name = "reinterpretAsString"; }; struct NameReinterpretAsFixedString { static constexpr auto name = "reinterpretAsFixedString"; }; -using FunctionReinterpretAsUInt8 = FunctionReinterpretAsTyped; -using FunctionReinterpretAsUInt16 = FunctionReinterpretAsTyped; -using FunctionReinterpretAsUInt32 = FunctionReinterpretAsTyped; -using FunctionReinterpretAsUInt64 = FunctionReinterpretAsTyped; -using FunctionReinterpretAsUInt256 = FunctionReinterpretAsTyped; -using FunctionReinterpretAsInt8 = FunctionReinterpretAsTyped; -using FunctionReinterpretAsInt16 = FunctionReinterpretAsTyped; -using FunctionReinterpretAsInt32 = FunctionReinterpretAsTyped; -using FunctionReinterpretAsInt64 = FunctionReinterpretAsTyped; -using FunctionReinterpretAsInt128 = FunctionReinterpretAsTyped; -using FunctionReinterpretAsInt256 = FunctionReinterpretAsTyped; -using FunctionReinterpretAsFloat32 = FunctionReinterpretAsTyped; -using FunctionReinterpretAsFloat64 = FunctionReinterpretAsTyped; -using FunctionReinterpretAsDate = FunctionReinterpretAsTyped; -using FunctionReinterpretAsDateTime = FunctionReinterpretAsTyped; -using FunctionReinterpretAsUUID = FunctionReinterpretAsTyped; +using FunctionReinterpretAsUInt8 = FunctionReinterpretAs; +using FunctionReinterpretAsUInt16 = FunctionReinterpretAs; +using FunctionReinterpretAsUInt32 = FunctionReinterpretAs; +using FunctionReinterpretAsUInt64 = FunctionReinterpretAs; +using FunctionReinterpretAsUInt256 = FunctionReinterpretAs; +using FunctionReinterpretAsInt8 = FunctionReinterpretAs; +using FunctionReinterpretAsInt16 = FunctionReinterpretAs; +using FunctionReinterpretAsInt32 = FunctionReinterpretAs; +using FunctionReinterpretAsInt64 = FunctionReinterpretAs; +using FunctionReinterpretAsInt128 = FunctionReinterpretAs; +using FunctionReinterpretAsInt256 = FunctionReinterpretAs; +using FunctionReinterpretAsFloat32 = FunctionReinterpretAs; +using FunctionReinterpretAsFloat64 = FunctionReinterpretAs; +using FunctionReinterpretAsDate = FunctionReinterpretAs; +using FunctionReinterpretAsDateTime = FunctionReinterpretAs; +using FunctionReinterpretAsUUID = FunctionReinterpretAs; -using FunctionReinterpretAsString = FunctionReinterpretAsTyped; +using FunctionReinterpretAsString = FunctionReinterpretAs; -using FunctionReinterpretAsFixedString = FunctionReinterpretAsTyped; +using FunctionReinterpretAsFixedString = FunctionReinterpretAs; } @@ -433,7 +433,7 @@ void registerFunctionsReinterpretAs(FunctionFactory & factory) factory.registerFunction(); - factory.registerFunction(); + factory.registerFunction(); } } diff --git a/tests/queries/0_stateless/01676_reinterpret_as.reference b/tests/queries/0_stateless/01676_reinterpret_as.reference index bbde2d5ed57..f7ca2bbedfa 100644 --- a/tests/queries/0_stateless/01676_reinterpret_as.reference +++ b/tests/queries/0_stateless/01676_reinterpret_as.reference @@ -25,6 +25,6 @@ Integer and Float types 0.2 1045220557 0.2 4596373779694328218 Integer and String types -1 49 -1 49 -11 12593 +1 1 49 +1 1 49 +11 11 12593 diff --git a/tests/queries/0_stateless/01676_reinterpret_as.sql b/tests/queries/0_stateless/01676_reinterpret_as.sql index 88dc6437043..cc5dba1e110 100644 --- a/tests/queries/0_stateless/01676_reinterpret_as.sql +++ b/tests/queries/0_stateless/01676_reinterpret_as.sql @@ -1,30 +1,30 @@ SELECT 'Into String'; -SELECT reinterpretAs(49, 'String'); +SELECT reinterpret(49, 'String'); SELECT 'Into FixedString'; -SELECT reinterpretAs(49, 'FixedString(1)'); -SELECT reinterpretAs(49, 'FixedString(2)'); -SELECT reinterpretAs(49, 'FixedString(3)'); -SELECT reinterpretAs(49, 'FixedString(4)'); +SELECT reinterpret(49, 'FixedString(1)'); +SELECT reinterpret(49, 'FixedString(2)'); +SELECT reinterpret(49, 'FixedString(3)'); +SELECT reinterpret(49, 'FixedString(4)'); SELECT reinterpretAsFixedString(49); SELECT 'Into Numeric Representable'; SELECT 'Integer and Integer types'; -SELECT reinterpretAs(257, 'UInt8'), reinterpretAsUInt8(257); -SELECT reinterpretAs(257, 'Int8'), reinterpretAsInt8(257); -SELECT reinterpretAs(257, 'UInt16'), reinterpretAsUInt16(257); -SELECT reinterpretAs(257, 'Int16'), reinterpretAsInt16(257); -SELECT reinterpretAs(257, 'UInt32'), reinterpretAsUInt32(257); -SELECT reinterpretAs(257, 'Int32'), reinterpretAsInt32(257); -SELECT reinterpretAs(257, 'UInt64'), reinterpretAsUInt64(257); -SELECT reinterpretAs(257, 'Int64'), reinterpretAsInt64(257); -SELECT reinterpretAs(257, 'Int128'), reinterpretAsInt128(257); -SELECT reinterpretAs(257, 'UInt256'), reinterpretAsUInt256(257); -SELECT reinterpretAs(257, 'Int256'), reinterpretAsInt256(257); +SELECT reinterpret(257, 'UInt8'), reinterpretAsUInt8(257); +SELECT reinterpret(257, 'Int8'), reinterpretAsInt8(257); +SELECT reinterpret(257, 'UInt16'), reinterpretAsUInt16(257); +SELECT reinterpret(257, 'Int16'), reinterpretAsInt16(257); +SELECT reinterpret(257, 'UInt32'), reinterpretAsUInt32(257); +SELECT reinterpret(257, 'Int32'), reinterpretAsInt32(257); +SELECT reinterpret(257, 'UInt64'), reinterpretAsUInt64(257); +SELECT reinterpret(257, 'Int64'), reinterpretAsInt64(257); +SELECT reinterpret(257, 'Int128'), reinterpretAsInt128(257); +SELECT reinterpret(257, 'UInt256'), reinterpretAsUInt256(257); +SELECT reinterpret(257, 'Int256'), reinterpretAsInt256(257); SELECT 'Integer and Float types'; -SELECT reinterpretAs(toFloat32(0.2), 'UInt32'), reinterpretAsUInt32(toFloat32(0.2)); -SELECT reinterpretAs(toFloat64(0.2), 'UInt64'), reinterpretAsUInt64(toFloat64(0.2)); +SELECT reinterpret(toFloat32(0.2), 'UInt32'), reinterpretAsUInt32(toFloat32(0.2)); +SELECT reinterpret(toFloat64(0.2), 'UInt64'), reinterpretAsUInt64(toFloat64(0.2)); SELECT reinterpretAsFloat32(a), reinterpretAsUInt32(toFloat32(0.2)) as a; SELECT reinterpretAsFloat64(a), reinterpretAsUInt64(toFloat64(0.2)) as a; SELECT 'Integer and String types'; -SELECT reinterpretAsString(a), reinterpretAsUInt8('1') as a; -SELECT reinterpretAsString(a), reinterpretAsUInt8('11') as a; -SELECT reinterpretAsString(a), reinterpretAsUInt16('11') as a; +SELECT reinterpret(a, 'String'), reinterpretAsString(a), reinterpretAsUInt8('1') as a; +SELECT reinterpret(a, 'String'), reinterpretAsString(a), reinterpretAsUInt8('11') as a; +SELECT reinterpret(a, 'String'), reinterpretAsString(a), reinterpretAsUInt16('11') as a; From e52cc1ac1fe7b3c937cc16d75dbcf623fca86c2c Mon Sep 17 00:00:00 2001 From: Maksim Kita Date: Wed, 17 Feb 2021 11:31:20 +0300 Subject: [PATCH 0363/2357] Updated documentation --- .../functions/type-conversion-functions.md | 36 +++++++++---------- 1 file changed, 18 insertions(+), 18 deletions(-) diff --git a/docs/en/sql-reference/functions/type-conversion-functions.md b/docs/en/sql-reference/functions/type-conversion-functions.md index 3ca36f41c78..6bc274eba73 100644 --- a/docs/en/sql-reference/functions/type-conversion-functions.md +++ b/docs/en/sql-reference/functions/type-conversion-functions.md @@ -303,7 +303,7 @@ SELECT toFixedString('foo\0bar', 8) AS s, toStringCutToZero(s) AS s_cut └────────────┴───────┘ ``` -## reinterpretAs(x, T) {#type_conversion_function-cast} +## reinterpret(x, T) {#type_conversion_function-reinterpret} Performs byte reinterpretation of ‘x’ as ‘t’ data type. @@ -313,9 +313,9 @@ Following reinterpretations are allowed: 3. FixedString, String, types that can be interpreted as numeric (Integers, Float, Date, DateTime, UUID) into types that can be interpreted as numeric (Integers, Float, Date, DateTime, UUID) into FixedString, ``` sql -SELECT reinterpretAs(toInt8(-1), 'UInt8') as int_to_uint, - reinterpretAs(toInt8(1), 'Float32') as int_to_float, - reinterpretAs('1', 'UInt32') as string_to_int; +SELECT reinterpret(toInt8(-1), 'UInt8') as int_to_uint, + reinterpret(toInt8(1), 'Float32') as int_to_float, + reinterpret('1', 'UInt32') as string_to_int; ``` ``` text @@ -324,23 +324,23 @@ SELECT reinterpretAs(toInt8(-1), 'UInt8') as int_to_uint, └─────────────┴──────────────┴───────────────┘ ``` -## reinterpretAsUInt(8\|16\|32\|64\|256) {#reinterpretasuint8163264256} +## reinterpretAsUInt(8\|16\|32\|64\|256) {#type_conversion_function-reinterpretAsUInt8163264256} -## reinterpretAsInt(8\|16\|32\|64\|128\|256) {#reinterpretasint8163264128256} +## reinterpretAsInt(8\|16\|32\|64\|128\|256) {#type_conversion_function-reinterpretAsInt8163264128256} -## reinterpretAsFloat(32\|64) {#reinterpretasfloat3264} +## reinterpretAsFloat(32\|64) {##type_conversion_function-reinterpretAsFloat} -## reinterpretAsDate {#reinterpretasdate} +## reinterpretAsDate {#type_conversion_function-reinterpretAsDate} -## reinterpretAsDateTime {#reinterpretasdatetime} +## reinterpretAsDateTime {#type_conversion_function-reinterpretAsDateTime} -## reinterpretAsString {#type_conversion_functions-reinterpretAsString} +## reinterpretAsString {#type_conversion_function-reinterpretAsString} -## reinterpretAsFixedString {#reinterpretasfixedstring} +## reinterpretAsFixedString {#type_conversion_function-reinterpretAsFixedString} -## reinterpretAsUUID {#reinterpretasuuid} +## reinterpretAsUUID {#type_conversion_function-reinterpretAsUUID} -These functions are aliases for `reinterpretAs`function. +These functions are aliases for `reinterpret` function. ## CAST(x, T) {#type_conversion_function-cast} @@ -401,7 +401,7 @@ bounds of type T. Example ``` sql -SELECT cast(-1, 'UInt8') as uint8; +SELECT cast(-1, 'UInt8') as uint8; ``` @@ -422,7 +422,7 @@ Code: 70. DB::Exception: Received from localhost:9000. DB::Exception: Value in c ## accurateCastOrNull(x, T) {#type_conversion_function-accurate-cast_or_null} -Converts ‘x’ to the ‘t’ data type. Always returns nullable type and returns NULL +Converts ‘x’ to the ‘t’ data type. Always returns nullable type and returns NULL if the casted value is not representable in the target type. Example: @@ -817,9 +817,9 @@ SELECT fromUnixTimestamp64Milli(i64, 'UTC') ## formatRow {#formatrow} -Converts arbitrary expressions into a string via given format. +Converts arbitrary expressions into a string via given format. -**Syntax** +**Syntax** ``` sql formatRow(format, x, y, ...) @@ -860,7 +860,7 @@ Result: Converts arbitrary expressions into a string via given format. The function trims the last `\n` if any. -**Syntax** +**Syntax** ``` sql formatRowNoNewline(format, x, y, ...) From acf843a01a9ff7677188dfabbebd4a861a2a7d5a Mon Sep 17 00:00:00 2001 From: alesapin Date: Wed, 17 Feb 2021 12:00:12 +0300 Subject: [PATCH 0364/2357] Slightly more optimal --- src/Coordination/Changelog.cpp | 88 ++++++++++++---------- src/Coordination/Changelog.h | 11 ++- src/Coordination/tests/gtest_for_build.cpp | 57 ++++++++++++++ 3 files changed, 116 insertions(+), 40 deletions(-) diff --git a/src/Coordination/Changelog.cpp b/src/Coordination/Changelog.cpp index 4f095974836..9e1ed557430 100644 --- a/src/Coordination/Changelog.cpp +++ b/src/Coordination/Changelog.cpp @@ -39,21 +39,15 @@ namespace static constexpr auto DEFAULT_PREFIX = "changelog"; -struct ChangelogName -{ - std::string prefix; - size_t from_log_idx; - size_t to_log_idx; -}; - -std::string formatChangelogPath(const std::string & prefix, const ChangelogName & name) +std::string formatChangelogPath(const std::string & prefix, const ChangelogFileDescription & name) { std::filesystem::path path(prefix); path /= std::filesystem::path(name.prefix + "_" + std::to_string(name.from_log_idx) + "_" + std::to_string(name.to_log_idx) + ".bin"); return path; } -ChangelogName getChangelogName(const std::string & path_str) + +ChangelogFileDescription getChangelogFileDescription(const std::string & path_str) { std::filesystem::path path(path_str); std::string filename = path.stem(); @@ -62,10 +56,11 @@ ChangelogName getChangelogName(const std::string & path_str) if (filename_parts.size() < 3) throw Exception(ErrorCodes::CORRUPTED_DATA, "Invalid changelog {}", path_str); - ChangelogName result; + ChangelogFileDescription result; result.prefix = filename_parts[0]; result.from_log_idx = parse(filename_parts[1]); result.to_log_idx = parse(filename_parts[2]); + result.path = path_str; return result; } @@ -204,8 +199,8 @@ Changelog::Changelog(const std::string & changelogs_dir_, size_t rotate_interval for (const auto & p : fs::directory_iterator(changelogs_dir)) { - auto name = getChangelogName(p.path()); - existing_changelogs[name.from_log_idx] = p.path(); + auto file_description = getChangelogFileDescription(p.path()); + existing_changelogs[file_description.from_log_idx] = file_description; } } @@ -214,22 +209,40 @@ void Changelog::readChangelogAndInitWriter(size_t from_log_idx) size_t read_from_last = 0; start_index = from_log_idx == 0 ? 1 : from_log_idx; size_t total_read = 0; - for (const auto & [start_id, changelog_file] : existing_changelogs) + size_t entries_in_last = 0; + size_t incomplete_log_idx = 0; + for (const auto & [start_idx, changelog_description] : existing_changelogs) { - ChangelogName parsed_name = getChangelogName(changelog_file); - if (parsed_name.to_log_idx >= from_log_idx) + entries_in_last = changelog_description.to_log_idx - changelog_description.from_log_idx + 1; + + if (changelog_description.to_log_idx >= from_log_idx) { - ChangelogReader reader(changelog_file); + ChangelogReader reader(changelog_description.path); read_from_last = reader.readChangelog(logs, from_log_idx, index_to_start_pos); total_read += read_from_last; + + /// May happen after truncate and crash + if (read_from_last < entries_in_last) + { + incomplete_log_idx = start_idx; + break; + } } } - if (existing_changelogs.size() > 0 && read_from_last < rotate_interval) + if (incomplete_log_idx != 0) { - auto str_name = existing_changelogs.rbegin()->second; - auto parsed_name = getChangelogName(str_name); - current_writer = std::make_unique(str_name, WriteMode::Append, parsed_name.from_log_idx); + for (auto itr = existing_changelogs.upper_bound(incomplete_log_idx); itr != existing_changelogs.end();) + { + std::filesystem::remove(itr->second.path); + itr = existing_changelogs.erase(itr); + } + } + + if (existing_changelogs.size() > 0 && read_from_last < entries_in_last) + { + auto description = existing_changelogs.rbegin()->second; + current_writer = std::make_unique(description.path, WriteMode::Append, description.from_log_idx); current_writer->setEntriesWritten(read_from_last); } else @@ -243,14 +256,14 @@ void Changelog::rotate(size_t new_start_log_idx) if (current_writer) current_writer->flush(); - ChangelogName new_name; - new_name.prefix = DEFAULT_PREFIX; - new_name.from_log_idx = new_start_log_idx; - new_name.to_log_idx = new_start_log_idx + rotate_interval - 1; + ChangelogFileDescription new_description; + new_description.prefix = DEFAULT_PREFIX; + new_description.from_log_idx = new_start_log_idx; + new_description.to_log_idx = new_start_log_idx + rotate_interval - 1; - auto new_log_path = formatChangelogPath(changelogs_dir, new_name); - existing_changelogs[new_start_log_idx] = new_log_path; - current_writer = std::make_unique(new_log_path, WriteMode::Rewrite, new_start_log_idx); + new_description.path = formatChangelogPath(changelogs_dir, new_description); + existing_changelogs[new_start_log_idx] = new_description; + current_writer = std::make_unique(new_description.path, WriteMode::Rewrite, new_start_log_idx); } ChangelogRecord Changelog::buildRecord(size_t index, nuraft::ptr log_entry) const @@ -301,15 +314,14 @@ void Changelog::writeAt(size_t index, nuraft::ptr log_entry) if (need_rollback) { auto index_changelog = existing_changelogs.lower_bound(index); - std::string fname; + ChangelogFileDescription description; if (index_changelog->first == index) - fname = index_changelog->second; + description = index_changelog->second; else - fname = std::prev(index_changelog)->second; + description = std::prev(index_changelog)->second; - current_writer = std::make_unique(fname, WriteMode::Append, index_changelog->first); - auto formated_name = getChangelogName(fname); - current_writer->setEntriesWritten(formated_name.to_log_idx - formated_name.from_log_idx + 1); + current_writer = std::make_unique(description.path, WriteMode::Append, index_changelog->first); + current_writer->setEntriesWritten(description.to_log_idx - description.from_log_idx + 1); } auto entries_written = current_writer->getEntriesWritten(); @@ -320,7 +332,7 @@ void Changelog::writeAt(size_t index, nuraft::ptr log_entry) auto to_remove_itr = existing_changelogs.upper_bound(index); for (auto itr = to_remove_itr; itr != existing_changelogs.end();) { - std::filesystem::remove(itr->second); + std::filesystem::remove(itr->second.path); itr = existing_changelogs.erase(itr); } } @@ -342,17 +354,16 @@ void Changelog::compact(size_t up_to_log_idx) { for (auto itr = existing_changelogs.begin(); itr != existing_changelogs.end();) { - ChangelogName parsed_name = getChangelogName(itr->second); - if (parsed_name.to_log_idx <= up_to_log_idx) + if (itr->second.to_log_idx <= up_to_log_idx) { - for (size_t idx = parsed_name.from_log_idx; idx <= parsed_name.to_log_idx; ++idx) + for (size_t idx = itr->second.from_log_idx; idx <= itr->second.to_log_idx; ++idx) { auto index_pos = index_to_start_pos.find(idx); if (index_pos == index_to_start_pos.end()) break; index_to_start_pos.erase(index_pos); } - std::filesystem::remove(itr->second); + std::filesystem::remove(itr->second.path); itr = existing_changelogs.erase(itr); } else @@ -366,7 +377,6 @@ void Changelog::compact(size_t up_to_log_idx) LogEntryPtr Changelog::getLastEntry() const { - static LogEntryPtr fake_entry = nuraft::cs_new(0, nuraft::buffer::alloc(sizeof(size_t))); size_t next_idx = getNextEntryIndex() - 1; diff --git a/src/Coordination/Changelog.h b/src/Coordination/Changelog.h index 7c352e7a91b..e154c1c70c6 100644 --- a/src/Coordination/Changelog.h +++ b/src/Coordination/Changelog.h @@ -45,6 +45,15 @@ struct ChangelogRecord nuraft::ptr blob; }; +struct ChangelogFileDescription +{ + std::string prefix; + size_t from_log_idx; + size_t to_log_idx; + + std::string path; +}; + class ChangelogWriter; class Changelog @@ -98,7 +107,7 @@ private: private: std::string changelogs_dir; - std::map existing_changelogs; + std::map existing_changelogs; std::unique_ptr current_writer; IndexToOffset index_to_start_pos; const size_t rotate_interval; diff --git a/src/Coordination/tests/gtest_for_build.cpp b/src/Coordination/tests/gtest_for_build.cpp index 8328d93d9cf..76dd08a6d33 100644 --- a/src/Coordination/tests/gtest_for_build.cpp +++ b/src/Coordination/tests/gtest_for_build.cpp @@ -36,7 +36,9 @@ struct ChangelogDirTest , drop(drop_) { if (fs::exists(path)) + { EXPECT_TRUE(false) << "Path " << path << " already exists, remove it to run test"; + } fs::create_directory(path); } @@ -810,4 +812,59 @@ TEST(CoordinationTest, ChangelogTestStartNewLogAfterRead) EXPECT_TRUE(fs::exists("./logs/changelog_36_40.bin")); } + +TEST(CoordinationTest, ChangelogTestReadAfterBrokenTruncate) +{ + ChangelogDirTest test("./logs"); + + DB::NuKeeperLogStore changelog("./logs", 5); + changelog.init(1); + + for (size_t i = 0; i < 35; ++i) + { + auto entry = getLogEntry(std::to_string(i) + "_hello_world", i * 10); + changelog.append(entry); + } + EXPECT_EQ(changelog.size(), 35); + EXPECT_TRUE(fs::exists("./logs/changelog_1_5.bin")); + EXPECT_TRUE(fs::exists("./logs/changelog_6_10.bin")); + EXPECT_TRUE(fs::exists("./logs/changelog_11_15.bin")); + EXPECT_TRUE(fs::exists("./logs/changelog_16_20.bin")); + EXPECT_TRUE(fs::exists("./logs/changelog_21_25.bin")); + EXPECT_TRUE(fs::exists("./logs/changelog_26_30.bin")); + EXPECT_TRUE(fs::exists("./logs/changelog_31_35.bin")); + + DB::WriteBufferFromFile plain_buf("./logs/changelog_11_15.bin", DBMS_DEFAULT_BUFFER_SIZE, O_APPEND | O_CREAT | O_WRONLY); + plain_buf.truncate(0); + + DB::NuKeeperLogStore changelog_reader("./logs", 5); + changelog_reader.init(1); + + EXPECT_EQ(changelog_reader.size(), 10); + EXPECT_EQ(changelog_reader.last_entry()->get_term(), 90); + + EXPECT_TRUE(fs::exists("./logs/changelog_1_5.bin")); + EXPECT_TRUE(fs::exists("./logs/changelog_6_10.bin")); + EXPECT_TRUE(fs::exists("./logs/changelog_11_15.bin")); + + EXPECT_FALSE(fs::exists("./logs/changelog_16_20.bin")); + EXPECT_FALSE(fs::exists("./logs/changelog_21_25.bin")); + EXPECT_FALSE(fs::exists("./logs/changelog_26_30.bin")); + EXPECT_FALSE(fs::exists("./logs/changelog_31_35.bin")); + + auto entry = getLogEntry("h", 7777); + changelog_reader.append(entry); + EXPECT_EQ(changelog_reader.size(), 11); + EXPECT_EQ(changelog_reader.last_entry()->get_term(), 7777); + + EXPECT_TRUE(fs::exists("./logs/changelog_1_5.bin")); + EXPECT_TRUE(fs::exists("./logs/changelog_6_10.bin")); + EXPECT_TRUE(fs::exists("./logs/changelog_11_15.bin")); + + EXPECT_FALSE(fs::exists("./logs/changelog_16_20.bin")); + EXPECT_FALSE(fs::exists("./logs/changelog_21_25.bin")); + EXPECT_FALSE(fs::exists("./logs/changelog_26_30.bin")); + EXPECT_FALSE(fs::exists("./logs/changelog_31_35.bin")); +} + #endif From e5cef576e589f4307f35074cf45e8dbb08801c65 Mon Sep 17 00:00:00 2001 From: Anton Popov Date: Wed, 17 Feb 2021 12:39:40 +0300 Subject: [PATCH 0365/2357] Update subqueries.xml --- tests/performance/subqueries.xml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/performance/subqueries.xml b/tests/performance/subqueries.xml index f1481a78c7e..0d41099841b 100644 --- a/tests/performance/subqueries.xml +++ b/tests/performance/subqueries.xml @@ -1,7 +1,7 @@ - create table tab (a UInt32, b UInt32) engine = MergeTree order by (a, b) + create table tab (a UInt32, b UInt32) engine = MergeTree order by (a, b) insert into tab values (1, 1) select a, b from tab where (a, b) in (select toUInt32(number) as x, toUInt32(sleep(0.1) + 1) from numbers_mt(16)) settings max_threads = 2, max_block_size = 4 select a, b from tab where (1, 1) = (select min(toUInt32(number + 1)) as x, min(toUInt32(sleep(0.1) + 1)) from numbers_mt(16)) settings max_threads = 2, max_block_size = 4 DROP TABLE tab - \ No newline at end of file + From 527210b5e48af7d65fa726c49d4062cbf730f697 Mon Sep 17 00:00:00 2001 From: vdimir Date: Wed, 17 Feb 2021 12:44:53 +0300 Subject: [PATCH 0366/2357] Support old cross to inner join rewrite behaviour --- src/Core/Settings.h | 2 +- src/Interpreters/CrossToInnerJoinVisitor.cpp | 73 +++++++++++++------- src/Interpreters/CrossToInnerJoinVisitor.h | 2 +- 3 files changed, 51 insertions(+), 26 deletions(-) diff --git a/src/Core/Settings.h b/src/Core/Settings.h index 2ddd1e003ca..d533223852a 100644 --- a/src/Core/Settings.h +++ b/src/Core/Settings.h @@ -503,7 +503,7 @@ class IColumn; M(Bool, output_format_write_statistics, true, "Write statistics about read rows, bytes, time elapsed in suitable output formats.", 0) \ M(Bool, output_format_pretty_row_numbers, false, "Add row numbers before each row for pretty output format", 0) \ M(Bool, insert_distributed_one_random_shard, false, "If setting is enabled, inserting into distributed table will choose a random shard to write when there is no sharding key", 0) \ - M(Bool, cross_to_inner_join_rewrite, true, "Use inner join instead of comma/cross join if possible", 0) \ + M(UInt64, cross_to_inner_join_rewrite, 1, "Use inner join instead of comma/cross join if possible. Possible values: 0 - no rewrite, 1 - conservative mode, move only simple expressions to ON section, 2 - optimistic mode, move as much as possible", 0) \ // End of FORMAT_FACTORY_SETTINGS diff --git a/src/Interpreters/CrossToInnerJoinVisitor.cpp b/src/Interpreters/CrossToInnerJoinVisitor.cpp index b1e42b23ad5..3f3e9adc605 100644 --- a/src/Interpreters/CrossToInnerJoinVisitor.cpp +++ b/src/Interpreters/CrossToInnerJoinVisitor.cpp @@ -29,6 +29,8 @@ namespace ErrorCodes namespace { +using TablesWithColumnNamesAndTypes = std::vector; + struct JoinedElement { explicit JoinedElement(const ASTTablesInSelectQueryElement & table_element) @@ -124,27 +126,21 @@ void collectConjunctions(const ASTPtr & node, std::vector & members) members.push_back(node); } -std::optional getIdentMembership(const ASTIdentifier & ident, const std::vector & tables) -{ - std::optional table_pos = IdentifierSemantic::getMembership(ident); - if (table_pos) - return table_pos; - return IdentifierSemantic::chooseTableColumnMatch(ident, tables); -} - -std::optional getIdentsMembership(const ASTPtr ast, - const std::vector & tables, +std::optional getIdentsMembership(const std::vector idents, + const TablesWithColumnNamesAndTypes & tables, const Aliases & aliases) { - auto idents = IdentifiersCollector::collect(ast); - std::optional result; for (const auto * ident : idents) { /// Moving expressions that use column aliases is not supported. if (ident->isShort() && aliases.count(ident->shortName())) return {}; - const auto pos = getIdentMembership(*ident, tables); + + std::optional pos = IdentifierSemantic::getMembership(*ident); + if (!pos) + pos = IdentifierSemantic::chooseTableColumnMatch(*ident, tables); + if (!pos) return {}; if (result && *pos != *result) @@ -154,6 +150,33 @@ std::optional getIdentsMembership(const ASTPtr ast, return result; } +std::optional> getArgumentsMembership( + const ASTPtr & left, const ASTPtr & right, const TablesWithColumnNamesAndTypes & tables, const Aliases & aliases, bool recursive) +{ + std::optional left_table_pos, right_table_pos; + if (recursive) + { + /// Collect all nested identifies + left_table_pos = getIdentsMembership(IdentifiersCollector::collect(left), tables, aliases); + right_table_pos = getIdentsMembership(IdentifiersCollector::collect(right), tables, aliases); + } + else + { + /// Use identifier only if it's on the top level + const auto * left_ident = left->as(); + const auto * right_ident = right->as(); + if (left_ident && right_ident) + { + left_table_pos = getIdentsMembership({left_ident}, tables, aliases); + right_table_pos = getIdentsMembership({right_ident}, tables, aliases); + } + } + + if (left_table_pos && right_table_pos) + return std::make_pair(*left_table_pos, *right_table_pos); + return {}; +} + bool isAllowedToRewriteCrossJoin(const ASTPtr & node, const Aliases & aliases) { if (node->as()) @@ -173,6 +196,7 @@ bool canMoveExpressionToJoinOn(const ASTPtr & ast, const std::vector & joined_tables, const std::vector & tables, const Aliases & aliases, + int rewrite_mode, std::map> & asts_to_join_on) { std::vector conjuncts; @@ -184,17 +208,18 @@ bool canMoveExpressionToJoinOn(const ASTPtr & ast, if (!func->arguments || func->arguments->children.size() != 2) return false; + bool optimistic_rewrite = rewrite_mode >= 2; + auto table_pos = getArgumentsMembership(func->arguments->children[0], func->arguments->children[1], + tables, aliases, optimistic_rewrite); + /// Check if the identifiers are from different joined tables. /// If it's a self joint, tables should have aliases. - auto left_table_pos = getIdentsMembership(func->arguments->children[0], tables, aliases); - auto right_table_pos = getIdentsMembership(func->arguments->children[1], tables, aliases); - - /// Identifiers from different table move to JOIN ON - if (left_table_pos && right_table_pos && *left_table_pos != *right_table_pos) + if (table_pos && table_pos->first != table_pos->second) { - size_t table_pos = std::max(*left_table_pos, *right_table_pos); - if (joined_tables[table_pos].canAttachOnExpression()) - asts_to_join_on[table_pos].push_back(node); + /// Identifiers from different table move to JOIN ON + size_t max_table_pos = std::max(table_pos->first, table_pos->second); + if (joined_tables[max_table_pos].canAttachOnExpression()) + asts_to_join_on[max_table_pos].push_back(node); else return false; } @@ -326,11 +351,11 @@ void CrossToInnerJoinMatcher::visit(ASTSelectQuery & select, ASTPtr &, Data & da /// CROSS to INNER - if (select.where() && data.cross_to_inner_join_rewrite) + if (select.where() && data.cross_to_inner_join_rewrite > 0) { std::map> asts_to_join_on; - bool can_move_where - = canMoveExpressionToJoinOn(select.where(), joined_tables, data.tables_with_columns, data.aliases, asts_to_join_on); + bool can_move_where = canMoveExpressionToJoinOn( + select.where(), joined_tables, data.tables_with_columns, data.aliases, data.cross_to_inner_join_rewrite, asts_to_join_on); if (can_move_where) { for (size_t i = 1; i < joined_tables.size(); ++i) diff --git a/src/Interpreters/CrossToInnerJoinVisitor.h b/src/Interpreters/CrossToInnerJoinVisitor.h index 885cf8162c1..db9dd7ba79b 100644 --- a/src/Interpreters/CrossToInnerJoinVisitor.h +++ b/src/Interpreters/CrossToInnerJoinVisitor.h @@ -19,7 +19,7 @@ public: const Aliases & aliases; const String current_database; bool done = false; - bool cross_to_inner_join_rewrite = true; + int cross_to_inner_join_rewrite = true; }; static bool needChildVisit(ASTPtr &, const ASTPtr &); From c608fa1e6a3539f74e8956e441e4f68b99367982 Mon Sep 17 00:00:00 2001 From: Maksim Kita Date: Wed, 17 Feb 2021 12:53:12 +0300 Subject: [PATCH 0367/2357] Added error reinterpretation tests --- src/Functions/reinterpretAs.cpp | 4 ++++ tests/queries/0_stateless/01676_reinterpret_as.reference | 1 + tests/queries/0_stateless/01676_reinterpret_as.sql | 4 ++++ 3 files changed, 9 insertions(+) diff --git a/src/Functions/reinterpretAs.cpp b/src/Functions/reinterpretAs.cpp index 1d105f4ce38..c15ba969fdb 100644 --- a/src/Functions/reinterpretAs.cpp +++ b/src/Functions/reinterpretAs.cpp @@ -93,6 +93,10 @@ public: + " because only Numeric, String or FixedString can be reinterpreted in Numeric", ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT); } + else + throw Exception("Cannot reinterpret " + from_type->getName() + " as " + to_type->getName() + + " because only reinterpretation in String, FixedString and Numeric types is supported", + ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT); return to_type; } diff --git a/tests/queries/0_stateless/01676_reinterpret_as.reference b/tests/queries/0_stateless/01676_reinterpret_as.reference index f7ca2bbedfa..b39deb55a7f 100644 --- a/tests/queries/0_stateless/01676_reinterpret_as.reference +++ b/tests/queries/0_stateless/01676_reinterpret_as.reference @@ -28,3 +28,4 @@ Integer and String types 1 1 49 1 1 49 11 11 12593 +ReinterpretErrors diff --git a/tests/queries/0_stateless/01676_reinterpret_as.sql b/tests/queries/0_stateless/01676_reinterpret_as.sql index cc5dba1e110..ff727f284bb 100644 --- a/tests/queries/0_stateless/01676_reinterpret_as.sql +++ b/tests/queries/0_stateless/01676_reinterpret_as.sql @@ -28,3 +28,7 @@ SELECT 'Integer and String types'; SELECT reinterpret(a, 'String'), reinterpretAsString(a), reinterpretAsUInt8('1') as a; SELECT reinterpret(a, 'String'), reinterpretAsString(a), reinterpretAsUInt8('11') as a; SELECT reinterpret(a, 'String'), reinterpretAsString(a), reinterpretAsUInt16('11') as a; +SELECT 'ReinterpretErrors'; +SELECT reinterpret(toDecimal64(1, 2), 'UInt8'); -- {serverError 43} +SELECT reinterpret('123', 'FixedString(1)'); -- {serverError 43} +SELECT reinterpret(toDateTime('9922337203.6854775808', 1), 'Decimal64(1)'); -- {serverError 43} From a8647096ed96fb348aea73edf54b5e7bedea4284 Mon Sep 17 00:00:00 2001 From: Nikolai Kochetov Date: Wed, 17 Feb 2021 13:27:47 +0300 Subject: [PATCH 0368/2357] Try fix tests. --- src/Interpreters/ActionsDAG.cpp | 20 +++++++++++++------ .../Optimizations/filterPushDown.cpp | 4 ++-- .../QueryPlan/Optimizations/optimizeTree.cpp | 8 ++++++++ 3 files changed, 24 insertions(+), 8 deletions(-) diff --git a/src/Interpreters/ActionsDAG.cpp b/src/Interpreters/ActionsDAG.cpp index e9e9d1628a8..691905bed27 100644 --- a/src/Interpreters/ActionsDAG.cpp +++ b/src/Interpreters/ActionsDAG.cpp @@ -1245,14 +1245,14 @@ ActionsDAGPtr ActionsDAG::splitActionsForFilter(const std::string & filter_name, { struct Frame { - const Node * node; + Node * node; bool is_predicate = false; size_t next_child_to_visit = 0; size_t num_allowed_children = 0; }; std::stack stack; - std::unordered_set visited_nodes; + std::unordered_set visited_nodes; stack.push(Frame{.node = *it, .is_predicate = true}); visited_nodes.insert(*it); @@ -1290,12 +1290,12 @@ ActionsDAGPtr ActionsDAG::splitActionsForFilter(const std::string & filter_name, else if (is_conjunction) { for (auto * child : cur.node->children) - { if (allowed_nodes.count(child)) selected_predicates.insert(child); - else - other_predicates.insert(child); - } + } + else if (cur.is_predicate) + { + other_predicates.insert(cur.node); } stack.pop(); @@ -1311,6 +1311,14 @@ ActionsDAGPtr ActionsDAG::splitActionsForFilter(const std::string & filter_name, return nullptr; } + // std::cerr << "************* Selectecd predicates\n"; + // for (const auto * p : selected_predicates) + // std::cerr << p->result_name << std::endl; + + // std::cerr << "............. Other predicates\n"; + // for (const auto * p : other_predicates) + // std::cerr << p->result_name << std::endl; + auto actions = cloneEmpty(); actions->settings.project_input = false; diff --git a/src/Processors/QueryPlan/Optimizations/filterPushDown.cpp b/src/Processors/QueryPlan/Optimizations/filterPushDown.cpp index 98e923249f3..39f24a32b45 100644 --- a/src/Processors/QueryPlan/Optimizations/filterPushDown.cpp +++ b/src/Processors/QueryPlan/Optimizations/filterPushDown.cpp @@ -117,8 +117,8 @@ size_t tryPushDownFilter(QueryPlan::Node * parent_node, QueryPlan::Nodes & nodes if (keys.count(column.name) == 0) allowed_inputs.push_back(column.name); - for (const auto & name : allowed_inputs) - std::cerr << name << std::endl; + // for (const auto & name : allowed_inputs) + // std::cerr << name << std::endl; if (auto updated_steps = tryAddNewFilterStep(parent_node, nodes, allowed_inputs)) return updated_steps; diff --git a/src/Processors/QueryPlan/Optimizations/optimizeTree.cpp b/src/Processors/QueryPlan/Optimizations/optimizeTree.cpp index e5ccc173ed8..cc81a7f39fc 100644 --- a/src/Processors/QueryPlan/Optimizations/optimizeTree.cpp +++ b/src/Processors/QueryPlan/Optimizations/optimizeTree.cpp @@ -23,6 +23,9 @@ void optimizeTree(QueryPlan::Node & root, QueryPlan::Nodes & nodes) std::stack stack; stack.push(Frame{.node = &root}); + size_t max_optimizations_to_apply = 0; + size_t total_applied_optimizations = 0; + while (!stack.empty()) { auto & frame = stack.top(); @@ -54,8 +57,13 @@ void optimizeTree(QueryPlan::Node & root, QueryPlan::Nodes & nodes) if (!optimization.apply) continue; + if (max_optimizations_to_apply && max_optimizations_to_apply < total_applied_optimizations) + continue; + /// Try to apply optimization. auto update_depth = optimization.apply(frame.node, nodes); + if (update_depth) + ++total_applied_optimizations; max_update_depth = std::max(max_update_depth, update_depth); } From 9396bae2e2051e2d50faa0d8c1005465171db481 Mon Sep 17 00:00:00 2001 From: alesapin Date: Wed, 17 Feb 2021 14:53:47 +0300 Subject: [PATCH 0369/2357] More reliable test keeper tests --- src/Coordination/tests/gtest_for_build.cpp | 2 +- .../test_testkeeper_back_to_back/test.py | 536 +++++++++--------- .../__init__.py | 1 + .../configs/enable_test_keeper.xml | 21 + .../configs/logs_conf.xml | 12 + .../configs/use_test_keeper.xml | 8 + .../test_testkeeper_persistent_log/test.py | 124 ++++ 7 files changed, 444 insertions(+), 260 deletions(-) create mode 100644 tests/integration/test_testkeeper_persistent_log/__init__.py create mode 100644 tests/integration/test_testkeeper_persistent_log/configs/enable_test_keeper.xml create mode 100644 tests/integration/test_testkeeper_persistent_log/configs/logs_conf.xml create mode 100644 tests/integration/test_testkeeper_persistent_log/configs/use_test_keeper.xml create mode 100644 tests/integration/test_testkeeper_persistent_log/test.py diff --git a/src/Coordination/tests/gtest_for_build.cpp b/src/Coordination/tests/gtest_for_build.cpp index 76dd08a6d33..81e1751c08c 100644 --- a/src/Coordination/tests/gtest_for_build.cpp +++ b/src/Coordination/tests/gtest_for_build.cpp @@ -31,7 +31,7 @@ struct ChangelogDirTest { std::string path; bool drop; - ChangelogDirTest(std::string path_, bool drop_ = true) + explicit ChangelogDirTest(std::string path_, bool drop_ = true) : path(path_) , drop(drop_) { diff --git a/tests/integration/test_testkeeper_back_to_back/test.py b/tests/integration/test_testkeeper_back_to_back/test.py index 8ec54f1a883..dd4e1f98cfd 100644 --- a/tests/integration/test_testkeeper_back_to_back/test.py +++ b/tests/integration/test_testkeeper_back_to_back/test.py @@ -8,32 +8,23 @@ from multiprocessing.dummy import Pool cluster = ClickHouseCluster(__file__) node = cluster.add_instance('node', main_configs=['configs/enable_test_keeper.xml', 'configs/logs_conf.xml'], with_zookeeper=True) -from kazoo.client import KazooClient, KazooState - -_genuine_zk_instance = None -_fake_zk_instance = None +from kazoo.client import KazooClient, KazooState, KeeperState def get_genuine_zk(): - global _genuine_zk_instance - if not _genuine_zk_instance: - print("Zoo1", cluster.get_instance_ip("zoo1")) - _genuine_zk_instance = cluster.get_kazoo_client('zoo1') - return _genuine_zk_instance - + print("Zoo1", cluster.get_instance_ip("zoo1")) + return cluster.get_kazoo_client('zoo1') def get_fake_zk(): - global _fake_zk_instance - if not _fake_zk_instance: - print("node", cluster.get_instance_ip("node")) - _fake_zk_instance = KazooClient(hosts=cluster.get_instance_ip("node") + ":9181", timeout=30.0) - def reset_last_zxid_listener(state): - print("Fake zk callback called for state", state) - global _fake_zk_instance - if state != KazooState.CONNECTED: - _fake_zk_instance._reset() + print("node", cluster.get_instance_ip("node")) + _fake_zk_instance = KazooClient(hosts=cluster.get_instance_ip("node") + ":9181", timeout=30.0) + def reset_last_zxid_listener(state): + print("Fake zk callback called for state", state) + nonlocal _fake_zk_instance + if state != KazooState.CONNECTED: + _fake_zk_instance._reset() - _fake_zk_instance.add_listener(reset_last_zxid_listener) - _fake_zk_instance.start() + _fake_zk_instance.add_listener(reset_last_zxid_listener) + _fake_zk_instance.start() return _fake_zk_instance def random_string(length): @@ -44,6 +35,15 @@ def create_random_path(prefix="", depth=1): return prefix return create_random_path(os.path.join(prefix, random_string(3)), depth - 1) +def stop_zk(zk): + try: + if zk: + zk.stop() + zk.close() + except: + pass + + @pytest.fixture(scope="module") def started_cluster(): try: @@ -53,44 +53,46 @@ def started_cluster(): finally: cluster.shutdown() - if _genuine_zk_instance: - _genuine_zk_instance.stop() - _genuine_zk_instance.close() - if _fake_zk_instance: - _fake_zk_instance.stop() - _fake_zk_instance.close() def test_simple_commands(started_cluster): - genuine_zk = get_genuine_zk() - fake_zk = get_fake_zk() + try: + genuine_zk = get_genuine_zk() + fake_zk = get_fake_zk() - for zk in [genuine_zk, fake_zk]: - zk.create("/test_simple_commands", b"") - zk.create("/test_simple_commands/somenode1", b"hello") - zk.set("/test_simple_commands/somenode1", b"world") + for zk in [genuine_zk, fake_zk]: + zk.create("/test_simple_commands", b"") + zk.create("/test_simple_commands/somenode1", b"hello") + zk.set("/test_simple_commands/somenode1", b"world") - for zk in [genuine_zk, fake_zk]: - assert zk.exists("/test_simple_commands") - assert zk.exists("/test_simple_commands/somenode1") - print(zk.get("/test_simple_commands/somenode1")) - assert zk.get("/test_simple_commands/somenode1")[0] == b"world" + for zk in [genuine_zk, fake_zk]: + assert zk.exists("/test_simple_commands") + assert zk.exists("/test_simple_commands/somenode1") + print(zk.get("/test_simple_commands/somenode1")) + assert zk.get("/test_simple_commands/somenode1")[0] == b"world" + finally: + for zk in [genuine_zk, fake_zk]: + stop_zk(zk) def test_sequential_nodes(started_cluster): - genuine_zk = get_genuine_zk() - fake_zk = get_fake_zk() - genuine_zk.create("/test_sequential_nodes") - fake_zk.create("/test_sequential_nodes") - for i in range(1, 11): - genuine_zk.create("/test_sequential_nodes/" + ("a" * i) + "-", sequence=True) - genuine_zk.create("/test_sequential_nodes/" + ("b" * i)) - fake_zk.create("/test_sequential_nodes/" + ("a" * i) + "-", sequence=True) - fake_zk.create("/test_sequential_nodes/" + ("b" * i)) + try: + genuine_zk = get_genuine_zk() + fake_zk = get_fake_zk() + genuine_zk.create("/test_sequential_nodes") + fake_zk.create("/test_sequential_nodes") + for i in range(1, 11): + genuine_zk.create("/test_sequential_nodes/" + ("a" * i) + "-", sequence=True) + genuine_zk.create("/test_sequential_nodes/" + ("b" * i)) + fake_zk.create("/test_sequential_nodes/" + ("a" * i) + "-", sequence=True) + fake_zk.create("/test_sequential_nodes/" + ("b" * i)) - genuine_childs = list(sorted(genuine_zk.get_children("/test_sequential_nodes"))) - fake_childs = list(sorted(fake_zk.get_children("/test_sequential_nodes"))) - assert genuine_childs == fake_childs + genuine_childs = list(sorted(genuine_zk.get_children("/test_sequential_nodes"))) + fake_childs = list(sorted(fake_zk.get_children("/test_sequential_nodes"))) + assert genuine_childs == fake_childs + finally: + for zk in [genuine_zk, fake_zk]: + stop_zk(zk) def assert_eq_stats(stat1, stat2): @@ -102,130 +104,141 @@ def assert_eq_stats(stat1, stat2): assert stat1.numChildren == stat2.numChildren def test_stats(started_cluster): - genuine_zk = get_genuine_zk() - fake_zk = get_fake_zk() - genuine_zk.create("/test_stats_nodes") - fake_zk.create("/test_stats_nodes") - genuine_stats = genuine_zk.exists("/test_stats_nodes") - fake_stats = fake_zk.exists("/test_stats_nodes") - assert_eq_stats(genuine_stats, fake_stats) - for i in range(1, 11): - genuine_zk.create("/test_stats_nodes/" + ("a" * i) + "-", sequence=True) - genuine_zk.create("/test_stats_nodes/" + ("b" * i)) - fake_zk.create("/test_stats_nodes/" + ("a" * i) + "-", sequence=True) - fake_zk.create("/test_stats_nodes/" + ("b" * i)) + try: + genuine_zk = get_genuine_zk() + fake_zk = get_fake_zk() + genuine_zk.create("/test_stats_nodes") + fake_zk.create("/test_stats_nodes") + genuine_stats = genuine_zk.exists("/test_stats_nodes") + fake_stats = fake_zk.exists("/test_stats_nodes") + assert_eq_stats(genuine_stats, fake_stats) + for i in range(1, 11): + genuine_zk.create("/test_stats_nodes/" + ("a" * i) + "-", sequence=True) + genuine_zk.create("/test_stats_nodes/" + ("b" * i)) + fake_zk.create("/test_stats_nodes/" + ("a" * i) + "-", sequence=True) + fake_zk.create("/test_stats_nodes/" + ("b" * i)) - genuine_stats = genuine_zk.exists("/test_stats_nodes") - fake_stats = fake_zk.exists("/test_stats_nodes") - assert_eq_stats(genuine_stats, fake_stats) - for i in range(1, 11): - print("/test_stats_nodes/" + ("a" * i) + "-" + "{:010d}".format((i - 1) * 2)) - genuine_zk.delete("/test_stats_nodes/" + ("a" * i) + "-" + "{:010d}".format((i - 1) * 2)) - genuine_zk.delete("/test_stats_nodes/" + ("b" * i)) - fake_zk.delete("/test_stats_nodes/" + ("a" * i) + "-" + "{:010d}".format((i - 1) * 2)) - fake_zk.delete("/test_stats_nodes/" + ("b" * i)) + genuine_stats = genuine_zk.exists("/test_stats_nodes") + fake_stats = fake_zk.exists("/test_stats_nodes") + assert_eq_stats(genuine_stats, fake_stats) + for i in range(1, 11): + print("/test_stats_nodes/" + ("a" * i) + "-" + "{:010d}".format((i - 1) * 2)) + genuine_zk.delete("/test_stats_nodes/" + ("a" * i) + "-" + "{:010d}".format((i - 1) * 2)) + genuine_zk.delete("/test_stats_nodes/" + ("b" * i)) + fake_zk.delete("/test_stats_nodes/" + ("a" * i) + "-" + "{:010d}".format((i - 1) * 2)) + fake_zk.delete("/test_stats_nodes/" + ("b" * i)) - genuine_stats = genuine_zk.exists("/test_stats_nodes") - fake_stats = fake_zk.exists("/test_stats_nodes") - print(genuine_stats) - print(fake_stats) - assert_eq_stats(genuine_stats, fake_stats) - for i in range(100): - genuine_zk.set("/test_stats_nodes", ("q" * i).encode()) - fake_zk.set("/test_stats_nodes", ("q" * i).encode()) + genuine_stats = genuine_zk.exists("/test_stats_nodes") + fake_stats = fake_zk.exists("/test_stats_nodes") + print(genuine_stats) + print(fake_stats) + assert_eq_stats(genuine_stats, fake_stats) + for i in range(100): + genuine_zk.set("/test_stats_nodes", ("q" * i).encode()) + fake_zk.set("/test_stats_nodes", ("q" * i).encode()) - genuine_stats = genuine_zk.exists("/test_stats_nodes") - fake_stats = fake_zk.exists("/test_stats_nodes") - print(genuine_stats) - print(fake_stats) - assert_eq_stats(genuine_stats, fake_stats) + genuine_stats = genuine_zk.exists("/test_stats_nodes") + fake_stats = fake_zk.exists("/test_stats_nodes") + print(genuine_stats) + print(fake_stats) + assert_eq_stats(genuine_stats, fake_stats) + finally: + for zk in [genuine_zk, fake_zk]: + stop_zk(zk) def test_watchers(started_cluster): - genuine_zk = get_genuine_zk() - fake_zk = get_fake_zk() - genuine_zk.create("/test_data_watches") - fake_zk.create("/test_data_watches") - genuine_data_watch_data = None + try: + genuine_zk = get_genuine_zk() + fake_zk = get_fake_zk() + genuine_zk.create("/test_data_watches") + fake_zk.create("/test_data_watches") + genuine_data_watch_data = None - def genuine_callback(event): - print("Genuine data watch called") - nonlocal genuine_data_watch_data - genuine_data_watch_data = event + def genuine_callback(event): + print("Genuine data watch called") + nonlocal genuine_data_watch_data + genuine_data_watch_data = event - fake_data_watch_data = None - def fake_callback(event): - print("Fake data watch called") - nonlocal fake_data_watch_data - fake_data_watch_data = event + fake_data_watch_data = None + def fake_callback(event): + print("Fake data watch called") + nonlocal fake_data_watch_data + fake_data_watch_data = event - genuine_zk.get("/test_data_watches", watch=genuine_callback) - fake_zk.get("/test_data_watches", watch=fake_callback) + genuine_zk.get("/test_data_watches", watch=genuine_callback) + fake_zk.get("/test_data_watches", watch=fake_callback) - print("Calling set genuine") - genuine_zk.set("/test_data_watches", b"a") - print("Calling set fake") - fake_zk.set("/test_data_watches", b"a") - time.sleep(3) + print("Calling set genuine") + genuine_zk.set("/test_data_watches", b"a") + print("Calling set fake") + fake_zk.set("/test_data_watches", b"a") + time.sleep(3) - print("Genuine data", genuine_data_watch_data) - print("Fake data", fake_data_watch_data) - assert genuine_data_watch_data == fake_data_watch_data + print("Genuine data", genuine_data_watch_data) + print("Fake data", fake_data_watch_data) + assert genuine_data_watch_data == fake_data_watch_data - genuine_children = None - def genuine_child_callback(event): - print("Genuine child watch called") - nonlocal genuine_children - genuine_children = event + genuine_children = None + def genuine_child_callback(event): + print("Genuine child watch called") + nonlocal genuine_children + genuine_children = event - fake_children = None - def fake_child_callback(event): - print("Fake child watch called") - nonlocal fake_children - fake_children = event + fake_children = None + def fake_child_callback(event): + print("Fake child watch called") + nonlocal fake_children + fake_children = event - genuine_zk.get_children("/test_data_watches", watch=genuine_child_callback) - fake_zk.get_children("/test_data_watches", watch=fake_child_callback) + genuine_zk.get_children("/test_data_watches", watch=genuine_child_callback) + fake_zk.get_children("/test_data_watches", watch=fake_child_callback) - print("Calling genuine child") - genuine_zk.create("/test_data_watches/child", b"b") - print("Calling fake child") - fake_zk.create("/test_data_watches/child", b"b") + print("Calling genuine child") + genuine_zk.create("/test_data_watches/child", b"b") + print("Calling fake child") + fake_zk.create("/test_data_watches/child", b"b") - time.sleep(3) + time.sleep(3) - print("Genuine children", genuine_children) - print("Fake children", fake_children) - assert genuine_children == fake_children + print("Genuine children", genuine_children) + print("Fake children", fake_children) + assert genuine_children == fake_children + finally: + for zk in [genuine_zk, fake_zk]: + stop_zk(zk) def test_multitransactions(started_cluster): - genuine_zk = get_genuine_zk() - fake_zk = get_fake_zk() - for zk in [genuine_zk, fake_zk]: - zk.create('/test_multitransactions') - t = zk.transaction() - t.create('/test_multitransactions/freddy') - t.create('/test_multitransactions/fred', ephemeral=True) - t.create('/test_multitransactions/smith', sequence=True) - results = t.commit() - assert len(results) == 3 - assert results[0] == '/test_multitransactions/freddy' - assert results[2].startswith('/test_multitransactions/smith0') is True - - from kazoo.exceptions import RolledBackError, NoNodeError - for i, zk in enumerate([genuine_zk, fake_zk]): - print("Processing ZK", i) - t = zk.transaction() - t.create('/test_multitransactions/q') - t.delete('/test_multitransactions/a') - t.create('/test_multitransactions/x') - results = t.commit() - print("Results", results) - assert results[0].__class__ == RolledBackError - assert results[1].__class__ == NoNodeError - assert zk.exists('/test_multitransactions/q') is None - assert zk.exists('/test_multitransactions/a') is None - assert zk.exists('/test_multitransactions/x') is None + try: + genuine_zk = get_genuine_zk() + fake_zk = get_fake_zk() + for zk in [genuine_zk, fake_zk]: + zk.create('/test_multitransactions') + t = zk.transaction() + t.create('/test_multitransactions/freddy') + t.create('/test_multitransactions/fred', ephemeral=True) + t.create('/test_multitransactions/smith', sequence=True) + results = t.commit() + assert len(results) == 3 + assert results[0] == '/test_multitransactions/freddy' + assert results[2].startswith('/test_multitransactions/smith0') is True + from kazoo.exceptions import RolledBackError, NoNodeError + for i, zk in enumerate([genuine_zk, fake_zk]): + print("Processing ZK", i) + t = zk.transaction() + t.create('/test_multitransactions/q') + t.delete('/test_multitransactions/a') + t.create('/test_multitransactions/x') + results = t.commit() + print("Results", results) + assert results[0].__class__ == RolledBackError + assert results[1].__class__ == NoNodeError + assert zk.exists('/test_multitransactions/q') is None + assert zk.exists('/test_multitransactions/a') is None + assert zk.exists('/test_multitransactions/x') is None + finally: + for zk in [genuine_zk, fake_zk]: + stop_zk(zk) def exists(zk, path): result = zk.exists(path) @@ -278,13 +291,13 @@ class Request(object): arg_str = ', '.join([str(k) + "=" + str(v) for k, v in self.arguments.items()]) return "ZKRequest name {} with arguments {}".format(self.name, arg_str) -def generate_requests(iters=1): +def generate_requests(prefix="/", iters=1): requests = [] existing_paths = [] for i in range(iters): for _ in range(100): rand_length = random.randint(0, 10) - path = "/" + path = prefix for j in range(1, rand_length): path = create_random_path(path, 1) existing_paths.append(path) @@ -322,31 +335,43 @@ def generate_requests(iters=1): def test_random_requests(started_cluster): - requests = generate_requests(10) - genuine_zk = get_genuine_zk() - fake_zk = get_fake_zk() - for i, request in enumerate(requests): - genuine_throw = False - fake_throw = False - fake_result = None - genuine_result = None - try: - genuine_result = request.callback(genuine_zk) - except Exception as ex: - genuine_throw = True + try: + requests = generate_requests("/test_random_requests", 10) + print("Generated", len(requests), "requests") + genuine_zk = get_genuine_zk() + fake_zk = get_fake_zk() + genuine_zk.create("/test_random_requests") + fake_zk.create("/test_random_requests") + for i, request in enumerate(requests): + genuine_throw = False + fake_throw = False + fake_result = None + genuine_result = None + try: + genuine_result = request.callback(genuine_zk) + except Exception as ex: + print("i", i, "request", request) + print("Genuine exception", str(ex)) + genuine_throw = True - try: - fake_result = request.callback(fake_zk) - except Exception as ex: - fake_throw = True + try: + fake_result = request.callback(fake_zk) + except Exception as ex: + print("i", i, "request", request) + print("Fake exception", str(ex)) + fake_throw = True - assert fake_throw == genuine_throw, "Fake throw genuine not or vise versa" - assert fake_result == genuine_result, "Zookeeper results differ" - root_children_genuine = [elem for elem in list(sorted(genuine_zk.get_children("/"))) if elem not in ('clickhouse', 'zookeeper')] - root_children_fake = [elem for elem in list(sorted(fake_zk.get_children("/"))) if elem not in ('clickhouse', 'zookeeper')] - assert root_children_fake == root_children_genuine + assert fake_throw == genuine_throw, "Fake throw genuine not or vise versa request {}" + assert fake_result == genuine_result, "Zookeeper results differ" + root_children_genuine = [elem for elem in list(sorted(genuine_zk.get_children("/test_random_requests"))) if elem not in ('clickhouse', 'zookeeper')] + root_children_fake = [elem for elem in list(sorted(fake_zk.get_children("/test_random_requests"))) if elem not in ('clickhouse', 'zookeeper')] + assert root_children_fake == root_children_genuine + finally: + for zk in [genuine_zk, fake_zk]: + stop_zk(zk) def test_end_of_session(started_cluster): + fake_zk1 = None fake_zk2 = None genuine_zk1 = None @@ -401,13 +426,8 @@ def test_end_of_session(started_cluster): assert fake_ephemeral_event == genuine_ephemeral_event finally: - try: - for zk in [fake_zk1, fake_zk2, genuine_zk1, genuine_zk2]: - if zk: - zk.stop() - zk.close() - except: - pass + for zk in [fake_zk1, fake_zk2, genuine_zk1, genuine_zk2]: + stop_zk(zk) def test_end_of_watches_session(started_cluster): fake_zk1 = None @@ -442,91 +462,89 @@ def test_end_of_watches_session(started_cluster): assert dummy_set == 2 finally: - try: - for zk in [fake_zk1, fake_zk2]: - if zk: - zk.stop() - zk.close() - except: - pass + for zk in [fake_zk1, fake_zk2]: + stop_zk(zk) def test_concurrent_watches(started_cluster): - fake_zk = get_fake_zk() - fake_zk.restart() - global_path = "/test_concurrent_watches_0" - fake_zk.create(global_path) + try: + fake_zk = get_fake_zk() + fake_zk.restart() + global_path = "/test_concurrent_watches_0" + fake_zk.create(global_path) - dumb_watch_triggered_counter = 0 - all_paths_triggered = [] + dumb_watch_triggered_counter = 0 + all_paths_triggered = [] - existing_path = [] - all_paths_created = [] - watches_created = 0 - def create_path_and_watch(i): - nonlocal watches_created - nonlocal all_paths_created - fake_zk.ensure_path(global_path + "/" + str(i)) - # new function each time - def dumb_watch(event): - nonlocal dumb_watch_triggered_counter - dumb_watch_triggered_counter += 1 - nonlocal all_paths_triggered - all_paths_triggered.append(event.path) + existing_path = [] + all_paths_created = [] + watches_created = 0 + def create_path_and_watch(i): + nonlocal watches_created + nonlocal all_paths_created + fake_zk.ensure_path(global_path + "/" + str(i)) + # new function each time + def dumb_watch(event): + nonlocal dumb_watch_triggered_counter + dumb_watch_triggered_counter += 1 + nonlocal all_paths_triggered + all_paths_triggered.append(event.path) - fake_zk.get(global_path + "/" + str(i), watch=dumb_watch) - all_paths_created.append(global_path + "/" + str(i)) - watches_created += 1 - existing_path.append(i) + fake_zk.get(global_path + "/" + str(i), watch=dumb_watch) + all_paths_created.append(global_path + "/" + str(i)) + watches_created += 1 + existing_path.append(i) - trigger_called = 0 - def trigger_watch(i): - nonlocal trigger_called - trigger_called += 1 - fake_zk.set(global_path + "/" + str(i), b"somevalue") - try: - existing_path.remove(i) - except: - pass - - def call(total): - for i in range(total): - create_path_and_watch(random.randint(0, 1000)) - time.sleep(random.random() % 0.5) + trigger_called = 0 + def trigger_watch(i): + nonlocal trigger_called + trigger_called += 1 + fake_zk.set(global_path + "/" + str(i), b"somevalue") try: - rand_num = random.choice(existing_path) - trigger_watch(rand_num) - except: - pass - while existing_path: - try: - rand_num = random.choice(existing_path) - trigger_watch(rand_num) + existing_path.remove(i) except: pass - p = Pool(10) - arguments = [100] * 10 - watches_must_be_created = sum(arguments) - watches_trigger_must_be_called = sum(arguments) - watches_must_be_triggered = sum(arguments) - p.map(call, arguments) - p.close() + def call(total): + for i in range(total): + create_path_and_watch(random.randint(0, 1000)) + time.sleep(random.random() % 0.5) + try: + rand_num = random.choice(existing_path) + trigger_watch(rand_num) + except: + pass + while existing_path: + try: + rand_num = random.choice(existing_path) + trigger_watch(rand_num) + except: + pass - # waiting for late watches - for i in range(50): - if dumb_watch_triggered_counter == watches_must_be_triggered: - break + p = Pool(10) + arguments = [100] * 10 + watches_must_be_created = sum(arguments) + watches_trigger_must_be_called = sum(arguments) + watches_must_be_triggered = sum(arguments) + p.map(call, arguments) + p.close() - time.sleep(0.1) + # waiting for late watches + for i in range(50): + if dumb_watch_triggered_counter == watches_must_be_triggered: + break - assert watches_created == watches_must_be_created - assert trigger_called >= watches_trigger_must_be_called - assert len(existing_path) == 0 - if dumb_watch_triggered_counter != watches_must_be_triggered: - print("All created paths", all_paths_created) - print("All triggerred paths", all_paths_triggered) - print("All paths len", len(all_paths_created)) - print("All triggered len", len(all_paths_triggered)) - print("Diff", list(set(all_paths_created) - set(all_paths_triggered))) + time.sleep(0.1) - assert dumb_watch_triggered_counter == watches_must_be_triggered + assert watches_created == watches_must_be_created + assert trigger_called >= watches_trigger_must_be_called + assert len(existing_path) == 0 + if dumb_watch_triggered_counter != watches_must_be_triggered: + print("All created paths", all_paths_created) + print("All triggerred paths", all_paths_triggered) + print("All paths len", len(all_paths_created)) + print("All triggered len", len(all_paths_triggered)) + print("Diff", list(set(all_paths_created) - set(all_paths_triggered))) + + assert dumb_watch_triggered_counter == watches_must_be_triggered + finally: + stop_zk(fake_zk) diff --git a/tests/integration/test_testkeeper_persistent_log/__init__.py b/tests/integration/test_testkeeper_persistent_log/__init__.py new file mode 100644 index 00000000000..e5a0d9b4834 --- /dev/null +++ b/tests/integration/test_testkeeper_persistent_log/__init__.py @@ -0,0 +1 @@ +#!/usr/bin/env python3 diff --git a/tests/integration/test_testkeeper_persistent_log/configs/enable_test_keeper.xml b/tests/integration/test_testkeeper_persistent_log/configs/enable_test_keeper.xml new file mode 100644 index 00000000000..a8b8991f959 --- /dev/null +++ b/tests/integration/test_testkeeper_persistent_log/configs/enable_test_keeper.xml @@ -0,0 +1,21 @@ + + + 9181 + 1 + /var/lib/clickhouse/coordination/log + + + 5000 + 10000 + trace + + + + + 1 + localhost + 44444 + + + + diff --git a/tests/integration/test_testkeeper_persistent_log/configs/logs_conf.xml b/tests/integration/test_testkeeper_persistent_log/configs/logs_conf.xml new file mode 100644 index 00000000000..318a6bca95d --- /dev/null +++ b/tests/integration/test_testkeeper_persistent_log/configs/logs_conf.xml @@ -0,0 +1,12 @@ + + 3 + + trace + /var/log/clickhouse-server/log.log + /var/log/clickhouse-server/log.err.log + 1000M + 10 + /var/log/clickhouse-server/stderr.log + /var/log/clickhouse-server/stdout.log + + diff --git a/tests/integration/test_testkeeper_persistent_log/configs/use_test_keeper.xml b/tests/integration/test_testkeeper_persistent_log/configs/use_test_keeper.xml new file mode 100644 index 00000000000..12dc7fd9447 --- /dev/null +++ b/tests/integration/test_testkeeper_persistent_log/configs/use_test_keeper.xml @@ -0,0 +1,8 @@ + + + + node1 + 9181 + + + diff --git a/tests/integration/test_testkeeper_persistent_log/test.py b/tests/integration/test_testkeeper_persistent_log/test.py new file mode 100644 index 00000000000..71fee94088f --- /dev/null +++ b/tests/integration/test_testkeeper_persistent_log/test.py @@ -0,0 +1,124 @@ +#!/usr/bin/env python3 +import pytest +from helpers.cluster import ClickHouseCluster +import random +import string +import os +import time +from kazoo.client import KazooClient, KazooState + + +cluster = ClickHouseCluster(__file__) + +node = cluster.add_instance('node', main_configs=['configs/enable_test_keeper.xml', 'configs/logs_conf.xml', 'configs/use_test_keeper.xml'], stay_alive=True) + + +def random_string(length): + return ''.join(random.choices(string.ascii_lowercase + string.digits, k=length)) + +def create_random_path(prefix="", depth=1): + if depth == 0: + return prefix + return create_random_path(os.path.join(prefix, random_string(3)), depth - 1) + +@pytest.fixture(scope="module") +def started_cluster(): + try: + cluster.start() + + yield cluster + + finally: + cluster.shutdown() + +def get_connection_zk(nodename, timeout=30.0): + _fake_zk_instance = KazooClient(hosts=cluster.get_instance_ip(nodename) + ":9181", timeout=timeout) + def reset_listener(state): + nonlocal _fake_zk_instance + print("Fake zk callback called for state", state) + if state != KazooState.CONNECTED: + _fake_zk_instance._reset() + + _fake_zk_instance.add_listener(reset_listener) + _fake_zk_instance.start() + return _fake_zk_instance + +def test_state_after_restart(started_cluster): + try: + node_zk = None + node_zk2 = None + node_zk = get_connection_zk("node") + + node_zk.create("/test_state_after_restart", b"somevalue") + strs = [] + for i in range(100): + strs.append(random_string(123).encode()) + node_zk.create("/test_state_after_restart/node" + str(i), strs[i]) + + for i in range(100): + if i % 7 == 0: + node_zk.delete("/test_state_after_restart/node" + str(i)) + + node.restart_clickhouse(kill=True) + + node_zk2 = get_connection_zk("node") + + assert node_zk2.get("/test_state_after_restart")[0] == b"somevalue" + for i in range(100): + if i % 7 == 0: + assert node_zk2.exists("/test_state_after_restart/node" + str(i)) is None + else: + assert len(node_zk2.get("/test_state_after_restart/node" + str(i))[0]) == 123 + assert node_zk2.get("/test_state_after_restart/node" + str(i))[0] == strs[i] + finally: + try: + if node_zk is not None: + node_zk.stop() + node_zk.close() + + if node_zk2 is not None: + node_zk2.stop() + node_zk2.close() + except: + pass + + +# http://zookeeper-user.578899.n2.nabble.com/Why-are-ephemeral-nodes-written-to-disk-tp7583403p7583418.html +def test_ephemeral_after_restart(started_cluster): + try: + node_zk = None + node_zk2 = None + node_zk = get_connection_zk("node") + + node_zk.create("/test_ephemeral_after_restart", b"somevalue") + strs = [] + for i in range(100): + strs.append(random_string(123).encode()) + node_zk.create("/test_ephemeral_after_restart/node" + str(i), strs[i], ephemeral=True) + + for i in range(100): + if i % 7 == 0: + node_zk.delete("/test_ephemeral_after_restart/node" + str(i)) + + node.restart_clickhouse(kill=True) + + node_zk2 = get_connection_zk("node") + + assert node_zk2.get("/test_ephemeral_after_restart")[0] == b"somevalue" + for i in range(100): + if i % 7 == 0: + assert node_zk2.exists("/test_ephemeral_after_restart/node" + str(i)) is None + else: + assert len(node_zk2.get("/test_ephemeral_after_restart/node" + str(i))[0]) == 123 + assert node_zk2.get("/test_ephemeral_after_restart/node" + str(i))[0] == strs[i] + finally: + try: + if node_zk is not None: + node_zk.stop() + node_zk.close() + + if node_zk2 is not None: + node_zk2.stop() + node_zk2.close() + except: + pass From b2c09f002f592a2bec866ff7e698aa0f0a89ff57 Mon Sep 17 00:00:00 2001 From: Maksim Kita Date: Wed, 17 Feb 2021 15:26:00 +0300 Subject: [PATCH 0370/2357] Dictionary create source with functions crash fix --- .../getDictionaryConfigurationFromAST.cpp | 6 +++- ...ary_create_source_with_functions.reference | 1 + ...ictionary_create_source_with_functions.sql | 28 +++++++++++++++++++ 3 files changed, 34 insertions(+), 1 deletion(-) create mode 100644 tests/queries/0_stateless/01720_dictionary_create_source_with_functions.reference create mode 100644 tests/queries/0_stateless/01720_dictionary_create_source_with_functions.sql diff --git a/src/Dictionaries/getDictionaryConfigurationFromAST.cpp b/src/Dictionaries/getDictionaryConfigurationFromAST.cpp index 2d4f971ef58..acfb11787de 100644 --- a/src/Dictionaries/getDictionaryConfigurationFromAST.cpp +++ b/src/Dictionaries/getDictionaryConfigurationFromAST.cpp @@ -401,10 +401,14 @@ void buildConfigurationFromFunctionWithKeyValueArguments( { auto builder = FunctionFactory::instance().tryGet(func->name, context); auto function = builder->build({}); - auto result = function->execute({}, {}, 0); + function->prepare({}); + + size_t input_rows_count = 1; + auto result = function->execute({}, function->getResultType(), input_rows_count); Field value; result->get(0, value); + AutoPtr text_value(doc->createTextNode(getFieldAsString(value))); current_xml_element->appendChild(text_value); } diff --git a/tests/queries/0_stateless/01720_dictionary_create_source_with_functions.reference b/tests/queries/0_stateless/01720_dictionary_create_source_with_functions.reference new file mode 100644 index 00000000000..38abe3c9f52 --- /dev/null +++ b/tests/queries/0_stateless/01720_dictionary_create_source_with_functions.reference @@ -0,0 +1 @@ +1 First diff --git a/tests/queries/0_stateless/01720_dictionary_create_source_with_functions.sql b/tests/queries/0_stateless/01720_dictionary_create_source_with_functions.sql new file mode 100644 index 00000000000..a0a4fbbfab9 --- /dev/null +++ b/tests/queries/0_stateless/01720_dictionary_create_source_with_functions.sql @@ -0,0 +1,28 @@ +DROP DATABASE IF EXISTS 01720_dictionary_db; +CREATE DATABASE 01720_dictionary_db; + +CREATE TABLE 01720_dictionary_db.dictionary_source_table +( + key UInt8, + value String +) +ENGINE = TinyLog; + +INSERT INTO 01720_dictionary_db.dictionary_source_table VALUES (1, 'First'); + +CREATE DICTIONARY 01720_dictionary_db.dictionary +( + key UInt64, + value String +) +PRIMARY KEY key +SOURCE(CLICKHOUSE(DB '01720_dictionary_db' TABLE 'dictionary_source_table' HOST hostName() PORT tcpPort())) +LIFETIME(0) +LAYOUT(FLAT()); + +SELECT * FROM 01720_dictionary_db.dictionary; + +DROP DICTIONARY 01720_dictionary_db.dictionary; +DROP TABLE 01720_dictionary_db.dictionary_source_table; + +DROP DATABASE 01720_dictionary_db; From e82bd824d7818279db000f2019f5d2c82fefbb38 Mon Sep 17 00:00:00 2001 From: alesapin Date: Wed, 17 Feb 2021 16:07:59 +0300 Subject: [PATCH 0371/2357] Fix restart replica in test --- .../test.py | 36 ++++++++++++++----- 1 file changed, 28 insertions(+), 8 deletions(-) diff --git a/tests/integration/test_testkeeper_multinode_blocade_leader/test.py b/tests/integration/test_testkeeper_multinode_blocade_leader/test.py index 3b2867ef3c7..a1fd066ab83 100644 --- a/tests/integration/test_testkeeper_multinode_blocade_leader/test.py +++ b/tests/integration/test_testkeeper_multinode_blocade_leader/test.py @@ -87,7 +87,7 @@ def test_blocade_leader(started_cluster): for i in range(100): try: - node2.query("SYSTEM RESTART REPLICA ordinary.t1") + restart_replica_for_sure(node2, "ordinary.t1", "/clickhouse/t1/replicas/2") node2.query("INSERT INTO ordinary.t1 SELECT rand() FROM numbers(100)") break except Exception as ex: @@ -104,7 +104,7 @@ def test_blocade_leader(started_cluster): for i in range(100): try: - node3.query("SYSTEM RESTART REPLICA ordinary.t1") + restart_replica_for_sure(node3, "ordinary.t1", "/clickhouse/t1/replicas/3") node3.query("INSERT INTO ordinary.t1 SELECT rand() FROM numbers(100)") break except Exception as ex: @@ -122,7 +122,7 @@ def test_blocade_leader(started_cluster): for n, node in enumerate([node1, node2, node3]): for i in range(100): try: - node.query("SYSTEM RESTART REPLICA ordinary.t1") + restart_replica_for_sure(node, "ordinary.t1", "/clickhouse/t1/replicas/{}".format(n + 1)) break except Exception as ex: try: @@ -150,7 +150,7 @@ def test_blocade_leader(started_cluster): for n, node in enumerate([node1, node2, node3]): for i in range(100): try: - node.query("SYSTEM RESTART REPLICA ordinary.t1") + restart_replica_for_sure(node, "ordinary.t1", "/clickhouse/t1/replicas/{}".format(n + 1)) node.query("SYSTEM SYNC REPLICA ordinary.t1", timeout=10) break except Exception as ex: @@ -188,6 +188,25 @@ def dump_zk(node, zk_path, replica_path): print("Parts") print(node.query("SELECT name FROM system.zookeeper WHERE path = '{}/parts' FORMAT Vertical".format(replica_path))) +def restart_replica_for_sure(node, table_name, zk_replica_path): + fake_zk = None + try: + node.query("DETACH TABLE {}".format(table_name)) + fake_zk = get_fake_zk(node.name) + if fake_zk.exists(zk_replica_path + "/is_active") is not None: + fake_zk.delete(zk_replica_path + "/is_active") + + node.query("ATTACH TABLE {}".format(table_name)) + except Exception as ex: + print("Exception", ex) + raise ex + finally: + if fake_zk: + fake_zk.stop() + fake_zk.close() + + + # in extremely rare case it can take more than 5 minutes in debug build with sanitizer @pytest.mark.timeout(600) def test_blocade_leader_twice(started_cluster): @@ -211,7 +230,7 @@ def test_blocade_leader_twice(started_cluster): for i in range(100): try: - node2.query("SYSTEM RESTART REPLICA ordinary.t2") + restart_replica_for_sure(node2, "ordinary.t2", "/clickhouse/t2/replicas/2") node2.query("INSERT INTO ordinary.t2 SELECT rand() FROM numbers(100)") break except Exception as ex: @@ -228,7 +247,8 @@ def test_blocade_leader_twice(started_cluster): for i in range(100): try: - node3.query("SYSTEM RESTART REPLICA ordinary.t2") + + restart_replica_for_sure(node3, "ordinary.t2", "/clickhouse/t2/replicas/3") node3.query("INSERT INTO ordinary.t2 SELECT rand() FROM numbers(100)") break except Exception as ex: @@ -265,7 +285,7 @@ def test_blocade_leader_twice(started_cluster): for n, node in enumerate([node1, node2, node3]): for i in range(100): try: - node.query("SYSTEM RESTART REPLICA ordinary.t2") + restart_replica_for_sure(node, "ordinary.t2", "/clickhouse/t2/replicas/{}".format(n + 1)) break except Exception as ex: try: @@ -296,7 +316,7 @@ def test_blocade_leader_twice(started_cluster): for n, node in enumerate([node1, node2, node3]): for i in range(100): try: - node.query("SYSTEM RESTART REPLICA ordinary.t2") + restart_replica_for_sure(node, "ordinary.t2", "/clickhouse/t2/replicas/{}".format(n + 1)) node.query("SYSTEM SYNC REPLICA ordinary.t2", timeout=10) break except Exception as ex: From ee4d3f7aa485f851831b9ce96c8d1b4b78f90589 Mon Sep 17 00:00:00 2001 From: Evgeniia Sudarikova Date: Wed, 17 Feb 2021 16:23:10 +0300 Subject: [PATCH 0372/2357] edited ; in queries, edited after review --- docs/en/sql-reference/functions/array-functions.md | 12 ++++++------ .../example-datasets/brown-benchmark.md | 6 +++--- docs/ru/sql-reference/functions/array-functions.md | 12 ++++++------ 3 files changed, 15 insertions(+), 15 deletions(-) diff --git a/docs/en/sql-reference/functions/array-functions.md b/docs/en/sql-reference/functions/array-functions.md index 48c5176f0e1..528d81b0a0b 100644 --- a/docs/en/sql-reference/functions/array-functions.md +++ b/docs/en/sql-reference/functions/array-functions.md @@ -1315,7 +1315,7 @@ Type: [Int](../../sql-reference/data-types/int-uint.md) or [Float](../../sql-ref Query: ``` sql -SELECT arrayMin([1, 2, 4]) AS res +SELECT arrayMin([1, 2, 4]) AS res; ``` Result: @@ -1329,7 +1329,7 @@ Result: Query: ``` sql -SELECT arrayMin(x -> (-x), [1, 2, 4]) AS res +SELECT arrayMin(x -> (-x), [1, 2, 4]) AS res; ``` Result: @@ -1367,7 +1367,7 @@ Type: [Int](../../sql-reference/data-types/int-uint.md) or [Float](../../sql-ref Query: ```sql -SELECT arrayMax([1, 2, 4]) AS res +SELECT arrayMax([1, 2, 4]) AS res; ``` Result: @@ -1381,7 +1381,7 @@ Result: Query: ``` sql -SELECT arrayMax(x -> (-x), [1, 2, 4]) AS res +SELECT arrayMax(x -> (-x), [1, 2, 4]) AS res; ``` Result: @@ -1419,7 +1419,7 @@ Type: [Int](../../sql-reference/data-types/int-uint.md) or [Float](../../sql-ref Query: ```sql -SELECT arraySum([2,3]) AS res +SELECT arraySum([2,3]) AS res; ``` Result: @@ -1433,7 +1433,7 @@ Result: Query: ``` sql -SELECT arraySum(x -> x*x, [2, 3]) AS res +SELECT arraySum(x -> x*x, [2, 3]) AS res; ``` Result: diff --git a/docs/ru/getting-started/example-datasets/brown-benchmark.md b/docs/ru/getting-started/example-datasets/brown-benchmark.md index e4fe00ace93..23702e07fcd 100644 --- a/docs/ru/getting-started/example-datasets/brown-benchmark.md +++ b/docs/ru/getting-started/example-datasets/brown-benchmark.md @@ -5,7 +5,7 @@ toc_title: Brown University Benchmark # Brown University Benchmark -`MgBench` — это новый аналитический бенчмарк для данных журнала событий, сгенерированных машиной. Бенчмарк разработан [Andrew Crotty](http://cs.brown.edu/people/acrotty/). +`MgBench` — это аналитический тест производительности для данных журнала событий, сгенерированных машиной. Бенчмарк разработан [Andrew Crotty](http://cs.brown.edu/people/acrotty/). Скачать данные: ``` @@ -74,7 +74,7 @@ ENGINE = MergeTree() ORDER BY (event_type, log_time); ``` -Insert data: +Вставка данных: ``` clickhouse-client --query "INSERT INTO mgbench.logs1 FORMAT CSVWithNames" < mgbench1.csv @@ -82,7 +82,7 @@ clickhouse-client --query "INSERT INTO mgbench.logs2 FORMAT CSVWithNames" < mgbe clickhouse-client --query "INSERT INTO mgbench.logs3 FORMAT CSVWithNames" < mgbench3.csv ``` -Run benchmark queries: +Запуск тестов производительности: ``` -- Q1.1: What is the CPU/network utilization for each web server since midnight? diff --git a/docs/ru/sql-reference/functions/array-functions.md b/docs/ru/sql-reference/functions/array-functions.md index 7afd9da471e..9702ab13d5e 100644 --- a/docs/ru/sql-reference/functions/array-functions.md +++ b/docs/ru/sql-reference/functions/array-functions.md @@ -1162,7 +1162,7 @@ arrayMin(arr) Запрос: ``` sql -SELECT arrayMin([1, 2, 4]) AS res +SELECT arrayMin([1, 2, 4]) AS res; ``` Результат: @@ -1176,7 +1176,7 @@ SELECT arrayMin([1, 2, 4]) AS res Запрос: ``` sql -SELECT arrayMin(x -> (-x), [1, 2, 4]) AS res +SELECT arrayMin(x -> (-x), [1, 2, 4]) AS res; ``` Результат: @@ -1214,7 +1214,7 @@ arrayMax(arr) Запрос: ```sql -SELECT arrayMax([1, 2, 4]) AS res +SELECT arrayMax([1, 2, 4]) AS res; ``` Результат: @@ -1228,7 +1228,7 @@ SELECT arrayMax([1, 2, 4]) AS res Запрос: ``` sql -SELECT arrayMax(x -> (-x), [1, 2, 4]) AS res +SELECT arrayMax(x -> (-x), [1, 2, 4]) AS res; ``` Результат: @@ -1266,7 +1266,7 @@ arraySum(arr) Запрос: ```sql -SELECT arraySum([2,3]) AS res +SELECT arraySum([2,3]) AS res; ``` Результат: @@ -1280,7 +1280,7 @@ SELECT arraySum([2,3]) AS res Запрос: ``` sql -SELECT arraySum(x -> x*x, [2, 3]) AS res +SELECT arraySum(x -> x*x, [2, 3]) AS res; ``` Результат: From 499c100b12233e3a6fbd31066a4bac3914a650e1 Mon Sep 17 00:00:00 2001 From: alesapin Date: Wed, 17 Feb 2021 16:41:43 +0300 Subject: [PATCH 0373/2357] Better test --- .../test.py | 37 +++++++++++-------- 1 file changed, 22 insertions(+), 15 deletions(-) diff --git a/tests/integration/test_testkeeper_multinode_blocade_leader/test.py b/tests/integration/test_testkeeper_multinode_blocade_leader/test.py index a1fd066ab83..49d86ab9fe8 100644 --- a/tests/integration/test_testkeeper_multinode_blocade_leader/test.py +++ b/tests/integration/test_testkeeper_multinode_blocade_leader/test.py @@ -55,7 +55,6 @@ def get_fake_zk(nodename, timeout=30.0): _fake_zk_instance = KazooClient(hosts=cluster.get_instance_ip(nodename) + ":9181", timeout=timeout) def reset_listener(state): nonlocal _fake_zk_instance - print("Fake zk callback called for state", state) if state != KazooState.CONNECTED: _fake_zk_instance._reset() @@ -247,8 +246,8 @@ def test_blocade_leader_twice(started_cluster): for i in range(100): try: - restart_replica_for_sure(node3, "ordinary.t2", "/clickhouse/t2/replicas/3") + node3.query("SYSTEM SYNC REPLICA ordinary.t2", timeout=10) node3.query("INSERT INTO ordinary.t2 SELECT rand() FROM numbers(100)") break except Exception as ex: @@ -263,6 +262,10 @@ def test_blocade_leader_twice(started_cluster): dump_zk(node, '/clickhouse/t2', '/clickhouse/t2/replicas/{}'.format(num + 1)) assert False, "Cannot reconnect for node3" + node2.query("SYSTEM SYNC REPLICA ordinary.t2", timeout=10) + + assert node2.query("SELECT COUNT() FROM ordinary.t2") == "210\n" + assert node3.query("SELECT COUNT() FROM ordinary.t2") == "210\n" # Total network partition pm.partition_instances(node3, node2) @@ -281,7 +284,6 @@ def test_blocade_leader_twice(started_cluster): except Exception as ex: time.sleep(0.5) - for n, node in enumerate([node1, node2, node3]): for i in range(100): try: @@ -313,24 +315,29 @@ def test_blocade_leader_twice(started_cluster): dump_zk(node, '/clickhouse/t2', '/clickhouse/t2/replicas/{}'.format(num + 1)) assert False, "Cannot reconnect for node{}".format(n + 1) - for n, node in enumerate([node1, node2, node3]): for i in range(100): - try: - restart_replica_for_sure(node, "ordinary.t2", "/clickhouse/t2/replicas/{}".format(n + 1)) - node.query("SYSTEM SYNC REPLICA ordinary.t2", timeout=10) - break - except Exception as ex: + all_done = True + for n, node in enumerate([node1, node2, node3]): try: - node.query("ATTACH TABLE ordinary.t2") - except Exception as attach_ex: - print("Got exception node{}".format(n + 1), smaller_exception(attach_ex)) + restart_replica_for_sure(node, "ordinary.t2", "/clickhouse/t2/replicas/{}".format(n + 1)) + node.query("SYSTEM SYNC REPLICA ordinary.t2", timeout=10) + break + except Exception as ex: + all_done = False + try: + node.query("ATTACH TABLE ordinary.t2") + except Exception as attach_ex: + print("Got exception node{}".format(n + 1), smaller_exception(attach_ex)) - print("Got exception node{}".format(n + 1), smaller_exception(ex)) - time.sleep(0.5) + print("Got exception node{}".format(n + 1), smaller_exception(ex)) + time.sleep(0.5) + + if all_done: + break else: for num, node in enumerate([node1, node2, node3]): dump_zk(node, '/clickhouse/t2', '/clickhouse/t2/replicas/{}'.format(num + 1)) - assert False, "Cannot reconnect for node{}".format(n + 1) + assert False, "Cannot reconnect in i {} retries".format(i) assert node1.query("SELECT COUNT() FROM ordinary.t2") == "510\n" if node2.query("SELECT COUNT() FROM ordinary.t2") != "510\n": From 8cecb533ca53038fe70a55fc4aa46e7ab2b0bef9 Mon Sep 17 00:00:00 2001 From: Marvin Taschenberger <45663148+Taschenbergerm@users.noreply.github.com> Date: Wed, 17 Feb 2021 15:03:09 +0100 Subject: [PATCH 0374/2357] Update argmax.md --- .../aggregate-functions/reference/argmax.md | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/docs/en/sql-reference/aggregate-functions/reference/argmax.md b/docs/en/sql-reference/aggregate-functions/reference/argmax.md index 7639117042f..1af188ad026 100644 --- a/docs/en/sql-reference/aggregate-functions/reference/argmax.md +++ b/docs/en/sql-reference/aggregate-functions/reference/argmax.md @@ -17,12 +17,12 @@ argMax(arg, val) or ``` sql -argMax(tuple(arg, val)) +argMax(tuple(arg1, arg2), val) ``` **Arguments** -- `arg` — Argument. +- `arg{i}` — Argument. - `val` — Value. **Returned value** @@ -33,7 +33,7 @@ Type: matches `arg` type. For tuple in the input: -- Tuple `(arg, val)`, where `val` is the maximum value and `arg` is a corresponding value. +- Tuple `(arg1, arg2)`, where `arg1` and `arg2` are the corresponding values. Type: [Tuple](../../../sql-reference/data-types/tuple.md). @@ -52,13 +52,13 @@ Input table: Query: ``` sql -SELECT argMax(user, salary), argMax(tuple(user, salary)) FROM salary; +SELECT argMax(user, salary), argMax(tuple(user, salary), salary) FROM salary; ``` Result: ``` text -┌─argMax(user, salary)─┬─argMax(tuple(user, salary))─┐ +┌─argMax(user, salary)─┬─argMax(tuple(user, salary), salary)─┐ │ director │ ('director',5000) │ └──────────────────────┴─────────────────────────────┘ ``` From 13ae988efffc41213445a7a7185a956fb00076df Mon Sep 17 00:00:00 2001 From: Alexander Tokmakov Date: Wed, 17 Feb 2021 17:23:07 +0300 Subject: [PATCH 0375/2357] make force_drop_table work with materialized view --- programs/server/Server.cpp | 2 +- src/Databases/DatabaseAtomic.cpp | 13 +++-- src/Storages/StorageMaterializedView.cpp | 26 ---------- src/Storages/StorageMaterializedView.h | 3 -- .../test_force_drop_table/__init__.py | 0 .../test_force_drop_table/configs/config.xml | 4 ++ .../integration/test_force_drop_table/test.py | 49 +++++++++++++++++++ 7 files changed, 62 insertions(+), 35 deletions(-) create mode 100644 tests/integration/test_force_drop_table/__init__.py create mode 100644 tests/integration/test_force_drop_table/configs/config.xml create mode 100644 tests/integration/test_force_drop_table/test.py diff --git a/programs/server/Server.cpp b/programs/server/Server.cpp index a96cb2b8973..1bd6becfb37 100644 --- a/programs/server/Server.cpp +++ b/programs/server/Server.cpp @@ -759,7 +759,7 @@ int Server::main(const std::vector & /*args*/) global_context->updateStorageConfiguration(*config); }, - /* already_loaded = */ true); + /* already_loaded = */ false); auto & access_control = global_context->getAccessControlManager(); if (config().has("custom_settings_prefixes")) diff --git a/src/Databases/DatabaseAtomic.cpp b/src/Databases/DatabaseAtomic.cpp index 17a91a1fff9..6e6c281d26d 100644 --- a/src/Databases/DatabaseAtomic.cpp +++ b/src/Databases/DatabaseAtomic.cpp @@ -106,8 +106,15 @@ StoragePtr DatabaseAtomic::detachTable(const String & name) return table; } -void DatabaseAtomic::dropTable(const Context &, const String & table_name, bool no_delay) +void DatabaseAtomic::dropTable(const Context & context, const String & table_name, bool no_delay) { + if (auto * mv = dynamic_cast(tryGetTable(table_name, context).get())) + { + /// Remove the inner table (if any) to avoid deadlock + /// (due to attempt to execute DROP from the worker thread) + mv->dropInnerTable(no_delay); + } + String table_metadata_path = getObjectMetadataPath(table_name); String table_metadata_path_drop; StoragePtr table; @@ -121,10 +128,6 @@ void DatabaseAtomic::dropTable(const Context &, const String & table_name, bool } if (table->storesDataOnDisk()) tryRemoveSymlink(table_name); - /// Remove the inner table (if any) to avoid deadlock - /// (due to attempt to execute DROP from the worker thread) - if (auto * mv = dynamic_cast(table.get())) - mv->dropInnerTable(no_delay); /// Notify DatabaseCatalog that table was dropped. It will remove table data in background. /// Cleanup is performed outside of database to allow easily DROP DATABASE without waiting for cleanup to complete. DatabaseCatalog::instance().enqueueDroppedTableCleanup(table->getStorageID(), table, table_metadata_path_drop, no_delay); diff --git a/src/Storages/StorageMaterializedView.cpp b/src/Storages/StorageMaterializedView.cpp index af00b37b1d5..2482f59c0d0 100644 --- a/src/Storages/StorageMaterializedView.cpp +++ b/src/Storages/StorageMaterializedView.cpp @@ -389,32 +389,6 @@ Strings StorageMaterializedView::getDataPaths() const return {}; } -void StorageMaterializedView::checkTableCanBeDropped() const -{ - /// Don't drop the target table if it was created manually via 'TO inner_table' statement - if (!has_inner_table) - return; - - auto target_table = tryGetTargetTable(); - if (!target_table) - return; - - target_table->checkTableCanBeDropped(); -} - -void StorageMaterializedView::checkPartitionCanBeDropped(const ASTPtr & partition) -{ - /// Don't drop the partition in target table if it was created manually via 'TO inner_table' statement - if (!has_inner_table) - return; - - auto target_table = tryGetTargetTable(); - if (!target_table) - return; - - target_table->checkPartitionCanBeDropped(partition); -} - ActionLock StorageMaterializedView::getActionLock(StorageActionBlockType type) { return has_inner_table ? getTargetTable()->getActionLock(type) : ActionLock{}; diff --git a/src/Storages/StorageMaterializedView.h b/src/Storages/StorageMaterializedView.h index fab9e28afe3..1e0425a356f 100644 --- a/src/Storages/StorageMaterializedView.h +++ b/src/Storages/StorageMaterializedView.h @@ -64,9 +64,6 @@ public: void shutdown() override; - void checkTableCanBeDropped() const override; - void checkPartitionCanBeDropped(const ASTPtr & partition) override; - QueryProcessingStage::Enum getQueryProcessingStage(const Context &, QueryProcessingStage::Enum /*to_stage*/, SelectQueryInfo &) const override; StoragePtr getTargetTable() const; diff --git a/tests/integration/test_force_drop_table/__init__.py b/tests/integration/test_force_drop_table/__init__.py new file mode 100644 index 00000000000..e69de29bb2d diff --git a/tests/integration/test_force_drop_table/configs/config.xml b/tests/integration/test_force_drop_table/configs/config.xml new file mode 100644 index 00000000000..e5f133953a6 --- /dev/null +++ b/tests/integration/test_force_drop_table/configs/config.xml @@ -0,0 +1,4 @@ + + 1 + 1 + diff --git a/tests/integration/test_force_drop_table/test.py b/tests/integration/test_force_drop_table/test.py new file mode 100644 index 00000000000..ad8316493e4 --- /dev/null +++ b/tests/integration/test_force_drop_table/test.py @@ -0,0 +1,49 @@ +import pytest + +from helpers.cluster import ClickHouseCluster + +cluster = ClickHouseCluster(__file__) +node = cluster.add_instance('node', main_configs=["configs/config.xml"], with_zookeeper=True) + + +@pytest.fixture(scope="module") +def started_cluster(): + try: + cluster.start() + yield cluster + finally: + cluster.shutdown() + +def create_force_drop_flag(node): + force_drop_flag_path = "/var/lib/clickhouse/flags/force_drop_table" + node.exec_in_container(["bash", "-c", "touch {} && chmod a=rw {}".format(force_drop_flag_path, force_drop_flag_path)], user="root") + +@pytest.mark.parametrize("engine", ['Ordinary', 'Atomic']) +def test_drop_materialized_view(started_cluster, engine): + node.query("CREATE DATABASE d ENGINE={}".format(engine)) + node.query("CREATE TABLE d.rmt (n UInt64) ENGINE=ReplicatedMergeTree('/test/rmt', 'r1') ORDER BY n PARTITION BY n % 2") + node.query("CREATE MATERIALIZED VIEW d.mv (n UInt64, s String) ENGINE=MergeTree ORDER BY n PARTITION BY n % 2 AS SELECT n, toString(n) AS s FROM d.rmt") + node.query("INSERT INTO d.rmt VALUES (1), (2)") + assert "is greater than max" in node.query_and_get_error("DROP TABLE d.rmt") + assert "is greater than max" in node.query_and_get_error("DROP TABLE d.mv") + assert "is greater than max" in node.query_and_get_error("TRUNCATE TABLE d.rmt") + assert "is greater than max" in node.query_and_get_error("TRUNCATE TABLE d.mv") + assert "is greater than max" in node.query_and_get_error("ALTER TABLE d.rmt DROP PARTITION '0'") + assert node.query("SELECT * FROM d.rmt ORDER BY n") == "1\n2\n" + assert node.query("SELECT * FROM d.mv ORDER BY n") == "1\t1\n2\t2\n" + + create_force_drop_flag(node) + node.query("ALTER TABLE d.rmt DROP PARTITION '0'") + assert node.query("SELECT * FROM d.rmt ORDER BY n") == "1\n" + assert "is greater than max" in node.query_and_get_error("ALTER TABLE d.mv DROP PARTITION '0'") + create_force_drop_flag(node) + node.query("ALTER TABLE d.mv DROP PARTITION '0'") + assert node.query("SELECT * FROM d.mv ORDER BY n") == "1\t1\n" + assert "is greater than max" in node.query_and_get_error("DROP TABLE d.rmt SYNC") + create_force_drop_flag(node) + node.query("DROP TABLE d.rmt SYNC") + assert "is greater than max" in node.query_and_get_error("DROP TABLE d.mv SYNC") + create_force_drop_flag(node) + node.query("DROP TABLE d.mv SYNC") + node.query("DROP DATABASE d") + From e0980fd0b73b5c819b6206292c0334f11e6d8e11 Mon Sep 17 00:00:00 2001 From: alesapin Date: Wed, 17 Feb 2021 17:41:21 +0300 Subject: [PATCH 0376/2357] Fix fasttest retry for failed tests --- docker/test/fasttest/run.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docker/test/fasttest/run.sh b/docker/test/fasttest/run.sh index e6294b5d74d..90663102f17 100755 --- a/docker/test/fasttest/run.sh +++ b/docker/test/fasttest/run.sh @@ -344,7 +344,7 @@ function run_tests 01666_blns ) - time clickhouse-test --hung-check -j 8 --order=random --use-skip-list --no-long --testname --shard --zookeeper --skip "${TESTS_TO_SKIP[@]}" -- "$FASTTEST_FOCUS" 2>&1 | ts '%Y-%m-%d %H:%M:%S' | tee "$FASTTEST_OUTPUT/test_log.txt" + (time clickhouse-test --hung-check -j 8 --order=random --use-skip-list --no-long --testname --shard --zookeeper --skip "${TESTS_TO_SKIP[@]}" -- "$FASTTEST_FOCUS" 2>&1 ||:) | ts '%Y-%m-%d %H:%M:%S' | tee "$FASTTEST_OUTPUT/test_log.txt" # substr is to remove semicolon after test name readarray -t FAILED_TESTS < <(awk '/\[ FAIL|TIMEOUT|ERROR \]/ { print substr($3, 1, length($3)-1) }' "$FASTTEST_OUTPUT/test_log.txt" | tee "$FASTTEST_OUTPUT/failed-parallel-tests.txt") From bb4ced05f9da997c987c7f520f423fd3892bb7d0 Mon Sep 17 00:00:00 2001 From: alesapin Date: Wed, 17 Feb 2021 17:52:32 +0300 Subject: [PATCH 0377/2357] Fix fast test --- docker/test/fasttest/run.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docker/test/fasttest/run.sh b/docker/test/fasttest/run.sh index 90663102f17..202e2f12a1a 100755 --- a/docker/test/fasttest/run.sh +++ b/docker/test/fasttest/run.sh @@ -361,7 +361,7 @@ function run_tests stop_server ||: # Clean the data so that there is no interference from the previous test run. - rm -rf "$FASTTEST_DATA"/{{meta,}data,user_files} ||: + rm -rf "$FASTTEST_DATA"/{{meta,}data,user_files,coordination} ||: start_server From 50e135db0f925b33d44be562af3cc71dabdf8daf Mon Sep 17 00:00:00 2001 From: Maksim Kita Date: Wed, 17 Feb 2021 19:24:04 +0300 Subject: [PATCH 0378/2357] Added comment --- src/Dictionaries/getDictionaryConfigurationFromAST.cpp | 2 ++ 1 file changed, 2 insertions(+) diff --git a/src/Dictionaries/getDictionaryConfigurationFromAST.cpp b/src/Dictionaries/getDictionaryConfigurationFromAST.cpp index acfb11787de..04ba1db09fc 100644 --- a/src/Dictionaries/getDictionaryConfigurationFromAST.cpp +++ b/src/Dictionaries/getDictionaryConfigurationFromAST.cpp @@ -403,6 +403,8 @@ void buildConfigurationFromFunctionWithKeyValueArguments( auto function = builder->build({}); function->prepare({}); + /// We assume that function will not take arguments and will return constant value like tcpPort or hostName + /// Such functions will return column with size equal to input_rows_count. size_t input_rows_count = 1; auto result = function->execute({}, function->getResultType(), input_rows_count); From 6522bfc402260b2b4edfd4c2f0ab55a662296e63 Mon Sep 17 00:00:00 2001 From: Nikolai Kochetov Date: Wed, 17 Feb 2021 19:54:11 +0300 Subject: [PATCH 0379/2357] Support for DIstinct, sorting steps. --- src/Interpreters/ActionsDAG.cpp | 2 +- src/Processors/QueryPlan/CreatingSetsStep.h | 2 +- src/Processors/QueryPlan/CubeStep.cpp | 5 ++ src/Processors/QueryPlan/CubeStep.h | 2 + src/Processors/QueryPlan/FillingStep.h | 2 + .../Optimizations/filterPushDown.cpp | 68 +++++++++++++++++-- 6 files changed, 74 insertions(+), 7 deletions(-) diff --git a/src/Interpreters/ActionsDAG.cpp b/src/Interpreters/ActionsDAG.cpp index 691905bed27..8b6013a4365 100644 --- a/src/Interpreters/ActionsDAG.cpp +++ b/src/Interpreters/ActionsDAG.cpp @@ -1389,7 +1389,7 @@ ActionsDAGPtr ActionsDAG::splitActionsForFilter(const std::string & filter_name, for (const auto * predicate : selected_predicates) args.emplace_back(nodes_mapping[predicate]); - result_predicate = &actions->addFunction(func_builder_and, args, {}, true); + result_predicate = &actions->addFunction(func_builder_and, args, {}, true, false); } actions->index.insert(result_predicate); diff --git a/src/Processors/QueryPlan/CreatingSetsStep.h b/src/Processors/QueryPlan/CreatingSetsStep.h index ec13ab2052e..97821cb63d3 100644 --- a/src/Processors/QueryPlan/CreatingSetsStep.h +++ b/src/Processors/QueryPlan/CreatingSetsStep.h @@ -34,7 +34,7 @@ private: class CreatingSetsStep : public IQueryPlanStep { public: - CreatingSetsStep(DataStreams input_streams_); + explicit CreatingSetsStep(DataStreams input_streams_); String getName() const override { return "CreatingSets"; } diff --git a/src/Processors/QueryPlan/CubeStep.cpp b/src/Processors/QueryPlan/CubeStep.cpp index de8bb2b3d43..6a0ec33402b 100644 --- a/src/Processors/QueryPlan/CubeStep.cpp +++ b/src/Processors/QueryPlan/CubeStep.cpp @@ -43,4 +43,9 @@ void CubeStep::transformPipeline(QueryPipeline & pipeline) }); } +const Aggregator::Params & CubeStep::getParams() const +{ + return params->params; +} + } diff --git a/src/Processors/QueryPlan/CubeStep.h b/src/Processors/QueryPlan/CubeStep.h index 707f62ce7d6..f67a03dc7e2 100644 --- a/src/Processors/QueryPlan/CubeStep.h +++ b/src/Processors/QueryPlan/CubeStep.h @@ -1,6 +1,7 @@ #pragma once #include #include +#include namespace DB { @@ -18,6 +19,7 @@ public: void transformPipeline(QueryPipeline & pipeline) override; + const Aggregator::Params & getParams() const; private: AggregatingTransformParamsPtr params; }; diff --git a/src/Processors/QueryPlan/FillingStep.h b/src/Processors/QueryPlan/FillingStep.h index 85736464a6c..c8d1f74c6ca 100644 --- a/src/Processors/QueryPlan/FillingStep.h +++ b/src/Processors/QueryPlan/FillingStep.h @@ -17,6 +17,8 @@ public: void describeActions(FormatSettings & settings) const override; + const SortDescription & getSortDescription() const { return sort_description; } + private: SortDescription sort_description; }; diff --git a/src/Processors/QueryPlan/Optimizations/filterPushDown.cpp b/src/Processors/QueryPlan/Optimizations/filterPushDown.cpp index 39f24a32b45..74c4fa6f329 100644 --- a/src/Processors/QueryPlan/Optimizations/filterPushDown.cpp +++ b/src/Processors/QueryPlan/Optimizations/filterPushDown.cpp @@ -4,9 +4,15 @@ #include #include #include +#include #include #include #include +#include "Processors/QueryPlan/FinishSortingStep.h" +#include "Processors/QueryPlan/MergeSortingStep.h" +#include "Processors/QueryPlan/MergingSortedStep.h" +#include "Processors/QueryPlan/PartialSortingStep.h" +#include #include namespace DB::ErrorCodes @@ -79,6 +85,30 @@ static size_t tryAddNewFilterStep( return 3; } +static Names getAggregatinKeys(const Aggregator::Params & params) +{ + Names keys; + keys.reserve(params.keys.size()); + for (auto pos : params.keys) + keys.push_back(params.src_header.getByPosition(pos).name); + + return keys; +} + +// static NameSet getColumnNamesFromSortDescription(const SortDescription & sort_desc, const Block & header) +// { +// NameSet names; +// for (const auto & column : sort_desc) +// { +// if (!column.column_name.empty()) +// names.insert(column.column_name); +// else +// names.insert(header.safeGetByPosition(column.column_number).name); +// } + +// return names; +// } + size_t tryPushDownFilter(QueryPlan::Node * parent_node, QueryPlan::Nodes & nodes) { if (parent_node->children.size() != 1) @@ -96,11 +126,7 @@ size_t tryPushDownFilter(QueryPlan::Node * parent_node, QueryPlan::Nodes & nodes if (auto * aggregating = typeid_cast(child.get())) { const auto & params = aggregating->getParams(); - - Names keys; - keys.reserve(params.keys.size()); - for (auto pos : params.keys) - keys.push_back(params.src_header.getByPosition(pos).name); + Names keys = getAggregatinKeys(params); if (auto updated_steps = tryAddNewFilterStep(parent_node, nodes, keys)) return updated_steps; @@ -124,6 +150,38 @@ size_t tryPushDownFilter(QueryPlan::Node * parent_node, QueryPlan::Nodes & nodes return updated_steps; } + if (auto * distinct = typeid_cast(child.get())) + { + Names allowed_inputs = distinct->getOutputStream().header.getNames(); + if (auto updated_steps = tryAddNewFilterStep(parent_node, nodes, allowed_inputs)) + return updated_steps; + } + + /// TODO. + /// We can filter earlier if expression does not depend on WITH FILL columns. + /// But we cannot just push down condition, because other column may be filled with defaults. + /// + /// It is possible to filter columns before and after WITH FILL, but such change is not idempotent. + /// So, appliying this to pair (Filter -> Filling) several times will create several similar filters. + // if (auto * filling = typeid_cast(child.get())) + // { + // } + + /// Same reason for Cube + // if (auto * cube = typeid_cast(child.get())) + // { + // } + + if (typeid_cast(child.get()) + || typeid_cast(child.get()) + || typeid_cast(child.get()) + || typeid_cast(child.get())) + { + Names allowed_inputs = child->getOutputStream().header.getNames(); + if (auto updated_steps = tryAddNewFilterStep(parent_node, nodes, allowed_inputs)) + return updated_steps; + } + return 0; } From e5b9c42860cce08b0b94f7863dbeb6f38b066d83 Mon Sep 17 00:00:00 2001 From: Nikolai Kochetov Date: Wed, 17 Feb 2021 19:54:37 +0300 Subject: [PATCH 0380/2357] Update test. --- .../01655_plan_optimizations.reference | 70 +++++++++++++++ .../0_stateless/01655_plan_optimizations.sh | 85 +++++++++++++++++-- 2 files changed, 149 insertions(+), 6 deletions(-) diff --git a/tests/queries/0_stateless/01655_plan_optimizations.reference b/tests/queries/0_stateless/01655_plan_optimizations.reference index 1e638829c74..7bc75dc0bf6 100644 --- a/tests/queries/0_stateless/01655_plan_optimizations.reference +++ b/tests/queries/0_stateless/01655_plan_optimizations.reference @@ -13,32 +13,102 @@ Limit 10 > filter should be pushed down after aggregating Aggregating Filter +0 1 +1 2 +2 3 +3 4 +4 5 +5 6 +6 7 +7 8 +8 9 +9 10 > filter should be pushed down after aggregating, column after aggregation is const COLUMN Const(UInt8) -> notEquals(y, 0) Aggregating Filter Filter +0 1 1 +1 2 1 +2 3 1 +3 4 1 +4 5 1 +5 6 1 +6 7 1 +7 8 1 +8 9 1 +9 10 1 > one condition of filter should be pushed down after aggregating, other condition is aliased Filter column ALIAS notEquals(s, 4) :: 1 -> and(notEquals(y, 0), notEquals(s, 4)) Aggregating Filter column: notEquals(y, 0) +0 1 +1 2 +2 3 +3 4 +5 6 +6 7 +7 8 +8 9 +9 10 > one condition of filter should be pushed down after aggregating, other condition is casted Filter column FUNCTION CAST(minus(s, 4) :: 1, UInt8 :: 3) -> and(notEquals(y, 0), minus(s, 4)) Aggregating Filter column: notEquals(y, 0) +0 1 +1 2 +2 3 +3 4 +5 6 +6 7 +7 8 +8 9 +9 10 > one condition of filter should be pushed down after aggregating, other two conditions are ANDed Filter column FUNCTION and(minus(s, 4) :: 2, minus(s, 8) :: 1) -> and(notEquals(y, 0), minus(s, 8), minus(s, 4)) Aggregating Filter column: notEquals(y, 0) +0 1 +1 2 +2 3 +3 4 +5 6 +6 7 +7 8 +9 10 > two conditions of filter should be pushed down after aggregating and ANDed, one condition is aliased Filter column ALIAS notEquals(s, 8) :: 1 -> and(notEquals(y, 0), notEquals(s, 8), minus(y, 4)) Aggregating Filter column: and(minus(y, 4), notEquals(y, 0)) +0 1 +1 2 +2 3 +4 5 +5 6 +6 7 +7 8 +9 10 > filter is split, one part is filtered before ARRAY JOIN Filter column: and(notEquals(y, 2), notEquals(x, 0)) ARRAY JOIN x Filter column: notEquals(y, 2) +1 3 +> filter is pushed down before Distinct +Distinct +Distinct +Filter column: notEquals(y, 2) +0 0 +0 1 +1 0 +1 1 +> filter is pushed down before sorting steps +MergingSorted +MergeSorting +PartialSorting +Filter column: and(notEquals(x, 0), notEquals(y, 0)) +1 2 +1 1 diff --git a/tests/queries/0_stateless/01655_plan_optimizations.sh b/tests/queries/0_stateless/01655_plan_optimizations.sh index ccd331df45e..f770643fc41 100755 --- a/tests/queries/0_stateless/01655_plan_optimizations.sh +++ b/tests/queries/0_stateless/01655_plan_optimizations.sh @@ -16,49 +16,122 @@ $CLICKHOUSE_CLIENT -q " select number as x, number + 1 as y from numbers(10)) group by y ) where y != 0 settings enable_optimize_predicate_expression=0" | grep -o "Aggregating\|Filter" +$CLICKHOUSE_CLIENT -q " + select s, y from (select sum(x) as s, y from ( + select number as x, number + 1 as y from numbers(10)) group by y + ) where y != 0 order by s, y + settings enable_optimize_predicate_expression=0" echo "> filter should be pushed down after aggregating, column after aggregation is const" $CLICKHOUSE_CLIENT -q " - explain actions = 1 select *, y != 0 from (select sum(x), y from ( + explain actions = 1 select s, y, y != 0 from (select sum(x) as s, y from ( select number as x, number + 1 as y from numbers(10)) group by y ) where y != 0 settings enable_optimize_predicate_expression=0" | grep -o "Aggregating\|Filter\|COLUMN Const(UInt8) -> notEquals(y, 0)" +$CLICKHOUSE_CLIENT -q " + select s, y, y != 0 from (select sum(x) as s, y from ( + select number as x, number + 1 as y from numbers(10)) group by y + ) where y != 0 order by s, y, y != 0 + settings enable_optimize_predicate_expression=0" echo "> one condition of filter should be pushed down after aggregating, other condition is aliased" $CLICKHOUSE_CLIENT -q " - explain actions = 1 select * from ( + explain actions = 1 select s, y from ( select sum(x) as s, y from (select number as x, number + 1 as y from numbers(10)) group by y ) where y != 0 and s != 4 settings enable_optimize_predicate_expression=0" | grep -o "Aggregating\|Filter column\|Filter column: notEquals(y, 0)\|ALIAS notEquals(s, 4) :: 1 -> and(notEquals(y, 0), notEquals(s, 4))" +$CLICKHOUSE_CLIENT -q " + select s, y from ( + select sum(x) as s, y from (select number as x, number + 1 as y from numbers(10)) group by y + ) where y != 0 and s != 4 order by s, y + settings enable_optimize_predicate_expression=0" echo "> one condition of filter should be pushed down after aggregating, other condition is casted" $CLICKHOUSE_CLIENT -q " - explain actions = 1 select * from ( + explain actions = 1 select s, y from ( select sum(x) as s, y from (select number as x, number + 1 as y from numbers(10)) group by y ) where y != 0 and s - 4 settings enable_optimize_predicate_expression=0" | grep -o "Aggregating\|Filter column\|Filter column: notEquals(y, 0)\|FUNCTION CAST(minus(s, 4) :: 1, UInt8 :: 3) -> and(notEquals(y, 0), minus(s, 4))" +$CLICKHOUSE_CLIENT -q " + select s, y from ( + select sum(x) as s, y from (select number as x, number + 1 as y from numbers(10)) group by y + ) where y != 0 and s - 4 order by s, y + settings enable_optimize_predicate_expression=0" echo "> one condition of filter should be pushed down after aggregating, other two conditions are ANDed" $CLICKHOUSE_CLIENT -q " - explain actions = 1 select * from ( + explain actions = 1 select s, y from ( select sum(x) as s, y from (select number as x, number + 1 as y from numbers(10)) group by y ) where y != 0 and s - 8 and s - 4 settings enable_optimize_predicate_expression=0" | grep -o "Aggregating\|Filter column\|Filter column: notEquals(y, 0)\|FUNCTION and(minus(s, 4) :: 2, minus(s, 8) :: 1) -> and(notEquals(y, 0), minus(s, 8), minus(s, 4))" +$CLICKHOUSE_CLIENT -q " + select s, y from ( + select sum(x) as s, y from (select number as x, number + 1 as y from numbers(10)) group by y + ) where y != 0 and s - 8 and s - 4 order by s, y + settings enable_optimize_predicate_expression=0" echo "> two conditions of filter should be pushed down after aggregating and ANDed, one condition is aliased" $CLICKHOUSE_CLIENT -q " - explain optimize = 1, actions = 1 select * from ( + explain actions = 1 select s, y from ( select sum(x) as s, y from (select number as x, number + 1 as y from numbers(10)) group by y ) where y != 0 and s != 8 and y - 4 settings enable_optimize_predicate_expression=0" | grep -o "Aggregating\|Filter column\|Filter column: and(minus(y, 4), notEquals(y, 0))\|ALIAS notEquals(s, 8) :: 1 -> and(notEquals(y, 0), notEquals(s, 8), minus(y, 4))" +$CLICKHOUSE_CLIENT -q " + select s, y from ( + select sum(x) as s, y from (select number as x, number + 1 as y from numbers(10)) group by y + ) where y != 0 and s != 8 and y - 4 order by s, y + settings enable_optimize_predicate_expression=0" echo "> filter is split, one part is filtered before ARRAY JOIN" $CLICKHOUSE_CLIENT -q " explain actions = 1 select x, y from ( select range(number) as x, number + 1 as y from numbers(3) ) array join x where y != 2 and x != 0" | - grep -o "Filter column: and(notEquals(y, 2), notEquals(x, 0))\|ARRAY JOIN x\|Filter column: notEquals(y, 2)" \ No newline at end of file + grep -o "Filter column: and(notEquals(y, 2), notEquals(x, 0))\|ARRAY JOIN x\|Filter column: notEquals(y, 2)" +$CLICKHOUSE_CLIENT -q " + select x, y from ( + select range(number) as x, number + 1 as y from numbers(3) + ) array join x where y != 2 and x != 0 order by x, y" + +# echo "> filter is split, one part is filtered before Aggregating and Cube" +# $CLICKHOUSE_CLIENT -q " +# explain actions = 1 select * from ( +# select sum(x) as s, x, y from (select number as x, number + 1 as y from numbers(10)) group by x, y with cube +# ) where y != 0 and s != 4 +# settings enable_optimize_predicate_expression=0" | +# grep -o "Cube\|Aggregating\|Filter column: notEquals(y, 0)" +# $CLICKHOUSE_CLIENT -q " +# select s, x, y from ( +# select sum(x) as s, x, y from (select number as x, number + 1 as y from numbers(10)) group by x, y with cube +# ) where y != 0 and s != 4 order by s, x, y +# settings enable_optimize_predicate_expression=0" + +echo "> filter is pushed down before Distinct" +$CLICKHOUSE_CLIENT -q " + explain actions = 1 select x, y from ( + select distinct x, y from (select number % 2 as x, number % 3 as y from numbers(10)) + ) where y != 2 + settings enable_optimize_predicate_expression=0" | + grep -o "Distinct\|Filter column: notEquals(y, 2)" +$CLICKHOUSE_CLIENT -q " + select x, y from ( + select distinct x, y from (select number % 2 as x, number % 3 as y from numbers(10)) + ) where y != 2 order by x, y + settings enable_optimize_predicate_expression=0" + +echo "> filter is pushed down before sorting steps" +$CLICKHOUSE_CLIENT -q " + explain actions = 1 select x, y from ( + select number % 2 as x, number % 3 as y from numbers(6) order by y desc + ) where x != 0 and y != 0 + settings enable_optimize_predicate_expression = 0" | + grep -o "MergingSorted\|MergeSorting\|PartialSorting\|Filter column: and(notEquals(x, 0), notEquals(y, 0))" +$CLICKHOUSE_CLIENT -q " + select x, y from ( + select number % 2 as x, number % 3 as y from numbers(6) order by y desc + ) where x != 0 and y != 0 + settings enable_optimize_predicate_expression = 0" From f6278ed429dc2231d68aa5179e63b3bb635d081a Mon Sep 17 00:00:00 2001 From: Nikolai Kochetov Date: Wed, 17 Feb 2021 19:56:17 +0300 Subject: [PATCH 0381/2357] Support for DIstinct, sorting steps. --- src/Processors/QueryPlan/Optimizations/filterPushDown.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/Processors/QueryPlan/Optimizations/filterPushDown.cpp b/src/Processors/QueryPlan/Optimizations/filterPushDown.cpp index 74c4fa6f329..02e1914504d 100644 --- a/src/Processors/QueryPlan/Optimizations/filterPushDown.cpp +++ b/src/Processors/QueryPlan/Optimizations/filterPushDown.cpp @@ -60,12 +60,12 @@ static size_t tryAddNewFilterStep( "Filter column {} was removed from ActionsDAG but it is needed in result. DAG:\n{}", filter_column_name, expression->dumpDAG()); - std::cerr << "replacing to expr because filter " << filter_column_name << " was removed\n"; + // std::cerr << "replacing to expr because filter " << filter_column_name << " was removed\n"; parent = std::make_unique(child->getOutputStream(), expression); } else if ((*it)->column && isColumnConst(*(*it)->column)) { - std::cerr << "replacing to expr because filter is const\n"; + // std::cerr << "replacing to expr because filter is const\n"; parent = std::make_unique(child->getOutputStream(), expression); } From c704a8cc45a298f363c9b5de2349ca8dcdd45d1f Mon Sep 17 00:00:00 2001 From: Vitaly Baranov Date: Wed, 17 Feb 2021 20:05:52 +0300 Subject: [PATCH 0382/2357] Log stdout and stderr when failed to start docker in integration tests. --- tests/integration/helpers/cluster.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/integration/helpers/cluster.py b/tests/integration/helpers/cluster.py index 14aa2f252c5..aaba3a34555 100644 --- a/tests/integration/helpers/cluster.py +++ b/tests/integration/helpers/cluster.py @@ -730,7 +730,7 @@ class ClickHouseCluster: clickhouse_start_cmd = self.base_cmd + ['up', '-d', '--no-recreate'] print(("Trying to create ClickHouse instance by command %s", ' '.join(map(str, clickhouse_start_cmd)))) - subprocess.check_output(clickhouse_start_cmd) + subprocess_check_call(clickhouse_start_cmd) print("ClickHouse instance created") start_deadline = time.time() + 20.0 # seconds From 1b78de2142062edcdd0f8f084758a54e3f03867d Mon Sep 17 00:00:00 2001 From: Pavel Kruglov Date: Wed, 17 Feb 2021 20:34:52 +0300 Subject: [PATCH 0383/2357] Use fibers in HedgedRequests --- src/Client/Connection.cpp | 258 +++++++---------- src/Client/Connection.h | 41 ++- src/Client/ConnectionEstablisher.cpp | 233 ++++++++++++++++ src/Client/ConnectionEstablisher.h | 116 ++++++++ src/Client/ConnectionPoolWithFailover.cpp | 213 +------------- src/Client/ConnectionPoolWithFailover.h | 45 --- src/Client/HedgedConnections.cpp | 147 +++++----- src/Client/HedgedConnections.h | 25 +- src/Client/HedgedConnectionsFactory.cpp | 261 ++++++------------ src/Client/HedgedConnectionsFactory.h | 55 ++-- src/Client/MultiplexedConnections.cpp | 6 +- src/Client/PacketReceiver.h | 143 ++++++++++ src/Client/ya.make | 1 + src/Common/Epoll.cpp | 8 +- src/Core/Defines.h | 3 +- src/Core/Settings.h | 8 +- src/IO/ConnectionTimeouts.h | 19 +- src/IO/ConnectionTimeoutsContext.h | 3 +- src/Server/TCPHandler.cpp | 35 +-- .../test_distributed_load_balancing/test.py | 4 + .../integration/test_hedged_requests/test.py | 169 +++--------- .../test_hedged_requests_parallel/test.py | 92 +++--- 22 files changed, 931 insertions(+), 954 deletions(-) create mode 100644 src/Client/ConnectionEstablisher.cpp create mode 100644 src/Client/ConnectionEstablisher.h create mode 100644 src/Client/PacketReceiver.h diff --git a/src/Client/Connection.cpp b/src/Client/Connection.cpp index a68ab6df34e..d30a6555da5 100644 --- a/src/Client/Connection.cpp +++ b/src/Client/Connection.cpp @@ -59,43 +59,17 @@ namespace ErrorCodes void Connection::connect(const ConnectionTimeouts & timeouts) -{ - if (connected) - disconnect(); - - prepare(timeouts); - sendHello(); - receiveHello(); - - LOG_TRACE(log_wrapper.get(), "Connected to {} server version {}.{}.{}.", - server_name, server_version_major, server_version_minor, server_version_patch); -} - - -void Connection::disconnect() -{ - LOG_DEBUG(log, "disconnect"); - maybe_compressed_out = nullptr; - in = nullptr; - last_input_packet_type.reset(); - out = nullptr; // can write to socket - if (socket) - socket->close(); - socket = nullptr; - connected = false; -} - -void Connection::prepare(const ConnectionTimeouts & timeouts) { try { - LOG_TRACE( - log_wrapper.get(), - "Connecting. Database: {}. User: {}{}{}", - default_database.empty() ? "(not specified)" : default_database, - user, - static_cast(secure) ? ". Secure" : "", - static_cast(compression) ? "" : ". Uncompressed"); + if (connected) + disconnect(); + + LOG_TRACE(log_wrapper.get(), "Connecting. Database: {}. User: {}{}{}", + default_database.empty() ? "(not specified)" : default_database, + user, + static_cast(secure) ? ". Secure" : "", + static_cast(compression) ? "" : ". Uncompressed"); if (static_cast(secure)) { @@ -105,7 +79,7 @@ void Connection::prepare(const ConnectionTimeouts & timeouts) /// we resolve the ip when we open SecureStreamSocket, so to make Server Name Indication (SNI) /// work we need to pass host name separately. It will be send into TLS Hello packet to let /// the server know which host we want to talk with (single IP can process requests for multiple hosts using SNI). - static_cast(socket.get())->setPeerHostName(host); + static_cast(socket.get())->setPeerHostName(host); #else throw Exception{"tcp_secure protocol is disabled because poco library was built without NetSSL support.", ErrorCodes::SUPPORT_IS_DISABLED}; #endif @@ -125,21 +99,28 @@ void Connection::prepare(const ConnectionTimeouts & timeouts) if (timeouts.tcp_keep_alive_timeout.totalSeconds()) { socket->setKeepAlive(true); - socket->setOption( - IPPROTO_TCP, + socket->setOption(IPPROTO_TCP, #if defined(TCP_KEEPALIVE) TCP_KEEPALIVE #else - TCP_KEEPIDLE // __APPLE__ + TCP_KEEPIDLE // __APPLE__ #endif - , - timeouts.tcp_keep_alive_timeout); + , timeouts.tcp_keep_alive_timeout); } in = std::make_shared(*socket); + if (async_callback) + in->setAsyncCallback(std::move(async_callback)); + out = std::make_shared(*socket); connected = true; + + sendHello(); + receiveHello(); + + LOG_TRACE(log_wrapper.get(), "Connected to {} server version {}.{}.{}.", + server_name, server_version_major, server_version_minor, server_version_patch); } catch (Poco::Net::NetException & e) { @@ -158,12 +139,21 @@ void Connection::prepare(const ConnectionTimeouts & timeouts) } +void Connection::disconnect() +{ + maybe_compressed_out = nullptr; + in = nullptr; + last_input_packet_type.reset(); + out = nullptr; // can write to socket + if (socket) + socket->close(); + socket = nullptr; + connected = false; +} + void Connection::sendHello() { - LOG_DEBUG(log_wrapper.get(), "sendHello"); - try - { - /** Disallow control characters in user controlled parameters + /** Disallow control characters in user controlled parameters * to mitigate the possibility of SSRF. * The user may do server side requests with 'remote' table function. * Malicious user with full r/w access to ClickHouse @@ -172,119 +162,84 @@ void Connection::sendHello() * Limiting number of possible characters in user-controlled part of handshake * will mitigate this possibility but doesn't solve it completely. */ - auto has_control_character = [](const std::string & s) - { - for (auto c : s) - if (isControlASCII(c)) - return true; - return false; - }; + auto has_control_character = [](const std::string & s) + { + for (auto c : s) + if (isControlASCII(c)) + return true; + return false; + }; - if (has_control_character(default_database) || has_control_character(user) || has_control_character(password)) - throw Exception( - "Parameters 'default_database', 'user' and 'password' must not contain ASCII control characters", - ErrorCodes::BAD_ARGUMENTS); + if (has_control_character(default_database) + || has_control_character(user) + || has_control_character(password)) + throw Exception("Parameters 'default_database', 'user' and 'password' must not contain ASCII control characters", ErrorCodes::BAD_ARGUMENTS); - writeVarUInt(Protocol::Client::Hello, *out); - writeStringBinary((DBMS_NAME " ") + client_name, *out); - writeVarUInt(DBMS_VERSION_MAJOR, *out); - writeVarUInt(DBMS_VERSION_MINOR, *out); - // NOTE For backward compatibility of the protocol, client cannot send its version_patch. - writeVarUInt(DBMS_TCP_PROTOCOL_VERSION, *out); - writeStringBinary(default_database, *out); - /// If interserver-secret is used, one do not need password - /// (NOTE we do not check for DBMS_MIN_REVISION_WITH_INTERSERVER_SECRET, since we cannot ignore inter-server secret if it was requested) - if (!cluster_secret.empty()) - { - writeStringBinary(USER_INTERSERVER_MARKER, *out); - writeStringBinary("" /* password */, *out); + writeVarUInt(Protocol::Client::Hello, *out); + writeStringBinary((DBMS_NAME " ") + client_name, *out); + writeVarUInt(DBMS_VERSION_MAJOR, *out); + writeVarUInt(DBMS_VERSION_MINOR, *out); + // NOTE For backward compatibility of the protocol, client cannot send its version_patch. + writeVarUInt(DBMS_TCP_PROTOCOL_VERSION, *out); + writeStringBinary(default_database, *out); + /// If interserver-secret is used, one do not need password + /// (NOTE we do not check for DBMS_MIN_REVISION_WITH_INTERSERVER_SECRET, since we cannot ignore inter-server secret if it was requested) + if (!cluster_secret.empty()) + { + writeStringBinary(USER_INTERSERVER_MARKER, *out); + writeStringBinary("" /* password */, *out); #if USE_SSL - sendClusterNameAndSalt(); + sendClusterNameAndSalt(); #else - throw Exception( - "Inter-server secret support is disabled, because ClickHouse was built without SSL library", - ErrorCodes::SUPPORT_IS_DISABLED); + throw Exception( + "Inter-server secret support is disabled, because ClickHouse was built without SSL library", + ErrorCodes::SUPPORT_IS_DISABLED); #endif - } - else - { - writeStringBinary(user, *out); - writeStringBinary(password, *out); - } - - out->next(); } - catch (Poco::Net::NetException & e) + else { - disconnect(); - - /// Add server address to exception. Also Exception will remember stack trace. It's a pity that more precise exception type is lost. - throw NetException(e.displayText() + " (" + getDescription() + ")", ErrorCodes::NETWORK_ERROR); + writeStringBinary(user, *out); + writeStringBinary(password, *out); } - catch (Poco::TimeoutException & e) - { - disconnect(); - /// Add server address to exception. Also Exception will remember stack trace. It's a pity that more precise exception type is lost. - throw NetException(e.displayText() + " (" + getDescription() + ")", ErrorCodes::SOCKET_TIMEOUT); - } + out->next(); } - void Connection::receiveHello() { - LOG_DEBUG(log_wrapper.get(), "receiveHello"); + /// Receive hello packet. + UInt64 packet_type = 0; - try + /// Prevent read after eof in readVarUInt in case of reset connection + /// (Poco should throw such exception while reading from socket but + /// sometimes it doesn't for unknown reason) + if (in->eof()) + throw Poco::Net::NetException("Connection reset by peer"); + + readVarUInt(packet_type, *in); + if (packet_type == Protocol::Server::Hello) { - /// Receive hello packet. - UInt64 packet_type = 0; - - /// Prevent read after eof in readVarUInt in case of reset connection - /// (Poco should throw such exception while reading from socket but - /// sometimes it doesn't for unknown reason) - if (in->eof()) - throw Poco::Net::NetException("Connection reset by peer"); - - readVarUInt(packet_type, *in); - if (packet_type == Protocol::Server::Hello) - { - readStringBinary(server_name, *in); - readVarUInt(server_version_major, *in); - readVarUInt(server_version_minor, *in); - readVarUInt(server_revision, *in); - if (server_revision >= DBMS_MIN_REVISION_WITH_SERVER_TIMEZONE) - readStringBinary(server_timezone, *in); - if (server_revision >= DBMS_MIN_REVISION_WITH_SERVER_DISPLAY_NAME) - readStringBinary(server_display_name, *in); - if (server_revision >= DBMS_MIN_REVISION_WITH_VERSION_PATCH) - readVarUInt(server_version_patch, *in); - else - server_version_patch = server_revision; - } - else if (packet_type == Protocol::Server::Exception) - receiveException()->rethrow(); + readStringBinary(server_name, *in); + readVarUInt(server_version_major, *in); + readVarUInt(server_version_minor, *in); + readVarUInt(server_revision, *in); + if (server_revision >= DBMS_MIN_REVISION_WITH_SERVER_TIMEZONE) + readStringBinary(server_timezone, *in); + if (server_revision >= DBMS_MIN_REVISION_WITH_SERVER_DISPLAY_NAME) + readStringBinary(server_display_name, *in); + if (server_revision >= DBMS_MIN_REVISION_WITH_VERSION_PATCH) + readVarUInt(server_version_patch, *in); else - { - /// Close connection, to not stay in unsynchronised state. - disconnect(); - throwUnexpectedPacket(packet_type, "Hello or Exception"); - } + server_version_patch = server_revision; } - catch (Poco::Net::NetException & e) + else if (packet_type == Protocol::Server::Exception) + receiveException()->rethrow(); + else { + /// Close connection, to not stay in unsynchronised state. disconnect(); - - /// Add server address to exception. Also Exception will remember stack trace. It's a pity that more precise exception type is lost. - throw NetException(e.displayText() + " (" + getDescription() + ")", ErrorCodes::NETWORK_ERROR); - } - catch (Poco::TimeoutException & e) - { - disconnect(); - - /// Add server address to exception. Also Exception will remember stack trace. It's a pity that more precise exception type is lost. - throw NetException(e.displayText() + " (" + getDescription() + ")", ErrorCodes::SOCKET_TIMEOUT); + throwUnexpectedPacket(packet_type, "Hello or Exception"); } } @@ -425,24 +380,9 @@ TablesStatusResponse Connection::getTablesStatus(const ConnectionTimeouts & time TimeoutSetter timeout_setter(*socket, sync_request_timeout, true); - sendTablesStatusRequest(request); - TablesStatusResponse response = receiveTablesStatusResponse(); - - return response; -} - -void Connection::sendTablesStatusRequest(const TablesStatusRequest & request) -{ - LOG_DEBUG(log_wrapper.get(), "sendTablesStatusRequest"); - writeVarUInt(Protocol::Client::TablesStatusRequest, *out); request.write(*out, server_revision); out->next(); -} - -TablesStatusResponse Connection::receiveTablesStatusResponse() -{ - LOG_DEBUG(log_wrapper.get(), "receiveTablesStatusResponse"); UInt64 response_type = 0; readVarUInt(response_type, *in); @@ -457,6 +397,7 @@ TablesStatusResponse Connection::receiveTablesStatusResponse() return response; } + void Connection::sendQuery( const ConnectionTimeouts & timeouts, const String & query, @@ -466,8 +407,6 @@ void Connection::sendQuery( const ClientInfo * client_info, bool with_pending_data) { - LOG_DEBUG(log_wrapper.get(), "sendQuery"); - if (!connected) connect(timeouts); @@ -565,8 +504,6 @@ void Connection::sendQuery( void Connection::sendCancel() { - LOG_DEBUG(log_wrapper.get(), "sendCancel"); - /// If we already disconnected. if (!out) return; @@ -815,13 +752,8 @@ std::optional Connection::checkPacket(size_t timeout_microseconds) } -Packet Connection::receivePacket(AsyncCallback async_callback) +Packet Connection::receivePacket() { - LOG_DEBUG(log_wrapper.get(), "receivePacket"); - - in->setAsyncCallback(std::move(async_callback)); - SCOPE_EXIT(in->setAsyncCallback({})); - try { Packet res; @@ -896,8 +828,6 @@ Packet Connection::receivePacket(AsyncCallback async_callback) Block Connection::receiveData() { - LOG_DEBUG(log_wrapper.get(), "receiveData"); - initBlockInput(); return receiveDataImpl(block_in); } diff --git a/src/Client/Connection.h b/src/Client/Connection.h index 1546f42f382..dd501b5f6ef 100644 --- a/src/Client/Connection.h +++ b/src/Client/Connection.h @@ -175,8 +175,7 @@ public: std::optional checkPacket(size_t timeout_microseconds = 0); /// Receive packet from server. - /// Each time read blocks and async_callback is set, it will be called. You can poll socket inside it. - Packet receivePacket(AsyncCallback async_callback = {}); + Packet receivePacket(); /// If not connected yet, or if connection is broken - then connect. If cannot connect - throw an exception. void forceConnected(const ConnectionTimeouts & timeouts); @@ -195,19 +194,15 @@ public: size_t outBytesCount() const { return out ? out->count() : 0; } size_t inBytesCount() const { return in ? in->count() : 0; } - /// Make preparation before sending Hello in connect - void prepare(const ConnectionTimeouts & timeouts); - - void sendHello(); - - void receiveHello(); - - void sendTablesStatusRequest(const TablesStatusRequest & request); - - TablesStatusResponse receiveTablesStatusResponse(); - Poco::Net::Socket * getSocket() { return socket.get(); } + void setAsyncCallback(AsyncCallback async_callback_) + { + async_callback = std::move(async_callback_); + if (in) + in->setAsyncCallback(std::move(async_callback)); + } + private: String host; UInt16 port; @@ -295,7 +290,11 @@ private: LoggerWrapper log_wrapper; + AsyncCallback async_callback; + void connect(const ConnectionTimeouts & timeouts); + void sendHello(); + void receiveHello(); #if USE_SSL void sendClusterNameAndSalt(); @@ -318,4 +317,20 @@ private: [[noreturn]] void throwUnexpectedPacket(UInt64 packet_type, const char * expected) const; }; +class AsyncCallbackSetter +{ +public: + AsyncCallbackSetter(Connection * connection_, AsyncCallback async_callback) : connection(connection_) + { + connection->setAsyncCallback(std::move(async_callback)); + } + + ~AsyncCallbackSetter() + { + connection->setAsyncCallback({}); + } +private: + Connection * connection; +}; + } diff --git a/src/Client/ConnectionEstablisher.cpp b/src/Client/ConnectionEstablisher.cpp new file mode 100644 index 00000000000..e529d366fdc --- /dev/null +++ b/src/Client/ConnectionEstablisher.cpp @@ -0,0 +1,233 @@ +#include +#include +#include + +namespace ProfileEvents +{ + extern const Event DistributedConnectionMissingTable; + extern const Event DistributedConnectionStaleReplica; +} + +namespace DB +{ + +namespace ErrorCodes +{ + extern const int ATTEMPT_TO_READ_AFTER_EOF; + extern const int NETWORK_ERROR; + extern const int SOCKET_TIMEOUT; +} + + +ConnectionEstablisher::ConnectionEstablisher( + IConnectionPool * pool_, + const ConnectionTimeouts * timeouts_, + const Settings * settings_, + const QualifiedTableName * table_to_check_) + : pool(pool_), timeouts(timeouts_), settings(settings_), table_to_check(table_to_check_), + stage(Stage::INITIAL), log(&Poco::Logger::get("ConnectionEstablisher")) +{ +#if defined(OS_LINUX) + epoll.add(receive_timeout.getDescriptor()); +#endif +} + +void ConnectionEstablisher::Routine::ReadCallback::operator()(int fd, const Poco::Timespan & timeout, const std::string &) +{ +#if defined(OS_LINUX) + if (connection_establisher.socket_fd != fd) + { + if (connection_establisher.socket_fd != -1) + connection_establisher.epoll.remove(connection_establisher.socket_fd); + + connection_establisher.epoll.add(fd); + connection_establisher.socket_fd = fd; + } + + connection_establisher.receive_timeout.setRelative(timeout); + fiber = std::move(fiber).resume(); + connection_establisher.receive_timeout.reset(); +#endif +} + +Fiber ConnectionEstablisher::Routine::operator()(Fiber && sink) +{ + try + { + connection_establisher.establishConnection(ReadCallback{connection_establisher, sink}); + } + catch (const boost::context::detail::forced_unwind &) + { + /// This exception is thrown by fiber implementation in case if fiber is being deleted but hasn't exited + /// It should not be caught or it will segfault. + /// Other exceptions must be caught + throw; + } + catch (...) + { + connection_establisher.exception = std::current_exception(); + } + + return std::move(sink); +} + +void ConnectionEstablisher::resume() +{ + if (!fiber_created) + { + reset(); + fiber = boost::context::fiber(std::allocator_arg_t(), fiber_stack, Routine{*this}); + fiber_created = true; + resumeFiber(); + return; + } + +#if defined(OS_LINUX) + bool is_socket_ready = false; + bool is_receive_timeout_alarmed = false; + + epoll_event events[2]; + events[0].data.fd = events[1].data.fd; + size_t ready_count = epoll.getManyReady(2, events, true); + for (size_t i = 0; i != ready_count; ++i) + { + if (events[i].data.fd == socket_fd) + is_socket_ready = true; + if (events[i].data.fd == receive_timeout.getDescriptor()) + is_receive_timeout_alarmed = true; + } + + if (is_receive_timeout_alarmed && !is_socket_ready) + processReceiveTimeout(); +#endif + + resumeFiber(); +} + +void ConnectionEstablisher::cancel() +{ + destroyFiber(); + reset(); +} + +void ConnectionEstablisher::processReceiveTimeout() +{ +#if defined(OS_LINUX) + destroyFiber(); + stage = Stage::FAILED; + fail_message = "Code: 209, e.displayText() = DB::NetException: Timeout exceeded while reading from socket (" + result.entry->getDescription() + ")"; + epoll.remove(socket_fd); + resetResult(); +#endif +} + +void ConnectionEstablisher::resetResult() +{ + if (!result.entry.isNull()) + { + result.entry->disconnect(); + result.reset(); + } +} + +void ConnectionEstablisher::reset() +{ + stage = Stage::INITIAL; + resetResult(); + fail_message.clear(); + socket_fd = -1; +} + +void ConnectionEstablisher::resumeFiber() +{ + fiber = std::move(fiber).resume(); + + if (exception) + std::rethrow_exception(std::move(exception)); + + if (stage == Stage::FAILED) + destroyFiber(); +} + +void ConnectionEstablisher::destroyFiber() +{ + Fiber to_destroy = std::move(fiber); + fiber_created = false; +} + +void ConnectionEstablisher::establishConnection(AsyncCallback async_callback) +{ + try + { + stage = Stage::IN_PROCESS; + result.entry = pool->get(*timeouts, settings, /* force_connected = */ false); + AsyncCallbackSetter async_setter(&*result.entry, std::move(async_callback)); + + UInt64 server_revision = 0; + if (table_to_check) + server_revision = result.entry->getServerRevision(*timeouts); + + if (!table_to_check || server_revision < DBMS_MIN_REVISION_WITH_TABLES_STATUS) + { + result.entry->forceConnected(*timeouts); + result.is_usable = true; + result.is_up_to_date = true; + stage = Stage::FINISHED; + return; + } + + /// Only status of the remote table corresponding to the Distributed table is taken into account. + /// TODO: request status for joined tables also. + TablesStatusRequest status_request; + status_request.tables.emplace(*table_to_check); + + TablesStatusResponse status_response = result.entry->getTablesStatus(*timeouts, status_request); + auto table_status_it = status_response.table_states_by_id.find(*table_to_check); + if (table_status_it == status_response.table_states_by_id.end()) + { + const char * message_pattern = "There is no table {}.{} on server: {}"; + fail_message = fmt::format(message_pattern, backQuote(table_to_check->database), backQuote(table_to_check->table), result.entry->getDescription()); + LOG_WARNING(log, fail_message); + ProfileEvents::increment(ProfileEvents::DistributedConnectionMissingTable); + + stage = Stage::FINISHED; + return; + } + + result.is_usable = true; + + UInt64 max_allowed_delay = settings ? UInt64(settings->max_replica_delay_for_distributed_queries) : 0; + if (!max_allowed_delay) + { + result.is_up_to_date = true; + stage = Stage::FINISHED; + return; + } + + UInt32 delay = table_status_it->second.absolute_delay; + + if (delay < max_allowed_delay) + result.is_up_to_date = true; + else + { + result.is_up_to_date = false; + result.staleness = delay; + + LOG_TRACE(log, "Server {} has unacceptable replica delay for table {}.{}: {}", result.entry->getDescription(), table_to_check->database, table_to_check->table, delay); + ProfileEvents::increment(ProfileEvents::DistributedConnectionStaleReplica); + } + stage = Stage::FINISHED; + } + catch (const Exception & e) + { + if (e.code() != ErrorCodes::NETWORK_ERROR && e.code() != ErrorCodes::SOCKET_TIMEOUT + && e.code() != ErrorCodes::ATTEMPT_TO_READ_AFTER_EOF) + throw; + + fail_message = getCurrentExceptionMessage(/* with_stacktrace = */ false); + resetResult(); + stage = Stage::FAILED; + } +} + +} diff --git a/src/Client/ConnectionEstablisher.h b/src/Client/ConnectionEstablisher.h new file mode 100644 index 00000000000..8d10126b3da --- /dev/null +++ b/src/Client/ConnectionEstablisher.h @@ -0,0 +1,116 @@ +#pragma once + +#include +#include +#include +#include +#include +#include + +namespace DB +{ + +/// Class for nonblocking establishing connection to the replica. +/// It runs establishing connection process in fiber and sets special +/// read callback which is called when reading from socket blocks. +/// When read callback is called, socket and receive timeout are added in epoll +/// and execution returns to the main program. +/// So, you can poll this epoll file descriptor to determine when to resume. +class ConnectionEstablisher +{ +public: + using TryResult = PoolWithFailoverBase::TryResult; + + ConnectionEstablisher(IConnectionPool * pool_, + const ConnectionTimeouts * timeouts_, + const Settings * settings_, + const QualifiedTableName * table_to_check = nullptr); + + /// Establish connection with replica, call async_callbeck when + /// reading from socket blocks. + void establishConnection(AsyncCallback async_callback = {}); + + /// In the first call create fiber with establishConnection function, + /// in the next - check timeout and resume fiber. + void resume(); + + /// Cancel establishing connections. Fiber will be destroyed, + /// class will be set in initial stage. + void cancel(); + + bool isInProcess() const { return stage == Stage::IN_PROCESS; } + + bool isFinished() const { return stage == Stage::FINISHED; } + + bool isFailed() const { return stage == Stage::FAILED; } + + int getFileDescriptor() const + { + int fd = -1; +#if defined(OS_LINUX) + fd = epoll.getFileDescriptor(); +#endif + return fd; + } + + const std::string & getFailMessage() const { return fail_message; } + + TryResult getResult() { return result; } + + Connection * getConnection() { return &*result.entry; } + + +private: + void processReceiveTimeout(); + + enum class Stage + { + INITIAL, + IN_PROCESS, + FINISHED, + FAILED, + }; + + struct Routine + { + ConnectionEstablisher & connection_establisher; + + struct ReadCallback + { + ConnectionEstablisher & connection_establisher; + Fiber & fiber; + + void operator()(int fd, const Poco::Timespan & timeout, const std::string &); + }; + + Fiber operator()(Fiber && sink); + }; + + void resetResult(); + + void reset(); + + void destroyFiber(); + + void resumeFiber(); + + IConnectionPool * pool; + const ConnectionTimeouts * timeouts; + std::string fail_message; + const Settings * settings; + const QualifiedTableName * table_to_check; + TryResult result; + Stage stage; + Poco::Logger * log; + Fiber fiber; + FiberStack fiber_stack; + std::exception_ptr exception; + int socket_fd = -1; + bool fiber_created = false; +#if defined(OS_LINUX) + TimerDescriptor receive_timeout; + Epoll epoll; +#endif +}; + +} diff --git a/src/Client/ConnectionPoolWithFailover.cpp b/src/Client/ConnectionPoolWithFailover.cpp index a027f7a186b..ec9215e3bc1 100644 --- a/src/Client/ConnectionPoolWithFailover.cpp +++ b/src/Client/ConnectionPoolWithFailover.cpp @@ -1,4 +1,5 @@ #include +#include #include #include @@ -249,77 +250,10 @@ ConnectionPoolWithFailover::tryGetEntry( const Settings * settings, const QualifiedTableName * table_to_check) { - TryResult result; - try - { - result.entry = pool.get(timeouts, settings, /* force_connected = */ false); - - UInt64 server_revision = 0; - if (table_to_check) - server_revision = result.entry->getServerRevision(timeouts); - - if (!table_to_check || server_revision < DBMS_MIN_REVISION_WITH_TABLES_STATUS) - { - result.entry->forceConnected(timeouts); - result.is_usable = true; - result.is_up_to_date = true; - return result; - } - - /// Only status of the remote table corresponding to the Distributed table is taken into account. - /// TODO: request status for joined tables also. - TablesStatusRequest status_request; - status_request.tables.emplace(*table_to_check); - - TablesStatusResponse status_response = result.entry->getTablesStatus(timeouts, status_request); - auto table_status_it = status_response.table_states_by_id.find(*table_to_check); - if (table_status_it == status_response.table_states_by_id.end()) - { - const char * message_pattern = "There is no table {}.{} on server: {}"; - fail_message = fmt::format(message_pattern, backQuote(table_to_check->database), backQuote(table_to_check->table), result.entry->getDescription()); - LOG_WARNING(log, fail_message); - ProfileEvents::increment(ProfileEvents::DistributedConnectionMissingTable); - - return result; - } - - result.is_usable = true; - - UInt64 max_allowed_delay = settings ? UInt64(settings->max_replica_delay_for_distributed_queries) : 0; - if (!max_allowed_delay) - { - result.is_up_to_date = true; - return result; - } - - UInt32 delay = table_status_it->second.absolute_delay; - - if (delay < max_allowed_delay) - result.is_up_to_date = true; - else - { - result.is_up_to_date = false; - result.staleness = delay; - - LOG_TRACE(log, "Server {} has unacceptable replica delay for table {}.{}: {}", result.entry->getDescription(), table_to_check->database, table_to_check->table, delay); - ProfileEvents::increment(ProfileEvents::DistributedConnectionStaleReplica); - } - } - catch (const Exception & e) - { - if (e.code() != ErrorCodes::NETWORK_ERROR && e.code() != ErrorCodes::SOCKET_TIMEOUT - && e.code() != ErrorCodes::ATTEMPT_TO_READ_AFTER_EOF) - throw; - - fail_message = getCurrentExceptionMessage(/* with_stacktrace = */ false); - - if (!result.entry.isNull()) - { - result.entry->disconnect(); - result.reset(); - } - } - return result; + ConnectionEstablisher connection_establisher(&pool, &timeouts, settings, table_to_check); + connection_establisher.establishConnection(); + fail_message = connection_establisher.getFailMessage(); + return connection_establisher.getResult(); } std::vector ConnectionPoolWithFailover::getShuffledPools(const Settings * settings) @@ -329,141 +263,4 @@ std::vector ConnectionPoolWithFa return Base::getShuffledPools(max_ignored_errors, get_priority); } -ConnectionEstablisher::ConnectionEstablisher( - IConnectionPool * pool_, - const ConnectionTimeouts * timeouts_, - const Settings * settings_, - const QualifiedTableName * table_to_check_, - Poco::Logger * log_) : - pool(pool_), timeouts(timeouts_), settings(settings_), - table_to_check(table_to_check_), log(log_), stage(Stage::CONNECT), socket_fd(-1) -{ -} - -void ConnectionEstablisher::reset() -{ - resetResult(); - stage = Stage::CONNECT; - action_before_disconnect = nullptr; - socket_fd = -1; - fail_message.clear(); -} - -void ConnectionEstablisher::resetResult() -{ - if (!result.entry.isNull()) - { - result.entry->disconnect(); - result.reset(); - } -} - -void ConnectionEstablisher::run() -{ - try - { - if (stage == Stage::CONNECT) - { - result.entry = pool->get(*timeouts, settings, /* force_connected = */ false); - - if (!result.entry->isConnected()) - { - result.entry->prepare(*timeouts); - socket_fd = result.entry->getSocket()->impl()->sockfd(); - result.entry->sendHello(); - stage = Stage::RECEIVE_HELLO; - /// We are waiting for hello from replica. - return; - } - - socket_fd = result.entry->getSocket()->impl()->sockfd(); - stage = Stage::START_CHECK_TABLE; - } - - if (stage == Stage::RECEIVE_HELLO) - { - result.entry->receiveHello(); - stage = Stage::START_CHECK_TABLE; - } - - if (stage == Stage::START_CHECK_TABLE) - { - UInt64 server_revision = 0; - if (table_to_check) - server_revision = result.entry->getServerRevision(*timeouts); - - if (!table_to_check || server_revision < DBMS_MIN_REVISION_WITH_TABLES_STATUS) - { - result.entry->forceConnected(*timeouts); - result.is_usable = true; - result.is_up_to_date = true; - stage = FINISHED; - return; - } - - TablesStatusRequest status_request; - status_request.tables.emplace(*table_to_check); - - result.entry->sendTablesStatusRequest(status_request); - stage = Stage::RECEIVE_TABLES_STATUS; - /// We are waiting for tables status response. - return; - } - - if (stage == Stage::RECEIVE_TABLES_STATUS) - { - TablesStatusResponse status_response = result.entry->receiveTablesStatusResponse(); - auto table_status_it = status_response.table_states_by_id.find(*table_to_check); - if (table_status_it == status_response.table_states_by_id.end()) - { - const char * message_pattern = "There is no table {}.{} on server: {}"; - fail_message = fmt::format(message_pattern, backQuote(table_to_check->database), backQuote(table_to_check->table), result.entry->getDescription()); - LOG_WARNING(log, fail_message); - ProfileEvents::increment(ProfileEvents::DistributedConnectionMissingTable); - stage = Stage::FINISHED; - return; - } - - result.is_usable = true; - - UInt64 max_allowed_delay = settings ? UInt64(settings->max_replica_delay_for_distributed_queries) : 0; - if (!max_allowed_delay) - { - result.is_up_to_date = true; - stage = Stage::FINISHED; - return; - } - - UInt32 delay = table_status_it->second.absolute_delay; - - if (delay < max_allowed_delay) - result.is_up_to_date = true; - else - { - result.is_up_to_date = false; - result.staleness = delay; - - LOG_TRACE(log, "Server {} has unacceptable replica delay for table {}.{}: {}", result.entry->getDescription(), table_to_check->database, table_to_check->table, delay); - ProfileEvents::increment(ProfileEvents::DistributedConnectionStaleReplica); - } - } - - stage = Stage::FINISHED; - } - catch (const Exception & e) - { - if (e.code() != ErrorCodes::NETWORK_ERROR && e.code() != ErrorCodes::SOCKET_TIMEOUT - && e.code() != ErrorCodes::ATTEMPT_TO_READ_AFTER_EOF) - throw; - - if (action_before_disconnect) - action_before_disconnect(socket_fd); - - fail_message = getCurrentExceptionMessage(/* with_stacktrace = */ false); - resetResult(); - socket_fd = -1; - stage = Stage::FAILED; - } -} - } diff --git a/src/Client/ConnectionPoolWithFailover.h b/src/Client/ConnectionPoolWithFailover.h index b25eee6e33d..2ecb0492747 100644 --- a/src/Client/ConnectionPoolWithFailover.h +++ b/src/Client/ConnectionPoolWithFailover.h @@ -31,51 +31,6 @@ enum class PoolMode GET_ALL }; -/// Class for establishing connection with replica without blocking using different stages. -class ConnectionEstablisher -{ -public: - enum Stage - { - CONNECT = 0, - RECEIVE_HELLO = 1, - START_CHECK_TABLE = 2, - RECEIVE_TABLES_STATUS = 3, - FINISHED = 4, - FAILED = 5, - }; - - using TryResult = PoolWithFailoverBase::TryResult; - - ConnectionEstablisher(IConnectionPool * pool_, - const ConnectionTimeouts * timeouts_, - const Settings * settings_, - const QualifiedTableName * table_to_check = nullptr, - Poco::Logger * log_ = nullptr); - - /// Continue connecting to replica from previous stage. Initial stage is CONNECT. - void run(); - - void resetResult(); - - /// Reset class to initial stage. - void reset(); - - /// If action_before_disconnect is set, action_before_disconnect(socket_fd) will be called before - /// disconnect. It may be useful for removing file descriptor from epoll. - void setActionBeforeDisconnect(std::function action) { action_before_disconnect = action; } - - IConnectionPool * pool; - const ConnectionTimeouts * timeouts; - std::string fail_message; - const Settings * settings; - const QualifiedTableName * table_to_check; - Poco::Logger * log; - TryResult result; - Stage stage; - int socket_fd; - std::function action_before_disconnect; -}; class ConnectionPoolWithFailover : public IConnectionPool, private PoolWithFailoverBase { diff --git a/src/Client/HedgedConnections.cpp b/src/Client/HedgedConnections.cpp index 6d49c0f6749..61d6d317c6e 100644 --- a/src/Client/HedgedConnections.cpp +++ b/src/Client/HedgedConnections.cpp @@ -29,24 +29,26 @@ HedgedConnections::HedgedConnections( if (connections.empty()) return; + offset_states.reserve(connections.size()); for (size_t i = 0; i != connections.size(); ++i) { - ReplicaState replica; - replica.connection = connections[i]; - replica.connection->setThrottler(throttler_); - replica.epoll.add(replica.connection->getSocket()->impl()->sockfd()); - epoll.add(replica.epoll.getFileDescriptor()); - fd_to_replica_location[replica.epoll.getFileDescriptor()] = ReplicaLocation{i, 0}; offset_states.emplace_back(); - offset_states[i].replicas.emplace_back(std::move(replica)); + offset_states[i].replicas.emplace_back(connections[i]); offset_states[i].active_connection_count = 1; + + ReplicaState & replica = offset_states[i].replicas.back(); + replica.connection->setThrottler(throttler_); + + epoll.add(replica.packet_receiver.getFileDescriptor()); + fd_to_replica_location[replica.packet_receiver.getFileDescriptor()] = ReplicaLocation{i, 0}; + + epoll.add(replica.change_replica_timeout.getDescriptor()); + timeout_fd_to_replica_location[replica.change_replica_timeout.getDescriptor()] = ReplicaLocation{i, 0}; } active_connection_count = connections.size(); offsets_with_received_first_data_packet = 0; pipeline_for_new_replicas.add([throttler_](ReplicaState & replica_) { replica_.connection->setThrottler(throttler_); }); - - log = &Poco::Logger::get("HedgedConnections"); } void HedgedConnections::Pipeline::add(std::function send_function) @@ -155,11 +157,10 @@ void HedgedConnections::sendQuery( if (offset_states.size() > 1) { modified_settings.parallel_replicas_count = offset_states.size(); - modified_settings.parallel_replica_offset = fd_to_replica_location[replica.epoll.getFileDescriptor()].offset; + modified_settings.parallel_replica_offset = fd_to_replica_location[replica.packet_receiver.getFileDescriptor()].offset; } replica.connection->sendQuery(timeouts, query, query_id, stage, &modified_settings, &client_info, with_pending_data); - replica.receive_timeout.setRelative(timeouts.receive_timeout); replica.change_replica_timeout.setRelative(timeouts.receive_data_timeout); }; @@ -281,75 +282,60 @@ Packet HedgedConnections::receivePacketUnlocked(AsyncCallback async_callback) if (epoll.empty()) throw Exception("No pending events in epoll.", ErrorCodes::LOGICAL_ERROR); - ReplicaLocation location = getReadyReplicaLocation(async_callback); - return receivePacketFromReplica(location, std::move(async_callback)); + ReplicaLocation location = getReadyReplicaLocation(std::move(async_callback)); + return receivePacketFromReplica(location); } HedgedConnections::ReplicaLocation HedgedConnections::getReadyReplicaLocation(AsyncCallback async_callback) { - LOG_DEBUG(log, "getReadyReplicaLocation"); int event_fd; while (true) { - /// Check connections for pending data. + /// Check connections for pending data in buffer. ReplicaLocation location; if (checkPendingData(location)) - return location; + { + ReplicaState & replica_state = offset_states[location.offset].replicas[location.index]; + + replica_state.packet_receiver.resume(); + if (replica_state.packet_receiver.isPacketReady()) + return location; + continue; + } /// Get ready file descriptor from epoll and process it. event_fd = getReadyFileDescriptor(async_callback); if (event_fd == hedged_connections_factory.getFileDescriptor()) - { tryGetNewReplica(false); - continue; - } - - if (!fd_to_replica_location.contains(event_fd)) - throw Exception("Unknown event from epoll", ErrorCodes::LOGICAL_ERROR); - - location = fd_to_replica_location[event_fd]; - - /// Read all events from replica epoll. - /// If socket is ready and timeout is alarmed simultaneously, skip timeout. - bool is_socket_ready = false; - bool is_change_replica_timeout_alarmed = false; - bool is_receive_timeout_alarmed = false; - - epoll_event events[3]; - events[0].data.fd = events[1].data.fd = events[2].data.fd = -1; - ReplicaState & replica_state = offset_states[location.offset].replicas[location.index]; - size_t ready_count = replica_state.epoll.getManyReady(3, events, true); - - for (size_t i = 0; i != ready_count; ++i) + else if (fd_to_replica_location.contains(event_fd)) { - if (events[i].data.fd == replica_state.connection->getSocket()->impl()->sockfd()) - is_socket_ready = true; - if (events[i].data.fd == replica_state.change_replica_timeout.getDescriptor()) - is_change_replica_timeout_alarmed = true; - if (events[i].data.fd == replica_state.receive_timeout.getDescriptor()) - is_receive_timeout_alarmed = true; + location = fd_to_replica_location[event_fd]; + ReplicaState & replica_state = offset_states[location.offset].replicas[location.index]; + replica_state.packet_receiver.resume(); + + if (replica_state.packet_receiver.isPacketReady()) + return location; + + if (replica_state.packet_receiver.isReceiveTimeoutExpired()) + { + finishProcessReplica(replica_state, true); + + /// Check if there is no more active connections with the same offset and there is no new replica in process. + if (offset_states[location.offset].active_connection_count == 0 && !offset_states[location.offset].next_replica_in_process) + throw NetException("Receive timeout expired", ErrorCodes::SOCKET_TIMEOUT); + } } - - if (is_socket_ready) - return location; - - /// We reach this point only if there is an alarmed timeout. - - if (is_change_replica_timeout_alarmed) + else if (timeout_fd_to_replica_location.contains(event_fd)) { - replica_state.change_replica_timeout.reset(); + location = timeout_fd_to_replica_location[event_fd]; + offset_states[location.offset].replicas[location.index].change_replica_timeout.reset(); + offset_states[location.offset].next_replica_in_process = true; offsets_queue.push(location.offset); tryGetNewReplica(true); } - if (is_receive_timeout_alarmed) - { - finishProcessReplica(replica_state, true); - - /// Check if there is no more active connections with the same offset and there is no new replica in process. - if (offset_states[location.offset].active_connection_count == 0 && !next_replica_in_process) - throw NetException("Receive timeout expired", ErrorCodes::SOCKET_TIMEOUT); - } + else + throw Exception("Unknown event from epoll", ErrorCodes::LOGICAL_ERROR); } }; @@ -375,19 +361,15 @@ bool HedgedConnections::checkPendingData(ReplicaLocation & location_out) return false; } -Packet HedgedConnections::receivePacketFromReplica(const ReplicaLocation & replica_location, AsyncCallback async_callback) +Packet HedgedConnections::receivePacketFromReplica(const ReplicaLocation & replica_location) { - LOG_DEBUG(log, "receivePacketFromReplica"); - ReplicaState & replica = offset_states[replica_location.offset].replicas[replica_location.index]; - replica.receive_timeout.reset(); - Packet packet = replica.connection->receivePacket(std::move(async_callback)); + Packet packet = replica.packet_receiver.getPacket(); switch (packet.type) { case Protocol::Server::Data: if (!offset_states[replica_location.offset].first_packet_of_data_received) processReceivedFirstDataPacket(replica_location); - replica.receive_timeout.setRelative(hedged_connections_factory.getConnectionTimeouts().receive_timeout); break; case Protocol::Server::PartUUIDs: case Protocol::Server::Progress: @@ -395,7 +377,6 @@ Packet HedgedConnections::receivePacketFromReplica(const ReplicaLocation & repli case Protocol::Server::Totals: case Protocol::Server::Extremes: case Protocol::Server::Log: - replica.receive_timeout.setRelative(hedged_connections_factory.getConnectionTimeouts().receive_timeout); break; case Protocol::Server::EndOfStream: @@ -413,8 +394,6 @@ Packet HedgedConnections::receivePacketFromReplica(const ReplicaLocation & repli void HedgedConnections::processReceivedFirstDataPacket(const ReplicaLocation & replica_location) { - LOG_DEBUG(log, "processReceivedFirstDataPacket"); - /// When we receive first packet of data from replica, we stop working with replicas, that are /// responsible for the same offset. OffsetState & offset_state = offset_states[replica_location.offset]; @@ -445,8 +424,6 @@ void HedgedConnections::processReceivedFirstDataPacket(const ReplicaLocation & r void HedgedConnections::tryGetNewReplica(bool start_new_connection) { - LOG_DEBUG(log, "tryGetNewReplica"); - Connection * connection = nullptr; HedgedConnectionsFactory::State state = hedged_connections_factory.getNextConnection(start_new_connection, false, connection); @@ -461,14 +438,18 @@ void HedgedConnections::tryGetNewReplica(bool start_new_connection) size_t offset = offsets_queue.front(); offsets_queue.pop(); - ReplicaState replica; - replica.connection = connection; - replica.epoll.add(replica.connection->getSocket()->impl()->sockfd()); - epoll.add(replica.epoll.getFileDescriptor()); - fd_to_replica_location[replica.epoll.getFileDescriptor()] = ReplicaLocation{offset, offset_states[offset].replicas.size()}; + offset_states[offset].replicas.emplace_back(connection); + ++offset_states[offset].active_connection_count; + offset_states[offset].next_replica_in_process = false; ++active_connection_count; + + ReplicaState & replica = offset_states[offset].replicas.back(); + epoll.add(replica.packet_receiver.getFileDescriptor()); + fd_to_replica_location[replica.packet_receiver.getFileDescriptor()] = ReplicaLocation{offset, offset_states[offset].replicas.size() - 1}; + epoll.add(replica.change_replica_timeout.getDescriptor()); + timeout_fd_to_replica_location[replica.change_replica_timeout.getDescriptor()] = ReplicaLocation{offset, offset_states[offset].replicas.size() - 1}; + pipeline_for_new_replicas.run(replica); - offset_states[offset].replicas.push_back(std::move(replica)); } else if (state == HedgedConnectionsFactory::State::NOT_READY && !next_replica_in_process) { @@ -483,6 +464,7 @@ void HedgedConnections::tryGetNewReplica(bool start_new_connection) { if (offset_states[offsets_queue.front()].active_connection_count == 0) throw Exception("Cannot find enough connections to replicas", ErrorCodes::ALL_CONNECTION_TRIES_FAILED); + offset_states[offsets_queue.front()].next_replica_in_process = false; offsets_queue.pop(); } } @@ -497,11 +479,16 @@ void HedgedConnections::tryGetNewReplica(bool start_new_connection) void HedgedConnections::finishProcessReplica(ReplicaState & replica, bool disconnect) { - LOG_DEBUG(log, "finishProcessReplica"); + replica.packet_receiver.cancel(); + replica.change_replica_timeout.reset(); + + epoll.remove(replica.packet_receiver.getFileDescriptor()); + --offset_states[fd_to_replica_location[replica.packet_receiver.getFileDescriptor()].offset].active_connection_count; + fd_to_replica_location.erase(replica.packet_receiver.getFileDescriptor()); + + epoll.remove(replica.change_replica_timeout.getDescriptor()); + timeout_fd_to_replica_location.erase(replica.change_replica_timeout.getDescriptor()); - epoll.remove(replica.epoll.getFileDescriptor()); - --offset_states[fd_to_replica_location[replica.epoll.getFileDescriptor()].offset].active_connection_count; - fd_to_replica_location.erase(replica.epoll.getFileDescriptor()); --active_connection_count; if (disconnect) diff --git a/src/Client/HedgedConnections.h b/src/Client/HedgedConnections.h index 41c548de9ef..bfd5a36c500 100644 --- a/src/Client/HedgedConnections.h +++ b/src/Client/HedgedConnections.h @@ -5,6 +5,9 @@ #include #include #include +#include +#include +#include namespace DB { @@ -20,18 +23,13 @@ class HedgedConnections : public IConnections public: struct ReplicaState { - ReplicaState() + ReplicaState(Connection * connection_) : connection(connection_), packet_receiver(connection_) { - epoll.add(receive_timeout.getDescriptor()); - epoll.add(change_replica_timeout.getDescriptor()); } Connection * connection = nullptr; - TimerDescriptor receive_timeout; + PacketReceiver packet_receiver; TimerDescriptor change_replica_timeout; - /// We store socket and timeout descriptors in epoll - /// and use it's fd outside. - Epoll epoll; }; struct OffsetState @@ -43,6 +41,12 @@ public: /// other replicas when we receive first data packet from one of them) size_t active_connection_count = 0; bool first_packet_of_data_received = false; + + /// This flag is true when this offset is in queue for + /// new replicas. It's needed to process receive timeout + /// (throw an exception when receive timeout expired and there is no + /// new replica in process) + bool next_replica_in_process = false; }; /// We process events in epoll, so we need to determine replica by it's @@ -109,7 +113,7 @@ private: std::vector> pipeline; }; - Packet receivePacketFromReplica(const ReplicaLocation & replica_location, AsyncCallback async_callback = {}); + Packet receivePacketFromReplica(const ReplicaLocation & replica_location); ReplicaLocation getReadyReplicaLocation(AsyncCallback async_callback = {}); @@ -133,6 +137,9 @@ private: /// Map socket file descriptor to replica location (it's offset and index in OffsetState.replicas). std::unordered_map fd_to_replica_location; + /// Map receive data timeout file descriptor to replica location. + std::unordered_map timeout_fd_to_replica_location; + /// A queue of offsets for new replicas. When we get RECEIVE_DATA_TIMEOUT from /// the replica, we push it's offset to this queue and start trying to get /// new replica. @@ -163,8 +170,6 @@ private: bool cancelled = false; mutable std::mutex cancel_mutex; - - Poco::Logger * log; }; } diff --git a/src/Client/HedgedConnectionsFactory.cpp b/src/Client/HedgedConnectionsFactory.cpp index ba0e4ac7b22..3551814d603 100644 --- a/src/Client/HedgedConnectionsFactory.cpp +++ b/src/Client/HedgedConnectionsFactory.cpp @@ -22,10 +22,7 @@ HedgedConnectionsFactory::HedgedConnectionsFactory( { shuffled_pools = pool->getShuffledPools(settings); for (size_t i = 0; i != shuffled_pools.size(); ++i) - { - ConnectionEstablisher establisher(shuffled_pools[i].pool, &timeouts, settings, table_to_check.get(), log); - replicas.emplace_back(std::move(establisher)); - } + replicas.emplace_back(ConnectionEstablisher(shuffled_pools[i].pool, &timeouts, settings, table_to_check.get())); max_tries = (settings ? size_t{settings->connections_with_failover_max_tries} : size_t{DBMS_CONNECTION_POOL_WITH_FAILOVER_DEFAULT_MAX_TRIES}); @@ -43,7 +40,6 @@ HedgedConnectionsFactory::~HedgedConnectionsFactory() std::vector HedgedConnectionsFactory::getManyConnections(PoolMode pool_mode) { - LOG_DEBUG(log, "getManyConnections"); size_t min_entries = (settings && settings->skip_unavailable_shards) ? 0 : 1; size_t max_entries; @@ -103,8 +99,6 @@ std::vector HedgedConnectionsFactory::getManyConnections(PoolMode HedgedConnectionsFactory::State HedgedConnectionsFactory::getNextConnection(bool start_new_connection, bool blocking, Connection *& connection_out) { - LOG_DEBUG(log, "getNextConnection"); - if (start_new_connection) { int index = startEstablishingNewConnection(connection_out); @@ -120,7 +114,7 @@ HedgedConnectionsFactory::State HedgedConnectionsFactory::getNextConnection(bool /// We will try to use usable replica. /// Check if we are not allowed to use usable replicas or there is no even a free usable replica. - if (!fallback_to_stale_replicas || !canGetNewConnection()) + if (!fallback_to_stale_replicas) return State::CANNOT_CHOOSE; return setBestUsableReplica(connection_out); @@ -128,21 +122,24 @@ HedgedConnectionsFactory::State HedgedConnectionsFactory::getNextConnection(bool void HedgedConnectionsFactory::stopChoosingReplicas() { - LOG_DEBUG(log, "stopChoosingReplicas"); - for (auto & [fd, replica_index] : fd_to_replica_index) + for (auto & [fd, index] : fd_to_replica_index) { - resetReplicaTimeouts(replica_index); epoll.remove(fd); - replicas[replica_index].connection_establisher.reset(); + replicas[index].connection_establisher.cancel(); + } + + for (auto & [fd, index] : timeout_fd_to_replica_index) + { + replicas[index].change_replica_timeout.reset(); + epoll.remove(fd); } fd_to_replica_index.clear(); + timeout_fd_to_replica_index.clear(); } int HedgedConnectionsFactory::getNextIndex() { - LOG_DEBUG(log, "getNextIndex"); - /// Check if there is no free replica. if (entries_count + replicas_in_process_count + failed_pools_count >= shuffled_pools.size()) return -1; @@ -161,8 +158,9 @@ int HedgedConnectionsFactory::getNextIndex() next_index = (next_index + 1) % shuffled_pools.size(); /// Check if we can try this replica. - if (!replicas[next_index].is_in_process && (max_tries == 0 || shuffled_pools[next_index].error_count < max_tries) - && replicas[next_index].connection_establisher.stage != ConnectionEstablisher::Stage::FINISHED) + if (!replicas[next_index].connection_establisher.isInProcess() + && !replicas[next_index].connection_establisher.isFinished() + && (max_tries == 0 || shuffled_pools[next_index].error_count < max_tries)) finish = true; /// If we made a complete round, there is no replica to connect. @@ -176,100 +174,80 @@ int HedgedConnectionsFactory::getNextIndex() int HedgedConnectionsFactory::startEstablishingNewConnection(Connection *& connection_out) { - LOG_DEBUG(log, "startEstablishingNewConnection"); - int index; do { - LOG_DEBUG(log, "startEstablishingNewConnection loop"); - index = getNextIndex(); if (index == -1) return -1; ReplicaStatus & replica = replicas[index]; - ++replicas_in_process_count; - replica.is_in_process = true; - replica.connection_establisher.reset(); - replica.connection_establisher.run(); + replica.connection_establisher.resume(); processConnectionEstablisherStage(index); - if (replica.is_in_process) + if (replica.connection_establisher.isInProcess()) { - replica.epoll.add(replica.connection_establisher.socket_fd); - replica.connection_establisher.setActionBeforeDisconnect([&](int fd){ replica.epoll.remove(fd); }); - addTimeouts(index); - epoll.add(replica.epoll.getFileDescriptor()); - fd_to_replica_index[replica.epoll.getFileDescriptor()] = index; + epoll.add(replica.connection_establisher.getFileDescriptor()); + fd_to_replica_index[replica.connection_establisher.getFileDescriptor()] = index; + + /// Add timeout for changing replica. + replica.change_replica_timeout.setRelative(timeouts.hedged_connection_timeout); + epoll.add(replica.change_replica_timeout.getDescriptor()); + timeout_fd_to_replica_index[replica.change_replica_timeout.getDescriptor()] = index; } } - while (!replicas[index].is_ready && !replicas[index].is_in_process); + while (!replicas[index].connection_establisher.isInProcess() && !replicas[index].is_ready); if (replicas[index].is_ready) - connection_out = &*replicas[index].connection_establisher.result.entry; + connection_out = replicas[index].connection_establisher.getConnection(); return index; } -void HedgedConnectionsFactory::processConnectionEstablisherStage(int replica_index, bool remove_from_epoll) +void HedgedConnectionsFactory::processConnectionEstablisherStage(int index, bool remove_from_epoll) { - LOG_DEBUG(log, "processConnectionEstablisherStage"); + ReplicaStatus & replica = replicas[index]; - ReplicaStatus & replica = replicas[replica_index]; - - if (replica.connection_establisher.stage == ConnectionEstablisher::Stage::FINISHED) + if (replica.connection_establisher.isFinished()) { - replica.is_in_process = false; --replicas_in_process_count; ++entries_count; if (remove_from_epoll) - { - epoll.remove(replica.epoll.getFileDescriptor()); - fd_to_replica_index.erase(replica.epoll.getFileDescriptor()); - } + removeReplicaFromEpoll(index); - if (replica.connection_establisher.result.is_usable) + if (replica.connection_establisher.getResult().is_usable) { ++usable_count; - if (replica.connection_establisher.result.is_up_to_date) - { - LOG_DEBUG(log, "READY"); - ++ready_replicas_count; + if (replica.connection_establisher.getResult().is_up_to_date) replica.is_ready = true; - return; - } - } - else - { - std::string & fail_message = replica.connection_establisher.fail_message; - if (!fail_message.empty()) - fail_messages += fail_message + "\n"; + + return; } + + /// If replica is not usable, we need to save fail message. + if (!replica.connection_establisher.getFailMessage().empty()) + fail_messages += replica.connection_establisher.getFailMessage() + "\n"; } - else if (replica.connection_establisher.stage == ConnectionEstablisher::Stage::FAILED) - processFailedConnection(replica_index, remove_from_epoll); + else if (replica.connection_establisher.isFailed()) + processFailedConnection(index, remove_from_epoll); } -void HedgedConnectionsFactory::processFailedConnection(int replica_index, bool remove_from_epoll) +void HedgedConnectionsFactory::processFailedConnection(int index, bool remove_from_epoll) { - LOG_DEBUG(log, "processFailedConnection"); - + ConnectionEstablisher & connection_establisher = replicas[index].connection_establisher; + if (remove_from_epoll) - { - epoll.remove(replicas[replica_index].epoll.getFileDescriptor()); - fd_to_replica_index.erase(replicas[replica_index].epoll.getFileDescriptor()); - } + removeReplicaFromEpoll(index); - std::string & fail_message = replicas[replica_index].connection_establisher.fail_message; - if (!fail_message.empty()) - fail_messages += fail_message + "\n"; + if (!connection_establisher.getFailMessage().empty()) + fail_messages += connection_establisher.getFailMessage() + "\n"; - ShuffledPool & shuffled_pool = shuffled_pools[replica_index]; + ShuffledPool & shuffled_pool = shuffled_pools[index]; LOG_WARNING( - log, "Connection failed at try №{}, reason: {}", (shuffled_pool.error_count + 1), fail_message); + log, "Connection failed at try №{}, reason: {}", (shuffled_pool.error_count + 1), connection_establisher.getFailMessage()); ProfileEvents::increment(ProfileEvents::DistributedConnectionFailTry); shuffled_pool.error_count = std::min(pool->getMaxErrorCup(), shuffled_pool.error_count + 1); @@ -281,109 +259,39 @@ void HedgedConnectionsFactory::processFailedConnection(int replica_index, bool r } --replicas_in_process_count; - replicas[replica_index].is_in_process = false; -} - -void HedgedConnectionsFactory::addTimeouts(int replica_index) -{ - LOG_DEBUG(log, "addTimeouts"); - - auto stage = replicas[replica_index].connection_establisher.stage; - if (stage == ConnectionEstablisher::Stage::RECEIVE_HELLO) - { - replicas[replica_index].receive_timeout.setRelative(timeouts.receive_timeout); - replicas[replica_index].change_replica_timeout.setRelative(timeouts.receive_hello_timeout); - } - else if (stage == ConnectionEstablisher::Stage::RECEIVE_TABLES_STATUS) - { - replicas[replica_index].receive_timeout.setRelative(Poco::Timespan(DBMS_DEFAULT_SYNC_REQUEST_TIMEOUT_SEC, 0)); - replicas[replica_index].change_replica_timeout.setRelative(timeouts.receive_tables_status_timeout); - } -} - -void HedgedConnectionsFactory::resetReplicaTimeouts(int replica_index) -{ - LOG_DEBUG(log, "resetReplicaTimeouts"); - - replicas[replica_index].receive_timeout.reset(); - replicas[replica_index].change_replica_timeout.reset(); } HedgedConnectionsFactory::State HedgedConnectionsFactory::processEpollEvents(bool blocking, Connection *& connection_out) { - LOG_DEBUG(log, "processEpollEvents"); - int event_fd; while (!epoll.empty()) { - /// Firstly, check connections for pending data. - int replica_index = checkPendingData(); - if (replica_index != -1) - { - processSocketEvent(replica_index, connection_out); - /// Return only if replica is ready. - if (replicas[replica_index].is_ready) - return State::READY; - - continue; - } - - /// Get ready descriptor from epoll. event_fd = getReadyFileDescriptor(blocking); - /// Check if there is no events. if (event_fd == -1) return State::NOT_READY; - if (!fd_to_replica_index.contains(event_fd)) - throw Exception("Unknown event from epoll", ErrorCodes::LOGICAL_ERROR); - - replica_index = fd_to_replica_index[event_fd]; - - /// Read all events from replica epoll. - /// If socket is ready and timeout is alarmed simultaneously, skip timeout. - bool is_socket_ready = false; - bool is_receive_timeout_alarmed = false; - bool is_change_replica_timeout_alarmed = false; - - epoll_event events[3]; - events[0].data.fd = events[1].data.fd = events[2].data.fd = -1; - size_t ready_count = replicas[replica_index].epoll.getManyReady(3, events, true); - for (size_t i = 0; i != ready_count; ++i) + if (fd_to_replica_index.contains(event_fd)) { - if (events[i].data.fd == replicas[replica_index].connection_establisher.socket_fd) - is_socket_ready = true; - if (events[i].data.fd == replicas[replica_index].receive_timeout.getDescriptor()) - is_receive_timeout_alarmed = true; - if (events[i].data.fd == replicas[replica_index].change_replica_timeout.getDescriptor()) - is_change_replica_timeout_alarmed = true; - } + int index = fd_to_replica_index[event_fd]; + processConnectionEstablisherEvent(index, connection_out); - if (is_socket_ready) - { - processSocketEvent(replica_index, connection_out); - /// Return only if replica is ready. - if (replicas[replica_index].is_ready) + if (replicas[index].is_ready) return State::READY; - if (replicas[replica_index].is_in_process) + if (replicas[index].connection_establisher.isInProcess()) continue; } + else if (timeout_fd_to_replica_index.contains(event_fd)) + replicas[timeout_fd_to_replica_index[event_fd]].change_replica_timeout.reset(); else - { - if (is_receive_timeout_alarmed) - processReceiveTimeout(replica_index); + throw Exception("Unknown event from epoll", ErrorCodes::LOGICAL_ERROR); - if (is_change_replica_timeout_alarmed) - { - LOG_DEBUG(log, "change_replica_timeout"); - replicas[replica_index].change_replica_timeout.reset(); - } - } + /// We reach this point only if we need to start new connection + /// (Special timeout expired or one of the previous connections failed). + int index = startEstablishingNewConnection(connection_out); - /// We reach this point only if we need to start new connection. - replica_index = startEstablishingNewConnection(connection_out); /// Return only if replica is ready. - if (replica_index != -1 && replicas[replica_index].is_ready) + if (index != -1 && replicas[index].is_ready) return State::READY; } @@ -398,40 +306,23 @@ int HedgedConnectionsFactory::getReadyFileDescriptor(bool blocking) return event.data.fd; } -int HedgedConnectionsFactory::checkPendingData() +void HedgedConnectionsFactory::removeReplicaFromEpoll(int index) { - for (auto & [fd, replica_index] : fd_to_replica_index) - if (replicas[replica_index].connection_establisher.result.entry->hasReadPendingData()) - return replica_index; + ReplicaStatus & replica = replicas[index]; + epoll.remove(replica.connection_establisher.getFileDescriptor()); + fd_to_replica_index.erase(replica.connection_establisher.getFileDescriptor()); - return -1; + replica.change_replica_timeout.reset(); + epoll.remove(replica.change_replica_timeout.getDescriptor()); + timeout_fd_to_replica_index.erase(replica.change_replica_timeout.getDescriptor()); } -void HedgedConnectionsFactory::processSocketEvent(int replica_index, Connection *& connection_out) +void HedgedConnectionsFactory::processConnectionEstablisherEvent(int index, Connection *& connection_out) { - LOG_DEBUG(log, "processSocketEvent"); - - resetReplicaTimeouts(replica_index); - replicas[replica_index].connection_establisher.run(); - processConnectionEstablisherStage(replica_index, true); - if (replicas[replica_index].is_in_process) - addTimeouts(replica_index); - if (replicas[replica_index].is_ready) - connection_out = &*replicas[replica_index].connection_establisher.result.entry; -} - -void HedgedConnectionsFactory::processReceiveTimeout(int replica_index) -{ - LOG_DEBUG(log, "processReceiveTimeout"); - - resetReplicaTimeouts(replica_index); - ReplicaStatus & replica = replicas[replica_index]; - - replica.connection_establisher.fail_message = - "Code: 209, e.displayText() = DB::NetException: Timeout exceeded while reading from socket (" + replica.connection_establisher.result.entry->getDescription() + ")"; - replica.connection_establisher.resetResult(); - replica.connection_establisher.stage = ConnectionEstablisher::Stage::FAILED; - processFailedConnection(replica_index, true); + replicas[index].connection_establisher.resume(); + processConnectionEstablisherStage(index, true); + if (replicas[index].is_ready) + connection_out = replicas[index].connection_establisher.getConnection(); } HedgedConnectionsFactory::State HedgedConnectionsFactory::setBestUsableReplica(Connection *& connection_out) @@ -440,8 +331,10 @@ HedgedConnectionsFactory::State HedgedConnectionsFactory::setBestUsableReplica(C for (size_t i = 0; i != replicas.size(); ++i) { /// Don't add unusable, failed replicas and replicas that are ready or in process. - if (!replicas[i].connection_establisher.result.entry.isNull() && replicas[i].connection_establisher.result.is_usable && - !replicas[i].is_in_process && !replicas[i].is_ready) + if (!replicas[i].connection_establisher.getResult().entry.isNull() + && replicas[i].connection_establisher.getResult().is_usable + && !replicas[i].connection_establisher.isInProcess() + && !replicas[i].is_ready) indexes.push_back(i); } @@ -454,12 +347,12 @@ HedgedConnectionsFactory::State HedgedConnectionsFactory::setBestUsableReplica(C indexes.end(), [&](size_t lhs, size_t rhs) { - return replicas[lhs].connection_establisher.result.staleness < replicas[rhs].connection_establisher.result.staleness; + return replicas[lhs].connection_establisher.getResult().staleness < replicas[rhs].connection_establisher.getResult().staleness; }); ++ready_replicas_count; replicas[indexes[0]].is_ready = true; - connection_out = &*replicas[indexes[0]].connection_establisher.result.entry; + connection_out = replicas[indexes[0]].connection_establisher.getConnection(); return State::READY; } diff --git a/src/Client/HedgedConnectionsFactory.h b/src/Client/HedgedConnectionsFactory.h index b821a9c925e..896774daae3 100644 --- a/src/Client/HedgedConnectionsFactory.h +++ b/src/Client/HedgedConnectionsFactory.h @@ -3,17 +3,18 @@ #if defined(OS_LINUX) #include +#include +#include +#include +#include #include #include -#include #include #include namespace DB { -using TimerDescriptorPtr = std::shared_ptr; - /** Class for establishing hedged connections with replicas. * The process of establishing connection is divided on stages, on each stage if * replica doesn't respond for a long time, we start establishing connection with @@ -25,22 +26,6 @@ class HedgedConnectionsFactory public: using ShuffledPool = ConnectionPoolWithFailover::Base::ShuffledPool; - struct ReplicaStatus - { - ReplicaStatus(const ConnectionEstablisher & establisher) : connection_establisher(establisher) - { - epoll.add(receive_timeout.getDescriptor()); - epoll.add(change_replica_timeout.getDescriptor()); - } - - ConnectionEstablisher connection_establisher; - TimerDescriptor receive_timeout; - TimerDescriptor change_replica_timeout; - bool is_ready = false; - bool is_in_process = false; - Epoll epoll; - }; - enum class State { READY, @@ -48,6 +33,17 @@ public: CANNOT_CHOOSE, }; + struct ReplicaStatus + { + ReplicaStatus(ConnectionEstablisher connection_stablisher_) : connection_establisher(std::move(connection_stablisher_)) + { + } + + ConnectionEstablisher connection_establisher; + TimerDescriptor change_replica_timeout; + bool is_ready = false; + }; + HedgedConnectionsFactory(const ConnectionPoolWithFailoverPtr & pool_, const Settings * settings_, const ConnectionTimeouts & timeouts_, @@ -64,12 +60,12 @@ public: State getNextConnection(bool start_new_connection, bool blocking, Connection *& connection_out); /// Check if we can try to produce new READY replica. - bool canGetNewConnection() const { return ready_replicas_count + failed_pools_count < shuffled_pools.size(); } +// bool canGetNewConnection() const { return ready_replicas_count + failed_pools_count < shuffled_pools.size(); } /// Stop working with all replicas that are not READY. void stopChoosingReplicas(); - bool hasEventsInProcess() const { return epoll.size() > 0; } + bool hasEventsInProcess() const { return !epoll.empty(); } int getFileDescriptor() const { return epoll.getFileDescriptor(); } @@ -90,17 +86,11 @@ private: int getReadyFileDescriptor(bool blocking); - int checkPendingData(); - - void addTimeouts(int replica_index); - - void resetReplicaTimeouts(int replica_index); - void processFailedConnection(int replica_index, bool remove_from_epoll); - void processSocketEvent(int replica_index, Connection *& connection_out); + void processConnectionEstablisherEvent(int replica_index, Connection *& connection_out); - void processReceiveTimeout(int replica_index); + void removeReplicaFromEpoll(int index); /// Return NOT_READY state if there is no ready events, READY if replica is ready /// and CANNOT_CHOOSE if there is no more events in epoll. @@ -111,19 +101,22 @@ private: const ConnectionPoolWithFailoverPtr pool; const Settings * settings; const ConnectionTimeouts timeouts; - std::shared_ptr table_to_check; - std::vector replicas; std::vector shuffled_pools; + std::vector replicas; /// Map socket file descriptor to replica index. std::unordered_map fd_to_replica_index; + /// Map timeout for changing replica to replica index. + std::unordered_map timeout_fd_to_replica_index; + /// Indexes of replicas, that are in process of connection. size_t replicas_in_process_count = 0; /// Indexes of ready replicas. size_t ready_replicas_count = 0; + std::shared_ptr table_to_check; int last_used_index = -1; bool fallback_to_stale_replicas; Epoll epoll; diff --git a/src/Client/MultiplexedConnections.cpp b/src/Client/MultiplexedConnections.cpp index 577210792e9..8b2b7c49f26 100644 --- a/src/Client/MultiplexedConnections.cpp +++ b/src/Client/MultiplexedConnections.cpp @@ -265,7 +265,11 @@ Packet MultiplexedConnections::receivePacketUnlocked(AsyncCallback async_callbac if (current_connection == nullptr) throw Exception("Logical error: no available replica", ErrorCodes::NO_AVAILABLE_REPLICA); - Packet packet = current_connection->receivePacket(std::move(async_callback)); + Packet packet; + { + AsyncCallbackSetter async_setter(current_connection, std::move(async_callback)); + packet = current_connection->receivePacket(); + } switch (packet.type) { diff --git a/src/Client/PacketReceiver.h b/src/Client/PacketReceiver.h new file mode 100644 index 00000000000..4d42804d0a2 --- /dev/null +++ b/src/Client/PacketReceiver.h @@ -0,0 +1,143 @@ +#pragma once + +#if defined(OS_LINUX) + +#include +#include +#include + +namespace DB +{ + +/// Class for nonblocking packet receiving. It runs connection->receivePacket +/// in fiber and sets special read callback which is called when +/// reading from socket blocks. When read callback is called, +/// socket and receive timeout are added in epoll and execution returns to the main program. +/// So, you can poll this epoll file descriptor to determine when to resume +/// packet receiving (beside polling epoll descriptor, you also need to check connection->hasPendingData(), +/// because small packet can be read in buffer with the previous one, so new packet will be ready in buffer, +/// but there is no data socket to poll). +class PacketReceiver +{ +public: + PacketReceiver(Connection * connection_) : connection(connection_) + { + epoll.add(receive_timeout.getDescriptor()); + epoll.add(connection->getSocket()->impl()->sockfd()); + fiber = boost::context::fiber(std::allocator_arg_t(), fiber_stack, Routine{*this}); + } + + /// Resume packet receiving. + void resume() + { + /// If there is no pending data, check receive timeout. + if (!connection->hasReadPendingData() && !checkReceiveTimeout()) + return; + + fiber = std::move(fiber).resume(); + if (exception) + std::rethrow_exception(std::move(exception)); + } + + void cancel() + { + Fiber to_destroy = std::move(fiber); + connection = nullptr; + } + + Packet getPacket() { return std::move(packet); } + + int getFileDescriptor() const { return epoll.getFileDescriptor(); } + + bool isPacketReady() const { return !is_read_in_process; } + + bool isReceiveTimeoutExpired() const { return is_receive_timeout_expired; } + +private: + /// When epoll file descriptor is ready, check if it's an expired timeout + bool checkReceiveTimeout() + { + bool is_socket_ready = false; + is_receive_timeout_expired = false; + + epoll_event events[2]; + events[0].data.fd = events[1].data.fd = -1; + size_t ready_count = epoll.getManyReady(2, events, true); + + for (size_t i = 0; i != ready_count; ++i) + { + if (events[i].data.fd == connection->getSocket()->impl()->sockfd()) + is_socket_ready = true; + if (events[i].data.fd == receive_timeout.getDescriptor()) + is_receive_timeout_expired = true; + } + + if (is_receive_timeout_expired && !is_socket_ready) + { + receive_timeout.reset(); + return false; + } + + return true; + } + + struct Routine + { + PacketReceiver & receiver; + + struct ReadCallback + { + PacketReceiver & receiver; + Fiber & sink; + + void operator()(int, const Poco::Timespan & timeout, const std::string &) + { + receiver.receive_timeout.setRelative(timeout); + receiver.is_read_in_process = true; + sink = std::move(sink).resume(); + receiver.is_read_in_process = false; + receiver.receive_timeout.reset(); + } + }; + + Fiber operator()(Fiber && sink) + { + try + { + AsyncCallbackSetter async_setter(receiver.connection, ReadCallback{receiver, sink}); + while (true) + { + receiver.packet = receiver.connection->receivePacket(); + sink = std::move(sink).resume(); + } + + } + catch (const boost::context::detail::forced_unwind &) + { + /// This exception is thrown by fiber implementation in case if fiber is being deleted but hasn't exited + /// It should not be caught or it will segfault. + /// Other exceptions must be caught + throw; + } + catch (...) + { + receiver.exception = std::current_exception(); + } + + return std::move(sink); + } + }; + + Connection * connection; + TimerDescriptor receive_timeout; + Epoll epoll; + Fiber fiber; + FiberStack fiber_stack; + Packet packet; + bool is_read_in_process = false; + bool is_receive_timeout_expired = false; + std::exception_ptr exception; +}; + +} +#endif diff --git a/src/Client/ya.make b/src/Client/ya.make index 7a664f328f7..af1dd05f1d4 100644 --- a/src/Client/ya.make +++ b/src/Client/ya.make @@ -11,6 +11,7 @@ PEERDIR( SRCS( Connection.cpp + ConnectionEstablisher.cpp ConnectionPoolWithFailover.cpp HedgedConnections.cpp HedgedConnectionsFactory.cpp diff --git a/src/Common/Epoll.cpp b/src/Common/Epoll.cpp index d085315b1a0..89c6c8abc49 100644 --- a/src/Common/Epoll.cpp +++ b/src/Common/Epoll.cpp @@ -21,20 +21,16 @@ Epoll::Epoll() : events_count(0) throwFromErrno("Cannot open epoll descriptor", DB::ErrorCodes::EPOLL_ERROR); } -Epoll::Epoll(Epoll && other) +Epoll::Epoll(Epoll && other) : epoll_fd(other.epoll_fd), events_count(other.events_count.load()) { - epoll_fd = other.epoll_fd; other.epoll_fd = -1; - int count = other.events_count; - events_count = count; } Epoll & Epoll::operator=(Epoll && other) { epoll_fd = other.epoll_fd; other.epoll_fd = -1; - int count = other.events_count; - events_count = count; + events_count.store(other.events_count.load()); return *this; } diff --git a/src/Core/Defines.h b/src/Core/Defines.h index 1e1e86171a1..8fd8e0d6bdf 100644 --- a/src/Core/Defines.h +++ b/src/Core/Defines.h @@ -12,8 +12,7 @@ #define DBMS_DEFAULT_SEND_TIMEOUT_SEC 300 #define DBMS_DEFAULT_RECEIVE_TIMEOUT_SEC 300 /// Timeouts for hedged requests. -#define DBMS_DEFAULT_RECEIVE_HELLO_TIMEOUT_MS 100 -#define DBMS_DEFAULT_RECEIVE_TABLES_STATUS_TIMEOUT_MS 100 +#define DBMS_DEFAULT_HEDGED_CONNECTION_TIMEOUT_MS 100 #define DBMS_DEFAULT_RECEIVE_DATA_TIMEOUT_SEC 2 /// Timeout for synchronous request-result protocol call (like Ping or TablesStatus). #define DBMS_DEFAULT_SYNC_REQUEST_TIMEOUT_SEC 5 diff --git a/src/Core/Settings.h b/src/Core/Settings.h index d4f5602122c..98483fceced 100644 --- a/src/Core/Settings.h +++ b/src/Core/Settings.h @@ -55,8 +55,7 @@ class IColumn; M(Seconds, receive_timeout, DBMS_DEFAULT_RECEIVE_TIMEOUT_SEC, "", 0) \ M(Seconds, send_timeout, DBMS_DEFAULT_SEND_TIMEOUT_SEC, "", 0) \ M(Seconds, tcp_keep_alive_timeout, 0, "The time in seconds the connection needs to remain idle before TCP starts sending keepalive probes", 0) \ - M(Milliseconds, receive_hello_timeout, DBMS_DEFAULT_RECEIVE_HELLO_TIMEOUT_MS, "Connection timeout for receiving hello from replica", 0) \ - M(Milliseconds, receive_tables_status_timeout, DBMS_DEFAULT_RECEIVE_TABLES_STATUS_TIMEOUT_MS, "Connection timeout for receiving tables status from replica", 0) \ + M(Milliseconds, hedged_connection_timeout, DBMS_DEFAULT_HEDGED_CONNECTION_TIMEOUT_MS, "Connection timeout for establishing connection with replica for Hedged requests", 0) \ M(Seconds, receive_data_timeout, DBMS_DEFAULT_RECEIVE_DATA_TIMEOUT_SEC, "Connection timeout for receiving first packet of data from replica", 0) \ M(Bool, use_hedged_requests, true, "Use hedged requests for distributed queries", 0) \ M(Milliseconds, queue_max_wait_ms, 0, "The wait time in the request queue, if the number of concurrent requests exceeds the maximum.", 0) \ @@ -220,9 +219,8 @@ class IColumn; M(Milliseconds, stream_poll_timeout_ms, 500, "Timeout for polling data from/to streaming storages.", 0) \ \ /** Settings for testing hedged requests */ \ - M(Int64, sleep_before_send_hello, 0, "Time to sleep before sending hello in TCPHandler", 0) \ - M(Int64, sleep_before_send_tables_status, 0, "Time to sleep before sending tables status response in TCPHandler", 0) \ - M(Int64, sleep_before_send_data, 0, "Time to sleep before sending data in TCPHandler", 0) \ + M(Int64, sleep_in_send_tables_status, 0, "Time to sleep in sending tables status response in TCPHandler", 0) \ + M(Int64, sleep_in_send_data, 0, "Time to sleep in sending data in TCPHandler", 0) \ \ M(Bool, insert_allow_materialized_columns, 0, "If setting is enabled, Allow materialized columns in INSERT.", 0) \ M(Seconds, http_connection_timeout, DEFAULT_HTTP_READ_BUFFER_CONNECTION_TIMEOUT, "HTTP connection timeout.", 0) \ diff --git a/src/IO/ConnectionTimeouts.h b/src/IO/ConnectionTimeouts.h index a92f75bf980..5f12a4edc79 100644 --- a/src/IO/ConnectionTimeouts.h +++ b/src/IO/ConnectionTimeouts.h @@ -18,8 +18,7 @@ struct ConnectionTimeouts Poco::Timespan secure_connection_timeout; /// Timeouts for HedgedConnections - Poco::Timespan receive_hello_timeout; - Poco::Timespan receive_tables_status_timeout; + Poco::Timespan hedged_connection_timeout; Poco::Timespan receive_data_timeout; ConnectionTimeouts() = default; @@ -33,8 +32,7 @@ struct ConnectionTimeouts tcp_keep_alive_timeout(0), http_keep_alive_timeout(0), secure_connection_timeout(connection_timeout), - receive_hello_timeout(receive_timeout_), - receive_tables_status_timeout(receive_timeout_), + hedged_connection_timeout(receive_timeout_), receive_data_timeout(receive_timeout_) { } @@ -49,8 +47,7 @@ struct ConnectionTimeouts tcp_keep_alive_timeout(tcp_keep_alive_timeout_), http_keep_alive_timeout(0), secure_connection_timeout(connection_timeout), - receive_hello_timeout(receive_timeout_), - receive_tables_status_timeout(receive_timeout_), + hedged_connection_timeout(receive_timeout_), receive_data_timeout(receive_timeout_) { } @@ -65,8 +62,7 @@ struct ConnectionTimeouts tcp_keep_alive_timeout(tcp_keep_alive_timeout_), http_keep_alive_timeout(http_keep_alive_timeout_), secure_connection_timeout(connection_timeout), - receive_hello_timeout(receive_timeout_), - receive_tables_status_timeout(receive_timeout_), + hedged_connection_timeout(receive_timeout_), receive_data_timeout(receive_timeout_) { } @@ -78,7 +74,6 @@ struct ConnectionTimeouts const Poco::Timespan & http_keep_alive_timeout_, const Poco::Timespan & secure_connection_timeout_, const Poco::Timespan & receive_hello_timeout_, - const Poco::Timespan & receive_tables_status_timeout_, const Poco::Timespan & receive_data_timeout_) : connection_timeout(connection_timeout_), send_timeout(send_timeout_), @@ -86,8 +81,7 @@ struct ConnectionTimeouts tcp_keep_alive_timeout(tcp_keep_alive_timeout_), http_keep_alive_timeout(http_keep_alive_timeout_), secure_connection_timeout(secure_connection_timeout_), - receive_hello_timeout(receive_hello_timeout_), - receive_tables_status_timeout(receive_tables_status_timeout_), + hedged_connection_timeout(receive_hello_timeout_), receive_data_timeout(receive_data_timeout_) { } @@ -108,8 +102,7 @@ struct ConnectionTimeouts saturate(tcp_keep_alive_timeout, limit), saturate(http_keep_alive_timeout, limit), saturate(secure_connection_timeout, limit), - saturate(receive_hello_timeout, limit), - saturate(receive_tables_status_timeout, limit), + saturate(hedged_connection_timeout, limit), saturate(receive_data_timeout, limit)); } diff --git a/src/IO/ConnectionTimeoutsContext.h b/src/IO/ConnectionTimeoutsContext.h index c6daae39e7a..c08ec2e8b92 100644 --- a/src/IO/ConnectionTimeoutsContext.h +++ b/src/IO/ConnectionTimeoutsContext.h @@ -23,8 +23,7 @@ inline ConnectionTimeouts ConnectionTimeouts::getTCPTimeoutsWithFailover(const S settings.tcp_keep_alive_timeout, 0, settings.connect_timeout_with_failover_secure_ms, - settings.receive_hello_timeout, - settings.receive_tables_status_timeout, + settings.hedged_connection_timeout, settings.receive_data_timeout); } diff --git a/src/Server/TCPHandler.cpp b/src/Server/TCPHandler.cpp index fdbb6d0e3c7..8500959ec69 100644 --- a/src/Server/TCPHandler.cpp +++ b/src/Server/TCPHandler.cpp @@ -717,16 +717,18 @@ void TCPHandler::processTablesStatusRequest() response.table_states_by_id.emplace(table_name, std::move(status)); } + + writeVarUInt(Protocol::Server::TablesStatusResponse, *out); + /// For testing hedged requests const Settings & settings = query_context->getSettingsRef(); - if (settings.sleep_before_send_tables_status) + if (settings.sleep_in_send_tables_status) { - std::chrono::seconds sec(settings.sleep_before_send_tables_status); + out->next(); + std::chrono::seconds sec(settings.sleep_in_send_tables_status); std::this_thread::sleep_for(sec); } - - writeVarUInt(Protocol::Server::TablesStatusResponse, *out); response.write(*out, client_tcp_protocol_version); } @@ -940,14 +942,6 @@ void TCPHandler::receiveUnexpectedHello() void TCPHandler::sendHello() { - /// For testing hedged requests - const Settings & settings = query_context->getSettingsRef(); - if (settings.sleep_before_send_hello) - { - std::chrono::seconds sec(settings.sleep_before_send_hello); - std::this_thread::sleep_for(sec); - } - writeVarUInt(Protocol::Server::Hello, *out); writeStringBinary(DBMS_NAME, *out); writeVarUInt(DBMS_VERSION_MAJOR, *out); @@ -1405,20 +1399,21 @@ bool TCPHandler::isQueryCancelled() void TCPHandler::sendData(const Block & block) { - /// For testing hedged requests - const Settings & settings = query_context->getSettingsRef(); - if (settings.sleep_before_send_data) - { - std::chrono::seconds sec(settings.sleep_before_send_data); - std::this_thread::sleep_for(sec); - } - initBlockOutput(block); writeVarUInt(Protocol::Server::Data, *out); /// Send external table name (empty name is the main table) writeStringBinary("", *out); + /// For testing hedged requests + const Settings & settings = query_context->getSettingsRef(); + if (settings.sleep_in_send_data) + { + out->next(); + std::chrono::seconds sec(settings.sleep_in_send_data); + std::this_thread::sleep_for(sec); + } + state.block_out->write(block); state.maybe_compressed_out->next(); out->next(); diff --git a/tests/integration/test_distributed_load_balancing/test.py b/tests/integration/test_distributed_load_balancing/test.py index d3ac5c132cd..c7fd6d9ab26 100644 --- a/tests/integration/test_distributed_load_balancing/test.py +++ b/tests/integration/test_distributed_load_balancing/test.py @@ -166,7 +166,11 @@ def test_load_balancing_priority_round_robin(dist_table): def test_distributed_replica_max_ignored_errors(): settings = { +<<<<<<< Updated upstream 'use_hedged_requests' : 0, +======= + 'use_hedged_requests': 0, +>>>>>>> Stashed changes 'load_balancing': 'in_order', 'prefer_localhost_replica': 0, 'connect_timeout': 2, diff --git a/tests/integration/test_hedged_requests/test.py b/tests/integration/test_hedged_requests/test.py index 24dc9202880..dc64b8a7ba9 100644 --- a/tests/integration/test_hedged_requests/test.py +++ b/tests/integration/test_hedged_requests/test.py @@ -10,23 +10,22 @@ from helpers.cluster import ClickHouseCluster from helpers.network import PartitionManager cluster = ClickHouseCluster(__file__) - NODES = {'node_' + str(i): None for i in (1, 2, 3)} + NODES['node'] = None sleep_time = 30 - @pytest.fixture(scope="module") def started_cluster(): cluster = ClickHouseCluster(__file__) NODES['node'] = cluster.add_instance( - 'node', with_zookeeper=True, main_configs=['configs/remote_servers.xml'], user_configs=['configs/users.xml']) + 'node', with_zookeeper=True, stay_alive=True, main_configs=['configs/remote_servers.xml'], user_configs=['configs/users.xml']) for name in NODES: if name != 'node': - NODES[name] = cluster.add_instance(name, with_zookeeper=True, stay_alive=True, user_configs=['configs/users1.xml']) - + NODES[name] = cluster.add_instance(name, with_zookeeper=True, user_configs=['configs/users1.xml']) + try: cluster.start() @@ -48,216 +47,128 @@ def started_cluster(): config = ''' - {sleep_before_send_hello} - {sleep_before_send_tables_status} - {sleep_before_send_data} + {sleep_in_send_tables_status} + {sleep_in_send_data} ''' def check_query(): + NODES['node'].restart_clickhouse() + # Without hedged requests select query will last more than 30 seconds, # with hedged requests it will last just around 1-2 second start = time.time() NODES['node'].query("SELECT * FROM distributed"); query_time = time.time() - start - print(query_time) + print("Query time:", query_time) - assert query_time < 5 - - -def test_send_hello_sleep(started_cluster): - NODES['node_1'].replace_config( - '/etc/clickhouse-server/users.d/users1.xml', - config.format(sleep_before_send_hello=sleep_time, sleep_before_send_tables_status=0, sleep_before_send_data=0)) - - NODES['node_1'].restart_clickhouse(sleep_time) - NODES['node_1'].query('SYSTEM SYNC REPLICA replicated') - - check_query() - - -def test_send_hello_sleep2(started_cluster): - NODES['node_1'].replace_config( - '/etc/clickhouse-server/users.d/users1.xml', - config.format(sleep_before_send_hello=sleep_time, sleep_before_send_tables_status=0, sleep_before_send_data=0)) - - NODES['node_1'].restart_clickhouse(sleep_time) - NODES['node_1'].query('SYSTEM SYNC REPLICA replicated') - - - NODES['node_2'].replace_config( - '/etc/clickhouse-server/users.d/users1.xml', - config.format(sleep_before_send_hello=sleep_time, sleep_before_send_tables_status=0, sleep_before_send_data=0)) - - NODES['node_2'].restart_clickhouse(sleep_time) - NODES['node_2'].query('SYSTEM SYNC REPLICA replicated') - - check_query() + assert query_time < 10 def test_send_table_status_sleep(started_cluster): NODES['node_1'].replace_config( '/etc/clickhouse-server/users.d/users1.xml', - config.format(sleep_before_send_hello=0, sleep_before_send_tables_status=sleep_time, sleep_before_send_data=0)) - - NODES['node_1'].restart_clickhouse(sleep_time) - NODES['node_1'].query('SYSTEM SYNC REPLICA replicated') + config.format(sleep_in_send_tables_status=sleep_time, sleep_in_send_data=0)) + time.sleep(2) check_query() def test_send_table_status_sleep2(started_cluster): NODES['node_1'].replace_config( '/etc/clickhouse-server/users.d/users1.xml', - config.format(sleep_before_send_hello=0, sleep_before_send_tables_status=sleep_time, sleep_before_send_data=0)) - - NODES['node_1'].restart_clickhouse(sleep_time) - NODES['node_1'].query('SYSTEM SYNC REPLICA replicated') + config.format(sleep_in_send_tables_status=sleep_time, sleep_in_send_data=0)) NODES['node_2'].replace_config( '/etc/clickhouse-server/users.d/users1.xml', - config.format(sleep_before_send_hello=0, sleep_before_send_tables_status=sleep_time, sleep_before_send_data=0)) - - NODES['node_2'].restart_clickhouse(sleep_time) - NODES['node_2'].query('SYSTEM SYNC REPLICA replicated') + config.format(sleep_in_send_tables_status=sleep_time, sleep_in_send_data=0)) + time.sleep(2) check_query() + def test_send_data(started_cluster): NODES['node_1'].replace_config( '/etc/clickhouse-server/users.d/users1.xml', - config.format(sleep_before_send_hello=0, sleep_before_send_tables_status=0, sleep_before_send_data=sleep_time)) - - NODES['node_1'].restart_clickhouse(sleep_time) - NODES['node_1'].query('SYSTEM SYNC REPLICA replicated') + config.format(sleep_in_send_tables_status=0, sleep_in_send_data=sleep_time)) + time.sleep(2) + check_query() def test_send_data2(started_cluster): NODES['node_1'].replace_config( '/etc/clickhouse-server/users.d/users1.xml', - config.format(sleep_before_send_hello=0, sleep_before_send_tables_status=0, sleep_before_send_data=sleep_time)) - - NODES['node_1'].restart_clickhouse(sleep_time) - NODES['node_1'].query('SYSTEM SYNC REPLICA replicated') + config.format(sleep_in_send_tables_status=0, sleep_in_send_data=sleep_time)) NODES['node_2'].replace_config( '/etc/clickhouse-server/users.d/users1.xml', - config.format(sleep_before_send_hello=0, sleep_before_send_tables_status=0, sleep_before_send_data=sleep_time)) - - NODES['node_2'].restart_clickhouse(sleep_time) - NODES['node_2'].query('SYSTEM SYNC REPLICA replicated') + config.format(sleep_in_send_tables_status=0, sleep_in_send_data=sleep_time)) + time.sleep(2) check_query() def test_combination1(started_cluster): NODES['node_1'].replace_config( '/etc/clickhouse-server/users.d/users1.xml', - config.format(sleep_before_send_hello=sleep_time, sleep_before_send_tables_status=0, sleep_before_send_data=0)) - - NODES['node_1'].restart_clickhouse(sleep_time) - NODES['node_1'].query('SYSTEM SYNC REPLICA replicated') + config.format(sleep_in_send_tables_status=sleep_time, sleep_in_send_data=0)) NODES['node_2'].replace_config( '/etc/clickhouse-server/users.d/users1.xml', - config.format(sleep_before_send_hello=0, sleep_before_send_tables_status=sleep_time, sleep_before_send_data=0)) - - NODES['node_2'].restart_clickhouse(sleep_time) - NODES['node_2'].query('SYSTEM SYNC REPLICA replicated') + config.format(sleep_in_send_tables_status=0, sleep_in_send_data=sleep_time)) + time.sleep(2) check_query() def test_combination2(started_cluster): NODES['node_1'].replace_config( '/etc/clickhouse-server/users.d/users1.xml', - config.format(sleep_before_send_hello=sleep_time, sleep_before_send_tables_status=0, sleep_before_send_data=0)) - - NODES['node_1'].restart_clickhouse(sleep_time) - NODES['node_1'].query('SYSTEM SYNC REPLICA replicated') + config.format(sleep_in_send_tables_status=0, sleep_in_send_data=sleep_time)) NODES['node_2'].replace_config( '/etc/clickhouse-server/users.d/users1.xml', - config.format(sleep_before_send_hello=0, sleep_before_send_tables_status=0, sleep_before_send_data=sleep_time)) - - NODES['node_2'].restart_clickhouse(sleep_time) - NODES['node_2'].query('SYSTEM SYNC REPLICA replicated') + config.format(sleep_in_send_tables_status=sleep_time, sleep_in_send_data=0)) + time.sleep(2) check_query() def test_combination3(started_cluster): NODES['node_1'].replace_config( '/etc/clickhouse-server/users.d/users1.xml', - config.format(sleep_before_send_hello=0, sleep_before_send_tables_status=sleep_time, sleep_before_send_data=0)) - - NODES['node_1'].restart_clickhouse(sleep_time) - NODES['node_1'].query('SYSTEM SYNC REPLICA replicated') + config.format(sleep_in_send_tables_status=0, sleep_in_send_data=sleep_time)) NODES['node_2'].replace_config( '/etc/clickhouse-server/users.d/users1.xml', - config.format(sleep_before_send_hello=sleep_time, sleep_before_send_tables_status=0, sleep_before_send_data=0)) + config.format(sleep_in_send_tables_status=1, sleep_in_send_data=0)) - NODES['node_2'].restart_clickhouse(sleep_time) - NODES['node_2'].query('SYSTEM SYNC REPLICA replicated') + NODES['node_3'].replace_config( + '/etc/clickhouse-server/users.d/users1.xml', + config.format(sleep_in_send_tables_status=0, sleep_in_send_data=sleep_time)) + time.sleep(2) check_query() def test_combination4(started_cluster): NODES['node_1'].replace_config( '/etc/clickhouse-server/users.d/users1.xml', - config.format(sleep_before_send_hello=0, sleep_before_send_tables_status=sleep_time, sleep_before_send_data=0)) - - NODES['node_1'].restart_clickhouse(sleep_time) - NODES['node_1'].query('SYSTEM SYNC REPLICA replicated') + config.format(sleep_in_send_tables_status=1, sleep_in_send_data=sleep_time)) NODES['node_2'].replace_config( '/etc/clickhouse-server/users.d/users1.xml', - config.format(sleep_before_send_hello=0, sleep_before_send_tables_status=0, sleep_before_send_data=sleep_time)) + config.format(sleep_in_send_tables_status=1, sleep_in_send_data=0)) - NODES['node_2'].restart_clickhouse(sleep_time) - NODES['node_2'].query('SYSTEM SYNC REPLICA replicated') - - check_query() - -def test_combination5(started_cluster): - NODES['node_1'].replace_config( - '/etc/clickhouse-server/users.d/users1.xml', - config.format(sleep_before_send_hello=0, sleep_before_send_tables_status=0, sleep_before_send_data=sleep_time)) - - NODES['node_1'].restart_clickhouse(sleep_time) - NODES['node_1'].query('SYSTEM SYNC REPLICA replicated') - - NODES['node_2'].replace_config( - '/etc/clickhouse-server/users.d/users1.xml', - config.format(sleep_before_send_hello=sleep_time, sleep_before_send_tables_status=0, sleep_before_send_data=0)) - - NODES['node_2'].restart_clickhouse(sleep_time) - NODES['node_2'].query('SYSTEM SYNC REPLICA replicated') - - check_query() - - -def test_combination6(started_cluster): - NODES['node_1'].replace_config( - '/etc/clickhouse-server/users.d/users1.xml', - config.format(sleep_before_send_hello=0, sleep_before_send_tables_status=0, sleep_before_send_data=sleep_time)) - - NODES['node_1'].restart_clickhouse(sleep_time) - NODES['node_1'].query('SYSTEM SYNC REPLICA replicated') - - NODES['node_2'].replace_config( - '/etc/clickhouse-server/users.d/users1.xml', - config.format(sleep_before_send_hello=0, sleep_before_send_tables_status=sleep_time, sleep_before_send_data=0)) - - NODES['node_2'].restart_clickhouse(sleep_time) - NODES['node_2'].query('SYSTEM SYNC REPLICA replicated') + NODES['node_3'].replace_config( + '/etc/clickhouse-server/users.d/users1.xml', + config.format(sleep_in_send_tables_status=2, sleep_in_send_data=0)) + time.sleep(2) check_query() diff --git a/tests/integration/test_hedged_requests_parallel/test.py b/tests/integration/test_hedged_requests_parallel/test.py index 95e32a0f3fc..b713cf14af4 100644 --- a/tests/integration/test_hedged_requests_parallel/test.py +++ b/tests/integration/test_hedged_requests_parallel/test.py @@ -20,11 +20,11 @@ sleep_time = 30 def started_cluster(): cluster = ClickHouseCluster(__file__) NODES['node'] = cluster.add_instance( - 'node', with_zookeeper=True, main_configs=['configs/remote_servers.xml'], user_configs=['configs/users.xml']) + 'node', with_zookeeper=True, stay_alive=True, main_configs=['configs/remote_servers.xml'], user_configs=['configs/users.xml']) for name in NODES: if name != 'node': - NODES[name] = cluster.add_instance(name, with_zookeeper=True, stay_alive=True, user_configs=['configs/users1.xml']) + NODES[name] = cluster.add_instance(name, with_zookeeper=True, user_configs=['configs/users1.xml']) try: cluster.start() @@ -47,78 +47,88 @@ def started_cluster(): config = ''' - {sleep_before_send_hello} - {sleep_before_send_tables_status} - {sleep_before_send_data} + {sleep_in_send_tables_status} + {sleep_in_send_data} ''' def check_query(): + NODES['node'].restart_clickhouse() + # Without hedged requests select query will last more than 30 seconds, # with hedged requests it will last just around 1-2 second start = time.time() NODES['node'].query("SELECT * FROM distributed"); query_time = time.time() - start - print(query_time) + print("Query time:", query_time) assert query_time < 5 -def test_send_hello_sleep(started_cluster): - NODES['node_1'].replace_config( - '/etc/clickhouse-server/users.d/users1.xml', - config.format(sleep_before_send_hello=sleep_time, sleep_before_send_tables_status=0, sleep_before_send_data=0)) - - NODES['node_1'].restart_clickhouse(sleep_time) - NODES['node_1'].query('SYSTEM SYNC REPLICA replicated') - - - NODES['node_2'].replace_config( - '/etc/clickhouse-server/users.d/users1.xml', - config.format(sleep_before_send_hello=sleep_time, sleep_before_send_tables_status=0, sleep_before_send_data=0)) - - NODES['node_2'].restart_clickhouse(sleep_time) - NODES['node_2'].query('SYSTEM SYNC REPLICA replicated') - - check_query() - - def test_send_table_status_sleep(started_cluster): NODES['node_1'].replace_config( '/etc/clickhouse-server/users.d/users1.xml', - config.format(sleep_before_send_hello=0, sleep_before_send_tables_status=sleep_time, sleep_before_send_data=0)) - - NODES['node_1'].restart_clickhouse(sleep_time) - NODES['node_1'].query('SYSTEM SYNC REPLICA replicated') + config.format(sleep_in_send_tables_status=sleep_time, sleep_in_send_data=0)) NODES['node_2'].replace_config( '/etc/clickhouse-server/users.d/users1.xml', - config.format(sleep_before_send_hello=0, sleep_before_send_tables_status=sleep_time, sleep_before_send_data=0)) - - NODES['node_2'].restart_clickhouse(sleep_time) - NODES['node_2'].query('SYSTEM SYNC REPLICA replicated') + config.format(sleep_in_send_tables_status=sleep_time, sleep_in_send_data=0)) + time.sleep(2) check_query() def test_send_data(started_cluster): NODES['node_1'].replace_config( '/etc/clickhouse-server/users.d/users1.xml', - config.format(sleep_before_send_hello=0, sleep_before_send_tables_status=0, sleep_before_send_data=sleep_time)) - - NODES['node_1'].restart_clickhouse(sleep_time) - NODES['node_1'].query('SYSTEM SYNC REPLICA replicated') + config.format(sleep_in_send_tables_status=0, sleep_in_send_data=sleep_time)) NODES['node_2'].replace_config( '/etc/clickhouse-server/users.d/users1.xml', - config.format(sleep_before_send_hello=0, sleep_before_send_tables_status=0, sleep_before_send_data=sleep_time)) - - NODES['node_2'].restart_clickhouse(sleep_time) - NODES['node_2'].query('SYSTEM SYNC REPLICA replicated') + config.format(sleep_in_send_tables_status=0, sleep_in_send_data=sleep_time)) + time.sleep(2) + check_query() + + +def test_combination1(started_cluster): + NODES['node_1'].replace_config( + '/etc/clickhouse-server/users.d/users1.xml', + config.format(sleep_in_send_tables_status=1, sleep_in_send_data=0)) + + NODES['node_2'].replace_config( + '/etc/clickhouse-server/users.d/users1.xml', + config.format(sleep_in_send_tables_status=1, sleep_in_send_data=0)) + + NODES['node_3'].replace_config( + '/etc/clickhouse-server/users.d/users1.xml', + config.format(sleep_in_send_tables_status=0, sleep_in_send_data=sleep_time)) + + time.sleep(2) + check_query() + + +def test_combination2(started_cluster): + NODES['node_1'].replace_config( + '/etc/clickhouse-server/users.d/users1.xml', + config.format(sleep_in_send_tables_status=0, sleep_in_send_data=sleep_time)) + + NODES['node_2'].replace_config( + '/etc/clickhouse-server/users.d/users1.xml', + config.format(sleep_in_send_tables_status=1, sleep_in_send_data=0)) + + NODES['node_3'].replace_config( + '/etc/clickhouse-server/users.d/users1.xml', + config.format(sleep_in_send_tables_status=0, sleep_in_send_data=sleep_time)) + + NODES['node_4'].replace_config( + '/etc/clickhouse-server/users.d/users1.xml', + config.format(sleep_in_send_tables_status=1, sleep_in_send_data=0)) + + time.sleep(2) check_query() From bc6fe4f28c4a991c05bf87610f3350c7a5512e75 Mon Sep 17 00:00:00 2001 From: Pavel Kruglov Date: Wed, 17 Feb 2021 20:44:11 +0300 Subject: [PATCH 0384/2357] Minor fixes --- src/Client/Connection.cpp | 16 ++++++++++------ src/Client/Connection.h | 1 + src/Client/ConnectionPoolWithFailover.h | 1 - .../test_distributed_load_balancing/test.py | 4 ---- 4 files changed, 11 insertions(+), 11 deletions(-) diff --git a/src/Client/Connection.cpp b/src/Client/Connection.cpp index d30a6555da5..b6903ae6c92 100644 --- a/src/Client/Connection.cpp +++ b/src/Client/Connection.cpp @@ -66,10 +66,10 @@ void Connection::connect(const ConnectionTimeouts & timeouts) disconnect(); LOG_TRACE(log_wrapper.get(), "Connecting. Database: {}. User: {}{}{}", - default_database.empty() ? "(not specified)" : default_database, - user, - static_cast(secure) ? ". Secure" : "", - static_cast(compression) ? "" : ". Uncompressed"); + default_database.empty() ? "(not specified)" : default_database, + user, + static_cast(secure) ? ". Secure" : "", + static_cast(compression) ? "" : ". Uncompressed"); if (static_cast(secure)) { @@ -103,7 +103,7 @@ void Connection::connect(const ConnectionTimeouts & timeouts) #if defined(TCP_KEEPALIVE) TCP_KEEPALIVE #else - TCP_KEEPIDLE // __APPLE__ + TCP_KEEPIDLE // __APPLE__ #endif , timeouts.tcp_keep_alive_timeout); } @@ -120,7 +120,7 @@ void Connection::connect(const ConnectionTimeouts & timeouts) receiveHello(); LOG_TRACE(log_wrapper.get(), "Connected to {} server version {}.{}.{}.", - server_name, server_version_major, server_version_minor, server_version_patch); + server_name, server_version_major, server_version_minor, server_version_patch); } catch (Poco::Net::NetException & e) { @@ -151,6 +151,7 @@ void Connection::disconnect() connected = false; } + void Connection::sendHello() { /** Disallow control characters in user controlled parameters @@ -206,6 +207,7 @@ void Connection::sendHello() out->next(); } + void Connection::receiveHello() { /// Receive hello packet. @@ -337,6 +339,8 @@ void Connection::sendClusterNameAndSalt() bool Connection::ping() { + // LOG_TRACE(log_wrapper.get(), "Ping"); + TimeoutSetter timeout_setter(*socket, sync_request_timeout, true); try { diff --git a/src/Client/Connection.h b/src/Client/Connection.h index dd501b5f6ef..d317ecb56b3 100644 --- a/src/Client/Connection.h +++ b/src/Client/Connection.h @@ -196,6 +196,7 @@ public: Poco::Net::Socket * getSocket() { return socket.get(); } + /// Each time read from socket blocks and async_callback is set, it will be called. You can poll socket inside it. void setAsyncCallback(AsyncCallback async_callback_) { async_callback = std::move(async_callback_); diff --git a/src/Client/ConnectionPoolWithFailover.h b/src/Client/ConnectionPoolWithFailover.h index 2ecb0492747..023ef863bdf 100644 --- a/src/Client/ConnectionPoolWithFailover.h +++ b/src/Client/ConnectionPoolWithFailover.h @@ -31,7 +31,6 @@ enum class PoolMode GET_ALL }; - class ConnectionPoolWithFailover : public IConnectionPool, private PoolWithFailoverBase { public: diff --git a/tests/integration/test_distributed_load_balancing/test.py b/tests/integration/test_distributed_load_balancing/test.py index c7fd6d9ab26..d3ac5c132cd 100644 --- a/tests/integration/test_distributed_load_balancing/test.py +++ b/tests/integration/test_distributed_load_balancing/test.py @@ -166,11 +166,7 @@ def test_load_balancing_priority_round_robin(dist_table): def test_distributed_replica_max_ignored_errors(): settings = { -<<<<<<< Updated upstream 'use_hedged_requests' : 0, -======= - 'use_hedged_requests': 0, ->>>>>>> Stashed changes 'load_balancing': 'in_order', 'prefer_localhost_replica': 0, 'connect_timeout': 2, From 18e036d19b1402007c2e5806c89ce435ced96517 Mon Sep 17 00:00:00 2001 From: Vitaly Baranov Date: Mon, 11 Jan 2021 04:50:30 +0300 Subject: [PATCH 0385/2357] Improved serialization for data types combined of Arrays and Tuples. Improved matching enum data types to protobuf enum type. Fixed serialization of the Map data type. Omitted values are now set by default. --- docker/test/stateless/Dockerfile | 1 + src/Columns/ColumnFixedString.cpp | 14 + src/Columns/ColumnFixedString.h | 3 +- src/Common/ErrorCodes.cpp | 6 +- src/DataTypes/DataTypeAggregateFunction.cpp | 41 - src/DataTypes/DataTypeAggregateFunction.h | 2 - src/DataTypes/DataTypeArray.cpp | 50 - src/DataTypes/DataTypeArray.h | 9 - src/DataTypes/DataTypeDate.cpp | 26 - src/DataTypes/DataTypeDate.h | 2 - src/DataTypes/DataTypeDateTime.cpp | 28 - src/DataTypes/DataTypeDateTime.h | 2 - src/DataTypes/DataTypeDateTime64.cpp | 26 - src/DataTypes/DataTypeDateTime64.h | 2 - src/DataTypes/DataTypeDecimalBase.cpp | 2 - src/DataTypes/DataTypeEnum.cpp | 30 - src/DataTypes/DataTypeEnum.h | 3 - src/DataTypes/DataTypeFixedString.cpp | 61 +- src/DataTypes/DataTypeFixedString.h | 3 - src/DataTypes/DataTypeLowCardinality.cpp | 25 - src/DataTypes/DataTypeLowCardinality.h | 2 - src/DataTypes/DataTypeMap.cpp | 10 - src/DataTypes/DataTypeMap.h | 5 +- src/DataTypes/DataTypeNullable.cpp | 27 - src/DataTypes/DataTypeNullable.h | 3 - src/DataTypes/DataTypeNumberBase.cpp | 30 - src/DataTypes/DataTypeNumberBase.h | 3 - src/DataTypes/DataTypeString.cpp | 51 - src/DataTypes/DataTypeString.h | 3 - src/DataTypes/DataTypeTuple.cpp | 27 - src/DataTypes/DataTypeTuple.h | 3 - src/DataTypes/DataTypeUUID.cpp | 26 - src/DataTypes/DataTypeUUID.h | 2 - src/DataTypes/DataTypesDecimal.cpp | 29 - src/DataTypes/DataTypesDecimal.h | 3 - src/DataTypes/IDataType.h | 7 - src/DataTypes/IDataTypeDummy.h | 2 - src/Formats/FormatSettings.h | 3 +- src/Formats/ProtobufColumnMatcher.cpp | 55 - src/Formats/ProtobufColumnMatcher.h | 196 -- src/Formats/ProtobufReader.cpp | 945 +----- src/Formats/ProtobufReader.h | 294 +- src/Formats/ProtobufSerializer.cpp | 2921 +++++++++++++++++ src/Formats/ProtobufSerializer.h | 52 + src/Formats/ProtobufWriter.cpp | 843 +---- src/Formats/ProtobufWriter.h | 322 +- src/Formats/ya.make | 2 +- .../Formats/Impl/ProtobufRowInputFormat.cpp | 73 +- .../Formats/Impl/ProtobufRowInputFormat.h | 13 +- .../Formats/Impl/ProtobufRowOutputFormat.cpp | 71 +- .../Formats/Impl/ProtobufRowOutputFormat.h | 29 +- src/Storages/Kafka/KafkaBlockOutputStream.cpp | 2 +- .../RabbitMQ/RabbitMQBlockOutputStream.cpp | 2 +- .../00825_protobuf_format_array_3dim.proto | 14 + ...00825_protobuf_format_array_3dim.reference | 52 + .../00825_protobuf_format_array_3dim.sh | 35 + ...0825_protobuf_format_array_of_arrays.proto | 9 + ..._protobuf_format_array_of_arrays.reference | 41 + .../00825_protobuf_format_array_of_arrays.sh | 38 + .../00825_protobuf_format_enum_mapping.proto | 13 + ...825_protobuf_format_enum_mapping.reference | 31 + .../00825_protobuf_format_enum_mapping.sh | 37 + .../00825_protobuf_format_map.proto | 5 + .../00825_protobuf_format_map.reference | 19 + .../0_stateless/00825_protobuf_format_map.sh | 40 + ...0825_protobuf_format_nested_optional.proto | 10 + ..._protobuf_format_nested_optional.reference | 25 + .../00825_protobuf_format_nested_optional.sh | 41 + .../00825_protobuf_format_table_default.proto | 6 + ...25_protobuf_format_table_default.reference | 37 + .../00825_protobuf_format_table_default.sh | 38 + .../protobuf_length_delimited_encoder.py | 180 + tests/queries/skip_list.json | 6 + 73 files changed, 3990 insertions(+), 3079 deletions(-) delete mode 100644 src/Formats/ProtobufColumnMatcher.cpp delete mode 100644 src/Formats/ProtobufColumnMatcher.h create mode 100644 src/Formats/ProtobufSerializer.cpp create mode 100644 src/Formats/ProtobufSerializer.h create mode 100644 tests/queries/0_stateless/00825_protobuf_format_array_3dim.proto create mode 100644 tests/queries/0_stateless/00825_protobuf_format_array_3dim.reference create mode 100755 tests/queries/0_stateless/00825_protobuf_format_array_3dim.sh create mode 100644 tests/queries/0_stateless/00825_protobuf_format_array_of_arrays.proto create mode 100644 tests/queries/0_stateless/00825_protobuf_format_array_of_arrays.reference create mode 100755 tests/queries/0_stateless/00825_protobuf_format_array_of_arrays.sh create mode 100644 tests/queries/0_stateless/00825_protobuf_format_enum_mapping.proto create mode 100644 tests/queries/0_stateless/00825_protobuf_format_enum_mapping.reference create mode 100755 tests/queries/0_stateless/00825_protobuf_format_enum_mapping.sh create mode 100644 tests/queries/0_stateless/00825_protobuf_format_map.proto create mode 100644 tests/queries/0_stateless/00825_protobuf_format_map.reference create mode 100755 tests/queries/0_stateless/00825_protobuf_format_map.sh create mode 100644 tests/queries/0_stateless/00825_protobuf_format_nested_optional.proto create mode 100644 tests/queries/0_stateless/00825_protobuf_format_nested_optional.reference create mode 100755 tests/queries/0_stateless/00825_protobuf_format_nested_optional.sh create mode 100644 tests/queries/0_stateless/00825_protobuf_format_table_default.proto create mode 100644 tests/queries/0_stateless/00825_protobuf_format_table_default.reference create mode 100755 tests/queries/0_stateless/00825_protobuf_format_table_default.sh create mode 100755 tests/queries/0_stateless/helpers/protobuf_length_delimited_encoder.py diff --git a/docker/test/stateless/Dockerfile b/docker/test/stateless/Dockerfile index b063f8d81f6..10b213803c9 100644 --- a/docker/test/stateless/Dockerfile +++ b/docker/test/stateless/Dockerfile @@ -13,6 +13,7 @@ RUN apt-get update -y \ ncdu \ netcat-openbsd \ openssl \ + protobuf-compiler \ python3 \ python3-lxml \ python3-requests \ diff --git a/src/Columns/ColumnFixedString.cpp b/src/Columns/ColumnFixedString.cpp index 55e387ff2ee..6cfec89a5dc 100644 --- a/src/Columns/ColumnFixedString.cpp +++ b/src/Columns/ColumnFixedString.cpp @@ -446,4 +446,18 @@ void ColumnFixedString::getExtremes(Field & min, Field & max) const get(max_idx, max); } +void ColumnFixedString::alignStringLength(ColumnFixedString::Chars & data, size_t n, size_t old_size) +{ + size_t length = data.size() - old_size; + if (length < n) + { + data.resize_fill(old_size + n); + } + else if (length > n) + { + data.resize_assume_reserved(old_size); + throw Exception("Too large value for FixedString(" + std::to_string(n) + ")", ErrorCodes::TOO_LARGE_STRING_SIZE); + } +} + } diff --git a/src/Columns/ColumnFixedString.h b/src/Columns/ColumnFixedString.h index 286b3a752dc..24a99c27b13 100644 --- a/src/Columns/ColumnFixedString.h +++ b/src/Columns/ColumnFixedString.h @@ -182,7 +182,8 @@ public: const Chars & getChars() const { return chars; } size_t getN() const { return n; } + + static void alignStringLength(ColumnFixedString::Chars & data, size_t n, size_t old_size); }; - } diff --git a/src/Common/ErrorCodes.cpp b/src/Common/ErrorCodes.cpp index d0d83448b68..52c22c2e371 100644 --- a/src/Common/ErrorCodes.cpp +++ b/src/Common/ErrorCodes.cpp @@ -404,7 +404,7 @@ M(432, UNKNOWN_CODEC) \ M(433, ILLEGAL_CODEC_PARAMETER) \ M(434, CANNOT_PARSE_PROTOBUF_SCHEMA) \ - M(435, NO_DATA_FOR_REQUIRED_PROTOBUF_FIELD) \ + M(435, NO_COLUMN_SERIALIZED_TO_REQUIRED_PROTOBUF_FIELD) \ M(436, PROTOBUF_BAD_CAST) \ M(437, PROTOBUF_FIELD_NOT_REPEATED) \ M(438, DATA_TYPE_CANNOT_BE_PROMOTED) \ @@ -412,7 +412,7 @@ M(440, INVALID_LIMIT_EXPRESSION) \ M(441, CANNOT_PARSE_DOMAIN_VALUE_FROM_STRING) \ M(442, BAD_DATABASE_FOR_TEMPORARY_TABLE) \ - M(443, NO_COMMON_COLUMNS_WITH_PROTOBUF_SCHEMA) \ + M(443, NO_COLUMNS_SERIALIZED_TO_PROTOBUF_FIELDS) \ M(444, UNKNOWN_PROTOBUF_FORMAT) \ M(445, CANNOT_MPROTECT) \ M(446, FUNCTION_NOT_ALLOWED) \ @@ -535,6 +535,8 @@ M(566, CANNOT_RMDIR) \ M(567, DUPLICATED_PART_UUIDS) \ M(568, RAFT_ERROR) \ + M(569, MULTIPLE_COLUMNS_SERIALIZED_TO_SAME_PROTOBUF_FIELD) \ + M(570, DATA_TYPE_INCOMPATIBLE_WITH_PROTOBUF_FIELD) \ \ M(999, KEEPER_EXCEPTION) \ M(1000, POCO_EXCEPTION) \ diff --git a/src/DataTypes/DataTypeAggregateFunction.cpp b/src/DataTypes/DataTypeAggregateFunction.cpp index 9104c12120f..e92994ae979 100644 --- a/src/DataTypes/DataTypeAggregateFunction.cpp +++ b/src/DataTypes/DataTypeAggregateFunction.cpp @@ -10,8 +10,6 @@ #include #include -#include -#include #include #include #include @@ -261,45 +259,6 @@ void DataTypeAggregateFunction::deserializeTextCSV(IColumn & column, ReadBuffer } -void DataTypeAggregateFunction::serializeProtobuf(const IColumn & column, size_t row_num, ProtobufWriter & protobuf, size_t & value_index) const -{ - if (value_index) - return; - value_index = static_cast( - protobuf.writeAggregateFunction(function, assert_cast(column).getData()[row_num])); -} - -void DataTypeAggregateFunction::deserializeProtobuf(IColumn & column, ProtobufReader & protobuf, bool allow_add_row, bool & row_added) const -{ - row_added = false; - ColumnAggregateFunction & column_concrete = assert_cast(column); - Arena & arena = column_concrete.createOrGetArena(); - size_t size_of_state = function->sizeOfData(); - AggregateDataPtr place = arena.alignedAlloc(size_of_state, function->alignOfData()); - function->create(place); - try - { - if (!protobuf.readAggregateFunction(function, place, arena)) - { - function->destroy(place); - return; - } - auto & container = column_concrete.getData(); - if (allow_add_row) - { - container.emplace_back(place); - row_added = true; - } - else - container.back() = place; - } - catch (...) - { - function->destroy(place); - throw; - } -} - MutableColumnPtr DataTypeAggregateFunction::createColumn() const { return ColumnAggregateFunction::create(function); diff --git a/src/DataTypes/DataTypeAggregateFunction.h b/src/DataTypes/DataTypeAggregateFunction.h index 9ae7c67a803..d07d46fd3ee 100644 --- a/src/DataTypes/DataTypeAggregateFunction.h +++ b/src/DataTypes/DataTypeAggregateFunction.h @@ -59,8 +59,6 @@ public: void serializeTextXML(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings &) const override; void serializeTextCSV(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings &) const override; void deserializeTextCSV(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const override; - void serializeProtobuf(const IColumn & column, size_t row_num, ProtobufWriter & protobuf, size_t & value_index) const override; - void deserializeProtobuf(IColumn & column, ProtobufReader & protobuf, bool allow_add_row, bool & row_added) const override; MutableColumnPtr createColumn() const override; diff --git a/src/DataTypes/DataTypeArray.cpp b/src/DataTypes/DataTypeArray.cpp index 3ad84a8fcd7..27088ab822c 100644 --- a/src/DataTypes/DataTypeArray.cpp +++ b/src/DataTypes/DataTypeArray.cpp @@ -6,7 +6,6 @@ #include #include -#include #include #include #include @@ -522,55 +521,6 @@ void DataTypeArray::deserializeTextCSV(IColumn & column, ReadBuffer & istr, cons } -void DataTypeArray::serializeProtobuf(const IColumn & column, size_t row_num, ProtobufWriter & protobuf, size_t & value_index) const -{ - const ColumnArray & column_array = assert_cast(column); - const ColumnArray::Offsets & offsets = column_array.getOffsets(); - size_t offset = offsets[row_num - 1] + value_index; - size_t next_offset = offsets[row_num]; - const IColumn & nested_column = column_array.getData(); - size_t i; - for (i = offset; i < next_offset; ++i) - { - size_t element_stored = 0; - nested->serializeProtobuf(nested_column, i, protobuf, element_stored); - if (!element_stored) - break; - } - value_index += i - offset; -} - - -void DataTypeArray::deserializeProtobuf(IColumn & column, ProtobufReader & protobuf, bool allow_add_row, bool & row_added) const -{ - row_added = false; - ColumnArray & column_array = assert_cast(column); - IColumn & nested_column = column_array.getData(); - ColumnArray::Offsets & offsets = column_array.getOffsets(); - size_t old_size = offsets.size(); - try - { - bool nested_row_added; - do - nested->deserializeProtobuf(nested_column, protobuf, true, nested_row_added); - while (nested_row_added && protobuf.canReadMoreValues()); - if (allow_add_row) - { - offsets.emplace_back(nested_column.size()); - row_added = true; - } - else - offsets.back() = nested_column.size(); - } - catch (...) - { - offsets.resize_assume_reserved(old_size); - nested_column.popBack(nested_column.size() - offsets.back()); - throw; - } -} - - MutableColumnPtr DataTypeArray::createColumn() const { return ColumnArray::create(nested->createColumn(), ColumnArray::ColumnOffsets::create()); diff --git a/src/DataTypes/DataTypeArray.h b/src/DataTypes/DataTypeArray.h index ba19ad021be..4185163e2e7 100644 --- a/src/DataTypes/DataTypeArray.h +++ b/src/DataTypes/DataTypeArray.h @@ -85,15 +85,6 @@ public: DeserializeBinaryBulkStatePtr & state, SubstreamsCache * cache) const override; - void serializeProtobuf(const IColumn & column, - size_t row_num, - ProtobufWriter & protobuf, - size_t & value_index) const override; - void deserializeProtobuf(IColumn & column, - ProtobufReader & protobuf, - bool allow_add_row, - bool & row_added) const override; - MutableColumnPtr createColumn() const override; Field getDefault() const override; diff --git a/src/DataTypes/DataTypeDate.cpp b/src/DataTypes/DataTypeDate.cpp index 2c1dfcbb0fe..192a89cc454 100644 --- a/src/DataTypes/DataTypeDate.cpp +++ b/src/DataTypes/DataTypeDate.cpp @@ -4,8 +4,6 @@ #include #include #include -#include -#include #include @@ -81,30 +79,6 @@ void DataTypeDate::deserializeTextCSV(IColumn & column, ReadBuffer & istr, const assert_cast(column).getData().push_back(value.getDayNum()); } -void DataTypeDate::serializeProtobuf(const IColumn & column, size_t row_num, ProtobufWriter & protobuf, size_t & value_index) const -{ - if (value_index) - return; - value_index = static_cast(protobuf.writeDate(DayNum(assert_cast(column).getData()[row_num]))); -} - -void DataTypeDate::deserializeProtobuf(IColumn & column, ProtobufReader & protobuf, bool allow_add_row, bool & row_added) const -{ - row_added = false; - DayNum d; - if (!protobuf.readDate(d)) - return; - - auto & container = assert_cast(column).getData(); - if (allow_add_row) - { - container.emplace_back(d); - row_added = true; - } - else - container.back() = d; -} - bool DataTypeDate::equals(const IDataType & rhs) const { return typeid(rhs) == typeid(*this); diff --git a/src/DataTypes/DataTypeDate.h b/src/DataTypes/DataTypeDate.h index 00afba424e4..496d7fe0b22 100644 --- a/src/DataTypes/DataTypeDate.h +++ b/src/DataTypes/DataTypeDate.h @@ -24,8 +24,6 @@ public: void deserializeTextJSON(IColumn & column, ReadBuffer & istr, const FormatSettings &) const override; void serializeTextCSV(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings &) const override; void deserializeTextCSV(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const override; - void serializeProtobuf(const IColumn & column, size_t row_num, ProtobufWriter & protobuf, size_t & value_index) const override; - void deserializeProtobuf(IColumn & column, ProtobufReader & protobuf, bool allow_add_row, bool & row_added) const override; bool canBeUsedAsVersion() const override { return true; } bool canBeInsideNullable() const override { return true; } diff --git a/src/DataTypes/DataTypeDateTime.cpp b/src/DataTypes/DataTypeDateTime.cpp index bfb4473e429..d2bbb4a1efa 100644 --- a/src/DataTypes/DataTypeDateTime.cpp +++ b/src/DataTypes/DataTypeDateTime.cpp @@ -5,8 +5,6 @@ #include #include #include -#include -#include #include #include #include @@ -164,32 +162,6 @@ void DataTypeDateTime::deserializeTextCSV(IColumn & column, ReadBuffer & istr, c assert_cast(column).getData().push_back(x); } -void DataTypeDateTime::serializeProtobuf(const IColumn & column, size_t row_num, ProtobufWriter & protobuf, size_t & value_index) const -{ - if (value_index) - return; - - // On some platforms `time_t` is `long` but not `unsigned int` (UInt32 that we store in column), hence static_cast. - value_index = static_cast(protobuf.writeDateTime(static_cast(assert_cast(column).getData()[row_num]))); -} - -void DataTypeDateTime::deserializeProtobuf(IColumn & column, ProtobufReader & protobuf, bool allow_add_row, bool & row_added) const -{ - row_added = false; - time_t t; - if (!protobuf.readDateTime(t)) - return; - - auto & container = assert_cast(column).getData(); - if (allow_add_row) - { - container.emplace_back(t); - row_added = true; - } - else - container.back() = t; -} - bool DataTypeDateTime::equals(const IDataType & rhs) const { /// DateTime with different timezones are equal, because: diff --git a/src/DataTypes/DataTypeDateTime.h b/src/DataTypes/DataTypeDateTime.h index 47c7f361091..edec889309b 100644 --- a/src/DataTypes/DataTypeDateTime.h +++ b/src/DataTypes/DataTypeDateTime.h @@ -68,8 +68,6 @@ public: void deserializeTextJSON(IColumn & column, ReadBuffer & istr, const FormatSettings &) const override; void serializeTextCSV(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings &) const override; void deserializeTextCSV(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const override; - void serializeProtobuf(const IColumn & column, size_t row_num, ProtobufWriter & protobuf, size_t & value_index) const override; - void deserializeProtobuf(IColumn & column, ProtobufReader & protobuf, bool allow_add_row, bool & row_added) const override; bool canBeUsedAsVersion() const override { return true; } bool canBeInsideNullable() const override { return true; } diff --git a/src/DataTypes/DataTypeDateTime64.cpp b/src/DataTypes/DataTypeDateTime64.cpp index ef1a971510a..09e39c2de1a 100644 --- a/src/DataTypes/DataTypeDateTime64.cpp +++ b/src/DataTypes/DataTypeDateTime64.cpp @@ -6,8 +6,6 @@ #include #include #include -#include -#include #include #include #include @@ -182,30 +180,6 @@ void DataTypeDateTime64::deserializeTextCSV(IColumn & column, ReadBuffer & istr, assert_cast(column).getData().push_back(x); } -void DataTypeDateTime64::serializeProtobuf(const IColumn & column, size_t row_num, ProtobufWriter & protobuf, size_t & value_index) const -{ - if (value_index) - return; - value_index = static_cast(protobuf.writeDateTime64(assert_cast(column).getData()[row_num], scale)); -} - -void DataTypeDateTime64::deserializeProtobuf(IColumn & column, ProtobufReader & protobuf, bool allow_add_row, bool & row_added) const -{ - row_added = false; - DateTime64 t = 0; - if (!protobuf.readDateTime64(t, scale)) - return; - - auto & container = assert_cast(column).getData(); - if (allow_add_row) - { - container.emplace_back(t); - row_added = true; - } - else - container.back() = t; -} - bool DataTypeDateTime64::equals(const IDataType & rhs) const { if (const auto * ptype = typeid_cast(&rhs)) diff --git a/src/DataTypes/DataTypeDateTime64.h b/src/DataTypes/DataTypeDateTime64.h index 003e83b7195..198c3739f58 100644 --- a/src/DataTypes/DataTypeDateTime64.h +++ b/src/DataTypes/DataTypeDateTime64.h @@ -42,8 +42,6 @@ public: void deserializeTextJSON(IColumn & column, ReadBuffer & istr, const FormatSettings &) const override; void serializeTextCSV(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings &) const override; void deserializeTextCSV(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const override; - void serializeProtobuf(const IColumn & column, size_t row_num, ProtobufWriter & protobuf, size_t & value_index) const override; - void deserializeProtobuf(IColumn & column, ProtobufReader & protobuf, bool allow_add_row, bool & row_added) const override; bool equals(const IDataType & rhs) const override; diff --git a/src/DataTypes/DataTypeDecimalBase.cpp b/src/DataTypes/DataTypeDecimalBase.cpp index 9fb445ab00d..ab17996167c 100644 --- a/src/DataTypes/DataTypeDecimalBase.cpp +++ b/src/DataTypes/DataTypeDecimalBase.cpp @@ -4,8 +4,6 @@ #include #include #include -#include -#include #include #include #include diff --git a/src/DataTypes/DataTypeEnum.cpp b/src/DataTypes/DataTypeEnum.cpp index 650a1da6407..043c971266c 100644 --- a/src/DataTypes/DataTypeEnum.cpp +++ b/src/DataTypes/DataTypeEnum.cpp @@ -1,7 +1,5 @@ #include #include -#include -#include #include #include #include @@ -254,34 +252,6 @@ void DataTypeEnum::deserializeBinaryBulk( x.resize(initial_size + size / sizeof(FieldType)); } -template -void DataTypeEnum::serializeProtobuf(const IColumn & column, size_t row_num, ProtobufWriter & protobuf, size_t & value_index) const -{ - if (value_index) - return; - protobuf.prepareEnumMapping(values); - value_index = static_cast(protobuf.writeEnum(assert_cast(column).getData()[row_num])); -} - -template -void DataTypeEnum::deserializeProtobuf(IColumn & column, ProtobufReader & protobuf, bool allow_add_row, bool & row_added) const -{ - protobuf.prepareEnumMapping(values); - row_added = false; - Type value; - if (!protobuf.readEnum(value)) - return; - - auto & container = assert_cast(column).getData(); - if (allow_add_row) - { - container.emplace_back(value); - row_added = true; - } - else - container.back() = value; -} - template Field DataTypeEnum::getDefault() const { diff --git a/src/DataTypes/DataTypeEnum.h b/src/DataTypes/DataTypeEnum.h index c75d348f15c..003613edb98 100644 --- a/src/DataTypes/DataTypeEnum.h +++ b/src/DataTypes/DataTypeEnum.h @@ -132,9 +132,6 @@ public: void serializeBinaryBulk(const IColumn & column, WriteBuffer & ostr, const size_t offset, size_t limit) const override; void deserializeBinaryBulk(IColumn & column, ReadBuffer & istr, const size_t limit, const double avg_value_size_hint) const override; - void serializeProtobuf(const IColumn & column, size_t row_num, ProtobufWriter & protobuf, size_t & value_index) const override; - void deserializeProtobuf(IColumn & column, ProtobufReader & protobuf, bool allow_add_row, bool & row_added) const override; - MutableColumnPtr createColumn() const override { return ColumnType::create(); } Field getDefault() const override; diff --git a/src/DataTypes/DataTypeFixedString.cpp b/src/DataTypes/DataTypeFixedString.cpp index 585c5709be7..21cfe855169 100644 --- a/src/DataTypes/DataTypeFixedString.cpp +++ b/src/DataTypes/DataTypeFixedString.cpp @@ -2,8 +2,6 @@ #include #include -#include -#include #include #include @@ -25,7 +23,6 @@ namespace DB namespace ErrorCodes { extern const int CANNOT_READ_ALL_DATA; - extern const int TOO_LARGE_STRING_SIZE; extern const int NUMBER_OF_ARGUMENTS_DOESNT_MATCH; extern const int UNEXPECTED_AST_STRUCTURE; } @@ -127,16 +124,7 @@ static inline void alignStringLength(const DataTypeFixedString & type, ColumnFixedString::Chars & data, size_t string_start) { - size_t length = data.size() - string_start; - if (length < type.getN()) - { - data.resize_fill(string_start + type.getN()); - } - else if (length > type.getN()) - { - data.resize_assume_reserved(string_start); - throw Exception("Too large value for " + type.getName(), ErrorCodes::TOO_LARGE_STRING_SIZE); - } + ColumnFixedString::alignStringLength(data, type.getN(), string_start); } template @@ -215,53 +203,6 @@ void DataTypeFixedString::deserializeTextCSV(IColumn & column, ReadBuffer & istr } -void DataTypeFixedString::serializeProtobuf(const IColumn & column, size_t row_num, ProtobufWriter & protobuf, size_t & value_index) const -{ - if (value_index) - return; - const char * pos = reinterpret_cast(&assert_cast(column).getChars()[n * row_num]); - value_index = static_cast(protobuf.writeString(StringRef(pos, n))); -} - - -void DataTypeFixedString::deserializeProtobuf(IColumn & column, ProtobufReader & protobuf, bool allow_add_row, bool & row_added) const -{ - row_added = false; - auto & column_string = assert_cast(column); - ColumnFixedString::Chars & data = column_string.getChars(); - size_t old_size = data.size(); - try - { - if (allow_add_row) - { - if (protobuf.readStringInto(data)) - { - alignStringLength(*this, data, old_size); - row_added = true; - } - else - data.resize_assume_reserved(old_size); - } - else - { - ColumnFixedString::Chars temp_data; - if (protobuf.readStringInto(temp_data)) - { - alignStringLength(*this, temp_data, 0); - column_string.popBack(1); - old_size = data.size(); - data.insertSmallAllowReadWriteOverflow15(temp_data.begin(), temp_data.end()); - } - } - } - catch (...) - { - data.resize_assume_reserved(old_size); - throw; - } -} - - MutableColumnPtr DataTypeFixedString::createColumn() const { return ColumnFixedString::create(n); diff --git a/src/DataTypes/DataTypeFixedString.h b/src/DataTypes/DataTypeFixedString.h index e410d1b0596..af82e4b5d11 100644 --- a/src/DataTypes/DataTypeFixedString.h +++ b/src/DataTypes/DataTypeFixedString.h @@ -66,9 +66,6 @@ public: void serializeTextCSV(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings &) const override; void deserializeTextCSV(IColumn & column, ReadBuffer & istr, const FormatSettings &) const override; - void serializeProtobuf(const IColumn & column, size_t row_num, ProtobufWriter & protobuf, size_t & value_index) const override; - void deserializeProtobuf(IColumn & column, ProtobufReader & protobuf, bool allow_add_row, bool & row_added) const override; - MutableColumnPtr createColumn() const override; Field getDefault() const override; diff --git a/src/DataTypes/DataTypeLowCardinality.cpp b/src/DataTypes/DataTypeLowCardinality.cpp index 9614c150c7d..1b21b7de4bc 100644 --- a/src/DataTypes/DataTypeLowCardinality.cpp +++ b/src/DataTypes/DataTypeLowCardinality.cpp @@ -808,31 +808,6 @@ void DataTypeLowCardinality::serializeTextXML(const IColumn & column, size_t row serializeImpl(column, row_num, &IDataType::serializeAsTextXML, ostr, settings); } -void DataTypeLowCardinality::serializeProtobuf(const IColumn & column, size_t row_num, ProtobufWriter & protobuf, size_t & value_index) const -{ - serializeImpl(column, row_num, &IDataType::serializeProtobuf, protobuf, value_index); -} - -void DataTypeLowCardinality::deserializeProtobuf(IColumn & column, ProtobufReader & protobuf, bool allow_add_row, bool & row_added) const -{ - if (allow_add_row) - { - deserializeImpl(column, &IDataType::deserializeProtobuf, protobuf, true, row_added); - return; - } - - row_added = false; - auto & low_cardinality_column= getColumnLowCardinality(column); - auto nested_column = low_cardinality_column.getDictionary().getNestedColumn(); - auto temp_column = nested_column->cloneEmpty(); - size_t unique_row_number = low_cardinality_column.getIndexes().getUInt(low_cardinality_column.size() - 1); - temp_column->insertFrom(*nested_column, unique_row_number); - bool dummy; - dictionary_type.get()->deserializeProtobuf(*temp_column, protobuf, false, dummy); - low_cardinality_column.popBack(1); - low_cardinality_column.insertFromFullColumn(*temp_column, 0); -} - template void DataTypeLowCardinality::serializeImpl( const IColumn & column, size_t row_num, DataTypeLowCardinality::SerializeFunctionPtr func, Args &&... args) const diff --git a/src/DataTypes/DataTypeLowCardinality.h b/src/DataTypes/DataTypeLowCardinality.h index 6ed2b792ce3..14beb423f1f 100644 --- a/src/DataTypes/DataTypeLowCardinality.h +++ b/src/DataTypes/DataTypeLowCardinality.h @@ -65,8 +65,6 @@ public: void serializeTextJSON(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings & settings) const override; void deserializeTextJSON(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const override; void serializeTextXML(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings & settings) const override; - void serializeProtobuf(const IColumn & column, size_t row_num, ProtobufWriter & protobuf, size_t & value_index) const override; - void deserializeProtobuf(IColumn & column, ProtobufReader & protobuf, bool allow_add_row, bool & row_added) const override; MutableColumnPtr createColumn() const override; diff --git a/src/DataTypes/DataTypeMap.cpp b/src/DataTypes/DataTypeMap.cpp index af2ed8805e8..9972452862f 100644 --- a/src/DataTypes/DataTypeMap.cpp +++ b/src/DataTypes/DataTypeMap.cpp @@ -336,16 +336,6 @@ void DataTypeMap::deserializeBinaryBulkWithMultipleStreamsImpl( nested->deserializeBinaryBulkWithMultipleStreams(column_map.getNestedColumnPtr(), limit, settings, state, cache); } -void DataTypeMap::serializeProtobuf(const IColumn & column, size_t row_num, ProtobufWriter & protobuf, size_t & value_index) const -{ - nested->serializeProtobuf(extractNestedColumn(column), row_num, protobuf, value_index); -} - -void DataTypeMap::deserializeProtobuf(IColumn & column, ProtobufReader & protobuf, bool allow_add_row, bool & row_added) const -{ - nested->deserializeProtobuf(extractNestedColumn(column), protobuf, allow_add_row, row_added); -} - MutableColumnPtr DataTypeMap::createColumn() const { return ColumnMap::create(nested->createColumn()); diff --git a/src/DataTypes/DataTypeMap.h b/src/DataTypes/DataTypeMap.h index ea495f05548..88ea44a0d5a 100644 --- a/src/DataTypes/DataTypeMap.h +++ b/src/DataTypes/DataTypeMap.h @@ -76,9 +76,6 @@ public: DeserializeBinaryBulkStatePtr & state, SubstreamsCache * cache) const override; - void serializeProtobuf(const IColumn & column, size_t row_num, ProtobufWriter & protobuf, size_t & value_index) const override; - void deserializeProtobuf(IColumn & column, ProtobufReader & protobuf, bool allow_add_row, bool & row_added) const override; - MutableColumnPtr createColumn() const override; Field getDefault() const override; @@ -92,6 +89,8 @@ public: const DataTypePtr & getValueType() const { return value_type; } DataTypes getKeyValueTypes() const { return {key_type, value_type}; } + const DataTypePtr & getNestedType() const { return nested; } + private: template void serializeTextImpl(const IColumn & column, size_t row_num, WriteBuffer & ostr, Writer && writer) const; diff --git a/src/DataTypes/DataTypeNullable.cpp b/src/DataTypes/DataTypeNullable.cpp index c3b734686f8..903ebeb3ddc 100644 --- a/src/DataTypes/DataTypeNullable.cpp +++ b/src/DataTypes/DataTypeNullable.cpp @@ -486,33 +486,6 @@ void DataTypeNullable::serializeTextXML(const IColumn & column, size_t row_num, nested_data_type->serializeAsTextXML(col.getNestedColumn(), row_num, ostr, settings); } -void DataTypeNullable::serializeProtobuf(const IColumn & column, size_t row_num, ProtobufWriter & protobuf, size_t & value_index) const -{ - const ColumnNullable & col = assert_cast(column); - if (!col.isNullAt(row_num)) - nested_data_type->serializeProtobuf(col.getNestedColumn(), row_num, protobuf, value_index); -} - -void DataTypeNullable::deserializeProtobuf(IColumn & column, ProtobufReader & protobuf, bool allow_add_row, bool & row_added) const -{ - ColumnNullable & col = assert_cast(column); - IColumn & nested_column = col.getNestedColumn(); - size_t old_size = nested_column.size(); - try - { - nested_data_type->deserializeProtobuf(nested_column, protobuf, allow_add_row, row_added); - if (row_added) - col.getNullMapData().push_back(0); - } - catch (...) - { - nested_column.popBack(nested_column.size() - old_size); - col.getNullMapData().resize_assume_reserved(old_size); - row_added = false; - throw; - } -} - MutableColumnPtr DataTypeNullable::createColumn() const { return ColumnNullable::create(nested_data_type->createColumn(), ColumnUInt8::create()); diff --git a/src/DataTypes/DataTypeNullable.h b/src/DataTypes/DataTypeNullable.h index db641faf0af..5e71a1bee4d 100644 --- a/src/DataTypes/DataTypeNullable.h +++ b/src/DataTypes/DataTypeNullable.h @@ -73,9 +73,6 @@ public: void serializeText(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings &) const override; void serializeTextXML(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings &) const override; - void serializeProtobuf(const IColumn & column, size_t row_num, ProtobufWriter & protobuf, size_t & value_index) const override; - void deserializeProtobuf(IColumn & column, ProtobufReader & protobuf, bool allow_add_row, bool & row_added) const override; - MutableColumnPtr createColumn() const override; Field getDefault() const override; diff --git a/src/DataTypes/DataTypeNumberBase.cpp b/src/DataTypes/DataTypeNumberBase.cpp index a9b9bbc8090..ae3e6762d27 100644 --- a/src/DataTypes/DataTypeNumberBase.cpp +++ b/src/DataTypes/DataTypeNumberBase.cpp @@ -8,8 +8,6 @@ #include #include #include -#include -#include namespace DB @@ -205,34 +203,6 @@ void DataTypeNumberBase::deserializeBinaryBulk(IColumn & column, ReadBuffer & } -template -void DataTypeNumberBase::serializeProtobuf(const IColumn & column, size_t row_num, ProtobufWriter & protobuf, size_t & value_index) const -{ - if (value_index) - return; - value_index = static_cast(protobuf.writeNumber(assert_cast &>(column).getData()[row_num])); -} - - -template -void DataTypeNumberBase::deserializeProtobuf(IColumn & column, ProtobufReader & protobuf, bool allow_add_row, bool & row_added) const -{ - row_added = false; - T value; - if (!protobuf.readNumber(value)) - return; - - auto & container = typeid_cast &>(column).getData(); - if (allow_add_row) - { - container.emplace_back(value); - row_added = true; - } - else - container.back() = value; -} - - template MutableColumnPtr DataTypeNumberBase::createColumn() const { diff --git a/src/DataTypes/DataTypeNumberBase.h b/src/DataTypes/DataTypeNumberBase.h index 1491eabfbd5..22a70ac7277 100644 --- a/src/DataTypes/DataTypeNumberBase.h +++ b/src/DataTypes/DataTypeNumberBase.h @@ -45,9 +45,6 @@ public: void serializeBinaryBulk(const IColumn & column, WriteBuffer & ostr, size_t offset, size_t limit) const override; void deserializeBinaryBulk(IColumn & column, ReadBuffer & istr, size_t limit, double avg_value_size_hint) const override; - void serializeProtobuf(const IColumn & column, size_t row_num, ProtobufWriter & protobuf, size_t & value_index) const override; - void deserializeProtobuf(IColumn & column, ProtobufReader & protobuf, bool allow_add_row, bool & row_added) const override; - MutableColumnPtr createColumn() const override; bool isParametric() const override { return false; } diff --git a/src/DataTypes/DataTypeString.cpp b/src/DataTypes/DataTypeString.cpp index c752d136642..d760df5075d 100644 --- a/src/DataTypes/DataTypeString.cpp +++ b/src/DataTypes/DataTypeString.cpp @@ -9,8 +9,6 @@ #include #include -#include -#include #include #include @@ -311,55 +309,6 @@ void DataTypeString::deserializeTextCSV(IColumn & column, ReadBuffer & istr, con } -void DataTypeString::serializeProtobuf(const IColumn & column, size_t row_num, ProtobufWriter & protobuf, size_t & value_index) const -{ - if (value_index) - return; - value_index = static_cast(protobuf.writeString(assert_cast(column).getDataAt(row_num))); -} - - -void DataTypeString::deserializeProtobuf(IColumn & column, ProtobufReader & protobuf, bool allow_add_row, bool & row_added) const -{ - row_added = false; - auto & column_string = assert_cast(column); - ColumnString::Chars & data = column_string.getChars(); - ColumnString::Offsets & offsets = column_string.getOffsets(); - size_t old_size = offsets.size(); - try - { - if (allow_add_row) - { - if (protobuf.readStringInto(data)) - { - data.emplace_back(0); - offsets.emplace_back(data.size()); - row_added = true; - } - else - data.resize_assume_reserved(offsets.back()); - } - else - { - ColumnString::Chars temp_data; - if (protobuf.readStringInto(temp_data)) - { - temp_data.emplace_back(0); - column_string.popBack(1); - old_size = offsets.size(); - data.insertSmallAllowReadWriteOverflow15(temp_data.begin(), temp_data.end()); - offsets.emplace_back(data.size()); - } - } - } - catch (...) - { - offsets.resize_assume_reserved(old_size); - data.resize_assume_reserved(offsets.back()); - throw; - } -} - Field DataTypeString::getDefault() const { return String(); diff --git a/src/DataTypes/DataTypeString.h b/src/DataTypes/DataTypeString.h index f6db8fe73d4..7f8aa1fd0cf 100644 --- a/src/DataTypes/DataTypeString.h +++ b/src/DataTypes/DataTypeString.h @@ -47,9 +47,6 @@ public: void serializeTextCSV(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings &) const override; void deserializeTextCSV(IColumn & column, ReadBuffer & istr, const FormatSettings &) const override; - void serializeProtobuf(const IColumn & column, size_t row_num, ProtobufWriter & protobuf, size_t & value_index) const override; - void deserializeProtobuf(IColumn & column, ProtobufReader & protobuf, bool allow_add_row, bool & row_added) const override; - MutableColumnPtr createColumn() const override; Field getDefault() const override; diff --git a/src/DataTypes/DataTypeTuple.cpp b/src/DataTypes/DataTypeTuple.cpp index c62aa1c1187..2261e776ea2 100644 --- a/src/DataTypes/DataTypeTuple.cpp +++ b/src/DataTypes/DataTypeTuple.cpp @@ -504,33 +504,6 @@ void DataTypeTuple::deserializeBinaryBulkWithMultipleStreamsImpl( settings.path.pop_back(); } -void DataTypeTuple::serializeProtobuf(const IColumn & column, size_t row_num, ProtobufWriter & protobuf, size_t & value_index) const -{ - for (; value_index < elems.size(); ++value_index) - { - size_t stored = 0; - elems[value_index]->serializeProtobuf(extractElementColumn(column, value_index), row_num, protobuf, stored); - if (!stored) - break; - } -} - -void DataTypeTuple::deserializeProtobuf(IColumn & column, ProtobufReader & protobuf, bool allow_add_row, bool & row_added) const -{ - row_added = false; - bool all_elements_get_row = true; - addElementSafe(elems, column, [&] - { - for (const auto & i : ext::range(0, ext::size(elems))) - { - bool element_row_added; - elems[i]->deserializeProtobuf(extractElementColumn(column, i), protobuf, allow_add_row, element_row_added); - all_elements_get_row &= element_row_added; - } - }); - row_added = all_elements_get_row; -} - MutableColumnPtr DataTypeTuple::createColumn() const { size_t size = elems.size(); diff --git a/src/DataTypes/DataTypeTuple.h b/src/DataTypes/DataTypeTuple.h index 0b28ebe5a63..12ccf574c0e 100644 --- a/src/DataTypes/DataTypeTuple.h +++ b/src/DataTypes/DataTypeTuple.h @@ -81,9 +81,6 @@ public: DeserializeBinaryBulkStatePtr & state, SubstreamsCache * cache) const override; - void serializeProtobuf(const IColumn & column, size_t row_num, ProtobufWriter & protobuf, size_t & value_index) const override; - void deserializeProtobuf(IColumn & column, ProtobufReader & protobuf, bool allow_add_row, bool & row_added) const override; - MutableColumnPtr createColumn() const override; Field getDefault() const override; diff --git a/src/DataTypes/DataTypeUUID.cpp b/src/DataTypes/DataTypeUUID.cpp index 94a043eb472..b66cbadaef0 100644 --- a/src/DataTypes/DataTypeUUID.cpp +++ b/src/DataTypes/DataTypeUUID.cpp @@ -1,8 +1,6 @@ #include #include #include -#include -#include #include #include #include @@ -79,30 +77,6 @@ void DataTypeUUID::deserializeTextCSV(IColumn & column, ReadBuffer & istr, const assert_cast(column).getData().push_back(value); } -void DataTypeUUID::serializeProtobuf(const IColumn & column, size_t row_num, ProtobufWriter & protobuf, size_t & value_index) const -{ - if (value_index) - return; - value_index = static_cast(protobuf.writeUUID(UUID(assert_cast(column).getData()[row_num]))); -} - -void DataTypeUUID::deserializeProtobuf(IColumn & column, ProtobufReader & protobuf, bool allow_add_row, bool & row_added) const -{ - row_added = false; - UUID uuid; - if (!protobuf.readUUID(uuid)) - return; - - auto & container = assert_cast(column).getData(); - if (allow_add_row) - { - container.emplace_back(uuid); - row_added = true; - } - else - container.back() = uuid; -} - bool DataTypeUUID::equals(const IDataType & rhs) const { return typeid(rhs) == typeid(*this); diff --git a/src/DataTypes/DataTypeUUID.h b/src/DataTypes/DataTypeUUID.h index 6290d05cc3b..de0c7c7d8cf 100644 --- a/src/DataTypes/DataTypeUUID.h +++ b/src/DataTypes/DataTypeUUID.h @@ -26,8 +26,6 @@ public: void deserializeTextJSON(IColumn & column, ReadBuffer & istr, const FormatSettings &) const override; void serializeTextCSV(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings &) const override; void deserializeTextCSV(IColumn & column, ReadBuffer & istr, const FormatSettings & settings) const override; - void serializeProtobuf(const IColumn & column, size_t row_num, ProtobufWriter & protobuf, size_t & value_index) const override; - void deserializeProtobuf(IColumn & column, ProtobufReader & protobuf, bool allow_add_row, bool & row_added) const override; bool canBeUsedInBitOperations() const override { return true; } bool canBeInsideNullable() const override { return true; } diff --git a/src/DataTypes/DataTypesDecimal.cpp b/src/DataTypes/DataTypesDecimal.cpp index 6c325c5d371..e174a242462 100644 --- a/src/DataTypes/DataTypesDecimal.cpp +++ b/src/DataTypes/DataTypesDecimal.cpp @@ -4,8 +4,6 @@ #include #include #include -#include -#include #include #include #include @@ -111,33 +109,6 @@ T DataTypeDecimal::parseFromString(const String & str) const return x; } -template -void DataTypeDecimal::serializeProtobuf(const IColumn & column, size_t row_num, ProtobufWriter & protobuf, size_t & value_index) const -{ - if (value_index) - return; - value_index = static_cast(protobuf.writeDecimal(assert_cast(column).getData()[row_num], this->scale)); -} - - -template -void DataTypeDecimal::deserializeProtobuf(IColumn & column, ProtobufReader & protobuf, bool allow_add_row, bool & row_added) const -{ - row_added = false; - T decimal; - if (!protobuf.readDecimal(decimal, this->precision, this->scale)) - return; - - auto & container = assert_cast(column).getData(); - if (allow_add_row) - { - container.emplace_back(decimal); - row_added = true; - } - else - container.back() = decimal; -} - static DataTypePtr create(const ASTPtr & arguments) { diff --git a/src/DataTypes/DataTypesDecimal.h b/src/DataTypes/DataTypesDecimal.h index 3f7b4e2ac63..08f44c60c41 100644 --- a/src/DataTypes/DataTypesDecimal.h +++ b/src/DataTypes/DataTypesDecimal.h @@ -46,9 +46,6 @@ public: void deserializeText(IColumn & column, ReadBuffer & istr, const FormatSettings &) const override; void deserializeTextCSV(IColumn & column, ReadBuffer & istr, const FormatSettings &) const override; - void serializeProtobuf(const IColumn & column, size_t row_num, ProtobufWriter & protobuf, size_t & value_index) const override; - void deserializeProtobuf(IColumn & column, ProtobufReader & protobuf, bool allow_add_row, bool & row_added) const override; - bool equals(const IDataType & rhs) const override; T parseFromString(const String & str) const; diff --git a/src/DataTypes/IDataType.h b/src/DataTypes/IDataType.h index dba5bc3f5a9..c9c848a8037 100644 --- a/src/DataTypes/IDataType.h +++ b/src/DataTypes/IDataType.h @@ -26,9 +26,6 @@ class Field; using DataTypePtr = std::shared_ptr; using DataTypes = std::vector; -class ProtobufReader; -class ProtobufWriter; - struct NameAndTypePair; @@ -235,10 +232,6 @@ public: /// If method will throw an exception, then column will be in same state as before call to method. virtual void deserializeBinary(IColumn & column, ReadBuffer & istr) const = 0; - /** Serialize to a protobuf. */ - virtual void serializeProtobuf(const IColumn & column, size_t row_num, ProtobufWriter & protobuf, size_t & value_index) const = 0; - virtual void deserializeProtobuf(IColumn & column, ProtobufReader & protobuf, bool allow_add_row, bool & row_added) const = 0; - /** Text serialization with escaping but without quoting. */ void serializeAsTextEscaped(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings &) const; diff --git a/src/DataTypes/IDataTypeDummy.h b/src/DataTypes/IDataTypeDummy.h index f27359e5f74..08cc0778a6e 100644 --- a/src/DataTypes/IDataTypeDummy.h +++ b/src/DataTypes/IDataTypeDummy.h @@ -34,8 +34,6 @@ public: void deserializeBinaryBulk(IColumn &, ReadBuffer &, size_t, double) const override { throwNoSerialization(); } void serializeText(const IColumn &, size_t, WriteBuffer &, const FormatSettings &) const override { throwNoSerialization(); } void deserializeText(IColumn &, ReadBuffer &, const FormatSettings &) const override { throwNoSerialization(); } - void serializeProtobuf(const IColumn &, size_t, ProtobufWriter &, size_t &) const override { throwNoSerialization(); } - void deserializeProtobuf(IColumn &, ProtobufReader &, bool, bool &) const override { throwNoSerialization(); } MutableColumnPtr createColumn() const override { diff --git a/src/Formats/FormatSettings.h b/src/Formats/FormatSettings.h index 3f031fa2311..c1f02c65748 100644 --- a/src/Formats/FormatSettings.h +++ b/src/Formats/FormatSettings.h @@ -120,7 +120,6 @@ struct FormatSettings struct { - bool write_row_delimiters = true; /** * Some buffers (kafka / rabbit) split the rows internally using callback, * and always send one row per message, so we can push there formats @@ -128,7 +127,7 @@ struct FormatSettings * we have to enforce exporting at most one row in the format output, * because Protobuf without delimiters is not generally useful. */ - bool allow_many_rows_no_delimiters = false; + bool allow_multiple_rows_without_delimiter = false; } protobuf; struct diff --git a/src/Formats/ProtobufColumnMatcher.cpp b/src/Formats/ProtobufColumnMatcher.cpp deleted file mode 100644 index f4803d1af10..00000000000 --- a/src/Formats/ProtobufColumnMatcher.cpp +++ /dev/null @@ -1,55 +0,0 @@ -#include "ProtobufColumnMatcher.h" -#if USE_PROTOBUF -#include -#include -#include - - -namespace DB -{ -namespace ErrorCodes -{ - extern const int NO_COMMON_COLUMNS_WITH_PROTOBUF_SCHEMA; -} - - -namespace -{ - String columnNameToSearchableForm(const String & str) - { - return Poco::replace(Poco::toUpper(str), ".", "_"); - } -} - -namespace ProtobufColumnMatcher -{ - namespace details - { - ColumnNameMatcher::ColumnNameMatcher(const std::vector & column_names) : column_usage(column_names.size()) - { - column_usage.resize(column_names.size(), false); - for (size_t i = 0; i != column_names.size(); ++i) - column_name_to_index_map.emplace(columnNameToSearchableForm(column_names[i]), i); - } - - size_t ColumnNameMatcher::findColumn(const String & field_name) - { - auto it = column_name_to_index_map.find(columnNameToSearchableForm(field_name)); - if (it == column_name_to_index_map.end()) - return -1; - size_t column_index = it->second; - if (column_usage[column_index]) - return -1; - column_usage[column_index] = true; - return column_index; - } - - void throwNoCommonColumns() - { - throw Exception("No common columns with provided protobuf schema", ErrorCodes::NO_COMMON_COLUMNS_WITH_PROTOBUF_SCHEMA); - } - } -} - -} -#endif diff --git a/src/Formats/ProtobufColumnMatcher.h b/src/Formats/ProtobufColumnMatcher.h deleted file mode 100644 index 35521be7a9b..00000000000 --- a/src/Formats/ProtobufColumnMatcher.h +++ /dev/null @@ -1,196 +0,0 @@ -#pragma once - -#if !defined(ARCADIA_BUILD) -# include "config_formats.h" -#endif - -#if USE_PROTOBUF -# include -# include -# include -# include -# include -# include -# include - -namespace google -{ -namespace protobuf -{ - class Descriptor; - class FieldDescriptor; -} -} - - -namespace DB -{ -namespace ProtobufColumnMatcher -{ - struct DefaultTraits - { - using MessageData = boost::blank; - using FieldData = boost::blank; - }; - - template - struct Message; - - /// Represents a field in a protobuf message. - template - struct Field - { - const google::protobuf::FieldDescriptor * field_descriptor = nullptr; - - /// Same as field_descriptor->number(). - UInt32 field_number = 0; - - /// Index of a column; either 'column_index' or 'nested_message' is set. - size_t column_index = -1; - std::unique_ptr> nested_message; - - typename Traits::FieldData data; - }; - - /// Represents a protobuf message. - template - struct Message - { - std::vector> fields; - - /// Points to the parent message if this is a nested message. - Message * parent = nullptr; - size_t index_in_parent = -1; - - typename Traits::MessageData data; - }; - - /// Utility function finding matching columns for each protobuf field. - template - static std::unique_ptr> matchColumns( - const std::vector & column_names, - const google::protobuf::Descriptor * message_type); - - template - static std::unique_ptr> matchColumns( - const std::vector & column_names, - const google::protobuf::Descriptor * message_type, - std::vector & field_descriptors_without_match); - - namespace details - { - [[noreturn]] void throwNoCommonColumns(); - - class ColumnNameMatcher - { - public: - ColumnNameMatcher(const std::vector & column_names); - size_t findColumn(const String & field_name); - - private: - std::unordered_map column_name_to_index_map; - std::vector column_usage; - }; - - template - std::unique_ptr> matchColumnsRecursive( - ColumnNameMatcher & name_matcher, - const google::protobuf::Descriptor * message_type, - const String & field_name_prefix, - std::vector * field_descriptors_without_match) - { - auto message = std::make_unique>(); - for (int i = 0; i != message_type->field_count(); ++i) - { - const google::protobuf::FieldDescriptor * field_descriptor = message_type->field(i); - if ((field_descriptor->type() == google::protobuf::FieldDescriptor::TYPE_MESSAGE) - || (field_descriptor->type() == google::protobuf::FieldDescriptor::TYPE_GROUP)) - { - auto nested_message = matchColumnsRecursive( - name_matcher, - field_descriptor->message_type(), - field_name_prefix + field_descriptor->name() + ".", - field_descriptors_without_match); - if (nested_message) - { - message->fields.emplace_back(); - auto & current_field = message->fields.back(); - current_field.field_number = field_descriptor->number(); - current_field.field_descriptor = field_descriptor; - current_field.nested_message = std::move(nested_message); - current_field.nested_message->parent = message.get(); - } - } - else - { - size_t column_index = name_matcher.findColumn(field_name_prefix + field_descriptor->name()); - if (column_index == static_cast(-1)) - { - if (field_descriptors_without_match) - field_descriptors_without_match->emplace_back(field_descriptor); - } - else - { - message->fields.emplace_back(); - auto & current_field = message->fields.back(); - current_field.field_number = field_descriptor->number(); - current_field.field_descriptor = field_descriptor; - current_field.column_index = column_index; - } - } - } - - if (message->fields.empty()) - return nullptr; - - // Columns should be sorted by field_number, it's necessary for writing protobufs and useful reading protobufs. - std::sort(message->fields.begin(), message->fields.end(), [](const Field & left, const Field & right) - { - return left.field_number < right.field_number; - }); - - for (size_t i = 0; i != message->fields.size(); ++i) - { - auto & field = message->fields[i]; - if (field.nested_message) - field.nested_message->index_in_parent = i; - } - - return message; - } - } - - template - static std::unique_ptr> matchColumnsImpl( - const std::vector & column_names, - const google::protobuf::Descriptor * message_type, - std::vector * field_descriptors_without_match) - { - details::ColumnNameMatcher name_matcher(column_names); - auto message = details::matchColumnsRecursive(name_matcher, message_type, "", field_descriptors_without_match); - if (!message) - details::throwNoCommonColumns(); - return message; - } - - template - static std::unique_ptr> matchColumns( - const std::vector & column_names, - const google::protobuf::Descriptor * message_type) - { - return matchColumnsImpl(column_names, message_type, nullptr); - } - - template - static std::unique_ptr> matchColumns( - const std::vector & column_names, - const google::protobuf::Descriptor * message_type, - std::vector & field_descriptors_without_match) - { - return matchColumnsImpl(column_names, message_type, &field_descriptors_without_match); - } -} - -} - -#endif diff --git a/src/Formats/ProtobufReader.cpp b/src/Formats/ProtobufReader.cpp index 8f28d279c06..0e05b59badf 100644 --- a/src/Formats/ProtobufReader.cpp +++ b/src/Formats/ProtobufReader.cpp @@ -1,14 +1,7 @@ #include "ProtobufReader.h" #if USE_PROTOBUF -# include -# include -# include -# include -# include -# include -# include -# include +# include namespace DB @@ -16,7 +9,6 @@ namespace DB namespace ErrorCodes { extern const int UNKNOWN_PROTOBUF_FORMAT; - extern const int PROTOBUF_BAD_CAST; } @@ -41,36 +33,21 @@ namespace constexpr Int64 END_OF_FILE = -3; Int64 decodeZigZag(UInt64 n) { return static_cast((n >> 1) ^ (~(n & 1) + 1)); } - } -// SimpleReader is an utility class to deserialize protobufs. -// Knows nothing about protobuf schemas, just provides useful functions to deserialize data. -ProtobufReader::SimpleReader::SimpleReader(ReadBuffer & in_, const bool use_length_delimiters_) +ProtobufReader::ProtobufReader(ReadBuffer & in_) : in(in_) - , cursor(0) - , current_message_level(0) - , current_message_end(0) - , field_end(0) - , last_string_pos(-1) - , use_length_delimiters(use_length_delimiters_) { } -[[noreturn]] void ProtobufReader::SimpleReader::throwUnknownFormat() const -{ - throw Exception(std::string("Protobuf messages are corrupted or don't match the provided schema.") + (use_length_delimiters ? " Please note that Protobuf stream is length-delimited: every message is prefixed by its length in varint." : ""), ErrorCodes::UNKNOWN_PROTOBUF_FORMAT); -} - -bool ProtobufReader::SimpleReader::startMessage() +void ProtobufReader::startMessage(bool with_length_delimiter_) { // Start reading a root message. assert(!current_message_level); - if (unlikely(in.eof())) - return false; - if (use_length_delimiters) + root_message_has_length_delimiter = with_length_delimiter_; + if (root_message_has_length_delimiter) { size_t size_of_message = readVarint(); current_message_end = cursor + size_of_message; @@ -80,11 +57,11 @@ bool ProtobufReader::SimpleReader::startMessage() current_message_end = END_OF_FILE; } ++current_message_level; + field_number = next_field_number = 0; field_end = cursor; - return true; } -void ProtobufReader::SimpleReader::endMessage(bool ignore_errors) +void ProtobufReader::endMessage(bool ignore_errors) { if (!current_message_level) return; @@ -94,6 +71,8 @@ void ProtobufReader::SimpleReader::endMessage(bool ignore_errors) { if (cursor < root_message_end) ignore(root_message_end - cursor); + else if (root_message_end == END_OF_FILE) + ignoreAll(); else if (ignore_errors) moveCursorBackward(cursor - root_message_end); else @@ -104,7 +83,7 @@ void ProtobufReader::SimpleReader::endMessage(bool ignore_errors) parent_message_ends.clear(); } -void ProtobufReader::SimpleReader::startNestedMessage() +void ProtobufReader::startNestedMessage() { assert(current_message_level >= 1); if ((cursor > field_end) && (field_end != END_OF_GROUP)) @@ -115,10 +94,11 @@ void ProtobufReader::SimpleReader::startNestedMessage() parent_message_ends.emplace_back(current_message_end); current_message_end = field_end; ++current_message_level; + field_number = next_field_number = 0; field_end = cursor; } -void ProtobufReader::SimpleReader::endNestedMessage() +void ProtobufReader::endNestedMessage() { assert(current_message_level >= 2); if (cursor != current_message_end) @@ -137,12 +117,20 @@ void ProtobufReader::SimpleReader::endNestedMessage() --current_message_level; current_message_end = parent_message_ends.back(); parent_message_ends.pop_back(); + field_number = next_field_number = 0; field_end = cursor; } -bool ProtobufReader::SimpleReader::readFieldNumber(UInt32 & field_number) +bool ProtobufReader::readFieldNumber(int & field_number_) { assert(current_message_level); + if (next_field_number) + { + field_number_ = field_number = next_field_number; + next_field_number = 0; + return true; + } + if (field_end != cursor) { if (field_end == END_OF_VARINT) @@ -183,7 +171,8 @@ bool ProtobufReader::SimpleReader::readFieldNumber(UInt32 & field_number) if (unlikely(varint & (static_cast(0xFFFFFFFF) << 32))) throwUnknownFormat(); UInt32 key = static_cast(varint); - field_number = (key >> 3); + field_number_ = field_number = (key >> 3); + next_field_number = 0; WireType wire_type = static_cast(key & 0x07); switch (wire_type) { @@ -224,77 +213,91 @@ bool ProtobufReader::SimpleReader::readFieldNumber(UInt32 & field_number) throwUnknownFormat(); } -bool ProtobufReader::SimpleReader::readUInt(UInt64 & value) +UInt64 ProtobufReader::readUInt() { + UInt64 value; if (field_end == END_OF_VARINT) { value = readVarint(); field_end = cursor; - return true; } - - if (unlikely(cursor >= field_end)) - return false; - - value = readVarint(); - return true; + else + { + value = readVarint(); + if (cursor < field_end) + next_field_number = field_number; + else if (unlikely(cursor) > field_end) + throwUnknownFormat(); + } + return value; } -bool ProtobufReader::SimpleReader::readInt(Int64 & value) +Int64 ProtobufReader::readInt() { - UInt64 varint; - if (!readUInt(varint)) - return false; - value = static_cast(varint); - return true; + return static_cast(readUInt()); } -bool ProtobufReader::SimpleReader::readSInt(Int64 & value) +Int64 ProtobufReader::readSInt() { - UInt64 varint; - if (!readUInt(varint)) - return false; - value = decodeZigZag(varint); - return true; + return decodeZigZag(readUInt()); } template -bool ProtobufReader::SimpleReader::readFixed(T & value) +T ProtobufReader::readFixed() { - if (unlikely(cursor >= field_end)) - return false; - + if (unlikely(cursor + static_cast(sizeof(T)) > field_end)) + throwUnknownFormat(); + T value; readBinary(&value, sizeof(T)); - return true; + if (cursor < field_end) + next_field_number = field_number; + return value; } -bool ProtobufReader::SimpleReader::readStringInto(PaddedPODArray & str) +template Int32 ProtobufReader::readFixed(); +template UInt32 ProtobufReader::readFixed(); +template Int64 ProtobufReader::readFixed(); +template UInt64 ProtobufReader::readFixed(); +template Float32 ProtobufReader::readFixed(); +template Float64 ProtobufReader::readFixed(); + +void ProtobufReader::readString(String & str) +{ + if (unlikely(cursor > field_end)) + throwUnknownFormat(); + size_t length = field_end - cursor; + str.resize(length); + readBinary(reinterpret_cast(str.data()), length); +} + +void ProtobufReader::readStringAndAppend(PaddedPODArray & str) { - if (unlikely(cursor == last_string_pos)) - return false; /// We don't want to read the same empty string again. - last_string_pos = cursor; if (unlikely(cursor > field_end)) throwUnknownFormat(); size_t length = field_end - cursor; size_t old_size = str.size(); str.resize(old_size + length); readBinary(reinterpret_cast(str.data() + old_size), length); - return true; } -void ProtobufReader::SimpleReader::readBinary(void* data, size_t size) +void ProtobufReader::readBinary(void* data, size_t size) { in.readStrict(reinterpret_cast(data), size); cursor += size; } -void ProtobufReader::SimpleReader::ignore(UInt64 num_bytes) +void ProtobufReader::ignore(UInt64 num_bytes) { in.ignore(num_bytes); cursor += num_bytes; } -void ProtobufReader::SimpleReader::moveCursorBackward(UInt64 num_bytes) +void ProtobufReader::ignoreAll() +{ + cursor += in.tryIgnore(std::numeric_limits::max()); +} + +void ProtobufReader::moveCursorBackward(UInt64 num_bytes) { if (in.offset() < num_bytes) throwUnknownFormat(); @@ -302,7 +305,7 @@ void ProtobufReader::SimpleReader::moveCursorBackward(UInt64 num_bytes) cursor -= num_bytes; } -UInt64 ProtobufReader::SimpleReader::continueReadingVarint(UInt64 first_byte) +UInt64 ProtobufReader::continueReadingVarint(UInt64 first_byte) { UInt64 result = (first_byte & ~static_cast(0x80)); char c; @@ -342,7 +345,7 @@ UInt64 ProtobufReader::SimpleReader::continueReadingVarint(UInt64 first_byte) throwUnknownFormat(); } -void ProtobufReader::SimpleReader::ignoreVarint() +void ProtobufReader::ignoreVarint() { char c; @@ -379,7 +382,7 @@ void ProtobufReader::SimpleReader::ignoreVarint() throwUnknownFormat(); } -void ProtobufReader::SimpleReader::ignoreGroup() +void ProtobufReader::ignoreGroup() { size_t level = 1; while (true) @@ -424,803 +427,15 @@ void ProtobufReader::SimpleReader::ignoreGroup() } } -// Implementation for a converter from any protobuf field type to any DB data type. -class ProtobufReader::ConverterBaseImpl : public ProtobufReader::IConverter +[[noreturn]] void ProtobufReader::throwUnknownFormat() const { -public: - ConverterBaseImpl(SimpleReader & simple_reader_, const google::protobuf::FieldDescriptor * field_) - : simple_reader(simple_reader_), field(field_) {} - - bool readStringInto(PaddedPODArray &) override - { - cannotConvertType("String"); - } - - bool readInt8(Int8 &) override - { - cannotConvertType("Int8"); - } - - bool readUInt8(UInt8 &) override - { - cannotConvertType("UInt8"); - } - - bool readInt16(Int16 &) override - { - cannotConvertType("Int16"); - } - - bool readUInt16(UInt16 &) override - { - cannotConvertType("UInt16"); - } - - bool readInt32(Int32 &) override - { - cannotConvertType("Int32"); - } - - bool readUInt32(UInt32 &) override - { - cannotConvertType("UInt32"); - } - - bool readInt64(Int64 &) override - { - cannotConvertType("Int64"); - } - - bool readUInt64(UInt64 &) override - { - cannotConvertType("UInt64"); - } - - bool readUInt128(UInt128 &) override - { - cannotConvertType("UInt128"); - } - - bool readInt128(Int128 &) override { cannotConvertType("Int128"); } - bool readInt256(Int256 &) override { cannotConvertType("Int256"); } - bool readUInt256(UInt256 &) override { cannotConvertType("UInt256"); } - - bool readFloat32(Float32 &) override - { - cannotConvertType("Float32"); - } - - bool readFloat64(Float64 &) override - { - cannotConvertType("Float64"); - } - - void prepareEnumMapping8(const std::vector> &) override {} - void prepareEnumMapping16(const std::vector> &) override {} - - bool readEnum8(Int8 &) override - { - cannotConvertType("Enum"); - } - - bool readEnum16(Int16 &) override - { - cannotConvertType("Enum"); - } - - bool readUUID(UUID &) override - { - cannotConvertType("UUID"); - } - - bool readDate(DayNum &) override - { - cannotConvertType("Date"); - } - - bool readDateTime(time_t &) override - { - cannotConvertType("DateTime"); - } - - bool readDateTime64(DateTime64 &, UInt32) override - { - cannotConvertType("DateTime64"); - } - - bool readDecimal32(Decimal32 &, UInt32, UInt32) override - { - cannotConvertType("Decimal32"); - } - - bool readDecimal64(Decimal64 &, UInt32, UInt32) override - { - cannotConvertType("Decimal64"); - } - - bool readDecimal128(Decimal128 &, UInt32, UInt32) override - { - cannotConvertType("Decimal128"); - } - - bool readDecimal256(Decimal256 &, UInt32, UInt32) override - { - cannotConvertType("Decimal256"); - } - - - bool readAggregateFunction(const AggregateFunctionPtr &, AggregateDataPtr, Arena &) override - { - cannotConvertType("AggregateFunction"); - } - -protected: - [[noreturn]] void cannotConvertType(const String & type_name) - { - throw Exception( - String("Could not convert type '") + field->type_name() + "' from protobuf field '" + field->name() + "' to data type '" - + type_name + "'", - ErrorCodes::PROTOBUF_BAD_CAST); - } - - [[noreturn]] void cannotConvertValue(const String & value, const String & type_name) - { - throw Exception( - "Could not convert value '" + value + "' from protobuf field '" + field->name() + "' to data type '" + type_name + "'", - ErrorCodes::PROTOBUF_BAD_CAST); - } - - template - To numericCast(From value) - { - if constexpr (std::is_same_v) - return value; - To result; - try - { - result = boost::numeric_cast(value); - } - catch (boost::numeric::bad_numeric_cast &) - { - cannotConvertValue(toString(value), TypeName::get()); - } - return result; - } - - template - To parseFromString(const PaddedPODArray & str) - { - try - { - To result; - ReadBufferFromString buf(str); - readText(result, buf); - return result; - } - catch (...) - { - cannotConvertValue(StringRef(str.data(), str.size()).toString(), TypeName::get()); - } - } - - SimpleReader & simple_reader; - const google::protobuf::FieldDescriptor * field; -}; - - -class ProtobufReader::ConverterFromString : public ConverterBaseImpl -{ -public: - using ConverterBaseImpl::ConverterBaseImpl; - - bool readStringInto(PaddedPODArray & str) override { return simple_reader.readStringInto(str); } - - bool readInt8(Int8 & value) override { return readNumeric(value); } - bool readUInt8(UInt8 & value) override { return readNumeric(value); } - bool readInt16(Int16 & value) override { return readNumeric(value); } - bool readUInt16(UInt16 & value) override { return readNumeric(value); } - bool readInt32(Int32 & value) override { return readNumeric(value); } - bool readUInt32(UInt32 & value) override { return readNumeric(value); } - bool readInt64(Int64 & value) override { return readNumeric(value); } - bool readUInt64(UInt64 & value) override { return readNumeric(value); } - bool readFloat32(Float32 & value) override { return readNumeric(value); } - bool readFloat64(Float64 & value) override { return readNumeric(value); } - - void prepareEnumMapping8(const std::vector> & name_value_pairs) override - { - prepareEnumNameToValueMap(name_value_pairs); - } - void prepareEnumMapping16(const std::vector> & name_value_pairs) override - { - prepareEnumNameToValueMap(name_value_pairs); - } - - bool readEnum8(Int8 & value) override { return readEnum(value); } - bool readEnum16(Int16 & value) override { return readEnum(value); } - - bool readUUID(UUID & uuid) override - { - if (!readTempString()) - return false; - ReadBufferFromString buf(temp_string); - readUUIDText(uuid, buf); - return true; - } - - bool readDate(DayNum & date) override - { - if (!readTempString()) - return false; - ReadBufferFromString buf(temp_string); - readDateText(date, buf); - return true; - } - - bool readDateTime(time_t & tm) override - { - if (!readTempString()) - return false; - ReadBufferFromString buf(temp_string); - readDateTimeText(tm, buf); - return true; - } - - bool readDateTime64(DateTime64 & date_time, UInt32 scale) override - { - if (!readTempString()) - return false; - ReadBufferFromString buf(temp_string); - readDateTime64Text(date_time, scale, buf); - return true; - } - - bool readDecimal32(Decimal32 & decimal, UInt32 precision, UInt32 scale) override { return readDecimal(decimal, precision, scale); } - bool readDecimal64(Decimal64 & decimal, UInt32 precision, UInt32 scale) override { return readDecimal(decimal, precision, scale); } - bool readDecimal128(Decimal128 & decimal, UInt32 precision, UInt32 scale) override { return readDecimal(decimal, precision, scale); } - bool readDecimal256(Decimal256 & decimal, UInt32 precision, UInt32 scale) override { return readDecimal(decimal, precision, scale); } - - bool readAggregateFunction(const AggregateFunctionPtr & function, AggregateDataPtr place, Arena & arena) override - { - if (!readTempString()) - return false; - ReadBufferFromString buf(temp_string); - function->deserialize(place, buf, &arena); - return true; - } - -private: - bool readTempString() - { - temp_string.clear(); - return simple_reader.readStringInto(temp_string); - } - - template - bool readNumeric(T & value) - { - if (!readTempString()) - return false; - value = parseFromString(temp_string); - return true; - } - - template - bool readEnum(T & value) - { - if (!readTempString()) - return false; - StringRef ref(temp_string.data(), temp_string.size()); - auto it = enum_name_to_value_map->find(ref); - if (it == enum_name_to_value_map->end()) - cannotConvertValue(ref.toString(), "Enum"); - value = static_cast(it->second); - return true; - } - - template - bool readDecimal(Decimal & decimal, UInt32 precision, UInt32 scale) - { - if (!readTempString()) - return false; - ReadBufferFromString buf(temp_string); - DataTypeDecimal>::readText(decimal, buf, precision, scale); - return true; - } - - template - void prepareEnumNameToValueMap(const std::vector> & name_value_pairs) - { - if (likely(enum_name_to_value_map.has_value())) - return; - enum_name_to_value_map.emplace(); - for (const auto & name_value_pair : name_value_pairs) - enum_name_to_value_map->emplace(name_value_pair.first, name_value_pair.second); - } - - PaddedPODArray temp_string; - std::optional> enum_name_to_value_map; -}; - -# define PROTOBUF_READER_CREATE_CONVERTER_SPECIALIZATION_FOR_STRINGS(field_type_id) \ - template <> \ - std::unique_ptr ProtobufReader::createConverter( \ - const google::protobuf::FieldDescriptor * field) \ - { \ - return std::make_unique(simple_reader, field); \ - } -PROTOBUF_READER_CREATE_CONVERTER_SPECIALIZATION_FOR_STRINGS(google::protobuf::FieldDescriptor::TYPE_STRING) -PROTOBUF_READER_CREATE_CONVERTER_SPECIALIZATION_FOR_STRINGS(google::protobuf::FieldDescriptor::TYPE_BYTES) - -# undef PROTOBUF_READER_CREATE_CONVERTER_SPECIALIZATION_FOR_STRINGS - - -template -class ProtobufReader::ConverterFromNumber : public ConverterBaseImpl -{ -public: - using ConverterBaseImpl::ConverterBaseImpl; - - bool readStringInto(PaddedPODArray & str) override - { - FromType number; - if (!readField(number)) - return false; - WriteBufferFromVector> buf(str); - writeText(number, buf); - return true; - } - - bool readInt8(Int8 & value) override { return readNumeric(value); } - bool readUInt8(UInt8 & value) override { return readNumeric(value); } - bool readInt16(Int16 & value) override { return readNumeric(value); } - bool readUInt16(UInt16 & value) override { return readNumeric(value); } - bool readInt32(Int32 & value) override { return readNumeric(value); } - bool readUInt32(UInt32 & value) override { return readNumeric(value); } - bool readInt64(Int64 & value) override { return readNumeric(value); } - bool readUInt64(UInt64 & value) override { return readNumeric(value); } - bool readFloat32(Float32 & value) override { return readNumeric(value); } - bool readFloat64(Float64 & value) override { return readNumeric(value); } - - bool readEnum8(Int8 & value) override { return readEnum(value); } - bool readEnum16(Int16 & value) override { return readEnum(value); } - - void prepareEnumMapping8(const std::vector> & name_value_pairs) override - { - prepareSetOfEnumValues(name_value_pairs); - } - void prepareEnumMapping16(const std::vector> & name_value_pairs) override - { - prepareSetOfEnumValues(name_value_pairs); - } - - bool readDate(DayNum & date) override - { - UInt16 number; - if (!readNumeric(number)) - return false; - date = DayNum(number); - return true; - } - - bool readDateTime(time_t & tm) override - { - UInt32 number; - if (!readNumeric(number)) - return false; - tm = number; - return true; - } - - bool readDateTime64(DateTime64 & date_time, UInt32 scale) override - { - return readDecimal(date_time, scale); - } - - bool readDecimal32(Decimal32 & decimal, UInt32, UInt32 scale) override { return readDecimal(decimal, scale); } - bool readDecimal64(Decimal64 & decimal, UInt32, UInt32 scale) override { return readDecimal(decimal, scale); } - bool readDecimal128(Decimal128 & decimal, UInt32, UInt32 scale) override { return readDecimal(decimal, scale); } - -private: - template - bool readNumeric(To & value) - { - FromType number; - if (!readField(number)) - return false; - value = numericCast(number); - return true; - } - - template - bool readEnum(EnumType & value) - { - if constexpr (!is_integer_v) - cannotConvertType("Enum"); // It's not correct to convert floating point to enum. - FromType number; - if (!readField(number)) - return false; - value = numericCast(number); - if (set_of_enum_values->find(value) == set_of_enum_values->end()) - cannotConvertValue(toString(value), "Enum"); - return true; - } - - template - void prepareSetOfEnumValues(const std::vector> & name_value_pairs) - { - if (likely(set_of_enum_values.has_value())) - return; - set_of_enum_values.emplace(); - for (const auto & name_value_pair : name_value_pairs) - set_of_enum_values->emplace(name_value_pair.second); - } - - template - bool readDecimal(Decimal & decimal, UInt32 scale) - { - FromType number; - if (!readField(number)) - return false; - decimal.value = convertToDecimal, DataTypeDecimal>>(number, scale); - return true; - } - - bool readField(FromType & value) - { - if constexpr (((field_type_id == google::protobuf::FieldDescriptor::TYPE_INT32) && std::is_same_v) - || ((field_type_id == google::protobuf::FieldDescriptor::TYPE_INT64) && std::is_same_v)) - { - return simple_reader.readInt(value); - } - else if constexpr (((field_type_id == google::protobuf::FieldDescriptor::TYPE_UINT32) && std::is_same_v) - || ((field_type_id == google::protobuf::FieldDescriptor::TYPE_UINT64) && std::is_same_v)) - { - return simple_reader.readUInt(value); - } - - else if constexpr (((field_type_id == google::protobuf::FieldDescriptor::TYPE_SINT32) && std::is_same_v) - || ((field_type_id == google::protobuf::FieldDescriptor::TYPE_SINT64) && std::is_same_v)) - { - return simple_reader.readSInt(value); - } - else - { - static_assert(((field_type_id == google::protobuf::FieldDescriptor::TYPE_FIXED32) && std::is_same_v) - || ((field_type_id == google::protobuf::FieldDescriptor::TYPE_SFIXED32) && std::is_same_v) - || ((field_type_id == google::protobuf::FieldDescriptor::TYPE_FIXED64) && std::is_same_v) - || ((field_type_id == google::protobuf::FieldDescriptor::TYPE_SFIXED64) && std::is_same_v) - || ((field_type_id == google::protobuf::FieldDescriptor::TYPE_FLOAT) && std::is_same_v) - || ((field_type_id == google::protobuf::FieldDescriptor::TYPE_DOUBLE) && std::is_same_v)); - return simple_reader.readFixed(value); - } - } - - std::optional> set_of_enum_values; -}; - -# define PROTOBUF_READER_CREATE_CONVERTER_SPECIALIZATION_FOR_NUMBERS(field_type_id, field_type) \ - template <> \ - std::unique_ptr ProtobufReader::createConverter( \ - const google::protobuf::FieldDescriptor * field) \ - { \ - return std::make_unique>(simple_reader, field); /* NOLINT */ \ - } - -PROTOBUF_READER_CREATE_CONVERTER_SPECIALIZATION_FOR_NUMBERS(google::protobuf::FieldDescriptor::TYPE_INT32, Int64); -PROTOBUF_READER_CREATE_CONVERTER_SPECIALIZATION_FOR_NUMBERS(google::protobuf::FieldDescriptor::TYPE_SINT32, Int64); -PROTOBUF_READER_CREATE_CONVERTER_SPECIALIZATION_FOR_NUMBERS(google::protobuf::FieldDescriptor::TYPE_UINT32, UInt64); -PROTOBUF_READER_CREATE_CONVERTER_SPECIALIZATION_FOR_NUMBERS(google::protobuf::FieldDescriptor::TYPE_INT64, Int64); -PROTOBUF_READER_CREATE_CONVERTER_SPECIALIZATION_FOR_NUMBERS(google::protobuf::FieldDescriptor::TYPE_SINT64, Int64); -PROTOBUF_READER_CREATE_CONVERTER_SPECIALIZATION_FOR_NUMBERS(google::protobuf::FieldDescriptor::TYPE_UINT64, UInt64); -PROTOBUF_READER_CREATE_CONVERTER_SPECIALIZATION_FOR_NUMBERS(google::protobuf::FieldDescriptor::TYPE_FIXED32, UInt32); -PROTOBUF_READER_CREATE_CONVERTER_SPECIALIZATION_FOR_NUMBERS(google::protobuf::FieldDescriptor::TYPE_SFIXED32, Int32); -PROTOBUF_READER_CREATE_CONVERTER_SPECIALIZATION_FOR_NUMBERS(google::protobuf::FieldDescriptor::TYPE_FIXED64, UInt64); -PROTOBUF_READER_CREATE_CONVERTER_SPECIALIZATION_FOR_NUMBERS(google::protobuf::FieldDescriptor::TYPE_SFIXED64, Int64); -PROTOBUF_READER_CREATE_CONVERTER_SPECIALIZATION_FOR_NUMBERS(google::protobuf::FieldDescriptor::TYPE_FLOAT, float); -PROTOBUF_READER_CREATE_CONVERTER_SPECIALIZATION_FOR_NUMBERS(google::protobuf::FieldDescriptor::TYPE_DOUBLE, double); - -# undef PROTOBUF_READER_CREATE_CONVERTER_SPECIALIZATION_FOR_NUMBERS - - -class ProtobufReader::ConverterFromBool : public ConverterBaseImpl -{ -public: - using ConverterBaseImpl::ConverterBaseImpl; - - bool readStringInto(PaddedPODArray & str) override - { - bool b; - if (!readField(b)) - return false; - StringRef ref(b ? "true" : "false"); - str.insert(ref.data, ref.data + ref.size); - return true; - } - - bool readInt8(Int8 & value) override { return readNumeric(value); } - bool readUInt8(UInt8 & value) override { return readNumeric(value); } - bool readInt16(Int16 & value) override { return readNumeric(value); } - bool readUInt16(UInt16 & value) override { return readNumeric(value); } - bool readInt32(Int32 & value) override { return readNumeric(value); } - bool readUInt32(UInt32 & value) override { return readNumeric(value); } - bool readInt64(Int64 & value) override { return readNumeric(value); } - bool readUInt64(UInt64 & value) override { return readNumeric(value); } - bool readFloat32(Float32 & value) override { return readNumeric(value); } - bool readFloat64(Float64 & value) override { return readNumeric(value); } - bool readDecimal32(Decimal32 & decimal, UInt32, UInt32) override { return readNumeric(decimal.value); } - bool readDecimal64(Decimal64 & decimal, UInt32, UInt32) override { return readNumeric(decimal.value); } - bool readDecimal128(Decimal128 & decimal, UInt32, UInt32) override { return readNumeric(decimal.value); } - -private: - template - bool readNumeric(T & value) - { - bool b; - if (!readField(b)) - return false; - value = b ? 1 : 0; - return true; - } - - bool readField(bool & b) - { - UInt64 number; - if (!simple_reader.readUInt(number)) - return false; - b = static_cast(number); - return true; - } -}; - -template <> -std::unique_ptr ProtobufReader::createConverter( - const google::protobuf::FieldDescriptor * field) -{ - return std::make_unique(simple_reader, field); + throw Exception( + std::string("Protobuf messages are corrupted or don't match the provided schema.") + + (root_message_has_length_delimiter + ? " Please note that Protobuf stream is length-delimited: every message is prefixed by its length in varint." + : ""), + ErrorCodes::UNKNOWN_PROTOBUF_FORMAT); } - - -class ProtobufReader::ConverterFromEnum : public ConverterBaseImpl -{ -public: - using ConverterBaseImpl::ConverterBaseImpl; - - bool readStringInto(PaddedPODArray & str) override - { - prepareEnumPbNumberToNameMap(); - Int64 pbnumber; - if (!readField(pbnumber)) - return false; - auto it = enum_pbnumber_to_name_map->find(pbnumber); - if (it == enum_pbnumber_to_name_map->end()) - cannotConvertValue(toString(pbnumber), "Enum"); - const auto & ref = it->second; - str.insert(ref.data, ref.data + ref.size); - return true; - } - - bool readInt8(Int8 & value) override { return readNumeric(value); } - bool readUInt8(UInt8 & value) override { return readNumeric(value); } - bool readInt16(Int16 & value) override { return readNumeric(value); } - bool readUInt16(UInt16 & value) override { return readNumeric(value); } - bool readInt32(Int32 & value) override { return readNumeric(value); } - bool readUInt32(UInt32 & value) override { return readNumeric(value); } - bool readInt64(Int64 & value) override { return readNumeric(value); } - bool readUInt64(UInt64 & value) override { return readNumeric(value); } - - void prepareEnumMapping8(const std::vector> & name_value_pairs) override - { - prepareEnumPbNumberToValueMap(name_value_pairs); - } - void prepareEnumMapping16(const std::vector> & name_value_pairs) override - { - prepareEnumPbNumberToValueMap(name_value_pairs); - } - - bool readEnum8(Int8 & value) override { return readEnum(value); } - bool readEnum16(Int16 & value) override { return readEnum(value); } - -private: - template - bool readNumeric(T & value) - { - Int64 pbnumber; - if (!readField(pbnumber)) - return false; - value = numericCast(pbnumber); - return true; - } - - template - bool readEnum(T & value) - { - Int64 pbnumber; - if (!readField(pbnumber)) - return false; - if (enum_pbnumber_always_equals_value) - value = static_cast(pbnumber); - else - { - auto it = enum_pbnumber_to_value_map->find(pbnumber); - if (it == enum_pbnumber_to_value_map->end()) - cannotConvertValue(toString(pbnumber), "Enum"); - value = static_cast(it->second); - } - return true; - } - - void prepareEnumPbNumberToNameMap() - { - if (likely(enum_pbnumber_to_name_map.has_value())) - return; - enum_pbnumber_to_name_map.emplace(); - const auto * enum_type = field->enum_type(); - for (int i = 0; i != enum_type->value_count(); ++i) - { - const auto * enum_value = enum_type->value(i); - enum_pbnumber_to_name_map->emplace(enum_value->number(), enum_value->name()); - } - } - - template - void prepareEnumPbNumberToValueMap(const std::vector> & name_value_pairs) - { - if (likely(enum_pbnumber_to_value_map.has_value())) - return; - enum_pbnumber_to_value_map.emplace(); - enum_pbnumber_always_equals_value = true; - for (const auto & name_value_pair : name_value_pairs) - { - Int16 value = name_value_pair.second; // NOLINT - const auto * enum_descriptor = field->enum_type()->FindValueByName(name_value_pair.first); - if (enum_descriptor) - { - enum_pbnumber_to_value_map->emplace(enum_descriptor->number(), value); - if (enum_descriptor->number() != value) - enum_pbnumber_always_equals_value = false; - } - else - enum_pbnumber_always_equals_value = false; - } - } - - bool readField(Int64 & enum_pbnumber) - { - return simple_reader.readInt(enum_pbnumber); - } - - std::optional> enum_pbnumber_to_name_map; - std::optional> enum_pbnumber_to_value_map; - bool enum_pbnumber_always_equals_value; -}; - -template <> -std::unique_ptr ProtobufReader::createConverter( - const google::protobuf::FieldDescriptor * field) -{ - return std::make_unique(simple_reader, field); -} - - -ProtobufReader::ProtobufReader( - ReadBuffer & in_, const google::protobuf::Descriptor * message_type, const std::vector & column_names, const bool use_length_delimiters_) - : simple_reader(in_, use_length_delimiters_) -{ - root_message = ProtobufColumnMatcher::matchColumns(column_names, message_type); - setTraitsDataAfterMatchingColumns(root_message.get()); -} - -ProtobufReader::~ProtobufReader() = default; - -void ProtobufReader::setTraitsDataAfterMatchingColumns(Message * message) -{ - for (Field & field : message->fields) - { - if (field.nested_message) - { - setTraitsDataAfterMatchingColumns(field.nested_message.get()); - continue; - } - switch (field.field_descriptor->type()) - { -# define PROTOBUF_READER_CONVERTER_CREATING_CASE(field_type_id) \ - case field_type_id: \ - field.data.converter = createConverter(field.field_descriptor); \ - break - PROTOBUF_READER_CONVERTER_CREATING_CASE(google::protobuf::FieldDescriptor::TYPE_STRING); - PROTOBUF_READER_CONVERTER_CREATING_CASE(google::protobuf::FieldDescriptor::TYPE_BYTES); - PROTOBUF_READER_CONVERTER_CREATING_CASE(google::protobuf::FieldDescriptor::TYPE_INT32); - PROTOBUF_READER_CONVERTER_CREATING_CASE(google::protobuf::FieldDescriptor::TYPE_SINT32); - PROTOBUF_READER_CONVERTER_CREATING_CASE(google::protobuf::FieldDescriptor::TYPE_UINT32); - PROTOBUF_READER_CONVERTER_CREATING_CASE(google::protobuf::FieldDescriptor::TYPE_FIXED32); - PROTOBUF_READER_CONVERTER_CREATING_CASE(google::protobuf::FieldDescriptor::TYPE_SFIXED32); - PROTOBUF_READER_CONVERTER_CREATING_CASE(google::protobuf::FieldDescriptor::TYPE_INT64); - PROTOBUF_READER_CONVERTER_CREATING_CASE(google::protobuf::FieldDescriptor::TYPE_SINT64); - PROTOBUF_READER_CONVERTER_CREATING_CASE(google::protobuf::FieldDescriptor::TYPE_UINT64); - PROTOBUF_READER_CONVERTER_CREATING_CASE(google::protobuf::FieldDescriptor::TYPE_FIXED64); - PROTOBUF_READER_CONVERTER_CREATING_CASE(google::protobuf::FieldDescriptor::TYPE_SFIXED64); - PROTOBUF_READER_CONVERTER_CREATING_CASE(google::protobuf::FieldDescriptor::TYPE_FLOAT); - PROTOBUF_READER_CONVERTER_CREATING_CASE(google::protobuf::FieldDescriptor::TYPE_DOUBLE); - PROTOBUF_READER_CONVERTER_CREATING_CASE(google::protobuf::FieldDescriptor::TYPE_BOOL); - PROTOBUF_READER_CONVERTER_CREATING_CASE(google::protobuf::FieldDescriptor::TYPE_ENUM); -# undef PROTOBUF_READER_CONVERTER_CREATING_CASE - default: - __builtin_unreachable(); - } - message->data.field_number_to_field_map.emplace(field.field_number, &field); - } -} - -bool ProtobufReader::startMessage() -{ - if (!simple_reader.startMessage()) - return false; - current_message = root_message.get(); - current_field_index = 0; - return true; -} - -void ProtobufReader::endMessage(bool try_ignore_errors) -{ - simple_reader.endMessage(try_ignore_errors); - current_message = nullptr; - current_converter = nullptr; -} - -bool ProtobufReader::readColumnIndex(size_t & column_index) -{ - while (true) - { - UInt32 field_number; - if (!simple_reader.readFieldNumber(field_number)) - { - if (!current_message->parent) - { - current_converter = nullptr; - return false; - } - simple_reader.endNestedMessage(); - current_field_index = current_message->index_in_parent; - current_message = current_message->parent; - continue; - } - - const Field * field = nullptr; - for (; current_field_index < current_message->fields.size(); ++current_field_index) - { - const Field & f = current_message->fields[current_field_index]; - if (f.field_number == field_number) - { - field = &f; - break; - } - if (f.field_number > field_number) - break; - } - - if (!field) - { - const auto & field_number_to_field_map = current_message->data.field_number_to_field_map; - auto it = field_number_to_field_map.find(field_number); - if (it == field_number_to_field_map.end()) - continue; - field = it->second; - } - - if (field->nested_message) - { - simple_reader.startNestedMessage(); - current_message = field->nested_message.get(); - current_field_index = 0; - continue; - } - - column_index = field->column_index; - current_converter = field->data.converter.get(); - return true; - } -} - } #endif diff --git a/src/Formats/ProtobufReader.h b/src/Formats/ProtobufReader.h index b2a0714a57a..31d6f9a08e0 100644 --- a/src/Formats/ProtobufReader.h +++ b/src/Formats/ProtobufReader.h @@ -1,258 +1,72 @@ #pragma once -#include -#include -#include -#include - #if !defined(ARCADIA_BUILD) -# include "config_formats.h" +# include "config_formats.h" #endif #if USE_PROTOBUF -# include -# include -# include -# include "ProtobufColumnMatcher.h" +# include +# include -namespace google -{ -namespace protobuf -{ - class Descriptor; -} -} namespace DB { -class Arena; -class IAggregateFunction; class ReadBuffer; -using AggregateDataPtr = char *; -using AggregateFunctionPtr = std::shared_ptr; - - -/** Deserializes a protobuf, tries to cast data types if necessarily. - */ -class ProtobufReader : private boost::noncopyable -{ -public: - ProtobufReader(ReadBuffer & in_, const google::protobuf::Descriptor * message_type, const std::vector & column_names, const bool use_length_delimiters_); - ~ProtobufReader(); - - /// Should be called when we start reading a new message. - bool startMessage(); - - /// Ends reading a message. - void endMessage(bool ignore_errors = false); - - /// Reads the column index. - /// The function returns false if there are no more columns to read (call endMessage() in this case). - bool readColumnIndex(size_t & column_index); - - /// Reads a value which should be put to column at index received with readColumnIndex(). - /// The function returns false if there are no more values to read now (call readColumnIndex() in this case). - bool readNumber(Int8 & value) { return current_converter->readInt8(value); } - bool readNumber(UInt8 & value) { return current_converter->readUInt8(value); } - bool readNumber(Int16 & value) { return current_converter->readInt16(value); } - bool readNumber(UInt16 & value) { return current_converter->readUInt16(value); } - bool readNumber(Int32 & value) { return current_converter->readInt32(value); } - bool readNumber(UInt32 & value) { return current_converter->readUInt32(value); } - bool readNumber(Int64 & value) { return current_converter->readInt64(value); } - bool readNumber(UInt64 & value) { return current_converter->readUInt64(value); } - bool readNumber(Int128 & value) { return current_converter->readInt128(value); } - bool readNumber(UInt128 & value) { return current_converter->readUInt128(value); } - bool readNumber(Int256 & value) { return current_converter->readInt256(value); } - bool readNumber(UInt256 & value) { return current_converter->readUInt256(value); } - bool readNumber(Float32 & value) { return current_converter->readFloat32(value); } - bool readNumber(Float64 & value) { return current_converter->readFloat64(value); } - - bool readStringInto(PaddedPODArray & str) { return current_converter->readStringInto(str); } - - void prepareEnumMapping(const std::vector> & name_value_pairs) { current_converter->prepareEnumMapping8(name_value_pairs); } - void prepareEnumMapping(const std::vector> & name_value_pairs) { current_converter->prepareEnumMapping16(name_value_pairs); } - bool readEnum(Int8 & value) { return current_converter->readEnum8(value); } - bool readEnum(Int16 & value) { return current_converter->readEnum16(value); } - - bool readUUID(UUID & uuid) { return current_converter->readUUID(uuid); } - bool readDate(DayNum & date) { return current_converter->readDate(date); } - bool readDateTime(time_t & tm) { return current_converter->readDateTime(tm); } - bool readDateTime64(DateTime64 & tm, UInt32 scale) { return current_converter->readDateTime64(tm, scale); } - - bool readDecimal(Decimal32 & decimal, UInt32 precision, UInt32 scale) { return current_converter->readDecimal32(decimal, precision, scale); } - bool readDecimal(Decimal64 & decimal, UInt32 precision, UInt32 scale) { return current_converter->readDecimal64(decimal, precision, scale); } - bool readDecimal(Decimal128 & decimal, UInt32 precision, UInt32 scale) { return current_converter->readDecimal128(decimal, precision, scale); } - bool readDecimal(Decimal256 & decimal, UInt32 precision, UInt32 scale) { return current_converter->readDecimal256(decimal, precision, scale); } - - bool readAggregateFunction(const AggregateFunctionPtr & function, AggregateDataPtr place, Arena & arena) { return current_converter->readAggregateFunction(function, place, arena); } - - /// Call it after calling one of the read*() function to determine if there are more values available for reading. - bool ALWAYS_INLINE canReadMoreValues() const { return simple_reader.canReadMoreValues(); } - -private: - class SimpleReader - { - public: - SimpleReader(ReadBuffer & in_, const bool use_length_delimiters_); - bool startMessage(); - void endMessage(bool ignore_errors); - void startNestedMessage(); - void endNestedMessage(); - bool readFieldNumber(UInt32 & field_number); - bool readInt(Int64 & value); - bool readSInt(Int64 & value); - bool readUInt(UInt64 & value); - template bool readFixed(T & value); - bool readStringInto(PaddedPODArray & str); - - bool ALWAYS_INLINE canReadMoreValues() const { return cursor < field_end; } - - private: - void readBinary(void * data, size_t size); - void ignore(UInt64 num_bytes); - void moveCursorBackward(UInt64 num_bytes); - - UInt64 ALWAYS_INLINE readVarint() - { - char c; - in.readStrict(c); - UInt64 first_byte = static_cast(c); - ++cursor; - if (likely(!(c & 0x80))) - return first_byte; - return continueReadingVarint(first_byte); - } - - UInt64 continueReadingVarint(UInt64 first_byte); - void ignoreVarint(); - void ignoreGroup(); - [[noreturn]] void throwUnknownFormat() const; - - ReadBuffer & in; - Int64 cursor; - size_t current_message_level; - Int64 current_message_end; - std::vector parent_message_ends; - Int64 field_end; - Int64 last_string_pos; - const bool use_length_delimiters; - }; - - class IConverter - { - public: - virtual ~IConverter() = default; - virtual bool readStringInto(PaddedPODArray &) = 0; - virtual bool readInt8(Int8&) = 0; - virtual bool readUInt8(UInt8 &) = 0; - virtual bool readInt16(Int16 &) = 0; - virtual bool readUInt16(UInt16 &) = 0; - virtual bool readInt32(Int32 &) = 0; - virtual bool readUInt32(UInt32 &) = 0; - virtual bool readInt64(Int64 &) = 0; - virtual bool readUInt64(UInt64 &) = 0; - virtual bool readInt128(Int128 &) = 0; - virtual bool readUInt128(UInt128 &) = 0; - - virtual bool readInt256(Int256 &) = 0; - virtual bool readUInt256(UInt256 &) = 0; - - virtual bool readFloat32(Float32 &) = 0; - virtual bool readFloat64(Float64 &) = 0; - virtual void prepareEnumMapping8(const std::vector> &) = 0; - virtual void prepareEnumMapping16(const std::vector> &) = 0; - virtual bool readEnum8(Int8 &) = 0; - virtual bool readEnum16(Int16 &) = 0; - virtual bool readUUID(UUID &) = 0; - virtual bool readDate(DayNum &) = 0; - virtual bool readDateTime(time_t &) = 0; - virtual bool readDateTime64(DateTime64 &, UInt32) = 0; - virtual bool readDecimal32(Decimal32 &, UInt32, UInt32) = 0; - virtual bool readDecimal64(Decimal64 &, UInt32, UInt32) = 0; - virtual bool readDecimal128(Decimal128 &, UInt32, UInt32) = 0; - virtual bool readDecimal256(Decimal256 &, UInt32, UInt32) = 0; - virtual bool readAggregateFunction(const AggregateFunctionPtr &, AggregateDataPtr, Arena &) = 0; - }; - - class ConverterBaseImpl; - class ConverterFromString; - template class ConverterFromNumber; - class ConverterFromBool; - class ConverterFromEnum; - - struct ColumnMatcherTraits - { - struct FieldData - { - std::unique_ptr converter; - }; - struct MessageData - { - std::unordered_map*> field_number_to_field_map; - }; - }; - using Message = ProtobufColumnMatcher::Message; - using Field = ProtobufColumnMatcher::Field; - - void setTraitsDataAfterMatchingColumns(Message * message); - - template - std::unique_ptr createConverter(const google::protobuf::FieldDescriptor * field); - - SimpleReader simple_reader; - std::unique_ptr root_message; - Message* current_message = nullptr; - size_t current_field_index = 0; - IConverter* current_converter = nullptr; -}; - -} - -#else - -namespace DB -{ -class Arena; -class IAggregateFunction; -class ReadBuffer; -using AggregateDataPtr = char *; -using AggregateFunctionPtr = std::shared_ptr; +/// Utility class for reading in the Protobuf format. +/// Knows nothing about protobuf schemas, just provides useful functions to serialize data. class ProtobufReader { public: - bool startMessage() { return false; } - void endMessage() {} - bool readColumnIndex(size_t &) { return false; } - bool readNumber(Int8 &) { return false; } - bool readNumber(UInt8 &) { return false; } - bool readNumber(Int16 &) { return false; } - bool readNumber(UInt16 &) { return false; } - bool readNumber(Int32 &) { return false; } - bool readNumber(UInt32 &) { return false; } - bool readNumber(Int64 &) { return false; } - bool readNumber(UInt64 &) { return false; } - bool readNumber(Int128 &) { return false; } - bool readNumber(UInt128 &) { return false; } - bool readNumber(Int256 &) { return false; } - bool readNumber(UInt256 &) { return false; } - bool readNumber(Float32 &) { return false; } - bool readNumber(Float64 &) { return false; } - bool readStringInto(PaddedPODArray &) { return false; } - void prepareEnumMapping(const std::vector> &) {} - void prepareEnumMapping(const std::vector> &) {} - bool readEnum(Int8 &) { return false; } - bool readEnum(Int16 &) { return false; } - bool readUUID(UUID &) { return false; } - bool readDate(DayNum &) { return false; } - bool readDateTime(time_t &) { return false; } - bool readDateTime64(DateTime64 & /*tm*/, UInt32 /*scale*/) { return false; } - bool readDecimal(Decimal32 &, UInt32, UInt32) { return false; } - bool readDecimal(Decimal64 &, UInt32, UInt32) { return false; } - bool readDecimal(Decimal128 &, UInt32, UInt32) { return false; } - bool readDecimal(Decimal256 &, UInt32, UInt32) { return false; } - bool readAggregateFunction(const AggregateFunctionPtr &, AggregateDataPtr, Arena &) { return false; } - bool canReadMoreValues() const { return false; } + ProtobufReader(ReadBuffer & in_); + + void startMessage(bool with_length_delimiter_); + void endMessage(bool ignore_errors); + void startNestedMessage(); + void endNestedMessage(); + + bool readFieldNumber(int & field_number); + Int64 readInt(); + Int64 readSInt(); + UInt64 readUInt(); + template T readFixed(); + + void readString(String & str); + void readStringAndAppend(PaddedPODArray & str); + + bool eof() const { return in.eof(); } + +private: + void readBinary(void * data, size_t size); + void ignore(UInt64 num_bytes); + void ignoreAll(); + void moveCursorBackward(UInt64 num_bytes); + + UInt64 ALWAYS_INLINE readVarint() + { + char c; + in.readStrict(c); + UInt64 first_byte = static_cast(c); + ++cursor; + if (likely(!(c & 0x80))) + return first_byte; + return continueReadingVarint(first_byte); + } + + UInt64 continueReadingVarint(UInt64 first_byte); + void ignoreVarint(); + void ignoreGroup(); + [[noreturn]] void throwUnknownFormat() const; + + ReadBuffer & in; + Int64 cursor = 0; + bool root_message_has_length_delimiter = false; + size_t current_message_level = 0; + Int64 current_message_end = 0; + std::vector parent_message_ends; + int field_number = 0; + int next_field_number = 0; + Int64 field_end = 0; }; } diff --git a/src/Formats/ProtobufSerializer.cpp b/src/Formats/ProtobufSerializer.cpp new file mode 100644 index 00000000000..82149460773 --- /dev/null +++ b/src/Formats/ProtobufSerializer.cpp @@ -0,0 +1,2921 @@ +#include + +#if USE_PROTOBUF +# include +# include +# include +# include +# include +# include +# include +# include +# include +# include +# include +# include +# include +# include +# include +# include +# include +# include +# include +# include +# include +# include +# include +# include +# include +# include +# include +# include +# include +# include +# include +# include +# include +# include +# include +# include +# include + +# include + +namespace DB +{ +namespace ErrorCodes +{ + extern const int NO_COLUMNS_SERIALIZED_TO_PROTOBUF_FIELDS; + extern const int MULTIPLE_COLUMNS_SERIALIZED_TO_SAME_PROTOBUF_FIELD; + extern const int NO_COLUMN_SERIALIZED_TO_REQUIRED_PROTOBUF_FIELD; + extern const int DATA_TYPE_INCOMPATIBLE_WITH_PROTOBUF_FIELD; + extern const int PROTOBUF_FIELD_NOT_REPEATED; + extern const int PROTOBUF_BAD_CAST; + extern const int LOGICAL_ERROR; +} + +namespace +{ + using FieldDescriptor = google::protobuf::FieldDescriptor; + using MessageDescriptor = google::protobuf::Descriptor; + using FieldTypeId = google::protobuf::FieldDescriptor::Type; + + + /// Compares column's name with protobuf field's name. + /// This comparison is case-insensitive and ignores the difference between '.' and '_' + struct ColumnNameWithProtobufFieldNameComparator + { + static bool equals(char c1, char c2) + { + return convertChar(c1) == convertChar(c2); + } + + static bool equals(const std::string_view & s1, const std::string_view & s2) + { + return (s1.length() == s2.length()) + && std::equal(s1.begin(), s1.end(), s2.begin(), [](char c1, char c2) { return convertChar(c1) == convertChar(c2); }); + } + + static bool less(const std::string_view & s1, const std::string_view & s2) + { + return std::lexicographical_compare(s1.begin(), s1.end(), s2.begin(), s2.end(), [](char c1, char c2) { return convertChar(c1) < convertChar(c2); }); + } + + static bool startsWith(const std::string_view & s1, const std::string_view & s2) + { + return (s1.length() >= s2.length()) && equals(s1.substr(0, s2.length()), s2); + } + + static char convertChar(char c) + { + c = tolower(c); + if (c == '.') + c = '_'; + return c; + } + }; + + + // Should we omit null values (zero for numbers / empty string for strings) while storing them. + bool shouldSkipZeroOrEmpty(const FieldDescriptor & field_descriptor) + { + if (!field_descriptor.is_optional()) + return false; + if (field_descriptor.containing_type()->options().map_entry()) + return false; + return field_descriptor.message_type() || (field_descriptor.file()->syntax() == google::protobuf::FileDescriptor::SYNTAX_PROTO3); + } + + // Should we pack repeated values while storing them. + bool shouldPackRepeated(const FieldDescriptor & field_descriptor) + { + if (!field_descriptor.is_repeated()) + return false; + switch (field_descriptor.type()) + { + case FieldTypeId::TYPE_INT32: + case FieldTypeId::TYPE_UINT32: + case FieldTypeId::TYPE_SINT32: + case FieldTypeId::TYPE_INT64: + case FieldTypeId::TYPE_UINT64: + case FieldTypeId::TYPE_SINT64: + case FieldTypeId::TYPE_FIXED32: + case FieldTypeId::TYPE_SFIXED32: + case FieldTypeId::TYPE_FIXED64: + case FieldTypeId::TYPE_SFIXED64: + case FieldTypeId::TYPE_FLOAT: + case FieldTypeId::TYPE_DOUBLE: + case FieldTypeId::TYPE_BOOL: + case FieldTypeId::TYPE_ENUM: + break; + default: + return false; + } + if (field_descriptor.options().has_packed()) + return field_descriptor.options().packed(); + return field_descriptor.file()->syntax() == google::protobuf::FileDescriptor::SYNTAX_PROTO3; + } + + + struct ProtobufReaderOrWriter + { + ProtobufReaderOrWriter(ProtobufReader & reader_) : reader(&reader_) {} // NOLINT(google-explicit-constructor) + ProtobufReaderOrWriter(ProtobufWriter & writer_) : writer(&writer_) {} // NOLINT(google-explicit-constructor) + ProtobufReader * const reader = nullptr; + ProtobufWriter * const writer = nullptr; + }; + + + /// Base class for all serializers which serialize a single value. + class ProtobufSerializerSingleValue : public ProtobufSerializer + { + protected: + ProtobufSerializerSingleValue(const FieldDescriptor & field_descriptor_, const ProtobufReaderOrWriter & reader_or_writer_) + : field_descriptor(field_descriptor_) + , field_typeid(field_descriptor_.type()) + , field_tag(field_descriptor.number()) + , reader(reader_or_writer_.reader) + , writer(reader_or_writer_.writer) + , skip_zero_or_empty(shouldSkipZeroOrEmpty(field_descriptor)) + { + } + + void setColumns(const ColumnPtr * columns, [[maybe_unused]] size_t num_columns) override + { + assert(num_columns == 1); + column = columns[0]; + } + + void setColumns(const MutableColumnPtr * columns, [[maybe_unused]] size_t num_columns) override + { + assert(num_columns == 1); + column = columns[0]->getPtr(); + } + + template + void writeInt(NumberType value) + { + auto casted = castNumber(value); + if (casted || !skip_zero_or_empty) + writer->writeInt(field_tag, casted); + } + + template + void writeSInt(NumberType value) + { + auto casted = castNumber(value); + if (casted || !skip_zero_or_empty) + writer->writeSInt(field_tag, casted); + } + + template + void writeUInt(NumberType value) + { + auto casted = castNumber(value); + if (casted || !skip_zero_or_empty) + writer->writeUInt(field_tag, casted); + } + + template + void writeFixed(NumberType value) + { + auto casted = castNumber(value); + if (casted || !skip_zero_or_empty) + writer->writeFixed(field_tag, casted); + } + + Int64 readInt() { return reader->readInt(); } + Int64 readSInt() { return reader->readSInt(); } + UInt64 readUInt() { return reader->readUInt(); } + + template + FieldType readFixed() + { + return reader->readFixed(); + } + + void writeStr(const std::string_view & str) + { + if (!str.empty() || !skip_zero_or_empty) + writer->writeString(field_tag, str); + } + + void readStr(String & str) { reader->readString(str); } + void readStrAndAppend(PaddedPODArray & str) { reader->readStringAndAppend(str); } + + template + DestType parseFromStr(const std::string_view & str) const + { + try + { + DestType result; + ReadBufferFromMemory buf(str.data(), str.length()); + readText(result, buf); + return result; + } + catch (...) + { + cannotConvertValue(str, "String", TypeName::get()); + } + } + + template + DestType castNumber(SrcType value) const + { + if constexpr (std::is_same_v) + return value; + DestType result; + try + { + /// TODO: use accurate::convertNumeric() maybe? + result = boost::numeric_cast(value); + } + catch (boost::numeric::bad_numeric_cast &) + { + cannotConvertValue(toString(value), TypeName::get(), TypeName::get()); + } + return result; + } + + [[noreturn]] void cannotConvertValue(const std::string_view & src_value, const std::string_view & src_type_name, const std::string_view & dest_type_name) const + { + throw Exception( + "Could not convert value '" + String{src_value} + "' from type " + String{src_type_name} + " to type " + String{dest_type_name} + + " while " + (reader ? "reading" : "writing") + " field " + field_descriptor.name(), + ErrorCodes::PROTOBUF_BAD_CAST); + } + + const FieldDescriptor & field_descriptor; + const FieldTypeId field_typeid; + const int field_tag; + ProtobufReader * const reader; + ProtobufWriter * const writer; + ColumnPtr column; + + private: + const bool skip_zero_or_empty; + }; + + + /// Serializes any ColumnVector to a field of any type except TYPE_MESSAGE, TYPE_GROUP. + /// NumberType must be one of the following types: Int8, UInt8, Int16, UInt16, Int32, UInt32, Int64, UInt64, + /// Int128, UInt128, Int256, UInt256, Float32, Float64. + /// And the field's type cannot be TYPE_ENUM if NumberType is Float32 or Float64. + template + class ProtobufSerializerNumber : public ProtobufSerializerSingleValue + { + public: + using ColumnType = ColumnVector; + + ProtobufSerializerNumber(const FieldDescriptor & field_descriptor_, const ProtobufReaderOrWriter & reader_or_writer_) + : ProtobufSerializerSingleValue(field_descriptor_, reader_or_writer_) + { + setFunctions(); + } + + void writeRow(size_t row_num) override + { + const auto & column_vector = assert_cast(*column); + write_function(column_vector.getElement(row_num)); + } + + void readRow(size_t row_num) override + { + NumberType value = read_function(); + auto & column_vector = assert_cast(column->assumeMutableRef()); + if (row_num < column_vector.size()) + column_vector.getElement(row_num) = value; + else + column_vector.insertValue(value); + } + + void insertDefaults(size_t row_num) override + { + auto & column_vector = assert_cast(column->assumeMutableRef()); + if (row_num < column_vector.size()) + return; + column_vector.insertValue(getDefaultNumber()); + } + + private: + void setFunctions() + { + switch (field_typeid) + { + case FieldTypeId::TYPE_INT32: + { + write_function = [this](NumberType value) { writeInt(value); }; + read_function = [this]() -> NumberType { return castNumber(readInt()); }; + default_function = [this]() -> NumberType { return castNumber(field_descriptor.default_value_int32()); }; + break; + } + + case FieldTypeId::TYPE_SINT32: + { + write_function = [this](NumberType value) { writeSInt(value); }; + read_function = [this]() -> NumberType { return castNumber(readSInt()); }; + default_function = [this]() -> NumberType { return castNumber(field_descriptor.default_value_int32()); }; + break; + } + + case FieldTypeId::TYPE_UINT32: + { + write_function = [this](NumberType value) { writeUInt(value); }; + read_function = [this]() -> NumberType { return castNumber(readUInt()); }; + default_function = [this]() -> NumberType { return castNumber(field_descriptor.default_value_uint32()); }; + break; + } + + case FieldTypeId::TYPE_INT64: + { + write_function = [this](NumberType value) { writeInt(value); }; + read_function = [this]() -> NumberType { return castNumber(readInt()); }; + default_function = [this]() -> NumberType { return castNumber(field_descriptor.default_value_int64()); }; + break; + } + + case FieldTypeId::TYPE_SINT64: + { + write_function = [this](NumberType value) { writeSInt(value); }; + read_function = [this]() -> NumberType { return castNumber(readSInt()); }; + default_function = [this]() -> NumberType { return castNumber(field_descriptor.default_value_int64()); }; + break; + } + + case FieldTypeId::TYPE_UINT64: + { + write_function = [this](NumberType value) { writeUInt(value); }; + read_function = [this]() -> NumberType { return castNumber(readUInt()); }; + default_function = [this]() -> NumberType { return castNumber(field_descriptor.default_value_uint64()); }; + break; + } + + case FieldTypeId::TYPE_FIXED32: + { + write_function = [this](NumberType value) { writeFixed(value); }; + read_function = [this]() -> NumberType { return castNumber(readFixed()); }; + default_function = [this]() -> NumberType { return castNumber(field_descriptor.default_value_uint32()); }; + break; + } + + case FieldTypeId::TYPE_SFIXED32: + { + write_function = [this](NumberType value) { writeFixed(value); }; + read_function = [this]() -> NumberType { return castNumber(readFixed()); }; + default_function = [this]() -> NumberType { return castNumber(field_descriptor.default_value_int32()); }; + break; + } + + case FieldTypeId::TYPE_FIXED64: + { + write_function = [this](NumberType value) { writeFixed(value); }; + read_function = [this]() -> NumberType { return castNumber(readFixed()); }; + default_function = [this]() -> NumberType { return castNumber(field_descriptor.default_value_uint64()); }; + break; + } + + case FieldTypeId::TYPE_SFIXED64: + { + write_function = [this](NumberType value) { writeFixed(value); }; + read_function = [this]() -> NumberType { return castNumber(readFixed()); }; + default_function = [this]() -> NumberType { return castNumber(field_descriptor.default_value_int64()); }; + break; + } + + case FieldTypeId::TYPE_FLOAT: + { + write_function = [this](NumberType value) { writeFixed(value); }; + read_function = [this]() -> NumberType { return castNumber(readFixed()); }; + default_function = [this]() -> NumberType { return castNumber(field_descriptor.default_value_float()); }; + break; + } + + case FieldTypeId::TYPE_DOUBLE: + { + write_function = [this](NumberType value) { writeFixed(value); }; + read_function = [this]() -> NumberType { return castNumber(readFixed()); }; + default_function = [this]() -> NumberType { return castNumber(field_descriptor.default_value_double()); }; + break; + } + + case FieldTypeId::TYPE_BOOL: + { + write_function = [this](NumberType value) + { + if (value == 0) + writeUInt(0); + else if (value == 1) + writeUInt(1); + else + cannotConvertValue(toString(value), TypeName::get(), field_descriptor.type_name()); + }; + + read_function = [this]() -> NumberType + { + UInt64 u64 = readUInt(); + if (u64 < 2) + return static_cast(u64); + else + cannotConvertValue(toString(u64), field_descriptor.type_name(), TypeName::get()); + }; + + default_function = [this]() -> NumberType { return static_cast(field_descriptor.default_value_bool()); }; + break; + } + + case FieldTypeId::TYPE_STRING: + case FieldTypeId::TYPE_BYTES: + { + write_function = [this](NumberType value) + { + WriteBufferFromString buf{text_buffer}; + writeText(value, buf); + buf.finalize(); + writeStr(text_buffer); + }; + + read_function = [this]() -> NumberType + { + readStr(text_buffer); + return parseFromStr(text_buffer); + }; + + default_function = [this]() -> NumberType { return parseFromStr(field_descriptor.default_value_string()); }; + break; + } + + case FieldTypeId::TYPE_ENUM: + { + if (std::is_floating_point_v) + failedToSetFunctions(); + + write_function = [this](NumberType value) + { + int number = castNumber(value); + checkProtobufEnumValue(number); + writeInt(number); + }; + + read_function = [this]() -> NumberType { return castNumber(readInt()); }; + default_function = [this]() -> NumberType { return castNumber(field_descriptor.default_value_enum()->number()); }; + break; + } + + default: + failedToSetFunctions(); + } + } + + [[noreturn]] void failedToSetFunctions() const + { + throw Exception( + "The field " + quoteString(field_descriptor.full_name()) + " has an incompatible type " + field_descriptor.type_name() + + " for serialization of the data type " + quoteString(TypeName::get()), + ErrorCodes::DATA_TYPE_INCOMPATIBLE_WITH_PROTOBUF_FIELD); + } + + NumberType getDefaultNumber() + { + if (!default_number) + default_number = default_function(); + return *default_number; + } + + void checkProtobufEnumValue(int value) const + { + const auto * enum_value_descriptor = field_descriptor.enum_type()->FindValueByNumber(value); + if (!enum_value_descriptor) + cannotConvertValue(toString(value), TypeName::get(), field_descriptor.type_name()); + } + + protected: + std::function write_function; + std::function read_function; + std::function default_function; + String text_buffer; + + private: + std::optional default_number; + }; + + + /// Serializes ColumnString or ColumnFixedString to a field of any type except TYPE_MESSAGE, TYPE_GROUP. + template + class ProtobufSerializerString : public ProtobufSerializerSingleValue + { + public: + using ColumnType = std::conditional_t; + using StringDataType = std::conditional_t; + + ProtobufSerializerString( + const StringDataType & string_data_type_, + const google::protobuf::FieldDescriptor & field_descriptor_, + const ProtobufReaderOrWriter & reader_or_writer_) + : ProtobufSerializerSingleValue(field_descriptor_, reader_or_writer_) + { + static_assert(is_fixed_string, "This constructor for FixedString only"); + n = string_data_type_.getN(); + setFunctions(); + prepareEnumMapping(); + } + + ProtobufSerializerString( + const google::protobuf::FieldDescriptor & field_descriptor_, const ProtobufReaderOrWriter & reader_or_writer_) + : ProtobufSerializerSingleValue(field_descriptor_, reader_or_writer_) + { + static_assert(!is_fixed_string, "This constructor for String only"); + setFunctions(); + prepareEnumMapping(); + } + + void writeRow(size_t row_num) override + { + const auto & column_string = assert_cast(*column); + write_function(std::string_view{column_string.getDataAt(row_num)}); + } + + void readRow(size_t row_num) override + { + auto & column_string = assert_cast(column->assumeMutableRef()); + const size_t old_size = column_string.size(); + typename ColumnType::Chars & data = column_string.getChars(); + const size_t old_data_size = data.size(); + + if (row_num < old_size) + { + text_buffer.clear(); + read_function(text_buffer); + } + else + { + try + { + read_function(data); + } + catch (...) + { + data.resize_assume_reserved(old_data_size); + throw; + } + } + + if constexpr (is_fixed_string) + { + if (row_num < old_size) + { + ColumnFixedString::alignStringLength(text_buffer, n, 0); + memcpy(data.data() + row_num * n, text_buffer.data(), n); + } + else + ColumnFixedString::alignStringLength(data, n, old_data_size); + } + else + { + if (row_num < old_size) + { + if (row_num != old_size - 1) + throw Exception("Cannot replace a string in the middle of ColumnString", ErrorCodes::LOGICAL_ERROR); + column_string.popBack(1); + } + try + { + data.push_back(0 /* terminating zero */); + column_string.getOffsets().push_back(data.size()); + } + catch (...) + { + data.resize_assume_reserved(old_data_size); + column_string.getOffsets().resize_assume_reserved(old_size); + throw; + } + } + } + + void insertDefaults(size_t row_num) override + { + auto & column_string = assert_cast(column->assumeMutableRef()); + const size_t old_size = column_string.size(); + if (row_num < old_size) + return; + + const auto & default_str = getDefaultString(); + typename ColumnType::Chars & data = column_string.getChars(); + const size_t old_data_size = data.size(); + try + { + data.insert(default_str.data(), default_str.data() + default_str.size()); + } + catch (...) + { + data.resize_assume_reserved(old_data_size); + throw; + } + + if constexpr (!is_fixed_string) + { + try + { + data.push_back(0 /* terminating zero */); + column_string.getOffsets().push_back(data.size()); + } + catch (...) + { + data.resize_assume_reserved(old_data_size); + column_string.getOffsets().resize_assume_reserved(old_size); + throw; + } + } + } + + private: + void setFunctions() + { + switch (field_typeid) + { + case FieldTypeId::TYPE_INT32: + { + write_function = [this](const std::string_view & str) { writeInt(parseFromStr(str)); }; + read_function = [this](PaddedPODArray & str) { toStringAppend(readInt(), str); }; + default_function = [this]() -> String { return toString(field_descriptor.default_value_int32()); }; + break; + } + + case FieldTypeId::TYPE_SINT32: + { + write_function = [this](const std::string_view & str) { writeSInt(parseFromStr(str)); }; + read_function = [this](PaddedPODArray & str) { toStringAppend(readSInt(), str); }; + default_function = [this]() -> String { return toString(field_descriptor.default_value_int32()); }; + break; + } + + case FieldTypeId::TYPE_UINT32: + { + write_function = [this](const std::string_view & str) { writeUInt(parseFromStr(str)); }; + read_function = [this](PaddedPODArray & str) { toStringAppend(readUInt(), str); }; + default_function = [this]() -> String { return toString(field_descriptor.default_value_uint32()); }; + break; + } + + case FieldTypeId::TYPE_INT64: + { + write_function = [this](const std::string_view & str) { writeInt(parseFromStr(str)); }; + read_function = [this](PaddedPODArray & str) { toStringAppend(readInt(), str); }; + default_function = [this]() -> String { return toString(field_descriptor.default_value_int64()); }; + break; + } + + case FieldTypeId::TYPE_SINT64: + { + write_function = [this](const std::string_view & str) { writeSInt(parseFromStr(str)); }; + read_function = [this](PaddedPODArray & str) { toStringAppend(readSInt(), str); }; + default_function = [this]() -> String { return toString(field_descriptor.default_value_int64()); }; + break; + } + + case FieldTypeId::TYPE_UINT64: + { + write_function = [this](const std::string_view & str) { writeUInt(parseFromStr(str)); }; + read_function = [this](PaddedPODArray & str) { toStringAppend(readUInt(), str); }; + default_function = [this]() -> String { return toString(field_descriptor.default_value_uint64()); }; + break; + } + + case FieldTypeId::TYPE_FIXED32: + { + write_function = [this](const std::string_view & str) { writeFixed(parseFromStr(str)); }; + read_function = [this](PaddedPODArray & str) { toStringAppend(readFixed(), str); }; + default_function = [this]() -> String { return toString(field_descriptor.default_value_uint32()); }; + break; + } + + case FieldTypeId::TYPE_SFIXED32: + { + write_function = [this](const std::string_view & str) { writeFixed(parseFromStr(str)); }; + read_function = [this](PaddedPODArray & str) { toStringAppend(readFixed(), str); }; + default_function = [this]() -> String { return toString(field_descriptor.default_value_int32()); }; + break; + } + + case FieldTypeId::TYPE_FIXED64: + { + write_function = [this](const std::string_view & str) { writeFixed(parseFromStr(str)); }; + read_function = [this](PaddedPODArray & str) { toStringAppend(readFixed(), str); }; + default_function = [this]() -> String { return toString(field_descriptor.default_value_uint64()); }; + break; + } + + case FieldTypeId::TYPE_SFIXED64: + { + write_function = [this](const std::string_view & str) { writeFixed(parseFromStr(str)); }; + read_function = [this](PaddedPODArray & str) { toStringAppend(readFixed(), str); }; + default_function = [this]() -> String { return toString(field_descriptor.default_value_int64()); }; + break; + } + + case FieldTypeId::TYPE_FLOAT: + { + write_function = [this](const std::string_view & str) { writeFixed(parseFromStr(str)); }; + read_function = [this](PaddedPODArray & str) { toStringAppend(readFixed(), str); }; + default_function = [this]() -> String { return toString(field_descriptor.default_value_float()); }; + break; + } + + case FieldTypeId::TYPE_DOUBLE: + { + write_function = [this](const std::string_view & str) { writeFixed(parseFromStr(str)); }; + read_function = [this](PaddedPODArray & str) { toStringAppend(readFixed(), str); }; + default_function = [this]() -> String { return toString(field_descriptor.default_value_double()); }; + break; + } + + case FieldTypeId::TYPE_BOOL: + { + write_function = [this](const std::string_view & str) + { + if (str == "true") + writeUInt(1); + else if (str == "false") + writeUInt(0); + else + cannotConvertValue(str, "String", field_descriptor.type_name()); + }; + + read_function = [this](PaddedPODArray & str) + { + UInt64 u64 = readUInt(); + if (u64 < 2) + { + std::string_view ref(u64 ? "true" : "false"); + str.insert(ref.data(), ref.data() + ref.length()); + } + else + cannotConvertValue(toString(u64), field_descriptor.type_name(), "String"); + }; + + default_function = [this]() -> String + { + return field_descriptor.default_value_bool() ? "true" : "false"; + }; + break; + } + + case FieldTypeId::TYPE_STRING: + case FieldTypeId::TYPE_BYTES: + { + write_function = [this](const std::string_view & str) { writeStr(str); }; + read_function = [this](PaddedPODArray & str) { readStrAndAppend(str); }; + default_function = [this]() -> String { return field_descriptor.default_value_string(); }; + break; + } + + case FieldTypeId::TYPE_ENUM: + { + write_function = [this](const std::string_view & str) { writeInt(stringToProtobufEnumValue(str)); }; + read_function = [this](PaddedPODArray & str) { protobufEnumValueToStringAppend(readInt(), str); }; + default_function = [this]() -> String { return field_descriptor.default_value_enum()->name(); }; + break; + } + + default: + failedToSetFunctions(); + } + } + + [[noreturn]] void failedToSetFunctions() + { + throw Exception( + "The field " + quoteString(field_descriptor.full_name()) + " has an incompatible type " + field_descriptor.type_name() + + " for serialization of the data type " + quoteString(is_fixed_string ? "FixedString" : "String"), + ErrorCodes::DATA_TYPE_INCOMPATIBLE_WITH_PROTOBUF_FIELD); + } + + const PaddedPODArray & getDefaultString() + { + if (!default_string) + { + PaddedPODArray arr; + auto str = default_function(); + arr.insert(str.data(), str.data() + str.size()); + if constexpr (is_fixed_string) + ColumnFixedString::alignStringLength(arr, n, 0); + default_string = std::move(arr); + } + return *default_string; + } + + template + void toStringAppend(NumberType value, PaddedPODArray & str) + { + WriteBufferFromVector buf{str, WriteBufferFromVector>::AppendModeTag{}}; + writeText(value, buf); + } + + void prepareEnumMapping() + { + if ((field_typeid == google::protobuf::FieldDescriptor::TYPE_ENUM) && writer) + { + const auto & enum_descriptor = *field_descriptor.enum_type(); + for (int i = 0; i != enum_descriptor.value_count(); ++i) + { + const auto & enum_value_descriptor = *enum_descriptor.value(i); + string_to_protobuf_enum_value_map.emplace(enum_value_descriptor.name(), enum_value_descriptor.number()); + } + } + } + + int stringToProtobufEnumValue(const std::string_view & str) const + { + auto it = string_to_protobuf_enum_value_map.find(str); + if (it == string_to_protobuf_enum_value_map.end()) + cannotConvertValue(str, "String", field_descriptor.type_name()); + return it->second; + } + + std::string_view protobufEnumValueToString(int value) const + { + const auto * enum_value_descriptor = field_descriptor.enum_type()->FindValueByNumber(value); + if (!enum_value_descriptor) + cannotConvertValue(toString(value), field_descriptor.type_name(), "String"); + return enum_value_descriptor->name(); + } + + void protobufEnumValueToStringAppend(int value, PaddedPODArray & str) const + { + auto name = protobufEnumValueToString(value); + str.insert(name.data(), name.data() + name.length()); + } + + size_t n = 0; + std::function write_function; + std::function &)> read_function; + std::function default_function; + std::unordered_map string_to_protobuf_enum_value_map; + PaddedPODArray text_buffer; + std::optional> default_string; + }; + + + /// Serializes ColumnVector containing enum values to a field of any type + /// except TYPE_MESSAGE, TYPE_GROUP, TYPE_FLOAT, TYPE_DOUBLE, TYPE_BOOL. + /// NumberType can be either Int8 or Int16. + template + class ProtobufSerializerEnum : public ProtobufSerializerNumber + { + public: + using ColumnType = ColumnVector; + using EnumDataType = DataTypeEnum; + using BaseClass = ProtobufSerializerNumber; + + ProtobufSerializerEnum( + const std::shared_ptr & enum_data_type_, + const FieldDescriptor & field_descriptor_, + const ProtobufReaderOrWriter & reader_or_writer_) + : BaseClass(field_descriptor_, reader_or_writer_), enum_data_type(enum_data_type_) + { + assert(enum_data_type); + setFunctions(); + prepareEnumMapping(); + } + + private: + void setFunctions() + { + switch (this->field_typeid) + { + case FieldTypeId::TYPE_INT32: + case FieldTypeId::TYPE_SINT32: + case FieldTypeId::TYPE_UINT32: + case FieldTypeId::TYPE_INT64: + case FieldTypeId::TYPE_SINT64: + case FieldTypeId::TYPE_UINT64: + case FieldTypeId::TYPE_FIXED32: + case FieldTypeId::TYPE_SFIXED32: + case FieldTypeId::TYPE_FIXED64: + case FieldTypeId::TYPE_SFIXED64: + { + auto base_read_function = this->read_function; + this->read_function = [this, base_read_function]() -> NumberType + { + NumberType value = base_read_function(); + checkEnumDataTypeValue(value); + return value; + }; + + auto base_default_function = this->default_function; + this->default_function = [this, base_default_function]() -> NumberType + { + auto value = base_default_function(); + checkEnumDataTypeValue(value); + return value; + }; + break; + } + + case FieldTypeId::TYPE_STRING: + case FieldTypeId::TYPE_BYTES: + { + this->write_function = [this](NumberType value) + { + writeStr(enumDataTypeValueToString(value)); + }; + + this->read_function = [this]() -> NumberType + { + readStr(this->text_buffer); + return stringToEnumDataTypeValue(this->text_buffer); + }; + + this->default_function = [this]() -> NumberType + { + return stringToEnumDataTypeValue(this->field_descriptor.default_value_string()); + }; + break; + } + + case FieldTypeId::TYPE_ENUM: + { + this->write_function = [this](NumberType value) { writeInt(enumDataTypeValueToProtobufEnumValue(value)); }; + this->read_function = [this]() -> NumberType { return protobufEnumValueToEnumDataTypeValue(readInt()); }; + this->default_function = [this]() -> NumberType { return protobufEnumValueToEnumDataTypeValue(this->field_descriptor.default_value_enum()->number()); }; + break; + } + + default: + failedToSetFunctions(); + } + } + + [[noreturn]] void failedToSetFunctions() + { + throw Exception( + "The field " + quoteString(this->field_descriptor.full_name()) + " has an incompatible type " + this->field_descriptor.type_name() + + " for serialization of the data type " + quoteString(enum_data_type->getName()), + ErrorCodes::DATA_TYPE_INCOMPATIBLE_WITH_PROTOBUF_FIELD); + } + + void checkEnumDataTypeValue(NumberType value) + { + enum_data_type->findByValue(value); /// Throws an exception if the value isn't defined in the DataTypeEnum. + } + + std::string_view enumDataTypeValueToString(NumberType value) const { return std::string_view{enum_data_type->getNameForValue(value)}; } + NumberType stringToEnumDataTypeValue(const String & str) const { return enum_data_type->getValue(str); } + + void prepareEnumMapping() + { + if (this->field_typeid != FieldTypeId::TYPE_ENUM) + return; + + const auto & enum_descriptor = *this->field_descriptor.enum_type(); + + /// We have two mappings: + /// enum_data_type: "string->NumberType" and protobuf_enum: string->int". + /// And here we want to make from those two mapping a new mapping "NumberType->int" (if we're writing protobuf data), + /// or "int->NumberType" (if we're reading protobuf data). + + auto add_to_mapping = [&](NumberType enum_data_type_value, int protobuf_enum_value) + { + if (this->writer) + enum_data_type_value_to_protobuf_enum_value_map.emplace(enum_data_type_value, protobuf_enum_value); + else + protobuf_enum_value_to_enum_data_type_value_map.emplace(protobuf_enum_value, enum_data_type_value); + }; + + auto iless = [](const std::string_view & s1, const std::string_view & s2) { return ColumnNameWithProtobufFieldNameComparator::less(s1, s2); }; + boost::container::flat_map string_to_protobuf_enum_value_map; + typename decltype(string_to_protobuf_enum_value_map)::sequence_type string_to_protobuf_enum_value_seq; + for (int i : ext::range(enum_descriptor.value_count())) + string_to_protobuf_enum_value_seq.emplace_back(enum_descriptor.value(i)->name(), enum_descriptor.value(i)->number()); + string_to_protobuf_enum_value_map.adopt_sequence(std::move(string_to_protobuf_enum_value_seq)); + + std::vector not_found_by_name_values; + not_found_by_name_values.reserve(enum_data_type->getValues().size()); + + /// Find mapping between enum_data_type and protobuf_enum by name (case insensitively), + /// i.e. we add to the mapping + /// NumberType(enum_data_type) -> "NAME"(enum_data_type) -> + /// -> "NAME"(protobuf_enum, same name) -> int(protobuf_enum) + for (const auto & [name, value] : enum_data_type->getValues()) + { + auto it = string_to_protobuf_enum_value_map.find(name); + if (it != string_to_protobuf_enum_value_map.end()) + add_to_mapping(value, it->second); + else + not_found_by_name_values.push_back(value); + } + + if (!not_found_by_name_values.empty()) + { + /// Find mapping between two enum_data_type and protobuf_enum by value. + /// If the same value has different names in enum_data_type and protobuf_enum + /// we can still add it to our mapping, i.e. we add to the mapping + /// NumberType(enum_data_type) -> int(protobuf_enum, same value) + for (NumberType value : not_found_by_name_values) + { + if (enum_descriptor.FindValueByNumber(value)) + add_to_mapping(value, value); + } + } + + size_t num_mapped_values = this->writer ? enum_data_type_value_to_protobuf_enum_value_map.size() + : protobuf_enum_value_to_enum_data_type_value_map.size(); + + if (!num_mapped_values && !enum_data_type->getValues().empty() && enum_descriptor.value_count()) + { + throw Exception( + "Couldn't find mapping between data type " + enum_data_type->getName() + " and the enum " + quoteString(enum_descriptor.full_name()) + + " in the protobuf schema", + ErrorCodes::DATA_TYPE_INCOMPATIBLE_WITH_PROTOBUF_FIELD); + } + } + + int enumDataTypeValueToProtobufEnumValue(NumberType value) const + { + auto it = enum_data_type_value_to_protobuf_enum_value_map.find(value); + if (it == enum_data_type_value_to_protobuf_enum_value_map.end()) + cannotConvertValue(toString(value), enum_data_type->getName(), this->field_descriptor.type_name()); + return it->second; + } + + NumberType protobufEnumValueToEnumDataTypeValue(int value) const + { + auto it = protobuf_enum_value_to_enum_data_type_value_map.find(value); + if (it == protobuf_enum_value_to_enum_data_type_value_map.end()) + cannotConvertValue(toString(value), this->field_descriptor.type_name(), enum_data_type->getName()); + return it->second; + } + + Int64 readInt() { return ProtobufSerializerSingleValue::readInt(); } + void writeInt(Int64 value) { ProtobufSerializerSingleValue::writeInt(value); } + void writeStr(const std::string_view & str) { ProtobufSerializerSingleValue::writeStr(str); } + void readStr(String & str) { ProtobufSerializerSingleValue::readStr(str); } + [[noreturn]] void cannotConvertValue(const std::string_view & src_value, const std::string_view & src_type_name, const std::string_view & dest_type_name) const { ProtobufSerializerSingleValue::cannotConvertValue(src_value, src_type_name, dest_type_name); } + + const std::shared_ptr enum_data_type; + std::unordered_map enum_data_type_value_to_protobuf_enum_value_map; + std::unordered_map protobuf_enum_value_to_enum_data_type_value_map; + }; + + + /// Serializes a ColumnDecimal to any field except TYPE_MESSAGE, TYPE_GROUP, TYPE_ENUM. + /// DecimalType must be one of the following types: Decimal32, Decimal64, Decimal128, Decimal256, DateTime64. + template + class ProtobufSerializerDecimal : public ProtobufSerializerSingleValue + { + public: + using ColumnType = ColumnDecimal; + + ProtobufSerializerDecimal( + const DataTypeDecimalBase & decimal_data_type_, + const FieldDescriptor & field_descriptor_, + const ProtobufReaderOrWriter & reader_or_writer_) + : ProtobufSerializerSingleValue(field_descriptor_, reader_or_writer_) + , precision(decimal_data_type_.getPrecision()) + , scale(decimal_data_type_.getScale()) + { + setFunctions(); + } + + void writeRow(size_t row_num) override + { + const auto & column_decimal = assert_cast(*column); + write_function(column_decimal.getElement(row_num)); + } + + void readRow(size_t row_num) override + { + DecimalType decimal = read_function(); + auto & column_decimal = assert_cast(column->assumeMutableRef()); + if (row_num < column_decimal.size()) + column_decimal.getElement(row_num) = decimal; + else + column_decimal.insertValue(decimal); + } + + void insertDefaults(size_t row_num) override + { + auto & column_decimal = assert_cast(column->assumeMutableRef()); + if (row_num < column_decimal.size()) + return; + column_decimal.insertValue(getDefaultDecimal()); + } + + private: + void setFunctions() + { + switch (field_typeid) + { + case FieldTypeId::TYPE_INT32: + { + write_function = [this](const DecimalType & decimal) { writeInt(decimalToNumber(decimal)); }; + read_function = [this]() -> DecimalType { return numberToDecimal(readInt()); }; + default_function = [this]() -> DecimalType { return numberToDecimal(field_descriptor.default_value_int32()); }; + break; + } + + case FieldTypeId::TYPE_SINT32: + { + write_function = [this](const DecimalType & decimal) { writeSInt(decimalToNumber(decimal)); }; + read_function = [this]() -> DecimalType { return numberToDecimal(readSInt()); }; + default_function = [this]() -> DecimalType { return numberToDecimal(field_descriptor.default_value_int32()); }; + break; + } + + case FieldTypeId::TYPE_UINT32: + { + write_function = [this](const DecimalType & decimal) { writeUInt(decimalToNumber(decimal)); }; + read_function = [this]() -> DecimalType { return numberToDecimal(readUInt()); }; + default_function = [this]() -> DecimalType { return numberToDecimal(field_descriptor.default_value_uint32()); }; + break; + } + + case FieldTypeId::TYPE_INT64: + { + write_function = [this](const DecimalType & decimal) { writeInt(decimalToNumber(decimal)); }; + read_function = [this]() -> DecimalType { return numberToDecimal(readInt()); }; + default_function = [this]() -> DecimalType { return numberToDecimal(field_descriptor.default_value_int64()); }; + break; + } + + case FieldTypeId::TYPE_SINT64: + { + write_function = [this](const DecimalType & decimal) { writeSInt(decimalToNumber(decimal)); }; + read_function = [this]() -> DecimalType { return numberToDecimal(readSInt()); }; + default_function = [this]() -> DecimalType { return numberToDecimal(field_descriptor.default_value_int64()); }; + break; + } + + case FieldTypeId::TYPE_UINT64: + { + write_function = [this](const DecimalType & decimal) { writeUInt(decimalToNumber(decimal)); }; + read_function = [this]() -> DecimalType { return numberToDecimal(readUInt()); }; + default_function = [this]() -> DecimalType { return numberToDecimal(field_descriptor.default_value_uint64()); }; + break; + } + + case FieldTypeId::TYPE_FIXED32: + { + write_function = [this](const DecimalType & decimal) { writeFixed(decimalToNumber(decimal)); }; + read_function = [this]() -> DecimalType { return numberToDecimal(readFixed()); }; + default_function = [this]() -> DecimalType { return numberToDecimal(field_descriptor.default_value_uint32()); }; + break; + } + + case FieldTypeId::TYPE_SFIXED32: + { + write_function = [this](const DecimalType & decimal) { writeFixed(decimalToNumber(decimal)); }; + read_function = [this]() -> DecimalType { return numberToDecimal(readFixed()); }; + default_function = [this]() -> DecimalType { return numberToDecimal(field_descriptor.default_value_int32()); }; + break; + } + + case FieldTypeId::TYPE_FIXED64: + { + write_function = [this](const DecimalType & decimal) { writeFixed(decimalToNumber(decimal)); }; + read_function = [this]() -> DecimalType { return numberToDecimal(readFixed()); }; + default_function = [this]() -> DecimalType { return numberToDecimal(field_descriptor.default_value_uint64()); }; + break; + } + + case FieldTypeId::TYPE_SFIXED64: + { + write_function = [this](const DecimalType & decimal) { writeFixed(decimalToNumber(decimal)); }; + read_function = [this]() -> DecimalType { return numberToDecimal(readFixed()); }; + default_function = [this]() -> DecimalType { return numberToDecimal(field_descriptor.default_value_int64()); }; + break; + } + + case FieldTypeId::TYPE_FLOAT: + { + write_function = [this](const DecimalType & decimal) { writeFixed(decimalToNumber(decimal)); }; + read_function = [this]() -> DecimalType { return numberToDecimal(readFixed()); }; + default_function = [this]() -> DecimalType { return numberToDecimal(field_descriptor.default_value_float()); }; + break; + } + + case FieldTypeId::TYPE_DOUBLE: + { + write_function = [this](const DecimalType & decimal) { writeFixed(decimalToNumber(decimal)); }; + read_function = [this]() -> DecimalType { return numberToDecimal(readFixed()); }; + default_function = [this]() -> DecimalType { return numberToDecimal(field_descriptor.default_value_double()); }; + break; + } + + case FieldTypeId::TYPE_BOOL: + { + if (std::is_same_v) + failedToSetFunctions(); + else + { + write_function = [this](const DecimalType & decimal) + { + if (decimal.value == 0) + writeInt(0); + else if (DecimalComparison::compare(decimal, 1, scale, 0)) + writeInt(1); + else + { + WriteBufferFromOwnString buf; + writeText(decimal, scale, buf); + cannotConvertValue(buf.str(), TypeName::get(), field_descriptor.type_name()); + } + }; + + read_function = [this]() -> DecimalType + { + UInt64 u64 = readUInt(); + if (u64 < 2) + return numberToDecimal(static_cast(u64 != 0)); + else + cannotConvertValue(toString(u64), field_descriptor.type_name(), TypeName::get()); + }; + + default_function = [this]() -> DecimalType + { + return numberToDecimal(static_cast(field_descriptor.default_value_bool())); + }; + } + break; + } + + case FieldTypeId::TYPE_STRING: + case FieldTypeId::TYPE_BYTES: + { + write_function = [this](const DecimalType & decimal) + { + decimalToString(decimal, text_buffer); + writeStr(text_buffer); + }; + + read_function = [this]() -> DecimalType + { + readStr(text_buffer); + return stringToDecimal(text_buffer); + }; + + default_function = [this]() -> DecimalType { return stringToDecimal(field_descriptor.default_value_string()); }; + break; + } + + default: + failedToSetFunctions(); + } + } + + [[noreturn]] void failedToSetFunctions() + { + throw Exception( + "The field " + quoteString(field_descriptor.full_name()) + " has an incompatible type " + field_descriptor.type_name() + + " for serialization of the data type " + quoteString(TypeName::get()), + ErrorCodes::DATA_TYPE_INCOMPATIBLE_WITH_PROTOBUF_FIELD); + } + + DecimalType getDefaultDecimal() + { + if (!default_decimal) + default_decimal = default_function(); + return *default_decimal; + } + + template + DecimalType numberToDecimal(NumberType value) const + { + return convertToDecimal, DataTypeDecimal>(value, scale); + } + + template + NumberType decimalToNumber(const DecimalType & decimal) const + { + return DecimalUtils::convertTo(decimal, scale); + } + + void decimalToString(const DecimalType & decimal, String & str) const + { + WriteBufferFromString buf{str}; + if constexpr (std::is_same_v) + writeDateTimeText(decimal, scale, buf); + else + writeText(decimal, scale, buf); + } + + DecimalType stringToDecimal(const String & str) const + { + ReadBufferFromString buf(str); + DecimalType decimal{0}; + if constexpr (std::is_same_v) + readDateTime64Text(decimal, scale, buf); + else + DataTypeDecimal::readText(decimal, buf, precision, scale); + return decimal; + } + + const UInt32 precision; + const UInt32 scale; + std::function write_function; + std::function read_function; + std::function default_function; + std::optional default_decimal; + String text_buffer; + }; + + using ProtobufSerializerDateTime64 = ProtobufSerializerDecimal; + + + /// Serializes a ColumnVector containing dates to a field of any type except TYPE_MESSAGE, TYPE_GROUP, TYPE_BOOL, TYPE_ENUM. + class ProtobufSerializerDate : public ProtobufSerializerNumber + { + public: + ProtobufSerializerDate( + const FieldDescriptor & field_descriptor_, + const ProtobufReaderOrWriter & reader_or_writer_) + : ProtobufSerializerNumber(field_descriptor_, reader_or_writer_) + { + setFunctions(); + } + + private: + void setFunctions() + { + switch (field_typeid) + { + case FieldTypeId::TYPE_INT32: + case FieldTypeId::TYPE_SINT32: + case FieldTypeId::TYPE_UINT32: + case FieldTypeId::TYPE_INT64: + case FieldTypeId::TYPE_SINT64: + case FieldTypeId::TYPE_UINT64: + case FieldTypeId::TYPE_FIXED32: + case FieldTypeId::TYPE_SFIXED32: + case FieldTypeId::TYPE_FIXED64: + case FieldTypeId::TYPE_SFIXED64: + case FieldTypeId::TYPE_FLOAT: + case FieldTypeId::TYPE_DOUBLE: + break; /// already set in ProtobufSerializerNumber::setFunctions(). + + case FieldTypeId::TYPE_STRING: + case FieldTypeId::TYPE_BYTES: + { + write_function = [this](UInt16 value) + { + dateToString(static_cast(value), text_buffer); + writeStr(text_buffer); + }; + + read_function = [this]() -> UInt16 + { + readStr(text_buffer); + return stringToDate(text_buffer); + }; + + default_function = [this]() -> UInt16 { return stringToDate(field_descriptor.default_value_string()); }; + break; + } + + default: + failedToSetFunctions(); + } + } + + static void dateToString(DayNum date, String & str) + { + WriteBufferFromString buf{str}; + writeText(date, buf); + } + + static DayNum stringToDate(const String & str) + { + DayNum date; + ReadBufferFromString buf{str}; + readDateText(date, buf); + return date; + } + + [[noreturn]] void failedToSetFunctions() + { + throw Exception( + "The field " + quoteString(field_descriptor.full_name()) + " has an incompatible type " + field_descriptor.type_name() + + " for serialization of the data type 'Date'", + ErrorCodes::DATA_TYPE_INCOMPATIBLE_WITH_PROTOBUF_FIELD); + } + }; + + + /// Serializes a ColumnVector containing dates to a field of any type except TYPE_MESSAGE, TYPE_GROUP, TYPE_BOOL, TYPE_ENUM. + class ProtobufSerializerDateTime : public ProtobufSerializerNumber + { + public: + ProtobufSerializerDateTime( + const FieldDescriptor & field_descriptor_, const ProtobufReaderOrWriter & reader_or_writer_) + : ProtobufSerializerNumber(field_descriptor_, reader_or_writer_) + { + setFunctions(); + } + + protected: + void setFunctions() + { + switch (field_typeid) + { + case FieldTypeId::TYPE_INT32: + case FieldTypeId::TYPE_SINT32: + case FieldTypeId::TYPE_UINT32: + case FieldTypeId::TYPE_INT64: + case FieldTypeId::TYPE_SINT64: + case FieldTypeId::TYPE_UINT64: + case FieldTypeId::TYPE_FIXED32: + case FieldTypeId::TYPE_SFIXED32: + case FieldTypeId::TYPE_FIXED64: + case FieldTypeId::TYPE_SFIXED64: + case FieldTypeId::TYPE_FLOAT: + case FieldTypeId::TYPE_DOUBLE: + break; /// already set in ProtobufSerializerNumber::setFunctions(). + + case FieldTypeId::TYPE_STRING: + case FieldTypeId::TYPE_BYTES: + { + write_function = [this](UInt32 value) + { + dateTimeToString(value, text_buffer); + writeStr(text_buffer); + }; + + read_function = [this]() -> UInt32 + { + readStr(text_buffer); + return stringToDateTime(text_buffer); + }; + + default_function = [this]() -> UInt32 { return stringToDateTime(field_descriptor.default_value_string()); }; + break; + } + + default: + failedToSetFunctions(); + } + } + + static void dateTimeToString(time_t tm, String & str) + { + WriteBufferFromString buf{str}; + writeDateTimeText(tm, buf); + } + + static time_t stringToDateTime(const String & str) + { + ReadBufferFromString buf{str}; + time_t tm = 0; + readDateTimeText(tm, buf); + return tm; + } + + [[noreturn]] void failedToSetFunctions() + { + throw Exception( + "The field " + quoteString(field_descriptor.full_name()) + " has an incompatible type " + field_descriptor.type_name() + + " for serialization of the data type 'DateTime'", + ErrorCodes::DATA_TYPE_INCOMPATIBLE_WITH_PROTOBUF_FIELD); + } + }; + + + /// Serializes a ColumnVector containing UUIDs to a field of type TYPE_STRING or TYPE_BYTES. + class ProtobufSerializerUUID : public ProtobufSerializerNumber + { + public: + ProtobufSerializerUUID( + const google::protobuf::FieldDescriptor & field_descriptor_, + const ProtobufReaderOrWriter & reader_or_writer_) + : ProtobufSerializerNumber(field_descriptor_, reader_or_writer_) + { + setFunctions(); + } + + private: + void setFunctions() + { + if ((field_typeid != FieldTypeId::TYPE_STRING) && (field_typeid != FieldTypeId::TYPE_BYTES)) + { + throw Exception( + "The field " + quoteString(field_descriptor.full_name()) + " has an incompatible type " + field_descriptor.type_name() + + " for serialization of the data type UUID", + ErrorCodes::DATA_TYPE_INCOMPATIBLE_WITH_PROTOBUF_FIELD); + } + + write_function = [this](UInt128 value) + { + uuidToString(static_cast(value), text_buffer); + writeStr(text_buffer); + }; + + read_function = [this]() -> UInt128 + { + readStr(text_buffer); + return stringToUUID(text_buffer); + }; + + default_function = [this]() -> UInt128 { return stringToUUID(field_descriptor.default_value_string()); }; + } + + static void uuidToString(const UUID & uuid, String & str) + { + WriteBufferFromString buf{str}; + writeText(uuid, buf); + } + + static UUID stringToUUID(const String & str) + { + ReadBufferFromString buf{str}; + UUID uuid; + readUUIDText(uuid, buf); + return uuid; + } + }; + + + using ProtobufSerializerInterval = ProtobufSerializerNumber; + + + /// Serializes a ColumnAggregateFunction to a field of type TYPE_STRING or TYPE_BYTES. + class ProtobufSerializerAggregateFunction : public ProtobufSerializerSingleValue + { + public: + ProtobufSerializerAggregateFunction( + const std::shared_ptr & aggregate_function_data_type_, + const google::protobuf::FieldDescriptor & field_descriptor_, + const ProtobufReaderOrWriter & reader_or_writer_) + : ProtobufSerializerSingleValue(field_descriptor_, reader_or_writer_) + , aggregate_function_data_type(aggregate_function_data_type_) + , aggregate_function(aggregate_function_data_type->getFunction()) + { + if ((field_typeid != FieldTypeId::TYPE_STRING) && (field_typeid != FieldTypeId::TYPE_BYTES)) + { + throw Exception( + "The field " + quoteString(field_descriptor.full_name()) + " has an incompatible type " + field_descriptor.type_name() + + " for serialization of the data type " + quoteString(aggregate_function_data_type->getName()), + ErrorCodes::DATA_TYPE_INCOMPATIBLE_WITH_PROTOBUF_FIELD); + } + } + + void writeRow(size_t row_num) override + { + const auto & column_af = assert_cast(*column); + dataToString(column_af.getData()[row_num], text_buffer); + writeStr(text_buffer); + } + + void readRow(size_t row_num) override + { + auto & column_af = assert_cast(column->assumeMutableRef()); + Arena & arena = column_af.createOrGetArena(); + AggregateDataPtr data; + readStr(text_buffer); + data = stringToData(text_buffer, arena); + + if (row_num < column_af.size()) + { + auto * old_data = std::exchange(column_af.getData()[row_num], data); + aggregate_function->destroy(old_data); + } + else + column_af.getData().push_back(data); + } + + void insertDefaults(size_t row_num) override + { + auto & column_af = assert_cast(column->assumeMutableRef()); + if (row_num < column_af.size()) + return; + + Arena & arena = column_af.createOrGetArena(); + AggregateDataPtr data = stringToData(field_descriptor.default_value_string(), arena); + column_af.getData().push_back(data); + } + + private: + void dataToString(ConstAggregateDataPtr data, String & str) const + { + WriteBufferFromString buf{str}; + aggregate_function->serialize(data, buf); + } + + AggregateDataPtr stringToData(const String & str, Arena & arena) const + { + size_t size_of_state = aggregate_function->sizeOfData(); + AggregateDataPtr data = arena.alignedAlloc(size_of_state, aggregate_function->alignOfData()); + try + { + aggregate_function->create(data); + ReadBufferFromMemory buf(str.data(), str.length()); + aggregate_function->deserialize(data, buf, &arena); + return data; + } + catch (...) + { + aggregate_function->destroy(data); + throw; + } + } + + const std::shared_ptr aggregate_function_data_type; + const AggregateFunctionPtr aggregate_function; + String text_buffer; + }; + + + /// Serializes a ColumnNullable. + class ProtobufSerializerNullable : public ProtobufSerializer + { + public: + explicit ProtobufSerializerNullable(std::unique_ptr nested_serializer_) + : nested_serializer(std::move(nested_serializer_)) + { + } + + void setColumns(const ColumnPtr * columns, [[maybe_unused]] size_t num_columns) override + { + assert(num_columns == 1); + column = columns[0]; + const auto & column_nullable = assert_cast(*column); + ColumnPtr nested_column = column_nullable.getNestedColumnPtr(); + nested_serializer->setColumns(&nested_column, 1); + } + + void setColumns(const MutableColumnPtr * columns, [[maybe_unused]] size_t num_columns) override + { + assert(num_columns == 1); + ColumnPtr column0 = columns[0]->getPtr(); + setColumns(&column0, 1); + } + + void writeRow(size_t row_num) override + { + const auto & column_nullable = assert_cast(*column); + const auto & null_map = column_nullable.getNullMapData(); + if (!null_map[row_num]) + nested_serializer->writeRow(row_num); + } + + void readRow(size_t row_num) override + { + auto & column_nullable = assert_cast(column->assumeMutableRef()); + auto & nested_column = column_nullable.getNestedColumn(); + auto & null_map = column_nullable.getNullMapData(); + size_t old_size = null_map.size(); + + nested_serializer->readRow(row_num); + + if (row_num < old_size) + { + null_map[row_num] = false; + } + else + { + size_t new_size = nested_column.size(); + if (new_size != old_size + 1) + throw Exception("Size of ColumnNullable is unexpected", ErrorCodes::LOGICAL_ERROR); + try + { + null_map.push_back(false); + } + catch (...) + { + nested_column.popBack(1); + throw; + } + } + } + + void insertDefaults(size_t row_num) override + { + auto & column_nullable = assert_cast(column->assumeMutableRef()); + if (row_num < column_nullable.size()) + return; + column_nullable.insertDefault(); + } + + private: + const std::unique_ptr nested_serializer; + ColumnPtr column; + }; + + + /// Serializes a ColumnMap. + class ProtobufSerializerMap : public ProtobufSerializer + { + public: + explicit ProtobufSerializerMap(std::unique_ptr nested_serializer_) + : nested_serializer(std::move(nested_serializer_)) + { + } + + void setColumns(const ColumnPtr * columns, [[maybe_unused]] size_t num_columns) override + { + assert(num_columns == 1); + const auto & column_map = assert_cast(*columns[0]); + ColumnPtr nested_column = column_map.getNestedColumnPtr(); + nested_serializer->setColumns(&nested_column, 1); + } + + void setColumns(const MutableColumnPtr * columns, [[maybe_unused]] size_t num_columns) override + { + assert(num_columns == 1); + ColumnPtr column0 = columns[0]->getPtr(); + setColumns(&column0, 1); + } + + void writeRow(size_t row_num) override { nested_serializer->writeRow(row_num); } + void readRow(size_t row_num) override { nested_serializer->readRow(row_num); } + void insertDefaults(size_t row_num) override { nested_serializer->insertDefaults(row_num); } + + private: + const std::unique_ptr nested_serializer; + }; + + + /// Serializes a ColumnLowCardinality. + class ProtobufSerializerLowCardinality : public ProtobufSerializer + { + public: + explicit ProtobufSerializerLowCardinality(std::unique_ptr nested_serializer_) + : nested_serializer(std::move(nested_serializer_)) + { + } + + void setColumns(const ColumnPtr * columns, [[maybe_unused]] size_t num_columns) override + { + assert(num_columns == 1); + column = columns[0]; + const auto & column_lc = assert_cast(*column); + ColumnPtr nested_column = column_lc.getDictionary().getNestedColumn(); + nested_serializer->setColumns(&nested_column, 1); + read_value_column_set = false; + } + + void setColumns(const MutableColumnPtr * columns, [[maybe_unused]] size_t num_columns) override + { + assert(num_columns == 1); + ColumnPtr column0 = columns[0]->getPtr(); + setColumns(&column0, 1); + } + + void writeRow(size_t row_num) override + { + const auto & column_lc = assert_cast(*column); + size_t unique_row_number = column_lc.getIndexes().getUInt(row_num); + nested_serializer->writeRow(unique_row_number); + } + + void readRow(size_t row_num) override + { + auto & column_lc = assert_cast(column->assumeMutableRef()); + + if (!read_value_column_set) + { + if (!read_value_column) + { + ColumnPtr nested_column = column_lc.getDictionary().getNestedColumn(); + read_value_column = nested_column->cloneEmpty(); + } + nested_serializer->setColumns(&read_value_column, 1); + read_value_column_set = true; + } + + read_value_column->popBack(read_value_column->size()); + nested_serializer->readRow(0); + + if (row_num < column_lc.size()) + { + if (row_num != column_lc.size() - 1) + throw Exception("Cannot replace an element in the middle of ColumnLowCardinality", ErrorCodes::LOGICAL_ERROR); + column_lc.popBack(1); + } + + column_lc.insertFromFullColumn(*read_value_column, 0); + } + + void insertDefaults(size_t row_num) override + { + auto & column_lc = assert_cast(column->assumeMutableRef()); + if (row_num < column_lc.size()) + return; + + if (!default_value_column) + { + ColumnPtr nested_column = column_lc.getDictionary().getNestedColumn(); + default_value_column = nested_column->cloneEmpty(); + nested_serializer->setColumns(&default_value_column, 1); + nested_serializer->insertDefaults(0); + read_value_column_set = false; + } + + column_lc.insertFromFullColumn(*default_value_column, 0); + } + + private: + const std::unique_ptr nested_serializer; + ColumnPtr column; + MutableColumnPtr read_value_column; + bool read_value_column_set = false; + MutableColumnPtr default_value_column; + }; + + + /// Serializes a ColumnArray to a repeated field. + class ProtobufSerializerArray : public ProtobufSerializer + { + public: + explicit ProtobufSerializerArray(std::unique_ptr element_serializer_) + : element_serializer(std::move(element_serializer_)) + { + } + + void setColumns(const ColumnPtr * columns, [[maybe_unused]] size_t num_columns) override + { + assert(num_columns == 1); + column = columns[0]; + const auto & column_array = assert_cast(*column); + ColumnPtr data_column = column_array.getDataPtr(); + element_serializer->setColumns(&data_column, 1); + } + + void setColumns(const MutableColumnPtr * columns, [[maybe_unused]] size_t num_columns) override + { + assert(num_columns == 1); + ColumnPtr column0 = columns[0]->getPtr(); + setColumns(&column0, 1); + } + + void writeRow(size_t row_num) override + { + const auto & column_array = assert_cast(*column); + const auto & offsets = column_array.getOffsets(); + for (size_t i : ext::range(offsets[row_num - 1], offsets[row_num])) + element_serializer->writeRow(i); + } + + void readRow(size_t row_num) override + { + auto & column_array = assert_cast(column->assumeMutableRef()); + auto & offsets = column_array.getOffsets(); + size_t old_size = offsets.size(); + if (row_num + 1 < old_size) + throw Exception("Cannot replace an element in the middle of ColumnArray", ErrorCodes::LOGICAL_ERROR); + auto data_column = column_array.getDataPtr(); + size_t old_data_size = data_column->size(); + + try + { + element_serializer->readRow(old_data_size); + size_t data_size = data_column->size(); + if (data_size != old_data_size + 1) + throw Exception("Size of ColumnArray is unexpected", ErrorCodes::LOGICAL_ERROR); + + if (row_num < old_size) + offsets.back() = data_size; + else + offsets.push_back(data_size); + } + catch (...) + { + if (data_column->size() > old_data_size) + data_column->assumeMutableRef().popBack(data_column->size() - old_data_size); + if (offsets.size() > old_size) + column_array.getOffsetsColumn().popBack(offsets.size() - old_size); + throw; + } + } + + void insertDefaults(size_t row_num) override + { + auto & column_array = assert_cast(column->assumeMutableRef()); + if (row_num < column_array.size()) + return; + column_array.insertDefault(); + } + + private: + const std::unique_ptr element_serializer; + ColumnPtr column; + }; + + + /// Serializes a ColumnTuple as a repeated field (just like we serialize arrays). + class ProtobufSerializerTupleAsArray : public ProtobufSerializer + { + public: + ProtobufSerializerTupleAsArray( + const std::shared_ptr & tuple_data_type_, + const FieldDescriptor & field_descriptor_, + std::vector> element_serializers_) + : tuple_data_type(tuple_data_type_) + , tuple_size(tuple_data_type->getElements().size()) + , field_descriptor(field_descriptor_) + , element_serializers(std::move(element_serializers_)) + { + assert(tuple_size); + assert(tuple_size == element_serializers.size()); + } + + void setColumns(const ColumnPtr * columns, [[maybe_unused]] size_t num_columns) override + { + assert(num_columns == 1); + column = columns[0]; + const auto & column_tuple = assert_cast(*column); + for (size_t i : ext::range(tuple_size)) + { + auto element_column = column_tuple.getColumnPtr(i); + element_serializers[i]->setColumns(&element_column, 1); + } + current_element_index = 0; + } + + void setColumns(const MutableColumnPtr * columns, [[maybe_unused]] size_t num_columns) override + { + assert(num_columns == 1); + ColumnPtr column0 = columns[0]->getPtr(); + setColumns(&column0, 1); + } + + void writeRow(size_t row_num) override + { + for (size_t i : ext::range(tuple_size)) + element_serializers[i]->writeRow(row_num); + } + + void readRow(size_t row_num) override + { + auto & column_tuple = assert_cast(column->assumeMutableRef()); + + size_t old_size = column_tuple.size(); + if (row_num >= old_size) + current_element_index = 0; + + insertDefaults(row_num); + + if (current_element_index >= tuple_size) + { + throw Exception( + "Too many (" + std::to_string(current_element_index) + ") elements was read from the field " + + field_descriptor.full_name() + " to fit in the data type " + tuple_data_type->getName(), + ErrorCodes::PROTOBUF_BAD_CAST); + } + + element_serializers[current_element_index]->readRow(row_num); + ++current_element_index; + } + + void insertDefaults(size_t row_num) override + { + auto & column_tuple = assert_cast(column->assumeMutableRef()); + size_t old_size = column_tuple.size(); + + if (row_num > old_size) + return; + + try + { + for (size_t i : ext::range(tuple_size)) + element_serializers[i]->insertDefaults(row_num); + } + catch (...) + { + for (size_t i : ext::range(tuple_size)) + { + auto element_column = column_tuple.getColumnPtr(i)->assumeMutable(); + if (element_column->size() > old_size) + element_column->popBack(element_column->size() - old_size); + } + throw; + } + } + + private: + const std::shared_ptr tuple_data_type; + const size_t tuple_size; + const FieldDescriptor & field_descriptor; + const std::vector> element_serializers; + ColumnPtr column; + size_t current_element_index = 0; + }; + + + /// Serializes a message (root or nested) in the protobuf schema. + class ProtobufSerializerMessage : public ProtobufSerializer + { + public: + struct FieldDesc + { + size_t column_index; + size_t num_columns; + const FieldDescriptor * field_descriptor; + std::unique_ptr field_serializer; + }; + + ProtobufSerializerMessage( + std::vector field_descs_, + const FieldDescriptor * parent_field_descriptor_, + bool with_length_delimiter_, + const ProtobufReaderOrWriter & reader_or_writer_) + : parent_field_descriptor(parent_field_descriptor_) + , with_length_delimiter(with_length_delimiter_) + , should_skip_if_empty(parent_field_descriptor ? shouldSkipZeroOrEmpty(*parent_field_descriptor) : false) + , reader(reader_or_writer_.reader) + , writer(reader_or_writer_.writer) + { + field_infos.reserve(field_descs_.size()); + for (auto & desc : field_descs_) + field_infos.emplace_back(desc.column_index, desc.num_columns, *desc.field_descriptor, std::move(desc.field_serializer)); + + std::sort(field_infos.begin(), field_infos.end(), + [](const FieldInfo & lhs, const FieldInfo & rhs) { return lhs.field_tag < rhs.field_tag; }); + + for (size_t i : ext::range(field_infos.size())) + field_index_by_field_tag.emplace(field_infos[i].field_tag, i); + } + + void setColumns(const ColumnPtr * columns_, size_t num_columns_) override + { + columns.assign(columns_, columns_ + num_columns_); + + for (const FieldInfo & info : field_infos) + info.field_serializer->setColumns(columns.data() + info.column_index, info.num_columns); + + if (reader) + { + missing_column_indices.clear(); + missing_column_indices.reserve(num_columns_); + size_t current_idx = 0; + for (const FieldInfo & info : field_infos) + { + while (current_idx < info.column_index) + missing_column_indices.push_back(current_idx++); + current_idx = info.column_index + info.num_columns; + } + while (current_idx < num_columns_) + missing_column_indices.push_back(current_idx++); + } + } + + void setColumns(const MutableColumnPtr * columns_, size_t num_columns_) override + { + Columns cols; + cols.reserve(num_columns_); + for (size_t i : ext::range(num_columns_)) + cols.push_back(columns_[i]->getPtr()); + setColumns(cols.data(), cols.size()); + } + + void writeRow(size_t row_num) override + { + if (parent_field_descriptor) + writer->startNestedMessage(); + else + writer->startMessage(); + + for (const FieldInfo & info : field_infos) + { + if (info.should_pack_repeated) + writer->startRepeatedPack(); + info.field_serializer->writeRow(row_num); + if (info.should_pack_repeated) + writer->endRepeatedPack(info.field_tag, true); + } + + if (parent_field_descriptor) + { + bool is_group = (parent_field_descriptor->type() == FieldTypeId::TYPE_GROUP); + writer->endNestedMessage(parent_field_descriptor->number(), is_group, should_skip_if_empty); + } + else + writer->endMessage(with_length_delimiter); + } + + void readRow(size_t row_num) override + { + if (parent_field_descriptor) + reader->startNestedMessage(); + else + reader->startMessage(with_length_delimiter); + + if (!field_infos.empty()) + { + last_field_index = 0; + last_field_tag = field_infos[0].field_tag; + size_t old_size = columns.empty() ? 0 : columns[0]->size(); + + try + { + int field_tag; + while (reader->readFieldNumber(field_tag)) + { + size_t field_index = findFieldIndexByFieldTag(field_tag); + if (field_index == static_cast(-1)) + continue; + auto * field_serializer = field_infos[field_index].field_serializer.get(); + field_serializer->readRow(row_num); + field_infos[field_index].field_read = true; + } + + for (auto & info : field_infos) + { + if (info.field_read) + info.field_read = false; + else + info.field_serializer->insertDefaults(row_num); + } + } + catch (...) + { + for (auto & column : columns) + { + if (column->size() > old_size) + column->assumeMutableRef().popBack(column->size() - old_size); + } + throw; + } + } + + if (parent_field_descriptor) + reader->endNestedMessage(); + else + reader->endMessage(false); + addDefaultsToMissingColumns(row_num); + } + + void insertDefaults(size_t row_num) override + { + for (const FieldInfo & info : field_infos) + info.field_serializer->insertDefaults(row_num); + addDefaultsToMissingColumns(row_num); + } + + private: + size_t findFieldIndexByFieldTag(int field_tag) + { + while (true) + { + if (field_tag == last_field_tag) + return last_field_index; + if (field_tag < last_field_tag) + break; + if (++last_field_index >= field_infos.size()) + break; + last_field_tag = field_infos[last_field_index].field_tag; + } + last_field_tag = field_tag; + auto it = field_index_by_field_tag.find(field_tag); + if (it == field_index_by_field_tag.end()) + last_field_index = static_cast(-1); + else + last_field_index = it->second; + return last_field_index; + } + + void addDefaultsToMissingColumns(size_t row_num) + { + for (size_t column_idx : missing_column_indices) + { + auto & column = columns[column_idx]; + size_t old_size = column->size(); + if (row_num >= old_size) + column->assumeMutableRef().insertDefault(); + } + } + + struct FieldInfo + { + FieldInfo( + size_t column_index_, + size_t num_columns_, + const FieldDescriptor & field_descriptor_, + std::unique_ptr field_serializer_) + : column_index(column_index_) + , num_columns(num_columns_) + , field_descriptor(&field_descriptor_) + , field_tag(field_descriptor_.number()) + , should_pack_repeated(shouldPackRepeated(field_descriptor_)) + , field_serializer(std::move(field_serializer_)) + { + } + size_t column_index; + size_t num_columns; + const FieldDescriptor * field_descriptor; + int field_tag; + bool should_pack_repeated; + std::unique_ptr field_serializer; + bool field_read = false; + }; + + const FieldDescriptor * const parent_field_descriptor; + const bool with_length_delimiter; + const bool should_skip_if_empty; + ProtobufReader * const reader; + ProtobufWriter * const writer; + std::vector field_infos; + std::unordered_map field_index_by_field_tag; + Columns columns; + std::vector missing_column_indices; + int last_field_tag = 0; + size_t last_field_index = static_cast(-1); + }; + + + /// Serializes a tuple with explicit names as a nested message. + class ProtobufSerializerTupleAsNestedMessage : public ProtobufSerializer + { + public: + explicit ProtobufSerializerTupleAsNestedMessage(std::unique_ptr nested_message_serializer_) + : nested_message_serializer(std::move(nested_message_serializer_)) + { + } + + void setColumns(const ColumnPtr * columns, [[maybe_unused]] size_t num_columns) override + { + assert(num_columns == 1); + const auto & column_tuple = assert_cast(*columns[0]); + size_t tuple_size = column_tuple.tupleSize(); + assert(tuple_size); + Columns element_columns; + element_columns.reserve(tuple_size); + for (size_t i : ext::range(tuple_size)) + element_columns.emplace_back(column_tuple.getColumnPtr(i)); + nested_message_serializer->setColumns(element_columns.data(), element_columns.size()); + } + + void setColumns(const MutableColumnPtr * columns, [[maybe_unused]] size_t num_columns) override + { + assert(num_columns == 1); + ColumnPtr column0 = columns[0]->getPtr(); + setColumns(&column0, 1); + } + + void writeRow(size_t row_num) override { nested_message_serializer->writeRow(row_num); } + void readRow(size_t row_num) override { nested_message_serializer->readRow(row_num); } + void insertDefaults(size_t row_num) override { nested_message_serializer->insertDefaults(row_num); } + + private: + const std::unique_ptr nested_message_serializer; + }; + + + /// Serializes a flattened Nested data type (an array of tuples with explicit names) + /// as a repeated nested message. + class ProtobufSerializerFlattenedNestedAsArrayOfNestedMessages : public ProtobufSerializer + { + public: + explicit ProtobufSerializerFlattenedNestedAsArrayOfNestedMessages( + std::unique_ptr nested_message_serializer_) + : nested_message_serializer(std::move(nested_message_serializer_)) + { + } + + void setColumns(const ColumnPtr * columns, size_t num_columns) override + { + assert(num_columns); + data_columns.clear(); + data_columns.reserve(num_columns); + offset_columns.clear(); + offset_columns.reserve(num_columns); + + for (size_t i : ext::range(num_columns)) + { + const auto & column_array = assert_cast(*columns[i]); + data_columns.emplace_back(column_array.getDataPtr()); + offset_columns.emplace_back(column_array.getOffsetsPtr()); + } + + std::sort(offset_columns.begin(), offset_columns.end()); + offset_columns.erase(std::unique(offset_columns.begin(), offset_columns.end()), offset_columns.end()); + + nested_message_serializer->setColumns(data_columns.data(), data_columns.size()); + } + + void setColumns(const MutableColumnPtr * columns, size_t num_columns) override + { + Columns cols; + cols.reserve(num_columns); + for (size_t i : ext::range(num_columns)) + cols.push_back(columns[i]->getPtr()); + setColumns(cols.data(), cols.size()); + } + + void writeRow(size_t row_num) override + { + const auto & offset_column0 = assert_cast(*offset_columns[0]); + size_t start_offset = offset_column0.getElement(row_num - 1); + size_t end_offset = offset_column0.getElement(row_num); + for (size_t i : ext::range(1, offset_columns.size())) + { + const auto & offset_column = assert_cast(*offset_columns[i]); + if (offset_column.getElement(row_num) != end_offset) + throw Exception("Components of FlattenedNested have different sizes", ErrorCodes::PROTOBUF_BAD_CAST); + } + for (size_t i : ext::range(start_offset, end_offset)) + nested_message_serializer->writeRow(i); + } + + void readRow(size_t row_num) override + { + size_t old_size = offset_columns[0]->size(); + if (row_num + 1 < old_size) + throw Exception("Cannot replace an element in the middle of ColumnArray", ErrorCodes::LOGICAL_ERROR); + + size_t old_data_size = data_columns[0]->size(); + + try + { + nested_message_serializer->readRow(old_data_size); + size_t data_size = data_columns[0]->size(); + if (data_size != old_data_size + 1) + throw Exception("Unexpected number of elements of ColumnArray has been read", ErrorCodes::LOGICAL_ERROR); + + if (row_num < old_size) + { + for (auto & offset_column : offset_columns) + assert_cast(offset_column->assumeMutableRef()).getData().back() = data_size; + } + else + { + for (auto & offset_column : offset_columns) + assert_cast(offset_column->assumeMutableRef()).getData().push_back(data_size); + } + } + catch (...) + { + for (auto & data_column : data_columns) + { + if (data_column->size() > old_data_size) + data_column->assumeMutableRef().popBack(data_column->size() - old_data_size); + } + for (auto & offset_column : offset_columns) + { + if (offset_column->size() > old_size) + offset_column->assumeMutableRef().popBack(offset_column->size() - old_size); + } + throw; + } + } + + void insertDefaults(size_t row_num) override + { + size_t old_size = offset_columns[0]->size(); + if (row_num < old_size) + return; + + try + { + size_t data_size = data_columns[0]->size(); + for (auto & offset_column : offset_columns) + assert_cast(offset_column->assumeMutableRef()).getData().push_back(data_size); + } + catch (...) + { + for (auto & offset_column : offset_columns) + { + if (offset_column->size() > old_size) + offset_column->assumeMutableRef().popBack(offset_column->size() - old_size); + } + throw; + } + } + + private: + const std::unique_ptr nested_message_serializer; + Columns data_columns; + Columns offset_columns; + }; + + + /// Produces a tree of ProtobufSerializers which serializes a row as a protobuf message. + class ProtobufSerializerBuilder + { + public: + explicit ProtobufSerializerBuilder(const ProtobufReaderOrWriter & reader_or_writer_) : reader_or_writer(reader_or_writer_) {} + + std::unique_ptr buildMessageSerializer( + const Strings & column_names, + const DataTypes & data_types, + std::vector & missing_column_indices, + const MessageDescriptor & message_descriptor, + bool with_length_delimiter) + { + std::vector used_column_indices; + auto serializer = buildMessageSerializerImpl( + /* num_columns = */ column_names.size(), + column_names.data(), + data_types.data(), + used_column_indices, + message_descriptor, + with_length_delimiter, + /* parent_field_descriptor = */ nullptr); + + if (!serializer) + { + throw Exception( + "Not found matches between the names of the columns {" + boost::algorithm::join(column_names, ", ") + + "} and the fields {" + boost::algorithm::join(getFieldNames(message_descriptor), ", ") + "} of the message " + + quoteString(message_descriptor.full_name()) + " in the protobuf schema", + ErrorCodes::NO_COLUMNS_SERIALIZED_TO_PROTOBUF_FIELDS); + } + + missing_column_indices.clear(); + missing_column_indices.reserve(column_names.size() - used_column_indices.size()); + boost::range::set_difference(ext::range(column_names.size()), used_column_indices, + std::back_inserter(missing_column_indices)); + + return serializer; + } + + private: + /// Collects all field names from the message (used only to format error messages). + static Strings getFieldNames(const MessageDescriptor & message_descriptor) + { + Strings field_names; + field_names.reserve(message_descriptor.field_count()); + for (int i : ext::range(message_descriptor.field_count())) + field_names.emplace_back(message_descriptor.field(i)->name()); + return field_names; + } + + static bool columnNameEqualsToFieldName(const std::string_view & column_name, const FieldDescriptor & field_descriptor) + { + std::string_view suffix; + return columnNameStartsWithFieldName(column_name, field_descriptor, suffix) && suffix.empty(); + } + + /// Checks if a passed column's name starts with a specified field's name. + /// The function also assigns `suffix` to the rest part of the column's name + /// which doesn't match to the field's name. + /// The function requires that rest part of the column's name to be started with a dot '.' or underline '_', + /// but doesn't include those '.' or '_' characters into `suffix`. + static bool columnNameStartsWithFieldName(const std::string_view & column_name, const FieldDescriptor & field_descriptor, std::string_view & suffix) + { + size_t matching_length = 0; + const MessageDescriptor & containing_type = *field_descriptor.containing_type(); + if (containing_type.options().map_entry()) + { + /// Special case. Elements of the data type Map are named as "keys" and "values", + /// but they're internally named as "key" and "value" in protobuf schema. + if (field_descriptor.number() == 1) + { + if (ColumnNameWithProtobufFieldNameComparator::startsWith(column_name, "keys")) + matching_length = strlen("keys"); + else if (ColumnNameWithProtobufFieldNameComparator::startsWith(column_name, "key")) + matching_length = strlen("key"); + } + else if (field_descriptor.number() == 2) + { + if (ColumnNameWithProtobufFieldNameComparator::startsWith(column_name, "values")) + matching_length = strlen("values"); + else if (ColumnNameWithProtobufFieldNameComparator::startsWith(column_name, "value")) + matching_length = strlen("value"); + } + } + if (!matching_length && ColumnNameWithProtobufFieldNameComparator::startsWith(column_name, field_descriptor.name())) + { + matching_length = field_descriptor.name().length(); + } + if (column_name.length() == matching_length) + return true; + if ((column_name.length() < matching_length + 2) || !field_descriptor.message_type()) + return false; + char first_char_after_matching = column_name[matching_length]; + if (!ColumnNameWithProtobufFieldNameComparator::equals(first_char_after_matching, '.')) + return false; + suffix = column_name.substr(matching_length + 1); + return true; + } + + /// Finds fields in the protobuf message which can be considered as matching + /// for a specified column's name. The found fields can be nested messages, + /// for that case suffixes are also returned. + /// This is only the first filter, buildMessageSerializerImpl() does other checks after calling this function. + static bool findFieldsByColumnName( + const std::string_view & column_name, + const MessageDescriptor & message_descriptor, + std::vector> & out_field_descriptors_with_suffixes) + { + out_field_descriptors_with_suffixes.clear(); + + /// Find all fields which have the same name as column's name (case-insensitively); i.e. we're checking + /// field_name == column_name. + for (int i : ext::range(message_descriptor.field_count())) + { + const auto & field_descriptor = *message_descriptor.field(i); + if (columnNameEqualsToFieldName(column_name, field_descriptor)) + { + out_field_descriptors_with_suffixes.emplace_back(&field_descriptor, std::string_view{}); + break; + } + } + + if (!out_field_descriptors_with_suffixes.empty()) + return true; /// We have an exact match, no need to compare prefixes. + + /// Find all fields which name is used as prefix in column's name; i.e. we're checking + /// column_name == field_name + '.' + nested_message_field_name + for (int i : ext::range(message_descriptor.field_count())) + { + const auto & field_descriptor = *message_descriptor.field(i); + std::string_view suffix; + if (columnNameStartsWithFieldName(column_name, field_descriptor, suffix)) + { + out_field_descriptors_with_suffixes.emplace_back(&field_descriptor, suffix); + } + } + + /// Shorter suffixes first. + std::sort(out_field_descriptors_with_suffixes.begin(), out_field_descriptors_with_suffixes.end(), + [](const std::pair & f1, + const std::pair & f2) + { + return f1.second.length() < f2.second.length(); + }); + + return !out_field_descriptors_with_suffixes.empty(); + } + + /// Builds a serializer for a protobuf message (root or nested). + template + std::unique_ptr buildMessageSerializerImpl( + size_t num_columns, + const StringOrStringViewT * column_names, + const DataTypePtr * data_types, + std::vector & used_column_indices, + const MessageDescriptor & message_descriptor, + bool with_length_delimiter, + const FieldDescriptor * parent_field_descriptor) + { + std::vector field_descs; + boost::container::flat_map field_descriptors_in_use; + + used_column_indices.clear(); + used_column_indices.reserve(num_columns); + + auto add_field_serializer = [&](size_t column_index_, + const std::string_view & column_name_, + size_t num_columns_, + const FieldDescriptor & field_descriptor_, + std::unique_ptr field_serializer_) + { + auto it = field_descriptors_in_use.find(&field_descriptor_); + if (it != field_descriptors_in_use.end()) + { + throw Exception( + "Multiple columns (" + backQuote(StringRef{field_descriptors_in_use[&field_descriptor_]}) + ", " + + backQuote(StringRef{column_name_}) + ") cannot be serialized to a single protobuf field " + + quoteString(field_descriptor_.full_name()), + ErrorCodes::MULTIPLE_COLUMNS_SERIALIZED_TO_SAME_PROTOBUF_FIELD); + } + + field_descs.push_back({column_index_, num_columns_, &field_descriptor_, std::move(field_serializer_)}); + field_descriptors_in_use.emplace(&field_descriptor_, column_name_); + }; + + std::vector> field_descriptors_with_suffixes; + + /// We're going through all the passed columns. + size_t column_idx = 0; + size_t next_column_idx = 1; + for (; column_idx != num_columns; column_idx = next_column_idx++) + { + auto column_name = column_names[column_idx]; + const auto & data_type = data_types[column_idx]; + + if (!findFieldsByColumnName(column_name, message_descriptor, field_descriptors_with_suffixes)) + continue; + + if ((field_descriptors_with_suffixes.size() == 1) && field_descriptors_with_suffixes[0].second.empty()) + { + /// Simple case: one column is serialized as one field. + const auto & field_descriptor = *field_descriptors_with_suffixes[0].first; + auto field_serializer = buildFieldSerializer(column_name, data_type, field_descriptor, field_descriptor.is_repeated()); + + if (field_serializer) + { + add_field_serializer(column_idx, column_name, 1, field_descriptor, std::move(field_serializer)); + used_column_indices.push_back(column_idx); + continue; + } + } + + for (const auto & [field_descriptor, suffix] : field_descriptors_with_suffixes) + { + if (!suffix.empty()) + { + /// Complex case: one or more columns are serialized as a nested message. + std::vector names_relative_to_nested_message; + names_relative_to_nested_message.reserve(num_columns - column_idx); + names_relative_to_nested_message.emplace_back(suffix); + + for (size_t j : ext::range(column_idx + 1, num_columns)) + { + std::string_view next_suffix; + if (!columnNameStartsWithFieldName(column_names[j], *field_descriptor, next_suffix)) + break; + names_relative_to_nested_message.emplace_back(next_suffix); + } + + /// Now we have up to `names_relative_to_nested_message.size()` sequential columns + /// which can be serialized as a nested message. + + /// Calculate how many of those sequential columns are arrays. + size_t num_arrays = 0; + for (size_t j : ext::range(column_idx, column_idx + names_relative_to_nested_message.size())) + { + if (data_types[j]->getTypeId() != TypeIndex::Array) + break; + ++num_arrays; + } + + /// We will try to serialize the sequential columns as one nested message, + /// then, if failed, as an array of nested messages (on condition those columns are array). + bool has_fallback_to_array_of_nested_messages = num_arrays && field_descriptor->is_repeated(); + + /// Try to serialize the sequential columns as one nested message. + try + { + std::vector used_column_indices_in_nested; + auto nested_message_serializer = buildMessageSerializerImpl( + names_relative_to_nested_message.size(), + names_relative_to_nested_message.data(), + &data_types[column_idx], + used_column_indices_in_nested, + *field_descriptor->message_type(), + false, + field_descriptor); + + if (nested_message_serializer) + { + for (size_t & idx_in_nested : used_column_indices_in_nested) + used_column_indices.push_back(idx_in_nested + column_idx); + + next_column_idx = used_column_indices.back() + 1; + add_field_serializer(column_idx, column_name, next_column_idx - column_idx, *field_descriptor, std::move(nested_message_serializer)); + break; + } + } + catch (Exception & e) + { + if ((e.code() != ErrorCodes::PROTOBUF_FIELD_NOT_REPEATED) || !has_fallback_to_array_of_nested_messages) + throw; + } + + if (has_fallback_to_array_of_nested_messages) + { + /// Try to serialize the sequential columns as an array of nested messages. + DataTypes array_nested_data_types; + array_nested_data_types.reserve(num_arrays); + for (size_t j : ext::range(column_idx, column_idx + num_arrays)) + array_nested_data_types.emplace_back(assert_cast(*data_types[j]).getNestedType()); + + std::vector used_column_indices_in_nested; + auto nested_message_serializer = buildMessageSerializerImpl( + array_nested_data_types.size(), + names_relative_to_nested_message.data(), + array_nested_data_types.data(), + used_column_indices_in_nested, + *field_descriptor->message_type(), + false, + field_descriptor); + + if (nested_message_serializer) + { + auto field_serializer = std::make_unique(std::move(nested_message_serializer)); + + for (size_t & idx_in_nested : used_column_indices_in_nested) + used_column_indices.push_back(idx_in_nested + column_idx); + + next_column_idx = used_column_indices.back() + 1; + add_field_serializer(column_idx, column_name, next_column_idx - column_idx, *field_descriptor, std::move(field_serializer)); + break; + } + } + } + } + } + + /// Check that we've found matching columns for all the required fields. + if ((message_descriptor.file()->syntax() == google::protobuf::FileDescriptor::SYNTAX_PROTO2) + && reader_or_writer.writer) + { + for (int i : ext::range(message_descriptor.field_count())) + { + const auto & field_descriptor = *message_descriptor.field(i); + if (field_descriptor.is_required() && !field_descriptors_in_use.count(&field_descriptor)) + throw Exception( + "Field " + quoteString(field_descriptor.full_name()) + " is required to be set", + ErrorCodes::NO_COLUMN_SERIALIZED_TO_REQUIRED_PROTOBUF_FIELD); + } + } + + if (field_descs.empty()) + return nullptr; + + return std::make_unique( + std::move(field_descs), parent_field_descriptor, with_length_delimiter, reader_or_writer); + } + + /// Builds a serializer for one-to-one match: + /// one column is serialized as one field in the protobuf message. + std::unique_ptr buildFieldSerializer( + const std::string_view & column_name, + const DataTypePtr & data_type, + const FieldDescriptor & field_descriptor, + bool allow_repeat) + { + auto data_type_id = data_type->getTypeId(); + switch (data_type_id) + { + case TypeIndex::UInt8: return std::make_unique>(field_descriptor, reader_or_writer); + case TypeIndex::UInt16: return std::make_unique>(field_descriptor, reader_or_writer); + case TypeIndex::UInt32: return std::make_unique>(field_descriptor, reader_or_writer); + case TypeIndex::UInt64: return std::make_unique>(field_descriptor, reader_or_writer); + case TypeIndex::UInt128: return std::make_unique>(field_descriptor, reader_or_writer); + case TypeIndex::UInt256: return std::make_unique>(field_descriptor, reader_or_writer); + case TypeIndex::Int8: return std::make_unique>(field_descriptor, reader_or_writer); + case TypeIndex::Int16: return std::make_unique>(field_descriptor, reader_or_writer); + case TypeIndex::Int32: return std::make_unique>(field_descriptor, reader_or_writer); + case TypeIndex::Int64: return std::make_unique>(field_descriptor, reader_or_writer); + case TypeIndex::Int128: return std::make_unique>(field_descriptor, reader_or_writer); + case TypeIndex::Int256: return std::make_unique>(field_descriptor, reader_or_writer); + case TypeIndex::Float32: return std::make_unique>(field_descriptor, reader_or_writer); + case TypeIndex::Float64: return std::make_unique>(field_descriptor, reader_or_writer); + case TypeIndex::Date: return std::make_unique(field_descriptor, reader_or_writer); + case TypeIndex::DateTime: return std::make_unique(field_descriptor, reader_or_writer); + case TypeIndex::DateTime64: return std::make_unique(assert_cast(*data_type), field_descriptor, reader_or_writer); + case TypeIndex::String: return std::make_unique>(field_descriptor, reader_or_writer); + case TypeIndex::FixedString: return std::make_unique>(assert_cast(*data_type), field_descriptor, reader_or_writer); + case TypeIndex::Enum8: return std::make_unique>(typeid_cast>(data_type), field_descriptor, reader_or_writer); + case TypeIndex::Enum16: return std::make_unique>(typeid_cast>(data_type), field_descriptor, reader_or_writer); + case TypeIndex::Decimal32: return std::make_unique>(assert_cast &>(*data_type), field_descriptor, reader_or_writer); + case TypeIndex::Decimal64: return std::make_unique>(assert_cast &>(*data_type), field_descriptor, reader_or_writer); + case TypeIndex::Decimal128: return std::make_unique>(assert_cast &>(*data_type), field_descriptor, reader_or_writer); + case TypeIndex::Decimal256: return std::make_unique>(assert_cast &>(*data_type), field_descriptor, reader_or_writer); + case TypeIndex::UUID: return std::make_unique(field_descriptor, reader_or_writer); + case TypeIndex::Interval: return std::make_unique(field_descriptor, reader_or_writer); + case TypeIndex::AggregateFunction: return std::make_unique(typeid_cast>(data_type), field_descriptor, reader_or_writer); + + case TypeIndex::Nullable: + { + const auto & nullable_data_type = assert_cast(*data_type); + auto nested_serializer = buildFieldSerializer(column_name, nullable_data_type.getNestedType(), field_descriptor, allow_repeat); + if (!nested_serializer) + return nullptr; + return std::make_unique(std::move(nested_serializer)); + } + + case TypeIndex::LowCardinality: + { + const auto & low_cardinality_data_type = assert_cast(*data_type); + auto nested_serializer + = buildFieldSerializer(column_name, low_cardinality_data_type.getDictionaryType(), field_descriptor, allow_repeat); + if (!nested_serializer) + return nullptr; + return std::make_unique(std::move(nested_serializer)); + } + + case TypeIndex::Map: + { + const auto & map_data_type = assert_cast(*data_type); + auto nested_serializer = buildFieldSerializer(column_name, map_data_type.getNestedType(), field_descriptor, allow_repeat); + if (!nested_serializer) + return nullptr; + return std::make_unique(std::move(nested_serializer)); + } + + case TypeIndex::Array: + { + /// Array is serialized as a repeated field. + const auto & array_data_type = assert_cast(*data_type); + + if (!allow_repeat) + { + throw Exception( + "The field " + quoteString(field_descriptor.full_name()) + + " must be repeated in the protobuf schema to match the column " + backQuote(StringRef{column_name}), + ErrorCodes::PROTOBUF_FIELD_NOT_REPEATED); + } + + auto nested_serializer = buildFieldSerializer(column_name, array_data_type.getNestedType(), field_descriptor, + /* allow_repeat = */ false); // We do our repeating now, so for nested type we forget about the repeating. + if (!nested_serializer) + return nullptr; + return std::make_unique(std::move(nested_serializer)); + } + + case TypeIndex::Tuple: + { + /// Tuple is serialized in one of two ways: + /// 1) If the tuple has explicit names then it can be serialized as a nested message. + /// 2) Any tuple can be serialized as a repeated field, just like Array. + const auto & tuple_data_type = assert_cast(*data_type); + size_t size_of_tuple = tuple_data_type.getElements().size(); + + if (tuple_data_type.haveExplicitNames() && field_descriptor.message_type()) + { + /// Try to serialize as a nested message. + std::vector used_column_indices; + auto nested_message_serializer = buildMessageSerializerImpl( + size_of_tuple, + tuple_data_type.getElementNames().data(), + tuple_data_type.getElements().data(), + used_column_indices, + *field_descriptor.message_type(), + false, + &field_descriptor); + + if (!nested_message_serializer) + { + throw Exception( + "Not found matches between the names of the tuple's elements {" + + boost::algorithm::join(tuple_data_type.getElementNames(), ", ") + "} and the fields {" + + boost::algorithm::join(getFieldNames(*field_descriptor.message_type()), ", ") + "} of the message " + + quoteString(field_descriptor.message_type()->full_name()) + " in the protobuf schema", + ErrorCodes::NO_COLUMNS_SERIALIZED_TO_PROTOBUF_FIELDS); + } + + return std::make_unique(std::move(nested_message_serializer)); + } + + /// Serialize as a repeated field. + if (!allow_repeat && (size_of_tuple > 1)) + { + throw Exception( + "The field " + quoteString(field_descriptor.full_name()) + + " must be repeated in the protobuf schema to match the column " + backQuote(StringRef{column_name}), + ErrorCodes::PROTOBUF_FIELD_NOT_REPEATED); + } + + std::vector> nested_serializers; + for (const auto & nested_data_type : tuple_data_type.getElements()) + { + auto nested_serializer = buildFieldSerializer(column_name, nested_data_type, field_descriptor, + /* allow_repeat = */ false); // We do our repeating now, so for nested type we forget about the repeating. + if (!nested_serializer) + break; + nested_serializers.push_back(std::move(nested_serializer)); + } + + if (nested_serializers.size() != size_of_tuple) + return nullptr; + + return std::make_unique( + typeid_cast>(data_type), + field_descriptor, + std::move(nested_serializers)); + } + + default: + throw Exception("Unknown data type: " + data_type->getName(), ErrorCodes::LOGICAL_ERROR); + } + } + + const ProtobufReaderOrWriter reader_or_writer; + }; +} + + +std::unique_ptr ProtobufSerializer::create( + const Strings & column_names, + const DataTypes & data_types, + std::vector & missing_column_indices, + const google::protobuf::Descriptor & message_descriptor, + bool with_length_delimiter, + ProtobufReader & reader) +{ + return ProtobufSerializerBuilder(reader).buildMessageSerializer(column_names, data_types, missing_column_indices, message_descriptor, with_length_delimiter); +} + +std::unique_ptr ProtobufSerializer::create( + const Strings & column_names, + const DataTypes & data_types, + const google::protobuf::Descriptor & message_descriptor, + bool with_length_delimiter, + ProtobufWriter & writer) +{ + std::vector missing_column_indices; + return ProtobufSerializerBuilder(writer).buildMessageSerializer(column_names, data_types, missing_column_indices, message_descriptor, with_length_delimiter); +} +} +#endif diff --git a/src/Formats/ProtobufSerializer.h b/src/Formats/ProtobufSerializer.h new file mode 100644 index 00000000000..86a2f2f36dd --- /dev/null +++ b/src/Formats/ProtobufSerializer.h @@ -0,0 +1,52 @@ +#pragma once + +#if !defined(ARCADIA_BUILD) +# include "config_formats.h" +#endif + +#if USE_PROTOBUF +# include + + +namespace google::protobuf { class Descriptor; } + +namespace DB +{ +class ProtobufReader; +class ProtobufWriter; +class IDataType; +using DataTypePtr = std::shared_ptr; +using DataTypes = std::vector; + + +/// Utility class, does all the work for serialization in the Protobuf format. +class ProtobufSerializer +{ +public: + virtual ~ProtobufSerializer() = default; + + virtual void setColumns(const ColumnPtr * columns, size_t num_columns) = 0; + virtual void writeRow(size_t row_num) = 0; + + virtual void setColumns(const MutableColumnPtr * columns, size_t num_columns) = 0; + virtual void readRow(size_t row_num) = 0; + virtual void insertDefaults(size_t row_num) = 0; + + static std::unique_ptr create( + const Strings & column_names, + const DataTypes & data_types, + std::vector & missing_column_indices, + const google::protobuf::Descriptor & message_descriptor, + bool with_length_delimiter, + ProtobufReader & reader); + + static std::unique_ptr create( + const Strings & column_names, + const DataTypes & data_types, + const google::protobuf::Descriptor & message_descriptor, + bool with_length_delimiter, + ProtobufWriter & writer); +}; + +} +#endif diff --git a/src/Formats/ProtobufWriter.cpp b/src/Formats/ProtobufWriter.cpp index e62d8fc4a58..ece4f78b1c8 100644 --- a/src/Formats/ProtobufWriter.cpp +++ b/src/Formats/ProtobufWriter.cpp @@ -1,29 +1,11 @@ #include "ProtobufWriter.h" #if USE_PROTOBUF -# include -# include -# include -# include -# include -# include -# include -# include -# include -# include +# include namespace DB { -namespace ErrorCodes -{ - extern const int NOT_IMPLEMENTED; - extern const int NO_DATA_FOR_REQUIRED_PROTOBUF_FIELD; - extern const int PROTOBUF_BAD_CAST; - extern const int PROTOBUF_FIELD_NOT_REPEATED; -} - - namespace { constexpr size_t MAX_VARINT_SIZE = 10; @@ -81,66 +63,24 @@ namespace } void writeFieldNumber(UInt32 field_number, WireType wire_type, PODArray & buf) { writeVarint((field_number << 3) | wire_type, buf); } - - // Should we pack repeated values while storing them. - // It depends on type of the field in the protobuf schema and the syntax of that schema. - bool shouldPackRepeated(const google::protobuf::FieldDescriptor * field) - { - if (!field->is_repeated()) - return false; - switch (field->type()) - { - case google::protobuf::FieldDescriptor::TYPE_INT32: - case google::protobuf::FieldDescriptor::TYPE_UINT32: - case google::protobuf::FieldDescriptor::TYPE_SINT32: - case google::protobuf::FieldDescriptor::TYPE_INT64: - case google::protobuf::FieldDescriptor::TYPE_UINT64: - case google::protobuf::FieldDescriptor::TYPE_SINT64: - case google::protobuf::FieldDescriptor::TYPE_FIXED32: - case google::protobuf::FieldDescriptor::TYPE_SFIXED32: - case google::protobuf::FieldDescriptor::TYPE_FIXED64: - case google::protobuf::FieldDescriptor::TYPE_SFIXED64: - case google::protobuf::FieldDescriptor::TYPE_FLOAT: - case google::protobuf::FieldDescriptor::TYPE_DOUBLE: - case google::protobuf::FieldDescriptor::TYPE_BOOL: - case google::protobuf::FieldDescriptor::TYPE_ENUM: - break; - default: - return false; - } - if (field->options().has_packed()) - return field->options().packed(); - return field->file()->syntax() == google::protobuf::FileDescriptor::SYNTAX_PROTO3; - } - - // Should we omit null values (zero for numbers / empty string for strings) while storing them. - bool shouldSkipNullValue(const google::protobuf::FieldDescriptor * field) - { - return field->is_optional() && (field->file()->syntax() == google::protobuf::FileDescriptor::SYNTAX_PROTO3); - } } -// SimpleWriter is an utility class to serialize protobufs. -// Knows nothing about protobuf schemas, just provides useful functions to serialize data. -ProtobufWriter::SimpleWriter::SimpleWriter(WriteBuffer & out_, const bool use_length_delimiters_) +ProtobufWriter::ProtobufWriter(WriteBuffer & out_) : out(out_) - , current_piece_start(0) - , num_bytes_skipped(0) - , use_length_delimiters(use_length_delimiters_) { } -ProtobufWriter::SimpleWriter::~SimpleWriter() = default; +ProtobufWriter::~ProtobufWriter() = default; -void ProtobufWriter::SimpleWriter::startMessage() +void ProtobufWriter::startMessage() { } -void ProtobufWriter::SimpleWriter::endMessage() +void ProtobufWriter::endMessage(bool with_length_delimiter) { pieces.emplace_back(current_piece_start, buffer.size()); - if (use_length_delimiters) + if (with_length_delimiter) { size_t size_of_message = buffer.size() - num_bytes_skipped; writeVarint(size_of_message, out); @@ -154,7 +94,7 @@ void ProtobufWriter::SimpleWriter::endMessage() current_piece_start = 0; } -void ProtobufWriter::SimpleWriter::startNestedMessage() +void ProtobufWriter::startNestedMessage() { nested_infos.emplace_back(pieces.size(), num_bytes_skipped); pieces.emplace_back(current_piece_start, buffer.size()); @@ -167,7 +107,7 @@ void ProtobufWriter::SimpleWriter::startNestedMessage() num_bytes_skipped = NESTED_MESSAGE_PADDING; } -void ProtobufWriter::SimpleWriter::endNestedMessage(UInt32 field_number, bool is_group, bool skip_if_empty) +void ProtobufWriter::endNestedMessage(int field_number, bool is_group, bool skip_if_empty) { const auto & nested_info = nested_infos.back(); size_t num_pieces_at_start = nested_info.num_pieces_at_start; @@ -203,8 +143,13 @@ void ProtobufWriter::SimpleWriter::endNestedMessage(UInt32 field_number, bool is num_bytes_skipped += num_bytes_skipped_at_start - num_bytes_inserted; } -void ProtobufWriter::SimpleWriter::writeUInt(UInt32 field_number, UInt64 value) +void ProtobufWriter::writeUInt(int field_number, UInt64 value) { + if (in_repeated_pack) + { + writeVarint(value, buffer); + return; + } size_t old_size = buffer.size(); buffer.reserve(old_size + 2 * MAX_VARINT_SIZE); UInt8 * ptr = buffer.data() + old_size; @@ -213,20 +158,27 @@ void ProtobufWriter::SimpleWriter::writeUInt(UInt32 field_number, UInt64 value) buffer.resize_assume_reserved(ptr - buffer.data()); } -void ProtobufWriter::SimpleWriter::writeInt(UInt32 field_number, Int64 value) +void ProtobufWriter::writeInt(int field_number, Int64 value) { writeUInt(field_number, static_cast(value)); } -void ProtobufWriter::SimpleWriter::writeSInt(UInt32 field_number, Int64 value) +void ProtobufWriter::writeSInt(int field_number, Int64 value) { writeUInt(field_number, encodeZigZag(value)); } template -void ProtobufWriter::SimpleWriter::writeFixed(UInt32 field_number, T value) +void ProtobufWriter::writeFixed(int field_number, T value) { static_assert((sizeof(T) == 4) || (sizeof(T) == 8)); + if (in_repeated_pack) + { + size_t old_size = buffer.size(); + buffer.resize(old_size + sizeof(T)); + memcpy(buffer.data() + old_size, &value, sizeof(T)); + return; + } constexpr WireType wire_type = (sizeof(T) == 4) ? BITS32 : BITS64; size_t old_size = buffer.size(); buffer.reserve(old_size + MAX_VARINT_SIZE + sizeof(T)); @@ -237,19 +189,27 @@ void ProtobufWriter::SimpleWriter::writeFixed(UInt32 field_number, T value) buffer.resize_assume_reserved(ptr - buffer.data()); } -void ProtobufWriter::SimpleWriter::writeString(UInt32 field_number, const StringRef & str) +template void ProtobufWriter::writeFixed(int field_number, Int32 value); +template void ProtobufWriter::writeFixed(int field_number, UInt32 value); +template void ProtobufWriter::writeFixed(int field_number, Int64 value); +template void ProtobufWriter::writeFixed(int field_number, UInt64 value); +template void ProtobufWriter::writeFixed(int field_number, Float32 value); +template void ProtobufWriter::writeFixed(int field_number, Float64 value); + +void ProtobufWriter::writeString(int field_number, const std::string_view & str) { + size_t length = str.length(); size_t old_size = buffer.size(); - buffer.reserve(old_size + 2 * MAX_VARINT_SIZE + str.size); + buffer.reserve(old_size + 2 * MAX_VARINT_SIZE + length); UInt8 * ptr = buffer.data() + old_size; ptr = writeFieldNumber(field_number, LENGTH_DELIMITED, ptr); - ptr = writeVarint(str.size, ptr); - memcpy(ptr, str.data, str.size); - ptr += str.size; + ptr = writeVarint(length, ptr); + memcpy(ptr, str.data(), length); + ptr += length; buffer.resize_assume_reserved(ptr - buffer.data()); } -void ProtobufWriter::SimpleWriter::startRepeatedPack() +void ProtobufWriter::startRepeatedPack() { pieces.emplace_back(current_piece_start, buffer.size()); @@ -259,17 +219,19 @@ void ProtobufWriter::SimpleWriter::startRepeatedPack() current_piece_start = buffer.size() + REPEATED_PACK_PADDING; buffer.resize(current_piece_start); num_bytes_skipped += REPEATED_PACK_PADDING; + in_repeated_pack = true; } -void ProtobufWriter::SimpleWriter::endRepeatedPack(UInt32 field_number) +void ProtobufWriter::endRepeatedPack(int field_number, bool skip_if_empty) { size_t size = buffer.size() - current_piece_start; - if (!size) + if (!size && skip_if_empty) { current_piece_start = pieces.back().start; buffer.resize(pieces.back().end); pieces.pop_back(); num_bytes_skipped -= REPEATED_PACK_PADDING; + in_repeated_pack = false; return; } UInt8 * ptr = &buffer[pieces.back().end]; @@ -278,726 +240,7 @@ void ProtobufWriter::SimpleWriter::endRepeatedPack(UInt32 field_number) size_t num_bytes_inserted = endptr - ptr; pieces.back().end += num_bytes_inserted; num_bytes_skipped -= num_bytes_inserted; -} - -void ProtobufWriter::SimpleWriter::addUIntToRepeatedPack(UInt64 value) -{ - writeVarint(value, buffer); -} - -void ProtobufWriter::SimpleWriter::addIntToRepeatedPack(Int64 value) -{ - writeVarint(static_cast(value), buffer); -} - -void ProtobufWriter::SimpleWriter::addSIntToRepeatedPack(Int64 value) -{ - writeVarint(encodeZigZag(value), buffer); -} - -template -void ProtobufWriter::SimpleWriter::addFixedToRepeatedPack(T value) -{ - static_assert((sizeof(T) == 4) || (sizeof(T) == 8)); - size_t old_size = buffer.size(); - buffer.resize(old_size + sizeof(T)); - memcpy(buffer.data() + old_size, &value, sizeof(T)); -} - - -// Implementation for a converter from any DB data type to any protobuf field type. -class ProtobufWriter::ConverterBaseImpl : public IConverter -{ -public: - ConverterBaseImpl(SimpleWriter & simple_writer_, const google::protobuf::FieldDescriptor * field_) - : simple_writer(simple_writer_), field(field_) - { - field_number = field->number(); - } - - virtual void writeString(const StringRef &) override { cannotConvertType("String"); } - virtual void writeInt8(Int8) override { cannotConvertType("Int8"); } - virtual void writeUInt8(UInt8) override { cannotConvertType("UInt8"); } - virtual void writeInt16(Int16) override { cannotConvertType("Int16"); } - virtual void writeUInt16(UInt16) override { cannotConvertType("UInt16"); } - virtual void writeInt32(Int32) override { cannotConvertType("Int32"); } - virtual void writeUInt32(UInt32) override { cannotConvertType("UInt32"); } - virtual void writeInt64(Int64) override { cannotConvertType("Int64"); } - virtual void writeUInt64(UInt64) override { cannotConvertType("UInt64"); } - virtual void writeInt128(Int128) override { cannotConvertType("Int128"); } - virtual void writeUInt128(const UInt128 &) override { cannotConvertType("UInt128"); } - virtual void writeInt256(const Int256 &) override { cannotConvertType("Int256"); } - virtual void writeUInt256(const UInt256 &) override { cannotConvertType("UInt256"); } - virtual void writeFloat32(Float32) override { cannotConvertType("Float32"); } - virtual void writeFloat64(Float64) override { cannotConvertType("Float64"); } - virtual void prepareEnumMapping8(const std::vector> &) override {} - virtual void prepareEnumMapping16(const std::vector> &) override {} - virtual void writeEnum8(Int8) override { cannotConvertType("Enum"); } - virtual void writeEnum16(Int16) override { cannotConvertType("Enum"); } - virtual void writeUUID(const UUID &) override { cannotConvertType("UUID"); } - virtual void writeDate(DayNum) override { cannotConvertType("Date"); } - virtual void writeDateTime(time_t) override { cannotConvertType("DateTime"); } - virtual void writeDateTime64(DateTime64, UInt32) override { cannotConvertType("DateTime64"); } - virtual void writeDecimal32(Decimal32, UInt32) override { cannotConvertType("Decimal32"); } - virtual void writeDecimal64(Decimal64, UInt32) override { cannotConvertType("Decimal64"); } - virtual void writeDecimal128(const Decimal128 &, UInt32) override { cannotConvertType("Decimal128"); } - virtual void writeDecimal256(const Decimal256 &, UInt32) override { cannotConvertType("Decimal256"); } - - virtual void writeAggregateFunction(const AggregateFunctionPtr &, ConstAggregateDataPtr) override { cannotConvertType("AggregateFunction"); } - -protected: - [[noreturn]] void cannotConvertType(const String & type_name) - { - throw Exception( - "Could not convert data type '" + type_name + "' to protobuf type '" + field->type_name() + "' (field: " + field->name() + ")", - ErrorCodes::PROTOBUF_BAD_CAST); - } - - [[noreturn]] void cannotConvertValue(const String & value) - { - throw Exception( - "Could not convert value '" + value + "' to protobuf type '" + field->type_name() + "' (field: " + field->name() + ")", - ErrorCodes::PROTOBUF_BAD_CAST); - } - - template - To numericCast(From value) - { - if constexpr (std::is_same_v) - return value; - To result; - try - { - result = boost::numeric_cast(value); - } - catch (boost::numeric::bad_numeric_cast &) - { - cannotConvertValue(toString(value)); - } - return result; - } - - template - To parseFromString(const StringRef & str) - { - To result; - try - { - result = ::DB::parse(str.data, str.size); - } - catch (...) - { - cannotConvertValue(str.toString()); - } - return result; - } - - SimpleWriter & simple_writer; - const google::protobuf::FieldDescriptor * field; - UInt32 field_number; -}; - - -template -class ProtobufWriter::ConverterToString : public ConverterBaseImpl -{ -public: - using ConverterBaseImpl::ConverterBaseImpl; - - void writeString(const StringRef & str) override { writeField(str); } - - void writeInt8(Int8 value) override { convertToStringAndWriteField(value); } - void writeUInt8(UInt8 value) override { convertToStringAndWriteField(value); } - void writeInt16(Int16 value) override { convertToStringAndWriteField(value); } - void writeUInt16(UInt16 value) override { convertToStringAndWriteField(value); } - void writeInt32(Int32 value) override { convertToStringAndWriteField(value); } - void writeUInt32(UInt32 value) override { convertToStringAndWriteField(value); } - void writeInt64(Int64 value) override { convertToStringAndWriteField(value); } - void writeUInt64(UInt64 value) override { convertToStringAndWriteField(value); } - void writeFloat32(Float32 value) override { convertToStringAndWriteField(value); } - void writeFloat64(Float64 value) override { convertToStringAndWriteField(value); } - - void prepareEnumMapping8(const std::vector> & name_value_pairs) override - { - prepareEnumValueToNameMap(name_value_pairs); - } - void prepareEnumMapping16(const std::vector> & name_value_pairs) override - { - prepareEnumValueToNameMap(name_value_pairs); - } - - void writeEnum8(Int8 value) override { writeEnum16(value); } - - void writeEnum16(Int16 value) override - { - auto it = enum_value_to_name_map->find(value); - if (it == enum_value_to_name_map->end()) - cannotConvertValue(toString(value)); - writeField(it->second); - } - - void writeUUID(const UUID & uuid) override { convertToStringAndWriteField(uuid); } - void writeDate(DayNum date) override { convertToStringAndWriteField(date); } - - void writeDateTime(time_t tm) override - { - writeDateTimeText(tm, text_buffer); - writeField(text_buffer.stringRef()); - text_buffer.restart(); - } - - void writeDateTime64(DateTime64 date_time, UInt32 scale) override - { - writeDateTimeText(date_time, scale, text_buffer); - writeField(text_buffer.stringRef()); - text_buffer.restart(); - } - - void writeDecimal32(Decimal32 decimal, UInt32 scale) override { writeDecimal(decimal, scale); } - void writeDecimal64(Decimal64 decimal, UInt32 scale) override { writeDecimal(decimal, scale); } - void writeDecimal128(const Decimal128 & decimal, UInt32 scale) override { writeDecimal(decimal, scale); } - - void writeAggregateFunction(const AggregateFunctionPtr & function, ConstAggregateDataPtr place) override - { - function->serialize(place, text_buffer); - writeField(text_buffer.stringRef()); - text_buffer.restart(); - } - -private: - template - void convertToStringAndWriteField(T value) - { - writeText(value, text_buffer); - writeField(text_buffer.stringRef()); - text_buffer.restart(); - } - - template - void writeDecimal(const Decimal & decimal, UInt32 scale) - { - writeText(decimal, scale, text_buffer); - writeField(text_buffer.stringRef()); - text_buffer.restart(); - } - - template - void prepareEnumValueToNameMap(const std::vector> & name_value_pairs) - { - if (enum_value_to_name_map.has_value()) - return; - enum_value_to_name_map.emplace(); - for (const auto & name_value_pair : name_value_pairs) - enum_value_to_name_map->emplace(name_value_pair.second, name_value_pair.first); - } - - void writeField(const StringRef & str) - { - if constexpr (skip_null_value) - { - if (!str.size) - return; - } - simple_writer.writeString(field_number, str); - } - - WriteBufferFromOwnString text_buffer; - std::optional> enum_value_to_name_map; -}; - -# define PROTOBUF_WRITER_CREATE_CONVERTER_SPECIALIZATION_FOR_STRINGS(field_type_id) \ - template <> \ - std::unique_ptr ProtobufWriter::createConverter( \ - const google::protobuf::FieldDescriptor * field) \ - { \ - if (shouldSkipNullValue(field)) \ - return std::make_unique>(simple_writer, field); \ - else \ - return std::make_unique>(simple_writer, field); \ - } -PROTOBUF_WRITER_CREATE_CONVERTER_SPECIALIZATION_FOR_STRINGS(google::protobuf::FieldDescriptor::TYPE_STRING) -PROTOBUF_WRITER_CREATE_CONVERTER_SPECIALIZATION_FOR_STRINGS(google::protobuf::FieldDescriptor::TYPE_BYTES) -# undef PROTOBUF_WRITER_CREATE_CONVERTER_SPECIALIZATION_FOR_STRINGS - - -template -class ProtobufWriter::ConverterToNumber : public ConverterBaseImpl -{ -public: - using ConverterBaseImpl::ConverterBaseImpl; - - void writeString(const StringRef & str) override { writeField(parseFromString(str)); } - - void writeInt8(Int8 value) override { castNumericAndWriteField(value); } - void writeUInt8(UInt8 value) override { castNumericAndWriteField(value); } - void writeInt16(Int16 value) override { castNumericAndWriteField(value); } - void writeUInt16(UInt16 value) override { castNumericAndWriteField(value); } - void writeInt32(Int32 value) override { castNumericAndWriteField(value); } - void writeUInt32(UInt32 value) override { castNumericAndWriteField(value); } - void writeInt64(Int64 value) override { castNumericAndWriteField(value); } - void writeUInt64(UInt64 value) override { castNumericAndWriteField(value); } - void writeFloat32(Float32 value) override { castNumericAndWriteField(value); } - void writeFloat64(Float64 value) override { castNumericAndWriteField(value); } - - void writeEnum8(Int8 value) override { writeEnum16(value); } - - void writeEnum16(Int16 value) override - { - if constexpr (!is_integer_v) - cannotConvertType("Enum"); // It's not correct to convert enum to floating point. - castNumericAndWriteField(value); - } - - void writeDate(DayNum date) override { castNumericAndWriteField(static_cast(date)); } - void writeDateTime(time_t tm) override { castNumericAndWriteField(tm); } - void writeDateTime64(DateTime64 date_time, UInt32 scale) override { writeDecimal(date_time, scale); } - void writeDecimal32(Decimal32 decimal, UInt32 scale) override { writeDecimal(decimal, scale); } - void writeDecimal64(Decimal64 decimal, UInt32 scale) override { writeDecimal(decimal, scale); } - void writeDecimal128(const Decimal128 & decimal, UInt32 scale) override { writeDecimal(decimal, scale); } - -private: - template - void castNumericAndWriteField(FromType value) - { - writeField(numericCast(value)); - } - - template - void writeDecimal(const Decimal & decimal, UInt32 scale) - { - castNumericAndWriteField(DecimalUtils::convertTo(decimal, scale)); - } - - void writeField(ToType value) - { - if constexpr (skip_null_value) - { - if (value == 0) - return; - } - if constexpr (((field_type_id == google::protobuf::FieldDescriptor::TYPE_INT32) && std::is_same_v) - || ((field_type_id == google::protobuf::FieldDescriptor::TYPE_INT64) && std::is_same_v)) - { - if constexpr (pack_repeated) - simple_writer.addIntToRepeatedPack(value); - else - simple_writer.writeInt(field_number, value); - } - else if constexpr (((field_type_id == google::protobuf::FieldDescriptor::TYPE_SINT32) && std::is_same_v) - || ((field_type_id == google::protobuf::FieldDescriptor::TYPE_SINT64) && std::is_same_v)) - { - if constexpr (pack_repeated) - simple_writer.addSIntToRepeatedPack(value); - else - simple_writer.writeSInt(field_number, value); - } - else if constexpr (((field_type_id == google::protobuf::FieldDescriptor::TYPE_UINT32) && std::is_same_v) - || ((field_type_id == google::protobuf::FieldDescriptor::TYPE_UINT64) && std::is_same_v)) - { - if constexpr (pack_repeated) - simple_writer.addUIntToRepeatedPack(value); - else - simple_writer.writeUInt(field_number, value); - } - else - { - static_assert(((field_type_id == google::protobuf::FieldDescriptor::TYPE_FIXED32) && std::is_same_v) - || ((field_type_id == google::protobuf::FieldDescriptor::TYPE_SFIXED32) && std::is_same_v) - || ((field_type_id == google::protobuf::FieldDescriptor::TYPE_FIXED64) && std::is_same_v) - || ((field_type_id == google::protobuf::FieldDescriptor::TYPE_SFIXED64) && std::is_same_v) - || ((field_type_id == google::protobuf::FieldDescriptor::TYPE_FLOAT) && std::is_same_v) - || ((field_type_id == google::protobuf::FieldDescriptor::TYPE_DOUBLE) && std::is_same_v)); - if constexpr (pack_repeated) - simple_writer.addFixedToRepeatedPack(value); - else - simple_writer.writeFixed(field_number, value); - } - } -}; - -# define PROTOBUF_WRITER_CREATE_CONVERTER_SPECIALIZATION_FOR_NUMBERS(field_type_id, field_type) \ - template <> \ - std::unique_ptr ProtobufWriter::createConverter( \ - const google::protobuf::FieldDescriptor * field) \ - { \ - if (shouldSkipNullValue(field)) \ - return std::make_unique>(simple_writer, field); \ - else if (shouldPackRepeated(field)) \ - return std::make_unique>(simple_writer, field); \ - else \ - return std::make_unique>(simple_writer, field); \ - } - -PROTOBUF_WRITER_CREATE_CONVERTER_SPECIALIZATION_FOR_NUMBERS(google::protobuf::FieldDescriptor::TYPE_INT32, Int32); -PROTOBUF_WRITER_CREATE_CONVERTER_SPECIALIZATION_FOR_NUMBERS(google::protobuf::FieldDescriptor::TYPE_SINT32, Int32); -PROTOBUF_WRITER_CREATE_CONVERTER_SPECIALIZATION_FOR_NUMBERS(google::protobuf::FieldDescriptor::TYPE_UINT32, UInt32); -PROTOBUF_WRITER_CREATE_CONVERTER_SPECIALIZATION_FOR_NUMBERS(google::protobuf::FieldDescriptor::TYPE_INT64, Int64); -PROTOBUF_WRITER_CREATE_CONVERTER_SPECIALIZATION_FOR_NUMBERS(google::protobuf::FieldDescriptor::TYPE_SINT64, Int64); -PROTOBUF_WRITER_CREATE_CONVERTER_SPECIALIZATION_FOR_NUMBERS(google::protobuf::FieldDescriptor::TYPE_UINT64, UInt64); -PROTOBUF_WRITER_CREATE_CONVERTER_SPECIALIZATION_FOR_NUMBERS(google::protobuf::FieldDescriptor::TYPE_FIXED32, UInt32); -PROTOBUF_WRITER_CREATE_CONVERTER_SPECIALIZATION_FOR_NUMBERS(google::protobuf::FieldDescriptor::TYPE_SFIXED32, Int32); -PROTOBUF_WRITER_CREATE_CONVERTER_SPECIALIZATION_FOR_NUMBERS(google::protobuf::FieldDescriptor::TYPE_FIXED64, UInt64); -PROTOBUF_WRITER_CREATE_CONVERTER_SPECIALIZATION_FOR_NUMBERS(google::protobuf::FieldDescriptor::TYPE_SFIXED64, Int64); -PROTOBUF_WRITER_CREATE_CONVERTER_SPECIALIZATION_FOR_NUMBERS(google::protobuf::FieldDescriptor::TYPE_FLOAT, float); -PROTOBUF_WRITER_CREATE_CONVERTER_SPECIALIZATION_FOR_NUMBERS(google::protobuf::FieldDescriptor::TYPE_DOUBLE, double); -# undef PROTOBUF_WRITER_CREATE_CONVERTER_SPECIALIZATION_FOR_NUMBERS - - -template -class ProtobufWriter::ConverterToBool : public ConverterBaseImpl -{ -public: - using ConverterBaseImpl::ConverterBaseImpl; - - void writeString(const StringRef & str) override - { - if (str == "true") - writeField(true); - else if (str == "false") - writeField(false); - else - cannotConvertValue(str.toString()); - } - - void writeInt8(Int8 value) override { convertToBoolAndWriteField(value); } - void writeUInt8(UInt8 value) override { convertToBoolAndWriteField(value); } - void writeInt16(Int16 value) override { convertToBoolAndWriteField(value); } - void writeUInt16(UInt16 value) override { convertToBoolAndWriteField(value); } - void writeInt32(Int32 value) override { convertToBoolAndWriteField(value); } - void writeUInt32(UInt32 value) override { convertToBoolAndWriteField(value); } - void writeInt64(Int64 value) override { convertToBoolAndWriteField(value); } - void writeUInt64(UInt64 value) override { convertToBoolAndWriteField(value); } - void writeFloat32(Float32 value) override { convertToBoolAndWriteField(value); } - void writeFloat64(Float64 value) override { convertToBoolAndWriteField(value); } - void writeDecimal32(Decimal32 decimal, UInt32) override { convertToBoolAndWriteField(decimal.value); } - void writeDecimal64(Decimal64 decimal, UInt32) override { convertToBoolAndWriteField(decimal.value); } - void writeDecimal128(const Decimal128 & decimal, UInt32) override { convertToBoolAndWriteField(decimal.value); } - -private: - template - void convertToBoolAndWriteField(T value) - { - writeField(static_cast(value)); - } - - void writeField(bool b) - { - if constexpr (skip_null_value) - { - if (!b) - return; - } - if constexpr (pack_repeated) - simple_writer.addUIntToRepeatedPack(b); - else - simple_writer.writeUInt(field_number, b); - } -}; - -template <> -std::unique_ptr ProtobufWriter::createConverter( - const google::protobuf::FieldDescriptor * field) -{ - if (shouldSkipNullValue(field)) - return std::make_unique>(simple_writer, field); - else if (shouldPackRepeated(field)) - return std::make_unique>(simple_writer, field); - else - return std::make_unique>(simple_writer, field); -} - - -template -class ProtobufWriter::ConverterToEnum : public ConverterBaseImpl -{ -public: - using ConverterBaseImpl::ConverterBaseImpl; - - void writeString(const StringRef & str) override - { - prepareEnumNameToPbNumberMap(); - auto it = enum_name_to_pbnumber_map->find(str); - if (it == enum_name_to_pbnumber_map->end()) - cannotConvertValue(str.toString()); - writeField(it->second); - } - - void writeInt8(Int8 value) override { convertToEnumAndWriteField(value); } - void writeUInt8(UInt8 value) override { convertToEnumAndWriteField(value); } - void writeInt16(Int16 value) override { convertToEnumAndWriteField(value); } - void writeUInt16(UInt16 value) override { convertToEnumAndWriteField(value); } - void writeInt32(Int32 value) override { convertToEnumAndWriteField(value); } - void writeUInt32(UInt32 value) override { convertToEnumAndWriteField(value); } - void writeInt64(Int64 value) override { convertToEnumAndWriteField(value); } - void writeUInt64(UInt64 value) override { convertToEnumAndWriteField(value); } - - void prepareEnumMapping8(const std::vector> & name_value_pairs) override - { - prepareEnumValueToPbNumberMap(name_value_pairs); - } - void prepareEnumMapping16(const std::vector> & name_value_pairs) override - { - prepareEnumValueToPbNumberMap(name_value_pairs); - } - - void writeEnum8(Int8 value) override { writeEnum16(value); } - - void writeEnum16(Int16 value) override - { - int pbnumber; - if (enum_value_always_equals_pbnumber) - pbnumber = value; - else - { - auto it = enum_value_to_pbnumber_map->find(value); - if (it == enum_value_to_pbnumber_map->end()) - cannotConvertValue(toString(value)); - pbnumber = it->second; - } - writeField(pbnumber); - } - -private: - template - void convertToEnumAndWriteField(T value) - { - const auto * enum_descriptor = field->enum_type()->FindValueByNumber(numericCast(value)); - if (!enum_descriptor) - cannotConvertValue(toString(value)); - writeField(enum_descriptor->number()); - } - - void prepareEnumNameToPbNumberMap() - { - if (enum_name_to_pbnumber_map.has_value()) - return; - enum_name_to_pbnumber_map.emplace(); - const auto * enum_type = field->enum_type(); - for (int i = 0; i != enum_type->value_count(); ++i) - { - const auto * enum_value = enum_type->value(i); - enum_name_to_pbnumber_map->emplace(enum_value->name(), enum_value->number()); - } - } - - template - void prepareEnumValueToPbNumberMap(const std::vector> & name_value_pairs) - { - if (enum_value_to_pbnumber_map.has_value()) - return; - enum_value_to_pbnumber_map.emplace(); - enum_value_always_equals_pbnumber = true; - for (const auto & name_value_pair : name_value_pairs) - { - Int16 value = name_value_pair.second; // NOLINT - const auto * enum_descriptor = field->enum_type()->FindValueByName(name_value_pair.first); - if (enum_descriptor) - { - enum_value_to_pbnumber_map->emplace(value, enum_descriptor->number()); - if (value != enum_descriptor->number()) - enum_value_always_equals_pbnumber = false; - } - else - enum_value_always_equals_pbnumber = false; - } - } - - void writeField(int enum_pbnumber) - { - if constexpr (skip_null_value) - { - if (!enum_pbnumber) - return; - } - if constexpr (pack_repeated) - simple_writer.addUIntToRepeatedPack(enum_pbnumber); - else - simple_writer.writeUInt(field_number, enum_pbnumber); - } - - std::optional> enum_name_to_pbnumber_map; - std::optional> enum_value_to_pbnumber_map; - bool enum_value_always_equals_pbnumber; -}; - -template <> -std::unique_ptr ProtobufWriter::createConverter( - const google::protobuf::FieldDescriptor * field) -{ - if (shouldSkipNullValue(field)) - return std::make_unique>(simple_writer, field); - else if (shouldPackRepeated(field)) - return std::make_unique>(simple_writer, field); - else - return std::make_unique>(simple_writer, field); -} - - -ProtobufWriter::ProtobufWriter( - WriteBuffer & out, const google::protobuf::Descriptor * message_type, const std::vector & column_names, const bool use_length_delimiters_) - : simple_writer(out, use_length_delimiters_) -{ - std::vector field_descriptors_without_match; - root_message = ProtobufColumnMatcher::matchColumns(column_names, message_type, field_descriptors_without_match); - for (const auto * field_descriptor_without_match : field_descriptors_without_match) - { - if (field_descriptor_without_match->is_required()) - throw Exception( - "Output doesn't have a column named '" + field_descriptor_without_match->name() - + "' which is required to write the output in the protobuf format.", - ErrorCodes::NO_DATA_FOR_REQUIRED_PROTOBUF_FIELD); - } - setTraitsDataAfterMatchingColumns(root_message.get()); -} - -ProtobufWriter::~ProtobufWriter() = default; - -void ProtobufWriter::setTraitsDataAfterMatchingColumns(Message * message) -{ - Field * parent_field = message->parent ? &message->parent->fields[message->index_in_parent] : nullptr; - message->data.parent_field_number = parent_field ? parent_field->field_number : 0; - message->data.is_required = parent_field && parent_field->data.is_required; - - if (parent_field && parent_field->data.is_repeatable) - message->data.repeatable_container_message = message; - else if (message->parent) - message->data.repeatable_container_message = message->parent->data.repeatable_container_message; - else - message->data.repeatable_container_message = nullptr; - - message->data.is_group = parent_field && (parent_field->field_descriptor->type() == google::protobuf::FieldDescriptor::TYPE_GROUP); - - for (auto & field : message->fields) - { - field.data.is_repeatable = field.field_descriptor->is_repeated(); - field.data.is_required = field.field_descriptor->is_required(); - field.data.repeatable_container_message = message->data.repeatable_container_message; - field.data.should_pack_repeated = shouldPackRepeated(field.field_descriptor); - - if (field.nested_message) - { - setTraitsDataAfterMatchingColumns(field.nested_message.get()); - continue; - } - switch (field.field_descriptor->type()) - { -# define PROTOBUF_WRITER_CONVERTER_CREATING_CASE(field_type_id) \ - case field_type_id: \ - field.data.converter = createConverter(field.field_descriptor); \ - break - PROTOBUF_WRITER_CONVERTER_CREATING_CASE(google::protobuf::FieldDescriptor::TYPE_STRING); - PROTOBUF_WRITER_CONVERTER_CREATING_CASE(google::protobuf::FieldDescriptor::TYPE_BYTES); - PROTOBUF_WRITER_CONVERTER_CREATING_CASE(google::protobuf::FieldDescriptor::TYPE_INT32); - PROTOBUF_WRITER_CONVERTER_CREATING_CASE(google::protobuf::FieldDescriptor::TYPE_SINT32); - PROTOBUF_WRITER_CONVERTER_CREATING_CASE(google::protobuf::FieldDescriptor::TYPE_UINT32); - PROTOBUF_WRITER_CONVERTER_CREATING_CASE(google::protobuf::FieldDescriptor::TYPE_FIXED32); - PROTOBUF_WRITER_CONVERTER_CREATING_CASE(google::protobuf::FieldDescriptor::TYPE_SFIXED32); - PROTOBUF_WRITER_CONVERTER_CREATING_CASE(google::protobuf::FieldDescriptor::TYPE_INT64); - PROTOBUF_WRITER_CONVERTER_CREATING_CASE(google::protobuf::FieldDescriptor::TYPE_SINT64); - PROTOBUF_WRITER_CONVERTER_CREATING_CASE(google::protobuf::FieldDescriptor::TYPE_UINT64); - PROTOBUF_WRITER_CONVERTER_CREATING_CASE(google::protobuf::FieldDescriptor::TYPE_FIXED64); - PROTOBUF_WRITER_CONVERTER_CREATING_CASE(google::protobuf::FieldDescriptor::TYPE_SFIXED64); - PROTOBUF_WRITER_CONVERTER_CREATING_CASE(google::protobuf::FieldDescriptor::TYPE_FLOAT); - PROTOBUF_WRITER_CONVERTER_CREATING_CASE(google::protobuf::FieldDescriptor::TYPE_DOUBLE); - PROTOBUF_WRITER_CONVERTER_CREATING_CASE(google::protobuf::FieldDescriptor::TYPE_BOOL); - PROTOBUF_WRITER_CONVERTER_CREATING_CASE(google::protobuf::FieldDescriptor::TYPE_ENUM); -# undef PROTOBUF_WRITER_CONVERTER_CREATING_CASE - default: - throw Exception( - String("Protobuf type '") + field.field_descriptor->type_name() + "' isn't supported", ErrorCodes::NOT_IMPLEMENTED); - } - } -} - -void ProtobufWriter::startMessage() -{ - current_message = root_message.get(); - current_field_index = 0; - simple_writer.startMessage(); -} - -void ProtobufWriter::endMessage() -{ - if (!current_message) - return; - endWritingField(); - while (current_message->parent) - { - simple_writer.endNestedMessage( - current_message->data.parent_field_number, current_message->data.is_group, !current_message->data.is_required); - current_message = current_message->parent; - } - simple_writer.endMessage(); - current_message = nullptr; -} - -bool ProtobufWriter::writeField(size_t & column_index) -{ - endWritingField(); - while (true) - { - if (current_field_index < current_message->fields.size()) - { - Field & field = current_message->fields[current_field_index]; - if (!field.nested_message) - { - current_field = ¤t_message->fields[current_field_index]; - current_converter = current_field->data.converter.get(); - column_index = current_field->column_index; - if (current_field->data.should_pack_repeated) - simple_writer.startRepeatedPack(); - return true; - } - simple_writer.startNestedMessage(); - current_message = field.nested_message.get(); - current_message->data.need_repeat = false; - current_field_index = 0; - continue; - } - if (current_message->parent) - { - simple_writer.endNestedMessage( - current_message->data.parent_field_number, current_message->data.is_group, !current_message->data.is_required); - if (current_message->data.need_repeat) - { - simple_writer.startNestedMessage(); - current_message->data.need_repeat = false; - current_field_index = 0; - continue; - } - current_field_index = current_message->index_in_parent + 1; - current_message = current_message->parent; - continue; - } - return false; - } -} - -void ProtobufWriter::endWritingField() -{ - if (!current_field) - return; - if (current_field->data.should_pack_repeated) - simple_writer.endRepeatedPack(current_field->field_number); - else if ((num_values == 0) && current_field->data.is_required) - throw Exception( - "No data for the required field '" + current_field->field_descriptor->name() + "'", - ErrorCodes::NO_DATA_FOR_REQUIRED_PROTOBUF_FIELD); - - current_field = nullptr; - current_converter = nullptr; - num_values = 0; - ++current_field_index; -} - -void ProtobufWriter::setNestedMessageNeedsRepeat() -{ - if (current_field->data.repeatable_container_message) - current_field->data.repeatable_container_message->data.need_repeat = true; - else - throw Exception( - "Cannot write more than single value to the non-repeated field '" + current_field->field_descriptor->name() + "'", - ErrorCodes::PROTOBUF_FIELD_NOT_REPEATED); + in_repeated_pack = false; } } diff --git a/src/Formats/ProtobufWriter.h b/src/Formats/ProtobufWriter.h index 52bb453aa73..6af1a237fbd 100644 --- a/src/Formats/ProtobufWriter.h +++ b/src/Formats/ProtobufWriter.h @@ -1,290 +1,68 @@ #pragma once -#include -#include -#include - #if !defined(ARCADIA_BUILD) # include "config_formats.h" #endif #if USE_PROTOBUF -# include -# include -# include -# include "ProtobufColumnMatcher.h" - - -namespace google -{ -namespace protobuf -{ - class Descriptor; - class FieldDescriptor; -} -} - -namespace DB -{ -class IAggregateFunction; -using AggregateFunctionPtr = std::shared_ptr; -using ConstAggregateDataPtr = const char *; - - -/** Serializes a protobuf, tries to cast types if necessarily. - */ -class ProtobufWriter : private boost::noncopyable -{ -public: - ProtobufWriter(WriteBuffer & out, const google::protobuf::Descriptor * message_type, const std::vector & column_names, const bool use_length_delimiters_); - ~ProtobufWriter(); - - /// Should be called at the beginning of writing a message. - void startMessage(); - - /// Should be called at the end of writing a message. - void endMessage(); - - /// Prepares for writing values of a field. - /// Returns true and sets 'column_index' to the corresponding column's index. - /// Returns false if there are no more fields to write in the message type (call endMessage() in this case). - bool writeField(size_t & column_index); - - /// Writes a value. This function should be called one or multiple times after writeField(). - /// Returns false if there are no more place for the values in the protobuf's field. - /// This can happen if the protobuf's field is not declared as repeated in the protobuf schema. - bool writeNumber(Int8 value) { return writeValueIfPossible(&IConverter::writeInt8, value); } - bool writeNumber(UInt8 value) { return writeValueIfPossible(&IConverter::writeUInt8, value); } - bool writeNumber(Int16 value) { return writeValueIfPossible(&IConverter::writeInt16, value); } - bool writeNumber(UInt16 value) { return writeValueIfPossible(&IConverter::writeUInt16, value); } - bool writeNumber(Int32 value) { return writeValueIfPossible(&IConverter::writeInt32, value); } - bool writeNumber(UInt32 value) { return writeValueIfPossible(&IConverter::writeUInt32, value); } - bool writeNumber(Int64 value) { return writeValueIfPossible(&IConverter::writeInt64, value); } - bool writeNumber(UInt64 value) { return writeValueIfPossible(&IConverter::writeUInt64, value); } - bool writeNumber(Int128 value) { return writeValueIfPossible(&IConverter::writeInt128, value); } - bool writeNumber(UInt128 value) { return writeValueIfPossible(&IConverter::writeUInt128, value); } - - bool writeNumber(Int256 value) { return writeValueIfPossible(&IConverter::writeInt256, value); } - bool writeNumber(UInt256 value) { return writeValueIfPossible(&IConverter::writeUInt256, value); } - - bool writeNumber(Float32 value) { return writeValueIfPossible(&IConverter::writeFloat32, value); } - bool writeNumber(Float64 value) { return writeValueIfPossible(&IConverter::writeFloat64, value); } - bool writeString(const StringRef & str) { return writeValueIfPossible(&IConverter::writeString, str); } - void prepareEnumMapping(const std::vector> & enum_values) { current_converter->prepareEnumMapping8(enum_values); } - void prepareEnumMapping(const std::vector> & enum_values) { current_converter->prepareEnumMapping16(enum_values); } - bool writeEnum(Int8 value) { return writeValueIfPossible(&IConverter::writeEnum8, value); } - bool writeEnum(Int16 value) { return writeValueIfPossible(&IConverter::writeEnum16, value); } - bool writeUUID(const UUID & uuid) { return writeValueIfPossible(&IConverter::writeUUID, uuid); } - bool writeDate(DayNum date) { return writeValueIfPossible(&IConverter::writeDate, date); } - bool writeDateTime(time_t tm) { return writeValueIfPossible(&IConverter::writeDateTime, tm); } - bool writeDateTime64(DateTime64 tm, UInt32 scale) { return writeValueIfPossible(&IConverter::writeDateTime64, tm, scale); } - bool writeDecimal(Decimal32 decimal, UInt32 scale) { return writeValueIfPossible(&IConverter::writeDecimal32, decimal, scale); } - bool writeDecimal(Decimal64 decimal, UInt32 scale) { return writeValueIfPossible(&IConverter::writeDecimal64, decimal, scale); } - bool writeDecimal(const Decimal128 & decimal, UInt32 scale) { return writeValueIfPossible(&IConverter::writeDecimal128, decimal, scale); } - bool writeDecimal(const Decimal256 & decimal, UInt32 scale) { return writeValueIfPossible(&IConverter::writeDecimal256, decimal, scale); } - bool writeAggregateFunction(const AggregateFunctionPtr & function, ConstAggregateDataPtr place) { return writeValueIfPossible(&IConverter::writeAggregateFunction, function, place); } - -private: - class SimpleWriter - { - public: - SimpleWriter(WriteBuffer & out_, const bool use_length_delimiters_); - ~SimpleWriter(); - - void startMessage(); - void endMessage(); - - void startNestedMessage(); - void endNestedMessage(UInt32 field_number, bool is_group, bool skip_if_empty); - - void writeInt(UInt32 field_number, Int64 value); - void writeUInt(UInt32 field_number, UInt64 value); - void writeSInt(UInt32 field_number, Int64 value); - template - void writeFixed(UInt32 field_number, T value); - void writeString(UInt32 field_number, const StringRef & str); - - void startRepeatedPack(); - void addIntToRepeatedPack(Int64 value); - void addUIntToRepeatedPack(UInt64 value); - void addSIntToRepeatedPack(Int64 value); - template - void addFixedToRepeatedPack(T value); - void endRepeatedPack(UInt32 field_number); - - private: - struct Piece - { - size_t start; - size_t end; - Piece(size_t start_, size_t end_) : start(start_), end(end_) {} - Piece() = default; - }; - - struct NestedInfo - { - size_t num_pieces_at_start; - size_t num_bytes_skipped_at_start; - NestedInfo(size_t num_pieces_at_start_, size_t num_bytes_skipped_at_start_) - : num_pieces_at_start(num_pieces_at_start_), num_bytes_skipped_at_start(num_bytes_skipped_at_start_) - { - } - }; - - WriteBuffer & out; - PODArray buffer; - std::vector pieces; - size_t current_piece_start; - size_t num_bytes_skipped; - std::vector nested_infos; - const bool use_length_delimiters; - }; - - class IConverter - { - public: - virtual ~IConverter() = default; - virtual void writeString(const StringRef &) = 0; - virtual void writeInt8(Int8) = 0; - virtual void writeUInt8(UInt8) = 0; - virtual void writeInt16(Int16) = 0; - virtual void writeUInt16(UInt16) = 0; - virtual void writeInt32(Int32) = 0; - virtual void writeUInt32(UInt32) = 0; - virtual void writeInt64(Int64) = 0; - virtual void writeUInt64(UInt64) = 0; - virtual void writeInt128(Int128) = 0; - virtual void writeUInt128(const UInt128 &) = 0; - - virtual void writeInt256(const Int256 &) = 0; - virtual void writeUInt256(const UInt256 &) = 0; - - virtual void writeFloat32(Float32) = 0; - virtual void writeFloat64(Float64) = 0; - virtual void prepareEnumMapping8(const std::vector> &) = 0; - virtual void prepareEnumMapping16(const std::vector> &) = 0; - virtual void writeEnum8(Int8) = 0; - virtual void writeEnum16(Int16) = 0; - virtual void writeUUID(const UUID &) = 0; - virtual void writeDate(DayNum) = 0; - virtual void writeDateTime(time_t) = 0; - virtual void writeDateTime64(DateTime64, UInt32 scale) = 0; - virtual void writeDecimal32(Decimal32, UInt32) = 0; - virtual void writeDecimal64(Decimal64, UInt32) = 0; - virtual void writeDecimal128(const Decimal128 &, UInt32) = 0; - virtual void writeDecimal256(const Decimal256 &, UInt32) = 0; - virtual void writeAggregateFunction(const AggregateFunctionPtr &, ConstAggregateDataPtr) = 0; - }; - - class ConverterBaseImpl; - template - class ConverterToString; - template - class ConverterToNumber; - template - class ConverterToBool; - template - class ConverterToEnum; - - struct ColumnMatcherTraits - { - struct FieldData - { - std::unique_ptr converter; - bool is_required; - bool is_repeatable; - bool should_pack_repeated; - ProtobufColumnMatcher::Message * repeatable_container_message; - }; - struct MessageData - { - UInt32 parent_field_number; - bool is_group; - bool is_required; - ProtobufColumnMatcher::Message * repeatable_container_message; - bool need_repeat; - }; - }; - using Message = ProtobufColumnMatcher::Message; - using Field = ProtobufColumnMatcher::Field; - - void setTraitsDataAfterMatchingColumns(Message * message); - - template - std::unique_ptr createConverter(const google::protobuf::FieldDescriptor * field); - - template - using WriteValueFunctionPtr = void (IConverter::*)(Params...); - - template - bool writeValueIfPossible(WriteValueFunctionPtr func, Args &&... args) - { - if (num_values && !current_field->data.is_repeatable) - { - setNestedMessageNeedsRepeat(); - return false; - } - (current_converter->*func)(std::forward(args)...); - ++num_values; - return true; - } - - void setNestedMessageNeedsRepeat(); - void endWritingField(); - - SimpleWriter simple_writer; - std::unique_ptr root_message; - - Message * current_message; - size_t current_field_index = 0; - const Field * current_field = nullptr; - IConverter * current_converter = nullptr; - size_t num_values = 0; -}; - -} - -#else -# include +# include +# include namespace DB { -class IAggregateFunction; -using AggregateFunctionPtr = std::shared_ptr; -using ConstAggregateDataPtr = const char *; +class WriteBuffer; +/// Utility class for writing in the Protobuf format. +/// Knows nothing about protobuf schemas, just provides useful functions to serialize data. class ProtobufWriter { public: - bool writeNumber(Int8 /* value */) { return false; } - bool writeNumber(UInt8 /* value */) { return false; } - bool writeNumber(Int16 /* value */) { return false; } - bool writeNumber(UInt16 /* value */) { return false; } - bool writeNumber(Int32 /* value */) { return false; } - bool writeNumber(UInt32 /* value */) { return false; } - bool writeNumber(Int64 /* value */) { return false; } - bool writeNumber(UInt64 /* value */) { return false; } - bool writeNumber(Int128 /* value */) { return false; } - bool writeNumber(UInt128 /* value */) { return false; } - bool writeNumber(Int256 /* value */) { return false; } - bool writeNumber(UInt256 /* value */) { return false; } - bool writeNumber(Float32 /* value */) { return false; } - bool writeNumber(Float64 /* value */) { return false; } - bool writeString(const StringRef & /* value */) { return false; } - void prepareEnumMapping(const std::vector> & /* name_value_pairs */) {} - void prepareEnumMapping(const std::vector> & /* name_value_pairs */) {} - bool writeEnum(Int8 /* value */) { return false; } - bool writeEnum(Int16 /* value */) { return false; } - bool writeUUID(const UUID & /* value */) { return false; } - bool writeDate(DayNum /* date */) { return false; } - bool writeDateTime(time_t /* tm */) { return false; } - bool writeDateTime64(DateTime64 /*tm*/, UInt32 /*scale*/) { return false; } - bool writeDecimal(Decimal32 /* decimal */, UInt32 /* scale */) { return false; } - bool writeDecimal(Decimal64 /* decimal */, UInt32 /* scale */) { return false; } - bool writeDecimal(const Decimal128 & /* decimal */, UInt32 /* scale */) { return false; } - bool writeDecimal(const Decimal256 & /* decimal */, UInt32 /* scale */) { return false; } - bool writeAggregateFunction(const AggregateFunctionPtr & /* function */, ConstAggregateDataPtr /* place */) { return false; } + ProtobufWriter(WriteBuffer & out_); + ~ProtobufWriter(); + + void startMessage(); + void endMessage(bool with_length_delimiter); + + void startNestedMessage(); + void endNestedMessage(int field_number, bool is_group, bool skip_if_empty); + + void writeInt(int field_number, Int64 value); + void writeUInt(int field_number, UInt64 value); + void writeSInt(int field_number, Int64 value); + template + void writeFixed(int field_number, T value); + void writeString(int field_number, const std::string_view & str); + + void startRepeatedPack(); + void endRepeatedPack(int field_number, bool skip_if_empty); + +private: + struct Piece + { + size_t start; + size_t end; + Piece(size_t start_, size_t end_) : start(start_), end(end_) {} + Piece() = default; + }; + + struct NestedInfo + { + size_t num_pieces_at_start; + size_t num_bytes_skipped_at_start; + NestedInfo(size_t num_pieces_at_start_, size_t num_bytes_skipped_at_start_) + : num_pieces_at_start(num_pieces_at_start_), num_bytes_skipped_at_start(num_bytes_skipped_at_start_) + { + } + }; + + WriteBuffer & out; + PODArray buffer; + std::vector pieces; + size_t current_piece_start = 0; + size_t num_bytes_skipped = 0; + std::vector nested_infos; + bool in_repeated_pack = false; }; } diff --git a/src/Formats/ya.make b/src/Formats/ya.make index 6b72ec397d5..8fe938be125 100644 --- a/src/Formats/ya.make +++ b/src/Formats/ya.make @@ -20,9 +20,9 @@ SRCS( NativeFormat.cpp NullFormat.cpp ParsedTemplateFormatString.cpp - ProtobufColumnMatcher.cpp ProtobufReader.cpp ProtobufSchemas.cpp + ProtobufSerializer.cpp ProtobufWriter.cpp registerFormats.cpp verbosePrintString.cpp diff --git a/src/Processors/Formats/Impl/ProtobufRowInputFormat.cpp b/src/Processors/Formats/Impl/ProtobufRowInputFormat.cpp index d1420d0d38e..22a758b80f6 100644 --- a/src/Processors/Formats/Impl/ProtobufRowInputFormat.cpp +++ b/src/Processors/Formats/Impl/ProtobufRowInputFormat.cpp @@ -1,57 +1,48 @@ #include "ProtobufRowInputFormat.h" #if USE_PROTOBUF -#include -#include -#include -#include -#include +# include +# include +# include +# include +# include +# include +# include +# include namespace DB { - -ProtobufRowInputFormat::ProtobufRowInputFormat(ReadBuffer & in_, const Block & header_, Params params_, const FormatSchemaInfo & info_, const bool use_length_delimiters_) +ProtobufRowInputFormat::ProtobufRowInputFormat(ReadBuffer & in_, const Block & header_, const Params & params_, const FormatSchemaInfo & schema_info_, bool with_length_delimiter_) : IRowInputFormat(header_, in_, params_) - , data_types(header_.getDataTypes()) - , reader(in, ProtobufSchemas::instance().getMessageTypeForFormatSchema(info_), header_.getNames(), use_length_delimiters_) + , reader(std::make_unique(in_)) + , serializer(ProtobufSerializer::create( + header_.getNames(), + header_.getDataTypes(), + missing_column_indices, + *ProtobufSchemas::instance().getMessageTypeForFormatSchema(schema_info_), + with_length_delimiter_, + *reader)) { } ProtobufRowInputFormat::~ProtobufRowInputFormat() = default; -bool ProtobufRowInputFormat::readRow(MutableColumns & columns, RowReadExtension & extra) +bool ProtobufRowInputFormat::readRow(MutableColumns & columns, RowReadExtension & row_read_extension) { - if (!reader.startMessage()) - return false; // EOF reached, no more messages. + if (reader->eof()) + return false; - // Set of columns for which the values were read. The rest will be filled with default values. - auto & read_columns = extra.read_columns; - read_columns.assign(columns.size(), false); + size_t row_num = columns.empty() ? 0 : columns[0]->size(); + if (!row_num) + serializer->setColumns(columns.data(), columns.size()); - // Read values from this message and put them to the columns while it's possible. - size_t column_index; - while (reader.readColumnIndex(column_index)) - { - bool allow_add_row = !static_cast(read_columns[column_index]); - do - { - bool row_added; - data_types[column_index]->deserializeProtobuf(*columns[column_index], reader, allow_add_row, row_added); - if (row_added) - { - read_columns[column_index] = true; - allow_add_row = false; - } - } while (reader.canReadMoreValues()); - } + serializer->readRow(row_num); - // Fill non-visited columns with the default values. - for (column_index = 0; column_index < read_columns.size(); ++column_index) - if (!read_columns[column_index]) - data_types[column_index]->insertDefaultInto(*columns[column_index]); - - reader.endMessage(); + row_read_extension.read_columns.clear(); + row_read_extension.read_columns.resize(columns.size(), true); + for (size_t column_idx : missing_column_indices) + row_read_extension.read_columns[column_idx] = false; return true; } @@ -62,14 +53,14 @@ bool ProtobufRowInputFormat::allowSyncAfterError() const void ProtobufRowInputFormat::syncAfterError() { - reader.endMessage(true); + reader->endMessage(true); } void registerInputFormatProcessorProtobuf(FormatFactory & factory) { - for (bool use_length_delimiters : {false, true}) + for (bool with_length_delimiter : {false, true}) { - factory.registerInputFormatProcessor(use_length_delimiters ? "Protobuf" : "ProtobufSingle", [use_length_delimiters]( + factory.registerInputFormatProcessor(with_length_delimiter ? "Protobuf" : "ProtobufSingle", [with_length_delimiter]( ReadBuffer & buf, const Block & sample, IRowInputFormat::Params params, @@ -78,7 +69,7 @@ void registerInputFormatProcessorProtobuf(FormatFactory & factory) return std::make_shared(buf, sample, std::move(params), FormatSchemaInfo(settings.schema.format_schema, "Protobuf", true, settings.schema.is_server, settings.schema.format_schema_path), - use_length_delimiters); + with_length_delimiter); }); } } diff --git a/src/Processors/Formats/Impl/ProtobufRowInputFormat.h b/src/Processors/Formats/Impl/ProtobufRowInputFormat.h index c6bc350e893..b2eabd4f37c 100644 --- a/src/Processors/Formats/Impl/ProtobufRowInputFormat.h +++ b/src/Processors/Formats/Impl/ProtobufRowInputFormat.h @@ -5,14 +5,14 @@ #endif #if USE_PROTOBUF -# include -# include # include namespace DB { class Block; class FormatSchemaInfo; +class ProtobufReader; +class ProtobufSerializer; /** Stream designed to deserialize data from the google protobuf format. @@ -29,18 +29,19 @@ class FormatSchemaInfo; class ProtobufRowInputFormat : public IRowInputFormat { public: - ProtobufRowInputFormat(ReadBuffer & in_, const Block & header_, Params params_, const FormatSchemaInfo & info_, const bool use_length_delimiters_); + ProtobufRowInputFormat(ReadBuffer & in_, const Block & header_, const Params & params_, const FormatSchemaInfo & schema_info_, bool with_length_delimiter_); ~ProtobufRowInputFormat() override; String getName() const override { return "ProtobufRowInputFormat"; } - bool readRow(MutableColumns & columns, RowReadExtension & extra) override; + bool readRow(MutableColumns & columns, RowReadExtension &) override; bool allowSyncAfterError() const override; void syncAfterError() override; private: - DataTypes data_types; - ProtobufReader reader; + std::unique_ptr reader; + std::vector missing_column_indices; + std::unique_ptr serializer; }; } diff --git a/src/Processors/Formats/Impl/ProtobufRowOutputFormat.cpp b/src/Processors/Formats/Impl/ProtobufRowOutputFormat.cpp index 3c885e80e31..d3b9a0124c1 100644 --- a/src/Processors/Formats/Impl/ProtobufRowOutputFormat.cpp +++ b/src/Processors/Formats/Impl/ProtobufRowOutputFormat.cpp @@ -1,13 +1,13 @@ -#include #include "ProtobufRowOutputFormat.h" #if USE_PROTOBUF - -#include -#include -#include -#include -#include +# include +# include +# include +# include +# include +# include +# include namespace DB @@ -20,58 +20,55 @@ namespace ErrorCodes ProtobufRowOutputFormat::ProtobufRowOutputFormat( WriteBuffer & out_, - const Block & header, + const Block & header_, const RowOutputFormatParams & params_, - const FormatSchemaInfo & format_schema, - const FormatSettings & settings) - : IRowOutputFormat(header, out_, params_) - , data_types(header.getDataTypes()) - , writer(out, - ProtobufSchemas::instance().getMessageTypeForFormatSchema(format_schema), - header.getNames(), settings.protobuf.write_row_delimiters) - , allow_only_one_row( - !settings.protobuf.write_row_delimiters - && !settings.protobuf.allow_many_rows_no_delimiters) + const FormatSchemaInfo & schema_info_, + const FormatSettings & settings_, + bool with_length_delimiter_) + : IRowOutputFormat(header_, out_, params_) + , writer(std::make_unique(out)) + , serializer(ProtobufSerializer::create( + header_.getNames(), + header_.getDataTypes(), + *ProtobufSchemas::instance().getMessageTypeForFormatSchema(schema_info_), + with_length_delimiter_, + *writer)) + , allow_multiple_rows(with_length_delimiter_ || settings_.protobuf.allow_multiple_rows_without_delimiter) { - value_indices.resize(header.columns()); } void ProtobufRowOutputFormat::write(const Columns & columns, size_t row_num) { - if (allow_only_one_row && !first_row) - { - throw Exception("The ProtobufSingle format can't be used to write multiple rows because this format doesn't have any row delimiter.", ErrorCodes::NO_ROW_DELIMITER); - } + if (!allow_multiple_rows && !first_row) + throw Exception( + "The ProtobufSingle format can't be used to write multiple rows because this format doesn't have any row delimiter.", + ErrorCodes::NO_ROW_DELIMITER); - writer.startMessage(); - std::fill(value_indices.begin(), value_indices.end(), 0); - size_t column_index; - while (writer.writeField(column_index)) - data_types[column_index]->serializeProtobuf( - *columns[column_index], row_num, writer, value_indices[column_index]); - writer.endMessage(); + if (!row_num) + serializer->setColumns(columns.data(), columns.size()); + + serializer->writeRow(row_num); } void registerOutputFormatProcessorProtobuf(FormatFactory & factory) { - for (bool write_row_delimiters : {false, true}) + for (bool with_length_delimiter : {false, true}) { factory.registerOutputFormatProcessor( - write_row_delimiters ? "Protobuf" : "ProtobufSingle", - [write_row_delimiters](WriteBuffer & buf, + with_length_delimiter ? "Protobuf" : "ProtobufSingle", + [with_length_delimiter](WriteBuffer & buf, const Block & header, const RowOutputFormatParams & params, - const FormatSettings & _settings) + const FormatSettings & settings) { - FormatSettings settings = _settings; - settings.protobuf.write_row_delimiters = write_row_delimiters; return std::make_shared( buf, header, params, FormatSchemaInfo(settings.schema.format_schema, "Protobuf", true, settings.schema.is_server, settings.schema.format_schema_path), - settings); + settings, + with_length_delimiter); }); } } diff --git a/src/Processors/Formats/Impl/ProtobufRowOutputFormat.h b/src/Processors/Formats/Impl/ProtobufRowOutputFormat.h index 847f7607ff5..5f82950e891 100644 --- a/src/Processors/Formats/Impl/ProtobufRowOutputFormat.h +++ b/src/Processors/Formats/Impl/ProtobufRowOutputFormat.h @@ -8,21 +8,16 @@ # include # include # include -# include # include -namespace google -{ -namespace protobuf -{ - class Message; -} -} - - namespace DB { +class ProtobufWriter; +class ProtobufSerializer; +class FormatSchemaInfo; +struct FormatSettings; + /** Stream designed to serialize data in the google protobuf format. * Each row is written as a separated message. * @@ -38,10 +33,11 @@ class ProtobufRowOutputFormat : public IRowOutputFormat public: ProtobufRowOutputFormat( WriteBuffer & out_, - const Block & header, + const Block & header_, const RowOutputFormatParams & params_, - const FormatSchemaInfo & format_schema, - const FormatSettings & settings); + const FormatSchemaInfo & schema_info_, + const FormatSettings & settings_, + bool with_length_delimiter_); String getName() const override { return "ProtobufRowOutputFormat"; } @@ -50,10 +46,9 @@ public: std::string getContentType() const override { return "application/octet-stream"; } private: - DataTypes data_types; - ProtobufWriter writer; - std::vector value_indices; - const bool allow_only_one_row; + std::unique_ptr writer; + std::unique_ptr serializer; + const bool allow_multiple_rows; }; } diff --git a/src/Storages/Kafka/KafkaBlockOutputStream.cpp b/src/Storages/Kafka/KafkaBlockOutputStream.cpp index cfbb7ad2523..2cb0fd98c71 100644 --- a/src/Storages/Kafka/KafkaBlockOutputStream.cpp +++ b/src/Storages/Kafka/KafkaBlockOutputStream.cpp @@ -26,7 +26,7 @@ void KafkaBlockOutputStream::writePrefix() buffer = storage.createWriteBuffer(getHeader()); auto format_settings = getFormatSettings(*context); - format_settings.protobuf.allow_many_rows_no_delimiters = true; + format_settings.protobuf.allow_multiple_rows_without_delimiter = true; child = FormatFactory::instance().getOutputStream(storage.getFormatName(), *buffer, getHeader(), *context, diff --git a/src/Storages/RabbitMQ/RabbitMQBlockOutputStream.cpp b/src/Storages/RabbitMQ/RabbitMQBlockOutputStream.cpp index d239586bb65..a987fff3c64 100644 --- a/src/Storages/RabbitMQ/RabbitMQBlockOutputStream.cpp +++ b/src/Storages/RabbitMQ/RabbitMQBlockOutputStream.cpp @@ -34,7 +34,7 @@ void RabbitMQBlockOutputStream::writePrefix() buffer->activateWriting(); auto format_settings = getFormatSettings(context); - format_settings.protobuf.allow_many_rows_no_delimiters = true; + format_settings.protobuf.allow_multiple_rows_without_delimiter = true; child = FormatFactory::instance().getOutputStream(storage.getFormatName(), *buffer, getHeader(), context, diff --git a/tests/queries/0_stateless/00825_protobuf_format_array_3dim.proto b/tests/queries/0_stateless/00825_protobuf_format_array_3dim.proto new file mode 100644 index 00000000000..8673924c929 --- /dev/null +++ b/tests/queries/0_stateless/00825_protobuf_format_array_3dim.proto @@ -0,0 +1,14 @@ +syntax = "proto3"; + +message ABC +{ + message nested + { + message nested + { + repeated int32 c = 1; + } + repeated nested b = 1; + } + repeated nested a = 1; +} \ No newline at end of file diff --git a/tests/queries/0_stateless/00825_protobuf_format_array_3dim.reference b/tests/queries/0_stateless/00825_protobuf_format_array_3dim.reference new file mode 100644 index 00000000000..69e7d5e1da8 --- /dev/null +++ b/tests/queries/0_stateless/00825_protobuf_format_array_3dim.reference @@ -0,0 +1,52 @@ +[[],[[]],[[1]],[[2,3],[4]]] +[[[5,6,7]],[[8,9,10]]] + +Binary representation: +00000000 1a 0a 00 0a 02 0a 00 0a 05 0a 03 0a 01 01 0a 0b |................| +00000010 0a 04 0a 02 02 03 0a 03 0a 01 04 12 0a 07 0a 05 |................| +00000020 0a 03 05 06 07 0a 07 0a 05 0a 03 08 09 0a |..............| +0000002e + +MESSAGE #1 AT 0x00000001 +a { +} +a { + b { + } +} +a { + b { + c: 1 + } +} +a { + b { + c: 2 + c: 3 + } + b { + c: 4 + } +} +MESSAGE #2 AT 0x0000001C +a { + b { + c: 5 + c: 6 + c: 7 + } +} +a { + b { + c: 8 + c: 9 + c: 10 + } +} + +Binary representation is as expected + +[[],[[]],[[1]],[[2,3],[4]]] +[[[5,6,7]],[[8,9,10]]] +[[],[[]],[[1]],[[2,3],[4]]] +[[[5,6,7]],[[8,9,10]]] diff --git a/tests/queries/0_stateless/00825_protobuf_format_array_3dim.sh b/tests/queries/0_stateless/00825_protobuf_format_array_3dim.sh new file mode 100755 index 00000000000..903217ca939 --- /dev/null +++ b/tests/queries/0_stateless/00825_protobuf_format_array_3dim.sh @@ -0,0 +1,35 @@ +#!/usr/bin/env bash + +CURDIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd) +# shellcheck source=../shell_config.sh +. "$CURDIR"/../shell_config.sh + +set -eo pipefail + +# Run the client. +$CLICKHOUSE_CLIENT --multiquery <<'EOF' +DROP TABLE IF EXISTS array_3dim_protobuf_00825; + +CREATE TABLE array_3dim_protobuf_00825 +( + `a_b_c` Array(Array(Array(Int32))) +) ENGINE = MergeTree ORDER BY tuple(); + +INSERT INTO array_3dim_protobuf_00825 VALUES ([[], [[]], [[1]], [[2,3],[4]]]), ([[[5, 6, 7]], [[8, 9, 10]]]); + +SELECT * FROM array_3dim_protobuf_00825; +EOF + +BINARY_FILE_PATH=$(mktemp "$CURDIR/00825_protobuf_format_array_3dim.XXXXXX.binary") +$CLICKHOUSE_CLIENT --query "SELECT * FROM array_3dim_protobuf_00825 FORMAT Protobuf SETTINGS format_schema = '$CURDIR/00825_protobuf_format_array_3dim:ABC'" > "$BINARY_FILE_PATH" + +# Check the output in the protobuf format +echo +$CURDIR/helpers/protobuf_length_delimited_encoder.py --decode_and_check --format_schema "$CURDIR/00825_protobuf_format_array_3dim:ABC" --input "$BINARY_FILE_PATH" + +# Check the input in the protobuf format (now the table contains the same data twice). +echo +$CLICKHOUSE_CLIENT --query "INSERT INTO array_3dim_protobuf_00825 FORMAT Protobuf SETTINGS format_schema='$CURDIR/00825_protobuf_format_array_3dim:ABC'" < "$BINARY_FILE_PATH" +$CLICKHOUSE_CLIENT --query "SELECT * FROM array_3dim_protobuf_00825" + +rm "$BINARY_FILE_PATH" diff --git a/tests/queries/0_stateless/00825_protobuf_format_array_of_arrays.proto b/tests/queries/0_stateless/00825_protobuf_format_array_of_arrays.proto new file mode 100644 index 00000000000..8f84164da2a --- /dev/null +++ b/tests/queries/0_stateless/00825_protobuf_format_array_of_arrays.proto @@ -0,0 +1,9 @@ +syntax = "proto3"; + +message AA { + message nested_array { + repeated double c = 2; + } + string a = 1; + repeated nested_array b = 2; +} \ No newline at end of file diff --git a/tests/queries/0_stateless/00825_protobuf_format_array_of_arrays.reference b/tests/queries/0_stateless/00825_protobuf_format_array_of_arrays.reference new file mode 100644 index 00000000000..5ea6780a3ba --- /dev/null +++ b/tests/queries/0_stateless/00825_protobuf_format_array_of_arrays.reference @@ -0,0 +1,41 @@ +one [[1,2,3],[0.5,0.25],[],[4,5],[0.125,0.0625],[6]] + +Binary representation: +00000000 6b 0a 03 6f 6e 65 12 1a 12 18 00 00 00 00 00 00 |k..one..........| +00000010 f0 3f 00 00 00 00 00 00 00 40 00 00 00 00 00 00 |.?.......@......| +00000020 08 40 12 12 12 10 00 00 00 00 00 00 e0 3f 00 00 |.@...........?..| +00000030 00 00 00 00 d0 3f 12 00 12 12 12 10 00 00 00 00 |.....?..........| +00000040 00 00 10 40 00 00 00 00 00 00 14 40 12 12 12 10 |...@.......@....| +00000050 00 00 00 00 00 00 c0 3f 00 00 00 00 00 00 b0 3f |.......?.......?| +00000060 12 0a 12 08 00 00 00 00 00 00 18 40 |...........@| +0000006c + +MESSAGE #1 AT 0x00000001 +a: "one" +b { + c: 1 + c: 2 + c: 3 +} +b { + c: 0.5 + c: 0.25 +} +b { +} +b { + c: 4 + c: 5 +} +b { + c: 0.125 + c: 0.0625 +} +b { + c: 6 +} + +Binary representation is as expected + +one [[1,2,3],[0.5,0.25],[],[4,5],[0.125,0.0625],[6]] +one [[1,2,3],[0.5,0.25],[],[4,5],[0.125,0.0625],[6]] diff --git a/tests/queries/0_stateless/00825_protobuf_format_array_of_arrays.sh b/tests/queries/0_stateless/00825_protobuf_format_array_of_arrays.sh new file mode 100755 index 00000000000..0b386723091 --- /dev/null +++ b/tests/queries/0_stateless/00825_protobuf_format_array_of_arrays.sh @@ -0,0 +1,38 @@ +#!/usr/bin/env bash + +# https://github.com/ClickHouse/ClickHouse/issues/9069 + +CURDIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd) +# shellcheck source=../shell_config.sh +. "$CURDIR"/../shell_config.sh + +set -eo pipefail + +# Run the client. +$CLICKHOUSE_CLIENT --multiquery <<'EOF' +CREATE TABLE array_of_arrays_protobuf_00825 +( + `a` String, + `b` Nested ( + `c` Array(Float64) + ) +) ENGINE = MergeTree ORDER BY tuple(); + +INSERT INTO array_of_arrays_protobuf_00825 VALUES ('one', [[1,2,3],[0.5,0.25],[],[4,5],[0.125,0.0625],[6]]); + +SELECT * FROM array_of_arrays_protobuf_00825; +EOF + +BINARY_FILE_PATH=$(mktemp "$CURDIR/00825_protobuf_format_array_of_arrays.XXXXXX.binary") +$CLICKHOUSE_CLIENT --query "SELECT * FROM array_of_arrays_protobuf_00825 FORMAT Protobuf SETTINGS format_schema = '$CURDIR/00825_protobuf_format_array_of_arrays:AA'" > "$BINARY_FILE_PATH" + +# Check the output in the protobuf format +echo +$CURDIR/helpers/protobuf_length_delimited_encoder.py --decode_and_check --format_schema "$CURDIR/00825_protobuf_format_array_of_arrays:AA" --input "$BINARY_FILE_PATH" + +# Check the input in the protobuf format (now the table contains the same data twice). +echo +$CLICKHOUSE_CLIENT --query "INSERT INTO array_of_arrays_protobuf_00825 FORMAT Protobuf SETTINGS format_schema='$CURDIR/00825_protobuf_format_array_of_arrays:AA'" < "$BINARY_FILE_PATH" +$CLICKHOUSE_CLIENT --query "SELECT * FROM array_of_arrays_protobuf_00825" + +rm "$BINARY_FILE_PATH" diff --git a/tests/queries/0_stateless/00825_protobuf_format_enum_mapping.proto b/tests/queries/0_stateless/00825_protobuf_format_enum_mapping.proto new file mode 100644 index 00000000000..ba558dbbadb --- /dev/null +++ b/tests/queries/0_stateless/00825_protobuf_format_enum_mapping.proto @@ -0,0 +1,13 @@ +syntax = "proto3"; + +message Message +{ + enum Enum + { + FIRST = 0; + SECOND = 1; + TEN = 10; + HUNDRED = 100; + }; + Enum x = 1; +}; \ No newline at end of file diff --git a/tests/queries/0_stateless/00825_protobuf_format_enum_mapping.reference b/tests/queries/0_stateless/00825_protobuf_format_enum_mapping.reference new file mode 100644 index 00000000000..ef8059bac28 --- /dev/null +++ b/tests/queries/0_stateless/00825_protobuf_format_enum_mapping.reference @@ -0,0 +1,31 @@ +Second +Third +First +First +Second + +Binary representation: +00000000 02 08 01 02 08 64 00 00 02 08 01 |.....d.....| +0000000b + +MESSAGE #1 AT 0x00000001 +x: SECOND +MESSAGE #2 AT 0x00000004 +x: HUNDRED +MESSAGE #3 AT 0x00000007 +MESSAGE #4 AT 0x00000008 +MESSAGE #5 AT 0x00000009 +x: SECOND + +Binary representation is as expected + +Second +Third +First +First +Second +Second +Third +First +First +Second diff --git a/tests/queries/0_stateless/00825_protobuf_format_enum_mapping.sh b/tests/queries/0_stateless/00825_protobuf_format_enum_mapping.sh new file mode 100755 index 00000000000..cbb387a62a5 --- /dev/null +++ b/tests/queries/0_stateless/00825_protobuf_format_enum_mapping.sh @@ -0,0 +1,37 @@ +#!/usr/bin/env bash + +# https://github.com/ClickHouse/ClickHouse/issues/7438 + +CURDIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd) +# shellcheck source=../shell_config.sh +. "$CURDIR"/../shell_config.sh + +set -eo pipefail + +# Run the client. +$CLICKHOUSE_CLIENT --multiquery <<'EOF' +DROP TABLE IF EXISTS enum_mapping_protobuf_00825; + +CREATE TABLE enum_mapping_protobuf_00825 +( + x Enum16('First'=-100, 'Second'=0, 'Third'=100) +) ENGINE = MergeTree ORDER BY tuple(); + +INSERT INTO enum_mapping_protobuf_00825 VALUES ('Second'), ('Third'), ('First'), ('First'), ('Second'); + +SELECT * FROM enum_mapping_protobuf_00825; +EOF + +BINARY_FILE_PATH=$(mktemp "$CURDIR/00825_protobuf_format_enum_mapping.XXXXXX.binary") +$CLICKHOUSE_CLIENT --query "SELECT * FROM enum_mapping_protobuf_00825 FORMAT Protobuf SETTINGS format_schema = '$CURDIR/00825_protobuf_format_enum_mapping:Message'" > "$BINARY_FILE_PATH" + +# Check the output in the protobuf format +echo +$CURDIR/helpers/protobuf_length_delimited_encoder.py --decode_and_check --format_schema "$CURDIR/00825_protobuf_format_enum_mapping:Message" --input "$BINARY_FILE_PATH" + +# Check the input in the protobuf format (now the table contains the same data twice). +echo +$CLICKHOUSE_CLIENT --query "INSERT INTO enum_mapping_protobuf_00825 FORMAT Protobuf SETTINGS format_schema='$CURDIR/00825_protobuf_format_enum_mapping:Message'" < "$BINARY_FILE_PATH" +$CLICKHOUSE_CLIENT --query "SELECT * FROM enum_mapping_protobuf_00825" + +rm "$BINARY_FILE_PATH" diff --git a/tests/queries/0_stateless/00825_protobuf_format_map.proto b/tests/queries/0_stateless/00825_protobuf_format_map.proto new file mode 100644 index 00000000000..561b409b733 --- /dev/null +++ b/tests/queries/0_stateless/00825_protobuf_format_map.proto @@ -0,0 +1,5 @@ +syntax = "proto3"; + +message Message { + map a = 1; +}; diff --git a/tests/queries/0_stateless/00825_protobuf_format_map.reference b/tests/queries/0_stateless/00825_protobuf_format_map.reference new file mode 100644 index 00000000000..e3f17cb1095 --- /dev/null +++ b/tests/queries/0_stateless/00825_protobuf_format_map.reference @@ -0,0 +1,19 @@ +{'x':5,'y':7} +{'z':11} +{'temp':0} +{'':0} + +Binary representation: +00000000 0e 0a 05 0a 01 78 10 05 0a 05 0a 01 79 10 07 07 |.....x......y...| +00000010 0a 05 0a 01 7a 10 0b 0a 0a 08 0a 04 74 65 6d 70 |....z.......temp| +00000020 10 00 06 0a 04 0a 00 10 00 |.........| +00000029 + +{'x':5,'y':7} +{'z':11} +{'temp':0} +{'':0} +{'x':5,'y':7} +{'z':11} +{'temp':0} +{'':0} diff --git a/tests/queries/0_stateless/00825_protobuf_format_map.sh b/tests/queries/0_stateless/00825_protobuf_format_map.sh new file mode 100755 index 00000000000..5df25c41750 --- /dev/null +++ b/tests/queries/0_stateless/00825_protobuf_format_map.sh @@ -0,0 +1,40 @@ +#!/usr/bin/env bash + +# https://github.com/ClickHouse/ClickHouse/issues/6497 + +CURDIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd) +# shellcheck source=../shell_config.sh +. "$CURDIR"/../shell_config.sh + +set -eo pipefail + +# Run the client. +$CLICKHOUSE_CLIENT --multiquery <<'EOF' +SET allow_experimental_map_type = 1; + +DROP TABLE IF EXISTS map_00825; + +CREATE TABLE map_00825 +( + a Map(String, UInt32) +) ENGINE = MergeTree ORDER BY tuple(); + +INSERT INTO map_00825 VALUES ({'x':5, 'y':7}), ({'z':11}), ({'temp':0}), ({'':0}); + +SELECT * FROM map_00825; +EOF + +BINARY_FILE_PATH=$(mktemp "$CURDIR/00825_protobuf_format_map.XXXXXX.binary") +$CLICKHOUSE_CLIENT --query "SELECT * FROM map_00825 FORMAT Protobuf SETTINGS format_schema = '$CURDIR/00825_protobuf_format_map:Message'" > "$BINARY_FILE_PATH" + +# Check the output in the protobuf format +echo +echo "Binary representation:" +hexdump -C $BINARY_FILE_PATH + +# Check the input in the protobuf format (now the table contains the same data twice). +echo +$CLICKHOUSE_CLIENT --query "INSERT INTO map_00825 FORMAT Protobuf SETTINGS format_schema='$CURDIR/00825_protobuf_format_map:Message'" < "$BINARY_FILE_PATH" +$CLICKHOUSE_CLIENT --query "SELECT * FROM map_00825" + +rm "$BINARY_FILE_PATH" diff --git a/tests/queries/0_stateless/00825_protobuf_format_nested_optional.proto b/tests/queries/0_stateless/00825_protobuf_format_nested_optional.proto new file mode 100644 index 00000000000..052741f504b --- /dev/null +++ b/tests/queries/0_stateless/00825_protobuf_format_nested_optional.proto @@ -0,0 +1,10 @@ +syntax = "proto3"; + +message Repeated { + string foo = 1; + int64 bar = 2; +} + +message Message { + repeated Repeated messages = 1; +}; \ No newline at end of file diff --git a/tests/queries/0_stateless/00825_protobuf_format_nested_optional.reference b/tests/queries/0_stateless/00825_protobuf_format_nested_optional.reference new file mode 100644 index 00000000000..6cdd56a5b7f --- /dev/null +++ b/tests/queries/0_stateless/00825_protobuf_format_nested_optional.reference @@ -0,0 +1,25 @@ +['1'] [0] +['1',''] [0,1] + +Binary representation: +00000000 05 0a 03 0a 01 31 09 0a 03 0a 01 31 0a 02 10 01 |.....1.....1....| +00000010 + +MESSAGE #1 AT 0x00000001 +messages { + foo: "1" +} +MESSAGE #2 AT 0x00000007 +messages { + foo: "1" +} +messages { + bar: 1 +} + +Binary representation is as expected + +['1'] [0] +['1',''] [0,1] +['1'] [0] +['1',''] [0,1] diff --git a/tests/queries/0_stateless/00825_protobuf_format_nested_optional.sh b/tests/queries/0_stateless/00825_protobuf_format_nested_optional.sh new file mode 100755 index 00000000000..58ded92f2c1 --- /dev/null +++ b/tests/queries/0_stateless/00825_protobuf_format_nested_optional.sh @@ -0,0 +1,41 @@ +#!/usr/bin/env bash + +# https://github.com/ClickHouse/ClickHouse/issues/6497 + +CURDIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd) +# shellcheck source=../shell_config.sh +. "$CURDIR"/../shell_config.sh + +set -eo pipefail + +# Run the client. +$CLICKHOUSE_CLIENT --multiquery <<'EOF' +DROP TABLE IF EXISTS nested_optional_protobuf_00825; + +CREATE TABLE nested_optional_protobuf_00825 +( + messages Nested + ( + foo String, + bar Int64 + ) +) ENGINE = MergeTree ORDER BY tuple(); + +INSERT INTO nested_optional_protobuf_00825 VALUES (['1'], [0]), (['1', ''], [0, 1]); + +SELECT * FROM nested_optional_protobuf_00825; +EOF + +BINARY_FILE_PATH=$(mktemp "$CURDIR/00825_protobuf_format_nested_optional.XXXXXX.binary") +$CLICKHOUSE_CLIENT --query "SELECT * FROM nested_optional_protobuf_00825 FORMAT Protobuf SETTINGS format_schema = '$CURDIR/00825_protobuf_format_nested_optional:Message'" > "$BINARY_FILE_PATH" + +# Check the output in the protobuf format +echo +$CURDIR/helpers/protobuf_length_delimited_encoder.py --decode_and_check --format_schema "$CURDIR/00825_protobuf_format_nested_optional:Message" --input "$BINARY_FILE_PATH" + +# Check the input in the protobuf format (now the table contains the same data twice). +echo +$CLICKHOUSE_CLIENT --query "INSERT INTO nested_optional_protobuf_00825 FORMAT Protobuf SETTINGS format_schema='$CURDIR/00825_protobuf_format_nested_optional:Message'" < "$BINARY_FILE_PATH" +$CLICKHOUSE_CLIENT --query "SELECT * FROM nested_optional_protobuf_00825" + +rm "$BINARY_FILE_PATH" diff --git a/tests/queries/0_stateless/00825_protobuf_format_table_default.proto b/tests/queries/0_stateless/00825_protobuf_format_table_default.proto new file mode 100644 index 00000000000..08e6049ffe0 --- /dev/null +++ b/tests/queries/0_stateless/00825_protobuf_format_table_default.proto @@ -0,0 +1,6 @@ +syntax = "proto3"; + +message Message { + sint32 x = 1; + sint32 z = 2; +}; \ No newline at end of file diff --git a/tests/queries/0_stateless/00825_protobuf_format_table_default.reference b/tests/queries/0_stateless/00825_protobuf_format_table_default.reference new file mode 100644 index 00000000000..5472f3bfa14 --- /dev/null +++ b/tests/queries/0_stateless/00825_protobuf_format_table_default.reference @@ -0,0 +1,37 @@ +0 0 0 +2 4 8 +3 9 27 +5 25 125 +101 102 103 + +Binary representation: +00000000 00 04 08 04 10 10 04 08 06 10 36 05 08 0a 10 fa |..........6.....| +00000010 01 06 08 ca 01 10 ce 01 |........| +00000018 + +MESSAGE #1 AT 0x00000001 +MESSAGE #2 AT 0x00000002 +x: 2 +z: 8 +MESSAGE #3 AT 0x00000007 +x: 3 +z: 27 +MESSAGE #4 AT 0x0000000C +x: 5 +z: 125 +MESSAGE #5 AT 0x00000012 +x: 101 +z: 103 + +Binary representation is as expected + +0 0 0 +0 0 0 +2 4 8 +2 4 8 +3 9 27 +3 9 27 +5 25 125 +5 25 125 +101 102 103 +101 10201 103 diff --git a/tests/queries/0_stateless/00825_protobuf_format_table_default.sh b/tests/queries/0_stateless/00825_protobuf_format_table_default.sh new file mode 100755 index 00000000000..97f7769269a --- /dev/null +++ b/tests/queries/0_stateless/00825_protobuf_format_table_default.sh @@ -0,0 +1,38 @@ +#!/usr/bin/env bash + +CURDIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd) +# shellcheck source=../shell_config.sh +. "$CURDIR"/../shell_config.sh + +set -eo pipefail + +# Run the client. +$CLICKHOUSE_CLIENT --multiquery <<'EOF' +DROP TABLE IF EXISTS table_default_protobuf_00825; + +CREATE TABLE table_default_protobuf_00825 +( + x Int64, + y Int64 DEFAULT x * x, + z Int64 DEFAULT x * x * x +) ENGINE = MergeTree ORDER BY tuple(); + +INSERT INTO table_default_protobuf_00825 (x) VALUES (0), (2), (3), (5); +INSERT INTO table_default_protobuf_00825 VALUES (101, 102, 103); + +SELECT * FROM table_default_protobuf_00825 ORDER BY x,y,z; +EOF + +BINARY_FILE_PATH=$(mktemp "$CURDIR/00825_protobuf_format_table_default.XXXXXX.binary") +$CLICKHOUSE_CLIENT --query "SELECT * FROM table_default_protobuf_00825 ORDER BY x,y,z FORMAT Protobuf SETTINGS format_schema = '$CURDIR/00825_protobuf_format_table_default:Message'" > "$BINARY_FILE_PATH" + +# Check the output in the protobuf format +echo +$CURDIR/helpers/protobuf_length_delimited_encoder.py --decode_and_check --format_schema "$CURDIR/00825_protobuf_format_table_default:Message" --input "$BINARY_FILE_PATH" + +# Check the input in the protobuf format (now the table contains the same data twice). +echo +$CLICKHOUSE_CLIENT --query "INSERT INTO table_default_protobuf_00825 FORMAT Protobuf SETTINGS format_schema='$CURDIR/00825_protobuf_format_table_default:Message'" < "$BINARY_FILE_PATH" +$CLICKHOUSE_CLIENT --query "SELECT * FROM table_default_protobuf_00825 ORDER BY x,y,z" + +rm "$BINARY_FILE_PATH" diff --git a/tests/queries/0_stateless/helpers/protobuf_length_delimited_encoder.py b/tests/queries/0_stateless/helpers/protobuf_length_delimited_encoder.py new file mode 100755 index 00000000000..3ed42f1c820 --- /dev/null +++ b/tests/queries/0_stateless/helpers/protobuf_length_delimited_encoder.py @@ -0,0 +1,180 @@ +#!/usr/bin/env python3 + +# The protobuf compiler protoc doesn't support encoding or decoding length-delimited protobuf message. +# To do that this script has been written. + +import argparse +import os.path +import struct +import subprocess +import sys +import tempfile + +def read_varint(input): + res = 0 + shift = 0 + while True: + c = input.read(1) + if len(c) == 0: + return None + b = c[0] + if b < 0x80: + res += b << shift + break + b -= 0x80 + res += b << shift + shift = shift << 7 + return res + +def write_varint(output, value): + while True: + if value < 0x80: + b = value + output.write(b.to_bytes(1, byteorder='little')) + break + b = (value & 0x7F) + 0x80 + output.write(b.to_bytes(1, byteorder='little')) + value = value >> 7 + +def write_hexdump(output, data): + with subprocess.Popen(["hexdump", "-C"], stdin=subprocess.PIPE, stdout=output, shell=False) as proc: + proc.communicate(data) + if proc.returncode != 0: + raise RuntimeError("hexdump returned code " + str(proc.returncode)) + output.flush() + +class FormatSchemaSplitted: + def __init__(self, format_schema): + self.format_schema = format_schema + splitted = self.format_schema.split(':') + if len(splitted) < 2: + raise RuntimeError('The format schema must have the format "schemafile:MessageType"') + path = splitted[0] + self.schemadir = os.path.dirname(path) + self.schemaname = os.path.basename(path) + if not self.schemaname.endswith(".proto"): + self.schemaname = self.schemaname + ".proto" + self.message_type = splitted[1] + +def decode(input, output, format_schema): + if not type(format_schema) is FormatSchemaSplitted: + format_schema = FormatSchemaSplitted(format_schema) + msgindex = 1 + while True: + sz = read_varint(input) + if sz is None: + break + output.write("MESSAGE #{msgindex} AT 0x{msgoffset:08X}\n".format(msgindex=msgindex, msgoffset=input.tell()).encode()) + output.flush() + msg = input.read(sz) + if len(msg) < sz: + raise EOFError('Unexpected end of file') + with subprocess.Popen(["protoc", + "--decode", format_schema.message_type, format_schema.schemaname], + cwd=format_schema.schemadir, + stdin=subprocess.PIPE, + stdout=output, + shell=False) as proc: + proc.communicate(msg) + if proc.returncode != 0: + raise RuntimeError("protoc returned code " + str(proc.returncode)) + output.flush() + msgindex = msgindex + 1 + +def encode(input, output, format_schema): + if not type(format_schema) is FormatSchemaSplitted: + format_schema = FormatSchemaSplitted(format_schema) + line_offset = input.tell() + line = input.readline() + while True: + if len(line) == 0: + break + if not line.startswith(b"MESSAGE #"): + raise RuntimeError("The line at 0x{line_offset:08X} must start with the text 'MESSAGE #'".format(line_offset=line_offset)) + msg = b"" + while True: + line_offset = input.tell() + line = input.readline() + if line.startswith(b"MESSAGE #") or len(line) == 0: + break + msg += line + with subprocess.Popen(["protoc", + "--encode", format_schema.message_type, format_schema.schemaname], + cwd=format_schema.schemadir, + stdin=subprocess.PIPE, + stdout=subprocess.PIPE, + shell=False) as proc: + msgbin = proc.communicate(msg)[0] + if proc.returncode != 0: + raise RuntimeError("protoc returned code " + str(proc.returncode)) + write_varint(output, len(msgbin)) + output.write(msgbin) + output.flush() + +def decode_and_check(input, output, format_schema): + input_data = input.read() + output.write(b"Binary representation:\n") + output.flush() + write_hexdump(output, input_data) + output.write(b"\n") + output.flush() + + with tempfile.TemporaryFile() as tmp_input, tempfile.TemporaryFile() as tmp_decoded, tempfile.TemporaryFile() as tmp_encoded: + tmp_input.write(input_data) + tmp_input.flush() + tmp_input.seek(0) + decode(tmp_input, tmp_decoded, format_schema) + tmp_decoded.seek(0) + decoded_text = tmp_decoded.read() + output.write(decoded_text) + output.flush() + tmp_decoded.seek(0) + encode(tmp_decoded, tmp_encoded, format_schema) + tmp_encoded.seek(0) + encoded_data = tmp_encoded.read() + + if encoded_data == input_data: + output.write(b"\nBinary representation is as expected\n") + output.flush() + else: + output.write(b"\nBinary representation differs from the expected one (listed below):\n") + output.flush() + write_hexdump(output, encoded_data) + sys.exit(1) + +if __name__ == "__main__": + parser = argparse.ArgumentParser(description='Encodes or decodes length-delimited protobuf messages.') + parser.add_argument('--input', help='The input file, the standard input will be used if not specified.') + parser.add_argument('--output', help='The output file, the standard output will be used if not specified') + parser.add_argument('--format_schema', required=True, help='Format schema in the format "schemafile:MessageType"') + group = parser.add_mutually_exclusive_group(required=True) + group.add_argument('--encode', action='store_true', help='Specify to encode length-delimited messages.' + 'The utility will read text-format messages of the given type from the input and write it in binary to the output.') + group.add_argument('--decode', action='store_true', help='Specify to decode length-delimited messages.' + 'The utility will read messages in binary from the input and write text-format messages to the output.') + group.add_argument('--decode_and_check', action='store_true', help='The same as --decode, and the utility will then encode ' + ' the decoded data back to the binary form to check that the result of that encoding is the same as the input was.') + args = parser.parse_args() + + custom_input_file = None + custom_output_file = None + try: + if args.input: + custom_input_file = open(args.input, "rb") + if args.output: + custom_output_file = open(args.output, "wb") + input = custom_input_file if custom_input_file else sys.stdin.buffer + output = custom_output_file if custom_output_file else sys.stdout.buffer + + if args.encode: + encode(input, output, args.format_schema) + elif args.decode: + decode(input, output, args.format_schema) + elif args.decode_and_check: + decode_and_check(input, output, args.format_schema) + + finally: + if custom_input_file: + custom_input_file.close() + if custom_output_file: + custom_output_file.close() diff --git a/tests/queries/skip_list.json b/tests/queries/skip_list.json index ee25bee6a0a..0e470e14916 100644 --- a/tests/queries/skip_list.json +++ b/tests/queries/skip_list.json @@ -131,6 +131,12 @@ "00763_create_query_as_table_engine_bug", "00765_sql_compatibility_aliases", "00825_protobuf_format_input", + "00825_protobuf_format_nested_optional", + "00825_protobuf_format_array_3dim", + "00825_protobuf_format_map", + "00825_protobuf_format_array_of_arrays", + "00825_protobuf_format_table_default", + "00825_protobuf_format_enum_mapping", "00826_cross_to_inner_join", "00834_not_between", "00909_kill_not_initialized_query", From acb5fb8179c2845890635582332790c94995df83 Mon Sep 17 00:00:00 2001 From: Alexander Kazakov Date: Wed, 17 Feb 2021 20:58:04 +0300 Subject: [PATCH 0386/2357] Randomly shuffle replicas withing the same priority --- base/mysqlxx/PoolWithFailover.cpp | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/base/mysqlxx/PoolWithFailover.cpp b/base/mysqlxx/PoolWithFailover.cpp index 5bee75aab1b..e2d612d6bc4 100644 --- a/base/mysqlxx/PoolWithFailover.cpp +++ b/base/mysqlxx/PoolWithFailover.cpp @@ -1,3 +1,6 @@ +#include +#include + #include @@ -7,6 +10,8 @@ static bool startsWith(const std::string & s, const char * prefix) return s.size() >= strlen(prefix) && 0 == memcmp(s.data(), prefix, strlen(prefix)); } +/// This is thread-safe +std::random_device rd; using namespace mysqlxx; @@ -33,6 +38,13 @@ PoolWithFailover::PoolWithFailover(const Poco::Util::AbstractConfiguration & con std::make_shared(config_, replica_name, default_connections_, max_connections_, config_name_.c_str())); } } + + static thread_local std::mt19937 rnd_generator(rd()); + for (auto & [_, replicas] : replicas_by_priority) + { + if (replicas.size() > 1) + std::shuffle(replicas.begin(), replicas.end(), rnd_generator); + } } else { From 56a5d1dafaa7cb08719277886000349490c47eda Mon Sep 17 00:00:00 2001 From: Nikolai Kochetov Date: Wed, 17 Feb 2021 21:48:26 +0300 Subject: [PATCH 0387/2357] Skip stateful functions --- src/Processors/QueryPlan/Optimizations/filterPushDown.cpp | 3 +++ 1 file changed, 3 insertions(+) diff --git a/src/Processors/QueryPlan/Optimizations/filterPushDown.cpp b/src/Processors/QueryPlan/Optimizations/filterPushDown.cpp index 02e1914504d..456faeb72c2 100644 --- a/src/Processors/QueryPlan/Optimizations/filterPushDown.cpp +++ b/src/Processors/QueryPlan/Optimizations/filterPushDown.cpp @@ -123,6 +123,9 @@ size_t tryPushDownFilter(QueryPlan::Node * parent_node, QueryPlan::Nodes & nodes if (!filter) return 0; + if (filter->getExpression()->hasStatefulFunctions()) + return 0; + if (auto * aggregating = typeid_cast(child.get())) { const auto & params = aggregating->getParams(); From 0296d7d026ab3fb1a335d1a97a5154add718ad89 Mon Sep 17 00:00:00 2001 From: Alexander Kazakov Date: Wed, 17 Feb 2021 21:51:05 +0300 Subject: [PATCH 0388/2357] Added some explanations on randomization --- base/mysqlxx/PoolWithFailover.cpp | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/base/mysqlxx/PoolWithFailover.cpp b/base/mysqlxx/PoolWithFailover.cpp index e2d612d6bc4..9132773f727 100644 --- a/base/mysqlxx/PoolWithFailover.cpp +++ b/base/mysqlxx/PoolWithFailover.cpp @@ -10,7 +10,7 @@ static bool startsWith(const std::string & s, const char * prefix) return s.size() >= strlen(prefix) && 0 == memcmp(s.data(), prefix, strlen(prefix)); } -/// This is thread-safe +/// This reads from "/dev/urandom" and thus is thread-safe std::random_device rd; using namespace mysqlxx; @@ -39,6 +39,11 @@ PoolWithFailover::PoolWithFailover(const Poco::Util::AbstractConfiguration & con } } + /// PoolWithFailover objects are stored in a cache inside PoolFactory. + /// This cache is reset by ExternalDictionariesLoader after every SYSTEM RELOAD DICTIONAR{Y|IES} + /// which triggers massive re-constructing of connection pools. + /// The state of PRNDGs like std::mt19937 is considered to be quite heavy + /// thus here we attempt to optimize its construction. static thread_local std::mt19937 rnd_generator(rd()); for (auto & [_, replicas] : replicas_by_priority) { From 62486d6e06eb0eb23ab3a0c3b640bb1895a76181 Mon Sep 17 00:00:00 2001 From: kssenii Date: Wed, 17 Feb 2021 18:40:25 +0000 Subject: [PATCH 0389/2357] Add test --- .../integration/test_odbc_interaction/test.py | 22 +++++++++++++++++++ 1 file changed, 22 insertions(+) diff --git a/tests/integration/test_odbc_interaction/test.py b/tests/integration/test_odbc_interaction/test.py index 084fc407f39..6bb6a6ee777 100644 --- a/tests/integration/test_odbc_interaction/test.py +++ b/tests/integration/test_odbc_interaction/test.py @@ -342,3 +342,25 @@ def test_bridge_dies_with_parent(started_cluster): assert clickhouse_pid is None assert bridge_pid is None + + +def test_odbc_postgres_date_data_type(started_cluster): + conn = get_postgres_conn(); + cursor = conn.cursor() + cursor.execute("CREATE TABLE IF NOT EXISTS clickhouse.test_date (column1 integer, column2 date)") + + cursor.execute("INSERT INTO clickhouse.test_date VALUES (1, '2020-12-01')") + cursor.execute("INSERT INTO clickhouse.test_date VALUES (2, '2020-12-02')") + cursor.execute("INSERT INTO clickhouse.test_date VALUES (3, '2020-12-03')") + conn.commit() + + node1.query( + ''' + CREATE TABLE test_date (column1 UInt64, column2 Date) + ENGINE=ODBC('DSN=postgresql_odbc; Servername=postgre-sql.local', 'clickhouse', 'test_date')''') + + expected = '1\t2020-12-01\n2\t2020-12-02\n3\t2020-12-03\n' + result = node1.query('SELECT * FROM test_date'); + assert(result == expected) + + From 80b6db7f729063778de0e2a5fc4d33d3fef27583 Mon Sep 17 00:00:00 2001 From: Pavel Kruglov Date: Wed, 17 Feb 2021 22:07:14 +0300 Subject: [PATCH 0390/2357] Style --- src/Client/ConnectionPoolWithFailover.cpp | 3 --- src/Client/HedgedConnectionsFactory.cpp | 2 +- 2 files changed, 1 insertion(+), 4 deletions(-) diff --git a/src/Client/ConnectionPoolWithFailover.cpp b/src/Client/ConnectionPoolWithFailover.cpp index ec9215e3bc1..acbb678d870 100644 --- a/src/Client/ConnectionPoolWithFailover.cpp +++ b/src/Client/ConnectionPoolWithFailover.cpp @@ -24,9 +24,6 @@ namespace DB namespace ErrorCodes { - extern const int ATTEMPT_TO_READ_AFTER_EOF; - extern const int NETWORK_ERROR; - extern const int SOCKET_TIMEOUT; extern const int LOGICAL_ERROR; } diff --git a/src/Client/HedgedConnectionsFactory.cpp b/src/Client/HedgedConnectionsFactory.cpp index 3551814d603..6519e0c9a94 100644 --- a/src/Client/HedgedConnectionsFactory.cpp +++ b/src/Client/HedgedConnectionsFactory.cpp @@ -238,7 +238,7 @@ void HedgedConnectionsFactory::processConnectionEstablisherStage(int index, bool void HedgedConnectionsFactory::processFailedConnection(int index, bool remove_from_epoll) { ConnectionEstablisher & connection_establisher = replicas[index].connection_establisher; - + if (remove_from_epoll) removeReplicaFromEpoll(index); From ec4dafaa5f914e99acc8cede5b60e85458eab134 Mon Sep 17 00:00:00 2001 From: Nikolai Kochetov Date: Wed, 17 Feb 2021 22:19:39 +0300 Subject: [PATCH 0391/2357] Fix build. --- src/CMakeLists.txt | 4 ++-- src/Processors/ya.make | 1 + 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index 86db7742c97..7a7f160dd81 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -100,8 +100,8 @@ endif() list (APPEND clickhouse_common_io_sources ${CONFIG_BUILD}) list (APPEND clickhouse_common_io_headers ${CONFIG_VERSION} ${CONFIG_COMMON}) -list (APPEND dbms_sources Functions/IFunction.cpp Functions/FunctionFactory.cpp Functions/FunctionHelpers.cpp Functions/extractTimeZoneFromFunctionArguments.cpp Functions/replicate.cpp) -list (APPEND dbms_headers Functions/IFunctionImpl.h Functions/FunctionFactory.h Functions/FunctionHelpers.h Functions/extractTimeZoneFromFunctionArguments.h Functions/replicate.h) +list (APPEND dbms_sources Functions/IFunction.cpp Functions/FunctionFactory.cpp Functions/FunctionHelpers.cpp Functions/extractTimeZoneFromFunctionArguments.cpp Functions/replicate.cpp Functions/FunctionsLogical.cpp) +list (APPEND dbms_headers Functions/IFunctionImpl.h Functions/FunctionFactory.h Functions/FunctionHelpers.h Functions/extractTimeZoneFromFunctionArguments.h Functions/replicate.h Functions/FunctionsLogical.h) list (APPEND dbms_sources AggregateFunctions/AggregateFunctionFactory.cpp diff --git a/src/Processors/ya.make b/src/Processors/ya.make index 34ff61d03c5..71ddd07f6a2 100644 --- a/src/Processors/ya.make +++ b/src/Processors/ya.make @@ -114,6 +114,7 @@ SRCS( QueryPlan/MergingFinal.cpp QueryPlan/MergingSortedStep.cpp QueryPlan/OffsetStep.cpp + QueryPlan/Optimizations/filterPushDown.cpp QueryPlan/Optimizations/liftUpArrayJoin.cpp QueryPlan/Optimizations/limitPushDown.cpp QueryPlan/Optimizations/mergeExpressions.cpp From 6e244e7bb1722e23a9e616c7e8048ac2c8306885 Mon Sep 17 00:00:00 2001 From: alesapin Date: Wed, 17 Feb 2021 22:32:38 +0300 Subject: [PATCH 0392/2357] Trying without fsync --- src/Coordination/Changelog.cpp | 2 +- src/Coordination/Changelog.h | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/src/Coordination/Changelog.cpp b/src/Coordination/Changelog.cpp index 9e1ed557430..a9693b2a47b 100644 --- a/src/Coordination/Changelog.cpp +++ b/src/Coordination/Changelog.cpp @@ -298,7 +298,7 @@ void Changelog::appendEntry(size_t index, nuraft::ptr log_ent if (current_writer->getEntriesWritten() == rotate_interval) rotate(index); - auto offset = current_writer->appendRecord(buildRecord(index, log_entry), true); + auto offset = current_writer->appendRecord(buildRecord(index, log_entry), false); if (!index_to_start_pos.try_emplace(index, offset).second) throw Exception(ErrorCodes::LOGICAL_ERROR, "Record with index {} already exists", index); diff --git a/src/Coordination/Changelog.h b/src/Coordination/Changelog.h index e154c1c70c6..5f38f68750e 100644 --- a/src/Coordination/Changelog.h +++ b/src/Coordination/Changelog.h @@ -101,7 +101,7 @@ public: private: - void rotate(size_t new_start_log_idex); + void rotate(size_t new_start_log_idx); ChangelogRecord buildRecord(size_t index, nuraft::ptr log_entry) const; From ff663dc511a5daf955e559cdff0d47fa6a07f104 Mon Sep 17 00:00:00 2001 From: alesapin Date: Wed, 17 Feb 2021 23:36:25 +0300 Subject: [PATCH 0393/2357] Fsync at server shutdown --- src/Coordination/Changelog.cpp | 13 ++++++++++++- src/Coordination/InMemoryStateManager.cpp | 5 +++++ src/Coordination/InMemoryStateManager.h | 2 ++ src/Coordination/NuKeeperServer.cpp | 1 + 4 files changed, 20 insertions(+), 1 deletion(-) diff --git a/src/Coordination/Changelog.cpp b/src/Coordination/Changelog.cpp index a9693b2a47b..2d1bbfb4440 100644 --- a/src/Coordination/Changelog.cpp +++ b/src/Coordination/Changelog.cpp @@ -467,6 +467,17 @@ void Changelog::flush() current_writer->flush(); } -Changelog::~Changelog() = default; +Changelog::~Changelog() +{ + try + { + if (current_writer) + current_writer->flush(); + } + catch (...) + { + tryLogCurrentException(__PRETTY_FUNCTION__); + } +} } diff --git a/src/Coordination/InMemoryStateManager.cpp b/src/Coordination/InMemoryStateManager.cpp index 6c4e95b993a..0423d2466f2 100644 --- a/src/Coordination/InMemoryStateManager.cpp +++ b/src/Coordination/InMemoryStateManager.cpp @@ -66,6 +66,11 @@ void InMemoryStateManager::loadLogStore(size_t start_log_index) log_store->init(start_log_index); } +void InMemoryStateManager::flushLogStore() +{ + log_store->flush(); +} + void InMemoryStateManager::save_config(const nuraft::cluster_config & config) { // Just keep in memory in this example. diff --git a/src/Coordination/InMemoryStateManager.h b/src/Coordination/InMemoryStateManager.h index 8a7be7d0129..c53f00702d4 100644 --- a/src/Coordination/InMemoryStateManager.h +++ b/src/Coordination/InMemoryStateManager.h @@ -27,6 +27,8 @@ public: void loadLogStore(size_t start_log_index); + void flushLogStore(); + nuraft::ptr load_config() override { return cluster_config; } void save_config(const nuraft::cluster_config & config) override; diff --git a/src/Coordination/NuKeeperServer.cpp b/src/Coordination/NuKeeperServer.cpp index a4582a5fbb8..8556fa85231 100644 --- a/src/Coordination/NuKeeperServer.cpp +++ b/src/Coordination/NuKeeperServer.cpp @@ -67,6 +67,7 @@ void NuKeeperServer::startup() void NuKeeperServer::shutdown() { state_machine->shutdownStorage(); + state_manager->flushLogStore(); if (!launcher.shutdown(coordination_settings->shutdown_timeout.totalSeconds())) LOG_WARNING(&Poco::Logger::get("NuKeeperServer"), "Failed to shutdown RAFT server in {} seconds", 5); } From f483cd091a5dbc71c7e507ab87d0d6fad307eb39 Mon Sep 17 00:00:00 2001 From: Azat Khuzhin Date: Sun, 14 Feb 2021 23:31:58 +0300 Subject: [PATCH 0394/2357] test/stress: use clickhouse builtin start/stop to run server from the same user This will allow to attach with gdb for better diagnosis. --- docker/test/stress/run.sh | 12 +++--------- 1 file changed, 3 insertions(+), 9 deletions(-) diff --git a/docker/test/stress/run.sh b/docker/test/stress/run.sh index 88a633ac488..44612a83504 100755 --- a/docker/test/stress/run.sh +++ b/docker/test/stress/run.sh @@ -10,14 +10,7 @@ dpkg -i package_folder/clickhouse-test_*.deb function stop() { - timeout 120 service clickhouse-server stop - - # Wait for process to disappear from processlist and also try to kill zombies. - while kill -9 "$(pidof clickhouse-server)" - do - echo "Killed clickhouse-server" - sleep 0.5 - done + clickhouse stop } function start() @@ -33,7 +26,8 @@ function start() tail -n1000 /var/log/clickhouse-server/clickhouse-server.log break fi - timeout 120 service clickhouse-server start + # use root to match with current uid + clickhouse start --user root >/var/log/clickhouse-server/stdout.log 2>/var/log/clickhouse-server/stderr.log sleep 0.5 counter=$((counter + 1)) done From 63eff6e8c812a8770fc54fa987c68e7fb681abe0 Mon Sep 17 00:00:00 2001 From: Azat Khuzhin Date: Sat, 13 Feb 2021 11:41:00 +0300 Subject: [PATCH 0395/2357] test/stress: improve backtrace catching on server failures Otherwise sometimes stracktraces may be lost [1]: [1]: https://clickhouse-test-reports.s3.yandex.net/19580/6aecb62416ece880cbb8ee3a803e14d841388dde/stress_test_(thread).html#fail1 --- docker/test/stress/run.sh | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/docker/test/stress/run.sh b/docker/test/stress/run.sh index 44612a83504..60e9ffd265c 100755 --- a/docker/test/stress/run.sh +++ b/docker/test/stress/run.sh @@ -31,6 +31,18 @@ function start() sleep 0.5 counter=$((counter + 1)) done + + echo " +handle all noprint +handle SIGSEGV stop print +handle SIGBUS stop print +handle SIGABRT stop print +continue +thread apply all backtrace +continue +" > script.gdb + + gdb -batch -command script.gdb -p "$(cat /var/run/clickhouse-server/clickhouse-server.pid)" & } # install test configs From 770c3406df6d55541dcb59b9146206b2558cbe86 Mon Sep 17 00:00:00 2001 From: Azat Khuzhin Date: Mon, 15 Feb 2021 21:02:21 +0300 Subject: [PATCH 0396/2357] test/stress: fix permissions for clickhouse directories --- docker/test/stress/run.sh | 22 +++++++++++++++------- 1 file changed, 15 insertions(+), 7 deletions(-) diff --git a/docker/test/stress/run.sh b/docker/test/stress/run.sh index 60e9ffd265c..dc1e4db4477 100755 --- a/docker/test/stress/run.sh +++ b/docker/test/stress/run.sh @@ -8,6 +8,20 @@ dpkg -i package_folder/clickhouse-server_*.deb dpkg -i package_folder/clickhouse-client_*.deb dpkg -i package_folder/clickhouse-test_*.deb +function configure() +{ + # install test configs + /usr/share/clickhouse-test/config/install.sh + + # for clickhouse-server (via service) + echo "ASAN_OPTIONS='malloc_context_size=10 verbosity=1 allocator_release_to_os_interval_ms=10000'" >> /etc/environment + # for clickhouse-client + export ASAN_OPTIONS='malloc_context_size=10 allocator_release_to_os_interval_ms=10000' + + # since we run clickhouse from root + sudo chown root: /var/lib/clickhouse +} + function stop() { clickhouse stop @@ -45,13 +59,7 @@ continue gdb -batch -command script.gdb -p "$(cat /var/run/clickhouse-server/clickhouse-server.pid)" & } -# install test configs -/usr/share/clickhouse-test/config/install.sh - -# for clickhouse-server (via service) -echo "ASAN_OPTIONS='malloc_context_size=10 verbosity=1 allocator_release_to_os_interval_ms=10000'" >> /etc/environment -# for clickhouse-client -export ASAN_OPTIONS='malloc_context_size=10 allocator_release_to_os_interval_ms=10000' +configure start From 65f2b6a0449f19e0488c5c66e013e9002b4949d3 Mon Sep 17 00:00:00 2001 From: Azat Khuzhin Date: Mon, 15 Feb 2021 10:18:37 +0300 Subject: [PATCH 0397/2357] test/fasttest: add gdb into docker image --- docker/test/fasttest/Dockerfile | 1 + 1 file changed, 1 insertion(+) diff --git a/docker/test/fasttest/Dockerfile b/docker/test/fasttest/Dockerfile index 03b7b2fc53a..64be52d8e30 100644 --- a/docker/test/fasttest/Dockerfile +++ b/docker/test/fasttest/Dockerfile @@ -47,6 +47,7 @@ RUN apt-get update \ expect \ fakeroot \ git \ + gdb \ gperf \ lld-${LLVM_VERSION} \ llvm-${LLVM_VERSION} \ From ee18f6a7ec23304c7ebc5128882d163d510525e0 Mon Sep 17 00:00:00 2001 From: Azat Khuzhin Date: Sun, 14 Feb 2021 23:34:14 +0300 Subject: [PATCH 0398/2357] test/fasttest: collect diagnosis by attaching with gdb in background Otherwise sometimes stacktraces may be lost [1]: [1]: https://clickhouse-test-reports.s3.yandex.net/20477/8ad20fcee5aaa642c2a2dd873d02103692d554f4/fast_test.html#fail1 --- docker/test/fasttest/run.sh | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/docker/test/fasttest/run.sh b/docker/test/fasttest/run.sh index e6294b5d74d..fbdad93a553 100755 --- a/docker/test/fasttest/run.sh +++ b/docker/test/fasttest/run.sh @@ -107,6 +107,18 @@ function start_server fi echo "ClickHouse server pid '$server_pid' started and responded" + + echo " +handle all noprint +handle SIGSEGV stop print +handle SIGBUS stop print +handle SIGABRT stop print +continue +thread apply all backtrace +continue +" > script.gdb + + gdb -batch -command script.gdb -p "$server_pid" & } function clone_root From f1fa110d486ce59b072df3c98b1d9cbf50296868 Mon Sep 17 00:00:00 2001 From: Anton Popov Date: Thu, 18 Feb 2021 01:05:31 +0300 Subject: [PATCH 0399/2357] fix type map with integer keys --- src/DataTypes/DataTypeMap.cpp | 17 ++++++++- src/DataTypes/DataTypeMap.h | 2 ++ src/Functions/array/arrayElement.cpp | 9 +++-- src/Interpreters/convertFieldToType.cpp | 35 +++++++++++++++++++ src/Parsers/ExpressionElementParsers.cpp | 1 - src/Parsers/ExpressionElementParsers.h | 12 ------- .../Impl/ConstantExpressionTemplate.cpp | 10 +----- 7 files changed, 61 insertions(+), 25 deletions(-) diff --git a/src/DataTypes/DataTypeMap.cpp b/src/DataTypes/DataTypeMap.cpp index af2ed8805e8..246d781b097 100644 --- a/src/DataTypes/DataTypeMap.cpp +++ b/src/DataTypes/DataTypeMap.cpp @@ -29,6 +29,7 @@ namespace ErrorCodes { extern const int NUMBER_OF_ARGUMENTS_DOESNT_MATCH; extern const int CANNOT_READ_MAP_FROM_TEXT; + extern const int BAD_ARGUMENTS; } @@ -38,6 +39,8 @@ DataTypeMap::DataTypeMap(const DataTypes & elems_) key_type = elems_[0]; value_type = elems_[1]; + assertKeyType(); + nested = std::make_shared( std::make_shared(DataTypes{key_type, value_type}, Names{"keys", "values"})); } @@ -45,7 +48,19 @@ DataTypeMap::DataTypeMap(const DataTypes & elems_) DataTypeMap::DataTypeMap(const DataTypePtr & key_type_, const DataTypePtr & value_type_) : key_type(key_type_), value_type(value_type_) , nested(std::make_shared( - std::make_shared(DataTypes{key_type_, value_type_}, Names{"keys", "values"}))) {} + std::make_shared(DataTypes{key_type_, value_type_}, Names{"keys", "values"}))) +{ + assertKeyType(); +} + +void DataTypeMap::assertKeyType() const +{ + if (!key_type->isValueRepresentedByInteger() && !isStringOrFixedString(*key_type) && !WhichDataType(key_type).isNothing()) + throw Exception(ErrorCodes::BAD_ARGUMENTS, + "Type of Map key must be a type, that can be represented by integer or string," + " but {} given", key_type->getName()); +} + std::string DataTypeMap::doGetName() const { diff --git a/src/DataTypes/DataTypeMap.h b/src/DataTypes/DataTypeMap.h index ea495f05548..2a9173dceae 100644 --- a/src/DataTypes/DataTypeMap.h +++ b/src/DataTypes/DataTypeMap.h @@ -98,6 +98,8 @@ private: template void deserializeTextImpl(IColumn & column, ReadBuffer & istr, bool need_safe_get_int_key, Reader && reader) const; + + void assertKeyType() const; }; } diff --git a/src/Functions/array/arrayElement.cpp b/src/Functions/array/arrayElement.cpp index 7d053988cae..cca252216b3 100644 --- a/src/Functions/array/arrayElement.cpp +++ b/src/Functions/array/arrayElement.cpp @@ -872,7 +872,7 @@ bool FunctionArrayElement::matchKeyToIndexNumberConst( if (!data_numeric) return false; - if (index.getType() != Field::Types::UInt64 && index.getType() != Field::Types::Int64) + if (index.getType() != Field::Types::UInt64 && index.getType() != Field::Types::Int64 && index.getType() != Field::Types::Int128) return false; MatcherNumberConst matcher{data_numeric->getData(), get(index)}; @@ -910,6 +910,7 @@ bool FunctionArrayElement::matchKeyToIndex( || matchKeyToIndexNumber(data, offsets, arguments, matched_idxs) || matchKeyToIndexNumber(data, offsets, arguments, matched_idxs) || matchKeyToIndexNumber(data, offsets, arguments, matched_idxs) + || matchKeyToIndexNumber(data, offsets, arguments, matched_idxs) || matchKeyToIndexString(data, offsets, arguments, matched_idxs); } @@ -925,6 +926,7 @@ bool FunctionArrayElement::matchKeyToIndexConst( || matchKeyToIndexNumberConst(data, offsets, index, matched_idxs) || matchKeyToIndexNumberConst(data, offsets, index, matched_idxs) || matchKeyToIndexNumberConst(data, offsets, index, matched_idxs) + || matchKeyToIndexNumberConst(data, offsets, index, matched_idxs) || matchKeyToIndexStringConst(data, offsets, index, matched_idxs); } @@ -945,11 +947,14 @@ ColumnPtr FunctionArrayElement::executeMap( indices_column->reserve(input_rows_count); auto & indices_data = assert_cast &>(*indices_column).getData(); + std::cerr << "types: " << arguments[0].type->getName() << " " << arguments[1].type->getName() << "\n"; + std::cerr << "columns: " << arguments[0].column->dumpStructure() << " " << arguments[1].column->dumpStructure() << "\n"; + if (!isColumnConst(*arguments[1].column)) { if (input_rows_count > 0 && !matchKeyToIndex(keys_data, offsets, arguments, indices_data)) throw Exception(ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT, - "Illegal types of arguments: {}, {} for function ", + "Illegal types of arguments: {}, {} for function {}", arguments[0].type->getName(), arguments[1].type->getName(), getName()); } else diff --git a/src/Interpreters/convertFieldToType.cpp b/src/Interpreters/convertFieldToType.cpp index 73bf493fa65..5bde9c8ec1a 100644 --- a/src/Interpreters/convertFieldToType.cpp +++ b/src/Interpreters/convertFieldToType.cpp @@ -15,6 +15,7 @@ #include #include #include +#include #include #include @@ -276,6 +277,40 @@ Field convertFieldToTypeImpl(const Field & src, const IDataType & type, const ID return have_unconvertible_element ? Field(Null()) : Field(res); } } + else if (const DataTypeMap * type_map = typeid_cast(&type)) + { + if (src.getType() == Field::Types::Map) + { + const auto & src_map = src.get(); + + const auto & key_type = *type_map->getKeyType(); + const auto & value_type = *type_map->getValueType(); + + bool have_unconvertible_element = false; + Map res(src_map.size()); + + for (size_t i = 0; i < src_map.size(); ++i) + { + const auto & src_tuple = src_map[i].safeGet(); + assert(src_tuple.size() == 2); + Tuple res_tuple(2); + + res_tuple[0] = convertFieldToType(src_tuple[0], key_type); + res_tuple[1] = convertFieldToType(src_tuple[1], value_type); + + if ((res_tuple[0].isNull() && !key_type.isNullable()) + || (res_tuple[1].isNull() && !value_type.isNullable())) + { + // See the comment for Tuples above. + have_unconvertible_element = true; + } + + res[i] = std::move(res_tuple); + } + + return have_unconvertible_element ? Field(Null()) : Field(res); + } + } else if (const DataTypeAggregateFunction * agg_func_type = typeid_cast(&type)) { if (src.getType() != Field::Types::AggregateFunctionState) diff --git a/src/Parsers/ExpressionElementParsers.cpp b/src/Parsers/ExpressionElementParsers.cpp index e7cd85798b9..fcb9a55c260 100644 --- a/src/Parsers/ExpressionElementParsers.cpp +++ b/src/Parsers/ExpressionElementParsers.cpp @@ -1979,7 +1979,6 @@ bool ParserExpressionElement::parseImpl(Pos & pos, ASTPtr & node, Expected & exp { return ParserSubquery().parse(pos, node, expected) || ParserTupleOfLiterals().parse(pos, node, expected) - || ParserMapOfLiterals().parse(pos, node, expected) || ParserParenthesisExpression().parse(pos, node, expected) || ParserArrayOfLiterals().parse(pos, node, expected) || ParserArray().parse(pos, node, expected) diff --git a/src/Parsers/ExpressionElementParsers.h b/src/Parsers/ExpressionElementParsers.h index ba18fc2cddd..02bd5d896e4 100644 --- a/src/Parsers/ExpressionElementParsers.h +++ b/src/Parsers/ExpressionElementParsers.h @@ -324,18 +324,6 @@ protected: } }; -class ParserMapOfLiterals : public IParserBase -{ -public: - ParserCollectionOfLiterals map_parser{TokenType::OpeningCurlyBrace, TokenType::ClosingCurlyBrace}; -protected: - const char * getName() const override { return "map"; } - bool parseImpl(Pos & pos, ASTPtr & node, Expected & expected) override - { - return map_parser.parse(pos, node, expected); - } -}; - class ParserArrayOfLiterals : public IParserBase { public: diff --git a/src/Processors/Formats/Impl/ConstantExpressionTemplate.cpp b/src/Processors/Formats/Impl/ConstantExpressionTemplate.cpp index d7a65c2f15d..c00dcd7a579 100644 --- a/src/Processors/Formats/Impl/ConstantExpressionTemplate.cpp +++ b/src/Processors/Formats/Impl/ConstantExpressionTemplate.cpp @@ -203,12 +203,6 @@ private: if (not_null == array.end()) return true; } - else if (literal->value.getType() == Field::Types::Map) - { - const Map & map = literal->value.get(); - if (map.size() % 2) - return false; - } String column_name = "_dummy_" + std::to_string(replaced_literals.size()); replaced_literals.emplace_back(literal, column_name, force_nullable); @@ -481,14 +475,12 @@ bool ConstantExpressionTemplate::parseLiteralAndAssertType(ReadBuffer & istr, co /// TODO faster way to check types without using Parsers ParserArrayOfLiterals parser_array; ParserTupleOfLiterals parser_tuple; - ParserMapOfLiterals parser_map; Tokens tokens_number(istr.position(), istr.buffer().end()); IParser::Pos iterator(tokens_number, settings.max_parser_depth); Expected expected; ASTPtr ast; - if (!parser_array.parse(iterator, ast, expected) && !parser_tuple.parse(iterator, ast, expected) - && !parser_map.parse(iterator, ast, expected)) + if (!parser_array.parse(iterator, ast, expected) && !parser_tuple.parse(iterator, ast, expected)) return false; istr.position() = const_cast(iterator->begin); From 9b72255ca4fd4d1ec7fd090dd9b39ab16ec6965e Mon Sep 17 00:00:00 2001 From: Alexey Milovidov Date: Thu, 18 Feb 2021 02:09:46 +0300 Subject: [PATCH 0400/2357] Implement compression for all columns except LowCardinality --- src/Columns/ColumnArray.cpp | 25 +++++++++++++- src/Columns/ColumnArray.h | 5 ++- src/Columns/ColumnDecimal.cpp | 25 ++++++++++++++ src/Columns/ColumnDecimal.h | 2 ++ src/Columns/ColumnFixedString.cpp | 30 ++++++++++++++++- src/Columns/ColumnFixedString.h | 2 ++ src/Columns/ColumnMap.h | 2 ++ src/Columns/ColumnNullable.cpp | 15 +++++++++ src/Columns/ColumnNullable.h | 2 ++ src/Columns/ColumnString.cpp | 54 +++++++++++++++++++++++++++++++ src/Columns/ColumnString.h | 2 ++ src/Columns/ColumnTuple.cpp | 24 +++++++++++++- src/Columns/ColumnTuple.h | 1 + src/Columns/ColumnUnique.h | 5 +++ 14 files changed, 188 insertions(+), 6 deletions(-) diff --git a/src/Columns/ColumnArray.cpp b/src/Columns/ColumnArray.cpp index 8c0e06424e7..e8a48672435 100644 --- a/src/Columns/ColumnArray.cpp +++ b/src/Columns/ColumnArray.cpp @@ -7,6 +7,7 @@ #include #include #include +#include #include #include @@ -369,8 +370,12 @@ void ColumnArray::compareColumn(const IColumn & rhs, size_t rhs_row_num, compare_results, direction, nan_direction_hint); } + +namespace +{ + template -struct ColumnArray::Cmp +struct Cmp { const ColumnArray & parent; int nan_direction_hint; @@ -390,6 +395,9 @@ struct ColumnArray::Cmp } }; +} + + void ColumnArray::reserve(size_t n) { getOffsets().reserve(n); @@ -912,6 +920,21 @@ void ColumnArray::updatePermutationWithCollation(const Collator & collator, bool updatePermutationImpl(limit, res, equal_range, Cmp(*this, nan_direction_hint, &collator)); } +ColumnPtr ColumnArray::compress() const +{ + ColumnPtr data_compressed = data->compress(); + ColumnPtr offsets_compressed = offsets->compress(); + + size_t byte_size = data_compressed->byteSize() + offsets_compressed->byteSize(); + + return ColumnCompressed::create(size(), byte_size, + [data_compressed = std::move(data_compressed), offsets_compressed = std::move(offsets_compressed)] + { + return ColumnArray::create(data_compressed->decompress(), offsets_compressed->decompress()); + }); +} + + ColumnPtr ColumnArray::replicate(const Offsets & replicate_offsets) const { if (replicate_offsets.empty()) diff --git a/src/Columns/ColumnArray.h b/src/Columns/ColumnArray.h index e81ecbc1ca0..1caaf672d49 100644 --- a/src/Columns/ColumnArray.h +++ b/src/Columns/ColumnArray.h @@ -123,6 +123,8 @@ public: void gather(ColumnGathererStream & gatherer_stream) override; + ColumnPtr compress() const override; + void forEachSubcolumn(ColumnCallback callback) override { callback(offsets); @@ -183,9 +185,6 @@ private: template void updatePermutationImpl(size_t limit, Permutation & res, EqualRanges & equal_range, Comparator cmp) const; - - template - struct Cmp; }; diff --git a/src/Columns/ColumnDecimal.cpp b/src/Columns/ColumnDecimal.cpp index ddc971032b6..bb61f60706e 100644 --- a/src/Columns/ColumnDecimal.cpp +++ b/src/Columns/ColumnDecimal.cpp @@ -14,6 +14,7 @@ #include #include +#include #include @@ -346,6 +347,30 @@ void ColumnDecimal::gather(ColumnGathererStream & gatherer) gatherer.gather(*this); } +template +ColumnPtr ColumnDecimal::compress() const +{ + size_t source_size = data.size() * sizeof(T); + + /// Don't compress small blocks. + if (source_size < 4096) /// A wild guess. + return ColumnCompressed::wrap(this->getPtr()); + + auto compressed = ColumnCompressed::compressBuffer(data.data(), source_size); + + if (!compressed) + return ColumnCompressed::wrap(this->getPtr()); + + return ColumnCompressed::create(data.size(), compressed->size(), + [compressed = std::move(compressed), column_size = data.size(), scale = this->scale] + { + auto res = ColumnDecimal::create(column_size, scale); + ColumnCompressed::decompressBuffer( + compressed->data(), res->getData().data(), compressed->size(), column_size * sizeof(T)); + return res; + }); +} + template void ColumnDecimal::getExtremes(Field & min, Field & max) const { diff --git a/src/Columns/ColumnDecimal.h b/src/Columns/ColumnDecimal.h index ef841292a7d..5016ddca791 100644 --- a/src/Columns/ColumnDecimal.h +++ b/src/Columns/ColumnDecimal.h @@ -172,6 +172,8 @@ public: return false; } + ColumnPtr compress() const override; + void insertValue(const T value) { data.push_back(value); } Container & getData() { return data; } diff --git a/src/Columns/ColumnFixedString.cpp b/src/Columns/ColumnFixedString.cpp index 55e387ff2ee..278c2fef5f8 100644 --- a/src/Columns/ColumnFixedString.cpp +++ b/src/Columns/ColumnFixedString.cpp @@ -1,6 +1,7 @@ #include - #include +#include + #include #include #include @@ -446,4 +447,31 @@ void ColumnFixedString::getExtremes(Field & min, Field & max) const get(max_idx, max); } +ColumnPtr ColumnFixedString::compress() const +{ + size_t source_size = chars.size() * n; + + /// Don't compress small blocks. + if (source_size < 4096) /// A wild guess. + return ColumnCompressed::wrap(this->getPtr()); + + auto compressed = ColumnCompressed::compressBuffer(chars.data(), source_size); + + if (!compressed) + return ColumnCompressed::wrap(this->getPtr()); + + size_t column_size = size(); + + return ColumnCompressed::create(column_size, compressed->size(), + [compressed = std::move(compressed), column_size, n = n] + { + size_t chars_size = n * column_size; + auto res = ColumnFixedString::create(n); + res->getChars().resize(chars_size); + ColumnCompressed::decompressBuffer( + compressed->data(), res->getChars().data(), compressed->size(), chars_size); + return res; + }); +} + } diff --git a/src/Columns/ColumnFixedString.h b/src/Columns/ColumnFixedString.h index 286b3a752dc..1bb7f922f3e 100644 --- a/src/Columns/ColumnFixedString.h +++ b/src/Columns/ColumnFixedString.h @@ -156,6 +156,8 @@ public: void gather(ColumnGathererStream & gatherer_stream) override; + ColumnPtr compress() const override; + void reserve(size_t size) override { chars.reserve(n * size); diff --git a/src/Columns/ColumnMap.h b/src/Columns/ColumnMap.h index c1948491db5..a970f67bd46 100644 --- a/src/Columns/ColumnMap.h +++ b/src/Columns/ColumnMap.h @@ -91,6 +91,8 @@ public: const ColumnTuple & getNestedData() const { return assert_cast(getNestedColumn().getData()); } ColumnTuple & getNestedData() { return assert_cast(getNestedColumn().getData()); } + + ColumnPtr compress() const override { return nested->compress(); } }; } diff --git a/src/Columns/ColumnNullable.cpp b/src/Columns/ColumnNullable.cpp index 35ce005073a..4e5cc2b4cf7 100644 --- a/src/Columns/ColumnNullable.cpp +++ b/src/Columns/ColumnNullable.cpp @@ -7,6 +7,7 @@ #include #include #include +#include #include @@ -511,6 +512,20 @@ void ColumnNullable::protect() getNullMapColumn().protect(); } +ColumnPtr ColumnNullable::compress() const +{ + ColumnPtr nested_compressed = nested_column->compress(); + ColumnPtr null_map_compressed = null_map->compress(); + + size_t byte_size = nested_column->byteSize() + null_map->byteSize(); + + return ColumnCompressed::create(size(), byte_size, + [nested_column = std::move(nested_column), null_map = std::move(null_map)] + { + return ColumnNullable::create(nested_column->decompress(), null_map->decompress()); + }); +} + namespace { diff --git a/src/Columns/ColumnNullable.h b/src/Columns/ColumnNullable.h index ade2c106627..8d267de8644 100644 --- a/src/Columns/ColumnNullable.h +++ b/src/Columns/ColumnNullable.h @@ -117,6 +117,8 @@ public: void gather(ColumnGathererStream & gatherer_stream) override; + ColumnPtr compress() const override; + void forEachSubcolumn(ColumnCallback callback) override { callback(nested_column); diff --git a/src/Columns/ColumnString.cpp b/src/Columns/ColumnString.cpp index 00d6349408f..190517bfeb9 100644 --- a/src/Columns/ColumnString.cpp +++ b/src/Columns/ColumnString.cpp @@ -2,6 +2,7 @@ #include #include +#include #include #include #include @@ -525,6 +526,59 @@ void ColumnString::getExtremes(Field & min, Field & max) const } +ColumnPtr ColumnString::compress() const +{ + size_t source_chars_size = chars.size(); + size_t source_offsets_size = offsets.size() * sizeof(Offset); + + /// Don't compress small blocks. + if (source_chars_size < 4096) /// A wild guess. + return ColumnCompressed::wrap(this->getPtr()); + + auto chars_compressed = ColumnCompressed::compressBuffer(chars.data(), source_chars_size); + auto offsets_compressed = ColumnCompressed::compressBuffer(offsets.data(), source_offsets_size); + + /// Return original column if not compressable. + if (!chars_compressed && !offsets_compressed) + return ColumnCompressed::wrap(this->getPtr()); + + if (!chars_compressed) + { + chars_compressed = std::make_shared>(source_chars_size); + memcpy(chars_compressed->data(), chars.data(), source_chars_size); + } + + if (!offsets_compressed) + { + offsets_compressed = std::make_shared>(source_offsets_size); + memcpy(offsets_compressed->data(), offsets.data(), source_offsets_size); + } + + return ColumnCompressed::create(offsets.size(), chars_compressed->size() + offsets_compressed->size(), + [ + chars_compressed = std::move(chars_compressed), + offsets_compressed = std::move(offsets_compressed), + source_chars_size, + source_offsets_elements = offsets.size() + ] + { + auto res = ColumnString::create(); + + res->getChars().resize(source_chars_size); + res->getOffsets().resize(source_offsets_elements); + + ColumnCompressed::decompressBuffer( + chars_compressed->data(), res->getChars().data(), chars_compressed->size(), source_chars_size); + + ColumnCompressed::decompressBuffer( + offsets_compressed->data(), res->getOffsets().data(), offsets_compressed->size(), source_offsets_elements * sizeof(Offset)); + + return res; + }); + +} + + int ColumnString::compareAtWithCollation(size_t n, size_t m, const IColumn & rhs_, int, const Collator & collator) const { const ColumnString & rhs = assert_cast(rhs_); diff --git a/src/Columns/ColumnString.h b/src/Columns/ColumnString.h index c1e76c5e28e..843e445d1a0 100644 --- a/src/Columns/ColumnString.h +++ b/src/Columns/ColumnString.h @@ -261,6 +261,8 @@ public: void gather(ColumnGathererStream & gatherer_stream) override; + ColumnPtr compress() const override; + void reserve(size_t n) override; void getExtremes(Field & min, Field & max) const override; diff --git a/src/Columns/ColumnTuple.cpp b/src/Columns/ColumnTuple.cpp index fa5a15d0351..1d85c67e7c6 100644 --- a/src/Columns/ColumnTuple.cpp +++ b/src/Columns/ColumnTuple.cpp @@ -1,6 +1,7 @@ #include #include +#include #include #include #include @@ -486,7 +487,7 @@ bool ColumnTuple::structureEquals(const IColumn & rhs) const bool ColumnTuple::isCollationSupported() const { - for (const auto& column : columns) + for (const auto & column : columns) { if (column->isCollationSupported()) return true; @@ -495,4 +496,25 @@ bool ColumnTuple::isCollationSupported() const } +ColumnPtr ColumnTuple::compress() const +{ + size_t byte_size = 0; + Columns compressed; + compressed.reserve(columns.size()); + for (const auto & column : columns) + { + auto compressed_column = column->compress(); + byte_size += compressed_column->byteSize(); + compressed.emplace_back(std::move(compressed_column)); + } + + return ColumnCompressed::create(size(), byte_size, + [compressed = std::move(compressed)] + { + for (auto & column : compressed) + column = column->decompress(); + return ColumnTuple::create(compressed); + }); +} + } diff --git a/src/Columns/ColumnTuple.h b/src/Columns/ColumnTuple.h index f763ca3fcba..818b29937bd 100644 --- a/src/Columns/ColumnTuple.h +++ b/src/Columns/ColumnTuple.h @@ -89,6 +89,7 @@ public: void forEachSubcolumn(ColumnCallback callback) override; bool structureEquals(const IColumn & rhs) const override; bool isCollationSupported() const override; + ColumnPtr compress() const override; size_t tupleSize() const { return columns.size(); } diff --git a/src/Columns/ColumnUnique.h b/src/Columns/ColumnUnique.h index 5d58b2484e0..d1c4a4e1183 100644 --- a/src/Columns/ColumnUnique.h +++ b/src/Columns/ColumnUnique.h @@ -28,6 +28,11 @@ namespace ErrorCodes extern const int ILLEGAL_COLUMN; } +/** Stores another column with unique values + * and also an index that allows to find position by value. + * + * This column is not used on it's own but only as implementation detail of ColumnLowCardinality. + */ template class ColumnUnique final : public COWHelper> { From 1781a64370c86c93be915db8673644cffe0e58df Mon Sep 17 00:00:00 2001 From: Alexey Milovidov Date: Thu, 18 Feb 2021 02:11:41 +0300 Subject: [PATCH 0401/2357] Whitespaces --- src/Columns/ColumnUnique.h | 2 +- src/Columns/ReverseIndex.h | 10 ++++++---- 2 files changed, 7 insertions(+), 5 deletions(-) diff --git a/src/Columns/ColumnUnique.h b/src/Columns/ColumnUnique.h index d1c4a4e1183..fbd3c3641b5 100644 --- a/src/Columns/ColumnUnique.h +++ b/src/Columns/ColumnUnique.h @@ -39,7 +39,7 @@ class ColumnUnique final : public COWHelper>; private: - explicit ColumnUnique(MutableColumnPtr && holder, bool is_nullable); + ColumnUnique(MutableColumnPtr && holder, bool is_nullable); explicit ColumnUnique(const IDataType & type); ColumnUnique(const ColumnUnique & other); diff --git a/src/Columns/ReverseIndex.h b/src/Columns/ReverseIndex.h index 154293acf99..35b0029fc7b 100644 --- a/src/Columns/ReverseIndex.h +++ b/src/Columns/ReverseIndex.h @@ -316,8 +316,8 @@ template class ReverseIndex { public: - explicit ReverseIndex(UInt64 num_prefix_rows_to_skip_, UInt64 base_index_) - : num_prefix_rows_to_skip(num_prefix_rows_to_skip_), base_index(base_index_), saved_hash_ptr(nullptr) {} + ReverseIndex(UInt64 num_prefix_rows_to_skip_, UInt64 base_index_) + : num_prefix_rows_to_skip(num_prefix_rows_to_skip_), base_index(base_index_), saved_hash_ptr(nullptr) {} void setColumn(ColumnType * column_); @@ -329,14 +329,16 @@ public: /// Returns the found data's index in the dictionary. If index is not built, builds it. UInt64 getInsertionPoint(StringRef data) { - if (!index) buildIndex(); + if (!index) + buildIndex(); return getIndexImpl(data); } /// Returns the found data's index in the dictionary if the #index is built, otherwise, returns a std::nullopt. std::optional getIndex(StringRef data) const { - if (!index) return {}; + if (!index) + return {}; return getIndexImpl(data); } From dcba99f4b1d3c1ed8b4838d00458271cfb2be8d4 Mon Sep 17 00:00:00 2001 From: Anton Popov Date: Thu, 18 Feb 2021 02:19:58 +0300 Subject: [PATCH 0402/2357] fix usage of 'distinct' combinator with 'state' combinator --- src/AggregateFunctions/AggregateFunctionDistinct.h | 5 +++++ .../01259_combinator_distinct_distributed.reference | 4 ++++ .../01259_combinator_distinct_distributed.sql | 9 +++++++++ 3 files changed, 18 insertions(+) diff --git a/src/AggregateFunctions/AggregateFunctionDistinct.h b/src/AggregateFunctions/AggregateFunctionDistinct.h index b481e2a28e7..b587bbebf6e 100644 --- a/src/AggregateFunctions/AggregateFunctionDistinct.h +++ b/src/AggregateFunctions/AggregateFunctionDistinct.h @@ -236,6 +236,11 @@ public: return true; } + bool isState() const override + { + return nested_func->isState(); + } + AggregateFunctionPtr getNestedFunction() const override { return nested_func; } }; diff --git a/tests/queries/0_stateless/01259_combinator_distinct_distributed.reference b/tests/queries/0_stateless/01259_combinator_distinct_distributed.reference index 096d5703292..72a41ac1d84 100644 --- a/tests/queries/0_stateless/01259_combinator_distinct_distributed.reference +++ b/tests/queries/0_stateless/01259_combinator_distinct_distributed.reference @@ -2,3 +2,7 @@ [0,1,2,3,4,5,6,7,8,9,10,11,12] 20 0.49237 +78 +[0,1,2,3,4,5,6,7,8,9,10,11,12] +20 +0.49237 diff --git a/tests/queries/0_stateless/01259_combinator_distinct_distributed.sql b/tests/queries/0_stateless/01259_combinator_distinct_distributed.sql index f851e64dbcb..f95d2d87b8e 100644 --- a/tests/queries/0_stateless/01259_combinator_distinct_distributed.sql +++ b/tests/queries/0_stateless/01259_combinator_distinct_distributed.sql @@ -1,3 +1,12 @@ +SET distributed_aggregation_memory_efficient = 1; + +SELECT sum(DISTINCT number % 13) FROM remote('127.0.0.{1,2}', numbers_mt(100000)); +SELECT arraySort(groupArray(DISTINCT number % 13)) FROM remote('127.0.0.{1,2}', numbers_mt(100000)); +SELECT finalizeAggregation(countState(DISTINCT toString(number % 20))) FROM remote('127.0.0.{1,2}', numbers_mt(100000)); +SELECT round(corrStable(DISTINCT x, y), 5) FROM (SELECT number % 10 AS x, number % 5 AS y FROM remote('127.0.0.{1,2}', numbers(1000))); + +SET distributed_aggregation_memory_efficient = 0; + SELECT sum(DISTINCT number % 13) FROM remote('127.0.0.{1,2}', numbers_mt(100000)); SELECT arraySort(groupArray(DISTINCT number % 13)) FROM remote('127.0.0.{1,2}', numbers_mt(100000)); SELECT finalizeAggregation(countState(DISTINCT toString(number % 20))) FROM remote('127.0.0.{1,2}', numbers_mt(100000)); From b7011f4f9c2a6df4144e9dec4a45c12e7fa62ec8 Mon Sep 17 00:00:00 2001 From: Alexey Milovidov Date: Thu, 18 Feb 2021 02:52:07 +0300 Subject: [PATCH 0403/2357] Fix build --- src/Columns/ColumnTuple.cpp | 2 +- src/DataTypes/DataTypeLowCardinality.h | 2 ++ 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/src/Columns/ColumnTuple.cpp b/src/Columns/ColumnTuple.cpp index 1d85c67e7c6..c7c5f7b97c6 100644 --- a/src/Columns/ColumnTuple.cpp +++ b/src/Columns/ColumnTuple.cpp @@ -509,7 +509,7 @@ ColumnPtr ColumnTuple::compress() const } return ColumnCompressed::create(size(), byte_size, - [compressed = std::move(compressed)] + [compressed = std::move(compressed)]() mutable { for (auto & column : compressed) column = column->decompress(); diff --git a/src/DataTypes/DataTypeLowCardinality.h b/src/DataTypes/DataTypeLowCardinality.h index 6ed2b792ce3..fc28ce0a59d 100644 --- a/src/DataTypes/DataTypeLowCardinality.h +++ b/src/DataTypes/DataTypeLowCardinality.h @@ -1,7 +1,9 @@ #pragma once + #include #include + namespace DB { From 634be2b933d87926fe79ce54bc037b4740dcf7de Mon Sep 17 00:00:00 2001 From: Alexey Milovidov Date: Thu, 18 Feb 2021 03:52:09 +0300 Subject: [PATCH 0404/2357] Fix error --- src/Columns/ColumnCompressed.cpp | 4 ++-- src/Columns/ColumnCompressed.h | 5 +++-- src/Columns/ColumnDecimal.cpp | 2 +- src/Columns/ColumnFixedString.cpp | 4 ++-- src/Columns/ColumnString.cpp | 18 +++--------------- src/Columns/ColumnVector.cpp | 2 +- 6 files changed, 12 insertions(+), 23 deletions(-) diff --git a/src/Columns/ColumnCompressed.cpp b/src/Columns/ColumnCompressed.cpp index d7d30745868..292c6968b86 100644 --- a/src/Columns/ColumnCompressed.cpp +++ b/src/Columns/ColumnCompressed.cpp @@ -15,7 +15,7 @@ namespace ErrorCodes } -std::shared_ptr> ColumnCompressed::compressBuffer(const void * data, size_t data_size) +std::shared_ptr> ColumnCompressed::compressBuffer(const void * data, size_t data_size, bool always_compress) { size_t max_dest_size = LZ4_COMPRESSBOUND(data_size); @@ -34,7 +34,7 @@ std::shared_ptr> ColumnCompressed::compressBuffer(const void * data, si throw Exception(ErrorCodes::CANNOT_COMPRESS, "Cannot compress column"); /// If compression is inefficient. - if (static_cast(compressed_size) * 2 > data_size) + if (!always_compress && static_cast(compressed_size) * 2 > data_size) return {}; /// Shrink to fit. diff --git a/src/Columns/ColumnCompressed.h b/src/Columns/ColumnCompressed.h index bd70005ac5d..f6b6bf22177 100644 --- a/src/Columns/ColumnCompressed.h +++ b/src/Columns/ColumnCompressed.h @@ -65,8 +65,9 @@ public: /// Helper methods for compression. - /// If data is not worth to be compressed - returns nullptr. Note: shared_ptr is to allow to be captured by std::function. - static std::shared_ptr> compressBuffer(const void * data, size_t data_size); + /// If data is not worth to be compressed and not 'always_compress' - returns nullptr. + /// Note: shared_ptr is to allow to be captured by std::function. + static std::shared_ptr> compressBuffer(const void * data, size_t data_size, bool always_compress); static void decompressBuffer( const void * compressed_data, void * decompressed_data, size_t compressed_size, size_t decompressed_size); diff --git a/src/Columns/ColumnDecimal.cpp b/src/Columns/ColumnDecimal.cpp index bb61f60706e..bad3a4c3402 100644 --- a/src/Columns/ColumnDecimal.cpp +++ b/src/Columns/ColumnDecimal.cpp @@ -356,7 +356,7 @@ ColumnPtr ColumnDecimal::compress() const if (source_size < 4096) /// A wild guess. return ColumnCompressed::wrap(this->getPtr()); - auto compressed = ColumnCompressed::compressBuffer(data.data(), source_size); + auto compressed = ColumnCompressed::compressBuffer(data.data(), source_size, false); if (!compressed) return ColumnCompressed::wrap(this->getPtr()); diff --git a/src/Columns/ColumnFixedString.cpp b/src/Columns/ColumnFixedString.cpp index 278c2fef5f8..84bd0561f01 100644 --- a/src/Columns/ColumnFixedString.cpp +++ b/src/Columns/ColumnFixedString.cpp @@ -449,13 +449,13 @@ void ColumnFixedString::getExtremes(Field & min, Field & max) const ColumnPtr ColumnFixedString::compress() const { - size_t source_size = chars.size() * n; + size_t source_size = chars.size(); /// Don't compress small blocks. if (source_size < 4096) /// A wild guess. return ColumnCompressed::wrap(this->getPtr()); - auto compressed = ColumnCompressed::compressBuffer(chars.data(), source_size); + auto compressed = ColumnCompressed::compressBuffer(chars.data(), source_size, false); if (!compressed) return ColumnCompressed::wrap(this->getPtr()); diff --git a/src/Columns/ColumnString.cpp b/src/Columns/ColumnString.cpp index 190517bfeb9..f46c96caf8c 100644 --- a/src/Columns/ColumnString.cpp +++ b/src/Columns/ColumnString.cpp @@ -535,24 +535,13 @@ ColumnPtr ColumnString::compress() const if (source_chars_size < 4096) /// A wild guess. return ColumnCompressed::wrap(this->getPtr()); - auto chars_compressed = ColumnCompressed::compressBuffer(chars.data(), source_chars_size); - auto offsets_compressed = ColumnCompressed::compressBuffer(offsets.data(), source_offsets_size); + auto chars_compressed = ColumnCompressed::compressBuffer(chars.data(), source_chars_size, false); /// Return original column if not compressable. - if (!chars_compressed && !offsets_compressed) + if (!chars_compressed) return ColumnCompressed::wrap(this->getPtr()); - if (!chars_compressed) - { - chars_compressed = std::make_shared>(source_chars_size); - memcpy(chars_compressed->data(), chars.data(), source_chars_size); - } - - if (!offsets_compressed) - { - offsets_compressed = std::make_shared>(source_offsets_size); - memcpy(offsets_compressed->data(), offsets.data(), source_offsets_size); - } + auto offsets_compressed = ColumnCompressed::compressBuffer(offsets.data(), source_offsets_size, true); return ColumnCompressed::create(offsets.size(), chars_compressed->size() + offsets_compressed->size(), [ @@ -575,7 +564,6 @@ ColumnPtr ColumnString::compress() const return res; }); - } diff --git a/src/Columns/ColumnVector.cpp b/src/Columns/ColumnVector.cpp index b8bfef7258e..19ba86c5120 100644 --- a/src/Columns/ColumnVector.cpp +++ b/src/Columns/ColumnVector.cpp @@ -533,7 +533,7 @@ ColumnPtr ColumnVector::compress() const if (source_size < 4096) /// A wild guess. return ColumnCompressed::wrap(this->getPtr()); - auto compressed = ColumnCompressed::compressBuffer(data.data(), source_size); + auto compressed = ColumnCompressed::compressBuffer(data.data(), source_size, false); if (!compressed) return ColumnCompressed::wrap(this->getPtr()); From 5007f7f0183f3cc6ce2b3580b99748ff7a3649ae Mon Sep 17 00:00:00 2001 From: Alexey Milovidov Date: Thu, 18 Feb 2021 05:57:15 +0300 Subject: [PATCH 0405/2357] Fix typo --- src/Columns/ColumnString.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/Columns/ColumnString.cpp b/src/Columns/ColumnString.cpp index f46c96caf8c..8fd22e85e10 100644 --- a/src/Columns/ColumnString.cpp +++ b/src/Columns/ColumnString.cpp @@ -537,7 +537,7 @@ ColumnPtr ColumnString::compress() const auto chars_compressed = ColumnCompressed::compressBuffer(chars.data(), source_chars_size, false); - /// Return original column if not compressable. + /// Return original column if not compressible. if (!chars_compressed) return ColumnCompressed::wrap(this->getPtr()); From 04cb91a0fd1e3dc0f3a1b00d752d93b19a116e97 Mon Sep 17 00:00:00 2001 From: Alexey Milovidov Date: Thu, 18 Feb 2021 06:02:41 +0300 Subject: [PATCH 0406/2357] Fix error --- src/Columns/ColumnMap.cpp | 10 ++++++++++ src/Columns/ColumnMap.h | 2 +- 2 files changed, 11 insertions(+), 1 deletion(-) diff --git a/src/Columns/ColumnMap.cpp b/src/Columns/ColumnMap.cpp index 1cfd7e6c4ef..cc2640a9cf6 100644 --- a/src/Columns/ColumnMap.cpp +++ b/src/Columns/ColumnMap.cpp @@ -1,4 +1,5 @@ #include +#include #include #include #include @@ -243,4 +244,13 @@ bool ColumnMap::structureEquals(const IColumn & rhs) const return false; } +ColumnPtr ColumnMap::compress() const +{ + auto compressed = nested->compress(); + return ColumnCompressed::create(size(), compressed->byteSize(), [compressed = std::move(compressed)] + { + return ColumnMap::create(compressed->decompress()); + }); +} + } diff --git a/src/Columns/ColumnMap.h b/src/Columns/ColumnMap.h index a970f67bd46..acae1574f4c 100644 --- a/src/Columns/ColumnMap.h +++ b/src/Columns/ColumnMap.h @@ -92,7 +92,7 @@ public: const ColumnTuple & getNestedData() const { return assert_cast(getNestedColumn().getData()); } ColumnTuple & getNestedData() { return assert_cast(getNestedColumn().getData()); } - ColumnPtr compress() const override { return nested->compress(); } + ColumnPtr compress() const override; }; } From adf5d24177b6d23d4788e531fa2267378c07aae6 Mon Sep 17 00:00:00 2001 From: M0r64n Date: Thu, 18 Feb 2021 11:36:17 +0400 Subject: [PATCH 0407/2357] Correct file engine settings tests --- .../01720_engine_file_empty_if_not_exists.sql | 1 + .../01721_engine_file_truncate_on_insert.sql | 21 ++++++++++--------- 2 files changed, 12 insertions(+), 10 deletions(-) diff --git a/tests/queries/0_stateless/01720_engine_file_empty_if_not_exists.sql b/tests/queries/0_stateless/01720_engine_file_empty_if_not_exists.sql index c04e01ccc88..d665dbc722f 100644 --- a/tests/queries/0_stateless/01720_engine_file_empty_if_not_exists.sql +++ b/tests/queries/0_stateless/01720_engine_file_empty_if_not_exists.sql @@ -13,3 +13,4 @@ SET engine_file_empty_if_not_exists=1; SELECT * FROM file_engine_table; SET engine_file_empty_if_not_exists=0; +DROP TABLE file_engine_table; diff --git a/tests/queries/0_stateless/01721_engine_file_truncate_on_insert.sql b/tests/queries/0_stateless/01721_engine_file_truncate_on_insert.sql index 65246db7963..42d935cc0dd 100644 --- a/tests/queries/0_stateless/01721_engine_file_truncate_on_insert.sql +++ b/tests/queries/0_stateless/01721_engine_file_truncate_on_insert.sql @@ -1,20 +1,21 @@ -INSERT INTO TABLE FUNCTION file('01718_file/test/data.TSV', 'TSV', 'id UInt32') VALUES ('file', 42); +DROP TABLE IF EXISTS test; + +INSERT INTO TABLE FUNCTION file('01718_file/test/data.TSV', 'TSV', 'id UInt32') VALUES (1); ATTACH TABLE test FROM '01718_file/test' (id UInt8) ENGINE=File(TSV); -CREATE TABLE file_engine_table (id UInt32) ENGINE=File(TabSeparated); - -INSERT INTO file_engine_table VALUES (1), (2), (3); -INSERT INTO file_engine_table VALUES (4); -SELECT * FROM file_engine_table; +INSERT INTO test VALUES (2), (3); +INSERT INTO test VALUES (4); +SELECT * FROM test; SET engine_file_truncate_on_insert=0; -INSERT INTO file_engine_table VALUES (5), (6); -SELECT * FROM file_engine_table; +INSERT INTO test VALUES (5), (6); +SELECT * FROM test; SET engine_file_truncate_on_insert=1; -INSERT INTO file_engine_table VALUES (0), (1), (2); -SELECT * FROM file_engine_table; +INSERT INTO test VALUES (0), (1), (2); +SELECT * FROM test; SET engine_file_truncate_on_insert=0; +DROP TABLE test; From 1ce9570fcb4919880c19b05986dd9f7691fefb6f Mon Sep 17 00:00:00 2001 From: M0r64n Date: Thu, 18 Feb 2021 07:50:15 +0000 Subject: [PATCH 0408/2357] Fix 01721_engine_file_truncate_on_insert.reference --- .../0_stateless/01721_engine_file_truncate_on_insert.reference | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/queries/0_stateless/01721_engine_file_truncate_on_insert.reference b/tests/queries/0_stateless/01721_engine_file_truncate_on_insert.reference index a25fb4f0e7e..578661c9194 100644 --- a/tests/queries/0_stateless/01721_engine_file_truncate_on_insert.reference +++ b/tests/queries/0_stateless/01721_engine_file_truncate_on_insert.reference @@ -10,4 +10,4 @@ 6 0 1 -2 \ No newline at end of file +2 From 4278098f9a243c740961248ad2232e425bd567d9 Mon Sep 17 00:00:00 2001 From: Maksim Kita Date: Thu, 18 Feb 2021 13:09:01 +0300 Subject: [PATCH 0409/2357] Reinterpret function added Decimal, DateTim64 support --- .../functions/type-conversion-functions.md | 10 ++- src/Functions/reinterpretAs.cpp | 65 ++++++++++++++----- .../01676_reinterpret_as.reference | 10 +++ .../0_stateless/01676_reinterpret_as.sql | 12 +++- 4 files changed, 76 insertions(+), 21 deletions(-) diff --git a/docs/en/sql-reference/functions/type-conversion-functions.md b/docs/en/sql-reference/functions/type-conversion-functions.md index 6bc274eba73..0cfeb282bb3 100644 --- a/docs/en/sql-reference/functions/type-conversion-functions.md +++ b/docs/en/sql-reference/functions/type-conversion-functions.md @@ -324,16 +324,20 @@ SELECT reinterpret(toInt8(-1), 'UInt8') as int_to_uint, └─────────────┴──────────────┴───────────────┘ ``` -## reinterpretAsUInt(8\|16\|32\|64\|256) {#type_conversion_function-reinterpretAsUInt8163264256} +## reinterpretAsUInt(8\|16\|32\|64\|256) {#reinterpretAsUInt8163264256} -## reinterpretAsInt(8\|16\|32\|64\|128\|256) {#type_conversion_function-reinterpretAsInt8163264128256} +## reinterpretAsInt(8\|16\|32\|64\|128\|256) {#reinterpretAsInt8163264128256} -## reinterpretAsFloat(32\|64) {##type_conversion_function-reinterpretAsFloat} +## reinterpretAsDecimal(32\|64\|128\|256) {#reinterpretAsDecimal3264128256} + +## reinterpretAsFloat(32\|64) {#type_conversion_function-reinterpretAsFloat} ## reinterpretAsDate {#type_conversion_function-reinterpretAsDate} ## reinterpretAsDateTime {#type_conversion_function-reinterpretAsDateTime} +## reinterpretAsDateTime64 {#type_conversion_function-reinterpretAsDateTime64} + ## reinterpretAsString {#type_conversion_function-reinterpretAsString} ## reinterpretAsFixedString {#type_conversion_function-reinterpretAsFixedString} diff --git a/src/Functions/reinterpretAs.cpp b/src/Functions/reinterpretAs.cpp index c15ba969fdb..3f4ba3d23e1 100644 --- a/src/Functions/reinterpretAs.cpp +++ b/src/Functions/reinterpretAs.cpp @@ -11,10 +11,13 @@ #include #include #include +#include +#include #include #include #include #include +#include #include #include @@ -158,7 +161,7 @@ public: { const auto * col_from = assert_cast(arguments[0].column.get()); - auto col_res = ToColumnType::create(); + auto col_res = numericColumnCreateHelper(static_cast(*result_type.get())); const ColumnString::Chars & data_from = col_from->getChars(); const ColumnString::Offsets & offsets_from = col_from->getOffsets(); @@ -185,7 +188,7 @@ public: { const auto * col_from_fixed = assert_cast(arguments[0].column.get()); - auto col_res = ToColumnType::create(); + auto col_res = numericColumnCreateHelper(static_cast(*result_type.get())); const ColumnString::Chars & data_from = col_from_fixed->getChars(); size_t step = col_from_fixed->getN(); @@ -209,12 +212,27 @@ public: } else if constexpr (CanBeReinterpretedAsNumeric) { - using FromTypeFieldType = typename FromType::FieldType; - const auto * col = assert_cast*>(arguments[0].column.get()); + using From = typename FromType::FieldType; + using To = typename ToType::FieldType; - auto col_res = ToColumnType::create(); - reinterpretImpl(col->getData(), col_res->getData()); - result = std::move(col_res); + using FromColumnType = std::conditional_t, ColumnDecimal, ColumnVector>; + + const auto * column_from = assert_cast(arguments[0].column.get()); + + auto column_to = numericColumnCreateHelper(static_cast(*result_type.get())); + + auto & from = column_from->getData(); + auto & to = column_to->getData(); + + size_t size = from.size(); + to.resize_fill(size); + + static constexpr size_t copy_size = std::min(sizeof(From), sizeof(To)); + + for (size_t i = 0; i < size; ++i) + memcpy(static_cast(&to[i]), static_cast(&from[i]), copy_size); + + result = std::move(column_to); return true; } @@ -232,7 +250,7 @@ public: private: template static constexpr auto CanBeReinterpretedAsNumeric = - IsDataTypeNumber || + IsDataTypeDecimalOrNumber || std::is_same_v || std::is_same_v || std::is_same_v; @@ -243,7 +261,8 @@ private: type.isInt() || type.isDateOrDateTime() || type.isFloat() || - type.isUUID(); + type.isUUID() || + type.isDecimal(); } static void NO_INLINE executeToFixedString(const IColumn & src, ColumnFixedString & dst, size_t n) @@ -296,18 +315,32 @@ private: } } - template - static void reinterpretImpl(const PaddedPODArray & from, PaddedPODArray & to) + template + static typename Type::ColumnType::MutablePtr numericColumnCreateHelper(const Type & type) { + size_t column_size = 0; + + using ColumnType = typename Type::ColumnType; + + if constexpr (IsDataTypeDecimal) + return ColumnType::create(column_size, type.getScale()); + else + return ColumnType::create(column_size); + } + + template + static void reinterpretImpl(const FromContainer & from, ToContainer & to) + { + using From = typename FromContainer::value_type; + using To = typename ToContainer::value_type; + size_t size = from.size(); to.resize_fill(size); + static constexpr size_t copy_size = std::min(sizeof(From), sizeof(To)); + for (size_t i = 0; i < size; ++i) - { - memcpy(static_cast(&to[i]), - static_cast(&from[i]), - std::min(sizeof(From), sizeof(To))); - } + memcpy(static_cast(&to[i]), static_cast(&from[i]), copy_size); } }; diff --git a/tests/queries/0_stateless/01676_reinterpret_as.reference b/tests/queries/0_stateless/01676_reinterpret_as.reference index b39deb55a7f..459ca166dc1 100644 --- a/tests/queries/0_stateless/01676_reinterpret_as.reference +++ b/tests/queries/0_stateless/01676_reinterpret_as.reference @@ -28,4 +28,14 @@ Integer and String types 1 1 49 1 1 49 11 11 12593 +Dates +1970-01-01 1970-01-01 +1970-01-01 03:00:00 1970-01-01 03:00:00 +1970-01-01 03:00:00.000 1970-01-01 03:00:00.000 +Decimals +5.00 0.49 +5.00 0.49 +5.00 0.49 +5.00 0.49 +0.00 ReinterpretErrors diff --git a/tests/queries/0_stateless/01676_reinterpret_as.sql b/tests/queries/0_stateless/01676_reinterpret_as.sql index ff727f284bb..5eb94ed0a13 100644 --- a/tests/queries/0_stateless/01676_reinterpret_as.sql +++ b/tests/queries/0_stateless/01676_reinterpret_as.sql @@ -28,7 +28,15 @@ SELECT 'Integer and String types'; SELECT reinterpret(a, 'String'), reinterpretAsString(a), reinterpretAsUInt8('1') as a; SELECT reinterpret(a, 'String'), reinterpretAsString(a), reinterpretAsUInt8('11') as a; SELECT reinterpret(a, 'String'), reinterpretAsString(a), reinterpretAsUInt16('11') as a; +SELECT 'Dates'; +SELECT reinterpret(0, 'Date'), reinterpret('', 'Date'); +SELECT reinterpret(0, 'DateTime'), reinterpret('', 'DateTime'); +SELECT reinterpret(0, 'DateTime64'), reinterpret('', 'DateTime64'); +SELECT 'Decimals'; +SELECT reinterpret(toDecimal32(5, 2), 'Decimal32(2)'), reinterpret('1', 'Decimal32(2)'); +SELECT reinterpret(toDecimal64(5, 2), 'Decimal64(2)'), reinterpret('1', 'Decimal64(2)');; +SELECT reinterpret(toDecimal128(5, 2), 'Decimal128(2)'), reinterpret('1', 'Decimal128(2)'); +SELECT reinterpret(toDecimal256(5, 2), 'Decimal256(2)'), reinterpret('1', 'Decimal256(2)'); +SELECT reinterpret(toDateTime64(0, 0), 'Decimal64(2)'); SELECT 'ReinterpretErrors'; -SELECT reinterpret(toDecimal64(1, 2), 'UInt8'); -- {serverError 43} SELECT reinterpret('123', 'FixedString(1)'); -- {serverError 43} -SELECT reinterpret(toDateTime('9922337203.6854775808', 1), 'Decimal64(1)'); -- {serverError 43} From 5b597fdf446bb2039ae45d722ad423445a063a96 Mon Sep 17 00:00:00 2001 From: alesapin Date: Thu, 18 Feb 2021 13:23:48 +0300 Subject: [PATCH 0410/2357] Force sync setting and ability to start with broken log --- src/Coordination/Changelog.cpp | 90 +++++++++++-------- src/Coordination/Changelog.h | 6 +- src/Coordination/CoordinationSettings.h | 3 +- src/Coordination/InMemoryStateManager.cpp | 6 +- src/Coordination/NuKeeperLogStore.cpp | 9 +- src/Coordination/NuKeeperLogStore.h | 3 +- src/Coordination/tests/gtest_for_build.cpp | 89 ++++++++++++------ tests/config/config.d/test_keeper_port.xml | 1 + .../configs/enable_test_keeper.xml | 1 + 9 files changed, 137 insertions(+), 71 deletions(-) diff --git a/src/Coordination/Changelog.cpp b/src/Coordination/Changelog.cpp index 2d1bbfb4440..4358fa062e8 100644 --- a/src/Coordination/Changelog.cpp +++ b/src/Coordination/Changelog.cpp @@ -6,6 +6,8 @@ #include #include #include +#include +#include namespace DB { @@ -37,7 +39,7 @@ ChangelogVersion fromString(const std::string & version_str) namespace { -static constexpr auto DEFAULT_PREFIX = "changelog"; +constexpr auto DEFAULT_PREFIX = "changelog"; std::string formatChangelogPath(const std::string & prefix, const ChangelogFileDescription & name) { @@ -151,39 +153,56 @@ public: size_t readChangelog(IndexToLogEntry & logs, size_t start_log_idx, IndexToOffset & index_to_offset) { size_t total_read = 0; - while (!read_buf.eof()) + try { - total_read += 1; - off_t pos = read_buf.count(); - ChangelogRecord record; - readIntBinary(record.header.version, read_buf); - readIntBinary(record.header.index, read_buf); - readIntBinary(record.header.term, read_buf); - readIntBinary(record.header.value_type, read_buf); - readIntBinary(record.header.blob_size, read_buf); - readIntBinary(record.header.blob_checksum, read_buf); - auto buffer = nuraft::buffer::alloc(record.header.blob_size); - auto buffer_begin = reinterpret_cast(buffer->data_begin()); - read_buf.readStrict(buffer_begin, record.header.blob_size); - index_to_offset[record.header.index] = pos; - - Checksum checksum = CityHash_v1_0_2::CityHash128(buffer_begin, record.header.blob_size); - if (checksum != record.header.blob_checksum) + while (!read_buf.eof()) { - throw Exception(ErrorCodes::CHECKSUM_DOESNT_MATCH, - "Checksums doesn't match for log {} (version {}), index {}, blob_size {}", - filepath, record.header.version, record.header.index, record.header.blob_size); - } - if (record.header.index < start_log_idx) - continue; + off_t pos = read_buf.count(); + ChangelogRecord record; + readIntBinary(record.header.version, read_buf); + readIntBinary(record.header.index, read_buf); + readIntBinary(record.header.term, read_buf); + readIntBinary(record.header.value_type, read_buf); + readIntBinary(record.header.blob_size, read_buf); + readIntBinary(record.header.blob_checksum, read_buf); + auto buffer = nuraft::buffer::alloc(record.header.blob_size); + auto buffer_begin = reinterpret_cast(buffer->data_begin()); + read_buf.readStrict(buffer_begin, record.header.blob_size); + index_to_offset[record.header.index] = pos; - auto log_entry = nuraft::cs_new(record.header.term, buffer, record.header.value_type); - if (!logs.try_emplace(record.header.index, log_entry).second) - throw Exception(ErrorCodes::CORRUPTED_DATA, "Duplicated index id {} in log {}", record.header.index, filepath); + Checksum checksum = CityHash_v1_0_2::CityHash128(buffer_begin, record.header.blob_size); + if (checksum != record.header.blob_checksum) + { + throw Exception(ErrorCodes::CHECKSUM_DOESNT_MATCH, + "Checksums doesn't match for log {} (version {}), index {}, blob_size {}", + filepath, record.header.version, record.header.index, record.header.blob_size); + } + + if (logs.count(record.header.index) != 0) + throw Exception(ErrorCodes::CORRUPTED_DATA, "Duplicated index id {} in log {}", record.header.index, filepath); + + total_read += 1; + + if (record.header.index < start_log_idx) + continue; + + auto log_entry = nuraft::cs_new(record.header.term, buffer, record.header.value_type); + + logs.emplace(record.header.index, log_entry); + } + } + catch (const Exception & ex) + { + LOG_WARNING(&Poco::Logger::get("RaftChangelog"), "Cannot completely read changelog on path {}, error: {}", filepath, ex.message()); + } + catch (...) + { + tryLogCurrentException(&Poco::Logger::get("RaftChangelog")); } return total_read; } + private: std::string filepath; ReadBufferFromFile read_buf; @@ -239,11 +258,12 @@ void Changelog::readChangelogAndInitWriter(size_t from_log_idx) } } - if (existing_changelogs.size() > 0 && read_from_last < entries_in_last) + if (!existing_changelogs.empty() && read_from_last < entries_in_last) { auto description = existing_changelogs.rbegin()->second; current_writer = std::make_unique(description.path, WriteMode::Append, description.from_log_idx); current_writer->setEntriesWritten(read_from_last); + current_writer->truncateToLength(index_to_start_pos[read_from_last]); } else { @@ -287,7 +307,7 @@ ChangelogRecord Changelog::buildRecord(size_t index, nuraft::ptr log_entry) +void Changelog::appendEntry(size_t index, nuraft::ptr log_entry, bool force_sync) { if (!current_writer) throw Exception(ErrorCodes::LOGICAL_ERROR, "Changelog must be initialized before appending records"); @@ -298,14 +318,14 @@ void Changelog::appendEntry(size_t index, nuraft::ptr log_ent if (current_writer->getEntriesWritten() == rotate_interval) rotate(index); - auto offset = current_writer->appendRecord(buildRecord(index, log_entry), false); + auto offset = current_writer->appendRecord(buildRecord(index, log_entry), force_sync); if (!index_to_start_pos.try_emplace(index, offset).second) throw Exception(ErrorCodes::LOGICAL_ERROR, "Record with index {} already exists", index); logs[index] = makeClone(log_entry); } -void Changelog::writeAt(size_t index, nuraft::ptr log_entry) +void Changelog::writeAt(size_t index, nuraft::ptr log_entry, bool force_sync) { if (index_to_start_pos.count(index) == 0) throw Exception(ErrorCodes::LOGICAL_ERROR, "Cannot write at index {} because changelog doesn't contain it", index); @@ -347,7 +367,7 @@ void Changelog::writeAt(size_t index, nuraft::ptr log_entry) current_writer->setEntriesWritten(entries_written); - appendEntry(index, log_entry); + appendEntry(index, log_entry, force_sync); } void Changelog::compact(size_t up_to_log_idx) @@ -441,7 +461,7 @@ nuraft::ptr Changelog::serializeEntriesToBuffer(size_t index, in return buf_out; } -void Changelog::applyEntriesFromBuffer(size_t index, nuraft::buffer & buffer) +void Changelog::applyEntriesFromBuffer(size_t index, nuraft::buffer & buffer, bool force_sync) { buffer.pos(0); int num_logs = buffer.get_int(); @@ -456,9 +476,9 @@ void Changelog::applyEntriesFromBuffer(size_t index, nuraft::buffer & buffer) LogEntryPtr log_entry = nuraft::log_entry::deserialize(*buf_local); if (i == 0 && logs.count(cur_idx)) - writeAt(cur_idx, log_entry); + writeAt(cur_idx, log_entry, force_sync); else - appendEntry(cur_idx, log_entry); + appendEntry(cur_idx, log_entry, force_sync); } } diff --git a/src/Coordination/Changelog.h b/src/Coordination/Changelog.h index 5f38f68750e..38d83819da2 100644 --- a/src/Coordination/Changelog.h +++ b/src/Coordination/Changelog.h @@ -64,9 +64,9 @@ public: void readChangelogAndInitWriter(size_t from_log_idx); - void appendEntry(size_t index, LogEntryPtr log_entry); + void appendEntry(size_t index, LogEntryPtr log_entry, bool force_sync); - void writeAt(size_t index, LogEntryPtr log_entry); + void writeAt(size_t index, LogEntryPtr log_entry, bool force_sync); void compact(size_t up_to_log_idx); @@ -88,7 +88,7 @@ public: nuraft::ptr serializeEntriesToBuffer(size_t index, int32_t cnt); - void applyEntriesFromBuffer(size_t index, nuraft::buffer & buffer); + void applyEntriesFromBuffer(size_t index, nuraft::buffer & buffer, bool force_sync); void flush(); diff --git a/src/Coordination/CoordinationSettings.h b/src/Coordination/CoordinationSettings.h index 0f1afb3fffe..ba3d3a7141a 100644 --- a/src/Coordination/CoordinationSettings.h +++ b/src/Coordination/CoordinationSettings.h @@ -29,7 +29,8 @@ struct Settings; M(Milliseconds, shutdown_timeout, 5000, "How many time we will until RAFT shutdown", 0) \ M(Milliseconds, startup_timeout, 30000, "How many time we will until RAFT to start", 0) \ M(LogsLevel, raft_logs_level, LogsLevel::information, "Log internal RAFT logs into main server log level. Valid values: 'trace', 'debug', 'information', 'warning', 'error', 'fatal', 'none'", 0) \ - M(UInt64, rotate_log_storage_interval, 500000, "How many records will be stored in one log storage file", 0) + M(UInt64, rotate_log_storage_interval, 500000, "How many records will be stored in one log storage file", 0) \ + M(Bool, force_sync, true, " Call fsync on each change in RAFT changelog", 0) DECLARE_SETTINGS_TRAITS(CoordinationSettingsTraits, LIST_OF_COORDINATION_SETTINGS) diff --git a/src/Coordination/InMemoryStateManager.cpp b/src/Coordination/InMemoryStateManager.cpp index 0423d2466f2..084ab043d12 100644 --- a/src/Coordination/InMemoryStateManager.cpp +++ b/src/Coordination/InMemoryStateManager.cpp @@ -12,7 +12,7 @@ namespace ErrorCodes InMemoryStateManager::InMemoryStateManager(int server_id_, const std::string & host, int port, const std::string & logs_path) : my_server_id(server_id_) , my_port(port) - , log_store(nuraft::cs_new(logs_path, 5000)) + , log_store(nuraft::cs_new(logs_path, 5000, true)) , cluster_config(nuraft::cs_new()) { auto peer_config = nuraft::cs_new(my_server_id, host + ":" + std::to_string(port)); @@ -25,7 +25,9 @@ InMemoryStateManager::InMemoryStateManager( const Poco::Util::AbstractConfiguration & config, const CoordinationSettingsPtr & coordination_settings) : my_server_id(my_server_id_) - , log_store(nuraft::cs_new(config.getString(config_prefix + ".log_storage_path"), coordination_settings->rotate_log_storage_interval)) + , log_store(nuraft::cs_new( + config.getString(config_prefix + ".log_storage_path"), + coordination_settings->rotate_log_storage_interval, coordination_settings->force_sync)) , cluster_config(nuraft::cs_new()) { diff --git a/src/Coordination/NuKeeperLogStore.cpp b/src/Coordination/NuKeeperLogStore.cpp index fa8d6d6c299..8834bdc4d69 100644 --- a/src/Coordination/NuKeeperLogStore.cpp +++ b/src/Coordination/NuKeeperLogStore.cpp @@ -3,8 +3,9 @@ namespace DB { -NuKeeperLogStore::NuKeeperLogStore(const std::string & changelogs_path, size_t rotate_interval_) +NuKeeperLogStore::NuKeeperLogStore(const std::string & changelogs_path, size_t rotate_interval_, bool force_sync_) : changelog(changelogs_path, rotate_interval_) + , force_sync(force_sync_) { } @@ -36,7 +37,7 @@ size_t NuKeeperLogStore::append(nuraft::ptr & entry) { std::lock_guard lock(changelog_lock); size_t idx = changelog.getNextEntryIndex(); - changelog.appendEntry(idx, entry); + changelog.appendEntry(idx, entry, force_sync); return idx; } @@ -44,7 +45,7 @@ size_t NuKeeperLogStore::append(nuraft::ptr & entry) void NuKeeperLogStore::write_at(size_t index, nuraft::ptr & entry) { std::lock_guard lock(changelog_lock); - changelog.writeAt(index, entry); + changelog.writeAt(index, entry, force_sync); } nuraft::ptr>> NuKeeperLogStore::log_entries(size_t start, size_t end) @@ -91,7 +92,7 @@ bool NuKeeperLogStore::flush() void NuKeeperLogStore::apply_pack(size_t index, nuraft::buffer & pack) { std::lock_guard lock(changelog_lock); - changelog.applyEntriesFromBuffer(index, pack); + changelog.applyEntriesFromBuffer(index, pack, force_sync); } size_t NuKeeperLogStore::size() const diff --git a/src/Coordination/NuKeeperLogStore.h b/src/Coordination/NuKeeperLogStore.h index 49d5dbfdf7c..0ff92220316 100644 --- a/src/Coordination/NuKeeperLogStore.h +++ b/src/Coordination/NuKeeperLogStore.h @@ -11,7 +11,7 @@ namespace DB class NuKeeperLogStore : public nuraft::log_store { public: - NuKeeperLogStore(const std::string & changelogs_path, size_t rotate_interval_); + NuKeeperLogStore(const std::string & changelogs_path, size_t rotate_interval_, bool force_sync_); void init(size_t from_log_idx); @@ -44,6 +44,7 @@ public: private: mutable std::mutex changelog_lock; Changelog changelog; + bool force_sync; }; } diff --git a/src/Coordination/tests/gtest_for_build.cpp b/src/Coordination/tests/gtest_for_build.cpp index 81e1751c08c..3fd2db84e3e 100644 --- a/src/Coordination/tests/gtest_for_build.cpp +++ b/src/Coordination/tests/gtest_for_build.cpp @@ -6,7 +6,8 @@ #endif #if USE_NURAFT - +#include +#include #include #include #include @@ -20,6 +21,7 @@ #include #include #include +#include #include // Y_IGNORE #include #include @@ -372,7 +374,7 @@ DB::LogEntryPtr getLogEntry(const std::string & s, size_t term) TEST(CoordinationTest, ChangelogTestSimple) { ChangelogDirTest test("./logs"); - DB::NuKeeperLogStore changelog("./logs", 5); + DB::NuKeeperLogStore changelog("./logs", 5, true); changelog.init(1); auto entry = getLogEntry("hello world", 77); changelog.append(entry); @@ -386,7 +388,7 @@ TEST(CoordinationTest, ChangelogTestSimple) TEST(CoordinationTest, ChangelogTestFile) { ChangelogDirTest test("./logs"); - DB::NuKeeperLogStore changelog("./logs", 5); + DB::NuKeeperLogStore changelog("./logs", 5, true); changelog.init(1); auto entry = getLogEntry("hello world", 77); changelog.append(entry); @@ -407,7 +409,7 @@ TEST(CoordinationTest, ChangelogTestFile) TEST(CoordinationTest, ChangelogReadWrite) { ChangelogDirTest test("./logs"); - DB::NuKeeperLogStore changelog("./logs", 1000); + DB::NuKeeperLogStore changelog("./logs", 1000, true); changelog.init(1); for (size_t i = 0; i < 10; ++i) { @@ -415,7 +417,7 @@ TEST(CoordinationTest, ChangelogReadWrite) changelog.append(entry); } EXPECT_EQ(changelog.size(), 10); - DB::NuKeeperLogStore changelog_reader("./logs", 1000); + DB::NuKeeperLogStore changelog_reader("./logs", 1000, true); changelog_reader.init(1); EXPECT_EQ(changelog_reader.size(), 10); EXPECT_EQ(changelog_reader.last_entry()->get_term(), changelog.last_entry()->get_term()); @@ -434,7 +436,7 @@ TEST(CoordinationTest, ChangelogReadWrite) TEST(CoordinationTest, ChangelogWriteAt) { ChangelogDirTest test("./logs"); - DB::NuKeeperLogStore changelog("./logs", 1000); + DB::NuKeeperLogStore changelog("./logs", 1000, true); changelog.init(1); for (size_t i = 0; i < 10; ++i) { @@ -450,7 +452,7 @@ TEST(CoordinationTest, ChangelogWriteAt) EXPECT_EQ(changelog.entry_at(7)->get_term(), 77); EXPECT_EQ(changelog.next_slot(), 8); - DB::NuKeeperLogStore changelog_reader("./logs", 1000); + DB::NuKeeperLogStore changelog_reader("./logs", 1000, true); changelog_reader.init(1); EXPECT_EQ(changelog_reader.size(), changelog.size()); @@ -463,7 +465,7 @@ TEST(CoordinationTest, ChangelogWriteAt) TEST(CoordinationTest, ChangelogTestAppendAfterRead) { ChangelogDirTest test("./logs"); - DB::NuKeeperLogStore changelog("./logs", 5); + DB::NuKeeperLogStore changelog("./logs", 5, true); changelog.init(1); for (size_t i = 0; i < 7; ++i) { @@ -475,7 +477,7 @@ TEST(CoordinationTest, ChangelogTestAppendAfterRead) EXPECT_TRUE(fs::exists("./logs/changelog_1_5.bin")); EXPECT_TRUE(fs::exists("./logs/changelog_6_10.bin")); - DB::NuKeeperLogStore changelog_reader("./logs", 5); + DB::NuKeeperLogStore changelog_reader("./logs", 5, true); changelog_reader.init(1); EXPECT_EQ(changelog_reader.size(), 7); @@ -511,7 +513,7 @@ TEST(CoordinationTest, ChangelogTestAppendAfterRead) TEST(CoordinationTest, ChangelogTestCompaction) { ChangelogDirTest test("./logs"); - DB::NuKeeperLogStore changelog("./logs", 5); + DB::NuKeeperLogStore changelog("./logs", 5, true); changelog.init(1); for (size_t i = 0; i < 3; ++i) @@ -552,7 +554,7 @@ TEST(CoordinationTest, ChangelogTestCompaction) EXPECT_EQ(changelog.next_slot(), 8); EXPECT_EQ(changelog.last_entry()->get_term(), 60); /// And we able to read it - DB::NuKeeperLogStore changelog_reader("./logs", 5); + DB::NuKeeperLogStore changelog_reader("./logs", 5, true); changelog_reader.init(7); EXPECT_EQ(changelog_reader.size(), 1); EXPECT_EQ(changelog_reader.start_index(), 7); @@ -563,7 +565,7 @@ TEST(CoordinationTest, ChangelogTestCompaction) TEST(CoordinationTest, ChangelogTestBatchOperations) { ChangelogDirTest test("./logs"); - DB::NuKeeperLogStore changelog("./logs", 100); + DB::NuKeeperLogStore changelog("./logs", 100, true); changelog.init(1); for (size_t i = 0; i < 10; ++i) { @@ -575,7 +577,7 @@ TEST(CoordinationTest, ChangelogTestBatchOperations) auto entries = changelog.pack(1, 5); - DB::NuKeeperLogStore apply_changelog("./logs", 100); + DB::NuKeeperLogStore apply_changelog("./logs", 100, true); apply_changelog.init(1); for (size_t i = 0; i < 10; ++i) @@ -605,7 +607,7 @@ TEST(CoordinationTest, ChangelogTestBatchOperations) TEST(CoordinationTest, ChangelogTestBatchOperationsEmpty) { ChangelogDirTest test("./logs"); - DB::NuKeeperLogStore changelog("./logs", 100); + DB::NuKeeperLogStore changelog("./logs", 100, true); changelog.init(1); for (size_t i = 0; i < 10; ++i) { @@ -618,7 +620,7 @@ TEST(CoordinationTest, ChangelogTestBatchOperationsEmpty) auto entries = changelog.pack(5, 5); ChangelogDirTest test1("./logs1"); - DB::NuKeeperLogStore changelog_new("./logs1", 100); + DB::NuKeeperLogStore changelog_new("./logs1", 100, true); changelog_new.init(1); EXPECT_EQ(changelog_new.size(), 0); @@ -637,7 +639,7 @@ TEST(CoordinationTest, ChangelogTestBatchOperationsEmpty) EXPECT_EQ(changelog_new.start_index(), 5); EXPECT_EQ(changelog_new.next_slot(), 11); - DB::NuKeeperLogStore changelog_reader("./logs1", 100); + DB::NuKeeperLogStore changelog_reader("./logs1", 100, true); changelog_reader.init(5); } @@ -645,7 +647,7 @@ TEST(CoordinationTest, ChangelogTestBatchOperationsEmpty) TEST(CoordinationTest, ChangelogTestWriteAtPreviousFile) { ChangelogDirTest test("./logs"); - DB::NuKeeperLogStore changelog("./logs", 5); + DB::NuKeeperLogStore changelog("./logs", 5, true); changelog.init(1); for (size_t i = 0; i < 33; ++i) @@ -680,7 +682,7 @@ TEST(CoordinationTest, ChangelogTestWriteAtPreviousFile) EXPECT_FALSE(fs::exists("./logs/changelog_26_30.bin")); EXPECT_FALSE(fs::exists("./logs/changelog_31_35.bin")); - DB::NuKeeperLogStore changelog_read("./logs", 5); + DB::NuKeeperLogStore changelog_read("./logs", 5, true); changelog_read.init(1); EXPECT_EQ(changelog_read.size(), 7); EXPECT_EQ(changelog_read.start_index(), 1); @@ -691,7 +693,7 @@ TEST(CoordinationTest, ChangelogTestWriteAtPreviousFile) TEST(CoordinationTest, ChangelogTestWriteAtFileBorder) { ChangelogDirTest test("./logs"); - DB::NuKeeperLogStore changelog("./logs", 5); + DB::NuKeeperLogStore changelog("./logs", 5, true); changelog.init(1); for (size_t i = 0; i < 33; ++i) @@ -726,7 +728,7 @@ TEST(CoordinationTest, ChangelogTestWriteAtFileBorder) EXPECT_FALSE(fs::exists("./logs/changelog_26_30.bin")); EXPECT_FALSE(fs::exists("./logs/changelog_31_35.bin")); - DB::NuKeeperLogStore changelog_read("./logs", 5); + DB::NuKeeperLogStore changelog_read("./logs", 5, true); changelog_read.init(1); EXPECT_EQ(changelog_read.size(), 11); EXPECT_EQ(changelog_read.start_index(), 1); @@ -737,7 +739,7 @@ TEST(CoordinationTest, ChangelogTestWriteAtFileBorder) TEST(CoordinationTest, ChangelogTestWriteAtAllFiles) { ChangelogDirTest test("./logs"); - DB::NuKeeperLogStore changelog("./logs", 5); + DB::NuKeeperLogStore changelog("./logs", 5, true); changelog.init(1); for (size_t i = 0; i < 33; ++i) @@ -776,7 +778,7 @@ TEST(CoordinationTest, ChangelogTestWriteAtAllFiles) TEST(CoordinationTest, ChangelogTestStartNewLogAfterRead) { ChangelogDirTest test("./logs"); - DB::NuKeeperLogStore changelog("./logs", 5); + DB::NuKeeperLogStore changelog("./logs", 5, true); changelog.init(1); for (size_t i = 0; i < 35; ++i) @@ -795,7 +797,7 @@ TEST(CoordinationTest, ChangelogTestStartNewLogAfterRead) EXPECT_FALSE(fs::exists("./logs/changelog_36_40.bin")); - DB::NuKeeperLogStore changelog_reader("./logs", 5); + DB::NuKeeperLogStore changelog_reader("./logs", 5, true); changelog_reader.init(1); auto entry = getLogEntry("36_hello_world", 360); @@ -817,7 +819,7 @@ TEST(CoordinationTest, ChangelogTestReadAfterBrokenTruncate) { ChangelogDirTest test("./logs"); - DB::NuKeeperLogStore changelog("./logs", 5); + DB::NuKeeperLogStore changelog("./logs", 5, true); changelog.init(1); for (size_t i = 0; i < 35; ++i) @@ -837,7 +839,7 @@ TEST(CoordinationTest, ChangelogTestReadAfterBrokenTruncate) DB::WriteBufferFromFile plain_buf("./logs/changelog_11_15.bin", DBMS_DEFAULT_BUFFER_SIZE, O_APPEND | O_CREAT | O_WRONLY); plain_buf.truncate(0); - DB::NuKeeperLogStore changelog_reader("./logs", 5); + DB::NuKeeperLogStore changelog_reader("./logs", 5, true); changelog_reader.init(1); EXPECT_EQ(changelog_reader.size(), 10); @@ -867,4 +869,41 @@ TEST(CoordinationTest, ChangelogTestReadAfterBrokenTruncate) EXPECT_FALSE(fs::exists("./logs/changelog_31_35.bin")); } +TEST(CoordinationTest, ChangelogTestReadAfterBrokenTruncate2) +{ + ChangelogDirTest test("./logs"); + + DB::NuKeeperLogStore changelog("./logs", 20, true); + changelog.init(1); + + for (size_t i = 0; i < 35; ++i) + { + auto entry = getLogEntry(std::to_string(i) + "_hello_world", (i + 44) * 10); + changelog.append(entry); + } + + EXPECT_TRUE(fs::exists("./logs/changelog_1_20.bin")); + EXPECT_TRUE(fs::exists("./logs/changelog_21_40.bin")); + + DB::WriteBufferFromFile plain_buf("./logs/changelog_1_20.bin", DBMS_DEFAULT_BUFFER_SIZE, O_APPEND | O_CREAT | O_WRONLY); + plain_buf.truncate(140); + + DB::NuKeeperLogStore changelog_reader("./logs", 20, true); + changelog_reader.init(1); + + EXPECT_EQ(changelog_reader.size(), 2); + EXPECT_EQ(changelog_reader.last_entry()->get_term(), 450); + EXPECT_TRUE(fs::exists("./logs/changelog_1_20.bin")); + EXPECT_FALSE(fs::exists("./logs/changelog_21_40.bin")); +} + +int main(int argc, char ** argv) +{ + Poco::AutoPtr channel(new Poco::ConsoleChannel(std::cerr)); + Poco::Logger::root().setChannel(channel); + Poco::Logger::root().setLevel("trace"); + testing::InitGoogleTest(&argc, argv); + return RUN_ALL_TESTS(); +} + #endif diff --git a/tests/config/config.d/test_keeper_port.xml b/tests/config/config.d/test_keeper_port.xml index 44123ffe9c1..88fbf027ce7 100644 --- a/tests/config/config.d/test_keeper_port.xml +++ b/tests/config/config.d/test_keeper_port.xml @@ -9,6 +9,7 @@ 30000 0 0 + false diff --git a/tests/integration/test_testkeeper_back_to_back/configs/enable_test_keeper.xml b/tests/integration/test_testkeeper_back_to_back/configs/enable_test_keeper.xml index a8b8991f959..2cf9f8022d1 100644 --- a/tests/integration/test_testkeeper_back_to_back/configs/enable_test_keeper.xml +++ b/tests/integration/test_testkeeper_back_to_back/configs/enable_test_keeper.xml @@ -8,6 +8,7 @@ 5000 10000 trace + false From 7231a97085b34d0ee6fa14a23a085a0bd60cc01f Mon Sep 17 00:00:00 2001 From: Nikolai Kochetov Date: Thu, 18 Feb 2021 14:15:16 +0300 Subject: [PATCH 0411/2357] Remove MaterializingStep --- .../QueryPlan/MaterializingStep.cpp | 39 ------------------- src/Processors/QueryPlan/MaterializingStep.h | 18 --------- src/Processors/ya.make | 1 - src/Storages/StorageView.cpp | 6 ++- 4 files changed, 4 insertions(+), 60 deletions(-) delete mode 100644 src/Processors/QueryPlan/MaterializingStep.cpp delete mode 100644 src/Processors/QueryPlan/MaterializingStep.h diff --git a/src/Processors/QueryPlan/MaterializingStep.cpp b/src/Processors/QueryPlan/MaterializingStep.cpp deleted file mode 100644 index f5313369020..00000000000 --- a/src/Processors/QueryPlan/MaterializingStep.cpp +++ /dev/null @@ -1,39 +0,0 @@ -#include -#include -#include - -#include - -namespace DB -{ - -static ITransformingStep::Traits getTraits() -{ - return ITransformingStep::Traits - { - { - .preserves_distinct_columns = true, - .returns_single_stream = false, - .preserves_number_of_streams = true, - .preserves_sorting = true, - }, - { - .preserves_number_of_rows = true, - } - }; -} - -MaterializingStep::MaterializingStep(const DataStream & input_stream_) - : ITransformingStep(input_stream_, materializeBlock(input_stream_.header), getTraits()) -{ -} - -void MaterializingStep::transformPipeline(QueryPipeline & pipeline) -{ - pipeline.addSimpleTransform([&](const Block & header) - { - return std::make_shared(header); - }); -} - -} diff --git a/src/Processors/QueryPlan/MaterializingStep.h b/src/Processors/QueryPlan/MaterializingStep.h deleted file mode 100644 index 72b3133dfe4..00000000000 --- a/src/Processors/QueryPlan/MaterializingStep.h +++ /dev/null @@ -1,18 +0,0 @@ -#pragma once -#include - -namespace DB -{ - -/// Materialize constants. See MaterializingTransform. -class MaterializingStep : public ITransformingStep -{ -public: - explicit MaterializingStep(const DataStream & input_stream_); - - String getName() const override { return "Materializing"; } - - void transformPipeline(QueryPipeline & pipeline) override; -}; - -} diff --git a/src/Processors/ya.make b/src/Processors/ya.make index 71ddd07f6a2..a44272cf9c0 100644 --- a/src/Processors/ya.make +++ b/src/Processors/ya.make @@ -108,7 +108,6 @@ SRCS( QueryPlan/ITransformingStep.cpp QueryPlan/LimitByStep.cpp QueryPlan/LimitStep.cpp - QueryPlan/MaterializingStep.cpp QueryPlan/MergeSortingStep.cpp QueryPlan/MergingAggregatedStep.cpp QueryPlan/MergingFinal.cpp diff --git a/src/Storages/StorageView.cpp b/src/Storages/StorageView.cpp index 38349ef8df9..1ee5ab3d0ca 100644 --- a/src/Storages/StorageView.cpp +++ b/src/Storages/StorageView.cpp @@ -15,7 +15,6 @@ #include #include -#include #include #include @@ -87,7 +86,10 @@ void StorageView::read( /// It's expected that the columns read from storage are not constant. /// Because method 'getSampleBlockForColumns' is used to obtain a structure of result in InterpreterSelectQuery. - auto materializing = std::make_unique(query_plan.getCurrentDataStream()); + auto materializing_actions = std::make_shared(query_plan.getCurrentDataStream().header.getColumnsWithTypeAndName()); + materializing_actions->addMaterializingOutputActions(); + + auto materializing = std::make_unique(query_plan.getCurrentDataStream(), std::move(materializing_actions)); materializing->setStepDescription("Materialize constants after VIEW subquery"); query_plan.addStep(std::move(materializing)); From 2f5b4c20aee69acc1172d8637dfa5b98cca8d4c2 Mon Sep 17 00:00:00 2001 From: Pavel Kruglov Date: Thu, 18 Feb 2021 14:21:48 +0300 Subject: [PATCH 0412/2357] Fix --- src/Client/ConnectionEstablisher.cpp | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/src/Client/ConnectionEstablisher.cpp b/src/Client/ConnectionEstablisher.cpp index e529d366fdc..f92d878b670 100644 --- a/src/Client/ConnectionEstablisher.cpp +++ b/src/Client/ConnectionEstablisher.cpp @@ -47,6 +47,9 @@ void ConnectionEstablisher::Routine::ReadCallback::operator()(int fd, const Poco connection_establisher.receive_timeout.setRelative(timeout); fiber = std::move(fiber).resume(); connection_establisher.receive_timeout.reset(); +#else + (void) fd; + (void) timeout; #endif } @@ -87,7 +90,7 @@ void ConnectionEstablisher::resume() bool is_receive_timeout_alarmed = false; epoll_event events[2]; - events[0].data.fd = events[1].data.fd; + events[0].data.fd = events[1].data.fd = -1; size_t ready_count = epoll.getManyReady(2, events, true); for (size_t i = 0; i != ready_count; ++i) { From 7f815325ba92e487712488e6a368ab12133421b7 Mon Sep 17 00:00:00 2001 From: alesapin Date: Thu, 18 Feb 2021 14:42:09 +0300 Subject: [PATCH 0413/2357] More tests for broken changelog read --- src/Coordination/Changelog.cpp | 42 +++++++++++++------ src/Coordination/tests/gtest_for_build.cpp | 15 +++++++ .../configs/use_test_keeper.xml | 2 +- 3 files changed, 45 insertions(+), 14 deletions(-) diff --git a/src/Coordination/Changelog.cpp b/src/Coordination/Changelog.cpp index 4358fa062e8..12943bd9272 100644 --- a/src/Coordination/Changelog.cpp +++ b/src/Coordination/Changelog.cpp @@ -142,6 +142,13 @@ private: size_t start_index; }; +struct ChangelogReadResult +{ + size_t entries_read; + off_t last_position; + bool error; +}; + class ChangelogReader { public: @@ -150,14 +157,15 @@ public: , read_buf(filepath) {} - size_t readChangelog(IndexToLogEntry & logs, size_t start_log_idx, IndexToOffset & index_to_offset) + ChangelogReadResult readChangelog(IndexToLogEntry & logs, size_t start_log_idx, IndexToOffset & index_to_offset) { - size_t total_read = 0; + size_t previous_index = 0; + ChangelogReadResult result{}; try { while (!read_buf.eof()) { - off_t pos = read_buf.count(); + result.last_position = read_buf.count(); ChangelogRecord record; readIntBinary(record.header.version, read_buf); readIntBinary(record.header.index, read_buf); @@ -168,7 +176,11 @@ public: auto buffer = nuraft::buffer::alloc(record.header.blob_size); auto buffer_begin = reinterpret_cast(buffer->data_begin()); read_buf.readStrict(buffer_begin, record.header.blob_size); - index_to_offset[record.header.index] = pos; + + if (previous_index != 0 && previous_index + 1 != record.header.index) + throw Exception(ErrorCodes::CORRUPTED_DATA, "Previous log entry {}, next log entry {}, seems like some entries skipped", previous_index, record.header.index); + + previous_index = record.header.index; Checksum checksum = CityHash_v1_0_2::CityHash128(buffer_begin, record.header.blob_size); if (checksum != record.header.blob_checksum) @@ -181,7 +193,7 @@ public: if (logs.count(record.header.index) != 0) throw Exception(ErrorCodes::CORRUPTED_DATA, "Duplicated index id {} in log {}", record.header.index, filepath); - total_read += 1; + result.entries_read += 1; if (record.header.index < start_log_idx) continue; @@ -189,18 +201,21 @@ public: auto log_entry = nuraft::cs_new(record.header.term, buffer, record.header.value_type); logs.emplace(record.header.index, log_entry); + index_to_offset[record.header.index] = result.last_position; } } catch (const Exception & ex) { + result.error = true; LOG_WARNING(&Poco::Logger::get("RaftChangelog"), "Cannot completely read changelog on path {}, error: {}", filepath, ex.message()); } catch (...) { + result.error = true; tryLogCurrentException(&Poco::Logger::get("RaftChangelog")); } - return total_read; + return result; } private: @@ -225,11 +240,11 @@ Changelog::Changelog(const std::string & changelogs_dir_, size_t rotate_interval void Changelog::readChangelogAndInitWriter(size_t from_log_idx) { - size_t read_from_last = 0; start_index = from_log_idx == 0 ? 1 : from_log_idx; size_t total_read = 0; size_t entries_in_last = 0; size_t incomplete_log_idx = 0; + ChangelogReadResult result{}; for (const auto & [start_idx, changelog_description] : existing_changelogs) { entries_in_last = changelog_description.to_log_idx - changelog_description.from_log_idx + 1; @@ -237,11 +252,11 @@ void Changelog::readChangelogAndInitWriter(size_t from_log_idx) if (changelog_description.to_log_idx >= from_log_idx) { ChangelogReader reader(changelog_description.path); - read_from_last = reader.readChangelog(logs, from_log_idx, index_to_start_pos); - total_read += read_from_last; + result = reader.readChangelog(logs, from_log_idx, index_to_start_pos); + total_read += result.entries_read; /// May happen after truncate and crash - if (read_from_last < entries_in_last) + if (result.entries_read < entries_in_last) { incomplete_log_idx = start_idx; break; @@ -258,12 +273,13 @@ void Changelog::readChangelogAndInitWriter(size_t from_log_idx) } } - if (!existing_changelogs.empty() && read_from_last < entries_in_last) + if (!existing_changelogs.empty() && result.entries_read < entries_in_last) { auto description = existing_changelogs.rbegin()->second; current_writer = std::make_unique(description.path, WriteMode::Append, description.from_log_idx); - current_writer->setEntriesWritten(read_from_last); - current_writer->truncateToLength(index_to_start_pos[read_from_last]); + current_writer->setEntriesWritten(result.entries_read); + if (result.error) + current_writer->truncateToLength(result.last_position); } else { diff --git a/src/Coordination/tests/gtest_for_build.cpp b/src/Coordination/tests/gtest_for_build.cpp index 3fd2db84e3e..457d0dbc52a 100644 --- a/src/Coordination/tests/gtest_for_build.cpp +++ b/src/Coordination/tests/gtest_for_build.cpp @@ -867,6 +867,11 @@ TEST(CoordinationTest, ChangelogTestReadAfterBrokenTruncate) EXPECT_FALSE(fs::exists("./logs/changelog_21_25.bin")); EXPECT_FALSE(fs::exists("./logs/changelog_26_30.bin")); EXPECT_FALSE(fs::exists("./logs/changelog_31_35.bin")); + + DB::NuKeeperLogStore changelog_reader2("./logs", 5, true); + changelog_reader2.init(1); + EXPECT_EQ(changelog_reader2.size(), 11); + EXPECT_EQ(changelog_reader2.last_entry()->get_term(), 7777); } TEST(CoordinationTest, ChangelogTestReadAfterBrokenTruncate2) @@ -895,6 +900,16 @@ TEST(CoordinationTest, ChangelogTestReadAfterBrokenTruncate2) EXPECT_EQ(changelog_reader.last_entry()->get_term(), 450); EXPECT_TRUE(fs::exists("./logs/changelog_1_20.bin")); EXPECT_FALSE(fs::exists("./logs/changelog_21_40.bin")); + auto entry = getLogEntry("hello_world", 7777); + changelog_reader.append(entry); + EXPECT_EQ(changelog_reader.size(), 3); + EXPECT_EQ(changelog_reader.last_entry()->get_term(), 7777); + + + DB::NuKeeperLogStore changelog_reader2("./logs", 20, true); + changelog_reader2.init(1); + EXPECT_EQ(changelog_reader2.size(), 3); + EXPECT_EQ(changelog_reader2.last_entry()->get_term(), 7777); } int main(int argc, char ** argv) diff --git a/tests/integration/test_testkeeper_persistent_log/configs/use_test_keeper.xml b/tests/integration/test_testkeeper_persistent_log/configs/use_test_keeper.xml index 12dc7fd9447..2e48e91bca5 100644 --- a/tests/integration/test_testkeeper_persistent_log/configs/use_test_keeper.xml +++ b/tests/integration/test_testkeeper_persistent_log/configs/use_test_keeper.xml @@ -1,7 +1,7 @@ - node1 + node 9181 From 904b4754ccbd5a63b95402ae913c57ea2a260b5c Mon Sep 17 00:00:00 2001 From: alesapin Date: Thu, 18 Feb 2021 14:47:37 +0300 Subject: [PATCH 0414/2357] Fix tidy --- src/Coordination/Changelog.cpp | 2 +- src/Coordination/Changelog.h | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/src/Coordination/Changelog.cpp b/src/Coordination/Changelog.cpp index 12943bd9272..a332ce37a8c 100644 --- a/src/Coordination/Changelog.cpp +++ b/src/Coordination/Changelog.cpp @@ -302,7 +302,7 @@ void Changelog::rotate(size_t new_start_log_idx) current_writer = std::make_unique(new_description.path, WriteMode::Rewrite, new_start_log_idx); } -ChangelogRecord Changelog::buildRecord(size_t index, nuraft::ptr log_entry) const +ChangelogRecord Changelog::buildRecord(size_t index, nuraft::ptr log_entry) { ChangelogRecordHeader header; header.index = index; diff --git a/src/Coordination/Changelog.h b/src/Coordination/Changelog.h index 38d83819da2..779d057d285 100644 --- a/src/Coordination/Changelog.h +++ b/src/Coordination/Changelog.h @@ -103,7 +103,7 @@ private: void rotate(size_t new_start_log_idx); - ChangelogRecord buildRecord(size_t index, nuraft::ptr log_entry) const; + static ChangelogRecord buildRecord(size_t index, nuraft::ptr log_entry); private: std::string changelogs_dir; From c92e613b82545c8ed13641b69a9e5ab9c2665b74 Mon Sep 17 00:00:00 2001 From: zlx19950903 <76729556+zlx19950903@users.noreply.github.com> Date: Thu, 18 Feb 2021 20:05:55 +0800 Subject: [PATCH 0415/2357] Add a function `htmlOrXmlCoarseParse` to extract content from html or xml format string. (#19600) * add html and xml coarse parse * add test file * add conditional check: hyperscan * fix style error * add conditional check * bug fix * delete unit * typos check fix * add unit test * style check fix * fix build error: case style * acradis_skip test fix * LINT error fix * Remove comments Co-authored-by: guojiantao Co-authored-by: Ivan <5627721+abyss7@users.noreply.github.com> Co-authored-by: Ivan Lezhankin --- docker/test/fasttest/run.sh | 1 + src/Functions/htmlOrXmlCoarseParse.cpp | 582 ++++++++++++++++++ src/Functions/registerFunctionsString.cpp | 7 +- src/Functions/ya.make | 1 + .../01674_htm_xml_coarse_parse.reference | 9 + .../01674_htm_xml_coarse_parse.sql | 15 + .../queries/0_stateless/arcadia_skip_list.txt | 1 + 7 files changed, 615 insertions(+), 1 deletion(-) create mode 100644 src/Functions/htmlOrXmlCoarseParse.cpp create mode 100644 tests/queries/0_stateless/01674_htm_xml_coarse_parse.reference create mode 100644 tests/queries/0_stateless/01674_htm_xml_coarse_parse.sql diff --git a/docker/test/fasttest/run.sh b/docker/test/fasttest/run.sh index 90663102f17..1c5f62a9e46 100755 --- a/docker/test/fasttest/run.sh +++ b/docker/test/fasttest/run.sh @@ -342,6 +342,7 @@ function run_tests # JSON functions 01666_blns + 01674_htm_xml_coarse_parse ) (time clickhouse-test --hung-check -j 8 --order=random --use-skip-list --no-long --testname --shard --zookeeper --skip "${TESTS_TO_SKIP[@]}" -- "$FASTTEST_FOCUS" 2>&1 ||:) | ts '%Y-%m-%d %H:%M:%S' | tee "$FASTTEST_OUTPUT/test_log.txt" diff --git a/src/Functions/htmlOrXmlCoarseParse.cpp b/src/Functions/htmlOrXmlCoarseParse.cpp new file mode 100644 index 00000000000..442de3d36b0 --- /dev/null +++ b/src/Functions/htmlOrXmlCoarseParse.cpp @@ -0,0 +1,582 @@ +#include +#include +#include +#include + +#include +#include +#include + +#if USE_HYPERSCAN +# include + +namespace DB +{ +namespace ErrorCodes +{ + extern const int ILLEGAL_COLUMN; + extern const int ILLEGAL_TYPE_OF_ARGUMENT; + extern const int CANNOT_ALLOCATE_MEMORY; + extern const int NOT_IMPLEMENTED; +} + +namespace +{ +struct HxCoarseParseImpl +{ +private: + struct SpanInfo + { + SpanInfo(): id(0), match_space(std::pair(0, 0)) {} // NOLINT + SpanInfo(unsigned int matchId, std::pair matchSpan): id(matchId), match_space(matchSpan){} // NOLINT + SpanInfo(const SpanInfo& obj) + { + id = obj.id; + match_space = obj.match_space; + } + SpanInfo& operator=(const SpanInfo& obj) = default; + + unsigned int id; + std::pair match_space; // NOLINT + }; + using SpanElement = std::vector; + struct Span + { + Span(): set_script(false), set_style(false), set_semi(false), is_finding_cdata(false) {} + + SpanElement copy_stack; // copy area + SpanElement tag_stack; // regexp area + SpanInfo script_ptr; // script pointer + bool set_script; // whether set script + SpanInfo style_ptr; // style pointer + bool set_style; // whether set style + SpanInfo semi_ptr; // tag ptr + bool set_semi; // whether set semi + + bool is_finding_cdata; + }; + + static inline void copyZone( + ColumnString::Offset& current_dst_string_offset, + ColumnString::Offset& current_copy_loc, + ColumnString::Chars& dst_chars, + const ColumnString::Chars& src_chars, + size_t bytes_to_copy, + unsigned is_space + ) + { + bool is_last_space = false; + if (current_dst_string_offset == 0 || dst_chars[current_dst_string_offset - 1] == 0 || dst_chars[current_dst_string_offset - 1] == ' ') + { + is_last_space = true; + } + if (bytes_to_copy == 0) + { + if (is_space && !is_last_space) + { + dst_chars[current_dst_string_offset++] = ' '; + } + } + else + { + if (is_last_space && src_chars[current_copy_loc] == ' ') + { + --bytes_to_copy; + ++current_copy_loc; + } + if (bytes_to_copy > 0) + { + memcpySmallAllowReadWriteOverflow15( + &dst_chars[current_dst_string_offset], &src_chars[current_copy_loc], bytes_to_copy); + current_dst_string_offset += bytes_to_copy; + } + + // separator is space and last character is not space. + if (is_space && !(current_dst_string_offset == 0 || dst_chars[current_dst_string_offset - 1] == 0 || dst_chars[current_dst_string_offset - 1] == ' ')) + { + dst_chars[current_dst_string_offset++] = ' '; + } + } + // return; + } + static inline void popArea(SpanElement& stack, unsigned long long from, unsigned long long to) //NOLINT + { + while (!stack.empty()) + { + if (to > stack.back().match_space.second && from < stack.back().match_space.second) + { + stack.pop_back(); + } + else + { + break; + } + } + // return; + } + + static void dealCommonTag(Span* matches) + { + while (!matches->copy_stack.empty() && matches->copy_stack.back().id != 10) + { + matches->copy_stack.pop_back(); + } + if (!matches->copy_stack.empty()) + { + matches->copy_stack.pop_back(); + } + unsigned long long from; // NOLINT + unsigned long long to; // NOLINT + unsigned id; + for (auto begin = matches->tag_stack.begin(); begin != matches->tag_stack.end(); ++begin) + { + from = begin->match_space.first; + to = begin->match_space.second; + id = begin->id; + switch (id) + { + case 12: + case 13: + { + popArea(matches->copy_stack, from, to); + if (matches->copy_stack.empty() || from >= matches->copy_stack.back().match_space.second) + matches->copy_stack.push_back(SpanInfo(id, std::make_pair(from, to))); + break; + } + case 0: + case 2: + case 3: + case 4: + case 5: + case 6: + case 7: + case 8: + case 9: + case 10: + { + if (!matches->set_semi || (matches->set_semi && from == matches->semi_ptr.match_space.first)) + { + matches->set_semi = true; + matches->semi_ptr = SpanInfo(id, std::make_pair(from, to)); + } + break; + } + case 1: + { + if (matches->set_semi) + { + switch (matches->semi_ptr.id) + { + case 0: + case 2: + case 3: + case 6: + case 7: + case 10: + { + if (matches->semi_ptr.id == 2 || (matches->semi_ptr.id == 3 && matches->semi_ptr.match_space.second == from)) + { + if (!matches->set_script) + { + matches->set_script = true; + matches->script_ptr = SpanInfo(matches->semi_ptr.id, std::make_pair(matches->semi_ptr.match_space.first, to)); + } + } + else if (matches->semi_ptr.id == 6 || (matches->semi_ptr.id == 7 && matches->semi_ptr.match_space.second == from)) + { + if (!matches->set_style) + { + matches->set_style = true; + matches->style_ptr = SpanInfo(matches->semi_ptr.id, std::make_pair(matches->semi_ptr.match_space.first, to)); + } + } + popArea(matches->copy_stack, matches->semi_ptr.match_space.first, to); + matches->copy_stack.push_back(SpanInfo(0, std::make_pair(matches->semi_ptr.match_space.first, to))); + matches->set_semi = false; + break; + } + case 4: + case 5: + case 8: + case 9: + { + SpanInfo complete_zone; + + complete_zone.match_space.second = to; + if (matches->set_script && (matches->semi_ptr.id == 4 || (matches->semi_ptr.id == 5 && matches->semi_ptr.match_space.second == from))) + { + complete_zone.id = matches->script_ptr.id; + complete_zone.match_space.first = matches->script_ptr.match_space.first; + matches->set_script = false; + } + else if (matches->set_style && (matches->semi_ptr.id == 8 || (matches->semi_ptr.id == 9 && matches->semi_ptr.match_space.second == from))) + { + complete_zone.id = matches->style_ptr.id; + complete_zone.match_space.first = matches->style_ptr.match_space.first; + matches->set_style = false; + } + else + { + complete_zone.id = matches->semi_ptr.id; + complete_zone.match_space.first = matches->semi_ptr.match_space.first; + } + popArea(matches->copy_stack, complete_zone.match_space.first, complete_zone.match_space.second); + matches->copy_stack.push_back(complete_zone); + matches->set_semi = false; + break; + } + } + } + break; + } + default: + { + break; + } + } + } + // return; + } + static int spanCollect(unsigned int id, + unsigned long long from, // NOLINT + unsigned long long to, // NOLINT + unsigned int , void * ctx) + { + Span* matches = static_cast(ctx); + from = id == 12 ? from : to - patterns_length[id]; + + if (matches->is_finding_cdata) + { + if (id == 11) + { + matches->copy_stack.push_back(SpanInfo(id, std::make_pair(from, to))); + matches->is_finding_cdata = false; + matches->tag_stack.clear(); + if (matches->semi_ptr.id == 10) + { + matches->set_semi = false; + } + } + else if (id == 12 || id == 13) + { + popArea(matches->copy_stack, from, to); + if (matches->copy_stack.empty() || from >= matches->copy_stack.back().match_space.second) + matches->copy_stack.push_back(SpanInfo(id, std::make_pair(from, to))); + + popArea(matches->tag_stack, from, to); + if (matches->tag_stack.empty() || from >= matches->tag_stack.back().match_space.second) + matches->tag_stack.push_back(SpanInfo(id, std::make_pair(from, to))); + } + else + { + popArea(matches->tag_stack, from, to); + matches->tag_stack.push_back(SpanInfo(id, std::make_pair(from, to))); + } + } + else + { + switch (id) + { + case 12: + case 13: + { + popArea(matches->copy_stack, from, to); + if (matches->copy_stack.empty() || from >= matches->copy_stack.back().match_space.second) + matches->copy_stack.push_back(SpanInfo(id, std::make_pair(from, to))); + break; + } + case 0: + case 2: + case 3: + case 4: + case 5: + case 6: + case 7: + case 8: + case 9: + { + if (!matches->set_semi || (matches->set_semi && from == matches->semi_ptr.match_space.first)) + { + matches->set_semi = true; + matches->semi_ptr = SpanInfo(id, std::make_pair(from, to)); + } + break; + } + case 10: + { + if (!matches->set_semi || (matches->set_semi && from == matches->semi_ptr.match_space.first)) + { + matches->set_semi = true; + matches->semi_ptr = SpanInfo(id, std::make_pair(from, to)); + } + matches->is_finding_cdata = true; + matches->copy_stack.push_back(SpanInfo(id, std::make_pair(from, to))); + matches->tag_stack.push_back(SpanInfo(id, std::make_pair(from, to))); + break; + } + case 1: + { + if (matches->set_semi) + { + switch (matches->semi_ptr.id) + { + case 0: + case 2: + case 3: + case 6: + case 7: + case 10: + { + if (matches->semi_ptr.id == 2 || (matches->semi_ptr.id == 3 && matches->semi_ptr.match_space.second == from)) + { + if (!matches->set_script) + { + matches->set_script = true; + matches->script_ptr = SpanInfo(matches->semi_ptr.id, std::make_pair(matches->semi_ptr.match_space.first, to)); + } + } + else if (matches->semi_ptr.id == 6 || (matches->semi_ptr.id == 7 && matches->semi_ptr.match_space.second == from)) + { + if (!matches->set_style) + { + matches->set_style = true; + matches->style_ptr = SpanInfo(matches->semi_ptr.id, std::make_pair(matches->semi_ptr.match_space.first, to)); + } + } + popArea(matches->copy_stack, matches->semi_ptr.match_space.first, to); + matches->copy_stack.push_back(SpanInfo(matches->semi_ptr.id, std::make_pair(matches->semi_ptr.match_space.first, to))); + matches->set_semi = false; + break; + } + case 4: + case 5: + case 8: + case 9: + { + SpanInfo complete_zone; + complete_zone.match_space.second = to; + if (matches->set_script && (matches->semi_ptr.id == 4 || (matches->semi_ptr.id == 5 && matches->semi_ptr.match_space.second == from))) + { + complete_zone.id = matches->script_ptr.id; + complete_zone.match_space.first = matches->script_ptr.match_space.first; + matches->set_script = false; + } + else if (matches->set_style && (matches->semi_ptr.id == 8 || (matches->semi_ptr.id == 9 && matches->semi_ptr.match_space.second == from))) + { + complete_zone.id = matches->style_ptr.id; + complete_zone.match_space.first = matches->style_ptr.match_space.first; + matches->set_style = false; + } + else + { + complete_zone.id = matches->semi_ptr.id; + complete_zone.match_space.first = matches->semi_ptr.match_space.first; + } + popArea(matches->copy_stack, complete_zone.match_space.first, complete_zone.match_space.second); + matches->copy_stack.push_back(complete_zone); + matches->set_semi = false; + break; + } + } + } + break; + } + default: + { + break; + } + } + } + return 0; + } + #if USE_HYPERSCAN + static hs_database_t* buildDatabase(const std::vector &expressions, + const std::vector &flags, + const std::vector &id, + unsigned int mode) + { + hs_database_t *db; + hs_compile_error_t *compile_err; + hs_error_t err; + err = hs_compile_multi(expressions.data(), flags.data(), id.data(), + expressions.size(), mode, nullptr, &db, &compile_err); + + if (err != HS_SUCCESS) + { + hs_free_compile_error(compile_err); + throw Exception("Hyper scan database cannot be compiled.", ErrorCodes::CANNOT_ALLOCATE_MEMORY); + } + return db; + } + #endif + static std::vector patterns; + static std::vector patterns_length; + static std::vector patterns_flag; + static std::vector ids; + +public: + static void executeInternal( + const ColumnString::Chars & src_chars, + const ColumnString::Offsets & src_offsets, + ColumnString::Chars & dst_chars, + ColumnString::Offsets & dst_offsets) + { + #if USE_HYPERSCAN + hs_database_t * db = buildDatabase(patterns, patterns_flag, ids, HS_MODE_BLOCK); + hs_scratch_t* scratch = nullptr; + if (hs_alloc_scratch(db, &scratch) != HS_SUCCESS) + { + hs_free_database(db); + throw Exception("Unable to allocate scratch space.", ErrorCodes::CANNOT_ALLOCATE_MEMORY); + } + dst_chars.resize(src_chars.size()); + dst_offsets.resize(src_offsets.size()); + + ColumnString::Offset current_src_string_offset = 0; + ColumnString::Offset current_dst_string_offset = 0; + ColumnString::Offset current_copy_loc; + ColumnString::Offset current_copy_end; + unsigned is_space; + size_t bytes_to_copy; + Span match_zoneall; + + for (size_t off = 0; off < src_offsets.size(); ++off) + { + hs_scan(db, reinterpret_cast(&src_chars[current_src_string_offset]), src_offsets[off] - current_src_string_offset, 0, scratch, spanCollect, &match_zoneall); + if (match_zoneall.is_finding_cdata) + { + dealCommonTag(&match_zoneall); + } + SpanElement& match_zone = match_zoneall.copy_stack; + current_copy_loc = current_src_string_offset; + if (match_zone.empty()) + { + current_copy_end = src_offsets[off]; + is_space = 0; + } + else + { + current_copy_end = current_src_string_offset + match_zone.begin()->match_space.first; + is_space = (match_zone.begin()->id == 12 || match_zone.begin()->id == 13)?1:0; + } + + bytes_to_copy = current_copy_end - current_copy_loc; + copyZone(current_dst_string_offset, current_copy_loc, dst_chars, src_chars, bytes_to_copy, is_space); + for (auto begin = match_zone.begin(); begin != match_zone.end(); ++begin) + { + current_copy_loc = current_src_string_offset + begin->match_space.second; + if (begin + 1 >= match_zone.end()) + { + current_copy_end = src_offsets[off]; + is_space = 0; + } + else + { + current_copy_end = current_src_string_offset + (begin+1)->match_space.first; + is_space = ((begin+1)->id == 12 || (begin+1)->id == 13)?1:0; + } + bytes_to_copy = current_copy_end - current_copy_loc; + copyZone(current_dst_string_offset, current_copy_loc, dst_chars, src_chars, bytes_to_copy, is_space); + } + if (current_dst_string_offset > 1 && dst_chars[current_dst_string_offset - 2] == ' ') + { + dst_chars[current_dst_string_offset - 2] = 0; + --current_dst_string_offset; + } + dst_offsets[off] = current_dst_string_offset; + current_src_string_offset = src_offsets[off]; + match_zoneall.copy_stack.clear(); + match_zoneall.tag_stack.clear(); + } + dst_chars.resize(dst_chars.size()); + hs_free_scratch(scratch); + hs_free_database(db); + #else + (void)src_chars; + (void)src_offsets; + (void)dst_chars; + (void)dst_offsets; + throw Exception( + "htmlOrXmlCoarseParse is not implemented when hyperscan is off (is it x86 processor?)", + ErrorCodes::NOT_IMPLEMENTED); + #endif + } +}; + +std::vector HxCoarseParseImpl::patterns = + { + "<[^\\s<>]", // 0 "<", except "< ", "<<", "<>" + ">", // 1 ">" + " + " + " + " + " + " + " + " + " + "\\]\\]>", // 11 ]]> + "\\s{2,}", // 12 " ", continuous blanks + "[^\\S ]" // 13 "\n", "\t" and other white space, it does not include single ' '. + }; +std::vector HxCoarseParseImpl::patterns_length = + { + 2, 1, 8, 7, 9, 8, 7, 6, 8, 7, 9, 3, 0, 1 + }; +#if USE_HYPERSCAN +std::vector HxCoarseParseImpl::patterns_flag = + { + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, HS_FLAG_SOM_LEFTMOST, 0 + }; +#endif +std::vector HxCoarseParseImpl::ids = + { + 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13 + }; + +class FunctionHtmlOrXmlCoarseParse : public IFunction +{ +public: + static constexpr auto name = "htmlOrXmlCoarseParse"; + + static FunctionPtr create(const Context &) {return std::make_shared(); } + + String getName() const override {return name;} + + size_t getNumberOfArguments() const override {return 1;} + + DataTypePtr getReturnTypeImpl(const DataTypes & arguments) const override + { + if (!isString(arguments[0])) + throw Exception( + "Illegal type " + arguments[0]->getName() + " of argument of function " + getName(), ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT); + return arguments[0]; + } + + bool useDefaultImplementationForConstants() const override {return true;} + + ColumnPtr executeImpl(const ColumnsWithTypeAndName & arguments, const DataTypePtr & , size_t) const override + { + const auto & strcolumn = arguments[0].column; + if (const ColumnString* html_sentence = checkAndGetColumn(strcolumn.get())) + { + auto col_res = ColumnString::create(); + HxCoarseParseImpl::executeInternal(html_sentence->getChars(), html_sentence->getOffsets(), col_res->getChars(), col_res->getOffsets()); + return col_res; + } + else + { + throw Exception("First argument for function " + getName() + " must be string.", ErrorCodes::ILLEGAL_COLUMN); + } + } +}; +} + +void registerFunctionHtmlOrXmlCoarseParse(FunctionFactory & factory) +{ + factory.registerFunction(); +} + +} +#endif diff --git a/src/Functions/registerFunctionsString.cpp b/src/Functions/registerFunctionsString.cpp index 5cf30dd83a6..b6327dfb92f 100644 --- a/src/Functions/registerFunctionsString.cpp +++ b/src/Functions/registerFunctionsString.cpp @@ -6,7 +6,9 @@ namespace DB { class FunctionFactory; - +#if USE_HYPERSCAN +void registerFunctionHtmlOrXmlCoarseParse(FunctionFactory &); +#endif void registerFunctionRepeat(FunctionFactory &); void registerFunctionEmpty(FunctionFactory &); void registerFunctionNotEmpty(FunctionFactory &); @@ -45,6 +47,9 @@ void registerFunctionTryBase64Decode(FunctionFactory &); void registerFunctionsString(FunctionFactory & factory) { +#if USE_HYPERSCAN + registerFunctionHtmlOrXmlCoarseParse(factory); +#endif registerFunctionRepeat(factory); registerFunctionEmpty(factory); registerFunctionNotEmpty(factory); diff --git a/src/Functions/ya.make b/src/Functions/ya.make index ea975901077..20ba5f846a3 100644 --- a/src/Functions/ya.make +++ b/src/Functions/ya.make @@ -291,6 +291,7 @@ SRCS( hasToken.cpp hasTokenCaseInsensitive.cpp hostName.cpp + htmlOrXmlCoarseParse.cpp hypot.cpp identity.cpp if.cpp diff --git a/tests/queries/0_stateless/01674_htm_xml_coarse_parse.reference b/tests/queries/0_stateless/01674_htm_xml_coarse_parse.reference new file mode 100644 index 00000000000..63b3707b9b4 --- /dev/null +++ b/tests/queries/0_stateless/01674_htm_xml_coarse_parse.reference @@ -0,0 +1,9 @@ + + +Here is CDTATA. +This is a white space test. +This is a complex test. world '); +SELECT htmlOrXmlCoarseParse(''); +SELECT htmlOrXmlCoarseParse('This is a white space test.'); +SELECT htmlOrXmlCoarseParse('This is a complex test. Hello, world ]]>world ]]> hello\n]]>hello\n'); +DROP TABLE IF EXISTS defaults; +CREATE TABLE defaults +( + stringColumn String +) ENGINE = Memory(); + +INSERT INTO defaults values ('hello, world'), (''), (''), ('white space collapse'); + +SELECT htmlOrXmlCoarseParse(stringColumn) FROM defaults; +DROP table defaults; diff --git a/tests/queries/0_stateless/arcadia_skip_list.txt b/tests/queries/0_stateless/arcadia_skip_list.txt index b141443a979..5466fb4bfb8 100644 --- a/tests/queries/0_stateless/arcadia_skip_list.txt +++ b/tests/queries/0_stateless/arcadia_skip_list.txt @@ -197,6 +197,7 @@ 01181_db_atomic_drop_on_cluster 01658_test_base64Encode_mysql_compatibility 01659_test_base64Decode_mysql_compatibility +01674_htm_xml_coarse_parse 01675_data_type_coroutine 01676_clickhouse_client_autocomplete 01671_aggregate_function_group_bitmap_data From 2aad067e7c092af8162f1048b93c80216ec2d8f9 Mon Sep 17 00:00:00 2001 From: kssenii Date: Thu, 18 Feb 2021 12:16:58 +0000 Subject: [PATCH 0416/2357] Support conversion for postgres numeric without precision and scale --- .../fetchPostgreSQLTableStructure.cpp | 35 ++++++++++++------- .../test_storage_postgresql/test.py | 8 ++--- 2 files changed, 27 insertions(+), 16 deletions(-) diff --git a/src/Databases/PostgreSQL/fetchPostgreSQLTableStructure.cpp b/src/Databases/PostgreSQL/fetchPostgreSQLTableStructure.cpp index ec23cfc8794..15ce9a1baed 100644 --- a/src/Databases/PostgreSQL/fetchPostgreSQLTableStructure.cpp +++ b/src/Databases/PostgreSQL/fetchPostgreSQLTableStructure.cpp @@ -54,19 +54,30 @@ static DataTypePtr convertPostgreSQLDataType(std::string & type, bool is_nullabl res = std::make_shared(); else if (type.starts_with("numeric")) { - /// Numeric and decimal will both end up here as numeric. - res = DataTypeFactory::instance().get(type); - uint32_t precision = getDecimalPrecision(*res); - uint32_t scale = getDecimalScale(*res); + /// Numeric and decimal will both end up here as numeric. If it has type and precision, + /// there will be Numeric(x, y), otherwise just Numeric + uint32_t precision, scale; + if (type.ends_with(")")) + { + res = DataTypeFactory::instance().get(type); + precision = getDecimalPrecision(*res); + scale = getDecimalScale(*res); + + if (precision <= DecimalUtils::maxPrecision()) + res = std::make_shared>(precision, scale); + else if (precision <= DecimalUtils::maxPrecision()) + res = std::make_shared>(precision, scale); + else if (precision <= DecimalUtils::maxPrecision()) + res = std::make_shared>(precision, scale); + else if (precision <= DecimalUtils::maxPrecision()) + res = std::make_shared>(precision, scale); + } + else + { + precision = DecimalUtils::maxPrecision(); + res = std::make_shared>(precision, precision); + } - if (precision <= DecimalUtils::maxPrecision()) - res = std::make_shared>(precision, scale); - else if (precision <= DecimalUtils::maxPrecision()) - res = std::make_shared>(precision, scale); - else if (precision <= DecimalUtils::maxPrecision()) - res = std::make_shared>(precision, scale); - else if (precision <= DecimalUtils::maxPrecision()) - res = std::make_shared>(precision, scale); } if (!res) diff --git a/tests/integration/test_storage_postgresql/test.py b/tests/integration/test_storage_postgresql/test.py index 4f567c19f2b..03af32a4803 100644 --- a/tests/integration/test_storage_postgresql/test.py +++ b/tests/integration/test_storage_postgresql/test.py @@ -63,13 +63,13 @@ def test_postgres_conversions(started_cluster): cursor.execute( '''CREATE TABLE IF NOT EXISTS test_types ( a smallint, b integer, c bigint, d real, e double precision, f serial, g bigserial, - h timestamp, i date, j numeric(5, 5), k decimal(5, 5))''') + h timestamp, i date, j decimal(5, 5), k numeric)''') node1.query(''' INSERT INTO TABLE FUNCTION postgresql('postgres1:5432', 'clickhouse', 'test_types', 'postgres', 'mysecretpassword') VALUES - (-32768, -2147483648, -9223372036854775808, 1.12345, 1.1234567890, 2147483647, 9223372036854775807, '2000-05-12 12:12:12', '2000-05-12', 0.2, 0.2)''') + (-32768, -2147483648, -9223372036854775808, 1.12345, 1.1234567890, 2147483647, 9223372036854775807, '2000-05-12 12:12:12', '2000-05-12', 0.22222, 0.22222)''') result = node1.query(''' - SELECT * FROM postgresql('postgres1:5432', 'clickhouse', 'test_types', 'postgres', 'mysecretpassword')''') - assert(result == '-32768\t-2147483648\t-9223372036854775808\t1.12345\t1.123456789\t2147483647\t9223372036854775807\t2000-05-12 12:12:12\t2000-05-12\t0.20000\t0.20000\n') + SELECT a, b, c, d, e, f, g, h, i, j, toDecimal32(k, 5) FROM postgresql('postgres1:5432', 'clickhouse', 'test_types', 'postgres', 'mysecretpassword')''') + assert(result == '-32768\t-2147483648\t-9223372036854775808\t1.12345\t1.123456789\t2147483647\t9223372036854775807\t2000-05-12 12:12:12\t2000-05-12\t0.22222\t0.22222\n') cursor.execute( '''CREATE TABLE IF NOT EXISTS test_array_dimensions From 77fd060665751fc6528dd9f77e0fdea41cbc23bc Mon Sep 17 00:00:00 2001 From: Amos Bird Date: Sun, 14 Feb 2021 19:09:36 +0800 Subject: [PATCH 0417/2357] Normalize function names --- .../AggregateFunctionFactory.cpp | 16 +++-- src/Common/IFactoryWithAliases.h | 14 ++++ src/Functions/FunctionFactory.cpp | 15 +++-- src/Functions/FunctionsRound.cpp | 2 +- src/Functions/extractAllGroupsVertical.cpp | 2 +- src/Interpreters/FunctionNameNormalizer.cpp | 18 +++++ src/Interpreters/FunctionNameNormalizer.h | 14 ++++ src/Interpreters/MutationsInterpreter.cpp | 4 +- src/Interpreters/TreeRewriter.cpp | 4 ++ src/Interpreters/addTypeConversionToAST.cpp | 2 +- src/Interpreters/inplaceBlockConversions.cpp | 2 +- .../Impl/ConstantExpressionTemplate.cpp | 2 +- tests/integration/test_mysql_protocol/test.py | 2 +- .../00597_push_down_predicate.reference | 2 +- .../01029_early_constant_folding.reference | 2 +- ...1611_constant_folding_subqueries.reference | 2 +- ..._case_insensitive_function_names.reference | 66 +++++++++++++++++++ ...malize_case_insensitive_function_names.sql | 1 + 18 files changed, 151 insertions(+), 19 deletions(-) create mode 100644 src/Interpreters/FunctionNameNormalizer.cpp create mode 100644 src/Interpreters/FunctionNameNormalizer.h create mode 100644 tests/queries/0_stateless/01705_normalize_case_insensitive_function_names.reference create mode 100644 tests/queries/0_stateless/01705_normalize_case_insensitive_function_names.sql diff --git a/src/AggregateFunctions/AggregateFunctionFactory.cpp b/src/AggregateFunctions/AggregateFunctionFactory.cpp index 5fc690d59f2..061077dd8fa 100644 --- a/src/AggregateFunctions/AggregateFunctionFactory.cpp +++ b/src/AggregateFunctions/AggregateFunctionFactory.cpp @@ -30,6 +30,10 @@ namespace ErrorCodes extern const int LOGICAL_ERROR; } +const String & getAggregateFunctionCanonicalNameIfAny(const String & name) +{ + return AggregateFunctionFactory::instance().getCanonicalNameIfAny(name); +} void AggregateFunctionFactory::registerFunction(const String & name, Value creator_with_properties, CaseSensitiveness case_sensitiveness) { @@ -41,10 +45,14 @@ void AggregateFunctionFactory::registerFunction(const String & name, Value creat throw Exception("AggregateFunctionFactory: the aggregate function name '" + name + "' is not unique", ErrorCodes::LOGICAL_ERROR); - if (case_sensitiveness == CaseInsensitive - && !case_insensitive_aggregate_functions.emplace(Poco::toLower(name), creator_with_properties).second) - throw Exception("AggregateFunctionFactory: the case insensitive aggregate function name '" + name + "' is not unique", - ErrorCodes::LOGICAL_ERROR); + if (case_sensitiveness == CaseInsensitive) + { + auto key = Poco::toLower(name); + if (!case_insensitive_aggregate_functions.emplace(key, creator_with_properties).second) + throw Exception("AggregateFunctionFactory: the case insensitive aggregate function name '" + name + "' is not unique", + ErrorCodes::LOGICAL_ERROR); + case_insensitive_name_mapping[key] = name; + } } static DataTypes convertLowCardinalityTypesToNested(const DataTypes & types) diff --git a/src/Common/IFactoryWithAliases.h b/src/Common/IFactoryWithAliases.h index 49c03049b92..5ef795c92d0 100644 --- a/src/Common/IFactoryWithAliases.h +++ b/src/Common/IFactoryWithAliases.h @@ -35,6 +35,8 @@ protected: return name; } + std::unordered_map case_insensitive_name_mapping; + public: /// For compatibility with SQL, it's possible to specify that certain function name is case insensitive. enum CaseSensitiveness @@ -68,9 +70,12 @@ public: factory_name + ": the alias name '" + alias_name + "' is already registered as real name", ErrorCodes::LOGICAL_ERROR); if (case_sensitiveness == CaseInsensitive) + { if (!case_insensitive_aliases.emplace(alias_name_lowercase, real_dict_name).second) throw Exception( factory_name + ": case insensitive alias name '" + alias_name + "' is not unique", ErrorCodes::LOGICAL_ERROR); + case_insensitive_name_mapping[alias_name_lowercase] = real_name; + } if (!aliases.emplace(alias_name, real_dict_name).second) throw Exception(factory_name + ": alias name '" + alias_name + "' is not unique", ErrorCodes::LOGICAL_ERROR); @@ -111,6 +116,15 @@ public: return getMap().count(name) || getCaseInsensitiveMap().count(name) || isAlias(name); } + /// Return the canonical name (the name used in registration) if it's different from `name`. + const String & getCanonicalNameIfAny(const String & name) const + { + auto it = case_insensitive_name_mapping.find(Poco::toLower(name)); + if (it != case_insensitive_name_mapping.end()) + return it->second; + return name; + } + virtual ~IFactoryWithAliases() override {} private: diff --git a/src/Functions/FunctionFactory.cpp b/src/Functions/FunctionFactory.cpp index 768f1cfe487..09fd360a925 100644 --- a/src/Functions/FunctionFactory.cpp +++ b/src/Functions/FunctionFactory.cpp @@ -21,6 +21,10 @@ namespace ErrorCodes extern const int LOGICAL_ERROR; } +const String & getFunctionCanonicalNameIfAny(const String & name) +{ + return FunctionFactory::instance().getCanonicalNameIfAny(name); +} void FunctionFactory::registerFunction(const std::string & name, @@ -36,10 +40,13 @@ void FunctionFactory::registerFunction(const throw Exception("FunctionFactory: the function name '" + name + "' is already registered as alias", ErrorCodes::LOGICAL_ERROR); - if (case_sensitiveness == CaseInsensitive - && !case_insensitive_functions.emplace(function_name_lowercase, creator).second) - throw Exception("FunctionFactory: the case insensitive function name '" + name + "' is not unique", - ErrorCodes::LOGICAL_ERROR); + if (case_sensitiveness == CaseInsensitive) + { + if (!case_insensitive_functions.emplace(function_name_lowercase, creator).second) + throw Exception("FunctionFactory: the case insensitive function name '" + name + "' is not unique", + ErrorCodes::LOGICAL_ERROR); + case_insensitive_name_mapping[function_name_lowercase] = name; + } } diff --git a/src/Functions/FunctionsRound.cpp b/src/Functions/FunctionsRound.cpp index b1349bd2164..c5ad27a0b90 100644 --- a/src/Functions/FunctionsRound.cpp +++ b/src/Functions/FunctionsRound.cpp @@ -8,7 +8,7 @@ namespace DB void registerFunctionsRound(FunctionFactory & factory) { factory.registerFunction("round", FunctionFactory::CaseInsensitive); - factory.registerFunction("roundBankers", FunctionFactory::CaseInsensitive); + factory.registerFunction("roundBankers", FunctionFactory::CaseSensitive); factory.registerFunction("floor", FunctionFactory::CaseInsensitive); factory.registerFunction("ceil", FunctionFactory::CaseInsensitive); factory.registerFunction("trunc", FunctionFactory::CaseInsensitive); diff --git a/src/Functions/extractAllGroupsVertical.cpp b/src/Functions/extractAllGroupsVertical.cpp index 9cbd148b016..bf33eef70f3 100644 --- a/src/Functions/extractAllGroupsVertical.cpp +++ b/src/Functions/extractAllGroupsVertical.cpp @@ -18,7 +18,7 @@ namespace DB void registerFunctionExtractAllGroupsVertical(FunctionFactory & factory) { factory.registerFunction>(); - factory.registerAlias("extractAllGroups", VerticalImpl::Name, FunctionFactory::CaseInsensitive); + factory.registerAlias("extractAllGroups", VerticalImpl::Name, FunctionFactory::CaseSensitive); } } diff --git a/src/Interpreters/FunctionNameNormalizer.cpp b/src/Interpreters/FunctionNameNormalizer.cpp new file mode 100644 index 00000000000..f22f72b5e03 --- /dev/null +++ b/src/Interpreters/FunctionNameNormalizer.cpp @@ -0,0 +1,18 @@ +#include + +namespace DB +{ + +const String & getFunctionCanonicalNameIfAny(const String & name); +const String & getAggregateFunctionCanonicalNameIfAny(const String & name); + +void FunctionNameNormalizer::visit(ASTPtr & ast) +{ + if (auto * node_func = ast->as()) + node_func->name = getAggregateFunctionCanonicalNameIfAny(getFunctionCanonicalNameIfAny(node_func->name)); + + for (auto & child : ast->children) + visit(child); +} + +} diff --git a/src/Interpreters/FunctionNameNormalizer.h b/src/Interpreters/FunctionNameNormalizer.h new file mode 100644 index 00000000000..2b20c28bce0 --- /dev/null +++ b/src/Interpreters/FunctionNameNormalizer.h @@ -0,0 +1,14 @@ +#pragma once + +#include +#include + +namespace DB +{ + +struct FunctionNameNormalizer +{ + static void visit(ASTPtr &); +}; + +} diff --git a/src/Interpreters/MutationsInterpreter.cpp b/src/Interpreters/MutationsInterpreter.cpp index 528b5ec6d8e..c393b214ee8 100644 --- a/src/Interpreters/MutationsInterpreter.cpp +++ b/src/Interpreters/MutationsInterpreter.cpp @@ -442,10 +442,10 @@ ASTPtr MutationsInterpreter::prepare(bool dry_run) auto type_literal = std::make_shared(columns_desc.getPhysical(column).type->getName()); const auto & update_expr = kv.second; - auto updated_column = makeASTFunction("cast", + auto updated_column = makeASTFunction("CAST", makeASTFunction("if", getPartitionAndPredicateExpressionForMutationCommand(command), - makeASTFunction("cast", + makeASTFunction("CAST", update_expr->clone(), type_literal), std::make_shared(column)), diff --git a/src/Interpreters/TreeRewriter.cpp b/src/Interpreters/TreeRewriter.cpp index fd87d86bf97..cf4db8f174e 100644 --- a/src/Interpreters/TreeRewriter.cpp +++ b/src/Interpreters/TreeRewriter.cpp @@ -8,6 +8,7 @@ #include #include #include +#include #include #include #include @@ -934,6 +935,9 @@ void TreeRewriter::normalize(ASTPtr & query, Aliases & aliases, const Settings & MarkTableIdentifiersVisitor::Data identifiers_data{aliases}; MarkTableIdentifiersVisitor(identifiers_data).visit(query); + /// Rewrite function names to their canonical ones. + FunctionNameNormalizer().visit(query); + /// Common subexpression elimination. Rewrite rules. QueryNormalizer::Data normalizer_data(aliases, settings); QueryNormalizer(normalizer_data).visit(query); diff --git a/src/Interpreters/addTypeConversionToAST.cpp b/src/Interpreters/addTypeConversionToAST.cpp index bb42ad79daa..18591fd732c 100644 --- a/src/Interpreters/addTypeConversionToAST.cpp +++ b/src/Interpreters/addTypeConversionToAST.cpp @@ -20,7 +20,7 @@ namespace ErrorCodes ASTPtr addTypeConversionToAST(ASTPtr && ast, const String & type_name) { - auto func = makeASTFunction("cast", ast, std::make_shared(type_name)); + auto func = makeASTFunction("CAST", ast, std::make_shared(type_name)); if (ASTWithAlias * ast_with_alias = dynamic_cast(ast.get())) { diff --git a/src/Interpreters/inplaceBlockConversions.cpp b/src/Interpreters/inplaceBlockConversions.cpp index eba03d7aa61..c9a96a81b48 100644 --- a/src/Interpreters/inplaceBlockConversions.cpp +++ b/src/Interpreters/inplaceBlockConversions.cpp @@ -43,7 +43,7 @@ void addDefaultRequiredExpressionsRecursively(const Block & block, const String RequiredSourceColumnsVisitor(columns_context).visit(column_default_expr); NameSet required_columns_names = columns_context.requiredColumns(); - auto cast_func = makeASTFunction("cast", column_default_expr, std::make_shared(columns.get(required_column).type->getName())); + auto cast_func = makeASTFunction("CAST", column_default_expr, std::make_shared(columns.get(required_column).type->getName())); default_expr_list_accum->children.emplace_back(setAlias(cast_func, required_column)); added_columns.emplace(required_column); diff --git a/src/Processors/Formats/Impl/ConstantExpressionTemplate.cpp b/src/Processors/Formats/Impl/ConstantExpressionTemplate.cpp index d7a65c2f15d..1685688f02d 100644 --- a/src/Processors/Formats/Impl/ConstantExpressionTemplate.cpp +++ b/src/Processors/Formats/Impl/ConstantExpressionTemplate.cpp @@ -626,7 +626,7 @@ void ConstantExpressionTemplate::TemplateStructure::addNodesToCastResult(const I expr = makeASTFunction("assumeNotNull", std::move(expr)); } - expr = makeASTFunction("cast", std::move(expr), std::make_shared(result_column_type.getName())); + expr = makeASTFunction("CAST", std::move(expr), std::make_shared(result_column_type.getName())); if (null_as_default) { diff --git a/tests/integration/test_mysql_protocol/test.py b/tests/integration/test_mysql_protocol/test.py index 9532d4b8ba2..7f7d59674bc 100644 --- a/tests/integration/test_mysql_protocol/test.py +++ b/tests/integration/test_mysql_protocol/test.py @@ -217,7 +217,7 @@ def test_mysql_replacement_query(mysql_client, server_address): --password=123 -e "select database();" '''.format(host=server_address, port=server_port), demux=True) assert code == 0 - assert stdout.decode() == 'database()\ndefault\n' + assert stdout.decode() == 'DATABASE()\ndefault\n' code, (stdout, stderr) = mysql_client.exec_run(''' mysql --protocol tcp -h {host} -P {port} default -u default diff --git a/tests/queries/0_stateless/00597_push_down_predicate.reference b/tests/queries/0_stateless/00597_push_down_predicate.reference index 794d9e7af5f..bd1c4791df4 100644 --- a/tests/queries/0_stateless/00597_push_down_predicate.reference +++ b/tests/queries/0_stateless/00597_push_down_predicate.reference @@ -114,7 +114,7 @@ FROM ( SELECT 1 AS id, - identity(cast(1, \'UInt8\')) AS subquery + identity(CAST(1, \'UInt8\')) AS subquery WHERE subquery = 1 ) WHERE subquery = 1 diff --git a/tests/queries/0_stateless/01029_early_constant_folding.reference b/tests/queries/0_stateless/01029_early_constant_folding.reference index 8a1d4cec388..8a2d7e6c61a 100644 --- a/tests/queries/0_stateless/01029_early_constant_folding.reference +++ b/tests/queries/0_stateless/01029_early_constant_folding.reference @@ -2,7 +2,7 @@ SELECT 1 WHERE 0 SELECT 1 SELECT 1 -WHERE (1 IN (0, 2)) AND (2 = (identity(cast(2, \'UInt8\')) AS subquery)) +WHERE (1 IN (0, 2)) AND (2 = (identity(CAST(2, \'UInt8\')) AS subquery)) SELECT 1 WHERE 1 IN ( ( diff --git a/tests/queries/0_stateless/01611_constant_folding_subqueries.reference b/tests/queries/0_stateless/01611_constant_folding_subqueries.reference index d10502c5860..e46fd479413 100644 --- a/tests/queries/0_stateless/01611_constant_folding_subqueries.reference +++ b/tests/queries/0_stateless/01611_constant_folding_subqueries.reference @@ -5,7 +5,7 @@ SELECT (SELECT * FROM system.numbers LIMIT 1 OFFSET 1) AS n, toUInt64(10 / n) FO 1,10 EXPLAIN SYNTAX SELECT (SELECT * FROM system.numbers LIMIT 1 OFFSET 1) AS n, toUInt64(10 / n); SELECT - identity(cast(0, \'UInt64\')) AS n, + identity(CAST(0, \'UInt64\')) AS n, toUInt64(10 / n) SELECT * FROM (WITH (SELECT * FROM system.numbers LIMIT 1 OFFSET 1) AS n, toUInt64(10 / n) as q SELECT * FROM system.one WHERE q > 0); 0 diff --git a/tests/queries/0_stateless/01705_normalize_case_insensitive_function_names.reference b/tests/queries/0_stateless/01705_normalize_case_insensitive_function_names.reference new file mode 100644 index 00000000000..5b0f7bdeb2d --- /dev/null +++ b/tests/queries/0_stateless/01705_normalize_case_insensitive_function_names.reference @@ -0,0 +1,66 @@ +SELECT + CAST(1, 'INT'), + ceil(1), + ceil(1), + char(49), + CHAR_LENGTH('1'), + CHARACTER_LENGTH('1'), + coalesce(1), + concat('1', '1'), + corr(1, 1), + cos(1), + count(), + covarPop(1, 1), + covarSamp(1, 1), + DATABASE(), + dateDiff('DAY', toDate('2020-10-24'), toDate('2019-10-24')), + exp(1), + arrayFlatten([[1]]), + floor(1), + FQDN(), + greatest(1), + 1, + ifNull(1, 1), + lower('A'), + least(1), + length('1'), + log(1), + position('1', '1'), + log(1), + log10(1), + log2(1), + lower('A'), + max(1), + substring('123', 1, 1), + min(1), + 1 % 1, + NOT 1, + now(), + now64(), + nullIf(1, 1), + pi(), + position('123', '2'), + pow(1, 1), + pow(1, 1), + rand(), + replaceAll('1', '1', '2'), + reverse('123'), + round(1), + sin(1), + sqrt(1), + stddevPop(1), + stddevSamp(1), + substring('123', 2), + substring('123', 2), + count(), + tan(1), + tanh(1), + trunc(1), + trunc(1), + upper('A'), + upper('A'), + currentUser(), + varPop(1), + varSamp(1), + toWeek(toDate('2020-10-24')), + toYearWeek(toDate('2020-10-24')) diff --git a/tests/queries/0_stateless/01705_normalize_case_insensitive_function_names.sql b/tests/queries/0_stateless/01705_normalize_case_insensitive_function_names.sql new file mode 100644 index 00000000000..9b35087182c --- /dev/null +++ b/tests/queries/0_stateless/01705_normalize_case_insensitive_function_names.sql @@ -0,0 +1 @@ +EXPLAIN SYNTAX SELECT CAST(1 AS INT), CEIL(1), CEILING(1), CHAR(49), CHAR_LENGTH('1'), CHARACTER_LENGTH('1'), COALESCE(1), CONCAT('1', '1'), CORR(1, 1), COS(1), COUNT(1), COVAR_POP(1, 1), COVAR_SAMP(1, 1), DATABASE(), DATEDIFF('DAY', toDate('2020-10-24'), toDate('2019-10-24')), EXP(1), FLATTEN([[1]]), FLOOR(1), FQDN(), GREATEST(1), IF(1, 1, 1), IFNULL(1, 1), LCASE('A'), LEAST(1), LENGTH('1'), LN(1), LOCATE('1', '1'), LOG(1), LOG10(1), LOG2(1), LOWER('A'), MAX(1), MID('123', 1, 1), MIN(1), MOD(1, 1), NOT(1), NOW(), NOW64(), NULLIF(1, 1), PI(), POSITION('123', '2'), POW(1, 1), POWER(1, 1), RAND(), REPLACE('1', '1', '2'), REVERSE('123'), ROUND(1), SIN(1), SQRT(1), STDDEV_POP(1), STDDEV_SAMP(1), SUBSTR('123', 2), SUBSTRING('123', 2), SUM(1), TAN(1), TANH(1), TRUNC(1), TRUNCATE(1), UCASE('A'), UPPER('A'), USER(), VAR_POP(1), VAR_SAMP(1), WEEK(toDate('2020-10-24')), YEARWEEK(toDate('2020-10-24')) format TSVRaw; From 2dc7ba160a3bdc61765b12336edf753a0100f923 Mon Sep 17 00:00:00 2001 From: Amos Bird Date: Sun, 14 Feb 2021 20:53:50 +0800 Subject: [PATCH 0418/2357] Better --- src/Interpreters/FunctionNameNormalizer.cpp | 27 +++++++++++++++++-- src/Interpreters/FunctionNameNormalizer.h | 2 +- src/Interpreters/InterpreterCreateQuery.cpp | 2 ++ ...OptimizeIfWithConstantConditionVisitor.cpp | 2 +- src/Interpreters/TreeRewriter.cpp | 2 +- src/Interpreters/inplaceBlockConversions.cpp | 2 +- src/Parsers/ExpressionElementParsers.cpp | 2 +- ...56_test_query_log_factories_info.reference | 2 +- 8 files changed, 33 insertions(+), 8 deletions(-) diff --git a/src/Interpreters/FunctionNameNormalizer.cpp b/src/Interpreters/FunctionNameNormalizer.cpp index f22f72b5e03..36ccc9340ea 100644 --- a/src/Interpreters/FunctionNameNormalizer.cpp +++ b/src/Interpreters/FunctionNameNormalizer.cpp @@ -1,18 +1,41 @@ #include +#include +#include + namespace DB { const String & getFunctionCanonicalNameIfAny(const String & name); const String & getAggregateFunctionCanonicalNameIfAny(const String & name); -void FunctionNameNormalizer::visit(ASTPtr & ast) +void FunctionNameNormalizer::visit(IAST * ast) { + if (!ast) + return; + + if (auto * node_storage = ast->as()) + { + visit(node_storage->partition_by); + visit(node_storage->primary_key); + visit(node_storage->order_by); + visit(node_storage->sample_by); + visit(node_storage->ttl_table); + return; + } + + if (auto * node_decl = ast->as()) + { + visit(node_decl->default_expression.get()); + visit(node_decl->ttl.get()); + return; + } + if (auto * node_func = ast->as()) node_func->name = getAggregateFunctionCanonicalNameIfAny(getFunctionCanonicalNameIfAny(node_func->name)); for (auto & child : ast->children) - visit(child); + visit(child.get()); } } diff --git a/src/Interpreters/FunctionNameNormalizer.h b/src/Interpreters/FunctionNameNormalizer.h index 2b20c28bce0..3f22bb2f627 100644 --- a/src/Interpreters/FunctionNameNormalizer.h +++ b/src/Interpreters/FunctionNameNormalizer.h @@ -8,7 +8,7 @@ namespace DB struct FunctionNameNormalizer { - static void visit(ASTPtr &); + static void visit(IAST *); }; } diff --git a/src/Interpreters/InterpreterCreateQuery.cpp b/src/Interpreters/InterpreterCreateQuery.cpp index e9a11b9eb0d..bc38d4e3821 100644 --- a/src/Interpreters/InterpreterCreateQuery.cpp +++ b/src/Interpreters/InterpreterCreateQuery.cpp @@ -56,6 +56,7 @@ #include #include #include +#include #include #include @@ -1118,6 +1119,7 @@ void InterpreterCreateQuery::prepareOnClusterQuery(ASTCreateQuery & create, cons BlockIO InterpreterCreateQuery::execute() { + FunctionNameNormalizer().visit(query_ptr.get()); auto & create = query_ptr->as(); if (!create.cluster.empty()) { diff --git a/src/Interpreters/OptimizeIfWithConstantConditionVisitor.cpp b/src/Interpreters/OptimizeIfWithConstantConditionVisitor.cpp index dee4c69118b..cdcf6f7dddd 100644 --- a/src/Interpreters/OptimizeIfWithConstantConditionVisitor.cpp +++ b/src/Interpreters/OptimizeIfWithConstantConditionVisitor.cpp @@ -29,7 +29,7 @@ static bool tryExtractConstValueFromCondition(const ASTPtr & condition, bool & v /// cast of numeric constant in condition to UInt8 if (const auto * function = condition->as()) { - if (function->name == "cast") + if (function->name == "CAST") { if (const auto * expr_list = function->arguments->as()) { diff --git a/src/Interpreters/TreeRewriter.cpp b/src/Interpreters/TreeRewriter.cpp index cf4db8f174e..7b1a960d435 100644 --- a/src/Interpreters/TreeRewriter.cpp +++ b/src/Interpreters/TreeRewriter.cpp @@ -936,7 +936,7 @@ void TreeRewriter::normalize(ASTPtr & query, Aliases & aliases, const Settings & MarkTableIdentifiersVisitor(identifiers_data).visit(query); /// Rewrite function names to their canonical ones. - FunctionNameNormalizer().visit(query); + FunctionNameNormalizer().visit(query.get()); /// Common subexpression elimination. Rewrite rules. QueryNormalizer::Data normalizer_data(aliases, settings); diff --git a/src/Interpreters/inplaceBlockConversions.cpp b/src/Interpreters/inplaceBlockConversions.cpp index c9a96a81b48..d06cde99425 100644 --- a/src/Interpreters/inplaceBlockConversions.cpp +++ b/src/Interpreters/inplaceBlockConversions.cpp @@ -79,7 +79,7 @@ ASTPtr convertRequiredExpressions(Block & block, const NamesAndTypesList & requi continue; auto cast_func = makeASTFunction( - "cast", std::make_shared(required_column.name), std::make_shared(required_column.type->getName())); + "CAST", std::make_shared(required_column.name), std::make_shared(required_column.type->getName())); conversion_expr_list->children.emplace_back(setAlias(cast_func, required_column.name)); diff --git a/src/Parsers/ExpressionElementParsers.cpp b/src/Parsers/ExpressionElementParsers.cpp index 3d868812304..7a426e7774d 100644 --- a/src/Parsers/ExpressionElementParsers.cpp +++ b/src/Parsers/ExpressionElementParsers.cpp @@ -864,7 +864,7 @@ bool ParserCastExpression::parseImpl(Pos & pos, ASTPtr & node, Expected & expect expr_list_args->children.push_back(std::move(type_literal)); auto func_node = std::make_shared(); - func_node->name = "cast"; + func_node->name = "CAST"; func_node->arguments = std::move(expr_list_args); func_node->children.push_back(func_node->arguments); diff --git a/tests/queries/0_stateless/01656_test_query_log_factories_info.reference b/tests/queries/0_stateless/01656_test_query_log_factories_info.reference index 3c93cd9ec26..324890c0a5a 100644 --- a/tests/queries/0_stateless/01656_test_query_log_factories_info.reference +++ b/tests/queries/0_stateless/01656_test_query_log_factories_info.reference @@ -11,7 +11,7 @@ arraySort(used_table_functions) ['numbers'] arraySort(used_functions) -['addDays','array','arrayFlatten','cast','modulo','plus','substring','toDate','toDayOfYear','toTypeName','toWeek'] +['CAST','addDays','array','arrayFlatten','modulo','plus','substring','toDate','toDayOfYear','toTypeName','toWeek'] arraySort(used_data_type_families) ['Array','Int32','Nullable','String'] From cac9c7fc079835b4e26cf2b5ff8ad776b1369c5d Mon Sep 17 00:00:00 2001 From: Amos Bird Date: Mon, 15 Feb 2021 00:00:47 +0800 Subject: [PATCH 0419/2357] Fix tests --- tests/queries/0_stateless/00642_cast.reference | 4 ++-- tests/queries/0_stateless/00643_cast_zookeeper.reference | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/tests/queries/0_stateless/00642_cast.reference b/tests/queries/0_stateless/00642_cast.reference index 3d5572932fb..7f5333f590e 100644 --- a/tests/queries/0_stateless/00642_cast.reference +++ b/tests/queries/0_stateless/00642_cast.reference @@ -10,11 +10,11 @@ hello CREATE TABLE default.cast ( `x` UInt8, - `e` Enum8('hello' = 1, 'world' = 2) DEFAULT cast(x, 'Enum8(\'hello\' = 1, \'world\' = 2)') + `e` Enum8('hello' = 1, 'world' = 2) DEFAULT CAST(x, 'Enum8(\'hello\' = 1, \'world\' = 2)') ) ENGINE = MergeTree ORDER BY e SETTINGS index_granularity = 8192 x UInt8 -e Enum8(\'hello\' = 1, \'world\' = 2) DEFAULT cast(x, \'Enum8(\\\'hello\\\' = 1, \\\'world\\\' = 2)\') +e Enum8(\'hello\' = 1, \'world\' = 2) DEFAULT CAST(x, \'Enum8(\\\'hello\\\' = 1, \\\'world\\\' = 2)\') 1 hello diff --git a/tests/queries/0_stateless/00643_cast_zookeeper.reference b/tests/queries/0_stateless/00643_cast_zookeeper.reference index 658233be742..9123463de1a 100644 --- a/tests/queries/0_stateless/00643_cast_zookeeper.reference +++ b/tests/queries/0_stateless/00643_cast_zookeeper.reference @@ -1,12 +1,12 @@ CREATE TABLE default.cast1 ( `x` UInt8, - `e` Enum8('hello' = 1, 'world' = 2) DEFAULT cast(x, 'Enum8(\'hello\' = 1, \'world\' = 2)') + `e` Enum8('hello' = 1, 'world' = 2) DEFAULT CAST(x, 'Enum8(\'hello\' = 1, \'world\' = 2)') ) ENGINE = ReplicatedMergeTree('/clickhouse/tables/test_00643/cast', 'r1') ORDER BY e SETTINGS index_granularity = 8192 x UInt8 -e Enum8(\'hello\' = 1, \'world\' = 2) DEFAULT cast(x, \'Enum8(\\\'hello\\\' = 1, \\\'world\\\' = 2)\') +e Enum8(\'hello\' = 1, \'world\' = 2) DEFAULT CAST(x, \'Enum8(\\\'hello\\\' = 1, \\\'world\\\' = 2)\') 1 hello 1 hello From f402aa4057814078b7b7ef2e0175ab2753d2bced Mon Sep 17 00:00:00 2001 From: Amos Bird Date: Wed, 17 Feb 2021 23:36:37 +0800 Subject: [PATCH 0420/2357] Normalize constant expression --- src/Interpreters/evaluateConstantExpression.cpp | 2 ++ 1 file changed, 2 insertions(+) diff --git a/src/Interpreters/evaluateConstantExpression.cpp b/src/Interpreters/evaluateConstantExpression.cpp index 02ef3426483..70b9baa544f 100644 --- a/src/Interpreters/evaluateConstantExpression.cpp +++ b/src/Interpreters/evaluateConstantExpression.cpp @@ -15,6 +15,7 @@ #include #include #include +#include #include #include @@ -35,6 +36,7 @@ std::pair> evaluateConstantExpression(co auto ast = node->clone(); ReplaceQueryParameterVisitor param_visitor(context.getQueryParameters()); param_visitor.visit(ast); + FunctionNameNormalizer().visit(ast.get()); String name = ast->getColumnName(); auto syntax_result = TreeRewriter(context).analyze(ast, source_columns); ExpressionActionsPtr expr_for_constant_folding = ExpressionAnalyzer(ast, syntax_result, context).getConstActions(); From 2c4bc43014c510292340954647fbebf0f72620e9 Mon Sep 17 00:00:00 2001 From: Amos Bird Date: Thu, 18 Feb 2021 11:27:24 +0800 Subject: [PATCH 0421/2357] Backward compatible --- src/Core/Settings.h | 1 + src/Interpreters/TreeRewriter.cpp | 3 ++- src/Interpreters/evaluateConstantExpression.cpp | 5 ++++- src/Server/TCPHandler.cpp | 6 ++++++ 4 files changed, 13 insertions(+), 2 deletions(-) diff --git a/src/Core/Settings.h b/src/Core/Settings.h index 9bb9ad30f15..4c5fe93bb03 100644 --- a/src/Core/Settings.h +++ b/src/Core/Settings.h @@ -383,6 +383,7 @@ class IColumn; M(Bool, optimize_if_chain_to_multiif, false, "Replace if(cond1, then1, if(cond2, ...)) chains to multiIf. Currently it's not beneficial for numeric types.", 0) \ M(Bool, optimize_if_transform_strings_to_enum, false, "Replaces string-type arguments in If and Transform to enum. Disabled by default cause it could make inconsistent change in distributed query that would lead to its fail.", 0) \ M(Bool, optimize_monotonous_functions_in_order_by, true, "Replace monotonous function with its argument in ORDER BY", 0) \ + M(Bool, normalize_function_names, true, "Normalize function names to their canonical names", 0) \ M(Bool, allow_experimental_alter_materialized_view_structure, false, "Allow atomic alter on Materialized views. Work in progress.", 0) \ M(Bool, enable_early_constant_folding, true, "Enable query optimization where we analyze function and subqueries results and rewrite query if there're constants there", 0) \ M(Bool, deduplicate_blocks_in_dependent_materialized_views, false, "Should deduplicate blocks for materialized views if the block is not a duplicate for the table. Use true to always deduplicate in dependent tables.", 0) \ diff --git a/src/Interpreters/TreeRewriter.cpp b/src/Interpreters/TreeRewriter.cpp index 7b1a960d435..37f49874e0a 100644 --- a/src/Interpreters/TreeRewriter.cpp +++ b/src/Interpreters/TreeRewriter.cpp @@ -936,7 +936,8 @@ void TreeRewriter::normalize(ASTPtr & query, Aliases & aliases, const Settings & MarkTableIdentifiersVisitor(identifiers_data).visit(query); /// Rewrite function names to their canonical ones. - FunctionNameNormalizer().visit(query.get()); + if (settings.normalize_function_names) + FunctionNameNormalizer().visit(query.get()); /// Common subexpression elimination. Rewrite rules. QueryNormalizer::Data normalizer_data(aliases, settings); diff --git a/src/Interpreters/evaluateConstantExpression.cpp b/src/Interpreters/evaluateConstantExpression.cpp index 70b9baa544f..42e96bae07b 100644 --- a/src/Interpreters/evaluateConstantExpression.cpp +++ b/src/Interpreters/evaluateConstantExpression.cpp @@ -36,7 +36,10 @@ std::pair> evaluateConstantExpression(co auto ast = node->clone(); ReplaceQueryParameterVisitor param_visitor(context.getQueryParameters()); param_visitor.visit(ast); - FunctionNameNormalizer().visit(ast.get()); + + if (context.getSettingsRef().normalize_function_names) + FunctionNameNormalizer().visit(ast.get()); + String name = ast->getColumnName(); auto syntax_result = TreeRewriter(context).analyze(ast, source_columns); ExpressionActionsPtr expr_for_constant_folding = ExpressionAnalyzer(ast, syntax_result, context).getConstActions(); diff --git a/src/Server/TCPHandler.cpp b/src/Server/TCPHandler.cpp index c207d188a85..430a01bb97a 100644 --- a/src/Server/TCPHandler.cpp +++ b/src/Server/TCPHandler.cpp @@ -1133,6 +1133,12 @@ void TCPHandler::receiveQuery() } query_context->applySettingsChanges(settings_changes); + /// Disable function name normalization it's not an initial query. + if (client_info.query_kind != ClientInfo::QueryKind::INITIAL_QUERY) + { + query_context->setSetting("normalize_function_names", Field(0)); + } + // Use the received query id, or generate a random default. It is convenient // to also generate the default OpenTelemetry trace id at the same time, and // set the trace parent. From 0449546bca7319132a99693b6634ca8684aa41f3 Mon Sep 17 00:00:00 2001 From: Nikolai Kochetov Date: Thu, 18 Feb 2021 16:13:09 +0300 Subject: [PATCH 0422/2357] Support TotalsHaving. Update test. --- .../Optimizations/filterPushDown.cpp | 38 ++++++++++++++++--- src/Processors/QueryPlan/TotalsHavingStep.h | 2 + .../01655_plan_optimizations.reference | 9 +++++ .../0_stateless/01655_plan_optimizations.sh | 12 ++++++ 4 files changed, 56 insertions(+), 5 deletions(-) diff --git a/src/Processors/QueryPlan/Optimizations/filterPushDown.cpp b/src/Processors/QueryPlan/Optimizations/filterPushDown.cpp index 456faeb72c2..4d01235e2fc 100644 --- a/src/Processors/QueryPlan/Optimizations/filterPushDown.cpp +++ b/src/Processors/QueryPlan/Optimizations/filterPushDown.cpp @@ -5,14 +5,17 @@ #include #include #include +#include +#include +#include +#include +#include +#include #include #include #include -#include "Processors/QueryPlan/FinishSortingStep.h" -#include "Processors/QueryPlan/MergeSortingStep.h" -#include "Processors/QueryPlan/MergingSortedStep.h" -#include "Processors/QueryPlan/PartialSortingStep.h" -#include +#include + #include namespace DB::ErrorCodes @@ -135,6 +138,31 @@ size_t tryPushDownFilter(QueryPlan::Node * parent_node, QueryPlan::Nodes & nodes return updated_steps; } + if (auto * totals_having = typeid_cast(child.get())) + { + /// If totals step has HAVING expression, skip it for now. + /// TODO: + /// We can merge HAING expression with current filer. + /// Alos, we can push down part of HAVING which depend only on aggregation keys. + if (totals_having->getActions()) + return 0; + + Names keys; + const auto & header = totals_having->getInputStreams().front().header; + for (const auto & column : header) + if (typeid_cast(column.type.get()) == nullptr) + keys.push_back(column.name); + + /// NOTE: this optimization changes TOTALS value. Example: + /// `select * from (select y, sum(x) from ( + /// select number as x, number % 4 as y from numbers(10) + /// ) group by y with totals) where y != 2` + /// Optimization will replace totals row `y, sum(x)` from `(0, 45)` to `(0, 37)`. + /// It is expected to ok, cause AST optimization `enable_optimize_predicate_expression = 1` also brakes it. + if (auto updated_steps = tryAddNewFilterStep(parent_node, nodes, keys)) + return updated_steps; + } + if (auto * array_join = typeid_cast(child.get())) { const auto & array_join_actions = array_join->arrayJoin(); diff --git a/src/Processors/QueryPlan/TotalsHavingStep.h b/src/Processors/QueryPlan/TotalsHavingStep.h index 7c1638013e5..57d5cf7aad5 100644 --- a/src/Processors/QueryPlan/TotalsHavingStep.h +++ b/src/Processors/QueryPlan/TotalsHavingStep.h @@ -28,6 +28,8 @@ public: void describeActions(FormatSettings & settings) const override; + const ActionsDAGPtr & getActions() const { return actions_dag; } + private: bool overflow_row; ActionsDAGPtr actions_dag; diff --git a/tests/queries/0_stateless/01655_plan_optimizations.reference b/tests/queries/0_stateless/01655_plan_optimizations.reference index 7bc75dc0bf6..fa83c098412 100644 --- a/tests/queries/0_stateless/01655_plan_optimizations.reference +++ b/tests/queries/0_stateless/01655_plan_optimizations.reference @@ -112,3 +112,12 @@ PartialSorting Filter column: and(notEquals(x, 0), notEquals(y, 0)) 1 2 1 1 +> filter is pushed down before TOTALS HAVING and aggregating +TotalsHaving +Aggregating +Filter column: notEquals(y, 2) +0 12 +1 15 +3 10 + +0 37 diff --git a/tests/queries/0_stateless/01655_plan_optimizations.sh b/tests/queries/0_stateless/01655_plan_optimizations.sh index f770643fc41..e47b03661e4 100755 --- a/tests/queries/0_stateless/01655_plan_optimizations.sh +++ b/tests/queries/0_stateless/01655_plan_optimizations.sh @@ -135,3 +135,15 @@ $CLICKHOUSE_CLIENT -q " select number % 2 as x, number % 3 as y from numbers(6) order by y desc ) where x != 0 and y != 0 settings enable_optimize_predicate_expression = 0" + +echo "> filter is pushed down before TOTALS HAVING and aggregating" +$CLICKHOUSE_CLIENT -q " + explain actions = 1 select * from ( + select y, sum(x) from (select number as x, number % 4 as y from numbers(10)) group by y with totals + ) where y != 2 + settings enable_optimize_predicate_expression=0" | + grep -o "TotalsHaving\|Aggregating\|Filter column: notEquals(y, 2)" +$CLICKHOUSE_CLIENT -q " + select * from ( + select y, sum(x) from (select number as x, number % 4 as y from numbers(10)) group by y with totals + ) where y != 2" \ No newline at end of file From 97f4c457ec979fc489892472dfb50a93062b4ce5 Mon Sep 17 00:00:00 2001 From: Alexander Tokmakov Date: Thu, 18 Feb 2021 16:27:51 +0300 Subject: [PATCH 0423/2357] fix MySQL COMM_FIELD_LIST response --- docker/test/fasttest/run.sh | 1 + docker/test/stateless/Dockerfile | 3 ++- src/Core/MySQL/PacketsProtocolText.cpp | 22 +++++++++++++--- src/Core/MySQL/PacketsProtocolText.h | 5 +++- src/Server/MySQLHandler.cpp | 2 +- .../01176_mysql_client_interactive.expect | 26 +++++++++++++++++++ .../01176_mysql_client_interactive.reference | 0 tests/queries/shell_config.sh | 13 ++++++++++ 8 files changed, 65 insertions(+), 7 deletions(-) create mode 100755 tests/queries/0_stateless/01176_mysql_client_interactive.expect create mode 100644 tests/queries/0_stateless/01176_mysql_client_interactive.reference diff --git a/docker/test/fasttest/run.sh b/docker/test/fasttest/run.sh index e6294b5d74d..7e7c8116901 100755 --- a/docker/test/fasttest/run.sh +++ b/docker/test/fasttest/run.sh @@ -259,6 +259,7 @@ function run_tests 00929_multi_match_edit_distance 01681_hyperscan_debug_assertion + 01176_mysql_client_interactive # requires mysql client 01031_mutations_interpreter_and_context 01053_ssd_dictionary # this test mistakenly requires acces to /var/lib/clickhouse -- can't run this locally, disabled 01083_expressions_in_engine_arguments diff --git a/docker/test/stateless/Dockerfile b/docker/test/stateless/Dockerfile index b063f8d81f6..f2e3016692f 100644 --- a/docker/test/stateless/Dockerfile +++ b/docker/test/stateless/Dockerfile @@ -23,7 +23,8 @@ RUN apt-get update -y \ telnet \ tree \ unixodbc \ - wget + wget \ + mysql-client-5.7 RUN pip3 install numpy scipy pandas diff --git a/src/Core/MySQL/PacketsProtocolText.cpp b/src/Core/MySQL/PacketsProtocolText.cpp index ad34cd8c28d..62efe549b33 100644 --- a/src/Core/MySQL/PacketsProtocolText.cpp +++ b/src/Core/MySQL/PacketsProtocolText.cpp @@ -62,10 +62,10 @@ ColumnDefinition::ColumnDefinition() ColumnDefinition::ColumnDefinition( String schema_, String table_, String org_table_, String name_, String org_name_, uint16_t character_set_, uint32_t column_length_, - ColumnType column_type_, uint16_t flags_, uint8_t decimals_) + ColumnType column_type_, uint16_t flags_, uint8_t decimals_, bool with_defaults_) : schema(std::move(schema_)), table(std::move(table_)), org_table(std::move(org_table_)), name(std::move(name_)), org_name(std::move(org_name_)), character_set(character_set_), column_length(column_length_), column_type(column_type_), - flags(flags_), decimals(decimals_) + flags(flags_), decimals(decimals_), is_comm_field_list_response(with_defaults_) { } @@ -77,8 +77,15 @@ ColumnDefinition::ColumnDefinition( size_t ColumnDefinition::getPayloadSize() const { - return 12 + getLengthEncodedStringSize("def") + getLengthEncodedStringSize(schema) + getLengthEncodedStringSize(table) + getLengthEncodedStringSize(org_table) + \ - getLengthEncodedStringSize(name) + getLengthEncodedStringSize(org_name) + getLengthEncodedNumberSize(next_length); + return 12 + + getLengthEncodedStringSize("def") + + getLengthEncodedStringSize(schema) + + getLengthEncodedStringSize(table) + + getLengthEncodedStringSize(org_table) + + getLengthEncodedStringSize(name) + + getLengthEncodedStringSize(org_name) + + getLengthEncodedNumberSize(next_length) + + is_comm_field_list_response; } void ColumnDefinition::readPayloadImpl(ReadBuffer & payload) @@ -115,6 +122,13 @@ void ColumnDefinition::writePayloadImpl(WriteBuffer & buffer) const buffer.write(reinterpret_cast(&flags), 2); buffer.write(reinterpret_cast(&decimals), 1); writeChar(0x0, 2, buffer); + if (is_comm_field_list_response) + { + /// We should write length encoded int with string size + /// followed by string with some "default values" (possibly it's column defaults). + /// But we just send NULL for simplicity. + writeChar(0xfb, buffer); + } } ColumnDefinition getColumnDefinition(const String & column_name, const TypeIndex type_index) diff --git a/src/Core/MySQL/PacketsProtocolText.h b/src/Core/MySQL/PacketsProtocolText.h index d449e94cff1..b54b1c5ca19 100644 --- a/src/Core/MySQL/PacketsProtocolText.h +++ b/src/Core/MySQL/PacketsProtocolText.h @@ -101,6 +101,9 @@ public: ColumnType column_type; uint16_t flags; uint8_t decimals = 0x00; + /// https://dev.mysql.com/doc/internals/en/com-query-response.html#column-definition + /// There are extra fields in the packet for column defaults + bool is_comm_field_list_response = false; protected: size_t getPayloadSize() const override; @@ -114,7 +117,7 @@ public: ColumnDefinition( String schema_, String table_, String org_table_, String name_, String org_name_, uint16_t character_set_, uint32_t column_length_, - ColumnType column_type_, uint16_t flags_, uint8_t decimals_); + ColumnType column_type_, uint16_t flags_, uint8_t decimals_, bool with_defaults_ = false); /// Should be used when column metadata (original name, table, original table, database) is unknown. ColumnDefinition( diff --git a/src/Server/MySQLHandler.cpp b/src/Server/MySQLHandler.cpp index 3cbe285615e..ea2813cf639 100644 --- a/src/Server/MySQLHandler.cpp +++ b/src/Server/MySQLHandler.cpp @@ -289,7 +289,7 @@ void MySQLHandler::comFieldList(ReadBuffer & payload) for (const NameAndTypePair & column : metadata_snapshot->getColumns().getAll()) { ColumnDefinition column_definition( - database, packet.table, packet.table, column.name, column.name, CharacterSet::binary, 100, ColumnType::MYSQL_TYPE_STRING, 0, 0 + database, packet.table, packet.table, column.name, column.name, CharacterSet::binary, 100, ColumnType::MYSQL_TYPE_STRING, 0, 0, true ); packet_endpoint->sendPacket(column_definition); } diff --git a/tests/queries/0_stateless/01176_mysql_client_interactive.expect b/tests/queries/0_stateless/01176_mysql_client_interactive.expect new file mode 100755 index 00000000000..d592bbe1ce2 --- /dev/null +++ b/tests/queries/0_stateless/01176_mysql_client_interactive.expect @@ -0,0 +1,26 @@ +#!/usr/bin/expect -f + +log_user 0 +set timeout 5 +match_max 100000 +# A default timeout action is to do nothing, change it to fail +expect_after { + timeout { + exit 1 + } +} + +set basedir [file dirname $argv0] +spawn bash -c "source $basedir/../shell_config.sh ; \$MYSQL_CLIENT_BINARY \$MYSQL_CLIENT_OPT" +expect "mysql> " + +send -- "USE system;\r" +expect "Database changed" + +send -- "SELECT * FROM one;\r" +expect "| dummy |" +expect "| 0 |" +expect "1 row in set" + +send -- "quit;\r" +expect eof diff --git a/tests/queries/0_stateless/01176_mysql_client_interactive.reference b/tests/queries/0_stateless/01176_mysql_client_interactive.reference new file mode 100644 index 00000000000..e69de29bb2d diff --git a/tests/queries/shell_config.sh b/tests/queries/shell_config.sh index eed77fb107d..d20b5669cc5 100644 --- a/tests/queries/shell_config.sh +++ b/tests/queries/shell_config.sh @@ -54,6 +54,8 @@ export CLICKHOUSE_PORT_HTTP=${CLICKHOUSE_PORT_HTTP:="8123"} export CLICKHOUSE_PORT_HTTPS=${CLICKHOUSE_PORT_HTTPS:=$(${CLICKHOUSE_EXTRACT_CONFIG} --try --key=https_port 2>/dev/null)} 2>/dev/null export CLICKHOUSE_PORT_HTTPS=${CLICKHOUSE_PORT_HTTPS:="8443"} export CLICKHOUSE_PORT_HTTP_PROTO=${CLICKHOUSE_PORT_HTTP_PROTO:="http"} +export CLICKHOUSE_PORT_MYSQL=${CLICKHOUSE_PORT_MYSQL:=$(${CLICKHOUSE_EXTRACT_CONFIG} --try --key=mysql_port 2>/dev/null)} 2>/dev/null +export CLICKHOUSE_PORT_MYSQL=${CLICKHOUSE_PORT_MYSQL:="9004"} # Add database and log comment to url params if [ -v CLICKHOUSE_URL_PARAMS ] @@ -87,6 +89,17 @@ export CLICKHOUSE_CURL=${CLICKHOUSE_CURL:="${CLICKHOUSE_CURL_COMMAND} -q -s --ma export CLICKHOUSE_TMP=${CLICKHOUSE_TMP:="."} mkdir -p ${CLICKHOUSE_TMP} +export MYSQL_CLIENT_BINARY=${MYSQL_CLIENT_BINARY:="mysql"} +export MYSQL_CLIENT_CLICKHOUSE_USER=${MYSQL_CLIENT_CLICKHOUSE_USER:="default"} +# Avoids "Can't connect to local MySQL server through socket '/var/run/mysqld/mysqld.sock'" when connecting to localhost +[ -v CLICKHOUSE_HOST ] && MYSQL_CLIENT_OPT0+=" --protocol tcp " +[ -v CLICKHOUSE_HOST ] && MYSQL_CLIENT_OPT0+=" --host ${CLICKHOUSE_HOST} " +[ -v CLICKHOUSE_PORT_MYSQL ] && MYSQL_CLIENT_OPT0+=" --port ${CLICKHOUSE_PORT_MYSQL} " +[ -v CLICKHOUSE_DATABASE ] && MYSQL_CLIENT_OPT0+=" --database ${CLICKHOUSE_DATABASE} " +MYSQL_CLIENT_OPT0+=" --user ${MYSQL_CLIENT_CLICKHOUSE_USER} " +export MYSQL_CLIENT_OPT="${MYSQL_CLIENT_OPT0:-} ${MYSQL_CLIENT_OPT:-}" +export MYSQL_CLIENT=${MYSQL_CLIENT:="$MYSQL_CLIENT_BINARY ${MYSQL_CLIENT_OPT:-}"} + function clickhouse_client_removed_host_parameter() { # removing only `--host=value` and `--host value` (removing '-hvalue' feels to dangerous) with python regex. From 556dc81ab990803f082dc6365656e5aac58a0a03 Mon Sep 17 00:00:00 2001 From: Vitaly Baranov Date: Thu, 18 Feb 2021 16:32:01 +0300 Subject: [PATCH 0424/2357] Fix undefined-behavior in ReservoirSamplerDeterministic.h --- .../ReservoirSamplerDeterministic.h | 40 +++++++++++++------ 1 file changed, 27 insertions(+), 13 deletions(-) diff --git a/src/AggregateFunctions/ReservoirSamplerDeterministic.h b/src/AggregateFunctions/ReservoirSamplerDeterministic.h index 3b7817e9308..3013a17e1ca 100644 --- a/src/AggregateFunctions/ReservoirSamplerDeterministic.h +++ b/src/AggregateFunctions/ReservoirSamplerDeterministic.h @@ -56,7 +56,7 @@ class ReservoirSamplerDeterministic { bool good(const UInt32 hash) { - return hash == ((hash >> skip_degree) << skip_degree); + return !(hash & skip_mask); } public: @@ -135,11 +135,8 @@ public: throw Poco::Exception("Cannot merge ReservoirSamplerDeterministic's with different max sample size"); sorted = false; - if (b.skip_degree > skip_degree) - { - skip_degree = b.skip_degree; - thinOut(); - } + if (skip_degree < b.skip_degree) + setSkipDegree(b.skip_degree); for (const auto & sample : b.samples) if (good(sample.second)) @@ -184,22 +181,39 @@ private: size_t total_values = 0; /// How many values were inserted (regardless if they remain in sample or not). bool sorted = false; Array samples; - UInt8 skip_degree = 0; /// The number N determining that we save only one per 2^N elements in average. + + /// The number N determining that we store only one per 2^N elements in average. + UInt8 skip_degree = 0; + + /// skip_mask is calculated as (2 ^ skip_degree - 1). We store an element only if (hash & skip_mask) == 0. + /// For example, if skip_degree==0 then skip_mask==0 means we store each element; + /// if skip_degree==1 then skip_mask==0b0001 means we store one per 2 elements in average; + /// if skip_degree==4 then skip_mask==0b1111 means we store one per 16 elements in average. + UInt32 skip_mask = 0; void insertImpl(const T & v, const UInt32 hash) { /// Make a room for plus one element. while (samples.size() >= max_sample_size) - { - ++skip_degree; - if (skip_degree > detail::MAX_SKIP_DEGREE) - throw DB::Exception{"skip_degree exceeds maximum value", DB::ErrorCodes::MEMORY_LIMIT_EXCEEDED}; - thinOut(); - } + setSkipDegree(skip_degree + 1); samples.emplace_back(v, hash); } + void setSkipDegree(UInt8 skip_degree_) + { + if (skip_degree_ == skip_degree) + return; + if (skip_degree_ > detail::MAX_SKIP_DEGREE) + throw DB::Exception{"skip_degree exceeds maximum value", DB::ErrorCodes::MEMORY_LIMIT_EXCEEDED}; + skip_degree = skip_degree_; + if (skip_degree == detail::MAX_SKIP_DEGREE) + skip_mask = static_cast(-1); + else + skip_mask = (1 << skip_degree) - 1; + thinOut(); + } + void thinOut() { samples.resize(std::distance(samples.begin(), From cd91ec8de11f7faf936b3141251a7a0f03e090ed Mon Sep 17 00:00:00 2001 From: Alexander Tokmakov Date: Thu, 18 Feb 2021 17:13:23 +0300 Subject: [PATCH 0425/2357] fix --- programs/server/Server.cpp | 15 +++++++++++---- 1 file changed, 11 insertions(+), 4 deletions(-) diff --git a/programs/server/Server.cpp b/programs/server/Server.cpp index 1bd6becfb37..991cd9699f9 100644 --- a/programs/server/Server.cpp +++ b/programs/server/Server.cpp @@ -704,6 +704,7 @@ int Server::main(const std::vector & /*args*/) main_config_zk_changed_event, [&](ConfigurationPtr config) { + static bool initial_loading = true; Settings::checkNoSettingNamesAtTopLevel(*config, config_path); /// Limit on total memory usage @@ -752,14 +753,20 @@ int Server::main(const std::vector & /*args*/) if (config->has("max_partition_size_to_drop")) global_context->setMaxPartitionSizeToDrop(config->getUInt64("max_partition_size_to_drop")); - if (config->has("zookeeper")) - global_context->reloadZooKeeperIfChanged(config); + if (!initial_loading) + { + /// We do not load ZooKeeper configuration on the first config loading + /// because TestKeeper server is not started yet. + if (config->has("zookeeper")) + global_context->reloadZooKeeperIfChanged(config); - global_context->reloadAuxiliaryZooKeepersConfigIfChanged(config); + global_context->reloadAuxiliaryZooKeepersConfigIfChanged(config); + initial_loading = false; + } global_context->updateStorageConfiguration(*config); }, - /* already_loaded = */ false); + /* already_loaded = */ false); /// Reload it right now (initial loading) auto & access_control = global_context->getAccessControlManager(); if (config().has("custom_settings_prefixes")) From 1bad1e3a7ca49af3c990999ae414bc1bcc4fc3ea Mon Sep 17 00:00:00 2001 From: Alexander Tokmakov Date: Thu, 18 Feb 2021 17:37:51 +0300 Subject: [PATCH 0426/2357] fix dockerfile --- docker/test/stateless/Dockerfile | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/docker/test/stateless/Dockerfile b/docker/test/stateless/Dockerfile index f2e3016692f..ba3355db89b 100644 --- a/docker/test/stateless/Dockerfile +++ b/docker/test/stateless/Dockerfile @@ -3,6 +3,9 @@ FROM yandex/clickhouse-test-base ARG odbc_driver_url="https://github.com/ClickHouse/clickhouse-odbc/releases/download/v1.1.4.20200302/clickhouse-odbc-1.1.4-Linux.tar.gz" +RUN echo "deb [trusted=yes] http://repo.mysql.com/apt/ubuntu/ bionic mysql-5.7" >> /etc/apt/sources.list \ + && apt-key adv --keyserver keyserver.ubuntu.com --recv-keys 8C718D3B5072E1F5 + RUN apt-get update -y \ && env DEBIAN_FRONTEND=noninteractive \ apt-get install --yes --no-install-recommends \ @@ -24,7 +27,7 @@ RUN apt-get update -y \ tree \ unixodbc \ wget \ - mysql-client-5.7 + mysql-client=5.7* RUN pip3 install numpy scipy pandas From 0336764426a2e5950dcc6ce27b6d89de09209368 Mon Sep 17 00:00:00 2001 From: alesapin Date: Thu, 18 Feb 2021 18:51:16 +0300 Subject: [PATCH 0427/2357] Fix tidy one more time --- src/Coordination/Changelog.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/Coordination/Changelog.cpp b/src/Coordination/Changelog.cpp index a332ce37a8c..4a3955e23ab 100644 --- a/src/Coordination/Changelog.cpp +++ b/src/Coordination/Changelog.cpp @@ -174,7 +174,7 @@ public: readIntBinary(record.header.blob_size, read_buf); readIntBinary(record.header.blob_checksum, read_buf); auto buffer = nuraft::buffer::alloc(record.header.blob_size); - auto buffer_begin = reinterpret_cast(buffer->data_begin()); + auto * buffer_begin = reinterpret_cast(buffer->data_begin()); read_buf.readStrict(buffer_begin, record.header.blob_size); if (previous_index != 0 && previous_index + 1 != record.header.index) From 5cfe245e2203cf4ca62bc5e72897ebd358a64b5b Mon Sep 17 00:00:00 2001 From: sevirov <72220289+sevirov@users.noreply.github.com> Date: Thu, 18 Feb 2021 21:21:12 +0300 Subject: [PATCH 0428/2357] Update docs/ru/sql-reference/functions/type-conversion-functions.md Co-authored-by: olgarev <56617294+olgarev@users.noreply.github.com> --- docs/ru/sql-reference/functions/type-conversion-functions.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/ru/sql-reference/functions/type-conversion-functions.md b/docs/ru/sql-reference/functions/type-conversion-functions.md index 92e674242df..1edebc26ccc 100644 --- a/docs/ru/sql-reference/functions/type-conversion-functions.md +++ b/docs/ru/sql-reference/functions/type-conversion-functions.md @@ -660,7 +660,7 @@ AS parseDateTimeBestEffortUS; ## parseDateTimeBestEffortUSOrNull {#parsedatetimebesteffortusornull} -Похожа на функцию [parseDateTimeBestEffortUS](#parsedatetimebesteffortUS), но разница состоит в том, что возвращает `NULL`, если входная строка не может быть преобразована в тип данных [DateTime](../../sql-reference/data-types/datetime.md). +Работает аналогично функции [parseDateTimeBestEffortUS](#parsedatetimebesteffortUS), но в отличие от нее возвращает `NULL`, если входная строка не может быть преобразована в тип данных [DateTime](../../sql-reference/data-types/datetime.md). **Синтаксис** From 1626833987b869c36096becebafbbb516939397d Mon Sep 17 00:00:00 2001 From: sevirov <72220289+sevirov@users.noreply.github.com> Date: Thu, 18 Feb 2021 21:21:25 +0300 Subject: [PATCH 0429/2357] Update docs/ru/sql-reference/functions/type-conversion-functions.md Co-authored-by: olgarev <56617294+olgarev@users.noreply.github.com> --- docs/ru/sql-reference/functions/type-conversion-functions.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/ru/sql-reference/functions/type-conversion-functions.md b/docs/ru/sql-reference/functions/type-conversion-functions.md index 1edebc26ccc..80f24d53515 100644 --- a/docs/ru/sql-reference/functions/type-conversion-functions.md +++ b/docs/ru/sql-reference/functions/type-conversion-functions.md @@ -746,7 +746,7 @@ SELECT parseDateTimeBestEffortUSOrNull('10.2021') AS parseDateTimeBestEffortUSOr ## parseDateTimeBestEffortUSOrZero {#parsedatetimebesteffortusorzero} -Похожа на функцию [parseDateTimeBestEffortUS](#parsedatetimebesteffortUS), но разница в том, что возвращает нулевую дату или нулевую дату со временем, если входная строка не может быть преобразована в тип данных [DateTime](../../sql-reference/data-types/datetime.md). +Работает аналогично функции [parseDateTimeBestEffortUS](#parsedatetimebesteffortUS), но в отличие от нее возвращает нулевую дату или нулевую дату со временем, если входная строка не может быть преобразована в тип данных [DateTime](../../sql-reference/data-types/datetime.md). **Синтаксис** From 03640221a84828043770dd89e9fa2011af0ed126 Mon Sep 17 00:00:00 2001 From: Dmitriy Date: Thu, 18 Feb 2021 21:33:30 +0300 Subject: [PATCH 0430/2357] Add the zero date MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Добавил нулевую дату. --- docs/en/sql-reference/functions/type-conversion-functions.md | 2 +- docs/ru/sql-reference/functions/type-conversion-functions.md | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/docs/en/sql-reference/functions/type-conversion-functions.md b/docs/en/sql-reference/functions/type-conversion-functions.md index 11d54790ac2..def37cef366 100644 --- a/docs/en/sql-reference/functions/type-conversion-functions.md +++ b/docs/en/sql-reference/functions/type-conversion-functions.md @@ -777,7 +777,7 @@ Result: ## parseDateTimeBestEffortUSOrZero {#parsedatetimebesteffortusorzero} -Same as [parseDateTimeBestEffortUS](#parsedatetimebesteffortUS) function except that it returns zero date or zero date with time when it encounters a date format that cannot be processed. +Same as [parseDateTimeBestEffortUS](#parsedatetimebesteffortUS) function except that it returns zero date (`1970-01-01`) or zero date with time (`1970-01-01 00:00:00`) when it encounters a date format that cannot be processed. **Syntax** diff --git a/docs/ru/sql-reference/functions/type-conversion-functions.md b/docs/ru/sql-reference/functions/type-conversion-functions.md index 80f24d53515..4de2b5c6e3e 100644 --- a/docs/ru/sql-reference/functions/type-conversion-functions.md +++ b/docs/ru/sql-reference/functions/type-conversion-functions.md @@ -746,7 +746,7 @@ SELECT parseDateTimeBestEffortUSOrNull('10.2021') AS parseDateTimeBestEffortUSOr ## parseDateTimeBestEffortUSOrZero {#parsedatetimebesteffortusorzero} -Работает аналогично функции [parseDateTimeBestEffortUS](#parsedatetimebesteffortUS), но в отличие от нее возвращает нулевую дату или нулевую дату со временем, если входная строка не может быть преобразована в тип данных [DateTime](../../sql-reference/data-types/datetime.md). +Работает аналогично функции [parseDateTimeBestEffortUS](#parsedatetimebesteffortUS), но в отличие от нее возвращает нулевую дату (`1970-01-01`) или нулевую дату со временем (`1970-01-01 00:00:00`), если входная строка не может быть преобразована в тип данных [DateTime](../../sql-reference/data-types/datetime.md). **Синтаксис** From 0e8a951ac59d5d78f0bb7d9f1a1b78f7993560c4 Mon Sep 17 00:00:00 2001 From: sevirov <72220289+sevirov@users.noreply.github.com> Date: Thu, 18 Feb 2021 21:38:54 +0300 Subject: [PATCH 0431/2357] Update docs/ru/operations/caches.md Co-authored-by: olgarev <56617294+olgarev@users.noreply.github.com> --- docs/ru/operations/caches.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/ru/operations/caches.md b/docs/ru/operations/caches.md index cf7118eb1f3..9a8092c3c39 100644 --- a/docs/ru/operations/caches.md +++ b/docs/ru/operations/caches.md @@ -24,6 +24,6 @@ toc_title: Кеши - Кеш страницы ОС. -Чтобы удалить кеш, используйте выражения типа [SYSTEM DROP ... CACHE](../sql-reference/statements/system.md). +Чтобы очистить кеш, используйте выражение [SYSTEM DROP ... CACHE](../sql-reference/statements/system.md). [Оригинальная статья](https://clickhouse.tech/docs/ru/operations/caches/) From 500d3561cf9433edb2ee58542b3ebd75cdd23b33 Mon Sep 17 00:00:00 2001 From: sevirov <72220289+sevirov@users.noreply.github.com> Date: Thu, 18 Feb 2021 21:41:47 +0300 Subject: [PATCH 0432/2357] Update docs/ru/sql-reference/table-functions/file.md Co-authored-by: olgarev <56617294+olgarev@users.noreply.github.com> --- docs/ru/sql-reference/table-functions/file.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/ru/sql-reference/table-functions/file.md b/docs/ru/sql-reference/table-functions/file.md index ca1ac8b29db..b0b31e76098 100644 --- a/docs/ru/sql-reference/table-functions/file.md +++ b/docs/ru/sql-reference/table-functions/file.md @@ -7,7 +7,7 @@ toc_title: file Создаёт таблицу из файла. Данная табличная функция похожа на табличные функции [url](../../sql-reference/table-functions/url.md) and [hdfs](../../sql-reference/table-functions/hdfs.md). -Функция `file` может использоваться в запросах `SELECT` и `INSERT` движка таблиц [File](../../engines/table-engines/special/file.md). +Функция `file` может использоваться в запросах `SELECT` и `INSERT` при работе с движком таблиц [File](../../engines/table-engines/special/file.md). **Синтаксис** From b854a7b7f8e80b9701b02e5218e37965631541f7 Mon Sep 17 00:00:00 2001 From: Azat Khuzhin Date: Thu, 18 Feb 2021 21:41:50 +0300 Subject: [PATCH 0433/2357] Add some details into comment for first_stage/second_stage Regardless distributed_group_by_no_merge=2/optimize_distributed_group_by_sharding_key --- src/Interpreters/InterpreterSelectQuery.cpp | 14 ++++++++++++-- 1 file changed, 12 insertions(+), 2 deletions(-) diff --git a/src/Interpreters/InterpreterSelectQuery.cpp b/src/Interpreters/InterpreterSelectQuery.cpp index 9f97160f77f..9f48a9a193b 100644 --- a/src/Interpreters/InterpreterSelectQuery.cpp +++ b/src/Interpreters/InterpreterSelectQuery.cpp @@ -561,10 +561,20 @@ Block InterpreterSelectQuery::getSampleBlockImpl() if (storage && !options.only_analyze) from_stage = storage->getQueryProcessingStage(*context, options.to_stage, query_info); - /// Do I need to perform the first part of the pipeline - running on remote servers during distributed processing. + /// Do I need to perform the first part of the pipeline? + /// Running on remote servers during distributed processing or if query is not distributed. + /// + /// Also note that with distributed_group_by_no_merge=1 or when there is + /// only one remote server, it is equal to local query in terms of query + /// stages (or when due to optimize_distributed_group_by_sharding_key the query was processed up to Complete stage). bool first_stage = from_stage < QueryProcessingStage::WithMergeableState && options.to_stage >= QueryProcessingStage::WithMergeableState; - /// Do I need to execute the second part of the pipeline - running on the initiating server during distributed processing. + /// Do I need to execute the second part of the pipeline? + /// Running on the initiating server during distributed processing or if query is not distributed. + /// + /// Also note that with distributed_group_by_no_merge=2 (i.e. when optimize_distributed_group_by_sharding_key takes place) + /// the query on the remote server will be processed up to WithMergeableStateAfterAggregation, + /// So it will do partial second stage (second_stage=true), and initiator will do the final part. bool second_stage = from_stage <= QueryProcessingStage::WithMergeableState && options.to_stage > QueryProcessingStage::WithMergeableState; From 4b4c37a7198a612367fa615b99db6d78c7978fce Mon Sep 17 00:00:00 2001 From: sevirov <72220289+sevirov@users.noreply.github.com> Date: Thu, 18 Feb 2021 21:42:38 +0300 Subject: [PATCH 0434/2357] Update docs/en/sql-reference/table-functions/file.md Co-authored-by: olgarev <56617294+olgarev@users.noreply.github.com> --- docs/en/sql-reference/table-functions/file.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/en/sql-reference/table-functions/file.md b/docs/en/sql-reference/table-functions/file.md index e4ea59aface..da0999e66eb 100644 --- a/docs/en/sql-reference/table-functions/file.md +++ b/docs/en/sql-reference/table-functions/file.md @@ -74,7 +74,7 @@ SELECT * FROM file('test.csv', 'CSV', 'column1 UInt32, column2 UInt32, column3 U ## Globs in Path {#globs-in-path} -Multiple path components can have globs. For being processed file should exists and matches to the whole path pattern (not only suffix or prefix). +Multiple path components can have globs. For being processed file must exist and match to the whole path pattern (not only suffix or prefix). - `*` — Substitutes any number of any characters except `/` including empty string. - `?` — Substitutes any single character. From af660140c320ca45bca0edfd89000b3c6da8ee6a Mon Sep 17 00:00:00 2001 From: Azat Khuzhin Date: Thu, 18 Feb 2021 21:41:50 +0300 Subject: [PATCH 0435/2357] Do only merging of sorted blocks on initiator with distributed_group_by_no_merge=2 When distributed_group_by_no_merge=2 is used (or when optimize_distributed_group_by_sharding_key takes place), remote servers will do full ORDER BY, so initiator can skip this step and do only merge of ordered blocks. --- src/Interpreters/InterpreterSelectQuery.cpp | 8 +++++++- ...buted_group_by_no_merge_order_by.reference | 20 +++++++++++++++++++ ...distributed_group_by_no_merge_order_by.sql | 20 +++++++++++++++++++ 3 files changed, 47 insertions(+), 1 deletion(-) create mode 100644 tests/queries/0_stateless/01730_distributed_group_by_no_merge_order_by.reference create mode 100644 tests/queries/0_stateless/01730_distributed_group_by_no_merge_order_by.sql diff --git a/src/Interpreters/InterpreterSelectQuery.cpp b/src/Interpreters/InterpreterSelectQuery.cpp index 9f48a9a193b..3008c55973d 100644 --- a/src/Interpreters/InterpreterSelectQuery.cpp +++ b/src/Interpreters/InterpreterSelectQuery.cpp @@ -1103,9 +1103,15 @@ void InterpreterSelectQuery::executeImpl(QueryPlan & query_plan, const BlockInpu /** If there is an ORDER BY for distributed query processing, * but there is no aggregation, then on the remote servers ORDER BY was made * - therefore, we merge the sorted streams from remote servers. + * + * Also in case of remote servers was process the query up to WithMergeableStateAfterAggregation + * (distributed_group_by_no_merge=2 or optimize_distributed_group_by_sharding_key=1 takes place), + * then merge the sorted streams is enough, since remote servers already did full ORDER BY. */ - if (!expressions.first_stage && !expressions.need_aggregate && !(query.group_by_with_totals && !aggregate_final)) + if (from_aggregation_stage) + executeMergeSorted(query_plan, "for ORDER BY"); + else if (!expressions.first_stage && !expressions.need_aggregate && !(query.group_by_with_totals && !aggregate_final)) executeMergeSorted(query_plan, "for ORDER BY"); else /// Otherwise, just sort. executeOrder(query_plan, query_info.input_order_info); diff --git a/tests/queries/0_stateless/01730_distributed_group_by_no_merge_order_by.reference b/tests/queries/0_stateless/01730_distributed_group_by_no_merge_order_by.reference new file mode 100644 index 00000000000..02ae8a37e52 --- /dev/null +++ b/tests/queries/0_stateless/01730_distributed_group_by_no_merge_order_by.reference @@ -0,0 +1,20 @@ +0 +0 +0 +0 +0 +0 +0 +0 +0 +0 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 diff --git a/tests/queries/0_stateless/01730_distributed_group_by_no_merge_order_by.sql b/tests/queries/0_stateless/01730_distributed_group_by_no_merge_order_by.sql new file mode 100644 index 00000000000..e43b81dca48 --- /dev/null +++ b/tests/queries/0_stateless/01730_distributed_group_by_no_merge_order_by.sql @@ -0,0 +1,20 @@ +drop table if exists data_01730; + +-- does not use 127.1 due to prefer_localhost_replica + +select * from remote('127.{2..11}', view(select * from numbers(1e6))) group by number order by number limit 20 settings distributed_group_by_no_merge=0, max_memory_usage='100Mi'; -- { serverError 241 } +-- no memory limit error, because with distributed_group_by_no_merge=2 remote servers will do ORDER BY and will cut to the LIMIT +select * from remote('127.{2..11}', view(select * from numbers(1e6))) group by number order by number limit 20 settings distributed_group_by_no_merge=2, max_memory_usage='100Mi'; + +-- since the MergingSortedTransform will start processing only when all ports (remotes) will have some data, +-- and the query with GROUP BY on remote servers will first do GROUP BY and then send the block, +-- so the initiator will first receive all blocks from remotes and only after start merging, +-- and will hit the memory limit. +select * from remote('127.{2..11}', view(select * from numbers(1e6))) group by number order by number limit 1e6 settings distributed_group_by_no_merge=2, max_memory_usage='100Mi'; -- { serverError 241 } + +-- with optimize_aggregation_in_order=1 remote servers will produce blocks more frequently, +-- since they don't need to wait until the aggregation will be finished, +-- and so the query will not hit the memory limit error. +create table data_01730 engine=MergeTree() order by key as select number key from numbers(1e6); +select * from remote('127.{2..11}', currentDatabase(), data_01730) group by key order by key limit 1e6 settings distributed_group_by_no_merge=2, max_memory_usage='100Mi', optimize_aggregation_in_order=1 format Null; +drop table data_01730; From d79ea4f38361046d4916b780f59ee893410af32e Mon Sep 17 00:00:00 2001 From: sevirov <72220289+sevirov@users.noreply.github.com> Date: Thu, 18 Feb 2021 21:44:03 +0300 Subject: [PATCH 0436/2357] Update docs/ru/sql-reference/table-functions/file.md Co-authored-by: olgarev <56617294+olgarev@users.noreply.github.com> --- docs/ru/sql-reference/table-functions/file.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/ru/sql-reference/table-functions/file.md b/docs/ru/sql-reference/table-functions/file.md index b0b31e76098..a36fc1411b2 100644 --- a/docs/ru/sql-reference/table-functions/file.md +++ b/docs/ru/sql-reference/table-functions/file.md @@ -74,7 +74,7 @@ SELECT * FROM file('test.csv', 'CSV', 'column1 UInt32, column2 UInt32, column3 U ## Шаблоны в компонентах пути {#globs-in-path} -Шаблоны могут содержаться в нескольких компонентах пути. Обрабатываются только существующие файлы, название которых целиком удовлетворяет шаблону (не только суффиксом или префиксом). +При описании пути к файлу могут использоваться шаблоны. Обрабатываются только те файлы, у которых путь и название соответствуют шаблону полностью (а не только префикс или суффикс). - `*` — заменяет любое количество любых символов кроме `/`, включая отсутствие символов. - `?` — заменяет ровно один любой символ. From 44bd6670da8511e7bae3a64d3a966c7a481ca291 Mon Sep 17 00:00:00 2001 From: sevirov <72220289+sevirov@users.noreply.github.com> Date: Thu, 18 Feb 2021 21:44:40 +0300 Subject: [PATCH 0437/2357] Update docs/ru/sql-reference/table-functions/file.md Co-authored-by: olgarev <56617294+olgarev@users.noreply.github.com> --- docs/ru/sql-reference/table-functions/file.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/ru/sql-reference/table-functions/file.md b/docs/ru/sql-reference/table-functions/file.md index a36fc1411b2..3cb7043929a 100644 --- a/docs/ru/sql-reference/table-functions/file.md +++ b/docs/ru/sql-reference/table-functions/file.md @@ -85,7 +85,7 @@ SELECT * FROM file('test.csv', 'CSV', 'column1 UInt32, column2 UInt32, column3 U **Пример** -Предположим у нас есть несколько файлов со следующими относительными путями: +Предположим, у нас есть несколько файлов со следующими относительными путями: - 'some_dir/some_file_1' - 'some_dir/some_file_2' From fe4419b220f802577b69be2ffb2c42acbe7ad037 Mon Sep 17 00:00:00 2001 From: sevirov <72220289+sevirov@users.noreply.github.com> Date: Thu, 18 Feb 2021 21:45:25 +0300 Subject: [PATCH 0438/2357] Update docs/ru/sql-reference/table-functions/remote.md Co-authored-by: olgarev <56617294+olgarev@users.noreply.github.com> --- docs/ru/sql-reference/table-functions/remote.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/ru/sql-reference/table-functions/remote.md b/docs/ru/sql-reference/table-functions/remote.md index 435fb5bb6d7..a3ddd4cfe68 100644 --- a/docs/ru/sql-reference/table-functions/remote.md +++ b/docs/ru/sql-reference/table-functions/remote.md @@ -5,7 +5,7 @@ toc_title: remote # remote, remoteSecure {#remote-remotesecure} -Позволяет обратиться к удалённым серверам без создания таблицы типа [Distributed](../../engines/table-engines/special/distributed.md). Функция `remoteSecure` такая же, как и `remote`, но с защищенным соединением. +Позволяет обратиться к удалённым серверам без создания таблицы типа [Distributed](../../engines/table-engines/special/distributed.md). Функция `remoteSecure` работает аналогично `remote`, но использует защищенное соединение. Обе функции могут быть использованы в запросах типа `SELECT` и `INSERT`. From ee168507dbef224012d8d367181ad3591c40595b Mon Sep 17 00:00:00 2001 From: sevirov <72220289+sevirov@users.noreply.github.com> Date: Thu, 18 Feb 2021 21:45:48 +0300 Subject: [PATCH 0439/2357] Update docs/ru/sql-reference/table-functions/remote.md Co-authored-by: olgarev <56617294+olgarev@users.noreply.github.com> --- docs/ru/sql-reference/table-functions/remote.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/ru/sql-reference/table-functions/remote.md b/docs/ru/sql-reference/table-functions/remote.md index a3ddd4cfe68..a48a176d75f 100644 --- a/docs/ru/sql-reference/table-functions/remote.md +++ b/docs/ru/sql-reference/table-functions/remote.md @@ -7,7 +7,7 @@ toc_title: remote Позволяет обратиться к удалённым серверам без создания таблицы типа [Distributed](../../engines/table-engines/special/distributed.md). Функция `remoteSecure` работает аналогично `remote`, но использует защищенное соединение. -Обе функции могут быть использованы в запросах типа `SELECT` и `INSERT`. +Обе функции могут использоваться в запросах `SELECT` и `INSERT`. **Синтаксис** From 95c07b19ecc933e38d82958a59996f51b4ad9e39 Mon Sep 17 00:00:00 2001 From: sevirov <72220289+sevirov@users.noreply.github.com> Date: Thu, 18 Feb 2021 21:46:13 +0300 Subject: [PATCH 0440/2357] Update docs/ru/sql-reference/table-functions/remote.md Co-authored-by: olgarev <56617294+olgarev@users.noreply.github.com> --- docs/ru/sql-reference/table-functions/remote.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/ru/sql-reference/table-functions/remote.md b/docs/ru/sql-reference/table-functions/remote.md index a48a176d75f..dd04f8458da 100644 --- a/docs/ru/sql-reference/table-functions/remote.md +++ b/docs/ru/sql-reference/table-functions/remote.md @@ -20,7 +20,7 @@ remoteSecure('addresses_expr', db.table[, 'user'[, 'password']]) **Параметры** -- `addresses_expr` — выражение, генерирующее адреса удалённых серверов. Это может быть просто один адрес сервера. Адрес сервера — это `хост:порт`, или только `хост`. +- `addresses_expr` — выражение, генерирующее адреса удалённых серверов. Это может быть просто один адрес сервера. Адрес сервера — это `host:port` или только `host`. Хост может быть указан в виде имени сервера, или в виде IPv4 или IPv6 адреса. IPv6 адрес указывается в квадратных скобках. From ce6263220a44ce5260a2ff28c55092ffd715c3ef Mon Sep 17 00:00:00 2001 From: sevirov <72220289+sevirov@users.noreply.github.com> Date: Thu, 18 Feb 2021 21:46:35 +0300 Subject: [PATCH 0441/2357] Update docs/ru/sql-reference/table-functions/remote.md Co-authored-by: olgarev <56617294+olgarev@users.noreply.github.com> --- docs/ru/sql-reference/table-functions/remote.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/ru/sql-reference/table-functions/remote.md b/docs/ru/sql-reference/table-functions/remote.md index dd04f8458da..4dbb5863cdf 100644 --- a/docs/ru/sql-reference/table-functions/remote.md +++ b/docs/ru/sql-reference/table-functions/remote.md @@ -22,7 +22,7 @@ remoteSecure('addresses_expr', db.table[, 'user'[, 'password']]) - `addresses_expr` — выражение, генерирующее адреса удалённых серверов. Это может быть просто один адрес сервера. Адрес сервера — это `host:port` или только `host`. - Хост может быть указан в виде имени сервера, или в виде IPv4 или IPv6 адреса. IPv6 адрес указывается в квадратных скобках. + Вместо параметра `host' может быть указано имя сервера или его адрес в формате IPv4 или IPv6. IPv6 адрес указывается в квадратных скобках. Порт — TCP-порт удалённого сервера. Если порт не указан, используется [tcp_port](../../operations/server-configuration-parameters/settings.md#server_configuration_parameters-tcp_port) из конфигурационного файла сервера, к которому обратились через функцию `remote` (по умолчанию - 9000), и [tcp_port_secure](../../operations/server-configuration-parameters/settings.md#server_configuration_parameters-tcp_port_secure), к которому обратились через функцию `remoteSecure` (по умолчанию — 9440). From ef263d03569cdf3702b97215cef4ea1810404ff2 Mon Sep 17 00:00:00 2001 From: sevirov <72220289+sevirov@users.noreply.github.com> Date: Thu, 18 Feb 2021 21:46:56 +0300 Subject: [PATCH 0442/2357] Update docs/ru/sql-reference/table-functions/remote.md Co-authored-by: olgarev <56617294+olgarev@users.noreply.github.com> --- docs/ru/sql-reference/table-functions/remote.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/ru/sql-reference/table-functions/remote.md b/docs/ru/sql-reference/table-functions/remote.md index 4dbb5863cdf..48eb8d0c254 100644 --- a/docs/ru/sql-reference/table-functions/remote.md +++ b/docs/ru/sql-reference/table-functions/remote.md @@ -24,7 +24,7 @@ remoteSecure('addresses_expr', db.table[, 'user'[, 'password']]) Вместо параметра `host' может быть указано имя сервера или его адрес в формате IPv4 или IPv6. IPv6 адрес указывается в квадратных скобках. - Порт — TCP-порт удалённого сервера. Если порт не указан, используется [tcp_port](../../operations/server-configuration-parameters/settings.md#server_configuration_parameters-tcp_port) из конфигурационного файла сервера, к которому обратились через функцию `remote` (по умолчанию - 9000), и [tcp_port_secure](../../operations/server-configuration-parameters/settings.md#server_configuration_parameters-tcp_port_secure), к которому обратились через функцию `remoteSecure` (по умолчанию — 9440). + `port` — TCP-порт удалённого сервера. Если порт не указан, используется [tcp_port](../../operations/server-configuration-parameters/settings.md#server_configuration_parameters-tcp_port) из конфигурационного файла сервера, к которому обратились через функцию `remote` (по умолчанию - 9000), и [tcp_port_secure](../../operations/server-configuration-parameters/settings.md#server_configuration_parameters-tcp_port_secure), к которому обратились через функцию `remoteSecure` (по умолчанию — 9440). С IPv6-адресом обязательно нужно указывать порт. From 3ba3faa156274d0ded2875c7d922b6a38ca21462 Mon Sep 17 00:00:00 2001 From: sevirov <72220289+sevirov@users.noreply.github.com> Date: Thu, 18 Feb 2021 21:47:28 +0300 Subject: [PATCH 0443/2357] Update docs/ru/sql-reference/table-functions/remote.md Co-authored-by: olgarev <56617294+olgarev@users.noreply.github.com> --- docs/ru/sql-reference/table-functions/remote.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/ru/sql-reference/table-functions/remote.md b/docs/ru/sql-reference/table-functions/remote.md index 48eb8d0c254..05d5938c40d 100644 --- a/docs/ru/sql-reference/table-functions/remote.md +++ b/docs/ru/sql-reference/table-functions/remote.md @@ -42,7 +42,7 @@ remoteSecure('addresses_expr', db.table[, 'user'[, 'password']]) **Использование** -Использование табличной функции `remote` менее оптимально, чем создание таблицы типа `Distributed`, так как в этом случае соединения с серверами устанавливаются заново при каждом запросе. В случае задания имён хостов делается резолвинг имён, а также не ведётся подсчёт ошибок при работе с разными репликами. При обработке большого количества запросов всегда создавайте таблицу типа `Distributed` заранее, не используйте табличную функцию `remote`. +Использование табличной функции `remote` менее оптимально, чем создание таблицы типа `Distributed`, так как в этом случае соединения с серверами устанавливаются заново при каждом запросе. Если указываются имена серверов, то приходится также выполнять поиск сервера по имени. Кроме того, не ведётся сквозной подсчёт ошибок при работе с разными репликами. При обработке большого количества запросов всегда создавайте таблицу типа `Distributed`, использовать табличную функцию `remote` в таких случаях не рекомендуется. Табличная функция `remote` может быть полезна в следующих случаях: From d10c9f1bd37128a37a0b9e6c416f6b6e5d8d3f80 Mon Sep 17 00:00:00 2001 From: sevirov <72220289+sevirov@users.noreply.github.com> Date: Thu, 18 Feb 2021 21:47:50 +0300 Subject: [PATCH 0444/2357] Update docs/ru/sql-reference/table-functions/remote.md Co-authored-by: olgarev <56617294+olgarev@users.noreply.github.com> --- docs/ru/sql-reference/table-functions/remote.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/docs/ru/sql-reference/table-functions/remote.md b/docs/ru/sql-reference/table-functions/remote.md index 05d5938c40d..a174bd12e94 100644 --- a/docs/ru/sql-reference/table-functions/remote.md +++ b/docs/ru/sql-reference/table-functions/remote.md @@ -46,8 +46,8 @@ remoteSecure('addresses_expr', db.table[, 'user'[, 'password']]) Табличная функция `remote` может быть полезна в следующих случаях: -- Обращение на конкретный сервер в целях сравнения данных, отладки и тестирования. -- Запросы между разными кластерами ClickHouse в целях исследований. +- Обращение на конкретный сервер для сравнения данных, отладки и тестирования. +- Запросы между разными кластерами ClickHouse для исследований. - Нечастые распределённые запросы, задаваемые вручную. - Распределённые запросы, где набор серверов определяется каждый раз заново. From 234ec940beba4aec7ae435d205acf7cfc232002e Mon Sep 17 00:00:00 2001 From: sevirov <72220289+sevirov@users.noreply.github.com> Date: Thu, 18 Feb 2021 21:48:13 +0300 Subject: [PATCH 0445/2357] Update docs/ru/sql-reference/table-functions/remote.md Co-authored-by: olgarev <56617294+olgarev@users.noreply.github.com> --- docs/ru/sql-reference/table-functions/remote.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/ru/sql-reference/table-functions/remote.md b/docs/ru/sql-reference/table-functions/remote.md index a174bd12e94..0cb32861d1e 100644 --- a/docs/ru/sql-reference/table-functions/remote.md +++ b/docs/ru/sql-reference/table-functions/remote.md @@ -82,7 +82,7 @@ example01-{01..02}-1 При наличии нескольких пар фигурных скобок генерируется прямое произведение соответствующих множеств. -Адреса или их фрагменты в фигурных скобках можно указать через символ \|. В этом случае соответствующие множества адресов понимаются как реплики — запрос будет отправлен на первую живую реплику. При этом реплики перебираются в порядке, согласно текущей настройке [load_balancing](../../operations/settings/settings.md#settings-load_balancing). В этом примере указано два шарда, в каждом из которых имеется две реплики: +Адреса или их фрагменты в фигурных скобках можно указать через символ \|. В этом случае соответствующие множества адресов понимаются как реплики — запрос будет отправлен на первую живую реплику. При этом реплики перебираются в порядке, согласно текущей настройке [load_balancing](../../operations/settings/settings.md#settings-load_balancing). В этом примере указаны два шарда, в каждом из которых имеются две реплики: ``` text example01-{01..02}-{1|2} From 3222a9aecd0c47ec232dc2277edbaec192604431 Mon Sep 17 00:00:00 2001 From: sevirov <72220289+sevirov@users.noreply.github.com> Date: Thu, 18 Feb 2021 21:48:32 +0300 Subject: [PATCH 0446/2357] Update docs/ru/sql-reference/table-functions/url.md Co-authored-by: olgarev <56617294+olgarev@users.noreply.github.com> --- docs/ru/sql-reference/table-functions/url.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/ru/sql-reference/table-functions/url.md b/docs/ru/sql-reference/table-functions/url.md index afb4a23b88e..ef97e269fbb 100644 --- a/docs/ru/sql-reference/table-functions/url.md +++ b/docs/ru/sql-reference/table-functions/url.md @@ -7,7 +7,7 @@ toc_title: url Функция `url` создает таблицу с помощью адреса `URL`, формата данных и структуры таблицы. -Функция `url` может быть использована в запросах `SELECT` и `INSERT` в таблицах движка [URL](../../engines/table-engines/special/url.md). +Функция `url` может быть использована в запросах `SELECT` и `INSERT` с таблицами на движке [URL](../../engines/table-engines/special/url.md). **Синтаксис** From 31e78ef9983d4e8de703f84fe3be069feb0f4297 Mon Sep 17 00:00:00 2001 From: sevirov <72220289+sevirov@users.noreply.github.com> Date: Thu, 18 Feb 2021 21:48:54 +0300 Subject: [PATCH 0447/2357] Update docs/ru/sql-reference/table-functions/url.md Co-authored-by: olgarev <56617294+olgarev@users.noreply.github.com> --- docs/ru/sql-reference/table-functions/url.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/ru/sql-reference/table-functions/url.md b/docs/ru/sql-reference/table-functions/url.md index ef97e269fbb..8a4f6fe7d98 100644 --- a/docs/ru/sql-reference/table-functions/url.md +++ b/docs/ru/sql-reference/table-functions/url.md @@ -27,7 +27,7 @@ url(URL, format, structure) **Примеры** -Получение первых 3 строк таблицы, содержащей столбцы типа `String` и [UInt32](../../sql-reference/data-types/int-uint.md), с HTTP-сервера в формате [CSV](../../interfaces/formats.md/#csv). +Получение с HTTP-сервера первых 3 строк таблицы с данными в формате [CSV](../../interfaces/formats.md/#csv), содержащей столбцы типа [String](../../sql-reference/data-types/string.md) и [UInt32](../../sql-reference/data-types/int-uint.md). ``` sql SELECT * FROM url('http://127.0.0.1:12345/', CSV, 'column1 String, column2 UInt32') LIMIT 3; From 5eb5180ef8de2b0d7b0751b9bf765c3abe9c0ba0 Mon Sep 17 00:00:00 2001 From: sevirov <72220289+sevirov@users.noreply.github.com> Date: Thu, 18 Feb 2021 21:49:33 +0300 Subject: [PATCH 0448/2357] Update docs/ru/sql-reference/table-functions/url.md Co-authored-by: olgarev <56617294+olgarev@users.noreply.github.com> --- docs/ru/sql-reference/table-functions/url.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/ru/sql-reference/table-functions/url.md b/docs/ru/sql-reference/table-functions/url.md index 8a4f6fe7d98..fe80f466f54 100644 --- a/docs/ru/sql-reference/table-functions/url.md +++ b/docs/ru/sql-reference/table-functions/url.md @@ -5,7 +5,7 @@ toc_title: url # url {#url} -Функция `url` создает таблицу с помощью адреса `URL`, формата данных и структуры таблицы. +Функция `url` берет данные по указанному адресу `URL` и создает из них таблицу указанной структуры со столбцами указанного формата. Функция `url` может быть использована в запросах `SELECT` и `INSERT` с таблицами на движке [URL](../../engines/table-engines/special/url.md). From 9c01869090e873603b3bb7ec1cd17fbcf264bc4f Mon Sep 17 00:00:00 2001 From: Azat Khuzhin Date: Thu, 18 Feb 2021 21:28:42 +0300 Subject: [PATCH 0449/2357] Fix 'Empty task was returned from async task queue' on query cancellation --- src/Processors/Executors/PipelineExecutor.cpp | 5 +++++ .../01731_async_task_queue_wait.reference | 0 .../0_stateless/01731_async_task_queue_wait.sh | 12 ++++++++++++ 3 files changed, 17 insertions(+) create mode 100644 tests/queries/0_stateless/01731_async_task_queue_wait.reference create mode 100755 tests/queries/0_stateless/01731_async_task_queue_wait.sh diff --git a/src/Processors/Executors/PipelineExecutor.cpp b/src/Processors/Executors/PipelineExecutor.cpp index 6192828784f..a724f22ed31 100644 --- a/src/Processors/Executors/PipelineExecutor.cpp +++ b/src/Processors/Executors/PipelineExecutor.cpp @@ -540,7 +540,12 @@ void PipelineExecutor::executeStepImpl(size_t thread_num, size_t num_threads, st /// If we execute in single thread, wait for async tasks here. auto res = async_task_queue.wait(lock); if (!res) + { + /// The query had been cancelled (finished is also set) + if (finished) + break; throw Exception("Empty task was returned from async task queue", ErrorCodes::LOGICAL_ERROR); + } node = static_cast(res.data); break; diff --git a/tests/queries/0_stateless/01731_async_task_queue_wait.reference b/tests/queries/0_stateless/01731_async_task_queue_wait.reference new file mode 100644 index 00000000000..e69de29bb2d diff --git a/tests/queries/0_stateless/01731_async_task_queue_wait.sh b/tests/queries/0_stateless/01731_async_task_queue_wait.sh new file mode 100755 index 00000000000..eddbfdf5322 --- /dev/null +++ b/tests/queries/0_stateless/01731_async_task_queue_wait.sh @@ -0,0 +1,12 @@ +#!/usr/bin/env bash + +CURDIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd) +# shellcheck source=../shell_config.sh +. "$CURDIR"/../shell_config.sh + +# regression for 'Empty task was returned from async task queue' during query +# cancellation with async_socket_for_remote=1 (that ignores +# max_distributed_connections) +timeout 5s ${CLICKHOUSE_CLIENT} --max_distributed_connections=1 --format Null -q "select * from remote('127.{2..11}', view(select * from numbers(1e9))) group by number format Null" +# timedout +test $? -eq 124 From 865dca0b0d7c2327e56b609a56f0693d6b43c6d7 Mon Sep 17 00:00:00 2001 From: Azat Khuzhin Date: Thu, 18 Feb 2021 22:38:21 +0300 Subject: [PATCH 0450/2357] ccache 4.2+ does not requires any quirks for SOURCE_DATE_EPOCH And besides "ccache compiler" does not work, since it interpret everything as ccache options. Refs: https://github.com/ccache/ccache/commit/cad2416291c042443cf0c045047c34a2e07e103a --- cmake/find/ccache.cmake | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/cmake/find/ccache.cmake b/cmake/find/ccache.cmake index d8e9cf9588d..d9ccd1a9ac6 100644 --- a/cmake/find/ccache.cmake +++ b/cmake/find/ccache.cmake @@ -37,15 +37,13 @@ if (CCACHE_FOUND AND NOT COMPILER_MATCHES_CCACHE) # # - 4.0+ ccache always includes this environment variable into the hash # of the manifest, which do not allow to use previous cache, - # - 4.2+ ccache ignores SOURCE_DATE_EPOCH under time_macros sloppiness. + # - 4.2+ ccache ignores SOURCE_DATE_EPOCH for every file w/o __DATE__/__TIME__ # # So for: - # - 4.2+ time_macros sloppiness is used, + # - 4.2+ does not require any sloppiness # - 4.0+ will ignore SOURCE_DATE_EPOCH environment variable. if (CCACHE_VERSION VERSION_GREATER_EQUAL "4.2") - message(STATUS "Use time_macros sloppiness for ccache") - set_property (GLOBAL PROPERTY RULE_LAUNCH_COMPILE "${CCACHE_FOUND} --set-config=sloppiness=time_macros") - set_property (GLOBAL PROPERTY RULE_LAUNCH_LINK "${CCACHE_FOUND} --set-config=sloppiness=time_macros") + message(STATUS "ccache is 4.2+ no quirks for SOURCE_DATE_EPOCH required") elseif (CCACHE_VERSION VERSION_GREATER_EQUAL "4.0") message(STATUS "Ignore SOURCE_DATE_EPOCH for ccache") set_property (GLOBAL PROPERTY RULE_LAUNCH_COMPILE "env -u SOURCE_DATE_EPOCH ${CCACHE_FOUND}") From 7bcfe92cd7ba75f7d2ee2d58be3ec51f627a807f Mon Sep 17 00:00:00 2001 From: Azat Khuzhin Date: Thu, 18 Feb 2021 23:29:38 +0300 Subject: [PATCH 0451/2357] Mark 01730_distributed_group_by_no_merge_order_by as long https://clickhouse-test-reports.s3.yandex.net/20882/af660140c320ca45bca0edfd89000b3c6da8ee6a/functional_stateless_tests_flaky_check_(address).html#fail1 --- ...> 01730_distributed_group_by_no_merge_order_by_long.reference} | 0 ....sql => 01730_distributed_group_by_no_merge_order_by_long.sql} | 0 2 files changed, 0 insertions(+), 0 deletions(-) rename tests/queries/0_stateless/{01730_distributed_group_by_no_merge_order_by.reference => 01730_distributed_group_by_no_merge_order_by_long.reference} (100%) rename tests/queries/0_stateless/{01730_distributed_group_by_no_merge_order_by.sql => 01730_distributed_group_by_no_merge_order_by_long.sql} (100%) diff --git a/tests/queries/0_stateless/01730_distributed_group_by_no_merge_order_by.reference b/tests/queries/0_stateless/01730_distributed_group_by_no_merge_order_by_long.reference similarity index 100% rename from tests/queries/0_stateless/01730_distributed_group_by_no_merge_order_by.reference rename to tests/queries/0_stateless/01730_distributed_group_by_no_merge_order_by_long.reference diff --git a/tests/queries/0_stateless/01730_distributed_group_by_no_merge_order_by.sql b/tests/queries/0_stateless/01730_distributed_group_by_no_merge_order_by_long.sql similarity index 100% rename from tests/queries/0_stateless/01730_distributed_group_by_no_merge_order_by.sql rename to tests/queries/0_stateless/01730_distributed_group_by_no_merge_order_by_long.sql From f4b0b1110cb77c6901243cc1120615d9735a2da3 Mon Sep 17 00:00:00 2001 From: Nikolai Kochetov Date: Thu, 18 Feb 2021 23:53:40 +0300 Subject: [PATCH 0452/2357] Fix test. --- tests/queries/0_stateless/01272_totals_and_filter_bug.reference | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/queries/0_stateless/01272_totals_and_filter_bug.reference b/tests/queries/0_stateless/01272_totals_and_filter_bug.reference index 0db840561fd..5b407738cb8 100644 --- a/tests/queries/0_stateless/01272_totals_and_filter_bug.reference +++ b/tests/queries/0_stateless/01272_totals_and_filter_bug.reference @@ -1,6 +1,6 @@ 1 1 -0 2 +0 1 - test1 10 0 From 6e9bf682179229b4ae3d7f97ec3ab5c83229704b Mon Sep 17 00:00:00 2001 From: Nikolai Kochetov Date: Thu, 18 Feb 2021 23:54:42 +0300 Subject: [PATCH 0453/2357] Fix typo --- src/Processors/QueryPlan/Optimizations/filterPushDown.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/Processors/QueryPlan/Optimizations/filterPushDown.cpp b/src/Processors/QueryPlan/Optimizations/filterPushDown.cpp index 4d01235e2fc..1b84fee4857 100644 --- a/src/Processors/QueryPlan/Optimizations/filterPushDown.cpp +++ b/src/Processors/QueryPlan/Optimizations/filterPushDown.cpp @@ -143,7 +143,7 @@ size_t tryPushDownFilter(QueryPlan::Node * parent_node, QueryPlan::Nodes & nodes /// If totals step has HAVING expression, skip it for now. /// TODO: /// We can merge HAING expression with current filer. - /// Alos, we can push down part of HAVING which depend only on aggregation keys. + /// Also, we can push down part of HAVING which depend only on aggregation keys. if (totals_having->getActions()) return 0; From ee98b2a472aa05d28d36f859eefff0d359b45910 Mon Sep 17 00:00:00 2001 From: alesapin Date: Fri, 19 Feb 2021 00:03:16 +0300 Subject: [PATCH 0454/2357] Better list requests --- src/Coordination/NuKeeperStorage.cpp | 37 ++++++++++++++-------------- src/Coordination/NuKeeperStorage.h | 8 +++--- 2 files changed, 24 insertions(+), 21 deletions(-) diff --git a/src/Coordination/NuKeeperStorage.cpp b/src/Coordination/NuKeeperStorage.cpp index 631f975cddc..fa57b8141a7 100644 --- a/src/Coordination/NuKeeperStorage.cpp +++ b/src/Coordination/NuKeeperStorage.cpp @@ -25,10 +25,10 @@ static String parentPath(const String & path) return "/"; } -static String baseName(const String & path) +static std::string_view getBaseNameView(const String & path) { - auto rslash_pos = path.rfind('/'); - return path.substr(rslash_pos + 1); + size_t basename_start = path.rfind('/'); + return std::string_view{&path[basename_start + 1], path.length() - basename_start - 1}; } static NuKeeperStorage::ResponsesForSessions processWatchesImpl(const String & path, NuKeeperStorage::Watches & watches, NuKeeperStorage::Watches & list_watches, Coordination::Event event_type) @@ -167,14 +167,17 @@ struct NuKeeperStorageCreateRequest final : public NuKeeperStorageRequest /// Increment sequential number even if node is not sequential ++it->second.seq_num; - response.path_created = path_created; - container.emplace(path_created, std::move(created_node)); + + auto [child_itr, created] = container.emplace(path_created, std::move(created_node)); + + auto child_path_view = getBaseNameView(child_itr->first); + it->second.children.insert(child_path_view); if (request.is_ephemeral) ephemerals[session_id].emplace(path_created); - undo = [&container, &ephemerals, session_id, path_created, is_ephemeral = request.is_ephemeral, parent_path = it->first] + undo = [&container, &ephemerals, session_id, path_created, is_ephemeral = request.is_ephemeral, parent_path = it->first, child_path_view] { container.erase(path_created); if (is_ephemeral) @@ -183,6 +186,7 @@ struct NuKeeperStorageCreateRequest final : public NuKeeperStorageRequest --undo_parent.stat.cversion; --undo_parent.stat.numChildren; --undo_parent.seq_num; + undo_parent.children.erase(child_path_view); }; ++it->second.stat.cversion; @@ -250,21 +254,25 @@ struct NuKeeperStorageRemoveRequest final : public NuKeeperStorageRequest if (prev_node.is_ephemeral) ephemerals[session_id].erase(request.path); - container.erase(it); + auto child_basename_view = getBaseNameView(it->first); auto & parent = container.at(parentPath(request.path)); --parent.stat.numChildren; ++parent.stat.cversion; + parent.children.erase(child_basename_view); response.error = Coordination::Error::ZOK; + container.erase(it); + undo = [prev_node, &container, &ephemerals, session_id, path = request.path] { if (prev_node.is_ephemeral) ephemerals[session_id].emplace(path); - container.emplace(path, prev_node); + auto [itr, inserted] = container.emplace(path, prev_node); auto & undo_parent = container.at(parentPath(path)); ++undo_parent.stat.numChildren; --undo_parent.stat.cversion; + undo_parent.children.insert(getBaseNameView(itr->first)); }; } @@ -370,17 +378,10 @@ struct NuKeeperStorageListRequest final : public NuKeeperStorageRequest if (path_prefix.empty()) throw DB::Exception("Logical error: path cannot be empty", ErrorCodes::LOGICAL_ERROR); - if (path_prefix.back() != '/') - path_prefix += '/'; + for (const auto & name : it->second.children) + response.names.emplace_back(name); - /// Fairly inefficient. - for (auto child_it = container.upper_bound(path_prefix); - child_it != container.end() && startsWith(child_it->first, path_prefix); - ++child_it) - { - if (parentPath(child_it->first) == request.path) - response.names.emplace_back(baseName(child_it->first)); - } + std::sort(response.names.begin(), response.names.end()); response.stat = it->second.stat; response.error = Coordination::Error::ZOK; diff --git a/src/Coordination/NuKeeperStorage.h b/src/Coordination/NuKeeperStorage.h index 20ab1982b4e..bd1fc087d09 100644 --- a/src/Coordination/NuKeeperStorage.h +++ b/src/Coordination/NuKeeperStorage.h @@ -16,6 +16,7 @@ using namespace DB; struct NuKeeperStorageRequest; using NuKeeperStorageRequestPtr = std::shared_ptr; using ResponseCallback = std::function; +using ChildrenRefSet = std::unordered_set; class NuKeeperStorage { @@ -30,6 +31,7 @@ public: bool is_sequental = false; Coordination::Stat stat{}; int32_t seq_num = 0; + ChildrenRefSet children; }; struct ResponseForSession @@ -48,9 +50,9 @@ public: using RequestsForSessions = std::vector; - using Container = std::map; - using Ephemerals = std::unordered_map>; - using SessionAndWatcher = std::unordered_map>; + using Container = std::unordered_map; + using Ephemerals = std::unordered_map>; + using SessionAndWatcher = std::unordered_map>; using SessionAndTimeout = std::unordered_map; using SessionIDs = std::vector; From 839d6f7072d6de6b71cc497027ca40715968535e Mon Sep 17 00:00:00 2001 From: alesapin Date: Fri, 19 Feb 2021 00:09:41 +0300 Subject: [PATCH 0455/2357] Revert "Better list requests" This reverts commit ee98b2a472aa05d28d36f859eefff0d359b45910. --- src/Coordination/NuKeeperStorage.cpp | 37 ++++++++++++++-------------- src/Coordination/NuKeeperStorage.h | 8 +++--- 2 files changed, 21 insertions(+), 24 deletions(-) diff --git a/src/Coordination/NuKeeperStorage.cpp b/src/Coordination/NuKeeperStorage.cpp index fa57b8141a7..631f975cddc 100644 --- a/src/Coordination/NuKeeperStorage.cpp +++ b/src/Coordination/NuKeeperStorage.cpp @@ -25,10 +25,10 @@ static String parentPath(const String & path) return "/"; } -static std::string_view getBaseNameView(const String & path) +static String baseName(const String & path) { - size_t basename_start = path.rfind('/'); - return std::string_view{&path[basename_start + 1], path.length() - basename_start - 1}; + auto rslash_pos = path.rfind('/'); + return path.substr(rslash_pos + 1); } static NuKeeperStorage::ResponsesForSessions processWatchesImpl(const String & path, NuKeeperStorage::Watches & watches, NuKeeperStorage::Watches & list_watches, Coordination::Event event_type) @@ -167,17 +167,14 @@ struct NuKeeperStorageCreateRequest final : public NuKeeperStorageRequest /// Increment sequential number even if node is not sequential ++it->second.seq_num; + response.path_created = path_created; - - auto [child_itr, created] = container.emplace(path_created, std::move(created_node)); - - auto child_path_view = getBaseNameView(child_itr->first); - it->second.children.insert(child_path_view); + container.emplace(path_created, std::move(created_node)); if (request.is_ephemeral) ephemerals[session_id].emplace(path_created); - undo = [&container, &ephemerals, session_id, path_created, is_ephemeral = request.is_ephemeral, parent_path = it->first, child_path_view] + undo = [&container, &ephemerals, session_id, path_created, is_ephemeral = request.is_ephemeral, parent_path = it->first] { container.erase(path_created); if (is_ephemeral) @@ -186,7 +183,6 @@ struct NuKeeperStorageCreateRequest final : public NuKeeperStorageRequest --undo_parent.stat.cversion; --undo_parent.stat.numChildren; --undo_parent.seq_num; - undo_parent.children.erase(child_path_view); }; ++it->second.stat.cversion; @@ -254,25 +250,21 @@ struct NuKeeperStorageRemoveRequest final : public NuKeeperStorageRequest if (prev_node.is_ephemeral) ephemerals[session_id].erase(request.path); - auto child_basename_view = getBaseNameView(it->first); + container.erase(it); auto & parent = container.at(parentPath(request.path)); --parent.stat.numChildren; ++parent.stat.cversion; - parent.children.erase(child_basename_view); response.error = Coordination::Error::ZOK; - container.erase(it); - undo = [prev_node, &container, &ephemerals, session_id, path = request.path] { if (prev_node.is_ephemeral) ephemerals[session_id].emplace(path); - auto [itr, inserted] = container.emplace(path, prev_node); + container.emplace(path, prev_node); auto & undo_parent = container.at(parentPath(path)); ++undo_parent.stat.numChildren; --undo_parent.stat.cversion; - undo_parent.children.insert(getBaseNameView(itr->first)); }; } @@ -378,10 +370,17 @@ struct NuKeeperStorageListRequest final : public NuKeeperStorageRequest if (path_prefix.empty()) throw DB::Exception("Logical error: path cannot be empty", ErrorCodes::LOGICAL_ERROR); - for (const auto & name : it->second.children) - response.names.emplace_back(name); + if (path_prefix.back() != '/') + path_prefix += '/'; - std::sort(response.names.begin(), response.names.end()); + /// Fairly inefficient. + for (auto child_it = container.upper_bound(path_prefix); + child_it != container.end() && startsWith(child_it->first, path_prefix); + ++child_it) + { + if (parentPath(child_it->first) == request.path) + response.names.emplace_back(baseName(child_it->first)); + } response.stat = it->second.stat; response.error = Coordination::Error::ZOK; diff --git a/src/Coordination/NuKeeperStorage.h b/src/Coordination/NuKeeperStorage.h index bd1fc087d09..20ab1982b4e 100644 --- a/src/Coordination/NuKeeperStorage.h +++ b/src/Coordination/NuKeeperStorage.h @@ -16,7 +16,6 @@ using namespace DB; struct NuKeeperStorageRequest; using NuKeeperStorageRequestPtr = std::shared_ptr; using ResponseCallback = std::function; -using ChildrenRefSet = std::unordered_set; class NuKeeperStorage { @@ -31,7 +30,6 @@ public: bool is_sequental = false; Coordination::Stat stat{}; int32_t seq_num = 0; - ChildrenRefSet children; }; struct ResponseForSession @@ -50,9 +48,9 @@ public: using RequestsForSessions = std::vector; - using Container = std::unordered_map; - using Ephemerals = std::unordered_map>; - using SessionAndWatcher = std::unordered_map>; + using Container = std::map; + using Ephemerals = std::unordered_map>; + using SessionAndWatcher = std::unordered_map>; using SessionAndTimeout = std::unordered_map; using SessionIDs = std::vector; From afed8a8192dbc118292072fd1286cc9af226c4c1 Mon Sep 17 00:00:00 2001 From: Alexander Tokmakov Date: Fri, 19 Feb 2021 01:08:44 +0300 Subject: [PATCH 0456/2357] process stress test results --- docker/test/stress/run.sh | 45 +++++++++++++++++++++++++++++++++++++-- docker/test/stress/stress | 3 ++- 2 files changed, 45 insertions(+), 3 deletions(-) diff --git a/docker/test/stress/run.sh b/docker/test/stress/run.sh index 88a633ac488..1e46adca966 100755 --- a/docker/test/stress/run.sh +++ b/docker/test/stress/run.sh @@ -64,9 +64,50 @@ clickhouse-client --query "RENAME TABLE datasets.hits_v1 TO test.hits" clickhouse-client --query "RENAME TABLE datasets.visits_v1 TO test.visits" clickhouse-client --query "SHOW TABLES FROM test" -./stress --hung-check --output-folder test_output --skip-func-tests "$SKIP_TESTS_OPTION" && echo "OK" > /test_output/script_exit_code.txt || echo "FAIL" > /test_output/script_exit_code.txt +./stress --hung-check --output-folder test_output --skip-func-tests "$SKIP_TESTS_OPTION" \ + && echo -e 'Test script exit code\tOK' >> /test_output/test_results.tsv \ + || echo -e 'Test script failed\tFAIL' >> /test_output/test_results.tsv stop start -clickhouse-client --query "SELECT 'Server successfuly started'" > /test_output/alive_check.txt || echo 'Server failed to start' > /test_output/alive_check.txt +clickhouse-client --query "SELECT 'Server successfully started', 'OK'" >> /test_output/test_results.tsv \ + || echo -e 'Server failed to start\tFAIL' >> /test_output/test_results.tsv + +[ -f /var/log/clickhouse-server/clickhouse-server.log ] || echo -e "Server log does not exist\tFAIL" +[ -f /var/log/clickhouse-server/stderr.log ] || echo -e "Stderr log does not exist\tFAIL" + +# Print Fatal log messages to stdout +zgrep -Fa " " /var/log/clickhouse-server/clickhouse-server.log + +# Grep logs for sanitizer asserts, crashes and other critical errors + +# Sanitizer asserts +zgrep -Fa "==================" /var/log/clickhouse-server/stderr.log >> /test_output/tmp +zgrep -Fa "WARNING" /var/log/clickhouse-server/stderr.log >> /test_output/tmp +zgrep -Fav "ASan doesn't fully support makecontext/swapcontext functions" > /dev/null \ + && echo -e 'Sanitizer assert (in stderr.log)\tFAIL' >> /test_output/test_results.tsv \ + || echo -e 'No sanitizer asserts\tOK' >> /test_output/test_results.tsv +rm -f /test_output/tmp + +# Logical errors +zgrep -Fa "Code: 49, e.displayText() = DB::Exception:" /var/log/clickhouse-server/clickhouse-server.log > /dev/null \ + && echo -e 'Logical error thrown (see clickhouse-server.log)\tFAIL' >> /test_output/test_results.tsv \ + || echo -e 'No logical errors\tOK' >> /test_output/test_results.tsv + +# Crash +zgrep -Fa "########################################" /var/log/clickhouse-server/clickhouse-server.log > /dev/null \ + && echo -e 'Killed by signal (in clickhouse-server.log)\tFAIL' >> /test_output/test_results.tsv \ + || echo -e 'Not crashed\tOK' >> /test_output/test_results.tsv + +# It also checks for OOM or crash without stacktrace (printed by watchdog) +zgrep -Fa " " /var/log/clickhouse-server/clickhouse-server.log > /dev/null \ + && echo -e 'Fatal message in clickhouse-server.log\tFAIL' >> /test_output/test_results.tsv \ + || echo -e 'No fatal messages in clickhouse-server.log\tOK' >> /test_output/test_results.tsv + +zgrep -Fa "########################################" /test_output/* > /dev/null \ + && echo -e 'Killed by signal (output files)\tFAIL' >> /test_output/test_results.tsv + +# Write check result into check_status.tsv +clickhouse-local --structure "test String, res String" -q "SELECT 'failure', test FROM table WHERE res != 'OK' LIMIT 1" < /test_output/test_results.tsv > /test_output/check_status.tsv +[ -s /test_output/check_status.tsv ] || echo -e "success\tNo errors found" diff --git a/docker/test/stress/stress b/docker/test/stress/stress index d2ec86b4421..8fad49ba5ee 100755 --- a/docker/test/stress/stress +++ b/docker/test/stress/stress @@ -88,8 +88,9 @@ if __name__ == "__main__": logging.info("Checking if some queries hung") cmd = "{} {} {}".format(args.test_cmd, "--hung-check", "00001_select_1") res = call(cmd, shell=True, stderr=STDOUT) + hung_check_status = "Hung check\t{}\n".format('FAIL' if res else 'OK') + open(os.path.join(args.output_folder, "test_results.tsv"), 'w+').write() if res != 0: logging.info("Hung check failed with exit code {}".format(res)) - sys.exit(1) logging.info("Stress test finished") From 879d9206c1a30eb04a4444cc5a30cf1455089176 Mon Sep 17 00:00:00 2001 From: Denis Glazachev Date: Fri, 19 Feb 2021 02:47:28 +0400 Subject: [PATCH 0457/2357] Add tests for row-level security with prewhere --- tests/integration/test_row_policy/test.py | 33 +++++++++++++++++++ .../prewhere_with_row_level_filter.xml | 16 +++++++++ 2 files changed, 49 insertions(+) create mode 100644 tests/performance/prewhere_with_row_level_filter.xml diff --git a/tests/integration/test_row_policy/test.py b/tests/integration/test_row_policy/test.py index 8919aeab0c5..c11e1b1e21c 100644 --- a/tests/integration/test_row_policy/test.py +++ b/tests/integration/test_row_policy/test.py @@ -107,6 +107,7 @@ def test_cannot_trick_row_policy_with_keyword_with(): assert node.query("WITH 0 AS a SELECT a, b FROM mydb.filtered_table1") == TSV([[0, 0], [0, 1]]) assert node.query("WITH 0 AS a SELECT a FROM mydb.filtered_table1") == TSV([[0], [0]]) assert node.query("WITH 0 AS a SELECT b FROM mydb.filtered_table1") == TSV([[0], [1]]) + assert node.query("WITH 0 AS a SELECT * FROM mydb.filtered_table1 PREWHERE a IN(0, 1) WHERE b IN(0, 1)") == TSV([[0], [1]]) def test_policy_from_users_xml_affects_only_user_assigned(): @@ -121,6 +122,38 @@ def test_policy_from_users_xml_affects_only_user_assigned(): assert node.query("SELECT * FROM mydb.local", user="another") == TSV([[1, 0], [1, 1]]) +def test_with_prewhere(): + copy_policy_xml('normal_filters.xml') + assert node.query("SELECT * FROM mydb.filtered_table2 WHERE a > 1") == TSV([[4, 5, 2, 1]]) + assert node.query("SELECT a FROM mydb.filtered_table2 WHERE a > 1") == TSV([[4]]) + assert node.query("SELECT a, b FROM mydb.filtered_table2 WHERE a > 1") == TSV([[4, 5]]) + assert node.query("SELECT b, c FROM mydb.filtered_table2 WHERE a > 1") == TSV([[5, 2]]) + assert node.query("SELECT d FROM mydb.filtered_table2 WHERE a > 1") == TSV([[1]]) + + assert node.query("SELECT * FROM mydb.filtered_table2 PREWHERE a > 1") == TSV([[4, 5, 2, 1]]) + assert node.query("SELECT a FROM mydb.filtered_table2 PREWHERE a > 1") == TSV([[4]]) + assert node.query("SELECT a, b FROM mydb.filtered_table2 PREWHERE a > 1") == TSV([[4, 5]]) + assert node.query("SELECT b, c FROM mydb.filtered_table2 PREWHERE a > 1") == TSV([[5, 2]]) + assert node.query("SELECT d FROM mydb.filtered_table2 PREWHERE a > 1") == TSV([[1]]) + + assert node.query("SELECT * FROM mydb.filtered_table2 PREWHERE a < 4 WHERE b < 10") == TSV([[1, 2, 3, 4]]) + assert node.query("SELECT a FROM mydb.filtered_table2 PREWHERE a < 4 WHERE b < 10") == TSV([[1]]) + assert node.query("SELECT b FROM mydb.filtered_table2 PREWHERE a < 4 WHERE b < 10") == TSV([[2]]) + assert node.query("SELECT a, b FROM mydb.filtered_table2 PREWHERE a < 4 WHERE b < 10") == TSV([[1, 2]]) + assert node.query("SELECT a, c FROM mydb.filtered_table2 PREWHERE a < 4 WHERE b < 10") == TSV([[1, 3]]) + assert node.query("SELECT b, d FROM mydb.filtered_table2 PREWHERE a < 4 WHERE b < 10") == TSV([[2, 4]]) + assert node.query("SELECT c, d FROM mydb.filtered_table2 PREWHERE a < 4 WHERE b < 10") == TSV([[3, 4]]) + + +def test_with_throwif_in_prewhere(): + copy_policy_xml('no_filters.xml') + assert 'expected' in node.query_and_get_error("SELECT throwIf(a = 0, 'expected') FROM mydb.filtered_table2 PREWHERE b < 10") + + copy_policy_xml('normal_filters.xml') + assert node.query("SELECT throwIf(a = 0, 'pwned') FROM mydb.filtered_table2 PREWHERE b < 10") == TSV([ + [4, 5, 2, 1], [1, 2, 3, 4]]) + + def test_change_of_users_xml_changes_row_policies(): copy_policy_xml('normal_filters.xml') assert node.query("SELECT * FROM mydb.filtered_table1") == TSV([[1, 0], [1, 1]]) diff --git a/tests/performance/prewhere_with_row_level_filter.xml b/tests/performance/prewhere_with_row_level_filter.xml new file mode 100644 index 00000000000..d73690ca811 --- /dev/null +++ b/tests/performance/prewhere_with_row_level_filter.xml @@ -0,0 +1,16 @@ + + DROP TABLE IF EXISTS test_prl; + CREATE TABLE test_prl (n UInt64) ENGINE MergeTree ORDER BY n; + CREATE ROW POLICY OR REPLACE test_prl_policy ON test_prl AS PERMISSIVE FOR SELECT USING n % 7 TO ALL; + + INSERT INTO test_prl SELECT number FROM numbers(50000000); + + SELECT * FROM test_prl; + SELECT * FROM test_prl WHERE n % 3 AND n % 5; + SELECT * FROM test_prl PREWHERE n % 3 AND n % 5; + SELECT * FROM test_prl PREWHERE n % 3 WHERE n % 5; + SELECT * FROM test_prl PREWHERE n % 5 WHERE n % 3; + + DROP ROW POLICY IF EXISTS test_prl_policy ON test_prl; + DROP TABLE IF EXISTS test_prl; + From fc185e5fb73dc0ac82ab8b0b7a79518832401379 Mon Sep 17 00:00:00 2001 From: Amos Bird Date: Fri, 19 Feb 2021 11:56:24 +0800 Subject: [PATCH 0458/2357] Another try --- src/Server/TCPHandler.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/Server/TCPHandler.cpp b/src/Server/TCPHandler.cpp index 430a01bb97a..9794a86d3e3 100644 --- a/src/Server/TCPHandler.cpp +++ b/src/Server/TCPHandler.cpp @@ -1133,8 +1133,8 @@ void TCPHandler::receiveQuery() } query_context->applySettingsChanges(settings_changes); - /// Disable function name normalization it's not an initial query. - if (client_info.query_kind != ClientInfo::QueryKind::INITIAL_QUERY) + /// Disable function name normalization it's a secondary query. + if (client_info.query_kind == ClientInfo::QueryKind::SECONDARY_QUERY) { query_context->setSetting("normalize_function_names", Field(0)); } From 88a6d4e206c362dcafc0d8751cb2a6a450178ee8 Mon Sep 17 00:00:00 2001 From: alesapin Date: Fri, 19 Feb 2021 10:05:52 +0300 Subject: [PATCH 0459/2357] Revert "Revert "Better list requests"" This reverts commit 839d6f7072d6de6b71cc497027ca40715968535e. --- src/Coordination/NuKeeperStorage.cpp | 37 ++++++++++++++-------------- src/Coordination/NuKeeperStorage.h | 8 +++--- 2 files changed, 24 insertions(+), 21 deletions(-) diff --git a/src/Coordination/NuKeeperStorage.cpp b/src/Coordination/NuKeeperStorage.cpp index 631f975cddc..fa57b8141a7 100644 --- a/src/Coordination/NuKeeperStorage.cpp +++ b/src/Coordination/NuKeeperStorage.cpp @@ -25,10 +25,10 @@ static String parentPath(const String & path) return "/"; } -static String baseName(const String & path) +static std::string_view getBaseNameView(const String & path) { - auto rslash_pos = path.rfind('/'); - return path.substr(rslash_pos + 1); + size_t basename_start = path.rfind('/'); + return std::string_view{&path[basename_start + 1], path.length() - basename_start - 1}; } static NuKeeperStorage::ResponsesForSessions processWatchesImpl(const String & path, NuKeeperStorage::Watches & watches, NuKeeperStorage::Watches & list_watches, Coordination::Event event_type) @@ -167,14 +167,17 @@ struct NuKeeperStorageCreateRequest final : public NuKeeperStorageRequest /// Increment sequential number even if node is not sequential ++it->second.seq_num; - response.path_created = path_created; - container.emplace(path_created, std::move(created_node)); + + auto [child_itr, created] = container.emplace(path_created, std::move(created_node)); + + auto child_path_view = getBaseNameView(child_itr->first); + it->second.children.insert(child_path_view); if (request.is_ephemeral) ephemerals[session_id].emplace(path_created); - undo = [&container, &ephemerals, session_id, path_created, is_ephemeral = request.is_ephemeral, parent_path = it->first] + undo = [&container, &ephemerals, session_id, path_created, is_ephemeral = request.is_ephemeral, parent_path = it->first, child_path_view] { container.erase(path_created); if (is_ephemeral) @@ -183,6 +186,7 @@ struct NuKeeperStorageCreateRequest final : public NuKeeperStorageRequest --undo_parent.stat.cversion; --undo_parent.stat.numChildren; --undo_parent.seq_num; + undo_parent.children.erase(child_path_view); }; ++it->second.stat.cversion; @@ -250,21 +254,25 @@ struct NuKeeperStorageRemoveRequest final : public NuKeeperStorageRequest if (prev_node.is_ephemeral) ephemerals[session_id].erase(request.path); - container.erase(it); + auto child_basename_view = getBaseNameView(it->first); auto & parent = container.at(parentPath(request.path)); --parent.stat.numChildren; ++parent.stat.cversion; + parent.children.erase(child_basename_view); response.error = Coordination::Error::ZOK; + container.erase(it); + undo = [prev_node, &container, &ephemerals, session_id, path = request.path] { if (prev_node.is_ephemeral) ephemerals[session_id].emplace(path); - container.emplace(path, prev_node); + auto [itr, inserted] = container.emplace(path, prev_node); auto & undo_parent = container.at(parentPath(path)); ++undo_parent.stat.numChildren; --undo_parent.stat.cversion; + undo_parent.children.insert(getBaseNameView(itr->first)); }; } @@ -370,17 +378,10 @@ struct NuKeeperStorageListRequest final : public NuKeeperStorageRequest if (path_prefix.empty()) throw DB::Exception("Logical error: path cannot be empty", ErrorCodes::LOGICAL_ERROR); - if (path_prefix.back() != '/') - path_prefix += '/'; + for (const auto & name : it->second.children) + response.names.emplace_back(name); - /// Fairly inefficient. - for (auto child_it = container.upper_bound(path_prefix); - child_it != container.end() && startsWith(child_it->first, path_prefix); - ++child_it) - { - if (parentPath(child_it->first) == request.path) - response.names.emplace_back(baseName(child_it->first)); - } + std::sort(response.names.begin(), response.names.end()); response.stat = it->second.stat; response.error = Coordination::Error::ZOK; diff --git a/src/Coordination/NuKeeperStorage.h b/src/Coordination/NuKeeperStorage.h index 20ab1982b4e..bd1fc087d09 100644 --- a/src/Coordination/NuKeeperStorage.h +++ b/src/Coordination/NuKeeperStorage.h @@ -16,6 +16,7 @@ using namespace DB; struct NuKeeperStorageRequest; using NuKeeperStorageRequestPtr = std::shared_ptr; using ResponseCallback = std::function; +using ChildrenRefSet = std::unordered_set; class NuKeeperStorage { @@ -30,6 +31,7 @@ public: bool is_sequental = false; Coordination::Stat stat{}; int32_t seq_num = 0; + ChildrenRefSet children; }; struct ResponseForSession @@ -48,9 +50,9 @@ public: using RequestsForSessions = std::vector; - using Container = std::map; - using Ephemerals = std::unordered_map>; - using SessionAndWatcher = std::unordered_map>; + using Container = std::unordered_map; + using Ephemerals = std::unordered_map>; + using SessionAndWatcher = std::unordered_map>; using SessionAndTimeout = std::unordered_map; using SessionIDs = std::vector; From b72b13bab05fc6f90396f335471023673c98c31f Mon Sep 17 00:00:00 2001 From: alesapin Date: Fri, 19 Feb 2021 10:25:55 +0300 Subject: [PATCH 0460/2357] Better list performance --- src/Coordination/NuKeeperStorage.cpp | 27 +++++++++++++-------------- src/Coordination/NuKeeperStorage.h | 4 ++-- 2 files changed, 15 insertions(+), 16 deletions(-) diff --git a/src/Coordination/NuKeeperStorage.cpp b/src/Coordination/NuKeeperStorage.cpp index fa57b8141a7..bb433474dc9 100644 --- a/src/Coordination/NuKeeperStorage.cpp +++ b/src/Coordination/NuKeeperStorage.cpp @@ -25,10 +25,10 @@ static String parentPath(const String & path) return "/"; } -static std::string_view getBaseNameView(const String & path) +static std::string getBaseName(const String & path) { size_t basename_start = path.rfind('/'); - return std::string_view{&path[basename_start + 1], path.length() - basename_start - 1}; + return std::string{&path[basename_start + 1], path.length() - basename_start - 1}; } static NuKeeperStorage::ResponsesForSessions processWatchesImpl(const String & path, NuKeeperStorage::Watches & watches, NuKeeperStorage::Watches & list_watches, Coordination::Event event_type) @@ -169,15 +169,15 @@ struct NuKeeperStorageCreateRequest final : public NuKeeperStorageRequest ++it->second.seq_num; response.path_created = path_created; - auto [child_itr, created] = container.emplace(path_created, std::move(created_node)); + container.emplace(path_created, std::move(created_node)); - auto child_path_view = getBaseNameView(child_itr->first); - it->second.children.insert(child_path_view); + auto child_path = getBaseName(path_created); + it->second.children.insert(child_path); if (request.is_ephemeral) ephemerals[session_id].emplace(path_created); - undo = [&container, &ephemerals, session_id, path_created, is_ephemeral = request.is_ephemeral, parent_path = it->first, child_path_view] + undo = [&container, &ephemerals, session_id, path_created, is_ephemeral = request.is_ephemeral, parent_path = it->first, child_path] { container.erase(path_created); if (is_ephemeral) @@ -186,7 +186,7 @@ struct NuKeeperStorageCreateRequest final : public NuKeeperStorageRequest --undo_parent.stat.cversion; --undo_parent.stat.numChildren; --undo_parent.seq_num; - undo_parent.children.erase(child_path_view); + undo_parent.children.erase(child_path); }; ++it->second.stat.cversion; @@ -254,25 +254,25 @@ struct NuKeeperStorageRemoveRequest final : public NuKeeperStorageRequest if (prev_node.is_ephemeral) ephemerals[session_id].erase(request.path); - auto child_basename_view = getBaseNameView(it->first); + auto child_basename = getBaseName(it->first); auto & parent = container.at(parentPath(request.path)); --parent.stat.numChildren; ++parent.stat.cversion; - parent.children.erase(child_basename_view); + parent.children.erase(child_basename); response.error = Coordination::Error::ZOK; container.erase(it); - undo = [prev_node, &container, &ephemerals, session_id, path = request.path] + undo = [prev_node, &container, &ephemerals, session_id, path = request.path, child_basename] { if (prev_node.is_ephemeral) ephemerals[session_id].emplace(path); - auto [itr, inserted] = container.emplace(path, prev_node); + container.emplace(path, prev_node); auto & undo_parent = container.at(parentPath(path)); ++undo_parent.stat.numChildren; --undo_parent.stat.cversion; - undo_parent.children.insert(getBaseNameView(itr->first)); + undo_parent.children.insert(child_basename); }; } @@ -378,8 +378,7 @@ struct NuKeeperStorageListRequest final : public NuKeeperStorageRequest if (path_prefix.empty()) throw DB::Exception("Logical error: path cannot be empty", ErrorCodes::LOGICAL_ERROR); - for (const auto & name : it->second.children) - response.names.emplace_back(name); + response.names.insert(response.names.end(), it->second.children.begin(), it->second.children.end()); std::sort(response.names.begin(), response.names.end()); diff --git a/src/Coordination/NuKeeperStorage.h b/src/Coordination/NuKeeperStorage.h index bd1fc087d09..299fad4eea0 100644 --- a/src/Coordination/NuKeeperStorage.h +++ b/src/Coordination/NuKeeperStorage.h @@ -16,7 +16,7 @@ using namespace DB; struct NuKeeperStorageRequest; using NuKeeperStorageRequestPtr = std::shared_ptr; using ResponseCallback = std::function; -using ChildrenRefSet = std::unordered_set; +using ChildrenSet = std::unordered_set; class NuKeeperStorage { @@ -31,7 +31,7 @@ public: bool is_sequental = false; Coordination::Stat stat{}; int32_t seq_num = 0; - ChildrenRefSet children; + ChildrenSet children; }; struct ResponseForSession From b9d6df9618c6a1b0efcd17c66cfa22aaa023d97a Mon Sep 17 00:00:00 2001 From: Nikolai Kochetov Date: Fri, 19 Feb 2021 11:49:41 +0300 Subject: [PATCH 0461/2357] Check for eintr in epoll_wait --- src/Client/PacketReceiver.h | 145 ++++++++++++++++++ .../RemoteQueryExecutorReadContext.cpp | 10 +- src/Processors/Executors/PollingQueue.cpp | 7 +- 3 files changed, 156 insertions(+), 6 deletions(-) create mode 100644 src/Client/PacketReceiver.h diff --git a/src/Client/PacketReceiver.h b/src/Client/PacketReceiver.h new file mode 100644 index 00000000000..c9475bafa71 --- /dev/null +++ b/src/Client/PacketReceiver.h @@ -0,0 +1,145 @@ +#pragma once + +#if defined(OS_LINUX) + +#include +#include +#include +#include +#include + +namespace DB +{ + +/// Class for nonblocking packet receiving. It runs connection->receivePacket +/// in fiber and sets special read callback which is called when +/// reading from socket blocks. When read callback is called, +/// socket and receive timeout are added in epoll and execution returns to the main program. +/// So, you can poll this epoll file descriptor to determine when to resume +/// packet receiving (beside polling epoll descriptor, you also need to check connection->hasPendingData(), +/// because small packet can be read in buffer with the previous one, so new packet will be ready in buffer, +/// but there is no data socket to poll). +class PacketReceiver +{ +public: + PacketReceiver(Connection * connection_) : connection(connection_) + { + epoll.add(receive_timeout.getDescriptor()); + epoll.add(connection->getSocket()->impl()->sockfd()); + fiber = boost::context::fiber(std::allocator_arg_t(), fiber_stack, Routine{*this}); + } + + /// Resume packet receiving. + void resume() + { + /// If there is no pending data, check receive timeout. + if (!connection->hasReadPendingData() && !checkReceiveTimeout()) + return; + + fiber = std::move(fiber).resume(); + if (exception) + std::rethrow_exception(std::move(exception)); + } + + void cancel() + { + Fiber to_destroy = std::move(fiber); + connection = nullptr; + } + + Packet getPacket() { return std::move(packet); } + + int getFileDescriptor() const { return epoll.getFileDescriptor(); } + + bool isPacketReady() const { return !is_read_in_process; } + + bool isReceiveTimeoutExpired() const { return is_receive_timeout_expired; } + +private: + /// When epoll file descriptor is ready, check if it's an expired timeout + bool checkReceiveTimeout() + { + bool is_socket_ready = false; + is_receive_timeout_expired = false; + + epoll_event events[2]; + events[0].data.fd = events[1].data.fd = -1; + size_t ready_count = epoll.getManyReady(2, events, true); + + for (size_t i = 0; i != ready_count; ++i) + { + if (events[i].data.fd == connection->getSocket()->impl()->sockfd()) + is_socket_ready = true; + if (events[i].data.fd == receive_timeout.getDescriptor()) + is_receive_timeout_expired = true; + } + + if (is_receive_timeout_expired && !is_socket_ready) + { + receive_timeout.reset(); + return false; + } + + return true; + } + + struct Routine + { + PacketReceiver & receiver; + + struct ReadCallback + { + PacketReceiver & receiver; + Fiber & sink; + + void operator()(int, const Poco::Timespan & timeout, const std::string &) + { + receiver.receive_timeout.setRelative(timeout); + receiver.is_read_in_process = true; + sink = std::move(sink).resume(); + receiver.is_read_in_process = false; + receiver.receive_timeout.reset(); + } + }; + + Fiber operator()(Fiber && sink) + { + try + { + AsyncCallbackSetter async_setter(receiver.connection, ReadCallback{receiver, sink}); + while (true) + { + receiver.packet = receiver.connection->receivePacket(); + sink = std::move(sink).resume(); + } + + } + catch (const boost::context::detail::forced_unwind &) + { + /// This exception is thrown by fiber implementation in case if fiber is being deleted but hasn't exited + /// It should not be caught or it will segfault. + /// Other exceptions must be caught + throw; + } + catch (...) + { + receiver.exception = std::current_exception(); + } + + return std::move(sink); + } + }; + + Connection * connection; + TimerDescriptor receive_timeout; + Epoll epoll; + Fiber fiber; + FiberStack fiber_stack; + Packet packet; + bool is_read_in_process = false; + bool is_receive_timeout_expired = false; + std::exception_ptr exception; +}; + +} +#endif diff --git a/src/DataStreams/RemoteQueryExecutorReadContext.cpp b/src/DataStreams/RemoteQueryExecutorReadContext.cpp index bc47b049407..c79fffafcb1 100644 --- a/src/DataStreams/RemoteQueryExecutorReadContext.cpp +++ b/src/DataStreams/RemoteQueryExecutorReadContext.cpp @@ -146,9 +146,13 @@ bool RemoteQueryExecutorReadContext::checkTimeoutImpl() const events[0].data.fd = events[1].data.fd = events[2].data.fd = -1; /// Wait for epoll_fd will not block if it was polled externally. - int num_events = epoll_wait(epoll_fd, events, 3, 0); - if (num_events == -1) - throwFromErrno("Failed to epoll_wait", ErrorCodes::CANNOT_READ_FROM_SOCKET); + int num_events = 0; + while (num_events <= 0) + { + num_events = epoll_wait(epoll_fd, events, 3, 0); + if (num_events == -1 && errno != EINTR) + throwFromErrno("Failed to epoll_wait", ErrorCodes::CANNOT_READ_FROM_SOCKET); + } bool is_socket_ready = false; bool is_pipe_alarmed = false; diff --git a/src/Processors/Executors/PollingQueue.cpp b/src/Processors/Executors/PollingQueue.cpp index 93edfe53987..b9c7bdade2d 100644 --- a/src/Processors/Executors/PollingQueue.cpp +++ b/src/Processors/Executors/PollingQueue.cpp @@ -88,11 +88,12 @@ PollingQueue::TaskData PollingQueue::wait(std::unique_lock & lock) event.data.ptr = nullptr; int num_events = 0; - while (num_events == 0) + while (num_events <= 0) { num_events = epoll_wait(epoll_fd, &event, 1, 0); - if (num_events == -1) - throwFromErrno("Failed to epoll_wait", ErrorCodes::CANNOT_READ_FROM_SOCKET); + + if (num_events == -1 && errno != EINTR) + throwFromErrno("Failed to epoll_wait", ErrorCodes::CANNOT_READ_FROM_SOCKET); } lock.lock(); From 7d1119680e7881af7f5934773721cb48f40b35e7 Mon Sep 17 00:00:00 2001 From: Nikolai Kochetov Date: Fri, 19 Feb 2021 11:52:33 +0300 Subject: [PATCH 0462/2357] Remove not needed file. --- src/Client/PacketReceiver.h | 145 ------------------------------------ 1 file changed, 145 deletions(-) delete mode 100644 src/Client/PacketReceiver.h diff --git a/src/Client/PacketReceiver.h b/src/Client/PacketReceiver.h deleted file mode 100644 index c9475bafa71..00000000000 --- a/src/Client/PacketReceiver.h +++ /dev/null @@ -1,145 +0,0 @@ -#pragma once - -#if defined(OS_LINUX) - -#include -#include -#include -#include -#include - -namespace DB -{ - -/// Class for nonblocking packet receiving. It runs connection->receivePacket -/// in fiber and sets special read callback which is called when -/// reading from socket blocks. When read callback is called, -/// socket and receive timeout are added in epoll and execution returns to the main program. -/// So, you can poll this epoll file descriptor to determine when to resume -/// packet receiving (beside polling epoll descriptor, you also need to check connection->hasPendingData(), -/// because small packet can be read in buffer with the previous one, so new packet will be ready in buffer, -/// but there is no data socket to poll). -class PacketReceiver -{ -public: - PacketReceiver(Connection * connection_) : connection(connection_) - { - epoll.add(receive_timeout.getDescriptor()); - epoll.add(connection->getSocket()->impl()->sockfd()); - fiber = boost::context::fiber(std::allocator_arg_t(), fiber_stack, Routine{*this}); - } - - /// Resume packet receiving. - void resume() - { - /// If there is no pending data, check receive timeout. - if (!connection->hasReadPendingData() && !checkReceiveTimeout()) - return; - - fiber = std::move(fiber).resume(); - if (exception) - std::rethrow_exception(std::move(exception)); - } - - void cancel() - { - Fiber to_destroy = std::move(fiber); - connection = nullptr; - } - - Packet getPacket() { return std::move(packet); } - - int getFileDescriptor() const { return epoll.getFileDescriptor(); } - - bool isPacketReady() const { return !is_read_in_process; } - - bool isReceiveTimeoutExpired() const { return is_receive_timeout_expired; } - -private: - /// When epoll file descriptor is ready, check if it's an expired timeout - bool checkReceiveTimeout() - { - bool is_socket_ready = false; - is_receive_timeout_expired = false; - - epoll_event events[2]; - events[0].data.fd = events[1].data.fd = -1; - size_t ready_count = epoll.getManyReady(2, events, true); - - for (size_t i = 0; i != ready_count; ++i) - { - if (events[i].data.fd == connection->getSocket()->impl()->sockfd()) - is_socket_ready = true; - if (events[i].data.fd == receive_timeout.getDescriptor()) - is_receive_timeout_expired = true; - } - - if (is_receive_timeout_expired && !is_socket_ready) - { - receive_timeout.reset(); - return false; - } - - return true; - } - - struct Routine - { - PacketReceiver & receiver; - - struct ReadCallback - { - PacketReceiver & receiver; - Fiber & sink; - - void operator()(int, const Poco::Timespan & timeout, const std::string &) - { - receiver.receive_timeout.setRelative(timeout); - receiver.is_read_in_process = true; - sink = std::move(sink).resume(); - receiver.is_read_in_process = false; - receiver.receive_timeout.reset(); - } - }; - - Fiber operator()(Fiber && sink) - { - try - { - AsyncCallbackSetter async_setter(receiver.connection, ReadCallback{receiver, sink}); - while (true) - { - receiver.packet = receiver.connection->receivePacket(); - sink = std::move(sink).resume(); - } - - } - catch (const boost::context::detail::forced_unwind &) - { - /// This exception is thrown by fiber implementation in case if fiber is being deleted but hasn't exited - /// It should not be caught or it will segfault. - /// Other exceptions must be caught - throw; - } - catch (...) - { - receiver.exception = std::current_exception(); - } - - return std::move(sink); - } - }; - - Connection * connection; - TimerDescriptor receive_timeout; - Epoll epoll; - Fiber fiber; - FiberStack fiber_stack; - Packet packet; - bool is_read_in_process = false; - bool is_receive_timeout_expired = false; - std::exception_ptr exception; -}; - -} -#endif From 39f07d62a42288b83f8c5e46e026ebf9d051601d Mon Sep 17 00:00:00 2001 From: Alexey Milovidov Date: Fri, 19 Feb 2021 12:02:18 +0300 Subject: [PATCH 0463/2357] Disable in-memory compression by default --- src/Storages/MemorySettings.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/Storages/MemorySettings.h b/src/Storages/MemorySettings.h index 4a1ba57475f..5e3b5f81ba5 100644 --- a/src/Storages/MemorySettings.h +++ b/src/Storages/MemorySettings.h @@ -9,7 +9,7 @@ class ASTStorage; #define MEMORY_SETTINGS(M) \ - M(Bool, compress, true, "Compress data in memory", 0) \ + M(Bool, compress, false, "Compress data in memory", 0) \ DECLARE_SETTINGS_TRAITS(memorySettingsTraits, MEMORY_SETTINGS) From d438d7e390648d6be1c9718b58a18389d4d68650 Mon Sep 17 00:00:00 2001 From: Nikolai Kochetov Date: Fri, 19 Feb 2021 12:07:34 +0300 Subject: [PATCH 0464/2357] Fix timeout in epoll_wait for PollingQueue --- src/Processors/Executors/PollingQueue.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/Processors/Executors/PollingQueue.cpp b/src/Processors/Executors/PollingQueue.cpp index b9c7bdade2d..3636fa82f73 100644 --- a/src/Processors/Executors/PollingQueue.cpp +++ b/src/Processors/Executors/PollingQueue.cpp @@ -90,7 +90,7 @@ PollingQueue::TaskData PollingQueue::wait(std::unique_lock & lock) while (num_events <= 0) { - num_events = epoll_wait(epoll_fd, &event, 1, 0); + num_events = epoll_wait(epoll_fd, &event, 1, -1); if (num_events == -1 && errno != EINTR) throwFromErrno("Failed to epoll_wait", ErrorCodes::CANNOT_READ_FROM_SOCKET); From ed4697cffc83c3b4c34d11189e9e300c969da618 Mon Sep 17 00:00:00 2001 From: Nikolai Kochetov Date: Fri, 19 Feb 2021 12:20:24 +0300 Subject: [PATCH 0465/2357] Fix timeout in epoll_wait for RemoteQueryExecutorReadContext --- src/DataStreams/RemoteQueryExecutorReadContext.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/DataStreams/RemoteQueryExecutorReadContext.cpp b/src/DataStreams/RemoteQueryExecutorReadContext.cpp index c79fffafcb1..3cc24ad5056 100644 --- a/src/DataStreams/RemoteQueryExecutorReadContext.cpp +++ b/src/DataStreams/RemoteQueryExecutorReadContext.cpp @@ -149,7 +149,7 @@ bool RemoteQueryExecutorReadContext::checkTimeoutImpl() const int num_events = 0; while (num_events <= 0) { - num_events = epoll_wait(epoll_fd, events, 3, 0); + num_events = epoll_wait(epoll_fd, events, 3, -1); if (num_events == -1 && errno != EINTR) throwFromErrno("Failed to epoll_wait", ErrorCodes::CANNOT_READ_FROM_SOCKET); } From 86a74ca6b5cd3618d574431d0c94a44ebac93baf Mon Sep 17 00:00:00 2001 From: alesapin Date: Fri, 19 Feb 2021 12:24:50 +0300 Subject: [PATCH 0466/2357] Fix size deserialization --- src/Coordination/NuKeeperStorage.h | 2 +- src/Coordination/NuKeeperStorageSerializer.cpp | 5 ++++- 2 files changed, 5 insertions(+), 2 deletions(-) diff --git a/src/Coordination/NuKeeperStorage.h b/src/Coordination/NuKeeperStorage.h index 299fad4eea0..1a2e6202bf0 100644 --- a/src/Coordination/NuKeeperStorage.h +++ b/src/Coordination/NuKeeperStorage.h @@ -31,7 +31,7 @@ public: bool is_sequental = false; Coordination::Stat stat{}; int32_t seq_num = 0; - ChildrenSet children; + ChildrenSet children{}; }; struct ResponseForSession diff --git a/src/Coordination/NuKeeperStorageSerializer.cpp b/src/Coordination/NuKeeperStorageSerializer.cpp index 298df45cde0..c29d0d1f1fa 100644 --- a/src/Coordination/NuKeeperStorageSerializer.cpp +++ b/src/Coordination/NuKeeperStorageSerializer.cpp @@ -59,13 +59,16 @@ void NuKeeperStorageSerializer::deserialize(NuKeeperStorage & storage, ReadBuffe size_t container_size; Coordination::read(container_size, in); - while (storage.container.size() < container_size) + + size_t current_size = 0; + while (current_size < container_size) { std::string path; Coordination::read(path, in); NuKeeperStorage::Node node; readNode(node, in); storage.container[path] = node; + current_size++; } size_t ephemerals_size; Coordination::read(ephemerals_size, in); From fc1885ea9b01714290fba8ee8fbbe1a78894e573 Mon Sep 17 00:00:00 2001 From: Amos Bird Date: Fri, 19 Feb 2021 17:28:01 +0800 Subject: [PATCH 0467/2357] Try fixing flaky tests --- tests/queries/0_stateless/00643_cast_zookeeper.sql | 2 ++ .../queries/0_stateless/01656_test_query_log_factories_info.sql | 2 ++ 2 files changed, 4 insertions(+) diff --git a/tests/queries/0_stateless/00643_cast_zookeeper.sql b/tests/queries/0_stateless/00643_cast_zookeeper.sql index c52d44bd88b..c9760f00ca7 100644 --- a/tests/queries/0_stateless/00643_cast_zookeeper.sql +++ b/tests/queries/0_stateless/00643_cast_zookeeper.sql @@ -1,3 +1,5 @@ +SET database_atomic_wait_for_drop_and_detach_synchronously=1; + DROP TABLE IF EXISTS cast1; DROP TABLE IF EXISTS cast2; diff --git a/tests/queries/0_stateless/01656_test_query_log_factories_info.sql b/tests/queries/0_stateless/01656_test_query_log_factories_info.sql index 9f374def8b5..17657cf60f5 100644 --- a/tests/queries/0_stateless/01656_test_query_log_factories_info.sql +++ b/tests/queries/0_stateless/01656_test_query_log_factories_info.sql @@ -1,3 +1,5 @@ +SET database_atomic_wait_for_drop_and_detach_synchronously=1; + SELECT uniqArray([1, 1, 2]), SUBSTRING('Hello, world', 7, 5), flatten([[[BIT_AND(123)]], [[mod(3, 2)], [CAST('1' AS INTEGER)]]]), From 5bbd6f7480281a7acdf5c16ac1efc4626ba51175 Mon Sep 17 00:00:00 2001 From: Maksim Kita Date: Fri, 19 Feb 2021 12:37:00 +0300 Subject: [PATCH 0468/2357] Fixed documentation --- docs/en/sql-reference/functions/hash-functions.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/docs/en/sql-reference/functions/hash-functions.md b/docs/en/sql-reference/functions/hash-functions.md index 9394426b20b..14ac288339b 100644 --- a/docs/en/sql-reference/functions/hash-functions.md +++ b/docs/en/sql-reference/functions/hash-functions.md @@ -9,7 +9,7 @@ Hash functions can be used for the deterministic pseudo-random shuffling of elem ## halfMD5 {#hash-functions-halfmd5} -[Interprets](../../sql-reference/functions/type-conversion-functions.md#type_conversion_functions-reinterpretAsString) all the input parameters as strings and calculates the [MD5](https://en.wikipedia.org/wiki/MD5) hash value for each of them. Then combines hashes, takes the first 8 bytes of the hash of the resulting string, and interprets them as `UInt64` in big-endian byte order. +[Interprets](../../sql-reference/functions/type-conversion-functions.md#type_conversion_function-reinterpretAsString) all the input parameters as strings and calculates the [MD5](https://en.wikipedia.org/wiki/MD5) hash value for each of them. Then combines hashes, takes the first 8 bytes of the hash of the resulting string, and interprets them as `UInt64` in big-endian byte order. ``` sql halfMD5(par1, ...) @@ -54,7 +54,7 @@ sipHash64(par1,...) This is a cryptographic hash function. It works at least three times faster than the [MD5](#hash_functions-md5) function. -Function [interprets](../../sql-reference/functions/type-conversion-functions.md#type_conversion_functions-reinterpretAsString) all the input parameters as strings and calculates the hash value for each of them. Then combines hashes by the following algorithm: +Function [interprets](../../sql-reference/functions/type-conversion-functions.md#type_conversion_function-reinterpretAsString) all the input parameters as strings and calculates the hash value for each of them. Then combines hashes by the following algorithm: 1. After hashing all the input parameters, the function gets the array of hashes. 2. Function takes the first and the second elements and calculates a hash for the array of them. From 4493c39bf72448b125919b6bf4beb904038c1e73 Mon Sep 17 00:00:00 2001 From: Alexander Tokmakov Date: Fri, 19 Feb 2021 12:57:09 +0300 Subject: [PATCH 0469/2357] fix --- docker/test/stress/run.sh | 4 ++++ docker/test/stress/stress | 2 +- tests/clickhouse-test | 6 ++++-- .../0_stateless/01079_parallel_alter_modify_zookeeper.sh | 1 + 4 files changed, 10 insertions(+), 3 deletions(-) diff --git a/docker/test/stress/run.sh b/docker/test/stress/run.sh index 1e46adca966..963b204c4c0 100755 --- a/docker/test/stress/run.sh +++ b/docker/test/stress/run.sh @@ -108,6 +108,10 @@ zgrep -Fa " " /var/log/clickhouse-server/clickhouse-server.log > /dev/nu zgrep -Fa "########################################" /test_output/* > /dev/null \ && echo -e 'Killed by signal (output files)\tFAIL' >> /test_output/test_results.tsv +# Put logs into /test_output/ +pigz < /var/log/clickhouse-server/clickhouse-server.log > /test_output/clickhouse-server.log.gz +mv /var/log/clickhouse-server/stderr.log /test_output/ + # Write check result into check_status.tsv clickhouse-local --structure "test String, res String" -q "SELECT 'failure', test FROM table WHERE res != 'OK' LIMIT 1" < /test_output/test_results.tsv > /test_output/check_status.tsv [ -s /test_output/check_status.tsv ] || echo -e "success\tNo errors found" diff --git a/docker/test/stress/stress b/docker/test/stress/stress index 8fad49ba5ee..e0189072f7d 100755 --- a/docker/test/stress/stress +++ b/docker/test/stress/stress @@ -89,7 +89,7 @@ if __name__ == "__main__": cmd = "{} {} {}".format(args.test_cmd, "--hung-check", "00001_select_1") res = call(cmd, shell=True, stderr=STDOUT) hung_check_status = "Hung check\t{}\n".format('FAIL' if res else 'OK') - open(os.path.join(args.output_folder, "test_results.tsv"), 'w+').write() + open(os.path.join(args.output_folder, "test_results.tsv"), 'w+').write(hung_check_status) if res != 0: logging.info("Hung check failed with exit code {}".format(res)) diff --git a/tests/clickhouse-test b/tests/clickhouse-test index 74f5f07eb9d..fa8d2891224 100755 --- a/tests/clickhouse-test +++ b/tests/clickhouse-test @@ -229,13 +229,15 @@ def get_stacktraces_from_clickhouse(client): def get_server_pid(server_tcp_port): cmd = "lsof -i tcp:{port} -s tcp:LISTEN -Fp | awk '/^p[0-9]+$/{{print substr($0, 2)}}'".format(port=server_tcp_port) + output = None try: - output = subprocess.check_output(cmd, shell=True) + output = subprocess.check_output(cmd, shell=True, stderr=subprocess.STDOUT) if output: return int(output) else: return None # server dead - except Exception: + except Exception as e: + print("Cannot get server pid, got {}: {}", output, e) return None diff --git a/tests/queries/0_stateless/01079_parallel_alter_modify_zookeeper.sh b/tests/queries/0_stateless/01079_parallel_alter_modify_zookeeper.sh index 5b14c5a8543..0749dc14dfa 100755 --- a/tests/queries/0_stateless/01079_parallel_alter_modify_zookeeper.sh +++ b/tests/queries/0_stateless/01079_parallel_alter_modify_zookeeper.sh @@ -14,6 +14,7 @@ for i in $(seq $REPLICAS); do $CLICKHOUSE_CLIENT --query "CREATE TABLE concurrent_alter_mt_$i (key UInt64, value1 UInt64, value2 Int32) ENGINE = ReplicatedMergeTree('/clickhouse/tables/test_01079/concurrent_alter_mt', '$i') ORDER BY key SETTINGS max_replicated_mutations_in_queue=1000, number_of_free_entries_in_pool_to_execute_mutation=0,max_replicated_merges_in_queue=1000" done + $CLICKHOUSE_CLIENT --query "INSERT INTO concurrent_alter_mt_1 SELECT number, number + 10, number from numbers(10)" $CLICKHOUSE_CLIENT --query "INSERT INTO concurrent_alter_mt_1 SELECT number, number + 10, number from numbers(10, 40)" From 6c9322bb2e779067d005879592157b5dba5074ac Mon Sep 17 00:00:00 2001 From: alesapin Date: Fri, 19 Feb 2021 12:57:39 +0300 Subject: [PATCH 0470/2357] Sane constant while reading requests --- src/Server/NuKeeperTCPHandler.cpp | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/src/Server/NuKeeperTCPHandler.cpp b/src/Server/NuKeeperTCPHandler.cpp index e855e2c68f7..f25ca4a42ce 100644 --- a/src/Server/NuKeeperTCPHandler.cpp +++ b/src/Server/NuKeeperTCPHandler.cpp @@ -342,6 +342,7 @@ void NuKeeperTCPHandler::runImpl() PollResult result = poll_wrapper->poll(session_timeout); if (result.has_requests && !close_received) { + size_t requests_read = 0; do { auto [received_op, received_xid] = receiveRequest(); @@ -358,6 +359,10 @@ void NuKeeperTCPHandler::runImpl() LOG_TRACE(log, "Received heartbeat for session #{}", session_id); session_stopwatch.restart(); } + + if (requests_read > 50) + break; + requests_read++; } while (in->available()); } From 1c5b10de41a8266b623f5bcc7f3b8d3b72c6982d Mon Sep 17 00:00:00 2001 From: kssenii Date: Fri, 19 Feb 2021 09:23:51 +0000 Subject: [PATCH 0471/2357] Use fixed version for aerospike --- docker/test/integration/runner/Dockerfile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docker/test/integration/runner/Dockerfile b/docker/test/integration/runner/Dockerfile index 502dc3736b2..e0e5e36a3d6 100644 --- a/docker/test/integration/runner/Dockerfile +++ b/docker/test/integration/runner/Dockerfile @@ -58,7 +58,7 @@ RUN dockerd --version; docker --version RUN python3 -m pip install \ PyMySQL \ - aerospike \ + aerospike==4.0.0 \ avro \ cassandra-driver \ confluent-kafka==1.5.0 \ From 8f8a4f64235e6df11717fb9cb91be55c0673b3f5 Mon Sep 17 00:00:00 2001 From: Nikolai Kochetov Date: Fri, 19 Feb 2021 13:59:38 +0300 Subject: [PATCH 0472/2357] Update 01731_async_task_queue_wait.sh --- tests/queries/0_stateless/01731_async_task_queue_wait.sh | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/tests/queries/0_stateless/01731_async_task_queue_wait.sh b/tests/queries/0_stateless/01731_async_task_queue_wait.sh index eddbfdf5322..7545ad1e81a 100755 --- a/tests/queries/0_stateless/01731_async_task_queue_wait.sh +++ b/tests/queries/0_stateless/01731_async_task_queue_wait.sh @@ -7,6 +7,4 @@ CURDIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd) # regression for 'Empty task was returned from async task queue' during query # cancellation with async_socket_for_remote=1 (that ignores # max_distributed_connections) -timeout 5s ${CLICKHOUSE_CLIENT} --max_distributed_connections=1 --format Null -q "select * from remote('127.{2..11}', view(select * from numbers(1e9))) group by number format Null" -# timedout -test $? -eq 124 +$(timeout --signal=SIGINT 1 clickhouse client --max_distributed_connections=1 --max_block_size=2 --interactive_delay=900000 -q "select x from remote('127.{2,3}', view(select number + sleep(0.3) as x from numbers(16))) settings max_block_size = 2") 2>&1 | grep "Empty task was returned from async task queue" From 3d954c43142b28c0643b504a7f4d9333142b3fe0 Mon Sep 17 00:00:00 2001 From: alesapin Date: Fri, 19 Feb 2021 14:06:17 +0300 Subject: [PATCH 0473/2357] Better request/response logic --- src/Server/NuKeeperTCPHandler.cpp | 70 ++++++++++++++----------------- 1 file changed, 32 insertions(+), 38 deletions(-) diff --git a/src/Server/NuKeeperTCPHandler.cpp b/src/Server/NuKeeperTCPHandler.cpp index f25ca4a42ce..081821504d3 100644 --- a/src/Server/NuKeeperTCPHandler.cpp +++ b/src/Server/NuKeeperTCPHandler.cpp @@ -40,7 +40,7 @@ namespace ErrorCodes struct PollResult { - size_t ready_responses_count{0}; + bool has_response{false}; bool has_requests{false}; bool error{false}; }; @@ -92,8 +92,22 @@ struct SocketInterruptablePollWrapper return pipe.fds_rw[1]; } - PollResult poll(Poco::Timespan remaining_time) + PollResult poll(Poco::Timespan remaining_time, const std::shared_ptr & in) { + PollResult result{}; + if (response_in.available() != 0) + { + UInt8 dummy; + readIntBinary(dummy, response_in); + result.has_response = true; + } + + if (in->available() != 0) + result.has_requests = true; + + if (result.has_response) + return result; + std::array outputs = {-1, -1}; #if defined(POCO_HAVE_FD_EPOLL) int rc; @@ -148,7 +162,6 @@ struct SocketInterruptablePollWrapper outputs[1] = pipe.fds_rw[0]; #endif - PollResult result{}; if (rc < 0) { result.error = true; @@ -169,16 +182,8 @@ struct SocketInterruptablePollWrapper else { UInt8 dummy; - do - { - /// All ready responses stored in responses queue, - /// but we have to count amount of ready responses in pipe - /// and process them only. Otherwise states of response_in - /// and response queue will be inconsistent and race condition is possible. - readIntBinary(dummy, response_in); - result.ready_responses_count++; - } - while (response_in.available()); + readIntBinary(dummy, response_in); + result.has_response = true; } } } @@ -339,42 +344,32 @@ void NuKeeperTCPHandler::runImpl() { using namespace std::chrono_literals; - PollResult result = poll_wrapper->poll(session_timeout); + PollResult result = poll_wrapper->poll(session_timeout, in); if (result.has_requests && !close_received) { - size_t requests_read = 0; - do + auto [received_op, received_xid] = receiveRequest(); + + if (received_op == Coordination::OpNum::Close) { - auto [received_op, received_xid] = receiveRequest(); - - if (received_op == Coordination::OpNum::Close) - { - LOG_DEBUG(log, "Received close event with xid {} for session id #{}", received_xid, session_id); - close_xid = received_xid; - close_received = true; - break; - } - else if (received_op == Coordination::OpNum::Heartbeat) - { - LOG_TRACE(log, "Received heartbeat for session #{}", session_id); - session_stopwatch.restart(); - } - - if (requests_read > 50) - break; - requests_read++; + LOG_DEBUG(log, "Received close event with xid {} for session id #{}", received_xid, session_id); + close_xid = received_xid; + close_received = true; + } + else if (received_op == Coordination::OpNum::Heartbeat) + { + LOG_TRACE(log, "Received heartbeat for session #{}", session_id); + session_stopwatch.restart(); } - while (in->available()); } /// Process exact amount of responses from pipe /// otherwise state of responses queue and signaling pipe /// became inconsistent and race condition is possible. - while (result.ready_responses_count != 0) + if (result.has_response) { Coordination::ZooKeeperResponsePtr response; if (!responses->tryPop(response)) - throw Exception(ErrorCodes::LOGICAL_ERROR, "We must have at least {} ready responses, but queue is empty. It's a bug.", result.ready_responses_count); + throw Exception(ErrorCodes::LOGICAL_ERROR, "We must have ready response, but queue is empty. It's a bug."); if (response->xid == close_xid) { @@ -388,7 +383,6 @@ void NuKeeperTCPHandler::runImpl() nu_keeper_storage_dispatcher->finishSession(session_id); return; } - result.ready_responses_count--; } if (result.error) From df1cf481cf118283c4d9b6afc6eaa419c5834d71 Mon Sep 17 00:00:00 2001 From: Nikolai Kochetov Date: Fri, 19 Feb 2021 14:14:31 +0300 Subject: [PATCH 0474/2357] Update 01731_async_task_queue_wait.sh --- tests/queries/0_stateless/01731_async_task_queue_wait.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/queries/0_stateless/01731_async_task_queue_wait.sh b/tests/queries/0_stateless/01731_async_task_queue_wait.sh index 7545ad1e81a..936f850791d 100755 --- a/tests/queries/0_stateless/01731_async_task_queue_wait.sh +++ b/tests/queries/0_stateless/01731_async_task_queue_wait.sh @@ -7,4 +7,4 @@ CURDIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd) # regression for 'Empty task was returned from async task queue' during query # cancellation with async_socket_for_remote=1 (that ignores # max_distributed_connections) -$(timeout --signal=SIGINT 1 clickhouse client --max_distributed_connections=1 --max_block_size=2 --interactive_delay=900000 -q "select x from remote('127.{2,3}', view(select number + sleep(0.3) as x from numbers(16))) settings max_block_size = 2") 2>&1 | grep "Empty task was returned from async task queue" +$(timeout --signal=SIGINT 1 ${CLICKHOUSE_CLIENT} --max_distributed_connections=1 --max_block_size=2 --interactive_delay=900000 -q "select x from remote('127.{2,3}', view(select number + sleep(0.3) as x from numbers(16))) settings max_block_size = 2") 2>&1 | grep "Empty task was returned from async task queue" From 4c36cd17373ee373029397d5da8d50c7616689b2 Mon Sep 17 00:00:00 2001 From: vdimir Date: Sun, 31 Jan 2021 22:25:47 +0300 Subject: [PATCH 0475/2357] Add converting step for 'join using' --- contrib/hyperscan | 2 +- src/Interpreters/ExpressionAnalyzer.cpp | 54 ++-- src/Interpreters/ExpressionAnalyzer.h | 13 +- src/Interpreters/InterpreterSelectQuery.cpp | 5 +- src/Interpreters/SubqueryForSet.cpp | 14 +- src/Interpreters/SubqueryForSet.h | 2 +- src/Interpreters/join_common.cpp | 78 ++++++ src/Interpreters/join_common.h | 7 + .../01674_join_implicit_cast.reference | 236 ++++++++++++++++++ .../0_stateless/01674_join_implicit_cast.sql | 128 ++++++++++ 10 files changed, 509 insertions(+), 30 deletions(-) create mode 100644 tests/queries/0_stateless/01674_join_implicit_cast.reference create mode 100644 tests/queries/0_stateless/01674_join_implicit_cast.sql diff --git a/contrib/hyperscan b/contrib/hyperscan index e9f08df0213..3907fd00ee8 160000 --- a/contrib/hyperscan +++ b/contrib/hyperscan @@ -1 +1 @@ -Subproject commit e9f08df0213fc637aac0a5bbde9beeaeba2fe9fa +Subproject commit 3907fd00ee8b2538739768fa9533f8635a276531 diff --git a/src/Interpreters/ExpressionAnalyzer.cpp b/src/Interpreters/ExpressionAnalyzer.cpp index 660718549b3..057be7467f0 100644 --- a/src/Interpreters/ExpressionAnalyzer.cpp +++ b/src/Interpreters/ExpressionAnalyzer.cpp @@ -42,14 +42,13 @@ #include #include -#include -#include -#include #include - -#include #include +#include +#include +#include +#include #include #include @@ -714,23 +713,32 @@ ArrayJoinActionPtr SelectQueryExpressionAnalyzer::appendArrayJoin(ExpressionActi return array_join; } -bool SelectQueryExpressionAnalyzer::appendJoinLeftKeys(ExpressionActionsChain & chain, bool only_types) +bool SelectQueryExpressionAnalyzer::appendJoinLeftKeys(ExpressionActionsChain & chain, bool only_types, Block & block) { ExpressionActionsChain::Step & step = chain.lastStep(columns_after_array_join); getRootActions(analyzedJoin().leftKeysList(), only_types, step.actions()); + ExpressionActionsPtr actions = std::make_shared(step.actions()); + actions->execute(block); return true; } -JoinPtr SelectQueryExpressionAnalyzer::appendJoin(ExpressionActionsChain & chain) +JoinPtr +SelectQueryExpressionAnalyzer::appendJoin(ExpressionActionsChain & chain, const Block & sample_block, ActionsDAGPtr & before_join_dag) { - JoinPtr table_join = makeTableJoin(*syntax->ast_join); + JoinCommon::JoinConvertActions converting_actions; + JoinPtr table_join = makeTableJoin(*syntax->ast_join, sample_block, converting_actions); + + if (converting_actions.first) + { + before_join_dag = ActionsDAG::merge(std::move(*before_join_dag->clone()), std::move(*converting_actions.first->clone())); + + chain.steps.push_back(std::make_unique(converting_actions.first)); + chain.addStep(); + } ExpressionActionsChain::Step & step = chain.lastStep(columns_after_array_join); - - chain.steps.push_back(std::make_unique( - syntax->analyzed_join, table_join, step.getResultColumns())); - + chain.steps.push_back(std::make_unique(syntax->analyzed_join, table_join, step.getResultColumns())); chain.addStep(); return table_join; } @@ -795,7 +803,9 @@ static std::shared_ptr makeJoin(std::shared_ptr analyzed_join, return std::make_shared(analyzed_join, sample_block); } -JoinPtr SelectQueryExpressionAnalyzer::makeTableJoin(const ASTTablesInSelectQueryElement & join_element) +JoinPtr SelectQueryExpressionAnalyzer::makeTableJoin(const ASTTablesInSelectQueryElement & join_element, + const Block & left_sample_block, + JoinCommon::JoinConvertActions & converting_actions) { /// Two JOINs are not supported with the same subquery, but different USINGs. auto join_hash = join_element.getTreeHash(); @@ -831,7 +841,17 @@ JoinPtr SelectQueryExpressionAnalyzer::makeTableJoin(const ASTTablesInSelectQuer } /// TODO You do not need to set this up when JOIN is only needed on remote servers. - subquery_for_join.setJoinActions(joined_block_actions); /// changes subquery_for_join.sample_block inside + subquery_for_join.addJoinActions(joined_block_actions); /// changes subquery_for_join.sample_block inside + + const Block & right_sample_block = subquery_for_join.sample_block; + bool has_using = syntax->analyzed_join->hasUsing(); + converting_actions = JoinCommon::columnsNeedConvert( + left_sample_block, syntax->analyzed_join->keyNamesLeft(), + right_sample_block, syntax->analyzed_join->keyNamesRight(), + has_using); + if (converting_actions.second) + subquery_for_join.addJoinActions(std::make_shared(converting_actions.second)); + subquery_for_join.join = makeJoin(syntax->analyzed_join, subquery_for_join.sample_block, context); /// Do not make subquery for join over dictionary. @@ -1425,10 +1445,10 @@ ExpressionAnalysisResult::ExpressionAnalysisResult( if (query_analyzer.hasTableJoin()) { - query_analyzer.appendJoinLeftKeys(chain, only_types || !first_stage); - + Block left_block_sample = source_header; + query_analyzer.appendJoinLeftKeys(chain, only_types || !first_stage, left_block_sample); before_join = chain.getLastActions(); - join = query_analyzer.appendJoin(chain); + join = query_analyzer.appendJoin(chain, left_block_sample, before_join); chain.addStep(); } diff --git a/src/Interpreters/ExpressionAnalyzer.h b/src/Interpreters/ExpressionAnalyzer.h index 319be9c1409..13561d128d4 100644 --- a/src/Interpreters/ExpressionAnalyzer.h +++ b/src/Interpreters/ExpressionAnalyzer.h @@ -1,15 +1,17 @@ #pragma once -#include #include +#include #include #include #include +#include #include +#include +#include #include #include #include -#include namespace DB { @@ -313,7 +315,8 @@ private: /// Create Set-s that we make from IN section to use index on them. void makeSetsForIndex(const ASTPtr & node); - JoinPtr makeTableJoin(const ASTTablesInSelectQueryElement & join_element); + JoinPtr makeTableJoin(const ASTTablesInSelectQueryElement & join_element, const Block & left_sample_block, + JoinCommon::JoinConvertActions & converting_actions); const ASTSelectQuery * getAggregatingQuery() const; @@ -333,8 +336,8 @@ private: /// Before aggregation: ArrayJoinActionPtr appendArrayJoin(ExpressionActionsChain & chain, ActionsDAGPtr & before_array_join, bool only_types); - bool appendJoinLeftKeys(ExpressionActionsChain & chain, bool only_types); - JoinPtr appendJoin(ExpressionActionsChain & chain); + bool appendJoinLeftKeys(ExpressionActionsChain & chain, bool only_types, Block & block); + JoinPtr appendJoin(ExpressionActionsChain & chain, const Block & sample_block, ActionsDAGPtr & before_join_dag); /// Add preliminary rows filtration. Actions are created in other expression analyzer to prevent any possible alias injection. void appendPreliminaryFilter(ExpressionActionsChain & chain, ActionsDAGPtr actions_dag, String column_name); /// remove_filter is set in ExpressionActionsChain::finalize(); diff --git a/src/Interpreters/InterpreterSelectQuery.cpp b/src/Interpreters/InterpreterSelectQuery.cpp index 9f97160f77f..47addcc8d6c 100644 --- a/src/Interpreters/InterpreterSelectQuery.cpp +++ b/src/Interpreters/InterpreterSelectQuery.cpp @@ -994,12 +994,8 @@ void InterpreterSelectQuery::executeImpl(QueryPlan & query_plan, const BlockInpu if (expressions.hasJoin()) { - Block join_result_sample; JoinPtr join = expressions.join; - join_result_sample = JoiningTransform::transformHeader( - query_plan.getCurrentDataStream().header, expressions.join); - QueryPlanStepPtr join_step = std::make_unique( query_plan.getCurrentDataStream(), expressions.join); @@ -1009,6 +1005,7 @@ void InterpreterSelectQuery::executeImpl(QueryPlan & query_plan, const BlockInpu if (expressions.join_has_delayed_stream) { + const Block & join_result_sample = query_plan.getCurrentDataStream().header; auto stream = std::make_shared(*join, join_result_sample, settings.max_block_size); auto source = std::make_shared(std::move(stream)); auto add_non_joined_rows_step = std::make_unique( diff --git a/src/Interpreters/SubqueryForSet.cpp b/src/Interpreters/SubqueryForSet.cpp index 6ca0ecc50c8..c81b7a710ae 100644 --- a/src/Interpreters/SubqueryForSet.cpp +++ b/src/Interpreters/SubqueryForSet.cpp @@ -39,10 +39,20 @@ void SubqueryForSet::renameColumns(Block & block) } } -void SubqueryForSet::setJoinActions(ExpressionActionsPtr actions) +void SubqueryForSet::addJoinActions(ExpressionActionsPtr actions) { actions->execute(sample_block); - joined_block_actions = actions; + if (joined_block_actions == nullptr) + { + joined_block_actions = actions; + } + else + { + auto new_dag = ActionsDAG::merge( + std::move(*joined_block_actions->getActionsDAG().clone()), + std::move(*actions->getActionsDAG().clone())); + joined_block_actions = std::make_shared(new_dag); + } } bool SubqueryForSet::insertJoinedBlock(Block & block) diff --git a/src/Interpreters/SubqueryForSet.h b/src/Interpreters/SubqueryForSet.h index fd073500dc2..a42bf296d6c 100644 --- a/src/Interpreters/SubqueryForSet.h +++ b/src/Interpreters/SubqueryForSet.h @@ -40,7 +40,7 @@ struct SubqueryForSet void makeSource(std::shared_ptr & interpreter, NamesWithAliases && joined_block_aliases_); - void setJoinActions(ExpressionActionsPtr actions); + void addJoinActions(ExpressionActionsPtr actions); bool insertJoinedBlock(Block & block); void setTotals(Block totals); diff --git a/src/Interpreters/join_common.cpp b/src/Interpreters/join_common.cpp index a4c39a45efa..17ff5666ec2 100644 --- a/src/Interpreters/join_common.cpp +++ b/src/Interpreters/join_common.cpp @@ -1,9 +1,11 @@ #include #include +#include #include #include #include #include +#include #include #include @@ -283,6 +285,82 @@ void addDefaultValues(IColumn & column, const DataTypePtr & type, size_t count) type->insertDefaultInto(column); } +bool typesEqualUpToNullability(DataTypePtr left_type, DataTypePtr right_type) +{ + DataTypePtr left_type_strict = removeNullable(recursiveRemoveLowCardinality(left_type)); + DataTypePtr right_type_strict = removeNullable(recursiveRemoveLowCardinality(right_type)); + return left_type_strict->equals(*right_type_strict); +} + +JoinConvertActions columnsNeedConvert(const Block & left_block, const Names & left_keys, + const Block & right_block, const Names & right_keys, + bool has_using) +{ + assert(left_keys.size() == right_keys.size()); + + /// only JOIN USING supported + if (!has_using) + return {}; + + Block left_block_dst = left_block; + Block right_block_dst = right_block; + + std::unordered_set visited_left; + std::unordered_set visited_right; + bool any_need_cast = false; + for (size_t i = 0; i < left_keys.size(); ++i) + { + if (visited_left.contains(left_keys[i]) || visited_right.contains(right_keys[i])) + { + /// if one column joined with multiple different others do not perform conversion + /// e.g. `JOIN ... ON t1.a == t2.a AND t1.a == t2.b` + return {}; + } + visited_left.insert(left_keys[i]); + visited_right.insert(right_keys[i]); + + DataTypePtr ltype = left_block.getByName(left_keys[i]).type; + DataTypePtr rtype = right_block.getByName(right_keys[i]).type; + + if (typesEqualUpToNullability(ltype, rtype)) + continue; + + any_need_cast = true; + DataTypePtr supertype; + try + { + supertype = DB::getLeastSupertype({ltype, rtype}); + } + catch (DB::Exception &) + { + throw Exception("Type mismatch of columns to JOIN by: " + + left_keys[i] + ": " + ltype->getName() + " at left, " + + right_keys[i] + ": " + rtype->getName() + " at right", + ErrorCodes::TYPE_MISMATCH); + } + auto & lcol_dst = left_block_dst.getByName(left_keys[i]); + auto & rcol_dst = right_block_dst.getByName(right_keys[i]); + lcol_dst.column = rcol_dst.column = nullptr; + lcol_dst.type = rcol_dst.type = supertype; + } + + if (!any_need_cast) + return {}; + + auto convert_left_actions_dag = ActionsDAG::makeConvertingActions( + left_block.getColumnsWithTypeAndName(), + left_block_dst.getColumnsWithTypeAndName(), + ActionsDAG::MatchColumnsMode::Name, + true); + auto convert_right_actions_dag = ActionsDAG::makeConvertingActions( + right_block.getColumnsWithTypeAndName(), + right_block_dst.getColumnsWithTypeAndName(), + ActionsDAG::MatchColumnsMode::Name, + true); + + return std::make_pair(convert_left_actions_dag, convert_right_actions_dag); +} + } diff --git a/src/Interpreters/join_common.h b/src/Interpreters/join_common.h index 6f9f7dd1210..3115b4ae2d2 100644 --- a/src/Interpreters/join_common.h +++ b/src/Interpreters/join_common.h @@ -2,6 +2,8 @@ #include #include +#include +#include namespace DB { @@ -14,6 +16,8 @@ using ColumnRawPtrs = std::vector; namespace JoinCommon { +using JoinConvertActions = std::pair; + void convertColumnToNullable(ColumnWithTypeAndName & column, bool low_card_nullability = false); void convertColumnsToNullable(Block & block, size_t starting_pos = 0); void removeColumnNullability(ColumnWithTypeAndName & column); @@ -36,6 +40,9 @@ void joinTotals(const Block & totals, const Block & columns_to_add, const Names void addDefaultValues(IColumn & column, const DataTypePtr & type, size_t count); +JoinConvertActions columnsNeedConvert(const Block & left_block, const Names & left_keys, + const Block & right_block, const Names & right_keys, + bool has_using); } /// Creates result from right table data in RIGHT and FULL JOIN when keys are not present in left table. diff --git a/tests/queries/0_stateless/01674_join_implicit_cast.reference b/tests/queries/0_stateless/01674_join_implicit_cast.reference new file mode 100644 index 00000000000..644e7a3aa3b --- /dev/null +++ b/tests/queries/0_stateless/01674_join_implicit_cast.reference @@ -0,0 +1,236 @@ +--- hash --- +- full - +-4 0 196 +-3 0 197 +-2 0 198 +-1 0 199 +0 0 200 +1 101 201 +2 102 202 +3 103 203 +4 104 204 +5 105 205 +6 106 \N +7 107 \N +8 108 \N +9 109 \N +10 110 \N +- left - +1 101 201 +2 102 202 +3 103 203 +4 104 204 +5 105 205 +6 106 \N +7 107 \N +8 108 \N +9 109 \N +10 110 \N +- right - +-4 0 196 +-3 0 197 +-2 0 198 +-1 0 199 +0 0 200 +1 101 201 +2 102 202 +3 103 203 +4 104 204 +5 105 205 +- inner - +1 101 201 +2 102 202 +3 103 203 +4 104 204 +5 105 205 +- full - +0 0 -4 +0 0 -3 +0 0 -2 +0 0 -1 +0 0 0 +1 1 1 +2 2 2 +3 3 3 +4 4 4 +5 5 5 +6 6 0 +7 7 0 +8 8 0 +9 9 0 +10 10 0 +- left - +1 1 1 +2 2 2 +3 3 3 +4 4 4 +5 5 5 +6 6 0 +7 7 0 +8 8 0 +9 9 0 +10 10 0 +- right - +0 0 -4 +0 0 -3 +0 0 -2 +0 0 -1 +0 0 0 +1 1 1 +2 2 2 +3 3 3 +4 4 4 +5 5 5 +- inner - +1 1 1 +2 2 2 +3 3 3 +4 4 4 +5 5 5 +- types - +1 +1 +1 +1 +--- partial_merge --- +- full - +-4 0 196 +-3 0 197 +-2 0 198 +-1 0 199 +0 0 200 +1 101 201 +2 102 202 +3 103 203 +4 104 204 +5 105 205 +6 106 \N +7 107 \N +8 108 \N +9 109 \N +10 110 \N +- left - +1 101 201 +2 102 202 +3 103 203 +4 104 204 +5 105 205 +6 106 \N +7 107 \N +8 108 \N +9 109 \N +10 110 \N +- right - +-4 0 196 +-3 0 197 +-2 0 198 +-1 0 199 +0 0 200 +1 101 201 +2 102 202 +3 103 203 +4 104 204 +5 105 205 +- inner - +1 101 201 +2 102 202 +3 103 203 +4 104 204 +5 105 205 +- full - +0 0 -4 +0 0 -3 +0 0 -2 +0 0 -1 +0 0 0 +1 1 1 +2 2 2 +3 3 3 +4 4 4 +5 5 5 +6 6 0 +7 7 0 +8 8 0 +9 9 0 +10 10 0 +- left - +1 1 1 +2 2 2 +3 3 3 +4 4 4 +5 5 5 +6 6 0 +7 7 0 +8 8 0 +9 9 0 +10 10 0 +- right - +0 0 -4 +0 0 -3 +0 0 -2 +0 0 -1 +0 0 0 +1 1 1 +2 2 2 +3 3 3 +4 4 4 +5 5 5 +- inner - +1 1 1 +2 2 2 +3 3 3 +4 4 4 +5 5 5 +- types - +1 +1 +1 +1 +--- hash --- +- full - +1 1 +2 2 +-1 1 +1 \N +1 257 +1 -1 +- left - +1 1 +2 2 +- right - +1 1 +-1 1 +1 \N +1 257 +1 -1 +- inner - +1 1 +- types - +1 +1 +1 +1 +--- partial_merge --- +- full - +1 1 +2 2 +-1 1 +1 \N +1 257 +1 -1 +- left - +1 1 +2 2 +- right - +1 1 +-1 1 +1 \N +1 257 +1 -1 +- inner - +1 1 +- types - +1 +1 +1 +1 diff --git a/tests/queries/0_stateless/01674_join_implicit_cast.sql b/tests/queries/0_stateless/01674_join_implicit_cast.sql new file mode 100644 index 00000000000..45bffbf1808 --- /dev/null +++ b/tests/queries/0_stateless/01674_join_implicit_cast.sql @@ -0,0 +1,128 @@ +CREATE DATABASE IF NOT EXISTS test_01655; +USE test_01655; + +DROP TABLE IF EXISTS t1; +DROP TABLE IF EXISTS t2; + +CREATE TABLE t1 (a UInt16, b UInt16) ENGINE = TinyLog; +CREATE TABLE t2 (a Int16, b Nullable(Int64)) ENGINE = TinyLog; + +INSERT INTO t1 SELECT number as a, 100 + number as b FROM system.numbers LIMIT 1, 10; +INSERT INTO t2 SELECT number - 5 as a, 200 + number - 5 as b FROM system.numbers LIMIT 1, 10; + +SELECT '--- hash ---'; +SET join_algorithm = 'hash'; + +SELECT '- full -'; +SELECT a, b, t2.b FROM t1 FULL JOIN t2 USING (a) ORDER BY (a); +SELECT '- left -'; +SELECT a, b, t2.b FROM t1 LEFT JOIN t2 USING (a) ORDER BY (a); +SELECT '- right -'; +SELECT a, b, t2.b FROM t1 RIGHT JOIN t2 USING (a) ORDER BY (a); +SELECT '- inner -'; +SELECT a, b, t2.b FROM t1 INNER JOIN t2 USING (a) ORDER BY (a); + +SELECT '- full -'; +SELECT a, t1.a, t2.a FROM t1 FULL JOIN t2 USING (a) ORDER BY (t1.a, t2.a); +SELECT '- left -'; +SELECT a, t1.a, t2.a FROM t1 LEFT JOIN t2 USING (a) ORDER BY (t1.a, t2.a); +SELECT '- right -'; +SELECT a, t1.a, t2.a FROM t1 RIGHT JOIN t2 USING (a) ORDER BY (t1.a, t2.a); +SELECT '- inner -'; +SELECT a, t1.a, t2.a FROM t1 INNER JOIN t2 USING (a) ORDER BY (t1.a, t2.a); + +SELECT '- types -'; +SELECT any(toTypeName(a)) == 'Int32' AND any(toTypeName(t2.a)) == 'Int32' FROM t1 FULL JOIN t2 USING (a); +SELECT any(toTypeName(a)) == 'Int32' AND any(toTypeName(t2.a)) == 'Int32' FROM t1 LEFT JOIN t2 USING (a); +SELECT any(toTypeName(a)) == 'Int32' AND any(toTypeName(t2.a)) == 'Int32' FROM t1 RIGHT JOIN t2 USING (a); +SELECT any(toTypeName(a)) == 'Int32' AND any(toTypeName(t2.a)) == 'Int32' FROM t1 INNER JOIN t2 USING (a); + +SELECT * FROM t1 FULL JOIN t2 ON (t1.a == t2.a) ORDER BY (a); -- { serverError 53 } +SELECT * FROM t1 LEFT JOIN t2 ON(t1.a == t2.a) ORDER BY (a); -- { serverError 53 } +SELECT * FROM t1 RIGHT JOIN t2 ON (t1.a == t2.a) ORDER BY (a); -- { serverError 53 } +SELECT * FROM t1 INNER JOIN t2 ON (t1.a == t2.a) ORDER BY (a); -- { serverError 53 } + +SELECT '--- partial_merge ---'; + +SET join_algorithm = 'partial_merge'; + +SELECT '- full -'; +SELECT a, b, t2.b FROM t1 FULL JOIN t2 USING (a) ORDER BY (a); +SELECT '- left -'; +SELECT a, b, t2.b FROM t1 LEFT JOIN t2 USING (a) ORDER BY (a); +SELECT '- right -'; +SELECT a, b, t2.b FROM t1 RIGHT JOIN t2 USING (a) ORDER BY (a); +SELECT '- inner -'; +SELECT a, b, t2.b FROM t1 INNER JOIN t2 USING (a) ORDER BY (a); + + +SELECT '- full -'; +SELECT a, t1.a, t2.a FROM t1 FULL JOIN t2 USING (a) ORDER BY (t1.a, t2.a); +SELECT '- left -'; +SELECT a, t1.a, t2.a FROM t1 LEFT JOIN t2 USING (a) ORDER BY (t1.a, t2.a); +SELECT '- right -'; +SELECT a, t1.a, t2.a FROM t1 RIGHT JOIN t2 USING (a) ORDER BY (t1.a, t2.a); +SELECT '- inner -'; +SELECT a, t1.a, t2.a FROM t1 INNER JOIN t2 USING (a) ORDER BY (t1.a, t2.a); + +SELECT '- types -'; +SELECT any(toTypeName(a)) == 'Int32' AND any(toTypeName(t2.a)) == 'Int32' FROM t1 FULL JOIN t2 USING (a); +SELECT any(toTypeName(a)) == 'Int32' AND any(toTypeName(t2.a)) == 'Int32' FROM t1 LEFT JOIN t2 USING (a); +SELECT any(toTypeName(a)) == 'Int32' AND any(toTypeName(t2.a)) == 'Int32' FROM t1 RIGHT JOIN t2 USING (a); +SELECT any(toTypeName(a)) == 'Int32' AND any(toTypeName(t2.a)) == 'Int32' FROM t1 INNER JOIN t2 USING (a); + +SELECT * FROM t1 FULL JOIN t2 ON (t1.a == t2.a) ORDER BY (a); -- { serverError 53 } +SELECT * FROM t1 LEFT JOIN t2 ON(t1.a == t2.a) ORDER BY (a); -- { serverError 53 } +SELECT * FROM t1 RIGHT JOIN t2 ON (t1.a == t2.a) ORDER BY (a); -- { serverError 53 } +SELECT * FROM t1 INNER JOIN t2 ON (t1.a == t2.a) ORDER BY (a); -- { serverError 53 } + +DROP TABLE IF EXISTS t1; +DROP TABLE IF EXISTS t2; + +CREATE TABLE t1 (id Nullable(Int32), a UInt16, b UInt8) ENGINE = TinyLog; +CREATE TABLE t2 (id Nullable(Int32), a Int16, b Nullable(Int64)) ENGINE = TinyLog; +INSERT INTO t1 VALUES (0, 1, 1), (1, 2, 2); +INSERT INTO t2 VALUES (2, -1, 1), (3, 1, NULL), (4, 1, 257), (5, 1, -1), (6, 1, 1); + +SELECT '--- hash ---'; + +SELECT '- full -'; +SELECT a, b FROM t1 FULL JOIN t2 USING (a, b) ORDER BY ifNull(t1.id, t2.id); +SELECT '- left -'; +SELECT a, b FROM t1 LEFT JOIN t2 USING (a, b) ORDER BY ifNull(t1.id, t2.id); +SELECT '- right -'; +SELECT a, b FROM t1 RIGHT JOIN t2 USING (a, b) ORDER BY ifNull(t1.id, t2.id); +SELECT '- inner -'; +SELECT a, b FROM t1 INNER JOIN t2 USING (a, b) ORDER BY ifNull(t1.id, t2.id); + +SELECT '- types -'; + +SELECT any(toTypeName(a)) == 'Int32' AND any(toTypeName(b)) == 'Nullable(Int64)' FROM t1 FULL JOIN t2 USING (a, b); +SELECT any(toTypeName(a)) == 'Int32' AND any(toTypeName(b)) == 'Nullable(Int64)' FROM t1 LEFT JOIN t2 USING (a, b); +SELECT any(toTypeName(a)) == 'Int32' AND any(toTypeName(b)) == 'Nullable(Int64)' FROM t1 RIGHT JOIN t2 USING (a, b); +SELECT any(toTypeName(a)) == 'Int32' AND any(toTypeName(b)) == 'Nullable(Int64)' FROM t1 INNER JOIN t2 USING (a, b); + +SELECT '--- partial_merge ---'; + +SET join_algorithm = 'partial_merge'; + +SELECT '- full -'; +SELECT a, b FROM t1 FULL JOIN t2 USING (a, b) ORDER BY ifNull(t1.id, t2.id); +SELECT '- left -'; +SELECT a, b FROM t1 LEFT JOIN t2 USING (a, b) ORDER BY ifNull(t1.id, t2.id); +SELECT '- right -'; +SELECT a, b FROM t1 RIGHT JOIN t2 USING (a, b) ORDER BY ifNull(t1.id, t2.id); +SELECT '- inner -'; +SELECT a, b FROM t1 INNER JOIN t2 USING (a, b) ORDER BY ifNull(t1.id, t2.id); + +SELECT '- types -'; + +SELECT any(toTypeName(a)) == 'Int32' AND any(toTypeName(b)) == 'Nullable(Int64)' FROM t1 FULL JOIN t2 USING (a, b); +SELECT any(toTypeName(a)) == 'Int32' AND any(toTypeName(b)) == 'Nullable(Int64)' FROM t1 LEFT JOIN t2 USING (a, b); +SELECT any(toTypeName(a)) == 'Int32' AND any(toTypeName(b)) == 'Nullable(Int64)' FROM t1 RIGHT JOIN t2 USING (a, b); +SELECT any(toTypeName(a)) == 'Int32' AND any(toTypeName(b)) == 'Nullable(Int64)' FROM t1 INNER JOIN t2 USING (a, b); + +DROP TABLE IF EXISTS t1; +DROP TABLE IF EXISTS t2; + +DROP DATABASE IF EXISTS test_01655; From f5b98015a800ab8f0db163d1e264470d767c46d0 Mon Sep 17 00:00:00 2001 From: vdimir Date: Mon, 1 Feb 2021 16:53:54 +0300 Subject: [PATCH 0476/2357] Fix converting right key type in join using --- src/Interpreters/ExpressionAnalyzer.cpp | 13 ++--- src/Interpreters/TableJoin.cpp | 6 +++ src/Interpreters/TableJoin.h | 6 ++- src/Interpreters/join_common.cpp | 11 ++-- src/Interpreters/join_common.h | 16 ++++-- .../0_stateless/01674_join_implicit_cast.sql | 50 +++++++++---------- 6 files changed, 64 insertions(+), 38 deletions(-) diff --git a/src/Interpreters/ExpressionAnalyzer.cpp b/src/Interpreters/ExpressionAnalyzer.cpp index 057be7467f0..e29eb5fbe69 100644 --- a/src/Interpreters/ExpressionAnalyzer.cpp +++ b/src/Interpreters/ExpressionAnalyzer.cpp @@ -728,12 +728,13 @@ SelectQueryExpressionAnalyzer::appendJoin(ExpressionActionsChain & chain, const { JoinCommon::JoinConvertActions converting_actions; JoinPtr table_join = makeTableJoin(*syntax->ast_join, sample_block, converting_actions); - - if (converting_actions.first) + if (converting_actions.needConvert()) { - before_join_dag = ActionsDAG::merge(std::move(*before_join_dag->clone()), std::move(*converting_actions.first->clone())); + syntax->analyzed_join->setConvertedRightType(converting_actions.right_target_types); - chain.steps.push_back(std::make_unique(converting_actions.first)); + before_join_dag = ActionsDAG::merge(std::move(*before_join_dag->clone()), std::move(*converting_actions.left_actions->clone())); + + chain.steps.push_back(std::make_unique(converting_actions.left_actions)); chain.addStep(); } @@ -849,8 +850,8 @@ JoinPtr SelectQueryExpressionAnalyzer::makeTableJoin(const ASTTablesInSelectQuer left_sample_block, syntax->analyzed_join->keyNamesLeft(), right_sample_block, syntax->analyzed_join->keyNamesRight(), has_using); - if (converting_actions.second) - subquery_for_join.addJoinActions(std::make_shared(converting_actions.second)); + if (converting_actions.needConvert()) + subquery_for_join.addJoinActions(std::make_shared(converting_actions.right_actions)); subquery_for_join.join = makeJoin(syntax->analyzed_join, subquery_for_join.sample_block, context); diff --git a/src/Interpreters/TableJoin.cpp b/src/Interpreters/TableJoin.cpp index c1777711d9e..aee38dbc322 100644 --- a/src/Interpreters/TableJoin.cpp +++ b/src/Interpreters/TableJoin.cpp @@ -241,9 +241,15 @@ void TableJoin::addJoinedColumnsAndCorrectNullability(ColumnsWithTypeAndName & c } } + std::unordered_map type_map; + for (const auto & [name, type] : converted_right_types) + type_map[name] = type; + for (const auto & col : columns_added_by_join) { auto res_type = col.type; + if (const auto it = type_map.find(col.name); it != type_map.end()) + res_type = it->second; if (rightBecomeNullable(res_type)) res_type = makeNullable(res_type); diff --git a/src/Interpreters/TableJoin.h b/src/Interpreters/TableJoin.h index 9dcbc30f07b..8b2c23c206b 100644 --- a/src/Interpreters/TableJoin.h +++ b/src/Interpreters/TableJoin.h @@ -68,10 +68,12 @@ class TableJoin NamesAndTypesList columns_from_joined_table; /// Columns will be added to block by JOIN. It's a subset of columns_from_joined_table with corrected Nullability NamesAndTypesList columns_added_by_join; + /// Columns from right table that requres type conversion + NamesAndTypesList converted_right_types; /// Name -> original name. Names are the same as in columns_from_joined_table list. std::unordered_map original_names; - /// Original name -> name. Only ranamed columns. + /// Original name -> name. Only renamed columns. std::unordered_map renames; VolumePtr tmp_volume; @@ -124,6 +126,7 @@ public: bool hasUsing() const { return table_join.using_expression_list != nullptr; } bool hasOn() const { return table_join.on_expression != nullptr; } + bool hasJoinedStorage() const { return joined_storage != nullptr; } NameSet getQualifiedColumnsSet() const; NamesWithAliases getNamesWithAliases(const NameSet & required_columns) const; @@ -137,6 +140,7 @@ public: bool rightBecomeNullable(const DataTypePtr & column_type) const; void addJoinedColumn(const NameAndTypePair & joined_column); void addJoinedColumnsAndCorrectNullability(ColumnsWithTypeAndName & columns) const; + void setConvertedRightType(NamesAndTypesList columns) { converted_right_types = columns; } void setAsofInequality(ASOF::Inequality inequality) { asof_inequality = inequality; } ASOF::Inequality getAsofInequality() { return asof_inequality; } diff --git a/src/Interpreters/join_common.cpp b/src/Interpreters/join_common.cpp index 17ff5666ec2..10cc637531f 100644 --- a/src/Interpreters/join_common.cpp +++ b/src/Interpreters/join_common.cpp @@ -302,6 +302,8 @@ JoinConvertActions columnsNeedConvert(const Block & left_block, const Names & le if (!has_using) return {}; + JoinConvertActions actions; + Block left_block_dst = left_block; Block right_block_dst = right_block; @@ -338,6 +340,9 @@ JoinConvertActions columnsNeedConvert(const Block & left_block, const Names & le + right_keys[i] + ": " + rtype->getName() + " at right", ErrorCodes::TYPE_MISMATCH); } + actions.left_target_types.emplace_back(left_keys[i], supertype); + actions.right_target_types.emplace_back(right_keys[i], supertype); + auto & lcol_dst = left_block_dst.getByName(left_keys[i]); auto & rcol_dst = right_block_dst.getByName(right_keys[i]); lcol_dst.column = rcol_dst.column = nullptr; @@ -347,18 +352,18 @@ JoinConvertActions columnsNeedConvert(const Block & left_block, const Names & le if (!any_need_cast) return {}; - auto convert_left_actions_dag = ActionsDAG::makeConvertingActions( + actions.left_actions = ActionsDAG::makeConvertingActions( left_block.getColumnsWithTypeAndName(), left_block_dst.getColumnsWithTypeAndName(), ActionsDAG::MatchColumnsMode::Name, true); - auto convert_right_actions_dag = ActionsDAG::makeConvertingActions( + actions.right_actions = ActionsDAG::makeConvertingActions( right_block.getColumnsWithTypeAndName(), right_block_dst.getColumnsWithTypeAndName(), ActionsDAG::MatchColumnsMode::Name, true); - return std::make_pair(convert_left_actions_dag, convert_right_actions_dag); + return actions; } } diff --git a/src/Interpreters/join_common.h b/src/Interpreters/join_common.h index 3115b4ae2d2..13df2cc7027 100644 --- a/src/Interpreters/join_common.h +++ b/src/Interpreters/join_common.h @@ -16,7 +16,16 @@ using ColumnRawPtrs = std::vector; namespace JoinCommon { -using JoinConvertActions = std::pair; +struct JoinConvertActions +{ + ActionsDAGPtr left_actions; + ActionsDAGPtr right_actions; + + NamesAndTypesList left_target_types; + NamesAndTypesList right_target_types; + + bool needConvert() const { return left_actions && right_actions; } +}; void convertColumnToNullable(ColumnWithTypeAndName & column, bool low_card_nullability = false); void convertColumnsToNullable(Block & block, size_t starting_pos = 0); @@ -40,9 +49,10 @@ void joinTotals(const Block & totals, const Block & columns_to_add, const Names void addDefaultValues(IColumn & column, const DataTypePtr & type, size_t count); +/// Return converting actions for left and right tables that need to be performed before join JoinConvertActions columnsNeedConvert(const Block & left_block, const Names & left_keys, - const Block & right_block, const Names & right_keys, - bool has_using); + const Block & right_block, const Names & right_keys, + bool has_using); } /// Creates result from right table data in RIGHT and FULL JOIN when keys are not present in left table. diff --git a/tests/queries/0_stateless/01674_join_implicit_cast.sql b/tests/queries/0_stateless/01674_join_implicit_cast.sql index 45bffbf1808..348f1af476b 100644 --- a/tests/queries/0_stateless/01674_join_implicit_cast.sql +++ b/tests/queries/0_stateless/01674_join_implicit_cast.sql @@ -1,5 +1,5 @@ -CREATE DATABASE IF NOT EXISTS test_01655; -USE test_01655; +CREATE DATABASE IF NOT EXISTS test_01674; +USE test_01674; DROP TABLE IF EXISTS t1; DROP TABLE IF EXISTS t2; @@ -79,50 +79,50 @@ SELECT * FROM t1 INNER JOIN t2 ON (t1.a == t2.a) ORDER BY (a); -- { serverError DROP TABLE IF EXISTS t1; DROP TABLE IF EXISTS t2; -CREATE TABLE t1 (id Nullable(Int32), a UInt16, b UInt8) ENGINE = TinyLog; -CREATE TABLE t2 (id Nullable(Int32), a Int16, b Nullable(Int64)) ENGINE = TinyLog; -INSERT INTO t1 VALUES (0, 1, 1), (1, 2, 2); -INSERT INTO t2 VALUES (2, -1, 1), (3, 1, NULL), (4, 1, 257), (5, 1, -1), (6, 1, 1); +CREATE TABLE t_ab1 (id Nullable(Int32), a UInt16, b UInt8) ENGINE = TinyLog; +CREATE TABLE t_ab2 (id Nullable(Int32), a Int16, b Nullable(Int64)) ENGINE = TinyLog; +INSERT INTO t_ab1 VALUES (0, 1, 1), (1, 2, 2); +INSERT INTO t_ab2 VALUES (2, -1, 1), (3, 1, NULL), (4, 1, 257), (5, 1, -1), (6, 1, 1); SELECT '--- hash ---'; SELECT '- full -'; -SELECT a, b FROM t1 FULL JOIN t2 USING (a, b) ORDER BY ifNull(t1.id, t2.id); +SELECT a, b FROM t_ab1 FULL JOIN t_ab2 USING (a, b) ORDER BY ifNull(t_ab1.id, t_ab2.id); SELECT '- left -'; -SELECT a, b FROM t1 LEFT JOIN t2 USING (a, b) ORDER BY ifNull(t1.id, t2.id); +SELECT a, b FROM t_ab1 LEFT JOIN t_ab2 USING (a, b) ORDER BY ifNull(t_ab1.id, t_ab2.id); SELECT '- right -'; -SELECT a, b FROM t1 RIGHT JOIN t2 USING (a, b) ORDER BY ifNull(t1.id, t2.id); +SELECT a, b FROM t_ab1 RIGHT JOIN t_ab2 USING (a, b) ORDER BY ifNull(t_ab1.id, t_ab2.id); SELECT '- inner -'; -SELECT a, b FROM t1 INNER JOIN t2 USING (a, b) ORDER BY ifNull(t1.id, t2.id); +SELECT a, b FROM t_ab1 INNER JOIN t_ab2 USING (a, b) ORDER BY ifNull(t_ab1.id, t_ab2.id); SELECT '- types -'; -SELECT any(toTypeName(a)) == 'Int32' AND any(toTypeName(b)) == 'Nullable(Int64)' FROM t1 FULL JOIN t2 USING (a, b); -SELECT any(toTypeName(a)) == 'Int32' AND any(toTypeName(b)) == 'Nullable(Int64)' FROM t1 LEFT JOIN t2 USING (a, b); -SELECT any(toTypeName(a)) == 'Int32' AND any(toTypeName(b)) == 'Nullable(Int64)' FROM t1 RIGHT JOIN t2 USING (a, b); -SELECT any(toTypeName(a)) == 'Int32' AND any(toTypeName(b)) == 'Nullable(Int64)' FROM t1 INNER JOIN t2 USING (a, b); +SELECT any(toTypeName(a)) == 'Int32' AND any(toTypeName(b)) == 'Nullable(Int64)' FROM t_ab1 FULL JOIN t_ab2 USING (a, b); +SELECT any(toTypeName(a)) == 'Int32' AND any(toTypeName(b)) == 'Nullable(Int64)' FROM t_ab1 LEFT JOIN t_ab2 USING (a, b); +SELECT any(toTypeName(a)) == 'Int32' AND any(toTypeName(b)) == 'Nullable(Int64)' FROM t_ab1 RIGHT JOIN t_ab2 USING (a, b); +SELECT any(toTypeName(a)) == 'Int32' AND any(toTypeName(b)) == 'Nullable(Int64)' FROM t_ab1 INNER JOIN t_ab2 USING (a, b); SELECT '--- partial_merge ---'; SET join_algorithm = 'partial_merge'; SELECT '- full -'; -SELECT a, b FROM t1 FULL JOIN t2 USING (a, b) ORDER BY ifNull(t1.id, t2.id); +SELECT a, b FROM t_ab1 FULL JOIN t_ab2 USING (a, b) ORDER BY ifNull(t_ab1.id, t_ab2.id); SELECT '- left -'; -SELECT a, b FROM t1 LEFT JOIN t2 USING (a, b) ORDER BY ifNull(t1.id, t2.id); +SELECT a, b FROM t_ab1 LEFT JOIN t_ab2 USING (a, b) ORDER BY ifNull(t_ab1.id, t_ab2.id); SELECT '- right -'; -SELECT a, b FROM t1 RIGHT JOIN t2 USING (a, b) ORDER BY ifNull(t1.id, t2.id); +SELECT a, b FROM t_ab1 RIGHT JOIN t_ab2 USING (a, b) ORDER BY ifNull(t_ab1.id, t_ab2.id); SELECT '- inner -'; -SELECT a, b FROM t1 INNER JOIN t2 USING (a, b) ORDER BY ifNull(t1.id, t2.id); +SELECT a, b FROM t_ab1 INNER JOIN t_ab2 USING (a, b) ORDER BY ifNull(t_ab1.id, t_ab2.id); SELECT '- types -'; -SELECT any(toTypeName(a)) == 'Int32' AND any(toTypeName(b)) == 'Nullable(Int64)' FROM t1 FULL JOIN t2 USING (a, b); -SELECT any(toTypeName(a)) == 'Int32' AND any(toTypeName(b)) == 'Nullable(Int64)' FROM t1 LEFT JOIN t2 USING (a, b); -SELECT any(toTypeName(a)) == 'Int32' AND any(toTypeName(b)) == 'Nullable(Int64)' FROM t1 RIGHT JOIN t2 USING (a, b); -SELECT any(toTypeName(a)) == 'Int32' AND any(toTypeName(b)) == 'Nullable(Int64)' FROM t1 INNER JOIN t2 USING (a, b); +SELECT any(toTypeName(a)) == 'Int32' AND any(toTypeName(b)) == 'Nullable(Int64)' FROM t_ab1 FULL JOIN t_ab2 USING (a, b); +SELECT any(toTypeName(a)) == 'Int32' AND any(toTypeName(b)) == 'Nullable(Int64)' FROM t_ab1 LEFT JOIN t_ab2 USING (a, b); +SELECT any(toTypeName(a)) == 'Int32' AND any(toTypeName(b)) == 'Nullable(Int64)' FROM t_ab1 RIGHT JOIN t_ab2 USING (a, b); +SELECT any(toTypeName(a)) == 'Int32' AND any(toTypeName(b)) == 'Nullable(Int64)' FROM t_ab1 INNER JOIN t_ab2 USING (a, b); -DROP TABLE IF EXISTS t1; -DROP TABLE IF EXISTS t2; +DROP TABLE IF EXISTS t_ab1; +DROP TABLE IF EXISTS t_ab2; -DROP DATABASE IF EXISTS test_01655; +DROP DATABASE IF EXISTS test_01674; From d15c1a203bb2f662adb969f5d9eaaa2fc1e96eb6 Mon Sep 17 00:00:00 2001 From: vdimir Date: Wed, 3 Feb 2021 14:38:59 +0300 Subject: [PATCH 0477/2357] Fix TableJoin, upd 01675_join_implicit_cast --- contrib/hyperscan | 2 +- src/Interpreters/ExpressionAnalyzer.h | 3 +-- src/Interpreters/TableJoin.h | 2 +- ..._cast.reference => 01686_join_implicit_cast.reference} | 0 ...oin_implicit_cast.sql => 01686_join_implicit_cast.sql} | 8 +++----- 5 files changed, 6 insertions(+), 9 deletions(-) rename tests/queries/0_stateless/{01674_join_implicit_cast.reference => 01686_join_implicit_cast.reference} (100%) rename tests/queries/0_stateless/{01674_join_implicit_cast.sql => 01686_join_implicit_cast.sql} (98%) diff --git a/contrib/hyperscan b/contrib/hyperscan index 3907fd00ee8..e9f08df0213 160000 --- a/contrib/hyperscan +++ b/contrib/hyperscan @@ -1 +1 @@ -Subproject commit 3907fd00ee8b2538739768fa9533f8635a276531 +Subproject commit e9f08df0213fc637aac0a5bbde9beeaeba2fe9fa diff --git a/src/Interpreters/ExpressionAnalyzer.h b/src/Interpreters/ExpressionAnalyzer.h index 13561d128d4..5c69bcafc65 100644 --- a/src/Interpreters/ExpressionAnalyzer.h +++ b/src/Interpreters/ExpressionAnalyzer.h @@ -3,11 +3,10 @@ #include #include #include -#include -#include #include #include #include +#include #include #include #include diff --git a/src/Interpreters/TableJoin.h b/src/Interpreters/TableJoin.h index 8b2c23c206b..f2c8bc2d839 100644 --- a/src/Interpreters/TableJoin.h +++ b/src/Interpreters/TableJoin.h @@ -68,7 +68,7 @@ class TableJoin NamesAndTypesList columns_from_joined_table; /// Columns will be added to block by JOIN. It's a subset of columns_from_joined_table with corrected Nullability NamesAndTypesList columns_added_by_join; - /// Columns from right table that requres type conversion + /// Columns from right table that requires type conversion NamesAndTypesList converted_right_types; /// Name -> original name. Names are the same as in columns_from_joined_table list. diff --git a/tests/queries/0_stateless/01674_join_implicit_cast.reference b/tests/queries/0_stateless/01686_join_implicit_cast.reference similarity index 100% rename from tests/queries/0_stateless/01674_join_implicit_cast.reference rename to tests/queries/0_stateless/01686_join_implicit_cast.reference diff --git a/tests/queries/0_stateless/01674_join_implicit_cast.sql b/tests/queries/0_stateless/01686_join_implicit_cast.sql similarity index 98% rename from tests/queries/0_stateless/01674_join_implicit_cast.sql rename to tests/queries/0_stateless/01686_join_implicit_cast.sql index 348f1af476b..e5bab3e9243 100644 --- a/tests/queries/0_stateless/01674_join_implicit_cast.sql +++ b/tests/queries/0_stateless/01686_join_implicit_cast.sql @@ -1,6 +1,3 @@ -CREATE DATABASE IF NOT EXISTS test_01674; -USE test_01674; - DROP TABLE IF EXISTS t1; DROP TABLE IF EXISTS t2; @@ -79,6 +76,9 @@ SELECT * FROM t1 INNER JOIN t2 ON (t1.a == t2.a) ORDER BY (a); -- { serverError DROP TABLE IF EXISTS t1; DROP TABLE IF EXISTS t2; +DROP TABLE IF EXISTS t_ab1; +DROP TABLE IF EXISTS t_ab2; + CREATE TABLE t_ab1 (id Nullable(Int32), a UInt16, b UInt8) ENGINE = TinyLog; CREATE TABLE t_ab2 (id Nullable(Int32), a Int16, b Nullable(Int64)) ENGINE = TinyLog; INSERT INTO t_ab1 VALUES (0, 1, 1), (1, 2, 2); @@ -124,5 +124,3 @@ SELECT any(toTypeName(a)) == 'Int32' AND any(toTypeName(b)) == 'Nullable(Int64)' DROP TABLE IF EXISTS t_ab1; DROP TABLE IF EXISTS t_ab2; - -DROP DATABASE IF EXISTS test_01674; From 435f63f42b319c52b4e1e2b6de51397c15b047a4 Mon Sep 17 00:00:00 2001 From: vdimir Date: Tue, 9 Feb 2021 16:17:42 +0300 Subject: [PATCH 0478/2357] Calculate common type for join using in TreeRewriter Split before_join and converting_join_columns dags Add more detailed comments for column type conversion for join using --- src/Interpreters/ExpressionActions.cpp | 2 +- src/Interpreters/ExpressionAnalyzer.cpp | 65 +++++++------ src/Interpreters/ExpressionAnalyzer.h | 8 +- src/Interpreters/InterpreterSelectQuery.cpp | 12 +++ src/Interpreters/TableJoin.cpp | 97 ++++++++++++++----- src/Interpreters/TableJoin.h | 33 ++++++- src/Interpreters/TreeRewriter.cpp | 6 ++ src/Interpreters/join_common.cpp | 74 -------------- src/Interpreters/join_common.h | 17 +--- .../01686_join_implicit_cast.reference | 24 +++++ .../0_stateless/01686_join_implicit_cast.sql | 33 +++++++ 11 files changed, 219 insertions(+), 152 deletions(-) diff --git a/src/Interpreters/ExpressionActions.cpp b/src/Interpreters/ExpressionActions.cpp index b1fd23d4311..9dd29e2c37d 100644 --- a/src/Interpreters/ExpressionActions.cpp +++ b/src/Interpreters/ExpressionActions.cpp @@ -726,7 +726,7 @@ ExpressionActionsChain::JoinStep::JoinStep( for (const auto & column : result_columns) required_columns.emplace_back(column.name, column.type); - analyzed_join->addJoinedColumnsAndCorrectNullability(result_columns); + analyzed_join->addJoinedColumnsAndCorrectTypes(result_columns); } void ExpressionActionsChain::JoinStep::finalize(const Names & required_output_) diff --git a/src/Interpreters/ExpressionAnalyzer.cpp b/src/Interpreters/ExpressionAnalyzer.cpp index e29eb5fbe69..8e25c6e9d43 100644 --- a/src/Interpreters/ExpressionAnalyzer.cpp +++ b/src/Interpreters/ExpressionAnalyzer.cpp @@ -95,6 +95,23 @@ bool allowEarlyConstantFolding(const ActionsDAG & actions, const Settings & sett return true; } + +/// Returns converting actions for tables that need to be performed before join +ActionsDAGPtr createJoinConvertingActions(const ColumnsWithTypeAndName & cols_src, const TableJoin::NameToTypeMap & mapping) +{ + ColumnsWithTypeAndName cols_dst = cols_src; + for (auto & col : cols_dst) + { + if (auto it = mapping.find(col.name); it != mapping.end()) + { + col.type = it->second; + col.column = nullptr; + } + } + return ActionsDAG::makeConvertingActions(cols_src, cols_dst, ActionsDAG::MatchColumnsMode::Name, true); +}; + + } bool sanitizeBlock(Block & block, bool throw_if_cannot_create_column) @@ -207,13 +224,12 @@ void ExpressionAnalyzer::analyzeAggregation() { getRootActionsNoMakeSet(analyzedJoin().leftKeysList(), true, temp_actions, false); auto sample_columns = temp_actions->getResultColumns(); - analyzedJoin().addJoinedColumnsAndCorrectNullability(sample_columns); + analyzedJoin().addJoinedColumnsAndCorrectTypes(sample_columns); temp_actions = std::make_shared(sample_columns); } columns_after_join = columns_after_array_join; - const auto & added_by_join = analyzedJoin().columnsAddedByJoin(); - columns_after_join.insert(columns_after_join.end(), added_by_join.begin(), added_by_join.end()); + analyzedJoin().addJoinedColumnsAndCorrectTypes(columns_after_join, false); } has_aggregation = makeAggregateDescriptions(temp_actions); @@ -713,28 +729,22 @@ ArrayJoinActionPtr SelectQueryExpressionAnalyzer::appendArrayJoin(ExpressionActi return array_join; } -bool SelectQueryExpressionAnalyzer::appendJoinLeftKeys(ExpressionActionsChain & chain, bool only_types, Block & block) +bool SelectQueryExpressionAnalyzer::appendJoinLeftKeys(ExpressionActionsChain & chain, bool only_types) { ExpressionActionsChain::Step & step = chain.lastStep(columns_after_array_join); getRootActions(analyzedJoin().leftKeysList(), only_types, step.actions()); - ExpressionActionsPtr actions = std::make_shared(step.actions()); - actions->execute(block); return true; } -JoinPtr -SelectQueryExpressionAnalyzer::appendJoin(ExpressionActionsChain & chain, const Block & sample_block, ActionsDAGPtr & before_join_dag) +JoinPtr SelectQueryExpressionAnalyzer::appendJoin(ExpressionActionsChain & chain, ActionsDAGPtr & left_actions) { - JoinCommon::JoinConvertActions converting_actions; - JoinPtr table_join = makeTableJoin(*syntax->ast_join, sample_block, converting_actions); - if (converting_actions.needConvert()) + JoinPtr table_join = makeTableJoin(*syntax->ast_join); + if (syntax->analyzed_join->needConvert()) { - syntax->analyzed_join->setConvertedRightType(converting_actions.right_target_types); - - before_join_dag = ActionsDAG::merge(std::move(*before_join_dag->clone()), std::move(*converting_actions.left_actions->clone())); - - chain.steps.push_back(std::make_unique(converting_actions.left_actions)); + left_actions = createJoinConvertingActions(chain.getLastStep().getResultColumns(), + syntax->analyzed_join->getLeftMapping()); + chain.steps.push_back(std::make_unique(left_actions)); chain.addStep(); } @@ -804,9 +814,7 @@ static std::shared_ptr makeJoin(std::shared_ptr analyzed_join, return std::make_shared(analyzed_join, sample_block); } -JoinPtr SelectQueryExpressionAnalyzer::makeTableJoin(const ASTTablesInSelectQueryElement & join_element, - const Block & left_sample_block, - JoinCommon::JoinConvertActions & converting_actions) +JoinPtr SelectQueryExpressionAnalyzer::makeTableJoin(const ASTTablesInSelectQueryElement & join_element) { /// Two JOINs are not supported with the same subquery, but different USINGs. auto join_hash = join_element.getTreeHash(); @@ -844,14 +852,12 @@ JoinPtr SelectQueryExpressionAnalyzer::makeTableJoin(const ASTTablesInSelectQuer /// TODO You do not need to set this up when JOIN is only needed on remote servers. subquery_for_join.addJoinActions(joined_block_actions); /// changes subquery_for_join.sample_block inside - const Block & right_sample_block = subquery_for_join.sample_block; - bool has_using = syntax->analyzed_join->hasUsing(); - converting_actions = JoinCommon::columnsNeedConvert( - left_sample_block, syntax->analyzed_join->keyNamesLeft(), - right_sample_block, syntax->analyzed_join->keyNamesRight(), - has_using); - if (converting_actions.needConvert()) - subquery_for_join.addJoinActions(std::make_shared(converting_actions.right_actions)); + if (syntax->analyzed_join->needConvert()) + { + auto right_actions = createJoinConvertingActions(subquery_for_join.sample_block.getColumnsWithTypeAndName(), + syntax->analyzed_join->getRightMapping()); + subquery_for_join.addJoinActions(std::make_shared(right_actions)); + } subquery_for_join.join = makeJoin(syntax->analyzed_join, subquery_for_join.sample_block, context); @@ -1446,10 +1452,9 @@ ExpressionAnalysisResult::ExpressionAnalysisResult( if (query_analyzer.hasTableJoin()) { - Block left_block_sample = source_header; - query_analyzer.appendJoinLeftKeys(chain, only_types || !first_stage, left_block_sample); + query_analyzer.appendJoinLeftKeys(chain, only_types || !first_stage); before_join = chain.getLastActions(); - join = query_analyzer.appendJoin(chain, left_block_sample, before_join); + join = query_analyzer.appendJoin(chain, converting_join_columns); chain.addStep(); } diff --git a/src/Interpreters/ExpressionAnalyzer.h b/src/Interpreters/ExpressionAnalyzer.h index 5c69bcafc65..20470553044 100644 --- a/src/Interpreters/ExpressionAnalyzer.h +++ b/src/Interpreters/ExpressionAnalyzer.h @@ -200,6 +200,7 @@ struct ExpressionAnalysisResult ActionsDAGPtr before_array_join; ArrayJoinActionPtr array_join; ActionsDAGPtr before_join; + ActionsDAGPtr converting_join_columns; JoinPtr join; ActionsDAGPtr before_where; ActionsDAGPtr before_aggregation; @@ -314,8 +315,7 @@ private: /// Create Set-s that we make from IN section to use index on them. void makeSetsForIndex(const ASTPtr & node); - JoinPtr makeTableJoin(const ASTTablesInSelectQueryElement & join_element, const Block & left_sample_block, - JoinCommon::JoinConvertActions & converting_actions); + JoinPtr makeTableJoin(const ASTTablesInSelectQueryElement & join_element); const ASTSelectQuery * getAggregatingQuery() const; @@ -335,8 +335,8 @@ private: /// Before aggregation: ArrayJoinActionPtr appendArrayJoin(ExpressionActionsChain & chain, ActionsDAGPtr & before_array_join, bool only_types); - bool appendJoinLeftKeys(ExpressionActionsChain & chain, bool only_types, Block & block); - JoinPtr appendJoin(ExpressionActionsChain & chain, const Block & sample_block, ActionsDAGPtr & before_join_dag); + bool appendJoinLeftKeys(ExpressionActionsChain & chain, bool only_types); + JoinPtr appendJoin(ExpressionActionsChain & chain, ActionsDAGPtr & before_join_dag); /// Add preliminary rows filtration. Actions are created in other expression analyzer to prevent any possible alias injection. void appendPreliminaryFilter(ExpressionActionsChain & chain, ActionsDAGPtr actions_dag, String column_name); /// remove_filter is set in ExpressionActionsChain::finalize(); diff --git a/src/Interpreters/InterpreterSelectQuery.cpp b/src/Interpreters/InterpreterSelectQuery.cpp index 47addcc8d6c..0b0243fe917 100644 --- a/src/Interpreters/InterpreterSelectQuery.cpp +++ b/src/Interpreters/InterpreterSelectQuery.cpp @@ -992,6 +992,18 @@ void InterpreterSelectQuery::executeImpl(QueryPlan & query_plan, const BlockInpu query_plan.addStep(std::move(before_join_step)); } + /// Optional step to convert key columns to common supertype. + /// Columns with changed types will be returned to user, + /// so its only suitable for `USING` join. + if (expressions.converting_join_columns) + { + QueryPlanStepPtr convert_join_step = std::make_unique( + query_plan.getCurrentDataStream(), + expressions.converting_join_columns); + convert_join_step->setStepDescription("Convert JOIN columns"); + query_plan.addStep(std::move(convert_join_step)); + } + if (expressions.hasJoin()) { JoinPtr join = expressions.join; diff --git a/src/Interpreters/TableJoin.cpp b/src/Interpreters/TableJoin.cpp index aee38dbc322..c99d6f6ea79 100644 --- a/src/Interpreters/TableJoin.cpp +++ b/src/Interpreters/TableJoin.cpp @@ -221,41 +221,48 @@ bool TableJoin::rightBecomeNullable(const DataTypePtr & column_type) const void TableJoin::addJoinedColumn(const NameAndTypePair & joined_column) { - if (rightBecomeNullable(joined_column.type)) - columns_added_by_join.emplace_back(NameAndTypePair(joined_column.name, makeNullable(joined_column.type))); - else - columns_added_by_join.push_back(joined_column); + DataTypePtr type = joined_column.type; + + if (auto it = right_type_map.find(joined_column.name); it != right_type_map.end()) + type = it->second; + + if (rightBecomeNullable(type)) + type = makeNullable(joined_column.type); + + columns_added_by_join.emplace_back(joined_column.name, type); } -void TableJoin::addJoinedColumnsAndCorrectNullability(ColumnsWithTypeAndName & columns) const +void TableJoin::addJoinedColumnsAndCorrectTypes(NamesAndTypesList & names_and_types, bool correct_nullability) const +{ + ColumnsWithTypeAndName columns; + for (auto & pair : names_and_types) + columns.emplace_back(nullptr, std::move(pair.type), std::move(pair.name)); + names_and_types.clear(); + + addJoinedColumnsAndCorrectTypes(columns, correct_nullability); + + for (auto & col : columns) + names_and_types.emplace_back(std::move(col.name), std::move(col.type)); +} + +void TableJoin::addJoinedColumnsAndCorrectTypes(ColumnsWithTypeAndName & columns, bool correct_nullability) const { for (auto & col : columns) { - if (leftBecomeNullable(col.type)) + if (auto it = left_type_map.find(col.name); it != left_type_map.end()) + col.type = it->second; + if (correct_nullability && leftBecomeNullable(col.type)) { /// No need to nullify constants - if (!(col.column && isColumnConst(*col.column))) - { + bool is_column_const = col.column && isColumnConst(*col.column); + if (!is_column_const) col.type = makeNullable(col.type); - } } } - std::unordered_map type_map; - for (const auto & [name, type] : converted_right_types) - type_map[name] = type; - + /// Types in columns_added_by_join already converted and set nullable if needed for (const auto & col : columns_added_by_join) - { - auto res_type = col.type; - if (const auto it = type_map.find(col.name); it != type_map.end()) - res_type = it->second; - - if (rightBecomeNullable(res_type)) - res_type = makeNullable(res_type); - - columns.emplace_back(nullptr, res_type, col.name); - } + columns.emplace_back(nullptr, col.type, col.name); } bool TableJoin::sameJoin(const TableJoin * x, const TableJoin * y) @@ -342,4 +349,48 @@ bool TableJoin::allowDictJoin(const String & dict_key, const Block & sample_bloc return true; } +bool TableJoin::inferJoinKeyCommonType(const NamesAndTypesList & left, const NamesAndTypesList & right) +{ + std::unordered_map left_types; + for (const auto & pair : left) + { + left_types[pair.name] = pair.type; + } + + std::unordered_map right_types; + for (const auto & pair : right) + { + if (auto it = renames.find(pair.name); it != renames.end()) + right_types[it->second] = pair.type; + else + right_types[pair.name] = pair.type; + } + + for (size_t i = 0; i < key_names_left.size(); ++i) + { + auto ltype = left_types[key_names_left[i]]; + auto rtype = right_types[key_names_right[i]]; + + if (JoinCommon::typesEqualUpToNullability(ltype, rtype)) + continue; + + DataTypePtr supertype; + try + { + supertype = DB::getLeastSupertype({ltype, rtype}); + } + catch (DB::Exception &) + { + throw Exception( + "Type mismatch of columns to JOIN by: " + + key_names_left[i] + ": " + ltype->getName() + " at left, " + + key_names_right[i] + ": " + rtype->getName() + " at right", + ErrorCodes::TYPE_MISMATCH); + } + left_type_map[key_names_left[i]] = right_type_map[key_names_right[i]] = supertype; + } + + return !left_type_map.empty(); +} + } diff --git a/src/Interpreters/TableJoin.h b/src/Interpreters/TableJoin.h index f2c8bc2d839..17a3eb5b177 100644 --- a/src/Interpreters/TableJoin.h +++ b/src/Interpreters/TableJoin.h @@ -5,9 +5,11 @@ #include #include #include +#include #include #include #include +#include #include #include @@ -16,6 +18,11 @@ namespace DB { +namespace ErrorCodes +{ + extern const int TYPE_MISMATCH; +} + class Context; class ASTSelectQuery; struct DatabaseAndTableWithAlias; @@ -32,6 +39,11 @@ using VolumePtr = std::shared_ptr; class TableJoin { + +public: + using NameToTypeMap = std::unordered_map; + +private: /** Query of the form `SELECT expr(x) AS k FROM t1 ANY LEFT JOIN (SELECT expr(x) AS k FROM t2) USING k` * The join is made by column k. * During the JOIN, @@ -66,10 +78,13 @@ class TableJoin /// All columns which can be read from joined table. Duplicating names are qualified. NamesAndTypesList columns_from_joined_table; - /// Columns will be added to block by JOIN. It's a subset of columns_from_joined_table with corrected Nullability + /// Columns will be added to block by JOIN. + /// It's a subset of columns_from_joined_table with corrected Nullability and type (if type conversion is required) NamesAndTypesList columns_added_by_join; - /// Columns from right table that requires type conversion - NamesAndTypesList converted_right_types; + + /// Target type to convert key columns before join + NameToTypeMap left_type_map; + NameToTypeMap right_type_map; /// Name -> original name. Names are the same as in columns_from_joined_table list. std::unordered_map original_names; @@ -139,8 +154,16 @@ public: bool leftBecomeNullable(const DataTypePtr & column_type) const; bool rightBecomeNullable(const DataTypePtr & column_type) const; void addJoinedColumn(const NameAndTypePair & joined_column); - void addJoinedColumnsAndCorrectNullability(ColumnsWithTypeAndName & columns) const; - void setConvertedRightType(NamesAndTypesList columns) { converted_right_types = columns; } + + void addJoinedColumnsAndCorrectTypes(NamesAndTypesList & names_and_types, bool correct_nullability = true) const; + void addJoinedColumnsAndCorrectTypes(ColumnsWithTypeAndName & columns, bool correct_nullability = true) const; + + /// Calculates common supertypes for corresponding join key columns. + bool inferJoinKeyCommonType(const NamesAndTypesList & left, const NamesAndTypesList & right); + bool needConvert() const { return !left_type_map.empty(); } + /// Key columns should be converted according to this mapping before join. + const NameToTypeMap & getLeftMapping() const { return left_type_map; } + const NameToTypeMap & getRightMapping() const { return right_type_map; } void setAsofInequality(ASOF::Inequality inequality) { asof_inequality = inequality; } ASOF::Inequality getAsofInequality() { return asof_inequality; } diff --git a/src/Interpreters/TreeRewriter.cpp b/src/Interpreters/TreeRewriter.cpp index fd87d86bf97..8487959ffb0 100644 --- a/src/Interpreters/TreeRewriter.cpp +++ b/src/Interpreters/TreeRewriter.cpp @@ -417,6 +417,12 @@ void collectJoinedColumns(TableJoin & analyzed_join, const ASTSelectQuery & sele const auto & keys = table_join.using_expression_list->as(); for (const auto & key : keys.children) analyzed_join.addUsingKey(key); + + /// `USING` semantic allows to have columns with changed types in result table. + /// `JOIN ON key1 = key2` should preserve types from original table, so do not perform conversion at all. + /// TODO: Conversion for `JOIN ON` can be added with additional maintenance for types and columns. + /// Or maybe it's possible to perform it on ast level? Not implemented yet. + analyzed_join.inferJoinKeyCommonType(tables[0].columns, tables[1].columns); } else if (table_join.on_expression) { diff --git a/src/Interpreters/join_common.cpp b/src/Interpreters/join_common.cpp index 10cc637531f..e48cf3cd80d 100644 --- a/src/Interpreters/join_common.cpp +++ b/src/Interpreters/join_common.cpp @@ -292,80 +292,6 @@ bool typesEqualUpToNullability(DataTypePtr left_type, DataTypePtr right_type) return left_type_strict->equals(*right_type_strict); } -JoinConvertActions columnsNeedConvert(const Block & left_block, const Names & left_keys, - const Block & right_block, const Names & right_keys, - bool has_using) -{ - assert(left_keys.size() == right_keys.size()); - - /// only JOIN USING supported - if (!has_using) - return {}; - - JoinConvertActions actions; - - Block left_block_dst = left_block; - Block right_block_dst = right_block; - - std::unordered_set visited_left; - std::unordered_set visited_right; - bool any_need_cast = false; - for (size_t i = 0; i < left_keys.size(); ++i) - { - if (visited_left.contains(left_keys[i]) || visited_right.contains(right_keys[i])) - { - /// if one column joined with multiple different others do not perform conversion - /// e.g. `JOIN ... ON t1.a == t2.a AND t1.a == t2.b` - return {}; - } - visited_left.insert(left_keys[i]); - visited_right.insert(right_keys[i]); - - DataTypePtr ltype = left_block.getByName(left_keys[i]).type; - DataTypePtr rtype = right_block.getByName(right_keys[i]).type; - - if (typesEqualUpToNullability(ltype, rtype)) - continue; - - any_need_cast = true; - DataTypePtr supertype; - try - { - supertype = DB::getLeastSupertype({ltype, rtype}); - } - catch (DB::Exception &) - { - throw Exception("Type mismatch of columns to JOIN by: " - + left_keys[i] + ": " + ltype->getName() + " at left, " - + right_keys[i] + ": " + rtype->getName() + " at right", - ErrorCodes::TYPE_MISMATCH); - } - actions.left_target_types.emplace_back(left_keys[i], supertype); - actions.right_target_types.emplace_back(right_keys[i], supertype); - - auto & lcol_dst = left_block_dst.getByName(left_keys[i]); - auto & rcol_dst = right_block_dst.getByName(right_keys[i]); - lcol_dst.column = rcol_dst.column = nullptr; - lcol_dst.type = rcol_dst.type = supertype; - } - - if (!any_need_cast) - return {}; - - actions.left_actions = ActionsDAG::makeConvertingActions( - left_block.getColumnsWithTypeAndName(), - left_block_dst.getColumnsWithTypeAndName(), - ActionsDAG::MatchColumnsMode::Name, - true); - actions.right_actions = ActionsDAG::makeConvertingActions( - right_block.getColumnsWithTypeAndName(), - right_block_dst.getColumnsWithTypeAndName(), - ActionsDAG::MatchColumnsMode::Name, - true); - - return actions; -} - } diff --git a/src/Interpreters/join_common.h b/src/Interpreters/join_common.h index 13df2cc7027..6a90670a4c6 100644 --- a/src/Interpreters/join_common.h +++ b/src/Interpreters/join_common.h @@ -16,17 +16,6 @@ using ColumnRawPtrs = std::vector; namespace JoinCommon { -struct JoinConvertActions -{ - ActionsDAGPtr left_actions; - ActionsDAGPtr right_actions; - - NamesAndTypesList left_target_types; - NamesAndTypesList right_target_types; - - bool needConvert() const { return left_actions && right_actions; } -}; - void convertColumnToNullable(ColumnWithTypeAndName & column, bool low_card_nullability = false); void convertColumnsToNullable(Block & block, size_t starting_pos = 0); void removeColumnNullability(ColumnWithTypeAndName & column); @@ -49,10 +38,8 @@ void joinTotals(const Block & totals, const Block & columns_to_add, const Names void addDefaultValues(IColumn & column, const DataTypePtr & type, size_t count); -/// Return converting actions for left and right tables that need to be performed before join -JoinConvertActions columnsNeedConvert(const Block & left_block, const Names & left_keys, - const Block & right_block, const Names & right_keys, - bool has_using); +bool typesEqualUpToNullability(DataTypePtr left_type, DataTypePtr right_type); + } /// Creates result from right table data in RIGHT and FULL JOIN when keys are not present in left table. diff --git a/tests/queries/0_stateless/01686_join_implicit_cast.reference b/tests/queries/0_stateless/01686_join_implicit_cast.reference index 644e7a3aa3b..517b356e519 100644 --- a/tests/queries/0_stateless/01686_join_implicit_cast.reference +++ b/tests/queries/0_stateless/01686_join_implicit_cast.reference @@ -87,11 +87,18 @@ 3 3 3 4 4 4 5 5 5 +- agg - +1 +1 +1 +1 - types - 1 1 1 1 +1 +1 --- partial_merge --- - full - -4 0 196 @@ -181,11 +188,18 @@ 3 3 3 4 4 4 5 5 5 +- agg - +1 +1 +1 +1 - types - 1 1 1 1 +1 +1 --- hash --- - full - 1 1 @@ -205,6 +219,11 @@ 1 -1 - inner - 1 1 +- agg - +5 260 +3 3 +3 258 +1 1 - types - 1 1 @@ -229,6 +248,11 @@ 1 -1 - inner - 1 1 +- agg - +5 260 +3 3 +3 258 +1 1 - types - 1 1 diff --git a/tests/queries/0_stateless/01686_join_implicit_cast.sql b/tests/queries/0_stateless/01686_join_implicit_cast.sql index e5bab3e9243..f4c10e6517d 100644 --- a/tests/queries/0_stateless/01686_join_implicit_cast.sql +++ b/tests/queries/0_stateless/01686_join_implicit_cast.sql @@ -28,12 +28,23 @@ SELECT a, t1.a, t2.a FROM t1 RIGHT JOIN t2 USING (a) ORDER BY (t1.a, t2.a); SELECT '- inner -'; SELECT a, t1.a, t2.a FROM t1 INNER JOIN t2 USING (a) ORDER BY (t1.a, t2.a); +select '- agg -'; +SELECT sum(a) == 7 FROM t1 FULL JOIN t2 USING (a) WHERE b > 102 AND t2.b <= 204; +SELECT sum(a) == 7 FROM t1 INNER JOIN t2 USING (a) WHERE b > 102 AND t2.b <= 204; + +SELECT sum(b) = 103 FROM t1 LEFT JOIN t2 USING (a) WHERE b > 102 AND t2.b < 204; +SELECT sum(t2.b) = 203 FROM t1 RIGHT JOIN t2 USING (a) WHERE b > 102 AND t2.b < 204; + SELECT '- types -'; + SELECT any(toTypeName(a)) == 'Int32' AND any(toTypeName(t2.a)) == 'Int32' FROM t1 FULL JOIN t2 USING (a); SELECT any(toTypeName(a)) == 'Int32' AND any(toTypeName(t2.a)) == 'Int32' FROM t1 LEFT JOIN t2 USING (a); SELECT any(toTypeName(a)) == 'Int32' AND any(toTypeName(t2.a)) == 'Int32' FROM t1 RIGHT JOIN t2 USING (a); SELECT any(toTypeName(a)) == 'Int32' AND any(toTypeName(t2.a)) == 'Int32' FROM t1 INNER JOIN t2 USING (a); +SELECT toTypeName(any(a)) == 'Int32' AND toTypeName(any(t2.a)) == 'Int32' FROM t1 FULL JOIN t2 USING (a); +SELECT min(toTypeName(a) == 'Int32' AND toTypeName(t2.a) == 'Int32') FROM t1 FULL JOIN t2 USING (a); + SELECT * FROM t1 FULL JOIN t2 ON (t1.a == t2.a) ORDER BY (a); -- { serverError 53 } SELECT * FROM t1 LEFT JOIN t2 ON(t1.a == t2.a) ORDER BY (a); -- { serverError 53 } SELECT * FROM t1 RIGHT JOIN t2 ON (t1.a == t2.a) ORDER BY (a); -- { serverError 53 } @@ -62,12 +73,22 @@ SELECT a, t1.a, t2.a FROM t1 RIGHT JOIN t2 USING (a) ORDER BY (t1.a, t2.a); SELECT '- inner -'; SELECT a, t1.a, t2.a FROM t1 INNER JOIN t2 USING (a) ORDER BY (t1.a, t2.a); +select '- agg -'; +SELECT sum(a) == 7 FROM t1 FULL JOIN t2 USING (a) WHERE b > 102 AND t2.b <= 204; +SELECT sum(a) == 7 FROM t1 INNER JOIN t2 USING (a) WHERE b > 102 AND t2.b <= 204; + +SELECT sum(b) = 103 FROM t1 LEFT JOIN t2 USING (a) WHERE b > 102 AND t2.b < 204; +SELECT sum(t2.b) = 203 FROM t1 RIGHT JOIN t2 USING (a) WHERE b > 102 AND t2.b < 204; + SELECT '- types -'; SELECT any(toTypeName(a)) == 'Int32' AND any(toTypeName(t2.a)) == 'Int32' FROM t1 FULL JOIN t2 USING (a); SELECT any(toTypeName(a)) == 'Int32' AND any(toTypeName(t2.a)) == 'Int32' FROM t1 LEFT JOIN t2 USING (a); SELECT any(toTypeName(a)) == 'Int32' AND any(toTypeName(t2.a)) == 'Int32' FROM t1 RIGHT JOIN t2 USING (a); SELECT any(toTypeName(a)) == 'Int32' AND any(toTypeName(t2.a)) == 'Int32' FROM t1 INNER JOIN t2 USING (a); +SELECT toTypeName(any(a)) == 'Int32' AND toTypeName(any(t2.a)) == 'Int32' FROM t1 FULL JOIN t2 USING (a); +SELECT min(toTypeName(a) == 'Int32' AND toTypeName(t2.a) == 'Int32') FROM t1 FULL JOIN t2 USING (a); + SELECT * FROM t1 FULL JOIN t2 ON (t1.a == t2.a) ORDER BY (a); -- { serverError 53 } SELECT * FROM t1 LEFT JOIN t2 ON(t1.a == t2.a) ORDER BY (a); -- { serverError 53 } SELECT * FROM t1 RIGHT JOIN t2 ON (t1.a == t2.a) ORDER BY (a); -- { serverError 53 } @@ -95,6 +116,12 @@ SELECT a, b FROM t_ab1 RIGHT JOIN t_ab2 USING (a, b) ORDER BY ifNull(t_ab1.id, t SELECT '- inner -'; SELECT a, b FROM t_ab1 INNER JOIN t_ab2 USING (a, b) ORDER BY ifNull(t_ab1.id, t_ab2.id); +select '- agg -'; +SELECT sum(a), sum(b) FROM t_ab1 FULL JOIN t_ab2 USING (a, b); +SELECT sum(a), sum(b) FROM t_ab1 LEFT JOIN t_ab2 USING (a, b); +SELECT sum(a), sum(b) FROM t_ab1 RIGHT JOIN t_ab2 USING (a, b); +SELECT sum(a), sum(b) FROM t_ab1 INNER JOIN t_ab2 USING (a, b); + SELECT '- types -'; SELECT any(toTypeName(a)) == 'Int32' AND any(toTypeName(b)) == 'Nullable(Int64)' FROM t_ab1 FULL JOIN t_ab2 USING (a, b); @@ -115,6 +142,12 @@ SELECT a, b FROM t_ab1 RIGHT JOIN t_ab2 USING (a, b) ORDER BY ifNull(t_ab1.id, t SELECT '- inner -'; SELECT a, b FROM t_ab1 INNER JOIN t_ab2 USING (a, b) ORDER BY ifNull(t_ab1.id, t_ab2.id); +select '- agg -'; +SELECT sum(a), sum(b) FROM t_ab1 FULL JOIN t_ab2 USING (a, b); +SELECT sum(a), sum(b) FROM t_ab1 LEFT JOIN t_ab2 USING (a, b); +SELECT sum(a), sum(b) FROM t_ab1 RIGHT JOIN t_ab2 USING (a, b); +SELECT sum(a), sum(b) FROM t_ab1 INNER JOIN t_ab2 USING (a, b); + SELECT '- types -'; SELECT any(toTypeName(a)) == 'Int32' AND any(toTypeName(b)) == 'Nullable(Int64)' FROM t_ab1 FULL JOIN t_ab2 USING (a, b); From 4203dd5e38b114032e55fdfd89f4d2320da7893e Mon Sep 17 00:00:00 2001 From: vdimir Date: Tue, 9 Feb 2021 18:28:06 +0300 Subject: [PATCH 0479/2357] Give up on name mismatch in inferJoinKeyCommonType --- src/Interpreters/TableJoin.cpp | 24 ++++++++++++++++++------ src/Interpreters/TableJoin.h | 5 ----- 2 files changed, 18 insertions(+), 11 deletions(-) diff --git a/src/Interpreters/TableJoin.cpp b/src/Interpreters/TableJoin.cpp index c99d6f6ea79..f4ca60ce7c8 100644 --- a/src/Interpreters/TableJoin.cpp +++ b/src/Interpreters/TableJoin.cpp @@ -15,6 +15,11 @@ namespace DB { +namespace ErrorCodes +{ + extern const int TYPE_MISMATCH; +} + TableJoin::TableJoin(const Settings & settings, VolumePtr tmp_volume_) : size_limits(SizeLimits{settings.max_rows_in_join, settings.max_bytes_in_join, settings.join_overflow_mode}) , default_max_bytes(settings.default_max_bytes_in_join) @@ -368,23 +373,30 @@ bool TableJoin::inferJoinKeyCommonType(const NamesAndTypesList & left, const Nam for (size_t i = 0; i < key_names_left.size(); ++i) { - auto ltype = left_types[key_names_left[i]]; - auto rtype = right_types[key_names_right[i]]; + auto ltype = left_types.find(key_names_left[i]); + auto rtype = right_types.find(key_names_right[i]); + if (ltype == left_types.end() || rtype == right_types.end()) + { + /// Name mismatch, give up + left_type_map.clear(); + right_type_map.clear(); + return false; + } - if (JoinCommon::typesEqualUpToNullability(ltype, rtype)) + if (JoinCommon::typesEqualUpToNullability(ltype->second, rtype->second)) continue; DataTypePtr supertype; try { - supertype = DB::getLeastSupertype({ltype, rtype}); + supertype = DB::getLeastSupertype({ltype->second, rtype->second}); } catch (DB::Exception &) { throw Exception( "Type mismatch of columns to JOIN by: " + - key_names_left[i] + ": " + ltype->getName() + " at left, " + - key_names_right[i] + ": " + rtype->getName() + " at right", + key_names_left[i] + ": " + ltype->second->getName() + " at left, " + + key_names_right[i] + ": " + rtype->second->getName() + " at right", ErrorCodes::TYPE_MISMATCH); } left_type_map[key_names_left[i]] = right_type_map[key_names_right[i]] = supertype; diff --git a/src/Interpreters/TableJoin.h b/src/Interpreters/TableJoin.h index 17a3eb5b177..89f852642c0 100644 --- a/src/Interpreters/TableJoin.h +++ b/src/Interpreters/TableJoin.h @@ -18,11 +18,6 @@ namespace DB { -namespace ErrorCodes -{ - extern const int TYPE_MISMATCH; -} - class Context; class ASTSelectQuery; struct DatabaseAndTableWithAlias; From 9b79ab2ac03162c488c08d8a1f4a8e3ab5f986dd Mon Sep 17 00:00:00 2001 From: vdimir Date: Wed, 10 Feb 2021 09:14:25 +0300 Subject: [PATCH 0480/2357] Fix style in appendJoin --- src/Interpreters/ExpressionAnalyzer.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/Interpreters/ExpressionAnalyzer.h b/src/Interpreters/ExpressionAnalyzer.h index 20470553044..62adcf50c5a 100644 --- a/src/Interpreters/ExpressionAnalyzer.h +++ b/src/Interpreters/ExpressionAnalyzer.h @@ -336,7 +336,7 @@ private: /// Before aggregation: ArrayJoinActionPtr appendArrayJoin(ExpressionActionsChain & chain, ActionsDAGPtr & before_array_join, bool only_types); bool appendJoinLeftKeys(ExpressionActionsChain & chain, bool only_types); - JoinPtr appendJoin(ExpressionActionsChain & chain, ActionsDAGPtr & before_join_dag); + JoinPtr appendJoin(ExpressionActionsChain & chain, ActionsDAGPtr & left_actions); /// Add preliminary rows filtration. Actions are created in other expression analyzer to prevent any possible alias injection. void appendPreliminaryFilter(ExpressionActionsChain & chain, ActionsDAGPtr actions_dag, String column_name); /// remove_filter is set in ExpressionActionsChain::finalize(); From cd7d9584bdd06fe92e2ea567bc4004516af3231a Mon Sep 17 00:00:00 2001 From: vdimir Date: Wed, 17 Feb 2021 17:19:30 +0300 Subject: [PATCH 0481/2357] Add tests to 01720_join_implicit_cast - switch, use_nulls --- ...nce => 01720_join_implicit_cast.reference} | 130 ++++++++++++++++++ ..._cast.sql => 01720_join_implicit_cast.sql} | 63 +++++++++ 2 files changed, 193 insertions(+) rename tests/queries/0_stateless/{01686_join_implicit_cast.reference => 01720_join_implicit_cast.reference} (63%) rename tests/queries/0_stateless/{01686_join_implicit_cast.sql => 01720_join_implicit_cast.sql} (75%) diff --git a/tests/queries/0_stateless/01686_join_implicit_cast.reference b/tests/queries/0_stateless/01720_join_implicit_cast.reference similarity index 63% rename from tests/queries/0_stateless/01686_join_implicit_cast.reference rename to tests/queries/0_stateless/01720_join_implicit_cast.reference index 517b356e519..21b02c112d9 100644 --- a/tests/queries/0_stateless/01686_join_implicit_cast.reference +++ b/tests/queries/0_stateless/01720_join_implicit_cast.reference @@ -200,6 +200,136 @@ 1 1 1 +--- switch --- +- full - +-4 0 196 +-3 0 197 +-2 0 198 +-1 0 199 +0 0 200 +1 101 201 +2 102 202 +3 103 203 +4 104 204 +5 105 205 +6 106 \N +7 107 \N +8 108 \N +9 109 \N +10 110 \N +- left - +1 101 201 +2 102 202 +3 103 203 +4 104 204 +5 105 205 +6 106 \N +7 107 \N +8 108 \N +9 109 \N +10 110 \N +- right - +-4 0 196 +-3 0 197 +-2 0 198 +-1 0 199 +0 0 200 +1 101 201 +2 102 202 +3 103 203 +4 104 204 +5 105 205 +- inner - +1 101 201 +2 102 202 +3 103 203 +4 104 204 +5 105 205 +- full - +0 0 -4 +0 0 -3 +0 0 -2 +0 0 -1 +0 0 0 +1 1 1 +2 2 2 +3 3 3 +4 4 4 +5 5 5 +6 6 0 +7 7 0 +8 8 0 +9 9 0 +10 10 0 +- left - +1 1 1 +2 2 2 +3 3 3 +4 4 4 +5 5 5 +6 6 0 +7 7 0 +8 8 0 +9 9 0 +10 10 0 +- right - +0 0 -4 +0 0 -3 +0 0 -2 +0 0 -1 +0 0 0 +1 1 1 +2 2 2 +3 3 3 +4 4 4 +5 5 5 +- inner - +1 1 1 +2 2 2 +3 3 3 +4 4 4 +5 5 5 +- agg - +1 +1 +1 +1 +- types - +1 +1 +1 +1 +1 +1 +--- join use nulls --- +- full - +-4 \N 196 +-3 \N 197 +-2 \N 198 +-1 \N 199 +0 \N 200 +1 101 201 +2 102 202 +3 103 203 +4 104 204 +5 105 205 +6 106 \N +7 107 \N +8 108 \N +9 109 \N +10 110 \N +- right - +-4 \N 196 +-3 \N 197 +-2 \N 198 +-1 \N 199 +0 \N 200 +1 101 201 +2 102 202 +3 103 203 +4 104 204 +5 105 205 +--- --- hash --- - full - 1 1 diff --git a/tests/queries/0_stateless/01686_join_implicit_cast.sql b/tests/queries/0_stateless/01720_join_implicit_cast.sql similarity index 75% rename from tests/queries/0_stateless/01686_join_implicit_cast.sql rename to tests/queries/0_stateless/01720_join_implicit_cast.sql index f4c10e6517d..b2fa227f0d5 100644 --- a/tests/queries/0_stateless/01686_join_implicit_cast.sql +++ b/tests/queries/0_stateless/01720_join_implicit_cast.sql @@ -94,9 +94,70 @@ SELECT * FROM t1 LEFT JOIN t2 ON(t1.a == t2.a) ORDER BY (a); -- { serverError 53 SELECT * FROM t1 RIGHT JOIN t2 ON (t1.a == t2.a) ORDER BY (a); -- { serverError 53 } SELECT * FROM t1 INNER JOIN t2 ON (t1.a == t2.a) ORDER BY (a); -- { serverError 53 } + +SELECT '--- switch ---'; + +SET join_algorithm = 'auto'; +SET max_bytes_in_join = 100; + +SELECT '- full -'; +SELECT a, b, t2.b FROM t1 FULL JOIN t2 USING (a) ORDER BY (a); +SELECT '- left -'; +SELECT a, b, t2.b FROM t1 LEFT JOIN t2 USING (a) ORDER BY (a); +SELECT '- right -'; +SELECT a, b, t2.b FROM t1 RIGHT JOIN t2 USING (a) ORDER BY (a); +SELECT '- inner -'; +SELECT a, b, t2.b FROM t1 INNER JOIN t2 USING (a) ORDER BY (a); + + +SELECT '- full -'; +SELECT a, t1.a, t2.a FROM t1 FULL JOIN t2 USING (a) ORDER BY (t1.a, t2.a); +SELECT '- left -'; +SELECT a, t1.a, t2.a FROM t1 LEFT JOIN t2 USING (a) ORDER BY (t1.a, t2.a); +SELECT '- right -'; +SELECT a, t1.a, t2.a FROM t1 RIGHT JOIN t2 USING (a) ORDER BY (t1.a, t2.a); +SELECT '- inner -'; +SELECT a, t1.a, t2.a FROM t1 INNER JOIN t2 USING (a) ORDER BY (t1.a, t2.a); + +select '- agg -'; +SELECT sum(a) == 7 FROM t1 FULL JOIN t2 USING (a) WHERE b > 102 AND t2.b <= 204; +SELECT sum(a) == 7 FROM t1 INNER JOIN t2 USING (a) WHERE b > 102 AND t2.b <= 204; + +SELECT sum(b) = 103 FROM t1 LEFT JOIN t2 USING (a) WHERE b > 102 AND t2.b < 204; +SELECT sum(t2.b) = 203 FROM t1 RIGHT JOIN t2 USING (a) WHERE b > 102 AND t2.b < 204; + +SELECT '- types -'; +SELECT any(toTypeName(a)) == 'Int32' AND any(toTypeName(t2.a)) == 'Int32' FROM t1 FULL JOIN t2 USING (a); +SELECT any(toTypeName(a)) == 'Int32' AND any(toTypeName(t2.a)) == 'Int32' FROM t1 LEFT JOIN t2 USING (a); +SELECT any(toTypeName(a)) == 'Int32' AND any(toTypeName(t2.a)) == 'Int32' FROM t1 RIGHT JOIN t2 USING (a); +SELECT any(toTypeName(a)) == 'Int32' AND any(toTypeName(t2.a)) == 'Int32' FROM t1 INNER JOIN t2 USING (a); + +SELECT toTypeName(any(a)) == 'Int32' AND toTypeName(any(t2.a)) == 'Int32' FROM t1 FULL JOIN t2 USING (a); +SELECT min(toTypeName(a) == 'Int32' AND toTypeName(t2.a) == 'Int32') FROM t1 FULL JOIN t2 USING (a); + +SELECT * FROM t1 FULL JOIN t2 ON (t1.a == t2.a) ORDER BY (a); -- { serverError 53 } +SELECT * FROM t1 LEFT JOIN t2 ON(t1.a == t2.a) ORDER BY (a); -- { serverError 53 } +SELECT * FROM t1 RIGHT JOIN t2 ON (t1.a == t2.a) ORDER BY (a); -- { serverError 53 } +SELECT * FROM t1 INNER JOIN t2 ON (t1.a == t2.a) ORDER BY (a); -- { serverError 53 } + +SET max_bytes_in_join = 0; + +SELECT '--- join use nulls ---'; + +SET join_use_nulls = 1; + +SELECT '- full -'; +SELECT a, b, t2.b FROM t1 FULL JOIN t2 USING (a) ORDER BY (a); +SELECT '- right -'; +SELECT a, b, t2.b FROM t1 RIGHT JOIN t2 USING (a) ORDER BY (a); + +SET join_use_nulls = 0; + DROP TABLE IF EXISTS t1; DROP TABLE IF EXISTS t2; +select '---'; + DROP TABLE IF EXISTS t_ab1; DROP TABLE IF EXISTS t_ab2; @@ -107,6 +168,8 @@ INSERT INTO t_ab2 VALUES (2, -1, 1), (3, 1, NULL), (4, 1, 257), (5, 1, -1), (6, SELECT '--- hash ---'; +SET join_algorithm = 'hash'; + SELECT '- full -'; SELECT a, b FROM t_ab1 FULL JOIN t_ab2 USING (a, b) ORDER BY ifNull(t_ab1.id, t_ab2.id); SELECT '- left -'; From a378bd08aa935853a5485e88904d54169e1c1cdb Mon Sep 17 00:00:00 2001 From: vdimir Date: Thu, 18 Feb 2021 14:49:32 +0300 Subject: [PATCH 0482/2357] Perform implicit type conversion for JOIN ON keys --- src/Interpreters/ActionsDAG.cpp | 58 +++++++++---- src/Interpreters/ActionsDAG.h | 6 +- src/Interpreters/ActionsVisitor.cpp | 2 +- src/Interpreters/ExpressionActions.cpp | 4 +- src/Interpreters/ExpressionAnalyzer.cpp | 38 ++++++-- src/Interpreters/ExpressionAnalyzer.h | 5 +- src/Interpreters/TableJoin.cpp | 86 ++++++++++++------- src/Interpreters/TableJoin.h | 29 +++++-- src/Interpreters/TreeRewriter.cpp | 16 +++- src/Interpreters/join_common.cpp | 18 ++-- .../01710_join_use_nulls.reference | 1 + .../0_stateless/01710_join_use_nulls.sql | 5 +- .../0_stateless/01720_join_implicit_cast.sql | 24 +++--- 13 files changed, 199 insertions(+), 93 deletions(-) diff --git a/src/Interpreters/ActionsDAG.cpp b/src/Interpreters/ActionsDAG.cpp index 12942371d4f..91862fbe8ba 100644 --- a/src/Interpreters/ActionsDAG.cpp +++ b/src/Interpreters/ActionsDAG.cpp @@ -679,7 +679,9 @@ ActionsDAGPtr ActionsDAG::makeConvertingActions( const ColumnsWithTypeAndName & source, const ColumnsWithTypeAndName & result, MatchColumnsMode mode, - bool ignore_constant_values) + bool ignore_constant_values, + bool add_casted_columns, + NameToNameMap * new_names) { size_t num_input_columns = source.size(); size_t num_result_columns = result.size(); @@ -687,6 +689,9 @@ ActionsDAGPtr ActionsDAG::makeConvertingActions( if (mode == MatchColumnsMode::Position && num_input_columns != num_result_columns) throw Exception("Number of columns doesn't match", ErrorCodes::NUMBER_OF_COLUMNS_DOESNT_MATCH); + if (add_casted_columns && mode != MatchColumnsMode::Name) + throw Exception("Converting with add_casted_columns supported only for MatchColumnsMode::Name", ErrorCodes::LOGICAL_ERROR); + auto actions_dag = std::make_shared(source); std::vector projection(num_result_columns); @@ -706,12 +711,13 @@ ActionsDAGPtr ActionsDAG::makeConvertingActions( { const auto & res_elem = result[result_col_num]; Node * src_node = nullptr; + Node * dst_node = nullptr; switch (mode) { case MatchColumnsMode::Position: { - src_node = actions_dag->inputs[result_col_num]; + src_node = dst_node = actions_dag->inputs[result_col_num]; break; } @@ -722,7 +728,7 @@ ActionsDAGPtr ActionsDAG::makeConvertingActions( throw Exception("Cannot find column " + backQuote(res_elem.name) + " in source stream", ErrorCodes::THERE_IS_NO_COLUMN); - src_node = actions_dag->inputs[input.front()]; + src_node = dst_node = actions_dag->inputs[input.front()]; input.pop_front(); break; } @@ -731,10 +737,10 @@ ActionsDAGPtr ActionsDAG::makeConvertingActions( /// Check constants. if (const auto * res_const = typeid_cast(res_elem.column.get())) { - if (const auto * src_const = typeid_cast(src_node->column.get())) + if (const auto * src_const = typeid_cast(dst_node->column.get())) { if (ignore_constant_values) - src_node = const_cast(&actions_dag->addColumn(res_elem, true)); + dst_node = const_cast(&actions_dag->addColumn(res_elem, true)); else if (res_const->getField() != src_const->getField()) throw Exception("Cannot convert column " + backQuote(res_elem.name) + " because " "it is constant but values of constants are different in source and result", @@ -747,7 +753,7 @@ ActionsDAGPtr ActionsDAG::makeConvertingActions( } /// Add CAST function to convert into result type if needed. - if (!res_elem.type->equals(*src_node->result_type)) + if (!res_elem.type->equals(*dst_node->result_type)) { ColumnWithTypeAndName column; column.name = res_elem.type->getName(); @@ -755,27 +761,49 @@ ActionsDAGPtr ActionsDAG::makeConvertingActions( column.type = std::make_shared(); auto * right_arg = const_cast(&actions_dag->addColumn(std::move(column), true)); - auto * left_arg = src_node; + auto * left_arg = dst_node; - FunctionCast::Diagnostic diagnostic = {src_node->result_name, res_elem.name}; + FunctionCast::Diagnostic diagnostic = {dst_node->result_name, res_elem.name}; FunctionOverloadResolverPtr func_builder_cast = std::make_shared( CastOverloadResolver::createImpl(false, std::move(diagnostic))); Inputs children = { left_arg, right_arg }; - src_node = &actions_dag->addFunction(func_builder_cast, std::move(children), {}, true); + dst_node = &actions_dag->addFunction(func_builder_cast, std::move(children), {}, true); } - if (src_node->column && isColumnConst(*src_node->column) && !(res_elem.column && isColumnConst(*res_elem.column))) + if (dst_node->column && isColumnConst(*dst_node->column) && !(res_elem.column && isColumnConst(*res_elem.column))) { - Inputs children = {src_node}; - src_node = &actions_dag->addFunction(func_builder_materialize, std::move(children), {}, true); + Inputs children = {dst_node}; + dst_node = &actions_dag->addFunction(func_builder_materialize, std::move(children), {}, true); } - if (src_node->result_name != res_elem.name) - src_node = &actions_dag->addAlias(*src_node, res_elem.name, true); + if (dst_node->result_name != res_elem.name) + { + if (add_casted_columns) + { + if (inputs.contains(dst_node->result_name)) + throw Exception("Cannot convert column " + backQuote(res_elem.name) + + " to "+ backQuote(dst_node->result_name) + + " because other column have same name", + ErrorCodes::ILLEGAL_COLUMN); + if (new_names) + new_names->emplace(res_elem.name, dst_node->result_name); - projection[result_col_num] = src_node; + /// Leave current column on same place, add converted to back + projection[result_col_num] = src_node; + projection.push_back(dst_node); + } + else + { + dst_node = &actions_dag->addAlias(*dst_node, res_elem.name, true); + projection[result_col_num] = dst_node; + } + } + else + { + projection[result_col_num] = dst_node; + } } actions_dag->removeUnusedActions(projection); diff --git a/src/Interpreters/ActionsDAG.h b/src/Interpreters/ActionsDAG.h index 3c8778e239a..48cc3449d39 100644 --- a/src/Interpreters/ActionsDAG.h +++ b/src/Interpreters/ActionsDAG.h @@ -248,11 +248,15 @@ public: /// Create ActionsDAG which converts block structure from source to result. /// It is needed to convert result from different sources to the same structure, e.g. for UNION query. /// Conversion should be possible with only usage of CAST function and renames. + /// @param ignore_constant_values - Do not check that constants are same. Use value from result_header. + /// @param add_casted_columns - Create new columns with converted values instead of replacing original. static ActionsDAGPtr makeConvertingActions( const ColumnsWithTypeAndName & source, const ColumnsWithTypeAndName & result, MatchColumnsMode mode, - bool ignore_constant_values = false); /// Do not check that constants are same. Use value from result_header. + bool ignore_constant_values = false, + bool add_casted_columns = false, + NameToNameMap * new_names = nullptr); /// Create expression which add const column and then materialize it. static ActionsDAGPtr makeAddingColumnActions(ColumnWithTypeAndName column); diff --git a/src/Interpreters/ActionsVisitor.cpp b/src/Interpreters/ActionsVisitor.cpp index ca78c370834..cf9f937736d 100644 --- a/src/Interpreters/ActionsVisitor.cpp +++ b/src/Interpreters/ActionsVisitor.cpp @@ -674,7 +674,7 @@ void ActionsMatcher::visit(const ASTIdentifier & identifier, const ASTPtr & ast, if (column_name_type.name == column_name) { throw Exception("Column " + backQuote(column_name) + " is not under aggregate function and not in GROUP BY", - ErrorCodes::NOT_AN_AGGREGATE); + ErrorCodes::NOT_AN_AGGREGATE); } } diff --git a/src/Interpreters/ExpressionActions.cpp b/src/Interpreters/ExpressionActions.cpp index 9dd29e2c37d..f06f8248fec 100644 --- a/src/Interpreters/ExpressionActions.cpp +++ b/src/Interpreters/ExpressionActions.cpp @@ -747,8 +747,8 @@ void ExpressionActionsChain::JoinStep::finalize(const Names & required_output_) } /// Result will also contain joined columns. - for (const auto & column : analyzed_join->columnsAddedByJoin()) - required_names.emplace(column.name); + for (const auto & column_name : analyzed_join->columnsAddedByJoin()) + required_names.emplace(column_name); for (const auto & column : result_columns) { diff --git a/src/Interpreters/ExpressionAnalyzer.cpp b/src/Interpreters/ExpressionAnalyzer.cpp index 8e25c6e9d43..56bae974094 100644 --- a/src/Interpreters/ExpressionAnalyzer.cpp +++ b/src/Interpreters/ExpressionAnalyzer.cpp @@ -97,7 +97,10 @@ bool allowEarlyConstantFolding(const ActionsDAG & actions, const Settings & sett /// Returns converting actions for tables that need to be performed before join -ActionsDAGPtr createJoinConvertingActions(const ColumnsWithTypeAndName & cols_src, const TableJoin::NameToTypeMap & mapping) +ActionsDAGPtr createJoinConvertingActions(const ColumnsWithTypeAndName & cols_src, + const TableJoin::NameToTypeMap & mapping, + bool has_using, + NameToNameMap & renames) { ColumnsWithTypeAndName cols_dst = cols_src; for (auto & col : cols_dst) @@ -108,7 +111,8 @@ ActionsDAGPtr createJoinConvertingActions(const ColumnsWithTypeAndName & cols_sr col.column = nullptr; } } - return ActionsDAG::makeConvertingActions(cols_src, cols_dst, ActionsDAG::MatchColumnsMode::Name, true); + return ActionsDAG::makeConvertingActions( + cols_src, cols_dst, ActionsDAG::MatchColumnsMode::Name, true, !has_using, &renames); }; @@ -739,11 +743,12 @@ bool SelectQueryExpressionAnalyzer::appendJoinLeftKeys(ExpressionActionsChain & JoinPtr SelectQueryExpressionAnalyzer::appendJoin(ExpressionActionsChain & chain, ActionsDAGPtr & left_actions) { - JoinPtr table_join = makeTableJoin(*syntax->ast_join); + const ColumnsWithTypeAndName & left_sample_columns = chain.getLastStep().getResultColumns(); + + JoinPtr table_join = makeTableJoin(*syntax->ast_join, left_sample_columns, left_actions); if (syntax->analyzed_join->needConvert()) { - left_actions = createJoinConvertingActions(chain.getLastStep().getResultColumns(), - syntax->analyzed_join->getLeftMapping()); + assert(left_actions); chain.steps.push_back(std::make_unique(left_actions)); chain.addStep(); } @@ -814,7 +819,8 @@ static std::shared_ptr makeJoin(std::shared_ptr analyzed_join, return std::make_shared(analyzed_join, sample_block); } -JoinPtr SelectQueryExpressionAnalyzer::makeTableJoin(const ASTTablesInSelectQueryElement & join_element) +JoinPtr SelectQueryExpressionAnalyzer::makeTableJoin( + const ASTTablesInSelectQueryElement & join_element, const ColumnsWithTypeAndName & left_sample_columns, ActionsDAGPtr & left_actions) { /// Two JOINs are not supported with the same subquery, but different USINGs. auto join_hash = join_element.getTreeHash(); @@ -852,10 +858,26 @@ JoinPtr SelectQueryExpressionAnalyzer::makeTableJoin(const ASTTablesInSelectQuer /// TODO You do not need to set this up when JOIN is only needed on remote servers. subquery_for_join.addJoinActions(joined_block_actions); /// changes subquery_for_join.sample_block inside + const ColumnsWithTypeAndName & right_sample_columns = subquery_for_join.sample_block.getColumnsWithTypeAndName(); + /// For `USING` we already inferred common type an syntax analyzer stage + if (!syntax->analyzed_join->hasUsing()) + syntax->analyzed_join->inferJoinKeyCommonType(left_sample_columns, right_sample_columns); if (syntax->analyzed_join->needConvert()) { - auto right_actions = createJoinConvertingActions(subquery_for_join.sample_block.getColumnsWithTypeAndName(), - syntax->analyzed_join->getRightMapping()); + NameToNameMap left_column_rename; + left_actions = createJoinConvertingActions(left_sample_columns, + syntax->analyzed_join->getLeftMapping(), + syntax->analyzed_join->hasUsing(), + left_column_rename); + syntax->analyzed_join->applyKeyColumnRename(left_column_rename, TableJoin::TableSide::Left); + + NameToNameMap right_renames; + auto right_actions = createJoinConvertingActions(right_sample_columns, + syntax->analyzed_join->getRightMapping(), + syntax->analyzed_join->hasUsing(), + right_renames); + syntax->analyzed_join->applyKeyColumnRename(right_renames, TableJoin::TableSide::Right); + subquery_for_join.addJoinActions(std::make_shared(right_actions)); } diff --git a/src/Interpreters/ExpressionAnalyzer.h b/src/Interpreters/ExpressionAnalyzer.h index 62adcf50c5a..6af4be8ae41 100644 --- a/src/Interpreters/ExpressionAnalyzer.h +++ b/src/Interpreters/ExpressionAnalyzer.h @@ -315,7 +315,10 @@ private: /// Create Set-s that we make from IN section to use index on them. void makeSetsForIndex(const ASTPtr & node); - JoinPtr makeTableJoin(const ASTTablesInSelectQueryElement & join_element); + JoinPtr makeTableJoin( + const ASTTablesInSelectQueryElement & join_element, + const ColumnsWithTypeAndName & left_sample_columns, + ActionsDAGPtr & left_actions); const ASTSelectQuery * getAggregatingQuery() const; diff --git a/src/Interpreters/TableJoin.cpp b/src/Interpreters/TableJoin.cpp index f4ca60ce7c8..a394fc4ce53 100644 --- a/src/Interpreters/TableJoin.cpp +++ b/src/Interpreters/TableJoin.cpp @@ -110,14 +110,6 @@ void TableJoin::deduplicateAndQualifyColumnNames(const NameSet & left_table_colu columns_from_joined_table.swap(dedup_columns); } -NameSet TableJoin::getQualifiedColumnsSet() const -{ - NameSet out; - for (const auto & names : original_names) - out.insert(names.first); - return out; -} - NamesWithAliases TableJoin::getNamesWithAliases(const NameSet & required_columns) const { NamesWithAliases out; @@ -228,8 +220,11 @@ void TableJoin::addJoinedColumn(const NameAndTypePair & joined_column) { DataTypePtr type = joined_column.type; - if (auto it = right_type_map.find(joined_column.name); it != right_type_map.end()) - type = it->second; + if (hasUsing()) + { + if (auto it = right_type_map.find(joined_column.name); it != right_type_map.end()) + type = it->second; + } if (rightBecomeNullable(type)) type = makeNullable(joined_column.type); @@ -237,6 +232,11 @@ void TableJoin::addJoinedColumn(const NameAndTypePair & joined_column) columns_added_by_join.emplace_back(joined_column.name, type); } +void TableJoin::addRequiredLeftColumn(const String & left_column) +{ + required_left_keys.emplace(left_column); +} + void TableJoin::addJoinedColumnsAndCorrectTypes(NamesAndTypesList & names_and_types, bool correct_nullability) const { ColumnsWithTypeAndName columns; @@ -254,8 +254,11 @@ void TableJoin::addJoinedColumnsAndCorrectTypes(ColumnsWithTypeAndName & columns { for (auto & col : columns) { - if (auto it = left_type_map.find(col.name); it != left_type_map.end()) - col.type = it->second; + if (hasUsing()) + { + if (auto it = left_type_map.find(col.name); it != left_type_map.end()) + col.type = it->second; + } if (correct_nullability && leftBecomeNullable(col.type)) { /// No need to nullify constants @@ -270,20 +273,6 @@ void TableJoin::addJoinedColumnsAndCorrectTypes(ColumnsWithTypeAndName & columns columns.emplace_back(nullptr, col.type, col.name); } -bool TableJoin::sameJoin(const TableJoin * x, const TableJoin * y) -{ - if (!x && !y) - return true; - if (!x || !y) - return false; - - return x->table_join.kind == y->table_join.kind - && x->table_join.strictness == y->table_join.strictness - && x->key_names_left == y->key_names_left - && x->key_names_right == y->key_names_right - && x->columns_added_by_join == y->columns_added_by_join; -} - bool TableJoin::sameStrictnessAndKind(ASTTableJoin::Strictness strictness_, ASTTableJoin::Kind kind_) const { if (strictness_ == strictness() && kind_ == kind()) @@ -354,21 +343,35 @@ bool TableJoin::allowDictJoin(const String & dict_key, const Block & sample_bloc return true; } +bool TableJoin::inferJoinKeyCommonType(const ColumnsWithTypeAndName & left, const ColumnsWithTypeAndName & right) +{ + NamesAndTypesList left_list; + NamesAndTypesList right_list; + + for (const auto & col : left) + left_list.emplace_back(col.name, col.type); + + for (const auto & col : right) + right_list.emplace_back(col.name, col.type); + + return inferJoinKeyCommonType(left_list, right_list); +} + bool TableJoin::inferJoinKeyCommonType(const NamesAndTypesList & left, const NamesAndTypesList & right) { std::unordered_map left_types; - for (const auto & pair : left) + for (const auto & col : left) { - left_types[pair.name] = pair.type; + left_types[col.name] = col.type; } std::unordered_map right_types; - for (const auto & pair : right) + for (const auto & col : right) { - if (auto it = renames.find(pair.name); it != renames.end()) - right_types[it->second] = pair.type; + if (auto it = renames.find(col.name); it != renames.end()) + right_types[it->second] = col.type; else - right_types[pair.name] = pair.type; + right_types[col.name] = col.type; } for (size_t i = 0; i < key_names_left.size(); ++i) @@ -405,4 +408,23 @@ bool TableJoin::inferJoinKeyCommonType(const NamesAndTypesList & left, const Nam return !left_type_map.empty(); } +void TableJoin::applyKeyColumnRename(const NameToNameMap & name_map, TableJoin::TableSide side) +{ + assert(!hasUsing() || name_map.empty()); + + Names & names = side == TableSide::Left ? key_names_left : key_names_right; + for (auto & name : names) + { + const auto it = name_map.find(name); + if (it != name_map.end()) + { +// if (side == TableSide::Left && required_left_keys.contains(name)) +// { +// columns_added_by_join.emplace_back(name, nullptr); +// } + name = it->second; + } + } +} + } diff --git a/src/Interpreters/TableJoin.h b/src/Interpreters/TableJoin.h index 89f852642c0..5d0d5382775 100644 --- a/src/Interpreters/TableJoin.h +++ b/src/Interpreters/TableJoin.h @@ -65,16 +65,18 @@ private: const String temporary_files_codec = "LZ4"; Names key_names_left; + Names key_names_right; /// Duplicating names are qualified. ASTs key_asts_left; ASTs key_asts_right; ASTTableJoin table_join; ASOF::Inequality asof_inequality = ASOF::Inequality::GreaterOrEquals; + NameSet required_left_keys; /// All columns which can be read from joined table. Duplicating names are qualified. NamesAndTypesList columns_from_joined_table; /// Columns will be added to block by JOIN. - /// It's a subset of columns_from_joined_table with corrected Nullability and type (if type conversion is required) + /// It's a subset of columns_from_joined_table with corrected Nullability and type (if inplace type conversion is required) NamesAndTypesList columns_added_by_join; /// Target type to convert key columns before join @@ -88,7 +90,15 @@ private: VolumePtr tmp_volume; + Names requiredJoinedNames() const; + public: + enum class TableSide + { + Left, + Right + }; + TableJoin() = default; TableJoin(const Settings &, VolumePtr tmp_volume); @@ -138,7 +148,6 @@ public: bool hasOn() const { return table_join.on_expression != nullptr; } bool hasJoinedStorage() const { return joined_storage != nullptr; } - NameSet getQualifiedColumnsSet() const; NamesWithAliases getNamesWithAliases(const NameSet & required_columns) const; NamesWithAliases getRequiredColumns(const Block & sample, const Names & action_required_columns) const; @@ -149,12 +158,17 @@ public: bool leftBecomeNullable(const DataTypePtr & column_type) const; bool rightBecomeNullable(const DataTypePtr & column_type) const; void addJoinedColumn(const NameAndTypePair & joined_column); + void addRequiredLeftColumn(const String & left_column); + + void applyKeyColumnRename(const NameToNameMap & name_map, TableSide side); void addJoinedColumnsAndCorrectTypes(NamesAndTypesList & names_and_types, bool correct_nullability = true) const; void addJoinedColumnsAndCorrectTypes(ColumnsWithTypeAndName & columns, bool correct_nullability = true) const; /// Calculates common supertypes for corresponding join key columns. bool inferJoinKeyCommonType(const NamesAndTypesList & left, const NamesAndTypesList & right); + bool inferJoinKeyCommonType(const ColumnsWithTypeAndName & left, const ColumnsWithTypeAndName & right); + bool needConvert() const { return !left_type_map.empty(); } /// Key columns should be converted according to this mapping before join. const NameToTypeMap & getLeftMapping() const { return left_type_map; } @@ -166,11 +180,16 @@ public: ASTPtr leftKeysList() const; ASTPtr rightKeysList() const; /// For ON syntax only - Names requiredJoinedNames() const; const Names & keyNamesLeft() const { return key_names_left; } const Names & keyNamesRight() const { return key_names_right; } const NamesAndTypesList & columnsFromJoinedTable() const { return columns_from_joined_table; } - const NamesAndTypesList & columnsAddedByJoin() const { return columns_added_by_join; } + Names columnsAddedByJoin() const + { + Names res; + for (const auto & col : columns_added_by_join) + res.push_back(col.name); + return res; + } /// StorageJoin overrides key names (cause of different names qualification) void setRightKeys(const Names & keys) { key_names_right = keys; } @@ -178,8 +197,6 @@ public: /// Split key and other columns by keys name list void splitAdditionalColumns(const Block & sample_block, Block & block_keys, Block & block_others) const; Block getRequiredRightKeys(const Block & right_table_keys, std::vector & keys_sources) const; - - static bool sameJoin(const TableJoin * x, const TableJoin * y); }; } diff --git a/src/Interpreters/TreeRewriter.cpp b/src/Interpreters/TreeRewriter.cpp index 8487959ffb0..28c1864ac6c 100644 --- a/src/Interpreters/TreeRewriter.cpp +++ b/src/Interpreters/TreeRewriter.cpp @@ -419,9 +419,10 @@ void collectJoinedColumns(TableJoin & analyzed_join, const ASTSelectQuery & sele analyzed_join.addUsingKey(key); /// `USING` semantic allows to have columns with changed types in result table. - /// `JOIN ON key1 = key2` should preserve types from original table, so do not perform conversion at all. - /// TODO: Conversion for `JOIN ON` can be added with additional maintenance for types and columns. - /// Or maybe it's possible to perform it on ast level? Not implemented yet. + /// `JOIN ON` should preserve types from original table + /// We can infer common type on syntax stage, because join only by columns (not expression) is possible + /// We need to know that types in result tables changed because some analysis (e.g. analyzeAggregation) performed before we will create join + /// For `JOIN ON expr1 == expr2` we will infer common type on join createion, when types of expression will be known analyzed_join.inferJoinKeyCommonType(tables[0].columns, tables[1].columns); } else if (table_join.on_expression) @@ -576,13 +577,20 @@ void TreeRewriterResult::collectUsedColumns(const ASTPtr & query, bool is_select source_column_names.insert(column.name); NameSet required = columns_context.requiredColumns(); - if (columns_context.has_table_join) { NameSet available_columns; for (const auto & name : source_columns) available_columns.insert(name.name); + for (const auto & name : analyzed_join->keyNamesLeft()) + { + if (available_columns.count(name)) + continue; + if (required.count(name)) + analyzed_join->addRequiredLeftColumn(name); + } + /// Add columns obtained by JOIN (if needed). for (const auto & joined_column : analyzed_join->columnsFromJoinedTable()) { diff --git a/src/Interpreters/join_common.cpp b/src/Interpreters/join_common.cpp index e48cf3cd80d..e2285836cd1 100644 --- a/src/Interpreters/join_common.cpp +++ b/src/Interpreters/join_common.cpp @@ -306,19 +306,21 @@ NotJoined::NotJoined(const TableJoin & table_join, const Block & saved_block_sam table_join.splitAdditionalColumns(right_sample_block, right_table_keys, sample_block_with_columns_to_add); Block required_right_keys = table_join.getRequiredRightKeys(right_table_keys, tmp); - bool remap_keys = table_join.hasUsing(); std::unordered_map left_to_right_key_remap; - for (size_t i = 0; i < table_join.keyNamesLeft().size(); ++i) + if (table_join.hasUsing()) { - const String & left_key_name = table_join.keyNamesLeft()[i]; - const String & right_key_name = table_join.keyNamesRight()[i]; + for (size_t i = 0; i < table_join.keyNamesLeft().size(); ++i) + { + const String & left_key_name = table_join.keyNamesLeft()[i]; + const String & right_key_name = table_join.keyNamesRight()[i]; - size_t left_key_pos = result_sample_block.getPositionByName(left_key_name); - size_t right_key_pos = saved_block_sample.getPositionByName(right_key_name); + size_t left_key_pos = result_sample_block.getPositionByName(left_key_name); + size_t right_key_pos = saved_block_sample.getPositionByName(right_key_name); - if (remap_keys && !required_right_keys.has(right_key_name)) - left_to_right_key_remap[left_key_pos] = right_key_pos; + if (!required_right_keys.has(right_key_name)) + left_to_right_key_remap[left_key_pos] = right_key_pos; + } } /// result_sample_block: left_sample_block + left expressions, right not key columns, required right keys diff --git a/tests/queries/0_stateless/01710_join_use_nulls.reference b/tests/queries/0_stateless/01710_join_use_nulls.reference index 8bd111e0416..069117803a7 100644 --- a/tests/queries/0_stateless/01710_join_use_nulls.reference +++ b/tests/queries/0_stateless/01710_join_use_nulls.reference @@ -1,3 +1,4 @@ 3 +3 1 1 diff --git a/tests/queries/0_stateless/01710_join_use_nulls.sql b/tests/queries/0_stateless/01710_join_use_nulls.sql index b024227d4e2..f9fc4f31b6b 100644 --- a/tests/queries/0_stateless/01710_join_use_nulls.sql +++ b/tests/queries/0_stateless/01710_join_use_nulls.sql @@ -4,15 +4,14 @@ DROP TABLE IF EXISTS Y; CREATE TABLE X (id Int) ENGINE=Memory; CREATE TABLE Y (id Int) ENGINE=Memory; --- Type mismatch of columns to JOIN by: plus(id, 1) Int64 at left, Y.id Int32 at right. -SELECT Y.id - 1 FROM X RIGHT JOIN Y ON (X.id + 1) = Y.id SETTINGS join_use_nulls=1; -- { serverError 53 } +SELECT Y.id - 1 FROM X RIGHT JOIN Y ON (X.id + 1) = Y.id SETTINGS join_use_nulls=1; SELECT Y.id - 1 FROM X RIGHT JOIN Y ON (X.id + 1) = toInt64(Y.id) SETTINGS join_use_nulls=1; -- Logical error: 'Arguments of 'plus' have incorrect data types: '2' of type 'UInt8', '1' of type 'UInt8''. -- Because 1 became toNullable(1), i.e.: -- 2 UInt8 Const(size = 1, UInt8(size = 1)) -- 1 UInt8 Const(size = 1, Nullable(size = 1, UInt8(size = 1), UInt8(size = 1))) -SELECT 2+1 FROM system.one X RIGHT JOIN system.one Y ON X.dummy+1 = Y.dummy SETTINGS join_use_nulls = 1; -- { serverError 53 } +SELECT 2+1 FROM system.one X RIGHT JOIN system.one Y ON X.dummy+1 = Y.dummy SETTINGS join_use_nulls = 1; SELECT 2+1 FROM system.one X RIGHT JOIN system.one Y ON X.dummy+1 = toUInt16(Y.dummy) SETTINGS join_use_nulls = 1; SELECT X.dummy+1 FROM system.one X RIGHT JOIN system.one Y ON X.dummy = Y.dummy SETTINGS join_use_nulls = 1; SELECT Y.dummy+1 FROM system.one X RIGHT JOIN system.one Y ON X.dummy = Y.dummy SETTINGS join_use_nulls = 1; diff --git a/tests/queries/0_stateless/01720_join_implicit_cast.sql b/tests/queries/0_stateless/01720_join_implicit_cast.sql index b2fa227f0d5..71014466f4c 100644 --- a/tests/queries/0_stateless/01720_join_implicit_cast.sql +++ b/tests/queries/0_stateless/01720_join_implicit_cast.sql @@ -45,10 +45,10 @@ SELECT any(toTypeName(a)) == 'Int32' AND any(toTypeName(t2.a)) == 'Int32' FROM t SELECT toTypeName(any(a)) == 'Int32' AND toTypeName(any(t2.a)) == 'Int32' FROM t1 FULL JOIN t2 USING (a); SELECT min(toTypeName(a) == 'Int32' AND toTypeName(t2.a) == 'Int32') FROM t1 FULL JOIN t2 USING (a); -SELECT * FROM t1 FULL JOIN t2 ON (t1.a == t2.a) ORDER BY (a); -- { serverError 53 } -SELECT * FROM t1 LEFT JOIN t2 ON(t1.a == t2.a) ORDER BY (a); -- { serverError 53 } -SELECT * FROM t1 RIGHT JOIN t2 ON (t1.a == t2.a) ORDER BY (a); -- { serverError 53 } -SELECT * FROM t1 INNER JOIN t2 ON (t1.a == t2.a) ORDER BY (a); -- { serverError 53 } +-- SELECT * FROM t1 FULL JOIN t2 ON (t1.a == t2.a) ORDER BY (a); +-- SELECT * FROM t1 LEFT JOIN t2 ON(t1.a == t2.a) ORDER BY (a); +-- SELECT * FROM t1 RIGHT JOIN t2 ON (t1.a == t2.a) ORDER BY (a); +-- SELECT * FROM t1 INNER JOIN t2 ON (t1.a == t2.a) ORDER BY (a); SELECT '--- partial_merge ---'; @@ -89,10 +89,10 @@ SELECT any(toTypeName(a)) == 'Int32' AND any(toTypeName(t2.a)) == 'Int32' FROM t SELECT toTypeName(any(a)) == 'Int32' AND toTypeName(any(t2.a)) == 'Int32' FROM t1 FULL JOIN t2 USING (a); SELECT min(toTypeName(a) == 'Int32' AND toTypeName(t2.a) == 'Int32') FROM t1 FULL JOIN t2 USING (a); -SELECT * FROM t1 FULL JOIN t2 ON (t1.a == t2.a) ORDER BY (a); -- { serverError 53 } -SELECT * FROM t1 LEFT JOIN t2 ON(t1.a == t2.a) ORDER BY (a); -- { serverError 53 } -SELECT * FROM t1 RIGHT JOIN t2 ON (t1.a == t2.a) ORDER BY (a); -- { serverError 53 } -SELECT * FROM t1 INNER JOIN t2 ON (t1.a == t2.a) ORDER BY (a); -- { serverError 53 } +-- SELECT * FROM t1 FULL JOIN t2 ON (t1.a == t2.a) ORDER BY (a); +-- SELECT * FROM t1 LEFT JOIN t2 ON(t1.a == t2.a) ORDER BY (a); +-- SELECT * FROM t1 RIGHT JOIN t2 ON (t1.a == t2.a) ORDER BY (a); +-- SELECT * FROM t1 INNER JOIN t2 ON (t1.a == t2.a) ORDER BY (a); SELECT '--- switch ---'; @@ -135,10 +135,10 @@ SELECT any(toTypeName(a)) == 'Int32' AND any(toTypeName(t2.a)) == 'Int32' FROM t SELECT toTypeName(any(a)) == 'Int32' AND toTypeName(any(t2.a)) == 'Int32' FROM t1 FULL JOIN t2 USING (a); SELECT min(toTypeName(a) == 'Int32' AND toTypeName(t2.a) == 'Int32') FROM t1 FULL JOIN t2 USING (a); -SELECT * FROM t1 FULL JOIN t2 ON (t1.a == t2.a) ORDER BY (a); -- { serverError 53 } -SELECT * FROM t1 LEFT JOIN t2 ON(t1.a == t2.a) ORDER BY (a); -- { serverError 53 } -SELECT * FROM t1 RIGHT JOIN t2 ON (t1.a == t2.a) ORDER BY (a); -- { serverError 53 } -SELECT * FROM t1 INNER JOIN t2 ON (t1.a == t2.a) ORDER BY (a); -- { serverError 53 } +-- SELECT * FROM t1 FULL JOIN t2 ON (t1.a == t2.a) ORDER BY (a); +-- SELECT * FROM t1 LEFT JOIN t2 ON(t1.a == t2.a) ORDER BY (a); +-- SELECT * FROM t1 RIGHT JOIN t2 ON (t1.a == t2.a) ORDER BY (a); +-- SELECT * FROM t1 INNER JOIN t2 ON (t1.a == t2.a) ORDER BY (a); SET max_bytes_in_join = 0; From 3a7eddcf3aca14b921d26c92718f58ecb5e1feee Mon Sep 17 00:00:00 2001 From: vdimir Date: Thu, 18 Feb 2021 17:46:02 +0300 Subject: [PATCH 0483/2357] Remove addRequiredLeftColumn, some tests for join on different types --- src/Interpreters/ExpressionAnalyzer.cpp | 2 +- src/Interpreters/TableJoin.cpp | 11 -- src/Interpreters/TableJoin.h | 3 - src/Interpreters/TreeRewriter.cpp | 8 -- .../01720_join_implicit_cast.reference | 135 ++++++++++++++++++ .../0_stateless/01720_join_implicit_cast.sql | 60 ++++---- 6 files changed, 172 insertions(+), 47 deletions(-) diff --git a/src/Interpreters/ExpressionAnalyzer.cpp b/src/Interpreters/ExpressionAnalyzer.cpp index 56bae974094..eda695ad190 100644 --- a/src/Interpreters/ExpressionAnalyzer.cpp +++ b/src/Interpreters/ExpressionAnalyzer.cpp @@ -744,8 +744,8 @@ bool SelectQueryExpressionAnalyzer::appendJoinLeftKeys(ExpressionActionsChain & JoinPtr SelectQueryExpressionAnalyzer::appendJoin(ExpressionActionsChain & chain, ActionsDAGPtr & left_actions) { const ColumnsWithTypeAndName & left_sample_columns = chain.getLastStep().getResultColumns(); - JoinPtr table_join = makeTableJoin(*syntax->ast_join, left_sample_columns, left_actions); + if (syntax->analyzed_join->needConvert()) { assert(left_actions); diff --git a/src/Interpreters/TableJoin.cpp b/src/Interpreters/TableJoin.cpp index a394fc4ce53..bf83bcd06f9 100644 --- a/src/Interpreters/TableJoin.cpp +++ b/src/Interpreters/TableJoin.cpp @@ -232,11 +232,6 @@ void TableJoin::addJoinedColumn(const NameAndTypePair & joined_column) columns_added_by_join.emplace_back(joined_column.name, type); } -void TableJoin::addRequiredLeftColumn(const String & left_column) -{ - required_left_keys.emplace(left_column); -} - void TableJoin::addJoinedColumnsAndCorrectTypes(NamesAndTypesList & names_and_types, bool correct_nullability) const { ColumnsWithTypeAndName columns; @@ -417,13 +412,7 @@ void TableJoin::applyKeyColumnRename(const NameToNameMap & name_map, TableJoin:: { const auto it = name_map.find(name); if (it != name_map.end()) - { -// if (side == TableSide::Left && required_left_keys.contains(name)) -// { -// columns_added_by_join.emplace_back(name, nullptr); -// } name = it->second; - } } } diff --git a/src/Interpreters/TableJoin.h b/src/Interpreters/TableJoin.h index 5d0d5382775..93a824efac8 100644 --- a/src/Interpreters/TableJoin.h +++ b/src/Interpreters/TableJoin.h @@ -72,7 +72,6 @@ private: ASTTableJoin table_join; ASOF::Inequality asof_inequality = ASOF::Inequality::GreaterOrEquals; - NameSet required_left_keys; /// All columns which can be read from joined table. Duplicating names are qualified. NamesAndTypesList columns_from_joined_table; /// Columns will be added to block by JOIN. @@ -146,7 +145,6 @@ public: bool hasUsing() const { return table_join.using_expression_list != nullptr; } bool hasOn() const { return table_join.on_expression != nullptr; } - bool hasJoinedStorage() const { return joined_storage != nullptr; } NamesWithAliases getNamesWithAliases(const NameSet & required_columns) const; NamesWithAliases getRequiredColumns(const Block & sample, const Names & action_required_columns) const; @@ -158,7 +156,6 @@ public: bool leftBecomeNullable(const DataTypePtr & column_type) const; bool rightBecomeNullable(const DataTypePtr & column_type) const; void addJoinedColumn(const NameAndTypePair & joined_column); - void addRequiredLeftColumn(const String & left_column); void applyKeyColumnRename(const NameToNameMap & name_map, TableSide side); diff --git a/src/Interpreters/TreeRewriter.cpp b/src/Interpreters/TreeRewriter.cpp index 28c1864ac6c..2a331686949 100644 --- a/src/Interpreters/TreeRewriter.cpp +++ b/src/Interpreters/TreeRewriter.cpp @@ -583,14 +583,6 @@ void TreeRewriterResult::collectUsedColumns(const ASTPtr & query, bool is_select for (const auto & name : source_columns) available_columns.insert(name.name); - for (const auto & name : analyzed_join->keyNamesLeft()) - { - if (available_columns.count(name)) - continue; - if (required.count(name)) - analyzed_join->addRequiredLeftColumn(name); - } - /// Add columns obtained by JOIN (if needed). for (const auto & joined_column : analyzed_join->columnsFromJoinedTable()) { diff --git a/tests/queries/0_stateless/01720_join_implicit_cast.reference b/tests/queries/0_stateless/01720_join_implicit_cast.reference index 21b02c112d9..a30023e44f1 100644 --- a/tests/queries/0_stateless/01720_join_implicit_cast.reference +++ b/tests/queries/0_stateless/01720_join_implicit_cast.reference @@ -87,6 +87,51 @@ 3 3 3 4 4 4 5 5 5 +- join on - +- full - +0 0 -4 196 +0 0 -3 197 +0 0 -2 198 +0 0 -1 199 +0 0 0 200 +1 101 1 201 +2 102 2 202 +3 103 3 203 +4 104 4 204 +5 105 5 205 +6 106 0 \N +7 107 0 \N +8 108 0 \N +9 109 0 \N +10 110 0 \N +- left - +1 101 1 201 +2 102 2 202 +3 103 3 203 +4 104 4 204 +5 105 5 205 +6 106 0 \N +7 107 0 \N +8 108 0 \N +9 109 0 \N +10 110 0 \N +- right - +0 0 -4 196 +0 0 -3 197 +0 0 -2 198 +0 0 -1 199 +0 0 0 200 +1 101 1 201 +2 102 2 202 +3 103 3 203 +4 104 4 204 +5 105 5 205 +- inner - +1 101 1 201 +2 102 2 202 +3 103 3 203 +4 104 4 204 +5 105 5 205 - agg - 1 1 @@ -188,6 +233,51 @@ 3 3 3 4 4 4 5 5 5 +- join on - +- full - +0 0 -4 196 +0 0 -3 197 +0 0 -2 198 +0 0 -1 199 +0 0 0 200 +1 101 1 201 +2 102 2 202 +3 103 3 203 +4 104 4 204 +5 105 5 205 +6 106 0 \N +7 107 0 \N +8 108 0 \N +9 109 0 \N +10 110 0 \N +- left - +1 101 1 201 +2 102 2 202 +3 103 3 203 +4 104 4 204 +5 105 5 205 +6 106 0 \N +7 107 0 \N +8 108 0 \N +9 109 0 \N +10 110 0 \N +- right - +0 0 -4 196 +0 0 -3 197 +0 0 -2 198 +0 0 -1 199 +0 0 0 200 +1 101 1 201 +2 102 2 202 +3 103 3 203 +4 104 4 204 +5 105 5 205 +- inner - +1 101 1 201 +2 102 2 202 +3 103 3 203 +4 104 4 204 +5 105 5 205 - agg - 1 1 @@ -289,6 +379,51 @@ 3 3 3 4 4 4 5 5 5 +- join on - +- full - +0 0 -4 196 +0 0 -3 197 +0 0 -2 198 +0 0 -1 199 +0 0 0 200 +1 101 1 201 +2 102 2 202 +3 103 3 203 +4 104 4 204 +5 105 5 205 +6 106 0 \N +7 107 0 \N +8 108 0 \N +9 109 0 \N +10 110 0 \N +- left - +1 101 1 201 +2 102 2 202 +3 103 3 203 +4 104 4 204 +5 105 5 205 +6 106 0 \N +7 107 0 \N +8 108 0 \N +9 109 0 \N +10 110 0 \N +- right - +0 0 -4 196 +0 0 -3 197 +0 0 -2 198 +0 0 -1 199 +0 0 0 200 +1 101 1 201 +2 102 2 202 +3 103 3 203 +4 104 4 204 +5 105 5 205 +- inner - +1 101 1 201 +2 102 2 202 +3 103 3 203 +4 104 4 204 +5 105 5 205 - agg - 1 1 diff --git a/tests/queries/0_stateless/01720_join_implicit_cast.sql b/tests/queries/0_stateless/01720_join_implicit_cast.sql index 71014466f4c..b011dff6598 100644 --- a/tests/queries/0_stateless/01720_join_implicit_cast.sql +++ b/tests/queries/0_stateless/01720_join_implicit_cast.sql @@ -28,7 +28,17 @@ SELECT a, t1.a, t2.a FROM t1 RIGHT JOIN t2 USING (a) ORDER BY (t1.a, t2.a); SELECT '- inner -'; SELECT a, t1.a, t2.a FROM t1 INNER JOIN t2 USING (a) ORDER BY (t1.a, t2.a); -select '- agg -'; +SELECT '- join on -'; +SELECT '- full -'; +SELECT a, b, t2.a, t2.b FROM t1 FULL JOIN t2 ON (t1.a == t2.a) ORDER BY (t1.a, t2.a); +SELECT '- left -'; +SELECT a, b, t2.a, t2.b FROM t1 LEFT JOIN t2 ON (t1.a == t2.a) ORDER BY (t1.a, t2.a); +SELECT '- right -'; +SELECT a, b, t2.a, t2.b FROM t1 RIGHT JOIN t2 ON (t1.a == t2.a) ORDER BY (t1.a, t2.a); +SELECT '- inner -'; +SELECT a, b, t2.a, t2.b FROM t1 INNER JOIN t2 ON (t1.a == t2.a) ORDER BY (t1.a, t2.a); + +SELECT '- agg -'; SELECT sum(a) == 7 FROM t1 FULL JOIN t2 USING (a) WHERE b > 102 AND t2.b <= 204; SELECT sum(a) == 7 FROM t1 INNER JOIN t2 USING (a) WHERE b > 102 AND t2.b <= 204; @@ -45,11 +55,6 @@ SELECT any(toTypeName(a)) == 'Int32' AND any(toTypeName(t2.a)) == 'Int32' FROM t SELECT toTypeName(any(a)) == 'Int32' AND toTypeName(any(t2.a)) == 'Int32' FROM t1 FULL JOIN t2 USING (a); SELECT min(toTypeName(a) == 'Int32' AND toTypeName(t2.a) == 'Int32') FROM t1 FULL JOIN t2 USING (a); --- SELECT * FROM t1 FULL JOIN t2 ON (t1.a == t2.a) ORDER BY (a); --- SELECT * FROM t1 LEFT JOIN t2 ON(t1.a == t2.a) ORDER BY (a); --- SELECT * FROM t1 RIGHT JOIN t2 ON (t1.a == t2.a) ORDER BY (a); --- SELECT * FROM t1 INNER JOIN t2 ON (t1.a == t2.a) ORDER BY (a); - SELECT '--- partial_merge ---'; SET join_algorithm = 'partial_merge'; @@ -63,7 +68,6 @@ SELECT a, b, t2.b FROM t1 RIGHT JOIN t2 USING (a) ORDER BY (a); SELECT '- inner -'; SELECT a, b, t2.b FROM t1 INNER JOIN t2 USING (a) ORDER BY (a); - SELECT '- full -'; SELECT a, t1.a, t2.a FROM t1 FULL JOIN t2 USING (a) ORDER BY (t1.a, t2.a); SELECT '- left -'; @@ -73,7 +77,17 @@ SELECT a, t1.a, t2.a FROM t1 RIGHT JOIN t2 USING (a) ORDER BY (t1.a, t2.a); SELECT '- inner -'; SELECT a, t1.a, t2.a FROM t1 INNER JOIN t2 USING (a) ORDER BY (t1.a, t2.a); -select '- agg -'; +SELECT '- join on -'; +SELECT '- full -'; +SELECT a, b, t2.a, t2.b FROM t1 FULL JOIN t2 ON (t1.a == t2.a) ORDER BY (t1.a, t2.a); +SELECT '- left -'; +SELECT a, b, t2.a, t2.b FROM t1 LEFT JOIN t2 ON (t1.a == t2.a) ORDER BY (t1.a, t2.a); +SELECT '- right -'; +SELECT a, b, t2.a, t2.b FROM t1 RIGHT JOIN t2 ON (t1.a == t2.a) ORDER BY (t1.a, t2.a); +SELECT '- inner -'; +SELECT a, b, t2.a, t2.b FROM t1 INNER JOIN t2 ON (t1.a == t2.a) ORDER BY (t1.a, t2.a); + +SELECT '- agg -'; SELECT sum(a) == 7 FROM t1 FULL JOIN t2 USING (a) WHERE b > 102 AND t2.b <= 204; SELECT sum(a) == 7 FROM t1 INNER JOIN t2 USING (a) WHERE b > 102 AND t2.b <= 204; @@ -89,12 +103,6 @@ SELECT any(toTypeName(a)) == 'Int32' AND any(toTypeName(t2.a)) == 'Int32' FROM t SELECT toTypeName(any(a)) == 'Int32' AND toTypeName(any(t2.a)) == 'Int32' FROM t1 FULL JOIN t2 USING (a); SELECT min(toTypeName(a) == 'Int32' AND toTypeName(t2.a) == 'Int32') FROM t1 FULL JOIN t2 USING (a); --- SELECT * FROM t1 FULL JOIN t2 ON (t1.a == t2.a) ORDER BY (a); --- SELECT * FROM t1 LEFT JOIN t2 ON(t1.a == t2.a) ORDER BY (a); --- SELECT * FROM t1 RIGHT JOIN t2 ON (t1.a == t2.a) ORDER BY (a); --- SELECT * FROM t1 INNER JOIN t2 ON (t1.a == t2.a) ORDER BY (a); - - SELECT '--- switch ---'; SET join_algorithm = 'auto'; @@ -109,7 +117,6 @@ SELECT a, b, t2.b FROM t1 RIGHT JOIN t2 USING (a) ORDER BY (a); SELECT '- inner -'; SELECT a, b, t2.b FROM t1 INNER JOIN t2 USING (a) ORDER BY (a); - SELECT '- full -'; SELECT a, t1.a, t2.a FROM t1 FULL JOIN t2 USING (a) ORDER BY (t1.a, t2.a); SELECT '- left -'; @@ -119,7 +126,17 @@ SELECT a, t1.a, t2.a FROM t1 RIGHT JOIN t2 USING (a) ORDER BY (t1.a, t2.a); SELECT '- inner -'; SELECT a, t1.a, t2.a FROM t1 INNER JOIN t2 USING (a) ORDER BY (t1.a, t2.a); -select '- agg -'; +SELECT '- join on -'; +SELECT '- full -'; +SELECT a, b, t2.a, t2.b FROM t1 FULL JOIN t2 ON (t1.a == t2.a) ORDER BY (t1.a, t2.a); +SELECT '- left -'; +SELECT a, b, t2.a, t2.b FROM t1 LEFT JOIN t2 ON (t1.a == t2.a) ORDER BY (t1.a, t2.a); +SELECT '- right -'; +SELECT a, b, t2.a, t2.b FROM t1 RIGHT JOIN t2 ON (t1.a == t2.a) ORDER BY (t1.a, t2.a); +SELECT '- inner -'; +SELECT a, b, t2.a, t2.b FROM t1 INNER JOIN t2 ON (t1.a == t2.a) ORDER BY (t1.a, t2.a); + +SELECT '- agg -'; SELECT sum(a) == 7 FROM t1 FULL JOIN t2 USING (a) WHERE b > 102 AND t2.b <= 204; SELECT sum(a) == 7 FROM t1 INNER JOIN t2 USING (a) WHERE b > 102 AND t2.b <= 204; @@ -135,11 +152,6 @@ SELECT any(toTypeName(a)) == 'Int32' AND any(toTypeName(t2.a)) == 'Int32' FROM t SELECT toTypeName(any(a)) == 'Int32' AND toTypeName(any(t2.a)) == 'Int32' FROM t1 FULL JOIN t2 USING (a); SELECT min(toTypeName(a) == 'Int32' AND toTypeName(t2.a) == 'Int32') FROM t1 FULL JOIN t2 USING (a); --- SELECT * FROM t1 FULL JOIN t2 ON (t1.a == t2.a) ORDER BY (a); --- SELECT * FROM t1 LEFT JOIN t2 ON(t1.a == t2.a) ORDER BY (a); --- SELECT * FROM t1 RIGHT JOIN t2 ON (t1.a == t2.a) ORDER BY (a); --- SELECT * FROM t1 INNER JOIN t2 ON (t1.a == t2.a) ORDER BY (a); - SET max_bytes_in_join = 0; SELECT '--- join use nulls ---'; @@ -156,7 +168,7 @@ SET join_use_nulls = 0; DROP TABLE IF EXISTS t1; DROP TABLE IF EXISTS t2; -select '---'; +SELECT '---'; DROP TABLE IF EXISTS t_ab1; DROP TABLE IF EXISTS t_ab2; @@ -179,7 +191,7 @@ SELECT a, b FROM t_ab1 RIGHT JOIN t_ab2 USING (a, b) ORDER BY ifNull(t_ab1.id, t SELECT '- inner -'; SELECT a, b FROM t_ab1 INNER JOIN t_ab2 USING (a, b) ORDER BY ifNull(t_ab1.id, t_ab2.id); -select '- agg -'; +SELECT '- agg -'; SELECT sum(a), sum(b) FROM t_ab1 FULL JOIN t_ab2 USING (a, b); SELECT sum(a), sum(b) FROM t_ab1 LEFT JOIN t_ab2 USING (a, b); SELECT sum(a), sum(b) FROM t_ab1 RIGHT JOIN t_ab2 USING (a, b); @@ -205,7 +217,7 @@ SELECT a, b FROM t_ab1 RIGHT JOIN t_ab2 USING (a, b) ORDER BY ifNull(t_ab1.id, t SELECT '- inner -'; SELECT a, b FROM t_ab1 INNER JOIN t_ab2 USING (a, b) ORDER BY ifNull(t_ab1.id, t_ab2.id); -select '- agg -'; +SELECT '- agg -'; SELECT sum(a), sum(b) FROM t_ab1 FULL JOIN t_ab2 USING (a, b); SELECT sum(a), sum(b) FROM t_ab1 LEFT JOIN t_ab2 USING (a, b); SELECT sum(a), sum(b) FROM t_ab1 RIGHT JOIN t_ab2 USING (a, b); From 456414beea5519934ec731ace35af2640a64ece8 Mon Sep 17 00:00:00 2001 From: vdimir Date: Thu, 18 Feb 2021 18:51:38 +0300 Subject: [PATCH 0484/2357] Fix converting join on keys, move actions into TableJoin --- src/Interpreters/ExpressionAnalyzer.cpp | 57 ++++--------------------- src/Interpreters/ExpressionAnalyzer.h | 5 +-- src/Interpreters/TableJoin.cpp | 57 ++++++++++++++++++------- src/Interpreters/TableJoin.h | 26 +++++------ 4 files changed, 67 insertions(+), 78 deletions(-) diff --git a/src/Interpreters/ExpressionAnalyzer.cpp b/src/Interpreters/ExpressionAnalyzer.cpp index eda695ad190..bee5c2df158 100644 --- a/src/Interpreters/ExpressionAnalyzer.cpp +++ b/src/Interpreters/ExpressionAnalyzer.cpp @@ -95,27 +95,6 @@ bool allowEarlyConstantFolding(const ActionsDAG & actions, const Settings & sett return true; } - -/// Returns converting actions for tables that need to be performed before join -ActionsDAGPtr createJoinConvertingActions(const ColumnsWithTypeAndName & cols_src, - const TableJoin::NameToTypeMap & mapping, - bool has_using, - NameToNameMap & renames) -{ - ColumnsWithTypeAndName cols_dst = cols_src; - for (auto & col : cols_dst) - { - if (auto it = mapping.find(col.name); it != mapping.end()) - { - col.type = it->second; - col.column = nullptr; - } - } - return ActionsDAG::makeConvertingActions( - cols_src, cols_dst, ActionsDAG::MatchColumnsMode::Name, true, !has_using, &renames); -}; - - } bool sanitizeBlock(Block & block, bool throw_if_cannot_create_column) @@ -741,15 +720,14 @@ bool SelectQueryExpressionAnalyzer::appendJoinLeftKeys(ExpressionActionsChain & return true; } -JoinPtr SelectQueryExpressionAnalyzer::appendJoin(ExpressionActionsChain & chain, ActionsDAGPtr & left_actions) +JoinPtr SelectQueryExpressionAnalyzer::appendJoin(ExpressionActionsChain & chain) { const ColumnsWithTypeAndName & left_sample_columns = chain.getLastStep().getResultColumns(); - JoinPtr table_join = makeTableJoin(*syntax->ast_join, left_sample_columns, left_actions); + JoinPtr table_join = makeTableJoin(*syntax->ast_join, left_sample_columns); if (syntax->analyzed_join->needConvert()) { - assert(left_actions); - chain.steps.push_back(std::make_unique(left_actions)); + chain.steps.push_back(std::make_unique(syntax->analyzed_join->leftConvertingActions())); chain.addStep(); } @@ -820,7 +798,7 @@ static std::shared_ptr makeJoin(std::shared_ptr analyzed_join, } JoinPtr SelectQueryExpressionAnalyzer::makeTableJoin( - const ASTTablesInSelectQueryElement & join_element, const ColumnsWithTypeAndName & left_sample_columns, ActionsDAGPtr & left_actions) + const ASTTablesInSelectQueryElement & join_element, const ColumnsWithTypeAndName & left_sample_columns) { /// Two JOINs are not supported with the same subquery, but different USINGs. auto join_hash = join_element.getTreeHash(); @@ -859,27 +837,9 @@ JoinPtr SelectQueryExpressionAnalyzer::makeTableJoin( subquery_for_join.addJoinActions(joined_block_actions); /// changes subquery_for_join.sample_block inside const ColumnsWithTypeAndName & right_sample_columns = subquery_for_join.sample_block.getColumnsWithTypeAndName(); - /// For `USING` we already inferred common type an syntax analyzer stage - if (!syntax->analyzed_join->hasUsing()) - syntax->analyzed_join->inferJoinKeyCommonType(left_sample_columns, right_sample_columns); - if (syntax->analyzed_join->needConvert()) - { - NameToNameMap left_column_rename; - left_actions = createJoinConvertingActions(left_sample_columns, - syntax->analyzed_join->getLeftMapping(), - syntax->analyzed_join->hasUsing(), - left_column_rename); - syntax->analyzed_join->applyKeyColumnRename(left_column_rename, TableJoin::TableSide::Left); - - NameToNameMap right_renames; - auto right_actions = createJoinConvertingActions(right_sample_columns, - syntax->analyzed_join->getRightMapping(), - syntax->analyzed_join->hasUsing(), - right_renames); - syntax->analyzed_join->applyKeyColumnRename(right_renames, TableJoin::TableSide::Right); - - subquery_for_join.addJoinActions(std::make_shared(right_actions)); - } + bool need_convert = syntax->analyzed_join->applyJoinKeyConvert(left_sample_columns, right_sample_columns); + if (need_convert) + subquery_for_join.addJoinActions(std::make_shared(syntax->analyzed_join->rightConvertingActions())); subquery_for_join.join = makeJoin(syntax->analyzed_join, subquery_for_join.sample_block, context); @@ -1476,7 +1436,8 @@ ExpressionAnalysisResult::ExpressionAnalysisResult( { query_analyzer.appendJoinLeftKeys(chain, only_types || !first_stage); before_join = chain.getLastActions(); - join = query_analyzer.appendJoin(chain, converting_join_columns); + join = query_analyzer.appendJoin(chain); + converting_join_columns = query_analyzer.analyzedJoin().leftConvertingActions(); chain.addStep(); } diff --git a/src/Interpreters/ExpressionAnalyzer.h b/src/Interpreters/ExpressionAnalyzer.h index 6af4be8ae41..1a0f88f95ac 100644 --- a/src/Interpreters/ExpressionAnalyzer.h +++ b/src/Interpreters/ExpressionAnalyzer.h @@ -317,8 +317,7 @@ private: JoinPtr makeTableJoin( const ASTTablesInSelectQueryElement & join_element, - const ColumnsWithTypeAndName & left_sample_columns, - ActionsDAGPtr & left_actions); + const ColumnsWithTypeAndName & left_sample_columns); const ASTSelectQuery * getAggregatingQuery() const; @@ -339,7 +338,7 @@ private: /// Before aggregation: ArrayJoinActionPtr appendArrayJoin(ExpressionActionsChain & chain, ActionsDAGPtr & before_array_join, bool only_types); bool appendJoinLeftKeys(ExpressionActionsChain & chain, bool only_types); - JoinPtr appendJoin(ExpressionActionsChain & chain, ActionsDAGPtr & left_actions); + JoinPtr appendJoin(ExpressionActionsChain & chain); /// Add preliminary rows filtration. Actions are created in other expression analyzer to prevent any possible alias injection. void appendPreliminaryFilter(ExpressionActionsChain & chain, ActionsDAGPtr actions_dag, String column_name); /// remove_filter is set in ExpressionActionsChain::finalize(); diff --git a/src/Interpreters/TableJoin.cpp b/src/Interpreters/TableJoin.cpp index bf83bcd06f9..691da9cd450 100644 --- a/src/Interpreters/TableJoin.cpp +++ b/src/Interpreters/TableJoin.cpp @@ -338,18 +338,29 @@ bool TableJoin::allowDictJoin(const String & dict_key, const Block & sample_bloc return true; } -bool TableJoin::inferJoinKeyCommonType(const ColumnsWithTypeAndName & left, const ColumnsWithTypeAndName & right) +bool TableJoin::applyJoinKeyConvert(const ColumnsWithTypeAndName & left_sample_columns, const ColumnsWithTypeAndName & right_sample_columns) { - NamesAndTypesList left_list; - NamesAndTypesList right_list; + bool need_convert = needConvert(); + if (!need_convert && !hasUsing()) + { + /// For `USING` we already inferred common type an syntax analyzer stage + NamesAndTypesList left_list; + NamesAndTypesList right_list; + for (const auto & col : left_sample_columns) + left_list.emplace_back(col.name, col.type); + for (const auto & col : right_sample_columns) + right_list.emplace_back(col.name, col.type); - for (const auto & col : left) - left_list.emplace_back(col.name, col.type); + need_convert = inferJoinKeyCommonType(left_list, right_list); + } - for (const auto & col : right) - right_list.emplace_back(col.name, col.type); + if (need_convert) + { + left_converting_actions = applyKeyConvertToTable(left_sample_columns, left_type_map, key_names_left); + right_converting_actions = applyKeyConvertToTable(right_sample_columns, right_type_map, key_names_right); + } - return inferJoinKeyCommonType(left_list, right_list); + return need_convert; } bool TableJoin::inferJoinKeyCommonType(const NamesAndTypesList & left, const NamesAndTypesList & right) @@ -403,17 +414,33 @@ bool TableJoin::inferJoinKeyCommonType(const NamesAndTypesList & left, const Nam return !left_type_map.empty(); } -void TableJoin::applyKeyColumnRename(const NameToNameMap & name_map, TableJoin::TableSide side) +ActionsDAGPtr +TableJoin::applyKeyConvertToTable(const ColumnsWithTypeAndName & cols_src, const NameToTypeMap & type_mapping, Names & names_to_rename) { - assert(!hasUsing() || name_map.empty()); - - Names & names = side == TableSide::Left ? key_names_left : key_names_right; - for (auto & name : names) + ColumnsWithTypeAndName cols_dst = cols_src; + for (auto & col : cols_dst) { - const auto it = name_map.find(name); - if (it != name_map.end()) + if (auto it = type_mapping.find(col.name); it != type_mapping.end()) + { + col.type = it->second; + col.column = nullptr; + } + } + + NameToNameMap key_column_rename; + /// Returns converting actions for tables that need to be performed before join + auto dag = ActionsDAG::makeConvertingActions( + cols_src, cols_dst, ActionsDAG::MatchColumnsMode::Name, true, !hasUsing(), &key_column_rename); + + assert(!hasUsing() || key_column_rename.empty()); + + for (auto & name : names_to_rename) + { + const auto it = key_column_rename.find(name); + if (it != key_column_rename.end()) name = it->second; } + return dag; } } diff --git a/src/Interpreters/TableJoin.h b/src/Interpreters/TableJoin.h index 93a824efac8..c492892e6b5 100644 --- a/src/Interpreters/TableJoin.h +++ b/src/Interpreters/TableJoin.h @@ -82,6 +82,9 @@ private: NameToTypeMap left_type_map; NameToTypeMap right_type_map; + ActionsDAGPtr left_converting_actions; + ActionsDAGPtr right_converting_actions; + /// Name -> original name. Names are the same as in columns_from_joined_table list. std::unordered_map original_names; /// Original name -> name. Only renamed columns. @@ -91,13 +94,12 @@ private: Names requiredJoinedNames() const; -public: - enum class TableSide - { - Left, - Right - }; + /// Create converting actions and change key column names if required + ActionsDAGPtr applyKeyConvertToTable(const ColumnsWithTypeAndName & cols_src, + const NameToTypeMap & type_mapping, + Names & names_to_rename); +public: TableJoin() = default; TableJoin(const Settings &, VolumePtr tmp_volume); @@ -157,19 +159,19 @@ public: bool rightBecomeNullable(const DataTypePtr & column_type) const; void addJoinedColumn(const NameAndTypePair & joined_column); - void applyKeyColumnRename(const NameToNameMap & name_map, TableSide side); - void addJoinedColumnsAndCorrectTypes(NamesAndTypesList & names_and_types, bool correct_nullability = true) const; void addJoinedColumnsAndCorrectTypes(ColumnsWithTypeAndName & columns, bool correct_nullability = true) const; /// Calculates common supertypes for corresponding join key columns. bool inferJoinKeyCommonType(const NamesAndTypesList & left, const NamesAndTypesList & right); - bool inferJoinKeyCommonType(const ColumnsWithTypeAndName & left, const ColumnsWithTypeAndName & right); + /// + bool applyJoinKeyConvert(const ColumnsWithTypeAndName & left_sample_columns, const ColumnsWithTypeAndName & right_sample_columns); bool needConvert() const { return !left_type_map.empty(); } - /// Key columns should be converted according to this mapping before join. - const NameToTypeMap & getLeftMapping() const { return left_type_map; } - const NameToTypeMap & getRightMapping() const { return right_type_map; } + + /// Key columns should be converted before join. + ActionsDAGPtr leftConvertingActions() const { return left_converting_actions; } + ActionsDAGPtr rightConvertingActions() const { return right_converting_actions; } void setAsofInequality(ASOF::Inequality inequality) { asof_inequality = inequality; } ASOF::Inequality getAsofInequality() { return asof_inequality; } From 1e37d7c84f6c8cc8b00c2517d94abaf5b27de661 Mon Sep 17 00:00:00 2001 From: vdimir Date: Thu, 18 Feb 2021 19:43:41 +0300 Subject: [PATCH 0485/2357] Add comments to TableJoin::inferJoinKeyCommonType --- src/Interpreters/TableJoin.cpp | 11 +++++------ src/Interpreters/TableJoin.h | 10 ++++++---- src/Interpreters/TreeRewriter.cpp | 6 +++--- 3 files changed, 14 insertions(+), 13 deletions(-) diff --git a/src/Interpreters/TableJoin.cpp b/src/Interpreters/TableJoin.cpp index 691da9cd450..cdf16356f4c 100644 --- a/src/Interpreters/TableJoin.cpp +++ b/src/Interpreters/TableJoin.cpp @@ -400,12 +400,13 @@ bool TableJoin::inferJoinKeyCommonType(const NamesAndTypesList & left, const Nam { supertype = DB::getLeastSupertype({ltype->second, rtype->second}); } - catch (DB::Exception &) + catch (DB::Exception & ex) { throw Exception( "Type mismatch of columns to JOIN by: " + key_names_left[i] + ": " + ltype->second->getName() + " at left, " + - key_names_right[i] + ": " + rtype->second->getName() + " at right", + key_names_right[i] + ": " + rtype->second->getName() + " at right. " + + "Can't get supertype: " + ex.message(), ErrorCodes::TYPE_MISMATCH); } left_type_map[key_names_left[i]] = right_type_map[key_names_right[i]] = supertype; @@ -414,8 +415,8 @@ bool TableJoin::inferJoinKeyCommonType(const NamesAndTypesList & left, const Nam return !left_type_map.empty(); } -ActionsDAGPtr -TableJoin::applyKeyConvertToTable(const ColumnsWithTypeAndName & cols_src, const NameToTypeMap & type_mapping, Names & names_to_rename) +ActionsDAGPtr TableJoin::applyKeyConvertToTable( + const ColumnsWithTypeAndName & cols_src, const NameToTypeMap & type_mapping, Names & names_to_rename) const { ColumnsWithTypeAndName cols_dst = cols_src; for (auto & col : cols_dst) @@ -432,8 +433,6 @@ TableJoin::applyKeyConvertToTable(const ColumnsWithTypeAndName & cols_src, const auto dag = ActionsDAG::makeConvertingActions( cols_src, cols_dst, ActionsDAG::MatchColumnsMode::Name, true, !hasUsing(), &key_column_rename); - assert(!hasUsing() || key_column_rename.empty()); - for (auto & name : names_to_rename) { const auto it = key_column_rename.find(name); diff --git a/src/Interpreters/TableJoin.h b/src/Interpreters/TableJoin.h index c492892e6b5..a74bbd104c4 100644 --- a/src/Interpreters/TableJoin.h +++ b/src/Interpreters/TableJoin.h @@ -95,9 +95,8 @@ private: Names requiredJoinedNames() const; /// Create converting actions and change key column names if required - ActionsDAGPtr applyKeyConvertToTable(const ColumnsWithTypeAndName & cols_src, - const NameToTypeMap & type_mapping, - Names & names_to_rename); + ActionsDAGPtr applyKeyConvertToTable( + const ColumnsWithTypeAndName & cols_src, const NameToTypeMap & type_mapping, Names & names_to_rename) const; public: TableJoin() = default; @@ -164,7 +163,10 @@ public: /// Calculates common supertypes for corresponding join key columns. bool inferJoinKeyCommonType(const NamesAndTypesList & left, const NamesAndTypesList & right); - /// + + /// Calculate converting actions, rename key columns in required + /// For `USING` join we will convert key columns inplace and affect into types in the result table + /// For `JOIN ON` we will create new columns with converted keys to join by. bool applyJoinKeyConvert(const ColumnsWithTypeAndName & left_sample_columns, const ColumnsWithTypeAndName & right_sample_columns); bool needConvert() const { return !left_type_map.empty(); } diff --git a/src/Interpreters/TreeRewriter.cpp b/src/Interpreters/TreeRewriter.cpp index 2a331686949..b1043a9ad0d 100644 --- a/src/Interpreters/TreeRewriter.cpp +++ b/src/Interpreters/TreeRewriter.cpp @@ -420,9 +420,9 @@ void collectJoinedColumns(TableJoin & analyzed_join, const ASTSelectQuery & sele /// `USING` semantic allows to have columns with changed types in result table. /// `JOIN ON` should preserve types from original table - /// We can infer common type on syntax stage, because join only by columns (not expression) is possible - /// We need to know that types in result tables changed because some analysis (e.g. analyzeAggregation) performed before we will create join - /// For `JOIN ON expr1 == expr2` we will infer common type on join createion, when types of expression will be known + /// We can infer common type on syntax stage for `USING` because join is performed only by columns (not expressions) + /// We need to know changed types in result tables because some analysis (e.g. analyzeAggregation) performed before join + /// For `JOIN ON expr1 == expr2` we will infer common type later in ExpressionAnalyzer, when types of expression will be known analyzed_join.inferJoinKeyCommonType(tables[0].columns, tables[1].columns); } else if (table_join.on_expression) From ab0719caf0f64980102481717136e5c7bfcb18fd Mon Sep 17 00:00:00 2001 From: vdimir Date: Fri, 19 Feb 2021 13:56:08 +0300 Subject: [PATCH 0486/2357] More tests for join on key type convert --- .../01720_join_implicit_cast.reference | 374 ++++++++++++++---- .../0_stateless/01720_join_implicit_cast.sql | 237 +++++++---- 2 files changed, 461 insertions(+), 150 deletions(-) diff --git a/tests/queries/0_stateless/01720_join_implicit_cast.reference b/tests/queries/0_stateless/01720_join_implicit_cast.reference index a30023e44f1..a2a7dcef980 100644 --- a/tests/queries/0_stateless/01720_join_implicit_cast.reference +++ b/tests/queries/0_stateless/01720_join_implicit_cast.reference @@ -1,5 +1,5 @@ ---- hash --- -- full - +=== hash === += full = -4 0 196 -3 0 197 -2 0 198 @@ -15,7 +15,7 @@ 8 108 \N 9 109 \N 10 110 \N -- left - += left = 1 101 201 2 102 202 3 103 203 @@ -26,7 +26,7 @@ 8 108 \N 9 109 \N 10 110 \N -- right - += right = -4 0 196 -3 0 197 -2 0 198 @@ -37,13 +37,13 @@ 3 103 203 4 104 204 5 105 205 -- inner - += inner = 1 101 201 2 102 202 3 103 203 4 104 204 5 105 205 -- full - += full = 0 0 -4 0 0 -3 0 0 -2 @@ -59,7 +59,7 @@ 8 8 0 9 9 0 10 10 0 -- left - += left = 1 1 1 2 2 2 3 3 3 @@ -70,7 +70,7 @@ 8 8 0 9 9 0 10 10 0 -- right - += right = 0 0 -4 0 0 -3 0 0 -2 @@ -81,14 +81,14 @@ 3 3 3 4 4 4 5 5 5 -- inner - += inner = 1 1 1 2 2 2 3 3 3 4 4 4 5 5 5 -- join on - -- full - += join on = += full = 0 0 -4 196 0 0 -3 197 0 0 -2 198 @@ -104,7 +104,7 @@ 8 108 0 \N 9 109 0 \N 10 110 0 \N -- left - += left = 1 101 1 201 2 102 2 202 3 103 3 203 @@ -115,7 +115,7 @@ 8 108 0 \N 9 109 0 \N 10 110 0 \N -- right - += right = 0 0 -4 196 0 0 -3 197 0 0 -2 198 @@ -126,26 +126,80 @@ 3 103 3 203 4 104 4 204 5 105 5 205 -- inner - += inner = 1 101 1 201 2 102 2 202 3 103 3 203 4 104 4 204 5 105 5 205 -- agg - += full = +0 0 -4 196 +0 0 -3 197 +0 0 -2 198 +0 0 -1 199 +0 0 0 200 +1 101 1 201 +2 102 2 202 +3 103 3 203 +4 104 4 204 +5 105 5 205 +6 106 0 \N +7 107 0 \N +8 108 0 \N +9 109 0 \N +10 110 0 \N += left = +1 101 1 201 +2 102 2 202 +3 103 3 203 +4 104 4 204 +5 105 5 205 +6 106 0 \N +7 107 0 \N +8 108 0 \N +9 109 0 \N +10 110 0 \N += right = +0 0 -4 196 +0 0 -3 197 +0 0 -2 198 +0 0 -1 199 +0 0 0 200 +1 101 1 201 +2 102 2 202 +3 103 3 203 +4 104 4 204 +5 105 5 205 += inner = +1 101 1 201 +2 102 2 202 +3 103 3 203 +4 104 4 204 +5 105 5 205 += agg = 1 1 1 1 -- types - +1 +0 -10 0 +1 55 1055 +0 0 -10 0 990 +1 55 15 1055 1015 += types = 1 1 1 1 1 1 ---- partial_merge --- -- full - +1 +1 +1 +1 +1 +=== partial_merge === += full = -4 0 196 -3 0 197 -2 0 198 @@ -161,7 +215,7 @@ 8 108 \N 9 109 \N 10 110 \N -- left - += left = 1 101 201 2 102 202 3 103 203 @@ -172,7 +226,7 @@ 8 108 \N 9 109 \N 10 110 \N -- right - += right = -4 0 196 -3 0 197 -2 0 198 @@ -183,13 +237,13 @@ 3 103 203 4 104 204 5 105 205 -- inner - += inner = 1 101 201 2 102 202 3 103 203 4 104 204 5 105 205 -- full - += full = 0 0 -4 0 0 -3 0 0 -2 @@ -205,7 +259,7 @@ 8 8 0 9 9 0 10 10 0 -- left - += left = 1 1 1 2 2 2 3 3 3 @@ -216,7 +270,7 @@ 8 8 0 9 9 0 10 10 0 -- right - += right = 0 0 -4 0 0 -3 0 0 -2 @@ -227,14 +281,14 @@ 3 3 3 4 4 4 5 5 5 -- inner - += inner = 1 1 1 2 2 2 3 3 3 4 4 4 5 5 5 -- join on - -- full - += join on = += full = 0 0 -4 196 0 0 -3 197 0 0 -2 198 @@ -250,7 +304,7 @@ 8 108 0 \N 9 109 0 \N 10 110 0 \N -- left - += left = 1 101 1 201 2 102 2 202 3 103 3 203 @@ -261,7 +315,7 @@ 8 108 0 \N 9 109 0 \N 10 110 0 \N -- right - += right = 0 0 -4 196 0 0 -3 197 0 0 -2 198 @@ -272,26 +326,80 @@ 3 103 3 203 4 104 4 204 5 105 5 205 -- inner - += inner = 1 101 1 201 2 102 2 202 3 103 3 203 4 104 4 204 5 105 5 205 -- agg - += full = +0 0 -4 196 +0 0 -3 197 +0 0 -2 198 +0 0 -1 199 +0 0 0 200 +1 101 1 201 +2 102 2 202 +3 103 3 203 +4 104 4 204 +5 105 5 205 +6 106 0 \N +7 107 0 \N +8 108 0 \N +9 109 0 \N +10 110 0 \N += left = +1 101 1 201 +2 102 2 202 +3 103 3 203 +4 104 4 204 +5 105 5 205 +6 106 0 \N +7 107 0 \N +8 108 0 \N +9 109 0 \N +10 110 0 \N += right = +0 0 -4 196 +0 0 -3 197 +0 0 -2 198 +0 0 -1 199 +0 0 0 200 +1 101 1 201 +2 102 2 202 +3 103 3 203 +4 104 4 204 +5 105 5 205 += inner = +1 101 1 201 +2 102 2 202 +3 103 3 203 +4 104 4 204 +5 105 5 205 += agg = 1 1 1 1 -- types - +1 +0 -10 0 +1 55 1055 +0 0 -10 0 990 +1 55 15 1055 1015 += types = 1 1 1 1 1 1 ---- switch --- -- full - +1 +1 +1 +1 +1 +=== switch === += full = -4 0 196 -3 0 197 -2 0 198 @@ -307,7 +415,7 @@ 8 108 \N 9 109 \N 10 110 \N -- left - += left = 1 101 201 2 102 202 3 103 203 @@ -318,7 +426,7 @@ 8 108 \N 9 109 \N 10 110 \N -- right - += right = -4 0 196 -3 0 197 -2 0 198 @@ -329,13 +437,13 @@ 3 103 203 4 104 204 5 105 205 -- inner - += inner = 1 101 201 2 102 202 3 103 203 4 104 204 5 105 205 -- full - += full = 0 0 -4 0 0 -3 0 0 -2 @@ -351,7 +459,7 @@ 8 8 0 9 9 0 10 10 0 -- left - += left = 1 1 1 2 2 2 3 3 3 @@ -362,7 +470,7 @@ 8 8 0 9 9 0 10 10 0 -- right - += right = 0 0 -4 0 0 -3 0 0 -2 @@ -373,14 +481,14 @@ 3 3 3 4 4 4 5 5 5 -- inner - += inner = 1 1 1 2 2 2 3 3 3 4 4 4 5 5 5 -- join on - -- full - += join on = += full = 0 0 -4 196 0 0 -3 197 0 0 -2 198 @@ -396,7 +504,7 @@ 8 108 0 \N 9 109 0 \N 10 110 0 \N -- left - += left = 1 101 1 201 2 102 2 202 3 103 3 203 @@ -407,7 +515,7 @@ 8 108 0 \N 9 109 0 \N 10 110 0 \N -- right - += right = 0 0 -4 196 0 0 -3 197 0 0 -2 198 @@ -418,26 +526,80 @@ 3 103 3 203 4 104 4 204 5 105 5 205 -- inner - += inner = 1 101 1 201 2 102 2 202 3 103 3 203 4 104 4 204 5 105 5 205 -- agg - += full = +0 0 -4 196 +0 0 -3 197 +0 0 -2 198 +0 0 -1 199 +0 0 0 200 +1 101 1 201 +2 102 2 202 +3 103 3 203 +4 104 4 204 +5 105 5 205 +6 106 0 \N +7 107 0 \N +8 108 0 \N +9 109 0 \N +10 110 0 \N += left = +1 101 1 201 +2 102 2 202 +3 103 3 203 +4 104 4 204 +5 105 5 205 +6 106 0 \N +7 107 0 \N +8 108 0 \N +9 109 0 \N +10 110 0 \N += right = +0 0 -4 196 +0 0 -3 197 +0 0 -2 198 +0 0 -1 199 +0 0 0 200 +1 101 1 201 +2 102 2 202 +3 103 3 203 +4 104 4 204 +5 105 5 205 += inner = +1 101 1 201 +2 102 2 202 +3 103 3 203 +4 104 4 204 +5 105 5 205 += agg = 1 1 1 1 -- types - +1 +0 -10 0 +1 55 1055 +0 0 -10 0 990 +1 55 15 1055 1015 += types = 1 1 1 1 1 1 ---- join use nulls --- -- full - +1 +1 +1 +1 +1 +=== join use nulls === += full = -4 \N 196 -3 \N 197 -2 \N 198 @@ -453,7 +615,7 @@ 8 108 \N 9 109 \N 10 110 \N -- right - += right = -4 \N 196 -3 \N 197 -2 \N 198 @@ -464,61 +626,105 @@ 3 103 203 4 104 204 5 105 205 ---- ---- hash --- -- full - +========== +=== hash === += full = 1 1 2 2 -1 1 1 \N 1 257 1 -1 -- left - += left = 1 1 2 2 -- right - += right = 1 1 -1 1 1 \N 1 257 1 -1 -- inner - += inner = 1 1 -- agg - += full = +1 1 1 1 +2 2 0 \N +0 0 -1 1 +0 0 1 \N +0 0 1 257 +0 0 1 -1 += left = +1 1 1 1 +2 2 0 \N += right = +1 1 1 1 +0 0 -1 1 +0 0 1 \N +0 0 1 257 +0 0 1 -1 += inner = +1 1 1 1 += agg = 5 260 3 3 3 258 1 1 -- types - -1 -1 -1 -1 ---- partial_merge --- -- full - -1 1 -2 2 --1 1 -1 \N -1 257 -1 -1 -- left - -1 1 -2 2 -- right - -1 1 --1 1 -1 \N -1 257 -1 -1 -- inner - -1 1 -- agg - 5 260 3 3 3 258 1 1 -- types - += types = +1 +1 +1 +1 +=== partial_merge === += full = +1 1 +2 2 +-1 1 +1 \N +1 257 +1 -1 += left = +1 1 +2 2 += right = +1 1 +-1 1 +1 \N +1 257 +1 -1 += inner = +1 1 += full = +1 1 1 1 +2 2 0 \N +0 0 -1 1 +0 0 1 \N +0 0 1 257 +0 0 1 -1 += left = +1 1 1 1 +2 2 0 \N += right = +1 1 1 1 +0 0 -1 1 +0 0 1 \N +0 0 1 257 +0 0 1 -1 += inner = +1 1 1 1 += agg = +5 260 +3 3 +3 258 +1 1 +5 260 +3 3 +3 258 +1 1 += types = 1 1 1 diff --git a/tests/queries/0_stateless/01720_join_implicit_cast.sql b/tests/queries/0_stateless/01720_join_implicit_cast.sql index b011dff6598..aff78c14a77 100644 --- a/tests/queries/0_stateless/01720_join_implicit_cast.sql +++ b/tests/queries/0_stateless/01720_join_implicit_cast.sql @@ -7,46 +7,65 @@ CREATE TABLE t2 (a Int16, b Nullable(Int64)) ENGINE = TinyLog; INSERT INTO t1 SELECT number as a, 100 + number as b FROM system.numbers LIMIT 1, 10; INSERT INTO t2 SELECT number - 5 as a, 200 + number - 5 as b FROM system.numbers LIMIT 1, 10; -SELECT '--- hash ---'; +SELECT '=== hash ==='; SET join_algorithm = 'hash'; -SELECT '- full -'; +SELECT '= full ='; SELECT a, b, t2.b FROM t1 FULL JOIN t2 USING (a) ORDER BY (a); -SELECT '- left -'; +SELECT '= left ='; SELECT a, b, t2.b FROM t1 LEFT JOIN t2 USING (a) ORDER BY (a); -SELECT '- right -'; +SELECT '= right ='; SELECT a, b, t2.b FROM t1 RIGHT JOIN t2 USING (a) ORDER BY (a); -SELECT '- inner -'; +SELECT '= inner ='; SELECT a, b, t2.b FROM t1 INNER JOIN t2 USING (a) ORDER BY (a); -SELECT '- full -'; +SELECT '= full ='; SELECT a, t1.a, t2.a FROM t1 FULL JOIN t2 USING (a) ORDER BY (t1.a, t2.a); -SELECT '- left -'; +SELECT '= left ='; SELECT a, t1.a, t2.a FROM t1 LEFT JOIN t2 USING (a) ORDER BY (t1.a, t2.a); -SELECT '- right -'; +SELECT '= right ='; SELECT a, t1.a, t2.a FROM t1 RIGHT JOIN t2 USING (a) ORDER BY (t1.a, t2.a); -SELECT '- inner -'; +SELECT '= inner ='; SELECT a, t1.a, t2.a FROM t1 INNER JOIN t2 USING (a) ORDER BY (t1.a, t2.a); -SELECT '- join on -'; -SELECT '- full -'; +SELECT '= join on ='; +SELECT '= full ='; SELECT a, b, t2.a, t2.b FROM t1 FULL JOIN t2 ON (t1.a == t2.a) ORDER BY (t1.a, t2.a); -SELECT '- left -'; +SELECT '= left ='; SELECT a, b, t2.a, t2.b FROM t1 LEFT JOIN t2 ON (t1.a == t2.a) ORDER BY (t1.a, t2.a); -SELECT '- right -'; +SELECT '= right ='; SELECT a, b, t2.a, t2.b FROM t1 RIGHT JOIN t2 ON (t1.a == t2.a) ORDER BY (t1.a, t2.a); -SELECT '- inner -'; +SELECT '= inner ='; SELECT a, b, t2.a, t2.b FROM t1 INNER JOIN t2 ON (t1.a == t2.a) ORDER BY (t1.a, t2.a); -SELECT '- agg -'; +SELECT '= full ='; +SELECT * FROM t1 FULL JOIN t2 ON (t1.a + t1.b = t2.a + t2.b - 100) ORDER BY (t1.a, t2.a); +SELECT '= left ='; +SELECT * FROM t1 LEFT JOIN t2 ON (t1.a + t1.b = t2.a + t2.b - 100) ORDER BY (t1.a, t2.a); +SELECT '= right ='; +SELECT * FROM t1 RIGHT JOIN t2 ON (t1.a + t1.b = t2.a + t2.b - 100) ORDER BY (t1.a, t2.a); +SELECT '= inner ='; +SELECT * FROM t1 INNER JOIN t2 ON (t1.a + t1.b = t2.a + t2.b - 100) ORDER BY (t1.a, t2.a); + +-- Int64 and UInt64 has no supertype +SELECT * FROM t1 FULL JOIN t2 ON (t1.a + t1.b + 100 = t2.a + t2.b) ORDER BY (t1.a, t2.a); -- { serverError 53 } +SELECT * FROM t1 LEFT JOIN t2 ON (t1.a + t1.b + 100 = t2.a + t2.b) ORDER BY (t1.a, t2.a); -- { serverError 53 } +SELECT * FROM t1 RIGHT JOIN t2 ON (t1.a + t1.b + 100 = t2.a + t2.b) ORDER BY (t1.a, t2.a); -- { serverError 53 } +SELECT * FROM t1 INNER JOIN t2 ON (t1.a + t1.b + 100 = t2.a + t2.b) ORDER BY (t1.a, t2.a); -- { serverError 53 } + +SELECT '= agg ='; SELECT sum(a) == 7 FROM t1 FULL JOIN t2 USING (a) WHERE b > 102 AND t2.b <= 204; SELECT sum(a) == 7 FROM t1 INNER JOIN t2 USING (a) WHERE b > 102 AND t2.b <= 204; SELECT sum(b) = 103 FROM t1 LEFT JOIN t2 USING (a) WHERE b > 102 AND t2.b < 204; SELECT sum(t2.b) = 203 FROM t1 RIGHT JOIN t2 USING (a) WHERE b > 102 AND t2.b < 204; -SELECT '- types -'; +SELECT sum(a) == 2 + 3 + 4 FROM t1 FULL JOIN t2 ON (t1.a + t1.b = t2.a + t2.b - 100) WHERE t1.b < 105 AND t2.b > 201; +SELECT a > 0, sum(a), sum(b) FROM t1 FULL JOIN t2 USING (a) GROUP BY (a > 0) ORDER BY a > 0; +SELECT a > 0, sum(a), sum(t2.a), sum(b), sum(t2.b) FROM t1 FULL JOIN t2 ON (t1.a == t2.a) GROUP BY (a > 0) ORDER BY a > 0; + +SELECT '= types ='; SELECT any(toTypeName(a)) == 'Int32' AND any(toTypeName(t2.a)) == 'Int32' FROM t1 FULL JOIN t2 USING (a); SELECT any(toTypeName(a)) == 'Int32' AND any(toTypeName(t2.a)) == 'Int32' FROM t1 LEFT JOIN t2 USING (a); SELECT any(toTypeName(a)) == 'Int32' AND any(toTypeName(t2.a)) == 'Int32' FROM t1 RIGHT JOIN t2 USING (a); @@ -55,46 +74,72 @@ SELECT any(toTypeName(a)) == 'Int32' AND any(toTypeName(t2.a)) == 'Int32' FROM t SELECT toTypeName(any(a)) == 'Int32' AND toTypeName(any(t2.a)) == 'Int32' FROM t1 FULL JOIN t2 USING (a); SELECT min(toTypeName(a) == 'Int32' AND toTypeName(t2.a) == 'Int32') FROM t1 FULL JOIN t2 USING (a); -SELECT '--- partial_merge ---'; +SELECT any(toTypeName(a)) == 'UInt16' AND any(toTypeName(t2.a)) == 'Int16' FROM t1 FULL JOIN t2 ON (t1.a == t2.a); +SELECT any(toTypeName(a)) == 'UInt16' AND any(toTypeName(t2.a)) == 'Int16' FROM t1 LEFT JOIN t2 ON (t1.a == t2.a); +SELECT any(toTypeName(a)) == 'UInt16' AND any(toTypeName(t2.a)) == 'Int16' FROM t1 RIGHT JOIN t2 ON (t1.a == t2.a); +SELECT any(toTypeName(a)) == 'UInt16' AND any(toTypeName(t2.a)) == 'Int16' FROM t1 INNER JOIN t2 ON (t1.a == t2.a); +SELECT toTypeName(any(a)) == 'UInt16' AND toTypeName(any(t2.a)) == 'Int16' FROM t1 FULL JOIN t2 ON (t1.a == t2.a); + +SELECT '=== partial_merge ==='; SET join_algorithm = 'partial_merge'; -SELECT '- full -'; +SELECT '= full ='; SELECT a, b, t2.b FROM t1 FULL JOIN t2 USING (a) ORDER BY (a); -SELECT '- left -'; +SELECT '= left ='; SELECT a, b, t2.b FROM t1 LEFT JOIN t2 USING (a) ORDER BY (a); -SELECT '- right -'; +SELECT '= right ='; SELECT a, b, t2.b FROM t1 RIGHT JOIN t2 USING (a) ORDER BY (a); -SELECT '- inner -'; +SELECT '= inner ='; SELECT a, b, t2.b FROM t1 INNER JOIN t2 USING (a) ORDER BY (a); -SELECT '- full -'; +SELECT '= full ='; SELECT a, t1.a, t2.a FROM t1 FULL JOIN t2 USING (a) ORDER BY (t1.a, t2.a); -SELECT '- left -'; +SELECT '= left ='; SELECT a, t1.a, t2.a FROM t1 LEFT JOIN t2 USING (a) ORDER BY (t1.a, t2.a); -SELECT '- right -'; +SELECT '= right ='; SELECT a, t1.a, t2.a FROM t1 RIGHT JOIN t2 USING (a) ORDER BY (t1.a, t2.a); -SELECT '- inner -'; +SELECT '= inner ='; SELECT a, t1.a, t2.a FROM t1 INNER JOIN t2 USING (a) ORDER BY (t1.a, t2.a); -SELECT '- join on -'; -SELECT '- full -'; +SELECT '= join on ='; +SELECT '= full ='; SELECT a, b, t2.a, t2.b FROM t1 FULL JOIN t2 ON (t1.a == t2.a) ORDER BY (t1.a, t2.a); -SELECT '- left -'; +SELECT '= left ='; SELECT a, b, t2.a, t2.b FROM t1 LEFT JOIN t2 ON (t1.a == t2.a) ORDER BY (t1.a, t2.a); -SELECT '- right -'; +SELECT '= right ='; SELECT a, b, t2.a, t2.b FROM t1 RIGHT JOIN t2 ON (t1.a == t2.a) ORDER BY (t1.a, t2.a); -SELECT '- inner -'; +SELECT '= inner ='; SELECT a, b, t2.a, t2.b FROM t1 INNER JOIN t2 ON (t1.a == t2.a) ORDER BY (t1.a, t2.a); -SELECT '- agg -'; +SELECT '= full ='; +SELECT * FROM t1 FULL JOIN t2 ON (t1.a + t1.b = t2.a + t2.b - 100) ORDER BY (t1.a, t2.a); +SELECT '= left ='; +SELECT * FROM t1 LEFT JOIN t2 ON (t1.a + t1.b = t2.a + t2.b - 100) ORDER BY (t1.a, t2.a); +SELECT '= right ='; +SELECT * FROM t1 RIGHT JOIN t2 ON (t1.a + t1.b = t2.a + t2.b - 100) ORDER BY (t1.a, t2.a); +SELECT '= inner ='; +SELECT * FROM t1 INNER JOIN t2 ON (t1.a + t1.b = t2.a + t2.b - 100) ORDER BY (t1.a, t2.a); + +-- Int64 and UInt64 has no supertype +SELECT * FROM t1 FULL JOIN t2 ON (t1.a + t1.b + 100 = t2.a + t2.b) ORDER BY (t1.a, t2.a); -- { serverError 53 } +SELECT * FROM t1 LEFT JOIN t2 ON (t1.a + t1.b + 100 = t2.a + t2.b) ORDER BY (t1.a, t2.a); -- { serverError 53 } +SELECT * FROM t1 RIGHT JOIN t2 ON (t1.a + t1.b + 100 = t2.a + t2.b) ORDER BY (t1.a, t2.a); -- { serverError 53 } +SELECT * FROM t1 INNER JOIN t2 ON (t1.a + t1.b + 100 = t2.a + t2.b) ORDER BY (t1.a, t2.a); -- { serverError 53 } + +SELECT '= agg ='; SELECT sum(a) == 7 FROM t1 FULL JOIN t2 USING (a) WHERE b > 102 AND t2.b <= 204; SELECT sum(a) == 7 FROM t1 INNER JOIN t2 USING (a) WHERE b > 102 AND t2.b <= 204; SELECT sum(b) = 103 FROM t1 LEFT JOIN t2 USING (a) WHERE b > 102 AND t2.b < 204; SELECT sum(t2.b) = 203 FROM t1 RIGHT JOIN t2 USING (a) WHERE b > 102 AND t2.b < 204; -SELECT '- types -'; +SELECT sum(a) == 2 + 3 + 4 FROM t1 FULL JOIN t2 ON (t1.a + t1.b = t2.a + t2.b - 100) WHERE t1.b < 105 AND t2.b > 201; + +SELECT a > 0, sum(a), sum(b) FROM t1 FULL JOIN t2 USING (a) GROUP BY (a > 0) ORDER BY a > 0; +SELECT a > 0, sum(a), sum(t2.a), sum(b), sum(t2.b) FROM t1 FULL JOIN t2 ON (t1.a == t2.a) GROUP BY (a > 0) ORDER BY a > 0; + +SELECT '= types ='; SELECT any(toTypeName(a)) == 'Int32' AND any(toTypeName(t2.a)) == 'Int32' FROM t1 FULL JOIN t2 USING (a); SELECT any(toTypeName(a)) == 'Int32' AND any(toTypeName(t2.a)) == 'Int32' FROM t1 LEFT JOIN t2 USING (a); SELECT any(toTypeName(a)) == 'Int32' AND any(toTypeName(t2.a)) == 'Int32' FROM t1 RIGHT JOIN t2 USING (a); @@ -103,47 +148,73 @@ SELECT any(toTypeName(a)) == 'Int32' AND any(toTypeName(t2.a)) == 'Int32' FROM t SELECT toTypeName(any(a)) == 'Int32' AND toTypeName(any(t2.a)) == 'Int32' FROM t1 FULL JOIN t2 USING (a); SELECT min(toTypeName(a) == 'Int32' AND toTypeName(t2.a) == 'Int32') FROM t1 FULL JOIN t2 USING (a); -SELECT '--- switch ---'; +SELECT any(toTypeName(a)) == 'UInt16' AND any(toTypeName(t2.a)) == 'Int16' FROM t1 FULL JOIN t2 ON (t1.a == t2.a); +SELECT any(toTypeName(a)) == 'UInt16' AND any(toTypeName(t2.a)) == 'Int16' FROM t1 LEFT JOIN t2 ON (t1.a == t2.a); +SELECT any(toTypeName(a)) == 'UInt16' AND any(toTypeName(t2.a)) == 'Int16' FROM t1 RIGHT JOIN t2 ON (t1.a == t2.a); +SELECT any(toTypeName(a)) == 'UInt16' AND any(toTypeName(t2.a)) == 'Int16' FROM t1 INNER JOIN t2 ON (t1.a == t2.a); +SELECT toTypeName(any(a)) == 'UInt16' AND toTypeName(any(t2.a)) == 'Int16' FROM t1 FULL JOIN t2 ON (t1.a == t2.a); + +SELECT '=== switch ==='; SET join_algorithm = 'auto'; SET max_bytes_in_join = 100; -SELECT '- full -'; +SELECT '= full ='; SELECT a, b, t2.b FROM t1 FULL JOIN t2 USING (a) ORDER BY (a); -SELECT '- left -'; +SELECT '= left ='; SELECT a, b, t2.b FROM t1 LEFT JOIN t2 USING (a) ORDER BY (a); -SELECT '- right -'; +SELECT '= right ='; SELECT a, b, t2.b FROM t1 RIGHT JOIN t2 USING (a) ORDER BY (a); -SELECT '- inner -'; +SELECT '= inner ='; SELECT a, b, t2.b FROM t1 INNER JOIN t2 USING (a) ORDER BY (a); -SELECT '- full -'; +SELECT '= full ='; SELECT a, t1.a, t2.a FROM t1 FULL JOIN t2 USING (a) ORDER BY (t1.a, t2.a); -SELECT '- left -'; +SELECT '= left ='; SELECT a, t1.a, t2.a FROM t1 LEFT JOIN t2 USING (a) ORDER BY (t1.a, t2.a); -SELECT '- right -'; +SELECT '= right ='; SELECT a, t1.a, t2.a FROM t1 RIGHT JOIN t2 USING (a) ORDER BY (t1.a, t2.a); -SELECT '- inner -'; +SELECT '= inner ='; SELECT a, t1.a, t2.a FROM t1 INNER JOIN t2 USING (a) ORDER BY (t1.a, t2.a); -SELECT '- join on -'; -SELECT '- full -'; +SELECT '= join on ='; +SELECT '= full ='; SELECT a, b, t2.a, t2.b FROM t1 FULL JOIN t2 ON (t1.a == t2.a) ORDER BY (t1.a, t2.a); -SELECT '- left -'; +SELECT '= left ='; SELECT a, b, t2.a, t2.b FROM t1 LEFT JOIN t2 ON (t1.a == t2.a) ORDER BY (t1.a, t2.a); -SELECT '- right -'; +SELECT '= right ='; SELECT a, b, t2.a, t2.b FROM t1 RIGHT JOIN t2 ON (t1.a == t2.a) ORDER BY (t1.a, t2.a); -SELECT '- inner -'; +SELECT '= inner ='; SELECT a, b, t2.a, t2.b FROM t1 INNER JOIN t2 ON (t1.a == t2.a) ORDER BY (t1.a, t2.a); -SELECT '- agg -'; +SELECT '= full ='; +SELECT * FROM t1 FULL JOIN t2 ON (t1.a + t1.b = t2.a + t2.b - 100) ORDER BY (t1.a, t2.a); +SELECT '= left ='; +SELECT * FROM t1 LEFT JOIN t2 ON (t1.a + t1.b = t2.a + t2.b - 100) ORDER BY (t1.a, t2.a); +SELECT '= right ='; +SELECT * FROM t1 RIGHT JOIN t2 ON (t1.a + t1.b = t2.a + t2.b - 100) ORDER BY (t1.a, t2.a); +SELECT '= inner ='; +SELECT * FROM t1 INNER JOIN t2 ON (t1.a + t1.b = t2.a + t2.b - 100) ORDER BY (t1.a, t2.a); + +-- Int64 and UInt64 has no supertype +SELECT * FROM t1 FULL JOIN t2 ON (t1.a + t1.b + 100 = t2.a + t2.b) ORDER BY (t1.a, t2.a); -- { serverError 53 } +SELECT * FROM t1 LEFT JOIN t2 ON (t1.a + t1.b + 100 = t2.a + t2.b) ORDER BY (t1.a, t2.a); -- { serverError 53 } +SELECT * FROM t1 RIGHT JOIN t2 ON (t1.a + t1.b + 100 = t2.a + t2.b) ORDER BY (t1.a, t2.a); -- { serverError 53 } +SELECT * FROM t1 INNER JOIN t2 ON (t1.a + t1.b + 100 = t2.a + t2.b) ORDER BY (t1.a, t2.a); -- { serverError 53 } + +SELECT '= agg ='; SELECT sum(a) == 7 FROM t1 FULL JOIN t2 USING (a) WHERE b > 102 AND t2.b <= 204; SELECT sum(a) == 7 FROM t1 INNER JOIN t2 USING (a) WHERE b > 102 AND t2.b <= 204; SELECT sum(b) = 103 FROM t1 LEFT JOIN t2 USING (a) WHERE b > 102 AND t2.b < 204; SELECT sum(t2.b) = 203 FROM t1 RIGHT JOIN t2 USING (a) WHERE b > 102 AND t2.b < 204; -SELECT '- types -'; +SELECT sum(a) == 2 + 3 + 4 FROM t1 FULL JOIN t2 ON (t1.a + t1.b = t2.a + t2.b - 100) WHERE t1.b < 105 AND t2.b > 201; + +SELECT a > 0, sum(a), sum(b) FROM t1 FULL JOIN t2 USING (a) GROUP BY (a > 0) ORDER BY a > 0; +SELECT a > 0, sum(a), sum(t2.a), sum(b), sum(t2.b) FROM t1 FULL JOIN t2 ON (t1.a == t2.a) GROUP BY (a > 0) ORDER BY a > 0; + +SELECT '= types ='; SELECT any(toTypeName(a)) == 'Int32' AND any(toTypeName(t2.a)) == 'Int32' FROM t1 FULL JOIN t2 USING (a); SELECT any(toTypeName(a)) == 'Int32' AND any(toTypeName(t2.a)) == 'Int32' FROM t1 LEFT JOIN t2 USING (a); SELECT any(toTypeName(a)) == 'Int32' AND any(toTypeName(t2.a)) == 'Int32' FROM t1 RIGHT JOIN t2 USING (a); @@ -152,15 +223,21 @@ SELECT any(toTypeName(a)) == 'Int32' AND any(toTypeName(t2.a)) == 'Int32' FROM t SELECT toTypeName(any(a)) == 'Int32' AND toTypeName(any(t2.a)) == 'Int32' FROM t1 FULL JOIN t2 USING (a); SELECT min(toTypeName(a) == 'Int32' AND toTypeName(t2.a) == 'Int32') FROM t1 FULL JOIN t2 USING (a); +SELECT any(toTypeName(a)) == 'UInt16' AND any(toTypeName(t2.a)) == 'Int16' FROM t1 FULL JOIN t2 ON (t1.a == t2.a); +SELECT any(toTypeName(a)) == 'UInt16' AND any(toTypeName(t2.a)) == 'Int16' FROM t1 LEFT JOIN t2 ON (t1.a == t2.a); +SELECT any(toTypeName(a)) == 'UInt16' AND any(toTypeName(t2.a)) == 'Int16' FROM t1 RIGHT JOIN t2 ON (t1.a == t2.a); +SELECT any(toTypeName(a)) == 'UInt16' AND any(toTypeName(t2.a)) == 'Int16' FROM t1 INNER JOIN t2 ON (t1.a == t2.a); +SELECT toTypeName(any(a)) == 'UInt16' AND toTypeName(any(t2.a)) == 'Int16' FROM t1 FULL JOIN t2 ON (t1.a == t2.a); + SET max_bytes_in_join = 0; -SELECT '--- join use nulls ---'; +SELECT '=== join use nulls ==='; SET join_use_nulls = 1; -SELECT '- full -'; +SELECT '= full ='; SELECT a, b, t2.b FROM t1 FULL JOIN t2 USING (a) ORDER BY (a); -SELECT '- right -'; +SELECT '= right ='; SELECT a, b, t2.b FROM t1 RIGHT JOIN t2 USING (a) ORDER BY (a); SET join_use_nulls = 0; @@ -168,7 +245,7 @@ SET join_use_nulls = 0; DROP TABLE IF EXISTS t1; DROP TABLE IF EXISTS t2; -SELECT '---'; +SELECT '=========='; DROP TABLE IF EXISTS t_ab1; DROP TABLE IF EXISTS t_ab2; @@ -178,52 +255,80 @@ CREATE TABLE t_ab2 (id Nullable(Int32), a Int16, b Nullable(Int64)) ENGINE = Tin INSERT INTO t_ab1 VALUES (0, 1, 1), (1, 2, 2); INSERT INTO t_ab2 VALUES (2, -1, 1), (3, 1, NULL), (4, 1, 257), (5, 1, -1), (6, 1, 1); -SELECT '--- hash ---'; +SELECT '=== hash ==='; SET join_algorithm = 'hash'; -SELECT '- full -'; +SELECT '= full ='; SELECT a, b FROM t_ab1 FULL JOIN t_ab2 USING (a, b) ORDER BY ifNull(t_ab1.id, t_ab2.id); -SELECT '- left -'; +SELECT '= left ='; SELECT a, b FROM t_ab1 LEFT JOIN t_ab2 USING (a, b) ORDER BY ifNull(t_ab1.id, t_ab2.id); -SELECT '- right -'; +SELECT '= right ='; SELECT a, b FROM t_ab1 RIGHT JOIN t_ab2 USING (a, b) ORDER BY ifNull(t_ab1.id, t_ab2.id); -SELECT '- inner -'; +SELECT '= inner ='; SELECT a, b FROM t_ab1 INNER JOIN t_ab2 USING (a, b) ORDER BY ifNull(t_ab1.id, t_ab2.id); -SELECT '- agg -'; +SELECT '= full ='; +SELECT a, b, t_ab2.a, t_ab2.b FROM t_ab1 FULL JOIN t_ab2 ON (t_ab1.a == t_ab2.a AND t_ab1.b == t_ab2.b) ORDER BY ifNull(t_ab1.id, t_ab2.id); +SELECT '= left ='; +SELECT a, b, t_ab2.a, t_ab2.b FROM t_ab1 LEFT JOIN t_ab2 ON (t_ab1.a == t_ab2.a AND t_ab1.b == t_ab2.b) ORDER BY ifNull(t_ab1.id, t_ab2.id); +SELECT '= right ='; +SELECT a, b, t_ab2.a, t_ab2.b FROM t_ab1 RIGHT JOIN t_ab2 ON (t_ab1.a == t_ab2.a AND t_ab1.b == t_ab2.b) ORDER BY ifNull(t_ab1.id, t_ab2.id); +SELECT '= inner ='; +SELECT a, b, t_ab2.a, t_ab2.b FROM t_ab1 INNER JOIN t_ab2 ON (t_ab1.a == t_ab2.a AND t_ab1.b == t_ab2.b) ORDER BY ifNull(t_ab1.id, t_ab2.id); + +SELECT '= agg ='; SELECT sum(a), sum(b) FROM t_ab1 FULL JOIN t_ab2 USING (a, b); SELECT sum(a), sum(b) FROM t_ab1 LEFT JOIN t_ab2 USING (a, b); SELECT sum(a), sum(b) FROM t_ab1 RIGHT JOIN t_ab2 USING (a, b); SELECT sum(a), sum(b) FROM t_ab1 INNER JOIN t_ab2 USING (a, b); -SELECT '- types -'; +SELECT sum(a) + sum(t_ab2.a) - 1, sum(b) + sum(t_ab2.b) - 1 FROM t_ab1 FULL JOIN t_ab2 ON (t_ab1.a == t_ab2.a AND t_ab1.b == t_ab2.b); +SELECT sum(a) + sum(t_ab2.a) - 1, sum(b) + sum(t_ab2.b) - 1 FROM t_ab1 LEFT JOIN t_ab2 ON (t_ab1.a == t_ab2.a AND t_ab1.b == t_ab2.b); +SELECT sum(a) + sum(t_ab2.a) - 1, sum(b) + sum(t_ab2.b) - 1 FROM t_ab1 RIGHT JOIN t_ab2 ON (t_ab1.a == t_ab2.a AND t_ab1.b == t_ab2.b); +SELECT sum(a) + sum(t_ab2.a) - 1, sum(b) + sum(t_ab2.b) - 1 FROM t_ab1 INNER JOIN t_ab2 ON (t_ab1.a == t_ab2.a AND t_ab1.b == t_ab2.b); + +SELECT '= types ='; SELECT any(toTypeName(a)) == 'Int32' AND any(toTypeName(b)) == 'Nullable(Int64)' FROM t_ab1 FULL JOIN t_ab2 USING (a, b); SELECT any(toTypeName(a)) == 'Int32' AND any(toTypeName(b)) == 'Nullable(Int64)' FROM t_ab1 LEFT JOIN t_ab2 USING (a, b); SELECT any(toTypeName(a)) == 'Int32' AND any(toTypeName(b)) == 'Nullable(Int64)' FROM t_ab1 RIGHT JOIN t_ab2 USING (a, b); SELECT any(toTypeName(a)) == 'Int32' AND any(toTypeName(b)) == 'Nullable(Int64)' FROM t_ab1 INNER JOIN t_ab2 USING (a, b); -SELECT '--- partial_merge ---'; +SELECT '=== partial_merge ==='; SET join_algorithm = 'partial_merge'; -SELECT '- full -'; +SELECT '= full ='; SELECT a, b FROM t_ab1 FULL JOIN t_ab2 USING (a, b) ORDER BY ifNull(t_ab1.id, t_ab2.id); -SELECT '- left -'; +SELECT '= left ='; SELECT a, b FROM t_ab1 LEFT JOIN t_ab2 USING (a, b) ORDER BY ifNull(t_ab1.id, t_ab2.id); -SELECT '- right -'; +SELECT '= right ='; SELECT a, b FROM t_ab1 RIGHT JOIN t_ab2 USING (a, b) ORDER BY ifNull(t_ab1.id, t_ab2.id); -SELECT '- inner -'; +SELECT '= inner ='; SELECT a, b FROM t_ab1 INNER JOIN t_ab2 USING (a, b) ORDER BY ifNull(t_ab1.id, t_ab2.id); -SELECT '- agg -'; +SELECT '= full ='; +SELECT a, b, t_ab2.a, t_ab2.b FROM t_ab1 FULL JOIN t_ab2 ON (t_ab1.a == t_ab2.a AND t_ab1.b == t_ab2.b) ORDER BY ifNull(t_ab1.id, t_ab2.id); +SELECT '= left ='; +SELECT a, b, t_ab2.a, t_ab2.b FROM t_ab1 LEFT JOIN t_ab2 ON (t_ab1.a == t_ab2.a AND t_ab1.b == t_ab2.b) ORDER BY ifNull(t_ab1.id, t_ab2.id); +SELECT '= right ='; +SELECT a, b, t_ab2.a, t_ab2.b FROM t_ab1 RIGHT JOIN t_ab2 ON (t_ab1.a == t_ab2.a AND t_ab1.b == t_ab2.b) ORDER BY ifNull(t_ab1.id, t_ab2.id); +SELECT '= inner ='; +SELECT a, b, t_ab2.a, t_ab2.b FROM t_ab1 INNER JOIN t_ab2 ON (t_ab1.a == t_ab2.a AND t_ab1.b == t_ab2.b) ORDER BY ifNull(t_ab1.id, t_ab2.id); + +SELECT '= agg ='; SELECT sum(a), sum(b) FROM t_ab1 FULL JOIN t_ab2 USING (a, b); SELECT sum(a), sum(b) FROM t_ab1 LEFT JOIN t_ab2 USING (a, b); SELECT sum(a), sum(b) FROM t_ab1 RIGHT JOIN t_ab2 USING (a, b); SELECT sum(a), sum(b) FROM t_ab1 INNER JOIN t_ab2 USING (a, b); -SELECT '- types -'; +SELECT sum(a) + sum(t_ab2.a) - 1, sum(b) + sum(t_ab2.b) - 1 FROM t_ab1 FULL JOIN t_ab2 ON (t_ab1.a == t_ab2.a AND t_ab1.b == t_ab2.b); +SELECT sum(a) + sum(t_ab2.a) - 1, sum(b) + sum(t_ab2.b) - 1 FROM t_ab1 LEFT JOIN t_ab2 ON (t_ab1.a == t_ab2.a AND t_ab1.b == t_ab2.b); +SELECT sum(a) + sum(t_ab2.a) - 1, sum(b) + sum(t_ab2.b) - 1 FROM t_ab1 RIGHT JOIN t_ab2 ON (t_ab1.a == t_ab2.a AND t_ab1.b == t_ab2.b); +SELECT sum(a) + sum(t_ab2.a) - 1, sum(b) + sum(t_ab2.b) - 1 FROM t_ab1 INNER JOIN t_ab2 ON (t_ab1.a == t_ab2.a AND t_ab1.b == t_ab2.b); + +SELECT '= types ='; SELECT any(toTypeName(a)) == 'Int32' AND any(toTypeName(b)) == 'Nullable(Int64)' FROM t_ab1 FULL JOIN t_ab2 USING (a, b); SELECT any(toTypeName(a)) == 'Int32' AND any(toTypeName(b)) == 'Nullable(Int64)' FROM t_ab1 LEFT JOIN t_ab2 USING (a, b); From dc9e660522487db1f82269dd7f53f3dcecb23bd7 Mon Sep 17 00:00:00 2001 From: vdimir Date: Fri, 19 Feb 2021 14:08:23 +0300 Subject: [PATCH 0487/2357] Remove 'error' from comment from 01710_join_use_nulls --- tests/queries/0_stateless/01710_join_use_nulls.sql | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/tests/queries/0_stateless/01710_join_use_nulls.sql b/tests/queries/0_stateless/01710_join_use_nulls.sql index f9fc4f31b6b..980ac48dd13 100644 --- a/tests/queries/0_stateless/01710_join_use_nulls.sql +++ b/tests/queries/0_stateless/01710_join_use_nulls.sql @@ -7,7 +7,8 @@ CREATE TABLE Y (id Int) ENGINE=Memory; SELECT Y.id - 1 FROM X RIGHT JOIN Y ON (X.id + 1) = Y.id SETTINGS join_use_nulls=1; SELECT Y.id - 1 FROM X RIGHT JOIN Y ON (X.id + 1) = toInt64(Y.id) SETTINGS join_use_nulls=1; --- Logical error: 'Arguments of 'plus' have incorrect data types: '2' of type 'UInt8', '1' of type 'UInt8''. +-- Fix issue #20366 +-- Arguments of 'plus' have incorrect data types: '2' of type 'UInt8', '1' of type 'UInt8'. -- Because 1 became toNullable(1), i.e.: -- 2 UInt8 Const(size = 1, UInt8(size = 1)) -- 1 UInt8 Const(size = 1, Nullable(size = 1, UInt8(size = 1), UInt8(size = 1))) From 1a26310d27b631ae22a723e31424133d555f9ae6 Mon Sep 17 00:00:00 2001 From: vdimir Date: Fri, 19 Feb 2021 14:17:15 +0300 Subject: [PATCH 0488/2357] Split test join_implicit_cast (due to timeouts in flaky checks) --- .../01720_join_implicit_cast.reference | 629 ------------------ .../0_stateless/01720_join_implicit_cast.sql | 249 ------- .../01721_join_implicit_cast_long.reference | 628 +++++++++++++++++ .../01721_join_implicit_cast_long.sql | 246 +++++++ 4 files changed, 874 insertions(+), 878 deletions(-) create mode 100644 tests/queries/0_stateless/01721_join_implicit_cast_long.reference create mode 100644 tests/queries/0_stateless/01721_join_implicit_cast_long.sql diff --git a/tests/queries/0_stateless/01720_join_implicit_cast.reference b/tests/queries/0_stateless/01720_join_implicit_cast.reference index a2a7dcef980..3cca6a264fa 100644 --- a/tests/queries/0_stateless/01720_join_implicit_cast.reference +++ b/tests/queries/0_stateless/01720_join_implicit_cast.reference @@ -1,634 +1,5 @@ === hash === = full = --4 0 196 --3 0 197 --2 0 198 --1 0 199 -0 0 200 -1 101 201 -2 102 202 -3 103 203 -4 104 204 -5 105 205 -6 106 \N -7 107 \N -8 108 \N -9 109 \N -10 110 \N -= left = -1 101 201 -2 102 202 -3 103 203 -4 104 204 -5 105 205 -6 106 \N -7 107 \N -8 108 \N -9 109 \N -10 110 \N -= right = --4 0 196 --3 0 197 --2 0 198 --1 0 199 -0 0 200 -1 101 201 -2 102 202 -3 103 203 -4 104 204 -5 105 205 -= inner = -1 101 201 -2 102 202 -3 103 203 -4 104 204 -5 105 205 -= full = -0 0 -4 -0 0 -3 -0 0 -2 -0 0 -1 -0 0 0 -1 1 1 -2 2 2 -3 3 3 -4 4 4 -5 5 5 -6 6 0 -7 7 0 -8 8 0 -9 9 0 -10 10 0 -= left = -1 1 1 -2 2 2 -3 3 3 -4 4 4 -5 5 5 -6 6 0 -7 7 0 -8 8 0 -9 9 0 -10 10 0 -= right = -0 0 -4 -0 0 -3 -0 0 -2 -0 0 -1 -0 0 0 -1 1 1 -2 2 2 -3 3 3 -4 4 4 -5 5 5 -= inner = -1 1 1 -2 2 2 -3 3 3 -4 4 4 -5 5 5 -= join on = -= full = -0 0 -4 196 -0 0 -3 197 -0 0 -2 198 -0 0 -1 199 -0 0 0 200 -1 101 1 201 -2 102 2 202 -3 103 3 203 -4 104 4 204 -5 105 5 205 -6 106 0 \N -7 107 0 \N -8 108 0 \N -9 109 0 \N -10 110 0 \N -= left = -1 101 1 201 -2 102 2 202 -3 103 3 203 -4 104 4 204 -5 105 5 205 -6 106 0 \N -7 107 0 \N -8 108 0 \N -9 109 0 \N -10 110 0 \N -= right = -0 0 -4 196 -0 0 -3 197 -0 0 -2 198 -0 0 -1 199 -0 0 0 200 -1 101 1 201 -2 102 2 202 -3 103 3 203 -4 104 4 204 -5 105 5 205 -= inner = -1 101 1 201 -2 102 2 202 -3 103 3 203 -4 104 4 204 -5 105 5 205 -= full = -0 0 -4 196 -0 0 -3 197 -0 0 -2 198 -0 0 -1 199 -0 0 0 200 -1 101 1 201 -2 102 2 202 -3 103 3 203 -4 104 4 204 -5 105 5 205 -6 106 0 \N -7 107 0 \N -8 108 0 \N -9 109 0 \N -10 110 0 \N -= left = -1 101 1 201 -2 102 2 202 -3 103 3 203 -4 104 4 204 -5 105 5 205 -6 106 0 \N -7 107 0 \N -8 108 0 \N -9 109 0 \N -10 110 0 \N -= right = -0 0 -4 196 -0 0 -3 197 -0 0 -2 198 -0 0 -1 199 -0 0 0 200 -1 101 1 201 -2 102 2 202 -3 103 3 203 -4 104 4 204 -5 105 5 205 -= inner = -1 101 1 201 -2 102 2 202 -3 103 3 203 -4 104 4 204 -5 105 5 205 -= agg = -1 -1 -1 -1 -1 -0 -10 0 -1 55 1055 -0 0 -10 0 990 -1 55 15 1055 1015 -= types = -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -=== partial_merge === -= full = --4 0 196 --3 0 197 --2 0 198 --1 0 199 -0 0 200 -1 101 201 -2 102 202 -3 103 203 -4 104 204 -5 105 205 -6 106 \N -7 107 \N -8 108 \N -9 109 \N -10 110 \N -= left = -1 101 201 -2 102 202 -3 103 203 -4 104 204 -5 105 205 -6 106 \N -7 107 \N -8 108 \N -9 109 \N -10 110 \N -= right = --4 0 196 --3 0 197 --2 0 198 --1 0 199 -0 0 200 -1 101 201 -2 102 202 -3 103 203 -4 104 204 -5 105 205 -= inner = -1 101 201 -2 102 202 -3 103 203 -4 104 204 -5 105 205 -= full = -0 0 -4 -0 0 -3 -0 0 -2 -0 0 -1 -0 0 0 -1 1 1 -2 2 2 -3 3 3 -4 4 4 -5 5 5 -6 6 0 -7 7 0 -8 8 0 -9 9 0 -10 10 0 -= left = -1 1 1 -2 2 2 -3 3 3 -4 4 4 -5 5 5 -6 6 0 -7 7 0 -8 8 0 -9 9 0 -10 10 0 -= right = -0 0 -4 -0 0 -3 -0 0 -2 -0 0 -1 -0 0 0 -1 1 1 -2 2 2 -3 3 3 -4 4 4 -5 5 5 -= inner = -1 1 1 -2 2 2 -3 3 3 -4 4 4 -5 5 5 -= join on = -= full = -0 0 -4 196 -0 0 -3 197 -0 0 -2 198 -0 0 -1 199 -0 0 0 200 -1 101 1 201 -2 102 2 202 -3 103 3 203 -4 104 4 204 -5 105 5 205 -6 106 0 \N -7 107 0 \N -8 108 0 \N -9 109 0 \N -10 110 0 \N -= left = -1 101 1 201 -2 102 2 202 -3 103 3 203 -4 104 4 204 -5 105 5 205 -6 106 0 \N -7 107 0 \N -8 108 0 \N -9 109 0 \N -10 110 0 \N -= right = -0 0 -4 196 -0 0 -3 197 -0 0 -2 198 -0 0 -1 199 -0 0 0 200 -1 101 1 201 -2 102 2 202 -3 103 3 203 -4 104 4 204 -5 105 5 205 -= inner = -1 101 1 201 -2 102 2 202 -3 103 3 203 -4 104 4 204 -5 105 5 205 -= full = -0 0 -4 196 -0 0 -3 197 -0 0 -2 198 -0 0 -1 199 -0 0 0 200 -1 101 1 201 -2 102 2 202 -3 103 3 203 -4 104 4 204 -5 105 5 205 -6 106 0 \N -7 107 0 \N -8 108 0 \N -9 109 0 \N -10 110 0 \N -= left = -1 101 1 201 -2 102 2 202 -3 103 3 203 -4 104 4 204 -5 105 5 205 -6 106 0 \N -7 107 0 \N -8 108 0 \N -9 109 0 \N -10 110 0 \N -= right = -0 0 -4 196 -0 0 -3 197 -0 0 -2 198 -0 0 -1 199 -0 0 0 200 -1 101 1 201 -2 102 2 202 -3 103 3 203 -4 104 4 204 -5 105 5 205 -= inner = -1 101 1 201 -2 102 2 202 -3 103 3 203 -4 104 4 204 -5 105 5 205 -= agg = -1 -1 -1 -1 -1 -0 -10 0 -1 55 1055 -0 0 -10 0 990 -1 55 15 1055 1015 -= types = -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -=== switch === -= full = --4 0 196 --3 0 197 --2 0 198 --1 0 199 -0 0 200 -1 101 201 -2 102 202 -3 103 203 -4 104 204 -5 105 205 -6 106 \N -7 107 \N -8 108 \N -9 109 \N -10 110 \N -= left = -1 101 201 -2 102 202 -3 103 203 -4 104 204 -5 105 205 -6 106 \N -7 107 \N -8 108 \N -9 109 \N -10 110 \N -= right = --4 0 196 --3 0 197 --2 0 198 --1 0 199 -0 0 200 -1 101 201 -2 102 202 -3 103 203 -4 104 204 -5 105 205 -= inner = -1 101 201 -2 102 202 -3 103 203 -4 104 204 -5 105 205 -= full = -0 0 -4 -0 0 -3 -0 0 -2 -0 0 -1 -0 0 0 -1 1 1 -2 2 2 -3 3 3 -4 4 4 -5 5 5 -6 6 0 -7 7 0 -8 8 0 -9 9 0 -10 10 0 -= left = -1 1 1 -2 2 2 -3 3 3 -4 4 4 -5 5 5 -6 6 0 -7 7 0 -8 8 0 -9 9 0 -10 10 0 -= right = -0 0 -4 -0 0 -3 -0 0 -2 -0 0 -1 -0 0 0 -1 1 1 -2 2 2 -3 3 3 -4 4 4 -5 5 5 -= inner = -1 1 1 -2 2 2 -3 3 3 -4 4 4 -5 5 5 -= join on = -= full = -0 0 -4 196 -0 0 -3 197 -0 0 -2 198 -0 0 -1 199 -0 0 0 200 -1 101 1 201 -2 102 2 202 -3 103 3 203 -4 104 4 204 -5 105 5 205 -6 106 0 \N -7 107 0 \N -8 108 0 \N -9 109 0 \N -10 110 0 \N -= left = -1 101 1 201 -2 102 2 202 -3 103 3 203 -4 104 4 204 -5 105 5 205 -6 106 0 \N -7 107 0 \N -8 108 0 \N -9 109 0 \N -10 110 0 \N -= right = -0 0 -4 196 -0 0 -3 197 -0 0 -2 198 -0 0 -1 199 -0 0 0 200 -1 101 1 201 -2 102 2 202 -3 103 3 203 -4 104 4 204 -5 105 5 205 -= inner = -1 101 1 201 -2 102 2 202 -3 103 3 203 -4 104 4 204 -5 105 5 205 -= full = -0 0 -4 196 -0 0 -3 197 -0 0 -2 198 -0 0 -1 199 -0 0 0 200 -1 101 1 201 -2 102 2 202 -3 103 3 203 -4 104 4 204 -5 105 5 205 -6 106 0 \N -7 107 0 \N -8 108 0 \N -9 109 0 \N -10 110 0 \N -= left = -1 101 1 201 -2 102 2 202 -3 103 3 203 -4 104 4 204 -5 105 5 205 -6 106 0 \N -7 107 0 \N -8 108 0 \N -9 109 0 \N -10 110 0 \N -= right = -0 0 -4 196 -0 0 -3 197 -0 0 -2 198 -0 0 -1 199 -0 0 0 200 -1 101 1 201 -2 102 2 202 -3 103 3 203 -4 104 4 204 -5 105 5 205 -= inner = -1 101 1 201 -2 102 2 202 -3 103 3 203 -4 104 4 204 -5 105 5 205 -= agg = -1 -1 -1 -1 -1 -0 -10 0 -1 55 1055 -0 0 -10 0 990 -1 55 15 1055 1015 -= types = -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -=== join use nulls === -= full = --4 \N 196 --3 \N 197 --2 \N 198 --1 \N 199 -0 \N 200 -1 101 201 -2 102 202 -3 103 203 -4 104 204 -5 105 205 -6 106 \N -7 107 \N -8 108 \N -9 109 \N -10 110 \N -= right = --4 \N 196 --3 \N 197 --2 \N 198 --1 \N 199 -0 \N 200 -1 101 201 -2 102 202 -3 103 203 -4 104 204 -5 105 205 -========== -=== hash === -= full = 1 1 2 2 -1 1 diff --git a/tests/queries/0_stateless/01720_join_implicit_cast.sql b/tests/queries/0_stateless/01720_join_implicit_cast.sql index aff78c14a77..653414e66af 100644 --- a/tests/queries/0_stateless/01720_join_implicit_cast.sql +++ b/tests/queries/0_stateless/01720_join_implicit_cast.sql @@ -1,252 +1,3 @@ -DROP TABLE IF EXISTS t1; -DROP TABLE IF EXISTS t2; - -CREATE TABLE t1 (a UInt16, b UInt16) ENGINE = TinyLog; -CREATE TABLE t2 (a Int16, b Nullable(Int64)) ENGINE = TinyLog; - -INSERT INTO t1 SELECT number as a, 100 + number as b FROM system.numbers LIMIT 1, 10; -INSERT INTO t2 SELECT number - 5 as a, 200 + number - 5 as b FROM system.numbers LIMIT 1, 10; - -SELECT '=== hash ==='; -SET join_algorithm = 'hash'; - -SELECT '= full ='; -SELECT a, b, t2.b FROM t1 FULL JOIN t2 USING (a) ORDER BY (a); -SELECT '= left ='; -SELECT a, b, t2.b FROM t1 LEFT JOIN t2 USING (a) ORDER BY (a); -SELECT '= right ='; -SELECT a, b, t2.b FROM t1 RIGHT JOIN t2 USING (a) ORDER BY (a); -SELECT '= inner ='; -SELECT a, b, t2.b FROM t1 INNER JOIN t2 USING (a) ORDER BY (a); - -SELECT '= full ='; -SELECT a, t1.a, t2.a FROM t1 FULL JOIN t2 USING (a) ORDER BY (t1.a, t2.a); -SELECT '= left ='; -SELECT a, t1.a, t2.a FROM t1 LEFT JOIN t2 USING (a) ORDER BY (t1.a, t2.a); -SELECT '= right ='; -SELECT a, t1.a, t2.a FROM t1 RIGHT JOIN t2 USING (a) ORDER BY (t1.a, t2.a); -SELECT '= inner ='; -SELECT a, t1.a, t2.a FROM t1 INNER JOIN t2 USING (a) ORDER BY (t1.a, t2.a); - -SELECT '= join on ='; -SELECT '= full ='; -SELECT a, b, t2.a, t2.b FROM t1 FULL JOIN t2 ON (t1.a == t2.a) ORDER BY (t1.a, t2.a); -SELECT '= left ='; -SELECT a, b, t2.a, t2.b FROM t1 LEFT JOIN t2 ON (t1.a == t2.a) ORDER BY (t1.a, t2.a); -SELECT '= right ='; -SELECT a, b, t2.a, t2.b FROM t1 RIGHT JOIN t2 ON (t1.a == t2.a) ORDER BY (t1.a, t2.a); -SELECT '= inner ='; -SELECT a, b, t2.a, t2.b FROM t1 INNER JOIN t2 ON (t1.a == t2.a) ORDER BY (t1.a, t2.a); - -SELECT '= full ='; -SELECT * FROM t1 FULL JOIN t2 ON (t1.a + t1.b = t2.a + t2.b - 100) ORDER BY (t1.a, t2.a); -SELECT '= left ='; -SELECT * FROM t1 LEFT JOIN t2 ON (t1.a + t1.b = t2.a + t2.b - 100) ORDER BY (t1.a, t2.a); -SELECT '= right ='; -SELECT * FROM t1 RIGHT JOIN t2 ON (t1.a + t1.b = t2.a + t2.b - 100) ORDER BY (t1.a, t2.a); -SELECT '= inner ='; -SELECT * FROM t1 INNER JOIN t2 ON (t1.a + t1.b = t2.a + t2.b - 100) ORDER BY (t1.a, t2.a); - --- Int64 and UInt64 has no supertype -SELECT * FROM t1 FULL JOIN t2 ON (t1.a + t1.b + 100 = t2.a + t2.b) ORDER BY (t1.a, t2.a); -- { serverError 53 } -SELECT * FROM t1 LEFT JOIN t2 ON (t1.a + t1.b + 100 = t2.a + t2.b) ORDER BY (t1.a, t2.a); -- { serverError 53 } -SELECT * FROM t1 RIGHT JOIN t2 ON (t1.a + t1.b + 100 = t2.a + t2.b) ORDER BY (t1.a, t2.a); -- { serverError 53 } -SELECT * FROM t1 INNER JOIN t2 ON (t1.a + t1.b + 100 = t2.a + t2.b) ORDER BY (t1.a, t2.a); -- { serverError 53 } - -SELECT '= agg ='; -SELECT sum(a) == 7 FROM t1 FULL JOIN t2 USING (a) WHERE b > 102 AND t2.b <= 204; -SELECT sum(a) == 7 FROM t1 INNER JOIN t2 USING (a) WHERE b > 102 AND t2.b <= 204; - -SELECT sum(b) = 103 FROM t1 LEFT JOIN t2 USING (a) WHERE b > 102 AND t2.b < 204; -SELECT sum(t2.b) = 203 FROM t1 RIGHT JOIN t2 USING (a) WHERE b > 102 AND t2.b < 204; - -SELECT sum(a) == 2 + 3 + 4 FROM t1 FULL JOIN t2 ON (t1.a + t1.b = t2.a + t2.b - 100) WHERE t1.b < 105 AND t2.b > 201; - -SELECT a > 0, sum(a), sum(b) FROM t1 FULL JOIN t2 USING (a) GROUP BY (a > 0) ORDER BY a > 0; -SELECT a > 0, sum(a), sum(t2.a), sum(b), sum(t2.b) FROM t1 FULL JOIN t2 ON (t1.a == t2.a) GROUP BY (a > 0) ORDER BY a > 0; - -SELECT '= types ='; -SELECT any(toTypeName(a)) == 'Int32' AND any(toTypeName(t2.a)) == 'Int32' FROM t1 FULL JOIN t2 USING (a); -SELECT any(toTypeName(a)) == 'Int32' AND any(toTypeName(t2.a)) == 'Int32' FROM t1 LEFT JOIN t2 USING (a); -SELECT any(toTypeName(a)) == 'Int32' AND any(toTypeName(t2.a)) == 'Int32' FROM t1 RIGHT JOIN t2 USING (a); -SELECT any(toTypeName(a)) == 'Int32' AND any(toTypeName(t2.a)) == 'Int32' FROM t1 INNER JOIN t2 USING (a); - -SELECT toTypeName(any(a)) == 'Int32' AND toTypeName(any(t2.a)) == 'Int32' FROM t1 FULL JOIN t2 USING (a); -SELECT min(toTypeName(a) == 'Int32' AND toTypeName(t2.a) == 'Int32') FROM t1 FULL JOIN t2 USING (a); - -SELECT any(toTypeName(a)) == 'UInt16' AND any(toTypeName(t2.a)) == 'Int16' FROM t1 FULL JOIN t2 ON (t1.a == t2.a); -SELECT any(toTypeName(a)) == 'UInt16' AND any(toTypeName(t2.a)) == 'Int16' FROM t1 LEFT JOIN t2 ON (t1.a == t2.a); -SELECT any(toTypeName(a)) == 'UInt16' AND any(toTypeName(t2.a)) == 'Int16' FROM t1 RIGHT JOIN t2 ON (t1.a == t2.a); -SELECT any(toTypeName(a)) == 'UInt16' AND any(toTypeName(t2.a)) == 'Int16' FROM t1 INNER JOIN t2 ON (t1.a == t2.a); -SELECT toTypeName(any(a)) == 'UInt16' AND toTypeName(any(t2.a)) == 'Int16' FROM t1 FULL JOIN t2 ON (t1.a == t2.a); - -SELECT '=== partial_merge ==='; - -SET join_algorithm = 'partial_merge'; - -SELECT '= full ='; -SELECT a, b, t2.b FROM t1 FULL JOIN t2 USING (a) ORDER BY (a); -SELECT '= left ='; -SELECT a, b, t2.b FROM t1 LEFT JOIN t2 USING (a) ORDER BY (a); -SELECT '= right ='; -SELECT a, b, t2.b FROM t1 RIGHT JOIN t2 USING (a) ORDER BY (a); -SELECT '= inner ='; -SELECT a, b, t2.b FROM t1 INNER JOIN t2 USING (a) ORDER BY (a); - -SELECT '= full ='; -SELECT a, t1.a, t2.a FROM t1 FULL JOIN t2 USING (a) ORDER BY (t1.a, t2.a); -SELECT '= left ='; -SELECT a, t1.a, t2.a FROM t1 LEFT JOIN t2 USING (a) ORDER BY (t1.a, t2.a); -SELECT '= right ='; -SELECT a, t1.a, t2.a FROM t1 RIGHT JOIN t2 USING (a) ORDER BY (t1.a, t2.a); -SELECT '= inner ='; -SELECT a, t1.a, t2.a FROM t1 INNER JOIN t2 USING (a) ORDER BY (t1.a, t2.a); - -SELECT '= join on ='; -SELECT '= full ='; -SELECT a, b, t2.a, t2.b FROM t1 FULL JOIN t2 ON (t1.a == t2.a) ORDER BY (t1.a, t2.a); -SELECT '= left ='; -SELECT a, b, t2.a, t2.b FROM t1 LEFT JOIN t2 ON (t1.a == t2.a) ORDER BY (t1.a, t2.a); -SELECT '= right ='; -SELECT a, b, t2.a, t2.b FROM t1 RIGHT JOIN t2 ON (t1.a == t2.a) ORDER BY (t1.a, t2.a); -SELECT '= inner ='; -SELECT a, b, t2.a, t2.b FROM t1 INNER JOIN t2 ON (t1.a == t2.a) ORDER BY (t1.a, t2.a); - -SELECT '= full ='; -SELECT * FROM t1 FULL JOIN t2 ON (t1.a + t1.b = t2.a + t2.b - 100) ORDER BY (t1.a, t2.a); -SELECT '= left ='; -SELECT * FROM t1 LEFT JOIN t2 ON (t1.a + t1.b = t2.a + t2.b - 100) ORDER BY (t1.a, t2.a); -SELECT '= right ='; -SELECT * FROM t1 RIGHT JOIN t2 ON (t1.a + t1.b = t2.a + t2.b - 100) ORDER BY (t1.a, t2.a); -SELECT '= inner ='; -SELECT * FROM t1 INNER JOIN t2 ON (t1.a + t1.b = t2.a + t2.b - 100) ORDER BY (t1.a, t2.a); - --- Int64 and UInt64 has no supertype -SELECT * FROM t1 FULL JOIN t2 ON (t1.a + t1.b + 100 = t2.a + t2.b) ORDER BY (t1.a, t2.a); -- { serverError 53 } -SELECT * FROM t1 LEFT JOIN t2 ON (t1.a + t1.b + 100 = t2.a + t2.b) ORDER BY (t1.a, t2.a); -- { serverError 53 } -SELECT * FROM t1 RIGHT JOIN t2 ON (t1.a + t1.b + 100 = t2.a + t2.b) ORDER BY (t1.a, t2.a); -- { serverError 53 } -SELECT * FROM t1 INNER JOIN t2 ON (t1.a + t1.b + 100 = t2.a + t2.b) ORDER BY (t1.a, t2.a); -- { serverError 53 } - -SELECT '= agg ='; -SELECT sum(a) == 7 FROM t1 FULL JOIN t2 USING (a) WHERE b > 102 AND t2.b <= 204; -SELECT sum(a) == 7 FROM t1 INNER JOIN t2 USING (a) WHERE b > 102 AND t2.b <= 204; - -SELECT sum(b) = 103 FROM t1 LEFT JOIN t2 USING (a) WHERE b > 102 AND t2.b < 204; -SELECT sum(t2.b) = 203 FROM t1 RIGHT JOIN t2 USING (a) WHERE b > 102 AND t2.b < 204; - -SELECT sum(a) == 2 + 3 + 4 FROM t1 FULL JOIN t2 ON (t1.a + t1.b = t2.a + t2.b - 100) WHERE t1.b < 105 AND t2.b > 201; - -SELECT a > 0, sum(a), sum(b) FROM t1 FULL JOIN t2 USING (a) GROUP BY (a > 0) ORDER BY a > 0; -SELECT a > 0, sum(a), sum(t2.a), sum(b), sum(t2.b) FROM t1 FULL JOIN t2 ON (t1.a == t2.a) GROUP BY (a > 0) ORDER BY a > 0; - -SELECT '= types ='; -SELECT any(toTypeName(a)) == 'Int32' AND any(toTypeName(t2.a)) == 'Int32' FROM t1 FULL JOIN t2 USING (a); -SELECT any(toTypeName(a)) == 'Int32' AND any(toTypeName(t2.a)) == 'Int32' FROM t1 LEFT JOIN t2 USING (a); -SELECT any(toTypeName(a)) == 'Int32' AND any(toTypeName(t2.a)) == 'Int32' FROM t1 RIGHT JOIN t2 USING (a); -SELECT any(toTypeName(a)) == 'Int32' AND any(toTypeName(t2.a)) == 'Int32' FROM t1 INNER JOIN t2 USING (a); - -SELECT toTypeName(any(a)) == 'Int32' AND toTypeName(any(t2.a)) == 'Int32' FROM t1 FULL JOIN t2 USING (a); -SELECT min(toTypeName(a) == 'Int32' AND toTypeName(t2.a) == 'Int32') FROM t1 FULL JOIN t2 USING (a); - -SELECT any(toTypeName(a)) == 'UInt16' AND any(toTypeName(t2.a)) == 'Int16' FROM t1 FULL JOIN t2 ON (t1.a == t2.a); -SELECT any(toTypeName(a)) == 'UInt16' AND any(toTypeName(t2.a)) == 'Int16' FROM t1 LEFT JOIN t2 ON (t1.a == t2.a); -SELECT any(toTypeName(a)) == 'UInt16' AND any(toTypeName(t2.a)) == 'Int16' FROM t1 RIGHT JOIN t2 ON (t1.a == t2.a); -SELECT any(toTypeName(a)) == 'UInt16' AND any(toTypeName(t2.a)) == 'Int16' FROM t1 INNER JOIN t2 ON (t1.a == t2.a); -SELECT toTypeName(any(a)) == 'UInt16' AND toTypeName(any(t2.a)) == 'Int16' FROM t1 FULL JOIN t2 ON (t1.a == t2.a); - -SELECT '=== switch ==='; - -SET join_algorithm = 'auto'; -SET max_bytes_in_join = 100; - -SELECT '= full ='; -SELECT a, b, t2.b FROM t1 FULL JOIN t2 USING (a) ORDER BY (a); -SELECT '= left ='; -SELECT a, b, t2.b FROM t1 LEFT JOIN t2 USING (a) ORDER BY (a); -SELECT '= right ='; -SELECT a, b, t2.b FROM t1 RIGHT JOIN t2 USING (a) ORDER BY (a); -SELECT '= inner ='; -SELECT a, b, t2.b FROM t1 INNER JOIN t2 USING (a) ORDER BY (a); - -SELECT '= full ='; -SELECT a, t1.a, t2.a FROM t1 FULL JOIN t2 USING (a) ORDER BY (t1.a, t2.a); -SELECT '= left ='; -SELECT a, t1.a, t2.a FROM t1 LEFT JOIN t2 USING (a) ORDER BY (t1.a, t2.a); -SELECT '= right ='; -SELECT a, t1.a, t2.a FROM t1 RIGHT JOIN t2 USING (a) ORDER BY (t1.a, t2.a); -SELECT '= inner ='; -SELECT a, t1.a, t2.a FROM t1 INNER JOIN t2 USING (a) ORDER BY (t1.a, t2.a); - -SELECT '= join on ='; -SELECT '= full ='; -SELECT a, b, t2.a, t2.b FROM t1 FULL JOIN t2 ON (t1.a == t2.a) ORDER BY (t1.a, t2.a); -SELECT '= left ='; -SELECT a, b, t2.a, t2.b FROM t1 LEFT JOIN t2 ON (t1.a == t2.a) ORDER BY (t1.a, t2.a); -SELECT '= right ='; -SELECT a, b, t2.a, t2.b FROM t1 RIGHT JOIN t2 ON (t1.a == t2.a) ORDER BY (t1.a, t2.a); -SELECT '= inner ='; -SELECT a, b, t2.a, t2.b FROM t1 INNER JOIN t2 ON (t1.a == t2.a) ORDER BY (t1.a, t2.a); - -SELECT '= full ='; -SELECT * FROM t1 FULL JOIN t2 ON (t1.a + t1.b = t2.a + t2.b - 100) ORDER BY (t1.a, t2.a); -SELECT '= left ='; -SELECT * FROM t1 LEFT JOIN t2 ON (t1.a + t1.b = t2.a + t2.b - 100) ORDER BY (t1.a, t2.a); -SELECT '= right ='; -SELECT * FROM t1 RIGHT JOIN t2 ON (t1.a + t1.b = t2.a + t2.b - 100) ORDER BY (t1.a, t2.a); -SELECT '= inner ='; -SELECT * FROM t1 INNER JOIN t2 ON (t1.a + t1.b = t2.a + t2.b - 100) ORDER BY (t1.a, t2.a); - --- Int64 and UInt64 has no supertype -SELECT * FROM t1 FULL JOIN t2 ON (t1.a + t1.b + 100 = t2.a + t2.b) ORDER BY (t1.a, t2.a); -- { serverError 53 } -SELECT * FROM t1 LEFT JOIN t2 ON (t1.a + t1.b + 100 = t2.a + t2.b) ORDER BY (t1.a, t2.a); -- { serverError 53 } -SELECT * FROM t1 RIGHT JOIN t2 ON (t1.a + t1.b + 100 = t2.a + t2.b) ORDER BY (t1.a, t2.a); -- { serverError 53 } -SELECT * FROM t1 INNER JOIN t2 ON (t1.a + t1.b + 100 = t2.a + t2.b) ORDER BY (t1.a, t2.a); -- { serverError 53 } - -SELECT '= agg ='; -SELECT sum(a) == 7 FROM t1 FULL JOIN t2 USING (a) WHERE b > 102 AND t2.b <= 204; -SELECT sum(a) == 7 FROM t1 INNER JOIN t2 USING (a) WHERE b > 102 AND t2.b <= 204; - -SELECT sum(b) = 103 FROM t1 LEFT JOIN t2 USING (a) WHERE b > 102 AND t2.b < 204; -SELECT sum(t2.b) = 203 FROM t1 RIGHT JOIN t2 USING (a) WHERE b > 102 AND t2.b < 204; - -SELECT sum(a) == 2 + 3 + 4 FROM t1 FULL JOIN t2 ON (t1.a + t1.b = t2.a + t2.b - 100) WHERE t1.b < 105 AND t2.b > 201; - -SELECT a > 0, sum(a), sum(b) FROM t1 FULL JOIN t2 USING (a) GROUP BY (a > 0) ORDER BY a > 0; -SELECT a > 0, sum(a), sum(t2.a), sum(b), sum(t2.b) FROM t1 FULL JOIN t2 ON (t1.a == t2.a) GROUP BY (a > 0) ORDER BY a > 0; - -SELECT '= types ='; -SELECT any(toTypeName(a)) == 'Int32' AND any(toTypeName(t2.a)) == 'Int32' FROM t1 FULL JOIN t2 USING (a); -SELECT any(toTypeName(a)) == 'Int32' AND any(toTypeName(t2.a)) == 'Int32' FROM t1 LEFT JOIN t2 USING (a); -SELECT any(toTypeName(a)) == 'Int32' AND any(toTypeName(t2.a)) == 'Int32' FROM t1 RIGHT JOIN t2 USING (a); -SELECT any(toTypeName(a)) == 'Int32' AND any(toTypeName(t2.a)) == 'Int32' FROM t1 INNER JOIN t2 USING (a); - -SELECT toTypeName(any(a)) == 'Int32' AND toTypeName(any(t2.a)) == 'Int32' FROM t1 FULL JOIN t2 USING (a); -SELECT min(toTypeName(a) == 'Int32' AND toTypeName(t2.a) == 'Int32') FROM t1 FULL JOIN t2 USING (a); - -SELECT any(toTypeName(a)) == 'UInt16' AND any(toTypeName(t2.a)) == 'Int16' FROM t1 FULL JOIN t2 ON (t1.a == t2.a); -SELECT any(toTypeName(a)) == 'UInt16' AND any(toTypeName(t2.a)) == 'Int16' FROM t1 LEFT JOIN t2 ON (t1.a == t2.a); -SELECT any(toTypeName(a)) == 'UInt16' AND any(toTypeName(t2.a)) == 'Int16' FROM t1 RIGHT JOIN t2 ON (t1.a == t2.a); -SELECT any(toTypeName(a)) == 'UInt16' AND any(toTypeName(t2.a)) == 'Int16' FROM t1 INNER JOIN t2 ON (t1.a == t2.a); -SELECT toTypeName(any(a)) == 'UInt16' AND toTypeName(any(t2.a)) == 'Int16' FROM t1 FULL JOIN t2 ON (t1.a == t2.a); - -SET max_bytes_in_join = 0; - -SELECT '=== join use nulls ==='; - -SET join_use_nulls = 1; - -SELECT '= full ='; -SELECT a, b, t2.b FROM t1 FULL JOIN t2 USING (a) ORDER BY (a); -SELECT '= right ='; -SELECT a, b, t2.b FROM t1 RIGHT JOIN t2 USING (a) ORDER BY (a); - -SET join_use_nulls = 0; - -DROP TABLE IF EXISTS t1; -DROP TABLE IF EXISTS t2; - -SELECT '=========='; - DROP TABLE IF EXISTS t_ab1; DROP TABLE IF EXISTS t_ab2; diff --git a/tests/queries/0_stateless/01721_join_implicit_cast_long.reference b/tests/queries/0_stateless/01721_join_implicit_cast_long.reference new file mode 100644 index 00000000000..785f8c7bacc --- /dev/null +++ b/tests/queries/0_stateless/01721_join_implicit_cast_long.reference @@ -0,0 +1,628 @@ +=== hash === += full = +-4 0 196 +-3 0 197 +-2 0 198 +-1 0 199 +0 0 200 +1 101 201 +2 102 202 +3 103 203 +4 104 204 +5 105 205 +6 106 \N +7 107 \N +8 108 \N +9 109 \N +10 110 \N += left = +1 101 201 +2 102 202 +3 103 203 +4 104 204 +5 105 205 +6 106 \N +7 107 \N +8 108 \N +9 109 \N +10 110 \N += right = +-4 0 196 +-3 0 197 +-2 0 198 +-1 0 199 +0 0 200 +1 101 201 +2 102 202 +3 103 203 +4 104 204 +5 105 205 += inner = +1 101 201 +2 102 202 +3 103 203 +4 104 204 +5 105 205 += full = +0 0 -4 +0 0 -3 +0 0 -2 +0 0 -1 +0 0 0 +1 1 1 +2 2 2 +3 3 3 +4 4 4 +5 5 5 +6 6 0 +7 7 0 +8 8 0 +9 9 0 +10 10 0 += left = +1 1 1 +2 2 2 +3 3 3 +4 4 4 +5 5 5 +6 6 0 +7 7 0 +8 8 0 +9 9 0 +10 10 0 += right = +0 0 -4 +0 0 -3 +0 0 -2 +0 0 -1 +0 0 0 +1 1 1 +2 2 2 +3 3 3 +4 4 4 +5 5 5 += inner = +1 1 1 +2 2 2 +3 3 3 +4 4 4 +5 5 5 += join on = += full = +0 0 -4 196 +0 0 -3 197 +0 0 -2 198 +0 0 -1 199 +0 0 0 200 +1 101 1 201 +2 102 2 202 +3 103 3 203 +4 104 4 204 +5 105 5 205 +6 106 0 \N +7 107 0 \N +8 108 0 \N +9 109 0 \N +10 110 0 \N += left = +1 101 1 201 +2 102 2 202 +3 103 3 203 +4 104 4 204 +5 105 5 205 +6 106 0 \N +7 107 0 \N +8 108 0 \N +9 109 0 \N +10 110 0 \N += right = +0 0 -4 196 +0 0 -3 197 +0 0 -2 198 +0 0 -1 199 +0 0 0 200 +1 101 1 201 +2 102 2 202 +3 103 3 203 +4 104 4 204 +5 105 5 205 += inner = +1 101 1 201 +2 102 2 202 +3 103 3 203 +4 104 4 204 +5 105 5 205 += full = +0 0 -4 196 +0 0 -3 197 +0 0 -2 198 +0 0 -1 199 +0 0 0 200 +1 101 1 201 +2 102 2 202 +3 103 3 203 +4 104 4 204 +5 105 5 205 +6 106 0 \N +7 107 0 \N +8 108 0 \N +9 109 0 \N +10 110 0 \N += left = +1 101 1 201 +2 102 2 202 +3 103 3 203 +4 104 4 204 +5 105 5 205 +6 106 0 \N +7 107 0 \N +8 108 0 \N +9 109 0 \N +10 110 0 \N += right = +0 0 -4 196 +0 0 -3 197 +0 0 -2 198 +0 0 -1 199 +0 0 0 200 +1 101 1 201 +2 102 2 202 +3 103 3 203 +4 104 4 204 +5 105 5 205 += inner = +1 101 1 201 +2 102 2 202 +3 103 3 203 +4 104 4 204 +5 105 5 205 += agg = +1 +1 +1 +1 +1 +0 -10 0 +1 55 1055 +0 0 -10 0 990 +1 55 15 1055 1015 += types = +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +=== partial_merge === += full = +-4 0 196 +-3 0 197 +-2 0 198 +-1 0 199 +0 0 200 +1 101 201 +2 102 202 +3 103 203 +4 104 204 +5 105 205 +6 106 \N +7 107 \N +8 108 \N +9 109 \N +10 110 \N += left = +1 101 201 +2 102 202 +3 103 203 +4 104 204 +5 105 205 +6 106 \N +7 107 \N +8 108 \N +9 109 \N +10 110 \N += right = +-4 0 196 +-3 0 197 +-2 0 198 +-1 0 199 +0 0 200 +1 101 201 +2 102 202 +3 103 203 +4 104 204 +5 105 205 += inner = +1 101 201 +2 102 202 +3 103 203 +4 104 204 +5 105 205 += full = +0 0 -4 +0 0 -3 +0 0 -2 +0 0 -1 +0 0 0 +1 1 1 +2 2 2 +3 3 3 +4 4 4 +5 5 5 +6 6 0 +7 7 0 +8 8 0 +9 9 0 +10 10 0 += left = +1 1 1 +2 2 2 +3 3 3 +4 4 4 +5 5 5 +6 6 0 +7 7 0 +8 8 0 +9 9 0 +10 10 0 += right = +0 0 -4 +0 0 -3 +0 0 -2 +0 0 -1 +0 0 0 +1 1 1 +2 2 2 +3 3 3 +4 4 4 +5 5 5 += inner = +1 1 1 +2 2 2 +3 3 3 +4 4 4 +5 5 5 += join on = += full = +0 0 -4 196 +0 0 -3 197 +0 0 -2 198 +0 0 -1 199 +0 0 0 200 +1 101 1 201 +2 102 2 202 +3 103 3 203 +4 104 4 204 +5 105 5 205 +6 106 0 \N +7 107 0 \N +8 108 0 \N +9 109 0 \N +10 110 0 \N += left = +1 101 1 201 +2 102 2 202 +3 103 3 203 +4 104 4 204 +5 105 5 205 +6 106 0 \N +7 107 0 \N +8 108 0 \N +9 109 0 \N +10 110 0 \N += right = +0 0 -4 196 +0 0 -3 197 +0 0 -2 198 +0 0 -1 199 +0 0 0 200 +1 101 1 201 +2 102 2 202 +3 103 3 203 +4 104 4 204 +5 105 5 205 += inner = +1 101 1 201 +2 102 2 202 +3 103 3 203 +4 104 4 204 +5 105 5 205 += full = +0 0 -4 196 +0 0 -3 197 +0 0 -2 198 +0 0 -1 199 +0 0 0 200 +1 101 1 201 +2 102 2 202 +3 103 3 203 +4 104 4 204 +5 105 5 205 +6 106 0 \N +7 107 0 \N +8 108 0 \N +9 109 0 \N +10 110 0 \N += left = +1 101 1 201 +2 102 2 202 +3 103 3 203 +4 104 4 204 +5 105 5 205 +6 106 0 \N +7 107 0 \N +8 108 0 \N +9 109 0 \N +10 110 0 \N += right = +0 0 -4 196 +0 0 -3 197 +0 0 -2 198 +0 0 -1 199 +0 0 0 200 +1 101 1 201 +2 102 2 202 +3 103 3 203 +4 104 4 204 +5 105 5 205 += inner = +1 101 1 201 +2 102 2 202 +3 103 3 203 +4 104 4 204 +5 105 5 205 += agg = +1 +1 +1 +1 +1 +0 -10 0 +1 55 1055 +0 0 -10 0 990 +1 55 15 1055 1015 += types = +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +=== switch === += full = +-4 0 196 +-3 0 197 +-2 0 198 +-1 0 199 +0 0 200 +1 101 201 +2 102 202 +3 103 203 +4 104 204 +5 105 205 +6 106 \N +7 107 \N +8 108 \N +9 109 \N +10 110 \N += left = +1 101 201 +2 102 202 +3 103 203 +4 104 204 +5 105 205 +6 106 \N +7 107 \N +8 108 \N +9 109 \N +10 110 \N += right = +-4 0 196 +-3 0 197 +-2 0 198 +-1 0 199 +0 0 200 +1 101 201 +2 102 202 +3 103 203 +4 104 204 +5 105 205 += inner = +1 101 201 +2 102 202 +3 103 203 +4 104 204 +5 105 205 += full = +0 0 -4 +0 0 -3 +0 0 -2 +0 0 -1 +0 0 0 +1 1 1 +2 2 2 +3 3 3 +4 4 4 +5 5 5 +6 6 0 +7 7 0 +8 8 0 +9 9 0 +10 10 0 += left = +1 1 1 +2 2 2 +3 3 3 +4 4 4 +5 5 5 +6 6 0 +7 7 0 +8 8 0 +9 9 0 +10 10 0 += right = +0 0 -4 +0 0 -3 +0 0 -2 +0 0 -1 +0 0 0 +1 1 1 +2 2 2 +3 3 3 +4 4 4 +5 5 5 += inner = +1 1 1 +2 2 2 +3 3 3 +4 4 4 +5 5 5 += join on = += full = +0 0 -4 196 +0 0 -3 197 +0 0 -2 198 +0 0 -1 199 +0 0 0 200 +1 101 1 201 +2 102 2 202 +3 103 3 203 +4 104 4 204 +5 105 5 205 +6 106 0 \N +7 107 0 \N +8 108 0 \N +9 109 0 \N +10 110 0 \N += left = +1 101 1 201 +2 102 2 202 +3 103 3 203 +4 104 4 204 +5 105 5 205 +6 106 0 \N +7 107 0 \N +8 108 0 \N +9 109 0 \N +10 110 0 \N += right = +0 0 -4 196 +0 0 -3 197 +0 0 -2 198 +0 0 -1 199 +0 0 0 200 +1 101 1 201 +2 102 2 202 +3 103 3 203 +4 104 4 204 +5 105 5 205 += inner = +1 101 1 201 +2 102 2 202 +3 103 3 203 +4 104 4 204 +5 105 5 205 += full = +0 0 -4 196 +0 0 -3 197 +0 0 -2 198 +0 0 -1 199 +0 0 0 200 +1 101 1 201 +2 102 2 202 +3 103 3 203 +4 104 4 204 +5 105 5 205 +6 106 0 \N +7 107 0 \N +8 108 0 \N +9 109 0 \N +10 110 0 \N += left = +1 101 1 201 +2 102 2 202 +3 103 3 203 +4 104 4 204 +5 105 5 205 +6 106 0 \N +7 107 0 \N +8 108 0 \N +9 109 0 \N +10 110 0 \N += right = +0 0 -4 196 +0 0 -3 197 +0 0 -2 198 +0 0 -1 199 +0 0 0 200 +1 101 1 201 +2 102 2 202 +3 103 3 203 +4 104 4 204 +5 105 5 205 += inner = +1 101 1 201 +2 102 2 202 +3 103 3 203 +4 104 4 204 +5 105 5 205 += agg = +1 +1 +1 +1 +1 +0 -10 0 +1 55 1055 +0 0 -10 0 990 +1 55 15 1055 1015 += types = +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +1 +=== join use nulls === += full = +-4 \N 196 +-3 \N 197 +-2 \N 198 +-1 \N 199 +0 \N 200 +1 101 201 +2 102 202 +3 103 203 +4 104 204 +5 105 205 +6 106 \N +7 107 \N +8 108 \N +9 109 \N +10 110 \N += right = +-4 \N 196 +-3 \N 197 +-2 \N 198 +-1 \N 199 +0 \N 200 +1 101 201 +2 102 202 +3 103 203 +4 104 204 +5 105 205 diff --git a/tests/queries/0_stateless/01721_join_implicit_cast_long.sql b/tests/queries/0_stateless/01721_join_implicit_cast_long.sql new file mode 100644 index 00000000000..a8e3c1b25e7 --- /dev/null +++ b/tests/queries/0_stateless/01721_join_implicit_cast_long.sql @@ -0,0 +1,246 @@ +DROP TABLE IF EXISTS t1; +DROP TABLE IF EXISTS t2; + +CREATE TABLE t1 (a UInt16, b UInt16) ENGINE = TinyLog; +CREATE TABLE t2 (a Int16, b Nullable(Int64)) ENGINE = TinyLog; + +INSERT INTO t1 SELECT number as a, 100 + number as b FROM system.numbers LIMIT 1, 10; +INSERT INTO t2 SELECT number - 5 as a, 200 + number - 5 as b FROM system.numbers LIMIT 1, 10; + +SELECT '=== hash ==='; +SET join_algorithm = 'hash'; + +SELECT '= full ='; +SELECT a, b, t2.b FROM t1 FULL JOIN t2 USING (a) ORDER BY (a); +SELECT '= left ='; +SELECT a, b, t2.b FROM t1 LEFT JOIN t2 USING (a) ORDER BY (a); +SELECT '= right ='; +SELECT a, b, t2.b FROM t1 RIGHT JOIN t2 USING (a) ORDER BY (a); +SELECT '= inner ='; +SELECT a, b, t2.b FROM t1 INNER JOIN t2 USING (a) ORDER BY (a); + +SELECT '= full ='; +SELECT a, t1.a, t2.a FROM t1 FULL JOIN t2 USING (a) ORDER BY (t1.a, t2.a); +SELECT '= left ='; +SELECT a, t1.a, t2.a FROM t1 LEFT JOIN t2 USING (a) ORDER BY (t1.a, t2.a); +SELECT '= right ='; +SELECT a, t1.a, t2.a FROM t1 RIGHT JOIN t2 USING (a) ORDER BY (t1.a, t2.a); +SELECT '= inner ='; +SELECT a, t1.a, t2.a FROM t1 INNER JOIN t2 USING (a) ORDER BY (t1.a, t2.a); + +SELECT '= join on ='; +SELECT '= full ='; +SELECT a, b, t2.a, t2.b FROM t1 FULL JOIN t2 ON (t1.a == t2.a) ORDER BY (t1.a, t2.a); +SELECT '= left ='; +SELECT a, b, t2.a, t2.b FROM t1 LEFT JOIN t2 ON (t1.a == t2.a) ORDER BY (t1.a, t2.a); +SELECT '= right ='; +SELECT a, b, t2.a, t2.b FROM t1 RIGHT JOIN t2 ON (t1.a == t2.a) ORDER BY (t1.a, t2.a); +SELECT '= inner ='; +SELECT a, b, t2.a, t2.b FROM t1 INNER JOIN t2 ON (t1.a == t2.a) ORDER BY (t1.a, t2.a); + +SELECT '= full ='; +SELECT * FROM t1 FULL JOIN t2 ON (t1.a + t1.b = t2.a + t2.b - 100) ORDER BY (t1.a, t2.a); +SELECT '= left ='; +SELECT * FROM t1 LEFT JOIN t2 ON (t1.a + t1.b = t2.a + t2.b - 100) ORDER BY (t1.a, t2.a); +SELECT '= right ='; +SELECT * FROM t1 RIGHT JOIN t2 ON (t1.a + t1.b = t2.a + t2.b - 100) ORDER BY (t1.a, t2.a); +SELECT '= inner ='; +SELECT * FROM t1 INNER JOIN t2 ON (t1.a + t1.b = t2.a + t2.b - 100) ORDER BY (t1.a, t2.a); + +-- Int64 and UInt64 has no supertype +SELECT * FROM t1 FULL JOIN t2 ON (t1.a + t1.b + 100 = t2.a + t2.b) ORDER BY (t1.a, t2.a); -- { serverError 53 } +SELECT * FROM t1 LEFT JOIN t2 ON (t1.a + t1.b + 100 = t2.a + t2.b) ORDER BY (t1.a, t2.a); -- { serverError 53 } +SELECT * FROM t1 RIGHT JOIN t2 ON (t1.a + t1.b + 100 = t2.a + t2.b) ORDER BY (t1.a, t2.a); -- { serverError 53 } +SELECT * FROM t1 INNER JOIN t2 ON (t1.a + t1.b + 100 = t2.a + t2.b) ORDER BY (t1.a, t2.a); -- { serverError 53 } + +SELECT '= agg ='; +SELECT sum(a) == 7 FROM t1 FULL JOIN t2 USING (a) WHERE b > 102 AND t2.b <= 204; +SELECT sum(a) == 7 FROM t1 INNER JOIN t2 USING (a) WHERE b > 102 AND t2.b <= 204; + +SELECT sum(b) = 103 FROM t1 LEFT JOIN t2 USING (a) WHERE b > 102 AND t2.b < 204; +SELECT sum(t2.b) = 203 FROM t1 RIGHT JOIN t2 USING (a) WHERE b > 102 AND t2.b < 204; + +SELECT sum(a) == 2 + 3 + 4 FROM t1 FULL JOIN t2 ON (t1.a + t1.b = t2.a + t2.b - 100) WHERE t1.b < 105 AND t2.b > 201; + +SELECT a > 0, sum(a), sum(b) FROM t1 FULL JOIN t2 USING (a) GROUP BY (a > 0) ORDER BY a > 0; +SELECT a > 0, sum(a), sum(t2.a), sum(b), sum(t2.b) FROM t1 FULL JOIN t2 ON (t1.a == t2.a) GROUP BY (a > 0) ORDER BY a > 0; + +SELECT '= types ='; +SELECT any(toTypeName(a)) == 'Int32' AND any(toTypeName(t2.a)) == 'Int32' FROM t1 FULL JOIN t2 USING (a); +SELECT any(toTypeName(a)) == 'Int32' AND any(toTypeName(t2.a)) == 'Int32' FROM t1 LEFT JOIN t2 USING (a); +SELECT any(toTypeName(a)) == 'Int32' AND any(toTypeName(t2.a)) == 'Int32' FROM t1 RIGHT JOIN t2 USING (a); +SELECT any(toTypeName(a)) == 'Int32' AND any(toTypeName(t2.a)) == 'Int32' FROM t1 INNER JOIN t2 USING (a); + +SELECT toTypeName(any(a)) == 'Int32' AND toTypeName(any(t2.a)) == 'Int32' FROM t1 FULL JOIN t2 USING (a); +SELECT min(toTypeName(a) == 'Int32' AND toTypeName(t2.a) == 'Int32') FROM t1 FULL JOIN t2 USING (a); + +SELECT any(toTypeName(a)) == 'UInt16' AND any(toTypeName(t2.a)) == 'Int16' FROM t1 FULL JOIN t2 ON (t1.a == t2.a); +SELECT any(toTypeName(a)) == 'UInt16' AND any(toTypeName(t2.a)) == 'Int16' FROM t1 LEFT JOIN t2 ON (t1.a == t2.a); +SELECT any(toTypeName(a)) == 'UInt16' AND any(toTypeName(t2.a)) == 'Int16' FROM t1 RIGHT JOIN t2 ON (t1.a == t2.a); +SELECT any(toTypeName(a)) == 'UInt16' AND any(toTypeName(t2.a)) == 'Int16' FROM t1 INNER JOIN t2 ON (t1.a == t2.a); +SELECT toTypeName(any(a)) == 'UInt16' AND toTypeName(any(t2.a)) == 'Int16' FROM t1 FULL JOIN t2 ON (t1.a == t2.a); + +SELECT '=== partial_merge ==='; + +SET join_algorithm = 'partial_merge'; + +SELECT '= full ='; +SELECT a, b, t2.b FROM t1 FULL JOIN t2 USING (a) ORDER BY (a); +SELECT '= left ='; +SELECT a, b, t2.b FROM t1 LEFT JOIN t2 USING (a) ORDER BY (a); +SELECT '= right ='; +SELECT a, b, t2.b FROM t1 RIGHT JOIN t2 USING (a) ORDER BY (a); +SELECT '= inner ='; +SELECT a, b, t2.b FROM t1 INNER JOIN t2 USING (a) ORDER BY (a); + +SELECT '= full ='; +SELECT a, t1.a, t2.a FROM t1 FULL JOIN t2 USING (a) ORDER BY (t1.a, t2.a); +SELECT '= left ='; +SELECT a, t1.a, t2.a FROM t1 LEFT JOIN t2 USING (a) ORDER BY (t1.a, t2.a); +SELECT '= right ='; +SELECT a, t1.a, t2.a FROM t1 RIGHT JOIN t2 USING (a) ORDER BY (t1.a, t2.a); +SELECT '= inner ='; +SELECT a, t1.a, t2.a FROM t1 INNER JOIN t2 USING (a) ORDER BY (t1.a, t2.a); + +SELECT '= join on ='; +SELECT '= full ='; +SELECT a, b, t2.a, t2.b FROM t1 FULL JOIN t2 ON (t1.a == t2.a) ORDER BY (t1.a, t2.a); +SELECT '= left ='; +SELECT a, b, t2.a, t2.b FROM t1 LEFT JOIN t2 ON (t1.a == t2.a) ORDER BY (t1.a, t2.a); +SELECT '= right ='; +SELECT a, b, t2.a, t2.b FROM t1 RIGHT JOIN t2 ON (t1.a == t2.a) ORDER BY (t1.a, t2.a); +SELECT '= inner ='; +SELECT a, b, t2.a, t2.b FROM t1 INNER JOIN t2 ON (t1.a == t2.a) ORDER BY (t1.a, t2.a); + +SELECT '= full ='; +SELECT * FROM t1 FULL JOIN t2 ON (t1.a + t1.b = t2.a + t2.b - 100) ORDER BY (t1.a, t2.a); +SELECT '= left ='; +SELECT * FROM t1 LEFT JOIN t2 ON (t1.a + t1.b = t2.a + t2.b - 100) ORDER BY (t1.a, t2.a); +SELECT '= right ='; +SELECT * FROM t1 RIGHT JOIN t2 ON (t1.a + t1.b = t2.a + t2.b - 100) ORDER BY (t1.a, t2.a); +SELECT '= inner ='; +SELECT * FROM t1 INNER JOIN t2 ON (t1.a + t1.b = t2.a + t2.b - 100) ORDER BY (t1.a, t2.a); + +-- Int64 and UInt64 has no supertype +SELECT * FROM t1 FULL JOIN t2 ON (t1.a + t1.b + 100 = t2.a + t2.b) ORDER BY (t1.a, t2.a); -- { serverError 53 } +SELECT * FROM t1 LEFT JOIN t2 ON (t1.a + t1.b + 100 = t2.a + t2.b) ORDER BY (t1.a, t2.a); -- { serverError 53 } +SELECT * FROM t1 RIGHT JOIN t2 ON (t1.a + t1.b + 100 = t2.a + t2.b) ORDER BY (t1.a, t2.a); -- { serverError 53 } +SELECT * FROM t1 INNER JOIN t2 ON (t1.a + t1.b + 100 = t2.a + t2.b) ORDER BY (t1.a, t2.a); -- { serverError 53 } + +SELECT '= agg ='; +SELECT sum(a) == 7 FROM t1 FULL JOIN t2 USING (a) WHERE b > 102 AND t2.b <= 204; +SELECT sum(a) == 7 FROM t1 INNER JOIN t2 USING (a) WHERE b > 102 AND t2.b <= 204; + +SELECT sum(b) = 103 FROM t1 LEFT JOIN t2 USING (a) WHERE b > 102 AND t2.b < 204; +SELECT sum(t2.b) = 203 FROM t1 RIGHT JOIN t2 USING (a) WHERE b > 102 AND t2.b < 204; + +SELECT sum(a) == 2 + 3 + 4 FROM t1 FULL JOIN t2 ON (t1.a + t1.b = t2.a + t2.b - 100) WHERE t1.b < 105 AND t2.b > 201; + +SELECT a > 0, sum(a), sum(b) FROM t1 FULL JOIN t2 USING (a) GROUP BY (a > 0) ORDER BY a > 0; +SELECT a > 0, sum(a), sum(t2.a), sum(b), sum(t2.b) FROM t1 FULL JOIN t2 ON (t1.a == t2.a) GROUP BY (a > 0) ORDER BY a > 0; + +SELECT '= types ='; +SELECT any(toTypeName(a)) == 'Int32' AND any(toTypeName(t2.a)) == 'Int32' FROM t1 FULL JOIN t2 USING (a); +SELECT any(toTypeName(a)) == 'Int32' AND any(toTypeName(t2.a)) == 'Int32' FROM t1 LEFT JOIN t2 USING (a); +SELECT any(toTypeName(a)) == 'Int32' AND any(toTypeName(t2.a)) == 'Int32' FROM t1 RIGHT JOIN t2 USING (a); +SELECT any(toTypeName(a)) == 'Int32' AND any(toTypeName(t2.a)) == 'Int32' FROM t1 INNER JOIN t2 USING (a); + +SELECT toTypeName(any(a)) == 'Int32' AND toTypeName(any(t2.a)) == 'Int32' FROM t1 FULL JOIN t2 USING (a); +SELECT min(toTypeName(a) == 'Int32' AND toTypeName(t2.a) == 'Int32') FROM t1 FULL JOIN t2 USING (a); + +SELECT any(toTypeName(a)) == 'UInt16' AND any(toTypeName(t2.a)) == 'Int16' FROM t1 FULL JOIN t2 ON (t1.a == t2.a); +SELECT any(toTypeName(a)) == 'UInt16' AND any(toTypeName(t2.a)) == 'Int16' FROM t1 LEFT JOIN t2 ON (t1.a == t2.a); +SELECT any(toTypeName(a)) == 'UInt16' AND any(toTypeName(t2.a)) == 'Int16' FROM t1 RIGHT JOIN t2 ON (t1.a == t2.a); +SELECT any(toTypeName(a)) == 'UInt16' AND any(toTypeName(t2.a)) == 'Int16' FROM t1 INNER JOIN t2 ON (t1.a == t2.a); +SELECT toTypeName(any(a)) == 'UInt16' AND toTypeName(any(t2.a)) == 'Int16' FROM t1 FULL JOIN t2 ON (t1.a == t2.a); + +SELECT '=== switch ==='; + +SET join_algorithm = 'auto'; +SET max_bytes_in_join = 100; + +SELECT '= full ='; +SELECT a, b, t2.b FROM t1 FULL JOIN t2 USING (a) ORDER BY (a); +SELECT '= left ='; +SELECT a, b, t2.b FROM t1 LEFT JOIN t2 USING (a) ORDER BY (a); +SELECT '= right ='; +SELECT a, b, t2.b FROM t1 RIGHT JOIN t2 USING (a) ORDER BY (a); +SELECT '= inner ='; +SELECT a, b, t2.b FROM t1 INNER JOIN t2 USING (a) ORDER BY (a); + +SELECT '= full ='; +SELECT a, t1.a, t2.a FROM t1 FULL JOIN t2 USING (a) ORDER BY (t1.a, t2.a); +SELECT '= left ='; +SELECT a, t1.a, t2.a FROM t1 LEFT JOIN t2 USING (a) ORDER BY (t1.a, t2.a); +SELECT '= right ='; +SELECT a, t1.a, t2.a FROM t1 RIGHT JOIN t2 USING (a) ORDER BY (t1.a, t2.a); +SELECT '= inner ='; +SELECT a, t1.a, t2.a FROM t1 INNER JOIN t2 USING (a) ORDER BY (t1.a, t2.a); + +SELECT '= join on ='; +SELECT '= full ='; +SELECT a, b, t2.a, t2.b FROM t1 FULL JOIN t2 ON (t1.a == t2.a) ORDER BY (t1.a, t2.a); +SELECT '= left ='; +SELECT a, b, t2.a, t2.b FROM t1 LEFT JOIN t2 ON (t1.a == t2.a) ORDER BY (t1.a, t2.a); +SELECT '= right ='; +SELECT a, b, t2.a, t2.b FROM t1 RIGHT JOIN t2 ON (t1.a == t2.a) ORDER BY (t1.a, t2.a); +SELECT '= inner ='; +SELECT a, b, t2.a, t2.b FROM t1 INNER JOIN t2 ON (t1.a == t2.a) ORDER BY (t1.a, t2.a); + +SELECT '= full ='; +SELECT * FROM t1 FULL JOIN t2 ON (t1.a + t1.b = t2.a + t2.b - 100) ORDER BY (t1.a, t2.a); +SELECT '= left ='; +SELECT * FROM t1 LEFT JOIN t2 ON (t1.a + t1.b = t2.a + t2.b - 100) ORDER BY (t1.a, t2.a); +SELECT '= right ='; +SELECT * FROM t1 RIGHT JOIN t2 ON (t1.a + t1.b = t2.a + t2.b - 100) ORDER BY (t1.a, t2.a); +SELECT '= inner ='; +SELECT * FROM t1 INNER JOIN t2 ON (t1.a + t1.b = t2.a + t2.b - 100) ORDER BY (t1.a, t2.a); + +-- Int64 and UInt64 has no supertype +SELECT * FROM t1 FULL JOIN t2 ON (t1.a + t1.b + 100 = t2.a + t2.b) ORDER BY (t1.a, t2.a); -- { serverError 53 } +SELECT * FROM t1 LEFT JOIN t2 ON (t1.a + t1.b + 100 = t2.a + t2.b) ORDER BY (t1.a, t2.a); -- { serverError 53 } +SELECT * FROM t1 RIGHT JOIN t2 ON (t1.a + t1.b + 100 = t2.a + t2.b) ORDER BY (t1.a, t2.a); -- { serverError 53 } +SELECT * FROM t1 INNER JOIN t2 ON (t1.a + t1.b + 100 = t2.a + t2.b) ORDER BY (t1.a, t2.a); -- { serverError 53 } + +SELECT '= agg ='; +SELECT sum(a) == 7 FROM t1 FULL JOIN t2 USING (a) WHERE b > 102 AND t2.b <= 204; +SELECT sum(a) == 7 FROM t1 INNER JOIN t2 USING (a) WHERE b > 102 AND t2.b <= 204; + +SELECT sum(b) = 103 FROM t1 LEFT JOIN t2 USING (a) WHERE b > 102 AND t2.b < 204; +SELECT sum(t2.b) = 203 FROM t1 RIGHT JOIN t2 USING (a) WHERE b > 102 AND t2.b < 204; + +SELECT sum(a) == 2 + 3 + 4 FROM t1 FULL JOIN t2 ON (t1.a + t1.b = t2.a + t2.b - 100) WHERE t1.b < 105 AND t2.b > 201; + +SELECT a > 0, sum(a), sum(b) FROM t1 FULL JOIN t2 USING (a) GROUP BY (a > 0) ORDER BY a > 0; +SELECT a > 0, sum(a), sum(t2.a), sum(b), sum(t2.b) FROM t1 FULL JOIN t2 ON (t1.a == t2.a) GROUP BY (a > 0) ORDER BY a > 0; + +SELECT '= types ='; +SELECT any(toTypeName(a)) == 'Int32' AND any(toTypeName(t2.a)) == 'Int32' FROM t1 FULL JOIN t2 USING (a); +SELECT any(toTypeName(a)) == 'Int32' AND any(toTypeName(t2.a)) == 'Int32' FROM t1 LEFT JOIN t2 USING (a); +SELECT any(toTypeName(a)) == 'Int32' AND any(toTypeName(t2.a)) == 'Int32' FROM t1 RIGHT JOIN t2 USING (a); +SELECT any(toTypeName(a)) == 'Int32' AND any(toTypeName(t2.a)) == 'Int32' FROM t1 INNER JOIN t2 USING (a); + +SELECT toTypeName(any(a)) == 'Int32' AND toTypeName(any(t2.a)) == 'Int32' FROM t1 FULL JOIN t2 USING (a); +SELECT min(toTypeName(a) == 'Int32' AND toTypeName(t2.a) == 'Int32') FROM t1 FULL JOIN t2 USING (a); + +SELECT any(toTypeName(a)) == 'UInt16' AND any(toTypeName(t2.a)) == 'Int16' FROM t1 FULL JOIN t2 ON (t1.a == t2.a); +SELECT any(toTypeName(a)) == 'UInt16' AND any(toTypeName(t2.a)) == 'Int16' FROM t1 LEFT JOIN t2 ON (t1.a == t2.a); +SELECT any(toTypeName(a)) == 'UInt16' AND any(toTypeName(t2.a)) == 'Int16' FROM t1 RIGHT JOIN t2 ON (t1.a == t2.a); +SELECT any(toTypeName(a)) == 'UInt16' AND any(toTypeName(t2.a)) == 'Int16' FROM t1 INNER JOIN t2 ON (t1.a == t2.a); +SELECT toTypeName(any(a)) == 'UInt16' AND toTypeName(any(t2.a)) == 'Int16' FROM t1 FULL JOIN t2 ON (t1.a == t2.a); + +SET max_bytes_in_join = 0; + +SELECT '=== join use nulls ==='; + +SET join_use_nulls = 1; + +SELECT '= full ='; +SELECT a, b, t2.b FROM t1 FULL JOIN t2 USING (a) ORDER BY (a); +SELECT '= right ='; +SELECT a, b, t2.b FROM t1 RIGHT JOIN t2 USING (a) ORDER BY (a); + +SET join_use_nulls = 0; + +DROP TABLE IF EXISTS t1; +DROP TABLE IF EXISTS t2; From 7b8b77f258a170758b33b6213a2064fa80525087 Mon Sep 17 00:00:00 2001 From: vdimir Date: Fri, 19 Feb 2021 14:53:06 +0300 Subject: [PATCH 0489/2357] Add tests to join_implicit_cast checks column name clash --- tests/queries/0_stateless/01720_join_implicit_cast.sql | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/tests/queries/0_stateless/01720_join_implicit_cast.sql b/tests/queries/0_stateless/01720_join_implicit_cast.sql index 653414e66af..cf4a3bdcef6 100644 --- a/tests/queries/0_stateless/01720_join_implicit_cast.sql +++ b/tests/queries/0_stateless/01720_join_implicit_cast.sql @@ -46,6 +46,9 @@ SELECT any(toTypeName(a)) == 'Int32' AND any(toTypeName(b)) == 'Nullable(Int64)' SELECT any(toTypeName(a)) == 'Int32' AND any(toTypeName(b)) == 'Nullable(Int64)' FROM t_ab1 RIGHT JOIN t_ab2 USING (a, b); SELECT any(toTypeName(a)) == 'Int32' AND any(toTypeName(b)) == 'Nullable(Int64)' FROM t_ab1 INNER JOIN t_ab2 USING (a, b); +SELECT * FROM ( SELECT a, b as "CAST(a, Int32)" FROM t_ab1 ) t_ab1 FULL JOIN t_ab2 ON (t_ab1.a == t_ab2.a); -- { serverError 44 } +SELECT * FROM ( SELECT a, b as "CAST(a, Int32)" FROM t_ab1 ) t_ab1 FULL JOIN t_ab2 USING (a) FORMAT Null; + SELECT '=== partial_merge ==='; SET join_algorithm = 'partial_merge'; @@ -86,5 +89,8 @@ SELECT any(toTypeName(a)) == 'Int32' AND any(toTypeName(b)) == 'Nullable(Int64)' SELECT any(toTypeName(a)) == 'Int32' AND any(toTypeName(b)) == 'Nullable(Int64)' FROM t_ab1 RIGHT JOIN t_ab2 USING (a, b); SELECT any(toTypeName(a)) == 'Int32' AND any(toTypeName(b)) == 'Nullable(Int64)' FROM t_ab1 INNER JOIN t_ab2 USING (a, b); +SELECT * FROM ( SELECT a, b as "CAST(a, Int32)" FROM t_ab1 ) t_ab1 FULL JOIN t_ab2 ON (t_ab1.a == t_ab2.a); -- { serverError 44 } +SELECT * FROM ( SELECT a, b as "CAST(a, Int32)" FROM t_ab1 ) t_ab1 FULL JOIN t_ab2 USING (a) FORMAT Null; + DROP TABLE IF EXISTS t_ab1; DROP TABLE IF EXISTS t_ab2; From e052a5a05e482317864593d14201a20a2b9ab28e Mon Sep 17 00:00:00 2001 From: vdimir Date: Fri, 19 Feb 2021 15:14:24 +0300 Subject: [PATCH 0490/2357] Revert "Support old cross to inner join rewrite behaviour" This reverts commit 527210b5e48af7d65fa726c49d4062cbf730f697. --- src/Core/Settings.h | 2 +- src/Interpreters/CrossToInnerJoinVisitor.cpp | 73 +++++++------------- src/Interpreters/CrossToInnerJoinVisitor.h | 2 +- 3 files changed, 26 insertions(+), 51 deletions(-) diff --git a/src/Core/Settings.h b/src/Core/Settings.h index d533223852a..2ddd1e003ca 100644 --- a/src/Core/Settings.h +++ b/src/Core/Settings.h @@ -503,7 +503,7 @@ class IColumn; M(Bool, output_format_write_statistics, true, "Write statistics about read rows, bytes, time elapsed in suitable output formats.", 0) \ M(Bool, output_format_pretty_row_numbers, false, "Add row numbers before each row for pretty output format", 0) \ M(Bool, insert_distributed_one_random_shard, false, "If setting is enabled, inserting into distributed table will choose a random shard to write when there is no sharding key", 0) \ - M(UInt64, cross_to_inner_join_rewrite, 1, "Use inner join instead of comma/cross join if possible. Possible values: 0 - no rewrite, 1 - conservative mode, move only simple expressions to ON section, 2 - optimistic mode, move as much as possible", 0) \ + M(Bool, cross_to_inner_join_rewrite, true, "Use inner join instead of comma/cross join if possible", 0) \ // End of FORMAT_FACTORY_SETTINGS diff --git a/src/Interpreters/CrossToInnerJoinVisitor.cpp b/src/Interpreters/CrossToInnerJoinVisitor.cpp index 3f3e9adc605..b1e42b23ad5 100644 --- a/src/Interpreters/CrossToInnerJoinVisitor.cpp +++ b/src/Interpreters/CrossToInnerJoinVisitor.cpp @@ -29,8 +29,6 @@ namespace ErrorCodes namespace { -using TablesWithColumnNamesAndTypes = std::vector; - struct JoinedElement { explicit JoinedElement(const ASTTablesInSelectQueryElement & table_element) @@ -126,21 +124,27 @@ void collectConjunctions(const ASTPtr & node, std::vector & members) members.push_back(node); } -std::optional getIdentsMembership(const std::vector idents, - const TablesWithColumnNamesAndTypes & tables, +std::optional getIdentMembership(const ASTIdentifier & ident, const std::vector & tables) +{ + std::optional table_pos = IdentifierSemantic::getMembership(ident); + if (table_pos) + return table_pos; + return IdentifierSemantic::chooseTableColumnMatch(ident, tables); +} + +std::optional getIdentsMembership(const ASTPtr ast, + const std::vector & tables, const Aliases & aliases) { + auto idents = IdentifiersCollector::collect(ast); + std::optional result; for (const auto * ident : idents) { /// Moving expressions that use column aliases is not supported. if (ident->isShort() && aliases.count(ident->shortName())) return {}; - - std::optional pos = IdentifierSemantic::getMembership(*ident); - if (!pos) - pos = IdentifierSemantic::chooseTableColumnMatch(*ident, tables); - + const auto pos = getIdentMembership(*ident, tables); if (!pos) return {}; if (result && *pos != *result) @@ -150,33 +154,6 @@ std::optional getIdentsMembership(const std::vector> getArgumentsMembership( - const ASTPtr & left, const ASTPtr & right, const TablesWithColumnNamesAndTypes & tables, const Aliases & aliases, bool recursive) -{ - std::optional left_table_pos, right_table_pos; - if (recursive) - { - /// Collect all nested identifies - left_table_pos = getIdentsMembership(IdentifiersCollector::collect(left), tables, aliases); - right_table_pos = getIdentsMembership(IdentifiersCollector::collect(right), tables, aliases); - } - else - { - /// Use identifier only if it's on the top level - const auto * left_ident = left->as(); - const auto * right_ident = right->as(); - if (left_ident && right_ident) - { - left_table_pos = getIdentsMembership({left_ident}, tables, aliases); - right_table_pos = getIdentsMembership({right_ident}, tables, aliases); - } - } - - if (left_table_pos && right_table_pos) - return std::make_pair(*left_table_pos, *right_table_pos); - return {}; -} - bool isAllowedToRewriteCrossJoin(const ASTPtr & node, const Aliases & aliases) { if (node->as()) @@ -196,7 +173,6 @@ bool canMoveExpressionToJoinOn(const ASTPtr & ast, const std::vector & joined_tables, const std::vector & tables, const Aliases & aliases, - int rewrite_mode, std::map> & asts_to_join_on) { std::vector conjuncts; @@ -208,18 +184,17 @@ bool canMoveExpressionToJoinOn(const ASTPtr & ast, if (!func->arguments || func->arguments->children.size() != 2) return false; - bool optimistic_rewrite = rewrite_mode >= 2; - auto table_pos = getArgumentsMembership(func->arguments->children[0], func->arguments->children[1], - tables, aliases, optimistic_rewrite); - /// Check if the identifiers are from different joined tables. /// If it's a self joint, tables should have aliases. - if (table_pos && table_pos->first != table_pos->second) + auto left_table_pos = getIdentsMembership(func->arguments->children[0], tables, aliases); + auto right_table_pos = getIdentsMembership(func->arguments->children[1], tables, aliases); + + /// Identifiers from different table move to JOIN ON + if (left_table_pos && right_table_pos && *left_table_pos != *right_table_pos) { - /// Identifiers from different table move to JOIN ON - size_t max_table_pos = std::max(table_pos->first, table_pos->second); - if (joined_tables[max_table_pos].canAttachOnExpression()) - asts_to_join_on[max_table_pos].push_back(node); + size_t table_pos = std::max(*left_table_pos, *right_table_pos); + if (joined_tables[table_pos].canAttachOnExpression()) + asts_to_join_on[table_pos].push_back(node); else return false; } @@ -351,11 +326,11 @@ void CrossToInnerJoinMatcher::visit(ASTSelectQuery & select, ASTPtr &, Data & da /// CROSS to INNER - if (select.where() && data.cross_to_inner_join_rewrite > 0) + if (select.where() && data.cross_to_inner_join_rewrite) { std::map> asts_to_join_on; - bool can_move_where = canMoveExpressionToJoinOn( - select.where(), joined_tables, data.tables_with_columns, data.aliases, data.cross_to_inner_join_rewrite, asts_to_join_on); + bool can_move_where + = canMoveExpressionToJoinOn(select.where(), joined_tables, data.tables_with_columns, data.aliases, asts_to_join_on); if (can_move_where) { for (size_t i = 1; i < joined_tables.size(); ++i) diff --git a/src/Interpreters/CrossToInnerJoinVisitor.h b/src/Interpreters/CrossToInnerJoinVisitor.h index db9dd7ba79b..885cf8162c1 100644 --- a/src/Interpreters/CrossToInnerJoinVisitor.h +++ b/src/Interpreters/CrossToInnerJoinVisitor.h @@ -19,7 +19,7 @@ public: const Aliases & aliases; const String current_database; bool done = false; - int cross_to_inner_join_rewrite = true; + bool cross_to_inner_join_rewrite = true; }; static bool needChildVisit(ASTPtr &, const ASTPtr &); From 866dfaec793f764dc9ba167d3ac9f6521b9b3381 Mon Sep 17 00:00:00 2001 From: Nikolai Kochetov Date: Fri, 19 Feb 2021 15:25:22 +0300 Subject: [PATCH 0491/2357] Update 01731_async_task_queue_wait.sh --- tests/queries/0_stateless/01731_async_task_queue_wait.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/queries/0_stateless/01731_async_task_queue_wait.sh b/tests/queries/0_stateless/01731_async_task_queue_wait.sh index 936f850791d..89d8b63d745 100755 --- a/tests/queries/0_stateless/01731_async_task_queue_wait.sh +++ b/tests/queries/0_stateless/01731_async_task_queue_wait.sh @@ -7,4 +7,4 @@ CURDIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd) # regression for 'Empty task was returned from async task queue' during query # cancellation with async_socket_for_remote=1 (that ignores # max_distributed_connections) -$(timeout --signal=SIGINT 1 ${CLICKHOUSE_CLIENT} --max_distributed_connections=1 --max_block_size=2 --interactive_delay=900000 -q "select x from remote('127.{2,3}', view(select number + sleep(0.3) as x from numbers(16))) settings max_block_size = 2") 2>&1 | grep "Empty task was returned from async task queue" +$(timeout --signal=SIGINT 1 ${CLICKHOUSE_CLIENT} --max_distributed_connections=1 --max_block_size=2 --interactive_delay=900000 -q "select x from remote('127.{2,3}', view(select number + sleep(0.3) as x from numbers(16))) settings max_block_size = 2") 2>&1 | grep "Empty task was returned from async task queue" || true From 414f470c79eb22b0ca47b82f11625cf80b0231aa Mon Sep 17 00:00:00 2001 From: Ivan <5627721+abyss7@users.noreply.github.com> Date: Fri, 19 Feb 2021 15:51:26 +0300 Subject: [PATCH 0492/2357] Make Poco HTTP Server zero-copy again (#19516) * Refactoring: part 1 * Refactoring: part 2 * Handle request using ReadBuffer interface * Struggles with ReadBuffer's * Fix URI parsing * Implement parsing of multipart/form-data * Check HTTP_LENGTH_REQUIRED before eof() or will hang * Fix HTTPChunkedReadBuffer * Fix build and style * Fix test * Resist double-eof * Fix arcadian build --- base/daemon/BaseDaemon.h | 6 +- programs/odbc-bridge/ColumnInfoHandler.cpp | 12 +- programs/odbc-bridge/ColumnInfoHandler.h | 9 +- programs/odbc-bridge/HandlerFactory.cpp | 15 +- programs/odbc-bridge/HandlerFactory.h | 15 +- .../odbc-bridge/IdentifierQuoteHandler.cpp | 12 +- programs/odbc-bridge/IdentifierQuoteHandler.h | 7 +- programs/odbc-bridge/MainHandler.cpp | 22 +- programs/odbc-bridge/MainHandler.h | 11 +- programs/odbc-bridge/ODBCBridge.cpp | 10 +- programs/odbc-bridge/PingHandler.cpp | 2 +- programs/odbc-bridge/PingHandler.h | 14 +- programs/odbc-bridge/SchemaAllowedHandler.cpp | 12 +- programs/odbc-bridge/SchemaAllowedHandler.h | 11 +- programs/server/Server.cpp | 43 +- programs/server/Server.h | 3 +- src/CMakeLists.txt | 1 + src/Common/HTMLForm.h | 42 -- src/Common/StringUtils/StringUtils.h | 6 + src/Common/formatIPv6.h | 12 +- src/Common/hex.h | 4 +- src/Core/ExternalTable.cpp | 9 +- src/Core/ExternalTable.h | 24 +- src/IO/EmptyReadBuffer.h | 18 + src/IO/HTTPChunkedReadBuffer.cpp | 92 +++++ src/IO/HTTPChunkedReadBuffer.h | 25 ++ src/IO/HTTPCommon.cpp | 4 +- src/IO/HTTPCommon.h | 17 +- src/IO/LimitReadBuffer.cpp | 42 +- src/IO/LimitReadBuffer.h | 15 +- src/IO/PeekableReadBuffer.cpp | 17 +- src/IO/PeekableReadBuffer.h | 2 +- src/IO/ReadBuffer.h | 52 ++- src/IO/ReadBufferFromPocoSocket.cpp | 2 +- src/IO/ReadBufferFromPocoSocket.h | 13 +- src/IO/ReadHelpers.cpp | 19 + src/IO/ReadHelpers.h | 15 +- src/IO/ya.make | 2 +- src/Interpreters/InterserverIOHandler.h | 15 +- src/Server/HTTP/HTMLForm.cpp | 381 ++++++++++++++++++ src/Server/HTTP/HTMLForm.h | 175 ++++++++ src/Server/HTTP/HTTPRequest.h | 10 + src/Server/HTTP/HTTPRequestHandler.h | 19 + src/Server/HTTP/HTTPRequestHandlerFactory.h | 20 + src/Server/HTTP/HTTPResponse.h | 10 + src/Server/HTTP/HTTPServer.cpp | 48 +++ src/Server/HTTP/HTTPServer.h | 46 +++ src/Server/HTTP/HTTPServerConnection.cpp | 128 ++++++ src/Server/HTTP/HTTPServerConnection.h | 36 ++ .../HTTP/HTTPServerConnectionFactory.cpp | 19 + src/Server/HTTP/HTTPServerConnectionFactory.h | 25 ++ src/Server/HTTP/HTTPServerRequest.cpp | 123 ++++++ src/Server/HTTP/HTTPServerRequest.h | 59 +++ src/Server/HTTP/HTTPServerResponse.cpp | 163 ++++++++ src/Server/HTTP/HTTPServerResponse.h | 91 +++++ src/Server/HTTP/ReadHeaders.cpp | 88 ++++ src/Server/HTTP/ReadHeaders.h | 17 + .../WriteBufferFromHTTPServerResponse.cpp | 44 +- .../HTTP}/WriteBufferFromHTTPServerResponse.h | 41 +- src/Server/HTTPHandler.cpp | 194 ++++----- src/Server/HTTPHandler.h | 36 +- src/Server/HTTPHandlerFactory.cpp | 101 +++-- src/Server/HTTPHandlerFactory.h | 112 ++--- src/Server/HTTPHandlerRequestFilter.h | 48 +-- src/Server/InterserverIOHTTPHandler.cpp | 37 +- src/Server/InterserverIOHTTPHandler.h | 16 +- src/Server/NotFoundHandler.cpp | 31 +- src/Server/NotFoundHandler.h | 9 +- src/Server/PrometheusRequestHandler.cpp | 34 +- src/Server/PrometheusRequestHandler.h | 16 +- src/Server/ReplicasStatusHandler.cpp | 27 +- src/Server/ReplicasStatusHandler.h | 10 +- src/Server/StaticRequestHandler.cpp | 31 +- src/Server/StaticRequestHandler.h | 6 +- src/Server/WebUIRequestHandler.cpp | 6 +- src/Server/WebUIRequestHandler.h | 6 +- src/Server/ya.make | 8 + src/Storages/MergeTree/DataPartsExchange.cpp | 17 +- src/Storages/MergeTree/DataPartsExchange.h | 15 +- tests/queries/query_test.py | 2 +- 80 files changed, 2303 insertions(+), 654 deletions(-) delete mode 100644 src/Common/HTMLForm.h create mode 100644 src/IO/EmptyReadBuffer.h create mode 100644 src/IO/HTTPChunkedReadBuffer.cpp create mode 100644 src/IO/HTTPChunkedReadBuffer.h create mode 100644 src/Server/HTTP/HTMLForm.cpp create mode 100644 src/Server/HTTP/HTMLForm.h create mode 100644 src/Server/HTTP/HTTPRequest.h create mode 100644 src/Server/HTTP/HTTPRequestHandler.h create mode 100644 src/Server/HTTP/HTTPRequestHandlerFactory.h create mode 100644 src/Server/HTTP/HTTPResponse.h create mode 100644 src/Server/HTTP/HTTPServer.cpp create mode 100644 src/Server/HTTP/HTTPServer.h create mode 100644 src/Server/HTTP/HTTPServerConnection.cpp create mode 100644 src/Server/HTTP/HTTPServerConnection.h create mode 100644 src/Server/HTTP/HTTPServerConnectionFactory.cpp create mode 100644 src/Server/HTTP/HTTPServerConnectionFactory.h create mode 100644 src/Server/HTTP/HTTPServerRequest.cpp create mode 100644 src/Server/HTTP/HTTPServerRequest.h create mode 100644 src/Server/HTTP/HTTPServerResponse.cpp create mode 100644 src/Server/HTTP/HTTPServerResponse.h create mode 100644 src/Server/HTTP/ReadHeaders.cpp create mode 100644 src/Server/HTTP/ReadHeaders.h rename src/{IO => Server/HTTP}/WriteBufferFromHTTPServerResponse.cpp (81%) rename src/{IO => Server/HTTP}/WriteBufferFromHTTPServerResponse.h (86%) diff --git a/base/daemon/BaseDaemon.h b/base/daemon/BaseDaemon.h index 42d94629ae9..8b9d765cf2e 100644 --- a/base/daemon/BaseDaemon.h +++ b/base/daemon/BaseDaemon.h @@ -83,7 +83,7 @@ public: template void writeToGraphite(const std::string & key, const T & value, const std::string & config_name = DEFAULT_GRAPHITE_CONFIG_NAME, time_t timestamp = 0, const std::string & custom_root_path = "") { - auto writer = getGraphiteWriter(config_name); + auto *writer = getGraphiteWriter(config_name); if (writer) writer->write(key, value, timestamp, custom_root_path); } @@ -91,7 +91,7 @@ public: template void writeToGraphite(const GraphiteWriter::KeyValueVector & key_vals, const std::string & config_name = DEFAULT_GRAPHITE_CONFIG_NAME, time_t timestamp = 0, const std::string & custom_root_path = "") { - auto writer = getGraphiteWriter(config_name); + auto *writer = getGraphiteWriter(config_name); if (writer) writer->write(key_vals, timestamp, custom_root_path); } @@ -99,7 +99,7 @@ public: template void writeToGraphite(const GraphiteWriter::KeyValueVector & key_vals, const std::chrono::system_clock::time_point & current_time, const std::string & custom_root_path) { - auto writer = getGraphiteWriter(); + auto *writer = getGraphiteWriter(); if (writer) writer->write(key_vals, std::chrono::system_clock::to_time_t(current_time), custom_root_path); } diff --git a/programs/odbc-bridge/ColumnInfoHandler.cpp b/programs/odbc-bridge/ColumnInfoHandler.cpp index ee4daa3e16d..5aef7f1ac38 100644 --- a/programs/odbc-bridge/ColumnInfoHandler.cpp +++ b/programs/odbc-bridge/ColumnInfoHandler.cpp @@ -4,14 +4,14 @@ # include # include -# include +# include # include # include # include # include # include # include -# include +# include # include # include # include @@ -59,16 +59,16 @@ namespace } } -void ODBCColumnsInfoHandler::handleRequest(Poco::Net::HTTPServerRequest & request, Poco::Net::HTTPServerResponse & response) +void ODBCColumnsInfoHandler::handleRequest(HTTPServerRequest & request, HTTPServerResponse & response) { - Poco::Net::HTMLForm params(request, request.stream()); + HTMLForm params(request, request.getStream()); LOG_TRACE(log, "Request URI: {}", request.getURI()); auto process_error = [&response, this](const std::string & message) { response.setStatusAndReason(Poco::Net::HTTPResponse::HTTP_INTERNAL_SERVER_ERROR); if (!response.sent()) - response.send() << message << std::endl; + *response.send() << message << std::endl; LOG_WARNING(log, message); }; @@ -159,7 +159,7 @@ void ODBCColumnsInfoHandler::handleRequest(Poco::Net::HTTPServerRequest & reques columns.emplace_back(reinterpret_cast(column_name), std::move(column_type)); } - WriteBufferFromHTTPServerResponse out(request, response, keep_alive_timeout); + WriteBufferFromHTTPServerResponse out(response, request.getMethod() == Poco::Net::HTTPRequest::HTTP_HEAD, keep_alive_timeout); writeStringBinary(columns.toString(), out); } catch (...) diff --git a/programs/odbc-bridge/ColumnInfoHandler.h b/programs/odbc-bridge/ColumnInfoHandler.h index 04b4c06693b..9b5b470b31d 100644 --- a/programs/odbc-bridge/ColumnInfoHandler.h +++ b/programs/odbc-bridge/ColumnInfoHandler.h @@ -3,10 +3,11 @@ #if USE_ODBC # include -# include -# include +# include # include +# include + /** The structure of the table is taken from the query "SELECT * FROM table WHERE 1=0". * TODO: It would be much better to utilize ODBC methods dedicated for columns description. * If there is no such table, an exception is thrown. @@ -14,7 +15,7 @@ namespace DB { -class ODBCColumnsInfoHandler : public Poco::Net::HTTPRequestHandler +class ODBCColumnsInfoHandler : public HTTPRequestHandler { public: ODBCColumnsInfoHandler(size_t keep_alive_timeout_, Context & context_) @@ -22,7 +23,7 @@ public: { } - void handleRequest(Poco::Net::HTTPServerRequest & request, Poco::Net::HTTPServerResponse & response) override; + void handleRequest(HTTPServerRequest & request, HTTPServerResponse & response) override; private: Poco::Logger * log; diff --git a/programs/odbc-bridge/HandlerFactory.cpp b/programs/odbc-bridge/HandlerFactory.cpp index 0cc40480b87..9ac48af4ace 100644 --- a/programs/odbc-bridge/HandlerFactory.cpp +++ b/programs/odbc-bridge/HandlerFactory.cpp @@ -7,39 +7,40 @@ namespace DB { -Poco::Net::HTTPRequestHandler * HandlerFactory::createRequestHandler(const Poco::Net::HTTPServerRequest & request) + +std::unique_ptr HandlerFactory::createRequestHandler(const HTTPServerRequest & request) { Poco::URI uri{request.getURI()}; LOG_TRACE(log, "Request URI: {}", uri.toString()); if (uri.getPath() == "/ping" && request.getMethod() == Poco::Net::HTTPRequest::HTTP_GET) - return new PingHandler(keep_alive_timeout); + return std::make_unique(keep_alive_timeout); if (request.getMethod() == Poco::Net::HTTPRequest::HTTP_POST) { if (uri.getPath() == "/columns_info") #if USE_ODBC - return new ODBCColumnsInfoHandler(keep_alive_timeout, context); + return std::make_unique(keep_alive_timeout, context); #else return nullptr; #endif else if (uri.getPath() == "/identifier_quote") #if USE_ODBC - return new IdentifierQuoteHandler(keep_alive_timeout, context); + return std::make_unique(keep_alive_timeout, context); #else return nullptr; #endif else if (uri.getPath() == "/schema_allowed") #if USE_ODBC - return new SchemaAllowedHandler(keep_alive_timeout, context); + return std::make_unique(keep_alive_timeout, context); #else return nullptr; #endif else if (uri.getPath() == "/write") - return new ODBCHandler(pool_map, keep_alive_timeout, context, "write"); + return std::make_unique(pool_map, keep_alive_timeout, context, "write"); else - return new ODBCHandler(pool_map, keep_alive_timeout, context, "read"); + return std::make_unique(pool_map, keep_alive_timeout, context, "read"); } return nullptr; } diff --git a/programs/odbc-bridge/HandlerFactory.h b/programs/odbc-bridge/HandlerFactory.h index 1d4edfc9dd1..5dce6f02ecd 100644 --- a/programs/odbc-bridge/HandlerFactory.h +++ b/programs/odbc-bridge/HandlerFactory.h @@ -1,16 +1,17 @@ #pragma once + #include -#include -#include -#include -#include "MainHandler.h" +#include #include "ColumnInfoHandler.h" #include "IdentifierQuoteHandler.h" +#include "MainHandler.h" #include "SchemaAllowedHandler.h" +#include + #pragma GCC diagnostic push #pragma GCC diagnostic ignored "-Wunused-parameter" - #include +#include #pragma GCC diagnostic pop @@ -19,7 +20,7 @@ namespace DB /** Factory for '/ping', '/', '/columns_info', '/identifier_quote', '/schema_allowed' handlers. * Also stores Session pools for ODBC connections */ -class HandlerFactory : public Poco::Net::HTTPRequestHandlerFactory +class HandlerFactory : public HTTPRequestHandlerFactory { public: HandlerFactory(const std::string & name_, size_t keep_alive_timeout_, Context & context_) @@ -28,7 +29,7 @@ public: pool_map = std::make_shared(); } - Poco::Net::HTTPRequestHandler * createRequestHandler(const Poco::Net::HTTPServerRequest & request) override; + std::unique_ptr createRequestHandler(const HTTPServerRequest & request) override; private: Poco::Logger * log; diff --git a/programs/odbc-bridge/IdentifierQuoteHandler.cpp b/programs/odbc-bridge/IdentifierQuoteHandler.cpp index 2c3701cfff9..ec4e4493d61 100644 --- a/programs/odbc-bridge/IdentifierQuoteHandler.cpp +++ b/programs/odbc-bridge/IdentifierQuoteHandler.cpp @@ -3,14 +3,14 @@ #if USE_ODBC # include -# include +# include +# include # include # include # include # include # include # include -# include # include # include # include @@ -22,16 +22,16 @@ namespace DB { -void IdentifierQuoteHandler::handleRequest(Poco::Net::HTTPServerRequest & request, Poco::Net::HTTPServerResponse & response) +void IdentifierQuoteHandler::handleRequest(HTTPServerRequest & request, HTTPServerResponse & response) { - Poco::Net::HTMLForm params(request, request.stream()); + HTMLForm params(request, request.getStream()); LOG_TRACE(log, "Request URI: {}", request.getURI()); auto process_error = [&response, this](const std::string & message) { response.setStatusAndReason(Poco::Net::HTTPResponse::HTTP_INTERNAL_SERVER_ERROR); if (!response.sent()) - response.send() << message << std::endl; + *response.send() << message << std::endl; LOG_WARNING(log, message); }; @@ -49,7 +49,7 @@ void IdentifierQuoteHandler::handleRequest(Poco::Net::HTTPServerRequest & reques auto identifier = getIdentifierQuote(hdbc); - WriteBufferFromHTTPServerResponse out(request, response, keep_alive_timeout); + WriteBufferFromHTTPServerResponse out(response, request.getMethod() == Poco::Net::HTTPRequest::HTTP_HEAD, keep_alive_timeout); writeStringBinary(identifier, out); } catch (...) diff --git a/programs/odbc-bridge/IdentifierQuoteHandler.h b/programs/odbc-bridge/IdentifierQuoteHandler.h index fd357e32786..dad88c72ad8 100644 --- a/programs/odbc-bridge/IdentifierQuoteHandler.h +++ b/programs/odbc-bridge/IdentifierQuoteHandler.h @@ -1,8 +1,9 @@ #pragma once #include +#include + #include -#include #if USE_ODBC @@ -10,7 +11,7 @@ namespace DB { -class IdentifierQuoteHandler : public Poco::Net::HTTPRequestHandler +class IdentifierQuoteHandler : public HTTPRequestHandler { public: IdentifierQuoteHandler(size_t keep_alive_timeout_, Context &) @@ -18,7 +19,7 @@ public: { } - void handleRequest(Poco::Net::HTTPServerRequest & request, Poco::Net::HTTPServerResponse & response) override; + void handleRequest(HTTPServerRequest & request, HTTPServerResponse & response) override; private: Poco::Logger * log; diff --git a/programs/odbc-bridge/MainHandler.cpp b/programs/odbc-bridge/MainHandler.cpp index 64cb7bc0b46..b9670397878 100644 --- a/programs/odbc-bridge/MainHandler.cpp +++ b/programs/odbc-bridge/MainHandler.cpp @@ -7,7 +7,7 @@ #include #include #include -#include +#include #include #include #include @@ -17,6 +17,7 @@ #include #include #include +#include #include #include @@ -73,19 +74,19 @@ ODBCHandler::PoolPtr ODBCHandler::getPool(const std::string & connection_str) return pool_map->at(connection_str); } -void ODBCHandler::processError(Poco::Net::HTTPServerResponse & response, const std::string & message) +void ODBCHandler::processError(HTTPServerResponse & response, const std::string & message) { - response.setStatusAndReason(Poco::Net::HTTPResponse::HTTP_INTERNAL_SERVER_ERROR); + response.setStatusAndReason(HTTPResponse::HTTP_INTERNAL_SERVER_ERROR); if (!response.sent()) - response.send() << message << std::endl; + *response.send() << message << std::endl; LOG_WARNING(log, message); } -void ODBCHandler::handleRequest(Poco::Net::HTTPServerRequest & request, Poco::Net::HTTPServerResponse & response) +void ODBCHandler::handleRequest(HTTPServerRequest & request, HTTPServerResponse & response) { - Poco::Net::HTMLForm params(request); + HTMLForm params(request); if (mode == "read") - params.read(request.stream()); + params.read(request.getStream()); LOG_TRACE(log, "Request URI: {}", request.getURI()); if (mode == "read" && !params.has("query")) @@ -136,7 +137,7 @@ void ODBCHandler::handleRequest(Poco::Net::HTTPServerRequest & request, Poco::Ne std::string connection_string = params.get("connection_string"); LOG_TRACE(log, "Connection string: '{}'", connection_string); - WriteBufferFromHTTPServerResponse out(request, response, keep_alive_timeout); + WriteBufferFromHTTPServerResponse out(response, request.getMethod() == Poco::Net::HTTPRequest::HTTP_HEAD, keep_alive_timeout); try { @@ -163,9 +164,8 @@ void ODBCHandler::handleRequest(Poco::Net::HTTPServerRequest & request, Poco::Ne #endif auto pool = getPool(connection_string); - ReadBufferFromIStream read_buf(request.stream()); - auto input_format = FormatFactory::instance().getInput(format, read_buf, *sample_block, - context, max_block_size); + auto & read_buf = request.getStream(); + auto input_format = FormatFactory::instance().getInput(format, read_buf, *sample_block, context, max_block_size); auto input_stream = std::make_shared(input_format); ODBCBlockOutputStream output_stream(pool->get(), db_name, table_name, *sample_block, quoting_style); copyData(*input_stream, output_stream); diff --git a/programs/odbc-bridge/MainHandler.h b/programs/odbc-bridge/MainHandler.h index ec5e6693a60..e237ede5814 100644 --- a/programs/odbc-bridge/MainHandler.h +++ b/programs/odbc-bridge/MainHandler.h @@ -1,12 +1,13 @@ #pragma once #include +#include + #include -#include #pragma GCC diagnostic push #pragma GCC diagnostic ignored "-Wunused-parameter" - #include +#include #pragma GCC diagnostic pop namespace DB @@ -16,7 +17,7 @@ namespace DB * and also query in request body * response in RowBinary format */ -class ODBCHandler : public Poco::Net::HTTPRequestHandler +class ODBCHandler : public HTTPRequestHandler { public: using PoolPtr = std::shared_ptr; @@ -34,7 +35,7 @@ public: { } - void handleRequest(Poco::Net::HTTPServerRequest & request, Poco::Net::HTTPServerResponse & response) override; + void handleRequest(HTTPServerRequest & request, HTTPServerResponse & response) override; private: Poco::Logger * log; @@ -47,7 +48,7 @@ private: static inline std::mutex mutex; PoolPtr getPool(const std::string & connection_str); - void processError(Poco::Net::HTTPServerResponse & response, const std::string & message); + void processError(HTTPServerResponse & response, const std::string & message); }; } diff --git a/programs/odbc-bridge/ODBCBridge.cpp b/programs/odbc-bridge/ODBCBridge.cpp index 9deefaf7895..8869a2639c1 100644 --- a/programs/odbc-bridge/ODBCBridge.cpp +++ b/programs/odbc-bridge/ODBCBridge.cpp @@ -11,7 +11,6 @@ # include #endif -#include #include #include #include @@ -23,6 +22,7 @@ #include #include #include +#include namespace DB @@ -212,8 +212,12 @@ int ODBCBridge::main(const std::vector & /*args*/) SensitiveDataMasker::setInstance(std::make_unique(config(), "query_masking_rules")); } - auto server = Poco::Net::HTTPServer( - new HandlerFactory("ODBCRequestHandlerFactory-factory", keep_alive_timeout, context), server_pool, socket, http_params); + auto server = HTTPServer( + context, + std::make_shared("ODBCRequestHandlerFactory-factory", keep_alive_timeout, context), + server_pool, + socket, + http_params); server.start(); LOG_INFO(log, "Listening http://{}", address.toString()); diff --git a/programs/odbc-bridge/PingHandler.cpp b/programs/odbc-bridge/PingHandler.cpp index b0313e46bf3..e3ab5e5cd00 100644 --- a/programs/odbc-bridge/PingHandler.cpp +++ b/programs/odbc-bridge/PingHandler.cpp @@ -6,7 +6,7 @@ namespace DB { -void PingHandler::handleRequest(Poco::Net::HTTPServerRequest & /*request*/, Poco::Net::HTTPServerResponse & response) +void PingHandler::handleRequest(HTTPServerRequest & /* request */, HTTPServerResponse & response) { try { diff --git a/programs/odbc-bridge/PingHandler.h b/programs/odbc-bridge/PingHandler.h index d8109a50bb6..c969ec55af7 100644 --- a/programs/odbc-bridge/PingHandler.h +++ b/programs/odbc-bridge/PingHandler.h @@ -1,17 +1,19 @@ #pragma once -#include + +#include namespace DB { -/** Simple ping handler, answers "Ok." to GET request - */ -class PingHandler : public Poco::Net::HTTPRequestHandler + +/// Simple ping handler, answers "Ok." to GET request +class PingHandler : public HTTPRequestHandler { public: - PingHandler(size_t keep_alive_timeout_) : keep_alive_timeout(keep_alive_timeout_) {} - void handleRequest(Poco::Net::HTTPServerRequest & request, Poco::Net::HTTPServerResponse & response) override; + explicit PingHandler(size_t keep_alive_timeout_) : keep_alive_timeout(keep_alive_timeout_) {} + void handleRequest(HTTPServerRequest & request, HTTPServerResponse & response) override; private: size_t keep_alive_timeout; }; + } diff --git a/programs/odbc-bridge/SchemaAllowedHandler.cpp b/programs/odbc-bridge/SchemaAllowedHandler.cpp index fa08a27da59..48744b6d2ca 100644 --- a/programs/odbc-bridge/SchemaAllowedHandler.cpp +++ b/programs/odbc-bridge/SchemaAllowedHandler.cpp @@ -2,12 +2,12 @@ #if USE_ODBC -# include +# include +# include # include # include # include # include -# include # include # include # include @@ -33,16 +33,16 @@ namespace } -void SchemaAllowedHandler::handleRequest(Poco::Net::HTTPServerRequest & request, Poco::Net::HTTPServerResponse & response) +void SchemaAllowedHandler::handleRequest(HTTPServerRequest & request, HTTPServerResponse & response) { - Poco::Net::HTMLForm params(request, request.stream()); + HTMLForm params(request, request.getStream()); LOG_TRACE(log, "Request URI: {}", request.getURI()); auto process_error = [&response, this](const std::string & message) { response.setStatusAndReason(Poco::Net::HTTPResponse::HTTP_INTERNAL_SERVER_ERROR); if (!response.sent()) - response.send() << message << std::endl; + *response.send() << message << std::endl; LOG_WARNING(log, message); }; @@ -60,7 +60,7 @@ void SchemaAllowedHandler::handleRequest(Poco::Net::HTTPServerRequest & request, bool result = isSchemaAllowed(hdbc); - WriteBufferFromHTTPServerResponse out(request, response, keep_alive_timeout); + WriteBufferFromHTTPServerResponse out(response, request.getMethod() == Poco::Net::HTTPRequest::HTTP_HEAD, keep_alive_timeout); writeBoolText(result, out); } catch (...) diff --git a/programs/odbc-bridge/SchemaAllowedHandler.h b/programs/odbc-bridge/SchemaAllowedHandler.h index 76aa23b903c..91eddf67803 100644 --- a/programs/odbc-bridge/SchemaAllowedHandler.h +++ b/programs/odbc-bridge/SchemaAllowedHandler.h @@ -1,17 +1,18 @@ #pragma once +#include + #include -#include #if USE_ODBC namespace DB { + class Context; - -/// This handler establishes connection to database, and retrieve whether schema is allowed. -class SchemaAllowedHandler : public Poco::Net::HTTPRequestHandler +/// This handler establishes connection to database, and retrieves whether schema is allowed. +class SchemaAllowedHandler : public HTTPRequestHandler { public: SchemaAllowedHandler(size_t keep_alive_timeout_, Context &) @@ -19,7 +20,7 @@ public: { } - void handleRequest(Poco::Net::HTTPServerRequest & request, Poco::Net::HTTPServerResponse & response) override; + void handleRequest(HTTPServerRequest & request, HTTPServerResponse & response) override; private: Poco::Logger * log; diff --git a/programs/server/Server.cpp b/programs/server/Server.cpp index a96cb2b8973..4194bb4a06b 100644 --- a/programs/server/Server.cpp +++ b/programs/server/Server.cpp @@ -69,6 +69,7 @@ #include #include #include +#include #if !defined(ARCADIA_BUILD) @@ -1070,8 +1071,10 @@ int Server::main(const std::vector & /*args*/) socket.setReceiveTimeout(settings.http_receive_timeout); socket.setSendTimeout(settings.http_send_timeout); - servers->emplace_back(port_name, std::make_unique( - createHandlerFactory(*this, async_metrics, "HTTPHandler-factory"), server_pool, socket, http_params)); + servers->emplace_back( + port_name, + std::make_unique( + context(), createHandlerFactory(*this, async_metrics, "HTTPHandler-factory"), server_pool, socket, http_params)); LOG_INFO(log, "Listening for http://{}", address.toString()); }); @@ -1085,8 +1088,10 @@ int Server::main(const std::vector & /*args*/) auto address = socketBindListen(socket, listen_host, port, /* secure = */ true); socket.setReceiveTimeout(settings.http_receive_timeout); socket.setSendTimeout(settings.http_send_timeout); - servers->emplace_back(port_name, std::make_unique( - createHandlerFactory(*this, async_metrics, "HTTPSHandler-factory"), server_pool, socket, http_params)); + servers->emplace_back( + port_name, + std::make_unique( + context(), createHandlerFactory(*this, async_metrics, "HTTPSHandler-factory"), server_pool, socket, http_params)); LOG_INFO(log, "Listening for https://{}", address.toString()); #else @@ -1160,8 +1165,14 @@ int Server::main(const std::vector & /*args*/) auto address = socketBindListen(socket, listen_host, port); socket.setReceiveTimeout(settings.http_receive_timeout); socket.setSendTimeout(settings.http_send_timeout); - servers->emplace_back(port_name, std::make_unique( - createHandlerFactory(*this, async_metrics, "InterserverIOHTTPHandler-factory"), server_pool, socket, http_params)); + servers->emplace_back( + port_name, + std::make_unique( + context(), + createHandlerFactory(*this, async_metrics, "InterserverIOHTTPHandler-factory"), + server_pool, + socket, + http_params)); LOG_INFO(log, "Listening for replica communication (interserver): http://{}", address.toString()); }); @@ -1174,8 +1185,14 @@ int Server::main(const std::vector & /*args*/) auto address = socketBindListen(socket, listen_host, port, /* secure = */ true); socket.setReceiveTimeout(settings.http_receive_timeout); socket.setSendTimeout(settings.http_send_timeout); - servers->emplace_back(port_name, std::make_unique( - createHandlerFactory(*this, async_metrics, "InterserverIOHTTPSHandler-factory"), server_pool, socket, http_params)); + servers->emplace_back( + port_name, + std::make_unique( + context(), + createHandlerFactory(*this, async_metrics, "InterserverIOHTTPSHandler-factory"), + server_pool, + socket, + http_params)); LOG_INFO(log, "Listening for secure replica communication (interserver): https://{}", address.toString()); #else @@ -1235,8 +1252,14 @@ int Server::main(const std::vector & /*args*/) auto address = socketBindListen(socket, listen_host, port); socket.setReceiveTimeout(settings.http_receive_timeout); socket.setSendTimeout(settings.http_send_timeout); - servers->emplace_back(port_name, std::make_unique( - createHandlerFactory(*this, async_metrics, "PrometheusHandler-factory"), server_pool, socket, http_params)); + servers->emplace_back( + port_name, + std::make_unique( + context(), + createHandlerFactory(*this, async_metrics, "PrometheusHandler-factory"), + server_pool, + socket, + http_params)); LOG_INFO(log, "Listening for Prometheus: http://{}", address.toString()); }); diff --git a/programs/server/Server.h b/programs/server/Server.h index c582e475308..fbfc26f6ee5 100644 --- a/programs/server/Server.h +++ b/programs/server/Server.h @@ -51,6 +51,7 @@ public: } void defineOptions(Poco::Util::OptionSet & _options) override; + protected: int run() override; @@ -65,8 +66,6 @@ protected: private: Context * global_context_ptr = nullptr; -private: - Poco::Net::SocketAddress socketBindListen(Poco::Net::ServerSocket & socket, const std::string & host, UInt16 port, [[maybe_unused]] bool secure = false) const; using CreateServerFunc = std::function; diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index d370016da00..215a13cce1a 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -181,6 +181,7 @@ add_object_library(clickhouse_storages_mergetree Storages/MergeTree) add_object_library(clickhouse_storages_liveview Storages/LiveView) add_object_library(clickhouse_client Client) add_object_library(clickhouse_server Server) +add_object_library(clickhouse_server_http Server/HTTP) add_object_library(clickhouse_formats Formats) add_object_library(clickhouse_processors Processors) add_object_library(clickhouse_processors_executors Processors/Executors) diff --git a/src/Common/HTMLForm.h b/src/Common/HTMLForm.h deleted file mode 100644 index 2b62167dce7..00000000000 --- a/src/Common/HTMLForm.h +++ /dev/null @@ -1,42 +0,0 @@ -#pragma once - -#include -#include -#include -#include - -#include - - -/** Somehow, in case of POST, Poco::Net::HTMLForm doesn't read parameters from URL, only from body. - * This helper allows to read parameters just from URL. - */ -struct HTMLForm : public Poco::Net::HTMLForm -{ - HTMLForm(const Poco::Net::HTTPRequest & request) - { - Poco::URI uri(request.getURI()); - std::istringstream istr(uri.getRawQuery()); // STYLE_CHECK_ALLOW_STD_STRING_STREAM - readUrl(istr); - } - - HTMLForm(const Poco::URI & uri) - { - std::istringstream istr(uri.getRawQuery()); // STYLE_CHECK_ALLOW_STD_STRING_STREAM - readUrl(istr); - } - - - template - T getParsed(const std::string & key, T default_value) - { - auto it = find(key); - return (it != end()) ? DB::parse(it->second) : default_value; - } - - template - T getParsed(const std::string & key) - { - return DB::parse(get(key)); - } -}; diff --git a/src/Common/StringUtils/StringUtils.h b/src/Common/StringUtils/StringUtils.h index 904e3035dd8..cb2227f01a8 100644 --- a/src/Common/StringUtils/StringUtils.h +++ b/src/Common/StringUtils/StringUtils.h @@ -120,6 +120,12 @@ inline bool isWhitespaceASCII(char c) return c == ' ' || c == '\t' || c == '\n' || c == '\r' || c == '\f' || c == '\v'; } +/// Since |isWhiteSpaceASCII()| is used inside algorithms it's easier to implement another function than add extra argument. +inline bool isWhitespaceASCIIOneLine(char c) +{ + return c == ' ' || c == '\t' || c == '\r' || c == '\f' || c == '\v'; +} + inline bool isControlASCII(char c) { return static_cast(c) <= 31; diff --git a/src/Common/formatIPv6.h b/src/Common/formatIPv6.h index 63c064b21f8..bd0c68d70f9 100644 --- a/src/Common/formatIPv6.h +++ b/src/Common/formatIPv6.h @@ -85,9 +85,9 @@ inline bool parseIPv6(const char * src, unsigned char * dst) return clear_dst(); unsigned char tmp[IPV6_BINARY_LENGTH]{}; - auto tp = tmp; - auto endp = tp + IPV6_BINARY_LENGTH; - auto curtok = src; + auto * tp = tmp; + auto * endp = tp + IPV6_BINARY_LENGTH; + const auto * curtok = src; auto saw_xdigit = false; UInt32 val{}; unsigned char * colonp = nullptr; @@ -97,14 +97,14 @@ inline bool parseIPv6(const char * src, unsigned char * dst) { const auto num = unhex(ch); - if (num != '\xff') + if (num != u8'\xff') { val <<= 4; val |= num; if (val > 0xffffu) return clear_dst(); - saw_xdigit = 1; + saw_xdigit = true; continue; } @@ -204,7 +204,7 @@ inline void formatIPv4(const unsigned char * src, char *& dst, uint8_t mask_tail for (size_t octet = 0; octet < limit; ++octet) { const uint8_t value = static_cast(src[IPV4_BINARY_LENGTH - octet - 1]); - auto rep = one_byte_to_string_lookup_table[value]; + const auto * rep = one_byte_to_string_lookup_table[value]; const uint8_t len = rep[0]; const char* str = rep + 1; diff --git a/src/Common/hex.h b/src/Common/hex.h index db094e1dfd1..a1fa7b32465 100644 --- a/src/Common/hex.h +++ b/src/Common/hex.h @@ -90,12 +90,12 @@ std::string getHexUIntLowercase(TUInt uint_) extern const char * const hex_char_to_digit_table; -inline char unhex(char c) +inline UInt8 unhex(char c) { return hex_char_to_digit_table[static_cast(c)]; } -inline char unhex2(const char * data) +inline UInt8 unhex2(const char * data) { return static_cast(unhex(data[0])) * 0x10 diff --git a/src/Core/ExternalTable.cpp b/src/Core/ExternalTable.cpp index 767ed959950..afc9fe00ef5 100644 --- a/src/Core/ExternalTable.cpp +++ b/src/Core/ExternalTable.cpp @@ -125,19 +125,16 @@ ExternalTable::ExternalTable(const boost::program_options::variables_map & exter } -void ExternalTablesHandler::handlePart(const Poco::Net::MessageHeader & header, std::istream & stream) +void ExternalTablesHandler::handlePart(const Poco::Net::MessageHeader & header, ReadBuffer & stream) { const Settings & settings = context.getSettingsRef(); - /// The buffer is initialized here, not in the virtual function initReadBuffer - read_buffer_impl = std::make_unique(stream); - if (settings.http_max_multipart_form_data_size) read_buffer = std::make_unique( - *read_buffer_impl, settings.http_max_multipart_form_data_size, + stream, settings.http_max_multipart_form_data_size, true, "the maximum size of multipart/form-data. This limit can be tuned by 'http_max_multipart_form_data_size' setting"); else - read_buffer = std::move(read_buffer_impl); + read_buffer = wrapReadBufferReference(stream); /// Retrieve a collection of parameters from MessageHeader Poco::Net::NameValueCollection content; diff --git a/src/Core/ExternalTable.h b/src/Core/ExternalTable.h index 0d8e0aaf8ac..aa15846d48a 100644 --- a/src/Core/ExternalTable.h +++ b/src/Core/ExternalTable.h @@ -1,15 +1,14 @@ #pragma once +#include +#include +#include +#include + +#include +#include #include #include -#include -#include - -#include - -#include -#include -#include namespace Poco @@ -51,7 +50,7 @@ public: std::unique_ptr read_buffer; Block sample_block; - virtual ~BaseExternalTable() {} + virtual ~BaseExternalTable() = default; /// Initialize read_buffer, depending on the data source. By default, does nothing. virtual void initReadBuffer() {} @@ -82,24 +81,23 @@ public: void initReadBuffer() override; /// Extract parameters from variables_map, which is built on the client command line - ExternalTable(const boost::program_options::variables_map & external_options); + explicit ExternalTable(const boost::program_options::variables_map & external_options); }; /// Parsing of external table used when sending tables via http /// The `handlePart` function will be called for each table passed, /// so it's also necessary to call `clean` at the end of the `handlePart`. -class ExternalTablesHandler : public Poco::Net::PartHandler, BaseExternalTable +class ExternalTablesHandler : public HTMLForm::PartHandler, BaseExternalTable { public: ExternalTablesHandler(Context & context_, const Poco::Net::NameValueCollection & params_) : context(context_), params(params_) {} - void handlePart(const Poco::Net::MessageHeader & header, std::istream & stream) override; + void handlePart(const Poco::Net::MessageHeader & header, ReadBuffer & stream) override; private: Context & context; const Poco::Net::NameValueCollection & params; - std::unique_ptr read_buffer_impl; }; diff --git a/src/IO/EmptyReadBuffer.h b/src/IO/EmptyReadBuffer.h new file mode 100644 index 00000000000..e2189b9943f --- /dev/null +++ b/src/IO/EmptyReadBuffer.h @@ -0,0 +1,18 @@ +#pragma once + +#include + +namespace DB +{ + +/// Just a stub - reads nothing from nowhere. +class EmptyReadBuffer : public ReadBuffer +{ +public: + EmptyReadBuffer() : ReadBuffer(nullptr, 0) {} + +private: + bool nextImpl() override { return false; } +}; + +} diff --git a/src/IO/HTTPChunkedReadBuffer.cpp b/src/IO/HTTPChunkedReadBuffer.cpp new file mode 100644 index 00000000000..bd9bbba4c6c --- /dev/null +++ b/src/IO/HTTPChunkedReadBuffer.cpp @@ -0,0 +1,92 @@ +#include + +#include +#include +#include +#include + + +namespace DB +{ + +namespace ErrorCodes +{ + extern const int ARGUMENT_OUT_OF_BOUND; + extern const int UNEXPECTED_END_OF_FILE; + extern const int CORRUPTED_DATA; + extern const int TOO_MANY_BYTES; +} + +size_t HTTPChunkedReadBuffer::readChunkHeader() +{ + if (in->eof()) + throw Exception("Unexpected end of file while reading chunk header of HTTP chunked data", ErrorCodes::UNEXPECTED_END_OF_FILE); + + if (!isHexDigit(*in->position())) + throw Exception("Unexpected data instead of HTTP chunk header", ErrorCodes::CORRUPTED_DATA); + + size_t res = 0; + do + { + if (common::mulOverflow(res, 16ul, res) || common::addOverflow(res, unhex(*in->position()), res)) + throw Exception("Chunk size is out of bounds", ErrorCodes::ARGUMENT_OUT_OF_BOUND); + ++in->position(); + } while (!in->eof() && isHexDigit(*in->position())); + + /// NOTE: If we want to read any chunk extensions, it should be done here. + + skipToCarriageReturnOrEOF(*in); + + if (in->eof()) + throw Exception("Unexpected end of file while reading chunk header of HTTP chunked data", ErrorCodes::UNEXPECTED_END_OF_FILE); + + if (res > max_size) + throw Exception("Chunk size is too large", ErrorCodes::TOO_MANY_BYTES); + + assertString("\n", *in); + return res; +} + +void HTTPChunkedReadBuffer::readChunkFooter() +{ + assertString("\r\n", *in); +} + +bool HTTPChunkedReadBuffer::nextImpl() +{ + if (!in) + return false; + + /// The footer of previous chunk. + if (count()) + readChunkFooter(); + + size_t chunk_size = readChunkHeader(); + if (0 == chunk_size) + { + readChunkFooter(); + in.reset(); // prevent double-eof situation. + return false; + } + + if (in->available() >= chunk_size) + { + /// Zero-copy read from input. + working_buffer = Buffer(in->position(), in->position() + chunk_size); + in->position() += chunk_size; + } + else + { + /// Chunk is not completely in buffer, copy it to scratch space. + memory.resize(chunk_size); + in->readStrict(memory.data(), chunk_size); + working_buffer = Buffer(memory.data(), memory.data() + chunk_size); + } + + /// NOTE: We postpone reading the footer to the next iteration, because it may not be completely in buffer, + /// but we need to keep the current data in buffer available. + + return true; +} + +} diff --git a/src/IO/HTTPChunkedReadBuffer.h b/src/IO/HTTPChunkedReadBuffer.h new file mode 100644 index 00000000000..0ccebc69d08 --- /dev/null +++ b/src/IO/HTTPChunkedReadBuffer.h @@ -0,0 +1,25 @@ +#pragma once + +#include +#include + +namespace DB +{ + +/// Reads data with HTTP Chunked Transfer Encoding. +class HTTPChunkedReadBuffer : public BufferWithOwnMemory +{ +public: + HTTPChunkedReadBuffer(std::unique_ptr in_, size_t max_chunk_size) : in(std::move(in_)), max_size(max_chunk_size) {} + +private: + std::unique_ptr in; + const size_t max_size; + + size_t readChunkHeader(); + void readChunkFooter(); + + bool nextImpl() override; +}; + +} diff --git a/src/IO/HTTPCommon.cpp b/src/IO/HTTPCommon.cpp index d12aa10fe6a..346bbf0427e 100644 --- a/src/IO/HTTPCommon.cpp +++ b/src/IO/HTTPCommon.cpp @@ -1,5 +1,6 @@ #include +#include #include #include #include @@ -23,7 +24,6 @@ # include #endif -#include #include #include @@ -266,7 +266,7 @@ namespace }; } -void setResponseDefaultHeaders(Poco::Net::HTTPServerResponse & response, unsigned keep_alive_timeout) +void setResponseDefaultHeaders(HTTPServerResponse & response, unsigned keep_alive_timeout) { if (!response.getKeepAlive()) return; diff --git a/src/IO/HTTPCommon.h b/src/IO/HTTPCommon.h index 4a81d23a8a3..18e83abb83b 100644 --- a/src/IO/HTTPCommon.h +++ b/src/IO/HTTPCommon.h @@ -14,20 +14,13 @@ #include -namespace Poco -{ -namespace Net -{ - class HTTPServerResponse; -} -} - - namespace DB { constexpr int HTTP_TOO_MANY_REQUESTS = 429; +class HTTPServerResponse; + class SingleEndpointHTTPSessionPool : public PoolBase { private: @@ -45,7 +38,7 @@ public: using PooledHTTPSessionPtr = SingleEndpointHTTPSessionPool::Entry; using HTTPSessionPtr = std::shared_ptr; -void setResponseDefaultHeaders(Poco::Net::HTTPServerResponse & response, unsigned keep_alive_timeout); +void setResponseDefaultHeaders(HTTPServerResponse & response, unsigned keep_alive_timeout); /// Create session object to perform requests and set required parameters. HTTPSessionPtr makeHTTPSession(const Poco::URI & uri, const ConnectionTimeouts & timeouts, bool resolve_host = true); @@ -54,7 +47,7 @@ HTTPSessionPtr makeHTTPSession(const Poco::URI & uri, const ConnectionTimeouts & PooledHTTPSessionPtr makePooledHTTPSession(const Poco::URI & uri, const ConnectionTimeouts & timeouts, size_t per_endpoint_pool_size, bool resolve_host = true); PooledHTTPSessionPtr makePooledHTTPSession(const Poco::URI & uri, const Poco::URI & proxy_uri, const ConnectionTimeouts & timeouts, size_t per_endpoint_pool_size, bool resolve_host = true); -bool isRedirect(const Poco::Net::HTTPResponse::HTTPStatus status); +bool isRedirect(Poco::Net::HTTPResponse::HTTPStatus status); /** Used to receive response (response headers and possibly body) * after sending data (request headers and possibly body). @@ -65,5 +58,5 @@ std::istream * receiveResponse( Poco::Net::HTTPClientSession & session, const Poco::Net::HTTPRequest & request, Poco::Net::HTTPResponse & response, bool allow_redirects); void assertResponseIsOk( - const Poco::Net::HTTPRequest & request, Poco::Net::HTTPResponse & response, std::istream & istr, const bool allow_redirects = false); + const Poco::Net::HTTPRequest & request, Poco::Net::HTTPResponse & response, std::istream & istr, bool allow_redirects = false); } diff --git a/src/IO/LimitReadBuffer.cpp b/src/IO/LimitReadBuffer.cpp index baa9e487688..9daffa3a1d3 100644 --- a/src/IO/LimitReadBuffer.cpp +++ b/src/IO/LimitReadBuffer.cpp @@ -14,10 +14,10 @@ namespace ErrorCodes bool LimitReadBuffer::nextImpl() { - assert(position() >= in.position()); + assert(position() >= in->position()); /// Let underlying buffer calculate read bytes in `next()` call. - in.position() = position(); + in->position() = position(); if (bytes >= limit) { @@ -27,13 +27,13 @@ bool LimitReadBuffer::nextImpl() return false; } - if (!in.next()) + if (!in->next()) { - working_buffer = in.buffer(); + working_buffer = in->buffer(); return false; } - working_buffer = in.buffer(); + working_buffer = in->buffer(); if (limit - bytes < working_buffer.size()) working_buffer.resize(limit - bytes); @@ -42,14 +42,33 @@ bool LimitReadBuffer::nextImpl() } -LimitReadBuffer::LimitReadBuffer(ReadBuffer & in_, UInt64 limit_, bool throw_exception_, std::string exception_message_) - : ReadBuffer(in_.position(), 0), in(in_), limit(limit_), throw_exception(throw_exception_), exception_message(std::move(exception_message_)) +LimitReadBuffer::LimitReadBuffer(ReadBuffer * in_, bool owns, UInt64 limit_, bool throw_exception_, std::string exception_message_) + : ReadBuffer(in_ ? in_->position() : nullptr, 0) + , in(in_) + , owns_in(owns) + , limit(limit_) + , throw_exception(throw_exception_) + , exception_message(std::move(exception_message_)) { - size_t remaining_bytes_in_buffer = in.buffer().end() - in.position(); + assert(in); + + size_t remaining_bytes_in_buffer = in->buffer().end() - in->position(); if (remaining_bytes_in_buffer > limit) remaining_bytes_in_buffer = limit; - working_buffer = Buffer(in.position(), in.position() + remaining_bytes_in_buffer); + working_buffer = Buffer(in->position(), in->position() + remaining_bytes_in_buffer); +} + + +LimitReadBuffer::LimitReadBuffer(ReadBuffer & in_, UInt64 limit_, bool throw_exception_, std::string exception_message_) + : LimitReadBuffer(&in_, false, limit_, throw_exception_, exception_message_) +{ +} + + +LimitReadBuffer::LimitReadBuffer(std::unique_ptr in_, UInt64 limit_, bool throw_exception_, std::string exception_message_) + : LimitReadBuffer(in_.release(), true, limit_, throw_exception_, exception_message_) +{ } @@ -57,7 +76,10 @@ LimitReadBuffer::~LimitReadBuffer() { /// Update underlying buffer's position in case when limit wasn't reached. if (!working_buffer.empty()) - in.position() = position(); + in->position() = position(); + + if (owns_in) + delete in; } } diff --git a/src/IO/LimitReadBuffer.h b/src/IO/LimitReadBuffer.h index db3d2684ef7..a5fa0f0d5cc 100644 --- a/src/IO/LimitReadBuffer.h +++ b/src/IO/LimitReadBuffer.h @@ -12,17 +12,22 @@ namespace DB */ class LimitReadBuffer : public ReadBuffer { +public: + LimitReadBuffer(ReadBuffer & in_, UInt64 limit_, bool throw_exception_, std::string exception_message_ = {}); + LimitReadBuffer(std::unique_ptr in_, UInt64 limit_, bool throw_exception_, std::string exception_message_ = {}); + ~LimitReadBuffer() override; + private: - ReadBuffer & in; + ReadBuffer * in; + bool owns_in; + UInt64 limit; bool throw_exception; std::string exception_message; - bool nextImpl() override; + LimitReadBuffer(ReadBuffer * in_, bool owns, UInt64 limit_, bool throw_exception_, std::string exception_message_); -public: - LimitReadBuffer(ReadBuffer & in_, UInt64 limit_, bool throw_exception_, std::string exception_message_ = {}); - ~LimitReadBuffer() override; + bool nextImpl() override; }; } diff --git a/src/IO/PeekableReadBuffer.cpp b/src/IO/PeekableReadBuffer.cpp index e0e99afbfec..1d999d586b2 100644 --- a/src/IO/PeekableReadBuffer.cpp +++ b/src/IO/PeekableReadBuffer.cpp @@ -1,7 +1,9 @@ #include + namespace DB { + namespace ErrorCodes { extern const int LOGICAL_ERROR; @@ -107,22 +109,29 @@ bool PeekableReadBuffer::peekNext() return sub_buf.next(); } -void PeekableReadBuffer::rollbackToCheckpoint() +void PeekableReadBuffer::rollbackToCheckpoint(bool drop) { checkStateCorrect(); + if (!checkpoint) throw DB::Exception("There is no checkpoint", ErrorCodes::LOGICAL_ERROR); else if (checkpointInOwnMemory() == currentlyReadFromOwnMemory()) pos = *checkpoint; else /// Checkpoint is in own memory and pos is not. Switch to reading from own memory BufferBase::set(memory.data(), peeked_size, *checkpoint - memory.data()); + + if (drop) + dropCheckpoint(); + checkStateCorrect(); } bool PeekableReadBuffer::nextImpl() { - /// FIXME wrong bytes count because it can read the same data again after rollbackToCheckpoint() - /// However, changing bytes count on every call of next() (even after rollback) allows to determine if some pointers were invalidated. + /// FIXME: wrong bytes count because it can read the same data again after rollbackToCheckpoint() + /// however, changing bytes count on every call of next() (even after rollback) allows to determine + /// if some pointers were invalidated. + checkStateCorrect(); bool res; @@ -138,7 +147,7 @@ bool PeekableReadBuffer::nextImpl() if (useSubbufferOnly()) { /// Load next data to sub_buf - sub_buf.position() = pos; + sub_buf.position() = position(); res = sub_buf.next(); } else diff --git a/src/IO/PeekableReadBuffer.h b/src/IO/PeekableReadBuffer.h index e425f9bc953..4f6e669b31d 100644 --- a/src/IO/PeekableReadBuffer.h +++ b/src/IO/PeekableReadBuffer.h @@ -58,7 +58,7 @@ public: /// Sets position at checkpoint. /// All pointers (such as this->buffer().end()) may be invalidated - void rollbackToCheckpoint(); + void rollbackToCheckpoint(bool drop = false); /// If checkpoint and current position are in different buffers, appends data from sub-buffer to own memory, /// so data between checkpoint and position will be in continuous memory. diff --git a/src/IO/ReadBuffer.h b/src/IO/ReadBuffer.h index 5cbe04f8348..e3166ba8180 100644 --- a/src/IO/ReadBuffer.h +++ b/src/IO/ReadBuffer.h @@ -134,15 +134,27 @@ public: tryIgnore(std::numeric_limits::max()); } - /** Reads a single byte. */ - bool ALWAYS_INLINE read(char & c) + /// Peeks a single byte. + bool ALWAYS_INLINE peek(char & c) { if (eof()) return false; - c = *pos++; + c = *pos; return true; } + /// Reads a single byte. + bool ALWAYS_INLINE read(char & c) + { + if (peek(c)) + { + ++pos; + return true; + } + + return false; + } + void ALWAYS_INLINE readStrict(char & c) { if (read(c)) @@ -207,5 +219,39 @@ private: using ReadBufferPtr = std::shared_ptr; +/// Due to inconsistencies in ReadBuffer-family interfaces: +/// - some require to fully wrap underlying buffer and own it, +/// - some just wrap the reference without ownership, +/// we need to be able to wrap reference-only buffers with movable transparent proxy-buffer. +/// The uniqueness of such wraps is responsibility of the code author. +inline std::unique_ptr wrapReadBufferReference(ReadBuffer & buf) +{ + class ReadBufferWrapper : public ReadBuffer + { + public: + explicit ReadBufferWrapper(ReadBuffer & buf_) : ReadBuffer(buf_.position(), 0), buf(buf_) + { + working_buffer = Buffer(buf.position(), buf.buffer().end()); + } + + private: + ReadBuffer & buf; + + bool nextImpl() override + { + buf.position() = position(); + + if (!buf.next()) + return false; + + working_buffer = buf.buffer(); + + return true; + } + }; + + return std::make_unique(buf); +} + } diff --git a/src/IO/ReadBufferFromPocoSocket.cpp b/src/IO/ReadBufferFromPocoSocket.cpp index 2c13446e693..59f0dc25667 100644 --- a/src/IO/ReadBufferFromPocoSocket.cpp +++ b/src/IO/ReadBufferFromPocoSocket.cpp @@ -78,7 +78,7 @@ ReadBufferFromPocoSocket::ReadBufferFromPocoSocket(Poco::Net::Socket & socket_, { } -bool ReadBufferFromPocoSocket::poll(size_t timeout_microseconds) +bool ReadBufferFromPocoSocket::poll(size_t timeout_microseconds) const { return available() || socket.poll(timeout_microseconds, Poco::Net::Socket::SELECT_READ | Poco::Net::Socket::SELECT_ERROR); } diff --git a/src/IO/ReadBufferFromPocoSocket.h b/src/IO/ReadBufferFromPocoSocket.h index 8064cd39246..d182d48d1f8 100644 --- a/src/IO/ReadBufferFromPocoSocket.h +++ b/src/IO/ReadBufferFromPocoSocket.h @@ -1,15 +1,14 @@ #pragma once -#include - -#include #include +#include + +#include namespace DB { -/** Works with the ready Poco::Net::Socket. Blocking operations. - */ +/// Works with the ready Poco::Net::Socket. Blocking operations. class ReadBufferFromPocoSocket : public BufferWithOwnMemory { protected: @@ -24,9 +23,9 @@ protected: bool nextImpl() override; public: - ReadBufferFromPocoSocket(Poco::Net::Socket & socket_, size_t buf_size = DBMS_DEFAULT_BUFFER_SIZE); + explicit ReadBufferFromPocoSocket(Poco::Net::Socket & socket_, size_t buf_size = DBMS_DEFAULT_BUFFER_SIZE); - bool poll(size_t timeout_microseconds); + bool poll(size_t timeout_microseconds) const; void setAsyncCallback(std::function async_callback_) { async_callback = std::move(async_callback_); } diff --git a/src/IO/ReadHelpers.cpp b/src/IO/ReadHelpers.cpp index baa12297718..fe563021d2e 100644 --- a/src/IO/ReadHelpers.cpp +++ b/src/IO/ReadHelpers.cpp @@ -1050,6 +1050,25 @@ void readAndThrowException(ReadBuffer & buf, const String & additional_message) } +void skipToCarriageReturnOrEOF(ReadBuffer & buf) +{ + while (!buf.eof()) + { + char * next_pos = find_first_symbols<'\r'>(buf.position(), buf.buffer().end()); + buf.position() = next_pos; + + if (!buf.hasPendingData()) + continue; + + if (*buf.position() == '\r') + { + ++buf.position(); + return; + } + } +} + + void skipToNextLineOrEOF(ReadBuffer & buf) { while (!buf.eof()) diff --git a/src/IO/ReadHelpers.h b/src/IO/ReadHelpers.h index 4482667f447..d203bd7bbee 100644 --- a/src/IO/ReadHelpers.h +++ b/src/IO/ReadHelpers.h @@ -536,7 +536,7 @@ void parseUUID(const UInt8 * src36, std::reverse_iterator dst16); void parseUUIDWithoutSeparator(const UInt8 * src36, std::reverse_iterator dst16); template -void formatHex(IteratorSrc src, IteratorDst dst, const size_t num_bytes); +void formatHex(IteratorSrc src, IteratorDst dst, size_t num_bytes); template @@ -1046,10 +1046,14 @@ void readText(std::vector & x, ReadBuffer & buf) /// Skip whitespace characters. -inline void skipWhitespaceIfAny(ReadBuffer & buf) +inline void skipWhitespaceIfAny(ReadBuffer & buf, bool one_line = false) { - while (!buf.eof() && isWhitespaceASCII(*buf.position())) - ++buf.position(); + if (!one_line) + while (!buf.eof() && isWhitespaceASCII(*buf.position())) + ++buf.position(); + else + while (!buf.eof() && isWhitespaceASCIIOneLine(*buf.position())) + ++buf.position(); } /// Skips json value. @@ -1212,6 +1216,9 @@ inline void skipBOMIfExists(ReadBuffer & buf) /// Skip to next character after next \n. If no \n in stream, skip to end. void skipToNextLineOrEOF(ReadBuffer & buf); +/// Skip to next character after next \r. If no \r in stream, skip to end. +void skipToCarriageReturnOrEOF(ReadBuffer & buf); + /// Skip to next character after next unescaped \n. If no \n in stream, skip to end. Does not throw on invalid escape sequences. void skipToUnescapedNextLineOrEOF(ReadBuffer & buf); diff --git a/src/IO/ya.make b/src/IO/ya.make index 2ef8bd0a986..980719aa74f 100644 --- a/src/IO/ya.make +++ b/src/IO/ya.make @@ -26,6 +26,7 @@ SRCS( CascadeWriteBuffer.cpp CompressionMethod.cpp DoubleConverter.cpp + HTTPChunkedReadBuffer.cpp HTTPCommon.cpp HashingWriteBuffer.cpp HexWriteBuffer.cpp @@ -56,7 +57,6 @@ SRCS( WriteBufferFromFileDescriptor.cpp WriteBufferFromFileDescriptorDiscardOnFailure.cpp WriteBufferFromHTTP.cpp - WriteBufferFromHTTPServerResponse.cpp WriteBufferFromOStream.cpp WriteBufferFromPocoSocket.cpp WriteBufferFromTemporaryFile.cpp diff --git a/src/Interpreters/InterserverIOHandler.h b/src/Interpreters/InterserverIOHandler.h index 6d62c9651ca..db95a00d0f7 100644 --- a/src/Interpreters/InterserverIOHandler.h +++ b/src/Interpreters/InterserverIOHandler.h @@ -8,13 +8,13 @@ #include #include #include -#include -#include -#include -#include + #include -namespace Poco { namespace Net { class HTTPServerResponse; } } +#include +#include +#include +#include namespace DB { @@ -25,13 +25,16 @@ namespace ErrorCodes extern const int NO_SUCH_INTERSERVER_IO_ENDPOINT; } +class HTMLForm; +class HTTPServerResponse; + /** Query processor from other servers. */ class InterserverIOEndpoint { public: virtual std::string getId(const std::string & path) const = 0; - virtual void processQuery(const Poco::Net::HTMLForm & params, ReadBuffer & body, WriteBuffer & out, Poco::Net::HTTPServerResponse & response) = 0; + virtual void processQuery(const HTMLForm & params, ReadBuffer & body, WriteBuffer & out, HTTPServerResponse & response) = 0; virtual ~InterserverIOEndpoint() = default; /// You need to stop the data transfer if blocker is activated. diff --git a/src/Server/HTTP/HTMLForm.cpp b/src/Server/HTTP/HTMLForm.cpp new file mode 100644 index 00000000000..ca407858c33 --- /dev/null +++ b/src/Server/HTTP/HTMLForm.cpp @@ -0,0 +1,381 @@ +#include + +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include + +#include + + +namespace DB +{ + +namespace +{ + +class NullPartHandler : public HTMLForm::PartHandler +{ +public: + void handlePart(const Poco::Net::MessageHeader &, ReadBuffer &) override {} +}; + +} + +const std::string HTMLForm::ENCODING_URL = "application/x-www-form-urlencoded"; +const std::string HTMLForm::ENCODING_MULTIPART = "multipart/form-data"; +const int HTMLForm::UNKNOWN_CONTENT_LENGTH = -1; + + +HTMLForm::HTMLForm() : field_limit(DFL_FIELD_LIMIT), value_length_limit(DFL_MAX_VALUE_LENGTH), encoding(ENCODING_URL) +{ +} + + +HTMLForm::HTMLForm(const std::string & encoding_) + : field_limit(DFL_FIELD_LIMIT), value_length_limit(DFL_MAX_VALUE_LENGTH), encoding(encoding_) +{ +} + + +HTMLForm::HTMLForm(const Poco::Net::HTTPRequest & request, ReadBuffer & requestBody, PartHandler & handler) + : field_limit(DFL_FIELD_LIMIT), value_length_limit(DFL_MAX_VALUE_LENGTH) +{ + load(request, requestBody, handler); +} + + +HTMLForm::HTMLForm(const Poco::Net::HTTPRequest & request, ReadBuffer & requestBody) + : field_limit(DFL_FIELD_LIMIT), value_length_limit(DFL_MAX_VALUE_LENGTH) +{ + load(request, requestBody); +} + + +HTMLForm::HTMLForm(const Poco::Net::HTTPRequest & request) : HTMLForm(Poco::URI(request.getURI())) +{ +} + +HTMLForm::HTMLForm(const Poco::URI & uri) : field_limit(DFL_FIELD_LIMIT), value_length_limit(DFL_MAX_VALUE_LENGTH) +{ + ReadBufferFromString istr(uri.getRawQuery()); // STYLE_CHECK_ALLOW_STD_STRING_STREAM + readQuery(istr); +} + + +void HTMLForm::setEncoding(const std::string & encoding_) +{ + encoding = encoding_; +} + + +void HTMLForm::addPart(const std::string & name, Poco::Net::PartSource * source) +{ + poco_check_ptr(source); + + Part part; + part.name = name; + part.source = std::unique_ptr(source); + parts.push_back(std::move(part)); +} + + +void HTMLForm::load(const Poco::Net::HTTPRequest & request, ReadBuffer & requestBody, PartHandler & handler) +{ + clear(); + + Poco::URI uri(request.getURI()); + const std::string & query = uri.getRawQuery(); + if (!query.empty()) + { + ReadBufferFromString istr(query); + readQuery(istr); + } + + if (request.getMethod() == Poco::Net::HTTPRequest::HTTP_POST || request.getMethod() == Poco::Net::HTTPRequest::HTTP_PUT) + { + std::string media_type; + NameValueCollection params; + Poco::Net::MessageHeader::splitParameters(request.getContentType(), media_type, params); + encoding = media_type; + if (encoding == ENCODING_MULTIPART) + { + boundary = params["boundary"]; + readMultipart(requestBody, handler); + } + else + { + readQuery(requestBody); + } + } +} + + +void HTMLForm::load(const Poco::Net::HTTPRequest & request, ReadBuffer & requestBody) +{ + NullPartHandler nah; + load(request, requestBody, nah); +} + + +void HTMLForm::load(const Poco::Net::HTTPRequest & request) +{ + NullPartHandler nah; + EmptyReadBuffer nis; + load(request, nis, nah); +} + + +void HTMLForm::read(ReadBuffer & in, PartHandler & handler) +{ + if (encoding == ENCODING_URL) + readQuery(in); + else + readMultipart(in, handler); +} + + +void HTMLForm::read(ReadBuffer & in) +{ + readQuery(in); +} + + +void HTMLForm::read(const std::string & queryString) +{ + ReadBufferFromString istr(queryString); + readQuery(istr); +} + + +void HTMLForm::readQuery(ReadBuffer & in) +{ + size_t fields = 0; + char ch = 0; // silence "uninitialized" warning from gcc-* + bool is_first = true; + + while (true) + { + if (field_limit > 0 && fields == field_limit) + throw Poco::Net::HTMLFormException("Too many form fields"); + + std::string name; + std::string value; + + while (in.read(ch) && ch != '=' && ch != '&') + { + if (ch == '+') + ch = ' '; + if (name.size() < MAX_NAME_LENGTH) + name += ch; + else + throw Poco::Net::HTMLFormException("Field name too long"); + } + + if (ch == '=') + { + while (in.read(ch) && ch != '&') + { + if (ch == '+') + ch = ' '; + if (value.size() < value_length_limit) + value += ch; + else + throw Poco::Net::HTMLFormException("Field value too long"); + } + } + + // Remove UTF-8 BOM from first name, if present + if (is_first) + Poco::UTF8::removeBOM(name); + + std::string decoded_name; + std::string decoded_value; + Poco::URI::decode(name, decoded_name); + Poco::URI::decode(value, decoded_value); + add(decoded_name, decoded_value); + ++fields; + + is_first = false; + + if (in.eof()) + break; + } +} + + +void HTMLForm::readMultipart(ReadBuffer & in_, PartHandler & handler) +{ + /// Assume there is always a boundary provided. + assert(!boundary.empty()); + + size_t fields = 0; + MultipartReadBuffer in(in_, boundary); + + /// Assume there is at least one part + in.skipToNextBoundary(); + + /// Read each part until next boundary (or last boundary) + while (!in.eof()) + { + if (field_limit && fields > field_limit) + throw Poco::Net::HTMLFormException("Too many form fields"); + + Poco::Net::MessageHeader header; + readHeaders(header, in); + skipToNextLineOrEOF(in); + + NameValueCollection params; + if (header.has("Content-Disposition")) + { + std::string unused; + Poco::Net::MessageHeader::splitParameters(header.get("Content-Disposition"), unused, params); + } + + if (params.has("filename")) + handler.handlePart(header, in); + else + { + std::string name = params["name"]; + std::string value; + char ch; + + while (in.read(ch)) + { + if (value.size() > value_length_limit) + throw Poco::Net::HTMLFormException("Field value too long"); + value += ch; + } + + add(name, value); + } + + ++fields; + + /// If we already encountered EOF for the buffer |in|, it's possible that the next symbol is a start of boundary line. + /// In this case reading the boundary line will reset the EOF state, potentially breaking invariant of EOF idempotency - + /// if there is such invariant in the first place. + if (!in.skipToNextBoundary()) + break; + } +} + + +void HTMLForm::setFieldLimit(int limit) +{ + poco_assert(limit >= 0); + + field_limit = limit; +} + + +void HTMLForm::setValueLengthLimit(int limit) +{ + poco_assert(limit >= 0); + + value_length_limit = limit; +} + + +HTMLForm::MultipartReadBuffer::MultipartReadBuffer(ReadBuffer & in_, const std::string & boundary_) + : ReadBuffer(nullptr, 0), in(in_), boundary("--" + boundary_) +{ + /// For consistency with |nextImpl()| + position() = in.position(); +} + +bool HTMLForm::MultipartReadBuffer::skipToNextBoundary() +{ + assert(working_buffer.empty() || eof()); + assert(boundary_hit); + + boundary_hit = false; + + while (!in.eof()) + { + auto line = readLine(); + if (startsWith(line, boundary)) + { + set(in.position(), 0); + next(); /// We need to restrict our buffer to size of next available line. + return !startsWith(line, boundary + "--"); + } + } + + throw Poco::Net::HTMLFormException("No boundary line found"); +} + +std::string HTMLForm::MultipartReadBuffer::readLine(bool strict) +{ + std::string line; + char ch = 0; // silence "uninitialized" warning from gcc-* + + while (in.read(ch) && ch != '\r' && ch != '\n') + line += ch; + + if (in.eof()) + { + if (strict) + throw Poco::Net::HTMLFormException("Unexpected end of message"); + return line; + } + + line += ch; + + if (ch == '\r') + { + if (!in.read(ch) || ch != '\n') + throw Poco::Net::HTMLFormException("No CRLF found"); + else + line += ch; + } + + return line; +} + +bool HTMLForm::MultipartReadBuffer::nextImpl() +{ + if (boundary_hit) + return false; + + assert(position() >= in.position()); + + in.position() = position(); + + /// We expect to start from the first symbol after EOL, so we can put checkpoint + /// and safely try to read til the next EOL and check for boundary. + in.setCheckpoint(); + + /// FIXME: there is an extra copy because we cannot traverse PeekableBuffer from checkpoint to position() + /// since it may store different data parts in different sub-buffers, + /// anyway calling makeContinuousMemoryFromCheckpointToPos() will also make an extra copy. + std::string line = readLine(false); + + /// According to RFC2046 the preceding CRLF is a part of boundary line. + if (line == "\r\n") + { + line = readLine(false); + boundary_hit = startsWith(line, boundary); + if (!boundary_hit) line = "\r\n"; + } + else + boundary_hit = startsWith(line, boundary); + + in.rollbackToCheckpoint(true); + + /// Rolling back to checkpoint may change underlying buffers. + /// Limit readable data to a single line. + BufferBase::set(in.position(), line.size(), 0); + + return !boundary_hit && !line.empty(); +} + +} diff --git a/src/Server/HTTP/HTMLForm.h b/src/Server/HTTP/HTMLForm.h new file mode 100644 index 00000000000..27be712e1d5 --- /dev/null +++ b/src/Server/HTTP/HTMLForm.h @@ -0,0 +1,175 @@ +#pragma once + +#include +#include + +#include +#include +#include +#include +#include + +namespace DB +{ + +class HTMLForm : public Poco::Net::NameValueCollection, private boost::noncopyable +{ +public: + class PartHandler; + + enum Options + { + OPT_USE_CONTENT_LENGTH = 0x01 // don't use Chunked Transfer-Encoding for multipart requests. + }; + + /// Creates an empty HTMLForm and sets the + /// encoding to "application/x-www-form-urlencoded". + HTMLForm(); + + /// Creates an empty HTMLForm that uses the given encoding. + /// Encoding must be either "application/x-www-form-urlencoded" (which is the default) or "multipart/form-data". + explicit HTMLForm(const std::string & encoding); + + /// Creates a HTMLForm from the given HTTP request. + /// Uploaded files are passed to the given PartHandler. + HTMLForm(const Poco::Net::HTTPRequest & request, ReadBuffer & requestBody, PartHandler & handler); + + /// Creates a HTMLForm from the given HTTP request. + /// Uploaded files are silently discarded. + HTMLForm(const Poco::Net::HTTPRequest & request, ReadBuffer & requestBody); + + /// Creates a HTMLForm from the given HTTP request. + /// The request must be a GET request and the form data must be in the query string (URL encoded). + /// For POST requests, you must use one of the constructors taking an additional input stream for the request body. + explicit HTMLForm(const Poco::Net::HTTPRequest & request); + + explicit HTMLForm(const Poco::URI & uri); + + template + T getParsed(const std::string & key, T default_value) + { + auto it = find(key); + return (it != end()) ? DB::parse(it->second) : default_value; + } + + template + T getParsed(const std::string & key) + { + return DB::parse(get(key)); + } + + /// Sets the encoding used for posting the form. + /// Encoding must be either "application/x-www-form-urlencoded" (which is the default) or "multipart/form-data". + void setEncoding(const std::string & encoding); + + /// Returns the encoding used for posting the form. + const std::string & getEncoding() const { return encoding; } + + /// Adds an part/attachment (file upload) to the form. + /// The form takes ownership of the PartSource and deletes it when it is no longer needed. + /// The part will only be sent if the encoding set for the form is "multipart/form-data" + void addPart(const std::string & name, Poco::Net::PartSource * pSource); + + /// Reads the form data from the given HTTP request. + /// Uploaded files are passed to the given PartHandler. + void load(const Poco::Net::HTTPRequest & request, ReadBuffer & requestBody, PartHandler & handler); + + /// Reads the form data from the given HTTP request. + /// Uploaded files are silently discarded. + void load(const Poco::Net::HTTPRequest & request, ReadBuffer & requestBody); + + /// Reads the form data from the given HTTP request. + /// The request must be a GET request and the form data must be in the query string (URL encoded). + /// For POST requests, you must use one of the overloads taking an additional input stream for the request body. + void load(const Poco::Net::HTTPRequest & request); + + /// Reads the form data from the given input stream. + /// The form data read from the stream must be in the encoding specified for the form. + /// Note that read() does not clear the form before reading the new values. + void read(ReadBuffer & in, PartHandler & handler); + + /// Reads the URL-encoded form data from the given input stream. + /// Note that read() does not clear the form before reading the new values. + void read(ReadBuffer & in); + + /// Reads the form data from the given HTTP query string. + /// Note that read() does not clear the form before reading the new values. + void read(const std::string & queryString); + + /// Returns the MIME boundary used for writing multipart form data. + const std::string & getBoundary() const { return boundary; } + + /// Returns the maximum number of header fields allowed. + /// See setFieldLimit() for more information. + int getFieldLimit() const { return field_limit; } + + /// Sets the maximum number of header fields allowed. This limit is used to defend certain kinds of denial-of-service attacks. + /// Specify 0 for unlimited (not recommended). The default limit is 100. + void setFieldLimit(int limit); + + /// Sets the maximum size for form field values stored as strings. + void setValueLengthLimit(int limit); + + /// Returns the maximum size for form field values stored as strings. + int getValueLengthLimit() const { return value_length_limit; } + + static const std::string ENCODING_URL; /// "application/x-www-form-urlencoded" + static const std::string ENCODING_MULTIPART; /// "multipart/form-data" + static const int UNKNOWN_CONTENT_LENGTH; + +protected: + void readQuery(ReadBuffer & in); + void readMultipart(ReadBuffer & in, PartHandler & handler); + +private: + /// This buffer provides data line by line to check for boundary line in a convenient way. + class MultipartReadBuffer; + + enum Limits + { + DFL_FIELD_LIMIT = 100, + MAX_NAME_LENGTH = 1024, + DFL_MAX_VALUE_LENGTH = 256 * 1024 + }; + + struct Part + { + std::string name; + std::unique_ptr source; + }; + + using PartVec = std::vector; + + size_t field_limit; + size_t value_length_limit; + std::string encoding; + std::string boundary; + PartVec parts; +}; + +class HTMLForm::PartHandler +{ +public: + virtual ~PartHandler() = default; + virtual void handlePart(const Poco::Net::MessageHeader &, ReadBuffer &) = 0; +}; + +class HTMLForm::MultipartReadBuffer : public ReadBuffer +{ +public: + MultipartReadBuffer(ReadBuffer & in, const std::string & boundary); + + /// Returns false if last boundary found. + bool skipToNextBoundary(); + +private: + PeekableReadBuffer in; + const std::string boundary; + bool boundary_hit = true; + + std::string readLine(bool strict = true); + + bool nextImpl() override; +}; + +} diff --git a/src/Server/HTTP/HTTPRequest.h b/src/Server/HTTP/HTTPRequest.h new file mode 100644 index 00000000000..40839cbcdd2 --- /dev/null +++ b/src/Server/HTTP/HTTPRequest.h @@ -0,0 +1,10 @@ +#pragma once + +#include + +namespace DB +{ + +using HTTPRequest = Poco::Net::HTTPRequest; + +} diff --git a/src/Server/HTTP/HTTPRequestHandler.h b/src/Server/HTTP/HTTPRequestHandler.h new file mode 100644 index 00000000000..19340866bb7 --- /dev/null +++ b/src/Server/HTTP/HTTPRequestHandler.h @@ -0,0 +1,19 @@ +#pragma once + +#include +#include + +#include + +namespace DB +{ + +class HTTPRequestHandler : private boost::noncopyable +{ +public: + virtual ~HTTPRequestHandler() = default; + + virtual void handleRequest(HTTPServerRequest & request, HTTPServerResponse & response) = 0; +}; + +} diff --git a/src/Server/HTTP/HTTPRequestHandlerFactory.h b/src/Server/HTTP/HTTPRequestHandlerFactory.h new file mode 100644 index 00000000000..3d50bf0a2ed --- /dev/null +++ b/src/Server/HTTP/HTTPRequestHandlerFactory.h @@ -0,0 +1,20 @@ +#pragma once + +#include + +#include + +namespace DB +{ + +class HTTPRequestHandlerFactory : private boost::noncopyable +{ +public: + virtual ~HTTPRequestHandlerFactory() = default; + + virtual std::unique_ptr createRequestHandler(const HTTPServerRequest & request) = 0; +}; + +using HTTPRequestHandlerFactoryPtr = std::shared_ptr; + +} diff --git a/src/Server/HTTP/HTTPResponse.h b/src/Server/HTTP/HTTPResponse.h new file mode 100644 index 00000000000..c73bcec6c39 --- /dev/null +++ b/src/Server/HTTP/HTTPResponse.h @@ -0,0 +1,10 @@ +#pragma once + +#include + +namespace DB +{ + +using HTTPResponse = Poco::Net::HTTPResponse; + +} diff --git a/src/Server/HTTP/HTTPServer.cpp b/src/Server/HTTP/HTTPServer.cpp new file mode 100644 index 00000000000..3e050080bdd --- /dev/null +++ b/src/Server/HTTP/HTTPServer.cpp @@ -0,0 +1,48 @@ +#include + +#include + + +namespace DB +{ +HTTPServer::HTTPServer( + const Context & context, + HTTPRequestHandlerFactoryPtr factory_, + UInt16 portNumber, + Poco::Net::HTTPServerParams::Ptr params) + : TCPServer(new HTTPServerConnectionFactory(context, params, factory_), portNumber, params), factory(factory_) +{ +} + +HTTPServer::HTTPServer( + const Context & context, + HTTPRequestHandlerFactoryPtr factory_, + const Poco::Net::ServerSocket & socket, + Poco::Net::HTTPServerParams::Ptr params) + : TCPServer(new HTTPServerConnectionFactory(context, params, factory_), socket, params), factory(factory_) +{ +} + +HTTPServer::HTTPServer( + const Context & context, + HTTPRequestHandlerFactoryPtr factory_, + Poco::ThreadPool & threadPool, + const Poco::Net::ServerSocket & socket, + Poco::Net::HTTPServerParams::Ptr params) + : TCPServer(new HTTPServerConnectionFactory(context, params, factory_), threadPool, socket, params), factory(factory_) +{ +} + +HTTPServer::~HTTPServer() +{ + /// We should call stop and join thread here instead of destructor of parent TCPHandler, + /// because there's possible race on 'vptr' between this virtual destructor and 'run' method. + stop(); +} + +void HTTPServer::stopAll(bool /* abortCurrent */) +{ + stop(); +} + +} diff --git a/src/Server/HTTP/HTTPServer.h b/src/Server/HTTP/HTTPServer.h new file mode 100644 index 00000000000..1ce62c65ca2 --- /dev/null +++ b/src/Server/HTTP/HTTPServer.h @@ -0,0 +1,46 @@ +#pragma once + +#include + +#include +#include + +#include + + +namespace DB +{ + +class Context; + +class HTTPServer : public Poco::Net::TCPServer +{ +public: + explicit HTTPServer( + const Context & context, + HTTPRequestHandlerFactoryPtr factory, + UInt16 portNumber = 80, + Poco::Net::HTTPServerParams::Ptr params = new Poco::Net::HTTPServerParams); + + HTTPServer( + const Context & context, + HTTPRequestHandlerFactoryPtr factory, + const Poco::Net::ServerSocket & socket, + Poco::Net::HTTPServerParams::Ptr params); + + HTTPServer( + const Context & context, + HTTPRequestHandlerFactoryPtr factory, + Poco::ThreadPool & threadPool, + const Poco::Net::ServerSocket & socket, + Poco::Net::HTTPServerParams::Ptr params); + + ~HTTPServer() override; + + void stopAll(bool abortCurrent = false); + +private: + HTTPRequestHandlerFactoryPtr factory; +}; + +} diff --git a/src/Server/HTTP/HTTPServerConnection.cpp b/src/Server/HTTP/HTTPServerConnection.cpp new file mode 100644 index 00000000000..e2ee4c8882b --- /dev/null +++ b/src/Server/HTTP/HTTPServerConnection.cpp @@ -0,0 +1,128 @@ +#include + +#include + +namespace DB +{ + +HTTPServerConnection::HTTPServerConnection( + const Context & context_, + const Poco::Net::StreamSocket & socket, + Poco::Net::HTTPServerParams::Ptr params_, + HTTPRequestHandlerFactoryPtr factory_) + : TCPServerConnection(socket), context(context_), params(params_), factory(factory_), stopped(false) +{ + poco_check_ptr(factory); +} + +void HTTPServerConnection::run() +{ + std::string server = params->getSoftwareVersion(); + Poco::Net::HTTPServerSession session(socket(), params); + + while (!stopped && session.hasMoreRequests()) + { + try + { + std::unique_lock lock(mutex); + if (!stopped) + { + HTTPServerResponse response(session); + HTTPServerRequest request(context, response, session); + + Poco::Timestamp now; + response.setDate(now); + response.setVersion(request.getVersion()); + response.setKeepAlive(params->getKeepAlive() && request.getKeepAlive() && session.canKeepAlive()); + if (!server.empty()) + response.set("Server", server); + try + { + std::unique_ptr handler(factory->createRequestHandler(request)); + + if (handler) + { + if (request.getExpectContinue() && response.getStatus() == Poco::Net::HTTPResponse::HTTP_OK) + response.sendContinue(); + + handler->handleRequest(request, response); + session.setKeepAlive(params->getKeepAlive() && response.getKeepAlive() && session.canKeepAlive()); + } + else + sendErrorResponse(session, Poco::Net::HTTPResponse::HTTP_NOT_IMPLEMENTED); + } + catch (Poco::Exception &) + { + if (!response.sent()) + { + try + { + sendErrorResponse(session, Poco::Net::HTTPResponse::HTTP_INTERNAL_SERVER_ERROR); + } + catch (...) + { + } + } + throw; + } + } + } + catch (Poco::Net::NoMessageException &) + { + break; + } + catch (Poco::Net::MessageException &) + { + sendErrorResponse(session, Poco::Net::HTTPResponse::HTTP_BAD_REQUEST); + } + catch (Poco::Exception &) + { + if (session.networkException()) + { + session.networkException()->rethrow(); + } + else + throw; + } + } +} + +// static +void HTTPServerConnection::sendErrorResponse(Poco::Net::HTTPServerSession & session, Poco::Net::HTTPResponse::HTTPStatus status) +{ + HTTPServerResponse response(session); + response.setVersion(Poco::Net::HTTPMessage::HTTP_1_1); + response.setStatusAndReason(status); + response.setKeepAlive(false); + response.send(); + session.setKeepAlive(false); +} + +void HTTPServerConnection::onServerStopped(const bool & abortCurrent) +{ + stopped = true; + if (abortCurrent) + { + try + { + socket().shutdown(); + } + catch (...) + { + } + } + else + { + std::unique_lock lock(mutex); + + try + { + socket().shutdown(); + } + catch (...) + { + } + } +} + +} diff --git a/src/Server/HTTP/HTTPServerConnection.h b/src/Server/HTTP/HTTPServerConnection.h new file mode 100644 index 00000000000..589c33025bf --- /dev/null +++ b/src/Server/HTTP/HTTPServerConnection.h @@ -0,0 +1,36 @@ +#pragma once + +#include +#include + +#include +#include +#include + +namespace DB +{ + +class HTTPServerConnection : public Poco::Net::TCPServerConnection +{ +public: + HTTPServerConnection( + const Context & context, + const Poco::Net::StreamSocket & socket, + Poco::Net::HTTPServerParams::Ptr params, + HTTPRequestHandlerFactoryPtr factory); + + void run() override; + +protected: + static void sendErrorResponse(Poco::Net::HTTPServerSession & session, Poco::Net::HTTPResponse::HTTPStatus status); + void onServerStopped(const bool & abortCurrent); + +private: + Context context; + Poco::Net::HTTPServerParams::Ptr params; + HTTPRequestHandlerFactoryPtr factory; + bool stopped; + std::mutex mutex; // guards the |factory| with assumption that creating handlers is not thread-safe. +}; + +} diff --git a/src/Server/HTTP/HTTPServerConnectionFactory.cpp b/src/Server/HTTP/HTTPServerConnectionFactory.cpp new file mode 100644 index 00000000000..876ccb9096b --- /dev/null +++ b/src/Server/HTTP/HTTPServerConnectionFactory.cpp @@ -0,0 +1,19 @@ +#include + +#include + +namespace DB +{ +HTTPServerConnectionFactory::HTTPServerConnectionFactory( + const Context & context_, Poco::Net::HTTPServerParams::Ptr params_, HTTPRequestHandlerFactoryPtr factory_) + : context(context_), params(params_), factory(factory_) +{ + poco_check_ptr(factory); +} + +Poco::Net::TCPServerConnection * HTTPServerConnectionFactory::createConnection(const Poco::Net::StreamSocket & socket) +{ + return new HTTPServerConnection(context, socket, params, factory); +} + +} diff --git a/src/Server/HTTP/HTTPServerConnectionFactory.h b/src/Server/HTTP/HTTPServerConnectionFactory.h new file mode 100644 index 00000000000..4f8ca43cbfb --- /dev/null +++ b/src/Server/HTTP/HTTPServerConnectionFactory.h @@ -0,0 +1,25 @@ +#pragma once + +#include +#include + +#include +#include + +namespace DB +{ + +class HTTPServerConnectionFactory : public Poco::Net::TCPServerConnectionFactory +{ +public: + HTTPServerConnectionFactory(const Context & context, Poco::Net::HTTPServerParams::Ptr params, HTTPRequestHandlerFactoryPtr factory); + + Poco::Net::TCPServerConnection * createConnection(const Poco::Net::StreamSocket & socket) override; + +private: + Context context; + Poco::Net::HTTPServerParams::Ptr params; + HTTPRequestHandlerFactoryPtr factory; +}; + +} diff --git a/src/Server/HTTP/HTTPServerRequest.cpp b/src/Server/HTTP/HTTPServerRequest.cpp new file mode 100644 index 00000000000..bdba6a51d91 --- /dev/null +++ b/src/Server/HTTP/HTTPServerRequest.cpp @@ -0,0 +1,123 @@ +#include + +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include + +namespace DB +{ + +HTTPServerRequest::HTTPServerRequest(const Context & context, HTTPServerResponse & response, Poco::Net::HTTPServerSession & session) +{ + response.attachRequest(this); + + /// Now that we know socket is still connected, obtain addresses + client_address = session.clientAddress(); + server_address = session.serverAddress(); + + auto receive_timeout = context.getSettingsRef().http_receive_timeout; + auto send_timeout = context.getSettingsRef().http_send_timeout; + auto max_query_size = context.getSettingsRef().max_query_size; + + session.socket().setReceiveTimeout(receive_timeout); + session.socket().setSendTimeout(send_timeout); + + auto in = std::make_unique(session.socket()); + socket = session.socket().impl(); + + readRequest(*in); /// Try parse according to RFC7230 + + if (getChunkedTransferEncoding()) + stream = std::make_unique(std::move(in), max_query_size); + else if (hasContentLength()) + stream = std::make_unique(std::move(in), getContentLength(), false); + else if (getMethod() != HTTPRequest::HTTP_GET && getMethod() != HTTPRequest::HTTP_HEAD && getMethod() != HTTPRequest::HTTP_DELETE) + stream = std::move(in); + else + /// We have to distinguish empty buffer and nullptr. + stream = std::make_unique(); +} + +bool HTTPServerRequest::checkPeerConnected() const +{ + try + { + char b; + if (!socket->receiveBytes(&b, 1, MSG_DONTWAIT | MSG_PEEK)) + return false; + } + catch (Poco::TimeoutException &) + { + } + catch (...) + { + return false; + } + + return true; +} + +void HTTPServerRequest::readRequest(ReadBuffer & in) +{ + char ch; + std::string method; + std::string uri; + std::string version; + + method.reserve(16); + uri.reserve(64); + version.reserve(16); + + if (in.eof()) + throw Poco::Net::NoMessageException(); + + skipWhitespaceIfAny(in); + + if (in.eof()) + throw Poco::Net::MessageException("No HTTP request header"); + + while (in.read(ch) && !Poco::Ascii::isSpace(ch) && method.size() <= MAX_METHOD_LENGTH) + method += ch; + + if (method.size() > MAX_METHOD_LENGTH) + throw Poco::Net::MessageException("HTTP request method invalid or too long"); + + skipWhitespaceIfAny(in); + + while (in.read(ch) && !Poco::Ascii::isSpace(ch) && uri.size() <= MAX_URI_LENGTH) + uri += ch; + + if (uri.size() > MAX_URI_LENGTH) + throw Poco::Net::MessageException("HTTP request URI invalid or too long"); + + skipWhitespaceIfAny(in); + + while (in.read(ch) && !Poco::Ascii::isSpace(ch) && version.size() <= MAX_VERSION_LENGTH) + version += ch; + + if (version.size() > MAX_VERSION_LENGTH) + throw Poco::Net::MessageException("Invalid HTTP version string"); + + // since HTTP always use Windows-style EOL '\r\n' we always can safely skip to '\n' + + skipToNextLineOrEOF(in); + + readHeaders(*this, in); + + skipToNextLineOrEOF(in); + + setMethod(method); + setURI(uri); + setVersion(version); +} + +} diff --git a/src/Server/HTTP/HTTPServerRequest.h b/src/Server/HTTP/HTTPServerRequest.h new file mode 100644 index 00000000000..7fd54850212 --- /dev/null +++ b/src/Server/HTTP/HTTPServerRequest.h @@ -0,0 +1,59 @@ +#pragma once + +#include +#include + +#include + +namespace DB +{ + +class Context; +class HTTPServerResponse; +class ReadBufferFromPocoSocket; + +class HTTPServerRequest : public HTTPRequest +{ +public: + HTTPServerRequest(const Context & context, HTTPServerResponse & response, Poco::Net::HTTPServerSession & session); + + /// FIXME: it's a little bit inconvenient interface. The rationale is that all other ReadBuffer's wrap each other + /// via unique_ptr - but we can't inherit HTTPServerRequest from ReadBuffer and pass it around, + /// since we also need it in other places. + + /// Returns the input stream for reading the request body. + ReadBuffer & getStream() + { + poco_check_ptr(stream); + return *stream; + } + + bool checkPeerConnected() const; + + /// Returns the client's address. + const Poco::Net::SocketAddress & clientAddress() const { return client_address; } + + /// Returns the server's address. + const Poco::Net::SocketAddress & serverAddress() const { return server_address; } + +private: + /// Limits for basic sanity checks when reading a header + enum Limits + { + MAX_NAME_LENGTH = 256, + MAX_VALUE_LENGTH = 8192, + MAX_METHOD_LENGTH = 32, + MAX_URI_LENGTH = 16384, + MAX_VERSION_LENGTH = 8, + MAX_FIELDS_NUMBER = 100, + }; + + std::unique_ptr stream; + Poco::Net::SocketImpl * socket; + Poco::Net::SocketAddress client_address; + Poco::Net::SocketAddress server_address; + + void readRequest(ReadBuffer & in); +}; + +} diff --git a/src/Server/HTTP/HTTPServerResponse.cpp b/src/Server/HTTP/HTTPServerResponse.cpp new file mode 100644 index 00000000000..e3d52fffa80 --- /dev/null +++ b/src/Server/HTTP/HTTPServerResponse.cpp @@ -0,0 +1,163 @@ +#include + +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +namespace DB +{ + +HTTPServerResponse::HTTPServerResponse(Poco::Net::HTTPServerSession & session_) : session(session_) +{ +} + +void HTTPServerResponse::sendContinue() +{ + Poco::Net::HTTPHeaderOutputStream hs(session); + hs << getVersion() << " 100 Continue\r\n\r\n"; +} + +std::shared_ptr HTTPServerResponse::send() +{ + poco_assert(!stream); + + if ((request && request->getMethod() == HTTPRequest::HTTP_HEAD) || getStatus() < 200 || getStatus() == HTTPResponse::HTTP_NO_CONTENT + || getStatus() == HTTPResponse::HTTP_NOT_MODIFIED) + { + Poco::CountingOutputStream cs; + write(cs); + stream = std::make_shared(session, cs.chars()); + write(*stream); + } + else if (getChunkedTransferEncoding()) + { + Poco::Net::HTTPHeaderOutputStream hs(session); + write(hs); + stream = std::make_shared(session); + } + else if (hasContentLength()) + { + Poco::CountingOutputStream cs; + write(cs); + stream = std::make_shared(session, getContentLength64() + cs.chars()); + write(*stream); + } + else + { + stream = std::make_shared(session); + setKeepAlive(false); + write(*stream); + } + + return stream; +} + +std::pair, std::shared_ptr> HTTPServerResponse::beginSend() +{ + poco_assert(!stream); + poco_assert(!header_stream); + + /// NOTE: Code is not exception safe. + + if ((request && request->getMethod() == HTTPRequest::HTTP_HEAD) || getStatus() < 200 || getStatus() == HTTPResponse::HTTP_NO_CONTENT + || getStatus() == HTTPResponse::HTTP_NOT_MODIFIED) + { + throw Poco::Exception("HTTPServerResponse::beginSend is invalid for HEAD request"); + } + else if (getChunkedTransferEncoding()) + { + header_stream = std::make_shared(session); + beginWrite(*header_stream); + stream = std::make_shared(session); + } + else if (hasContentLength()) + { + throw Poco::Exception("HTTPServerResponse::beginSend is invalid for response with Content-Length header"); + } + else + { + stream = std::make_shared(session); + header_stream = stream; + setKeepAlive(false); + beginWrite(*stream); + } + + return std::make_pair(header_stream, stream); +} + +void HTTPServerResponse::sendFile(const std::string & path, const std::string & mediaType) +{ + poco_assert(!stream); + + Poco::File f(path); + Poco::Timestamp date_time = f.getLastModified(); + Poco::File::FileSize length = f.getSize(); + set("Last-Modified", Poco::DateTimeFormatter::format(date_time, Poco::DateTimeFormat::HTTP_FORMAT)); + setContentLength64(length); + setContentType(mediaType); + setChunkedTransferEncoding(false); + + Poco::FileInputStream istr(path); + if (istr.good()) + { + stream = std::make_shared(session); + write(*stream); + if (request && request->getMethod() != HTTPRequest::HTTP_HEAD) + { + Poco::StreamCopier::copyStream(istr, *stream); + } + } + else + throw Poco::OpenFileException(path); +} + +void HTTPServerResponse::sendBuffer(const void * buffer, std::size_t length) +{ + poco_assert(!stream); + + setContentLength(static_cast(length)); + setChunkedTransferEncoding(false); + + stream = std::make_shared(session); + write(*stream); + if (request && request->getMethod() != HTTPRequest::HTTP_HEAD) + { + stream->write(static_cast(buffer), static_cast(length)); + } +} + +void HTTPServerResponse::redirect(const std::string & uri, HTTPStatus status) +{ + poco_assert(!stream); + + setContentLength(0); + setChunkedTransferEncoding(false); + + setStatusAndReason(status); + set("Location", uri); + + stream = std::make_shared(session); + write(*stream); +} + +void HTTPServerResponse::requireAuthentication(const std::string & realm) +{ + poco_assert(!stream); + + setStatusAndReason(HTTPResponse::HTTP_UNAUTHORIZED); + std::string auth("Basic realm=\""); + auth.append(realm); + auth.append("\""); + set("WWW-Authenticate", auth); +} + +} diff --git a/src/Server/HTTP/HTTPServerResponse.h b/src/Server/HTTP/HTTPServerResponse.h new file mode 100644 index 00000000000..82221ce3a83 --- /dev/null +++ b/src/Server/HTTP/HTTPServerResponse.h @@ -0,0 +1,91 @@ +#pragma once + +#include + +#include +#include + +#include +#include + +namespace DB +{ + +class HTTPServerRequest; + +class HTTPServerResponse : public HTTPResponse +{ +public: + explicit HTTPServerResponse(Poco::Net::HTTPServerSession & session); + + void sendContinue(); /// Sends a 100 Continue response to the client. + + /// Sends the response header to the client and + /// returns an output stream for sending the + /// response body. + /// + /// Must not be called after beginSend(), sendFile(), sendBuffer() + /// or redirect() has been called. + std::shared_ptr send(); /// TODO: use some WriteBuffer implementation here. + + /// Sends the response headers to the client + /// but do not finish headers with \r\n, + /// allowing to continue sending additional header fields. + /// + /// Must not be called after send(), sendFile(), sendBuffer() + /// or redirect() has been called. + std::pair, std::shared_ptr> beginSend(); /// TODO: use some WriteBuffer implementation here. + + /// Sends the response header to the client, followed + /// by the content of the given file. + /// + /// Must not be called after send(), sendBuffer() + /// or redirect() has been called. + /// + /// Throws a FileNotFoundException if the file + /// cannot be found, or an OpenFileException if + /// the file cannot be opened. + void sendFile(const std::string & path, const std::string & mediaType); + + /// Sends the response header to the client, followed + /// by the contents of the given buffer. + /// + /// The Content-Length header of the response is set + /// to length and chunked transfer encoding is disabled. + /// + /// If both the HTTP message header and body (from the + /// given buffer) fit into one single network packet, the + /// complete response can be sent in one network packet. + /// + /// Must not be called after send(), sendFile() + /// or redirect() has been called. + void sendBuffer(const void * pBuffer, std::size_t length); /// FIXME: do we need this one? + + /// Sets the status code, which must be one of + /// HTTP_MOVED_PERMANENTLY (301), HTTP_FOUND (302), + /// or HTTP_SEE_OTHER (303), + /// and sets the "Location" header field + /// to the given URI, which according to + /// the HTTP specification, must be absolute. + /// + /// Must not be called after send() has been called. + void redirect(const std::string & uri, Poco::Net::HTTPResponse::HTTPStatus status = Poco::Net::HTTPResponse::HTTP_FOUND); + + void requireAuthentication(const std::string & realm); + /// Sets the status code to 401 (Unauthorized) + /// and sets the "WWW-Authenticate" header field + /// according to the given realm. + + /// Returns true if the response (header) has been sent. + bool sent() const { return !!stream; } + + void attachRequest(HTTPServerRequest * request_) { request = request_; } + +private: + Poco::Net::HTTPServerSession & session; + HTTPServerRequest * request; + std::shared_ptr stream; + std::shared_ptr header_stream; +}; + +} diff --git a/src/Server/HTTP/ReadHeaders.cpp b/src/Server/HTTP/ReadHeaders.cpp new file mode 100644 index 00000000000..77ec48c11b1 --- /dev/null +++ b/src/Server/HTTP/ReadHeaders.cpp @@ -0,0 +1,88 @@ +#include + +#include +#include + +#include + +namespace DB +{ + +void readHeaders( + Poco::Net::MessageHeader & headers, ReadBuffer & in, size_t max_fields_number, size_t max_name_length, size_t max_value_length) +{ + char ch = 0; // silence uninitialized warning from gcc-* + std::string name; + std::string value; + + name.reserve(32); + value.reserve(64); + + size_t fields = 0; + + while (true) + { + if (fields > max_fields_number) + throw Poco::Net::MessageException("Too many header fields"); + + name.clear(); + value.clear(); + + /// Field name + while (in.peek(ch) && ch != ':' && !Poco::Ascii::isSpace(ch) && name.size() <= max_name_length) + { + name += ch; + in.ignore(); + } + + if (in.eof()) + throw Poco::Net::MessageException("Field is invalid"); + + if (name.empty()) + { + if (ch == '\r') + /// Start of the empty-line delimiter + break; + if (ch == ':') + throw Poco::Net::MessageException("Field name is empty"); + } + else + { + if (name.size() > max_name_length) + throw Poco::Net::MessageException("Field name is too long"); + if (ch != ':') + throw Poco::Net::MessageException("Field name is invalid or no colon found"); + } + + in.ignore(); + + skipWhitespaceIfAny(in, true); + + if (in.eof()) + throw Poco::Net::MessageException("Field is invalid"); + + /// Field value - folded values not supported. + while (in.read(ch) && ch != '\r' && ch != '\n' && value.size() <= max_value_length) + value += ch; + + if (in.eof()) + throw Poco::Net::MessageException("Field is invalid"); + + if (value.empty()) + throw Poco::Net::MessageException("Field value is empty"); + + if (ch == '\n') + throw Poco::Net::MessageException("No CRLF found"); + + if (value.size() > max_value_length) + throw Poco::Net::MessageException("Field value is too long"); + + skipToNextLineOrEOF(in); + + Poco::trimRightInPlace(value); + headers.add(name, headers.decodeWord(value)); + ++fields; + } +} + +} diff --git a/src/Server/HTTP/ReadHeaders.h b/src/Server/HTTP/ReadHeaders.h new file mode 100644 index 00000000000..e94cddcf489 --- /dev/null +++ b/src/Server/HTTP/ReadHeaders.h @@ -0,0 +1,17 @@ +#pragma once + +#include + +namespace DB +{ + +class ReadBuffer; + +void readHeaders( + Poco::Net::MessageHeader & headers, + ReadBuffer & in, + size_t max_fields_number = 100, + size_t max_name_length = 256, + size_t max_value_length = 8192); + +} diff --git a/src/IO/WriteBufferFromHTTPServerResponse.cpp b/src/Server/HTTP/WriteBufferFromHTTPServerResponse.cpp similarity index 81% rename from src/IO/WriteBufferFromHTTPServerResponse.cpp rename to src/Server/HTTP/WriteBufferFromHTTPServerResponse.cpp index ac2eeac1652..86133fc2ffe 100644 --- a/src/IO/WriteBufferFromHTTPServerResponse.cpp +++ b/src/Server/HTTP/WriteBufferFromHTTPServerResponse.cpp @@ -1,9 +1,8 @@ -#include -#include -#include -#include +#include + #include #include +#include #include #include #include @@ -13,6 +12,8 @@ # include #endif +#include + namespace DB { @@ -33,16 +34,13 @@ void WriteBufferFromHTTPServerResponse::startSendHeaders() setResponseDefaultHeaders(response, keep_alive_timeout); -#if defined(POCO_CLICKHOUSE_PATCH) - if (request.getMethod() != Poco::Net::HTTPRequest::HTTP_HEAD) + if (!is_http_method_head) std::tie(response_header_ostr, response_body_ostr) = response.beginSend(); -#endif } } void WriteBufferFromHTTPServerResponse::writeHeaderSummary() { -#if defined(POCO_CLICKHOUSE_PATCH) if (headers_finished_sending) return; @@ -51,12 +49,10 @@ void WriteBufferFromHTTPServerResponse::writeHeaderSummary() if (response_header_ostr) *response_header_ostr << "X-ClickHouse-Summary: " << progress_string_writer.str() << "\r\n" << std::flush; -#endif } void WriteBufferFromHTTPServerResponse::writeHeaderProgress() { -#if defined(POCO_CLICKHOUSE_PATCH) if (headers_finished_sending) return; @@ -65,7 +61,6 @@ void WriteBufferFromHTTPServerResponse::writeHeaderProgress() if (response_header_ostr) *response_header_ostr << "X-ClickHouse-Progress: " << progress_string_writer.str() << "\r\n" << std::flush; -#endif } void WriteBufferFromHTTPServerResponse::finishSendHeaders() @@ -75,23 +70,16 @@ void WriteBufferFromHTTPServerResponse::finishSendHeaders() writeHeaderSummary(); headers_finished_sending = true; - if (request.getMethod() != Poco::Net::HTTPRequest::HTTP_HEAD) + if (!is_http_method_head) { -#if defined(POCO_CLICKHOUSE_PATCH) /// Send end of headers delimiter. if (response_header_ostr) *response_header_ostr << "\r\n" << std::flush; -#else - /// Newline autosent by response.send() - /// if nothing to send in body: - if (!response_body_ostr) - response_body_ostr = &(response.send()); -#endif } else { if (!response_body_ostr) - response_body_ostr = &(response.send()); + response_body_ostr = response.send(); } } } @@ -104,23 +92,15 @@ void WriteBufferFromHTTPServerResponse::nextImpl() startSendHeaders(); - if (!out && request.getMethod() != Poco::Net::HTTPRequest::HTTP_HEAD) + if (!out && !is_http_method_head) { if (compress) { auto content_encoding_name = toContentEncodingName(compression_method); -#if defined(POCO_CLICKHOUSE_PATCH) *response_header_ostr << "Content-Encoding: " << content_encoding_name << "\r\n"; -#else - response.set("Content-Encoding", content_encoding_name); -#endif } -#if !defined(POCO_CLICKHOUSE_PATCH) - response_body_ostr = &(response.send()); -#endif - /// We reuse our buffer in "out" to avoid extra allocations and copies. if (compress) @@ -150,14 +130,14 @@ void WriteBufferFromHTTPServerResponse::nextImpl() WriteBufferFromHTTPServerResponse::WriteBufferFromHTTPServerResponse( - Poco::Net::HTTPServerRequest & request_, - Poco::Net::HTTPServerResponse & response_, + HTTPServerResponse & response_, + bool is_http_method_head_, unsigned keep_alive_timeout_, bool compress_, CompressionMethod compression_method_) : BufferWithOwnMemory(DBMS_DEFAULT_BUFFER_SIZE) - , request(request_) , response(response_) + , is_http_method_head(is_http_method_head_) , keep_alive_timeout(keep_alive_timeout_) , compress(compress_) , compression_method(compression_method_) diff --git a/src/IO/WriteBufferFromHTTPServerResponse.h b/src/Server/HTTP/WriteBufferFromHTTPServerResponse.h similarity index 86% rename from src/IO/WriteBufferFromHTTPServerResponse.h rename to src/Server/HTTP/WriteBufferFromHTTPServerResponse.h index 85a81c3dda7..b4ff454195f 100644 --- a/src/IO/WriteBufferFromHTTPServerResponse.h +++ b/src/Server/HTTP/WriteBufferFromHTTPServerResponse.h @@ -1,31 +1,17 @@ #pragma once -#include -#include -#include -#include -#include -#include -#include #include -#include +#include #include #include +#include +#include +#include #include #include -#if !defined(ARCADIA_BUILD) -# include -#endif - - -namespace Poco -{ - namespace Net - { - class HTTPServerResponse; - } -} +#include +#include namespace DB @@ -47,20 +33,17 @@ namespace DB class WriteBufferFromHTTPServerResponse final : public BufferWithOwnMemory { private: - Poco::Net::HTTPServerRequest & request; - Poco::Net::HTTPServerResponse & response; + HTTPServerResponse & response; + bool is_http_method_head; bool add_cors_header = false; unsigned keep_alive_timeout = 0; bool compress = false; CompressionMethod compression_method; int compression_level = 1; - std::ostream * response_body_ostr = nullptr; - -#if defined(POCO_CLICKHOUSE_PATCH) - std::ostream * response_header_ostr = nullptr; -#endif + std::shared_ptr response_body_ostr; + std::shared_ptr response_header_ostr; std::unique_ptr out; @@ -91,8 +74,8 @@ private: public: WriteBufferFromHTTPServerResponse( - Poco::Net::HTTPServerRequest & request_, - Poco::Net::HTTPServerResponse & response_, + HTTPServerResponse & response_, + bool is_http_method_head_, unsigned keep_alive_timeout_, bool compress_ = false, /// If true - set Content-Encoding header and compress the result. CompressionMethod compression_method_ = CompressionMethod::None); diff --git a/src/Server/HTTPHandler.cpp b/src/Server/HTTPHandler.cpp index e9a77c3b433..d200ee7421f 100644 --- a/src/Server/HTTPHandler.cpp +++ b/src/Server/HTTPHandler.cpp @@ -1,49 +1,47 @@ -#include "HTTPHandler.h" +#include -#include "HTTPHandlerFactory.h" -#include "HTTPHandlerRequestFilter.h" - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include #include #include +#include +#include +#include +#include +#include +#include #include #include -#include -#include #include +#include +#include +#include #include #include -#include -#include -#include -#include -#include -#include -#include #include +#include +#include +#include +#include +#include +#include +#include +#include +#include #include -#include +#include +#include #if !defined(ARCADIA_BUILD) # include #endif +#include +#include +#include +#include + +#include +#include + namespace DB { @@ -237,16 +235,14 @@ HTTPHandler::HTTPHandler(IServer & server_, const std::string & name) void HTTPHandler::processQuery( Context & context, - Poco::Net::HTTPServerRequest & request, + HTTPServerRequest & request, HTMLForm & params, - Poco::Net::HTTPServerResponse & response, + HTTPServerResponse & response, Output & used_output, std::optional & query_scope) { LOG_TRACE(log, "Request URI: {}", request.getURI()); - std::istream & istr = request.stream(); - /// The user and password can be passed by headers (similar to X-Auth-*), /// which is used by load balancers to pass authentication information. std::string user = request.get("X-ClickHouse-User", ""); @@ -291,9 +287,9 @@ void HTTPHandler::processQuery( client_info.interface = ClientInfo::Interface::HTTP; ClientInfo::HTTPMethod http_method = ClientInfo::HTTPMethod::UNKNOWN; - if (request.getMethod() == Poco::Net::HTTPServerRequest::HTTP_GET) + if (request.getMethod() == HTTPServerRequest::HTTP_GET) http_method = ClientInfo::HTTPMethod::GET; - else if (request.getMethod() == Poco::Net::HTTPServerRequest::HTTP_POST) + else if (request.getMethod() == HTTPServerRequest::HTTP_POST) http_method = ClientInfo::HTTPMethod::POST; client_info.http_method = http_method; @@ -356,10 +352,8 @@ void HTTPHandler::processQuery( } #endif - // Set the query id supplied by the user, if any, and also update the - // OpenTelemetry fields. - context.setCurrentQueryId(params.get("query_id", - request.get("X-ClickHouse-Query-Id", ""))); + // Set the query id supplied by the user, if any, and also update the OpenTelemetry fields. + context.setCurrentQueryId(params.get("query_id", request.get("X-ClickHouse-Query-Id", ""))); client_info.initial_query_id = client_info.current_query_id; @@ -405,7 +399,11 @@ void HTTPHandler::processQuery( unsigned keep_alive_timeout = config.getUInt("keep_alive_timeout", 10); used_output.out = std::make_shared( - request, response, keep_alive_timeout, client_supports_http_compression, http_response_compression_method); + response, + request.getMethod() == Poco::Net::HTTPRequest::HTTP_HEAD, + keep_alive_timeout, + client_supports_http_compression, + http_response_compression_method); if (internal_compression) used_output.out_maybe_compressed = std::make_shared(*used_output.out); @@ -459,8 +457,8 @@ void HTTPHandler::processQuery( /// Request body can be compressed using algorithm specified in the Content-Encoding header. String http_request_compression_method_str = request.get("Content-Encoding", ""); - std::unique_ptr in_post = wrapReadBufferWithCompressionMethod( - std::make_unique(istr), chooseCompressionMethod({}, http_request_compression_method_str)); + auto in_post = wrapReadBufferWithCompressionMethod( + wrapReadBufferReference(request.getStream()), chooseCompressionMethod({}, http_request_compression_method_str)); /// The data can also be compressed using incompatible internal algorithm. This is indicated by /// 'decompress' query parameter. @@ -513,7 +511,7 @@ void HTTPHandler::processQuery( const auto & settings = context.getSettingsRef(); /// Only readonly queries are allowed for HTTP GET requests. - if (request.getMethod() == Poco::Net::HTTPServerRequest::HTTP_GET) + if (request.getMethod() == HTTPServerRequest::HTTP_GET) { if (settings.readonly == 0) context.setSetting("readonly", 2); @@ -608,26 +606,12 @@ void HTTPHandler::processQuery( if (settings.readonly > 0 && settings.cancel_http_readonly_queries_on_client_close) { - Poco::Net::StreamSocket & socket = dynamic_cast(request).socket(); - - append_callback([&context, &socket](const Progress &) + append_callback([&context, &request](const Progress &) { - /// Assume that at the point this method is called no one is reading data from the socket any more. - /// True for read-only queries. - try - { - char b; - int status = socket.receiveBytes(&b, 1, MSG_DONTWAIT | MSG_PEEK); - if (status == 0) - context.killCurrentQuery(); - } - catch (Poco::TimeoutException &) - { - } - catch (...) - { + /// Assume that at the point this method is called no one is reading data from the socket any more: + /// should be true for read-only queries. + if (!request.checkPeerConnected()) context.killCurrentQuery(); - } }); } @@ -656,22 +640,23 @@ void HTTPHandler::processQuery( used_output.out->finalize(); } -void HTTPHandler::trySendExceptionToClient(const std::string & s, int exception_code, - Poco::Net::HTTPServerRequest & request, Poco::Net::HTTPServerResponse & response, - Output & used_output) +void HTTPHandler::trySendExceptionToClient( + const std::string & s, int exception_code, HTTPServerRequest & request, HTTPServerResponse & response, Output & used_output) { try { response.set("X-ClickHouse-Exception-Code", toString(exception_code)); + /// FIXME: make sure that no one else is reading from the same stream at the moment. + /// If HTTP method is POST and Keep-Alive is turned on, we should read the whole request body /// to avoid reading part of the current request body in the next request. if (request.getMethod() == Poco::Net::HTTPRequest::HTTP_POST && response.getKeepAlive() - && !request.stream().eof() - && exception_code != ErrorCodes::HTTP_LENGTH_REQUIRED) + && exception_code != ErrorCodes::HTTP_LENGTH_REQUIRED + && !request.getStream().eof()) { - request.stream().ignore(std::numeric_limits::max()); + request.getStream().ignoreAll(); } bool auth_fail = exception_code == ErrorCodes::UNKNOWN_USER || @@ -690,7 +675,7 @@ void HTTPHandler::trySendExceptionToClient(const std::string & s, int exception_ if (!response.sent() && !used_output.out_maybe_compressed) { /// If nothing was sent yet and we don't even know if we must compress the response. - response.send() << s << std::endl; + *response.send() << s << std::endl; } else if (used_output.out_maybe_compressed) { @@ -717,6 +702,11 @@ void HTTPHandler::trySendExceptionToClient(const std::string & s, int exception_ used_output.out_maybe_compressed->next(); used_output.out->finalize(); } + else + { + assert(false); + __builtin_unreachable(); + } } catch (...) { @@ -725,7 +715,7 @@ void HTTPHandler::trySendExceptionToClient(const std::string & s, int exception_ } -void HTTPHandler::handleRequest(Poco::Net::HTTPServerRequest & request, Poco::Net::HTTPServerResponse & response) +void HTTPHandler::handleRequest(HTTPServerRequest & request, HTTPServerResponse & response) { setThreadName("HTTPHandler"); ThreadStatus thread_status; @@ -746,17 +736,18 @@ void HTTPHandler::handleRequest(Poco::Net::HTTPServerRequest & request, Poco::Ne response.setContentType("text/plain; charset=UTF-8"); response.set("X-ClickHouse-Server-Display-Name", server_display_name); /// For keep-alive to work. - if (request.getVersion() == Poco::Net::HTTPServerRequest::HTTP_1_1) + if (request.getVersion() == HTTPServerRequest::HTTP_1_1) response.setChunkedTransferEncoding(true); HTMLForm params(request); with_stacktrace = params.getParsed("stacktrace", false); /// Workaround. Poco does not detect 411 Length Required case. - if (request.getMethod() == Poco::Net::HTTPRequest::HTTP_POST && !request.getChunkedTransferEncoding() && - !request.hasContentLength()) + if (request.getMethod() == HTTPRequest::HTTP_POST && !request.getChunkedTransferEncoding() && !request.hasContentLength()) { - throw Exception("The Transfer-Encoding is not chunked and there is no Content-Length header for POST request", ErrorCodes::HTTP_LENGTH_REQUIRED); + throw Exception( + "The Transfer-Encoding is not chunked and there is no Content-Length header for POST request", + ErrorCodes::HTTP_LENGTH_REQUIRED); } processQuery(context, request, params, response, used_output, query_scope); @@ -800,7 +791,7 @@ bool DynamicQueryHandler::customizeQueryParam(Context & context, const std::stri return false; } -std::string DynamicQueryHandler::getQuery(Poco::Net::HTTPServerRequest & request, HTMLForm & params, Context & context) +std::string DynamicQueryHandler::getQuery(HTTPServerRequest & request, HTMLForm & params, Context & context) { if (likely(!startsWith(request.getContentType(), "multipart/form-data"))) { @@ -814,7 +805,7 @@ std::string DynamicQueryHandler::getQuery(Poco::Net::HTTPServerRequest & request /// Support for "external data for query processing". /// Used in case of POST request with form-data, but it isn't expected to be deleted after that scope. ExternalTablesHandler handler(context, params); - params.load(request, request.stream(), handler); + params.load(request, request.getStream(), handler); std::string full_query; /// Params are of both form params POST and uri (GET params) @@ -844,7 +835,7 @@ bool PredefinedQueryHandler::customizeQueryParam(Context & context, const std::s return false; } -void PredefinedQueryHandler::customizeContext(Poco::Net::HTTPServerRequest & request, DB::Context & context) +void PredefinedQueryHandler::customizeContext(HTTPServerRequest & request, DB::Context & context) { /// If in the configuration file, the handler's header is regex and contains named capture group /// We will extract regex named capture groups as query parameters @@ -880,22 +871,26 @@ void PredefinedQueryHandler::customizeContext(Poco::Net::HTTPServerRequest & req } } -std::string PredefinedQueryHandler::getQuery(Poco::Net::HTTPServerRequest & request, HTMLForm & params, Context & context) +std::string PredefinedQueryHandler::getQuery(HTTPServerRequest & request, HTMLForm & params, Context & context) { if (unlikely(startsWith(request.getContentType(), "multipart/form-data"))) { /// Support for "external data for query processing". ExternalTablesHandler handler(context, params); - params.load(request, request.stream(), handler); + params.load(request, request.getStream(), handler); } return predefined_query; } -Poco::Net::HTTPRequestHandlerFactory * createDynamicHandlerFactory(IServer & server, const std::string & config_prefix) +HTTPRequestHandlerFactoryPtr createDynamicHandlerFactory(IServer & server, const std::string & config_prefix) { - std::string query_param_name = server.config().getString(config_prefix + ".handler.query_param_name", "query"); - return addFiltersFromConfig(new HandlingRuleHTTPHandlerFactory(server, std::move(query_param_name)), server.config(), config_prefix); + const auto & query_param_name = server.config().getString(config_prefix + ".handler.query_param_name", "query"); + auto factory = std::make_shared>(server, std::move(query_param_name)); + + factory->addFiltersFromConfig(server.config(), config_prefix); + + return factory; } static inline bool capturingNamedQueryParam(NameSet receive_params, const CompiledRegexPtr & compiled_regex) @@ -913,18 +908,20 @@ static inline CompiledRegexPtr getCompiledRegex(const std::string & expression) auto compiled_regex = std::make_shared(expression); if (!compiled_regex->ok()) - throw Exception("Cannot compile re2: " + expression + " for http handling rule, error: " + - compiled_regex->error() + ". Look at https://github.com/google/re2/wiki/Syntax for reference.", ErrorCodes::CANNOT_COMPILE_REGEXP); + throw Exception( + "Cannot compile re2: " + expression + " for http handling rule, error: " + compiled_regex->error() + + ". Look at https://github.com/google/re2/wiki/Syntax for reference.", + ErrorCodes::CANNOT_COMPILE_REGEXP); return compiled_regex; } -Poco::Net::HTTPRequestHandlerFactory * createPredefinedHandlerFactory(IServer & server, const std::string & config_prefix) +HTTPRequestHandlerFactoryPtr createPredefinedHandlerFactory(IServer & server, const std::string & config_prefix) { Poco::Util::AbstractConfiguration & configuration = server.config(); if (!configuration.has(config_prefix + ".handler.query")) - throw Exception("There is no path '" + config_prefix + ".handler.query" + "' in configuration file.", ErrorCodes::NO_ELEMENTS_IN_CONFIG); + throw Exception("There is no path '" + config_prefix + ".handler.query' in configuration file.", ErrorCodes::NO_ELEMENTS_IN_CONFIG); std::string predefined_query = configuration.getString(config_prefix + ".handler.query"); NameSet analyze_receive_params = analyzeReceiveQueryParams(predefined_query); @@ -946,6 +943,8 @@ Poco::Net::HTTPRequestHandlerFactory * createPredefinedHandlerFactory(IServer & headers_name_with_regex.emplace(std::make_pair(header_name, regex)); } + std::shared_ptr> factory; + if (configuration.has(config_prefix + ".url")) { auto url_expression = configuration.getString(config_prefix + ".url"); @@ -955,14 +954,23 @@ Poco::Net::HTTPRequestHandlerFactory * createPredefinedHandlerFactory(IServer & auto regex = getCompiledRegex(url_expression); if (capturingNamedQueryParam(analyze_receive_params, regex)) - return addFiltersFromConfig(new HandlingRuleHTTPHandlerFactory( - server, std::move(analyze_receive_params), std::move(predefined_query), std::move(regex), - std::move(headers_name_with_regex)), configuration, config_prefix); + { + factory = std::make_shared>( + server, + std::move(analyze_receive_params), + std::move(predefined_query), + std::move(regex), + std::move(headers_name_with_regex)); + factory->addFiltersFromConfig(configuration, config_prefix); + return factory; + } } - return addFiltersFromConfig(new HandlingRuleHTTPHandlerFactory( - server, std::move(analyze_receive_params), std::move(predefined_query), CompiledRegexPtr{} ,std::move(headers_name_with_regex)), - configuration, config_prefix); + factory = std::make_shared>( + server, std::move(analyze_receive_params), std::move(predefined_query), CompiledRegexPtr{}, std::move(headers_name_with_regex)); + factory->addFiltersFromConfig(configuration, config_prefix); + + return factory; } } diff --git a/src/Server/HTTPHandler.h b/src/Server/HTTPHandler.h index 96727df5404..e903fbfbff7 100644 --- a/src/Server/HTTPHandler.h +++ b/src/Server/HTTPHandler.h @@ -1,13 +1,10 @@ #pragma once -#include "IServer.h" - -#include - -#include -#include -#include #include +#include +#include +#include +#include #include @@ -21,23 +18,24 @@ namespace Poco { class Logger; } namespace DB { +class IServer; class WriteBufferFromHTTPServerResponse; using CompiledRegexPtr = std::shared_ptr; -class HTTPHandler : public Poco::Net::HTTPRequestHandler +class HTTPHandler : public HTTPRequestHandler { public: - explicit HTTPHandler(IServer & server_, const std::string & name); + HTTPHandler(IServer & server_, const std::string & name); - void handleRequest(Poco::Net::HTTPServerRequest & request, Poco::Net::HTTPServerResponse & response) override; + void handleRequest(HTTPServerRequest & request, HTTPServerResponse & response) override; /// This method is called right before the query execution. - virtual void customizeContext(Poco::Net::HTTPServerRequest & /*request*/, Context & /* context */) {} + virtual void customizeContext(HTTPServerRequest & /* request */, Context & /* context */) {} virtual bool customizeQueryParam(Context & context, const std::string & key, const std::string & value) = 0; - virtual std::string getQuery(Poco::Net::HTTPServerRequest & request, HTMLForm & params, Context & context) = 0; + virtual std::string getQuery(HTTPServerRequest & request, HTMLForm & params, Context & context) = 0; private: struct Output @@ -74,17 +72,17 @@ private: /// Also initializes 'used_output'. void processQuery( Context & context, - Poco::Net::HTTPServerRequest & request, + HTTPServerRequest & request, HTMLForm & params, - Poco::Net::HTTPServerResponse & response, + HTTPServerResponse & response, Output & used_output, std::optional & query_scope); void trySendExceptionToClient( const std::string & s, int exception_code, - Poco::Net::HTTPServerRequest & request, - Poco::Net::HTTPServerResponse & response, + HTTPServerRequest & request, + HTTPServerResponse & response, Output & used_output); static void pushDelayedResults(Output & used_output); @@ -97,7 +95,7 @@ private: public: explicit DynamicQueryHandler(IServer & server_, const std::string & param_name_ = "query"); - std::string getQuery(Poco::Net::HTTPServerRequest & request, HTMLForm & params, Context & context) override; + std::string getQuery(HTTPServerRequest & request, HTMLForm & params, Context & context) override; bool customizeQueryParam(Context &context, const std::string &key, const std::string &value) override; }; @@ -114,9 +112,9 @@ public: IServer & server_, const NameSet & receive_params_, const std::string & predefined_query_ , const CompiledRegexPtr & url_regex_, const std::unordered_map & header_name_with_regex_); - virtual void customizeContext(Poco::Net::HTTPServerRequest & request, Context & context) override; + virtual void customizeContext(HTTPServerRequest & request, Context & context) override; - std::string getQuery(Poco::Net::HTTPServerRequest & request, HTMLForm & params, Context & context) override; + std::string getQuery(HTTPServerRequest & request, HTMLForm & params, Context & context) override; bool customizeQueryParam(Context & context, const std::string & key, const std::string & value) override; }; diff --git a/src/Server/HTTPHandlerFactory.cpp b/src/Server/HTTPHandlerFactory.cpp index 9eac60355d2..db80750beb8 100644 --- a/src/Server/HTTPHandlerFactory.cpp +++ b/src/Server/HTTPHandlerFactory.cpp @@ -1,4 +1,7 @@ -#include "HTTPHandlerFactory.h" +#include + +#include +#include #include @@ -29,7 +32,7 @@ HTTPRequestHandlerFactoryMain::HTTPRequestHandlerFactoryMain(const std::string & { } -Poco::Net::HTTPRequestHandler * HTTPRequestHandlerFactoryMain::createRequestHandler(const Poco::Net::HTTPServerRequest & request) +std::unique_ptr HTTPRequestHandlerFactoryMain::createRequestHandler(const HTTPServerRequest & request) { LOG_TRACE(log, "HTTP Request for {}. Method: {}, Address: {}, User-Agent: {}{}, Content Type: {}, Transfer Encoding: {}, X-Forwarded-For: {}", name, request.getMethod(), request.clientAddress().toString(), request.get("User-Agent", "(none)"), @@ -38,8 +41,8 @@ Poco::Net::HTTPRequestHandler * HTTPRequestHandlerFactoryMain::createRequestHand for (auto & handler_factory : child_factories) { - auto * handler = handler_factory->createRequestHandler(request); - if (handler != nullptr) + auto handler = handler_factory->createRequestHandler(request); + if (handler) return handler; } @@ -47,31 +50,16 @@ Poco::Net::HTTPRequestHandler * HTTPRequestHandlerFactoryMain::createRequestHand || request.getMethod() == Poco::Net::HTTPRequest::HTTP_HEAD || request.getMethod() == Poco::Net::HTTPRequest::HTTP_POST) { - return new NotFoundHandler; + return std::unique_ptr(new NotFoundHandler); } return nullptr; } -HTTPRequestHandlerFactoryMain::~HTTPRequestHandlerFactoryMain() -{ - while (!child_factories.empty()) - { - delete child_factories.back(); - child_factories.pop_back(); - } -} - -HTTPRequestHandlerFactoryMain::TThis * HTTPRequestHandlerFactoryMain::addHandler(Poco::Net::HTTPRequestHandlerFactory * child_factory) -{ - child_factories.emplace_back(child_factory); - return this; -} - static inline auto createHandlersFactoryFromConfig( IServer & server, const std::string & name, const String & prefix, AsynchronousMetrics & async_metrics) { - auto main_handler_factory = std::make_unique(name); + auto main_handler_factory = std::make_shared(name); Poco::Util::AbstractConfiguration::Keys keys; server.config().keys(prefix, keys); @@ -109,10 +97,11 @@ static inline auto createHandlersFactoryFromConfig( ErrorCodes::UNKNOWN_ELEMENT_IN_CONFIG); } - return main_handler_factory.release(); + return main_handler_factory; } -static inline Poco::Net::HTTPRequestHandlerFactory * createHTTPHandlerFactory(IServer & server, const std::string & name, AsynchronousMetrics & async_metrics) +static inline HTTPRequestHandlerFactoryPtr +createHTTPHandlerFactory(IServer & server, const std::string & name, AsynchronousMetrics & async_metrics) { if (server.config().has("http_handlers")) { @@ -120,25 +109,25 @@ static inline Poco::Net::HTTPRequestHandlerFactory * createHTTPHandlerFactory(IS } else { - auto factory = std::make_unique(name); + auto factory = std::make_shared(name); addDefaultHandlersFactory(*factory, server, async_metrics); - return factory.release(); + return factory; } } -static inline Poco::Net::HTTPRequestHandlerFactory * createInterserverHTTPHandlerFactory(IServer & server, const std::string & name) +static inline HTTPRequestHandlerFactoryPtr createInterserverHTTPHandlerFactory(IServer & server, const std::string & name) { - auto factory = std::make_unique(name); + auto factory = std::make_shared(name); addCommonDefaultHandlersFactory(*factory, server); - auto main_handler = std::make_unique>(server); + auto main_handler = std::make_shared>(server); main_handler->allowPostAndGetParamsRequest(); - factory->addHandler(main_handler.release()); + factory->addHandler(main_handler); - return factory.release(); + return factory; } -Poco::Net::HTTPRequestHandlerFactory * createHandlerFactory(IServer & server, AsynchronousMetrics & async_metrics, const std::string & name) +HTTPRequestHandlerFactoryPtr createHandlerFactory(IServer & server, AsynchronousMetrics & async_metrics, const std::string & name) { if (name == "HTTPHandler-factory" || name == "HTTPSHandler-factory") return createHTTPHandlerFactory(server, name, async_metrics); @@ -146,12 +135,13 @@ Poco::Net::HTTPRequestHandlerFactory * createHandlerFactory(IServer & server, As return createInterserverHTTPHandlerFactory(server, name); else if (name == "PrometheusHandler-factory") { - auto factory = std::make_unique(name); - auto handler = std::make_unique>( + auto factory = std::make_shared(name); + auto handler = std::make_shared>( server, PrometheusMetricsWriter(server.config(), "prometheus", async_metrics)); - handler->attachStrictPath(server.config().getString("prometheus.endpoint", "/metrics"))->allowGetAndHeadRequest(); - factory->addHandler(handler.release()); - return factory.release(); + handler->attachStrictPath(server.config().getString("prometheus.endpoint", "/metrics")); + handler->allowGetAndHeadRequest(); + factory->addHandler(handler); + return factory; } throw Exception("LOGICAL ERROR: Unknown HTTP handler factory name.", ErrorCodes::LOGICAL_ERROR); @@ -162,39 +152,44 @@ static const auto root_response_expression = "config://http_server_default_respo void addCommonDefaultHandlersFactory(HTTPRequestHandlerFactoryMain & factory, IServer & server) { - auto root_handler = std::make_unique>(server, root_response_expression); - root_handler->attachStrictPath("/")->allowGetAndHeadRequest(); - factory.addHandler(root_handler.release()); + auto root_handler = std::make_shared>(server, root_response_expression); + root_handler->attachStrictPath("/"); + root_handler->allowGetAndHeadRequest(); + factory.addHandler(root_handler); - auto ping_handler = std::make_unique>(server, ping_response_expression); - ping_handler->attachStrictPath("/ping")->allowGetAndHeadRequest(); - factory.addHandler(ping_handler.release()); + auto ping_handler = std::make_shared>(server, ping_response_expression); + ping_handler->attachStrictPath("/ping"); + ping_handler->allowGetAndHeadRequest(); + factory.addHandler(ping_handler); - auto replicas_status_handler = std::make_unique>(server); - replicas_status_handler->attachNonStrictPath("/replicas_status")->allowGetAndHeadRequest(); - factory.addHandler(replicas_status_handler.release()); + auto replicas_status_handler = std::make_shared>(server); + replicas_status_handler->attachNonStrictPath("/replicas_status"); + replicas_status_handler->allowGetAndHeadRequest(); + factory.addHandler(replicas_status_handler); - auto web_ui_handler = std::make_unique>(server, "play.html"); - web_ui_handler->attachNonStrictPath("/play")->allowGetAndHeadRequest(); - factory.addHandler(web_ui_handler.release()); + auto web_ui_handler = std::make_shared>(server, "play.html"); + web_ui_handler->attachNonStrictPath("/play"); + web_ui_handler->allowGetAndHeadRequest(); + factory.addHandler(web_ui_handler); } void addDefaultHandlersFactory(HTTPRequestHandlerFactoryMain & factory, IServer & server, AsynchronousMetrics & async_metrics) { addCommonDefaultHandlersFactory(factory, server); - auto query_handler = std::make_unique>(server, "query"); + auto query_handler = std::make_shared>(server, "query"); query_handler->allowPostAndGetParamsRequest(); - factory.addHandler(query_handler.release()); + factory.addHandler(query_handler); /// We check that prometheus handler will be served on current (default) port. /// Otherwise it will be created separately, see createHandlerFactory(...). if (server.config().has("prometheus") && server.config().getInt("prometheus.port", 0) == 0) { - auto prometheus_handler = std::make_unique>( + auto prometheus_handler = std::make_shared>( server, PrometheusMetricsWriter(server.config(), "prometheus", async_metrics)); - prometheus_handler->attachStrictPath(server.config().getString("prometheus.endpoint", "/metrics"))->allowGetAndHeadRequest(); - factory.addHandler(prometheus_handler.release()); + prometheus_handler->attachStrictPath(server.config().getString("prometheus.endpoint", "/metrics")); + prometheus_handler->allowGetAndHeadRequest(); + factory.addHandler(prometheus_handler); } } diff --git a/src/Server/HTTPHandlerFactory.h b/src/Server/HTTPHandlerFactory.h index 3e8313172eb..6297f988eaa 100644 --- a/src/Server/HTTPHandlerFactory.h +++ b/src/Server/HTTPHandlerFactory.h @@ -1,82 +1,102 @@ #pragma once -#include "IServer.h" -#include -#include -#include -#include -#include -#include #include +#include +#include +#include +#include +#include + +#include namespace DB { -/// Handle request using child handlers -class HTTPRequestHandlerFactoryMain : public Poco::Net::HTTPRequestHandlerFactory, boost::noncopyable +namespace ErrorCodes { -private: - using TThis = HTTPRequestHandlerFactoryMain; + extern const int UNKNOWN_ELEMENT_IN_CONFIG; +} +class IServer; + +/// Handle request using child handlers +class HTTPRequestHandlerFactoryMain : public HTTPRequestHandlerFactory +{ +public: + explicit HTTPRequestHandlerFactoryMain(const std::string & name_); + + void addHandler(HTTPRequestHandlerFactoryPtr child_factory) { child_factories.emplace_back(child_factory); } + + std::unique_ptr createRequestHandler(const HTTPServerRequest & request) override; + +private: Poco::Logger * log; std::string name; - std::vector child_factories; -public: - - ~HTTPRequestHandlerFactoryMain() override; - - HTTPRequestHandlerFactoryMain(const std::string & name_); - - TThis * addHandler(Poco::Net::HTTPRequestHandlerFactory * child_factory); - - Poco::Net::HTTPRequestHandler * createRequestHandler(const Poco::Net::HTTPServerRequest & request) override; + std::vector child_factories; }; template -class HandlingRuleHTTPHandlerFactory : public Poco::Net::HTTPRequestHandlerFactory +class HandlingRuleHTTPHandlerFactory : public HTTPRequestHandlerFactory { public: - using TThis = HandlingRuleHTTPHandlerFactory; - using Filter = std::function; + using Filter = std::function; template - HandlingRuleHTTPHandlerFactory(TArgs &&... args) + explicit HandlingRuleHTTPHandlerFactory(TArgs &&... args) { creator = [args = std::tuple(std::forward(args) ...)]() { return std::apply([&](auto && ... endpoint_args) { - return new TEndpoint(std::forward(endpoint_args)...); + return std::make_unique(std::forward(endpoint_args)...); }, std::move(args)); }; } - TThis * addFilter(Filter cur_filter) + void addFilter(Filter cur_filter) { Filter prev_filter = filter; filter = [prev_filter, cur_filter](const auto & request) { return prev_filter ? prev_filter(request) && cur_filter(request) : cur_filter(request); }; - - return this; } - TThis * attachStrictPath(const String & strict_path) + void addFiltersFromConfig(Poco::Util::AbstractConfiguration & config, const std::string & prefix) { - return addFilter([strict_path](const auto & request) { return request.getURI() == strict_path; }); + Poco::Util::AbstractConfiguration::Keys filters_type; + config.keys(prefix, filters_type); + + for (const auto & filter_type : filters_type) + { + if (filter_type == "handler") + continue; + else if (filter_type == "url") + addFilter(urlFilter(config, prefix + ".url")); + else if (filter_type == "headers") + addFilter(headersFilter(config, prefix + ".headers")); + else if (filter_type == "methods") + addFilter(methodsFilter(config, prefix + ".methods")); + else + throw Exception("Unknown element in config: " + prefix + "." + filter_type, ErrorCodes::UNKNOWN_ELEMENT_IN_CONFIG); + } } - TThis * attachNonStrictPath(const String & non_strict_path) + void attachStrictPath(const String & strict_path) { - return addFilter([non_strict_path](const auto & request) { return startsWith(request.getURI(), non_strict_path); }); + addFilter([strict_path](const auto & request) { return request.getURI() == strict_path; }); + } + + void attachNonStrictPath(const String & non_strict_path) + { + addFilter([non_strict_path](const auto & request) { return startsWith(request.getURI(), non_strict_path); }); } /// Handle GET or HEAD endpoint on specified path - TThis * allowGetAndHeadRequest() + void allowGetAndHeadRequest() { - return addFilter([](const auto & request) + addFilter([](const auto & request) { return request.getMethod() == Poco::Net::HTTPRequest::HTTP_GET || request.getMethod() == Poco::Net::HTTPRequest::HTTP_HEAD; @@ -84,35 +104,35 @@ public: } /// Handle POST or GET with params - TThis * allowPostAndGetParamsRequest() + void allowPostAndGetParamsRequest() { - return addFilter([](const auto & request) + addFilter([](const auto & request) { return request.getURI().find('?') != std::string::npos || request.getMethod() == Poco::Net::HTTPRequest::HTTP_POST; }); } - Poco::Net::HTTPRequestHandler * createRequestHandler(const Poco::Net::HTTPServerRequest & request) override + std::unique_ptr createRequestHandler(const HTTPServerRequest & request) override { return filter(request) ? creator() : nullptr; } private: Filter filter; - std::function creator; + std::function ()> creator; }; -Poco::Net::HTTPRequestHandlerFactory * createStaticHandlerFactory(IServer & server, const std::string & config_prefix); +HTTPRequestHandlerFactoryPtr createStaticHandlerFactory(IServer & server, const std::string & config_prefix); -Poco::Net::HTTPRequestHandlerFactory * createDynamicHandlerFactory(IServer & server, const std::string & config_prefix); +HTTPRequestHandlerFactoryPtr createDynamicHandlerFactory(IServer & server, const std::string & config_prefix); -Poco::Net::HTTPRequestHandlerFactory * createPredefinedHandlerFactory(IServer & server, const std::string & config_prefix); +HTTPRequestHandlerFactoryPtr createPredefinedHandlerFactory(IServer & server, const std::string & config_prefix); -Poco::Net::HTTPRequestHandlerFactory * createReplicasStatusHandlerFactory(IServer & server, const std::string & config_prefix); +HTTPRequestHandlerFactoryPtr createReplicasStatusHandlerFactory(IServer & server, const std::string & config_prefix); -Poco::Net::HTTPRequestHandlerFactory * createPrometheusHandlerFactory(IServer & server, AsynchronousMetrics & async_metrics, const std::string & config_prefix); - -Poco::Net::HTTPRequestHandlerFactory * createHandlerFactory(IServer & server, AsynchronousMetrics & async_metrics, const std::string & name); +HTTPRequestHandlerFactoryPtr +createPrometheusHandlerFactory(IServer & server, AsynchronousMetrics & async_metrics, const std::string & config_prefix); +HTTPRequestHandlerFactoryPtr createHandlerFactory(IServer & server, AsynchronousMetrics & async_metrics, const std::string & name); } diff --git a/src/Server/HTTPHandlerRequestFilter.h b/src/Server/HTTPHandlerRequestFilter.h index f952efd7653..f0474e8b953 100644 --- a/src/Server/HTTPHandlerRequestFilter.h +++ b/src/Server/HTTPHandlerRequestFilter.h @@ -1,15 +1,17 @@ #pragma once -#include "HTTPHandlerFactory.h" +#include +#include +#include +#include +#include #include #include #include -#include #include -#include - +#include namespace DB { @@ -17,11 +19,9 @@ namespace DB namespace ErrorCodes { extern const int CANNOT_COMPILE_REGEXP; - extern const int UNKNOWN_ELEMENT_IN_CONFIG; } - -typedef std::shared_ptr CompiledRegexPtr; +using CompiledRegexPtr = std::shared_ptr; static inline bool checkRegexExpression(const StringRef & match_str, const CompiledRegexPtr & compiled_regex) { @@ -45,10 +45,10 @@ static inline auto methodsFilter(Poco::Util::AbstractConfiguration & config, con std::vector methods; Poco::StringTokenizer tokenizer(config.getString(config_path), ","); - for (auto iterator = tokenizer.begin(); iterator != tokenizer.end(); ++iterator) - methods.emplace_back(Poco::toUpper(Poco::trim(*iterator))); + for (const auto & iterator : tokenizer) + methods.emplace_back(Poco::toUpper(Poco::trim(iterator))); - return [methods](const Poco::Net::HTTPServerRequest & request) { return std::count(methods.begin(), methods.end(), request.getMethod()); }; + return [methods](const HTTPServerRequest & request) { return std::count(methods.begin(), methods.end(), request.getMethod()); }; } static inline auto getExpression(const std::string & expression) @@ -66,7 +66,7 @@ static inline auto getExpression(const std::string & expression) static inline auto urlFilter(Poco::Util::AbstractConfiguration & config, const std::string & config_path) { - return [expression = getExpression(config.getString(config_path))](const Poco::Net::HTTPServerRequest & request) + return [expression = getExpression(config.getString(config_path))](const HTTPServerRequest & request) { const auto & uri = request.getURI(); const auto & end = find_first_symbols<'?'>(uri.data(), uri.data() + uri.size()); @@ -88,7 +88,7 @@ static inline auto headersFilter(Poco::Util::AbstractConfiguration & config, con headers_expression.emplace(std::make_pair(header_name, expression)); } - return [headers_expression](const Poco::Net::HTTPServerRequest & request) + return [headers_expression](const HTTPServerRequest & request) { for (const auto & [header_name, header_expression] : headers_expression) { @@ -101,28 +101,4 @@ static inline auto headersFilter(Poco::Util::AbstractConfiguration & config, con }; } -template -static inline Poco::Net::HTTPRequestHandlerFactory * addFiltersFromConfig( - HandlingRuleHTTPHandlerFactory * factory, Poco::Util::AbstractConfiguration & config, const std::string & prefix) -{ - Poco::Util::AbstractConfiguration::Keys filters_type; - config.keys(prefix, filters_type); - - for (const auto & filter_type : filters_type) - { - if (filter_type == "handler") - continue; - else if (filter_type == "url") - factory->addFilter(urlFilter(config, prefix + ".url")); - else if (filter_type == "headers") - factory->addFilter(headersFilter(config, prefix + ".headers")); - else if (filter_type == "methods") - factory->addFilter(methodsFilter(config, prefix + ".methods")); - else - throw Exception("Unknown element in config: " + prefix + "." + filter_type, ErrorCodes::UNKNOWN_ELEMENT_IN_CONFIG); - } - - return factory; -} - } diff --git a/src/Server/InterserverIOHTTPHandler.cpp b/src/Server/InterserverIOHTTPHandler.cpp index 973759bedd1..3296da94578 100644 --- a/src/Server/InterserverIOHTTPHandler.cpp +++ b/src/Server/InterserverIOHTTPHandler.cpp @@ -1,18 +1,18 @@ -#include "InterserverIOHTTPHandler.h" +#include + +#include -#include -#include -#include -#include -#include -#include -#include #include #include -#include -#include #include -#include "IServer.h" +#include +#include +#include +#include +#include + +#include +#include namespace DB { @@ -23,7 +23,7 @@ namespace ErrorCodes extern const int TOO_MANY_SIMULTANEOUS_QUERIES; } -std::pair InterserverIOHTTPHandler::checkAuthentication(Poco::Net::HTTPServerRequest & request) const +std::pair InterserverIOHTTPHandler::checkAuthentication(HTTPServerRequest & request) const { const auto & config = server.config(); @@ -51,7 +51,7 @@ std::pair InterserverIOHTTPHandler::checkAuthentication(Poco::Net: return {"", true}; } -void InterserverIOHTTPHandler::processQuery(Poco::Net::HTTPServerRequest & request, Poco::Net::HTTPServerResponse & response, Output & used_output) +void InterserverIOHTTPHandler::processQuery(HTTPServerRequest & request, HTTPServerResponse & response, Output & used_output) { HTMLForm params(request); @@ -60,7 +60,7 @@ void InterserverIOHTTPHandler::processQuery(Poco::Net::HTTPServerRequest & reque String endpoint_name = params.get("endpoint"); bool compress = params.get("compress") == "true"; - ReadBufferFromIStream body(request.stream()); + auto & body = request.getStream(); auto endpoint = server.context().getInterserverIOHandler().getEndpoint(endpoint_name); /// Locked for read while query processing @@ -80,18 +80,19 @@ void InterserverIOHTTPHandler::processQuery(Poco::Net::HTTPServerRequest & reque } -void InterserverIOHTTPHandler::handleRequest(Poco::Net::HTTPServerRequest & request, Poco::Net::HTTPServerResponse & response) +void InterserverIOHTTPHandler::handleRequest(HTTPServerRequest & request, HTTPServerResponse & response) { setThreadName("IntersrvHandler"); /// In order to work keep-alive. - if (request.getVersion() == Poco::Net::HTTPServerRequest::HTTP_1_1) + if (request.getVersion() == HTTPServerRequest::HTTP_1_1) response.setChunkedTransferEncoding(true); Output used_output; const auto & config = server.config(); unsigned keep_alive_timeout = config.getUInt("keep_alive_timeout", 10); - used_output.out = std::make_shared(request, response, keep_alive_timeout); + used_output.out = std::make_shared( + response, request.getMethod() == Poco::Net::HTTPRequest::HTTP_HEAD, keep_alive_timeout); try { @@ -102,7 +103,7 @@ void InterserverIOHTTPHandler::handleRequest(Poco::Net::HTTPServerRequest & requ } else { - response.setStatusAndReason(Poco::Net::HTTPServerResponse::HTTP_UNAUTHORIZED); + response.setStatusAndReason(HTTPServerResponse::HTTP_UNAUTHORIZED); if (!response.sent()) writeString(message, *used_output.out); LOG_WARNING(log, "Query processing failed request: '{}' authentication failed", request.getURI()); diff --git a/src/Server/InterserverIOHTTPHandler.h b/src/Server/InterserverIOHTTPHandler.h index 8dc1962664c..47892aa678f 100644 --- a/src/Server/InterserverIOHTTPHandler.h +++ b/src/Server/InterserverIOHTTPHandler.h @@ -1,10 +1,12 @@ #pragma once -#include -#include -#include +#include #include +#include + +#include + namespace CurrentMetrics { @@ -17,7 +19,7 @@ namespace DB class IServer; class WriteBufferFromHTTPServerResponse; -class InterserverIOHTTPHandler : public Poco::Net::HTTPRequestHandler +class InterserverIOHTTPHandler : public HTTPRequestHandler { public: explicit InterserverIOHTTPHandler(IServer & server_) @@ -26,7 +28,7 @@ public: { } - void handleRequest(Poco::Net::HTTPServerRequest & request, Poco::Net::HTTPServerResponse & response) override; + void handleRequest(HTTPServerRequest & request, HTTPServerResponse & response) override; private: struct Output @@ -39,9 +41,9 @@ private: CurrentMetrics::Increment metric_increment{CurrentMetrics::InterserverConnection}; - void processQuery(Poco::Net::HTTPServerRequest & request, Poco::Net::HTTPServerResponse & response, Output & used_output); + void processQuery(HTTPServerRequest & request, HTTPServerResponse & response, Output & used_output); - std::pair checkAuthentication(Poco::Net::HTTPServerRequest & request) const; + std::pair checkAuthentication(HTTPServerRequest & request) const; }; } diff --git a/src/Server/NotFoundHandler.cpp b/src/Server/NotFoundHandler.cpp index 766e8895784..3181708b9b7 100644 --- a/src/Server/NotFoundHandler.cpp +++ b/src/Server/NotFoundHandler.cpp @@ -1,32 +1,25 @@ -#include "NotFoundHandler.h" +#include #include - #include -#include -#include - namespace DB { - -void NotFoundHandler::handleRequest( - Poco::Net::HTTPServerRequest & request, - Poco::Net::HTTPServerResponse & response) +void NotFoundHandler::handleRequest(HTTPServerRequest & request, HTTPServerResponse & response) { try { response.setStatusAndReason(Poco::Net::HTTPResponse::HTTP_NOT_FOUND); - response.send() << "There is no handle " << request.getURI() << "\n\n" - << "Use / or /ping for health checks.\n" - << "Or /replicas_status for more sophisticated health checks.\n\n" - << "Send queries from your program with POST method or GET /?query=...\n\n" - << "Use clickhouse-client:\n\n" - << "For interactive data analysis:\n" - << " clickhouse-client\n\n" - << "For batch query processing:\n" - << " clickhouse-client --query='SELECT 1' > result\n" - << " clickhouse-client < query > result\n"; + *response.send() << "There is no handle " << request.getURI() << "\n\n" + << "Use / or /ping for health checks.\n" + << "Or /replicas_status for more sophisticated health checks.\n\n" + << "Send queries from your program with POST method or GET /?query=...\n\n" + << "Use clickhouse-client:\n\n" + << "For interactive data analysis:\n" + << " clickhouse-client\n\n" + << "For batch query processing:\n" + << " clickhouse-client --query='SELECT 1' > result\n" + << " clickhouse-client < query > result\n"; } catch (...) { diff --git a/src/Server/NotFoundHandler.h b/src/Server/NotFoundHandler.h index 7f758e49d0d..749ac388c4d 100644 --- a/src/Server/NotFoundHandler.h +++ b/src/Server/NotFoundHandler.h @@ -1,18 +1,15 @@ #pragma once -#include - +#include namespace DB { /// Response with 404 and verbose description. -class NotFoundHandler : public Poco::Net::HTTPRequestHandler +class NotFoundHandler : public HTTPRequestHandler { public: - void handleRequest( - Poco::Net::HTTPServerRequest & request, - Poco::Net::HTTPServerResponse & response) override; + void handleRequest(HTTPServerRequest & request, HTTPServerResponse & response) override; }; } diff --git a/src/Server/PrometheusRequestHandler.cpp b/src/Server/PrometheusRequestHandler.cpp index 60deec9b289..83cb8e85a9e 100644 --- a/src/Server/PrometheusRequestHandler.cpp +++ b/src/Server/PrometheusRequestHandler.cpp @@ -1,26 +1,19 @@ -#include "PrometheusRequestHandler.h" +#include #include - -#include - -#include -#include -#include - -#include +#include +#include +#include #include +#include +#include -#include -#include +#include namespace DB { - -void PrometheusRequestHandler::handleRequest( - Poco::Net::HTTPServerRequest & request, - Poco::Net::HTTPServerResponse & response) +void PrometheusRequestHandler::handleRequest(HTTPServerRequest & request, HTTPServerResponse & response) { try { @@ -31,7 +24,7 @@ void PrometheusRequestHandler::handleRequest( response.setContentType("text/plain; version=0.0.4; charset=UTF-8"); - auto wb = WriteBufferFromHTTPServerResponse(request, response, keep_alive_timeout); + auto wb = WriteBufferFromHTTPServerResponse(response, request.getMethod() == Poco::Net::HTTPRequest::HTTP_HEAD, keep_alive_timeout); metrics_writer.write(wb); wb.finalize(); } @@ -41,10 +34,13 @@ void PrometheusRequestHandler::handleRequest( } } -Poco::Net::HTTPRequestHandlerFactory * createPrometheusHandlerFactory(IServer & server, AsynchronousMetrics & async_metrics, const std::string & config_prefix) +HTTPRequestHandlerFactoryPtr +createPrometheusHandlerFactory(IServer & server, AsynchronousMetrics & async_metrics, const std::string & config_prefix) { - return addFiltersFromConfig(new HandlingRuleHTTPHandlerFactory( - server, PrometheusMetricsWriter(server.config(), config_prefix + ".handler", async_metrics)), server.config(), config_prefix); + auto factory = std::make_shared>( + server, PrometheusMetricsWriter(server.config(), config_prefix + ".handler", async_metrics)); + factory->addFiltersFromConfig(server.config(), config_prefix); + return factory; } } diff --git a/src/Server/PrometheusRequestHandler.h b/src/Server/PrometheusRequestHandler.h index 47c8adf4774..1fb3d9f0f59 100644 --- a/src/Server/PrometheusRequestHandler.h +++ b/src/Server/PrometheusRequestHandler.h @@ -1,17 +1,15 @@ #pragma once -#include "IServer.h" -#include "PrometheusMetricsWriter.h" +#include -#include -#include -#include -#include +#include "PrometheusMetricsWriter.h" namespace DB { -class PrometheusRequestHandler : public Poco::Net::HTTPRequestHandler +class IServer; + +class PrometheusRequestHandler : public HTTPRequestHandler { private: IServer & server; @@ -24,9 +22,7 @@ public: { } - void handleRequest( - Poco::Net::HTTPServerRequest & request, - Poco::Net::HTTPServerResponse & response) override; + void handleRequest(HTTPServerRequest & request, HTTPServerResponse & response) override; }; } diff --git a/src/Server/ReplicasStatusHandler.cpp b/src/Server/ReplicasStatusHandler.cpp index fc79ad9d134..778f9827131 100644 --- a/src/Server/ReplicasStatusHandler.cpp +++ b/src/Server/ReplicasStatusHandler.cpp @@ -1,17 +1,18 @@ -#include "ReplicasStatusHandler.h" +#include -#include -#include -#include -#include #include #include +#include +#include +#include +#include +#include +#include +#include #include #include #include -#include -#include namespace DB @@ -24,7 +25,7 @@ ReplicasStatusHandler::ReplicasStatusHandler(IServer & server) } -void ReplicasStatusHandler::handleRequest(Poco::Net::HTTPServerRequest & request, Poco::Net::HTTPServerResponse & response) +void ReplicasStatusHandler::handleRequest(HTTPServerRequest & request, HTTPServerResponse & response) { try { @@ -82,7 +83,7 @@ void ReplicasStatusHandler::handleRequest(Poco::Net::HTTPServerRequest & request } if (verbose) - response.send() << message.str(); + *response.send() << message.str(); else { const char * data = "Ok.\n"; @@ -100,7 +101,7 @@ void ReplicasStatusHandler::handleRequest(Poco::Net::HTTPServerRequest & request if (!response.sent()) { /// We have not sent anything yet and we don't even know if we need to compress response. - response.send() << getCurrentExceptionMessage(false) << std::endl; + *response.send() << getCurrentExceptionMessage(false) << std::endl; } } catch (...) @@ -110,9 +111,11 @@ void ReplicasStatusHandler::handleRequest(Poco::Net::HTTPServerRequest & request } } -Poco::Net::HTTPRequestHandlerFactory * createReplicasStatusHandlerFactory(IServer & server, const std::string & config_prefix) +HTTPRequestHandlerFactoryPtr createReplicasStatusHandlerFactory(IServer & server, const std::string & config_prefix) { - return addFiltersFromConfig(new HandlingRuleHTTPHandlerFactory(server), server.config(), config_prefix); + auto factory = std::make_shared>(server); + factory->addFiltersFromConfig(server.config(), config_prefix); + return factory; } } diff --git a/src/Server/ReplicasStatusHandler.h b/src/Server/ReplicasStatusHandler.h index a32f1ba905f..8a790b13ad6 100644 --- a/src/Server/ReplicasStatusHandler.h +++ b/src/Server/ReplicasStatusHandler.h @@ -1,17 +1,15 @@ #pragma once -#include "IServer.h" - -#include - +#include namespace DB { class Context; +class IServer; /// Replies "Ok.\n" if all replicas on this server don't lag too much. Otherwise output lag information. -class ReplicasStatusHandler : public Poco::Net::HTTPRequestHandler +class ReplicasStatusHandler : public HTTPRequestHandler { private: Context & context; @@ -19,7 +17,7 @@ private: public: explicit ReplicasStatusHandler(IServer & server_); - void handleRequest(Poco::Net::HTTPServerRequest & request, Poco::Net::HTTPServerResponse & response) override; + void handleRequest(HTTPServerRequest & request, HTTPServerResponse & response) override; }; diff --git a/src/Server/StaticRequestHandler.cpp b/src/Server/StaticRequestHandler.cpp index ad2c07ab0aa..f3f564c1cf8 100644 --- a/src/Server/StaticRequestHandler.cpp +++ b/src/Server/StaticRequestHandler.cpp @@ -9,7 +9,7 @@ #include #include #include -#include +#include #include #include @@ -32,7 +32,8 @@ namespace ErrorCodes extern const int INVALID_CONFIG_PARAMETER; } -static inline WriteBufferPtr responseWriteBuffer(Poco::Net::HTTPServerRequest & request, Poco::Net::HTTPServerResponse & response, unsigned int keep_alive_timeout) +static inline WriteBufferPtr +responseWriteBuffer(HTTPServerRequest & request, HTTPServerResponse & response, unsigned int keep_alive_timeout) { /// The client can pass a HTTP header indicating supported compression method (gzip or deflate). String http_response_compression_methods = request.get("Accept-Encoding", ""); @@ -55,12 +56,15 @@ static inline WriteBufferPtr responseWriteBuffer(Poco::Net::HTTPServerRequest & bool client_supports_http_compression = http_response_compression_method != CompressionMethod::None; return std::make_shared( - request, response, keep_alive_timeout, client_supports_http_compression, http_response_compression_method); + response, + request.getMethod() == Poco::Net::HTTPRequest::HTTP_HEAD, + keep_alive_timeout, + client_supports_http_compression, + http_response_compression_method); } static inline void trySendExceptionToClient( - const std::string & s, int exception_code, - Poco::Net::HTTPServerRequest & request, Poco::Net::HTTPServerResponse & response , WriteBuffer & out) + const std::string & s, int exception_code, HTTPServerRequest & request, HTTPServerResponse & response, WriteBuffer & out) { try { @@ -69,13 +73,13 @@ static inline void trySendExceptionToClient( /// If HTTP method is POST and Keep-Alive is turned on, we should read the whole request body /// to avoid reading part of the current request body in the next request. if (request.getMethod() == Poco::Net::HTTPRequest::HTTP_POST - && response.getKeepAlive() && !request.stream().eof() && exception_code != ErrorCodes::HTTP_LENGTH_REQUIRED) - request.stream().ignore(std::numeric_limits::max()); + && response.getKeepAlive() && !request.getStream().eof() && exception_code != ErrorCodes::HTTP_LENGTH_REQUIRED) + request.getStream().ignore(std::numeric_limits::max()); response.setStatusAndReason(Poco::Net::HTTPResponse::HTTP_INTERNAL_SERVER_ERROR); if (!response.sent()) - response.send() << s << std::endl; + *response.send() << s << std::endl; else { if (out.count() != out.offset()) @@ -94,7 +98,7 @@ static inline void trySendExceptionToClient( } } -void StaticRequestHandler::handleRequest(Poco::Net::HTTPServerRequest & request, Poco::Net::HTTPServerResponse & response) +void StaticRequestHandler::handleRequest(HTTPServerRequest & request, HTTPServerResponse & response) { auto keep_alive_timeout = server.config().getUInt("keep_alive_timeout", 10); const auto & out = responseWriteBuffer(request, response, keep_alive_timeout); @@ -159,14 +163,17 @@ StaticRequestHandler::StaticRequestHandler(IServer & server_, const String & exp { } -Poco::Net::HTTPRequestHandlerFactory * createStaticHandlerFactory(IServer & server, const std::string & config_prefix) +HTTPRequestHandlerFactoryPtr createStaticHandlerFactory(IServer & server, const std::string & config_prefix) { int status = server.config().getInt(config_prefix + ".handler.status", 200); std::string response_content = server.config().getRawString(config_prefix + ".handler.response_content", "Ok.\n"); std::string response_content_type = server.config().getString(config_prefix + ".handler.content_type", "text/plain; charset=UTF-8"); + auto factory = std::make_shared>( + server, std::move(response_content), std::move(status), std::move(response_content_type)); - return addFiltersFromConfig(new HandlingRuleHTTPHandlerFactory( - server, std::move(response_content), std::move(status), std::move(response_content_type)), server.config(), config_prefix); + factory->addFiltersFromConfig(server.config(), config_prefix); + + return factory; } } diff --git a/src/Server/StaticRequestHandler.h b/src/Server/StaticRequestHandler.h index 0a29384ad0e..56c7f5a6d44 100644 --- a/src/Server/StaticRequestHandler.h +++ b/src/Server/StaticRequestHandler.h @@ -1,6 +1,6 @@ #pragma once -#include +#include #include @@ -11,7 +11,7 @@ class IServer; class WriteBuffer; /// Response with custom string. Can be used for browser. -class StaticRequestHandler : public Poco::Net::HTTPRequestHandler +class StaticRequestHandler : public HTTPRequestHandler { private: IServer & server; @@ -29,7 +29,7 @@ public: void writeResponse(WriteBuffer & out); - void handleRequest(Poco::Net::HTTPServerRequest & request, Poco::Net::HTTPServerResponse & response) override; + void handleRequest(HTTPServerRequest & request, HTTPServerResponse & response) override; }; } diff --git a/src/Server/WebUIRequestHandler.cpp b/src/Server/WebUIRequestHandler.cpp index 6159a27971f..fb8ff71611e 100644 --- a/src/Server/WebUIRequestHandler.cpp +++ b/src/Server/WebUIRequestHandler.cpp @@ -18,18 +18,18 @@ WebUIRequestHandler::WebUIRequestHandler(IServer & server_, std::string resource } -void WebUIRequestHandler::handleRequest(Poco::Net::HTTPServerRequest & request, Poco::Net::HTTPServerResponse & response) +void WebUIRequestHandler::handleRequest(HTTPServerRequest & request, HTTPServerResponse & response) { auto keep_alive_timeout = server.config().getUInt("keep_alive_timeout", 10); response.setContentType("text/html; charset=UTF-8"); - if (request.getVersion() == Poco::Net::HTTPServerRequest::HTTP_1_1) + if (request.getVersion() == HTTPServerRequest::HTTP_1_1) response.setChunkedTransferEncoding(true); setResponseDefaultHeaders(response, keep_alive_timeout); response.setStatusAndReason(Poco::Net::HTTPResponse::HTTP_OK); - response.send() << getResource(resource_name); + *response.send() << getResource(resource_name); } } diff --git a/src/Server/WebUIRequestHandler.h b/src/Server/WebUIRequestHandler.h index 3066b86b36a..1c52b626091 100644 --- a/src/Server/WebUIRequestHandler.h +++ b/src/Server/WebUIRequestHandler.h @@ -1,6 +1,6 @@ #pragma once -#include +#include namespace DB @@ -9,14 +9,14 @@ namespace DB class IServer; /// Response with HTML page that allows to send queries and show results in browser. -class WebUIRequestHandler : public Poco::Net::HTTPRequestHandler +class WebUIRequestHandler : public HTTPRequestHandler { private: IServer & server; std::string resource_name; public: WebUIRequestHandler(IServer & server_, std::string resource_name_); - void handleRequest(Poco::Net::HTTPServerRequest & request, Poco::Net::HTTPServerResponse & response) override; + void handleRequest(HTTPServerRequest & request, HTTPServerResponse & response) override; }; } diff --git a/src/Server/ya.make b/src/Server/ya.make index a0269e9ac84..ef5ef6d5f57 100644 --- a/src/Server/ya.make +++ b/src/Server/ya.make @@ -11,6 +11,14 @@ PEERDIR( SRCS( GRPCServer.cpp + HTTP/HTMLForm.cpp + HTTP/HTTPServer.cpp + HTTP/HTTPServerConnection.cpp + HTTP/HTTPServerConnectionFactory.cpp + HTTP/HTTPServerRequest.cpp + HTTP/HTTPServerResponse.cpp + HTTP/ReadHeaders.cpp + HTTP/WriteBufferFromHTTPServerResponse.cpp HTTPHandler.cpp HTTPHandlerFactory.cpp InterserverIOHTTPHandler.cpp diff --git a/src/Storages/MergeTree/DataPartsExchange.cpp b/src/Storages/MergeTree/DataPartsExchange.cpp index e01e7793dd3..f80020991b0 100644 --- a/src/Storages/MergeTree/DataPartsExchange.cpp +++ b/src/Storages/MergeTree/DataPartsExchange.cpp @@ -1,17 +1,20 @@ #include + +#include +#include +#include +#include +#include +#include #include #include -#include -#include +#include #include #include -#include -#include #include + #include -#include #include -#include namespace CurrentMetrics @@ -83,7 +86,7 @@ std::string Service::getId(const std::string & node_id) const return getEndpointId(node_id); } -void Service::processQuery(const Poco::Net::HTMLForm & params, ReadBuffer & /*body*/, WriteBuffer & out, Poco::Net::HTTPServerResponse & response) +void Service::processQuery(const HTMLForm & params, ReadBuffer & /*body*/, WriteBuffer & out, HTTPServerResponse & response) { int client_protocol_version = parse(params.get("client_protocol_version", "0")); diff --git a/src/Storages/MergeTree/DataPartsExchange.h b/src/Storages/MergeTree/DataPartsExchange.h index 0a359474d2d..834fed1182f 100644 --- a/src/Storages/MergeTree/DataPartsExchange.h +++ b/src/Storages/MergeTree/DataPartsExchange.h @@ -20,21 +20,19 @@ namespace DataPartsExchange class Service final : public InterserverIOEndpoint { public: - Service(MergeTreeData & data_) - : data(data_), log(&Poco::Logger::get(data.getLogName() + " (Replicated PartsService)")) {} + explicit Service(MergeTreeData & data_) : data(data_), log(&Poco::Logger::get(data.getLogName() + " (Replicated PartsService)")) {} Service(const Service &) = delete; Service & operator=(const Service &) = delete; std::string getId(const std::string & node_id) const override; - void processQuery(const Poco::Net::HTMLForm & params, ReadBuffer & body, WriteBuffer & out, Poco::Net::HTTPServerResponse & response) override; + void processQuery(const HTMLForm & params, ReadBuffer & body, WriteBuffer & out, HTTPServerResponse & response) override; private: MergeTreeData::DataPartPtr findPart(const String & name); void sendPartFromMemory(const MergeTreeData::DataPartPtr & part, WriteBuffer & out); void sendPartFromDisk(const MergeTreeData::DataPartPtr & part, WriteBuffer & out, int client_protocol_version); -private: /// StorageReplicatedMergeTree::shutdown() waits for all parts exchange handlers to finish, /// so Service will never access dangling reference to storage MergeTreeData & data; @@ -43,13 +41,10 @@ private: /** Client for getting the parts from the table *MergeTree. */ -class Fetcher final +class Fetcher final : private boost::noncopyable { public: - Fetcher(MergeTreeData & data_) : data(data_), log(&Poco::Logger::get("Fetcher")) {} - - Fetcher(const Fetcher &) = delete; - Fetcher & operator=(const Fetcher &) = delete; + explicit Fetcher(MergeTreeData & data_) : data(data_), log(&Poco::Logger::get("Fetcher")) {} /// Downloads a part to tmp_directory. If to_detached - downloads to the `detached` directory. MergeTreeData::MutableDataPartPtr fetchPart( @@ -75,7 +70,7 @@ private: bool to_detached, const String & tmp_prefix_, bool sync, - const ReservationPtr reservation, + ReservationPtr reservation, PooledReadWriteBufferFromHTTP & in); MergeTreeData::MutableDataPartPtr downloadPartToMemory( diff --git a/tests/queries/query_test.py b/tests/queries/query_test.py index 3dea639187e..417a51fe523 100644 --- a/tests/queries/query_test.py +++ b/tests/queries/query_test.py @@ -33,7 +33,7 @@ SKIP_LIST = [ "01057_http_compression_prefer_brotli", "01080_check_for_error_incorrect_size_of_nested_column", "01083_expressions_in_engine_arguments", - "01086_odbc_roundtrip", + # "01086_odbc_roundtrip", "01088_benchmark_query_id", "01098_temporary_and_external_tables", "01099_parallel_distributed_insert_select", From 5c9420c0779c648db5a42ecbb8f6db43cb98a76d Mon Sep 17 00:00:00 2001 From: alesapin Date: Fri, 19 Feb 2021 15:56:51 +0300 Subject: [PATCH 0493/2357] More correct epoll usage --- src/Server/NuKeeperTCPHandler.cpp | 46 ++++++++++++++++--------------- 1 file changed, 24 insertions(+), 22 deletions(-) diff --git a/src/Server/NuKeeperTCPHandler.cpp b/src/Server/NuKeeperTCPHandler.cpp index 081821504d3..92c7f4b968f 100644 --- a/src/Server/NuKeeperTCPHandler.cpp +++ b/src/Server/NuKeeperTCPHandler.cpp @@ -70,14 +70,14 @@ struct SocketInterruptablePollWrapper if (epollfd < 0) throwFromErrno("Cannot epoll_create", ErrorCodes::SYSTEM_ERROR); - socket_event.events = EPOLLIN | EPOLLERR; + socket_event.events = EPOLLIN | EPOLLERR | EPOLLPRI; socket_event.data.fd = sockfd; if (epoll_ctl(epollfd, EPOLL_CTL_ADD, sockfd, &socket_event) < 0) { ::close(epollfd); throwFromErrno("Cannot insert socket into epoll queue", ErrorCodes::SYSTEM_ERROR); } - pipe_event.events = EPOLLIN | EPOLLERR; + pipe_event.events = EPOLLIN | EPOLLERR | EPOLLPRI; pipe_event.data.fd = pipe.fds_rw[0]; if (epoll_ctl(epollfd, EPOLL_CTL_ADD, pipe.fds_rw[0], &pipe_event) < 0) { @@ -108,11 +108,12 @@ struct SocketInterruptablePollWrapper if (result.has_response) return result; - std::array outputs = {-1, -1}; + bool socket_ready = false; + bool fd_ready = false; #if defined(POCO_HAVE_FD_EPOLL) int rc; epoll_event evout[2]; - memset(evout, 0, sizeof(evout)); + evout[0].data.fd = evout[1].data.fd = -1; do { Poco::Timestamp start; @@ -129,10 +130,13 @@ struct SocketInterruptablePollWrapper } while (rc < 0 && errno == EINTR); - if (rc >= 1 && evout[0].events & EPOLLIN) - outputs[0] = evout[0].data.fd; - if (rc == 2 && evout[1].events & EPOLLIN) - outputs[1] = evout[1].data.fd; + for (int i = 0; i < rc; ++i) + { + if (evout[i].data.fd == sockfd) + socket_ready = true; + if (evout[i].data.fd == pipe.fds_rw[0]) + fd_ready = true; + } #else pollfd poll_buf[2]; poll_buf[0].fd = sockfd; @@ -156,10 +160,11 @@ struct SocketInterruptablePollWrapper } } while (rc < 0 && errno == POCO_EINTR); + if (rc >= 1 && poll_buf[0].revents & POLLIN) - outputs[0] = sockfd; + socket_ready = true; if (rc == 2 && poll_buf[1].revents & POLLIN) - outputs[1] = pipe.fds_rw[0]; + fd_ready = true; #endif if (rc < 0) @@ -173,19 +178,15 @@ struct SocketInterruptablePollWrapper } else { - for (auto fd : outputs) + if (socket_ready) { - if (fd != -1) - { - if (fd == sockfd) - result.has_requests = true; - else - { - UInt8 dummy; - readIntBinary(dummy, response_in); - result.has_response = true; - } - } + result.has_requests = true; + } + if (fd_ready) + { + UInt8 dummy; + readIntBinary(dummy, response_in); + result.has_response = true; } } return result; @@ -368,6 +369,7 @@ void NuKeeperTCPHandler::runImpl() if (result.has_response) { Coordination::ZooKeeperResponsePtr response; + if (!responses->tryPop(response)) throw Exception(ErrorCodes::LOGICAL_ERROR, "We must have ready response, but queue is empty. It's a bug."); From ea27c3ca32bdf9a18e90d75bf38bbc725c6db4db Mon Sep 17 00:00:00 2001 From: alesapin Date: Fri, 19 Feb 2021 16:41:46 +0300 Subject: [PATCH 0494/2357] Add gdb to fasttest image --- docker/test/fasttest/Dockerfile | 1 + 1 file changed, 1 insertion(+) diff --git a/docker/test/fasttest/Dockerfile b/docker/test/fasttest/Dockerfile index 03b7b2fc53a..64be52d8e30 100644 --- a/docker/test/fasttest/Dockerfile +++ b/docker/test/fasttest/Dockerfile @@ -47,6 +47,7 @@ RUN apt-get update \ expect \ fakeroot \ git \ + gdb \ gperf \ lld-${LLVM_VERSION} \ llvm-${LLVM_VERSION} \ From 946576017ff9929d57bab403adfe12ca1bdcbd48 Mon Sep 17 00:00:00 2001 From: vdimir Date: Fri, 19 Feb 2021 17:06:57 +0300 Subject: [PATCH 0495/2357] Update CrossToInnerJoinVisitor, add tests to cross_to_inner_join --- src/Interpreters/CrossToInnerJoinVisitor.cpp | 52 ++++----- .../00826_cross_to_inner_join.reference | 109 +++++++++++++++--- .../0_stateless/00826_cross_to_inner_join.sql | 69 ++++++++--- .../01083_cross_to_inner_with_like.reference | 7 -- .../01083_cross_to_inner_with_like.sql | 1 - 5 files changed, 168 insertions(+), 70 deletions(-) diff --git a/src/Interpreters/CrossToInnerJoinVisitor.cpp b/src/Interpreters/CrossToInnerJoinVisitor.cpp index b1e42b23ad5..c4d330831bb 100644 --- a/src/Interpreters/CrossToInnerJoinVisitor.cpp +++ b/src/Interpreters/CrossToInnerJoinVisitor.cpp @@ -124,6 +124,13 @@ void collectConjunctions(const ASTPtr & node, std::vector & members) members.push_back(node); } +std::vector collectConjunctions(const ASTPtr & node) +{ + std::vector members; + collectConjunctions(node, members); + return members; +} + std::optional getIdentMembership(const ASTIdentifier & ident, const std::vector & tables) { std::optional table_pos = IdentifierSemantic::getMembership(ident); @@ -169,20 +176,20 @@ bool isAllowedToRewriteCrossJoin(const ASTPtr & node, const Aliases & aliases) return node->as() || node->as(); } -bool canMoveExpressionToJoinOn(const ASTPtr & ast, - const std::vector & joined_tables, - const std::vector & tables, - const Aliases & aliases, - std::map> & asts_to_join_on) +/// Return mapping table_no -> expression with expression that can be moved into JOIN ON section +std::map> moveExpressionToJoinOn( + const ASTPtr & ast, + const std::vector & joined_tables, + const std::vector & tables, + const Aliases & aliases) { - std::vector conjuncts; - collectConjunctions(ast, conjuncts); - for (const auto & node : conjuncts) + std::map> asts_to_join_on; + for (const auto & node : collectConjunctions(ast)) { if (const auto * func = node->as(); func && func->name == NameEquals::name) { if (!func->arguments || func->arguments->children.size() != 2) - return false; + return {}; /// Check if the identifiers are from different joined tables. /// If it's a self joint, tables should have aliases. @@ -196,14 +203,14 @@ bool canMoveExpressionToJoinOn(const ASTPtr & ast, if (joined_tables[table_pos].canAttachOnExpression()) asts_to_join_on[table_pos].push_back(node); else - return false; + return {}; } } if (!isAllowedToRewriteCrossJoin(node, aliases)) - return false; + return {}; } - return true; + return asts_to_join_on; } ASTPtr makeOnExpression(const std::vector & expressions) @@ -317,7 +324,6 @@ void CrossToInnerJoinMatcher::visit(ASTSelectQuery & select, ASTPtr &, Data & da } /// COMMA to CROSS - if (num_comma) { for (auto & table : joined_tables) @@ -325,22 +331,16 @@ void CrossToInnerJoinMatcher::visit(ASTSelectQuery & select, ASTPtr &, Data & da } /// CROSS to INNER - - if (select.where() && data.cross_to_inner_join_rewrite) + if (data.cross_to_inner_join_rewrite && select.where()) { - std::map> asts_to_join_on; - bool can_move_where - = canMoveExpressionToJoinOn(select.where(), joined_tables, data.tables_with_columns, data.aliases, asts_to_join_on); - if (can_move_where) + auto asts_to_join_on = moveExpressionToJoinOn(select.where(), joined_tables, data.tables_with_columns, data.aliases); + for (size_t i = 1; i < joined_tables.size(); ++i) { - for (size_t i = 1; i < joined_tables.size(); ++i) + const auto & expr_it = asts_to_join_on.find(i); + if (expr_it != asts_to_join_on.end()) { - const auto & expr_it = asts_to_join_on.find(i); - if (expr_it != asts_to_join_on.end()) - { - if (joined_tables[i].rewriteCrossToInner(makeOnExpression(expr_it->second))) - data.done = true; - } + if (joined_tables[i].rewriteCrossToInner(makeOnExpression(expr_it->second))) + data.done = true; } } } diff --git a/tests/queries/0_stateless/00826_cross_to_inner_join.reference b/tests/queries/0_stateless/00826_cross_to_inner_join.reference index e7c8d6b1ea9..973c5b078a3 100644 --- a/tests/queries/0_stateless/00826_cross_to_inner_join.reference +++ b/tests/queries/0_stateless/00826_cross_to_inner_join.reference @@ -1,18 +1,18 @@ 0 0 -cross +--- cross --- 1 1 1 1 1 1 1 2 2 2 2 \N -cross nullable +--- cross nullable --- 1 1 1 1 2 2 1 2 -cross nullable vs not nullable +--- cross nullable vs not nullable --- 1 1 1 1 2 2 1 2 -cross self +--- cross self --- 1 1 1 1 2 2 2 2 -cross one table expr +--- cross one table expr --- 1 1 1 1 1 1 1 2 1 1 2 \N @@ -21,20 +21,34 @@ cross one table expr 2 2 1 2 2 2 2 \N 2 2 3 \N -cross multiple ands +--- cross multiple ands --- 1 1 1 1 -cross and inside and +--- cross and inside and --- 1 1 1 1 -cross split conjunction +--- cross split conjunction --- 1 1 1 1 -comma +--- and or --- +1 1 1 1 +--- arithmetic expr --- +2 2 1 2 +--- is null or --- +1 1 1 2 +2 2 2 \N +--- do not rewrite alias --- +1 +1 +2 +--- comma --- 1 1 1 1 1 1 1 2 2 2 2 \N -comma nullable +--- comma nullable --- 1 1 1 1 2 2 1 2 -cross +--- comma and or --- +1 1 1 1 +2 2 2 \N +--- cross --- SELECT a, b, @@ -43,7 +57,7 @@ SELECT FROM t1_00826 ALL INNER JOIN t2_00826 ON a = t2_00826.a WHERE a = t2_00826.a -cross nullable +--- cross nullable --- SELECT a, b, @@ -52,7 +66,7 @@ SELECT FROM t1_00826 ALL INNER JOIN t2_00826 ON a = t2_00826.a WHERE a = t2_00826.a -cross nullable vs not nullable +--- cross nullable vs not nullable --- SELECT a, b, @@ -61,7 +75,7 @@ SELECT FROM t1_00826 ALL INNER JOIN t2_00826 ON a = t2_00826.b WHERE a = t2_00826.b -cross self +--- cross self --- SELECT a, b, @@ -70,7 +84,7 @@ SELECT FROM t1_00826 AS x ALL INNER JOIN t1_00826 AS y ON (a = y.a) AND (b = y.b) WHERE (a = y.a) AND (b = y.b) -cross one table expr +--- cross one table expr --- SELECT a, b, @@ -79,7 +93,7 @@ SELECT FROM t1_00826 CROSS JOIN t2_00826 WHERE a = b -cross multiple ands +--- cross multiple ands --- SELECT a, b, @@ -88,7 +102,7 @@ SELECT FROM t1_00826 ALL INNER JOIN t2_00826 ON (a = t2_00826.a) AND (b = t2_00826.b) WHERE (a = t2_00826.a) AND (b = t2_00826.b) -cross and inside and +--- cross and inside and --- SELECT a, b, @@ -97,7 +111,7 @@ SELECT FROM t1_00826 ALL INNER JOIN t2_00826 ON (a = t2_00826.a) AND (a = t2_00826.a) AND (a = t2_00826.a) AND (b = t2_00826.b) WHERE (a = t2_00826.a) AND ((a = t2_00826.a) AND ((a = t2_00826.a) AND (b = t2_00826.b))) -cross split conjunction +--- cross split conjunction --- SELECT a, b, @@ -106,3 +120,62 @@ SELECT FROM t1_00826 ALL INNER JOIN t2_00826 ON (a = t2_00826.a) AND (b = t2_00826.b) WHERE (a = t2_00826.a) AND (b = t2_00826.b) AND (a >= 1) AND (t2_00826.b > 0) +--- and or --- +SELECT + a, + b, + t2_00826.a, + t2_00826.b +FROM t1_00826 +ALL INNER JOIN t2_00826 ON (a = t2_00826.a) AND (b = t2_00826.b) +WHERE (a = t2_00826.a) AND (b = t2_00826.b) AND ((a >= 1) OR (t2_00826.b = 1)) +--- arithmetic expr --- +SELECT + a, + b, + t2_00826.a, + t2_00826.b +FROM t1_00826 +ALL INNER JOIN t2_00826 ON (a + 1) = (t2_00826.a + t2_00826.b) +WHERE ((a + 1) = (t2_00826.a + t2_00826.b)) AND ((((a + b) + t2_00826.a) + t2_00826.b) > 5) +--- is null or --- +SELECT + a, + b, + t2_00826.a, + t2_00826.b +FROM t1_00826 +ALL INNER JOIN t2_00826 ON b = t2_00826.a +WHERE (b = t2_00826.a) AND (isNull(t2_00826.b) OR (t2_00826.b > t2_00826.a)) +--- do not rewrite alias --- +SELECT a AS b +FROM t1_00826 +CROSS JOIN t2_00826 +WHERE (b = t2_00826.a) AND (b > 0) +--- comma --- +SELECT + a, + b, + t2_00826.a, + t2_00826.b +FROM t1_00826 +ALL INNER JOIN t2_00826 ON a = t2_00826.a +WHERE a = t2_00826.a +--- comma nullable --- +SELECT + a, + b, + t2_00826.a, + t2_00826.b +FROM t1_00826 +ALL INNER JOIN t2_00826 ON b = t2_00826.b +WHERE b = t2_00826.b +--- comma and or --- +SELECT + a, + b, + t2_00826.a, + t2_00826.b +FROM t1_00826 +ALL INNER JOIN t2_00826 ON a = t2_00826.a +WHERE (a = t2_00826.a) AND (isNull(t2_00826.b) OR (t2_00826.b < 2)) diff --git a/tests/queries/0_stateless/00826_cross_to_inner_join.sql b/tests/queries/0_stateless/00826_cross_to_inner_join.sql index 67471864686..392ade02ab7 100644 --- a/tests/queries/0_stateless/00826_cross_to_inner_join.sql +++ b/tests/queries/0_stateless/00826_cross_to_inner_join.sql @@ -12,46 +12,79 @@ INSERT INTO t1_00826 values (1,1), (2,2); INSERT INTO t2_00826 values (1,1), (1,2); INSERT INTO t2_00826 (a) values (2), (3); -SELECT 'cross'; +SELECT '--- cross ---'; SELECT * FROM t1_00826 cross join t2_00826 where t1_00826.a = t2_00826.a; -SELECT 'cross nullable'; +SELECT '--- cross nullable ---'; SELECT * FROM t1_00826 cross join t2_00826 where t1_00826.b = t2_00826.b; -SELECT 'cross nullable vs not nullable'; +SELECT '--- cross nullable vs not nullable ---'; SELECT * FROM t1_00826 cross join t2_00826 where t1_00826.a = t2_00826.b; -SELECT 'cross self'; +SELECT '--- cross self ---'; SELECT * FROM t1_00826 x cross join t1_00826 y where x.a = y.a and x.b = y.b; -SELECT 'cross one table expr'; +SELECT '--- cross one table expr ---'; SELECT * FROM t1_00826 cross join t2_00826 where t1_00826.a = t1_00826.b order by (t1_00826.a, t2_00826.a, t2_00826.b); -SELECT 'cross multiple ands'; +SELECT '--- cross multiple ands ---'; SELECT * FROM t1_00826 cross join t2_00826 where t1_00826.a = t2_00826.a and t1_00826.b = t2_00826.b; -SELECT 'cross and inside and'; +SELECT '--- cross and inside and ---'; SELECT * FROM t1_00826 cross join t2_00826 where t1_00826.a = t2_00826.a and (t1_00826.b = t2_00826.b and 1); -SELECT 'cross split conjunction'; +SELECT '--- cross split conjunction ---'; SELECT * FROM t1_00826 cross join t2_00826 where t1_00826.a = t2_00826.a and t1_00826.b = t2_00826.b and t1_00826.a >= 1 and t2_00826.b = 1; -SELECT 'comma'; +SELECT '--- and or ---'; +SELECT * FROM t1_00826 cross join t2_00826 where t1_00826.a = t2_00826.a and t1_00826.b = t2_00826.b and (t1_00826.a >= 1 OR t2_00826.b = 1); + +SELECT '--- arithmetic expr ---'; +SELECT * FROM t1_00826 cross join t2_00826 where t1_00826.a + 1 = t2_00826.a + t2_00826.b AND (t1_00826.a + t1_00826.b + t2_00826.a + t2_00826.b > 5); + +SELECT '--- is null or ---'; +SELECT * FROM t1_00826 cross join t2_00826 where t1_00826.b = t2_00826.a AND (t2_00826.b IS NULL OR t2_00826.b > t2_00826.a); + +SELECT '--- do not rewrite alias ---'; +SELECT a as b FROM t1_00826 cross join t2_00826 where t1_00826.b = t2_00826.a AND b > 0; + +SELECT '--- comma ---'; SELECT * FROM t1_00826, t2_00826 where t1_00826.a = t2_00826.a; -SELECT 'comma nullable'; +SELECT '--- comma nullable ---'; SELECT * FROM t1_00826, t2_00826 where t1_00826.b = t2_00826.b; +SELECT '--- comma and or ---'; +SELECT * FROM t1_00826, t2_00826 where t1_00826.a = t2_00826.a AND (t2_00826.b IS NULL OR t2_00826.b < 2); -SELECT 'cross'; +SELECT '--- cross ---'; EXPLAIN SYNTAX SELECT * FROM t1_00826 cross join t2_00826 where t1_00826.a = t2_00826.a; -SELECT 'cross nullable'; +SELECT '--- cross nullable ---'; EXPLAIN SYNTAX SELECT * FROM t1_00826, t2_00826 where t1_00826.a = t2_00826.a; -SELECT 'cross nullable vs not nullable'; +SELECT '--- cross nullable vs not nullable ---'; EXPLAIN SYNTAX SELECT * FROM t1_00826 cross join t2_00826 where t1_00826.a = t2_00826.b; -SELECT 'cross self'; +SELECT '--- cross self ---'; EXPLAIN SYNTAX SELECT * FROM t1_00826 x cross join t1_00826 y where x.a = y.a and x.b = y.b; -SELECT 'cross one table expr'; +SELECT '--- cross one table expr ---'; EXPLAIN SYNTAX SELECT * FROM t1_00826 cross join t2_00826 where t1_00826.a = t1_00826.b; -SELECT 'cross multiple ands'; +SELECT '--- cross multiple ands ---'; EXPLAIN SYNTAX SELECT * FROM t1_00826 cross join t2_00826 where t1_00826.a = t2_00826.a and t1_00826.b = t2_00826.b; -SELECT 'cross and inside and'; +SELECT '--- cross and inside and ---'; EXPLAIN SYNTAX SELECT * FROM t1_00826 cross join t2_00826 where t1_00826.a = t2_00826.a and (t1_00826.a = t2_00826.a and (t1_00826.a = t2_00826.a and t1_00826.b = t2_00826.b)); -SELECT 'cross split conjunction'; +SELECT '--- cross split conjunction ---'; EXPLAIN SYNTAX SELECT * FROM t1_00826 cross join t2_00826 where t1_00826.a = t2_00826.a and t1_00826.b = t2_00826.b and t1_00826.a >= 1 and t2_00826.b > 0; +SELECT '--- and or ---'; +EXPLAIN SYNTAX SELECT * FROM t1_00826 cross join t2_00826 where t1_00826.a = t2_00826.a and t1_00826.b = t2_00826.b and (t1_00826.a >= 1 OR t2_00826.b = 1); + +SELECT '--- arithmetic expr ---'; +EXPLAIN SYNTAX SELECT * FROM t1_00826 cross join t2_00826 where t1_00826.a + 1 = t2_00826.a + t2_00826.b AND (t1_00826.a + t1_00826.b + t2_00826.a + t2_00826.b > 5); + +SELECT '--- is null or ---'; +EXPLAIN SYNTAX SELECT * FROM t1_00826 cross join t2_00826 where t1_00826.b = t2_00826.a AND (t2_00826.b IS NULL OR t2_00826.b > t2_00826.a); + +SELECT '--- do not rewrite alias ---'; +EXPLAIN SYNTAX SELECT a as b FROM t1_00826 cross join t2_00826 where t1_00826.b = t2_00826.a AND b > 0; + +SELECT '--- comma ---'; +EXPLAIN SYNTAX SELECT * FROM t1_00826, t2_00826 where t1_00826.a = t2_00826.a; +SELECT '--- comma nullable ---'; +EXPLAIN SYNTAX SELECT * FROM t1_00826, t2_00826 where t1_00826.b = t2_00826.b; +SELECT '--- comma and or ---'; +EXPLAIN SYNTAX SELECT * FROM t1_00826, t2_00826 where t1_00826.a = t2_00826.a AND (t2_00826.b IS NULL OR t2_00826.b < 2); + DROP TABLE t1_00826; DROP TABLE t2_00826; diff --git a/tests/queries/0_stateless/01083_cross_to_inner_with_like.reference b/tests/queries/0_stateless/01083_cross_to_inner_with_like.reference index bf043b4668a..42bbeb05ecb 100644 --- a/tests/queries/0_stateless/01083_cross_to_inner_with_like.reference +++ b/tests/queries/0_stateless/01083_cross_to_inner_with_like.reference @@ -19,10 +19,3 @@ SELECT FROM n ALL INNER JOIN r ON k = r.k WHERE (k = r.k) AND (name NOT LIKE \'A%\') -SELECT - k, - r.k, - name -FROM n -ALL INNER JOIN r ON (k + 1) = (r.k + 1) -WHERE ((k + 1) = (r.k + 1)) AND ((name = \'A\') OR (name = \'AA\')) diff --git a/tests/queries/0_stateless/01083_cross_to_inner_with_like.sql b/tests/queries/0_stateless/01083_cross_to_inner_with_like.sql index c6544553816..644190cbddf 100644 --- a/tests/queries/0_stateless/01083_cross_to_inner_with_like.sql +++ b/tests/queries/0_stateless/01083_cross_to_inner_with_like.sql @@ -9,7 +9,6 @@ SET enable_optimize_predicate_expression = 0; EXPLAIN SYNTAX SELECT * FROM n, r WHERE n.k = r.k AND r.name = 'A'; EXPLAIN SYNTAX SELECT * FROM n, r WHERE n.k = r.k AND r.name LIKE 'A%'; EXPLAIN SYNTAX SELECT * FROM n, r WHERE n.k = r.k AND r.name NOT LIKE 'A%'; -EXPLAIN SYNTAX SELECT * FROM n, r WHERE n.k + 1 = r.k + 1 AND (r.name = 'A' OR r.name = 'AA'); DROP TABLE n; DROP TABLE r; From 068c9cfbf7a58dd7e624b3d1557ccdbaf227bf34 Mon Sep 17 00:00:00 2001 From: alesapin Date: Fri, 19 Feb 2021 17:13:29 +0300 Subject: [PATCH 0496/2357] Fix logs level --- docker/test/fasttest/run.sh | 1 + src/Coordination/LoggerWrapper.h | 32 ++++++++++++++++++++++++-------- 2 files changed, 25 insertions(+), 8 deletions(-) diff --git a/docker/test/fasttest/run.sh b/docker/test/fasttest/run.sh index 53a0de21d5b..0ace1cd39da 100755 --- a/docker/test/fasttest/run.sh +++ b/docker/test/fasttest/run.sh @@ -70,6 +70,7 @@ function start_server --path "$FASTTEST_DATA" --user_files_path "$FASTTEST_DATA/user_files" --top_level_domains_path "$FASTTEST_DATA/top_level_domains" + --test_keeper_server.log_storage_path "$FASTTEST_DATA/coordination" ) clickhouse-server "${opts[@]}" &>> "$FASTTEST_OUTPUT/server.log" & server_pid=$! diff --git a/src/Coordination/LoggerWrapper.h b/src/Coordination/LoggerWrapper.h index 755b72c06cc..25a1969d2e9 100644 --- a/src/Coordination/LoggerWrapper.h +++ b/src/Coordination/LoggerWrapper.h @@ -9,12 +9,26 @@ namespace DB class LoggerWrapper : public nuraft::logger { +private: + + static inline const std::unordered_map LEVELS = + { + {LogsLevel::trace, Poco::Message::Priority::PRIO_TRACE}, + {LogsLevel::debug, Poco::Message::Priority::PRIO_DEBUG}, + {LogsLevel::information, Poco::Message::PRIO_INFORMATION}, + {LogsLevel::warning, Poco::Message::PRIO_WARNING}, + {LogsLevel::error, Poco::Message::PRIO_ERROR}, + {LogsLevel::fatal, Poco::Message::PRIO_FATAL} + }; + static inline const int LEVEL_MAX = static_cast(LogsLevel::trace); + static inline const int LEVEL_MIN = static_cast(LogsLevel::none); + public: LoggerWrapper(const std::string & name, LogsLevel level_) : log(&Poco::Logger::get(name)) - , level(static_cast(level_)) + , level(level_) { - log->setLevel(level); + log->setLevel(static_cast(LEVELS.at(level))); } void put_details( @@ -24,24 +38,26 @@ public: size_t /* line_number */, const std::string & msg) override { - LOG_IMPL(log, static_cast(level_), static_cast(level_), msg); + LogsLevel db_level = static_cast(level_); + LOG_IMPL(log, db_level, LEVELS.at(db_level), msg); } void set_level(int level_) override { - level_ = std::min(6, std::max(1, level_)); - log->setLevel(level_); - level = level_; + level_ = std::min(LEVEL_MAX, std::max(LEVEL_MIN, level_)); + level = static_cast(level_); + log->setLevel(static_cast(LEVELS.at(level))); } int get_level() override { - return level; + LogsLevel lvl = level; + return static_cast(lvl); } private: Poco::Logger * log; - std::atomic level; + std::atomic level; }; } From 033f55f498ead26bcc0bd5d2efa9332bf6db8482 Mon Sep 17 00:00:00 2001 From: Alexander Tokmakov Date: Fri, 19 Feb 2021 17:38:20 +0300 Subject: [PATCH 0497/2357] fix --- docker/test/stress/stress | 5 ++- tests/clickhouse-test | 44 +++++++++++-------- .../01079_parallel_alter_modify_zookeeper.sh | 1 - 3 files changed, 28 insertions(+), 22 deletions(-) diff --git a/docker/test/stress/stress b/docker/test/stress/stress index e0189072f7d..666fd4cce50 100755 --- a/docker/test/stress/stress +++ b/docker/test/stress/stress @@ -88,9 +88,10 @@ if __name__ == "__main__": logging.info("Checking if some queries hung") cmd = "{} {} {}".format(args.test_cmd, "--hung-check", "00001_select_1") res = call(cmd, shell=True, stderr=STDOUT) - hung_check_status = "Hung check\t{}\n".format('FAIL' if res else 'OK') - open(os.path.join(args.output_folder, "test_results.tsv"), 'w+').write(hung_check_status) + hung_check_status = "No queries hung\tOK\n" if res != 0: logging.info("Hung check failed with exit code {}".format(res)) + hung_check_status = "Hung check failed\tFAIL\n" + open(os.path.join(args.output_folder, "test_results.tsv"), 'w+').write(hung_check_status) logging.info("Stress test finished") diff --git a/tests/clickhouse-test b/tests/clickhouse-test index fa8d2891224..2aca0504141 100755 --- a/tests/clickhouse-test +++ b/tests/clickhouse-test @@ -212,7 +212,8 @@ def get_stacktraces_from_gdb(server_pid): try: return subprocess.check_output(cmd, shell=True).decode('utf-8') except Exception as ex: - return "Error occured while receiving stack traces from gdb: {}".format(str(ex)) + print("Error occured while receiving stack traces from gdb: {}".format(str(ex))) + return None # collect server stacktraces from system.stack_trace table @@ -224,21 +225,24 @@ def get_stacktraces_from_clickhouse(client): "arrayMap(x -> demangle(addressToSymbol(x)), trace)), '\n') as trace " "FROM system.stack_trace format Vertical\"".format(client), shell=True).decode('utf-8') except Exception as ex: - return "Error occured while receiving stack traces from client: {}".format(str(ex)) + print("Error occured while receiving stack traces from client: {}".format(str(ex))) + return None def get_server_pid(server_tcp_port): - cmd = "lsof -i tcp:{port} -s tcp:LISTEN -Fp | awk '/^p[0-9]+$/{{print substr($0, 2)}}'".format(port=server_tcp_port) + # lsof does not work in stress tests for some reason + cmd_lsof = "lsof -i tcp:{port} -s tcp:LISTEN -Fp | awk '/^p[0-9]+$/{{print substr($0, 2)}}'".format(port=server_tcp_port) + cmd_pidof = "pidof -s clickhouse-server" + commands = [cmd_lsof, cmd_pidof] output = None - try: - output = subprocess.check_output(cmd, shell=True, stderr=subprocess.STDOUT) - if output: - return int(output) - else: - return None # server dead - except Exception as e: - print("Cannot get server pid, got {}: {}", output, e) - return None + for cmd in commands: + try: + output = subprocess.check_output(cmd, shell=True, stderr=subprocess.STDOUT) + if output: + return int(output) + except Exception as e: + print("Cannot get server pid with {}, got {}: {}", cmd, output, e) + return None # most likely server dead def colored(text, args, color=None, on_color=None, attrs=None): @@ -796,21 +800,23 @@ def main(args): clickhouse_tcp_port = os.getenv("CLICKHOUSE_PORT_TCP", '9000') server_pid = get_server_pid(clickhouse_tcp_port) + bt = None if server_pid: print("\nLocated ClickHouse server process {} listening at TCP port {}".format(server_pid, clickhouse_tcp_port)) - - # It does not work in Sandbox - #print("\nCollecting stacktraces from system.stacktraces table:") - #print(get_stacktraces_from_clickhouse(args.client)) - print("\nCollecting stacktraces from all running threads with gdb:") - print(get_stacktraces_from_gdb(server_pid)) - else: + bt = get_stacktraces_from_gdb(server_pid) + if bt is None: + print("\nCollecting stacktraces from system.stacktraces table:") + bt = get_stacktraces_from_clickhouse(args.client) + if bt is None: print( colored( "\nUnable to locate ClickHouse server process listening at TCP port {}. " "It must have crashed or exited prematurely!".format(clickhouse_tcp_port), args, "red", attrs=["bold"])) + else: + print(bt) + exit_code = 1 else: diff --git a/tests/queries/0_stateless/01079_parallel_alter_modify_zookeeper.sh b/tests/queries/0_stateless/01079_parallel_alter_modify_zookeeper.sh index 0749dc14dfa..5b14c5a8543 100755 --- a/tests/queries/0_stateless/01079_parallel_alter_modify_zookeeper.sh +++ b/tests/queries/0_stateless/01079_parallel_alter_modify_zookeeper.sh @@ -14,7 +14,6 @@ for i in $(seq $REPLICAS); do $CLICKHOUSE_CLIENT --query "CREATE TABLE concurrent_alter_mt_$i (key UInt64, value1 UInt64, value2 Int32) ENGINE = ReplicatedMergeTree('/clickhouse/tables/test_01079/concurrent_alter_mt', '$i') ORDER BY key SETTINGS max_replicated_mutations_in_queue=1000, number_of_free_entries_in_pool_to_execute_mutation=0,max_replicated_merges_in_queue=1000" done - $CLICKHOUSE_CLIENT --query "INSERT INTO concurrent_alter_mt_1 SELECT number, number + 10, number from numbers(10)" $CLICKHOUSE_CLIENT --query "INSERT INTO concurrent_alter_mt_1 SELECT number, number + 10, number from numbers(10, 40)" From 12d05c27922eb1010eaede6fdf891995240dc644 Mon Sep 17 00:00:00 2001 From: alesapin Date: Fri, 19 Feb 2021 17:47:53 +0300 Subject: [PATCH 0498/2357] Better startup --- contrib/NuRaft | 2 +- src/Coordination/NuKeeperServer.cpp | 9 +-------- 2 files changed, 2 insertions(+), 9 deletions(-) diff --git a/contrib/NuRaft b/contrib/NuRaft index 7adf7ae33e7..c250d5ad58c 160000 --- a/contrib/NuRaft +++ b/contrib/NuRaft @@ -1 +1 @@ -Subproject commit 7adf7ae33e7d5c307342431b577c8ab1025ee793 +Subproject commit c250d5ad58c82e751264df40a94da682a2fc3519 diff --git a/src/Coordination/NuKeeperServer.cpp b/src/Coordination/NuKeeperServer.cpp index 8556fa85231..c2917e3ab76 100644 --- a/src/Coordination/NuKeeperServer.cpp +++ b/src/Coordination/NuKeeperServer.cpp @@ -161,7 +161,7 @@ bool NuKeeperServer::isLeaderAlive() const nuraft::cb_func::ReturnCode NuKeeperServer::callbackFunc(nuraft::cb_func::Type type, nuraft::cb_func::Param * /* param */) { - if (type == nuraft::cb_func::Type::BecomeFresh || type == nuraft::cb_func::Type::BecomeLeader) + if ((type == nuraft::cb_func::InitialBatchCommited && isLeader()) || type == nuraft::cb_func::BecomeFresh) { std::unique_lock lock(initialized_mutex); initialized_flag = true; @@ -176,13 +176,6 @@ void NuKeeperServer::waitInit() int64_t timeout = coordination_settings->startup_timeout.totalMilliseconds(); if (!initialized_cv.wait_for(lock, std::chrono::milliseconds(timeout), [&] { return initialized_flag; })) throw Exception(ErrorCodes::RAFT_ERROR, "Failed to wait RAFT initialization"); - - /// TODO FIXME somehow - while (isLeader() && raft_instance->get_committed_log_idx() != raft_instance->get_last_log_idx()) - { - LOG_WARNING(&Poco::Logger::get("NuKeeperServer"), "Loading from log store {}/{}", raft_instance->get_committed_log_idx(), raft_instance->get_last_log_idx()); - std::this_thread::sleep_for(std::chrono::milliseconds(100)); - } } std::unordered_set NuKeeperServer::getDeadSessions() From bfccfd9eece9a7e1f1c70755034a837d8cee8078 Mon Sep 17 00:00:00 2001 From: Denis Glazachev Date: Fri, 19 Feb 2021 19:32:35 +0400 Subject: [PATCH 0499/2357] Fix tests Add tests WIP: Failures are legit! --- .../test_row_policy/normal_filter2_table2.xml | 16 +++++ tests/integration/test_row_policy/test.py | 63 ++++++++++++++----- 2 files changed, 65 insertions(+), 14 deletions(-) create mode 100644 tests/integration/test_row_policy/normal_filter2_table2.xml diff --git a/tests/integration/test_row_policy/normal_filter2_table2.xml b/tests/integration/test_row_policy/normal_filter2_table2.xml new file mode 100644 index 00000000000..aca6bddc334 --- /dev/null +++ b/tests/integration/test_row_policy/normal_filter2_table2.xml @@ -0,0 +1,16 @@ + + + + + + + + + + a > 0 + + + + + + diff --git a/tests/integration/test_row_policy/test.py b/tests/integration/test_row_policy/test.py index c11e1b1e21c..25d90ae25b9 100644 --- a/tests/integration/test_row_policy/test.py +++ b/tests/integration/test_row_policy/test.py @@ -103,11 +103,32 @@ def test_join(): def test_cannot_trick_row_policy_with_keyword_with(): - assert node.query("WITH 0 AS a SELECT * FROM mydb.filtered_table1") == TSV([[1, 0], [1, 1]]) - assert node.query("WITH 0 AS a SELECT a, b FROM mydb.filtered_table1") == TSV([[0, 0], [0, 1]]) assert node.query("WITH 0 AS a SELECT a FROM mydb.filtered_table1") == TSV([[0], [0]]) assert node.query("WITH 0 AS a SELECT b FROM mydb.filtered_table1") == TSV([[0], [1]]) - assert node.query("WITH 0 AS a SELECT * FROM mydb.filtered_table1 PREWHERE a IN(0, 1) WHERE b IN(0, 1)") == TSV([[0], [1]]) + + assert node.query("WITH 0 AS a SELECT * FROM mydb.filtered_table1") == TSV([[1, 0], [1, 1]]) + assert node.query("WITH 0 AS a SELECT * FROM mydb.filtered_table1 WHERE a IN(0, 1) AND b IN(0, 1)") == TSV([[1, 0], [1, 1]]) + assert node.query("WITH 0 AS a SELECT * FROM mydb.filtered_table1 PREWHERE a IN(0, 1) AND b IN(0, 1)") == TSV([[1, 0], [1, 1]]) + assert node.query("WITH 0 AS a SELECT * FROM mydb.filtered_table1 PREWHERE a IN(0, 1) WHERE b IN(0, 1)") == TSV([[1, 0], [1, 1]]) + assert node.query("WITH 0 AS a SELECT * FROM mydb.filtered_table1 PREWHERE b IN(0, 1) WHERE a IN(0, 1)") == TSV([[1, 0], [1, 1]]) + + assert node.query("WITH 0 AS a SELECT a, b FROM mydb.filtered_table1") == TSV([[0, 0], [0, 1]]) + assert node.query("WITH 0 AS a SELECT a, b FROM mydb.filtered_table1 WHERE a IN(0, 1) AND b IN(0, 1)") == TSV([[0, 0], [0, 1]]) + assert node.query("WITH 0 AS a SELECT a, b FROM mydb.filtered_table1 PREWHERE a IN(0, 1) AND b IN(0, 1)") == TSV([[0, 0], [0, 1]]) + assert node.query("WITH 0 AS a SELECT a, b FROM mydb.filtered_table1 PREWHERE a IN(0, 1) WHERE b IN(0, 1)") == TSV([[0, 0], [0, 1]]) + assert node.query("WITH 0 AS a SELECT a, b FROM mydb.filtered_table1 PREWHERE b IN(0, 1) WHERE a IN(0, 1)") == TSV([[0, 0], [0, 1]]) + + assert node.query("WITH 0 AS c SELECT * FROM mydb.filtered_table3") == TSV([[0, 1], [1, 0]]) + assert node.query("WITH 0 AS c SELECT * FROM mydb.filtered_table3 WHERE c >= 0 AND a >= 0") == TSV([[0, 1], [1, 0]]) + assert node.query("WITH 0 AS c SELECT * FROM mydb.filtered_table3 PREWHERE c >= 0 AND a >= 0") == TSV([[0, 1], [1, 0]]) + assert node.query("WITH 0 AS c SELECT * FROM mydb.filtered_table3 PREWHERE c >= 0 WHERE a >= 0") == TSV([[0, 1], [1, 0]]) + assert node.query("WITH 0 AS c SELECT * FROM mydb.filtered_table3 PREWHERE a >= 0 WHERE c >= 0") == TSV([[0, 1], [1, 0]]) + + assert node.query("WITH 0 AS c SELECT a, b, c FROM mydb.filtered_table3") == TSV([[0, 1], [1, 0], [0, 0]]) + assert node.query("WITH 0 AS c SELECT a, b, c FROM mydb.filtered_table3 WHERE c >= 0 AND a >= 0") == TSV([[0, 1], [1, 0], [0, 0]]) + assert node.query("WITH 0 AS c SELECT a, b, c FROM mydb.filtered_table3 PREWHERE c >= 0 AND a >= 0") == TSV([[0, 1], [1, 0], [0, 0]]) + assert node.query("WITH 0 AS c SELECT a, b, c FROM mydb.filtered_table3 PREWHERE c >= 0 WHERE a >= 0") == TSV([[0, 1], [1, 0], [0, 0]]) + assert node.query("WITH 0 AS c SELECT a, b, c FROM mydb.filtered_table3 PREWHERE a >= 0 WHERE c >= 0") == TSV([[0, 1], [1, 0], [0, 0]]) def test_policy_from_users_xml_affects_only_user_assigned(): @@ -123,17 +144,17 @@ def test_policy_from_users_xml_affects_only_user_assigned(): def test_with_prewhere(): - copy_policy_xml('normal_filters.xml') - assert node.query("SELECT * FROM mydb.filtered_table2 WHERE a > 1") == TSV([[4, 5, 2, 1]]) + copy_policy_xml('normal_filter2_table2.xml') + assert node.query("SELECT * FROM mydb.filtered_table2 WHERE a > 1") == TSV([[4, 3, 2, 1]]) assert node.query("SELECT a FROM mydb.filtered_table2 WHERE a > 1") == TSV([[4]]) - assert node.query("SELECT a, b FROM mydb.filtered_table2 WHERE a > 1") == TSV([[4, 5]]) - assert node.query("SELECT b, c FROM mydb.filtered_table2 WHERE a > 1") == TSV([[5, 2]]) + assert node.query("SELECT a, b FROM mydb.filtered_table2 WHERE a > 1") == TSV([[4, 3]]) + assert node.query("SELECT b, c FROM mydb.filtered_table2 WHERE a > 1") == TSV([[3, 2]]) assert node.query("SELECT d FROM mydb.filtered_table2 WHERE a > 1") == TSV([[1]]) - assert node.query("SELECT * FROM mydb.filtered_table2 PREWHERE a > 1") == TSV([[4, 5, 2, 1]]) + assert node.query("SELECT * FROM mydb.filtered_table2 PREWHERE a > 1") == TSV([[4, 3, 2, 1]]) assert node.query("SELECT a FROM mydb.filtered_table2 PREWHERE a > 1") == TSV([[4]]) - assert node.query("SELECT a, b FROM mydb.filtered_table2 PREWHERE a > 1") == TSV([[4, 5]]) - assert node.query("SELECT b, c FROM mydb.filtered_table2 PREWHERE a > 1") == TSV([[5, 2]]) + assert node.query("SELECT a, b FROM mydb.filtered_table2 PREWHERE a > 1") == TSV([[4, 3]]) + assert node.query("SELECT b, c FROM mydb.filtered_table2 PREWHERE a > 1") == TSV([[3, 2]]) assert node.query("SELECT d FROM mydb.filtered_table2 PREWHERE a > 1") == TSV([[1]]) assert node.query("SELECT * FROM mydb.filtered_table2 PREWHERE a < 4 WHERE b < 10") == TSV([[1, 2, 3, 4]]) @@ -145,13 +166,22 @@ def test_with_prewhere(): assert node.query("SELECT c, d FROM mydb.filtered_table2 PREWHERE a < 4 WHERE b < 10") == TSV([[3, 4]]) +def test_with_throwif_in_where(): + copy_policy_xml('no_filters.xml') + assert 'expected' in node.query_and_get_error("SELECT * FROM mydb.filtered_table2 WHERE throwIf(a = 0, 'expected') = 0") + + copy_policy_xml('normal_filter2_table2.xml') + assert node.query("SELECT * FROM mydb.filtered_table2 WHERE throwIf(a = 0, 'pwned') = 0") == TSV([ + [1, 2, 3, 4], [4, 3, 2, 1]]) + + def test_with_throwif_in_prewhere(): copy_policy_xml('no_filters.xml') - assert 'expected' in node.query_and_get_error("SELECT throwIf(a = 0, 'expected') FROM mydb.filtered_table2 PREWHERE b < 10") + assert 'expected' in node.query_and_get_error("SELECT * FROM mydb.filtered_table2 PREWHERE throwIf(a = 0, 'expected') = 0") - copy_policy_xml('normal_filters.xml') - assert node.query("SELECT throwIf(a = 0, 'pwned') FROM mydb.filtered_table2 PREWHERE b < 10") == TSV([ - [4, 5, 2, 1], [1, 2, 3, 4]]) + copy_policy_xml('normal_filter2_table2.xml') + assert node.query("SELECT * FROM mydb.filtered_table2 PREWHERE throwIf(a = 0, 'pwned') = 0") == TSV([ + [1, 2, 3, 4], [4, 3, 2, 1]]) def test_change_of_users_xml_changes_row_policies(): @@ -176,6 +206,11 @@ def test_change_of_users_xml_changes_row_policies(): assert node.query("SELECT * FROM mydb.filtered_table2") == TSV([[0, 0, 0, 0], [0, 0, 6, 0]]) assert node.query("SELECT * FROM mydb.filtered_table3") == TSV([[0, 1], [1, 0]]) + copy_policy_xml('normal_filter2_table2.xml') + assert node.query("SELECT * FROM mydb.filtered_table1") == TSV([[0, 0], [0, 1], [1, 0], [1, 1]]) + assert node.query("SELECT * FROM mydb.filtered_table2") == TSV([[1, 2, 3, 4], [4, 3, 2, 1]]) + assert node.query("SELECT * FROM mydb.filtered_table3") == TSV([[0, 0], [0, 1], [1, 0], [1, 1]]) + copy_policy_xml('no_filters.xml') assert node.query("SELECT * FROM mydb.filtered_table1") == TSV([[0, 0], [0, 1], [1, 0], [1, 1]]) assert node.query("SELECT * FROM mydb.filtered_table2") == TSV( From 5e5b3b80cee11b5c3c8e72c1ccc5f4cfb5be585e Mon Sep 17 00:00:00 2001 From: Denis Glazachev Date: Fri, 19 Feb 2021 19:46:52 +0400 Subject: [PATCH 0500/2357] Remove debug printouts --- src/Interpreters/InterpreterSelectQuery.cpp | 12 ------------ .../MergeTree/MergeTreeBaseSelectProcessor.cpp | 9 --------- 2 files changed, 21 deletions(-) diff --git a/src/Interpreters/InterpreterSelectQuery.cpp b/src/Interpreters/InterpreterSelectQuery.cpp index 826be1e5143..2d6a22cce2f 100644 --- a/src/Interpreters/InterpreterSelectQuery.cpp +++ b/src/Interpreters/InterpreterSelectQuery.cpp @@ -106,10 +106,6 @@ namespace ErrorCodes /// Assumes `storage` is set and the table filter (row-level security) is not empty. String InterpreterSelectQuery::generateFilterActions(ActionsDAGPtr & actions, const Names & prerequisite_columns) const { - // std::cerr << "----- InterpreterSelectQuery::generateFilterActions\n"; - // for (const auto & name : prerequisite_columns) - // std::cerr << name << std::endl; - const auto & db_name = table_id.getDatabaseName(); const auto & table_name = table_id.getTableName(); @@ -144,7 +140,6 @@ String InterpreterSelectQuery::generateFilterActions(ActionsDAGPtr & actions, co auto syntax_result = TreeRewriter(*context).analyzeSelect(query_ast, TreeRewriterResult({}, storage, metadata_snapshot)); SelectQueryExpressionAnalyzer analyzer(query_ast, syntax_result, *context, metadata_snapshot); actions = analyzer.simpleSelectActions(); - //std::cerr << actions-> return expr_list->children.at(0)->getColumnName(); } @@ -527,10 +522,6 @@ void InterpreterSelectQuery::buildQueryPlan(QueryPlan & query_plan) { executeImpl(query_plan, input, std::move(input_pipe)); - // WriteBufferFromOwnString buf; - // query_plan.explainPlan(buf, {.header = true, .actions = true}); - // std::cerr << buf.str(); - /// We must guarantee that result structure is the same as in getSampleBlock() if (!blocksHaveEqualStructure(query_plan.getCurrentDataStream().header, result_header)) { @@ -826,7 +817,6 @@ void InterpreterSelectQuery::executeImpl(QueryPlan & query_plan, const BlockInpu const bool does_storage_support_prewhere = !input && !input_pipe && storage && storage->supportsPrewhere(); if (does_storage_support_prewhere && settings.optimize_move_to_prewhere) { - // std::cerr << "----- Moving row level filter to prewhere\n"; /// Execute row level filter in prewhere as a part of "move to prewhere" optimization. expressions.prewhere_info = std::make_shared( std::move(expressions.filter_info->actions), @@ -839,7 +829,6 @@ void InterpreterSelectQuery::executeImpl(QueryPlan & query_plan, const BlockInpu else { /// Add row level security actions to prewhere. - // std::cerr << expressions.filter_info->actions->dumpDAG() << std::endl; expressions.prewhere_info->row_level_filter_actions = std::move(expressions.filter_info->actions); expressions.prewhere_info->row_level_column_name = std::move(expressions.filter_info->column_name); expressions.prewhere_info->row_level_filter_actions->projectInput(false); @@ -1658,7 +1647,6 @@ void InterpreterSelectQuery::executeFetchColumns(QueryProcessingStage::Enum proc if (prewhere_info) { - // std::cerr << "-------- filling prewhere info \n"; query_info.prewhere_info = std::make_shared(); query_info.prewhere_info->prewhere_actions = std::make_shared(prewhere_info->prewhere_actions); diff --git a/src/Storages/MergeTree/MergeTreeBaseSelectProcessor.cpp b/src/Storages/MergeTree/MergeTreeBaseSelectProcessor.cpp index 90da45cc6d8..4911f9982d5 100644 --- a/src/Storages/MergeTree/MergeTreeBaseSelectProcessor.cpp +++ b/src/Storages/MergeTree/MergeTreeBaseSelectProcessor.cpp @@ -334,13 +334,9 @@ void MergeTreeBaseSelectProcessor::executePrewhereActions(Block & block, const P { if (prewhere_info) { - // std::cerr << "0: " << block.dumpStructure() << std::endl; - if (prewhere_info->alias_actions) prewhere_info->alias_actions->execute(block); - // std::cerr << "1: " << block.dumpStructure() << std::endl; - if (prewhere_info->row_level_filter) { prewhere_info->row_level_filter->execute(block); @@ -351,13 +347,10 @@ void MergeTreeBaseSelectProcessor::executePrewhereActions(Block & block, const P ErrorCodes::LOGICAL_ERROR); } } - // std::cerr << "2: " << block.dumpStructure() << std::endl; if (prewhere_info->prewhere_actions) prewhere_info->prewhere_actions->execute(block); - // std::cerr << "3: " << block.dumpStructure() << std::endl; - auto & prewhere_column = block.getByName(prewhere_info->prewhere_column_name); if (!prewhere_column.type->canBeUsedInBooleanContext()) { @@ -372,8 +365,6 @@ void MergeTreeBaseSelectProcessor::executePrewhereActions(Block & block, const P auto & ctn = block.getByName(prewhere_info->prewhere_column_name); ctn.column = ctn.type->createColumnConst(block.rows(), 1u)->convertToFullColumnIfConst(); } - - // std::cerr << "4: " << block.dumpStructure() << std::endl; } } From ad374ec0953926af32227aea9744fc9c09da65ca Mon Sep 17 00:00:00 2001 From: alesapin Date: Fri, 19 Feb 2021 19:05:26 +0300 Subject: [PATCH 0501/2357] Rename file --- src/Coordination/NuKeeperServer.cpp | 4 ++-- src/Coordination/NuKeeperServer.h | 4 ++-- ...ryStateManager.cpp => NuKeeperStateManager.cpp} | 14 +++++++------- ...MemoryStateManager.h => NuKeeperStateManager.h} | 6 +++--- src/Coordination/tests/gtest_for_build.cpp | 6 +++--- 5 files changed, 17 insertions(+), 17 deletions(-) rename src/Coordination/{InMemoryStateManager.cpp => NuKeeperStateManager.cpp} (88%) rename src/Coordination/{InMemoryStateManager.h => NuKeeperStateManager.h} (94%) diff --git a/src/Coordination/NuKeeperServer.cpp b/src/Coordination/NuKeeperServer.cpp index c2917e3ab76..c0dc3f85343 100644 --- a/src/Coordination/NuKeeperServer.cpp +++ b/src/Coordination/NuKeeperServer.cpp @@ -1,7 +1,7 @@ #include #include #include -#include +#include #include #include #include @@ -26,7 +26,7 @@ NuKeeperServer::NuKeeperServer( : server_id(server_id_) , coordination_settings(coordination_settings_) , state_machine(nuraft::cs_new(responses_queue_, coordination_settings)) - , state_manager(nuraft::cs_new(server_id, "test_keeper_server", config, coordination_settings)) + , state_manager(nuraft::cs_new(server_id, "test_keeper_server", config, coordination_settings)) , responses_queue(responses_queue_) { } diff --git a/src/Coordination/NuKeeperServer.h b/src/Coordination/NuKeeperServer.h index a8d269eb9eb..40f3efec76a 100644 --- a/src/Coordination/NuKeeperServer.h +++ b/src/Coordination/NuKeeperServer.h @@ -2,7 +2,7 @@ #include // Y_IGNORE #include -#include +#include #include #include #include @@ -20,7 +20,7 @@ private: nuraft::ptr state_machine; - nuraft::ptr state_manager; + nuraft::ptr state_manager; nuraft::raft_launcher launcher; diff --git a/src/Coordination/InMemoryStateManager.cpp b/src/Coordination/NuKeeperStateManager.cpp similarity index 88% rename from src/Coordination/InMemoryStateManager.cpp rename to src/Coordination/NuKeeperStateManager.cpp index 084ab043d12..14e8badd92f 100644 --- a/src/Coordination/InMemoryStateManager.cpp +++ b/src/Coordination/NuKeeperStateManager.cpp @@ -1,4 +1,4 @@ -#include +#include #include namespace DB @@ -9,7 +9,7 @@ namespace ErrorCodes extern const int RAFT_ERROR; } -InMemoryStateManager::InMemoryStateManager(int server_id_, const std::string & host, int port, const std::string & logs_path) +NuKeeperStateManager::NuKeeperStateManager(int server_id_, const std::string & host, int port, const std::string & logs_path) : my_server_id(server_id_) , my_port(port) , log_store(nuraft::cs_new(logs_path, 5000, true)) @@ -19,7 +19,7 @@ InMemoryStateManager::InMemoryStateManager(int server_id_, const std::string & h cluster_config->get_servers().push_back(peer_config); } -InMemoryStateManager::InMemoryStateManager( +NuKeeperStateManager::NuKeeperStateManager( int my_server_id_, const std::string & config_prefix, const Poco::Util::AbstractConfiguration & config, @@ -63,17 +63,17 @@ InMemoryStateManager::InMemoryStateManager( throw Exception(ErrorCodes::RAFT_ERROR, "At least one of servers should be able to start as leader (without )"); } -void InMemoryStateManager::loadLogStore(size_t start_log_index) +void NuKeeperStateManager::loadLogStore(size_t start_log_index) { log_store->init(start_log_index); } -void InMemoryStateManager::flushLogStore() +void NuKeeperStateManager::flushLogStore() { log_store->flush(); } -void InMemoryStateManager::save_config(const nuraft::cluster_config & config) +void NuKeeperStateManager::save_config(const nuraft::cluster_config & config) { // Just keep in memory in this example. // Need to write to disk here, if want to make it durable. @@ -81,7 +81,7 @@ void InMemoryStateManager::save_config(const nuraft::cluster_config & config) cluster_config = nuraft::cluster_config::deserialize(*buf); } -void InMemoryStateManager::save_state(const nuraft::srv_state & state) +void NuKeeperStateManager::save_state(const nuraft::srv_state & state) { // Just keep in memory in this example. // Need to write to disk here, if want to make it durable. diff --git a/src/Coordination/InMemoryStateManager.h b/src/Coordination/NuKeeperStateManager.h similarity index 94% rename from src/Coordination/InMemoryStateManager.h rename to src/Coordination/NuKeeperStateManager.h index c53f00702d4..66229a3b8d1 100644 --- a/src/Coordination/InMemoryStateManager.h +++ b/src/Coordination/NuKeeperStateManager.h @@ -10,16 +10,16 @@ namespace DB { -class InMemoryStateManager : public nuraft::state_mgr +class NuKeeperStateManager : public nuraft::state_mgr { public: - InMemoryStateManager( + NuKeeperStateManager( int server_id_, const std::string & config_prefix, const Poco::Util::AbstractConfiguration & config, const CoordinationSettingsPtr & coordination_settings); - InMemoryStateManager( + NuKeeperStateManager( int server_id_, const std::string & host, int port, diff --git a/src/Coordination/tests/gtest_for_build.cpp b/src/Coordination/tests/gtest_for_build.cpp index 457d0dbc52a..f871f39a906 100644 --- a/src/Coordination/tests/gtest_for_build.cpp +++ b/src/Coordination/tests/gtest_for_build.cpp @@ -9,7 +9,7 @@ #include #include #include -#include +#include #include #include #include @@ -100,7 +100,7 @@ struct SimpliestRaftServer , port(port_) , endpoint(hostname + ":" + std::to_string(port)) , state_machine(nuraft::cs_new()) - , state_manager(nuraft::cs_new(server_id, hostname, port, logs_path)) + , state_manager(nuraft::cs_new(server_id, hostname, port, logs_path)) { state_manager->loadLogStore(1); nuraft::raft_params params; @@ -151,7 +151,7 @@ struct SimpliestRaftServer nuraft::ptr state_machine; // State manager. - nuraft::ptr state_manager; + nuraft::ptr state_manager; // Raft launcher. nuraft::raft_launcher launcher; From 5cfd687dfbd507f33f0042b5dd959c296ab98988 Mon Sep 17 00:00:00 2001 From: Alexander Tokmakov Date: Fri, 19 Feb 2021 19:22:47 +0300 Subject: [PATCH 0502/2357] fix --- programs/server/Server.cpp | 4 +--- src/Access/UsersConfigAccessStorage.cpp | 2 +- src/Common/Config/ConfigReloader.cpp | 8 ++++---- src/Common/Config/ConfigReloader.h | 6 +++--- .../integration/test_reload_auxiliary_zookeepers/test.py | 3 +-- 5 files changed, 10 insertions(+), 13 deletions(-) diff --git a/programs/server/Server.cpp b/programs/server/Server.cpp index 991cd9699f9..0786140b067 100644 --- a/programs/server/Server.cpp +++ b/programs/server/Server.cpp @@ -702,9 +702,8 @@ int Server::main(const std::vector & /*args*/) config().getString("path", ""), std::move(main_config_zk_node_cache), main_config_zk_changed_event, - [&](ConfigurationPtr config) + [&](ConfigurationPtr config, bool initial_loading) { - static bool initial_loading = true; Settings::checkNoSettingNamesAtTopLevel(*config, config_path); /// Limit on total memory usage @@ -761,7 +760,6 @@ int Server::main(const std::vector & /*args*/) global_context->reloadZooKeeperIfChanged(config); global_context->reloadAuxiliaryZooKeepersConfigIfChanged(config); - initial_loading = false; } global_context->updateStorageConfiguration(*config); diff --git a/src/Access/UsersConfigAccessStorage.cpp b/src/Access/UsersConfigAccessStorage.cpp index b3f151c3030..33efd71d0d0 100644 --- a/src/Access/UsersConfigAccessStorage.cpp +++ b/src/Access/UsersConfigAccessStorage.cpp @@ -518,7 +518,7 @@ void UsersConfigAccessStorage::load( preprocessed_dir, zkutil::ZooKeeperNodeCache(get_zookeeper_function), std::make_shared(), - [&](Poco::AutoPtr new_config) + [&](Poco::AutoPtr new_config, bool /*initial_loading*/) { parseFromConfig(*new_config); Settings::checkNoSettingNamesAtTopLevel(*new_config, users_config_path); diff --git a/src/Common/Config/ConfigReloader.cpp b/src/Common/Config/ConfigReloader.cpp index 677448e03ae..afff08e82bb 100644 --- a/src/Common/Config/ConfigReloader.cpp +++ b/src/Common/Config/ConfigReloader.cpp @@ -27,7 +27,7 @@ ConfigReloader::ConfigReloader( , updater(std::move(updater_)) { if (!already_loaded) - reloadIfNewer(/* force = */ true, /* throw_on_error = */ true, /* fallback_to_preprocessed = */ true); + reloadIfNewer(/* force = */ true, /* throw_on_error = */ true, /* fallback_to_preprocessed = */ true, /* initial_loading = */ true); } @@ -66,7 +66,7 @@ void ConfigReloader::run() if (quit) return; - reloadIfNewer(zk_changed, /* throw_on_error = */ false, /* fallback_to_preprocessed = */ false); + reloadIfNewer(zk_changed, /* throw_on_error = */ false, /* fallback_to_preprocessed = */ false, /* initial_loading = */ false); } catch (...) { @@ -76,7 +76,7 @@ void ConfigReloader::run() } } -void ConfigReloader::reloadIfNewer(bool force, bool throw_on_error, bool fallback_to_preprocessed) +void ConfigReloader::reloadIfNewer(bool force, bool throw_on_error, bool fallback_to_preprocessed, bool initial_loading) { std::lock_guard lock(reload_mutex); @@ -131,7 +131,7 @@ void ConfigReloader::reloadIfNewer(bool force, bool throw_on_error, bool fallbac try { - updater(loaded_config.configuration); + updater(loaded_config.configuration, initial_loading); } catch (...) { diff --git a/src/Common/Config/ConfigReloader.h b/src/Common/Config/ConfigReloader.h index 489f062e2fe..2e4399d3c4e 100644 --- a/src/Common/Config/ConfigReloader.h +++ b/src/Common/Config/ConfigReloader.h @@ -27,7 +27,7 @@ class Context; class ConfigReloader { public: - using Updater = std::function; + using Updater = std::function; /** include_from_path is usually /etc/metrika.xml (i.e. value of tag) */ @@ -46,12 +46,12 @@ public: void start(); /// Reload immediately. For SYSTEM RELOAD CONFIG query. - void reload() { reloadIfNewer(/* force */ true, /* throw_on_error */ true, /* fallback_to_preprocessed */ false); } + void reload() { reloadIfNewer(/* force */ true, /* throw_on_error */ true, /* fallback_to_preprocessed */ false, /* initial_loading = */ false); } private: void run(); - void reloadIfNewer(bool force, bool throw_on_error, bool fallback_to_preprocessed); + void reloadIfNewer(bool force, bool throw_on_error, bool fallback_to_preprocessed, bool initial_loading); struct FileWithTimestamp; diff --git a/tests/integration/test_reload_auxiliary_zookeepers/test.py b/tests/integration/test_reload_auxiliary_zookeepers/test.py index 92c66c890fc..3d4b1848755 100644 --- a/tests/integration/test_reload_auxiliary_zookeepers/test.py +++ b/tests/integration/test_reload_auxiliary_zookeepers/test.py @@ -62,8 +62,7 @@ def test_reload_auxiliary_zookeepers(start_cluster): """ node.replace_config("/etc/clickhouse-server/conf.d/zookeeper.xml", new_config) - # Hopefully it has finished the configuration reload - time.sleep(2) + node.query("SYSTEM RELOAD CONFIG") node.query( "ALTER TABLE simple2 FETCH PARTITION '2020-08-27' FROM 'zookeeper2:/clickhouse/tables/0/simple';" From b84112a6039589c9a5e2399d4b0efc14d4adf1fc Mon Sep 17 00:00:00 2001 From: Maksim Kita Date: Fri, 19 Feb 2021 19:25:50 +0300 Subject: [PATCH 0503/2357] Function sumMap decimal fix --- src/AggregateFunctions/AggregateFunctionSumMap.h | 7 ++++++- .../queries/0_stateless/00502_sum_map.reference | 2 ++ tests/queries/0_stateless/00502_sum_map.sql | 16 ++++++++++++++++ 3 files changed, 24 insertions(+), 1 deletion(-) diff --git a/src/AggregateFunctions/AggregateFunctionSumMap.h b/src/AggregateFunctions/AggregateFunctionSumMap.h index 3079da36cda..f88a1468732 100644 --- a/src/AggregateFunctions/AggregateFunctionSumMap.h +++ b/src/AggregateFunctions/AggregateFunctionSumMap.h @@ -115,7 +115,12 @@ public: "Values for {} are expected to be Numeric, Float or Decimal, passed type {}", getName(), value_type->getName()}; - result_type = value_type_without_nullable->promoteNumericType(); + WhichDataType value_type_to_check(value_type); + + if (value_type_to_check.isDecimal()) + result_type = value_type_without_nullable; + else + result_type = value_type_without_nullable->promoteNumericType(); } types.emplace_back(std::make_shared(result_type)); diff --git a/tests/queries/0_stateless/00502_sum_map.reference b/tests/queries/0_stateless/00502_sum_map.reference index 0002c43945a..c38fb2ec7d6 100644 --- a/tests/queries/0_stateless/00502_sum_map.reference +++ b/tests/queries/0_stateless/00502_sum_map.reference @@ -22,3 +22,5 @@ ([1.01],[1]) (['a','b'],[1,2]) (['a','ab','abc'],[3,2,1]) +([1,2,3,4,5,6,7,8],[1.00000,2.00000,6.00000,8.00000,10.00000,12.00000,7.00000,8.00000]) +([1,2,3,4,5,6,7,8],[1.00000,2.00000,6.00000,8.00000,10.00000,12.00000,7.00000,8.00000]) diff --git a/tests/queries/0_stateless/00502_sum_map.sql b/tests/queries/0_stateless/00502_sum_map.sql index 021aaf3cd3b..51007a9c78a 100644 --- a/tests/queries/0_stateless/00502_sum_map.sql +++ b/tests/queries/0_stateless/00502_sum_map.sql @@ -38,3 +38,19 @@ select sumMap(val, cnt) from ( SELECT [ CAST(1.01, 'Decimal(10,2)') ] as val, [1 select sumMap(val, cnt) from ( SELECT [ CAST('a', 'FixedString(1)'), CAST('b', 'FixedString(1)' ) ] as val, [1, 2] as cnt ); select sumMap(val, cnt) from ( SELECT [ CAST('abc', 'String'), CAST('ab', 'String'), CAST('a', 'String') ] as val, [1, 2, 3] as cnt ); + +DROP TABLE IF EXISTS sum_map_decimal; + +CREATE TABLE sum_map_decimal( + statusMap Nested( + goal_id UInt16, + revenue Decimal32(5) + ) +) ENGINE = Log; + +INSERT INTO sum_map_decimal VALUES ([1, 2, 3], [1.0, 2.0, 3.0]), ([3, 4, 5], [3.0, 4.0, 5.0]), ([4, 5, 6], [4.0, 5.0, 6.0]), ([6, 7, 8], [6.0, 7.0, 8.0]); + +SELECT sumMap(statusMap.goal_id, statusMap.revenue) FROM sum_map_decimal; +SELECT sumMapWithOverflow(statusMap.goal_id, statusMap.revenue) FROM sum_map_decimal; + +DROP TABLE sum_map_decimal; From 0b5213c80d52595eb66ce8a992381073ac290e9a Mon Sep 17 00:00:00 2001 From: Maksim Kita Date: Fri, 19 Feb 2021 19:49:19 +0300 Subject: [PATCH 0504/2357] Added comment --- src/AggregateFunctions/AggregateFunctionSumMap.h | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/src/AggregateFunctions/AggregateFunctionSumMap.h b/src/AggregateFunctions/AggregateFunctionSumMap.h index f88a1468732..9c2cdb41844 100644 --- a/src/AggregateFunctions/AggregateFunctionSumMap.h +++ b/src/AggregateFunctions/AggregateFunctionSumMap.h @@ -116,7 +116,9 @@ public: getName(), value_type->getName()}; WhichDataType value_type_to_check(value_type); - + + /// Do not promote decimal because of implementation issues of this function design + /// If we decide to make this function more efficient we should promote decimal type during summ if (value_type_to_check.isDecimal()) result_type = value_type_without_nullable; else From fc03c1013cc73094ebb592623c60037acd196410 Mon Sep 17 00:00:00 2001 From: Maksim Kita Date: Fri, 19 Feb 2021 20:42:51 +0300 Subject: [PATCH 0505/2357] Fixed style check --- src/AggregateFunctions/AggregateFunctionSumMap.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/AggregateFunctions/AggregateFunctionSumMap.h b/src/AggregateFunctions/AggregateFunctionSumMap.h index 9c2cdb41844..f6a473546f9 100644 --- a/src/AggregateFunctions/AggregateFunctionSumMap.h +++ b/src/AggregateFunctions/AggregateFunctionSumMap.h @@ -116,7 +116,7 @@ public: getName(), value_type->getName()}; WhichDataType value_type_to_check(value_type); - + /// Do not promote decimal because of implementation issues of this function design /// If we decide to make this function more efficient we should promote decimal type during summ if (value_type_to_check.isDecimal()) From 252bcccddaed5729e2a02fbd610209e0f7de5543 Mon Sep 17 00:00:00 2001 From: Alexey Milovidov Date: Fri, 19 Feb 2021 21:32:39 +0300 Subject: [PATCH 0506/2357] Just little better --- src/Interpreters/Aggregator.cpp | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/Interpreters/Aggregator.cpp b/src/Interpreters/Aggregator.cpp index 8040091256c..abff6f21acf 100644 --- a/src/Interpreters/Aggregator.cpp +++ b/src/Interpreters/Aggregator.cpp @@ -558,7 +558,7 @@ void NO_INLINE Aggregator::executeImplBatch( /// Generic case. - PODArray places(rows); + std::unique_ptr places(new AggregateDataPtr[rows]); /// For all rows. for (size_t i = 0; i < rows; ++i) @@ -589,9 +589,9 @@ void NO_INLINE Aggregator::executeImplBatch( for (AggregateFunctionInstruction * inst = aggregate_instructions; inst->that; ++inst) { if (inst->offsets) - inst->batch_that->addBatchArray(rows, places.data(), inst->state_offset, inst->batch_arguments, inst->offsets, aggregates_pool); + inst->batch_that->addBatchArray(rows, places.get(), inst->state_offset, inst->batch_arguments, inst->offsets, aggregates_pool); else - inst->batch_that->addBatch(rows, places.data(), inst->state_offset, inst->batch_arguments, aggregates_pool); + inst->batch_that->addBatch(rows, places.get(), inst->state_offset, inst->batch_arguments, aggregates_pool); } } From 66e775ef8811f1d1bba30a4369872b8ae04e0c54 Mon Sep 17 00:00:00 2001 From: Denis Zhuravlev Date: Fri, 19 Feb 2021 14:53:34 -0400 Subject: [PATCH 0507/2357] test for decimal ( p , s) in dictionaries --- .../01721_dictionary_decimal_p_s.reference | 10 +++ .../01721_dictionary_decimal_p_s.sql | 78 +++++++++++++++++++ 2 files changed, 88 insertions(+) create mode 100644 tests/queries/0_stateless/01721_dictionary_decimal_p_s.reference create mode 100644 tests/queries/0_stateless/01721_dictionary_decimal_p_s.sql diff --git a/tests/queries/0_stateless/01721_dictionary_decimal_p_s.reference b/tests/queries/0_stateless/01721_dictionary_decimal_p_s.reference new file mode 100644 index 00000000000..066b4bd1d97 --- /dev/null +++ b/tests/queries/0_stateless/01721_dictionary_decimal_p_s.reference @@ -0,0 +1,10 @@ +-------- 42 -------- +42 14.0000 14.00000000 14.00000000 14.0000000000000000618637523926765281280 +42 14.0000 14.00000000 14.00000000 +14.0000 14.00000000 14.00000000 +-------- 4999 -------- +4999 1666.3333 1666.33333333 1666.33333333 1633.3553612205046244471093725648757194800 +4999 1666.3333 1666.33333333 1666.33333333 +1666.3333 1666.33333333 1666.33333333 +-------- 5000 -------- +0.1100 0.11000000 0.11000000 diff --git a/tests/queries/0_stateless/01721_dictionary_decimal_p_s.sql b/tests/queries/0_stateless/01721_dictionary_decimal_p_s.sql new file mode 100644 index 00000000000..0451d455009 --- /dev/null +++ b/tests/queries/0_stateless/01721_dictionary_decimal_p_s.sql @@ -0,0 +1,78 @@ +set allow_experimental_bigint_types=1; +drop database if exists db_01721; +drop table if exists db_01721.table_decimal_dict; +drop dictionary if exists db_01721.decimal_dict; + + +create database db_01721; + +CREATE TABLE db_01721.table_decimal_dict( +KeyField UInt64, +Decimal32_ Decimal(5,4), +Decimal64_ Decimal(18,8), +Decimal128_ Decimal(25,8), +Decimal256_ Decimal(76,37) +) +ENGINE = Memory; + +insert into db_01721.table_decimal_dict +select number, + number / 3, + number / 3, + number / 3, + number / 3 +from numbers(5000); + + +CREATE DICTIONARY IF NOT EXISTS db_01721.decimal_dict ( + KeyField UInt64 DEFAULT 9999999, + Decimal32_ Decimal(5,4) DEFAULT 0.11, + Decimal64_ Decimal(18,8) DEFAULT 0.11, + Decimal128_ Decimal(25,8) DEFAULT 0.11 +-- ,Decimal256_ Decimal256(37) DEFAULT 0.11 +) +PRIMARY KEY KeyField +SOURCE(CLICKHOUSE(HOST 'localhost' PORT tcpPort() USER 'default' TABLE 'table_decimal_dict' DB 'db_01721')) +LIFETIME(0) LAYOUT(SPARSE_HASHED); + +select '-------- 42 --------'; + +SELECT * from db_01721.table_decimal_dict where KeyField = 42; + +SELECT * from db_01721.decimal_dict where KeyField = 42; + +SELECT dictGet('db_01721.decimal_dict', 'Decimal32_', toUInt64(42)), + dictGet('db_01721.decimal_dict', 'Decimal64_', toUInt64(42)), + dictGet('db_01721.decimal_dict', 'Decimal128_', toUInt64(42)) + -- ,dictGet('db_01721.decimal_dict', 'Decimal256_', toUInt64(42)) +; + + +select '-------- 4999 --------'; + +SELECT * from db_01721.table_decimal_dict where KeyField = 4999; + +SELECT * from db_01721.decimal_dict where KeyField = 4999; + +SELECT dictGet('db_01721.decimal_dict', 'Decimal32_', toUInt64(4999)), + dictGet('db_01721.decimal_dict', 'Decimal64_', toUInt64(4999)), + dictGet('db_01721.decimal_dict', 'Decimal128_', toUInt64(4999)) + --,dictGet('db_01721.decimal_dict', 'Decimal256_', toUInt64(4999)) +; + +select '-------- 5000 --------'; + +SELECT * from db_01721.table_decimal_dict where KeyField = 5000; + +SELECT * from db_01721.decimal_dict where KeyField = 5000; + +SELECT dictGet('db_01721.decimal_dict', 'Decimal32_', toUInt64(5000)), + dictGet('db_01721.decimal_dict', 'Decimal64_', toUInt64(5000)), + dictGet('db_01721.decimal_dict', 'Decimal128_', toUInt64(5000)) + --,dictGet('db_01721.decimal_dict', 'Decimal256_', toUInt64(5000)) +; + +drop table if exists table_decimal_dict; +drop dictionary if exists cache_dict; +drop database if exists db_01721; + From fba1c7fcc165b1d84907a4a1ee37c809307cbf32 Mon Sep 17 00:00:00 2001 From: Azat Khuzhin Date: Fri, 19 Feb 2021 21:48:58 +0300 Subject: [PATCH 0508/2357] Fix uncaught exception when HTTP client goes away Even after #20464 it was still possible, for example [1]. 2021.02.19 11:40:21.886191 [ 68373 ] {} DynamicQueryHandler: Request URI: /?database=test_ds2d6y&log_comment=/usr/share/clickhouse-test/queries/0_stateless/01302_aggregate_state_exception_memory_leak.sh&enable_http_compression=1&http_zlib_compression_level=1 2021.02.19 11:41:35.289940 [ 365 ] {} BaseDaemon: (version 21.3.1.6058, build id: 8D46D65205E2C8B7FE408A0B4EC76CA0483F9E92) (from thread 68373) Terminate called for uncaught exception: Code: 24, e.displayText() = DB::Exception: Cannot write to ostream at offset 262568, Stack trace (when copying this message, always include the lines below): 0. ./obj-x86_64-linux-gnu/../contrib/libcxx/include/exception:0: Poco::Exception::Exception(std::__1::basic_string, std::__1::allocator > const&, int) @ 0x15b3c7db in /usr/bin/clickhouse 1. ./obj-x86_64-linux-gnu/../src/Common/Exception.cpp:56: DB::Exception::Exception(std::__1::basic_string, std::__1::allocator > const&, int, bool) @ 0x8aba66e in /usr/bin/clickhouse 2. ./obj-x86_64-linux-gnu/../src/IO/WriteBufferFromOStream.cpp:0: DB::WriteBufferFromOStream::nextImpl() @ 0x8b8c105 in /usr/bin/clickhouse 3. ./obj-x86_64-linux-gnu/../src/IO/BufferBase.h:39: DB::WriteBufferFromOStream::~WriteBufferFromOStream() @ 0x8b8c537 in /usr/bin/clickhouse 4. ./obj-x86_64-linux-gnu/../src/IO/WriteBufferFromOStream.cpp:44: DB::Write [1]: https://clickhouse-test-reports.s3.yandex.net/16481/5d150cce4778dd14f58dcff67435bdec1efa155b/stress_test_(thread).html#fail1 And according to this partial stacktrace it seems that the dtor of WriteBufferFromOStream was called from WriteBufferFromHTTPServerResponse, since the class name starts from DB::Write* The problem is that if first time WriteBufferFromOStream::next() fails, it will reset position to make next write no-op, however WriteBufferFromHTTPServerResponse::next() will set position to available buffer back, and next() will throw again, but this time it can be from dtor. --- .../HTTP/WriteBufferFromHTTPServerResponse.cpp | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) diff --git a/src/Server/HTTP/WriteBufferFromHTTPServerResponse.cpp b/src/Server/HTTP/WriteBufferFromHTTPServerResponse.cpp index 86133fc2ffe..81f8cc30468 100644 --- a/src/Server/HTTP/WriteBufferFromHTTPServerResponse.cpp +++ b/src/Server/HTTP/WriteBufferFromHTTPServerResponse.cpp @@ -168,12 +168,18 @@ void WriteBufferFromHTTPServerResponse::onProgress(const Progress & progress) void WriteBufferFromHTTPServerResponse::finalize() { - next(); - if (out) + try { - out->next(); + next(); out.reset(); } + catch (...) + { + /// Avoid calling WriteBufferFromOStream::next() from dtor + /// (via WriteBufferFromHTTPServerResponse::next()) + out.reset(); + throw; + } if (!offset()) { From 0f77b6fd9585303162c5386a5b660d5448470d26 Mon Sep 17 00:00:00 2001 From: Alexey Milovidov Date: Fri, 19 Feb 2021 22:01:45 +0300 Subject: [PATCH 0509/2357] Even more better --- src/Interpreters/AggregationCommon.h | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/src/Interpreters/AggregationCommon.h b/src/Interpreters/AggregationCommon.h index aafec9a7929..e896b0e14df 100644 --- a/src/Interpreters/AggregationCommon.h +++ b/src/Interpreters/AggregationCommon.h @@ -271,9 +271,13 @@ static T inline packFixedShuffle( size_t idx, const uint8_t * __restrict masks) { - __m128i res{}; + assert(num_srcs > 0); - for (size_t i = 0; i < num_srcs; ++i) + __m128i res = _mm_shuffle_epi8( + _mm_loadu_si128(reinterpret_cast(srcs[0] + elem_sizes[0] * idx)), + _mm_loadu_si128(reinterpret_cast(masks))); + + for (size_t i = 1; i < num_srcs; ++i) { res = _mm_xor_si128(res, _mm_shuffle_epi8( From 7ee72dfd0c46f0884c446003dfd3676644f6b19e Mon Sep 17 00:00:00 2001 From: alesapin Date: Fri, 19 Feb 2021 22:24:20 +0300 Subject: [PATCH 0510/2357] Missed tests --- .../configs/use_test_keeper.xml | 8 ++ .../__init__.py | 1 + .../configs/enable_test_keeper1.xml | 39 ++++++++ .../configs/enable_test_keeper2.xml | 39 ++++++++ .../configs/enable_test_keeper3.xml | 39 ++++++++ .../configs/log_conf.xml | 12 +++ .../configs/use_test_keeper.xml | 16 +++ .../test.py | 98 +++++++++++++++++++ 8 files changed, 252 insertions(+) create mode 100644 tests/integration/test_testkeeper_back_to_back/configs/use_test_keeper.xml create mode 100644 tests/integration/test_testkeeper_persistent_log_multinode/__init__.py create mode 100644 tests/integration/test_testkeeper_persistent_log_multinode/configs/enable_test_keeper1.xml create mode 100644 tests/integration/test_testkeeper_persistent_log_multinode/configs/enable_test_keeper2.xml create mode 100644 tests/integration/test_testkeeper_persistent_log_multinode/configs/enable_test_keeper3.xml create mode 100644 tests/integration/test_testkeeper_persistent_log_multinode/configs/log_conf.xml create mode 100644 tests/integration/test_testkeeper_persistent_log_multinode/configs/use_test_keeper.xml create mode 100644 tests/integration/test_testkeeper_persistent_log_multinode/test.py diff --git a/tests/integration/test_testkeeper_back_to_back/configs/use_test_keeper.xml b/tests/integration/test_testkeeper_back_to_back/configs/use_test_keeper.xml new file mode 100644 index 00000000000..12dc7fd9447 --- /dev/null +++ b/tests/integration/test_testkeeper_back_to_back/configs/use_test_keeper.xml @@ -0,0 +1,8 @@ + + + + node1 + 9181 + + + diff --git a/tests/integration/test_testkeeper_persistent_log_multinode/__init__.py b/tests/integration/test_testkeeper_persistent_log_multinode/__init__.py new file mode 100644 index 00000000000..e5a0d9b4834 --- /dev/null +++ b/tests/integration/test_testkeeper_persistent_log_multinode/__init__.py @@ -0,0 +1 @@ +#!/usr/bin/env python3 diff --git a/tests/integration/test_testkeeper_persistent_log_multinode/configs/enable_test_keeper1.xml b/tests/integration/test_testkeeper_persistent_log_multinode/configs/enable_test_keeper1.xml new file mode 100644 index 00000000000..a47e5eae09a --- /dev/null +++ b/tests/integration/test_testkeeper_persistent_log_multinode/configs/enable_test_keeper1.xml @@ -0,0 +1,39 @@ + + + 9181 + 1 + /var/lib/clickhouse/coordination/log + + + 5000 + 10000 + trace + + + + + 1 + node1 + 44444 + true + 3 + + + 2 + node2 + 44444 + true + true + 2 + + + 3 + node3 + 44444 + true + true + 1 + + + + diff --git a/tests/integration/test_testkeeper_persistent_log_multinode/configs/enable_test_keeper2.xml b/tests/integration/test_testkeeper_persistent_log_multinode/configs/enable_test_keeper2.xml new file mode 100644 index 00000000000..18681f0dc95 --- /dev/null +++ b/tests/integration/test_testkeeper_persistent_log_multinode/configs/enable_test_keeper2.xml @@ -0,0 +1,39 @@ + + + 9181 + 2 + /var/lib/clickhouse/coordination/log + + + 5000 + 10000 + trace + + + + + 1 + node1 + 44444 + true + 3 + + + 2 + node2 + 44444 + true + true + 2 + + + 3 + node3 + 44444 + true + true + 1 + + + + diff --git a/tests/integration/test_testkeeper_persistent_log_multinode/configs/enable_test_keeper3.xml b/tests/integration/test_testkeeper_persistent_log_multinode/configs/enable_test_keeper3.xml new file mode 100644 index 00000000000..184d3724219 --- /dev/null +++ b/tests/integration/test_testkeeper_persistent_log_multinode/configs/enable_test_keeper3.xml @@ -0,0 +1,39 @@ + + + 9181 + 3 + /var/lib/clickhouse/coordination/log + + + 5000 + 10000 + trace + + + + + 1 + node1 + 44444 + true + 3 + + + 2 + node2 + 44444 + true + true + 2 + + + 3 + node3 + 44444 + true + true + 1 + + + + diff --git a/tests/integration/test_testkeeper_persistent_log_multinode/configs/log_conf.xml b/tests/integration/test_testkeeper_persistent_log_multinode/configs/log_conf.xml new file mode 100644 index 00000000000..318a6bca95d --- /dev/null +++ b/tests/integration/test_testkeeper_persistent_log_multinode/configs/log_conf.xml @@ -0,0 +1,12 @@ + + 3 + + trace + /var/log/clickhouse-server/log.log + /var/log/clickhouse-server/log.err.log + 1000M + 10 + /var/log/clickhouse-server/stderr.log + /var/log/clickhouse-server/stdout.log + + diff --git a/tests/integration/test_testkeeper_persistent_log_multinode/configs/use_test_keeper.xml b/tests/integration/test_testkeeper_persistent_log_multinode/configs/use_test_keeper.xml new file mode 100644 index 00000000000..b6139005d2f --- /dev/null +++ b/tests/integration/test_testkeeper_persistent_log_multinode/configs/use_test_keeper.xml @@ -0,0 +1,16 @@ + + + + node1 + 9181 + + + node2 + 9181 + + + node3 + 9181 + + + diff --git a/tests/integration/test_testkeeper_persistent_log_multinode/test.py b/tests/integration/test_testkeeper_persistent_log_multinode/test.py new file mode 100644 index 00000000000..cb9cf5a59d1 --- /dev/null +++ b/tests/integration/test_testkeeper_persistent_log_multinode/test.py @@ -0,0 +1,98 @@ +#!/usr/bin/env python3 +import pytest +from helpers.cluster import ClickHouseCluster +import random +import string +import os +import time + +cluster = ClickHouseCluster(__file__) +node1 = cluster.add_instance('node1', main_configs=['configs/enable_test_keeper1.xml', 'configs/log_conf.xml', 'configs/use_test_keeper.xml'], stay_alive=True) +node2 = cluster.add_instance('node2', main_configs=['configs/enable_test_keeper2.xml', 'configs/log_conf.xml', 'configs/use_test_keeper.xml'], stay_alive=True) +node3 = cluster.add_instance('node3', main_configs=['configs/enable_test_keeper3.xml', 'configs/log_conf.xml', 'configs/use_test_keeper.xml'], stay_alive=True) + +from kazoo.client import KazooClient, KazooState + +@pytest.fixture(scope="module") +def started_cluster(): + try: + cluster.start() + + yield cluster + + finally: + cluster.shutdown() + +def get_fake_zk(nodename, timeout=30.0): + _fake_zk_instance = KazooClient(hosts=cluster.get_instance_ip(nodename) + ":9181", timeout=timeout) + def reset_listener(state): + nonlocal _fake_zk_instance + print("Fake zk callback called for state", state) + if state != KazooState.CONNECTED: + _fake_zk_instance._reset() + + _fake_zk_instance.add_listener(reset_listener) + _fake_zk_instance.start() + return _fake_zk_instance + +def stop_zk(zk): + try: + if zk: + zk.stop() + zk.close() + except: + pass + +def test_restart_multinode(started_cluster): + try: + node1_zk = node2_zk = node3_zk = None + + node1_zk = get_fake_zk("node1") + node2_zk = get_fake_zk("node2") + node3_zk = get_fake_zk("node3") + + for i in range(100): + node1_zk.create("/test_read_write_multinode_node" + str(i), ("somedata" + str(i)).encode()) + + for i in range(100): + if i % 10 == 0: + node1_zk.delete("/test_read_write_multinode_node" + str(i)) + + node2_zk.sync("/test_read_write_multinode_node0") + node3_zk.sync("/test_read_write_multinode_node0") + + for i in range(100): + if i % 10 != 0: + assert node2_zk.get("/test_read_write_multinode_node" + str(i))[0] == ("somedata" + str(i)).encode() + assert node3_zk.get("/test_read_write_multinode_node" + str(i))[0] == ("somedata" + str(i)).encode() + else: + assert node2_zk.exists("/test_read_write_multinode_node" + str(i)) is None + assert node3_zk.exists("/test_read_write_multinode_node" + str(i)) is None + + finally: + for zk in [node1_zk, node2_zk, node3_zk]: + stop_zk(zk) + + node1.restart_clickhouse(kill=True) + node2.restart_clickhouse(kill=True) + node3.restart_clickhouse(kill=True) + for i in range(100): + try: + node1_zk = get_fake_zk("node1") + node2_zk = get_fake_zk("node2") + node3_zk = get_fake_zk("node3") + for i in range(100): + if i % 10 != 0: + assert node1_zk.get("/test_read_write_multinode_node" + str(i))[0] == ("somedata" + str(i)).encode() + assert node2_zk.get("/test_read_write_multinode_node" + str(i))[0] == ("somedata" + str(i)).encode() + assert node3_zk.get("/test_read_write_multinode_node" + str(i))[0] == ("somedata" + str(i)).encode() + else: + assert node1_zk.exists("/test_read_write_multinode_node" + str(i)) is None + assert node2_zk.exists("/test_read_write_multinode_node" + str(i)) is None + assert node3_zk.exists("/test_read_write_multinode_node" + str(i)) is None + break + except Exception as ex: + print("Got exception as ex", ex) + finally: + for zk in [node1_zk, node2_zk, node3_zk]: + stop_zk(zk) From 057c87f3225f7437debd9f947e744cda6bf7365e Mon Sep 17 00:00:00 2001 From: tavplubix Date: Fri, 19 Feb 2021 22:32:50 +0300 Subject: [PATCH 0511/2357] Update test.py --- tests/integration/test_reload_auxiliary_zookeepers/test.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/integration/test_reload_auxiliary_zookeepers/test.py b/tests/integration/test_reload_auxiliary_zookeepers/test.py index 3d4b1848755..1b14408bc12 100644 --- a/tests/integration/test_reload_auxiliary_zookeepers/test.py +++ b/tests/integration/test_reload_auxiliary_zookeepers/test.py @@ -80,7 +80,7 @@ def test_reload_auxiliary_zookeepers(start_cluster):
""" node.replace_config("/etc/clickhouse-server/conf.d/zookeeper.xml", new_config) - time.sleep(2) + node.query("SYSTEM RELOAD CONFIG") with pytest.raises(QueryRuntimeException): node.query( "ALTER TABLE simple2 FETCH PARTITION '2020-08-27' FROM 'zookeeper2:/clickhouse/tables/0/simple';" From f5fc082ed0e4b3d9f412f6ecfb5db0ce49469d3e Mon Sep 17 00:00:00 2001 From: tavplubix Date: Fri, 19 Feb 2021 22:39:42 +0300 Subject: [PATCH 0512/2357] Update run.sh --- docker/test/stress/run.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docker/test/stress/run.sh b/docker/test/stress/run.sh index 963b204c4c0..df58a23794e 100755 --- a/docker/test/stress/run.sh +++ b/docker/test/stress/run.sh @@ -114,4 +114,4 @@ mv /var/log/clickhouse-server/stderr.log /test_output/ # Write check result into check_status.tsv clickhouse-local --structure "test String, res String" -q "SELECT 'failure', test FROM table WHERE res != 'OK' LIMIT 1" < /test_output/test_results.tsv > /test_output/check_status.tsv -[ -s /test_output/check_status.tsv ] || echo -e "success\tNo errors found" +[ -s /test_output/check_status.tsv ] || echo -e "success\tNo errors found" > /test_output/check_status.tsv From 7474a7e3ca139f1a4e88e83af011b304ebdcaf3c Mon Sep 17 00:00:00 2001 From: Azat Khuzhin Date: Fri, 19 Feb 2021 22:42:40 +0300 Subject: [PATCH 0513/2357] Increase buffer for uncaught exception / std::terminate Use PIPE_BUF over some magic number 1024 in terminate_handler, since according to pipe(7): PIPE_BUF POSIX.1 says that write(2)s of less than PIPE_BUF bytes must be atomic Also note that 1024, is too small, especially for C++ stacktraces (and especially for debug builds, that contains lots of non-inlined helpers for various ptrs). --- base/daemon/BaseDaemon.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/base/daemon/BaseDaemon.cpp b/base/daemon/BaseDaemon.cpp index db7019d3572..248ffdd4d10 100644 --- a/base/daemon/BaseDaemon.cpp +++ b/base/daemon/BaseDaemon.cpp @@ -416,7 +416,7 @@ static void sanitizerDeathCallback() else log_message = "Terminate called without an active exception"; - static const size_t buf_size = 1024; + static const size_t buf_size = PIPE_BUF; if (log_message.size() > buf_size - 16) log_message.resize(buf_size - 16); From 2ab643170dada778ed2914d97077207f0caa06bc Mon Sep 17 00:00:00 2001 From: Denis Glazachev Date: Sat, 20 Feb 2021 00:45:19 +0400 Subject: [PATCH 0514/2357] Add 'SETTINGS optimize_move_to_prewhere = 0' in WHERE-only SELECTs --- tests/integration/test_row_policy/test.py | 44 +++++++++---------- .../prewhere_with_row_level_filter.xml | 2 +- 2 files changed, 23 insertions(+), 23 deletions(-) diff --git a/tests/integration/test_row_policy/test.py b/tests/integration/test_row_policy/test.py index 25d90ae25b9..c83e9cb8ec3 100644 --- a/tests/integration/test_row_policy/test.py +++ b/tests/integration/test_row_policy/test.py @@ -106,26 +106,26 @@ def test_cannot_trick_row_policy_with_keyword_with(): assert node.query("WITH 0 AS a SELECT a FROM mydb.filtered_table1") == TSV([[0], [0]]) assert node.query("WITH 0 AS a SELECT b FROM mydb.filtered_table1") == TSV([[0], [1]]) - assert node.query("WITH 0 AS a SELECT * FROM mydb.filtered_table1") == TSV([[1, 0], [1, 1]]) - assert node.query("WITH 0 AS a SELECT * FROM mydb.filtered_table1 WHERE a IN(0, 1) AND b IN(0, 1)") == TSV([[1, 0], [1, 1]]) - assert node.query("WITH 0 AS a SELECT * FROM mydb.filtered_table1 PREWHERE a IN(0, 1) AND b IN(0, 1)") == TSV([[1, 0], [1, 1]]) - assert node.query("WITH 0 AS a SELECT * FROM mydb.filtered_table1 PREWHERE a IN(0, 1) WHERE b IN(0, 1)") == TSV([[1, 0], [1, 1]]) - assert node.query("WITH 0 AS a SELECT * FROM mydb.filtered_table1 PREWHERE b IN(0, 1) WHERE a IN(0, 1)") == TSV([[1, 0], [1, 1]]) + assert node.query("WITH 0 AS a SELECT * FROM mydb.filtered_table1") == TSV([[1, 0], [1, 1]]) + assert node.query("WITH 0 AS a SELECT * FROM mydb.filtered_table1 WHERE a >= 0 AND b >= 0 SETTINGS optimize_move_to_prewhere = 0") == TSV([[1, 0], [1, 1]]) + assert node.query("WITH 0 AS a SELECT * FROM mydb.filtered_table1 PREWHERE a >= 0 AND b >= 0") == TSV([[1, 0], [1, 1]]) + assert node.query("WITH 0 AS a SELECT * FROM mydb.filtered_table1 PREWHERE a >= 0 WHERE b >= 0") == TSV([[1, 0], [1, 1]]) + assert node.query("WITH 0 AS a SELECT * FROM mydb.filtered_table1 PREWHERE b >= 0 WHERE a >= 0") == TSV([[1, 0], [1, 1]]) - assert node.query("WITH 0 AS a SELECT a, b FROM mydb.filtered_table1") == TSV([[0, 0], [0, 1]]) - assert node.query("WITH 0 AS a SELECT a, b FROM mydb.filtered_table1 WHERE a IN(0, 1) AND b IN(0, 1)") == TSV([[0, 0], [0, 1]]) - assert node.query("WITH 0 AS a SELECT a, b FROM mydb.filtered_table1 PREWHERE a IN(0, 1) AND b IN(0, 1)") == TSV([[0, 0], [0, 1]]) - assert node.query("WITH 0 AS a SELECT a, b FROM mydb.filtered_table1 PREWHERE a IN(0, 1) WHERE b IN(0, 1)") == TSV([[0, 0], [0, 1]]) - assert node.query("WITH 0 AS a SELECT a, b FROM mydb.filtered_table1 PREWHERE b IN(0, 1) WHERE a IN(0, 1)") == TSV([[0, 0], [0, 1]]) + assert node.query("WITH 0 AS a SELECT a, b FROM mydb.filtered_table1") == TSV([[0, 0], [0, 1]]) + assert node.query("WITH 0 AS a SELECT a, b FROM mydb.filtered_table1 WHERE a >= 0 AND b >= 0 SETTINGS optimize_move_to_prewhere = 0") == TSV([[0, 0], [0, 1]]) + assert node.query("WITH 0 AS a SELECT a, b FROM mydb.filtered_table1 PREWHERE a >= 0 AND b >= 0") == TSV([[0, 0], [0, 1]]) + assert node.query("WITH 0 AS a SELECT a, b FROM mydb.filtered_table1 PREWHERE a >= 0 WHERE b >= 0") == TSV([[0, 0], [0, 1]]) + assert node.query("WITH 0 AS a SELECT a, b FROM mydb.filtered_table1 PREWHERE b >= 0 WHERE a >= 0") == TSV([[0, 0], [0, 1]]) assert node.query("WITH 0 AS c SELECT * FROM mydb.filtered_table3") == TSV([[0, 1], [1, 0]]) - assert node.query("WITH 0 AS c SELECT * FROM mydb.filtered_table3 WHERE c >= 0 AND a >= 0") == TSV([[0, 1], [1, 0]]) + assert node.query("WITH 0 AS c SELECT * FROM mydb.filtered_table3 WHERE c >= 0 AND a >= 0 SETTINGS optimize_move_to_prewhere = 0") == TSV([[0, 1], [1, 0]]) assert node.query("WITH 0 AS c SELECT * FROM mydb.filtered_table3 PREWHERE c >= 0 AND a >= 0") == TSV([[0, 1], [1, 0]]) assert node.query("WITH 0 AS c SELECT * FROM mydb.filtered_table3 PREWHERE c >= 0 WHERE a >= 0") == TSV([[0, 1], [1, 0]]) assert node.query("WITH 0 AS c SELECT * FROM mydb.filtered_table3 PREWHERE a >= 0 WHERE c >= 0") == TSV([[0, 1], [1, 0]]) assert node.query("WITH 0 AS c SELECT a, b, c FROM mydb.filtered_table3") == TSV([[0, 1], [1, 0], [0, 0]]) - assert node.query("WITH 0 AS c SELECT a, b, c FROM mydb.filtered_table3 WHERE c >= 0 AND a >= 0") == TSV([[0, 1], [1, 0], [0, 0]]) + assert node.query("WITH 0 AS c SELECT a, b, c FROM mydb.filtered_table3 WHERE c >= 0 AND a >= 0 SETTINGS optimize_move_to_prewhere = 0") == TSV([[0, 1], [1, 0], [0, 0]]) assert node.query("WITH 0 AS c SELECT a, b, c FROM mydb.filtered_table3 PREWHERE c >= 0 AND a >= 0") == TSV([[0, 1], [1, 0], [0, 0]]) assert node.query("WITH 0 AS c SELECT a, b, c FROM mydb.filtered_table3 PREWHERE c >= 0 WHERE a >= 0") == TSV([[0, 1], [1, 0], [0, 0]]) assert node.query("WITH 0 AS c SELECT a, b, c FROM mydb.filtered_table3 PREWHERE a >= 0 WHERE c >= 0") == TSV([[0, 1], [1, 0], [0, 0]]) @@ -145,17 +145,17 @@ def test_policy_from_users_xml_affects_only_user_assigned(): def test_with_prewhere(): copy_policy_xml('normal_filter2_table2.xml') - assert node.query("SELECT * FROM mydb.filtered_table2 WHERE a > 1") == TSV([[4, 3, 2, 1]]) - assert node.query("SELECT a FROM mydb.filtered_table2 WHERE a > 1") == TSV([[4]]) - assert node.query("SELECT a, b FROM mydb.filtered_table2 WHERE a > 1") == TSV([[4, 3]]) - assert node.query("SELECT b, c FROM mydb.filtered_table2 WHERE a > 1") == TSV([[3, 2]]) - assert node.query("SELECT d FROM mydb.filtered_table2 WHERE a > 1") == TSV([[1]]) + assert node.query("SELECT * FROM mydb.filtered_table2 WHERE a > 1 SETTINGS optimize_move_to_prewhere = 0") == TSV([[4, 3, 2, 1]]) + assert node.query("SELECT a FROM mydb.filtered_table2 WHERE a > 1 SETTINGS optimize_move_to_prewhere = 0") == TSV([[4]]) + assert node.query("SELECT a, b FROM mydb.filtered_table2 WHERE a > 1 SETTINGS optimize_move_to_prewhere = 0") == TSV([[4, 3]]) + assert node.query("SELECT b, c FROM mydb.filtered_table2 WHERE a > 1 SETTINGS optimize_move_to_prewhere = 0") == TSV([[3, 2]]) + assert node.query("SELECT d FROM mydb.filtered_table2 WHERE a > 1 SETTINGS optimize_move_to_prewhere = 0") == TSV([[1]]) - assert node.query("SELECT * FROM mydb.filtered_table2 PREWHERE a > 1") == TSV([[4, 3, 2, 1]]) - assert node.query("SELECT a FROM mydb.filtered_table2 PREWHERE a > 1") == TSV([[4]]) + assert node.query("SELECT * FROM mydb.filtered_table2 PREWHERE a > 1") == TSV([[4, 3, 2, 1]]) + assert node.query("SELECT a FROM mydb.filtered_table2 PREWHERE a > 1") == TSV([[4]]) assert node.query("SELECT a, b FROM mydb.filtered_table2 PREWHERE a > 1") == TSV([[4, 3]]) assert node.query("SELECT b, c FROM mydb.filtered_table2 PREWHERE a > 1") == TSV([[3, 2]]) - assert node.query("SELECT d FROM mydb.filtered_table2 PREWHERE a > 1") == TSV([[1]]) + assert node.query("SELECT d FROM mydb.filtered_table2 PREWHERE a > 1") == TSV([[1]]) assert node.query("SELECT * FROM mydb.filtered_table2 PREWHERE a < 4 WHERE b < 10") == TSV([[1, 2, 3, 4]]) assert node.query("SELECT a FROM mydb.filtered_table2 PREWHERE a < 4 WHERE b < 10") == TSV([[1]]) @@ -168,10 +168,10 @@ def test_with_prewhere(): def test_with_throwif_in_where(): copy_policy_xml('no_filters.xml') - assert 'expected' in node.query_and_get_error("SELECT * FROM mydb.filtered_table2 WHERE throwIf(a = 0, 'expected') = 0") + assert 'expected' in node.query_and_get_error("SELECT * FROM mydb.filtered_table2 WHERE throwIf(a = 0, 'expected') = 0 SETTINGS optimize_move_to_prewhere = 0") copy_policy_xml('normal_filter2_table2.xml') - assert node.query("SELECT * FROM mydb.filtered_table2 WHERE throwIf(a = 0, 'pwned') = 0") == TSV([ + assert node.query("SELECT * FROM mydb.filtered_table2 WHERE throwIf(a = 0, 'pwned') = 0 SETTINGS optimize_move_to_prewhere = 0") == TSV([ [1, 2, 3, 4], [4, 3, 2, 1]]) diff --git a/tests/performance/prewhere_with_row_level_filter.xml b/tests/performance/prewhere_with_row_level_filter.xml index d73690ca811..03c73a88f89 100644 --- a/tests/performance/prewhere_with_row_level_filter.xml +++ b/tests/performance/prewhere_with_row_level_filter.xml @@ -6,7 +6,7 @@ INSERT INTO test_prl SELECT number FROM numbers(50000000); SELECT * FROM test_prl; - SELECT * FROM test_prl WHERE n % 3 AND n % 5; + SELECT * FROM test_prl WHERE n % 3 AND n % 5 SETTINGS optimize_move_to_prewhere = 0; SELECT * FROM test_prl PREWHERE n % 3 AND n % 5; SELECT * FROM test_prl PREWHERE n % 3 WHERE n % 5; SELECT * FROM test_prl PREWHERE n % 5 WHERE n % 3; From f5893778cbf6544cb1a6b2d92d21248674bc864a Mon Sep 17 00:00:00 2001 From: Azat Khuzhin Date: Sat, 20 Feb 2021 00:01:13 +0300 Subject: [PATCH 0515/2357] Do not use view() in 01731_async_task_queue_wait to fix ANTLR parser --- tests/queries/0_stateless/01731_async_task_queue_wait.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/queries/0_stateless/01731_async_task_queue_wait.sh b/tests/queries/0_stateless/01731_async_task_queue_wait.sh index 89d8b63d745..2f77628fc6d 100755 --- a/tests/queries/0_stateless/01731_async_task_queue_wait.sh +++ b/tests/queries/0_stateless/01731_async_task_queue_wait.sh @@ -7,4 +7,4 @@ CURDIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd) # regression for 'Empty task was returned from async task queue' during query # cancellation with async_socket_for_remote=1 (that ignores # max_distributed_connections) -$(timeout --signal=SIGINT 1 ${CLICKHOUSE_CLIENT} --max_distributed_connections=1 --max_block_size=2 --interactive_delay=900000 -q "select x from remote('127.{2,3}', view(select number + sleep(0.3) as x from numbers(16))) settings max_block_size = 2") 2>&1 | grep "Empty task was returned from async task queue" || true +$(timeout --signal=SIGINT 1 ${CLICKHOUSE_CLIENT} --max_distributed_connections=1 --max_block_size=2 --interactive_delay=900000 -q "select number + sleep(0.3) as x from remote('127.{2,3}', system.numbers) settings max_block_size = 2") 2>&1 | grep "Empty task was returned from async task queue" || true From d0fe8900f980167530a0e1be56dd0cd219c6f08a Mon Sep 17 00:00:00 2001 From: Azat Khuzhin Date: Sat, 20 Feb 2021 00:04:28 +0300 Subject: [PATCH 0516/2357] Fix bash syntax in 01731_async_task_queue_wait --- tests/queries/0_stateless/01731_async_task_queue_wait.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/queries/0_stateless/01731_async_task_queue_wait.sh b/tests/queries/0_stateless/01731_async_task_queue_wait.sh index 2f77628fc6d..e0babf3c6ff 100755 --- a/tests/queries/0_stateless/01731_async_task_queue_wait.sh +++ b/tests/queries/0_stateless/01731_async_task_queue_wait.sh @@ -7,4 +7,4 @@ CURDIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd) # regression for 'Empty task was returned from async task queue' during query # cancellation with async_socket_for_remote=1 (that ignores # max_distributed_connections) -$(timeout --signal=SIGINT 1 ${CLICKHOUSE_CLIENT} --max_distributed_connections=1 --max_block_size=2 --interactive_delay=900000 -q "select number + sleep(0.3) as x from remote('127.{2,3}', system.numbers) settings max_block_size = 2") 2>&1 | grep "Empty task was returned from async task queue" || true +timeout --signal=SIGINT 1 ${CLICKHOUSE_CLIENT} --max_distributed_connections=1 --max_block_size=2 --interactive_delay=900000 -q "select number + sleep(0.3) as x from remote('127.{2,3}', system.numbers) settings max_block_size = 2" 2>&1 | grep "Empty task was returned from async task queue" || true From 9bd9ea9fbcb9ffa9bf606e75228384c175851e69 Mon Sep 17 00:00:00 2001 From: Pavel Kruglov Date: Sat, 20 Feb 2021 00:45:58 +0300 Subject: [PATCH 0517/2357] Try to fix sigsev --- src/Client/Connection.cpp | 3 +-- src/Client/Connection.h | 7 ++++--- src/Client/ConnectionPoolWithFailover.h | 2 +- src/Client/PacketReceiver.h | 6 ++++-- 4 files changed, 10 insertions(+), 8 deletions(-) diff --git a/src/Client/Connection.cpp b/src/Client/Connection.cpp index b6903ae6c92..80d44a336a5 100644 --- a/src/Client/Connection.cpp +++ b/src/Client/Connection.cpp @@ -109,8 +109,7 @@ void Connection::connect(const ConnectionTimeouts & timeouts) } in = std::make_shared(*socket); - if (async_callback) - in->setAsyncCallback(std::move(async_callback)); + in->setAsyncCallback(std::move(async_callback)); out = std::make_shared(*socket); diff --git a/src/Client/Connection.h b/src/Client/Connection.h index d317ecb56b3..95fbb902c2c 100644 --- a/src/Client/Connection.h +++ b/src/Client/Connection.h @@ -199,9 +199,10 @@ public: /// Each time read from socket blocks and async_callback is set, it will be called. You can poll socket inside it. void setAsyncCallback(AsyncCallback async_callback_) { - async_callback = std::move(async_callback_); if (in) - in->setAsyncCallback(std::move(async_callback)); + in->setAsyncCallback(std::move(async_callback_)); + else + async_callback = std::move(async_callback_); } private: @@ -291,7 +292,7 @@ private: LoggerWrapper log_wrapper; - AsyncCallback async_callback; + AsyncCallback async_callback = {}; void connect(const ConnectionTimeouts & timeouts); void sendHello(); diff --git a/src/Client/ConnectionPoolWithFailover.h b/src/Client/ConnectionPoolWithFailover.h index 023ef863bdf..3bd39977566 100644 --- a/src/Client/ConnectionPoolWithFailover.h +++ b/src/Client/ConnectionPoolWithFailover.h @@ -99,7 +99,7 @@ private: /// Try to get a connection from the pool and check that it is good. /// If table_to_check is not null and the check is enabled in settings, check that replication delay /// for this table is not too large. - TryResult tryGetEntry( + static TryResult tryGetEntry( IConnectionPool & pool, const ConnectionTimeouts & timeouts, std::string & fail_message, diff --git a/src/Client/PacketReceiver.h b/src/Client/PacketReceiver.h index 4d42804d0a2..79d805be877 100644 --- a/src/Client/PacketReceiver.h +++ b/src/Client/PacketReceiver.h @@ -104,10 +104,12 @@ private: { try { - AsyncCallbackSetter async_setter(receiver.connection, ReadCallback{receiver, sink}); while (true) { - receiver.packet = receiver.connection->receivePacket(); + { + AsyncCallbackSetter async_setter(receiver.connection, ReadCallback{receiver, sink}); + receiver.packet = receiver.connection->receivePacket(); + } sink = std::move(sink).resume(); } From 21ee685ef0f1910d42e0e5a47b010b2eb4cc9a71 Mon Sep 17 00:00:00 2001 From: kssenii Date: Fri, 19 Feb 2021 19:38:46 +0000 Subject: [PATCH 0518/2357] Fix brotly --- src/IO/BrotliWriteBuffer.cpp | 2 +- ...7_http_compression_prefer_brotli.reference | 23 +++++++++++++++++++ .../01057_http_compression_prefer_brotli.sh | 2 ++ 3 files changed, 26 insertions(+), 1 deletion(-) diff --git a/src/IO/BrotliWriteBuffer.cpp b/src/IO/BrotliWriteBuffer.cpp index d14c94ca43d..e562cc70e61 100644 --- a/src/IO/BrotliWriteBuffer.cpp +++ b/src/IO/BrotliWriteBuffer.cpp @@ -86,7 +86,7 @@ void BrotliWriteBuffer::nextImpl() throw Exception("brotli compress failed", ErrorCodes::BROTLI_WRITE_FAILED); } } - while (in_available > 0 || out_capacity == 0); + while (in_available > 0); } void BrotliWriteBuffer::finish() diff --git a/tests/queries/0_stateless/01057_http_compression_prefer_brotli.reference b/tests/queries/0_stateless/01057_http_compression_prefer_brotli.reference index 5dd396a38c9..c28cbee8485 100644 --- a/tests/queries/0_stateless/01057_http_compression_prefer_brotli.reference +++ b/tests/queries/0_stateless/01057_http_compression_prefer_brotli.reference @@ -9,3 +9,26 @@ 999997 999998 999999 + }, + { + "datetime": "2020-12-12", + "pipeline": "test-pipeline", + "host": "clickhouse-test-host-001.clickhouse.com", + "home": "clickhouse", + "detail": "clickhouse", + "row_number": "999998" + }, + { + "datetime": "2020-12-12", + "pipeline": "test-pipeline", + "host": "clickhouse-test-host-001.clickhouse.com", + "home": "clickhouse", + "detail": "clickhouse", + "row_number": "999999" + } + ], + + "rows": 1000000, + + "rows_before_limit_at_least": 1048080, + diff --git a/tests/queries/0_stateless/01057_http_compression_prefer_brotli.sh b/tests/queries/0_stateless/01057_http_compression_prefer_brotli.sh index e5f4d12ee18..f93062d43a7 100755 --- a/tests/queries/0_stateless/01057_http_compression_prefer_brotli.sh +++ b/tests/queries/0_stateless/01057_http_compression_prefer_brotli.sh @@ -11,3 +11,5 @@ ${CLICKHOUSE_CURL} -sS -H 'Accept-Encoding: gzip,deflate,br' "${CLICKHOUSE_URL}& ${CLICKHOUSE_CURL} -sS -H 'Accept-Encoding: gzip,deflate' "${CLICKHOUSE_URL}&enable_http_compression=1" -d 'SELECT 1' | gzip -d ${CLICKHOUSE_CURL} -sS -H 'Accept-Encoding: gzip' "${CLICKHOUSE_URL}&enable_http_compression=1" -d 'SELECT number FROM numbers(1000000)' | gzip -d | tail -n3 ${CLICKHOUSE_CURL} -sS -H 'Accept-Encoding: br' "${CLICKHOUSE_URL}&enable_http_compression=1" -d 'SELECT number FROM numbers(1000000)' | brotli -d | tail -n3 + +${CLICKHOUSE_CURL} -sS -H 'Accept-Encoding: br' "${CLICKHOUSE_URL}&enable_http_compression=1" -d "SELECT toDate('2020-12-12') as datetime, 'test-pipeline' as pipeline, 'clickhouse-test-host-001.clickhouse.com' as host, 'clickhouse' as home, 'clickhouse' as detail, number as row_number FROM numbers(1000000) FORMAT JSON" | brotli -d | tail -n30 | head -n23 From 2f7d0ba92677f595b1d760af2a826cc6fa181802 Mon Sep 17 00:00:00 2001 From: M0r64n Date: Sat, 20 Feb 2021 03:27:23 +0400 Subject: [PATCH 0519/2357] Replace direct truncate with O_TRUNC flag --- src/Storages/StorageFile.cpp | 16 ++++++++++------ .../01721_engine_file_truncate_on_insert.sql | 4 ++-- 2 files changed, 12 insertions(+), 8 deletions(-) diff --git a/src/Storages/StorageFile.cpp b/src/Storages/StorageFile.cpp index 856d03ea2ce..5524569e1f0 100644 --- a/src/Storages/StorageFile.cpp +++ b/src/Storages/StorageFile.cpp @@ -475,7 +475,8 @@ public: std::unique_lock && lock_, const CompressionMethod compression_method, const Context & context, - const std::optional & format_settings) + const std::optional & format_settings, + int & flags) : storage(storage_) , metadata_snapshot(metadata_snapshot_) , lock(std::move(lock_)) @@ -491,13 +492,14 @@ public: * INSERT data; SELECT *; last SELECT returns only insert_data */ storage.table_fd_was_used = true; - naked_buffer = std::make_unique(storage.table_fd); + naked_buffer = std::make_unique(storage.table_fd, DBMS_DEFAULT_BUFFER_SIZE); } else { if (storage.paths.size() != 1) throw Exception("Table '" + storage.getStorageID().getNameForLogs() + "' is in readonly mode because of globs in filepath", ErrorCodes::DATABASE_ACCESS_DENIED); - naked_buffer = std::make_unique(storage.paths[0], DBMS_DEFAULT_BUFFER_SIZE, O_WRONLY | O_APPEND | O_CREAT); + flags |= O_WRONLY | O_APPEND | O_CREAT; + naked_buffer = std::make_unique(storage.paths[0], DBMS_DEFAULT_BUFFER_SIZE, flags); } /// In case of CSVWithNames we have already written prefix. @@ -552,10 +554,11 @@ BlockOutputStreamPtr StorageFile::write( if (format_name == "Distributed") throw Exception("Method write is not implemented for Distributed format", ErrorCodes::NOT_IMPLEMENTED); + int flags = 0; + std::string path; if (context.getSettingsRef().engine_file_truncate_on_insert) - if (0 != ::truncate(paths[0].c_str(), 0)) - throwFromErrnoWithPath("Cannot truncate file " + paths[0], paths[0], ErrorCodes::CANNOT_TRUNCATE_FILE); + flags |= O_TRUNC; if (!paths.empty()) { @@ -569,7 +572,8 @@ BlockOutputStreamPtr StorageFile::write( std::unique_lock{rwlock, getLockTimeout(context)}, chooseCompressionMethod(path, compression_method), context, - format_settings); + format_settings, + flags); } bool StorageFile::storesDataOnDisk() const diff --git a/tests/queries/0_stateless/01721_engine_file_truncate_on_insert.sql b/tests/queries/0_stateless/01721_engine_file_truncate_on_insert.sql index 42d935cc0dd..079b2546a20 100644 --- a/tests/queries/0_stateless/01721_engine_file_truncate_on_insert.sql +++ b/tests/queries/0_stateless/01721_engine_file_truncate_on_insert.sql @@ -1,7 +1,7 @@ DROP TABLE IF EXISTS test; -INSERT INTO TABLE FUNCTION file('01718_file/test/data.TSV', 'TSV', 'id UInt32') VALUES (1); -ATTACH TABLE test FROM '01718_file/test' (id UInt8) ENGINE=File(TSV); +INSERT INTO TABLE FUNCTION file('01721_file/test/data.TSV', 'TSV', 'id UInt32') VALUES (1); +ATTACH TABLE test FROM '01721_file/test' (id UInt8) ENGINE=File(TSV); INSERT INTO test VALUES (2), (3); INSERT INTO test VALUES (4); From 2a36d6cb55af14b0dcf87c1b806afbf5c7dec8be Mon Sep 17 00:00:00 2001 From: Alexander Tokmakov Date: Sat, 20 Feb 2021 02:41:58 +0300 Subject: [PATCH 0520/2357] review suggestions --- src/Common/ZooKeeper/ZooKeeper.h | 2 +- src/Databases/DatabaseAtomic.cpp | 11 ++- src/Databases/DatabaseFactory.cpp | 15 +++- src/Databases/DatabaseReplicated.cpp | 79 ++++++++++---------- src/Databases/DatabaseReplicated.h | 9 ++- src/Databases/DatabaseReplicatedSettings.cpp | 23 ++++++ src/Databases/DatabaseReplicatedSettings.h | 26 +++++++ src/Databases/DatabaseReplicatedWorker.cpp | 13 ++-- src/Databases/DatabaseReplicatedWorker.h | 12 +++ src/Databases/DatabaseWithDictionaries.cpp | 4 +- src/Databases/ya.make | 1 + src/Interpreters/Context.cpp | 4 +- src/Interpreters/Context.h | 12 +-- src/Interpreters/DDLTask.cpp | 26 +++---- src/Interpreters/DDLTask.h | 41 ++++++++-- src/Interpreters/DDLWorker.cpp | 47 ++++++------ src/Interpreters/DDLWorker.h | 4 +- src/Interpreters/InterpreterAlterQuery.cpp | 2 +- src/Interpreters/InterpreterCreateQuery.cpp | 4 +- src/Interpreters/InterpreterDropQuery.cpp | 4 +- src/Interpreters/InterpreterRenameQuery.cpp | 2 +- src/Storages/StorageMaterializedView.cpp | 4 +- src/Storages/StorageReplicatedMergeTree.cpp | 10 +-- tests/queries/skip_list.json | 1 + 24 files changed, 232 insertions(+), 124 deletions(-) create mode 100644 src/Databases/DatabaseReplicatedSettings.cpp create mode 100644 src/Databases/DatabaseReplicatedSettings.h diff --git a/src/Common/ZooKeeper/ZooKeeper.h b/src/Common/ZooKeeper/ZooKeeper.h index fbe1bede91a..5b37e4d6024 100644 --- a/src/Common/ZooKeeper/ZooKeeper.h +++ b/src/Common/ZooKeeper/ZooKeeper.h @@ -315,7 +315,7 @@ public: return std::make_shared(path, zookeeper, false, false, ""); } - void reset() + void setAlreadyRemoved() { need_remove = false; } diff --git a/src/Databases/DatabaseAtomic.cpp b/src/Databases/DatabaseAtomic.cpp index 2065e036863..71e0effb2d2 100644 --- a/src/Databases/DatabaseAtomic.cpp +++ b/src/Databases/DatabaseAtomic.cpp @@ -115,11 +115,14 @@ void DatabaseAtomic::dropTable(const Context & context, const String & table_nam std::unique_lock lock(mutex); table = getTableUnlocked(table_name, lock); table_metadata_path_drop = DatabaseCatalog::instance().getPathForDroppedMetadata(table->getStorageID()); - auto txn = context.getMetadataTransaction(); + auto txn = context.getZooKeeperMetadataTransaction(); if (txn && !context.isInternalSubquery()) txn->commit(); /// Commit point (a sort of) for Replicated database /// NOTE: replica will be lost if server crashes before the following rename + /// We apply changes in ZooKeeper before applying changes in local metadata file + /// to reduce probability of failures between these operations + /// (it's more likely to lost connection, than to fail before applying local changes). /// TODO better detection and recovery Poco::File(table_metadata_path).renameTo(table_metadata_path_drop); /// Mark table as dropped @@ -241,7 +244,7 @@ void DatabaseAtomic::renameTable(const Context & context, const String & table_n } /// Table renaming actually begins here - auto txn = context.getMetadataTransaction(); + auto txn = context.getZooKeeperMetadataTransaction(); if (txn && !context.isInternalSubquery()) txn->commit(); /// Commit point (a sort of) for Replicated database @@ -302,7 +305,7 @@ void DatabaseAtomic::commitCreateTable(const ASTCreateQuery & query, const Stora DatabaseCatalog::instance().addUUIDMapping(query.uuid); locked_uuid = true; - auto txn = query_context.getMetadataTransaction(); + auto txn = query_context.getZooKeeperMetadataTransaction(); if (txn && !query_context.isInternalSubquery()) txn->commit(); /// Commit point (a sort of) for Replicated database @@ -337,7 +340,7 @@ void DatabaseAtomic::commitAlterTable(const StorageID & table_id, const String & if (table_id.uuid != actual_table_id.uuid) throw Exception("Cannot alter table because it was renamed", ErrorCodes::CANNOT_ASSIGN_ALTER); - auto txn = query_context.getMetadataTransaction(); + auto txn = query_context.getZooKeeperMetadataTransaction(); if (txn && !query_context.isInternalSubquery()) txn->commit(); /// Commit point (a sort of) for Replicated database diff --git a/src/Databases/DatabaseFactory.cpp b/src/Databases/DatabaseFactory.cpp index ca2b9bb083e..cd0143556c9 100644 --- a/src/Databases/DatabaseFactory.cpp +++ b/src/Databases/DatabaseFactory.cpp @@ -103,8 +103,11 @@ DatabasePtr DatabaseFactory::getImpl(const ASTCreateQuery & create, const String if (engine_define->engine->arguments && !engine_may_have_arguments) throw Exception("Database engine " + engine_name + " cannot have arguments", ErrorCodes::BAD_ARGUMENTS); - if (engine_define->engine->parameters || engine_define->partition_by || engine_define->primary_key || engine_define->order_by || - engine_define->sample_by || (!endsWith(engine_name, "MySQL") && engine_define->settings)) + bool has_unexpected_element = engine_define->engine->parameters || engine_define->partition_by || + engine_define->primary_key || engine_define->order_by || + engine_define->sample_by; + bool may_have_settings = endsWith(engine_name, "MySQL") || engine_name == "Replicated"; + if (has_unexpected_element || (!may_have_settings && engine_define->settings)) throw Exception("Database engine " + engine_name + " cannot have parameters, primary_key, order_by, sample_by, settings", ErrorCodes::UNKNOWN_ELEMENT_IN_AST); @@ -205,7 +208,13 @@ DatabasePtr DatabaseFactory::getImpl(const ASTCreateQuery & create, const String shard_name = context.getMacros()->expand(shard_name); replica_name = context.getMacros()->expand(replica_name); - return std::make_shared(database_name, metadata_path, uuid, zookeeper_path, shard_name, replica_name, context); + DatabaseReplicatedSettings database_replicated_settings{}; + if (engine_define->settings) + database_replicated_settings.loadFromQuery(*engine_define); + + return std::make_shared(database_name, metadata_path, uuid, + zookeeper_path, shard_name, replica_name, + std::move(database_replicated_settings), context); } #if USE_LIBPQXX diff --git a/src/Databases/DatabaseReplicated.cpp b/src/Databases/DatabaseReplicated.cpp index 441880ae616..12cff3407d3 100644 --- a/src/Databases/DatabaseReplicated.cpp +++ b/src/Databases/DatabaseReplicated.cpp @@ -63,11 +63,13 @@ DatabaseReplicated::DatabaseReplicated( const String & zookeeper_path_, const String & shard_name_, const String & replica_name_, + DatabaseReplicatedSettings db_settings_, const Context & context_) : DatabaseAtomic(name_, metadata_path_, uuid, "DatabaseReplicated (" + name_ + ")", context_) , zookeeper_path(zookeeper_path_) , shard_name(shard_name_) , replica_name(replica_name_) + , db_settings(std::move(db_settings_)) { if (zookeeper_path.empty() || shard_name.empty() || replica_name.empty()) throw Exception("ZooKeeper path, shard and replica names must be non-empty", ErrorCodes::BAD_ARGUMENTS); @@ -141,7 +143,8 @@ ClusterPtr DatabaseReplicated::getCluster() const break; } if (!success) - throw Exception(ErrorCodes::ALL_CONNECTION_TRIES_FAILED, "Cannot get consistent cluster snapshot"); + throw Exception(ErrorCodes::ALL_CONNECTION_TRIES_FAILED, "Cannot get consistent cluster snapshot," + "because replicas are created or removed concurrently"); assert(!hosts.empty()); assert(hosts.size() == host_ids.size()); @@ -172,7 +175,7 @@ ClusterPtr DatabaseReplicated::getCluster() const return std::make_shared(global_context.getSettingsRef(), shards, username, password, global_context.getTCPPort(), false); } -void DatabaseReplicated::tryConnectToZooKeeper(bool force_attach) +void DatabaseReplicated::tryConnectToZooKeeperAndInitDatabase(bool force_attach) { try { @@ -228,6 +231,9 @@ bool DatabaseReplicated::createDatabaseNodesInZooKeeper(const zkutil::ZooKeeperP ops.emplace_back(zkutil::makeCreateRequest(zookeeper_path + "/log", "", zkutil::CreateMode::Persistent)); ops.emplace_back(zkutil::makeCreateRequest(zookeeper_path + "/replicas", "", zkutil::CreateMode::Persistent)); ops.emplace_back(zkutil::makeCreateRequest(zookeeper_path + "/counter", "", zkutil::CreateMode::Persistent)); + /// We create and remove counter/cnt- node to increment sequential number of counter/ node and make log entry numbers start from 1. + /// New replicas are created with log pointer equal to 0 and log pointer is a number of the last executed entry. + /// It means that we cannot have log entry with number 0. ops.emplace_back(zkutil::makeCreateRequest(zookeeper_path + "/counter/cnt-", "", zkutil::CreateMode::Persistent)); ops.emplace_back(zkutil::makeRemoveRequest(zookeeper_path + "/counter/cnt-", -1)); ops.emplace_back(zkutil::makeCreateRequest(zookeeper_path + "/metadata", "", zkutil::CreateMode::Persistent)); @@ -253,10 +259,7 @@ void DatabaseReplicated::createReplicaNodesInZooKeeper(const zkutil::ZooKeeperPt auto host_id = getHostID(global_context, db_uuid); /// On replica creation add empty entry to log. Can be used to trigger some actions on other replicas (e.g. update cluster info). - DDLLogEntry entry; - entry.hosts = {}; - entry.query = {}; - entry.initiator = {}; + DDLLogEntry entry{}; String query_path_prefix = zookeeper_path + "/log/query-"; String counter_prefix = zookeeper_path + "/counter/cnt-"; @@ -273,7 +276,7 @@ void DatabaseReplicated::createReplicaNodesInZooKeeper(const zkutil::ZooKeeperPt void DatabaseReplicated::loadStoredObjects(Context & context, bool has_force_restore_data_flag, bool force_attach) { - tryConnectToZooKeeper(force_attach); + tryConnectToZooKeeperAndInitDatabase(force_attach); DatabaseAtomic::loadStoredObjects(context, has_force_restore_data_flag, force_attach); @@ -281,7 +284,7 @@ void DatabaseReplicated::loadStoredObjects(Context & context, bool has_force_res ddl_worker->startup(); } -BlockIO DatabaseReplicated::propose(const ASTPtr & query, const Context & query_context) +BlockIO DatabaseReplicated::tryEnqueueReplicatedDDL(const ASTPtr & query, const Context & query_context) { if (is_readonly) throw Exception(ErrorCodes::NO_ZOOKEEPER, "Database is in readonly mode, because it cannot connect to ZooKeeper"); @@ -405,7 +408,7 @@ void DatabaseReplicated::recoverLostReplica(const ZooKeeperPtr & current_zookeep String db_name = getDatabaseName(); String to_db_name = getDatabaseName() + BROKEN_TABLES_SUFFIX; - if (total_tables < tables_to_detach.size() * 2) + if (total_tables * db_settings.max_broken_tables_ratio < tables_to_detach.size()) throw Exception(ErrorCodes::DATABASE_REPLICATION_FAILED, "Too many tables to recreate: {} of {}", tables_to_detach.size(), total_tables); else if (!tables_to_detach.empty()) { @@ -594,12 +597,12 @@ void DatabaseReplicated::shutdown() void DatabaseReplicated::dropTable(const Context & context, const String & table_name, bool no_delay) { - auto txn = context.getMetadataTransaction(); + auto txn = context.getZooKeeperMetadataTransaction(); assert(!ddl_worker->isCurrentlyActive() || txn); - if (txn && txn->is_initial_query) + if (txn && txn->isInitialQuery()) { String metadata_zk_path = zookeeper_path + "/metadata/" + escapeForFileName(table_name); - txn->ops.emplace_back(zkutil::makeRemoveRequest(metadata_zk_path, -1)); + txn->addOp(zkutil::makeRemoveRequest(metadata_zk_path, -1)); } DatabaseAtomic::dropTable(context, table_name, no_delay); } @@ -607,10 +610,10 @@ void DatabaseReplicated::dropTable(const Context & context, const String & table void DatabaseReplicated::renameTable(const Context & context, const String & table_name, IDatabase & to_database, const String & to_table_name, bool exchange, bool dictionary) { - auto txn = context.getMetadataTransaction(); + auto txn = context.getZooKeeperMetadataTransaction(); assert(txn); - if (txn->is_initial_query) + if (txn->isInitialQuery()) { if (this != &to_database) throw Exception(ErrorCodes::NOT_IMPLEMENTED, "Moving tables between databases is not supported for Replicated engine"); @@ -622,16 +625,16 @@ void DatabaseReplicated::renameTable(const Context & context, const String & tab throw Exception(ErrorCodes::UNKNOWN_TABLE, "Table {} does not exist", to_table_name); String statement = readMetadataFile(table_name); - String metadata_zk_path = txn->zookeeper_path + "/metadata/" + escapeForFileName(table_name); - String metadata_zk_path_to = txn->zookeeper_path + "/metadata/" + escapeForFileName(to_table_name); - txn->ops.emplace_back(zkutil::makeRemoveRequest(metadata_zk_path, -1)); + String metadata_zk_path = zookeeper_path + "/metadata/" + escapeForFileName(table_name); + String metadata_zk_path_to = zookeeper_path + "/metadata/" + escapeForFileName(to_table_name); + txn->addOp(zkutil::makeRemoveRequest(metadata_zk_path, -1)); if (exchange) { String statement_to = readMetadataFile(to_table_name); - txn->ops.emplace_back(zkutil::makeRemoveRequest(metadata_zk_path_to, -1)); - txn->ops.emplace_back(zkutil::makeCreateRequest(metadata_zk_path, statement_to, zkutil::CreateMode::Persistent)); + txn->addOp(zkutil::makeRemoveRequest(metadata_zk_path_to, -1)); + txn->addOp(zkutil::makeCreateRequest(metadata_zk_path, statement_to, zkutil::CreateMode::Persistent)); } - txn->ops.emplace_back(zkutil::makeCreateRequest(metadata_zk_path_to, statement, zkutil::CreateMode::Persistent)); + txn->addOp(zkutil::makeCreateRequest(metadata_zk_path_to, statement, zkutil::CreateMode::Persistent)); } DatabaseAtomic::renameTable(context, table_name, to_database, to_table_name, exchange, dictionary); @@ -641,14 +644,14 @@ void DatabaseReplicated::commitCreateTable(const ASTCreateQuery & query, const S const String & table_metadata_tmp_path, const String & table_metadata_path, const Context & query_context) { - auto txn = query_context.getMetadataTransaction(); + auto txn = query_context.getZooKeeperMetadataTransaction(); assert(!ddl_worker->isCurrentlyActive() || txn); - if (txn && txn->is_initial_query) + if (txn && txn->isInitialQuery()) { - String metadata_zk_path = txn->zookeeper_path + "/metadata/" + escapeForFileName(query.table); + String metadata_zk_path = zookeeper_path + "/metadata/" + escapeForFileName(query.table); String statement = getObjectDefinitionFromCreateQuery(query.clone()); /// zk::multi(...) will throw if `metadata_zk_path` exists - txn->ops.emplace_back(zkutil::makeCreateRequest(metadata_zk_path, statement, zkutil::CreateMode::Persistent)); + txn->addOp(zkutil::makeCreateRequest(metadata_zk_path, statement, zkutil::CreateMode::Persistent)); } DatabaseAtomic::commitCreateTable(query, table, table_metadata_tmp_path, table_metadata_path, query_context); } @@ -657,11 +660,11 @@ void DatabaseReplicated::commitAlterTable(const StorageID & table_id, const String & table_metadata_tmp_path, const String & table_metadata_path, const String & statement, const Context & query_context) { - auto txn = query_context.getMetadataTransaction(); - if (txn && txn->is_initial_query) + auto txn = query_context.getZooKeeperMetadataTransaction(); + if (txn && txn->isInitialQuery()) { - String metadata_zk_path = txn->zookeeper_path + "/metadata/" + escapeForFileName(table_id.table_name); - txn->ops.emplace_back(zkutil::makeSetRequest(metadata_zk_path, statement, -1)); + String metadata_zk_path = zookeeper_path + "/metadata/" + escapeForFileName(table_id.table_name); + txn->addOp(zkutil::makeSetRequest(metadata_zk_path, statement, -1)); } DatabaseAtomic::commitAlterTable(table_id, table_metadata_tmp_path, table_metadata_path, statement, query_context); } @@ -670,37 +673,37 @@ void DatabaseReplicated::createDictionary(const Context & context, const String & dictionary_name, const ASTPtr & query) { - auto txn = context.getMetadataTransaction(); + auto txn = context.getZooKeeperMetadataTransaction(); assert(!ddl_worker->isCurrentlyActive() || txn); - if (txn && txn->is_initial_query) + if (txn && txn->isInitialQuery()) { - String metadata_zk_path = txn->zookeeper_path + "/metadata/" + escapeForFileName(dictionary_name); + String metadata_zk_path = zookeeper_path + "/metadata/" + escapeForFileName(dictionary_name); String statement = getObjectDefinitionFromCreateQuery(query->clone()); - txn->ops.emplace_back(zkutil::makeCreateRequest(metadata_zk_path, statement, zkutil::CreateMode::Persistent)); + txn->addOp(zkutil::makeCreateRequest(metadata_zk_path, statement, zkutil::CreateMode::Persistent)); } DatabaseAtomic::createDictionary(context, dictionary_name, query); } void DatabaseReplicated::removeDictionary(const Context & context, const String & dictionary_name) { - auto txn = context.getMetadataTransaction(); + auto txn = context.getZooKeeperMetadataTransaction(); assert(!ddl_worker->isCurrentlyActive() || txn); - if (txn && txn->is_initial_query) + if (txn && txn->isInitialQuery()) { String metadata_zk_path = zookeeper_path + "/metadata/" + escapeForFileName(dictionary_name); - txn->ops.emplace_back(zkutil::makeRemoveRequest(metadata_zk_path, -1)); + txn->addOp(zkutil::makeRemoveRequest(metadata_zk_path, -1)); } DatabaseAtomic::removeDictionary(context, dictionary_name); } void DatabaseReplicated::detachTablePermanently(const Context & context, const String & table_name) { - auto txn = context.getMetadataTransaction(); + auto txn = context.getZooKeeperMetadataTransaction(); assert(!ddl_worker->isCurrentlyActive() || txn); - if (txn && txn->is_initial_query) + if (txn && txn->isInitialQuery()) { String metadata_zk_path = zookeeper_path + "/metadata/" + escapeForFileName(table_name); - txn->ops.emplace_back(zkutil::makeRemoveRequest(metadata_zk_path, -1)); + txn->addOp(zkutil::makeRemoveRequest(metadata_zk_path, -1)); } DatabaseAtomic::detachTablePermanently(context, table_name); } diff --git a/src/Databases/DatabaseReplicated.h b/src/Databases/DatabaseReplicated.h index a3a53e02ee4..fde53cf2c29 100644 --- a/src/Databases/DatabaseReplicated.h +++ b/src/Databases/DatabaseReplicated.h @@ -1,6 +1,7 @@ #pragma once #include +#include #include #include #include @@ -22,13 +23,14 @@ class DatabaseReplicated : public DatabaseAtomic public: DatabaseReplicated(const String & name_, const String & metadata_path_, UUID uuid, const String & zookeeper_path_, const String & shard_name_, const String & replica_name_, + DatabaseReplicatedSettings db_settings_, const Context & context); ~DatabaseReplicated() override; String getEngineName() const override { return "Replicated"; } - /// If current query is initial, then the following methods add metadata updating ZooKeeper operations to current MetadataTransaction. + /// If current query is initial, then the following methods add metadata updating ZooKeeper operations to current ZooKeeperMetadataTransaction. void dropTable(const Context &, const String & table_name, bool no_delay) override; void renameTable(const Context & context, const String & table_name, IDatabase & to_database, const String & to_table_name, bool exchange, bool dictionary) override; @@ -46,7 +48,7 @@ public: /// Try to execute DLL query on current host as initial query. If query is succeed, /// then it will be executed on all replicas. - BlockIO propose(const ASTPtr & query, const Context & query_context); + BlockIO tryEnqueueReplicatedDDL(const ASTPtr & query, const Context & query_context); void stopReplication(); @@ -64,7 +66,7 @@ public: friend struct DatabaseReplicatedTask; friend class DatabaseReplicatedDDLWorker; private: - void tryConnectToZooKeeper(bool force_attach); + void tryConnectToZooKeeperAndInitDatabase(bool force_attach); bool createDatabaseNodesInZooKeeper(const ZooKeeperPtr & current_zookeeper); void createReplicaNodesInZooKeeper(const ZooKeeperPtr & current_zookeeper); @@ -78,6 +80,7 @@ private: String shard_name; String replica_name; String replica_path; + DatabaseReplicatedSettings db_settings; zkutil::ZooKeeperPtr getZooKeeper() const; diff --git a/src/Databases/DatabaseReplicatedSettings.cpp b/src/Databases/DatabaseReplicatedSettings.cpp new file mode 100644 index 00000000000..61febcf2810 --- /dev/null +++ b/src/Databases/DatabaseReplicatedSettings.cpp @@ -0,0 +1,23 @@ +#include +#include +#include + +namespace DB +{ + +IMPLEMENT_SETTINGS_TRAITS(DatabaseReplicatedSettingsTraits, LIST_OF_DATABASE_REPLICATED_SETTINGS) + +void DatabaseReplicatedSettings::loadFromQuery(ASTStorage & storage_def) +{ + if (storage_def.settings) + { + applyChanges(storage_def.settings->changes); + return; + } + + auto settings_ast = std::make_shared(); + settings_ast->is_standalone = false; + storage_def.set(storage_def.settings, settings_ast); +} + +} diff --git a/src/Databases/DatabaseReplicatedSettings.h b/src/Databases/DatabaseReplicatedSettings.h new file mode 100644 index 00000000000..11d5b3820e4 --- /dev/null +++ b/src/Databases/DatabaseReplicatedSettings.h @@ -0,0 +1,26 @@ +#pragma once +#include +#include + +namespace DB +{ + +class ASTStorage; + +#define LIST_OF_DATABASE_REPLICATED_SETTINGS(M) \ + M(Float, max_broken_tables_ratio, 0.5, "Do not recover replica automatically if the ratio of staled tables to all tables is greater", 0) \ + M(UInt64, max_replication_lag_to_enqueue, 10, "Replica will throw exception on attempt to execute query if its replication lag greater", 0) \ + M(UInt64, wait_entry_commited_timeout_sec, 3600, "Replicas will try to cancel query if timeout exceed, but initiator host has not executed it yet", 0) \ + +DECLARE_SETTINGS_TRAITS(DatabaseReplicatedSettingsTraits, LIST_OF_DATABASE_REPLICATED_SETTINGS) + + +/** Settings for the MaterializeMySQL database engine. + * Could be loaded from a CREATE DATABASE query (SETTINGS clause). + */ +struct DatabaseReplicatedSettings : public BaseSettings +{ + void loadFromQuery(ASTStorage & storage_def); +}; + +} diff --git a/src/Databases/DatabaseReplicatedWorker.cpp b/src/Databases/DatabaseReplicatedWorker.cpp index ff15878b136..e0c5717711c 100644 --- a/src/Databases/DatabaseReplicatedWorker.cpp +++ b/src/Databases/DatabaseReplicatedWorker.cpp @@ -30,7 +30,7 @@ void DatabaseReplicatedDDLWorker::initializeMainThread() { auto zookeeper = getAndSetZooKeeper(); if (database->is_readonly) - database->tryConnectToZooKeeper(false); + database->tryConnectToZooKeeperAndInitDatabase(false); initializeReplication(); initialized = true; return; @@ -98,8 +98,7 @@ String DatabaseReplicatedDDLWorker::tryEnqueueAndExecuteEntry(DDLLogEntry & entr UInt32 our_log_ptr = parse(zookeeper->get(database->replica_path + "/log_ptr")); UInt32 max_log_ptr = parse(zookeeper->get(database->zookeeper_path + "/max_log_ptr")); assert(our_log_ptr <= max_log_ptr); - constexpr UInt32 max_replication_lag = 16; - if (max_replication_lag < max_log_ptr - our_log_ptr) + if (database->db_settings.max_replication_lag_to_enqueue < max_log_ptr - our_log_ptr) throw Exception(ErrorCodes::NOT_A_LEADER, "Cannot enqueue query on this replica, " "because it has replication lag of {} queries. Try other replica.", max_log_ptr - our_log_ptr); @@ -131,7 +130,7 @@ String DatabaseReplicatedDDLWorker::tryEnqueueAndExecuteEntry(DDLLogEntry & entr if (zookeeper->expired() || stop_flag) throw Exception(ErrorCodes::DATABASE_REPLICATION_FAILED, "ZooKeeper session expired or replication stopped, try again"); - processTask(*task); + processTask(*task, zookeeper); if (!task->was_executed) { @@ -139,7 +138,7 @@ String DatabaseReplicatedDDLWorker::tryEnqueueAndExecuteEntry(DDLLogEntry & entr task->execution_status.code, task->execution_status.message); } - try_node->reset(); + try_node->setAlreadyRemoved(); return entry_path; } @@ -178,7 +177,7 @@ DDLTaskPtr DatabaseReplicatedDDLWorker::initAndCheckTask(const String & entry_na /// Query is not committed yet. We cannot just skip it and execute next one, because reordering may break replication. LOG_TRACE(log, "Waiting for initiator {} to commit or rollback entry {}", initiator_name, entry_path); constexpr size_t wait_time_ms = 1000; - constexpr size_t max_iterations = 3600; + size_t max_iterations = database->db_settings.wait_entry_commited_timeout_sec; size_t iteration = 0; while (!wait_committed_or_failed->tryWait(wait_time_ms)) @@ -194,7 +193,7 @@ DDLTaskPtr DatabaseReplicatedDDLWorker::initAndCheckTask(const String & entry_na if (max_iterations <= ++iteration) { /// What can we do if initiator hangs for some reason? Seems like we can remove /try node. - /// Initiator will fail to commit entry to ZK (including ops for replicated table) if /try does not exist. + /// Initiator will fail to commit ZooKeeperMetadataTransaction (including ops for replicated table) if /try does not exist. /// But it's questionable. /// We use tryRemove(...) because multiple hosts (including initiator) may try to do it concurrently. diff --git a/src/Databases/DatabaseReplicatedWorker.h b/src/Databases/DatabaseReplicatedWorker.h index 1eafe2489e7..6dd8dc408d7 100644 --- a/src/Databases/DatabaseReplicatedWorker.h +++ b/src/Databases/DatabaseReplicatedWorker.h @@ -6,6 +6,18 @@ namespace DB class DatabaseReplicated; +/// It's similar to DDLWorker, but has the following differences: +/// 1. DDL queue in ZooKeeper is not shared between multiple clusters and databases, +/// each DatabaseReplicated has its own queue in ZooKeeper and DatabaseReplicatedDDLWorker object. +/// 2. Shards and replicas are identified by shard_name and replica_name arguments of database engine, +/// not by address:port pairs. Cluster (of multiple database replicas) is identified by its zookeeper_path. +/// 3. After creation of an entry in DDL queue initiator tries to execute the entry locally +/// and other hosts wait for query to finish on initiator host. +/// If query succeed on initiator, then all hosts must execute it, so they will retry until query succeed. +/// We assume that cluster is homogenous, so if replicas are in consistent state and query succeed on one host, +/// then all hosts can execute it (maybe after several retries). +/// 4. Each database replica stores its log pointer in ZooKeeper. Cleanup thread removes old entry +/// if its number < max_log_ptr - logs_to_keep. class DatabaseReplicatedDDLWorker : public DDLWorker { public: diff --git a/src/Databases/DatabaseWithDictionaries.cpp b/src/Databases/DatabaseWithDictionaries.cpp index 7ce5de56b64..d92f0f1897e 100644 --- a/src/Databases/DatabaseWithDictionaries.cpp +++ b/src/Databases/DatabaseWithDictionaries.cpp @@ -194,7 +194,7 @@ void DatabaseWithDictionaries::createDictionary(const Context & context, const S detachDictionary(dictionary_name); }); - auto txn = context.getMetadataTransaction(); + auto txn = context.getZooKeeperMetadataTransaction(); if (txn && !context.isInternalSubquery()) txn->commit(); /// Commit point (a sort of) for Replicated database @@ -219,7 +219,7 @@ void DatabaseWithDictionaries::removeDictionary(const Context & context, const S { String dictionary_metadata_path = getObjectMetadataPath(dictionary_name); - auto txn = context.getMetadataTransaction(); + auto txn = context.getZooKeeperMetadataTransaction(); if (txn && !context.isInternalSubquery()) txn->commit(); /// Commit point (a sort of) for Replicated database diff --git a/src/Databases/ya.make b/src/Databases/ya.make index 38f79532080..8bd3f291a64 100644 --- a/src/Databases/ya.make +++ b/src/Databases/ya.make @@ -17,6 +17,7 @@ SRCS( DatabaseOnDisk.cpp DatabaseOrdinary.cpp DatabaseReplicated.cpp + DatabaseReplicatedSettings.cpp DatabaseReplicatedWorker.cpp DatabaseWithDictionaries.cpp DatabasesCommon.cpp diff --git a/src/Interpreters/Context.cpp b/src/Interpreters/Context.cpp index 766b14dea42..98e4a87fba3 100644 --- a/src/Interpreters/Context.cpp +++ b/src/Interpreters/Context.cpp @@ -2553,14 +2553,14 @@ StorageID Context::resolveStorageIDImpl(StorageID storage_id, StorageNamespace w return StorageID::createEmpty(); } -void Context::initMetadataTransaction(MetadataTransactionPtr txn, [[maybe_unused]] bool attach_existing) +void Context::initZooKeeperMetadataTransaction(ZooKeeperMetadataTransactionPtr txn, [[maybe_unused]] bool attach_existing) { assert(!metadata_transaction); assert(attach_existing || query_context == this); metadata_transaction = std::move(txn); } -MetadataTransactionPtr Context::getMetadataTransaction() const +ZooKeeperMetadataTransactionPtr Context::getZooKeeperMetadataTransaction() const { assert(!metadata_transaction || hasQueryContext()); return metadata_transaction; diff --git a/src/Interpreters/Context.h b/src/Interpreters/Context.h index 24d0eb4b0de..563fb172488 100644 --- a/src/Interpreters/Context.h +++ b/src/Interpreters/Context.h @@ -117,8 +117,8 @@ using VolumePtr = std::shared_ptr; struct NamedSession; struct BackgroundTaskSchedulingSettings; -struct MetadataTransaction; -using MetadataTransactionPtr = std::shared_ptr; +class ZooKeeperMetadataTransaction; +using ZooKeeperMetadataTransactionPtr = std::shared_ptr; #if USE_EMBEDDED_COMPILER class CompiledExpressionCache; @@ -281,7 +281,7 @@ private: /// to be customized in HTTP and TCP servers by overloading the customizeContext(DB::Context&) /// methods. - MetadataTransactionPtr metadata_transaction; /// Distributed DDL context. I'm not sure if it's a suitable place for this, + ZooKeeperMetadataTransactionPtr metadata_transaction; /// Distributed DDL context. I'm not sure if it's a suitable place for this, /// but it's the easiest way to pass this through the whole stack from executeQuery(...) /// to DatabaseOnDisk::commitCreateTable(...) or IStorage::alter(...) without changing /// thousands of signatures. @@ -746,8 +746,10 @@ public: IHostContextPtr & getHostContext(); const IHostContextPtr & getHostContext() const; - void initMetadataTransaction(MetadataTransactionPtr txn, bool attach_existing = false); - MetadataTransactionPtr getMetadataTransaction() const; + /// Initialize context of distributed DDL query with Replicated database. + void initZooKeeperMetadataTransaction(ZooKeeperMetadataTransactionPtr txn, bool attach_existing = false); + /// Returns context of current distributed DDL query or nullptr. + ZooKeeperMetadataTransactionPtr getZooKeeperMetadataTransaction() const; struct MySQLWireContext { diff --git a/src/Interpreters/DDLTask.cpp b/src/Interpreters/DDLTask.cpp index 7f47f0a6659..4be465d3de4 100644 --- a/src/Interpreters/DDLTask.cpp +++ b/src/Interpreters/DDLTask.cpp @@ -96,7 +96,7 @@ void DDLTaskBase::parseQueryFromEntry(const Context & context) query = parseQuery(parser_query, begin, end, description, 0, context.getSettingsRef().max_parser_depth); } -std::unique_ptr DDLTaskBase::makeQueryContext(Context & from_context) +std::unique_ptr DDLTaskBase::makeQueryContext(Context & from_context, const ZooKeeperPtr & /*zookeeper*/) { auto query_context = std::make_unique(from_context); query_context->makeQueryContext(); @@ -293,28 +293,26 @@ String DatabaseReplicatedTask::getShardID() const return database->shard_name; } -std::unique_ptr DatabaseReplicatedTask::makeQueryContext(Context & from_context) +std::unique_ptr DatabaseReplicatedTask::makeQueryContext(Context & from_context, const ZooKeeperPtr & zookeeper) { - auto query_context = DDLTaskBase::makeQueryContext(from_context); + auto query_context = DDLTaskBase::makeQueryContext(from_context, zookeeper); query_context->getClientInfo().query_kind = ClientInfo::QueryKind::SECONDARY_QUERY; query_context->setCurrentDatabase(database->getDatabaseName()); - auto txn = std::make_shared(); - query_context->initMetadataTransaction(txn); - txn->current_zookeeper = from_context.getZooKeeper(); - txn->zookeeper_path = database->zookeeper_path; - txn->is_initial_query = is_initial_query; + auto txn = std::make_shared(zookeeper, database->zookeeper_path, is_initial_query); + query_context->initZooKeeperMetadataTransaction(txn); if (is_initial_query) { - txn->ops.emplace_back(zkutil::makeRemoveRequest(entry_path + "/try", -1)); - txn->ops.emplace_back(zkutil::makeCreateRequest(entry_path + "/committed", host_id_str, zkutil::CreateMode::Persistent)); - txn->ops.emplace_back(zkutil::makeSetRequest(database->zookeeper_path + "/max_log_ptr", toString(getLogEntryNumber(entry_name)), -1)); + txn->addOp(zkutil::makeRemoveRequest(entry_path + "/try", -1)); + txn->addOp(zkutil::makeCreateRequest(entry_path + "/committed", host_id_str, zkutil::CreateMode::Persistent)); + txn->addOp(zkutil::makeSetRequest(database->zookeeper_path + "/max_log_ptr", toString(getLogEntryNumber(entry_name)), -1)); } - txn->ops.emplace_back(zkutil::makeSetRequest(database->replica_path + "/log_ptr", toString(getLogEntryNumber(entry_name)), -1)); + txn->addOp(zkutil::makeSetRequest(database->replica_path + "/log_ptr", toString(getLogEntryNumber(entry_name)), -1)); - std::move(ops.begin(), ops.end(), std::back_inserter(txn->ops)); + for (auto & op : ops) + txn->addOp(std::move(op)); ops.clear(); return query_context; @@ -335,7 +333,7 @@ UInt32 DDLTaskBase::getLogEntryNumber(const String & log_entry_name) return parse(log_entry_name.substr(strlen(name))); } -void MetadataTransaction::commit() +void ZooKeeperMetadataTransaction::commit() { assert(state == CREATED); state = FAILED; diff --git a/src/Interpreters/DDLTask.h b/src/Interpreters/DDLTask.h index f02e17103aa..18c1f4c80cd 100644 --- a/src/Interpreters/DDLTask.h +++ b/src/Interpreters/DDLTask.h @@ -20,8 +20,8 @@ class ASTQueryWithOnCluster; using ZooKeeperPtr = std::shared_ptr; class DatabaseReplicated; -struct MetadataTransaction; -using MetadataTransactionPtr = std::shared_ptr; +class ZooKeeperMetadataTransaction; +using ZooKeeperMetadataTransactionPtr = std::shared_ptr; struct HostID { @@ -95,7 +95,7 @@ struct DDLTaskBase virtual String getShardID() const = 0; - virtual std::unique_ptr makeQueryContext(Context & from_context); + virtual std::unique_ptr makeQueryContext(Context & from_context, const ZooKeeperPtr & zookeeper); inline String getActiveNodePath() const { return entry_path + "/active/" + host_id_str; } inline String getFinishedNodePath() const { return entry_path + "/finished/" + host_id_str; } @@ -132,13 +132,19 @@ struct DatabaseReplicatedTask : public DDLTaskBase DatabaseReplicatedTask(const String & name, const String & path, DatabaseReplicated * database_); String getShardID() const override; - std::unique_ptr makeQueryContext(Context & from_context) override; + std::unique_ptr makeQueryContext(Context & from_context, const ZooKeeperPtr & zookeeper) override; DatabaseReplicated * database; }; - -struct MetadataTransaction +/// The main purpose of ZooKeeperMetadataTransaction is to execute all zookeeper operation related to query +/// in a single transaction when we performed all required checks and ready to "commit" changes. +/// For example, create ALTER_METADATA entry in ReplicatedMergeTree log, +/// create path/to/entry/finished/host_id node in distributed DDL queue to mark query as executed and +/// update metadata in path/to/replicated_database/metadata/table_name +/// It's used for DatabaseReplicated. +/// TODO we can also use it for ordinary ON CLUSTER queries +class ZooKeeperMetadataTransaction { enum State { @@ -153,8 +159,29 @@ struct MetadataTransaction bool is_initial_query; Coordination::Requests ops; +public: + ZooKeeperMetadataTransaction(const ZooKeeperPtr & current_zookeeper_, const String & zookeeper_path_, bool is_initial_query_) + : current_zookeeper(current_zookeeper_) + , zookeeper_path(zookeeper_path_) + , is_initial_query(is_initial_query_) + { + } + + bool isInitialQuery() const { return is_initial_query; } + + bool isExecuted() const { return state != CREATED; } + + String getDatabaseZooKeeperPath() const { return zookeeper_path; } + + void addOp(Coordination::RequestPtr && op) + { + assert(!isExecuted()); + ops.emplace_back(op); + } + void moveOpsTo(Coordination::Requests & other_ops) { + assert(!isExecuted()); std::move(ops.begin(), ops.end(), std::back_inserter(other_ops)); ops.clear(); state = COMMITTED; @@ -162,7 +189,7 @@ struct MetadataTransaction void commit(); - ~MetadataTransaction() { assert(state != CREATED || std::uncaught_exception()); } + ~ZooKeeperMetadataTransaction() { assert(isExecuted() || std::uncaught_exception()); } }; } diff --git a/src/Interpreters/DDLWorker.cpp b/src/Interpreters/DDLWorker.cpp index 12fd03b3b70..67f716c235c 100644 --- a/src/Interpreters/DDLWorker.cpp +++ b/src/Interpreters/DDLWorker.cpp @@ -195,16 +195,15 @@ void DDLWorker::startup() void DDLWorker::shutdown() { - stop_flag = true; - queue_updated_event->set(); - cleanup_event->set(); - - if (main_thread.joinable()) + bool prev_stop_flag = stop_flag.exchange(true); + if (!prev_stop_flag) + { + queue_updated_event->set(); + cleanup_event->set(); main_thread.join(); - if (cleanup_thread.joinable()) cleanup_thread.join(); - - worker_pool.reset(); + worker_pool.reset(); + } } DDLWorker::~DDLWorker() @@ -267,6 +266,8 @@ DDLTaskPtr DDLWorker::initAndCheckTask(const String & entry_name, String & out_r } /// Stage 2: resolve host_id and check if we should execute query or not + /// Multiple clusters can use single DDL queue path in ZooKeeper, + /// So we should skip task if we cannot find current host in cluster hosts list. if (!task->findCurrentHostID(context, log)) { out_reason = "There is no a local address in host list"; @@ -317,7 +318,7 @@ void DDLWorker::scheduleTasks() bool status_written = zookeeper->exists(task->getFinishedNodePath()); if (task->was_executed && !status_written && task_still_exists) { - processTask(*task); + processTask(*task, zookeeper); } } @@ -364,15 +365,15 @@ void DDLWorker::scheduleTasks() if (worker_pool) { - worker_pool->scheduleOrThrowOnError([this, &saved_task]() + worker_pool->scheduleOrThrowOnError([this, &saved_task, &zookeeper]() { setThreadName("DDLWorkerExec"); - processTask(saved_task); + processTask(saved_task, zookeeper); }); } else { - processTask(saved_task); + processTask(saved_task, zookeeper); } } } @@ -385,7 +386,7 @@ DDLTaskBase & DDLWorker::saveTask(DDLTaskPtr && task) return *current_tasks.back(); } -bool DDLWorker::tryExecuteQuery(const String & query, DDLTaskBase & task) +bool DDLWorker::tryExecuteQuery(const String & query, DDLTaskBase & task, const ZooKeeperPtr & zookeeper) { /// Add special comment at the start of query to easily identify DDL-produced queries in query_log String query_prefix = "/* ddl_entry=" + task.entry_name + " */ "; @@ -398,14 +399,16 @@ bool DDLWorker::tryExecuteQuery(const String & query, DDLTaskBase & task) try { - auto query_context = task.makeQueryContext(context); + auto query_context = task.makeQueryContext(context, zookeeper); if (!task.is_initial_query) query_scope.emplace(*query_context); executeQuery(istr, ostr, !task.is_initial_query, *query_context, {}); - if (auto txn = query_context->getMetadataTransaction()) + if (auto txn = query_context->getZooKeeperMetadataTransaction()) { - if (txn->state == MetadataTransaction::CREATED) + /// Most queries commit changes to ZooKeeper right before applying local changes, + /// but some queries does not support it, so we have to do it here. + if (!txn->isExecuted()) txn->commit(); } } @@ -463,10 +466,8 @@ void DDLWorker::updateMaxDDLEntryID(const String & entry_name) } } -void DDLWorker::processTask(DDLTaskBase & task) +void DDLWorker::processTask(DDLTaskBase & task, const ZooKeeperPtr & zookeeper) { - auto zookeeper = tryGetZooKeeper(); - LOG_DEBUG(log, "Processing task {} ({})", task.entry_name, task.entry.query); String active_node_path = task.getActiveNodePath(); @@ -541,7 +542,7 @@ void DDLWorker::processTask(DDLTaskBase & task) else { storage.reset(); - tryExecuteQuery(rewritten_query, task); + tryExecuteQuery(rewritten_query, task, zookeeper); } } catch (const Coordination::Exception &) @@ -565,7 +566,7 @@ void DDLWorker::processTask(DDLTaskBase & task) } else { - /// task.ops where not executed by table or database engine, se DDLWorker is responsible for + /// task.ops where not executed by table or database engine, so DDLWorker is responsible for /// writing query execution status into ZooKeeper. task.ops.emplace_back(zkutil::makeSetRequest(finished_node_path, task.execution_status.serializeText(), -1)); } @@ -589,7 +590,7 @@ void DDLWorker::processTask(DDLTaskBase & task) } /// Active node was removed in multi ops - active_node->reset(); + active_node->setAlreadyRemoved(); task.completely_processed = true; } @@ -712,7 +713,7 @@ bool DDLWorker::tryExecuteQueryOnLeaderReplica( /// If the leader will unexpectedly changed this method will return false /// and on the next iteration new leader will take lock - if (tryExecuteQuery(rewritten_query, task)) + if (tryExecuteQuery(rewritten_query, task, zookeeper)) { executed_by_us = true; break; diff --git a/src/Interpreters/DDLWorker.h b/src/Interpreters/DDLWorker.h index c39a832c098..8b0a8f038a0 100644 --- a/src/Interpreters/DDLWorker.h +++ b/src/Interpreters/DDLWorker.h @@ -77,7 +77,7 @@ protected: /// Returns non-empty DDLTaskPtr if entry parsed and the check is passed virtual DDLTaskPtr initAndCheckTask(const String & entry_name, String & out_reason, const ZooKeeperPtr & zookeeper); - void processTask(DDLTaskBase & task); + void processTask(DDLTaskBase & task, const ZooKeeperPtr & zookeeper); void updateMaxDDLEntryID(const String & entry_name); /// Check that query should be executed on leader replica only @@ -95,7 +95,7 @@ protected: const String & node_path, const ZooKeeperPtr & zookeeper); - bool tryExecuteQuery(const String & query, DDLTaskBase & task); + bool tryExecuteQuery(const String & query, DDLTaskBase & task, const ZooKeeperPtr & zookeeper); /// Checks and cleanups queue's nodes void cleanupQueue(Int64 current_time_seconds, const ZooKeeperPtr & zookeeper); diff --git a/src/Interpreters/InterpreterAlterQuery.cpp b/src/Interpreters/InterpreterAlterQuery.cpp index 402f05895bc..bf624507574 100644 --- a/src/Interpreters/InterpreterAlterQuery.cpp +++ b/src/Interpreters/InterpreterAlterQuery.cpp @@ -54,7 +54,7 @@ BlockIO InterpreterAlterQuery::execute() { auto guard = DatabaseCatalog::instance().getDDLGuard(table_id.database_name, table_id.table_name); guard->releaseTableLock(); - return typeid_cast(database.get())->propose(query_ptr, context); + return typeid_cast(database.get())->tryEnqueueReplicatedDDL(query_ptr, context); } StoragePtr table = DatabaseCatalog::instance().getTable(table_id, context); diff --git a/src/Interpreters/InterpreterCreateQuery.cpp b/src/Interpreters/InterpreterCreateQuery.cpp index 2021c1f1d60..2b1dddde78c 100644 --- a/src/Interpreters/InterpreterCreateQuery.cpp +++ b/src/Interpreters/InterpreterCreateQuery.cpp @@ -880,7 +880,7 @@ BlockIO InterpreterCreateQuery::createTable(ASTCreateQuery & create) { assertOrSetUUID(create, database); guard->releaseTableLock(); - return typeid_cast(database.get())->propose(query_ptr, context); + return typeid_cast(database.get())->tryEnqueueReplicatedDDL(query_ptr, context); } } @@ -1092,7 +1092,7 @@ BlockIO InterpreterCreateQuery::createDictionary(ASTCreateQuery & create) if (!create.attach) assertOrSetUUID(create, database); guard->releaseTableLock(); - return typeid_cast(database.get())->propose(query_ptr, context); + return typeid_cast(database.get())->tryEnqueueReplicatedDDL(query_ptr, context); } if (database->isDictionaryExist(dictionary_name)) diff --git a/src/Interpreters/InterpreterDropQuery.cpp b/src/Interpreters/InterpreterDropQuery.cpp index 9e63c647f71..33e93a79c41 100644 --- a/src/Interpreters/InterpreterDropQuery.cpp +++ b/src/Interpreters/InterpreterDropQuery.cpp @@ -146,7 +146,7 @@ BlockIO InterpreterDropQuery::executeToTableImpl(const ASTDropQuery & query, Dat ddl_guard->releaseTableLock(); table.reset(); - return typeid_cast(database.get())->propose(query.clone(), context); + return typeid_cast(database.get())->tryEnqueueReplicatedDDL(query.clone(), context); } if (query.kind == ASTDropQuery::Kind::Detach) @@ -231,7 +231,7 @@ BlockIO InterpreterDropQuery::executeToDictionary( context.checkAccess(AccessType::DROP_DICTIONARY, database_name, dictionary_name); ddl_guard->releaseTableLock(); - return typeid_cast(database.get())->propose(query_ptr, context); + return typeid_cast(database.get())->tryEnqueueReplicatedDDL(query_ptr, context); } if (!database || !database->isDictionaryExist(dictionary_name)) diff --git a/src/Interpreters/InterpreterRenameQuery.cpp b/src/Interpreters/InterpreterRenameQuery.cpp index b9d7faac73c..923a342d9ea 100644 --- a/src/Interpreters/InterpreterRenameQuery.cpp +++ b/src/Interpreters/InterpreterRenameQuery.cpp @@ -90,7 +90,7 @@ BlockIO InterpreterRenameQuery::executeToTables(const ASTRenameQuery & rename, c UniqueTableName to(elem.to_database_name, elem.to_table_name); ddl_guards[from]->releaseTableLock(); ddl_guards[to]->releaseTableLock(); - return typeid_cast(database.get())->propose(query_ptr, context); + return typeid_cast(database.get())->tryEnqueueReplicatedDDL(query_ptr, context); } else { diff --git a/src/Storages/StorageMaterializedView.cpp b/src/Storages/StorageMaterializedView.cpp index 32317968fe5..325bf3d2f74 100644 --- a/src/Storages/StorageMaterializedView.cpp +++ b/src/Storages/StorageMaterializedView.cpp @@ -212,11 +212,11 @@ static void executeDropQuery(ASTDropQuery::Kind kind, const Context & global_con /// looks like expected behaviour and we have tests for it. auto drop_context = Context(global_context); drop_context.getClientInfo().query_kind = ClientInfo::QueryKind::SECONDARY_QUERY; - if (auto txn = current_context.getMetadataTransaction()) + if (auto txn = current_context.getZooKeeperMetadataTransaction()) { /// For Replicated database drop_context.setQueryContext(const_cast(current_context)); - drop_context.initMetadataTransaction(txn, true); + drop_context.initZooKeeperMetadataTransaction(txn, true); } InterpreterDropQuery drop_interpreter(ast_drop_query, drop_context); drop_interpreter.execute(); diff --git a/src/Storages/StorageReplicatedMergeTree.cpp b/src/Storages/StorageReplicatedMergeTree.cpp index ff39bf91fbb..f2c88cdedd9 100644 --- a/src/Storages/StorageReplicatedMergeTree.cpp +++ b/src/Storages/StorageReplicatedMergeTree.cpp @@ -4282,12 +4282,12 @@ void StorageReplicatedMergeTree::alter( zkutil::makeCreateRequest(mutations_path + "/", mutation_entry.toString(), zkutil::CreateMode::PersistentSequential)); } - if (auto txn = query_context.getMetadataTransaction()) + if (auto txn = query_context.getZooKeeperMetadataTransaction()) { txn->moveOpsTo(ops); /// NOTE: IDatabase::alterTable(...) is called when executing ALTER_METADATA queue entry without query context, /// so we have to update metadata of DatabaseReplicated here. - String metadata_zk_path = txn->zookeeper_path + "/metadata/" + escapeForFileName(table_id.table_name); + String metadata_zk_path = txn->getDatabaseZooKeeperPath() + "/metadata/" + escapeForFileName(table_id.table_name); auto ast = DatabaseCatalog::instance().getDatabase(table_id.database_name)->getCreateTableQuery(table_id.table_name, query_context); applyMetadataChangesToCreateQuery(ast, future_metadata); ops.emplace_back(zkutil::makeSetRequest(metadata_zk_path, getObjectDefinitionFromCreateQuery(ast), -1)); @@ -5262,7 +5262,7 @@ void StorageReplicatedMergeTree::mutate(const MutationCommands & commands, const requests.emplace_back(zkutil::makeCreateRequest( mutations_path + "/", mutation_entry.toString(), zkutil::CreateMode::PersistentSequential)); - if (auto txn = query_context.getMetadataTransaction()) + if (auto txn = query_context.getZooKeeperMetadataTransaction()) txn->moveOpsTo(requests); Coordination::Responses responses; @@ -5766,7 +5766,7 @@ void StorageReplicatedMergeTree::replacePartitionFrom( } } - if (auto txn = context.getMetadataTransaction()) + if (auto txn = context.getZooKeeperMetadataTransaction()) txn->moveOpsTo(ops); ops.emplace_back(zkutil::makeSetRequest(zookeeper_path + "/log", "", -1)); /// Just update version @@ -6269,7 +6269,7 @@ bool StorageReplicatedMergeTree::dropAllPartsInPartition( Coordination::Requests ops; ops.emplace_back(zkutil::makeCreateRequest(zookeeper_path + "/log/log-", entry.toString(), zkutil::CreateMode::PersistentSequential)); ops.emplace_back(zkutil::makeSetRequest(zookeeper_path + "/log", "", -1)); /// Just update version. - if (auto txn = query_context.getMetadataTransaction()) + if (auto txn = query_context.getZooKeeperMetadataTransaction()) txn->moveOpsTo(ops); Coordination::Responses responses = zookeeper.multi(ops); diff --git a/tests/queries/skip_list.json b/tests/queries/skip_list.json index f08a41e32b8..e6bb3747fb0 100644 --- a/tests/queries/skip_list.json +++ b/tests/queries/skip_list.json @@ -108,6 +108,7 @@ "memory_tracking", "memory_usage", "live_view", + "00825_protobuf_format_map", "00152_insert_different_granularity", "01715_background_checker_blather_zookeeper", "01714_alter_drop_version", From 85ea5cafc46dbfa280c0376aa6f96d9b2ae92b13 Mon Sep 17 00:00:00 2001 From: Denis Glazachev Date: Sat, 20 Feb 2021 03:18:16 +0400 Subject: [PATCH 0521/2357] Always apply row_level_filter immediately --- .../MergeTree/MergeTreeRangeReader.cpp | 19 ++++++++++++++++++- 1 file changed, 18 insertions(+), 1 deletion(-) diff --git a/src/Storages/MergeTree/MergeTreeRangeReader.cpp b/src/Storages/MergeTree/MergeTreeRangeReader.cpp index b4b8e4309b5..a3e4fbf6e6b 100644 --- a/src/Storages/MergeTree/MergeTreeRangeReader.cpp +++ b/src/Storages/MergeTree/MergeTreeRangeReader.cpp @@ -865,7 +865,7 @@ void MergeTreeRangeReader::executePrewhereActionsAndFilterColumns(ReadResult & r return; const auto & header = merge_tree_reader->getColumns(); - size_t num_columns = header.size(); + const auto num_columns = header.size(); if (result.columns.size() != num_columns) throw Exception("Invalid number of columns passed to MergeTreeRangeReader. " @@ -900,8 +900,25 @@ void MergeTreeRangeReader::executePrewhereActionsAndFilterColumns(ReadResult & r if (prewhere_info->row_level_filter) { prewhere_info->row_level_filter->execute(block); + const auto filter_column_pos = block.getPositionByName(prewhere_info->row_level_column_name); result.addFilter(block.getByPosition(filter_column_pos).column); + + result.columns.clear(); + result.columns.reserve(block.columns()); + + for (auto & col : block) + result.columns.emplace_back(std::move(col.column)); + + const auto * result_filter = result.getFilter(); + filterColumns(result.columns, result_filter->getData()); + + auto it = block.begin(); + for (auto & col : result.columns) + it++->column = std::move(col); + + result.columns.clear(); + result.clearFilter(); } prewhere_info->prewhere_actions->execute(block); From e1868d1392d9834d84e4d9f1f0230429e7df2e3c Mon Sep 17 00:00:00 2001 From: kssenii Date: Sat, 20 Feb 2021 05:13:31 +0000 Subject: [PATCH 0522/2357] Move test into separate file --- .../01057_http_compression_prefer_brotli.sh | 1 - ...tli_http_compression_json_format.reference | 23 +++++++++++++++++++ ...ong_brotli_http_compression_json_format.sh | 7 ++++++ 3 files changed, 30 insertions(+), 1 deletion(-) create mode 100644 tests/queries/0_stateless/01722_long_brotli_http_compression_json_format.reference create mode 100755 tests/queries/0_stateless/01722_long_brotli_http_compression_json_format.sh diff --git a/tests/queries/0_stateless/01057_http_compression_prefer_brotli.sh b/tests/queries/0_stateless/01057_http_compression_prefer_brotli.sh index f93062d43a7..22ab745d7c0 100755 --- a/tests/queries/0_stateless/01057_http_compression_prefer_brotli.sh +++ b/tests/queries/0_stateless/01057_http_compression_prefer_brotli.sh @@ -12,4 +12,3 @@ ${CLICKHOUSE_CURL} -sS -H 'Accept-Encoding: gzip,deflate' "${CLICKHOUSE_URL}& ${CLICKHOUSE_CURL} -sS -H 'Accept-Encoding: gzip' "${CLICKHOUSE_URL}&enable_http_compression=1" -d 'SELECT number FROM numbers(1000000)' | gzip -d | tail -n3 ${CLICKHOUSE_CURL} -sS -H 'Accept-Encoding: br' "${CLICKHOUSE_URL}&enable_http_compression=1" -d 'SELECT number FROM numbers(1000000)' | brotli -d | tail -n3 -${CLICKHOUSE_CURL} -sS -H 'Accept-Encoding: br' "${CLICKHOUSE_URL}&enable_http_compression=1" -d "SELECT toDate('2020-12-12') as datetime, 'test-pipeline' as pipeline, 'clickhouse-test-host-001.clickhouse.com' as host, 'clickhouse' as home, 'clickhouse' as detail, number as row_number FROM numbers(1000000) FORMAT JSON" | brotli -d | tail -n30 | head -n23 diff --git a/tests/queries/0_stateless/01722_long_brotli_http_compression_json_format.reference b/tests/queries/0_stateless/01722_long_brotli_http_compression_json_format.reference new file mode 100644 index 00000000000..7c089a2fd05 --- /dev/null +++ b/tests/queries/0_stateless/01722_long_brotli_http_compression_json_format.reference @@ -0,0 +1,23 @@ + }, + { + "datetime": "2020-12-12", + "pipeline": "test-pipeline", + "host": "clickhouse-test-host-001.clickhouse.com", + "home": "clickhouse", + "detail": "clickhouse", + "row_number": "999998" + }, + { + "datetime": "2020-12-12", + "pipeline": "test-pipeline", + "host": "clickhouse-test-host-001.clickhouse.com", + "home": "clickhouse", + "detail": "clickhouse", + "row_number": "999999" + } + ], + + "rows": 1000000, + + "rows_before_limit_at_least": 1048080, + diff --git a/tests/queries/0_stateless/01722_long_brotli_http_compression_json_format.sh b/tests/queries/0_stateless/01722_long_brotli_http_compression_json_format.sh new file mode 100755 index 00000000000..a187d778fdb --- /dev/null +++ b/tests/queries/0_stateless/01722_long_brotli_http_compression_json_format.sh @@ -0,0 +1,7 @@ +#!/usr/bin/env bash + +CURDIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd) +# shellcheck source=../shell_config.sh +. "$CURDIR"/../shell_config.sh + +${CLICKHOUSE_CURL} -sS -H 'Accept-Encoding: br' "${CLICKHOUSE_URL}&enable_http_compression=1" -d "SELECT toDate('2020-12-12') as datetime, 'test-pipeline' as pipeline, 'clickhouse-test-host-001.clickhouse.com' as host, 'clickhouse' as home, 'clickhouse' as detail, number as row_number FROM numbers(1000000) FORMAT JSON" | brotli -d | tail -n30 | head -n23 From e8583ddfe2f03b20d86e9ce85a8215e7ee46d0f4 Mon Sep 17 00:00:00 2001 From: alexey-milovidov Date: Sat, 20 Feb 2021 09:10:15 +0300 Subject: [PATCH 0523/2357] Update BaseDaemon.cpp --- base/daemon/BaseDaemon.cpp | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/base/daemon/BaseDaemon.cpp b/base/daemon/BaseDaemon.cpp index 248ffdd4d10..83384038b7c 100644 --- a/base/daemon/BaseDaemon.cpp +++ b/base/daemon/BaseDaemon.cpp @@ -416,7 +416,9 @@ static void sanitizerDeathCallback() else log_message = "Terminate called without an active exception"; - static const size_t buf_size = PIPE_BUF; + /// POSIX.1 says that write(2)s of less than PIPE_BUF bytes must be atomic - man 7 pipe + /// And the buffer should not be too small because our exception messages can be large. + static constexpr size_t buf_size = PIPE_BUF; if (log_message.size() > buf_size - 16) log_message.resize(buf_size - 16); From 487fb09ff670a379deddc953b2bd1f52d3c77a39 Mon Sep 17 00:00:00 2001 From: Amos Bird Date: Sat, 20 Feb 2021 14:11:01 +0800 Subject: [PATCH 0524/2357] Suppress signed overflow in AggregateFunctionGroupArrayMoving 2 --- src/AggregateFunctions/AggregateFunctionGroupArrayMoving.h | 6 +++--- tests/queries/0_stateless/01177_group_array_moving.sql | 2 ++ 2 files changed, 5 insertions(+), 3 deletions(-) diff --git a/src/AggregateFunctions/AggregateFunctionGroupArrayMoving.h b/src/AggregateFunctions/AggregateFunctionGroupArrayMoving.h index 2a713f3aed2..3bab831d316 100644 --- a/src/AggregateFunctions/AggregateFunctionGroupArrayMoving.h +++ b/src/AggregateFunctions/AggregateFunctionGroupArrayMoving.h @@ -52,7 +52,7 @@ struct MovingSumData : public MovingData { static constexpr auto name = "groupArrayMovingSum"; - T get(size_t idx, UInt64 window_size) const + T NO_SANITIZE_UNDEFINED get(size_t idx, UInt64 window_size) const { if (idx < window_size) return this->value[idx]; @@ -66,7 +66,7 @@ struct MovingAvgData : public MovingData { static constexpr auto name = "groupArrayMovingAvg"; - T get(size_t idx, UInt64 window_size) const + T NO_SANITIZE_UNDEFINED get(size_t idx, UInt64 window_size) const { if (idx < window_size) return this->value[idx] / window_size; @@ -114,7 +114,7 @@ public: return std::make_shared(std::make_shared()); } - void add(AggregateDataPtr __restrict place, const IColumn ** columns, size_t row_num, Arena * arena) const override + void NO_SANITIZE_UNDEFINED add(AggregateDataPtr __restrict place, const IColumn ** columns, size_t row_num, Arena * arena) const override { auto value = static_cast(*columns[0]).getData()[row_num]; this->data(place).add(static_cast(value), arena); diff --git a/tests/queries/0_stateless/01177_group_array_moving.sql b/tests/queries/0_stateless/01177_group_array_moving.sql index b1969e204fc..5689cd95f75 100644 --- a/tests/queries/0_stateless/01177_group_array_moving.sql +++ b/tests/queries/0_stateless/01177_group_array_moving.sql @@ -1,2 +1,4 @@ SELECT groupArrayMovingSum(257)(-9223372036854775808), groupArrayMovingSum(1048575)(18446744073709551615), groupArrayMovingSum(9223372036854775807)(number * 9223372036854775807) FROM remote('127.0.0.{1..2}', numbers(3)); SELECT groupArrayMovingAvg(257)(-9223372036854775808), groupArrayMovingAvg(1048575)(18446744073709551615), groupArrayMovingAvg(9223372036854775807)(number * 9223372036854775807) FROM remote('127.0.0.{1..2}', numbers(3)); + +SELECT groupArrayMovingSum(257)(-9223372036854775808), groupArrayMovingSum(1)(10.000100135803223, [NULL, NULL], NULL), groupArrayMovingSum(NULL)(NULL) FROM numbers(1023) FORMAT Null; From 7c04f15c8031a63f20573b9948dd18005f860f26 Mon Sep 17 00:00:00 2001 From: Azat Khuzhin Date: Sat, 20 Feb 2021 09:11:42 +0300 Subject: [PATCH 0525/2357] Add log message when stacktrace cannot be obtained for thread This is to provide better diagnostics for 01051_system_stack_trace failure [1]. [1]: https://clickhouse-test-reports.s3.yandex.net/20881/866dfaec793f764dc9ba167d3ac9f6521b9b3381/functional_stateless_tests_(release,_wide_parts_enabled).html#fail1 --- src/Storages/System/StorageSystemStackTrace.cpp | 4 ++++ src/Storages/System/StorageSystemStackTrace.h | 6 ++++++ 2 files changed, 10 insertions(+) diff --git a/src/Storages/System/StorageSystemStackTrace.cpp b/src/Storages/System/StorageSystemStackTrace.cpp index abb2fdf54ed..e74d56108ad 100644 --- a/src/Storages/System/StorageSystemStackTrace.cpp +++ b/src/Storages/System/StorageSystemStackTrace.cpp @@ -16,6 +16,7 @@ #include #include #include +#include namespace DB @@ -150,6 +151,7 @@ namespace StorageSystemStackTrace::StorageSystemStackTrace(const StorageID & table_id_) : IStorageSystemOneBlock(table_id_) + , log(&Poco::Logger::get("StorageSystemStackTrace")) { notification_pipe.open(); @@ -229,6 +231,8 @@ void StorageSystemStackTrace::fillData(MutableColumns & res_columns, const Conte } else { + LOG_DEBUG(log, "Cannot obtain a stack trace for thread {}", tid); + /// Cannot obtain a stack trace. But create a record in result nevertheless. res_columns[0]->insert(tid); diff --git a/src/Storages/System/StorageSystemStackTrace.h b/src/Storages/System/StorageSystemStackTrace.h index a389f02eb09..582618d2ecd 100644 --- a/src/Storages/System/StorageSystemStackTrace.h +++ b/src/Storages/System/StorageSystemStackTrace.h @@ -6,6 +6,10 @@ #include #include +namespace Poco +{ +class Logger; +} namespace DB { @@ -30,6 +34,8 @@ protected: void fillData(MutableColumns & res_columns, const Context & context, const SelectQueryInfo & query_info) const override; mutable std::mutex mutex; + + Poco::Logger * log; }; } From 8361904b4d7cdbc8b00390de17e5194055cdc38e Mon Sep 17 00:00:00 2001 From: Denis Glazachev Date: Sat, 20 Feb 2021 10:47:29 +0400 Subject: [PATCH 0526/2357] Add cases for throwIf() --- tests/integration/test_row_policy/test.py | 14 ++++++++++++-- 1 file changed, 12 insertions(+), 2 deletions(-) diff --git a/tests/integration/test_row_policy/test.py b/tests/integration/test_row_policy/test.py index c83e9cb8ec3..22da7b10b0b 100644 --- a/tests/integration/test_row_policy/test.py +++ b/tests/integration/test_row_policy/test.py @@ -166,7 +166,17 @@ def test_with_prewhere(): assert node.query("SELECT c, d FROM mydb.filtered_table2 PREWHERE a < 4 WHERE b < 10") == TSV([[3, 4]]) -def test_with_throwif_in_where(): +def test_throwif_error_in_where_with_same_condition_as_filter(): + copy_policy_xml('normal_filter2_table2.xml') + assert 'expected' in node.query_and_get_error("SELECT * FROM mydb.filtered_table2 WHERE throwIf(a > 0, 'expected') = 0 SETTINGS optimize_move_to_prewhere = 0") + + +def test_throwif_error_in_prewhere_with_same_condition_as_filter(): + copy_policy_xml('normal_filter2_table2.xml') + assert 'expected' in node.query_and_get_error("SELECT * FROM mydb.filtered_table2 PREWHERE throwIf(a > 0, 'expected') = 0") + + +def test_throwif_in_where_doesnt_expose_restricted_data(): copy_policy_xml('no_filters.xml') assert 'expected' in node.query_and_get_error("SELECT * FROM mydb.filtered_table2 WHERE throwIf(a = 0, 'expected') = 0 SETTINGS optimize_move_to_prewhere = 0") @@ -175,7 +185,7 @@ def test_with_throwif_in_where(): [1, 2, 3, 4], [4, 3, 2, 1]]) -def test_with_throwif_in_prewhere(): +def test_throwif_in_prewhere_doesnt_expose_restricted_data(): copy_policy_xml('no_filters.xml') assert 'expected' in node.query_and_get_error("SELECT * FROM mydb.filtered_table2 PREWHERE throwIf(a = 0, 'expected') = 0") From 4390cb3d73f8672269fe030a709899ca119909a9 Mon Sep 17 00:00:00 2001 From: alexey-milovidov Date: Sat, 20 Feb 2021 09:49:02 +0300 Subject: [PATCH 0527/2357] Update config.xml --- programs/server/config.xml | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/programs/server/config.xml b/programs/server/config.xml index fe2a068787b..ba9b8b04b05 100644 --- a/programs/server/config.xml +++ b/programs/server/config.xml @@ -285,10 +285,9 @@ Cache is used when 'use_uncompressed_cache' user setting turned on (off by default). Uncompressed cache is advantageous only for very short queries and in rare cases. - Note: uncompressed cache is pointless for lz4, because memory bandwidth is slower than multi-core decompression. - Enabling it will only make queries slower. - If number of CPU cores is in order of 100 and memory bandwidth is in range of 100-200 GB/sec, - there is a chance it is also being pointless for zstd. + Note: uncompressed cache can be pointless for lz4, because memory bandwidth + is slower than multi-core decompression on some server configurations. + Enabling it can sometimes paradoxically make queries slower. --> 8589934592 From f820047cc841fa2b129e3f3d20ebcc0c28d1940c Mon Sep 17 00:00:00 2001 From: kssenii Date: Fri, 19 Feb 2021 15:48:48 +0000 Subject: [PATCH 0528/2357] Fix --- .../PostgreSQL/fetchPostgreSQLTableStructure.cpp | 7 +++++-- tests/integration/test_storage_postgresql/test.py | 8 ++++---- 2 files changed, 9 insertions(+), 6 deletions(-) diff --git a/src/Databases/PostgreSQL/fetchPostgreSQLTableStructure.cpp b/src/Databases/PostgreSQL/fetchPostgreSQLTableStructure.cpp index 15ce9a1baed..e065a497115 100644 --- a/src/Databases/PostgreSQL/fetchPostgreSQLTableStructure.cpp +++ b/src/Databases/PostgreSQL/fetchPostgreSQLTableStructure.cpp @@ -56,7 +56,7 @@ static DataTypePtr convertPostgreSQLDataType(std::string & type, bool is_nullabl { /// Numeric and decimal will both end up here as numeric. If it has type and precision, /// there will be Numeric(x, y), otherwise just Numeric - uint32_t precision, scale; + UInt32 precision, scale; if (type.ends_with(")")) { res = DataTypeFactory::instance().get(type); @@ -71,11 +71,14 @@ static DataTypePtr convertPostgreSQLDataType(std::string & type, bool is_nullabl res = std::make_shared>(precision, scale); else if (precision <= DecimalUtils::maxPrecision()) res = std::make_shared>(precision, scale); + else + throw Exception(ErrorCodes::BAD_ARGUMENTS, "Presicion {} and scale {} are too big and not supported", precision, scale); } else { precision = DecimalUtils::maxPrecision(); - res = std::make_shared>(precision, precision); + scale = precision >> 1; + res = std::make_shared>(precision, scale); } } diff --git a/tests/integration/test_storage_postgresql/test.py b/tests/integration/test_storage_postgresql/test.py index 03af32a4803..cee495438a2 100644 --- a/tests/integration/test_storage_postgresql/test.py +++ b/tests/integration/test_storage_postgresql/test.py @@ -63,13 +63,13 @@ def test_postgres_conversions(started_cluster): cursor.execute( '''CREATE TABLE IF NOT EXISTS test_types ( a smallint, b integer, c bigint, d real, e double precision, f serial, g bigserial, - h timestamp, i date, j decimal(5, 5), k numeric)''') + h timestamp, i date, j decimal(5, 3), k numeric)''') node1.query(''' INSERT INTO TABLE FUNCTION postgresql('postgres1:5432', 'clickhouse', 'test_types', 'postgres', 'mysecretpassword') VALUES - (-32768, -2147483648, -9223372036854775808, 1.12345, 1.1234567890, 2147483647, 9223372036854775807, '2000-05-12 12:12:12', '2000-05-12', 0.22222, 0.22222)''') + (-32768, -2147483648, -9223372036854775808, 1.12345, 1.1234567890, 2147483647, 9223372036854775807, '2000-05-12 12:12:12', '2000-05-12', 22.222, 22.222)''') result = node1.query(''' - SELECT a, b, c, d, e, f, g, h, i, j, toDecimal32(k, 5) FROM postgresql('postgres1:5432', 'clickhouse', 'test_types', 'postgres', 'mysecretpassword')''') - assert(result == '-32768\t-2147483648\t-9223372036854775808\t1.12345\t1.123456789\t2147483647\t9223372036854775807\t2000-05-12 12:12:12\t2000-05-12\t0.22222\t0.22222\n') + SELECT a, b, c, d, e, f, g, h, i, j, toDecimal128(k, 3) FROM postgresql('postgres1:5432', 'clickhouse', 'test_types', 'postgres', 'mysecretpassword')''') + assert(result == '-32768\t-2147483648\t-9223372036854775808\t1.12345\t1.123456789\t2147483647\t9223372036854775807\t2000-05-12 12:12:12\t2000-05-12\t22.222\t22.222\n') cursor.execute( '''CREATE TABLE IF NOT EXISTS test_array_dimensions From 0d88366b2775bdcb60ae3eb18bc9fcb2ce7eef01 Mon Sep 17 00:00:00 2001 From: kssenii Date: Sat, 20 Feb 2021 07:07:50 +0000 Subject: [PATCH 0529/2357] Add forgotten .reference file update --- ...7_http_compression_prefer_brotli.reference | 23 ------------------- 1 file changed, 23 deletions(-) diff --git a/tests/queries/0_stateless/01057_http_compression_prefer_brotli.reference b/tests/queries/0_stateless/01057_http_compression_prefer_brotli.reference index c28cbee8485..5dd396a38c9 100644 --- a/tests/queries/0_stateless/01057_http_compression_prefer_brotli.reference +++ b/tests/queries/0_stateless/01057_http_compression_prefer_brotli.reference @@ -9,26 +9,3 @@ 999997 999998 999999 - }, - { - "datetime": "2020-12-12", - "pipeline": "test-pipeline", - "host": "clickhouse-test-host-001.clickhouse.com", - "home": "clickhouse", - "detail": "clickhouse", - "row_number": "999998" - }, - { - "datetime": "2020-12-12", - "pipeline": "test-pipeline", - "host": "clickhouse-test-host-001.clickhouse.com", - "home": "clickhouse", - "detail": "clickhouse", - "row_number": "999999" - } - ], - - "rows": 1000000, - - "rows_before_limit_at_least": 1048080, - From 5d36ceaaee50c1442dfef55a3d98c240ee2f7bd6 Mon Sep 17 00:00:00 2001 From: Azat Khuzhin Date: Sat, 20 Feb 2021 08:31:05 +0300 Subject: [PATCH 0530/2357] Fix WriteBufferFromHTTPServerResponse usage in odbc-bridge --- programs/odbc-bridge/ColumnInfoHandler.cpp | 10 ++++++++- .../odbc-bridge/IdentifierQuoteHandler.cpp | 10 ++++++++- programs/odbc-bridge/MainHandler.cpp | 22 +++++++++++++++++-- programs/odbc-bridge/SchemaAllowedHandler.cpp | 10 ++++++++- 4 files changed, 47 insertions(+), 5 deletions(-) diff --git a/programs/odbc-bridge/ColumnInfoHandler.cpp b/programs/odbc-bridge/ColumnInfoHandler.cpp index 5aef7f1ac38..14fa734f246 100644 --- a/programs/odbc-bridge/ColumnInfoHandler.cpp +++ b/programs/odbc-bridge/ColumnInfoHandler.cpp @@ -160,7 +160,15 @@ void ODBCColumnsInfoHandler::handleRequest(HTTPServerRequest & request, HTTPServ } WriteBufferFromHTTPServerResponse out(response, request.getMethod() == Poco::Net::HTTPRequest::HTTP_HEAD, keep_alive_timeout); - writeStringBinary(columns.toString(), out); + try + { + writeStringBinary(columns.toString(), out); + out.finalize(); + } + catch (...) + { + out.finalize(); + } } catch (...) { diff --git a/programs/odbc-bridge/IdentifierQuoteHandler.cpp b/programs/odbc-bridge/IdentifierQuoteHandler.cpp index ec4e4493d61..5060d37c479 100644 --- a/programs/odbc-bridge/IdentifierQuoteHandler.cpp +++ b/programs/odbc-bridge/IdentifierQuoteHandler.cpp @@ -50,7 +50,15 @@ void IdentifierQuoteHandler::handleRequest(HTTPServerRequest & request, HTTPServ auto identifier = getIdentifierQuote(hdbc); WriteBufferFromHTTPServerResponse out(response, request.getMethod() == Poco::Net::HTTPRequest::HTTP_HEAD, keep_alive_timeout); - writeStringBinary(identifier, out); + try + { + writeStringBinary(identifier, out); + out.finalize(); + } + catch (...) + { + out.finalize(); + } } catch (...) { diff --git a/programs/odbc-bridge/MainHandler.cpp b/programs/odbc-bridge/MainHandler.cpp index b9670397878..4fcc9deea6a 100644 --- a/programs/odbc-bridge/MainHandler.cpp +++ b/programs/odbc-bridge/MainHandler.cpp @@ -187,9 +187,27 @@ void ODBCHandler::handleRequest(HTTPServerRequest & request, HTTPServerResponse auto message = getCurrentExceptionMessage(true); response.setStatusAndReason( Poco::Net::HTTPResponse::HTTP_INTERNAL_SERVER_ERROR); // can't call process_error, because of too soon response sending - writeStringBinary(message, out); - tryLogCurrentException(log); + try + { + writeStringBinary(message, out); + out.finalize(); + } + catch (...) + { + tryLogCurrentException(log); + } + + tryLogCurrentException(log); + } + + try + { + out.finalize(); + } + catch (...) + { + tryLogCurrentException(log); } } diff --git a/programs/odbc-bridge/SchemaAllowedHandler.cpp b/programs/odbc-bridge/SchemaAllowedHandler.cpp index 48744b6d2ca..d4a70db61f4 100644 --- a/programs/odbc-bridge/SchemaAllowedHandler.cpp +++ b/programs/odbc-bridge/SchemaAllowedHandler.cpp @@ -61,7 +61,15 @@ void SchemaAllowedHandler::handleRequest(HTTPServerRequest & request, HTTPServer bool result = isSchemaAllowed(hdbc); WriteBufferFromHTTPServerResponse out(response, request.getMethod() == Poco::Net::HTTPRequest::HTTP_HEAD, keep_alive_timeout); - writeBoolText(result, out); + try + { + writeBoolText(result, out); + out.finalize(); + } + catch (...) + { + out.finalize(); + } } catch (...) { From 1ccb333ac50e1e62d9507e424c3daeee465e14f9 Mon Sep 17 00:00:00 2001 From: Azat Khuzhin Date: Sat, 20 Feb 2021 08:28:47 +0300 Subject: [PATCH 0531/2357] Fix WriteBufferFromHTTPServerResponse usage in other places (add missing finalize()) Since I saw the following: 0. DB::WriteBufferFromOStream::nextImpl() 1. DB::WriteBufferFromHTTPServerResponse::nextImpl() 2. DB::WriteBufferFromHTTPServerResponse::finalize() 3. DB::WriteBufferFromHTTPServerResponse::~WriteBufferFromHTTPServerResponse() 4. DB::StaticRequestHandler::handleRequest(Poco::Net::HTTPServerRequest&, Poco::Net::HTTPServerResponse&) 5. Poco::Net::HTTPServerConnection::run() 6. Poco::Net::TCPServerConnection::start() --- src/Server/InterserverIOHTTPHandler.cpp | 26 +++++++++++++++++++------ src/Server/PrometheusRequestHandler.cpp | 13 ++++++++++--- src/Server/StaticRequestHandler.cpp | 2 ++ 3 files changed, 32 insertions(+), 9 deletions(-) diff --git a/src/Server/InterserverIOHTTPHandler.cpp b/src/Server/InterserverIOHTTPHandler.cpp index 3296da94578..740072e8e9f 100644 --- a/src/Server/InterserverIOHTTPHandler.cpp +++ b/src/Server/InterserverIOHTTPHandler.cpp @@ -94,6 +94,23 @@ void InterserverIOHTTPHandler::handleRequest(HTTPServerRequest & request, HTTPSe used_output.out = std::make_shared( response, request.getMethod() == Poco::Net::HTTPRequest::HTTP_HEAD, keep_alive_timeout); + auto write_response = [&](const std::string & message) + { + if (response.sent()) + return; + + auto & out = *used_output.out; + try + { + writeString(message, out); + out.finalize(); + } + catch (...) + { + out.finalize(); + } + }; + try { if (auto [message, success] = checkAuthentication(request); success) @@ -104,8 +121,7 @@ void InterserverIOHTTPHandler::handleRequest(HTTPServerRequest & request, HTTPSe else { response.setStatusAndReason(HTTPServerResponse::HTTP_UNAUTHORIZED); - if (!response.sent()) - writeString(message, *used_output.out); + write_response(message); LOG_WARNING(log, "Query processing failed request: '{}' authentication failed", request.getURI()); } } @@ -120,8 +136,7 @@ void InterserverIOHTTPHandler::handleRequest(HTTPServerRequest & request, HTTPSe bool is_real_error = e.code() != ErrorCodes::ABORTED; std::string message = getCurrentExceptionMessage(is_real_error); - if (!response.sent()) - writeString(message, *used_output.out); + write_response(message); if (is_real_error) LOG_ERROR(log, message); @@ -132,8 +147,7 @@ void InterserverIOHTTPHandler::handleRequest(HTTPServerRequest & request, HTTPSe { response.setStatusAndReason(Poco::Net::HTTPResponse::HTTP_INTERNAL_SERVER_ERROR); std::string message = getCurrentExceptionMessage(false); - if (!response.sent()) - writeString(message, *used_output.out); + write_response(message); LOG_ERROR(log, message); } diff --git a/src/Server/PrometheusRequestHandler.cpp b/src/Server/PrometheusRequestHandler.cpp index 83cb8e85a9e..bf78a37166a 100644 --- a/src/Server/PrometheusRequestHandler.cpp +++ b/src/Server/PrometheusRequestHandler.cpp @@ -24,9 +24,16 @@ void PrometheusRequestHandler::handleRequest(HTTPServerRequest & request, HTTPSe response.setContentType("text/plain; version=0.0.4; charset=UTF-8"); - auto wb = WriteBufferFromHTTPServerResponse(response, request.getMethod() == Poco::Net::HTTPRequest::HTTP_HEAD, keep_alive_timeout); - metrics_writer.write(wb); - wb.finalize(); + WriteBufferFromHTTPServerResponse wb(response, request.getMethod() == Poco::Net::HTTPRequest::HTTP_HEAD, keep_alive_timeout); + try + { + metrics_writer.write(wb); + wb.finalize(); + } + catch (...) + { + wb.finalize(); + } } catch (...) { diff --git a/src/Server/StaticRequestHandler.cpp b/src/Server/StaticRequestHandler.cpp index f3f564c1cf8..9f959239be9 100644 --- a/src/Server/StaticRequestHandler.cpp +++ b/src/Server/StaticRequestHandler.cpp @@ -126,6 +126,8 @@ void StaticRequestHandler::handleRequest(HTTPServerRequest & request, HTTPServer std::string exception_message = getCurrentExceptionMessage(false, true); trySendExceptionToClient(exception_message, exception_code, request, response, *out); } + + out->finalize(); } void StaticRequestHandler::writeResponse(WriteBuffer & out) From 2ab37d025a62f650d4b90f5fafa23f4076ab3844 Mon Sep 17 00:00:00 2001 From: Amos Bird Date: Sat, 20 Feb 2021 16:14:38 +0800 Subject: [PATCH 0532/2357] Skip non-parallel tests --- tests/queries/skip_list.json | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/tests/queries/skip_list.json b/tests/queries/skip_list.json index fdb845b7e72..1164d7b0004 100644 --- a/tests/queries/skip_list.json +++ b/tests/queries/skip_list.json @@ -364,6 +364,7 @@ "00626_replace_partition_from_table", "00626_replace_partition_from_table_zookeeper", "00633_materialized_view_and_too_many_parts_zookeeper", + "00643_cast_zookeeper", "00652_mergetree_mutations", "00652_replicated_mutations_zookeeper", "00682_empty_parts_merge", @@ -577,10 +578,11 @@ "01602_show_create_view", "01603_rename_overwrite_bug", "01646_system_restart_replicas_smoke", // system restart replicas is a global query - "01676_dictget_in_default_expression", - "01715_background_checker_blather_zookeeper", - "01700_system_zookeeper_path_in", + "01656_test_query_log_factories_info", "01669_columns_declaration_serde", + "01676_dictget_in_default_expression", + "01700_system_zookeeper_path_in", + "01715_background_checker_blather_zookeeper", "attach", "ddl_dictionaries", "dictionary", From d947dbc185beee7a78bf73ba2aceeb81e664e013 Mon Sep 17 00:00:00 2001 From: Alexey Milovidov Date: Sat, 20 Feb 2021 11:44:35 +0300 Subject: [PATCH 0533/2357] Add test to skip list --- tests/queries/0_stateless/arcadia_skip_list.txt | 1 + tests/queries/skip_list.json | 9 ++++++--- 2 files changed, 7 insertions(+), 3 deletions(-) diff --git a/tests/queries/0_stateless/arcadia_skip_list.txt b/tests/queries/0_stateless/arcadia_skip_list.txt index 5466fb4bfb8..4e523545938 100644 --- a/tests/queries/0_stateless/arcadia_skip_list.txt +++ b/tests/queries/0_stateless/arcadia_skip_list.txt @@ -206,3 +206,4 @@ 01683_dist_INSERT_block_structure_mismatch 01702_bitmap_native_integers 01686_event_time_microseconds_part_log +01017_uniqCombined_memory_usage diff --git a/tests/queries/skip_list.json b/tests/queries/skip_list.json index fdb845b7e72..70963190125 100644 --- a/tests/queries/skip_list.json +++ b/tests/queries/skip_list.json @@ -17,7 +17,8 @@ "functions_bad_arguments", /// Too long for TSan "01603_read_with_backoff_bug", /// Too long for TSan "01646_system_restart_replicas_smoke", /// RESTART REPLICAS can acquire too much locks, while only 64 is possible from one thread under TSan - "01641_memory_tracking_insert_optimize" /// INSERT lots of rows is too heavy for TSan + "01641_memory_tracking_insert_optimize", /// INSERT lots of rows is too heavy for TSan + "01017_uniqCombined_memory_usage" /// Fine thresholds on memory usage ], "address-sanitizer": [ "00877", @@ -27,7 +28,8 @@ "01103_check_cpu_instructions_at_startup", "01473_event_time_microseconds", "01526_max_untracked_memory", /// requires TraceCollector, does not available under sanitizers - "01193_metadata_loading" + "01193_metadata_loading", + "01017_uniqCombined_memory_usage" /// Fine thresholds on memory usage ], "ub-sanitizer": [ "capnproto", @@ -48,7 +50,8 @@ "00877_memory_limit_for_new_delete", /// memory limits don't work correctly under msan because it replaces malloc/free "01473_event_time_microseconds", "01526_max_untracked_memory", /// requires TraceCollector, does not available under sanitizers - "01193_metadata_loading" + "01193_metadata_loading", + "01017_uniqCombined_memory_usage" /// Fine thresholds on memory usage ], "debug-build": [ "query_profiler", From f37631830f8139a68c42111c11584956f992630a Mon Sep 17 00:00:00 2001 From: Amos Bird Date: Sat, 20 Feb 2021 16:45:25 +0800 Subject: [PATCH 0534/2357] Comments --- src/Interpreters/FunctionNameNormalizer.cpp | 4 ++++ src/Interpreters/ya.make | 1 + src/Server/TCPHandler.cpp | 4 +++- 3 files changed, 8 insertions(+), 1 deletion(-) diff --git a/src/Interpreters/FunctionNameNormalizer.cpp b/src/Interpreters/FunctionNameNormalizer.cpp index 36ccc9340ea..255f4d8c6bb 100644 --- a/src/Interpreters/FunctionNameNormalizer.cpp +++ b/src/Interpreters/FunctionNameNormalizer.cpp @@ -14,6 +14,8 @@ void FunctionNameNormalizer::visit(IAST * ast) if (!ast) return; + // Normalize only selected children. Avoid normalizing engine clause because some engine might + // have the same name as function, e.g. Log. if (auto * node_storage = ast->as()) { visit(node_storage->partition_by); @@ -24,6 +26,8 @@ void FunctionNameNormalizer::visit(IAST * ast) return; } + // Normalize only selected children. Avoid normalizing type clause because some type might + // have the same name as function, e.g. Date. if (auto * node_decl = ast->as()) { visit(node_decl->default_expression.get()); diff --git a/src/Interpreters/ya.make b/src/Interpreters/ya.make index cd4980927e4..e7882ec8d98 100644 --- a/src/Interpreters/ya.make +++ b/src/Interpreters/ya.make @@ -58,6 +58,7 @@ SRCS( ExternalModelsLoader.cpp ExtractExpressionInfoVisitor.cpp FillingRow.cpp + FunctionNameNormalizer.cpp HashJoin.cpp IExternalLoadable.cpp IInterpreter.cpp diff --git a/src/Server/TCPHandler.cpp b/src/Server/TCPHandler.cpp index 9794a86d3e3..d2ce2a409a9 100644 --- a/src/Server/TCPHandler.cpp +++ b/src/Server/TCPHandler.cpp @@ -1133,7 +1133,9 @@ void TCPHandler::receiveQuery() } query_context->applySettingsChanges(settings_changes); - /// Disable function name normalization it's a secondary query. + /// Disable function name normalization when it's a secondary query, because queries are either + /// already normalized on initiator node, or not normalized and should remain unnormalized for + /// compatibility. if (client_info.query_kind == ClientInfo::QueryKind::SECONDARY_QUERY) { query_context->setSetting("normalize_function_names", Field(0)); From a38a31c954aa03251767f769f8c6b5584165b2dd Mon Sep 17 00:00:00 2001 From: kssenii Date: Sat, 20 Feb 2021 09:58:24 +0000 Subject: [PATCH 0535/2357] Fix typos check --- src/Databases/PostgreSQL/fetchPostgreSQLTableStructure.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/Databases/PostgreSQL/fetchPostgreSQLTableStructure.cpp b/src/Databases/PostgreSQL/fetchPostgreSQLTableStructure.cpp index e065a497115..d3a42ead3f6 100644 --- a/src/Databases/PostgreSQL/fetchPostgreSQLTableStructure.cpp +++ b/src/Databases/PostgreSQL/fetchPostgreSQLTableStructure.cpp @@ -72,7 +72,7 @@ static DataTypePtr convertPostgreSQLDataType(std::string & type, bool is_nullabl else if (precision <= DecimalUtils::maxPrecision()) res = std::make_shared>(precision, scale); else - throw Exception(ErrorCodes::BAD_ARGUMENTS, "Presicion {} and scale {} are too big and not supported", precision, scale); + throw Exception(ErrorCodes::BAD_ARGUMENTS, "Precision {} and scale {} are too big and not supported", precision, scale); } else { From 673e24d7ef8ea1f181cc79e4df53a09564dc8332 Mon Sep 17 00:00:00 2001 From: Nikolai Kochetov Date: Sat, 20 Feb 2021 14:00:16 +0300 Subject: [PATCH 0536/2357] Refactor --- src/Interpreters/ActionsDAG.h | 2 + src/Interpreters/ExpressionAnalyzer.cpp | 17 +- src/Interpreters/InterpreterSelectQuery.cpp | 51 +++-- .../MergeTreeBaseSelectProcessor.cpp | 2 + .../MergeTree/MergeTreeBlockReadUtils.cpp | 24 +- .../MergeTree/MergeTreeRangeReader.cpp | 213 +++++++++++------- src/Storages/MergeTree/MergeTreeRangeReader.h | 6 +- 7 files changed, 196 insertions(+), 119 deletions(-) diff --git a/src/Interpreters/ActionsDAG.h b/src/Interpreters/ActionsDAG.h index d3f1d65d454..18f6f9a89ee 100644 --- a/src/Interpreters/ActionsDAG.h +++ b/src/Interpreters/ActionsDAG.h @@ -208,6 +208,8 @@ public: const Context & context, bool can_replace = false); + void addNodeToIndex(const Node * node) { index.insert(const_cast(node)); } + /// Call addAlias several times. void addAliases(const NamesWithAliases & aliases); /// Add alias actions and remove unused columns from index. Also specify result columns order in index. diff --git a/src/Interpreters/ExpressionAnalyzer.cpp b/src/Interpreters/ExpressionAnalyzer.cpp index 3145df23b95..2dc8d137abe 100644 --- a/src/Interpreters/ExpressionAnalyzer.cpp +++ b/src/Interpreters/ExpressionAnalyzer.cpp @@ -1405,7 +1405,8 @@ ExpressionAnalysisResult::ExpressionAnalysisResult( if (storage && filter_info_) { filter_info = filter_info_; - query_analyzer.appendPreliminaryFilter(chain, filter_info->actions, filter_info->column_name); + filter_info->do_remove_column = true; + //query_analyzer.appendPreliminaryFilter(chain, filter_info->actions, filter_info->column_name); } if (auto actions = query_analyzer.appendPrewhere(chain, !first_stage, additional_required_columns_after_prewhere)) @@ -1574,11 +1575,11 @@ void ExpressionAnalysisResult::finalize(const ExpressionActionsChain & chain, si { size_t next_step_i = 0; - if (hasFilter()) - { - const ExpressionActionsChain::Step & step = *chain.steps.at(next_step_i++); - filter_info->do_remove_column = step.can_remove_required_output.at(0); - } + // if (hasFilter()) + // { + // const ExpressionActionsChain::Step & step = *chain.steps.at(next_step_i++); + // filter_info->do_remove_column = step.can_remove_required_output.at(0); + // } if (hasPrewhere()) { @@ -1605,8 +1606,8 @@ void ExpressionAnalysisResult::finalize(const ExpressionActionsChain & chain, si void ExpressionAnalysisResult::removeExtraColumns() const { - if (hasFilter()) - filter_info->actions->projectInput(); + // if (hasFilter()) + // filter_info->actions->projectInput(); if (hasWhere()) before_where->projectInput(); if (hasHaving()) diff --git a/src/Interpreters/InterpreterSelectQuery.cpp b/src/Interpreters/InterpreterSelectQuery.cpp index 95b42bede8b..54481cbe873 100644 --- a/src/Interpreters/InterpreterSelectQuery.cpp +++ b/src/Interpreters/InterpreterSelectQuery.cpp @@ -141,7 +141,15 @@ String InterpreterSelectQuery::generateFilterActions(ActionsDAGPtr & actions, co SelectQueryExpressionAnalyzer analyzer(query_ast, syntax_result, *context, metadata_snapshot); actions = analyzer.simpleSelectActions(); - return expr_list->children.at(0)->getColumnName(); + auto column_name = expr_list->children.at(0)->getColumnName(); + actions->removeUnusedActions({column_name}); + actions->projectInput(false); + + ActionsDAG::Index index; + for (const auto * node : actions->getInputs()) + actions->addNodeToIndex(node); + + return column_name; } InterpreterSelectQuery::InterpreterSelectQuery( @@ -444,16 +452,22 @@ InterpreterSelectQuery::InterpreterSelectQuery( if (storage) { - source_header = metadata_snapshot->getSampleBlockForColumns(required_columns, storage->getVirtuals(), storage->getStorageID()); - /// Fix source_header for filter actions. if (row_policy_filter) { filter_info = std::make_shared(); filter_info->column_name = generateFilterActions(filter_info->actions, required_columns); - source_header = metadata_snapshot->getSampleBlockForColumns( - filter_info->actions->getRequiredColumns().getNames(), storage->getVirtuals(), storage->getStorageID()); + + auto required_columns_from_filter = filter_info->actions->getRequiredColumns(); + + for (const auto & column : required_columns_from_filter) + { + if (required_columns.end() == std::find(required_columns.begin(), required_columns.end(), column.name)) + required_columns.push_back(column.name); + } } + + source_header = metadata_snapshot->getSampleBlockForColumns(required_columns, storage->getVirtuals(), storage->getStorageID()); } /// Calculate structure of the result. @@ -834,6 +848,7 @@ void InterpreterSelectQuery::executeImpl(QueryPlan & query_plan, const BlockInpu expressions.prewhere_info = std::make_shared( std::move(expressions.filter_info->actions), std::move(expressions.filter_info->column_name)); + expressions.prewhere_info->prewhere_actions->projectInput(false); expressions.prewhere_info->remove_prewhere_column = expressions.filter_info->do_remove_column; expressions.prewhere_info->need_filter = true; expressions.filter_info = nullptr; @@ -845,19 +860,19 @@ void InterpreterSelectQuery::executeImpl(QueryPlan & query_plan, const BlockInpu expressions.prewhere_info->row_level_filter_actions = std::move(expressions.filter_info->actions); expressions.prewhere_info->row_level_column_name = std::move(expressions.filter_info->column_name); expressions.prewhere_info->row_level_filter_actions->projectInput(false); - if (expressions.filter_info->do_remove_column) - { - /// Instead of removing column, add it to prewhere_actions input (but not in index). - /// It will be removed at prewhere_actions execution. - const auto & index = expressions.prewhere_info->row_level_filter_actions->getIndex(); - auto it = index.find(expressions.prewhere_info->row_level_column_name); - if (it == index.end()) - throw Exception(ErrorCodes::LOGICAL_ERROR, "Not found column {} in row level security filter {}", - expressions.prewhere_info->row_level_column_name, expressions.prewhere_info->row_level_filter_actions->dumpDAG()); - const auto & node = *it; + // if (expressions.filter_info->do_remove_column) + // { + // /// Instead of removing column, add it to prewhere_actions input (but not in index). + // /// It will be removed at prewhere_actions execution. + // const auto & index = expressions.prewhere_info->row_level_filter_actions->getIndex(); + // auto it = index.find(expressions.prewhere_info->row_level_column_name); + // if (it == index.end()) + // throw Exception(ErrorCodes::LOGICAL_ERROR, "Not found column {} in row level security filter {}", + // expressions.prewhere_info->row_level_column_name, expressions.prewhere_info->row_level_filter_actions->dumpDAG()); + // const auto & node = *it; - expressions.prewhere_info->prewhere_actions->addInput(node->result_name, node->result_type, true, false); - } + // expressions.prewhere_info->prewhere_actions->addInput(node->result_name, node->result_type, true, false); + // } expressions.filter_info = nullptr; } @@ -1285,7 +1300,7 @@ void InterpreterSelectQuery::addEmptySourceToQueryPlan(QueryPlan & query_plan, c header, prewhere_info.row_level_filter, prewhere_info.row_level_column_name, - false); + true); }); } diff --git a/src/Storages/MergeTree/MergeTreeBaseSelectProcessor.cpp b/src/Storages/MergeTree/MergeTreeBaseSelectProcessor.cpp index 4911f9982d5..6bf164dd824 100644 --- a/src/Storages/MergeTree/MergeTreeBaseSelectProcessor.cpp +++ b/src/Storages/MergeTree/MergeTreeBaseSelectProcessor.cpp @@ -346,6 +346,8 @@ void MergeTreeBaseSelectProcessor::executePrewhereActions(Block & block, const P throw Exception("Invalid type for filter in PREWHERE: " + row_level_column.type->getName(), ErrorCodes::LOGICAL_ERROR); } + + block.erase(prewhere_info->row_level_column_name); } if (prewhere_info->prewhere_actions) diff --git a/src/Storages/MergeTree/MergeTreeBlockReadUtils.cpp b/src/Storages/MergeTree/MergeTreeBlockReadUtils.cpp index ed5fc48dad1..bf5fd307b1d 100644 --- a/src/Storages/MergeTree/MergeTreeBlockReadUtils.cpp +++ b/src/Storages/MergeTree/MergeTreeBlockReadUtils.cpp @@ -271,11 +271,22 @@ MergeTreeReadTaskColumns getReadTaskColumns( { if (prewhere_info->alias_actions) pre_column_names = prewhere_info->alias_actions->getRequiredColumns(); - else if (prewhere_info->row_level_filter) - pre_column_names = prewhere_info->row_level_filter->getRequiredColumns(); - else if (prewhere_info->prewhere_actions) + else + { pre_column_names = prewhere_info->prewhere_actions->getRequiredColumns(); + if (prewhere_info->row_level_filter) + { + NameSet names(pre_column_names.begin(), pre_column_names.end()); + + for (auto & name : prewhere_info->row_level_filter->getRequiredColumns()) + { + if (names.count(name) == 0) + pre_column_names.push_back(name); + } + } + } + if (pre_column_names.empty()) pre_column_names.push_back(column_names[0]); @@ -293,6 +304,13 @@ MergeTreeReadTaskColumns getReadTaskColumns( column_names = post_column_names; } + // std::cerr << "---------- Pre column names\n"; + // for (const auto & col : pre_column_names) + // std::cerr << col << std::endl; + // std::cerr << "----------- Post column names\n"; + // for (const auto & col : column_names) + // std::cerr << col << std::endl; + MergeTreeReadTaskColumns result; if (check_columns) diff --git a/src/Storages/MergeTree/MergeTreeRangeReader.cpp b/src/Storages/MergeTree/MergeTreeRangeReader.cpp index a3e4fbf6e6b..e72039f7172 100644 --- a/src/Storages/MergeTree/MergeTreeRangeReader.cpp +++ b/src/Storages/MergeTree/MergeTreeRangeReader.cpp @@ -33,6 +33,25 @@ static void filterColumns(Columns & columns, const IColumn::Filter & filter) } } +static void filterColumns(Columns & columns, const ColumnPtr & filter) +{ + ConstantFilterDescription const_descr(*filter); + if (const_descr.always_true) + return; + + if (const_descr.always_false) + { + for (auto & col : columns) + if (col) + col = col->cloneEmpty(); + + return; + } + + FilterDescription descr(*filter); + filterColumns(columns, *descr.data); +} + MergeTreeRangeReader::DelayedStream::DelayedStream( size_t from_mark, IMergeTreeReader * merge_tree_reader_) @@ -315,7 +334,7 @@ void MergeTreeRangeReader::ReadResult::setFilterConstFalse() num_rows = 0; } -void MergeTreeRangeReader::ReadResult::optimize(bool can_read_incomplete_granules) +void MergeTreeRangeReader::ReadResult::optimize(bool can_read_incomplete_granules, bool allow_filter_columns) { if (total_rows_per_granule == 0 || filter == nullptr) return; @@ -347,7 +366,7 @@ void MergeTreeRangeReader::ReadResult::optimize(bool can_read_incomplete_granule filter_holder_original = std::move(filter_holder); /// Check if const 1 after shrink - if (countBytesInResultFilter(filter->getData()) + total_zero_rows_in_tails == total_rows_per_granule) + if (allow_filter_columns && countBytesInResultFilter(filter->getData()) + total_zero_rows_in_tails == total_rows_per_granule) { total_rows_per_granule = total_rows_per_granule - total_zero_rows_in_tails; num_rows = total_rows_per_granule; @@ -451,79 +470,32 @@ size_t MergeTreeRangeReader::ReadResult::numZerosInTail(const UInt8 * begin, con return count; } -void MergeTreeRangeReader::ReadResult::addFilter(const ColumnPtr & new_filter) +void MergeTreeRangeReader::ReadResult::setFilter(const ColumnPtr & new_filter) { + if (!new_filter && filter) + throw Exception("Can't replace existing filter with empty.", ErrorCodes::LOGICAL_ERROR); + if (filter) { - if (!new_filter) - throw Exception("Can't add an empty filter to the existing one.", ErrorCodes::LOGICAL_ERROR); + size_t new_size = new_filter->size(); - const auto new_size = new_filter->size(); if (new_size != total_rows_per_granule) - throw Exception("Can't add the new filter because it's size is " + toString(new_size) + " but " + throw Exception("Can't set filter because it's size is " + toString(new_size) + " but " + toString(total_rows_per_granule) + " rows was read.", ErrorCodes::LOGICAL_ERROR); } ConstantFilterDescription const_description(*new_filter); if (const_description.always_true) - { - if (!filter) - setFilterConstTrue(); - } + setFilterConstTrue(); else if (const_description.always_false) - { clear(); - } else { - FilterDescription description(*new_filter); - auto new_holder = (description.data_holder ? description.data_holder : new_filter); - const auto * new_holder_cast = typeid_cast(new_holder.get()); - - if (!new_holder_cast) - throw Exception("addFilter function expected ColumnUInt8.", ErrorCodes::LOGICAL_ERROR); - - if (filter) - { - MutableColumnPtr new_mutable_holder = IColumn::mutate(std::move(new_holder)); - auto * new_mutable_holder_cast = typeid_cast(new_mutable_holder.get()); - - if (!new_mutable_holder_cast) - throw Exception("addFilter function expected ColumnUInt8.", ErrorCodes::LOGICAL_ERROR); - - const auto & data = filter->getData(); - const auto * it = data.begin(); - - auto & new_data = new_mutable_holder_cast->getData(); - auto * n_it = new_data.begin(); - - while (it != data.end() && n_it != new_data.end()) - { - *n_it = (*n_it && *it); - ++it; - ++n_it; - } - - ConstantFilterDescription new_const_description(*new_mutable_holder); - if (new_const_description.always_true) - { - setFilterConstTrue(); - } - else if (new_const_description.always_false) - { - clear(); - } - else - { - filter_holder = std::move(new_mutable_holder); - filter = new_mutable_holder_cast; - } - } - else - { - filter_holder = std::move(new_holder); - filter = new_holder_cast; - } + FilterDescription filter_description(*new_filter); + filter_holder = filter_description.data_holder ? filter_description.data_holder : new_filter; + filter = typeid_cast(filter_holder.get()); + if (!filter) + throw Exception("setFilter function expected ColumnUInt8.", ErrorCodes::LOGICAL_ERROR); } } @@ -565,7 +537,10 @@ MergeTreeRangeReader::MergeTreeRangeReader( prewhere_info->alias_actions->execute(sample_block, true); if (prewhere_info->row_level_filter) + { prewhere_info->row_level_filter->execute(sample_block, true); + sample_block.erase(prewhere_info->row_level_column_name); + } if (prewhere_info->prewhere_actions) prewhere_info->prewhere_actions->execute(sample_block, true); @@ -859,20 +834,76 @@ Columns MergeTreeRangeReader::continueReadingChain(ReadResult & result, size_t & return columns; } +static void checkCombindeFiltersSize(size_t bytes_in_first_filter, size_t second_filter_size) +{ + if (bytes_in_first_filter != second_filter_size) + throw Exception(ErrorCodes::LOGICAL_ERROR, + "Cannot combine filters because number of bytes in a first filter ({}) " + "does not match second filter size ({})", bytes_in_first_filter, second_filter_size); +} + +static ColumnPtr combineFilters(ColumnPtr first, ColumnPtr second) +{ + ConstantFilterDescription firsrt_const_descr(*first); + + if (firsrt_const_descr.always_true) + { + checkCombindeFiltersSize(first->size(), second->size()); + return second; + } + + if (firsrt_const_descr.always_false) + { + checkCombindeFiltersSize(0, second->size()); + return first; + } + + auto mut_first = IColumn::mutate(std::move(first)); + FilterDescription firsrt_descr(*mut_first); + + size_t bytes_in_first_filter = countBytesInFilter(*firsrt_descr.data); + checkCombindeFiltersSize(bytes_in_first_filter, second->size()); + + ConstantFilterDescription second_const_descr(*second); + + if (second_const_descr.always_true) + return mut_first; + + if (second_const_descr.always_false) + return second->cloneResized(mut_first->size()); + + FilterDescription second_descr(*second); + auto & first_data = const_cast(*firsrt_descr.data); + const auto * second_data = second_descr.data->data(); + + for (auto & val : first_data) + { + if (val) + { + val = *second_data; + ++second_data; + } + } + + return mut_first; +} + void MergeTreeRangeReader::executePrewhereActionsAndFilterColumns(ReadResult & result) { if (!prewhere_info) return; const auto & header = merge_tree_reader->getColumns(); - const auto num_columns = header.size(); + size_t num_columns = header.size(); if (result.columns.size() != num_columns) throw Exception("Invalid number of columns passed to MergeTreeRangeReader. " "Expected " + toString(num_columns) + ", " "got " + toString(result.columns.size()), ErrorCodes::LOGICAL_ERROR); - size_t prewhere_column_pos = 0; + ColumnPtr filter; + ColumnPtr row_level_filter; + size_t prewhere_column_pos; { /// Restore block from columns list. @@ -900,44 +931,47 @@ void MergeTreeRangeReader::executePrewhereActionsAndFilterColumns(ReadResult & r if (prewhere_info->row_level_filter) { prewhere_info->row_level_filter->execute(block); + auto row_level_filter_pos = block.getPositionByName(prewhere_info->row_level_column_name); + row_level_filter = block.getByPosition(row_level_filter_pos).column; + block.erase(row_level_filter_pos); - const auto filter_column_pos = block.getPositionByName(prewhere_info->row_level_column_name); - result.addFilter(block.getByPosition(filter_column_pos).column); - - result.columns.clear(); - result.columns.reserve(block.columns()); - - for (auto & col : block) - result.columns.emplace_back(std::move(col.column)); - - const auto * result_filter = result.getFilter(); - filterColumns(result.columns, result_filter->getData()); - - auto it = block.begin(); - for (auto & col : result.columns) - it++->column = std::move(col); - - result.columns.clear(); - result.clearFilter(); + auto columns = block.getColumns(); + filterColumns(columns, row_level_filter); + block.setColumns(columns); } prewhere_info->prewhere_actions->execute(block); prewhere_column_pos = block.getPositionByName(prewhere_info->prewhere_column_name); - result.addFilter(block.getByPosition(prewhere_column_pos).column); - - block.getByPosition(prewhere_column_pos).column.reset(); result.columns.clear(); result.columns.reserve(block.columns()); - for (auto & col : block) result.columns.emplace_back(std::move(col.column)); + + filter.swap(result.columns[prewhere_column_pos]); } + if (result.getFilter()) + { + /// TODO: implement for prewhere chain. + /// In order to do it we need combine filter and result.filter, where filter filters only '1' in result.filter. + throw Exception("MergeTreeRangeReader chain with several prewhere actions in not implemented.", + ErrorCodes::LOGICAL_ERROR); + } + + if (filter && row_level_filter) + { + row_level_filter = combineFilters(std::move(row_level_filter), filter); + result.setFilter(row_level_filter); + + } + else + result.setFilter(filter); + /// If there is a WHERE, we filter in there, and only optimize IO and shrink columns here if (!last_reader_in_chain) - result.optimize(merge_tree_reader->canReadIncompleteGranules()); + result.optimize(merge_tree_reader->canReadIncompleteGranules(), prewhere_info->row_level_filter == nullptr); /// If we read nothing or filter gets optimized to nothing if (result.totalRowsPerGranule() == 0) @@ -962,7 +996,12 @@ void MergeTreeRangeReader::executePrewhereActionsAndFilterColumns(ReadResult & r { /// filter might be shrunk while columns not const auto * result_filter = result.getFilterOriginal(); - filterColumns(result.columns, result_filter->getData()); + + if (row_level_filter) + filterColumns(result.columns, filter); + else + filterColumns(result.columns, result_filter->getData()); + result.need_filter = true; bool has_column = false; diff --git a/src/Storages/MergeTree/MergeTreeRangeReader.h b/src/Storages/MergeTree/MergeTreeRangeReader.h index 884d2dbafd1..18075e52bdd 100644 --- a/src/Storages/MergeTree/MergeTreeRangeReader.h +++ b/src/Storages/MergeTree/MergeTreeRangeReader.h @@ -153,10 +153,10 @@ public: void addRows(size_t rows) { num_read_rows += rows; } void addRange(const MarkRange & range) { started_ranges.push_back({rows_per_granule.size(), range}); } - /// Apply a filter on top of the existing one (AND'ed) or set it if there isn't any. - void addFilter(const ColumnPtr & new_filter); + /// Set filter or replace old one. Filter must have more zeroes than previous. + void setFilter(const ColumnPtr & new_filter); /// For each granule calculate the number of filtered rows at the end. Remove them and update filter. - void optimize(bool can_read_incomplete_granules); + void optimize(bool can_read_incomplete_granules, bool allow_filter_columns); /// Remove all rows from granules. void clear(); From a8bc2722f43a98bf9bee8cae777d490af62b5f0d Mon Sep 17 00:00:00 2001 From: Nikolai Kochetov Date: Sat, 20 Feb 2021 14:04:32 +0300 Subject: [PATCH 0537/2357] Fix getHeaderForProcessingStage --- src/Interpreters/getHeaderForProcessingStage.cpp | 3 +++ 1 file changed, 3 insertions(+) diff --git a/src/Interpreters/getHeaderForProcessingStage.cpp b/src/Interpreters/getHeaderForProcessingStage.cpp index 3adbab8413f..b56b90cdf3f 100644 --- a/src/Interpreters/getHeaderForProcessingStage.cpp +++ b/src/Interpreters/getHeaderForProcessingStage.cpp @@ -47,7 +47,10 @@ Block getHeaderForProcessingStage( auto & prewhere_info = *query_info.prewhere_info; if (prewhere_info.row_level_filter) + { prewhere_info.row_level_filter->execute(header); + header.erase(prewhere_info.row_level_column_name); + } if (prewhere_info.prewhere_actions) prewhere_info.prewhere_actions->execute(header); From 89dd15a91df89a3975e68ad3f6d4651f517e33ba Mon Sep 17 00:00:00 2001 From: alesapin Date: Sat, 20 Feb 2021 14:04:38 +0300 Subject: [PATCH 0538/2357] Checksum for header and logging, better names --- src/Coordination/Changelog.cpp | 239 +++++++++++++------------- src/Coordination/Changelog.h | 61 ++++--- src/Coordination/NuKeeperLogStore.cpp | 3 +- src/Coordination/NuKeeperLogStore.h | 2 + 4 files changed, 168 insertions(+), 137 deletions(-) diff --git a/src/Coordination/Changelog.cpp b/src/Coordination/Changelog.cpp index 4a3955e23ab..3d3c1ad230d 100644 --- a/src/Coordination/Changelog.cpp +++ b/src/Coordination/Changelog.cpp @@ -20,22 +20,6 @@ namespace ErrorCodes extern const int LOGICAL_ERROR; } -std::string toString(const ChangelogVersion & version) -{ - if (version == ChangelogVersion::V0) - return "V0"; - - throw Exception(ErrorCodes::UNKNOWN_FORMAT_VERSION, "Unknown chagelog version {}", static_cast(version)); -} - -ChangelogVersion fromString(const std::string & version_str) -{ - if (version_str == "V0") - return ChangelogVersion::V0; - - throw Exception(ErrorCodes::UNKNOWN_FORMAT_VERSION, "Unknown chagelog version {}", version_str); -} - namespace { @@ -44,11 +28,10 @@ constexpr auto DEFAULT_PREFIX = "changelog"; std::string formatChangelogPath(const std::string & prefix, const ChangelogFileDescription & name) { std::filesystem::path path(prefix); - path /= std::filesystem::path(name.prefix + "_" + std::to_string(name.from_log_idx) + "_" + std::to_string(name.to_log_idx) + ".bin"); + path /= std::filesystem::path(name.prefix + "_" + std::to_string(name.from_log_index) + "_" + std::to_string(name.to_log_index) + ".bin"); return path; } - ChangelogFileDescription getChangelogFileDescription(const std::string & path_str) { std::filesystem::path path(path_str); @@ -60,8 +43,8 @@ ChangelogFileDescription getChangelogFileDescription(const std::string & path_st ChangelogFileDescription result; result.prefix = filename_parts[0]; - result.from_log_idx = parse(filename_parts[1]); - result.to_log_idx = parse(filename_parts[2]); + result.from_log_index = parse(filename_parts[1]); + result.to_log_index = parse(filename_parts[2]); result.path = path_str; return result; } @@ -71,6 +54,17 @@ LogEntryPtr makeClone(const LogEntryPtr & entry) return cs_new(entry->get_term(), nuraft::buffer::clone(entry->get_buf()), entry->get_val_type()); } +Checksum computeRecordChecksum(const ChangelogRecord & record) +{ + const auto * header_start = reinterpret_cast(&record.header); + auto sum = CityHash_v1_0_2::CityHash128(header_start, sizeof(record.header)); + + if (record.header.blob_size != 0) + sum = CityHash_v1_0_2::CityHash128WithSeed(reinterpret_cast(record.blob->data_begin()), record.header.blob_size, sum); + + return sum; +} + } class ChangelogWriter @@ -86,12 +80,9 @@ public: off_t appendRecord(ChangelogRecord && record, bool sync) { off_t result = plain_buf.count(); - writeIntBinary(record.header.version, plain_buf); - writeIntBinary(record.header.index, plain_buf); - writeIntBinary(record.header.term, plain_buf); - writeIntBinary(record.header.value_type, plain_buf); - writeIntBinary(record.header.blob_size, plain_buf); - writeIntBinary(record.header.blob_checksum, plain_buf); + writeIntBinary(computeRecordChecksum(record), plain_buf); + + writePODBinary(record.header, plain_buf); if (record.header.blob_size != 0) plain_buf.write(reinterpret_cast(record.blob->data_begin()), record.blob->size()); @@ -157,7 +148,7 @@ public: , read_buf(filepath) {} - ChangelogReadResult readChangelog(IndexToLogEntry & logs, size_t start_log_idx, IndexToOffset & index_to_offset) + ChangelogReadResult readChangelog(IndexToLogEntry & logs, size_t start_log_index, IndexToOffset & index_to_offset, Poco::Logger * log) { size_t previous_index = 0; ChangelogReadResult result{}; @@ -166,24 +157,31 @@ public: while (!read_buf.eof()) { result.last_position = read_buf.count(); + Checksum record_checksum; + readIntBinary(record_checksum, read_buf); + ChangelogRecord record; - readIntBinary(record.header.version, read_buf); - readIntBinary(record.header.index, read_buf); - readIntBinary(record.header.term, read_buf); - readIntBinary(record.header.value_type, read_buf); - readIntBinary(record.header.blob_size, read_buf); - readIntBinary(record.header.blob_checksum, read_buf); - auto buffer = nuraft::buffer::alloc(record.header.blob_size); - auto * buffer_begin = reinterpret_cast(buffer->data_begin()); - read_buf.readStrict(buffer_begin, record.header.blob_size); + readPODBinary(record.header, read_buf); + if (record.header.version > CURRENT_CHANGELOG_VERSION) + throw Exception(ErrorCodes::UNKNOWN_FORMAT_VERSION, "Unsupported changelog version {} on path {}", record.header.version, filepath); + + if (record.header.blob_size != 0) + { + auto buffer = nuraft::buffer::alloc(record.header.blob_size); + auto * buffer_begin = reinterpret_cast(buffer->data_begin()); + read_buf.readStrict(buffer_begin, record.header.blob_size); + record.blob = buffer; + } + else + record.blob = nullptr; if (previous_index != 0 && previous_index + 1 != record.header.index) throw Exception(ErrorCodes::CORRUPTED_DATA, "Previous log entry {}, next log entry {}, seems like some entries skipped", previous_index, record.header.index); previous_index = record.header.index; - Checksum checksum = CityHash_v1_0_2::CityHash128(buffer_begin, record.header.blob_size); - if (checksum != record.header.blob_checksum) + Checksum checksum = computeRecordChecksum(record); + if (checksum != record_checksum) { throw Exception(ErrorCodes::CHECKSUM_DOESNT_MATCH, "Checksums doesn't match for log {} (version {}), index {}, blob_size {}", @@ -195,10 +193,10 @@ public: result.entries_read += 1; - if (record.header.index < start_log_idx) + if (record.header.index < start_log_index) continue; - auto log_entry = nuraft::cs_new(record.header.term, buffer, record.header.value_type); + auto log_entry = nuraft::cs_new(record.header.term, record.blob, record.header.value_type); logs.emplace(record.header.index, log_entry); index_to_offset[record.header.index] = result.last_position; @@ -206,13 +204,16 @@ public: } catch (const Exception & ex) { + if (ex.code() == ErrorCodes::UNKNOWN_FORMAT_VERSION) + throw ex; + result.error = true; - LOG_WARNING(&Poco::Logger::get("RaftChangelog"), "Cannot completely read changelog on path {}, error: {}", filepath, ex.message()); + LOG_WARNING(log, "Cannot completely read changelog on path {}, error: {}", filepath, ex.message()); } catch (...) { result.error = true; - tryLogCurrentException(&Poco::Logger::get("RaftChangelog")); + tryLogCurrentException(log); } return result; @@ -223,9 +224,10 @@ private: ReadBufferFromFile read_buf; }; -Changelog::Changelog(const std::string & changelogs_dir_, size_t rotate_interval_) +Changelog::Changelog(const std::string & changelogs_dir_, size_t rotate_interval_, Poco::Logger * log_) : changelogs_dir(changelogs_dir_) , rotate_interval(rotate_interval_) + , log(log_) { namespace fs = std::filesystem; if (!fs::exists(changelogs_dir)) @@ -234,96 +236,104 @@ Changelog::Changelog(const std::string & changelogs_dir_, size_t rotate_interval for (const auto & p : fs::directory_iterator(changelogs_dir)) { auto file_description = getChangelogFileDescription(p.path()); - existing_changelogs[file_description.from_log_idx] = file_description; + existing_changelogs[file_description.from_log_index] = file_description; } } -void Changelog::readChangelogAndInitWriter(size_t from_log_idx) +void Changelog::readChangelogAndInitWriter(size_t from_log_index) { - start_index = from_log_idx == 0 ? 1 : from_log_idx; + start_index = from_log_index == 0 ? 1 : from_log_index; size_t total_read = 0; size_t entries_in_last = 0; - size_t incomplete_log_idx = 0; + size_t incomplete_log_index = 0; ChangelogReadResult result{}; - for (const auto & [start_idx, changelog_description] : existing_changelogs) - { - entries_in_last = changelog_description.to_log_idx - changelog_description.from_log_idx + 1; - if (changelog_description.to_log_idx >= from_log_idx) + for (const auto & [start_index, changelog_description] : existing_changelogs) + { + entries_in_last = changelog_description.to_log_index - changelog_description.from_log_index + 1; + + if (changelog_description.to_log_index >= from_log_index) { ChangelogReader reader(changelog_description.path); - result = reader.readChangelog(logs, from_log_idx, index_to_start_pos); + result = reader.readChangelog(logs, from_log_index, index_to_start_pos, log); total_read += result.entries_read; - /// May happen after truncate and crash + /// May happen after truncate, crash or simply unfinished log if (result.entries_read < entries_in_last) { - incomplete_log_idx = start_idx; + incomplete_log_index = start_index; break; } } } - if (incomplete_log_idx != 0) + if (incomplete_log_index != 0) { - for (auto itr = existing_changelogs.upper_bound(incomplete_log_idx); itr != existing_changelogs.end();) + /// All subsequent logs shouldn't exist. But they may exist if we crashed after writeAt started. Remove them. + for (auto itr = existing_changelogs.upper_bound(incomplete_log_index); itr != existing_changelogs.end();) { + LOG_WARNING(log, "Removing changelog {}, beacuse it's goes after broken changelog entry", itr->second.path); std::filesystem::remove(itr->second.path); itr = existing_changelogs.erase(itr); } + + /// Continue to write into existing log + if (!existing_changelogs.empty()) + { + auto description = existing_changelogs.rbegin()->second; + LOG_TRACE(log, "Continue to write into {}", description.path); + current_writer = std::make_unique(description.path, WriteMode::Append, description.from_log_index); + current_writer->setEntriesWritten(result.entries_read); + + /// Truncate all broken entries from log + if (result.error) + { + LOG_WARNING(log, "Read finished with error, truncating all broken log entries"); + current_writer->truncateToLength(result.last_position); + } + } } - if (!existing_changelogs.empty() && result.entries_read < entries_in_last) - { - auto description = existing_changelogs.rbegin()->second; - current_writer = std::make_unique(description.path, WriteMode::Append, description.from_log_idx); - current_writer->setEntriesWritten(result.entries_read); - if (result.error) - current_writer->truncateToLength(result.last_position); - } - else - { + /// Start new log if we don't initialize writer from previous log + if (!current_writer) rotate(start_index + total_read); - } } -void Changelog::rotate(size_t new_start_log_idx) +void Changelog::rotate(size_t new_start_log_index) { + //// doesn't exist on init if (current_writer) current_writer->flush(); ChangelogFileDescription new_description; new_description.prefix = DEFAULT_PREFIX; - new_description.from_log_idx = new_start_log_idx; - new_description.to_log_idx = new_start_log_idx + rotate_interval - 1; + new_description.from_log_index = new_start_log_index; + new_description.to_log_index = new_start_log_index + rotate_interval - 1; new_description.path = formatChangelogPath(changelogs_dir, new_description); - existing_changelogs[new_start_log_idx] = new_description; - current_writer = std::make_unique(new_description.path, WriteMode::Rewrite, new_start_log_idx); + + LOG_TRACE(log, "Starting new changelog {}", new_description.path); + existing_changelogs[new_start_log_index] = new_description; + current_writer = std::make_unique(new_description.path, WriteMode::Rewrite, new_start_log_index); } -ChangelogRecord Changelog::buildRecord(size_t index, nuraft::ptr log_entry) +ChangelogRecord Changelog::buildRecord(size_t index, const LogEntryPtr & log_entry) { ChangelogRecordHeader header; + header.version = ChangelogVersion::V0; header.index = index; header.term = log_entry->get_term(); header.value_type = log_entry->get_val_type(); auto buffer = log_entry->get_buf_ptr(); if (buffer) - { header.blob_size = buffer->size(); - header.blob_checksum = CityHash_v1_0_2::CityHash128(reinterpret_cast(buffer->data_begin()), buffer->size()); - } else - { header.blob_size = 0; - header.blob_checksum = std::make_pair(0, 0); - } return ChangelogRecord{header, buffer}; } -void Changelog::appendEntry(size_t index, nuraft::ptr log_entry, bool force_sync) +void Changelog::appendEntry(size_t index, const LogEntryPtr & log_entry, bool force_sync) { if (!current_writer) throw Exception(ErrorCodes::LOGICAL_ERROR, "Changelog must be initialized before appending records"); @@ -341,13 +351,13 @@ void Changelog::appendEntry(size_t index, nuraft::ptr log_ent logs[index] = makeClone(log_entry); } -void Changelog::writeAt(size_t index, nuraft::ptr log_entry, bool force_sync) +void Changelog::writeAt(size_t index, const LogEntryPtr & log_entry, bool force_sync) { if (index_to_start_pos.count(index) == 0) throw Exception(ErrorCodes::LOGICAL_ERROR, "Cannot write at index {} because changelog doesn't contain it", index); - bool need_rollback = index < current_writer->getStartIndex(); - if (need_rollback) + bool go_to_previous_file = index < current_writer->getStartIndex(); + if (go_to_previous_file) { auto index_changelog = existing_changelogs.lower_bound(index); ChangelogFileDescription description; @@ -357,14 +367,15 @@ void Changelog::writeAt(size_t index, nuraft::ptr log_entry, description = std::prev(index_changelog)->second; current_writer = std::make_unique(description.path, WriteMode::Append, index_changelog->first); - current_writer->setEntriesWritten(description.to_log_idx - description.from_log_idx + 1); + current_writer->setEntriesWritten(description.to_log_index - description.from_log_index + 1); } auto entries_written = current_writer->getEntriesWritten(); current_writer->truncateToLength(index_to_start_pos[index]); - if (need_rollback) + if (go_to_previous_file) { + /// Remove all subsequent files auto to_remove_itr = existing_changelogs.upper_bound(index); for (auto itr = to_remove_itr; itr != existing_changelogs.end();) { @@ -373,11 +384,14 @@ void Changelog::writeAt(size_t index, nuraft::ptr log_entry, } } - /// Rollback in memory state - for (auto itr = logs.lower_bound(index); itr != logs.end();) + /// Remove redundant logs from memory + for (size_t i = index; ; ++i) { - index_to_start_pos.erase(itr->first); - itr = logs.erase(itr); + auto log_itr = logs.find(i); + if (log_itr == logs.end()) + break; + logs.erase(log_itr); + index_to_start_pos.erase(i); entries_written--; } @@ -386,37 +400,32 @@ void Changelog::writeAt(size_t index, nuraft::ptr log_entry, appendEntry(index, log_entry, force_sync); } -void Changelog::compact(size_t up_to_log_idx) +void Changelog::compact(size_t up_to_log_index) { for (auto itr = existing_changelogs.begin(); itr != existing_changelogs.end();) { - if (itr->second.to_log_idx <= up_to_log_idx) + /// Remove all completely outdated changelog files + if (itr->second.to_log_index <= up_to_log_index) { - for (size_t idx = itr->second.from_log_idx; idx <= itr->second.to_log_idx; ++idx) - { - auto index_pos = index_to_start_pos.find(idx); - if (index_pos == index_to_start_pos.end()) - break; - index_to_start_pos.erase(index_pos); - } + + LOG_INFO(log, "Removing changelog {} because of compaction", itr->second.path); + std::erase_if(index_to_start_pos, [right_index = itr->second.to_log_index] (const auto & item) { return item.first <= right_index; }); std::filesystem::remove(itr->second.path); itr = existing_changelogs.erase(itr); } - else + else /// Files are ordered, so all subsequent should exist break; } - auto start = logs.begin(); - auto end = logs.upper_bound(up_to_log_idx); - logs.erase(start, end); - start_index = up_to_log_idx + 1; + start_index = up_to_log_index + 1; + std::erase_if(logs, [up_to_log_index] (const auto & item) { return item.first <= up_to_log_index; }); } LogEntryPtr Changelog::getLastEntry() const { static LogEntryPtr fake_entry = nuraft::cs_new(0, nuraft::buffer::alloc(sizeof(size_t))); - size_t next_idx = getNextEntryIndex() - 1; - auto entry = logs.find(next_idx); + size_t next_index = getNextEntryIndex() - 1; + auto entry = logs.find(next_index); if (entry == logs.end()) return fake_entry; @@ -437,10 +446,10 @@ LogEntriesPtr Changelog::getLogEntriesBetween(size_t start, size_t end) return ret; } -LogEntryPtr Changelog::entryAt(size_t idx) +LogEntryPtr Changelog::entryAt(size_t index) { nuraft::ptr src = nullptr; - auto entry = logs.find(idx); + auto entry = logs.find(index); if (entry == logs.end()) return nullptr; @@ -448,12 +457,12 @@ LogEntryPtr Changelog::entryAt(size_t idx) return makeClone(src); } -nuraft::ptr Changelog::serializeEntriesToBuffer(size_t index, int32_t cnt) +nuraft::ptr Changelog::serializeEntriesToBuffer(size_t index, int32_t count) { std::vector> returned_logs; size_t size_total = 0; - for (size_t i = index; i < index + cnt; ++i) + for (size_t i = index; i < index + count; ++i) { auto entry = logs.find(i); if (entry == logs.end()) @@ -464,9 +473,9 @@ nuraft::ptr Changelog::serializeEntriesToBuffer(size_t index, in returned_logs.push_back(buf); } - nuraft::ptr buf_out = nuraft::buffer::alloc(sizeof(int32_t) + cnt * sizeof(int32_t) + size_total); + nuraft::ptr buf_out = nuraft::buffer::alloc(sizeof(int32_t) + count * sizeof(int32_t) + size_total); buf_out->pos(0); - buf_out->put(static_cast(cnt)); + buf_out->put(static_cast(count)); for (auto & entry : returned_logs) { @@ -484,17 +493,17 @@ void Changelog::applyEntriesFromBuffer(size_t index, nuraft::buffer & buffer, bo for (int i = 0; i < num_logs; ++i) { - size_t cur_idx = index + i; + size_t cur_index = index + i; int buf_size = buffer.get_int(); nuraft::ptr buf_local = nuraft::buffer::alloc(buf_size); buffer.get(buf_local); LogEntryPtr log_entry = nuraft::log_entry::deserialize(*buf_local); - if (i == 0 && logs.count(cur_idx)) - writeAt(cur_idx, log_entry, force_sync); + if (i == 0 && logs.count(cur_index)) + writeAt(cur_index, log_entry, force_sync); else - appendEntry(cur_idx, log_entry, force_sync); + appendEntry(cur_index, log_entry, force_sync); } } diff --git a/src/Coordination/Changelog.h b/src/Coordination/Changelog.h index 779d057d285..38679d604de 100644 --- a/src/Coordination/Changelog.h +++ b/src/Coordination/Changelog.h @@ -15,60 +15,68 @@ using Checksum = CityHash_v1_0_2::uint128; using LogEntryPtr = nuraft::ptr; using LogEntries = std::vector; using LogEntriesPtr = nuraft::ptr; +using BufferPtr = nuraft::ptr; using IndexToOffset = std::unordered_map; -using IndexToLogEntry = std::map; +using IndexToLogEntry = std::unordered_map; enum class ChangelogVersion : uint8_t { V0 = 0, }; -std::string toString(const ChangelogVersion & version); -ChangelogVersion fromString(const std::string & version_str); - static constexpr auto CURRENT_CHANGELOG_VERSION = ChangelogVersion::V0; -struct ChangelogRecordHeader +struct __attribute__((__packed__)) ChangelogRecordHeader { ChangelogVersion version = CURRENT_CHANGELOG_VERSION; - size_t index; + size_t index; /// entry log number size_t term; nuraft::log_val_type value_type; size_t blob_size; - Checksum blob_checksum; }; +/// Changelog record on disk struct ChangelogRecord { ChangelogRecordHeader header; nuraft::ptr blob; }; +/// changelog_fromindex_toindex.bin +/// [fromindex, toindex] <- inclusive struct ChangelogFileDescription { std::string prefix; - size_t from_log_idx; - size_t to_log_idx; + size_t from_log_index; + size_t to_log_index; std::string path; }; class ChangelogWriter; +/// Simpliest changelog with files rotation. +/// No compression, no metadata, just entries with headers one by one +/// Able to read broken files/entries and discard them. class Changelog { public: - Changelog(const std::string & changelogs_dir_, size_t rotate_interval_); + Changelog(const std::string & changelogs_dir_, size_t rotate_interval_, Poco::Logger * log_); - void readChangelogAndInitWriter(size_t from_log_idx); + /// Read changelog from files on changelogs_dir_ skipping all entries before from_log_index + /// Truncate broken entries, remove files after broken entries. + void readChangelogAndInitWriter(size_t from_log_index); - void appendEntry(size_t index, LogEntryPtr log_entry, bool force_sync); + /// Add entry to log with index. Call fsync if force_sync true. + void appendEntry(size_t index, const LogEntryPtr & log_entry, bool force_sync); - void writeAt(size_t index, LogEntryPtr log_entry, bool force_sync); + /// Write entry at index and truncate all subsequent entries. + void writeAt(size_t index, const LogEntryPtr & log_entry, bool force_sync); - void compact(size_t up_to_log_idx); + /// Remove log files with to_log_index <= up_to_log_index. + void compact(size_t up_to_log_index); size_t getNextEntryIndex() const { @@ -80,16 +88,22 @@ public: return start_index; } + /// Last entry in log, or fake entry with term 0 if log is empty LogEntryPtr getLastEntry() const; - LogEntriesPtr getLogEntriesBetween(size_t start_index, size_t end_idx); + /// Return log entries between [start, end) + LogEntriesPtr getLogEntriesBetween(size_t start_index, size_t end_index); - LogEntryPtr entryAt(size_t idx); + /// Return entry at position index + LogEntryPtr entryAt(size_t index); - nuraft::ptr serializeEntriesToBuffer(size_t index, int32_t cnt); + /// Serialize entries from index into buffer + BufferPtr serializeEntriesToBuffer(size_t index, int32_t count); + /// Apply entries from buffer overriding existing entries void applyEntriesFromBuffer(size_t index, nuraft::buffer & buffer, bool force_sync); + /// Fsync log to disk void flush(); size_t size() const @@ -97,20 +111,25 @@ public: return logs.size(); } + /// Fsync log to disk ~Changelog(); private: - void rotate(size_t new_start_log_idx); + /// Starts new file [new_start_log_index, new_start_log_index + rotate_interval] + void rotate(size_t new_start_log_index); - static ChangelogRecord buildRecord(size_t index, nuraft::ptr log_entry); + /// Pack log_entry into changelog record + static ChangelogRecord buildRecord(size_t index, const LogEntryPtr & log_entry); private: - std::string changelogs_dir; + const std::string changelogs_dir; + const size_t rotate_interval; + Poco::Logger * log; + std::map existing_changelogs; std::unique_ptr current_writer; IndexToOffset index_to_start_pos; - const size_t rotate_interval; IndexToLogEntry logs; size_t start_index = 0; }; diff --git a/src/Coordination/NuKeeperLogStore.cpp b/src/Coordination/NuKeeperLogStore.cpp index 8834bdc4d69..6aba078bb80 100644 --- a/src/Coordination/NuKeeperLogStore.cpp +++ b/src/Coordination/NuKeeperLogStore.cpp @@ -4,7 +4,8 @@ namespace DB { NuKeeperLogStore::NuKeeperLogStore(const std::string & changelogs_path, size_t rotate_interval_, bool force_sync_) - : changelog(changelogs_path, rotate_interval_) + : log(&Poco::Logger::get("NuKeeperLogStore")) + , changelog(changelogs_path, rotate_interval_, log) , force_sync(force_sync_) { } diff --git a/src/Coordination/NuKeeperLogStore.h b/src/Coordination/NuKeeperLogStore.h index 0ff92220316..a94b662fda4 100644 --- a/src/Coordination/NuKeeperLogStore.h +++ b/src/Coordination/NuKeeperLogStore.h @@ -4,6 +4,7 @@ #include #include #include +#include namespace DB { @@ -43,6 +44,7 @@ public: private: mutable std::mutex changelog_lock; + Poco::Logger * log; Changelog changelog; bool force_sync; }; From e7f792c94d2835676f82fd7942f6f8a591fe7e4d Mon Sep 17 00:00:00 2001 From: alesapin Date: Sat, 20 Feb 2021 14:28:39 +0300 Subject: [PATCH 0539/2357] Fix typos --- src/Coordination/Changelog.cpp | 2 +- src/Coordination/Changelog.h | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/src/Coordination/Changelog.cpp b/src/Coordination/Changelog.cpp index 3d3c1ad230d..efb0f2798e2 100644 --- a/src/Coordination/Changelog.cpp +++ b/src/Coordination/Changelog.cpp @@ -272,7 +272,7 @@ void Changelog::readChangelogAndInitWriter(size_t from_log_index) /// All subsequent logs shouldn't exist. But they may exist if we crashed after writeAt started. Remove them. for (auto itr = existing_changelogs.upper_bound(incomplete_log_index); itr != existing_changelogs.end();) { - LOG_WARNING(log, "Removing changelog {}, beacuse it's goes after broken changelog entry", itr->second.path); + LOG_WARNING(log, "Removing changelog {}, because it's goes after broken changelog entry", itr->second.path); std::filesystem::remove(itr->second.path); itr = existing_changelogs.erase(itr); } diff --git a/src/Coordination/Changelog.h b/src/Coordination/Changelog.h index 38679d604de..f758edc27ed 100644 --- a/src/Coordination/Changelog.h +++ b/src/Coordination/Changelog.h @@ -56,7 +56,7 @@ struct ChangelogFileDescription class ChangelogWriter; -/// Simpliest changelog with files rotation. +/// Simplest changelog with files rotation. /// No compression, no metadata, just entries with headers one by one /// Able to read broken files/entries and discard them. class Changelog From 863c0992540c68b781b393a35d8c8f47dddbdd20 Mon Sep 17 00:00:00 2001 From: Alexander Tokmakov Date: Sat, 20 Feb 2021 15:56:28 +0300 Subject: [PATCH 0540/2357] fix --- docker/test/fasttest/run.sh | 2 +- src/Databases/DatabaseReplicatedWorker.h | 2 +- ...ference => 01541_max_memory_usage_for_user_long.reference} | 0 ...ge_for_user.sh => 01541_max_memory_usage_for_user_long.sh} | 0 tests/queries/skip_list.json | 4 ++-- 5 files changed, 4 insertions(+), 4 deletions(-) rename tests/queries/0_stateless/{01541_max_memory_usage_for_user.reference => 01541_max_memory_usage_for_user_long.reference} (100%) rename tests/queries/0_stateless/{01541_max_memory_usage_for_user.sh => 01541_max_memory_usage_for_user_long.sh} (100%) diff --git a/docker/test/fasttest/run.sh b/docker/test/fasttest/run.sh index 1c5f62a9e46..c9c8cb1382d 100755 --- a/docker/test/fasttest/run.sh +++ b/docker/test/fasttest/run.sh @@ -326,7 +326,7 @@ function run_tests # Look at DistributedFilesToInsert, so cannot run in parallel. 01460_DistributedFilesToInsert - 01541_max_memory_usage_for_user + 01541_max_memory_usage_for_user_long # Require python libraries like scipy, pandas and numpy 01322_ttest_scipy diff --git a/src/Databases/DatabaseReplicatedWorker.h b/src/Databases/DatabaseReplicatedWorker.h index 6dd8dc408d7..6ba46a98bca 100644 --- a/src/Databases/DatabaseReplicatedWorker.h +++ b/src/Databases/DatabaseReplicatedWorker.h @@ -14,7 +14,7 @@ class DatabaseReplicated; /// 3. After creation of an entry in DDL queue initiator tries to execute the entry locally /// and other hosts wait for query to finish on initiator host. /// If query succeed on initiator, then all hosts must execute it, so they will retry until query succeed. -/// We assume that cluster is homogenous, so if replicas are in consistent state and query succeed on one host, +/// We assume that cluster is homogeneous, so if replicas are in consistent state and query succeed on one host, /// then all hosts can execute it (maybe after several retries). /// 4. Each database replica stores its log pointer in ZooKeeper. Cleanup thread removes old entry /// if its number < max_log_ptr - logs_to_keep. diff --git a/tests/queries/0_stateless/01541_max_memory_usage_for_user.reference b/tests/queries/0_stateless/01541_max_memory_usage_for_user_long.reference similarity index 100% rename from tests/queries/0_stateless/01541_max_memory_usage_for_user.reference rename to tests/queries/0_stateless/01541_max_memory_usage_for_user_long.reference diff --git a/tests/queries/0_stateless/01541_max_memory_usage_for_user.sh b/tests/queries/0_stateless/01541_max_memory_usage_for_user_long.sh similarity index 100% rename from tests/queries/0_stateless/01541_max_memory_usage_for_user.sh rename to tests/queries/0_stateless/01541_max_memory_usage_for_user_long.sh diff --git a/tests/queries/skip_list.json b/tests/queries/skip_list.json index e6bb3747fb0..77c4d487082 100644 --- a/tests/queries/skip_list.json +++ b/tests/queries/skip_list.json @@ -440,7 +440,7 @@ "01530_drop_database_atomic_sync", "01532_execute_merges_on_single_replica", "01532_primary_key_without_order_by_zookeeper", - "01541_max_memory_usage_for_user", + "01541_max_memory_usage_for_user_long", "01551_mergetree_read_in_order_spread", "01552_dict_fixedstring", "01554_bloom_filter_index_big_integer_uuid", @@ -717,7 +717,7 @@ "01527_clickhouse_local_optimize", "01527_dist_sharding_key_dictGet_reload", "01530_drop_database_atomic_sync", - "01541_max_memory_usage_for_user", + "01541_max_memory_usage_for_user_long", "01542_dictionary_load_exception_race", "01575_disable_detach_table_of_dictionary", "01593_concurrent_alter_mutations_kill", From d723f25fbd6474675d4e846c8a187418a540153a Mon Sep 17 00:00:00 2001 From: Evgeniia Sudarikova Date: Sat, 20 Feb 2021 16:51:31 +0300 Subject: [PATCH 0541/2357] delete extra text --- docs/en/sql-reference/functions/array-functions.md | 1 - 1 file changed, 1 deletion(-) diff --git a/docs/en/sql-reference/functions/array-functions.md b/docs/en/sql-reference/functions/array-functions.md index 531c5e5be49..c9c418d57a4 100644 --- a/docs/en/sql-reference/functions/array-functions.md +++ b/docs/en/sql-reference/functions/array-functions.md @@ -1378,7 +1378,6 @@ SELECT arrayMax([1, 2, 4]) AS res; Result: -``` text ```text ┌─res─┐ │ 4 │ From 8a876b9510da3fcd23dd3f3efa308d7cb52a6410 Mon Sep 17 00:00:00 2001 From: Nikolai Kochetov Date: Sat, 20 Feb 2021 17:19:11 +0300 Subject: [PATCH 0542/2357] Fix aliases for row level actions. --- src/Interpreters/InterpreterSelectQuery.cpp | 55 +++++++++++---------- 1 file changed, 29 insertions(+), 26 deletions(-) diff --git a/src/Interpreters/InterpreterSelectQuery.cpp b/src/Interpreters/InterpreterSelectQuery.cpp index 54481cbe873..835c0c0e50f 100644 --- a/src/Interpreters/InterpreterSelectQuery.cpp +++ b/src/Interpreters/InterpreterSelectQuery.cpp @@ -1410,30 +1410,30 @@ void InterpreterSelectQuery::executeFetchColumns(QueryProcessingStage::Enum proc if (storage) { /// Append columns from the table filter to required - if (row_policy_filter) - { - ActionsDAG * row_policy_dag = nullptr; - if (expressions.filter_info) - row_policy_dag = expressions.filter_info->actions.get(); - else if (expressions.prewhere_info) - { - if (expressions.prewhere_info->row_level_filter_actions) - row_policy_dag = expressions.prewhere_info->row_level_filter_actions.get(); - else if (expressions.prewhere_info->prewhere_actions) - row_policy_dag = expressions.prewhere_info->prewhere_actions.get(); - } + // if (row_policy_filter) + // { + // ActionsDAG * row_policy_dag = nullptr; + // if (expressions.filter_info) + // row_policy_dag = expressions.filter_info->actions.get(); + // else if (expressions.prewhere_info) + // { + // if (expressions.prewhere_info->row_level_filter_actions) + // row_policy_dag = expressions.prewhere_info->row_level_filter_actions.get(); + // else if (expressions.prewhere_info->prewhere_actions) + // row_policy_dag = expressions.prewhere_info->prewhere_actions.get(); + // } - if (row_policy_dag) - { - auto required_columns_from_filter = row_policy_dag->getRequiredColumns(); + // if (row_policy_dag) + // { + // auto required_columns_from_filter = row_policy_dag->getRequiredColumns(); - for (const auto & column : required_columns_from_filter) - { - if (required_columns.end() == std::find(required_columns.begin(), required_columns.end(), column.name)) - required_columns.push_back(column.name); - } - } - } + // for (const auto & column : required_columns_from_filter) + // { + // if (required_columns.end() == std::find(required_columns.begin(), required_columns.end(), column.name)) + // required_columns.push_back(column.name); + // } + // } + // } /// Detect, if ALIAS columns are required for query execution auto alias_columns_required = false; @@ -1463,11 +1463,14 @@ void InterpreterSelectQuery::executeFetchColumns(QueryProcessingStage::Enum proc if (prewhere_info) { /// Get some columns directly from PREWHERE expression actions - auto prewhere_required_columns = ( - prewhere_info->row_level_filter_actions ? - prewhere_info->row_level_filter_actions : - prewhere_info->prewhere_actions)->getRequiredColumns().getNames(); + auto prewhere_required_columns = prewhere_info->prewhere_actions->getRequiredColumns().getNames(); required_columns_from_prewhere.insert(prewhere_required_columns.begin(), prewhere_required_columns.end()); + + if (prewhere_info->row_level_filter_actions) + { + auto row_level_required_columns = prewhere_info->row_level_filter_actions->getRequiredColumns().getNames(); + required_columns_from_prewhere.insert(row_level_required_columns.begin(), row_level_required_columns.end()); + } } /// Expression, that contains all raw required columns From 6cc2fb5e9f32517ebd29104c56f3ee07517f462d Mon Sep 17 00:00:00 2001 From: vdimir Date: Sat, 20 Feb 2021 18:00:59 +0300 Subject: [PATCH 0543/2357] Try to fix race in storage join: block parralel inserts --- src/Functions/FunctionJoinGet.cpp | 8 ++++---- src/Functions/FunctionJoinGet.h | 15 +++++++-------- src/Interpreters/HashJoin.cpp | 11 +---------- src/Interpreters/HashJoin.h | 9 ++++++++- src/Storages/StorageJoin.cpp | 10 +++++++--- src/Storages/StorageJoin.h | 18 ++++++++++++++---- 6 files changed, 41 insertions(+), 30 deletions(-) diff --git a/src/Functions/FunctionJoinGet.cpp b/src/Functions/FunctionJoinGet.cpp index 6b15bf821b2..3a2649c11a8 100644 --- a/src/Functions/FunctionJoinGet.cpp +++ b/src/Functions/FunctionJoinGet.cpp @@ -25,7 +25,7 @@ ColumnPtr ExecutableFunctionJoinGet::execute(const ColumnsWithTypeAndNa auto key = arguments[i]; keys.emplace_back(std::move(key)); } - return join->joinGet(keys, result_columns).column; + return join->join->joinGet(keys, result_columns).column; } template @@ -87,13 +87,13 @@ FunctionBaseImplPtr JoinGetOverloadResolver::build(const ColumnsWithTyp + ", should be greater or equal to 3", ErrorCodes::NUMBER_OF_ARGUMENTS_DOESNT_MATCH); auto [storage_join, attr_name] = getJoin(arguments, context); - auto join = storage_join->getJoin(); + auto join_holder = storage_join->getJoin(); DataTypes data_types(arguments.size() - 2); for (size_t i = 2; i < arguments.size(); ++i) data_types[i - 2] = arguments[i].type; - auto return_type = join->joinGetCheckAndGetReturnType(data_types, attr_name, or_null); + auto return_type = join_holder->join->joinGetCheckAndGetReturnType(data_types, attr_name, or_null); auto table_lock = storage_join->lockForShare(context.getInitialQueryId(), context.getSettingsRef().lock_acquire_timeout); - return std::make_unique>(table_lock, storage_join, join, attr_name, data_types, return_type); + return std::make_unique>(table_lock, join_holder, attr_name, data_types, return_type); } void registerFunctionJoinGet(FunctionFactory & factory) diff --git a/src/Functions/FunctionJoinGet.h b/src/Functions/FunctionJoinGet.h index 27f348e9698..820c6cd3fa2 100644 --- a/src/Functions/FunctionJoinGet.h +++ b/src/Functions/FunctionJoinGet.h @@ -9,13 +9,14 @@ namespace DB class Context; class HashJoin; +class HashJoinHolder; using HashJoinPtr = std::shared_ptr; template class ExecutableFunctionJoinGet final : public IExecutableFunctionImpl { public: - ExecutableFunctionJoinGet(HashJoinPtr join_, const DB::Block & result_columns_) + ExecutableFunctionJoinGet(std::shared_ptr join_, const DB::Block & result_columns_) : join(std::move(join_)), result_columns(result_columns_) {} static constexpr auto name = or_null ? "joinGetOrNull" : "joinGet"; @@ -29,7 +30,7 @@ public: String getName() const override { return name; } private: - HashJoinPtr join; + std::shared_ptr join; DB::Block result_columns; }; @@ -39,12 +40,11 @@ class FunctionJoinGet final : public IFunctionBaseImpl public: static constexpr auto name = or_null ? "joinGetOrNull" : "joinGet"; - FunctionJoinGet(TableLockHolder table_lock_, StoragePtr storage_join_, - HashJoinPtr join_, String attr_name_, + FunctionJoinGet(TableLockHolder table_lock_, + std::shared_ptr join_, String attr_name_, DataTypes argument_types_, DataTypePtr return_type_) : table_lock(std::move(table_lock_)) - , storage_join(std::move(storage_join_)) - , join(std::move(join_)) + , join(join_) , attr_name(std::move(attr_name_)) , argument_types(std::move(argument_types_)) , return_type(std::move(return_type_)) @@ -60,8 +60,7 @@ public: private: TableLockHolder table_lock; - StoragePtr storage_join; - HashJoinPtr join; + std::shared_ptr join; const String attr_name; DataTypes argument_types; DataTypePtr return_type; diff --git a/src/Interpreters/HashJoin.cpp b/src/Interpreters/HashJoin.cpp index 5c50b53e2ca..cd158241860 100644 --- a/src/Interpreters/HashJoin.cpp +++ b/src/Interpreters/HashJoin.cpp @@ -423,19 +423,16 @@ bool HashJoin::empty() const size_t HashJoin::getTotalByteCount() const { - std::shared_lock lock(data->rwlock); return getTotalByteCountLocked(); } size_t HashJoin::getTotalRowCount() const { - std::shared_lock lock(data->rwlock); return getTotalRowCountLocked(); } bool HashJoin::alwaysReturnsEmptySet() const { - std::shared_lock lock(data->rwlock); return isInnerOrRight(getKind()) && data->empty && !overDictionary(); } @@ -652,7 +649,7 @@ bool HashJoin::addJoinedBlock(const Block & source_block, bool check_limits) size_t total_bytes = 0; { - std::unique_lock lock(data->rwlock); + assert(storage_join_lock.mutex() == nullptr); data->blocks.emplace_back(std::move(structured_block)); Block * stored_block = &data->blocks.back(); @@ -1219,8 +1216,6 @@ void HashJoin::joinBlockImplCross(Block & block, ExtraBlockPtr & not_processed) DataTypePtr HashJoin::joinGetCheckAndGetReturnType(const DataTypes & data_types, const String & column_name, bool or_null) const { - std::shared_lock lock(data->rwlock); - size_t num_keys = data_types.size(); if (right_table_keys.columns() != num_keys) throw Exception( @@ -1273,8 +1268,6 @@ ColumnWithTypeAndName HashJoin::joinGetImpl(const Block & block, const Block & b // TODO: return array of values when strictness == ASTTableJoin::Strictness::All ColumnWithTypeAndName HashJoin::joinGet(const Block & block, const Block & block_with_columns_to_add) const { - std::shared_lock lock(data->rwlock); - if ((strictness == ASTTableJoin::Strictness::Any || strictness == ASTTableJoin::Strictness::RightAny) && kind == ASTTableJoin::Kind::Left) { @@ -1287,8 +1280,6 @@ ColumnWithTypeAndName HashJoin::joinGet(const Block & block, const Block & block void HashJoin::joinBlock(Block & block, ExtraBlockPtr & not_processed) { - std::shared_lock lock(data->rwlock); - const Names & key_names_left = table_join->keyNamesLeft(); JoinCommon::checkTypesOfKeys(block, key_names_left, right_table_keys, key_names_right); diff --git a/src/Interpreters/HashJoin.h b/src/Interpreters/HashJoin.h index 06ce7559f31..06e07dc10dd 100644 --- a/src/Interpreters/HashJoin.h +++ b/src/Interpreters/HashJoin.h @@ -308,7 +308,7 @@ public: { /// Protect state for concurrent use in insertFromBlock and joinBlock. /// @note that these methods could be called simultaneously only while use of StorageJoin. - mutable std::shared_mutex rwlock; +// mutable std::shared_mutex rwlock; Type type = Type::EMPTY; bool empty = true; @@ -322,6 +322,11 @@ public: Arena pool; }; + void setLock(std::shared_mutex & rwlock) + { + storage_join_lock = std::shared_lock(rwlock); + } + void reuseJoinedData(const HashJoin & join); std::shared_ptr getJoinedData() const @@ -371,6 +376,8 @@ private: Block totals; + std::shared_lock storage_join_lock; + void init(Type type_); const Block & savedBlockSample() const { return data->sample_block; } diff --git a/src/Storages/StorageJoin.cpp b/src/Storages/StorageJoin.cpp index 8d4f0b3b3be..f130316566f 100644 --- a/src/Storages/StorageJoin.cpp +++ b/src/Storages/StorageJoin.cpp @@ -97,11 +97,17 @@ HashJoinPtr StorageJoin::getJoin(std::shared_ptr analyzed_join) const HashJoinPtr join_clone = std::make_shared(analyzed_join, metadata_snapshot->getSampleBlock().sortColumns()); join_clone->reuseJoinedData(*join); + join_clone->setLock(rwlock); + return join_clone; } -void StorageJoin::insertBlock(const Block & block) { join->addJoinedBlock(block, true); } +void StorageJoin::insertBlock(const Block & block) +{ + std::unique_lock lock(rwlock); + join->addJoinedBlock(block, true); +} size_t StorageJoin::getSize() const { return join->getTotalRowCount(); } std::optional StorageJoin::totalRows(const Settings &) const { return join->getTotalRowCount(); } @@ -267,7 +273,6 @@ public: JoinSource(const HashJoin & parent_, UInt64 max_block_size_, Block sample_block_) : SourceWithProgress(sample_block_) , parent(parent_) - , lock(parent.data->rwlock) , max_block_size(max_block_size_) , sample_block(std::move(sample_block_)) { @@ -312,7 +317,6 @@ protected: private: const HashJoin & parent; - std::shared_lock lock; UInt64 max_block_size; Block sample_block; Block restored_block; /// sample_block with parent column types diff --git a/src/Storages/StorageJoin.h b/src/Storages/StorageJoin.h index c453c036b65..6d3ec2710c9 100644 --- a/src/Storages/StorageJoin.h +++ b/src/Storages/StorageJoin.h @@ -14,6 +14,18 @@ class TableJoin; class HashJoin; using HashJoinPtr = std::shared_ptr; +class HashJoinHolder +{ + std::shared_lock lock; +public: + HashJoinPtr join; + + HashJoinHolder(std::shared_mutex & rwlock, HashJoinPtr join_) + : lock(rwlock) + , join(join_) + { + } +}; /** Allows you save the state for later use on the right side of the JOIN. * When inserted into a table, the data will be inserted into the state, @@ -31,12 +43,9 @@ public: void truncate(const ASTPtr &, const StorageMetadataPtr & metadata_snapshot, const Context &, TableExclusiveLockHolder &) override; /// Access the innards. - HashJoinPtr & getJoin() { return join; } + std::shared_ptr getJoin() { return std::make_shared(rwlock, join); } HashJoinPtr getJoin(std::shared_ptr analyzed_join) const; - /// Verify that the data structure is suitable for implementing this type of JOIN. - void assertCompatible(ASTTableJoin::Kind kind_, ASTTableJoin::Strictness strictness_) const; - Pipe read( const Names & column_names, const StorageMetadataPtr & /*metadata_snapshot*/, @@ -60,6 +69,7 @@ private: std::shared_ptr table_join; HashJoinPtr join; + mutable std::shared_mutex rwlock; void insertBlock(const Block & block) override; void finishInsert() override {} From 0c2cf3cf30b707fdf46c88760c931c194a086d2d Mon Sep 17 00:00:00 2001 From: alesapin Date: Sat, 20 Feb 2021 18:36:56 +0300 Subject: [PATCH 0544/2357] Calculate checksum with siphash --- src/Coordination/Changelog.cpp | 51 ++++++++++++++++++++++------------ src/Coordination/Changelog.h | 9 +++--- 2 files changed, 38 insertions(+), 22 deletions(-) diff --git a/src/Coordination/Changelog.cpp b/src/Coordination/Changelog.cpp index efb0f2798e2..adf367c565d 100644 --- a/src/Coordination/Changelog.cpp +++ b/src/Coordination/Changelog.cpp @@ -7,6 +7,7 @@ #include #include #include +#include #include namespace DB @@ -56,13 +57,15 @@ LogEntryPtr makeClone(const LogEntryPtr & entry) Checksum computeRecordChecksum(const ChangelogRecord & record) { - const auto * header_start = reinterpret_cast(&record.header); - auto sum = CityHash_v1_0_2::CityHash128(header_start, sizeof(record.header)); - + SipHash hash; + hash.update(record.header.version); + hash.update(record.header.index); + hash.update(record.header.term); + hash.update(record.header.value_type); + hash.update(record.header.blob_size); if (record.header.blob_size != 0) - sum = CityHash_v1_0_2::CityHash128WithSeed(reinterpret_cast(record.blob->data_begin()), record.header.blob_size, sum); - - return sum; + hash.update(reinterpret_cast(record.blob->data_begin()), record.blob->size()); + return hash.get64(); } } @@ -82,7 +85,11 @@ public: off_t result = plain_buf.count(); writeIntBinary(computeRecordChecksum(record), plain_buf); - writePODBinary(record.header, plain_buf); + writeIntBinary(record.header.version, plain_buf); + writeIntBinary(record.header.index, plain_buf); + writeIntBinary(record.header.term, plain_buf); + writeIntBinary(record.header.value_type, plain_buf); + writeIntBinary(record.header.blob_size, plain_buf); if (record.header.blob_size != 0) plain_buf.write(reinterpret_cast(record.blob->data_begin()), record.blob->size()); @@ -160,8 +167,14 @@ public: Checksum record_checksum; readIntBinary(record_checksum, read_buf); + /// Initialization is required, otherwise checksums may fail ChangelogRecord record; - readPODBinary(record.header, read_buf); + readIntBinary(record.header.version, read_buf); + readIntBinary(record.header.index, read_buf); + readIntBinary(record.header.term, read_buf); + readIntBinary(record.header.value_type, read_buf); + readIntBinary(record.header.blob_size, read_buf); + if (record.header.version > CURRENT_CHANGELOG_VERSION) throw Exception(ErrorCodes::UNKNOWN_FORMAT_VERSION, "Unsupported changelog version {} on path {}", record.header.version, filepath); @@ -248,7 +261,7 @@ void Changelog::readChangelogAndInitWriter(size_t from_log_index) size_t incomplete_log_index = 0; ChangelogReadResult result{}; - for (const auto & [start_index, changelog_description] : existing_changelogs) + for (const auto & [changelog_start_index, changelog_description] : existing_changelogs) { entries_in_last = changelog_description.to_log_index - changelog_description.from_log_index + 1; @@ -261,7 +274,7 @@ void Changelog::readChangelogAndInitWriter(size_t from_log_index) /// May happen after truncate, crash or simply unfinished log if (result.entries_read < entries_in_last) { - incomplete_log_index = start_index; + incomplete_log_index = changelog_start_index; break; } } @@ -319,18 +332,20 @@ void Changelog::rotate(size_t new_start_log_index) ChangelogRecord Changelog::buildRecord(size_t index, const LogEntryPtr & log_entry) { - ChangelogRecordHeader header; - header.version = ChangelogVersion::V0; - header.index = index; - header.term = log_entry->get_term(); - header.value_type = log_entry->get_val_type(); + ChangelogRecord record; + record.header.version = ChangelogVersion::V0; + record.header.index = index; + record.header.term = log_entry->get_term(); + record.header.value_type = log_entry->get_val_type(); auto buffer = log_entry->get_buf_ptr(); if (buffer) - header.blob_size = buffer->size(); + record.header.blob_size = buffer->size(); else - header.blob_size = 0; + record.header.blob_size = 0; - return ChangelogRecord{header, buffer}; + record.blob = buffer; + + return record; } void Changelog::appendEntry(size_t index, const LogEntryPtr & log_entry, bool force_sync) diff --git a/src/Coordination/Changelog.h b/src/Coordination/Changelog.h index f758edc27ed..0f67c2a9a7d 100644 --- a/src/Coordination/Changelog.h +++ b/src/Coordination/Changelog.h @@ -10,7 +10,7 @@ namespace DB { -using Checksum = CityHash_v1_0_2::uint128; +using Checksum = UInt64; using LogEntryPtr = nuraft::ptr; using LogEntries = std::vector; @@ -27,7 +27,7 @@ enum class ChangelogVersion : uint8_t static constexpr auto CURRENT_CHANGELOG_VERSION = ChangelogVersion::V0; -struct __attribute__((__packed__)) ChangelogRecordHeader +struct ChangelogRecordHeader { ChangelogVersion version = CURRENT_CHANGELOG_VERSION; size_t index; /// entry log number @@ -115,12 +115,13 @@ public: ~Changelog(); private: + /// Pack log_entry into changelog record + static ChangelogRecord buildRecord(size_t index, const LogEntryPtr & log_entry); /// Starts new file [new_start_log_index, new_start_log_index + rotate_interval] void rotate(size_t new_start_log_index); - /// Pack log_entry into changelog record - static ChangelogRecord buildRecord(size_t index, const LogEntryPtr & log_entry); + private: const std::string changelogs_dir; From 9f520f42c117e33ad107f9ea33465e11e2cf26e3 Mon Sep 17 00:00:00 2001 From: alesapin Date: Sat, 20 Feb 2021 18:56:55 +0300 Subject: [PATCH 0545/2357] Fix style --- src/Coordination/Changelog.h | 2 -- 1 file changed, 2 deletions(-) diff --git a/src/Coordination/Changelog.h b/src/Coordination/Changelog.h index 0f67c2a9a7d..be38915066d 100644 --- a/src/Coordination/Changelog.h +++ b/src/Coordination/Changelog.h @@ -121,8 +121,6 @@ private: /// Starts new file [new_start_log_index, new_start_log_index + rotate_interval] void rotate(size_t new_start_log_index); - - private: const std::string changelogs_dir; const size_t rotate_interval; From 48e188681c88b88c11924f98976993d500fbb1d4 Mon Sep 17 00:00:00 2001 From: Anton Popov Date: Sat, 20 Feb 2021 16:05:33 +0300 Subject: [PATCH 0546/2357] do not start mutation for alters with wrong type conversion --- src/Storages/MergeTree/MergeTreeData.cpp | 23 ++++++++++++++----- .../01732_alters_bad_conversions.reference | 4 ++++ .../01732_alters_bad_conversions.sql | 17 ++++++++++++++ 3 files changed, 38 insertions(+), 6 deletions(-) create mode 100644 tests/queries/0_stateless/01732_alters_bad_conversions.reference create mode 100644 tests/queries/0_stateless/01732_alters_bad_conversions.sql diff --git a/src/Storages/MergeTree/MergeTreeData.cpp b/src/Storages/MergeTree/MergeTreeData.cpp index a0d23b8ab22..b09f068f509 100644 --- a/src/Storages/MergeTree/MergeTreeData.cpp +++ b/src/Storages/MergeTree/MergeTreeData.cpp @@ -21,6 +21,7 @@ #include #include #include +#include #include #include #include @@ -1482,6 +1483,7 @@ void MergeTreeData::checkAlterIsPossible(const AlterCommands & commands, const S for (const auto & column : old_metadata.getColumns().getAllPhysical()) old_types.emplace(column.name, column.type.get()); + NamesAndTypesList columns_to_check_conversion; for (const AlterCommand & command : commands) { /// Just validate partition expression @@ -1571,9 +1573,9 @@ void MergeTreeData::checkAlterIsPossible(const AlterCommands & commands, const S throw Exception("ALTER of key column " + backQuoteIfNeed(command.column_name) + " is forbidden", ErrorCodes::ALTER_OF_COLUMN_IS_FORBIDDEN); - if (columns_alter_type_check_safe_for_partition.count(command.column_name)) + if (command.type == AlterCommand::MODIFY_COLUMN) { - if (command.type == AlterCommand::MODIFY_COLUMN) + if (columns_alter_type_check_safe_for_partition.count(command.column_name)) { auto it = old_types.find(command.column_name); @@ -1584,11 +1586,8 @@ void MergeTreeData::checkAlterIsPossible(const AlterCommands & commands, const S + " is not safe because it can change the representation of partition key", ErrorCodes::ALTER_OF_COLUMN_IS_FORBIDDEN); } - } - if (columns_alter_type_metadata_only.count(command.column_name)) - { - if (command.type == AlterCommand::MODIFY_COLUMN) + if (columns_alter_type_metadata_only.count(command.column_name)) { auto it = old_types.find(command.column_name); assert(it != old_types.end()); @@ -1598,6 +1597,12 @@ void MergeTreeData::checkAlterIsPossible(const AlterCommands & commands, const S + " is not safe because it can change the representation of primary key", ErrorCodes::ALTER_OF_COLUMN_IS_FORBIDDEN); } + + if (old_metadata.getColumns().has(command.column_name)) + { + columns_to_check_conversion.push_back( + new_metadata.getColumns().getPhysical(command.column_name)); + } } } } @@ -1605,6 +1610,12 @@ void MergeTreeData::checkAlterIsPossible(const AlterCommands & commands, const S checkProperties(new_metadata, old_metadata); checkTTLExpressions(new_metadata, old_metadata); + if (!columns_to_check_conversion.empty()) + { + auto old_header = old_metadata.getSampleBlock(); + performRequiredConversions(old_header, columns_to_check_conversion, global_context); + } + if (old_metadata.hasSettingsChanges()) { const auto current_changes = old_metadata.getSettingsChanges()->as().changes; diff --git a/tests/queries/0_stateless/01732_alters_bad_conversions.reference b/tests/queries/0_stateless/01732_alters_bad_conversions.reference new file mode 100644 index 00000000000..5f570c78579 --- /dev/null +++ b/tests/queries/0_stateless/01732_alters_bad_conversions.reference @@ -0,0 +1,4 @@ +CREATE TABLE default.bad_conversions\n(\n `a` UInt32\n)\nENGINE = MergeTree\nORDER BY tuple()\nSETTINGS index_granularity = 8192 +0 +CREATE TABLE default.bad_conversions_2\n(\n `e` Enum8(\'foo\' = 1, \'bar\' = 2)\n)\nENGINE = MergeTree\nORDER BY tuple()\nSETTINGS index_granularity = 8192 +0 diff --git a/tests/queries/0_stateless/01732_alters_bad_conversions.sql b/tests/queries/0_stateless/01732_alters_bad_conversions.sql new file mode 100644 index 00000000000..27da5242368 --- /dev/null +++ b/tests/queries/0_stateless/01732_alters_bad_conversions.sql @@ -0,0 +1,17 @@ +DROP TABLE IF EXISTS bad_conversions; +DROP TABLE IF EXISTS bad_conversions_2; + +CREATE TABLE bad_conversions (a UInt32) ENGINE = MergeTree ORDER BY tuple(); +INSERT INTO bad_conversions VALUES (1); +ALTER TABLE bad_conversions MODIFY COLUMN a Array(String); -- { serverError 53 } +SHOW CREATE TABLE bad_conversions; +SELECT count() FROM system.mutations WHERE table = 'bad_conversions' AND database = currentDatabase(); + +CREATE TABLE bad_conversions_2 (e Enum('foo' = 1, 'bar' = 2)) ENGINE = MergeTree ORDER BY tuple(); +INSERT INTO bad_conversions_2 VALUES (1); +ALTER TABLE bad_conversions_2 MODIFY COLUMN e Enum('bar' = 1, 'foo' = 2); -- { serverError 70 } +SHOW CREATE TABLE bad_conversions_2; +SELECT count() FROM system.mutations WHERE table = 'bad_conversions_2' AND database = currentDatabase(); + +DROP TABLE IF EXISTS bad_conversions; +DROP TABLE IF EXISTS bad_conversions_2; From f0396661b3cf74b98ea2b562d96edb18949e9df8 Mon Sep 17 00:00:00 2001 From: Nikolai Kochetov Date: Sat, 20 Feb 2021 19:13:36 +0300 Subject: [PATCH 0547/2357] Refactor ActionsDAG::splitActionsForFilter --- src/Interpreters/ActionsDAG.cpp | 411 ++++++++++++++++++-------------- src/Interpreters/ActionsDAG.h | 2 + 2 files changed, 228 insertions(+), 185 deletions(-) diff --git a/src/Interpreters/ActionsDAG.cpp b/src/Interpreters/ActionsDAG.cpp index 8b6013a4365..b3f86313a1c 100644 --- a/src/Interpreters/ActionsDAG.cpp +++ b/src/Interpreters/ActionsDAG.cpp @@ -1212,112 +1212,120 @@ ActionsDAG::SplitResult ActionsDAG::splitActionsForFilter(const std::string & co return split(split_nodes); } -ActionsDAGPtr ActionsDAG::splitActionsForFilter(const std::string & filter_name, bool can_remove_filter, const Names & available_inputs) +namespace { - std::unordered_map> inputs_map; - for (const auto & input : inputs) - inputs_map[input->result_name].emplace_back(input); - std::unordered_set allowed_nodes; - for (const auto & name : available_inputs) +struct ConjinctionNodes +{ + std::unordered_set allowed; + std::unordered_set rejected; +}; + +/// Take a node which result is predicate. +/// Assuming predicate is a conjunction (probably, trivial). +/// Find separate conjunctions nodes. Split nodes into allowed and rejected sets. +/// Allowed predicate is a predicate which can be calculated using only nodes from allowed_nodes set. +ConjinctionNodes getConjinctionNodes(ActionsDAG::Node * predicate, std::unordered_set allowed_nodes) +{ + ConjinctionNodes conjunction; + + struct Frame { - auto & inputs_list = inputs_map[name]; - if (inputs_list.empty()) - continue; + ActionsDAG::Node * node; + bool is_predicate = false; + size_t next_child_to_visit = 0; + size_t num_allowed_children = 0; + }; - allowed_nodes.emplace(inputs_list.front()); - inputs_list.pop_front(); - } - - auto it = index.begin(); - for (; it != index.end(); ++it) - if ((*it)->result_name == filter_name) - break; - - if (it == index.end()) - throw Exception(ErrorCodes::LOGICAL_ERROR, - "Index for ActionsDAG does not contain filter column name {}. DAG:\n{}", - filter_name, dumpDAG()); - - std::unordered_set selected_predicates; - std::unordered_set other_predicates; + std::stack stack; + std::unordered_set visited_nodes; + stack.push(Frame{.node = predicate, .is_predicate = true}); + visited_nodes.insert(predicate); + while (!stack.empty()) { - struct Frame + auto & cur = stack.top(); + bool is_conjunction = cur.is_predicate + && cur.node->type == ActionsDAG::ActionType::FUNCTION + && cur.node->function_base->getName() == "and"; + + /// At first, visit all children. + while (cur.next_child_to_visit < cur.node->children.size()) { - Node * node; - bool is_predicate = false; - size_t next_child_to_visit = 0; - size_t num_allowed_children = 0; - }; + auto * child = cur.node->children[cur.next_child_to_visit]; - std::stack stack; - std::unordered_set visited_nodes; - - stack.push(Frame{.node = *it, .is_predicate = true}); - visited_nodes.insert(*it); - while (!stack.empty()) - { - auto & cur = stack.top(); - bool is_conjunction = cur.is_predicate - && cur.node->type == ActionType::FUNCTION - && cur.node->function_base->getName() == "and"; - - /// At first, visit all children. - while (cur.next_child_to_visit < cur.node->children.size()) + if (visited_nodes.count(child) == 0) { - auto * child = cur.node->children[cur.next_child_to_visit]; - - if (visited_nodes.count(child) == 0) - { - visited_nodes.insert(child); - stack.push({.node = child, .is_predicate = is_conjunction}); - break; - } - - if (allowed_nodes.contains(child)) - ++cur.num_allowed_children; - ++cur.next_child_to_visit; + visited_nodes.insert(child); + stack.push({.node = child, .is_predicate = is_conjunction}); + break; } - if (cur.next_child_to_visit == cur.node->children.size()) - { - if (cur.num_allowed_children == cur.node->children.size()) - { - if (cur.node->type != ActionType::ARRAY_JOIN && cur.node->type != ActionType::INPUT) - allowed_nodes.emplace(cur.node); - } - else if (is_conjunction) - { - for (auto * child : cur.node->children) - if (allowed_nodes.count(child)) - selected_predicates.insert(child); - } - else if (cur.is_predicate) - { - other_predicates.insert(cur.node); - } + if (allowed_nodes.contains(child)) + ++cur.num_allowed_children; + ++cur.next_child_to_visit; + } - stack.pop(); + if (cur.next_child_to_visit == cur.node->children.size()) + { + if (cur.num_allowed_children == cur.node->children.size()) + { + if (cur.node->type != ActionsDAG::ActionType::ARRAY_JOIN && cur.node->type != ActionsDAG::ActionType::INPUT) + allowed_nodes.emplace(cur.node); } + else if (is_conjunction) + { + for (auto * child : cur.node->children) + if (allowed_nodes.count(child)) + conjunction.allowed.insert(child); + } + else if (cur.is_predicate) + { + conjunction.rejected.insert(cur.node); + } + + stack.pop(); } } - if (selected_predicates.empty()) + if (conjunction.allowed.empty()) { - if (allowed_nodes.count(*it)) - selected_predicates.insert(*it); - else - return nullptr; + if (allowed_nodes.count(predicate)) + conjunction.allowed.insert(predicate); } - // std::cerr << "************* Selectecd predicates\n"; - // for (const auto * p : selected_predicates) - // std::cerr << p->result_name << std::endl; + return conjunction; +} - // std::cerr << "............. Other predicates\n"; - // for (const auto * p : other_predicates) - // std::cerr << p->result_name << std::endl; +ColumnsWithTypeAndName prepareFunctionArguments(const std::vector nodes) +{ + ColumnsWithTypeAndName arguments; + arguments.reserve(nodes.size()); + + for (const auto * child : nodes) + { + ColumnWithTypeAndName argument; + argument.column = child->column; + argument.type = child->result_type; + argument.name = child->result_name; + + arguments.emplace_back(std::move(argument)); + } + + return arguments; +} + +} + +/// Create actions which calculate conjunction of selected nodes. +/// Assume conjunction nodes are predicates (and may be used as arguments of function AND). +/// +/// Result actions add single column with conjunction result (it is always last in index). +/// No other columns are added or removed. +ActionsDAGPtr ActionsDAG::cloneActionsForConjunction(std::unordered_set conjunction) +{ + if (conjunction.empty()) + return nullptr; auto actions = cloneEmpty(); actions->settings.project_input = false; @@ -1327,82 +1335,128 @@ ActionsDAGPtr ActionsDAG::splitActionsForFilter(const std::string & filter_name, std::make_unique( std::make_shared())); - std::unordered_map nodes_mapping; + std::unordered_map nodes_mapping; + struct Frame { - struct Frame + const ActionsDAG::Node * node; + size_t next_child_to_visit = 0; + }; + + std::stack stack; + + /// DFS. Clone actions. + for (const auto * predicate : conjunction) + { + if (nodes_mapping.count(predicate)) + continue; + + stack.push({.node = predicate}); + while (!stack.empty()) { - const Node * node; - size_t next_child_to_visit = 0; - }; - - std::stack stack; - - for (const auto * predicate : selected_predicates) - { - if (nodes_mapping.count(predicate)) - continue; - - stack.push({.node = predicate}); - while (!stack.empty()) + auto & cur = stack.top(); + /// At first, visit all children. + while (cur.next_child_to_visit < cur.node->children.size()) { - auto & cur = stack.top(); - /// At first, visit all children. - while (cur.next_child_to_visit < cur.node->children.size()) + auto * child = cur.node->children[cur.next_child_to_visit]; + + if (nodes_mapping.count(child) == 0) { - auto * child = cur.node->children[cur.next_child_to_visit]; - - if (nodes_mapping.count(child) == 0) - { - stack.push({.node = child}); - break; - } - - ++cur.next_child_to_visit; + stack.push({.node = child}); + break; } - if (cur.next_child_to_visit == cur.node->children.size()) + ++cur.next_child_to_visit; + } + + if (cur.next_child_to_visit == cur.node->children.size()) + { + auto & node = actions->nodes.emplace_back(*cur.node); + nodes_mapping[cur.node] = &node; + + for (auto & child : node.children) + child = nodes_mapping[child]; + + if (node.type == ActionType::INPUT) { - auto & node = actions->nodes.emplace_back(*cur.node); - nodes_mapping[cur.node] = &node; - - for (auto & child : node.children) - child = nodes_mapping[child]; - - if (node.type == ActionType::INPUT) - { - actions->inputs.emplace_back(&node); - actions->index.insert(&node); - } - - stack.pop(); + actions->inputs.emplace_back(&node); + actions->index.insert(&node); } + + stack.pop(); } } - - Node * result_predicate = nodes_mapping[*selected_predicates.begin()]; - - if (selected_predicates.size() > 1) - { - std::vector args; - args.reserve(selected_predicates.size()); - for (const auto * predicate : selected_predicates) - args.emplace_back(nodes_mapping[predicate]); - - result_predicate = &actions->addFunction(func_builder_and, args, {}, true, false); - } - - actions->index.insert(result_predicate); } - if (selected_predicates.count(*it)) + Node * result_predicate = nodes_mapping[*conjunction.begin()]; + + if (conjunction.size() > 1) + { + std::vector args; + args.reserve(conjunction.size()); + for (const auto * predicate : conjunction) + args.emplace_back(nodes_mapping[predicate]); + + result_predicate = &actions->addFunction(func_builder_and, args, {}, true, false); + } + + actions->index.insert(result_predicate); + return actions; +} + +ActionsDAGPtr ActionsDAG::splitActionsForFilter(const std::string & filter_name, bool can_remove_filter, const Names & available_inputs) +{ + Node * predicate; + + { + auto it = index.begin(); + for (; it != index.end(); ++it) + if ((*it)->result_name == filter_name) + break; + + if (it == index.end()) + throw Exception(ErrorCodes::LOGICAL_ERROR, + "Index for ActionsDAG does not contain filter column name {}. DAG:\n{}", + filter_name, dumpDAG()); + + predicate = *it; + } + + std::unordered_set allowed_nodes; + + /// Get input nodes from available_inputs names. + { + std::unordered_map> inputs_map; + for (const auto & input : inputs) + inputs_map[input->result_name].emplace_back(input); + + for (const auto & name : available_inputs) + { + auto & inputs_list = inputs_map[name]; + if (inputs_list.empty()) + continue; + + allowed_nodes.emplace(inputs_list.front()); + inputs_list.pop_front(); + } + } + + auto conjunction = getConjinctionNodes(predicate, allowed_nodes); + auto actions = cloneActionsForConjunction(conjunction.allowed); + if (!actions) + return nullptr; + + /// Now, when actions are created, update current DAG. + + if (conjunction.allowed.count(predicate)) { /// The whole predicate was split. if (can_remove_filter) { + /// If filter column is not needed, remove it from index. for (auto i = index.begin(); i != index.end(); ++i) { - if (*i == *it) + if (*i == predicate) { index.remove(i); break; @@ -1411,84 +1465,71 @@ ActionsDAGPtr ActionsDAG::splitActionsForFilter(const std::string & filter_name, } else { + /// Replace predicate result to constant 1. Node node; node.type = ActionType::COLUMN; - node.result_name = std::move((*it)->result_name); - node.result_type = std::move((*it)->result_type); + node.result_name = std::move(predicate->result_name); + node.result_type = std::move(predicate->result_type); node.column = node.result_type->createColumnConst(0, 1); - *(*it) = std::move(node); + *predicate = std::move(node); } removeUnusedActions(false); } - else if ((*it)->type == ActionType::FUNCTION && (*it)->function_base->getName() == "and") + else { - std::vector new_children(other_predicates.begin(), other_predicates.end()); + /// Predicate is conjunction, where both allowed and rejected sets are not empty. + /// Replace this node to conjunction of rejected predicates. + + std::vector new_children(conjunction.rejected.begin(), conjunction.rejected.end()); if (new_children.size() == 1) { - if (new_children.front()->result_type->equals(*((*it)->result_type))) + /// Rejected set has only one predicate. + if (new_children.front()->result_type->equals(*predicate->result_type)) { + /// If it's type is same, just add alias. Node node; node.type = ActionType::ALIAS; - node.result_name = (*it)->result_name; - node.result_type = (*it)->result_type; + node.result_name = predicate->result_name; + node.result_type = predicate->result_type; node.children.swap(new_children); - *(*it) = std::move(node); + *predicate = std::move(node); } else { + /// If type is different, cast column. + /// This case is possible, cause AND can use any numeric type as argument. Node node; node.type = ActionType::COLUMN; - node.result_name = (*it)->result_type->getName(); + node.result_name = predicate->result_type->getName(); node.column = DataTypeString().createColumnConst(0, node.result_name); node.result_type = std::make_shared(); auto * right_arg = &nodes.emplace_back(std::move(node)); auto * left_arg = new_children.front(); - - (*it)->children = {left_arg, right_arg}; - ColumnsWithTypeAndName arguments; - arguments.reserve((*it)->children.size()); - - for (const auto * child : (*it)->children) - { - ColumnWithTypeAndName argument; - argument.column = child->column; - argument.type = child->result_type; - argument.name = child->result_name; - - arguments.emplace_back(std::move(argument)); - } + predicate->children = {left_arg, right_arg}; + auto arguments = prepareFunctionArguments(predicate->children); FunctionOverloadResolverPtr func_builder_cast = std::make_shared( CastOverloadResolver::createImpl(false)); - (*it)->function_builder = func_builder_cast; - (*it)->function_base = (*it)->function_builder->build(arguments); - (*it)->function = (*it)->function_base->prepare(arguments); + predicate->function_builder = func_builder_cast; + predicate->function_base = predicate->function_builder->build(arguments); + predicate->function = predicate->function_base->prepare(arguments); } } else { - (*it)->children.swap(new_children); - ColumnsWithTypeAndName arguments; - arguments.reserve((*it)->children.size()); + /// Predicate is function AND, which still have more then one argument. + /// Just update children and rebuild it. + predicate->children.swap(new_children); + auto arguments = prepareFunctionArguments(predicate->children); - for (const auto * child : (*it)->children) - { - ColumnWithTypeAndName argument; - argument.column = child->column; - argument.type = child->result_type; - argument.name = child->result_name; - - arguments.emplace_back(std::move(argument)); - } - - (*it)->function_base = (*it)->function_builder->build(arguments); - (*it)->function = (*it)->function_base->prepare(arguments); + predicate->function_base = predicate->function_builder->build(arguments); + predicate->function = predicate->function_base->prepare(arguments); } removeUnusedActions(false); diff --git a/src/Interpreters/ActionsDAG.h b/src/Interpreters/ActionsDAG.h index bd1dcd347df..87cf03f6edd 100644 --- a/src/Interpreters/ActionsDAG.h +++ b/src/Interpreters/ActionsDAG.h @@ -313,6 +313,8 @@ private: void addAliases(const NamesWithAliases & aliases, std::vector & result_nodes); void compileFunctions(); + + ActionsDAGPtr cloneActionsForConjunction(std::unordered_set conjunction); }; From 2ae0b47edbf1b01d45461e64c1c8df59ed2a7361 Mon Sep 17 00:00:00 2001 From: Nikolai Kochetov Date: Sat, 20 Feb 2021 19:25:47 +0300 Subject: [PATCH 0548/2357] Refactor tryPushDownFilter optimization. --- .../Optimizations/filterPushDown.cpp | 25 +++++++++++++------ 1 file changed, 18 insertions(+), 7 deletions(-) diff --git a/src/Processors/QueryPlan/Optimizations/filterPushDown.cpp b/src/Processors/QueryPlan/Optimizations/filterPushDown.cpp index 1b84fee4857..01e38e81092 100644 --- a/src/Processors/QueryPlan/Optimizations/filterPushDown.cpp +++ b/src/Processors/QueryPlan/Optimizations/filterPushDown.cpp @@ -56,19 +56,30 @@ static size_t tryAddNewFilterStep( if ((*it)->result_name == filter_column_name) break; + const bool found_filter_column = it != expression->getIndex().end(); + + if (!found_filter_column && removes_filter) + throw Exception(ErrorCodes::LOGICAL_ERROR, + "Filter column {} was removed from ActionsDAG but it is needed in result. DAG:\n{}", + filter_column_name, expression->dumpDAG()); + + const bool filter_is_constant = found_filter_column && (*it)->column && isColumnConst(*(*it)->column); + + if (!found_filter_column || filter_is_constant) + /// This means that all predicates of filter were pused down. + /// Replace current actions to expression, as we don't need to filter anything. + parent = std::make_unique(child->getOutputStream(), expression); + if (it == expression->getIndex().end()) { - if (!removes_filter) - throw Exception(ErrorCodes::LOGICAL_ERROR, - "Filter column {} was removed from ActionsDAG but it is needed in result. DAG:\n{}", - filter_column_name, expression->dumpDAG()); + /// Filter was removed after split. + + - // std::cerr << "replacing to expr because filter " << filter_column_name << " was removed\n"; - parent = std::make_unique(child->getOutputStream(), expression); } else if ((*it)->column && isColumnConst(*(*it)->column)) { - // std::cerr << "replacing to expr because filter is const\n"; + /// Filter column was replaced to constant. parent = std::make_unique(child->getOutputStream(), expression); } From 694d89ad81bcb4a551903097a0df042dd48639c7 Mon Sep 17 00:00:00 2001 From: Alexander Tokmakov Date: Sat, 20 Feb 2021 19:27:04 +0300 Subject: [PATCH 0549/2357] fix --- docker/test/stress/run.sh | 5 +++-- docker/test/stress/stress | 2 ++ src/Interpreters/DDLWorker.cpp | 2 +- 3 files changed, 6 insertions(+), 3 deletions(-) diff --git a/docker/test/stress/run.sh b/docker/test/stress/run.sh index 03c140d8a83..ee291e5b04d 100755 --- a/docker/test/stress/run.sh +++ b/docker/test/stress/run.sh @@ -53,10 +53,11 @@ handle SIGBUS stop print handle SIGABRT stop print continue thread apply all backtrace -continue +detach +quit " > script.gdb - gdb -batch -command script.gdb -p "$(cat /var/run/clickhouse-server/clickhouse-server.pid)" & + gdb -batch -command script.gdb -p "$(cat /var/run/clickhouse-server/clickhouse-server.pid)" >> /test_output/gdb.log & } configure diff --git a/docker/test/stress/stress b/docker/test/stress/stress index 666fd4cce50..c62692f8683 100755 --- a/docker/test/stress/stress +++ b/docker/test/stress/stress @@ -85,6 +85,8 @@ if __name__ == "__main__": logging.info("All processes finished") if args.hung_check: + logging.info("Will terminate gdb (if any)") + res = call("killall -TERM gdb", shell=True, stderr=STDOUT) logging.info("Checking if some queries hung") cmd = "{} {} {}".format(args.test_cmd, "--hung-check", "00001_select_1") res = call(cmd, shell=True, stderr=STDOUT) diff --git a/src/Interpreters/DDLWorker.cpp b/src/Interpreters/DDLWorker.cpp index fc460a5584c..63df919de22 100644 --- a/src/Interpreters/DDLWorker.cpp +++ b/src/Interpreters/DDLWorker.cpp @@ -719,7 +719,7 @@ void DDLWorker::processTask(DDLTask & task) String dummy; if (zookeeper->tryGet(active_node_path, dummy, nullptr, eph_node_disappeared)) { - constexpr int timeout_ms = 5000; + constexpr int timeout_ms = 30 * 1000; if (!eph_node_disappeared->tryWait(timeout_ms)) throw Exception(ErrorCodes::LOGICAL_ERROR, "Ephemeral node {} still exists, " "probably it's owned by someone else", active_node_path); From fe159de141bd47ae1915fea24ad520d71ae6a9a3 Mon Sep 17 00:00:00 2001 From: robot-clickhouse Date: Sat, 20 Feb 2021 19:30:27 +0300 Subject: [PATCH 0550/2357] Update version_date.tsv after release 21.2.4.6 --- utils/list-versions/version_date.tsv | 1 + 1 file changed, 1 insertion(+) diff --git a/utils/list-versions/version_date.tsv b/utils/list-versions/version_date.tsv index d0d782e77ec..f7035ebb506 100644 --- a/utils/list-versions/version_date.tsv +++ b/utils/list-versions/version_date.tsv @@ -1,3 +1,4 @@ +v21.2.4.6-stable 2021-02-20 v21.2.3.15-stable 2021-02-14 v21.2.2.8-stable 2021-02-07 v21.1.4.46-stable 2021-02-14 From 4fa822dd287cb699e170da2941effb3c89c7f0ea Mon Sep 17 00:00:00 2001 From: robot-clickhouse Date: Sat, 20 Feb 2021 20:21:55 +0300 Subject: [PATCH 0551/2357] Update version_date.tsv after release 21.1.5.4 --- utils/list-versions/version_date.tsv | 1 + 1 file changed, 1 insertion(+) diff --git a/utils/list-versions/version_date.tsv b/utils/list-versions/version_date.tsv index f7035ebb506..1ccf3c66580 100644 --- a/utils/list-versions/version_date.tsv +++ b/utils/list-versions/version_date.tsv @@ -1,6 +1,7 @@ v21.2.4.6-stable 2021-02-20 v21.2.3.15-stable 2021-02-14 v21.2.2.8-stable 2021-02-07 +v21.1.5.4-stable 2021-02-20 v21.1.4.46-stable 2021-02-14 v21.1.3.32-stable 2021-02-03 v21.1.2.15-stable 2021-01-18 From e49d90405cac621c35698443d69b8a2de887a9da Mon Sep 17 00:00:00 2001 From: robot-clickhouse Date: Sat, 20 Feb 2021 20:39:18 +0300 Subject: [PATCH 0552/2357] Update version_date.tsv after release 20.12.7.3 --- utils/list-versions/version_date.tsv | 1 + 1 file changed, 1 insertion(+) diff --git a/utils/list-versions/version_date.tsv b/utils/list-versions/version_date.tsv index 1ccf3c66580..b0abdaab087 100644 --- a/utils/list-versions/version_date.tsv +++ b/utils/list-versions/version_date.tsv @@ -5,6 +5,7 @@ v21.1.5.4-stable 2021-02-20 v21.1.4.46-stable 2021-02-14 v21.1.3.32-stable 2021-02-03 v21.1.2.15-stable 2021-01-18 +v20.12.7.3-stable 2021-02-20 v20.12.6.29-stable 2021-02-14 v20.12.5.18-stable 2021-02-03 v20.12.5.14-stable 2020-12-28 From 00e0dbc3e5d39bb8bd0ff79b5001d69866c3a9cf Mon Sep 17 00:00:00 2001 From: Nikolai Kochetov Date: Sat, 20 Feb 2021 20:42:06 +0300 Subject: [PATCH 0553/2357] Fix test. --- src/Interpreters/ActionsDAG.cpp | 23 +++++++++----- src/Interpreters/ActionsDAG.h | 2 +- .../Optimizations/filterPushDown.cpp | 30 ++----------------- .../01655_plan_optimizations.reference | 4 +-- .../0_stateless/01655_plan_optimizations.sh | 4 +-- 5 files changed, 23 insertions(+), 40 deletions(-) diff --git a/src/Interpreters/ActionsDAG.cpp b/src/Interpreters/ActionsDAG.cpp index b3f86313a1c..1406eecc5c0 100644 --- a/src/Interpreters/ActionsDAG.cpp +++ b/src/Interpreters/ActionsDAG.cpp @@ -1217,8 +1217,8 @@ namespace struct ConjinctionNodes { - std::unordered_set allowed; - std::unordered_set rejected; + std::vector allowed; + std::vector rejected; }; /// Take a node which result is predicate. @@ -1228,6 +1228,8 @@ struct ConjinctionNodes ConjinctionNodes getConjinctionNodes(ActionsDAG::Node * predicate, std::unordered_set allowed_nodes) { ConjinctionNodes conjunction; + std::unordered_set allowed; + std::unordered_set rejected; struct Frame { @@ -1276,12 +1278,19 @@ ConjinctionNodes getConjinctionNodes(ActionsDAG::Node * predicate, std::unordere else if (is_conjunction) { for (auto * child : cur.node->children) + { if (allowed_nodes.count(child)) - conjunction.allowed.insert(child); + { + if (allowed.insert(child).second) + conjunction.allowed.push_back(child); + + } + } } else if (cur.is_predicate) { - conjunction.rejected.insert(cur.node); + if (rejected.insert(cur.node).second) + conjunction.rejected.push_back(cur.node); } stack.pop(); @@ -1291,7 +1300,7 @@ ConjinctionNodes getConjinctionNodes(ActionsDAG::Node * predicate, std::unordere if (conjunction.allowed.empty()) { if (allowed_nodes.count(predicate)) - conjunction.allowed.insert(predicate); + conjunction.allowed.push_back(predicate); } return conjunction; @@ -1322,7 +1331,7 @@ ColumnsWithTypeAndName prepareFunctionArguments(const std::vector conjunction) +ActionsDAGPtr ActionsDAG::cloneActionsForConjunction(std::vector conjunction) { if (conjunction.empty()) return nullptr; @@ -1448,7 +1457,7 @@ ActionsDAGPtr ActionsDAG::splitActionsForFilter(const std::string & filter_name, /// Now, when actions are created, update current DAG. - if (conjunction.allowed.count(predicate)) + if (conjunction.rejected.empty()) { /// The whole predicate was split. if (can_remove_filter) diff --git a/src/Interpreters/ActionsDAG.h b/src/Interpreters/ActionsDAG.h index 87cf03f6edd..2e3baa181fd 100644 --- a/src/Interpreters/ActionsDAG.h +++ b/src/Interpreters/ActionsDAG.h @@ -314,7 +314,7 @@ private: void compileFunctions(); - ActionsDAGPtr cloneActionsForConjunction(std::unordered_set conjunction); + ActionsDAGPtr cloneActionsForConjunction(std::vector conjunction); }; diff --git a/src/Processors/QueryPlan/Optimizations/filterPushDown.cpp b/src/Processors/QueryPlan/Optimizations/filterPushDown.cpp index 01e38e81092..d64f082b7ee 100644 --- a/src/Processors/QueryPlan/Optimizations/filterPushDown.cpp +++ b/src/Processors/QueryPlan/Optimizations/filterPushDown.cpp @@ -58,11 +58,12 @@ static size_t tryAddNewFilterStep( const bool found_filter_column = it != expression->getIndex().end(); - if (!found_filter_column && removes_filter) + if (!found_filter_column && !removes_filter) throw Exception(ErrorCodes::LOGICAL_ERROR, "Filter column {} was removed from ActionsDAG but it is needed in result. DAG:\n{}", filter_column_name, expression->dumpDAG()); + /// Filter column was replaced to constant. const bool filter_is_constant = found_filter_column && (*it)->column && isColumnConst(*(*it)->column); if (!found_filter_column || filter_is_constant) @@ -70,19 +71,6 @@ static size_t tryAddNewFilterStep( /// Replace current actions to expression, as we don't need to filter anything. parent = std::make_unique(child->getOutputStream(), expression); - if (it == expression->getIndex().end()) - { - /// Filter was removed after split. - - - - } - else if ((*it)->column && isColumnConst(*(*it)->column)) - { - /// Filter column was replaced to constant. - parent = std::make_unique(child->getOutputStream(), expression); - } - /// Add new Filter step before Aggregating. /// Expression/Filter -> Aggregating -> Something auto & node = nodes.emplace_back(); @@ -109,20 +97,6 @@ static Names getAggregatinKeys(const Aggregator::Params & params) return keys; } -// static NameSet getColumnNamesFromSortDescription(const SortDescription & sort_desc, const Block & header) -// { -// NameSet names; -// for (const auto & column : sort_desc) -// { -// if (!column.column_name.empty()) -// names.insert(column.column_name); -// else -// names.insert(header.safeGetByPosition(column.column_number).name); -// } - -// return names; -// } - size_t tryPushDownFilter(QueryPlan::Node * parent_node, QueryPlan::Nodes & nodes) { if (parent_node->children.size() != 1) diff --git a/tests/queries/0_stateless/01655_plan_optimizations.reference b/tests/queries/0_stateless/01655_plan_optimizations.reference index fa83c098412..f261e134494 100644 --- a/tests/queries/0_stateless/01655_plan_optimizations.reference +++ b/tests/queries/0_stateless/01655_plan_optimizations.reference @@ -68,7 +68,7 @@ Filter column: notEquals(y, 0) 9 10 > one condition of filter should be pushed down after aggregating, other two conditions are ANDed Filter column -FUNCTION and(minus(s, 4) :: 2, minus(s, 8) :: 1) -> and(notEquals(y, 0), minus(s, 8), minus(s, 4)) +FUNCTION and(minus(s, 8) :: 1, minus(s, 4) :: 2) -> and(notEquals(y, 0), minus(s, 8), minus(s, 4)) Aggregating Filter column: notEquals(y, 0) 0 1 @@ -83,7 +83,7 @@ Filter column: notEquals(y, 0) Filter column ALIAS notEquals(s, 8) :: 1 -> and(notEquals(y, 0), notEquals(s, 8), minus(y, 4)) Aggregating -Filter column: and(minus(y, 4), notEquals(y, 0)) +Filter column: and(notEquals(y, 0), minus(y, 4)) 0 1 1 2 2 3 diff --git a/tests/queries/0_stateless/01655_plan_optimizations.sh b/tests/queries/0_stateless/01655_plan_optimizations.sh index e47b03661e4..84452fe651f 100755 --- a/tests/queries/0_stateless/01655_plan_optimizations.sh +++ b/tests/queries/0_stateless/01655_plan_optimizations.sh @@ -66,7 +66,7 @@ $CLICKHOUSE_CLIENT -q " select sum(x) as s, y from (select number as x, number + 1 as y from numbers(10)) group by y ) where y != 0 and s - 8 and s - 4 settings enable_optimize_predicate_expression=0" | - grep -o "Aggregating\|Filter column\|Filter column: notEquals(y, 0)\|FUNCTION and(minus(s, 4) :: 2, minus(s, 8) :: 1) -> and(notEquals(y, 0), minus(s, 8), minus(s, 4))" + grep -o "Aggregating\|Filter column\|Filter column: notEquals(y, 0)\|FUNCTION and(minus(s, 8) :: 1, minus(s, 4) :: 2) -> and(notEquals(y, 0), minus(s, 8), minus(s, 4))" $CLICKHOUSE_CLIENT -q " select s, y from ( select sum(x) as s, y from (select number as x, number + 1 as y from numbers(10)) group by y @@ -79,7 +79,7 @@ $CLICKHOUSE_CLIENT -q " select sum(x) as s, y from (select number as x, number + 1 as y from numbers(10)) group by y ) where y != 0 and s != 8 and y - 4 settings enable_optimize_predicate_expression=0" | - grep -o "Aggregating\|Filter column\|Filter column: and(minus(y, 4), notEquals(y, 0))\|ALIAS notEquals(s, 8) :: 1 -> and(notEquals(y, 0), notEquals(s, 8), minus(y, 4))" + grep -o "Aggregating\|Filter column\|Filter column: and(notEquals(y, 0), minus(y, 4))\|ALIAS notEquals(s, 8) :: 1 -> and(notEquals(y, 0), notEquals(s, 8), minus(y, 4))" $CLICKHOUSE_CLIENT -q " select s, y from ( select sum(x) as s, y from (select number as x, number + 1 as y from numbers(10)) group by y From 64e76a4a8da87adb374ffeb571fe76eac4850ae8 Mon Sep 17 00:00:00 2001 From: Alexey Milovidov Date: Sat, 20 Feb 2021 21:13:36 +0300 Subject: [PATCH 0554/2357] Minor changes in Decimal --- src/Core/DecimalComparison.h | 2 +- src/Core/DecimalFunctions.h | 24 +++++++++---------- src/Core/MySQL/MySQLReplication.cpp | 6 ++--- src/DataTypes/DataTypeDateTime64.cpp | 4 ++-- src/DataTypes/DataTypeDecimalBase.h | 10 ++++---- src/DataTypes/DataTypesDecimal.cpp | 2 +- src/DataTypes/DataTypesDecimal.h | 2 +- src/DataTypes/convertMySQLDataType.cpp | 6 ++--- .../fetchPostgreSQLTableStructure.cpp | 8 +++---- src/Functions/array/arrayAggregation.cpp | 2 +- src/Functions/array/arrayCumSum.cpp | 2 +- .../array/arrayCumSumNonNegative.cpp | 2 +- src/Functions/isDecimalOverflow.cpp | 2 +- src/IO/WriteHelpers.h | 20 ++++++++-------- 14 files changed, 46 insertions(+), 46 deletions(-) diff --git a/src/Core/DecimalComparison.h b/src/Core/DecimalComparison.h index 8279d01d35a..486c2c1f8f4 100644 --- a/src/Core/DecimalComparison.h +++ b/src/Core/DecimalComparison.h @@ -78,7 +78,7 @@ public: static bool compare(A a, B b, UInt32 scale_a, UInt32 scale_b) { - static const UInt32 max_scale = DecimalUtils::maxPrecision(); + static const UInt32 max_scale = DecimalUtils::max_precision; if (scale_a > max_scale || scale_b > max_scale) throw Exception("Bad scale of decimal field", ErrorCodes::DECIMAL_OVERFLOW); diff --git a/src/Core/DecimalFunctions.h b/src/Core/DecimalFunctions.h index 2b916cbf538..355cf1d378a 100644 --- a/src/Core/DecimalFunctions.h +++ b/src/Core/DecimalFunctions.h @@ -24,13 +24,13 @@ namespace ErrorCodes namespace DecimalUtils { -static constexpr size_t minPrecision() { return 1; } -template static constexpr size_t maxPrecision() { return 0; } -template <> constexpr size_t maxPrecision() { return 9; } -template <> constexpr size_t maxPrecision() { return 18; } -template <> constexpr size_t maxPrecision() { return 18; } -template <> constexpr size_t maxPrecision() { return 38; } -template <> constexpr size_t maxPrecision() { return 76; } +inline constexpr size_t min_precision = 1; +template inline constexpr size_t max_precision = 0; +template <> inline constexpr size_t max_precision = 9; +template <> inline constexpr size_t max_precision = 18; +template <> inline constexpr size_t max_precision = 18; +template <> inline constexpr size_t max_precision = 38; +template <> inline constexpr size_t max_precision = 76; template inline auto scaleMultiplier(UInt32 scale) @@ -87,7 +87,7 @@ struct DataTypeDecimalTrait * * Sign of `whole` controls sign of result: negative whole => negative result, positive whole => positive result. * Sign of `fractional` is expected to be positive, otherwise result is undefined. - * If `scale` is to big (scale > maxPrecision), result is undefined. + * If `scale` is to big (scale > max_precision), result is undefined. */ template inline DecimalType decimalFromComponentsWithMultiplier( @@ -287,21 +287,21 @@ inline auto binaryOpResult(const DecimalType & tx, const DecimalType & ty) scale = (tx.getScale() > ty.getScale() ? tx.getScale() : ty.getScale()); if constexpr (sizeof(T) < sizeof(U)) - return DataTypeDecimalTrait(DecimalUtils::maxPrecision(), scale); + return DataTypeDecimalTrait(DecimalUtils::max_precision, scale); else - return DataTypeDecimalTrait(DecimalUtils::maxPrecision(), scale); + return DataTypeDecimalTrait(DecimalUtils::max_precision, scale); } template typename DecimalType> inline const DataTypeDecimalTrait binaryOpResult(const DecimalType & tx, const DataTypeNumber &) { - return DataTypeDecimalTrait(DecimalUtils::maxPrecision(), tx.getScale()); + return DataTypeDecimalTrait(DecimalUtils::max_precision, tx.getScale()); } template typename DecimalType> inline const DataTypeDecimalTrait binaryOpResult(const DataTypeNumber &, const DecimalType & ty) { - return DataTypeDecimalTrait(DecimalUtils::maxPrecision(), ty.getScale()); + return DataTypeDecimalTrait(DecimalUtils::max_precision, ty.getScale()); } } diff --git a/src/Core/MySQL/MySQLReplication.cpp b/src/Core/MySQL/MySQLReplication.cpp index 8fdf337c849..1b202c4edb4 100644 --- a/src/Core/MySQL/MySQLReplication.cpp +++ b/src/Core/MySQL/MySQLReplication.cpp @@ -475,11 +475,11 @@ namespace MySQLReplication { const auto & dispatch = [](const size_t & precision, const size_t & scale, const auto & function) -> Field { - if (precision <= DecimalUtils::maxPrecision()) + if (precision <= DecimalUtils::max_precision) return Field(function(precision, scale, Decimal32())); - else if (precision <= DecimalUtils::maxPrecision()) + else if (precision <= DecimalUtils::max_precision) return Field(function(precision, scale, Decimal64())); - else if (precision <= DecimalUtils::maxPrecision()) + else if (precision <= DecimalUtils::max_precision) return Field(function(precision, scale, Decimal128())); return Field(function(precision, scale, Decimal256())); diff --git a/src/DataTypes/DataTypeDateTime64.cpp b/src/DataTypes/DataTypeDateTime64.cpp index 09e39c2de1a..17b94e871bf 100644 --- a/src/DataTypes/DataTypeDateTime64.cpp +++ b/src/DataTypes/DataTypeDateTime64.cpp @@ -28,7 +28,7 @@ namespace ErrorCodes static constexpr UInt32 max_scale = 9; DataTypeDateTime64::DataTypeDateTime64(UInt32 scale_, const std::string & time_zone_name) - : DataTypeDecimalBase(DecimalUtils::maxPrecision(), scale_), + : DataTypeDecimalBase(DecimalUtils::max_precision, scale_), TimezoneMixin(time_zone_name) { if (scale > max_scale) @@ -37,7 +37,7 @@ DataTypeDateTime64::DataTypeDateTime64(UInt32 scale_, const std::string & time_z } DataTypeDateTime64::DataTypeDateTime64(UInt32 scale_, const TimezoneMixin & time_zone_info) - : DataTypeDecimalBase(DecimalUtils::maxPrecision(), scale_), + : DataTypeDecimalBase(DecimalUtils::max_precision, scale_), TimezoneMixin(time_zone_info) { if (scale > max_scale) diff --git a/src/DataTypes/DataTypeDecimalBase.h b/src/DataTypes/DataTypeDecimalBase.h index c861b3bcac0..d9079166fa7 100644 --- a/src/DataTypes/DataTypeDecimalBase.h +++ b/src/DataTypes/DataTypeDecimalBase.h @@ -65,7 +65,7 @@ public: static constexpr bool is_parametric = true; - static constexpr size_t maxPrecision() { return DecimalUtils::maxPrecision(); } + static constexpr size_t maxPrecision() { return DecimalUtils::max_precision; } DataTypeDecimalBase(UInt32 precision_, UInt32 scale_) : precision(precision_), @@ -197,17 +197,17 @@ inline const DecimalType decimalResultType(const DataTypeNumber & tx, cons template