This commit is contained in:
Alexey Milovidov 2016-01-28 04:00:27 +03:00
parent b7d3f0e4d6
commit 67a07205b7
58 changed files with 3453 additions and 474 deletions

View File

@ -0,0 +1,26 @@
#pragma once
#include <DB/Interpreters/ClusterProxy/IQueryConstructor.h>
namespace DB
{
namespace ClusterProxy
{
class AlterQueryConstructor final : public IQueryConstructor
{
public:
AlterQueryConstructor() = default;
BlockInputStreamPtr createLocal(ASTPtr query_ast, const Context & context, const Cluster::Address & address) override;
BlockInputStreamPtr createRemote(IConnectionPool * pool, const std::string & query,
const Settings & settings, ThrottlerPtr throttler, const Context & context) override;
BlockInputStreamPtr createRemote(ConnectionPoolsPtr & pools, const std::string & query,
const Settings & settings, ThrottlerPtr throttler, const Context & context) override;
bool isInclusive() const override;
};
}
}

View File

@ -0,0 +1,26 @@
#pragma once
#include <DB/Interpreters/ClusterProxy/IQueryConstructor.h>
namespace DB
{
namespace ClusterProxy
{
class DescribeQueryConstructor final : public IQueryConstructor
{
public:
DescribeQueryConstructor() = default;
BlockInputStreamPtr createLocal(ASTPtr query_ast, const Context & context, const Cluster::Address & address) override;
BlockInputStreamPtr createRemote(IConnectionPool * pool, const std::string & query,
const Settings & settings, ThrottlerPtr throttler, const Context & context) override;
BlockInputStreamPtr createRemote(ConnectionPoolsPtr & pools, const std::string & query,
const Settings & settings, ThrottlerPtr throttler, const Context & context) override;
bool isInclusive() const override;
};
}
}

View File

@ -0,0 +1,35 @@
#pragma once
#include <DB/Interpreters/Cluster.h>
#include <DB/Parsers/IAST.h>
#include <DB/Storages/IStorage.h>
#include <DB/Client/ConnectionPool.h>
namespace DB
{
class Settings;
class Context;
class Cluster;
class IInterpreter;
class RemoteBlockInputStream;
class Throttler;
namespace ClusterProxy
{
class IQueryConstructor
{
public:
virtual ~IQueryConstructor() {}
virtual BlockInputStreamPtr createLocal(ASTPtr query_ast, const Context & context, const Cluster::Address & address) = 0;
virtual BlockInputStreamPtr createRemote(IConnectionPool * pool, const std::string & query,
const Settings & settings, ThrottlerPtr throttler, const Context & context) = 0;
virtual BlockInputStreamPtr createRemote(ConnectionPoolsPtr & pools, const std::string & query,
const Settings & new_settings, ThrottlerPtr throttler, const Context & context) = 0;
virtual bool isInclusive() const = 0;
};
}
}

View File

@ -0,0 +1,37 @@
#pragma once
#include <DB/Parsers/IAST.h>
#include <DB/Storages/IStorage.h>
#include <DB/Client/ConnectionPool.h>
namespace DB
{
class Settings;
class Context;
class Cluster;
namespace ClusterProxy
{
class IQueryConstructor;
class Query
{
public:
Query(IQueryConstructor & query_constructor_, Cluster & cluster_,
ASTPtr query_ast_, const Context & context_, const Settings & settings_, bool enable_shard_multiplexing_);
BlockInputStreams execute();
private:
IQueryConstructor & query_constructor;
Cluster & cluster;
ASTPtr query_ast;
const Context & context;
const Settings & settings;
bool enable_shard_multiplexing;
};
}
}

View File

@ -0,0 +1,32 @@
#pragma once
#include <DB/Interpreters/ClusterProxy/IQueryConstructor.h>
#include <DB/Core/QueryProcessingStage.h>
#include <DB/Storages/IStorage.h>
namespace DB
{
namespace ClusterProxy
{
class SelectQueryConstructor final : public IQueryConstructor
{
public:
SelectQueryConstructor(const QueryProcessingStage::Enum & processed_stage, const Tables & external_tables);
BlockInputStreamPtr createLocal(ASTPtr query_ast, const Context & context, const Cluster::Address & address) override;
BlockInputStreamPtr createRemote(IConnectionPool * pool, const std::string & query,
const Settings & settings, ThrottlerPtr throttler, const Context & context) override;
BlockInputStreamPtr createRemote(ConnectionPoolsPtr & pools, const std::string & query,
const Settings & settings, ThrottlerPtr throttler, const Context & context) override;
bool isInclusive() const override;
private:
const QueryProcessingStage::Enum & processed_stage;
const Tables & external_tables;
};
}
}

View File

@ -30,6 +30,7 @@ class Dictionaries;
class ExternalDictionaries;
class InterserverIOHandler;
class BackgroundProcessingPool;
class ReshardingWorker;
class MergeList;
class Cluster;
class Compiler;
@ -250,6 +251,8 @@ public:
BackgroundProcessingPool & getBackgroundPool();
ReshardingWorker & getReshardingWorker();
/** Очистить кэши разжатых блоков и засечек.
* Обычно это делается при переименовании таблиц, изменении типа столбцов, удалении таблицы.
* - так как кэши привязаны к именам файлов, и становятся некорректными.

View File

@ -16,7 +16,7 @@ namespace DB
class InterpreterAlterQuery : public IInterpreter
{
public:
InterpreterAlterQuery(ASTPtr query_ptr_, Context & context_);
InterpreterAlterQuery(ASTPtr query_ptr_, const Context & context_);
BlockIO execute() override;
@ -28,7 +28,8 @@ public:
const NamesAndTypesList & materialized_columns,
const NamesAndTypesList & alias_columns,
const ColumnDefaults & column_defaults,
Context & context);
const Context & context);
private:
struct PartitionCommand
{
@ -38,6 +39,7 @@ private:
ATTACH_PARTITION,
FETCH_PARTITION,
FREEZE_PARTITION,
RESHARD_PARTITION
};
Type type;
@ -50,6 +52,11 @@ private:
String from; /// Для FETCH PARTITION - путь в ZK к шарду, с которого скачивать партицию.
/// Для RESHARD PARTITION.
Field last_partition;
WeightedZooKeeperPaths weighted_zookeeper_paths;
String sharding_key;
static PartitionCommand dropPartition(const Field & partition, bool detach, bool unreplicated)
{
return {DROP_PARTITION, partition, detach, unreplicated};
@ -69,6 +76,12 @@ private:
{
return {FREEZE_PARTITION, partition};
}
static PartitionCommand reshardPartitions(const Field & first_partition_, const Field & last_partition_,
const WeightedZooKeeperPaths & weighted_zookeeper_paths_, const String & sharding_key_)
{
return {RESHARD_PARTITION, first_partition_, false, false, false, {}, last_partition_, weighted_zookeeper_paths_, sharding_key_};
}
};
typedef std::vector<PartitionCommand> PartitionCommands;

View File

@ -23,7 +23,7 @@ namespace DB
class InterpreterDescribeQuery : public IInterpreter
{
public:
InterpreterDescribeQuery(ASTPtr query_ptr_, Context & context_)
InterpreterDescribeQuery(ASTPtr query_ptr_, const Context & context_)
: query_ptr(query_ptr_), context(context_) {}
BlockIO execute() override

View File

@ -2,6 +2,10 @@
#include <DB/IO/ReadBuffer.h>
#include <DB/IO/WriteBuffer.h>
#include <DB/IO/ReadBufferFromString.h>
#include <DB/IO/ReadHelpers.h>
#include <DB/IO/WriteBufferFromString.h>
#include <DB/IO/WriteHelpers.h>
#include <DB/Core/Types.h>
#include <map>
#include <atomic>
@ -16,12 +20,50 @@ namespace ErrorCodes
extern const int NO_SUCH_INTERSERVER_IO_ENDPOINT;
}
/** Местонахождение сервиса.
*/
struct InterserverIOEndpointLocation
{
public:
InterserverIOEndpointLocation(const std::string & name_, const std::string & host_, UInt16 port_)
: name(name_), host(host_), port(port_)
{
}
/// Создаёт местонахождение на основе его сериализованного представления.
InterserverIOEndpointLocation(const std::string & serialized_location)
{
ReadBufferFromString buf(serialized_location);
readBinary(name, buf);
readBinary(host, buf);
readBinary(port, buf);
assertEOF(buf);
}
/// Сериализует местонахождение.
std::string toString() const
{
std::string serialized_location;
WriteBufferFromString buf(serialized_location);
writeBinary(name, buf);
writeBinary(host, buf);
writeBinary(port, buf);
buf.next();
return serialized_location;
}
public:
std::string name;
std::string host;
UInt16 port;
};
/** Обработчик запросов от других серверов.
*/
class InterserverIOEndpoint
{
public:
virtual std::string getId(const std::string & path) const = 0;
virtual void processQuery(const Poco::Net::HTMLForm & params, WriteBuffer & out) = 0;
virtual ~InterserverIOEndpoint() {}

View File

@ -1,7 +1,6 @@
#pragma once
#include <DB/Parsers/IAST.h>
#include <mysqlxx/Manip.h>
namespace DB
@ -18,6 +17,7 @@ namespace ErrorCodes
* DROP COLUMN col_drop,
* MODIFY COLUMN col_name type,
* DROP PARTITION partition
* RESHARD PARTITION partition TO /path/to/zookeeper/table WEIGHT w, ... USING column
* ...
*/
@ -33,12 +33,14 @@ public:
ATTACH_PARTITION,
FETCH_PARTITION,
FREEZE_PARTITION,
RESHARD_PARTITION,
NO_TYPE
};
struct Parameters
{
Parameters() : type(NO_TYPE) {}
Parameters();
int type = NO_TYPE;
/** В запросе ADD COLUMN здесь хранится имя и тип добавляемого столбца
@ -52,7 +54,7 @@ public:
*/
ASTPtr column;
/** В запросе DROP PARTITION здесь хранится имя partition'а.
/** В запросах DROP PARTITION и RESHARD PARTITION здесь хранится имя partition'а.
*/
ASTPtr partition;
bool detach = false; /// true для DETACH PARTITION.
@ -64,126 +66,33 @@ public:
*/
String from;
/** Для RESHARD PARTITION.
*/
ASTPtr last_partition;
ASTPtr weighted_zookeeper_paths;
String sharding_key;
/// deep copy
void clone(Parameters & p) const
{
p = *this;
if (col_decl) p.col_decl = col_decl->clone();
if (column) p.column = column->clone();
if (partition) p.partition = partition->clone();
}
void clone(Parameters & p) const;
};
typedef std::vector<Parameters> ParameterContainer;
ParameterContainer parameters;
String database;
String table;
void addParameters(const Parameters & params)
{
parameters.push_back(params);
if (params.col_decl)
children.push_back(params.col_decl);
if (params.column)
children.push_back(params.column);
if (params.partition)
children.push_back(params.partition);
}
void addParameters(const Parameters & params);
ASTAlterQuery(StringRange range_ = StringRange()) : IAST(range_) {};
ASTAlterQuery(StringRange range_ = StringRange());
/** Получить текст, который идентифицирует этот элемент. */
String getID() const override { return ("AlterQuery_" + database + "_" + table); };
String getID() const override;
ASTPtr clone() const override
{
ASTAlterQuery * res = new ASTAlterQuery(*this);
for (ParameterContainer::size_type i = 0; i < parameters.size(); ++i)
parameters[i].clone(res->parameters[i]);
return res;
}
ASTPtr clone() const override;
protected:
void formatImpl(const FormatSettings & settings, FormatState & state, FormatStateStacked frame) const override
{
frame.need_parens = false;
std::string indent_str = settings.one_line ? "" : std::string(4 * frame.indent, ' ');
settings.ostr << (settings.hilite ? hilite_keyword : "") << indent_str << "ALTER TABLE " << (settings.hilite ? hilite_none : "");
if (!table.empty())
{
if (!database.empty())
{
settings.ostr << indent_str << database;
settings.ostr << ".";
}
settings.ostr << indent_str << table;
}
settings.ostr << settings.nl_or_ws;
for (size_t i = 0; i < parameters.size(); ++i)
{
const ASTAlterQuery::Parameters & p = parameters[i];
if (p.type == ASTAlterQuery::ADD_COLUMN)
{
settings.ostr << (settings.hilite ? hilite_keyword : "") << indent_str << "ADD COLUMN " << (settings.hilite ? hilite_none : "");
p.col_decl->formatImpl(settings, state, frame);
/// AFTER
if (p.column)
{
settings.ostr << (settings.hilite ? hilite_keyword : "") << indent_str << " AFTER " << (settings.hilite ? hilite_none : "");
p.column->formatImpl(settings, state, frame);
}
}
else if (p.type == ASTAlterQuery::DROP_COLUMN)
{
settings.ostr << (settings.hilite ? hilite_keyword : "") << indent_str << "DROP COLUMN " << (settings.hilite ? hilite_none : "");
p.column->formatImpl(settings, state, frame);
}
else if (p.type == ASTAlterQuery::MODIFY_COLUMN)
{
settings.ostr << (settings.hilite ? hilite_keyword : "") << indent_str << "MODIFY COLUMN " << (settings.hilite ? hilite_none : "");
p.col_decl->formatImpl(settings, state, frame);
}
else if (p.type == ASTAlterQuery::DROP_PARTITION)
{
settings.ostr << (settings.hilite ? hilite_keyword : "") << indent_str << (p.detach ? "DETACH" : "DROP") << " PARTITION "
<< (settings.hilite ? hilite_none : "");
p.partition->formatImpl(settings, state, frame);
}
else if (p.type == ASTAlterQuery::ATTACH_PARTITION)
{
settings.ostr << (settings.hilite ? hilite_keyword : "") << indent_str << "ATTACH " << (p.unreplicated ? "UNREPLICATED " : "")
<< (p.part ? "PART " : "PARTITION ") << (settings.hilite ? hilite_none : "");
p.partition->formatImpl(settings, state, frame);
}
else if (p.type == ASTAlterQuery::FETCH_PARTITION)
{
settings.ostr << (settings.hilite ? hilite_keyword : "") << indent_str << "FETCH " << (p.unreplicated ? "UNREPLICATED " : "")
<< "PARTITION " << (settings.hilite ? hilite_none : "");
p.partition->formatImpl(settings, state, frame);
settings.ostr << (settings.hilite ? hilite_keyword : "")
<< " FROM " << (settings.hilite ? hilite_none : "") << mysqlxx::quote << p.from;
}
else if (p.type == ASTAlterQuery::FREEZE_PARTITION)
{
settings.ostr << (settings.hilite ? hilite_keyword : "") << indent_str << "FREEZE PARTITION " << (settings.hilite ? hilite_none : "");
p.partition->formatImpl(settings, state, frame);
}
else
throw Exception("Unexpected type of ALTER", ErrorCodes::UNEXPECTED_AST_STRUCTURE);
std::string comma = (i < (parameters.size() -1) ) ? "," : "";
settings.ostr << (settings.hilite ? hilite_keyword : "") << indent_str << comma << (settings.hilite ? hilite_none : "");
settings.ostr << settings.nl_or_ws;
}
}
void formatImpl(const FormatSettings & settings, FormatState & state, FormatStateStacked frame) const override;
};
}

View File

@ -0,0 +1,30 @@
#pragma once
#include <DB/Parsers/IAST.h>
#include <DB/Core/Types.h>
#include <mysqlxx/Manip.h>
namespace DB
{
class ASTWeightedZooKeeperPath : public IAST
{
public:
ASTWeightedZooKeeperPath() = default;
ASTWeightedZooKeeperPath(StringRange range_) : IAST(range_) {}
String getID() const override { return "Weighted_ZooKeeper_Path"; }
ASTPtr clone() const override { return new ASTWeightedZooKeeperPath(*this); }
public:
String path;
UInt64 weight;
protected:
void formatImpl(const FormatSettings & settings, FormatState & state, FormatStateStacked frame) const override
{
std::string indent_str = settings.one_line ? "" : std::string(4 * frame.indent, ' ');
settings.ostr << settings.nl_or_ws << indent_str << mysqlxx::quote << path << " WEIGHT " << weight;
}
};
}

View File

@ -216,5 +216,13 @@ protected:
bool parseImpl(Pos & pos, Pos end, ASTPtr & node, Pos & max_parsed_pos, Expected & expected);
};
/** Путь шарда в ZooKeeper вместе с весом.
*/
class ParserWeightedZooKeeperPath : public IParserBase
{
protected:
const char * getName() const { return "weighted ZooKeeper path"; }
bool parseImpl(Pos & pos, Pos end, ASTPtr & node, Pos & max_parsed_pos, Expected & expected);
};
}

View File

@ -13,6 +13,7 @@ namespace DB
* [DROP|DETACH|ATTACH [UNREPLICATED] PARTITION|PART partition, ...]
* [FETCH PARTITION partition FROM ...]
* [FREEZE PARTITION]
* [RESHARD PARTITION partition TO zookeeper/path/to/partition [WEIGHT w] [, ...] USING sharding_key]
*/
class ParserAlterQuery : public IParserBase
{

View File

@ -6,6 +6,10 @@
namespace DB
{
/// Для RESHARD PARTITION.
using WeightedZooKeeperPath = std::pair<String, UInt64>;
using WeightedZooKeeperPaths = std::vector<WeightedZooKeeperPath>;
/// Операция из запроса ALTER (кроме манипуляции с PART/PARTITION). Добавление столбцов типа Nested не развернуто в добавление отдельных столбцов.
struct AlterCommand
{

View File

@ -206,7 +206,7 @@ public:
* Этот метод должен полностью выполнить запрос ALTER, самостоятельно заботясь о блокировках.
* Для обновления метаданных таблицы на диске этот метод должен вызвать InterpreterAlterQuery::updateMetadata.
*/
virtual void alter(const AlterCommands & params, const String & database_name, const String & table_name, Context & context)
virtual void alter(const AlterCommands & params, const String & database_name, const String & table_name, const Context & context)
{
throw Exception("Method alter is not supported by storage " + getName(), ErrorCodes::NOT_IMPLEMENTED);
}
@ -239,6 +239,15 @@ public:
throw Exception("Method freezePartition is not supported by storage " + getName(), ErrorCodes::NOT_IMPLEMENTED);
}
/** Выполнить запрос RESHARD PARTITION.
*/
virtual void reshardPartitions(const String & database_name, const Field & first_partition, const Field & last_partition,
const WeightedZooKeeperPaths & weighted_zookeeper_paths, const String & sharding_key,
const Settings & settings)
{
throw Exception("Method reshardPartition is not supported by storage " + getName(), ErrorCodes::NOT_IMPLEMENTED);
}
/** Выполнить какую-либо фоновую работу. Например, объединение кусков в таблице типа MergeTree.
* Возвращает - была ли выполнена какая-либо работа.
*/

View File

@ -0,0 +1,87 @@
#pragma once
#include <DB/Interpreters/InterserverIOHandler.h>
#include <DB/Storages/MergeTree/MergeTreeData.h>
#include <DB/IO/ReadBufferFromHTTP.h>
#include <DB/IO/HashingWriteBuffer.h>
#include <DB/IO/copyData.h>
namespace DB
{
class StorageReplicatedMergeTree;
namespace DataPartsExchange
{
/** Сервис для отправки кусков из таблицы *MergeTree.
*/
class Service final : public InterserverIOEndpoint
{
public:
Service(MergeTreeData & data_, StorageReplicatedMergeTree & storage_) : data(data_),
storage(storage_), log(&Logger::get(data.getLogName() + " (Replicated PartsService)")) {}
Service(const Service &) = delete;
Service & operator=(const Service &) = delete;
std::string getId(const std::string & node_id) const override;
void processQuery(const Poco::Net::HTMLForm & params, WriteBuffer & out) override;
private:
MergeTreeData::DataPartPtr findPart(const String & name);
MergeTreeData::DataPartPtr findShardedPart(const String & name, size_t shard_no);
private:
MergeTreeData & data;
StorageReplicatedMergeTree & storage;
Logger * log;
};
/** Клиент для получения кусков из таблицы *MergeTree.
*/
class Fetcher final
{
public:
Fetcher(MergeTreeData & data_) : data(data_), log(&Logger::get("Fetcher")) {}
Fetcher(const Fetcher &) = delete;
Fetcher & operator=(const Fetcher &) = delete;
/// Скачивает кусок в tmp_директорию. Если to_detached - скачивает в директорию detached.
MergeTreeData::MutableDataPartPtr fetchPart(
const String & part_name,
const String & replica_path,
const String & host,
int port,
bool to_detached = false);
/// Метод для перешардирования. Скачивает шардированный кусок
/// из заданного шарда в папку to_detached.
MergeTreeData::MutableDataPartPtr fetchShardedPart(
const InterserverIOEndpointLocation & location,
const String & part_name,
size_t shard_no);
void cancel() { is_cancelled = true; }
private:
MergeTreeData::MutableDataPartPtr fetchPartImpl(
const String & part_name,
const String & replica_path,
const String & host,
int port,
const String & shard_no,
bool to_detached);
private:
MergeTreeData & data;
/// Нужно остановить передачу данных.
std::atomic<bool> is_cancelled {false};
Logger * log;
};
}
}

View File

@ -90,6 +90,8 @@ namespace ErrorCodes
class MergeTreeData : public ITableDeclaration
{
friend class ReshardingWorker;
public:
/// Функция, которую можно вызвать, если есть подозрение, что данные куска испорчены.
typedef std::function<void (const String &)> BrokenPartCallback;
@ -246,6 +248,10 @@ public:
/// Если true, деструктор удалит директорию с куском.
bool is_temp = false;
/// Для перешардирования.
bool is_sharded = false;
size_t shard_no = 0;
/// Первичный ключ. Всегда загружается в оперативку.
typedef std::vector<Field> Index;
Index index;
@ -280,13 +286,15 @@ public:
{
try
{
Poco::File dir(storage.full_path + name);
std::string path = storage.full_path + (is_sharded ? ("reshard/" + toString(shard_no) + "/") : "") + name;
Poco::File dir(path);
if (!dir.exists())
return;
if (name.substr(0, strlen("tmp")) != "tmp")
{
LOG_ERROR(storage.log, "~DataPart() should remove part " << storage.full_path + name
LOG_ERROR(storage.log, "~DataPart() should remove part " << path
<< " but its name doesn't start with tmp. Too suspicious, keeping the part.");
return;
}
@ -541,9 +549,10 @@ public:
bool hasColumnFiles(const String & column) const
{
String prefix = storage.full_path + (is_sharded ? ("reshard/" + toString(shard_no) + "/") : "") + name + "/";
String escaped_column = escapeForFileName(column);
return Poco::File(storage.full_path + name + "/" + escaped_column + ".bin").exists() &&
Poco::File(storage.full_path + name + "/" + escaped_column + ".mrk").exists();
return Poco::File(prefix + escaped_column + ".bin").exists() &&
Poco::File(prefix + escaped_column + ".mrk").exists();
}
};
@ -554,6 +563,9 @@ public:
typedef std::set<DataPartPtr, DataPartPtrLess> DataParts;
typedef std::vector<DataPartPtr> DataPartsVector;
/// Для перешардирования.
using MutableDataParts = std::set<MutableDataPartPtr, DataPartPtrLess>;
using PerShardDataParts = std::unordered_map<size_t, MutableDataParts>;
/// Некоторые операции над множеством кусков могут возвращать такой объект.
/// Если не был вызван commit или rollback, деструктор откатывает операцию.
@ -667,7 +679,7 @@ public:
const NamesAndTypesList & materialized_columns_,
const NamesAndTypesList & alias_columns_,
const ColumnDefaults & column_defaults_,
const Context & context_,
Context & context_,
ASTPtr & primary_expr_ast_,
const String & date_column_name_,
const ASTPtr & sampling_expression_, /// nullptr, если семплирование не поддерживается.
@ -752,6 +764,7 @@ public:
/** Возвращает кусок с таким именем (активный или не активный). Если нету, nullptr.
*/
DataPartPtr getPartIfExists(const String & part_name);
DataPartPtr getShardedPartIfExists(const String & part_name, size_t shard_no);
/** Переименовывает временный кусок в постоянный и добавляет его в рабочий набор.
* Если increment != nullptr, индекс куска берется из инкремента. Иначе индекс куска не меняется.
@ -841,6 +854,10 @@ public:
*/
void freezePartition(const std::string & prefix);
/** Возвращает размер заданной партиции в байтах.
*/
size_t getPartitionSize(const std::string & partition_name) const;
size_t getColumnSize(const std::string & name) const
{
Poco::ScopedLock<Poco::FastMutex> lock{data_parts_mutex};
@ -856,11 +873,13 @@ public:
return column_sizes;
}
/// Для ATTACH/DETACH/DROP PARTITION.
/// Для ATTACH/DETACH/DROP/RESHARD PARTITION.
static String getMonthName(const Field & partition);
static String getMonthName(DayNum_t month);
static DayNum_t getMonthDayNum(const Field & partition);
static DayNum_t getMonthFromName(const String & month_name);
const Context & context;
Context & context;
const String date_column_name;
const ASTPtr sampling_expression;
const size_t index_granularity;
@ -906,6 +925,10 @@ private:
DataParts all_data_parts;
mutable Poco::FastMutex all_data_parts_mutex;
/** Для каждого шарда множество шардированных кусков.
*/
PerShardDataParts per_shard_data_parts;
/** Выражение, преобразующее типы столбцов.
* Если преобразований типов нет, out_expression=nullptr.
* out_rename_map отображает файлы-столбцы на выходе выражения в новые файлы таблицы.

View File

@ -8,6 +8,7 @@ namespace DB
{
class MergeListEntry;
class ReshardingJob;
/** Умеет выбирать куски для слияния и сливать их.
@ -39,11 +40,15 @@ public:
bool only_small,
const AllowedMergingPredicate & can_merge);
/** Выбрать все куски принадлежащие одной партиции.
*/
MergeTreeData::DataPartsVector selectAllPartsFromPartition(DayNum_t partition);
/** Сливает куски.
* Если reservation != nullptr, то и дело уменьшает размер зарезервированного места
* приблизительно пропорционально количеству уже выписанных данных.
*/
MergeTreeData::DataPartPtr mergeParts(
MergeTreeData::MutableDataPartPtr mergeParts(
const MergeTreeData::DataPartsVector & parts, const String & merged_name, MergeListEntry & merge_entry,
size_t aio_threshold, MergeTreeData::Transaction * out_transaction = nullptr,
DiskSpaceMonitor::Reservation * disk_reservation = nullptr);

View File

@ -0,0 +1,61 @@
#pragma once
#include <DB/IO/WriteBufferFromFile.h>
#include <DB/IO/CompressedWriteBuffer.h>
#include <DB/Columns/ColumnsNumber.h>
#include <DB/Interpreters/sortBlock.h>
#include <DB/Storages/MergeTree/MergeTreeData.h>
#include <DB/Core/Block.h>
namespace DB
{
struct ShardedBlockWithDateInterval final
{
ShardedBlockWithDateInterval(const Block & block_, size_t shard_no_, UInt16 min_date_, UInt16 max_date_);
ShardedBlockWithDateInterval(const ShardedBlockWithDateInterval &) = delete;
ShardedBlockWithDateInterval & operator=(const ShardedBlockWithDateInterval &) = delete;
Block block;
size_t shard_no;
UInt16 min_date;
UInt16 max_date;
};
using ShardedBlocksWithDateIntervals = std::list<ShardedBlockWithDateInterval>;
class ReshardingJob;
/** Создаёт новые шардированные куски с данными.
*/
class MergeTreeSharder final
{
public:
MergeTreeSharder(MergeTreeData & data_, const ReshardingJob & job_);
MergeTreeSharder(const MergeTreeSharder &) = delete;
MergeTreeSharder & operator=(const MergeTreeSharder &) = delete;
/** Разбивает блок на блоки по ключу шардирования, каждый из которых
* нужно записать в отдельный кусок. Работает детерминированно: если
* отдать на вход такой же блок, на выходе получатся такие же блоки в
* таком же порядке.
*/
ShardedBlocksWithDateIntervals shardBlock(const Block & block);
/** Все строки должны относиться к одному месяцу.
* temp_index - значение left и right для нового куска. Можно будет изменить при переименовании.
* Возвращает временный кусок с именем, начинающимся с tmp_.
*/
MergeTreeData::MutableDataPartPtr writeTempPart(ShardedBlockWithDateInterval & sharded_block_with_dates, Int64 temp_index);
private:
std::vector<IColumn::Filter> createFilters(Block block);
private:
MergeTreeData & data;
const ReshardingJob & job;
Logger * log;
std::vector<size_t> slots;
};
}

View File

@ -0,0 +1,45 @@
#pragma once
#include <DB/Core/Types.h>
#include <DB/Interpreters/InterserverIOHandler.h>
#include <DB/IO/WriteBuffer.h>
namespace DB
{
namespace RemoteDiskSpaceMonitor
{
/** Сервис для получения информации о свободном месте на диске.
*/
class Service final : public InterserverIOEndpoint
{
public:
Service(const std::string & path_);
Service(const Service &) = delete;
Service & operator=(const Service &) = delete;
std::string getId(const std::string & node_id) const override;
void processQuery(const Poco::Net::HTMLForm & params, WriteBuffer & out) override;
private:
const std::string path;
};
/** Клиент для получения информации о свободном месте на удалённом диске.
*/
class Client final
{
public:
Client() = default;
Client(const Client &) = delete;
Client & operator=(const Client &) = delete;
size_t getFreeDiskSpace(const InterserverIOEndpointLocation & location) const;
void cancel() { is_cancelled = true; }
private:
std::atomic<bool> is_cancelled{false};
};
}
}

View File

@ -0,0 +1,46 @@
#pragma once
#include <DB/Interpreters/InterserverIOHandler.h>
#include <DB/IO/WriteBuffer.h>
namespace DB
{
class Context;
namespace RemoteQueryExecutor
{
/** Сервис для выполнения SQL запросов.
*/
class Service final : public InterserverIOEndpoint
{
public:
Service(Context & context_);
Service(const Service &) = delete;
Service & operator=(const Service &) = delete;
std::string getId(const std::string & node_id) const override;
void processQuery(const Poco::Net::HTMLForm & params, WriteBuffer & out) override;
private:
Context & context;
};
/** Клиент для удалённого выполнения SQL запросов.
*/
class Client final
{
public:
Client() = default;
Client(const Client &) = delete;
Client & operator=(const Client &) = delete;
bool executeQuery(const InterserverIOEndpointLocation & location, const std::string & query);
void cancel() { is_cancelled = true; }
private:
std::atomic<bool> is_cancelled{false};
};
}
}

View File

@ -1,62 +0,0 @@
#pragma once
#include <DB/Interpreters/InterserverIOHandler.h>
#include <DB/Storages/MergeTree/MergeTreeData.h>
#include <DB/IO/ReadBufferFromHTTP.h>
#include <DB/IO/HashingWriteBuffer.h>
#include <DB/IO/copyData.h>
namespace DB
{
class StorageReplicatedMergeTree;
class ReplicatedMergeTreePartsServer : public InterserverIOEndpoint
{
public:
ReplicatedMergeTreePartsServer(MergeTreeData & data_, StorageReplicatedMergeTree & storage_) : data(data_),
storage(storage_), log(&Logger::get(data.getLogName() + " (Replicated PartsServer)")) {}
void processQuery(const Poco::Net::HTMLForm & params, WriteBuffer & out) override;
private:
MergeTreeData & data;
StorageReplicatedMergeTree & storage;
Logger * log;
MergeTreeData::DataPartPtr findPart(const String & name)
{
MergeTreeData::DataPartPtr part = data.getPartIfExists(name);
if (part)
return part;
throw Exception("No part " + name + " in table");
}
};
class ReplicatedMergeTreePartsFetcher
{
public:
ReplicatedMergeTreePartsFetcher(MergeTreeData & data_) : data(data_), log(&Logger::get("ReplicatedMergeTreePartsFetcher")) {}
/// Скачивает кусок в tmp_директорию. Если to_detached - скачивает в директорию detached.
MergeTreeData::MutableDataPartPtr fetchPart(
const String & part_name,
const String & replica_path,
const String & host,
int port,
bool to_detached = false);
void cancel() { is_cancelled = true; }
private:
MergeTreeData & data;
/// Нужно остановить передачу данных.
std::atomic<bool> is_cancelled {false};
Logger * log;
};
}

View File

@ -0,0 +1,35 @@
#pragma once
#include <DB/Storages/AlterCommands.h>
#include <string>
namespace DB
{
/** Описание задачи перешардирования.
*/
struct ReshardingJob final
{
public:
/// Создаёт описание на основе его сериализованного представления.
ReshardingJob(const std::string & serialized_job);
ReshardingJob(const std::string & database_name_, const std::string & table_name_,
const std::string & partition_, const WeightedZooKeeperPaths & paths_,
const std::string & sharding_key_);
ReshardingJob(const ReshardingJob &) = delete;
ReshardingJob & operator=(const ReshardingJob &) = delete;
/// Сериализует описание задачи.
std::string toString() const;
public:
std::string database_name;
std::string table_name;
std::string partition;
WeightedZooKeeperPaths paths;
std::string sharding_key;
};
}

View File

@ -0,0 +1,98 @@
#pragma once
#include <DB/Storages/AlterCommands.h>
#include <common/logger_useful.h>
#include <Poco/SharedPtr.h>
#include <string>
#include <thread>
#include <atomic>
namespace DB
{
class Context;
class StorageReplicatedMergeTree;
class ReshardingJob;
/** Исполнитель задач перешардирования.
* Рабоает в фоновом режиме внутри одного потока.
* Следит за появлением задач и назначает их на выполнение.
* Задачи выполняются последовательно.
*/
class ReshardingWorker final
{
public:
ReshardingWorker(Context & context_);
ReshardingWorker(const ReshardingWorker &) = delete;
ReshardingWorker & operator=(const ReshardingWorker &) = delete;
~ReshardingWorker();
/// Запустить поток выполняющий задачи перешардирования.
void start();
/// Прислать запрос на перешардирование.
void submitJob(const std::string & database_name,
const std::string & table_name,
const std::string & partition,
const WeightedZooKeeperPaths & weighted_zookeeper_paths,
const std::string & sharding_key);
/// Прислать запрос на перешардирование.
void submitJob(const ReshardingJob & job);
/// Был ли поток запущен?
bool isStarted() const;
private:
/// Прислать запрос на перешардирование (внутренняя версия).
void submitJobImpl(const std::string & serialized_job);
/// Следить за появлением новых задач. Выполнить их последовательно.
void pollAndExecute();
/// Выполнить задачи, которые были в очереди выполнения при запуске узла.
void performPendingJobs();
/// Выполнить задачи, которые заданы по путям в БД ZooKeeper.
void perform(const Strings & job_nodes);
/// Выполнить одну задачу.
void perform(const ReshardingJob & job);
/// Разбить куски входящие в партицию на несколько, согласно ключу шардирования.
/// Оновременно перегруппировать эти куски по шардам и слить куски в каждой группе.
/// При завершении этого процесса создаётся новая партиция для каждого шарда.
void createShardedPartitions(StorageReplicatedMergeTree & storage, const ReshardingJob & job);
/// Копировать все партиции полученные путём перешардирования на каждую реплику
/// соответствующих шардов.
void publishShardedPartitions(StorageReplicatedMergeTree & storage, const ReshardingJob & job);
/// Для каждого шарда добавить данные из новой партиции этого шарда в таблицу на всех
/// репликах входящих в этот же шард. На локальном узле, который выполняет задачу
/// перешардирования, удалить данные из первоначальной партиции.
void applyChanges(StorageReplicatedMergeTree & storage, const ReshardingJob & job);
/// Удалить временные данные с локального узла и ZooKeeper'а.
void cleanup(StorageReplicatedMergeTree & storage, const ReshardingJob & job);
/// Принудительно завершить поток.
void abortIfRequested() const;
/// Был ли поток завершён?
bool hasAborted(const Exception & ex) const;
private:
Context & context;
Logger * log;
std::thread polling_thread;
std::string host_task_queue_path;
std::atomic<bool> is_started{false};
std::atomic<bool> must_stop{false};
};
using ReshardingWorkerPtr = Poco::SharedPtr<ReshardingWorker>;
}

View File

@ -0,0 +1,47 @@
#pragma once
#include <DB/Interpreters/InterserverIOHandler.h>
#include <DB/IO/WriteBuffer.h>
namespace DB
{
class StorageReplicatedMergeTree;
namespace ShardedPartitionSender
{
/** Сервис для получения кусков из партиции таблицы *MergeTree.
*/
class Service final : public InterserverIOEndpoint
{
public:
Service(StorageReplicatedMergeTree & storage_);
Service(const Service &) = delete;
Service & operator=(const Service &) = delete;
std::string getId(const std::string & node_id) const override;
void processQuery(const Poco::Net::HTMLForm & params, WriteBuffer & out) override;
private:
StorageReplicatedMergeTree & storage;
};
/** Клиент для отправления кусков из партиции таблицы *MergeTree.
*/
class Client final
{
public:
Client() = default;
Client(const Client &) = delete;
Client & operator=(const Client &) = delete;
bool send(const InterserverIOEndpointLocation & to_location, const InterserverIOEndpointLocation & from_location,
const std::vector<std::string> & parts, size_t shard_no);
void cancel() { is_cancelled = true; }
private:
std::atomic<bool> is_cancelled{false};
};
}
}

View File

@ -85,7 +85,7 @@ public:
bool supportsParallelReplicas() const override { return true; }
/// Структура подчинённой таблицы не проверяется и не изменяется.
void alter(const AlterCommands & params, const String & database_name, const String & table_name, Context & context) override;
void alter(const AlterCommands & params, const String & database_name, const String & table_name, const Context & context) override;
private:
String name;

View File

@ -72,10 +72,14 @@ public:
void rename(const String & new_path_to_db, const String & new_database_name, const String & new_table_name) override { name = new_table_name; }
/// в подтаблицах добавлять и удалять столбы нужно вручную
/// структура подтаблиц не проверяется
void alter(const AlterCommands & params, const String & database_name, const String & table_name, Context & context) override;
void alter(const AlterCommands & params, const String & database_name, const String & table_name, const Context & context) override;
void shutdown() override;
void reshardPartitions(const String & database_name, const Field & first_partition, const Field & last_partition,
const WeightedZooKeeperPaths & weighted_zookeeper_paths, const String & sharding_key,
const Settings & settings) override;
/// От каждой реплики получить описание соответствующей локальной таблицы.
BlockInputStreams describe(const Context & context, const Settings & settings);

View File

@ -62,7 +62,7 @@ public:
/// в подтаблицах добавлять и удалять столбы нужно вручную
/// структура подтаблиц не проверяется
void alter(const AlterCommands & params, const String & database_name, const String & table_name, Context & context) override;
void alter(const AlterCommands & params, const String & database_name, const String & table_name, const Context & context) override;
private:
String name;

View File

@ -97,7 +97,7 @@ public:
void rename(const String & new_path_to_db, const String & new_database_name, const String & new_table_name) override;
void alter(const AlterCommands & params, const String & database_name, const String & table_name, Context & context) override;
void alter(const AlterCommands & params, const String & database_name, const String & table_name, const Context & context) override;
bool supportsIndexForIn() const override { return true; }

View File

@ -7,11 +7,14 @@
#include <DB/Storages/MergeTree/MergeTreeDataSelectExecutor.h>
#include <DB/Storages/MergeTree/ReplicatedMergeTreeLogEntry.h>
#include <DB/Storages/MergeTree/ReplicatedMergeTreeQueue.h>
#include <DB/Storages/MergeTree/ReplicatedMergeTreePartsExchange.h>
#include <DB/Storages/MergeTree/ReplicatedMergeTreeCleanupThread.h>
#include <DB/Storages/MergeTree/ReplicatedMergeTreeRestartingThread.h>
#include <DB/Storages/MergeTree/AbandonableLockInZooKeeper.h>
#include <DB/Storages/MergeTree/BackgroundProcessingPool.h>
#include <DB/Storages/MergeTree/DataPartsExchange.h>
#include <DB/Storages/MergeTree/RemoteDiskSpaceMonitor.h>
#include <DB/Storages/MergeTree/ShardedPartitionSender.h>
#include <DB/Storages/MergeTree/RemoteQueryExecutor.h>
#include <DB/DataTypes/DataTypesNumberFixed.h>
#include <zkutil/ZooKeeper.h>
#include <zkutil/LeaderElection.h>
@ -126,12 +129,15 @@ public:
bool optimize(const Settings & settings) override;
void alter(const AlterCommands & params, const String & database_name, const String & table_name, Context & context) override;
void alter(const AlterCommands & params, const String & database_name, const String & table_name, const Context & context) override;
void dropPartition(ASTPtr query, const Field & partition, bool detach, bool unreplicated, const Settings & settings) override;
void attachPartition(ASTPtr query, const Field & partition, bool unreplicated, bool part, const Settings & settings) override;
void fetchPartition(const Field & partition, const String & from, const Settings & settings) override;
void freezePartition(const Field & partition, const Settings & settings) override;
void reshardPartitions(const String & database_name, const Field & first_partition, const Field & last_partition,
const WeightedZooKeeperPaths & weighted_zookeeper_paths, const String & sharding_key,
const Settings & settings) override;
/** Удаляет реплику из ZooKeeper. Если других реплик нет, удаляет всю таблицу из ZooKeeper.
*/
@ -181,6 +187,11 @@ private:
friend class ReplicatedMergeTreeRestartingThread;
friend class ReplicatedMergeTreeCleanupThread;
friend struct ReplicatedMergeTreeLogEntry;
friend class ScopedPartitionMergeLock;
friend class ReshardingWorker;
friend class ShardedPartitionSender::Client;
friend class ShardedPartitionSender::Service;
using LogEntry = ReplicatedMergeTreeLogEntry;
using LogEntryPtr = LogEntry::Ptr;
@ -236,12 +247,20 @@ private:
bool is_leader_node = false;
InterserverIOEndpointHolderPtr endpoint_holder;
InterserverIOEndpointHolderPtr disk_space_monitor_endpoint_holder;
InterserverIOEndpointHolderPtr sharded_partition_sender_endpoint_holder;
InterserverIOEndpointHolderPtr remote_query_executor_endpoint_holder;
MergeTreeData data;
MergeTreeDataSelectExecutor reader;
MergeTreeDataWriter writer;
MergeTreeDataMerger merger;
ReplicatedMergeTreePartsFetcher fetcher;
DataPartsExchange::Fetcher fetcher;
RemoteDiskSpaceMonitor::Client free_disk_space_checker;
ShardedPartitionSender::Client sharded_partition_sender_client;
RemoteQueryExecutor::Client remote_query_executor_client;
zkutil::LeaderElectionPtr leader_election;
/// Для чтения данных из директории unreplicated.
@ -423,12 +442,91 @@ private:
/// Кинуть исключение, если таблица readonly.
void assertNotReadonly() const;
/** Получить блокировку, которая защищает заданную партицию от задачи слияния.
* Блокировка является рекурсивной.
*/
std::string acquirePartitionMergeLock(const std::string & partition_name);
/** Заявить, что больше не ссылаемся на блокировку соответствующую заданной
* партиции. Если ссылок больше нет, блокировка уничтожается.
*/
void releasePartitionMergeLock(const std::string & partition_name);
/// Проверить наличие узла в ZK. Если он есть - запомнить эту информацию, и затем сразу отвечать true.
std::unordered_set<std::string> existing_nodes_cache;
std::mutex existing_nodes_cache_mutex;
bool existsNodeCached(const std::string & path);
/// Перешардирование.
struct ReplicaSpaceInfo
{
long double factor = 0.0;
size_t available_size = 0;
};
using ReplicaToSpaceInfo = std::map<std::string, ReplicaSpaceInfo>;
struct PartitionMergeLockInfo
{
PartitionMergeLockInfo(const std::string & fake_part_name_)
: fake_part_name(fake_part_name_), ref_count(1)
{
}
std::string fake_part_name;
unsigned int ref_count;
};
using PartitionToMergeLock = std::map<std::string, PartitionMergeLockInfo>;
/** Проверяет, что структуры локальной и реплицируемых таблиц совпадают.
*/
void enforceShardsConsistency(const WeightedZooKeeperPaths & weighted_zookeeper_paths);
/** Получить информацию о свободном месте на репликах + дополнительную информацию
* для функции checkSpaceForResharding.
*/
ReplicaToSpaceInfo gatherReplicaSpaceInfo(const WeightedZooKeeperPaths & weighted_zookeeper_paths);
/** Проверяет, что имеется достаточно свободного места локально и на всех репликах.
*/
bool checkSpaceForResharding(const ReplicaToSpaceInfo & replica_to_space_info, size_t partition_size) const;
std::mutex mutex_partition_to_merge_lock;
PartitionToMergeLock partition_to_merge_lock;
};
/** Рекурсивная блокировка, которая защищает заданную партицию от задачи слияния.
*/
class ScopedPartitionMergeLock final
{
public:
ScopedPartitionMergeLock(StorageReplicatedMergeTree & storage_, const std::string & partition_name_)
: storage(storage_), partition_name(partition_name_)
{
fake_part_name = storage.acquirePartitionMergeLock(partition_name);
}
ScopedPartitionMergeLock(const ScopedPartitionMergeLock &) = delete;
ScopedPartitionMergeLock & operator=(const ScopedPartitionMergeLock &) = delete;
/// Получить уникальное название блокировки.
std::string getId() const
{
return fake_part_name;
}
~ScopedPartitionMergeLock()
{
storage.releasePartitionMergeLock(partition_name);
}
private:
StorageReplicatedMergeTree & storage;
const std::string partition_name;
std::string fake_part_name;
};
}

View File

@ -313,6 +313,16 @@ namespace ErrorCodes
extern const int TOO_MUCH_BYTES = 307;
extern const int UNEXPECTED_NODE_IN_ZOOKEEPER = 308;
extern const int FUNCTION_CANNOT_HAVE_PARAMETERS = 309;
extern const int INCONSISTENT_TABLE_ACCROSS_SHARDS = 310;
extern const int INSUFFICIENT_SPACE_FOR_RESHARDING = 311;
extern const int PARTITION_COPY_FAILED = 312;
extern const int PARTITION_ATTACH_FAILED = 313;
extern const int RESHARDING_CLEANUP_FAILED = 314;
extern const int RESHARDING_NO_WORKER = 315;
extern const int INVALID_PARTITIONS_INTERVAL = 316;
extern const int RESHARDING_INVALID_PARAMETERS = 317;
extern const int INVALID_SHARD_WEIGHT = 318;
extern const int SHARD_DOESNT_REFERENCE_TABLE = 319;
extern const int UNKNOWN_STATUS_OF_INSERT = 320;
extern const int KEEPER_EXCEPTION = 999;

View File

@ -0,0 +1,38 @@
#include <DB/Interpreters/ClusterProxy/AlterQueryConstructor.h>
#include <DB/Interpreters/InterpreterAlterQuery.h>
#include <DB/DataStreams/MaterializingBlockInputStream.h>
#include <DB/DataStreams/BlockExtraInfoInputStream.h>
#include <DB/DataStreams/RemoteBlockInputStream.h>
namespace DB
{
namespace ClusterProxy
{
BlockInputStreamPtr AlterQueryConstructor::createLocal(ASTPtr query_ast, const Context & context, const Cluster::Address & address)
{
InterpreterAlterQuery interpreter(query_ast, context);
return interpreter.execute().in;
}
BlockInputStreamPtr AlterQueryConstructor::createRemote(IConnectionPool * pool, const std::string & query,
const Settings & settings, ThrottlerPtr throttler, const Context & context)
{
return new RemoteBlockInputStream{pool, query, &settings, throttler};
}
BlockInputStreamPtr AlterQueryConstructor::createRemote(ConnectionPoolsPtr & pools, const std::string & query,
const Settings & settings, ThrottlerPtr throttler, const Context & context)
{
return new RemoteBlockInputStream{pools, query, &settings, throttler};
}
bool AlterQueryConstructor::isInclusive() const
{
return false;
}
}
}

View File

@ -0,0 +1,64 @@
#include <DB/Interpreters/ClusterProxy/DescribeQueryConstructor.h>
#include <DB/Interpreters/InterpreterDescribeQuery.h>
#include <DB/DataStreams/MaterializingBlockInputStream.h>
#include <DB/DataStreams/BlockExtraInfoInputStream.h>
#include <DB/DataStreams/RemoteBlockInputStream.h>
namespace DB
{
namespace
{
BlockExtraInfo toBlockExtraInfo(const Cluster::Address & address)
{
BlockExtraInfo block_extra_info;
block_extra_info.host = address.host_name;
block_extra_info.resolved_address = address.resolved_address.toString();
block_extra_info.port = address.port;
block_extra_info.user = address.user;
block_extra_info.is_valid = true;
return block_extra_info;
}
}
namespace ClusterProxy
{
BlockInputStreamPtr DescribeQueryConstructor::createLocal(ASTPtr query_ast, const Context & context, const Cluster::Address & address)
{
InterpreterDescribeQuery interpreter(query_ast, context);
/** Материализация нужна, так как с удалённых серверов константы приходят материализованными.
* Если этого не делать, то в разных потоках будут получаться разные типы (Const и не-Const) столбцов,
* а это не разрешено, так как весь код исходит из допущения, что в потоке блоков все типы одинаковые.
*/
BlockInputStreamPtr stream = new MaterializingBlockInputStream(interpreter.execute().in);
return new BlockExtraInfoInputStream(stream, toBlockExtraInfo(address));
}
BlockInputStreamPtr DescribeQueryConstructor::createRemote(IConnectionPool * pool, const std::string & query,
const Settings & settings, ThrottlerPtr throttler, const Context & context)
{
auto stream = new RemoteBlockInputStream{pool, query, &settings, throttler};
stream->doBroadcast();
stream->appendExtraInfo();
return stream;
}
BlockInputStreamPtr DescribeQueryConstructor::createRemote(ConnectionPoolsPtr & pools, const std::string & query,
const Settings & settings, ThrottlerPtr throttler, const Context & context)
{
auto stream = new RemoteBlockInputStream{pools, query, &settings, throttler};
stream->doBroadcast();
stream->appendExtraInfo();
return stream;
}
bool DescribeQueryConstructor::isInclusive() const
{
return true;
}
}
}

View File

@ -0,0 +1,130 @@
#include <DB/Interpreters/ClusterProxy/Query.h>
#include <DB/Interpreters/ClusterProxy/IQueryConstructor.h>
#include <DB/Interpreters/Settings.h>
#include <DB/Interpreters/Context.h>
#include <DB/Interpreters/Cluster.h>
#include <DB/Interpreters/IInterpreter.h>
#include <DB/DataStreams/RemoteBlockInputStream.h>
namespace DB
{
namespace ClusterProxy
{
Query::Query(IQueryConstructor & query_constructor_, Cluster & cluster_,
ASTPtr query_ast_, const Context & context_, const Settings & settings_, bool enable_shard_multiplexing_)
: query_constructor(query_constructor_), cluster(cluster_), query_ast(query_ast_),
context(context_), settings(settings_), enable_shard_multiplexing(enable_shard_multiplexing_)
{
}
BlockInputStreams Query::execute()
{
BlockInputStreams res;
const std::string query = queryToString(query_ast);
Settings new_settings = settings;
new_settings.queue_max_wait_ms = Cluster::saturate(new_settings.queue_max_wait_ms, settings.limits.max_execution_time);
/// Не имеет смысла на удалённых серверах, так как запрос отправляется обычно с другим user-ом.
new_settings.max_concurrent_queries_for_user = 0;
/// Ограничение сетевого трафика, если нужно.
ThrottlerPtr throttler;
if (settings.limits.max_network_bandwidth || settings.limits.max_network_bytes)
throttler.reset(new Throttler(
settings.limits.max_network_bandwidth,
settings.limits.max_network_bytes,
"Limit for bytes to send or receive over network exceeded."));
/// Распределить шарды равномерно по потокам.
size_t remote_count = 0;
if (query_constructor.isInclusive())
{
for (const auto & shard_info : cluster.getShardsInfo())
{
if (shard_info.hasRemoteConnections())
++remote_count;
}
}
else
remote_count = cluster.getRemoteShardCount();
size_t thread_count;
if (!enable_shard_multiplexing)
thread_count = remote_count;
else if (remote_count == 0)
thread_count = 0;
else if (settings.max_distributed_processing_threads == 0)
thread_count = 1;
else
thread_count = std::min(remote_count, static_cast<size_t>(settings.max_distributed_processing_threads));
size_t pools_per_thread = (thread_count > 0) ? (remote_count / thread_count) : 0;
size_t remainder = (thread_count > 0) ? (remote_count % thread_count) : 0;
ConnectionPoolsPtr pools;
bool do_init = true;
/// Цикл по шардам.
size_t current_thread = 0;
for (const auto & shard_info : cluster.getShardsInfo())
{
bool create_local_queries = shard_info.isLocal();
bool create_remote_queries = query_constructor.isInclusive() ? shard_info.hasRemoteConnections() : !create_local_queries;
if (create_local_queries)
{
/// Добавляем запросы к локальному ClickHouse.
DB::Context new_context = context;
new_context.setSettings(new_settings);
for (const auto & address : shard_info.local_addresses)
{
BlockInputStreamPtr stream = query_constructor.createLocal(query_ast, new_context, address);
if (stream)
res.emplace_back(stream);
}
}
if (create_remote_queries)
{
size_t excess = (current_thread < remainder) ? 1 : 0;
size_t actual_pools_per_thread = pools_per_thread + excess;
if (actual_pools_per_thread == 1)
{
res.emplace_back(query_constructor.createRemote(shard_info.pool, query, new_settings, throttler, context));
++current_thread;
}
else
{
if (do_init)
{
pools = new ConnectionPools;
do_init = false;
}
pools->push_back(shard_info.pool);
if (pools->size() == actual_pools_per_thread)
{
res.emplace_back(query_constructor.createRemote(pools, query, new_settings, throttler, context));
do_init = true;
++current_thread;
}
}
}
}
return res;
}
}
}

View File

@ -0,0 +1,48 @@
#include <DB/Interpreters/ClusterProxy/SelectQueryConstructor.h>
#include <DB/Interpreters/InterpreterSelectQuery.h>
#include <DB/DataStreams/RemoteBlockInputStream.h>
#include <DB/DataStreams/MaterializingBlockInputStream.h>
namespace DB
{
namespace ClusterProxy
{
SelectQueryConstructor::SelectQueryConstructor(const QueryProcessingStage::Enum & processed_stage_,
const Tables & external_tables_)
: processed_stage(processed_stage_), external_tables(external_tables_)
{
}
BlockInputStreamPtr SelectQueryConstructor::createLocal(ASTPtr query_ast, const Context & context, const Cluster::Address & address)
{
InterpreterSelectQuery interpreter(query_ast, context, processed_stage);
/** Материализация нужна, так как с удалённых серверов константы приходят материализованными.
* Если этого не делать, то в разных потоках будут получаться разные типы (Const и не-Const) столбцов,
* а это не разрешено, так как весь код исходит из допущения, что в потоке блоков все типы одинаковые.
*/
return new MaterializingBlockInputStream(interpreter.execute().in);
}
BlockInputStreamPtr SelectQueryConstructor::createRemote(IConnectionPool * pool, const std::string & query,
const Settings & settings, ThrottlerPtr throttler, const Context & context)
{
return new RemoteBlockInputStream{pool, query, &settings, throttler, external_tables, processed_stage, context};
}
BlockInputStreamPtr SelectQueryConstructor::createRemote(ConnectionPoolsPtr & pools, const std::string & query,
const Settings & settings, ThrottlerPtr throttler, const Context & context)
{
return new RemoteBlockInputStream{pools, query, &settings, throttler, external_tables, processed_stage, context};
}
bool SelectQueryConstructor::isInclusive() const
{
return false;
}
}
}

View File

@ -17,6 +17,7 @@
#include <DB/Storages/IStorage.h>
#include <DB/Storages/MarkCache.h>
#include <DB/Storages/MergeTree/BackgroundProcessingPool.h>
#include <DB/Storages/MergeTree/ReshardingWorker.h>
#include <DB/Storages/MergeTree/MergeList.h>
#include <DB/Storages/MergeTree/MergeTreeSettings.h>
#include <DB/Storages/CompressionMethodSelector.h>
@ -102,6 +103,7 @@ struct ContextShared
ConfigurationPtr users_config; /// Конфиг с секциями users, profiles и quotas.
InterserverIOHandler interserver_io_handler; /// Обработчик для межсерверной передачи данных.
BackgroundProcessingPoolPtr background_pool; /// Пул потоков для фоновой работы, выполняемой таблицами.
ReshardingWorkerPtr resharding_worker;
Macros macros; /// Подстановки из конфига.
std::unique_ptr<Compiler> compiler; /// Для динамической компиляции частей запроса, при необходимости.
std::unique_ptr<QueryLog> query_log; /// Для логгирования запросов.
@ -820,6 +822,19 @@ BackgroundProcessingPool & Context::getBackgroundPool()
return *shared->background_pool;
}
ReshardingWorker & Context::getReshardingWorker()
{
Poco::ScopedLock<Poco::Mutex> lock(shared->mutex);
if (!shared->zookeeper)
throw Exception("Resharding background processing requires ZooKeeper", ErrorCodes::LOGICAL_ERROR);
if (!shared->resharding_worker)
shared->resharding_worker = new ReshardingWorker(*this);
return *shared->resharding_worker;
}
void Context::resetCaches() const
{
Poco::ScopedLock<Poco::Mutex> lock(shared->mutex);

View File

@ -6,6 +6,7 @@
#include <DB/Parsers/ASTNameTypePair.h>
#include <DB/Parsers/ASTIdentifier.h>
#include <DB/Parsers/ASTLiteral.h>
#include <DB/Parsers/ASTWeightedZooKeeperPath.h>
#include <DB/Parsers/ParserCreateQuery.h>
#include <DB/IO/copyData.h>
@ -29,7 +30,7 @@ namespace ErrorCodes
}
InterpreterAlterQuery::InterpreterAlterQuery(ASTPtr query_ptr_, Context & context_)
InterpreterAlterQuery::InterpreterAlterQuery(ASTPtr query_ptr_, const Context & context_)
: query_ptr(query_ptr_), context(context_)
{
}
@ -65,6 +66,10 @@ BlockIO InterpreterAlterQuery::execute()
table->freezePartition(command.partition, context.getSettingsRef());
break;
case PartitionCommand::RESHARD_PARTITION:
table->reshardPartitions(database_name, command.partition, command.last_partition, command.weighted_zookeeper_paths, command.sharding_key, context.getSettingsRef());
break;
default:
throw Exception("Bad PartitionCommand::Type: " + toString(command.type), ErrorCodes::ARGUMENT_OUT_OF_BOUND);
}
@ -164,6 +169,31 @@ void InterpreterAlterQuery::parseAlter(
const Field & partition = dynamic_cast<const ASTLiteral &>(*params.partition).value;
out_partition_commands.push_back(PartitionCommand::freezePartition(partition));
}
else if (params.type == ASTAlterQuery::RESHARD_PARTITION)
{
Field first_partition;
if (params.partition)
first_partition = dynamic_cast<const ASTLiteral &>(*params.partition).value;
Field last_partition;
if (params.last_partition)
last_partition = dynamic_cast<const ASTLiteral &>(*params.last_partition).value;
else
last_partition = first_partition;
WeightedZooKeeperPaths weighted_zookeeper_paths;
const ASTs & ast_weighted_zookeeper_paths = typeid_cast<const ASTExpressionList &>(*params.weighted_zookeeper_paths).children;
for (size_t i = 0; i < ast_weighted_zookeeper_paths.size(); ++i)
{
const auto & weighted_zookeeper_path = typeid_cast<const ASTWeightedZooKeeperPath &>(*ast_weighted_zookeeper_paths[i]);
weighted_zookeeper_paths.emplace_back(weighted_zookeeper_path.path, weighted_zookeeper_path.weight);
}
const auto & sharding_key = params.sharding_key;
out_partition_commands.push_back(PartitionCommand::reshardPartitions(first_partition, last_partition, weighted_zookeeper_paths, sharding_key));
}
else
throw Exception("Wrong parameter type in ALTER query", ErrorCodes::LOGICAL_ERROR);
}
@ -176,7 +206,7 @@ void InterpreterAlterQuery::updateMetadata(
const NamesAndTypesList & materialized_columns,
const NamesAndTypesList & alias_columns,
const ColumnDefaults & column_defaults,
Context & context)
const Context & context)
{
String path = context.getPath();

View File

@ -0,0 +1,169 @@
#include <DB/Parsers/ASTAlterQuery.h>
#include <mysqlxx/Manip.h>
namespace DB
{
namespace ErrorCodes
{
extern const int UNEXPECTED_AST_STRUCTURE;
}
ASTAlterQuery::Parameters::Parameters() : type(NO_TYPE) {}
void ASTAlterQuery::Parameters::clone(Parameters & p) const
{
p = *this;
if (col_decl) p.col_decl = col_decl->clone();
if (column) p.column = column->clone();
if (partition) p.partition = partition->clone();
if (last_partition) p.last_partition = last_partition->clone();
if (weighted_zookeeper_paths) p.weighted_zookeeper_paths = weighted_zookeeper_paths->clone();
}
void ASTAlterQuery::addParameters(const Parameters & params)
{
parameters.push_back(params);
if (params.col_decl)
children.push_back(params.col_decl);
if (params.column)
children.push_back(params.column);
if (params.partition)
children.push_back(params.partition);
if (params.last_partition)
children.push_back(params.last_partition);
if (params.weighted_zookeeper_paths)
children.push_back(params.weighted_zookeeper_paths);
}
ASTAlterQuery::ASTAlterQuery(StringRange range_) : IAST(range_)
{
}
/** Получить текст, который идентифицирует этот элемент. */
String ASTAlterQuery::getID() const
{
return ("AlterQuery_" + database + "_" + table);
}
ASTPtr ASTAlterQuery::clone() const
{
ASTAlterQuery * res = new ASTAlterQuery(*this);
for (ParameterContainer::size_type i = 0; i < parameters.size(); ++i)
parameters[i].clone(res->parameters[i]);
return res;
}
void ASTAlterQuery::formatImpl(const FormatSettings & settings, FormatState & state, FormatStateStacked frame) const
{
frame.need_parens = false;
std::string indent_str = settings.one_line ? "" : std::string(4 * frame.indent, ' ');
settings.ostr << (settings.hilite ? hilite_keyword : "") << indent_str << "ALTER TABLE " << (settings.hilite ? hilite_none : "");
if (!table.empty())
{
if (!database.empty())
{
settings.ostr << indent_str << database;
settings.ostr << ".";
}
settings.ostr << indent_str << table;
}
settings.ostr << settings.nl_or_ws;
for (size_t i = 0; i < parameters.size(); ++i)
{
const ASTAlterQuery::Parameters & p = parameters[i];
if (p.type == ASTAlterQuery::ADD_COLUMN)
{
settings.ostr << (settings.hilite ? hilite_keyword : "") << indent_str << "ADD COLUMN " << (settings.hilite ? hilite_none : "");
p.col_decl->formatImpl(settings, state, frame);
/// AFTER
if (p.column)
{
settings.ostr << (settings.hilite ? hilite_keyword : "") << indent_str << " AFTER " << (settings.hilite ? hilite_none : "");
p.column->formatImpl(settings, state, frame);
}
}
else if (p.type == ASTAlterQuery::DROP_COLUMN)
{
settings.ostr << (settings.hilite ? hilite_keyword : "") << indent_str << "DROP COLUMN " << (settings.hilite ? hilite_none : "");
p.column->formatImpl(settings, state, frame);
}
else if (p.type == ASTAlterQuery::MODIFY_COLUMN)
{
settings.ostr << (settings.hilite ? hilite_keyword : "") << indent_str << "MODIFY COLUMN " << (settings.hilite ? hilite_none : "");
p.col_decl->formatImpl(settings, state, frame);
}
else if (p.type == ASTAlterQuery::DROP_PARTITION)
{
settings.ostr << (settings.hilite ? hilite_keyword : "") << indent_str << (p.detach ? "DETACH" : "DROP") << " PARTITION "
<< (settings.hilite ? hilite_none : "");
p.partition->formatImpl(settings, state, frame);
}
else if (p.type == ASTAlterQuery::ATTACH_PARTITION)
{
settings.ostr << (settings.hilite ? hilite_keyword : "") << indent_str << "ATTACH " << (p.unreplicated ? "UNREPLICATED " : "")
<< (p.part ? "PART " : "PARTITION ") << (settings.hilite ? hilite_none : "");
p.partition->formatImpl(settings, state, frame);
}
else if (p.type == ASTAlterQuery::FETCH_PARTITION)
{
settings.ostr << (settings.hilite ? hilite_keyword : "") << indent_str << "FETCH " << (p.unreplicated ? "UNREPLICATED " : "")
<< "PARTITION " << (settings.hilite ? hilite_none : "");
p.partition->formatImpl(settings, state, frame);
settings.ostr << (settings.hilite ? hilite_keyword : "")
<< " FROM " << (settings.hilite ? hilite_none : "") << mysqlxx::quote << p.from;
}
else if (p.type == ASTAlterQuery::FREEZE_PARTITION)
{
settings.ostr << (settings.hilite ? hilite_keyword : "") << indent_str << "FREEZE PARTITION " << (settings.hilite ? hilite_none : "");
p.partition->formatImpl(settings, state, frame);
}
else if (p.type == ASTAlterQuery::RESHARD_PARTITION)
{
settings.ostr << (settings.hilite ? hilite_keyword : "") << indent_str << "RESHARD ";
if (p.partition)
settings.ostr << "PARTITION ";
settings.ostr << (settings.hilite ? hilite_none : "");
if (p.partition)
p.partition->formatImpl(settings, state, frame);
if (p.partition && p.last_partition)
settings.ostr << "..";
if (p.last_partition)
p.last_partition->formatImpl(settings, state, frame);
std::string ws = p.partition ? " " : "";
settings.ostr << (settings.hilite ? hilite_keyword : "") << ws
<< "TO " << (settings.hilite ? hilite_none : "");
FormatStateStacked frame_with_indent = frame;
++frame_with_indent.indent;
p.weighted_zookeeper_paths->formatImpl(settings, state, frame_with_indent);
settings.ostr << settings.nl_or_ws;
settings.ostr << (settings.hilite ? hilite_keyword : "") << indent_str
<< "USING " << (settings.hilite ? hilite_none : "")
<< p.sharding_key;
}
else
throw Exception("Unexpected type of ALTER", ErrorCodes::UNEXPECTED_AST_STRUCTURE);
std::string comma = (i < (parameters.size() -1) ) ? "," : "";
settings.ostr << (settings.hilite ? hilite_keyword : "") << indent_str << comma << (settings.hilite ? hilite_none : "");
settings.ostr << settings.nl_or_ws;
}
}
}

View File

@ -12,6 +12,7 @@
#include <DB/Parsers/ASTOrderByElement.h>
#include <DB/Parsers/ASTSelectQuery.h>
#include <DB/Parsers/ASTSubquery.h>
#include <DB/Parsers/ASTWeightedZooKeeperPath.h>
#include <DB/Parsers/CommonParsers.h>
#include <DB/Parsers/ExpressionListParsers.h>
@ -791,6 +792,45 @@ bool ParserOrderByElement::parseImpl(Pos & pos, Pos end, ASTPtr & node, Pos & ma
return true;
}
bool ParserWeightedZooKeeperPath::parseImpl(Pos & pos, Pos end, ASTPtr & node, Pos & max_parsed_pos, Expected & expected)
{
ParserString s_weight("WEIGHT", true, true);
ParserStringLiteral path_p;
ParserUnsignedInteger weight_p;
ParserWhiteSpaceOrComments ws;
auto weighted_zookeeper_path = new ASTWeightedZooKeeperPath;
node = weighted_zookeeper_path;
ws.ignore(pos, end);
ASTPtr path_node;
if (!path_p.parse(pos, end, path_node, max_parsed_pos, expected))
return false;
weighted_zookeeper_path->path = typeid_cast<const ASTLiteral &>(*path_node).value.get<const String &>();
ws.ignore(pos, end);
bool is_weight_set = false;
if (s_weight.ignore(pos, end, max_parsed_pos, expected))
{
ws.ignore(pos, end);
ASTPtr weight_node;
if (weight_p.parse(pos, end, weight_node, max_parsed_pos, expected))
{
is_weight_set = true;
weighted_zookeeper_path->weight = typeid_cast<const ASTLiteral &>(*weight_node).value.get<const UInt64 &>();
}
}
if (!is_weight_set)
weighted_zookeeper_path->weight = 1;
return true;
}
}

View File

@ -9,6 +9,7 @@
namespace DB
{
bool ParserAlterQuery::parseImpl(Pos & pos, Pos end, ASTPtr & node, Pos & max_parsed_pos, Expected & expected)
{
Pos begin = pos;
@ -22,7 +23,7 @@ bool ParserAlterQuery::parseImpl(Pos & pos, Pos end, ASTPtr & node, Pos & max_pa
ParserString s_column("COLUMN", true, true);
ParserString s_after("AFTER", true, true);
ParserString s_modify("MODIFY", true, true);
ParserString s_reshard("RESHARD", true, true);
ParserString s_drop("DROP", true, true);
ParserString s_detach("DETACH", true, true);
ParserString s_attach("ATTACH", true, true);
@ -32,12 +33,17 @@ bool ParserAlterQuery::parseImpl(Pos & pos, Pos end, ASTPtr & node, Pos & max_pa
ParserString s_part("PART", true, true);
ParserString s_partition("PARTITION", true, true);
ParserString s_from("FROM", true, true);
ParserString s_to("TO", true, true);
ParserString s_using("USING", true, true);
ParserString s_key("KEY", true, true);
ParserString s_comma(",");
ParserString s_doubledot("..");
ParserIdentifier table_parser;
ParserCompoundIdentifier parser_name;
ParserCompoundColumnDeclaration parser_col_decl;
ParserLiteral parser_literal;
ParserUnsignedInteger parser_uint;
ParserStringLiteral parser_string_literal;
ASTPtr table;
@ -246,6 +252,58 @@ bool ParserAlterQuery::parseImpl(Pos & pos, Pos end, ASTPtr & node, Pos & max_pa
params.type = ASTAlterQuery::MODIFY_COLUMN;
}
else if (s_reshard.ignore(pos, end, max_parsed_pos, expected))
{
ParserList weighted_zookeeper_paths_p(ParserPtr(new ParserWeightedZooKeeperPath), ParserPtr(new ParserString(",")), false);
ParserIdentifier sharding_key_parser;
ws.ignore(pos, end);
if (s_partition.ignore(pos, end, max_parsed_pos, expected))
{
ws.ignore(pos, end);
if (!parser_uint.parse(pos, end, params.partition, max_parsed_pos, expected))
return false;
ws.ignore(pos, end);
if (s_doubledot.ignore(pos, end, max_parsed_pos, expected))
{
ws.ignore(pos, end);
if (!parser_uint.parse(pos, end, params.last_partition, max_parsed_pos, expected))
return false;
}
}
ws.ignore(pos, end);
if (!s_to.ignore(pos, end, max_parsed_pos, expected))
return false;
ws.ignore(pos, end);
if (!weighted_zookeeper_paths_p.parse(pos, end, params.weighted_zookeeper_paths, max_parsed_pos, expected))
return false;
ws.ignore(pos, end);
if (!s_using.ignore(pos, end, max_parsed_pos, expected))
return false;
ws.ignore(pos, end);
ASTPtr ast_sharding_key;
if (!sharding_key_parser.parse(pos, end, ast_sharding_key, max_parsed_pos, expected))
return false;
params.sharding_key = typeid_cast<const ASTIdentifier &>(*ast_sharding_key).name;
ws.ignore(pos, end);
params.type = ASTAlterQuery::RESHARD_PARTITION;
}
else
return false;

View File

@ -35,6 +35,8 @@
#include <DB/Storages/System/StorageSystemFunctions.h>
#include <DB/Storages/System/StorageSystemClusters.h>
#include <DB/Storages/System/StorageSystemMetrics.h>
#include <DB/Storages/StorageReplicatedMergeTree.h>
#include <DB/Storages/MergeTree/ReshardingWorker.h>
#include <zkutil/ZooKeeper.h>
@ -318,6 +320,16 @@ int Server::main(const std::vector<std::string> & args)
global_context->setCurrentDatabase(config().getString("default_database", "default"));
if (has_zookeeper)
{
zkutil::ZooKeeperPtr zookeeper = global_context->getZooKeeper();
if (!zookeeper->getTaskQueuePath().empty())
{
auto & resharding_worker = global_context->getReshardingWorker();
resharding_worker.start();
}
}
SCOPE_EXIT(
LOG_DEBUG(log, "Closed all connections.");

View File

@ -1,4 +1,4 @@
#include <DB/Storages/MergeTree/ReplicatedMergeTreePartsExchange.h>
#include <DB/Storages/MergeTree/DataPartsExchange.h>
#include <DB/Storages/StorageReplicatedMergeTree.h>
#include <DB/Common/CurrentMetrics.h>
@ -9,22 +9,52 @@ namespace DB
namespace ErrorCodes
{
extern const int ABORTED;
extern const int BAD_SIZE_OF_FILE_IN_DATA_PART;
}
namespace DataPartsExchange
{
void ReplicatedMergeTreePartsServer::processQuery(const Poco::Net::HTMLForm & params, WriteBuffer & out)
namespace
{
std::string getEndpointId(const std::string & node_id)
{
return "DataPartsExchange:" + node_id;
}
}
std::string Service::getId(const std::string & node_id) const
{
return getEndpointId(node_id);
}
void Service::processQuery(const Poco::Net::HTMLForm & params, WriteBuffer & out)
{
if (is_cancelled)
throw Exception("Transferring part to replica was cancelled", ErrorCodes::ABORTED);
String part_name = params.get("part");
String shard_str = params.get("shard");
bool send_sharded_part = !shard_str.empty();
LOG_TRACE(log, "Sending part " << part_name);
try
{
auto storage_lock = storage.lockStructure(false);
MergeTreeData::DataPartPtr part = findPart(part_name);
MergeTreeData::DataPartPtr part;
if (send_sharded_part)
{
size_t shard_no = std::stoul(shard_str);
part = findShardedPart(part_name, shard_no);
}
else
part = findPart(part_name);
Poco::ScopedReadRWLock part_lock(part->columns_lock);
@ -43,7 +73,13 @@ void ReplicatedMergeTreePartsServer::processQuery(const Poco::Net::HTMLForm & pa
{
String file_name = it.first;
String path = data.getFullPath() + part_name + "/" + file_name;
String path;
if (send_sharded_part)
path = data.getFullPath() + "reshard/" + shard_str + "/" + part_name + "/" + file_name;
else
path = data.getFullPath() + part_name + "/" + file_name;
UInt64 size = Poco::File(path).getSize();
writeStringBinary(it.first, out);
@ -75,17 +111,53 @@ void ReplicatedMergeTreePartsServer::processQuery(const Poco::Net::HTMLForm & pa
}
}
MergeTreeData::MutableDataPartPtr ReplicatedMergeTreePartsFetcher::fetchPart(
MergeTreeData::DataPartPtr Service::findPart(const String & name)
{
MergeTreeData::DataPartPtr part = data.getPartIfExists(name);
if (part)
return part;
throw Exception("No part " + name + " in table");
}
MergeTreeData::DataPartPtr Service::findShardedPart(const String & name, size_t shard_no)
{
MergeTreeData::DataPartPtr part = data.getShardedPartIfExists(name, shard_no);
if (part)
return part;
throw Exception("No part " + name + " in table");
}
MergeTreeData::MutableDataPartPtr Fetcher::fetchPart(
const String & part_name,
const String & replica_path,
const String & host,
int port,
bool to_detached)
{
return fetchPartImpl(part_name, replica_path, host, port, "", to_detached);
}
MergeTreeData::MutableDataPartPtr Fetcher::fetchShardedPart(
const InterserverIOEndpointLocation & location,
const String & part_name,
size_t shard_no)
{
return fetchPartImpl(part_name, location.name, location.host, location.port, toString(shard_no), true);
}
MergeTreeData::MutableDataPartPtr Fetcher::fetchPartImpl(
const String & part_name,
const String & replica_path,
const String & host,
int port,
const String & shard_no,
bool to_detached)
{
ReadBufferFromHTTP::Params params =
{
{"endpoint", "ReplicatedMergeTree:" + replica_path},
{"endpoint", getEndpointId(replica_path)},
{"part", part_name},
{"shard", shard_no},
{"compress", "false"}
};
@ -150,10 +222,12 @@ MergeTreeData::MutableDataPartPtr ReplicatedMergeTreePartsFetcher::fetchPart(
new_data_part->loadColumns(true);
new_data_part->loadChecksums(true);
new_data_part->loadIndex();
new_data_part->is_sharded = false;
new_data_part->checksums.checkEqual(checksums, false);
return new_data_part;
}
}
}

View File

@ -33,7 +33,7 @@ MergeTreeData::MergeTreeData(
const NamesAndTypesList & materialized_columns_,
const NamesAndTypesList & alias_columns_,
const ColumnDefaults & column_defaults_,
const Context & context_,
Context & context_,
ASTPtr & primary_expr_ast_,
const String & date_column_name_, const ASTPtr & sampling_expression_,
size_t index_granularity_,
@ -1039,6 +1039,19 @@ MergeTreeData::DataPartPtr MergeTreeData::getPartIfExists(const String & part_na
return nullptr;
}
MergeTreeData::DataPartPtr MergeTreeData::getShardedPartIfExists(const String & part_name, size_t shard_no)
{
MutableDataPartPtr tmp_part(new DataPart(*this));
ActiveDataPartSet::parsePartName(part_name, *tmp_part);
const MutableDataParts & sharded_parts = per_shard_data_parts.at(shard_no);
MutableDataParts::const_iterator it = sharded_parts.lower_bound(tmp_part);
if ((it != sharded_parts.end()) && ((*it)->name == part_name))
return *it;
return nullptr;
}
MergeTreeData::MutableDataPartPtr MergeTreeData::loadPartAndFixMetadata(const String & relative_path)
{
MutableDataPartPtr part = std::make_shared<DataPart>(*this);
@ -1339,6 +1352,31 @@ void MergeTreeData::freezePartition(const std::string & prefix)
LOG_DEBUG(log, "Freezed " << parts_processed << " parts");
}
size_t MergeTreeData::getPartitionSize(const std::string & partition_name) const
{
size_t size = 0;
Poco::DirectoryIterator end;
Poco::DirectoryIterator end2;
for (Poco::DirectoryIterator it(full_path); it != end; ++it)
{
const auto filename = it.name();
if (!ActiveDataPartSet::isPartDirectory(filename))
continue;
if (0 != filename.compare(0, partition_name.size(), partition_name))
continue;
const auto part_path = it.path().absolute().toString();
for (Poco::DirectoryIterator it2(part_path); it2 != end2; ++it2)
{
const auto part_file_path = it2.path().absolute().toString();
size += Poco::File(part_file_path).getSize();
}
}
return size;
}
static std::pair<String, DayNum_t> getMonthNameAndDayNum(const Field & partition)
{
@ -1366,9 +1404,26 @@ String MergeTreeData::getMonthName(const Field & partition)
return getMonthNameAndDayNum(partition).first;
}
String MergeTreeData::getMonthName(DayNum_t month)
{
return toString(DateLUT::instance().toNumYYYYMMDD(month) / 100);
}
DayNum_t MergeTreeData::getMonthDayNum(const Field & partition)
{
return getMonthNameAndDayNum(partition).second;
}
DayNum_t MergeTreeData::getMonthFromName(const String & month_name)
{
DayNum_t date = DateLUT::instance().YYYYMMDDToDayNum(parse<UInt32>(month_name + "01"));
/// Не можем просто сравнить date с нулем, потому что 0 тоже валидный DayNum.
if (month_name != toString(DateLUT::instance().toNumYYYYMMDD(date) / 100))
throw Exception("Invalid partition format: " + month_name + " doesn't look like month.",
ErrorCodes::INVALID_PARTITION_NAME);
return date;
}
}

View File

@ -3,6 +3,7 @@
#include <DB/Storages/MergeTree/MergedBlockOutputStream.h>
#include <DB/Storages/MergeTree/DiskSpaceMonitor.h>
#include <DB/Storages/MergeTree/MergeList.h>
#include <DB/Storages/MergeTree/ReshardingJob.h>
#include <DB/DataStreams/ExpressionBlockInputStream.h>
#include <DB/DataStreams/MergingSortedBlockInputStream.h>
#include <DB/DataStreams/CollapsingSortedBlockInputStream.h>
@ -278,18 +279,59 @@ bool MergeTreeDataMerger::selectPartsToMerge(MergeTreeData::DataPartsVector & pa
return found;
}
MergeTreeData::DataPartsVector MergeTreeDataMerger::selectAllPartsFromPartition(DayNum_t partition)
{
MergeTreeData::DataPartsVector parts_from_partition;
MergeTreeData::DataParts data_parts = data.getDataParts();
for (MergeTreeData::DataParts::iterator it = data_parts.cbegin(); it != data_parts.cend(); ++it)
{
const MergeTreeData::DataPartPtr & current_part = *it;
DayNum_t month = current_part->month;
if (month != partition)
continue;
parts_from_partition.push_back(*it);
}
return parts_from_partition;
}
/// parts должны быть отсортированы.
MergeTreeData::DataPartPtr MergeTreeDataMerger::mergeParts(
MergeTreeData::MutableDataPartPtr MergeTreeDataMerger::mergeParts(
const MergeTreeData::DataPartsVector & parts, const String & merged_name, MergeList::Entry & merge_entry,
size_t aio_threshold, MergeTreeData::Transaction * out_transaction,
DiskSpaceMonitor::Reservation * disk_reservation)
{
bool is_sharded = parts[0]->is_sharded;
for (size_t i = 1; i < parts.size(); ++i)
{
if (parts[i]->is_sharded != is_sharded)
throw Exception("Inconsistent set of parts provided for merging", ErrorCodes::LOGICAL_ERROR);
}
size_t shard_no = 0;
if (is_sharded)
{
shard_no = parts[0]->shard_no;
for (size_t i = 1; i < parts.size(); ++i)
{
if (parts[i]->shard_no != shard_no)
throw Exception("Inconsistent set of parts provided for merging", ErrorCodes::LOGICAL_ERROR);
}
}
merge_entry->num_parts = parts.size();
LOG_DEBUG(log, "Merging " << parts.size() << " parts: from " << parts.front()->name << " to " << parts.back()->name << " into " << merged_name);
String merged_dir = data.getFullPath() + merged_name;
String merged_dir;
if (is_sharded)
merged_dir = data.getFullPath() + "reshard/" + toString(shard_no) + merged_name;
else
merged_dir = data.getFullPath() + merged_name;
if (Poco::File(merged_dir).exists())
throw Exception("Directory " + merged_dir + " already exists", ErrorCodes::DIRECTORY_ALREADY_EXISTS);
@ -333,8 +375,14 @@ MergeTreeData::DataPartPtr MergeTreeDataMerger::mergeParts(
{
MarkRanges ranges(1, MarkRange(0, parts[i]->size));
String part_path;
if (is_sharded)
part_path = data.getFullPath() + "reshard/" + toString(shard_no) + "/" + parts[i]->name + '/';
else
part_path = data.getFullPath() + parts[i]->name + '/';
auto input = std::make_unique<MergeTreeBlockInputStream>(
data.getFullPath() + parts[i]->name + '/', DEFAULT_MERGE_BLOCK_SIZE, union_column_names, data,
part_path, DEFAULT_MERGE_BLOCK_SIZE, union_column_names, data,
parts[i], ranges, false, nullptr, "", true, aio_threshold, DBMS_DEFAULT_BUFFER_SIZE, false);
input->setProgressCallback([&merge_entry, rows_total] (const Progress & value)
@ -388,7 +436,12 @@ MergeTreeData::DataPartPtr MergeTreeDataMerger::mergeParts(
throw Exception("Unknown mode of operation for MergeTreeData: " + toString(data.mode), ErrorCodes::LOGICAL_ERROR);
}
const String new_part_tmp_path = data.getFullPath() + "tmp_" + merged_name + "/";
String new_part_tmp_path;
if (is_sharded)
new_part_tmp_path = data.getFullPath() + "reshard/" + toString(shard_no) + "/tmp_" + merged_name + "/";
else
new_part_tmp_path = data.getFullPath() + "tmp_" + merged_name + "/";
auto compression_method = data.context.chooseCompressionMethod(
merge_entry->total_size_bytes_compressed,
@ -430,40 +483,45 @@ MergeTreeData::DataPartPtr MergeTreeDataMerger::mergeParts(
new_data_part->size = to.marksCount();
new_data_part->modification_time = time(0);
new_data_part->size_in_bytes = MergeTreeData::DataPart::calcTotalSize(new_part_tmp_path);
new_data_part->is_sharded = is_sharded;
new_data_part->shard_no = shard_no;
/// Переименовываем новый кусок, добавляем в набор и убираем исходные куски.
auto replaced_parts = data.renameTempPartAndReplace(new_data_part, nullptr, out_transaction);
if (new_data_part->name != merged_name)
throw Exception("Unexpected part name: " + new_data_part->name + " instead of " + merged_name, ErrorCodes::LOGICAL_ERROR);
/// Проверим, что удалились все исходные куски и только они.
if (replaced_parts.size() != parts.size())
if (!is_sharded)
{
/** Это нормально, хотя такое бывает редко.
* Ситуация - было заменено 0 кусков вместо N может быть, например, в следующем случае:
* - у нас был кусок A, но не было куска B и C;
* - в очереди был мердж A, B -> AB, но его не делали, так как куска B нет;
* - в очереди был мердж AB, C -> ABC, но его не делали, так как куска AB и C нет;
* - мы выполнили задачу на скачивание куска B;
* - мы начали делать мердж A, B -> AB, так как все куски появились;
* - мы решили скачать с другой реплики кусок ABC, так как невозможно было сделать мердж AB, C -> ABC;
* - кусок ABC появился, при его добавлении, были удалены старые куски A, B, C;
* - мердж AB закончился. Добавился кусок AB. Но это устаревший кусок. В логе будет сообщение Obsolete part added,
* затем попадаем сюда.
* Ситуация - было заменено M > N кусков тоже нормальная.
*
* Хотя это должно предотвращаться проверкой в методе ReplicatedMergeTreeQueue::shouldExecuteLogEntry.
*/
LOG_WARNING(log, "Unexpected number of parts removed when adding " << new_data_part->name << ": " << replaced_parts.size()
<< " instead of " << parts.size());
}
else
{
for (size_t i = 0; i < parts.size(); ++i)
if (parts[i]->name != replaced_parts[i]->name)
throw Exception("Unexpected part removed when adding " + new_data_part->name + ": " + replaced_parts[i]->name
+ " instead of " + parts[i]->name, ErrorCodes::LOGICAL_ERROR);
/// Переименовываем новый кусок, добавляем в набор и убираем исходные куски.
auto replaced_parts = data.renameTempPartAndReplace(new_data_part, nullptr, out_transaction);
if (new_data_part->name != merged_name)
throw Exception("Unexpected part name: " + new_data_part->name + " instead of " + merged_name, ErrorCodes::LOGICAL_ERROR);
/// Проверим, что удалились все исходные куски и только они.
if (replaced_parts.size() != parts.size())
{
/** Это нормально, хотя такое бывает редко.
* Ситуация - было заменено 0 кусков вместо N может быть, например, в следующем случае:
* - у нас был кусок A, но не было куска B и C;
* - в очереди был мердж A, B -> AB, но его не делали, так как куска B нет;
* - в очереди был мердж AB, C -> ABC, но его не делали, так как куска AB и C нет;
* - мы выполнили задачу на скачивание куска B;
* - мы начали делать мердж A, B -> AB, так как все куски появились;
* - мы решили скачать с другой реплики кусок ABC, так как невозможно было сделать мердж AB, C -> ABC;
* - кусок ABC появился, при его добавлении, были удалены старые куски A, B, C;
* - мердж AB закончился. Добавился кусок AB. Но это устаревший кусок. В логе будет сообщение Obsolete part added,
* затем попадаем сюда.
* Ситуация - было заменено M > N кусков тоже нормальная.
*
* Хотя это должно предотвращаться проверкой в методе StorageReplicatedMergeTree::shouldExecuteLogEntry.
*/
LOG_WARNING(log, "Unexpected number of parts removed when adding " << new_data_part->name << ": " << replaced_parts.size()
<< " instead of " << parts.size());
}
else
{
for (size_t i = 0; i < parts.size(); ++i)
if (parts[i]->name != replaced_parts[i]->name)
throw Exception("Unexpected part removed when adding " + new_data_part->name + ": " + replaced_parts[i]->name
+ " instead of " + parts[i]->name, ErrorCodes::LOGICAL_ERROR);
}
}
LOG_TRACE(log, "Merged " << parts.size() << " parts: from " << parts.front()->name << " to " << parts.back()->name);

View File

@ -0,0 +1,229 @@
#include <DB/Storages/MergeTree/MergeTreeSharder.h>
#include <DB/Storages/MergeTree/ReshardingJob.h>
#include <DB/Storages/MergeTree/MergedBlockOutputStream.h>
#include <DB/Common/escapeForFileName.h>
#include <DB/DataTypes/DataTypeArray.h>
#include <DB/IO/HashingWriteBuffer.h>
#include <ctime>
namespace DB
{
namespace ErrorCodes
{
extern const int LOGICAL_ERROR;
extern const int TYPE_MISMATCH;
}
namespace
{
template <typename T>
std::vector<IColumn::Filter> createFiltersImpl(const size_t num_rows, const IColumn * column, size_t num_shards, const std::vector<size_t> & slots)
{
const auto total_weight = slots.size();
std::vector<IColumn::Filter> filters(num_shards);
/** Деление отрицательного числа с остатком на положительное, в C++ даёт отрицательный остаток.
* Для данной задачи это не подходит. Поэтому, будем обрабатывать знаковые типы как беззнаковые.
* Это даёт уже что-то совсем не похожее на деление с остатком, но подходящее для данной задачи.
*/
using UnsignedT = typename std::make_unsigned<T>::type;
/// const columns contain only one value, therefore we do not need to read it at every iteration
if (column->isConst())
{
const auto data = typeid_cast<const ColumnConst<T> *>(column)->getData();
const auto shard_num = slots[static_cast<UnsignedT>(data) % total_weight];
for (size_t i = 0; i < num_shards; ++i)
filters[i].assign(num_rows, static_cast<UInt8>(shard_num == i));
}
else
{
const auto & data = typeid_cast<const ColumnVector<T> *>(column)->getData();
for (size_t i = 0; i < num_shards; ++i)
{
filters[i].resize(num_rows);
for (size_t j = 0; j < num_rows; ++j)
filters[i][j] = slots[static_cast<UnsignedT>(data[j]) % total_weight] == i;
}
}
return filters;
}
}
ShardedBlockWithDateInterval::ShardedBlockWithDateInterval(const Block & block_,
size_t shard_no_, UInt16 min_date_, UInt16 max_date_)
: block(block_), shard_no(shard_no_), min_date(min_date_), max_date(max_date_)
{
}
MergeTreeSharder::MergeTreeSharder(MergeTreeData & data_, const ReshardingJob & job_)
: data(data_), job(job_), log(&Logger::get(data.getLogName() + " (Sharder)"))
{
for (size_t shard_no = 0; shard_no < job.paths.size(); ++shard_no)
{
const WeightedZooKeeperPath & weighted_path = job.paths[shard_no];
slots.insert(slots.end(), weighted_path.second, shard_no);
}
}
ShardedBlocksWithDateIntervals MergeTreeSharder::shardBlock(const Block & block)
{
ShardedBlocksWithDateIntervals res;
const auto num_cols = block.columns();
/// cache column pointers for later reuse
std::vector<const IColumn*> columns(num_cols);
for (size_t i = 0; i < columns.size(); ++i)
columns[i] = block.getByPosition(i).column;
auto filters = createFilters(block);
const auto num_shards = job.paths.size();
ssize_t size_hint = ((block.rowsInFirstColumn() + num_shards - 1) / num_shards) * 1.1; /// Число 1.1 выбрано наугад.
for (size_t shard_no = 0; shard_no < num_shards; ++shard_no)
{
auto target_block = block.cloneEmpty();
for (size_t col = 0; col < num_cols; ++col)
target_block.getByPosition(col).column = columns[col]->filter(filters[shard_no], size_hint);
if (target_block.rowsInFirstColumn())
{
/// Достаём столбец с датой.
const ColumnUInt16::Container_t & dates =
typeid_cast<const ColumnUInt16 &>(*target_block.getByName(data.date_column_name).column).getData();
/// Минимальная и максимальная дата.
UInt16 min_date = std::numeric_limits<UInt16>::max();
UInt16 max_date = std::numeric_limits<UInt16>::min();
for (ColumnUInt16::Container_t::const_iterator it = dates.begin(); it != dates.end(); ++it)
{
if (*it < min_date)
min_date = *it;
if (*it > max_date)
max_date = *it;
}
res.emplace_back(target_block, shard_no, min_date, max_date);
}
}
return res;
}
MergeTreeData::MutableDataPartPtr MergeTreeSharder::writeTempPart(
ShardedBlockWithDateInterval & sharded_block_with_dates, Int64 temp_index)
{
Block & block = sharded_block_with_dates.block;
UInt16 min_date = sharded_block_with_dates.min_date;
UInt16 max_date = sharded_block_with_dates.max_date;
size_t shard_no = sharded_block_with_dates.shard_no;
const auto & date_lut = DateLUT::instance();
DayNum_t min_month = date_lut.toFirstDayNumOfMonth(DayNum_t(min_date));
DayNum_t max_month = date_lut.toFirstDayNumOfMonth(DayNum_t(max_date));
if (min_month != max_month)
throw Exception("Logical error: part spans more than one month.", ErrorCodes::LOGICAL_ERROR);
size_t part_size = (block.rows() + data.index_granularity - 1) / data.index_granularity;
String tmp_part_name = "tmp_" + ActiveDataPartSet::getPartName(
DayNum_t(min_date), DayNum_t(max_date),
temp_index, temp_index, 0);
String part_tmp_path = data.getFullPath() + "reshard/" + toString(shard_no) + "/" + tmp_part_name + "/";
Poco::File(part_tmp_path).createDirectories();
MergeTreeData::MutableDataPartPtr new_data_part = std::make_shared<MergeTreeData::DataPart>(data);
new_data_part->name = tmp_part_name;
new_data_part->is_temp = true;
/// Если для сортировки надо вычислить некоторые столбцы - делаем это.
if (data.mode != MergeTreeData::Unsorted)
data.getPrimaryExpression()->execute(block);
SortDescription sort_descr = data.getSortDescription();
/// Сортируем.
IColumn::Permutation * perm_ptr = nullptr;
IColumn::Permutation perm;
if (data.mode != MergeTreeData::Unsorted)
{
if (!isAlreadySorted(block, sort_descr))
{
stableGetPermutation(block, sort_descr, perm);
perm_ptr = &perm;
}
}
NamesAndTypesList columns = data.getColumnsList().filter(block.getColumnsList().getNames());
MergedBlockOutputStream out(data, part_tmp_path, columns, CompressionMethod::LZ4);
out.getIndex().reserve(part_size * sort_descr.size());
out.writePrefix();
out.writeWithPermutation(block, perm_ptr);
MergeTreeData::DataPart::Checksums checksums = out.writeSuffixAndGetChecksums();
new_data_part->left_date = DayNum_t(min_date);
new_data_part->right_date = DayNum_t(max_date);
new_data_part->left = temp_index;
new_data_part->right = temp_index;
new_data_part->level = 0;
new_data_part->size = part_size;
new_data_part->modification_time = std::time(0);
new_data_part->month = min_month;
new_data_part->columns = columns;
new_data_part->checksums = checksums;
new_data_part->index.swap(out.getIndex());
new_data_part->size_in_bytes = MergeTreeData::DataPart::calcTotalSize(part_tmp_path);
new_data_part->is_sharded = true;
new_data_part->shard_no = sharded_block_with_dates.shard_no;
return new_data_part;
}
std::vector<IColumn::Filter> MergeTreeSharder::createFilters(Block block)
{
using create_filters_sig = std::vector<IColumn::Filter>(size_t, const IColumn *, size_t num_shards, const std::vector<size_t> & slots);
/// hashmap of pointers to functions corresponding to each integral type
static std::unordered_map<std::string, create_filters_sig *> creators{
{ TypeName<UInt8>::get(), &createFiltersImpl<UInt8> },
{ TypeName<UInt16>::get(), &createFiltersImpl<UInt16> },
{ TypeName<UInt32>::get(), &createFiltersImpl<UInt32> },
{ TypeName<UInt64>::get(), &createFiltersImpl<UInt64> },
{ TypeName<Int8>::get(), &createFiltersImpl<Int8> },
{ TypeName<Int16>::get(), &createFiltersImpl<Int16> },
{ TypeName<Int32>::get(), &createFiltersImpl<Int32> },
{ TypeName<Int64>::get(), &createFiltersImpl<Int64> },
};
data.getPrimaryExpression()->execute(block);
const auto & key_column = block.getByName(job.sharding_key);
/// check that key column has valid type
const auto it = creators.find(key_column.type->getName());
return it != std::end(creators)
? (*it->second)(block.rowsInFirstColumn(), key_column.column.get(), job.paths.size(), slots)
: throw Exception{
"Sharding key expression does not evaluate to an integer type",
ErrorCodes::TYPE_MISMATCH
};
}
}

View File

@ -0,0 +1,67 @@
#include <DB/Storages/MergeTree/RemoteDiskSpaceMonitor.h>
#include <DB/Storages/MergeTree/DiskSpaceMonitor.h>
#include <DB/IO/ReadBufferFromHTTP.h>
#include <DB/IO/WriteHelpers.h>
#include <DB/IO/ReadHelpers.h>
namespace DB
{
namespace ErrorCodes
{
extern const int ABORTED;
}
namespace RemoteDiskSpaceMonitor
{
namespace
{
std::string getEndpointId(const std::string & node_id)
{
return "RemoteDiskSpaceMonitor:" + node_id;
}
}
Service::Service(const String & path_)
: path(path_)
{
}
std::string Service::getId(const std::string & node_id) const
{
return getEndpointId(node_id);
}
void Service::processQuery(const Poco::Net::HTMLForm & params, WriteBuffer & out)
{
if (is_cancelled)
throw Exception("RemoteDiskSpaceMonitor service terminated", ErrorCodes::ABORTED);
size_t free_space = DiskSpaceMonitor::getUnreservedFreeSpace(path);
writeBinary(free_space, out);
out.next();
}
size_t Client::getFreeDiskSpace(const InterserverIOEndpointLocation & location) const
{
ReadBufferFromHTTP::Params params =
{
{"endpoint", getEndpointId(location.name) },
{"compress", "false"}
};
ReadBufferFromHTTP in(location.host, location.port, params);
size_t free_disk_space;
readBinary(free_disk_space, in);
assertEOF(in);
return free_disk_space;
}
}
}

View File

@ -0,0 +1,80 @@
#include <DB/Storages/MergeTree/RemoteQueryExecutor.h>
#include <DB/Interpreters/executeQuery.h>
#include <DB/IO/ReadBufferFromHTTP.h>
#include <DB/IO/ReadHelpers.h>
#include <DB/IO/WriteHelpers.h>
namespace DB
{
namespace ErrorCodes
{
extern const int ABORTED;
}
namespace RemoteQueryExecutor
{
namespace
{
std::string getEndpointId(const std::string & node_id)
{
return "RemoteQueryExecutor:" + node_id;
}
}
Service::Service(Context & context_)
: context(context_)
{
}
std::string Service::getId(const std::string & node_id) const
{
return getEndpointId(node_id);
}
void Service::processQuery(const Poco::Net::HTMLForm & params, WriteBuffer & out)
{
if (is_cancelled)
throw Exception("RemoteQueryExecutor service terminated", ErrorCodes::ABORTED);
std::string query = params.get("query");
bool flag = true;
try
{
(void) executeQuery(query, context, true);
}
catch (...)
{
flag = false;
}
writeBinary(flag, out);
out.next();
}
bool Client::executeQuery(const InterserverIOEndpointLocation & location, const std::string & query)
{
ReadBufferFromHTTP::Params params =
{
{"endpoint", getEndpointId(location.name)},
{"compress", "false"},
{"query", query}
};
ReadBufferFromHTTP in(location.host, location.port, params);
bool flag;
readBinary(flag, in);
assertEOF(in);
return flag;
}
}
}

View File

@ -161,6 +161,16 @@ void ReplicatedMergeTreeRestartingThread::run()
{
storage.endpoint_holder->cancel();
storage.endpoint_holder = nullptr;
storage.disk_space_monitor_endpoint_holder->cancel();
storage.disk_space_monitor_endpoint_holder = nullptr;
storage.sharded_partition_sender_endpoint_holder->cancel();
storage.sharded_partition_sender_endpoint_holder = nullptr;
storage.remote_query_executor_endpoint_holder->cancel();
storage.remote_query_executor_endpoint_holder = nullptr;
partialShutdown();
}
catch (...)

View File

@ -0,0 +1,60 @@
#include <DB/Storages/MergeTree/ReshardingJob.h>
#include <DB/IO/ReadBufferFromString.h>
#include <DB/IO/ReadHelpers.h>
#include <DB/IO/WriteBufferFromString.h>
#include <DB/IO/WriteHelpers.h>
namespace DB
{
ReshardingJob::ReshardingJob(const std::string & serialized_job)
{
ReadBufferFromString buf(serialized_job);
readBinary(database_name, buf);
readBinary(table_name, buf);
readBinary(partition, buf);
readBinary(sharding_key, buf);
while (!buf.eof())
{
std::string path;
readBinary(path, buf);
UInt64 weight;
readBinary(weight, buf);
paths.emplace_back(path, weight);
}
}
ReshardingJob::ReshardingJob(const std::string & database_name_, const std::string & table_name_,
const std::string & partition_, const WeightedZooKeeperPaths & paths_,
const std::string & sharding_key_)
: database_name(database_name_),
table_name(table_name_),
partition(partition_),
paths(paths_),
sharding_key(sharding_key_)
{
}
std::string ReshardingJob::toString() const
{
std::string serialized_job;
WriteBufferFromString buf(serialized_job);
writeBinary(database_name, buf);
writeBinary(table_name, buf);
writeBinary(partition, buf);
writeBinary(sharding_key, buf);
for (const auto & path : paths)
{
writeBinary(path.first, buf);
writeBinary(path.second, buf);
}
buf.next();
return serialized_job;
}
}

View File

@ -0,0 +1,726 @@
#include <DB/Storages/MergeTree/ReshardingWorker.h>
#include <DB/Storages/MergeTree/ReshardingJob.h>
#include <DB/Storages/MergeTree/MergeTreeData.h>
#include <DB/Storages/MergeTree/MergeList.h>
#include <DB/Storages/MergeTree/ReplicatedMergeTreeAddress.h>
#include <DB/Storages/MergeTree/MergeTreeSharder.h>
#include <DB/Storages/MergeTree/MergeTreeBlockInputStream.h>
#include <DB/Storages/StorageReplicatedMergeTree.h>
#include <DB/IO/ReadBufferFromString.h>
#include <DB/IO/ReadHelpers.h>
#include <DB/IO/WriteBufferFromString.h>
#include <DB/IO/WriteHelpers.h>
#include <DB/Common/getFQDNOrHostName.h>
#include <DB/Interpreters/executeQuery.h>
#include <DB/Interpreters/Context.h>
#include <common/threadpool.hpp>
#include <zkutil/ZooKeeper.h>
#include <Poco/Event.h>
#include <Poco/DirectoryIterator.h>
#include <Poco/File.h>
#include <future>
namespace DB
{
namespace ErrorCodes
{
extern const int LOGICAL_ERROR;
extern const int ABORTED;
extern const int UNEXPECTED_ZOOKEEPER_ERROR;
extern const int PARTITION_COPY_FAILED;
extern const int PARTITION_ATTACH_FAILED;
extern const int RESHARDING_CLEANUP_FAILED;
}
namespace
{
std::string createMergedPartName(const MergeTreeData::DataPartsVector & parts)
{
DayNum_t left_date = DayNum_t(std::numeric_limits<UInt16>::max());
DayNum_t right_date = DayNum_t(std::numeric_limits<UInt16>::min());
UInt32 level = 0;
for (const MergeTreeData::DataPartPtr & part : parts)
{
level = std::max(level, part->level);
left_date = std::min(left_date, part->left_date);
right_date = std::max(right_date, part->right_date);
}
return ActiveDataPartSet::getPartName(left_date, right_date, parts.front()->left, parts.back()->right, level + 1);
}
}
ReshardingWorker::ReshardingWorker(Context & context_)
: context(context_), log(&Logger::get("ReshardingWorker"))
{
auto zookeeper = context.getZooKeeper();
host_task_queue_path = "/clickhouse";
zookeeper->createIfNotExists(host_task_queue_path, "");
host_task_queue_path += "/" + zookeeper->getTaskQueuePath();
zookeeper->createIfNotExists(host_task_queue_path, "");
host_task_queue_path += "/resharding";
zookeeper->createIfNotExists(host_task_queue_path, "");
host_task_queue_path += "/" + getFQDNOrHostName();
zookeeper->createIfNotExists(host_task_queue_path, "");
}
ReshardingWorker::~ReshardingWorker()
{
must_stop = true;
if (polling_thread.joinable())
polling_thread.join();
}
void ReshardingWorker::start()
{
polling_thread = std::thread(&ReshardingWorker::pollAndExecute, this);
}
void ReshardingWorker::submitJob(const std::string & database_name,
const std::string & table_name,
const std::string & partition,
const WeightedZooKeeperPaths & weighted_zookeeper_paths,
const std::string & sharding_key)
{
auto str = ReshardingJob(database_name, table_name, partition, weighted_zookeeper_paths, sharding_key).toString();
submitJobImpl(str);
}
void ReshardingWorker::submitJob(const ReshardingJob & job)
{
auto str = job.toString();
submitJobImpl(str);
}
bool ReshardingWorker::isStarted() const
{
return is_started;
}
void ReshardingWorker::submitJobImpl(const std::string & serialized_job)
{
auto zookeeper = context.getZooKeeper();
(void) zookeeper->create(host_task_queue_path + "/task-", serialized_job,
zkutil::CreateMode::PersistentSequential);
}
void ReshardingWorker::pollAndExecute()
{
try
{
bool old_val = false;
if (!is_started.compare_exchange_strong(old_val, true, std::memory_order_seq_cst,
std::memory_order_relaxed))
throw Exception("Resharding worker thread already started", ErrorCodes::LOGICAL_ERROR);
LOG_DEBUG(log, "Started resharding thread.");
try
{
performPendingJobs();
}
catch (const Exception & ex)
{
if ((ex.code() == ErrorCodes::RESHARDING_CLEANUP_FAILED) || hasAborted(ex))
throw;
else
LOG_INFO(log, ex.message());
}
catch (...)
{
tryLogCurrentException(__PRETTY_FUNCTION__);
}
while (true)
{
try
{
Strings children;
while (true)
{
zkutil::EventPtr event = new Poco::Event;
auto zookeeper = context.getZooKeeper();
children = zookeeper->getChildren(host_task_queue_path, nullptr, event);
if (!children.empty())
break;
do
{
abortIfRequested();
}
while (!event->tryWait(1000));
}
std::sort(children.begin(), children.end());
perform(children);
}
catch (const Exception & ex)
{
if ((ex.code() == ErrorCodes::RESHARDING_CLEANUP_FAILED) || hasAborted(ex))
throw;
else
LOG_INFO(log, ex.message());
}
catch (...)
{
tryLogCurrentException(__PRETTY_FUNCTION__);
}
}
}
catch (const Exception & ex)
{
if (!hasAborted(ex))
throw;
}
LOG_DEBUG(log, "Resharding thread terminated.");
}
void ReshardingWorker::performPendingJobs()
{
auto zookeeper = context.getZooKeeper();
Strings children = zookeeper->getChildren(host_task_queue_path);
std::sort(children.begin(), children.end());
perform(children);
}
void ReshardingWorker::perform(const Strings & job_nodes)
{
auto zookeeper = context.getZooKeeper();
for (const auto & child : job_nodes)
{
std::string child_full_path = host_task_queue_path + "/" + child;
auto job_descriptor = zookeeper->get(child_full_path);
ReshardingJob job(job_descriptor);
zookeeper->remove(child_full_path);
perform(job);
}
}
void ReshardingWorker::perform(const ReshardingJob & job)
{
LOG_DEBUG(log, "Performing resharding job.");
StoragePtr generic_storage = context.getTable(job.database_name, job.table_name);
auto & storage = typeid_cast<StorageReplicatedMergeTree &>(*(generic_storage.get()));
/// Защитить перешардируемую партицию от задачи слияния.
ScopedPartitionMergeLock partition_merge_lock(storage, job.partition);
try
{
createShardedPartitions(storage, job);
publishShardedPartitions(storage, job);
applyChanges(storage, job);
}
catch (const Exception & ex)
{
cleanup(storage, job);
if (hasAborted(ex))
{
/// Поток завершается. Сохраняем сведения о прерванной задаче.
submitJob(job);
LOG_DEBUG(log, "Resharding job cancelled then re-submitted for later processing.");
}
throw;
}
catch (...)
{
cleanup(storage, job);
throw;
}
cleanup(storage, job);
LOG_DEBUG(log, "Resharding job successfully completed.");
}
void ReshardingWorker::createShardedPartitions(StorageReplicatedMergeTree & storage, const ReshardingJob & job)
{
abortIfRequested();
LOG_DEBUG(log, "Splitting partition shard-wise.");
/// Куски одношо шарда, которые должы быть слиты.
struct PartsToBeMerged
{
void add(MergeTreeData::MutableDataPartPtr & part)
{
parts.insert(part);
total_size += part->size_in_bytes;
}
void clear()
{
parts.clear();
total_size = 0;
}
MergeTreeData::MutableDataParts parts;
size_t total_size = 0;
};
/// Для каждого шарда, куски, которые должны быть слиты.
std::unordered_map<size_t, PartsToBeMerged> to_merge;
MergeTreeData::PerShardDataParts & per_shard_data_parts = storage.data.per_shard_data_parts;
auto zookeeper = storage.getZooKeeper();
const auto & settings = context.getSettingsRef();
(void) settings;
DayNum_t month = MergeTreeData::getMonthFromName(job.partition);
auto parts_from_partition = storage.merger.selectAllPartsFromPartition(month);
for (const auto & part : parts_from_partition)
{
MarkRanges ranges(1, MarkRange(0, part->size));
MergeTreeBlockInputStream source(
storage.data.getFullPath() + part->name + '/',
DEFAULT_MERGE_BLOCK_SIZE,
part->columns.getNames(),
storage.data,
part,
ranges,
false,
nullptr,
"",
true,
settings.min_bytes_to_use_direct_io,
DBMS_DEFAULT_BUFFER_SIZE,
true);
MergeTreeSharder sharder(storage.data, job);
Block block;
while (block = source.read())
{
/// Разбить куски на несколько, согласно ключу шардирования.
ShardedBlocksWithDateIntervals blocks = sharder.shardBlock(block);
for (ShardedBlockWithDateInterval & block_with_dates : blocks)
{
abortIfRequested();
/// Создать новый кусок соответствующий новому блоку.
std::string month_name = toString(DateLUT::instance().toNumYYYYMMDD(DayNum_t(block_with_dates.min_date)) / 100);
AbandonableLockInZooKeeper block_number_lock = storage.allocateBlockNumber(month_name);
Int64 part_number = block_number_lock.getNumber();
MergeTreeData::MutableDataPartPtr block_part = sharder.writeTempPart(block_with_dates, part_number);
/// Добавить в БД ZooKeeper информацию о новом блоке.
SipHash hash;
block_part->checksums.summaryDataChecksum(hash);
union
{
char bytes[16];
UInt64 lo;
UInt64 hi;
} hash_value;
hash.get128(hash_value.bytes);
std::string checksum(hash_value.bytes, 16);
std::string block_id = toString(hash_value.lo) + "_" + toString(hash_value.hi);
zkutil::Ops ops;
auto acl = zookeeper->getDefaultACL();
std::string to_path = job.paths[block_with_dates.shard_no].first;
ops.push_back(
new zkutil::Op::Create(
to_path + "/detached_sharded_blocks/" + block_id,
"",
acl,
zkutil::CreateMode::Persistent));
ops.push_back(
new zkutil::Op::Create(
to_path + "/detached_sharded_blocks/" + block_id + "/checksum",
checksum,
acl,
zkutil::CreateMode::Persistent));
ops.push_back(
new zkutil::Op::Create(
to_path + "/detached_sharded_blocks/" + block_id + "/number",
toString(part_number),
acl,
zkutil::CreateMode::Persistent));
block_number_lock.getUnlockOps(ops);
auto code = zookeeper->tryMulti(ops);
if (code != ZOK)
throw Exception("Unexpected error while adding block " + toString(part_number)
+ " with ID " + block_id + ": " + zkutil::ZooKeeper::error2string(code),
ErrorCodes::UNEXPECTED_ZOOKEEPER_ERROR);
abortIfRequested();
/// Добавить новый кусок в список кусков соответствущего шарда, которые должны
/// быть слиты. Если установлено, что при вставке этого куска, суммарный размер
/// кусков бы превышал некоторый предел, сначала слияем все куски, затем их
/// перемещаем в список кусков новой партиции.
PartsToBeMerged & parts_to_be_merged = to_merge[block_with_dates.shard_no];
if ((parts_to_be_merged.total_size + block_part->size_in_bytes) > storage.data.settings.max_bytes_to_merge_parts)
{
MergeTreeData::MutableDataParts & sharded_parts = per_shard_data_parts[block_with_dates.shard_no];
if (parts_to_be_merged.parts.size() >= 2)
{
MergeTreeData::DataPartsVector parts(parts_to_be_merged.parts.begin(), parts_to_be_merged.parts.end());
std::string merged_name = createMergedPartName(parts);
const auto & merge_entry = storage.data.context.getMergeList().insert(job.database_name,
job.table_name, merged_name);
MergeTreeData::MutableDataPartPtr new_part = storage.merger.mergeParts(parts, merged_name, *merge_entry,
storage.data.context.getSettings().min_bytes_to_use_direct_io);
sharded_parts.insert(new_part);
}
else
sharded_parts.insert(block_part);
/// Удалить исходные куски.
parts_to_be_merged.clear();
}
parts_to_be_merged.add(block_part);
}
}
/// Обработать все оставшиеся куски.
for (auto & entry : to_merge)
{
abortIfRequested();
size_t shard_no = entry.first;
PartsToBeMerged & parts_to_be_merged = entry.second;
MergeTreeData::MutableDataParts & sharded_parts = per_shard_data_parts[shard_no];
if (parts_to_be_merged.parts.size() >= 2)
{
MergeTreeData::DataPartsVector parts(parts_to_be_merged.parts.begin(), parts_to_be_merged.parts.end());
std::string merged_name = createMergedPartName(parts);
const auto & merge_entry = storage.data.context.getMergeList().insert(job.database_name,
job.table_name, merged_name);
MergeTreeData::MutableDataPartPtr new_part = storage.merger.mergeParts(parts, merged_name, *merge_entry,
storage.data.context.getSettings().min_bytes_to_use_direct_io);
sharded_parts.insert(new_part);
}
else
{
auto single_part = *(parts_to_be_merged.parts.begin());
sharded_parts.insert(single_part);
}
/// Удалить исходные куски.
parts_to_be_merged.clear();
}
}
/// До сих пор все куски новых партиций были временны.
for (auto & entry : per_shard_data_parts)
{
size_t shard_no = entry.first;
MergeTreeData::MutableDataParts & sharded_parts = entry.second;
for (auto & sharded_part : sharded_parts)
{
sharded_part->is_temp = false;
std::string prefix = storage.full_path + "reshard/" + toString(shard_no) + "/";
std::string old_name = sharded_part->name;
std::string new_name = ActiveDataPartSet::getPartName(sharded_part->left_date,
sharded_part->right_date, sharded_part->left, sharded_part->right, sharded_part->level);
sharded_part->name = new_name;
Poco::File(prefix + old_name).renameTo(prefix + new_name);
}
}
}
void ReshardingWorker::publishShardedPartitions(StorageReplicatedMergeTree & storage, const ReshardingJob & job)
{
abortIfRequested();
LOG_DEBUG(log, "Sending newly created partitions to their respective shards.");
auto zookeeper = storage.getZooKeeper();
MergeTreeData::PerShardDataParts & per_shard_data_parts = storage.data.per_shard_data_parts;
struct TaskInfo
{
TaskInfo(const std::string & replica_path_,
const std::vector<std::string> & parts_,
const ReplicatedMergeTreeAddress & dest_,
size_t shard_no_)
: replica_path(replica_path_), dest(dest_), parts(parts_),
shard_no(shard_no_)
{
}
std::string replica_path;
ReplicatedMergeTreeAddress dest;
std::vector<std::string> parts;
size_t shard_no;
};
using TaskInfoList = std::vector<TaskInfo>;
TaskInfoList task_info_list;
/// Копировать новые партиции на реплики соответствующих шардов.
/// Количество участвующих локальных реплик. Должно быть <= 1.
size_t local_count = 0;
for (size_t shard_no = 0; shard_no < job.paths.size(); ++shard_no)
{
const WeightedZooKeeperPath & weighted_path = job.paths[shard_no];
const std::string & zookeeper_path = weighted_path.first;
std::vector<std::string> part_names;
const MergeTreeData::MutableDataParts & sharded_parts = per_shard_data_parts.at(shard_no);
for (const MergeTreeData::DataPartPtr & sharded_part : sharded_parts)
part_names.push_back(sharded_part->name);
auto children = zookeeper->getChildren(zookeeper_path + "/replicas");
for (const auto & child : children)
{
const std::string replica_path = zookeeper_path + "/replicas/" + child;
auto host = zookeeper->get(replica_path + "/host");
ReplicatedMergeTreeAddress host_desc(host);
task_info_list.emplace_back(replica_path, part_names, host_desc, shard_no);
if (replica_path == storage.replica_path)
{
++local_count;
if (local_count > 1)
throw Exception("Detected more than one local replica", ErrorCodes::LOGICAL_ERROR);
std::swap(task_info_list[0], task_info_list[task_info_list.size() - 1]);
}
}
}
abortIfRequested();
size_t remote_count = task_info_list.size() - local_count;
boost::threadpool::pool pool(remote_count);
using Tasks = std::vector<std::packaged_task<bool()> >;
Tasks tasks(remote_count);
ReplicatedMergeTreeAddress local_address(zookeeper->get(storage.replica_path + "/host"));
InterserverIOEndpointLocation from_location(storage.replica_path, local_address.host, local_address.replication_port);
try
{
for (size_t i = local_count; i < task_info_list.size(); ++i)
{
const TaskInfo & entry = task_info_list[i];
const auto & replica_path = entry.replica_path;
const auto & dest = entry.dest;
const auto & parts = entry.parts;
size_t shard_no = entry.shard_no;
InterserverIOEndpointLocation to_location(replica_path, dest.host, dest.replication_port);
size_t j = i - local_count;
tasks[j] = Tasks::value_type(std::bind(&ShardedPartitionSender::Client::send,
&storage.sharded_partition_sender_client, to_location, from_location, parts, shard_no));
pool.schedule([j, &tasks]{ tasks[j](); });
}
}
catch (...)
{
pool.wait();
throw;
}
pool.wait();
for (auto & task : tasks)
{
bool res = task.get_future().get();
if (!res)
throw Exception("Failed to copy partition", ErrorCodes::PARTITION_COPY_FAILED);
}
abortIfRequested();
if (local_count == 1)
{
/// На локальной реплике просто перемещаем шардированную паритцию в папку detached/.
const TaskInfo & entry = task_info_list[0];
const auto & parts = entry.parts;
size_t shard_no = entry.shard_no;
for (const auto & part : parts)
{
std::string from_path = storage.full_path + "reshard/" + toString(shard_no) + "/" + part + "/";
std::string to_path = storage.full_path + "detached/";
Poco::File(from_path).moveTo(to_path);
}
}
}
void ReshardingWorker::applyChanges(StorageReplicatedMergeTree & storage, const ReshardingJob & job)
{
abortIfRequested();
LOG_DEBUG(log, "Attaching new partitions.");
auto zookeeper = storage.getZooKeeper();
/// На локальном узле удалить первоначальную партицию.
std::string query_str = "ALTER TABLE " + job.database_name + "." + job.table_name + " DROP PARTITION " + job.partition;
(void) executeQuery(query_str, context, true);
/// На всех участвующих репликах добавить соответствующие шардированные партиции в таблицу.
struct TaskInfo
{
TaskInfo(const std::string & replica_path_, const ReplicatedMergeTreeAddress & dest_)
: replica_path(replica_path_), dest(dest_)
{
}
std::string replica_path;
ReplicatedMergeTreeAddress dest;
};
using TaskInfoList = std::vector<TaskInfo>;
TaskInfoList task_info_list;
for (size_t i = 0; i < job.paths.size(); ++i)
{
const WeightedZooKeeperPath & weighted_path = job.paths[i];
const std::string & zookeeper_path = weighted_path.first;
auto children = zookeeper->getChildren(zookeeper_path + "/replicas");
for (const auto & child : children)
{
const std::string replica_path = zookeeper_path + "/replicas/" + child;
auto host = zookeeper->get(replica_path + "/host");
ReplicatedMergeTreeAddress host_desc(host);
task_info_list.emplace_back(replica_path, host_desc);
}
}
boost::threadpool::pool pool(task_info_list.size());
using Tasks = std::vector<std::packaged_task<bool()> >;
Tasks tasks(task_info_list.size());
try
{
for (size_t i = 0; i < task_info_list.size(); ++i)
{
const auto & entry = task_info_list[i];
const auto & replica_path = entry.replica_path;
const auto & dest = entry.dest;
InterserverIOEndpointLocation location(replica_path, dest.host, dest.replication_port);
std::string query_str = "ALTER TABLE " + dest.database + "." + dest.table + " ATTACH PARTITION " + job.partition;
tasks[i] = Tasks::value_type(std::bind(&RemoteQueryExecutor::Client::executeQuery,
&storage.remote_query_executor_client, location, query_str));
pool.schedule([i, &tasks]{ tasks[i](); });
}
}
catch (...)
{
pool.wait();
throw;
}
pool.wait();
for (auto & task : tasks)
{
bool res = task.get_future().get();
if (!res)
throw Exception("Failed to attach partition on replica", ErrorCodes::PARTITION_ATTACH_FAILED);
}
}
void ReshardingWorker::cleanup(StorageReplicatedMergeTree & storage, const ReshardingJob & job)
{
LOG_DEBUG(log, "Performing cleanup.");
try
{
storage.data.per_shard_data_parts.clear();
Poco::DirectoryIterator end;
for (Poco::DirectoryIterator it(storage.full_path + "/reshard"); it != end; ++it)
{
auto absolute_path = it.path().absolute().toString();
Poco::File(absolute_path).remove(true);
}
auto zookeeper = storage.getZooKeeper();
zkutil::Ops ops;
for (size_t i = 0; i < job.paths.size(); ++i)
{
const WeightedZooKeeperPath & weighted_path = job.paths[i];
const std::string & zookeeper_path = weighted_path.first;
auto children = zookeeper->getChildren(zookeeper_path + "/detached_sharded_blocks");
if (!children.empty())
{
for (const auto & child : children)
{
ops.push_back(
new zkutil::Op::Remove(
zookeeper_path + "/detached_sharded_blocks/" + child + "/number", -1));
ops.push_back(
new zkutil::Op::Remove(
zookeeper_path + "/detached_sharded_blocks/" + child + "/checksum", -1));
ops.push_back(
new zkutil::Op::Remove(
zookeeper_path + "/detached_sharded_blocks/" + child, -1));
}
}
}
zookeeper->multi(ops);
}
catch (...)
{
throw Exception("Failed to perform cleanup during resharding operation",
ErrorCodes::RESHARDING_CLEANUP_FAILED);
}
}
void ReshardingWorker::abortIfRequested() const
{
if (must_stop)
throw Exception("Cancelled resharding", ErrorCodes::ABORTED);
}
bool ReshardingWorker::hasAborted(const Exception & ex) const
{
return must_stop && (ex.code() == ErrorCodes::ABORTED);
}
}

View File

@ -0,0 +1,116 @@
#include <DB/Storages/MergeTree/ShardedPartitionSender.h>
#include <DB/Storages/MergeTree/ReplicatedMergeTreeAddress.h>
#include <DB/Storages/StorageReplicatedMergeTree.h>
#include <DB/IO/ReadBufferFromHTTP.h>
#include <DB/IO/ReadHelpers.h>
#include <DB/IO/WriteHelpers.h>
#include <boost/algorithm/string.hpp>
namespace DB
{
namespace ErrorCodes
{
extern const int ABORTED;
}
namespace
{
std::string glue(const std::vector<std::string> & names, char delim)
{
std::string res;
bool is_first = true;
for (const auto & name : names)
{
if (is_first)
is_first = false;
else
res.append(1, delim);
res.append(name);
}
return res;
}
}
namespace ShardedPartitionSender
{
namespace
{
std::string getEndpointId(const std::string & node_id)
{
return "ShardedPartitionSender:" + node_id;
}
}
Service::Service(StorageReplicatedMergeTree & storage_)
: storage(storage_)
{
}
std::string Service::getId(const std::string & node_id) const
{
return getEndpointId(node_id);
}
void Service::processQuery(const Poco::Net::HTMLForm & params, WriteBuffer & out)
{
if (is_cancelled)
throw Exception("ShardedPartitionSender service terminated", ErrorCodes::ABORTED);
InterserverIOEndpointLocation from_location(params.get("from_location"));
std::string glued_parts = params.get("parts");
size_t shard_no = std::stoul(params.get("shard"));
std::vector<std::string> parts;
boost::split(parts, glued_parts, boost::is_any_of(","));
for (const auto & part_name : parts)
{
if (is_cancelled)
throw Exception("ShardedPartitionSender service terminated", ErrorCodes::ABORTED);
MergeTreeData::MutableDataPartPtr part = storage.fetcher.fetchShardedPart(from_location, part_name, shard_no);
part->is_temp = false;
const std::string new_name = "detached/" + part_name;
Poco::File(storage.full_path + part->name).renameTo(storage.full_path + new_name);
}
bool flag = true;
writeBinary(flag, out);
out.next();
}
bool Client::send(const InterserverIOEndpointLocation & to_location, const InterserverIOEndpointLocation & from_location,
const std::vector<std::string> & parts, size_t shard_no)
{
std::string glued_parts = glue(parts, ',');
ReadBufferFromHTTP::Params params =
{
{"endpoint", getEndpointId(to_location.name)},
{"from_location", from_location.toString()},
{"compress", "false"},
{"parts", glued_parts},
{"shard", toString(shard_no)}
};
ReadBufferFromHTTP in(to_location.host, to_location.port, params);
bool flag;
readBinary(flag, in);
assertEOF(in);
return flag;
}
}
}

View File

@ -490,7 +490,7 @@ void StorageBuffer::flushThread()
}
void StorageBuffer::alter(const AlterCommands & params, const String & database_name, const String & table_name, Context & context)
void StorageBuffer::alter(const AlterCommands & params, const String & database_name, const String & table_name, const Context & context)
{
auto lock = lockStructureForAlter();

View File

@ -1,6 +1,7 @@
#include <DB/DataStreams/RemoteBlockInputStream.h>
#include <DB/DataStreams/MaterializingBlockInputStream.h>
#include <DB/DataStreams/BlockExtraInfoInputStream.h>
#include <DB/DataStreams/UnionBlockInputStream.h>
#include <DB/Storages/StorageDistributed.h>
#include <DB/Storages/VirtualColumnFactory.h>
@ -11,11 +12,19 @@
#include <DB/Parsers/ASTSelectQuery.h>
#include <DB/Parsers/ASTIdentifier.h>
#include <DB/Parsers/TablePropertiesQueriesASTs.h>
#include <DB/Parsers/ParserAlterQuery.h>
#include <DB/Parsers/parseQuery.h>
#include <DB/Parsers/ASTWeightedZooKeeperPath.h>
#include <DB/Parsers/ASTLiteral.h>
#include <DB/Interpreters/InterpreterSelectQuery.h>
#include <DB/Interpreters/InterpreterAlterQuery.h>
#include <DB/Interpreters/InterpreterDescribeQuery.h>
#include <DB/Interpreters/ExpressionAnalyzer.h>
#include <DB/Interpreters/ClusterProxy/Query.h>
#include <DB/Interpreters/ClusterProxy/SelectQueryConstructor.h>
#include <DB/Interpreters/ClusterProxy/DescribeQueryConstructor.h>
#include <DB/Interpreters/ClusterProxy/AlterQueryConstructor.h>
#include <DB/Core/Field.h>
@ -59,17 +68,6 @@ namespace
return modified_query_ast;
}
BlockExtraInfo toBlockExtraInfo(const Cluster::Address & address)
{
BlockExtraInfo block_extra_info;
block_extra_info.host = address.host_name;
block_extra_info.resolved_address = address.resolved_address.toString();
block_extra_info.port = address.port;
block_extra_info.user = address.user;
block_extra_info.is_valid = true;
return block_extra_info;
}
}
@ -170,39 +168,20 @@ BlockInputStreams StorageDistributed::read(
const size_t max_block_size,
const unsigned threads)
{
Settings new_settings = settings;
new_settings.queue_max_wait_ms = Cluster::saturate(new_settings.queue_max_wait_ms, settings.limits.max_execution_time);
/// Не имеет смысла на удалённых серверах, так как запрос отправляется обычно с другим user-ом.
new_settings.max_concurrent_queries_for_user = 0;
size_t result_size = (cluster.getRemoteShardCount() * settings.max_parallel_replicas) + cluster.getLocalShardCount();
processed_stage = result_size == 1 || settings.distributed_group_by_no_merge
? QueryProcessingStage::Complete
: QueryProcessingStage::WithMergeableState;
BlockInputStreams res;
const auto & modified_query_ast = rewriteSelectQuery(
query, remote_database, remote_table);
const auto & modified_query = queryToString(modified_query_ast);
/// Ограничение сетевого трафика, если нужно.
ThrottlerPtr throttler;
if (settings.limits.max_network_bandwidth || settings.limits.max_network_bytes)
throttler.reset(new Throttler(
settings.limits.max_network_bandwidth,
settings.limits.max_network_bytes,
"Limit for bytes to send or receive over network exceeded."));
Tables external_tables;
if (settings.global_subqueries_method == GlobalSubqueriesMethod::PUSH)
external_tables = context.getExternalTables();
/// Распределить шарды равномерно по потокам.
size_t remote_count = cluster.getRemoteShardCount();
/// Отключаем мультиплексирование шардов, если есть ORDER BY без GROUP BY.
//const ASTSelectQuery & ast = *(static_cast<const ASTSelectQuery *>(modified_query_ast.get()));
@ -213,79 +192,10 @@ BlockInputStreams StorageDistributed::read(
//bool enable_shard_multiplexing = !(ast.order_expression_list && !ast.group_expression_list);
bool enable_shard_multiplexing = false;
size_t thread_count;
ClusterProxy::SelectQueryConstructor select_query_constructor(processed_stage, external_tables);
if (!enable_shard_multiplexing)
thread_count = remote_count;
else if (remote_count == 0)
thread_count = 0;
else if (settings.max_distributed_processing_threads == 0)
thread_count = 1;
else
thread_count = std::min(remote_count, static_cast<size_t>(settings.max_distributed_processing_threads));
size_t pools_per_thread = (thread_count > 0) ? (remote_count / thread_count) : 0;
size_t remainder = (thread_count > 0) ? (remote_count % thread_count) : 0;
ConnectionPoolsPtr pools;
bool do_init = true;
/// Цикл по шардам.
size_t current_thread = 0;
for (const auto & shard_info : cluster.getShardsInfo())
{
if (shard_info.isLocal())
{
/// Добавляем запросы к локальному ClickHouse.
DB::Context new_context = context;
new_context.setSettings(new_settings);
for (size_t i = 0; i < shard_info.local_addresses.size(); ++i)
{
InterpreterSelectQuery interpreter(modified_query_ast, new_context, processed_stage);
/** Материализация нужна, так как с удалённых серверов константы приходят материализованными.
* Если этого не делать, то в разных потоках будут получаться разные типы (Const и не-Const) столбцов,
* а это не разрешено, так как весь код исходит из допущения, что в потоке блоков все типы одинаковые.
*/
res.emplace_back(new MaterializingBlockInputStream(interpreter.execute().in));
}
}
else
{
size_t excess = (current_thread < remainder) ? 1 : 0;
size_t actual_pools_per_thread = pools_per_thread + excess;
if (actual_pools_per_thread == 1)
{
res.emplace_back(new RemoteBlockInputStream{
shard_info.pool, modified_query, &new_settings, throttler,
external_tables, processed_stage, context});
++current_thread;
}
else
{
if (do_init)
{
pools = new ConnectionPools;
do_init = false;
}
pools->push_back(shard_info.pool);
if (pools->size() == actual_pools_per_thread)
{
res.emplace_back(new RemoteBlockInputStream{
pools, modified_query, &new_settings, throttler,
external_tables, processed_stage, context});
do_init = true;
++current_thread;
}
}
}
}
return res;
return ClusterProxy::Query(select_query_constructor, cluster, modified_query_ast,
context, settings, enable_shard_multiplexing).execute();
}
BlockOutputStreamPtr StorageDistributed::write(ASTPtr query, const Settings & settings)
@ -303,7 +213,7 @@ BlockOutputStreamPtr StorageDistributed::write(ASTPtr query, const Settings & se
};
}
void StorageDistributed::alter(const AlterCommands & params, const String & database_name, const String & table_name, Context & context)
void StorageDistributed::alter(const AlterCommands & params, const String & database_name, const String & table_name, const Context & context)
{
auto lock = lockStructureForAlter();
params.apply(*columns, materialized_columns, alias_columns, column_defaults);
@ -316,120 +226,83 @@ void StorageDistributed::shutdown()
directory_monitors.clear();
}
BlockInputStreams StorageDistributed::describe(const Context & context, const Settings & settings)
void StorageDistributed::reshardPartitions(const String & database_name, const Field & first_partition,
const Field & last_partition, const WeightedZooKeeperPaths & weighted_zookeeper_paths,
const String & sharding_key, const Settings & settings)
{
Settings new_settings = settings;
new_settings.queue_max_wait_ms = Cluster::saturate(new_settings.queue_max_wait_ms, settings.limits.max_execution_time);
/// Не имеет смысла на удалённых серверах, так как запрос отправляется обычно с другим user-ом.
new_settings.max_concurrent_queries_for_user = 0;
/// Создать запрос ALTER TABLE xxx.yyy RESHARD PARTITION zzz TO ttt USING uuu.
/// Создать запрос DESCRIBE TABLE.
ASTPtr alter_query_ptr = new ASTAlterQuery;
auto & alter_query = static_cast<ASTAlterQuery &>(*alter_query_ptr);
auto describe_query = new ASTDescribeQuery;
describe_query->database = remote_database;
describe_query->table = remote_table;
alter_query.database = remote_database;
alter_query.table = remote_table;
ASTPtr ast = describe_query;
const auto query = queryToString(ast);
alter_query.parameters.emplace_back();
ASTAlterQuery::Parameters & parameters = alter_query.parameters.back();
/// Ограничение сетевого трафика, если нужно.
ThrottlerPtr throttler;
if (settings.limits.max_network_bandwidth || settings.limits.max_network_bytes)
throttler.reset(new Throttler(
settings.limits.max_network_bandwidth,
settings.limits.max_network_bytes,
"Limit for bytes to send or receive over network exceeded."));
parameters.type = ASTAlterQuery::RESHARD_PARTITION;
if (!first_partition.isNull())
parameters.partition = new ASTLiteral({}, first_partition);
if (!last_partition.isNull())
parameters.last_partition = new ASTLiteral({}, last_partition);
BlockInputStreams res;
/// Распределить шарды равномерно по потокам.
size_t remote_count = 0;
for (const auto & shard_info : cluster.getShardsInfo())
ASTPtr expr_list = new ASTExpressionList;
for (const auto & entry : weighted_zookeeper_paths)
{
if (shard_info.hasRemoteConnections())
++remote_count;
ASTPtr weighted_path_ptr = new ASTWeightedZooKeeperPath;
auto & weighted_path = static_cast<ASTWeightedZooKeeperPath &>(*weighted_path_ptr);
weighted_path.path = entry.first;
weighted_path.weight = entry.second;
expr_list->children.push_back(weighted_path_ptr);
}
size_t thread_count;
parameters.weighted_zookeeper_paths = expr_list;
parameters.sharding_key = sharding_key;
/** Функциональность shard_multiplexing не доделана - выключаем её.
* (Потому что установка соединений с разными шардами в рамках одного потока выполняется не параллельно.)
* Подробнее смотрите в https://███████████.yandex-team.ru/METR-18300
*/
bool enable_shard_multiplexing = false;
/* if (remote_count == 0)
thread_count = 0;
else if (settings.max_distributed_processing_threads == 0)
thread_count = 1;
else
thread_count = std::min(remote_count, static_cast<size_t>(settings.max_distributed_processing_threads));
*/
thread_count = remote_count;
ClusterProxy::AlterQueryConstructor alter_query_constructor;
size_t pools_per_thread = (thread_count > 0) ? (remote_count / thread_count) : 0;
size_t remainder = (thread_count > 0) ? (remote_count % thread_count) : 0;
BlockInputStreams streams = ClusterProxy::Query(alter_query_constructor, cluster, alter_query_ptr,
context, settings, enable_shard_multiplexing).execute();
ConnectionPoolsPtr pools;
bool do_init = true;
streams[0] = new UnionBlockInputStream<>(streams, nullptr, settings.max_distributed_connections);
streams.resize(1);
/// Цикл по шардам.
size_t current_thread = 0;
for (const auto & shard_info : cluster.getShardsInfo())
{
if (shard_info.isLocal())
{
/// Добавляем запросы к локальному ClickHouse.
auto stream_ptr = dynamic_cast<IProfilingBlockInputStream *>(&*streams[0]);
if (stream_ptr == nullptr)
throw Exception("StorageDistributed: Internal error", ErrorCodes::LOGICAL_ERROR);
auto & stream = *stream_ptr;
DB::Context new_context = context;
new_context.setSettings(new_settings);
while (!stream.isCancelled() && stream.read())
;
}
for (const auto & address : shard_info.local_addresses)
{
InterpreterDescribeQuery interpreter(ast, new_context);
BlockInputStreamPtr stream = new MaterializingBlockInputStream(interpreter.execute().in);
stream = new BlockExtraInfoInputStream(stream, toBlockExtraInfo(address));
res.emplace_back(stream);
}
}
BlockInputStreams StorageDistributed::describe(const Context & context, const Settings & settings)
{
/// Создать запрос DESCRIBE TABLE.
if (shard_info.hasRemoteConnections())
{
size_t excess = (current_thread < remainder) ? 1 : 0;
size_t actual_pools_per_thread = pools_per_thread + excess;
ASTPtr describe_query_ptr = new ASTDescribeQuery;
auto & describe_query = static_cast<ASTDescribeQuery &>(*describe_query_ptr);
if (actual_pools_per_thread == 1)
{
auto stream = new RemoteBlockInputStream{shard_info.pool, query, &new_settings, throttler};
stream->doBroadcast();
stream->appendExtraInfo();
res.emplace_back(stream);
++current_thread;
}
else
{
if (do_init)
{
pools = new ConnectionPools;
do_init = false;
}
describe_query.database = remote_database;
describe_query.table = remote_table;
pools->push_back(shard_info.pool);
if (pools->size() == actual_pools_per_thread)
{
auto stream = new RemoteBlockInputStream{pools, query, &new_settings, throttler};
stream->doBroadcast();
stream->appendExtraInfo();
res.emplace_back(stream);
/** Функциональность shard_multiplexing не доделана - выключаем её.
* (Потому что установка соединений с разными шардами в рамках одного потока выполняется не параллельно.)
* Подробнее смотрите в https://███████████.yandex-team.ru/METR-18300
*/
bool enable_shard_multiplexing = false;
do_init = true;
++current_thread;
}
}
}
}
ClusterProxy::DescribeQueryConstructor describe_query_constructor;
return res;
return ClusterProxy::Query(describe_query_constructor, cluster, describe_query_ptr,
context, settings, enable_shard_multiplexing).execute();
}
NameAndTypePair StorageDistributed::getColumn(const String & column_name) const

View File

@ -215,7 +215,7 @@ void StorageMerge::getSelectedTables(StorageVector & selected_tables) const
}
void StorageMerge::alter(const AlterCommands & params, const String & database_name, const String & table_name, Context & context)
void StorageMerge::alter(const AlterCommands & params, const String & database_name, const String & table_name, const Context & context)
{
auto lock = lockStructureForAlter();
params.apply(*columns, materialized_columns, alias_columns, column_defaults);

View File

@ -161,7 +161,7 @@ void StorageMergeTree::rename(const String & new_path_to_db, const String & new_
/// TODO: Можно обновить названия логгеров у this, data, reader, writer, merger.
}
void StorageMergeTree::alter(const AlterCommands & params, const String & database_name, const String & table_name, Context & context)
void StorageMergeTree::alter(const AlterCommands & params, const String & database_name, const String & table_name, const Context & context)
{
/// NOTE: Здесь так же как в ReplicatedMergeTree можно сделать ALTER, не блокирующий запись данных надолго.
const MergeTreeMergeBlocker merge_blocker{merger};

View File

@ -1,6 +1,3 @@
#include <time.h>
#include <ext/range.hpp>
#include <zkutil/Types.h>
#include <zkutil/KeeperException.h>
@ -9,12 +6,12 @@
#include <DB/Storages/ColumnsDescription.h>
#include <DB/Storages/StorageReplicatedMergeTree.h>
#include <DB/Storages/MergeTree/ReplicatedMergeTreeBlockOutputStream.h>
#include <DB/Storages/MergeTree/ReplicatedMergeTreePartsExchange.h>
#include <DB/Storages/MergeTree/ReplicatedMergeTreeQuorumEntry.h>
#include <DB/Storages/MergeTree/MergeTreePartChecker.h>
#include <DB/Storages/MergeTree/MergeList.h>
#include <DB/Storages/MergeTree/MergeTreeWhereOptimizer.h>
#include <DB/Storages/MergeTree/ReplicatedMergeTreeAddress.h>
#include <DB/Storages/MergeTree/ReshardingWorker.h>
#include <DB/Parsers/formatAST.h>
#include <DB/Parsers/ASTInsertQuery.h>
@ -37,6 +34,15 @@
#include <Poco/DirectoryIterator.h>
#include <common/threadpool.hpp>
#include <ext/range.hpp>
#include <cfenv>
#include <ctime>
#include <thread>
#include <future>
namespace DB
{
@ -63,6 +69,13 @@ namespace ErrorCodes
extern const int TOO_MUCH_RETRIES_TO_FETCH_PARTS;
extern const int RECEIVED_ERROR_FROM_REMOTE_IO_SERVER;
extern const int PARTITION_DOESNT_EXIST;
extern const int INCONSISTENT_TABLE_ACCROSS_SHARDS;
extern const int INSUFFICIENT_SPACE_FOR_RESHARDING;
extern const int RESHARDING_NO_WORKER;
extern const int INVALID_PARTITIONS_INTERVAL;
extern const int RESHARDING_INVALID_PARAMETERS;
extern const int INVALID_SHARD_WEIGHT;
extern const int SHARD_DOESNT_REFERENCE_TABLE;
}
@ -307,11 +320,34 @@ StoragePtr StorageReplicatedMergeTree::create(
StoragePtr res_ptr = res->thisPtr();
auto get_endpoint_holder = [&res](InterserverIOEndpointPtr endpoint)
{
return new InterserverIOEndpointHolder(endpoint->getId(res->replica_path), endpoint, res->context.getInterserverIOHandler());
};
if (res->tryGetZooKeeper())
{
String endpoint_name = "ReplicatedMergeTree:" + res->replica_path;
InterserverIOEndpointPtr endpoint = new ReplicatedMergeTreePartsServer(res->data, *res);
res->endpoint_holder = new InterserverIOEndpointHolder(endpoint_name, endpoint, res->context.getInterserverIOHandler());
{
InterserverIOEndpointPtr endpoint = new DataPartsExchange::Service(res->data, *res);
res->endpoint_holder = get_endpoint_holder(endpoint);
}
/// Сервисы для перешардирования.
{
InterserverIOEndpointPtr endpoint = new RemoteDiskSpaceMonitor::Service(res->full_path);
res->disk_space_monitor_endpoint_holder = get_endpoint_holder(endpoint);
}
{
InterserverIOEndpointPtr endpoint = new ShardedPartitionSender::Service(*res);
res->sharded_partition_sender_endpoint_holder = get_endpoint_holder(endpoint);
}
{
InterserverIOEndpointPtr endpoint = new RemoteQueryExecutor::Service(res->context);
res->remote_query_executor_endpoint_holder = get_endpoint_holder(endpoint);
}
}
return res_ptr;
@ -367,6 +403,8 @@ void StorageReplicatedMergeTree::createTableIfNotExists()
acl, zkutil::CreateMode::Persistent));
ops.push_back(new zkutil::Op::Create(zookeeper_path + "/blocks", "",
acl, zkutil::CreateMode::Persistent));
ops.push_back(new zkutil::Op::Create(zookeeper_path + "/detached_sharded_blocks", "",
acl, zkutil::CreateMode::Persistent));
ops.push_back(new zkutil::Op::Create(zookeeper_path + "/block_numbers", "",
acl, zkutil::CreateMode::Persistent));
ops.push_back(new zkutil::Op::Create(zookeeper_path + "/nonincrement_block_numbers", "",
@ -2277,6 +2315,15 @@ void StorageReplicatedMergeTree::shutdown()
endpoint_holder = nullptr;
fetcher.cancel();
disk_space_monitor_endpoint_holder = nullptr;
free_disk_space_checker.cancel();
sharded_partition_sender_endpoint_holder = nullptr;
sharded_partition_sender_client.cancel();
remote_query_executor_endpoint_holder = nullptr;
remote_query_executor_client.cancel();
}
@ -2469,7 +2516,7 @@ bool StorageReplicatedMergeTree::optimize(const Settings & settings)
void StorageReplicatedMergeTree::alter(const AlterCommands & params,
const String & database_name, const String & table_name, Context & context)
const String & database_name, const String & table_name, const Context & context)
{
assertNotReadonly();
@ -2676,35 +2723,8 @@ void StorageReplicatedMergeTree::dropPartition(ASTPtr query, const Field & field
return;
}
/** Пропустим один номер в block_numbers для удаляемого месяца, и будем удалять только куски до этого номера.
* Это запретит мерджи удаляемых кусков с новыми вставляемыми данными.
* Инвариант: в логе не появятся слияния удаляемых кусков с другими кусками.
* NOTE: Если понадобится аналогично поддержать запрос DROP PART, для него придется придумать какой-нибудь новый механизм,
* чтобы гарантировать этот инвариант.
*/
Int64 right;
{
AbandonableLockInZooKeeper block_number_lock = allocateBlockNumber(month_name);
right = block_number_lock.getNumber();
block_number_lock.unlock();
}
/// Такого никогда не должно происходить.
if (right == 0)
throw Exception("Logical error: just allocated block number is zero", ErrorCodes::LOGICAL_ERROR);
--right;
String fake_part_name = getFakePartNameForDrop(month_name, 0, right);
/** Запретим выбирать для слияния удаляемые куски.
* Инвариант: после появления в логе записи DROP_RANGE, в логе не появятся слияния удаляемых кусков.
*/
{
std::lock_guard<std::mutex> merge_selecting_lock(merge_selecting_mutex);
queue.disableMergesInRange(fake_part_name);
}
ScopedPartitionMergeLock partition_merge_lock(*this, month_name);
std::string fake_part_name = partition_merge_lock.getId();
/// Наконец, добившись нужных инвариантов, можно положить запись в лог.
LogEntry entry;
@ -2726,6 +2746,66 @@ void StorageReplicatedMergeTree::dropPartition(ASTPtr query, const Field & field
}
}
std::string StorageReplicatedMergeTree::acquirePartitionMergeLock(const std::string & partition_name)
{
std::lock_guard<std::mutex> guard(mutex_partition_to_merge_lock);
auto it = partition_to_merge_lock.find(partition_name);
if (it != partition_to_merge_lock.end())
{
auto & info = it->second;
++info.ref_count;
return info.fake_part_name;
}
/** Пропустим один номер в block_numbers для удаляемого месяца, и будем удалять только куски до этого номера.
* Это запретит мерджи удаляемых кусков с новыми вставляемыми данными.
* Инвариант: в логе не появятся слияния удаляемых кусков с другими кусками.
* NOTE: Если понадобится аналогично поддержать запрос DROP PART, для него придется придумать какой-нибудь новый механизм,
* чтобы гарантировать этот инвариант.
*/
Int64 right;
{
AbandonableLockInZooKeeper block_number_lock = allocateBlockNumber(partition_name);
right = block_number_lock.getNumber();
block_number_lock.unlock();
}
/// Такого никогда не должно происходить.
if (right == 0)
throw Exception("Logical error: just allocated block number is zero", ErrorCodes::LOGICAL_ERROR);
--right;
std::string fake_part_name = getFakePartNameForDrop(partition_name, 0, right);
partition_to_merge_lock.emplace(partition_name, PartitionMergeLockInfo(fake_part_name));
/** Запретим выбирать для слияния удаляемые куски.
* Инвариант: после появления в логе записи DROP_RANGE, в логе не появятся слияния удаляемых кусков.
*/
{
std::lock_guard<std::mutex> merge_selecting_lock(merge_selecting_mutex);
queue.disableMergesInRange(fake_part_name);
}
return fake_part_name;
}
void StorageReplicatedMergeTree::releasePartitionMergeLock(const std::string & partition_name)
{
std::lock_guard<std::mutex> guard(mutex_partition_to_merge_lock);
auto it = partition_to_merge_lock.find(partition_name);
if (it == partition_to_merge_lock.end())
throw Exception("StorageReplicatedMergeTree: trying to release a non-existent partition merge lock",
ErrorCodes::LOGICAL_ERROR);
auto & info = it->second;
--info.ref_count;
if (info.ref_count == 0)
partition_to_merge_lock.erase(it);
}
void StorageReplicatedMergeTree::attachPartition(ASTPtr query, const Field & field, bool unreplicated, bool attach_part, const Settings & settings)
{
@ -2811,7 +2891,57 @@ void StorageReplicatedMergeTree::attachPartition(ASTPtr query, const Field & fie
zookeeper_path + "/log/log-", entry.toString(), zookeeper->getDefaultACL(), zkutil::CreateMode::PersistentSequential));
}
LOG_DEBUG(log, "Adding attaches to log");
std::string log_msg = "Adding attaches to log";
if (is_leader_node)
{
/// Если ATTACH PART выполняется в рамках перешардирования, обновляем информацию о блоках на шарде.
auto children = zookeeper->getChildren(zookeeper_path + "/detached_sharded_blocks");
if (!children.empty())
{
log_msg += ". Updating information about blocks in the context of the resharding operation.";
auto acl = zookeeper->getDefaultACL();
for (const auto & child : children)
{
std::string checksum = zookeeper->get(zookeeper_path + "/detached_sharded_blocks/" + child + "/checksum");
std::string number = zookeeper->get(zookeeper_path + "/detached_sharded_blocks/" + child + "/number");
ops.push_back(
new zkutil::Op::Create(
zookeeper_path + "/blocks/" + child,
"",
acl,
zkutil::CreateMode::Persistent));
ops.push_back(
new zkutil::Op::Create(
zookeeper_path + "/blocks/" + child + "/checksum",
checksum,
acl,
zkutil::CreateMode::Persistent));
ops.push_back(
new zkutil::Op::Create(
zookeeper_path + "/blocks/" + child + "/number",
number,
acl,
zkutil::CreateMode::Persistent));
ops.push_back(
new zkutil::Op::Remove(
zookeeper_path + "/detached_sharded_blocks/" + child + "/number", -1));
ops.push_back(
new zkutil::Op::Remove(
zookeeper_path + "/detached_sharded_blocks/" + child + "/checksum", -1));
ops.push_back(
new zkutil::Op::Remove(
zookeeper_path + "/detached_sharded_blocks/" + child, -1));
}
}
}
LOG_DEBUG(log, log_msg);
zookeeper->multi(ops);
/// Если надо - дожидаемся выполнения операции на себе или на всех репликах.
@ -2906,7 +3036,7 @@ AbandonableLockInZooKeeper StorageReplicatedMergeTree::allocateBlockNumber(const
String month_path = zookeeper_path + "/block_numbers/" + month_name;
if (!existsNodeCached(month_path))
{
/// Создадим в block_numbers ноду для месяца и пропустим в ней 200 значений инкремента.
/// Создадим в block_numbers ноду для месяца и пропустим в ней N=RESERVED_BLOCK_NUMBERS значений инкремента.
/// Нужно, чтобы в будущем при необходимости можно было добавить данные в начало.
zkutil::Ops ops;
auto acl = zookeeper->getDefaultACL();
@ -3350,5 +3480,218 @@ void StorageReplicatedMergeTree::freezePartition(const Field & partition, const
unreplicated_data->freezePartition(prefix);
}
void StorageReplicatedMergeTree::reshardPartitions(const String & database_name, const Field & first_partition, const Field & last_partition,
const WeightedZooKeeperPaths & weighted_zookeeper_paths, const String & sharding_key,
const Settings & settings)
{
auto & resharding_worker = context.getReshardingWorker();
if (!resharding_worker.isStarted())
throw Exception("Resharding worker is not running.", ErrorCodes::RESHARDING_NO_WORKER);
for (const auto & weighted_path : weighted_zookeeper_paths)
{
UInt64 weight = weighted_path.second;
if (weight == 0)
throw Exception("Shard has invalid weight", ErrorCodes::INVALID_SHARD_WEIGHT);
}
for (const auto & weighted_path : weighted_zookeeper_paths)
{
const std::string & path = weighted_path.first;
if ((path.length() <= getTableName().length()) ||
(path.substr(path.length() - getTableName().length()) != getTableName()))
throw Exception("Shard does not reference table", ErrorCodes::SHARD_DOESNT_REFERENCE_TABLE);
}
DayNum_t first_partition_num = !first_partition.isNull() ? MergeTreeData::getMonthDayNum(first_partition) : DayNum_t();
DayNum_t last_partition_num = !last_partition.isNull() ? MergeTreeData::getMonthDayNum(last_partition) : DayNum_t();
if (first_partition_num && last_partition_num)
{
if (first_partition_num > last_partition_num)
throw Exception("Invalid interval of partitions", ErrorCodes::INVALID_PARTITIONS_INTERVAL);
}
if (!first_partition_num && last_partition_num)
throw Exception("Received invalid parameters for resharding", ErrorCodes::RESHARDING_INVALID_PARAMETERS);
bool include_all = !first_partition_num;
/// Составить список локальных партиций, которые надо перешардировать.
using PartitionList = std::set<std::string>;
PartitionList partition_list;
const MergeTreeData::DataParts & data_parts = data.getDataParts();
for (MergeTreeData::DataParts::iterator it = data_parts.cbegin(); it != data_parts.cend(); ++it)
{
const MergeTreeData::DataPartPtr & current_part = *it;
DayNum_t month = current_part->month;
if (include_all || ((month >= first_partition_num) && (month <= last_partition_num)))
partition_list.insert(MergeTreeData::getMonthName(month));
}
if (partition_list.empty())
throw Exception("No existing partition found", ErrorCodes::PARTITION_DOESNT_EXIST);
/// Убедиться, что структуры локальной и реплицируемых таблиц совпадают.
enforceShardsConsistency(weighted_zookeeper_paths);
/// Проверить, что для всех задач имеется достаточно свободного места локально и на всех репликах.
auto replica_to_space_info = gatherReplicaSpaceInfo(weighted_zookeeper_paths);
for (const auto & partition : partition_list)
{
size_t partition_size = data.getPartitionSize(partition);
if (!checkSpaceForResharding(replica_to_space_info, partition_size))
throw Exception("Insufficient space available for resharding operation "
"on partition " + partition, ErrorCodes::INSUFFICIENT_SPACE_FOR_RESHARDING);
}
/// Зарегистрировать фоновые задачи перешардирования.
for (const auto & partition : partition_list)
resharding_worker.submitJob(database_name, getTableName(), partition, weighted_zookeeper_paths, sharding_key);
}
void StorageReplicatedMergeTree::enforceShardsConsistency(const WeightedZooKeeperPaths & weighted_zookeeper_paths)
{
const auto & columns = getColumnsList();
auto zookeeper = getZooKeeper();
for (const auto & weighted_path : weighted_zookeeper_paths)
{
auto columns_str = zookeeper->get(weighted_path.first + "/columns");
auto columns_desc = ColumnsDescription<true>::parse(columns_str);
if (!std::equal(columns.begin(), columns.end(), columns_desc.columns.begin()) ||
!std::equal(materialized_columns.begin(), materialized_columns.end(), columns_desc.materialized.begin()) ||
!std::equal(alias_columns.begin(), alias_columns.end(), columns_desc.alias.begin()) ||
!std::equal(column_defaults.begin(), column_defaults.end(), columns_desc.defaults.begin()))
throw Exception("Table is inconsistent accross shards", ErrorCodes::INCONSISTENT_TABLE_ACCROSS_SHARDS);
}
}
StorageReplicatedMergeTree::ReplicaToSpaceInfo
StorageReplicatedMergeTree::gatherReplicaSpaceInfo(const WeightedZooKeeperPaths & weighted_zookeeper_paths)
{
struct TaskInfo
{
TaskInfo(const std::string & replica_path_,
const ReplicatedMergeTreeAddress & address_)
: replica_path(replica_path_), address(address_)
{
}
std::string replica_path;
ReplicatedMergeTreeAddress address;
};
using TaskInfoList = std::vector<TaskInfo>;
TaskInfoList task_info_list;
ReplicaToSpaceInfo replica_to_space_info;
/// Теперь проверяем наличие свободного места на удаленных репликах.
UInt64 total_weight = 0;
for (const auto & weighted_path : weighted_zookeeper_paths)
{
UInt64 weight = weighted_path.second;
total_weight += weight;
}
auto & local_space_info = replica_to_space_info[replica_path];
local_space_info.factor = 1.1;
local_space_info.available_size = DiskSpaceMonitor::getUnreservedFreeSpace(full_path);
auto zookeeper = getZooKeeper();
for (const auto & weighted_path : weighted_zookeeper_paths)
{
const auto & path = weighted_path.first;
UInt64 weight = weighted_path.second;
long double factor = (weight / static_cast<long double>(total_weight)) * 1.1;
auto children = zookeeper->getChildren(path + "/replicas");
for (const auto & child : children)
{
const std::string child_replica_path = path + "/replicas/" + child;
if (child_replica_path != replica_path)
{
replica_to_space_info[child_replica_path].factor = factor;
auto host = zookeeper->get(child_replica_path + "/host");
ReplicatedMergeTreeAddress host_desc(host);
task_info_list.emplace_back(child_replica_path, host_desc);
}
}
}
boost::threadpool::pool pool(task_info_list.size());
using Tasks = std::vector<std::packaged_task<size_t()> >;
Tasks tasks(task_info_list.size());
try
{
for (size_t i = 0; i < task_info_list.size(); ++i)
{
const auto & entry = task_info_list[i];
const auto & replica_path = entry.replica_path;
const auto & address = entry.address;
InterserverIOEndpointLocation location(replica_path, address.host, address.replication_port);
tasks[i] = Tasks::value_type(std::bind(&RemoteDiskSpaceMonitor::Client::getFreeDiskSpace,
&free_disk_space_checker, location));
pool.schedule([i, &tasks]{ tasks[i](); });
}
}
catch (...)
{
pool.wait();
throw;
}
pool.wait();
for (size_t i = 0; i < task_info_list.size(); ++i)
{
size_t remote_available_size = tasks[i].get_future().get();
const auto & remote_replica_path = task_info_list[i].replica_path;
replica_to_space_info.at(remote_replica_path).available_size = remote_available_size;
}
return replica_to_space_info;
}
bool StorageReplicatedMergeTree::checkSpaceForResharding(const ReplicaToSpaceInfo & replica_to_space_info,
size_t partition_size) const
{
/// Безопасное умножение.
auto scale_size = [](size_t size, long double factor)
{
feclearexcept(FE_OVERFLOW);
feclearexcept(FE_UNDERFLOW);
long double result = static_cast<long double>(size) * factor;
if ((fetestexcept(FE_OVERFLOW) != 0) || (fetestexcept(FE_UNDERFLOW) != 0))
throw Exception("StorageReplicatedMergeTree: floating point exception raised", ErrorCodes::LOGICAL_ERROR);
if (result > static_cast<long double>(std::numeric_limits<size_t>::max()))
throw Exception("StorageReplicatedMergeTree: integer overflow", ErrorCodes::LOGICAL_ERROR);
return static_cast<size_t>(result);
};
for (const auto & entry : replica_to_space_info)
{
const auto & info = entry.second;
size_t required_size = scale_size(partition_size, info.factor);
if (info.available_size < required_size)
return false;
}
return true;
}
}

View File

@ -173,6 +173,7 @@ public:
*/
void waitForDisappear(const std::string & path);
std::string getTaskQueuePath() const;
/** Асинхронный интерфейс (реализовано небольшое подмножество операций).
*
@ -299,7 +300,7 @@ private:
friend struct WatchWithEvent;
friend class EphemeralNodeHolder;
void init(const std::string & hosts, int32_t session_timeout_ms);
void init(const std::string & hosts, int32_t session_timeout_ms, const std::string & task_queue_path_ = "");
void removeChildrenRecursive(const std::string & path);
void tryRemoveChildrenRecursive(const std::string & path);
void * watchForEvent(EventPtr event);
@ -342,6 +343,7 @@ private:
int32_t existsImpl(const std::string & path, Stat * stat_, EventPtr watch = nullptr);
std::string hosts;
std::string task_queue_path;
int32_t session_timeout_ms;
std::mutex mutex;

View File

@ -61,12 +61,13 @@ void ZooKeeper::processEvent(zhandle_t * zh, int type, int state, const char * p
}
}
void ZooKeeper::init(const std::string & hosts_, int32_t session_timeout_ms_)
void ZooKeeper::init(const std::string & hosts_, int32_t session_timeout_ms_, const std::string & task_queue_path_)
{
log = &Logger::get("ZooKeeper");
zoo_set_debug_level(ZOO_LOG_LEVEL_ERROR);
hosts = hosts_;
session_timeout_ms = session_timeout_ms_;
task_queue_path = task_queue_path_;
impl = zookeeper_init(hosts.c_str(), nullptr, session_timeout_ms, nullptr, nullptr, 0);
ProfileEvents::increment(ProfileEvents::ZooKeeperInit);
@ -104,6 +105,10 @@ struct ZooKeeperArgs
{
session_timeout_ms = config.getInt(config_name + "." + key);
}
else if (key == "task_queue_path")
{
task_queue_path = config.getString(config_name + "." + key);
}
else throw KeeperException(std::string("Unknown key ") + key + " in config file");
}
@ -120,12 +125,13 @@ struct ZooKeeperArgs
std::string hosts;
size_t session_timeout_ms;
std::string task_queue_path;
};
ZooKeeper::ZooKeeper(const Poco::Util::AbstractConfiguration & config, const std::string & config_name)
{
ZooKeeperArgs args(config, config_name);
init(args.hosts, args.session_timeout_ms);
init(args.hosts, args.session_timeout_ms, args.task_queue_path);
}
void * ZooKeeper::watchForEvent(EventPtr event)
@ -578,6 +584,10 @@ void ZooKeeper::waitForDisappear(const std::string & path)
}
}
std::string ZooKeeper::getTaskQueuePath() const
{
return task_queue_path;
}
ZooKeeper::~ZooKeeper()
{