2014-03-21 13:42:14 +00:00
|
|
|
|
#include <DB/Storages/StorageReplicatedMergeTree.h>
|
2014-04-02 10:10:37 +00:00
|
|
|
|
#include <DB/Storages/MergeTree/ReplicatedMergeTreeBlockOutputStream.h>
|
2014-04-02 13:45:39 +00:00
|
|
|
|
#include <DB/Storages/MergeTree/ReplicatedMergeTreePartsExchange.h>
|
2014-03-22 14:44:44 +00:00
|
|
|
|
#include <DB/Parsers/formatAST.h>
|
|
|
|
|
#include <DB/IO/WriteBufferFromOStream.h>
|
|
|
|
|
#include <DB/IO/ReadBufferFromString.h>
|
2014-03-21 13:42:14 +00:00
|
|
|
|
|
|
|
|
|
namespace DB
|
|
|
|
|
{
|
|
|
|
|
|
2014-04-03 11:48:28 +00:00
|
|
|
|
|
|
|
|
|
const auto QUEUE_UPDATE_SLEEP = std::chrono::seconds(5);
|
|
|
|
|
const auto QUEUE_NO_WORK_SLEEP = std::chrono::seconds(5);
|
|
|
|
|
const auto QUEUE_ERROR_SLEEP = std::chrono::seconds(1);
|
|
|
|
|
const auto QUEUE_AFTER_WORK_SLEEP = std::chrono::seconds(0);
|
|
|
|
|
|
|
|
|
|
|
2014-03-21 13:42:14 +00:00
|
|
|
|
StorageReplicatedMergeTree::StorageReplicatedMergeTree(
|
|
|
|
|
const String & zookeeper_path_,
|
|
|
|
|
const String & replica_name_,
|
2014-03-21 19:17:59 +00:00
|
|
|
|
bool attach,
|
2014-03-21 13:42:14 +00:00
|
|
|
|
const String & path_, const String & name_, NamesAndTypesListPtr columns_,
|
2014-03-22 14:44:44 +00:00
|
|
|
|
Context & context_,
|
2014-03-21 13:42:14 +00:00
|
|
|
|
ASTPtr & primary_expr_ast_,
|
|
|
|
|
const String & date_column_name_,
|
|
|
|
|
const ASTPtr & sampling_expression_,
|
|
|
|
|
size_t index_granularity_,
|
|
|
|
|
MergeTreeData::Mode mode_,
|
|
|
|
|
const String & sign_column_,
|
|
|
|
|
const MergeTreeSettings & settings_)
|
|
|
|
|
:
|
2014-03-22 14:44:44 +00:00
|
|
|
|
context(context_), zookeeper(context.getZooKeeper()),
|
2014-03-21 13:42:14 +00:00
|
|
|
|
path(path_), name(name_), full_path(path + escapeForFileName(name) + '/'), zookeeper_path(zookeeper_path_),
|
|
|
|
|
replica_name(replica_name_),
|
|
|
|
|
data( full_path, columns_, context_, primary_expr_ast_, date_column_name_, sampling_expression_,
|
|
|
|
|
index_granularity_,mode_, sign_column_, settings_),
|
2014-04-03 08:47:59 +00:00
|
|
|
|
reader(data), writer(data), fetcher(data),
|
2014-03-22 14:44:44 +00:00
|
|
|
|
log(&Logger::get("StorageReplicatedMergeTree")),
|
2014-03-21 13:42:14 +00:00
|
|
|
|
shutdown_called(false)
|
|
|
|
|
{
|
2014-03-22 14:44:44 +00:00
|
|
|
|
if (!zookeeper_path.empty() && *zookeeper_path.rbegin() == '/')
|
|
|
|
|
zookeeper_path.erase(zookeeper_path.end() - 1);
|
|
|
|
|
replica_path = zookeeper_path + "/replicas/" + replica_name;
|
2014-03-21 19:49:27 +00:00
|
|
|
|
|
2014-03-21 19:17:59 +00:00
|
|
|
|
if (!attach)
|
|
|
|
|
{
|
2014-03-22 14:44:44 +00:00
|
|
|
|
if (!zookeeper.exists(zookeeper_path))
|
|
|
|
|
createTable();
|
|
|
|
|
|
|
|
|
|
if (!isTableEmpty())
|
|
|
|
|
throw Exception("Can't add new replica to non-empty table", ErrorCodes::ADDING_REPLICA_TO_NON_EMPTY_TABLE);
|
|
|
|
|
|
|
|
|
|
checkTableStructure();
|
|
|
|
|
createReplica();
|
2014-03-21 19:17:59 +00:00
|
|
|
|
}
|
|
|
|
|
else
|
|
|
|
|
{
|
|
|
|
|
checkTableStructure();
|
2014-04-02 07:59:43 +00:00
|
|
|
|
checkParts();
|
2014-03-21 19:17:59 +00:00
|
|
|
|
}
|
2014-03-22 14:44:44 +00:00
|
|
|
|
|
2014-04-03 08:47:59 +00:00
|
|
|
|
loadQueue();
|
2014-03-22 14:44:44 +00:00
|
|
|
|
activateReplica();
|
2014-04-03 11:48:28 +00:00
|
|
|
|
|
|
|
|
|
queue_updating_thread = std::thread(&StorageReplicatedMergeTree::queueUpdatingThread, this);
|
|
|
|
|
for (size_t i = 0; i < settings_.replication_threads; ++i)
|
|
|
|
|
queue_threads.push_back(std::thread(&StorageReplicatedMergeTree::queueThread, this));
|
2014-03-21 13:42:14 +00:00
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
StoragePtr StorageReplicatedMergeTree::create(
|
|
|
|
|
const String & zookeeper_path_,
|
|
|
|
|
const String & replica_name_,
|
2014-03-21 19:17:59 +00:00
|
|
|
|
bool attach,
|
2014-03-21 13:42:14 +00:00
|
|
|
|
const String & path_, const String & name_, NamesAndTypesListPtr columns_,
|
2014-03-22 14:44:44 +00:00
|
|
|
|
Context & context_,
|
2014-03-21 13:42:14 +00:00
|
|
|
|
ASTPtr & primary_expr_ast_,
|
|
|
|
|
const String & date_column_name_,
|
|
|
|
|
const ASTPtr & sampling_expression_,
|
|
|
|
|
size_t index_granularity_,
|
|
|
|
|
MergeTreeData::Mode mode_,
|
|
|
|
|
const String & sign_column_,
|
|
|
|
|
const MergeTreeSettings & settings_)
|
|
|
|
|
{
|
2014-04-02 07:59:43 +00:00
|
|
|
|
StorageReplicatedMergeTree * res = new StorageReplicatedMergeTree(zookeeper_path_, replica_name_, attach,
|
|
|
|
|
path_, name_, columns_, context_, primary_expr_ast_, date_column_name_, sampling_expression_,
|
|
|
|
|
index_granularity_, mode_, sign_column_, settings_);
|
|
|
|
|
StoragePtr res_ptr = res->thisPtr();
|
2014-04-02 13:45:39 +00:00
|
|
|
|
String endpoint_name = "ReplicatedMergeTree:" + res->replica_path;
|
|
|
|
|
InterserverIOEndpointPtr endpoint = new ReplicatedMergeTreePartsServer(res->data, res_ptr);
|
|
|
|
|
res->endpoint_holder = new InterserverIOEndpointHolder(endpoint_name, endpoint, res->context.getInterserverIOHandler());
|
2014-04-02 07:59:43 +00:00
|
|
|
|
return res_ptr;
|
2014-03-21 13:42:14 +00:00
|
|
|
|
}
|
|
|
|
|
|
2014-03-22 14:44:44 +00:00
|
|
|
|
static String formattedAST(const ASTPtr & ast)
|
|
|
|
|
{
|
|
|
|
|
if (!ast)
|
|
|
|
|
return "";
|
|
|
|
|
std::stringstream ss;
|
|
|
|
|
formatAST(*ast, ss, 0, false, true);
|
|
|
|
|
return ss.str();
|
|
|
|
|
}
|
2014-03-21 19:17:59 +00:00
|
|
|
|
|
2014-03-22 14:44:44 +00:00
|
|
|
|
void StorageReplicatedMergeTree::createTable()
|
2014-03-21 19:17:59 +00:00
|
|
|
|
{
|
2014-03-22 14:44:44 +00:00
|
|
|
|
zookeeper.create(zookeeper_path, "", zkutil::CreateMode::Persistent);
|
2014-03-21 19:17:59 +00:00
|
|
|
|
|
2014-03-22 14:44:44 +00:00
|
|
|
|
/// Запишем метаданные таблицы, чтобы реплики могли сверять с ними свою локальную структуру таблицы.
|
|
|
|
|
std::stringstream metadata;
|
|
|
|
|
metadata << "metadata format version: 1" << std::endl;
|
|
|
|
|
metadata << "date column: " << data.date_column_name << std::endl;
|
|
|
|
|
metadata << "sampling expression: " << formattedAST(data.sampling_expression) << std::endl;
|
|
|
|
|
metadata << "index granularity: " << data.index_granularity << std::endl;
|
|
|
|
|
metadata << "mode: " << static_cast<int>(data.mode) << std::endl;
|
|
|
|
|
metadata << "sign column: " << data.sign_column << std::endl;
|
|
|
|
|
metadata << "primary key: " << formattedAST(data.primary_expr_ast) << std::endl;
|
|
|
|
|
metadata << "columns:" << std::endl;
|
|
|
|
|
WriteBufferFromOStream buf(metadata);
|
|
|
|
|
for (auto & it : data.getColumnsList())
|
|
|
|
|
{
|
|
|
|
|
writeBackQuotedString(it.first, buf);
|
|
|
|
|
writeChar(' ', buf);
|
|
|
|
|
writeString(it.second->getName(), buf);
|
|
|
|
|
writeChar('\n', buf);
|
|
|
|
|
}
|
|
|
|
|
buf.next();
|
2014-03-21 19:17:59 +00:00
|
|
|
|
|
2014-03-22 14:44:44 +00:00
|
|
|
|
zookeeper.create(zookeeper_path + "/metadata", metadata.str(), zkutil::CreateMode::Persistent);
|
2014-03-21 19:17:59 +00:00
|
|
|
|
|
2014-03-22 14:44:44 +00:00
|
|
|
|
/// Создадим нужные "директории".
|
|
|
|
|
zookeeper.create(zookeeper_path + "/replicas", "", zkutil::CreateMode::Persistent);
|
|
|
|
|
zookeeper.create(zookeeper_path + "/blocks", "", zkutil::CreateMode::Persistent);
|
2014-04-03 12:49:01 +00:00
|
|
|
|
zookeeper.create(zookeeper_path + "/block_numbers", "", zkutil::CreateMode::Persistent);
|
2014-04-02 13:45:39 +00:00
|
|
|
|
zookeeper.create(zookeeper_path + "/temp", "", zkutil::CreateMode::Persistent);
|
2014-03-22 14:44:44 +00:00
|
|
|
|
}
|
2014-03-21 19:17:59 +00:00
|
|
|
|
|
2014-03-22 14:44:44 +00:00
|
|
|
|
/** Проверить, что список столбцов и настройки таблицы совпадают с указанными в ZK (/metadata).
|
|
|
|
|
* Если нет - бросить исключение.
|
2014-03-21 19:17:59 +00:00
|
|
|
|
*/
|
2014-03-22 14:44:44 +00:00
|
|
|
|
void StorageReplicatedMergeTree::checkTableStructure()
|
|
|
|
|
{
|
|
|
|
|
String metadata_str = zookeeper.get(zookeeper_path + "/metadata");
|
|
|
|
|
ReadBufferFromString buf(metadata_str);
|
|
|
|
|
assertString("metadata format version: 1", buf);
|
|
|
|
|
assertString("\ndate column: ", buf);
|
|
|
|
|
assertString(data.date_column_name, buf);
|
|
|
|
|
assertString("\nsampling expression: ", buf);
|
|
|
|
|
assertString(formattedAST(data.sampling_expression), buf);
|
|
|
|
|
assertString("\nindex granularity: ", buf);
|
|
|
|
|
assertString(toString(data.index_granularity), buf);
|
|
|
|
|
assertString("\nmode: ", buf);
|
|
|
|
|
assertString(toString(static_cast<int>(data.mode)), buf);
|
|
|
|
|
assertString("\nsign column: ", buf);
|
|
|
|
|
assertString(data.sign_column, buf);
|
|
|
|
|
assertString("\nprimary key: ", buf);
|
|
|
|
|
assertString(formattedAST(data.primary_expr_ast), buf);
|
|
|
|
|
assertString("\ncolumns:\n", buf);
|
|
|
|
|
for (auto & it : data.getColumnsList())
|
|
|
|
|
{
|
|
|
|
|
String name;
|
|
|
|
|
readBackQuotedString(name, buf);
|
|
|
|
|
if (name != it.first)
|
|
|
|
|
throw Exception("Unexpected column name in ZooKeeper: expected " + it.first + ", found " + name,
|
|
|
|
|
ErrorCodes::UNKNOWN_IDENTIFIER);
|
|
|
|
|
assertString(" ", buf);
|
|
|
|
|
assertString(it.second->getName(), buf);
|
|
|
|
|
assertString("\n", buf);
|
|
|
|
|
}
|
2014-04-02 07:59:43 +00:00
|
|
|
|
assertEOF(buf);
|
2014-03-22 14:44:44 +00:00
|
|
|
|
}
|
2014-03-21 19:17:59 +00:00
|
|
|
|
|
2014-03-22 14:44:44 +00:00
|
|
|
|
void StorageReplicatedMergeTree::createReplica()
|
|
|
|
|
{
|
|
|
|
|
zookeeper.create(replica_path, "", zkutil::CreateMode::Persistent);
|
|
|
|
|
zookeeper.create(replica_path + "/host", "", zkutil::CreateMode::Persistent);
|
|
|
|
|
zookeeper.create(replica_path + "/log", "", zkutil::CreateMode::Persistent);
|
|
|
|
|
zookeeper.create(replica_path + "/log_pointers", "", zkutil::CreateMode::Persistent);
|
|
|
|
|
zookeeper.create(replica_path + "/queue", "", zkutil::CreateMode::Persistent);
|
|
|
|
|
zookeeper.create(replica_path + "/parts", "", zkutil::CreateMode::Persistent);
|
|
|
|
|
}
|
2014-03-21 19:17:59 +00:00
|
|
|
|
|
2014-03-22 14:44:44 +00:00
|
|
|
|
void StorageReplicatedMergeTree::activateReplica()
|
|
|
|
|
{
|
|
|
|
|
std::stringstream host;
|
|
|
|
|
host << "host: " << context.getInterserverIOHost() << std::endl;
|
|
|
|
|
host << "port: " << context.getInterserverIOPort() << std::endl;
|
2014-03-21 19:17:59 +00:00
|
|
|
|
|
2014-04-03 08:47:59 +00:00
|
|
|
|
/// Одновременно объявим, что эта реплика активна, и обновим хост.
|
2014-03-22 14:44:44 +00:00
|
|
|
|
zkutil::Ops ops;
|
|
|
|
|
ops.push_back(new zkutil::Op::Create(replica_path + "/is_active", "", zookeeper.getDefaultACL(), zkutil::CreateMode::Ephemeral));
|
|
|
|
|
ops.push_back(new zkutil::Op::SetData(replica_path + "/host", host.str(), -1));
|
2014-04-03 13:11:11 +00:00
|
|
|
|
|
|
|
|
|
try
|
|
|
|
|
{
|
|
|
|
|
zookeeper.multi(ops);
|
|
|
|
|
}
|
|
|
|
|
catch (zkutil::KeeperException & e)
|
|
|
|
|
{
|
|
|
|
|
if (e.code == zkutil::ReturnCode::NodeExists)
|
|
|
|
|
throw Exception("Replica " + replica_path + " appears to be already active. If you're sure it's not, "
|
|
|
|
|
"try again in a minute or remove znode " + replica_path + "/is_active manually", ErrorCodes::REPLICA_IS_ALREADY_ACTIVE);
|
|
|
|
|
|
|
|
|
|
throw;
|
|
|
|
|
}
|
2014-03-21 19:17:59 +00:00
|
|
|
|
|
2014-03-22 14:44:44 +00:00
|
|
|
|
replica_is_active_node = zkutil::EphemeralNodeHolder::existing(replica_path + "/is_active", zookeeper);
|
|
|
|
|
}
|
2014-03-21 19:17:59 +00:00
|
|
|
|
|
2014-03-22 14:44:44 +00:00
|
|
|
|
bool StorageReplicatedMergeTree::isTableEmpty()
|
|
|
|
|
{
|
|
|
|
|
Strings replicas = zookeeper.getChildren(zookeeper_path + "/replicas");
|
|
|
|
|
for (const auto & replica : replicas)
|
|
|
|
|
{
|
|
|
|
|
if (!zookeeper.getChildren(zookeeper_path + "/replicas/" + replica + "/parts").empty())
|
|
|
|
|
return false;
|
|
|
|
|
}
|
|
|
|
|
return true;
|
|
|
|
|
}
|
2014-03-21 19:17:59 +00:00
|
|
|
|
|
2014-04-02 07:59:43 +00:00
|
|
|
|
void StorageReplicatedMergeTree::checkParts()
|
|
|
|
|
{
|
|
|
|
|
Strings expected_parts_vec = zookeeper.getChildren(replica_path + "/parts");
|
|
|
|
|
NameSet expected_parts(expected_parts_vec.begin(), expected_parts_vec.end());
|
|
|
|
|
|
|
|
|
|
MergeTreeData::DataParts parts = data.getDataParts();
|
|
|
|
|
|
|
|
|
|
MergeTreeData::DataPartsVector unexpected_parts;
|
|
|
|
|
for (const auto & part : parts)
|
|
|
|
|
{
|
|
|
|
|
if (expected_parts.count(part->name))
|
|
|
|
|
{
|
|
|
|
|
expected_parts.erase(part->name);
|
|
|
|
|
}
|
|
|
|
|
else
|
|
|
|
|
{
|
|
|
|
|
unexpected_parts.push_back(part);
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
if (!expected_parts.empty())
|
|
|
|
|
throw Exception("Not found " + toString(expected_parts.size())
|
2014-04-02 10:10:37 +00:00
|
|
|
|
+ " parts (including " + *expected_parts.begin() + ") in table " + data.getTableName(),
|
2014-04-02 07:59:43 +00:00
|
|
|
|
ErrorCodes::NOT_FOUND_EXPECTED_DATA_PART);
|
|
|
|
|
|
|
|
|
|
if (unexpected_parts.size() > 1)
|
|
|
|
|
throw Exception("More than one unexpected part (including " + unexpected_parts[0]->name
|
|
|
|
|
+ ") in table " + data.getTableName(),
|
|
|
|
|
ErrorCodes::TOO_MANY_UNEXPECTED_DATA_PARTS);
|
|
|
|
|
|
|
|
|
|
for (MergeTreeData::DataPartPtr part : unexpected_parts)
|
|
|
|
|
{
|
|
|
|
|
LOG_ERROR(log, "Unexpected part " << part->name << ". Renaming it to ignored_" + part->name);
|
2014-04-02 10:10:37 +00:00
|
|
|
|
data.renameAndDetachPart(part, "ignored_");
|
2014-04-02 07:59:43 +00:00
|
|
|
|
}
|
|
|
|
|
}
|
2014-03-21 19:17:59 +00:00
|
|
|
|
|
2014-04-03 08:47:59 +00:00
|
|
|
|
void StorageReplicatedMergeTree::loadQueue()
|
|
|
|
|
{
|
|
|
|
|
Poco::ScopedLock<Poco::FastMutex> lock(queue_mutex);
|
|
|
|
|
|
|
|
|
|
Strings children = zookeeper.getChildren(replica_path + "/queue");
|
|
|
|
|
std::sort(children.begin(), children.end());
|
|
|
|
|
for (const String & child : children)
|
|
|
|
|
{
|
2014-04-03 12:49:01 +00:00
|
|
|
|
String s = zookeeper.get(replica_path + "/queue/" + child);
|
2014-04-03 11:48:28 +00:00
|
|
|
|
LogEntry entry = LogEntry::parse(s);
|
|
|
|
|
entry.znode_name = child;
|
|
|
|
|
queue.push_back(entry);
|
2014-04-03 08:47:59 +00:00
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
void StorageReplicatedMergeTree::pullLogsToQueue()
|
|
|
|
|
{
|
|
|
|
|
Poco::ScopedLock<Poco::FastMutex> lock(queue_mutex);
|
2014-03-21 19:17:59 +00:00
|
|
|
|
|
2014-04-03 12:49:01 +00:00
|
|
|
|
/// Сольем все логи в хронологическом порядке.
|
|
|
|
|
|
|
|
|
|
struct LogIterator
|
|
|
|
|
{
|
|
|
|
|
String replica; /// Имя реплики.
|
|
|
|
|
UInt64 index; /// Номер записи в логе (суффикс имени ноды).
|
|
|
|
|
|
|
|
|
|
Int64 timestamp; /// Время (czxid) создания записи в логе.
|
|
|
|
|
String entry_str; /// Сама запись.
|
|
|
|
|
|
|
|
|
|
bool operator<(const LogIterator & rhs) const
|
|
|
|
|
{
|
|
|
|
|
return timestamp < rhs.timestamp;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
bool readEntry(zkutil::ZooKeeper & zookeeper, const String & zookeeper_path)
|
|
|
|
|
{
|
|
|
|
|
String index_str = toString(index);
|
|
|
|
|
while (index_str.size() < 10)
|
|
|
|
|
index_str = '0' + index_str;
|
|
|
|
|
zkutil::Stat stat;
|
|
|
|
|
if (!zookeeper.tryGet(zookeeper_path + "/replicas/" + replica + "/log/log-" + index_str, entry_str, &stat))
|
|
|
|
|
return false;
|
|
|
|
|
timestamp = stat.getczxid();
|
|
|
|
|
return true;
|
|
|
|
|
}
|
|
|
|
|
};
|
|
|
|
|
|
|
|
|
|
typedef std::priority_queue<LogIterator> PriorityQueue;
|
|
|
|
|
PriorityQueue priority_queue;
|
|
|
|
|
|
2014-04-03 08:47:59 +00:00
|
|
|
|
Strings replicas = zookeeper.getChildren(zookeeper_path + "/replicas");
|
2014-04-03 12:49:01 +00:00
|
|
|
|
|
2014-04-03 08:47:59 +00:00
|
|
|
|
for (const String & replica : replicas)
|
|
|
|
|
{
|
2014-04-03 12:49:01 +00:00
|
|
|
|
String index_str;
|
|
|
|
|
UInt64 index;
|
2014-04-03 08:47:59 +00:00
|
|
|
|
|
2014-04-03 12:49:01 +00:00
|
|
|
|
if (zookeeper.tryGet(replica_path + "/log_pointers/" + replica, index_str))
|
2014-04-03 08:47:59 +00:00
|
|
|
|
{
|
2014-04-03 12:49:01 +00:00
|
|
|
|
index = Poco::NumberParser::parseUnsigned64(index_str);
|
2014-04-03 08:47:59 +00:00
|
|
|
|
}
|
|
|
|
|
else
|
|
|
|
|
{
|
|
|
|
|
/// Если у нас еще нет указателя на лог этой реплики, поставим указатель на первую запись в нем.
|
2014-04-03 12:49:01 +00:00
|
|
|
|
Strings entries = zookeeper.getChildren(zookeeper_path + "/replicas/" + replica + "/log");
|
2014-04-03 08:47:59 +00:00
|
|
|
|
std::sort(entries.begin(), entries.end());
|
2014-04-03 12:49:01 +00:00
|
|
|
|
index = entries.empty() ? 0 : Poco::NumberParser::parseUnsigned64(entries[0].substr(strlen("log-")));
|
2014-04-03 08:47:59 +00:00
|
|
|
|
|
2014-04-03 12:49:01 +00:00
|
|
|
|
zookeeper.create(replica_path + "/log_pointers/" + replica, toString(index), zkutil::CreateMode::Persistent);
|
2014-04-03 08:47:59 +00:00
|
|
|
|
}
|
|
|
|
|
|
2014-04-03 12:49:01 +00:00
|
|
|
|
LogIterator iterator;
|
|
|
|
|
iterator.replica = replica;
|
|
|
|
|
iterator.index = index;
|
|
|
|
|
|
|
|
|
|
if (iterator.readEntry(zookeeper, zookeeper_path))
|
|
|
|
|
priority_queue.push(iterator);
|
|
|
|
|
}
|
2014-04-03 08:47:59 +00:00
|
|
|
|
|
2014-04-03 12:49:01 +00:00
|
|
|
|
while (!priority_queue.empty())
|
|
|
|
|
{
|
|
|
|
|
LogIterator iterator = priority_queue.top();
|
|
|
|
|
priority_queue.pop();
|
2014-04-03 11:48:28 +00:00
|
|
|
|
|
2014-04-03 12:49:01 +00:00
|
|
|
|
LogEntry entry = LogEntry::parse(iterator.entry_str);
|
2014-04-03 08:47:59 +00:00
|
|
|
|
|
2014-04-03 12:49:01 +00:00
|
|
|
|
/// Одновременно добавим запись в очередь и продвинем указатель на лог.
|
|
|
|
|
zkutil::Ops ops;
|
|
|
|
|
ops.push_back(new zkutil::Op::Create(
|
|
|
|
|
replica_path + "/queue/queue-", iterator.entry_str, zookeeper.getDefaultACL(), zkutil::CreateMode::PersistentSequential));
|
|
|
|
|
ops.push_back(new zkutil::Op::SetData(
|
|
|
|
|
replica_path + "/log_pointers/" + iterator.replica, toString(iterator.index + 1), -1));
|
|
|
|
|
auto results = zookeeper.multi(ops);
|
|
|
|
|
|
|
|
|
|
String path_created = dynamic_cast<zkutil::OpResult::Create &>((*results)[0]).getPathCreated();
|
|
|
|
|
entry.znode_name = path.substr(path.find_last_of('/') + 1);
|
|
|
|
|
queue.push_back(entry);
|
|
|
|
|
|
|
|
|
|
++iterator.index;
|
|
|
|
|
if (iterator.readEntry(zookeeper, zookeeper_path))
|
|
|
|
|
priority_queue.push(iterator);
|
2014-04-03 08:47:59 +00:00
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
void StorageReplicatedMergeTree::optimizeQueue()
|
|
|
|
|
{
|
2014-04-03 11:48:28 +00:00
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
void StorageReplicatedMergeTree::executeLogEntry(const LogEntry & entry)
|
|
|
|
|
{
|
|
|
|
|
if (entry.type == LogEntry::GET_PART ||
|
|
|
|
|
entry.type == LogEntry::MERGE_PARTS)
|
|
|
|
|
{
|
|
|
|
|
/// Если у нас уже есть этот кусок или покрывающий его кусок, ничего делать не нужно.
|
|
|
|
|
MergeTreeData::DataPartPtr containing_part = data.getContainingPart(entry.new_part_name);
|
|
|
|
|
|
|
|
|
|
/// Даже если кусок есть локально, его (в исключительных случаях) может не быть в zookeeper.
|
|
|
|
|
if (containing_part && zookeeper.exists(replica_path + "/parts/" + containing_part->name))
|
|
|
|
|
return;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
if (entry.type != LogEntry::GET_PART)
|
|
|
|
|
throw Exception("Merging is not implemented.", ErrorCodes::NOT_IMPLEMENTED);
|
2014-04-03 08:47:59 +00:00
|
|
|
|
|
2014-04-03 11:48:28 +00:00
|
|
|
|
String replica = findActiveReplicaHavingPart(entry.new_part_name);
|
|
|
|
|
fetchPart(entry.new_part_name, replica);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
void StorageReplicatedMergeTree::queueUpdatingThread()
|
|
|
|
|
{
|
|
|
|
|
while (!shutdown_called)
|
|
|
|
|
{
|
|
|
|
|
pullLogsToQueue();
|
|
|
|
|
optimizeQueue();
|
|
|
|
|
std::this_thread::sleep_for(QUEUE_UPDATE_SLEEP);
|
|
|
|
|
}
|
2014-04-03 08:47:59 +00:00
|
|
|
|
}
|
2014-03-21 19:17:59 +00:00
|
|
|
|
|
2014-04-03 11:48:28 +00:00
|
|
|
|
void StorageReplicatedMergeTree::queueThread()
|
|
|
|
|
{
|
|
|
|
|
while (!shutdown_called)
|
|
|
|
|
{
|
|
|
|
|
LogEntry entry;
|
|
|
|
|
bool empty;
|
|
|
|
|
|
|
|
|
|
{
|
|
|
|
|
Poco::ScopedLock<Poco::FastMutex> lock(queue_mutex);
|
|
|
|
|
empty = queue.empty();
|
|
|
|
|
if (!empty)
|
|
|
|
|
{
|
|
|
|
|
entry = queue.front();
|
|
|
|
|
queue.pop_front();
|
|
|
|
|
}
|
|
|
|
|
}
|
2014-03-21 19:17:59 +00:00
|
|
|
|
|
2014-04-03 11:48:28 +00:00
|
|
|
|
if (empty)
|
|
|
|
|
{
|
|
|
|
|
std::this_thread::sleep_for(QUEUE_NO_WORK_SLEEP);
|
|
|
|
|
continue;
|
|
|
|
|
}
|
2014-03-21 19:17:59 +00:00
|
|
|
|
|
2014-04-03 11:48:28 +00:00
|
|
|
|
bool success = false;
|
2014-04-02 07:59:43 +00:00
|
|
|
|
|
2014-04-03 11:48:28 +00:00
|
|
|
|
try
|
|
|
|
|
{
|
|
|
|
|
executeLogEntry(entry);
|
2014-04-03 13:30:21 +00:00
|
|
|
|
zookeeper.remove(replica_path + "/queue/" + entry.znode_name);
|
2014-04-03 11:48:28 +00:00
|
|
|
|
|
|
|
|
|
success = true;
|
|
|
|
|
}
|
|
|
|
|
catch (const Exception & e)
|
|
|
|
|
{
|
|
|
|
|
LOG_ERROR(log, "Code: " << e.code() << ". " << e.displayText() << std::endl
|
|
|
|
|
<< std::endl
|
|
|
|
|
<< "Stack trace:" << std::endl
|
|
|
|
|
<< e.getStackTrace().toString());
|
|
|
|
|
}
|
|
|
|
|
catch (const Poco::Exception & e)
|
|
|
|
|
{
|
|
|
|
|
LOG_ERROR(log, "Poco::Exception: " << e.code() << ". " << e.displayText());
|
|
|
|
|
}
|
|
|
|
|
catch (const std::exception & e)
|
|
|
|
|
{
|
|
|
|
|
LOG_ERROR(log, "std::exception: " << e.what());
|
|
|
|
|
}
|
|
|
|
|
catch (...)
|
|
|
|
|
{
|
|
|
|
|
LOG_ERROR(log, "Unknown exception");
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
if (shutdown_called)
|
|
|
|
|
break;
|
|
|
|
|
|
|
|
|
|
if (success)
|
|
|
|
|
{
|
|
|
|
|
std::this_thread::sleep_for(QUEUE_AFTER_WORK_SLEEP);
|
|
|
|
|
}
|
|
|
|
|
else
|
|
|
|
|
{
|
|
|
|
|
{
|
|
|
|
|
/// Добавим действие, которое не получилось выполнить, в конец очереди.
|
|
|
|
|
Poco::ScopedLock<Poco::FastMutex> lock(queue_mutex);
|
|
|
|
|
queue.push_back(entry);
|
|
|
|
|
}
|
|
|
|
|
std::this_thread::sleep_for(QUEUE_ERROR_SLEEP);
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
String StorageReplicatedMergeTree::findActiveReplicaHavingPart(const String & part_name)
|
|
|
|
|
{
|
|
|
|
|
Strings replicas = zookeeper.getChildren(zookeeper_path + "/replicas");
|
|
|
|
|
|
|
|
|
|
/// Из реплик, у которых есть кусок, выберем одну равновероятно.
|
|
|
|
|
std::random_shuffle(replicas.begin(), replicas.end());
|
|
|
|
|
|
|
|
|
|
for (const String & replica : replicas)
|
|
|
|
|
{
|
|
|
|
|
if (zookeeper.exists(zookeeper_path + "/replicas/" + replica + "/parts/" + part_name) &&
|
|
|
|
|
zookeeper.exists(zookeeper_path + "/replicas/" + replica + "/is_active"))
|
|
|
|
|
return replica;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
throw Exception("No active replica has part " + part_name, ErrorCodes::NO_REPLICA_HAS_PART);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
void StorageReplicatedMergeTree::fetchPart(const String & part_name, const String & replica_name)
|
|
|
|
|
{
|
|
|
|
|
auto table_lock = lockStructure(true);
|
|
|
|
|
|
|
|
|
|
String host;
|
|
|
|
|
int port;
|
|
|
|
|
|
|
|
|
|
String host_port_str = zookeeper.get(zookeeper_path + "/replicas/" + replica_name + "/host");
|
|
|
|
|
ReadBufferFromString buf(host_port_str);
|
|
|
|
|
assertString("host: ", buf);
|
|
|
|
|
readString(host, buf);
|
|
|
|
|
assertString("\nport: ", buf);
|
|
|
|
|
readText(port, buf);
|
|
|
|
|
assertString("\n", buf);
|
|
|
|
|
assertEOF(buf);
|
|
|
|
|
|
2014-04-03 12:49:01 +00:00
|
|
|
|
MergeTreeData::MutableDataPartPtr part = fetcher.fetchPart(part_name, zookeeper_path + "/replicas/" + replica_name, host, port);
|
2014-04-03 11:48:28 +00:00
|
|
|
|
data.renameTempPartAndAdd(part, nullptr);
|
|
|
|
|
|
|
|
|
|
zkutil::Ops ops;
|
|
|
|
|
ops.push_back(new zkutil::Op::Create(
|
|
|
|
|
replica_path + "/parts/" + part->name,
|
|
|
|
|
"",
|
|
|
|
|
zookeeper.getDefaultACL(),
|
|
|
|
|
zkutil::CreateMode::Persistent));
|
|
|
|
|
ops.push_back(new zkutil::Op::Create(
|
|
|
|
|
replica_path + "/parts/" + part->name + "/checksums",
|
|
|
|
|
part->checksums.toString(),
|
|
|
|
|
zookeeper.getDefaultACL(),
|
|
|
|
|
zkutil::CreateMode::Persistent));
|
|
|
|
|
zookeeper.multi(ops);
|
|
|
|
|
}
|
2014-03-21 19:17:59 +00:00
|
|
|
|
|
2014-03-22 14:44:44 +00:00
|
|
|
|
void StorageReplicatedMergeTree::shutdown()
|
|
|
|
|
{
|
|
|
|
|
if (shutdown_called)
|
|
|
|
|
return;
|
|
|
|
|
shutdown_called = true;
|
|
|
|
|
replica_is_active_node = nullptr;
|
|
|
|
|
endpoint_holder = nullptr;
|
|
|
|
|
|
2014-04-03 11:48:28 +00:00
|
|
|
|
LOG_TRACE(log, "Waiting for threads to finish");
|
|
|
|
|
queue_updating_thread.join();
|
|
|
|
|
for (auto & thread : queue_threads)
|
|
|
|
|
thread.join();
|
|
|
|
|
LOG_TRACE(log, "Threads finished");
|
2014-03-22 14:44:44 +00:00
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
StorageReplicatedMergeTree::~StorageReplicatedMergeTree()
|
|
|
|
|
{
|
|
|
|
|
try
|
|
|
|
|
{
|
|
|
|
|
shutdown();
|
|
|
|
|
}
|
|
|
|
|
catch(...)
|
|
|
|
|
{
|
|
|
|
|
tryLogCurrentException("~StorageReplicatedMergeTree");
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
BlockInputStreams StorageReplicatedMergeTree::read(
|
|
|
|
|
const Names & column_names,
|
|
|
|
|
ASTPtr query,
|
|
|
|
|
const Settings & settings,
|
|
|
|
|
QueryProcessingStage::Enum & processed_stage,
|
|
|
|
|
size_t max_block_size,
|
|
|
|
|
unsigned threads)
|
|
|
|
|
{
|
|
|
|
|
return reader.read(column_names, query, settings, processed_stage, max_block_size, threads);
|
|
|
|
|
}
|
|
|
|
|
|
2014-04-02 10:10:37 +00:00
|
|
|
|
BlockOutputStreamPtr StorageReplicatedMergeTree::write(ASTPtr query)
|
|
|
|
|
{
|
|
|
|
|
String insert_id;
|
|
|
|
|
if (ASTInsertQuery * insert = dynamic_cast<ASTInsertQuery *>(&*query))
|
|
|
|
|
insert_id = insert->insert_id;
|
|
|
|
|
|
|
|
|
|
return new ReplicatedMergeTreeBlockOutputStream(*this, insert_id);
|
|
|
|
|
}
|
2014-03-22 14:44:44 +00:00
|
|
|
|
|
|
|
|
|
void StorageReplicatedMergeTree::drop()
|
|
|
|
|
{
|
2014-04-03 11:48:28 +00:00
|
|
|
|
shutdown();
|
|
|
|
|
|
2014-03-22 14:44:44 +00:00
|
|
|
|
replica_is_active_node = nullptr;
|
|
|
|
|
zookeeper.removeRecursive(replica_path);
|
|
|
|
|
if (zookeeper.getChildren(zookeeper_path + "/replicas").empty())
|
|
|
|
|
zookeeper.removeRecursive(zookeeper_path);
|
|
|
|
|
}
|
|
|
|
|
|
2014-04-02 10:10:37 +00:00
|
|
|
|
void StorageReplicatedMergeTree::LogEntry::writeText(WriteBuffer & out) const
|
|
|
|
|
{
|
|
|
|
|
writeString("format version: 1\n", out);
|
|
|
|
|
switch (type)
|
|
|
|
|
{
|
|
|
|
|
case GET_PART:
|
|
|
|
|
writeString("get\n", out);
|
|
|
|
|
writeString(new_part_name, out);
|
|
|
|
|
break;
|
|
|
|
|
case MERGE_PARTS:
|
|
|
|
|
writeString("merge\n", out);
|
|
|
|
|
for (const String & s : parts_to_merge)
|
|
|
|
|
{
|
|
|
|
|
writeString(s, out);
|
|
|
|
|
writeString("\n", out);
|
|
|
|
|
}
|
|
|
|
|
writeString("into\n", out);
|
|
|
|
|
writeString(new_part_name, out);
|
|
|
|
|
break;
|
|
|
|
|
}
|
|
|
|
|
writeString("\n", out);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
void StorageReplicatedMergeTree::LogEntry::readText(ReadBuffer & in)
|
|
|
|
|
{
|
|
|
|
|
String type_str;
|
|
|
|
|
|
|
|
|
|
assertString("format version: 1\n", in);
|
|
|
|
|
readString(type_str, in);
|
|
|
|
|
assertString("\n", in);
|
|
|
|
|
|
|
|
|
|
if (type_str == "get")
|
|
|
|
|
{
|
|
|
|
|
type = GET_PART;
|
|
|
|
|
readString(new_part_name, in);
|
|
|
|
|
}
|
|
|
|
|
else if (type_str == "merge")
|
|
|
|
|
{
|
|
|
|
|
type = MERGE_PARTS;
|
|
|
|
|
while (true)
|
|
|
|
|
{
|
|
|
|
|
String s;
|
|
|
|
|
readString(s, in);
|
|
|
|
|
assertString("\n", in);
|
|
|
|
|
if (s == "into")
|
|
|
|
|
break;
|
|
|
|
|
parts_to_merge.push_back(s);
|
|
|
|
|
}
|
|
|
|
|
readString(new_part_name, in);
|
|
|
|
|
}
|
|
|
|
|
assertString("\n", in);
|
|
|
|
|
}
|
|
|
|
|
|
2014-03-21 13:42:14 +00:00
|
|
|
|
}
|