mirror of
https://github.com/ClickHouse/ClickHouse.git
synced 2024-10-19 06:50:50 +00:00
457 lines
14 KiB
C++
457 lines
14 KiB
C++
#include <Storages/Distributed/DistributedBlockOutputStream.h>
|
|
#include <Storages/Distributed/DirectoryMonitor.h>
|
|
#include <Storages/StorageDistributed.h>
|
|
|
|
#include <Parsers/formatAST.h>
|
|
#include <Parsers/queryToString.h>
|
|
|
|
#include <IO/WriteBufferFromFile.h>
|
|
#include <IO/CompressedWriteBuffer.h>
|
|
#include <IO/Operators.h>
|
|
#include <IO/WriteBufferFromString.h>
|
|
#include <DataStreams/NativeBlockOutputStream.h>
|
|
#include <DataStreams/RemoteBlockOutputStream.h>
|
|
#include <Interpreters/InterpreterInsertQuery.h>
|
|
#include <Interpreters/createBlockSelector.h>
|
|
|
|
#include <DataTypes/DataTypesNumber.h>
|
|
#include <Common/setThreadName.h>
|
|
#include <Common/ClickHouseRevision.h>
|
|
#include <Common/CurrentMetrics.h>
|
|
#include <Common/typeid_cast.h>
|
|
#include <Common/Exception.h>
|
|
#include <Common/ProfileEvents.h>
|
|
#include <Common/MemoryTracker.h>
|
|
#include <Common/escapeForFileName.h>
|
|
#include <common/logger_useful.h>
|
|
#include <ext/range.h>
|
|
|
|
#include <Poco/DirectoryIterator.h>
|
|
|
|
#include <future>
|
|
#include <condition_variable>
|
|
#include <mutex>
|
|
|
|
namespace CurrentMetrics
|
|
{
|
|
extern const Metric DistributedSend;
|
|
}
|
|
|
|
namespace ProfileEvents
|
|
{
|
|
extern const Event DistributedSyncInsertionTimeoutExceeded;
|
|
}
|
|
|
|
namespace DB
|
|
{
|
|
|
|
|
|
namespace ErrorCodes
|
|
{
|
|
extern const int TIMEOUT_EXCEEDED;
|
|
}
|
|
|
|
|
|
DistributedBlockOutputStream::DistributedBlockOutputStream(StorageDistributed & storage, const ASTPtr & query_ast,
|
|
const ClusterPtr & cluster_, bool insert_sync_, UInt64 insert_timeout_)
|
|
: storage(storage), query_ast(query_ast), cluster(cluster_), insert_sync(insert_sync_), insert_timeout(insert_timeout_)
|
|
{
|
|
}
|
|
|
|
|
|
void DistributedBlockOutputStream::writePrefix()
|
|
{
|
|
deadline = std::chrono::steady_clock::now() + std::chrono::seconds(insert_timeout);
|
|
}
|
|
|
|
|
|
void DistributedBlockOutputStream::write(const Block & block)
|
|
{
|
|
if (insert_sync)
|
|
writeSync(block);
|
|
else
|
|
writeAsync(block);
|
|
}
|
|
|
|
|
|
void DistributedBlockOutputStream::writeAsync(const Block & block)
|
|
{
|
|
if (storage.getShardingKeyExpr() && (cluster->getShardsInfo().size() > 1))
|
|
return writeSplitAsync(block);
|
|
|
|
writeAsyncImpl(block);
|
|
++blocks_inserted;
|
|
}
|
|
|
|
|
|
ThreadPool::Job DistributedBlockOutputStream::createWritingJob(
|
|
WritingJobContext & context, const Block & block, const Cluster::Address & address, size_t shard_id, size_t job_id)
|
|
{
|
|
auto memory_tracker = current_memory_tracker;
|
|
return [this, memory_tracker, & context, & block, & address, shard_id, job_id]()
|
|
{
|
|
if (!current_memory_tracker)
|
|
{
|
|
current_memory_tracker = memory_tracker;
|
|
setThreadName("DistributedBlockOutputStreamProc");
|
|
}
|
|
try
|
|
{
|
|
const auto & shard_info = cluster->getShardsInfo()[shard_id];
|
|
if (address.is_local)
|
|
{
|
|
writeToLocal(block, shard_info.getLocalNodeCount());
|
|
context.done_local_jobs[job_id] = true;
|
|
}
|
|
else
|
|
{
|
|
writeToShardSync(block, shard_info.hasInternalReplication()
|
|
? shard_info.dir_name_for_internal_replication
|
|
: address.toStringFull());
|
|
context.done_remote_jobs[job_id] = true;
|
|
}
|
|
|
|
++context.finished_jobs_count;
|
|
context.cond_var.notify_one();
|
|
}
|
|
catch (...)
|
|
{
|
|
++context.finished_jobs_count;
|
|
context.cond_var.notify_one();
|
|
throw;
|
|
}
|
|
};
|
|
}
|
|
|
|
|
|
std::string DistributedBlockOutputStream::getCurrentStateDescription(const WritingJobContext & context)
|
|
{
|
|
const Cluster::ShardsInfo & shards_info = cluster->getShardsInfo();
|
|
String description;
|
|
WriteBufferFromString buffer(description);
|
|
|
|
buffer << "Insertion status:\n";
|
|
|
|
auto writeDescription = [&buffer](const Cluster::Address & address, size_t shard_id, size_t blocks_wrote)
|
|
{
|
|
buffer << "Wrote " << blocks_wrote << " blocks on shard " << shard_id << " replica ";
|
|
buffer << address.toString() << '\n';
|
|
};
|
|
|
|
const auto addresses_with_failovers = cluster->getShardsAddresses();
|
|
|
|
size_t remote_job_id = 0;
|
|
size_t local_job_id = 0;
|
|
for (size_t shard_id : ext::range(0, shards_info.size()))
|
|
{
|
|
const auto & shard_info = shards_info[shard_id];
|
|
/// If hasInternalReplication, than prefer local replica
|
|
if (!shard_info.hasInternalReplication() || !shard_info.isLocal())
|
|
{
|
|
for (const auto & address : addresses_with_failovers[shard_id])
|
|
if (!address.is_local)
|
|
{
|
|
writeDescription(address, shard_id, blocks_inserted + (context.done_remote_jobs[remote_job_id++] ? 1 : 0));
|
|
if (shard_info.hasInternalReplication())
|
|
break;
|
|
}
|
|
}
|
|
|
|
if (shard_info.isLocal())
|
|
{
|
|
const auto & address = shard_info.local_addresses.front();
|
|
writeDescription(address, shard_id, blocks_inserted + (context.done_local_jobs[local_job_id++] ? 1 : 0));
|
|
}
|
|
}
|
|
|
|
return description;
|
|
}
|
|
|
|
|
|
void DistributedBlockOutputStream::createWritingJobs(WritingJobContext & context, const Blocks & blocks)
|
|
{
|
|
const auto & addresses_with_failovers = cluster->getShardsAddresses();
|
|
const Cluster::ShardsInfo & shards_info = cluster->getShardsInfo();
|
|
|
|
size_t remote_job_id = 0;
|
|
size_t local_job_id = 0;
|
|
for (size_t shard_id : ext::range(0, blocks.size()))
|
|
{
|
|
const auto & shard_info = shards_info[shard_id];
|
|
/// If hasInternalReplication, than prefer local replica
|
|
if (!shard_info.hasInternalReplication() || !shard_info.isLocal())
|
|
{
|
|
for (const auto & address : addresses_with_failovers[shard_id])
|
|
if (!address.is_local)
|
|
{
|
|
pool->schedule(createWritingJob(context, blocks[shard_id], address, shard_id, remote_job_id++));
|
|
if (shard_info.hasInternalReplication())
|
|
break;
|
|
}
|
|
}
|
|
|
|
if (shards_info[shard_id].isLocal())
|
|
{
|
|
const auto & address = shards_info[shard_id].local_addresses.front();
|
|
pool->schedule(createWritingJob(context, blocks[shard_id], address, shard_id, local_job_id++));
|
|
}
|
|
}
|
|
}
|
|
|
|
|
|
void DistributedBlockOutputStream::calculateJobsCount()
|
|
{
|
|
remote_jobs_count = 0;
|
|
local_jobs_count = 0;
|
|
|
|
const auto & addresses_with_failovers = cluster->getShardsAddresses();
|
|
|
|
const auto & shards_info = cluster->getShardsInfo();
|
|
for (size_t shard_id : ext::range(0, shards_info.size()))
|
|
{
|
|
const auto & shard_info = shards_info[shard_id];
|
|
/// If hasInternalReplication, than prefer local replica
|
|
if (!shard_info.hasInternalReplication() || !shard_info.isLocal())
|
|
{
|
|
for (const auto & address : addresses_with_failovers[shard_id])
|
|
if (!address.is_local)
|
|
{
|
|
++remote_jobs_count;
|
|
if (shard_info.hasInternalReplication())
|
|
break;
|
|
}
|
|
}
|
|
|
|
local_jobs_count += shard_info.isLocal() ? 1 : 0;
|
|
}
|
|
}
|
|
|
|
|
|
void DistributedBlockOutputStream::waitForUnfinishedJobs(WritingJobContext & context)
|
|
{
|
|
std::unique_lock<std::mutex> lock(context.mutex);
|
|
size_t jobs_count = remote_jobs_count + local_jobs_count;
|
|
auto cond = [& context, jobs_count] { return context.finished_jobs_count == jobs_count; };
|
|
|
|
if (insert_timeout)
|
|
{
|
|
if (!context.cond_var.wait_until(lock, deadline, cond))
|
|
{
|
|
pool->wait();
|
|
ProfileEvents::increment(ProfileEvents::DistributedSyncInsertionTimeoutExceeded);
|
|
throw Exception("Timeout exceeded.", ErrorCodes::TIMEOUT_EXCEEDED);
|
|
}
|
|
}
|
|
else
|
|
context.cond_var.wait(lock, cond);
|
|
pool->wait();
|
|
}
|
|
|
|
|
|
void DistributedBlockOutputStream::writeSync(const Block & block)
|
|
{
|
|
if (!pool)
|
|
{
|
|
/// Deferred initialization. Only for sync insertion.
|
|
calculateJobsCount();
|
|
pool.emplace(remote_jobs_count + local_jobs_count);
|
|
}
|
|
|
|
WritingJobContext context;
|
|
context.done_remote_jobs.assign(remote_jobs_count, false);
|
|
context.done_local_jobs.assign(local_jobs_count, false);
|
|
context.finished_jobs_count = 0;
|
|
|
|
const Cluster::ShardsInfo & shards_info = cluster->getShardsInfo();
|
|
Blocks blocks = shards_info.size() > 1 ? splitBlock(block) : Blocks({block});
|
|
createWritingJobs(context, blocks);
|
|
|
|
try
|
|
{
|
|
waitForUnfinishedJobs(context);
|
|
}
|
|
catch(Exception & exception)
|
|
{
|
|
exception.addMessage(getCurrentStateDescription(context));
|
|
throw;
|
|
}
|
|
|
|
++blocks_inserted;
|
|
}
|
|
|
|
|
|
IColumn::Selector DistributedBlockOutputStream::createSelector(Block block)
|
|
{
|
|
storage.getShardingKeyExpr()->execute(block);
|
|
const auto & key_column = block.getByName(storage.getShardingKeyColumnName());
|
|
size_t num_shards = cluster->getShardsInfo().size();
|
|
const auto & slot_to_shard = cluster->getSlotToShard();
|
|
|
|
#define CREATE_FOR_TYPE(TYPE) \
|
|
if (typeid_cast<const DataType ## TYPE *>(key_column.type.get())) \
|
|
return createBlockSelector<TYPE>(*key_column.column, num_shards, slot_to_shard);
|
|
|
|
CREATE_FOR_TYPE(UInt8)
|
|
CREATE_FOR_TYPE(UInt16)
|
|
CREATE_FOR_TYPE(UInt32)
|
|
CREATE_FOR_TYPE(UInt64)
|
|
CREATE_FOR_TYPE(Int8)
|
|
CREATE_FOR_TYPE(Int16)
|
|
CREATE_FOR_TYPE(Int32)
|
|
CREATE_FOR_TYPE(Int64)
|
|
|
|
#undef CREATE_FOR_TYPE
|
|
|
|
throw Exception{"Sharding key expression does not evaluate to an integer type", ErrorCodes::TYPE_MISMATCH};
|
|
}
|
|
|
|
|
|
Blocks DistributedBlockOutputStream::splitBlock(const Block & block)
|
|
{
|
|
const auto num_cols = block.columns();
|
|
/// cache column pointers for later reuse
|
|
std::vector<const IColumn *> columns(num_cols);
|
|
for (size_t i = 0; i < columns.size(); ++i)
|
|
columns[i] = block.safeGetByPosition(i).column.get();
|
|
|
|
auto selector = createSelector(block);
|
|
|
|
/// Split block to num_shard smaller block, using 'selector'.
|
|
|
|
const size_t num_shards = cluster->getShardsInfo().size();
|
|
Blocks splitted_blocks(num_shards);
|
|
|
|
for (size_t shard_idx = 0; shard_idx < num_shards; ++shard_idx)
|
|
splitted_blocks[shard_idx] = block.cloneEmpty();
|
|
|
|
size_t columns_in_block = block.columns();
|
|
for (size_t col_idx_in_block = 0; col_idx_in_block < columns_in_block; ++col_idx_in_block)
|
|
{
|
|
Columns splitted_columns = block.getByPosition(col_idx_in_block).column->scatter(num_shards, selector);
|
|
for (size_t shard_idx = 0; shard_idx < num_shards; ++shard_idx)
|
|
splitted_blocks[shard_idx].getByPosition(col_idx_in_block).column = std::move(splitted_columns[shard_idx]);
|
|
}
|
|
|
|
return splitted_blocks;
|
|
}
|
|
|
|
|
|
void DistributedBlockOutputStream::writeSplitAsync(const Block & block)
|
|
{
|
|
Blocks splitted_blocks = splitBlock(block);
|
|
const size_t num_shards = splitted_blocks.size();
|
|
|
|
for (size_t shard_idx = 0; shard_idx < num_shards; ++shard_idx)
|
|
if (splitted_blocks[shard_idx].rows())
|
|
writeAsyncImpl(splitted_blocks[shard_idx], shard_idx);
|
|
|
|
++blocks_inserted;
|
|
}
|
|
|
|
|
|
void DistributedBlockOutputStream::writeAsyncImpl(const Block & block, const size_t shard_id)
|
|
{
|
|
const auto & shard_info = cluster->getShardsInfo()[shard_id];
|
|
if (shard_info.getLocalNodeCount() > 0)
|
|
writeToLocal(block, shard_info.getLocalNodeCount());
|
|
|
|
if (shard_info.hasInternalReplication())
|
|
writeToShard(block, {shard_info.dir_name_for_internal_replication});
|
|
else
|
|
{
|
|
std::vector<std::string> dir_names;
|
|
for (const auto & address : cluster->getShardsAddresses()[shard_id])
|
|
if (!address.is_local)
|
|
dir_names.push_back(address.toStringFull());
|
|
writeToShard(block, dir_names);
|
|
}
|
|
}
|
|
|
|
|
|
void DistributedBlockOutputStream::writeToLocal(const Block & block, const size_t repeats)
|
|
{
|
|
InterpreterInsertQuery interp{query_ast, storage.context};
|
|
|
|
auto block_io = interp.execute();
|
|
block_io.out->writePrefix();
|
|
|
|
for (size_t i = 0; i < repeats; ++i)
|
|
block_io.out->write(block);
|
|
|
|
block_io.out->writeSuffix();
|
|
}
|
|
|
|
|
|
void DistributedBlockOutputStream::writeToShardSync(const Block & block, const std::string & connection_pool_name)
|
|
{
|
|
auto pool = storage.requireConnectionPool(connection_pool_name);
|
|
auto connection = pool->get();
|
|
|
|
const auto & query_string = queryToString(query_ast);
|
|
RemoteBlockOutputStream remote{*connection, query_string};
|
|
|
|
CurrentMetrics::Increment metric_increment{CurrentMetrics::DistributedSend};
|
|
|
|
remote.writePrefix();
|
|
remote.write(block);
|
|
remote.writeSuffix();
|
|
}
|
|
|
|
|
|
void DistributedBlockOutputStream::writeToShard(const Block & block, const std::vector<std::string> & dir_names)
|
|
{
|
|
/** tmp directory is used to ensure atomicity of transactions
|
|
* and keep monitor thread out from reading incomplete data
|
|
*/
|
|
std::string first_file_tmp_path{};
|
|
|
|
auto first = true;
|
|
const auto & query_string = queryToString(query_ast);
|
|
|
|
/// write first file, hardlink the others
|
|
for (const auto & dir_name : dir_names)
|
|
{
|
|
const auto & path = storage.getPath() + dir_name + '/';
|
|
|
|
/// ensure shard subdirectory creation and notify storage
|
|
if (Poco::File(path).createDirectory())
|
|
storage.requireDirectoryMonitor(dir_name);
|
|
|
|
const auto & file_name = toString(storage.file_names_increment.get()) + ".bin";
|
|
const auto & block_file_path = path + file_name;
|
|
|
|
/** on first iteration write block to a temporary directory for subsequent hardlinking to ensure
|
|
* the inode is not freed until we're done */
|
|
if (first)
|
|
{
|
|
first = false;
|
|
|
|
const auto & tmp_path = path + "tmp/";
|
|
Poco::File(tmp_path).createDirectory();
|
|
const auto & block_file_tmp_path = tmp_path + file_name;
|
|
|
|
first_file_tmp_path = block_file_tmp_path;
|
|
|
|
WriteBufferFromFile out{block_file_tmp_path};
|
|
CompressedWriteBuffer compress{out};
|
|
NativeBlockOutputStream stream{compress, ClickHouseRevision::get()};
|
|
|
|
writeStringBinary(query_string, out);
|
|
|
|
stream.writePrefix();
|
|
stream.write(block);
|
|
stream.writeSuffix();
|
|
}
|
|
|
|
if (link(first_file_tmp_path.data(), block_file_path.data()))
|
|
throwFromErrno("Could not link " + block_file_path + " to " + first_file_tmp_path);
|
|
}
|
|
|
|
/** remove the temporary file, enabling the OS to reclaim inode after all threads
|
|
* have removed their corresponding files */
|
|
Poco::File(first_file_tmp_path).remove();
|
|
}
|
|
|
|
|
|
}
|