2014-08-12 13:46:46 +00:00
|
|
|
|
#pragma once
|
|
|
|
|
|
|
|
|
|
#include <DB/Storages/StorageDistributed.h>
|
2014-09-30 03:48:40 +00:00
|
|
|
|
|
|
|
|
|
#include <DB/Parsers/formatAST.h>
|
2014-08-13 12:52:30 +00:00
|
|
|
|
|
|
|
|
|
#include <DB/IO/WriteBufferFromFile.h>
|
|
|
|
|
#include <DB/IO/CompressedWriteBuffer.h>
|
|
|
|
|
#include <DB/DataStreams/NativeBlockOutputStream.h>
|
2014-08-15 09:50:05 +00:00
|
|
|
|
#include <DB/Interpreters/InterpreterInsertQuery.h>
|
2015-04-16 06:12:35 +00:00
|
|
|
|
#include <DB/Interpreters/Cluster.h>
|
2014-08-13 12:52:30 +00:00
|
|
|
|
|
2015-10-05 01:11:12 +00:00
|
|
|
|
#include <DB/Common/Increment.h>
|
2015-02-10 21:10:58 +00:00
|
|
|
|
#include <memory>
|
2015-09-29 19:19:54 +00:00
|
|
|
|
#include <common/Revision.h>
|
2014-08-13 12:52:30 +00:00
|
|
|
|
|
2014-08-12 13:46:46 +00:00
|
|
|
|
#include <iostream>
|
2014-08-22 17:17:25 +00:00
|
|
|
|
#include <type_traits>
|
2014-08-12 13:46:46 +00:00
|
|
|
|
|
2015-04-16 06:12:35 +00:00
|
|
|
|
|
2014-08-12 13:46:46 +00:00
|
|
|
|
namespace DB
|
|
|
|
|
{
|
|
|
|
|
|
2014-08-21 12:07:29 +00:00
|
|
|
|
/** Запись асинхронная - данные сначала записываются на локальную файловую систему, а потом отправляются на удалённые серверы.
|
|
|
|
|
* Если Distributed таблица использует более одного шарда, то для того, чтобы поддерживалась запись,
|
|
|
|
|
* при создании таблицы должен быть указан дополнительный параметр у ENGINE - ключ шардирования.
|
|
|
|
|
* Ключ шардирования - произвольное выражение от столбцов. Например, rand() или UserID.
|
|
|
|
|
* При записи блок данных разбивается по остатку от деления ключа шардирования на суммарный вес шардов,
|
|
|
|
|
* и полученные блоки пишутся в сжатом Native формате в отдельные директории для отправки.
|
|
|
|
|
* Для каждого адреса назначения (каждой директории с данными для отправки), в StorageDistributed создаётся отдельный поток,
|
|
|
|
|
* который следит за директорией и отправляет данные. */
|
2014-08-12 13:46:46 +00:00
|
|
|
|
class DistributedBlockOutputStream : public IBlockOutputStream
|
|
|
|
|
{
|
|
|
|
|
public:
|
2014-08-15 09:50:05 +00:00
|
|
|
|
DistributedBlockOutputStream(StorageDistributed & storage, const ASTPtr & query_ast)
|
2014-08-21 12:07:29 +00:00
|
|
|
|
: storage(storage), query_ast(query_ast)
|
2014-08-12 13:46:46 +00:00
|
|
|
|
{
|
|
|
|
|
}
|
|
|
|
|
|
2014-08-21 12:07:29 +00:00
|
|
|
|
void write(const Block & block) override
|
2014-08-12 13:46:46 +00:00
|
|
|
|
{
|
2015-10-20 14:59:29 +00:00
|
|
|
|
if (storage.getShardingKeyExpr() && (storage.cluster.getShardsInfo().size() > 1))
|
2014-08-21 12:07:29 +00:00
|
|
|
|
return writeSplit(block);
|
2014-08-15 09:56:22 +00:00
|
|
|
|
|
|
|
|
|
writeImpl(block);
|
2014-08-12 13:46:46 +00:00
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
private:
|
2014-08-22 14:05:34 +00:00
|
|
|
|
template <typename T>
|
|
|
|
|
static std::vector<IColumn::Filter> createFiltersImpl(const size_t num_rows, const IColumn * column, const Cluster & cluster)
|
2014-08-12 13:46:46 +00:00
|
|
|
|
{
|
2014-08-22 14:05:34 +00:00
|
|
|
|
const auto total_weight = cluster.slot_to_shard.size();
|
2015-10-20 14:59:29 +00:00
|
|
|
|
const auto num_shards = cluster.getShardsInfo().size();
|
2014-08-22 14:05:34 +00:00
|
|
|
|
std::vector<IColumn::Filter> filters(num_shards);
|
2014-08-12 13:46:46 +00:00
|
|
|
|
|
2014-08-22 17:17:25 +00:00
|
|
|
|
/** Деление отрицательного числа с остатком на положительное, в C++ даёт отрицательный остаток.
|
|
|
|
|
* Для данной задачи это не подходит. Поэтому, будем обрабатывать знаковые типы как беззнаковые.
|
|
|
|
|
* Это даёт уже что-то совсем не похожее на деление с остатком, но подходящее для данной задачи.
|
|
|
|
|
*/
|
|
|
|
|
using UnsignedT = typename std::make_unsigned<T>::type;
|
|
|
|
|
|
2014-08-22 14:05:34 +00:00
|
|
|
|
/// const columns contain only one value, therefore we do not need to read it at every iteration
|
|
|
|
|
if (column->isConst())
|
|
|
|
|
{
|
|
|
|
|
const auto data = typeid_cast<const ColumnConst<T> *>(column)->getData();
|
2014-08-22 17:17:25 +00:00
|
|
|
|
const auto shard_num = cluster.slot_to_shard[static_cast<UnsignedT>(data) % total_weight];
|
2014-08-13 09:45:39 +00:00
|
|
|
|
|
2014-08-22 14:05:34 +00:00
|
|
|
|
for (size_t i = 0; i < num_shards; ++i)
|
|
|
|
|
filters[i].assign(num_rows, static_cast<UInt8>(shard_num == i));
|
|
|
|
|
}
|
|
|
|
|
else
|
|
|
|
|
{
|
|
|
|
|
const auto & data = typeid_cast<const ColumnVector<T> *>(column)->getData();
|
|
|
|
|
|
|
|
|
|
for (size_t i = 0; i < num_shards; ++i)
|
|
|
|
|
{
|
|
|
|
|
filters[i].resize(num_rows);
|
|
|
|
|
for (size_t j = 0; j < num_rows; ++j)
|
2014-08-22 17:17:25 +00:00
|
|
|
|
filters[i][j] = cluster.slot_to_shard[static_cast<UnsignedT>(data[j]) % total_weight] == i;
|
2014-08-22 14:05:34 +00:00
|
|
|
|
}
|
|
|
|
|
}
|
2014-08-13 09:45:39 +00:00
|
|
|
|
|
2014-08-22 14:05:34 +00:00
|
|
|
|
return filters;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
std::vector<IColumn::Filter> createFilters(Block block)
|
|
|
|
|
{
|
|
|
|
|
using create_filters_sig = std::vector<IColumn::Filter>(size_t, const IColumn *, const Cluster &);
|
|
|
|
|
/// hashmap of pointers to functions corresponding to each integral type
|
|
|
|
|
static std::unordered_map<std::string, create_filters_sig *> creators{
|
|
|
|
|
{ TypeName<UInt8>::get(), &createFiltersImpl<UInt8> },
|
|
|
|
|
{ TypeName<UInt16>::get(), &createFiltersImpl<UInt16> },
|
|
|
|
|
{ TypeName<UInt32>::get(), &createFiltersImpl<UInt32> },
|
|
|
|
|
{ TypeName<UInt64>::get(), &createFiltersImpl<UInt64> },
|
|
|
|
|
{ TypeName<Int8>::get(), &createFiltersImpl<Int8> },
|
|
|
|
|
{ TypeName<Int16>::get(), &createFiltersImpl<Int16> },
|
|
|
|
|
{ TypeName<Int32>::get(), &createFiltersImpl<Int32> },
|
|
|
|
|
{ TypeName<Int64>::get(), &createFiltersImpl<Int64> },
|
|
|
|
|
};
|
|
|
|
|
|
|
|
|
|
storage.getShardingKeyExpr()->execute(block);
|
|
|
|
|
|
|
|
|
|
const auto & key_column = block.getByName(storage.getShardingKeyColumnName());
|
|
|
|
|
|
|
|
|
|
/// check that key column has valid type
|
|
|
|
|
const auto it = creators.find(key_column.type->getName());
|
|
|
|
|
|
|
|
|
|
return it != std::end(creators)
|
|
|
|
|
? (*it->second)(block.rowsInFirstColumn(), key_column.column.get(), storage.cluster)
|
|
|
|
|
: throw Exception{
|
|
|
|
|
"Sharding key expression does not evaluate to an integer type",
|
|
|
|
|
ErrorCodes::TYPE_MISMATCH
|
|
|
|
|
};
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
void writeSplit(const Block & block)
|
|
|
|
|
{
|
2014-08-21 12:07:29 +00:00
|
|
|
|
const auto num_cols = block.columns();
|
2014-08-22 14:05:34 +00:00
|
|
|
|
/// cache column pointers for later reuse
|
2014-08-21 12:07:29 +00:00
|
|
|
|
std::vector<const IColumn*> columns(num_cols);
|
|
|
|
|
for (size_t i = 0; i < columns.size(); ++i)
|
|
|
|
|
columns[i] = block.getByPosition(i).column;
|
2014-08-13 09:45:39 +00:00
|
|
|
|
|
2014-08-22 14:05:34 +00:00
|
|
|
|
auto filters = createFilters(block);
|
|
|
|
|
|
2015-10-20 14:59:29 +00:00
|
|
|
|
const auto num_shards = storage.cluster.getShardsInfo().size();
|
2015-12-05 07:01:18 +00:00
|
|
|
|
|
|
|
|
|
ssize_t size_hint = ((block.rowsInFirstColumn() + num_shards - 1) / num_shards) * 1.1; /// Число 1.1 выбрано наугад.
|
|
|
|
|
|
2014-08-22 14:05:34 +00:00
|
|
|
|
for (size_t i = 0; i < num_shards; ++i)
|
2014-08-12 13:46:46 +00:00
|
|
|
|
{
|
2014-08-22 14:05:34 +00:00
|
|
|
|
auto target_block = block.cloneEmpty();
|
2014-08-12 13:46:46 +00:00
|
|
|
|
|
2014-08-21 12:07:29 +00:00
|
|
|
|
for (size_t col = 0; col < num_cols; ++col)
|
2015-12-05 07:01:18 +00:00
|
|
|
|
target_block.getByPosition(col).column = columns[col]->filter(filters[i], size_hint);
|
2014-08-12 13:46:46 +00:00
|
|
|
|
|
2014-08-22 14:05:34 +00:00
|
|
|
|
if (target_block.rowsInFirstColumn())
|
|
|
|
|
writeImpl(target_block, i);
|
|
|
|
|
}
|
2014-08-12 13:46:46 +00:00
|
|
|
|
}
|
|
|
|
|
|
2014-08-13 11:26:13 +00:00
|
|
|
|
void writeImpl(const Block & block, const size_t shard_id = 0)
|
2014-08-12 13:46:46 +00:00
|
|
|
|
{
|
2015-10-20 14:59:29 +00:00
|
|
|
|
const auto & shard_info = storage.cluster.getShardsInfo()[shard_id];
|
|
|
|
|
if (shard_info.getLocalNodeCount() > 0)
|
|
|
|
|
writeToLocal(block, shard_info.getLocalNodeCount());
|
2014-08-13 11:26:13 +00:00
|
|
|
|
|
2014-08-21 12:07:29 +00:00
|
|
|
|
/// dir_names is empty if shard has only local addresses
|
2014-08-15 09:50:05 +00:00
|
|
|
|
if (!shard_info.dir_names.empty())
|
|
|
|
|
writeToShard(block, shard_info.dir_names);
|
|
|
|
|
}
|
|
|
|
|
|
2014-08-22 14:05:34 +00:00
|
|
|
|
void writeToLocal(const Block & block, const size_t repeats)
|
2014-08-15 09:50:05 +00:00
|
|
|
|
{
|
|
|
|
|
InterpreterInsertQuery interp{query_ast, storage.context};
|
|
|
|
|
|
|
|
|
|
auto block_io = interp.execute();
|
|
|
|
|
block_io.out->writePrefix();
|
2014-08-22 14:05:34 +00:00
|
|
|
|
|
|
|
|
|
for (size_t i = 0; i < repeats; ++i)
|
|
|
|
|
block_io.out->write(block);
|
|
|
|
|
|
2014-08-15 09:50:05 +00:00
|
|
|
|
block_io.out->writeSuffix();
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
void writeToShard(const Block & block, const std::vector<std::string> & dir_names)
|
|
|
|
|
{
|
|
|
|
|
/** tmp directory is used to ensure atomicity of transactions
|
|
|
|
|
* and keep monitor thread out from reading incomplete data
|
|
|
|
|
*/
|
|
|
|
|
std::string first_file_tmp_path{};
|
2014-08-19 08:04:13 +00:00
|
|
|
|
|
2014-08-15 09:50:05 +00:00
|
|
|
|
auto first = true;
|
2014-08-22 14:05:34 +00:00
|
|
|
|
const auto & query_string = queryToString(query_ast);
|
2014-08-15 09:50:05 +00:00
|
|
|
|
|
|
|
|
|
/// write first file, hardlink the others
|
|
|
|
|
for (const auto & dir_name : dir_names)
|
|
|
|
|
{
|
|
|
|
|
const auto & path = storage.getPath() + dir_name + '/';
|
2014-08-13 11:26:13 +00:00
|
|
|
|
|
2014-08-19 08:04:13 +00:00
|
|
|
|
/// ensure shard subdirectory creation and notify storage
|
2014-08-15 09:50:05 +00:00
|
|
|
|
if (Poco::File(path).createDirectory())
|
2014-08-19 08:04:13 +00:00
|
|
|
|
storage.requireDirectoryMonitor(dir_name);
|
2014-08-13 12:52:30 +00:00
|
|
|
|
|
2014-08-21 12:07:29 +00:00
|
|
|
|
const auto & file_name = toString(Increment{path + "increment.txt"}.get(true)) + ".bin";
|
2014-08-15 09:50:05 +00:00
|
|
|
|
const auto & block_file_path = path + file_name;
|
2014-08-13 12:52:30 +00:00
|
|
|
|
|
2014-08-19 08:04:13 +00:00
|
|
|
|
/** on first iteration write block to a temporary directory for subsequent hardlinking to ensure
|
|
|
|
|
* the inode is not freed until we're done */
|
2014-08-15 09:50:05 +00:00
|
|
|
|
if (first)
|
|
|
|
|
{
|
|
|
|
|
first = false;
|
|
|
|
|
|
|
|
|
|
const auto & tmp_path = path + "tmp/";
|
|
|
|
|
Poco::File(tmp_path).createDirectory();
|
|
|
|
|
const auto & block_file_tmp_path = tmp_path + file_name;
|
|
|
|
|
|
|
|
|
|
first_file_tmp_path = block_file_tmp_path;
|
|
|
|
|
|
2014-08-21 12:07:29 +00:00
|
|
|
|
WriteBufferFromFile out{block_file_tmp_path};
|
|
|
|
|
CompressedWriteBuffer compress{out};
|
2015-01-22 04:14:34 +00:00
|
|
|
|
NativeBlockOutputStream stream{compress, Revision::get()};
|
2014-08-15 09:50:05 +00:00
|
|
|
|
|
2014-08-21 12:07:29 +00:00
|
|
|
|
writeStringBinary(query_string, out);
|
2014-08-15 09:50:05 +00:00
|
|
|
|
|
|
|
|
|
stream.writePrefix();
|
|
|
|
|
stream.write(block);
|
|
|
|
|
stream.writeSuffix();
|
|
|
|
|
}
|
2014-08-19 08:04:13 +00:00
|
|
|
|
|
|
|
|
|
if (link(first_file_tmp_path.data(), block_file_path.data()))
|
2014-08-21 12:07:29 +00:00
|
|
|
|
throwFromErrno("Could not link " + block_file_path + " to " + first_file_tmp_path);
|
2014-08-15 09:50:05 +00:00
|
|
|
|
}
|
2014-08-13 13:43:54 +00:00
|
|
|
|
|
2014-08-19 08:04:13 +00:00
|
|
|
|
/** remove the temporary file, enabling the OS to reclaim inode after all threads
|
|
|
|
|
* have removed their corresponding files */
|
|
|
|
|
Poco::File(first_file_tmp_path).remove();
|
2014-08-12 13:46:46 +00:00
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
StorageDistributed & storage;
|
2014-08-15 09:50:05 +00:00
|
|
|
|
ASTPtr query_ast;
|
2014-08-12 13:46:46 +00:00
|
|
|
|
};
|
|
|
|
|
|
|
|
|
|
}
|