ClickHouse/src/Storages/MergeTree/MergeTreePartition.cpp

181 lines
6.4 KiB
C++
Raw Normal View History

#include <Storages/MergeTree/MergeTreePartition.h>
#include <Storages/MergeTree/MergeTreeData.h>
2019-10-10 16:30:30 +00:00
#include <Storages/MergeTree/IMergeTreeDataPart.h>
#include <IO/HashingWriteBuffer.h>
#include <Common/FieldVisitors.h>
#include <DataTypes/DataTypeDate.h>
#include <DataTypes/DataTypeTuple.h>
#include <Columns/ColumnTuple.h>
#include <Common/SipHash.h>
#include <Common/typeid_cast.h>
#include <Common/hex.h>
#include <Core/Block.h>
namespace DB
{
2020-02-25 18:10:48 +00:00
namespace ErrorCodes
{
extern const int LOGICAL_ERROR;
}
static std::unique_ptr<ReadBufferFromFileBase> openForReading(const DiskPtr & disk, const String & path)
{
return disk->readFile(path, std::min(size_t(DBMS_DEFAULT_BUFFER_SIZE), disk->getFileSize(path)));
}
String MergeTreePartition::getID(const MergeTreeData & storage) const
{
return getID(storage.getInMemoryMetadataPtr()->getPartitionKey().sample_block);
}
2017-09-13 16:22:04 +00:00
/// NOTE: This ID is used to create part names which are then persisted in ZK and as directory names on the file system.
/// So if you want to change this method, be sure to guarantee compatibility with existing table data.
String MergeTreePartition::getID(const Block & partition_key_sample) const
{
if (value.size() != partition_key_sample.columns())
throw Exception("Invalid partition key size: " + toString(value.size()), ErrorCodes::LOGICAL_ERROR);
if (value.empty())
2017-09-13 16:22:04 +00:00
return "all"; /// It is tempting to use an empty string here. But that would break directory structure in ZK.
2017-09-13 16:22:04 +00:00
/// In case all partition fields are represented by integral types, try to produce a human-readable ID.
/// Otherwise use a hex-encoded hash.
bool are_all_integral = true;
for (const Field & field : value)
{
if (field.getType() != Field::Types::UInt64 && field.getType() != Field::Types::Int64)
{
are_all_integral = false;
break;
}
}
String result;
if (are_all_integral)
{
FieldVisitorToString to_string_visitor;
for (size_t i = 0; i < value.size(); ++i)
{
if (i > 0)
result += '-';
if (typeid_cast<const DataTypeDate *>(partition_key_sample.getByPosition(i).type.get()))
result += toString(DateLUT::instance().toNumYYYYMMDD(DayNum(value[i].safeGet<UInt64>())));
else
result += applyVisitor(to_string_visitor, value[i]);
2017-09-13 16:22:04 +00:00
/// It is tempting to output DateTime as YYYYMMDDhhmmss, but that would make partition ID
/// timezone-dependent.
}
return result;
}
SipHash hash;
FieldVisitorHash hashing_visitor(hash);
for (const Field & field : value)
applyVisitor(hashing_visitor, field);
char hash_data[16];
hash.get128(hash_data);
result.resize(32);
for (size_t i = 0; i < 16; ++i)
writeHexByteLowercase(hash_data[i], &result[2 * i]);
return result;
}
void MergeTreePartition::serializeText(const MergeTreeData & storage, WriteBuffer & out, const FormatSettings & format_settings) const
{
2020-06-24 14:12:44 +00:00
auto metadata_snapshot = storage.getInMemoryMetadataPtr();
const auto & partition_key_sample = metadata_snapshot->getPartitionKey().sample_block;
2020-05-20 12:16:55 +00:00
size_t key_size = partition_key_sample.columns();
if (key_size == 0)
{
writeCString("tuple()", out);
}
else if (key_size == 1)
{
2020-05-20 12:16:55 +00:00
const DataTypePtr & type = partition_key_sample.getByPosition(0).type;
auto column = type->createColumn();
column->insert(value[0]);
type->serializeAsText(*column, 0, out, format_settings);
}
else
{
DataTypes types;
Columns columns;
for (size_t i = 0; i < key_size; ++i)
{
2020-05-20 12:16:55 +00:00
const auto & type = partition_key_sample.getByPosition(i).type;
types.push_back(type);
auto column = type->createColumn();
column->insert(value[i]);
columns.push_back(std::move(column));
}
DataTypeTuple tuple_type(types);
auto tuple_column = ColumnTuple::create(columns);
tuple_type.serializeText(*tuple_column, 0, out, format_settings);
}
}
void MergeTreePartition::load(const MergeTreeData & storage, const DiskPtr & disk, const String & part_path)
{
auto metadata_snapshot = storage.getInMemoryMetadataPtr();
if (!metadata_snapshot->hasPartitionKey())
return;
const auto & partition_key_sample = metadata_snapshot->getPartitionKey().sample_block;
auto partition_file_path = part_path + "partition.dat";
auto file = openForReading(disk, partition_file_path);
2020-05-20 12:16:55 +00:00
value.resize(partition_key_sample.columns());
for (size_t i = 0; i < partition_key_sample.columns(); ++i)
partition_key_sample.getByPosition(i).type->deserializeBinary(value[i], *file);
}
void MergeTreePartition::store(const MergeTreeData & storage, const DiskPtr & disk, const String & part_path, MergeTreeDataPartChecksums & checksums) const
{
2020-06-24 16:06:01 +00:00
auto metadata_snapshot = storage.getInMemoryMetadataPtr();
const auto & partition_key_sample = metadata_snapshot->getPartitionKey().sample_block;
store(partition_key_sample, disk, part_path, checksums);
}
void MergeTreePartition::store(const Block & partition_key_sample, const DiskPtr & disk, const String & part_path, MergeTreeDataPartChecksums & checksums) const
{
if (!partition_key_sample)
return;
auto out = disk->writeFile(part_path + "partition.dat");
HashingWriteBuffer out_hashing(*out);
for (size_t i = 0; i < value.size(); ++i)
partition_key_sample.getByPosition(i).type->serializeBinary(value[i], out_hashing);
out_hashing.next();
checksums.files["partition.dat"].file_size = out_hashing.count();
checksums.files["partition.dat"].file_hash = out_hashing.getHash();
out->finalize();
}
2020-06-26 11:30:23 +00:00
void MergeTreePartition::create(const StorageMetadataPtr & metadata_snapshot, Block block, size_t row)
{
2020-06-26 11:30:23 +00:00
if (!metadata_snapshot->hasPartitionKey())
return;
2020-06-26 11:30:23 +00:00
const auto & partition_key = metadata_snapshot->getPartitionKey();
partition_key.expression->execute(block);
size_t partition_columns_num = partition_key.sample_block.columns();
value.resize(partition_columns_num);
for (size_t i = 0; i < partition_columns_num; ++i)
{
const auto & column_name = partition_key.sample_block.getByPosition(i).name;
const auto & partition_column = block.getByName(column_name).column;
partition_column->get(row, value[i]);
}
}
}