ClickHouse/src/Storages/MergeTree/ReplicatedMergeTreeLogEntry.cpp
Nicolae Vartolomei 0381c634d4
Add support for user defined identifier on log entries
Sometimes we want to push a log entry once and only once. Because it is
not possible to create a sequential node in ZooKeeper and store its name
to a well known location in the same transaction we'll do it in the
other order. First somehow generate a unique identifier, then submit a
log entry with that identifier. Later, we can search through log entries
using the identifier we provided to find the node.

Required for part movement between shards.
2021-09-17 15:32:35 +01:00

508 lines
16 KiB
C++

#include <Common/ZooKeeper/Types.h>
#include "Access/IAccessEntity.h"
#include <Storages/MergeTree/ReplicatedMergeTreeLogEntry.h>
#include <Storages/MergeTree/ReplicatedMergeTreeTableMetadata.h>
#include <Storages/MergeTree/MergeTreePartInfo.h>
#include <IO/Operators.h>
#include <IO/ReadBufferFromString.h>
#include <IO/WriteBufferFromString.h>
#include <IO/ReadHelpers.h>
#include <IO/WriteHelpers.h>
namespace DB
{
namespace ErrorCodes
{
extern const int UNKNOWN_FORMAT_VERSION;
extern const int LOGICAL_ERROR;
}
enum FormatVersion : UInt8
{
FORMAT_WITH_CREATE_TIME = 2,
FORMAT_WITH_BLOCK_ID = 3,
FORMAT_WITH_DEDUPLICATE = 4,
FORMAT_WITH_UUID = 5,
FORMAT_WITH_DEDUPLICATE_BY_COLUMNS = 6,
FORMAT_WITH_LOG_ENTRY_ID = 7,
FORMAT_LAST
};
void ReplicatedMergeTreeLogEntryData::writeText(WriteBuffer & out) const
{
UInt8 format_version = FORMAT_WITH_DEDUPLICATE;
if (!deduplicate_by_columns.empty())
format_version = std::max<UInt8>(format_version, FORMAT_WITH_DEDUPLICATE_BY_COLUMNS);
/// Conditionally bump format_version only when uuid has been assigned.
/// If some other feature requires bumping format_version to >= 5 then this code becomes no-op.
if (new_part_uuid != UUIDHelpers::Nil)
format_version = std::max<UInt8>(format_version, FORMAT_WITH_UUID);
if (!log_entry_id.empty())
format_version = std::max<UInt8>(format_version, FORMAT_WITH_LOG_ENTRY_ID);
out << "format version: " << format_version << "\n"
<< "create_time: " << LocalDateTime(create_time ? create_time : time(nullptr)) << "\n"
<< "source replica: " << source_replica << '\n'
<< "block_id: " << escape << block_id << '\n';
if (format_version >= FORMAT_WITH_LOG_ENTRY_ID)
out << "log_entry_id: " << escape << log_entry_id << '\n';
switch (type)
{
case GET_PART:
out << "get\n" << new_part_name;
break;
case CLONE_PART_FROM_SHARD:
out << "clone_part_from_shard\n"
<< new_part_name << "\n"
<< "source_shard: " << source_shard;
break;
case ATTACH_PART:
out << "attach\n" << new_part_name << "\n"
<< "part_checksum: " << part_checksum;
break;
case MERGE_PARTS:
out << "merge\n";
for (const String & s : source_parts)
out << s << '\n';
out << "into\n" << new_part_name;
out << "\ndeduplicate: " << deduplicate;
if (merge_type != MergeType::REGULAR)
out <<"\nmerge_type: " << static_cast<UInt64>(merge_type);
if (new_part_uuid != UUIDHelpers::Nil)
out << "\ninto_uuid: " << new_part_uuid;
if (!deduplicate_by_columns.empty())
{
out << "\ndeduplicate_by_columns: ";
for (size_t i = 0; i < deduplicate_by_columns.size(); ++i)
{
out << quote << deduplicate_by_columns[i];
if (i != deduplicate_by_columns.size() - 1)
out << ",";
}
}
break;
case DROP_RANGE:
if (detach)
out << "detach\n";
else
out << "drop\n";
out << new_part_name;
break;
/// NOTE: Deprecated.
case CLEAR_COLUMN:
out << "clear_column\n"
<< escape << column_name
<< "\nfrom\n"
<< new_part_name;
break;
/// NOTE: Deprecated.
case CLEAR_INDEX:
out << "clear_index\n"
<< escape << index_name
<< "\nfrom\n"
<< new_part_name;
break;
case REPLACE_RANGE:
out << typeToString(REPLACE_RANGE) << "\n";
replace_range_entry->writeText(out);
break;
case MUTATE_PART:
out << "mutate\n"
<< source_parts.at(0) << "\n"
<< "to\n"
<< new_part_name;
if (new_part_uuid != UUIDHelpers::Nil)
out << "\nto_uuid\n"
<< new_part_uuid;
if (isAlterMutation())
out << "\nalter_version\n" << alter_version;
break;
case ALTER_METADATA: /// Just make local /metadata and /columns consistent with global
out << "alter\n";
out << "alter_version\n";
out << alter_version<< "\n";
out << "have_mutation\n";
out << have_mutation << "\n";
out << "columns_str_size:\n";
out << columns_str.size() << "\n";
out << columns_str << "\n";
out << "metadata_str_size:\n";
out << metadata_str.size() << "\n";
out << metadata_str;
break;
case SYNC_PINNED_PART_UUIDS:
out << "sync_pinned_part_uuids\n";
break;
default:
throw Exception(ErrorCodes::LOGICAL_ERROR, "Unknown log entry type: {}", static_cast<int>(type));
}
out << '\n';
if (new_part_type != MergeTreeDataPartType::WIDE && new_part_type != MergeTreeDataPartType::UNKNOWN)
out << "part_type: " << new_part_type.toString() << "\n";
if (quorum)
out << "quorum: " << quorum << '\n';
}
void ReplicatedMergeTreeLogEntryData::readText(ReadBuffer & in)
{
UInt8 format_version = 0;
String type_str;
in >> "format version: " >> format_version >> "\n";
if (format_version < 1 || format_version >= FORMAT_LAST)
throw Exception(ErrorCodes::UNKNOWN_FORMAT_VERSION, "Unknown ReplicatedMergeTreeLogEntry format version: {}",
DB::toString(format_version));
if (format_version >= FORMAT_WITH_CREATE_TIME)
{
LocalDateTime create_time_dt;
in >> "create_time: " >> create_time_dt >> "\n";
create_time = DateLUT::instance().makeDateTime(
create_time_dt.year(), create_time_dt.month(), create_time_dt.day(),
create_time_dt.hour(), create_time_dt.minute(), create_time_dt.second());
}
in >> "source replica: " >> source_replica >> "\n";
if (format_version >= FORMAT_WITH_BLOCK_ID)
{
in >> "block_id: " >> escape >> block_id >> "\n";
}
if (format_version >= FORMAT_WITH_LOG_ENTRY_ID)
in >> "log_entry_id: " >> escape >> log_entry_id >> "\n";
in >> type_str >> "\n";
bool trailing_newline_found = false;
if (type_str == "get")
{
type = GET_PART;
in >> new_part_name;
}
else if (type_str == "attach")
{
type = ATTACH_PART;
in >> new_part_name >> "\npart_checksum: " >> part_checksum;
}
else if (type_str == "merge")
{
type = MERGE_PARTS;
while (true)
{
String s;
in >> s >> "\n";
if (s == "into")
break;
source_parts.push_back(s);
}
in >> new_part_name;
if (format_version >= FORMAT_WITH_DEDUPLICATE)
{
in >> "\ndeduplicate: " >> deduplicate;
/// Trying to be more backward compatible
while (!trailing_newline_found)
{
in >> "\n";
if (checkString("merge_type: ", in))
{
UInt64 value;
in >> value;
merge_type = checkAndGetMergeType(value);
}
else if (checkString("into_uuid: ", in))
in >> new_part_uuid;
else if (checkString("deduplicate_by_columns: ", in))
{
Strings new_deduplicate_by_columns;
for (;;)
{
String tmp_column_name;
in >> quote >> tmp_column_name;
new_deduplicate_by_columns.emplace_back(std::move(tmp_column_name));
if (!checkString(",", in))
break;
}
deduplicate_by_columns = std::move(new_deduplicate_by_columns);
}
else
trailing_newline_found = true;
}
}
}
else if (type_str == "drop" || type_str == "detach")
{
type = DROP_RANGE;
detach = type_str == "detach";
in >> new_part_name;
}
else if (type_str == "clear_column") /// NOTE: Deprecated.
{
type = CLEAR_COLUMN;
in >> escape >> column_name >> "\nfrom\n" >> new_part_name;
}
else if (type_str == "clear_index") /// NOTE: Deprecated.
{
type = CLEAR_INDEX;
in >> escape >> index_name >> "\nfrom\n" >> new_part_name;
}
else if (type_str == typeToString(REPLACE_RANGE))
{
type = REPLACE_RANGE;
replace_range_entry = std::make_shared<ReplaceRangeEntry>();
replace_range_entry->readText(in);
}
else if (type_str == "mutate")
{
type = MUTATE_PART;
String source_part;
in >> source_part >> "\n"
>> "to\n"
>> new_part_name;
source_parts.push_back(source_part);
while (!trailing_newline_found)
{
in >> "\n";
if (checkString("alter_version\n", in))
in >> alter_version;
else if (checkString("to_uuid\n", in))
in >> new_part_uuid;
else
trailing_newline_found = true;
}
}
else if (type_str == "alter")
{
type = ALTER_METADATA;
in >> "alter_version\n";
in >> alter_version;
in >> "\nhave_mutation\n";
in >> have_mutation;
in >> "\ncolumns_str_size:\n";
size_t columns_size;
in >> columns_size >> "\n";
columns_str.resize(columns_size);
in.readStrict(&columns_str[0], columns_size);
in >> "\nmetadata_str_size:\n";
size_t metadata_size;
in >> metadata_size >> "\n";
metadata_str.resize(metadata_size);
in.readStrict(&metadata_str[0], metadata_size);
}
else if (type_str == "sync_pinned_part_uuids")
{
type = SYNC_PINNED_PART_UUIDS;
}
else if (type_str == "clone_part_from_shard")
{
type = CLONE_PART_FROM_SHARD;
in >> new_part_name;
in >> "\nsource_shard: " >> source_shard;
}
if (!trailing_newline_found)
in >> "\n";
if (checkString("part_type: ", in))
{
String part_type_str;
in >> type_str;
new_part_type.fromString(type_str);
in >> "\n";
}
else
new_part_type = MergeTreeDataPartType::WIDE;
/// Optional field.
if (!in.eof())
in >> "quorum: " >> quorum >> "\n";
}
void ReplicatedMergeTreeLogEntryData::ReplaceRangeEntry::writeText(WriteBuffer & out) const
{
out << "drop_range_name: " << drop_range_part_name << "\n";
out << "from_database: " << escape << from_database << "\n";
out << "from_table: " << escape << from_table << "\n";
out << "source_parts: ";
writeQuoted(src_part_names, out);
out << "\n";
out << "new_parts: ";
writeQuoted(new_part_names, out);
out << "\n";
out << "part_checksums: ";
writeQuoted(part_names_checksums, out);
out << "\n";
out << "columns_version: " << columns_version;
}
void ReplicatedMergeTreeLogEntryData::ReplaceRangeEntry::readText(ReadBuffer & in)
{
in >> "drop_range_name: " >> drop_range_part_name >> "\n";
in >> "from_database: " >> escape >> from_database >> "\n";
in >> "from_table: " >> escape >> from_table >> "\n";
in >> "source_parts: ";
readQuoted(src_part_names, in);
in >> "\n";
in >> "new_parts: ";
readQuoted(new_part_names, in);
in >> "\n";
in >> "part_checksums: ";
readQuoted(part_names_checksums, in);
in >> "\n";
in >> "columns_version: " >> columns_version;
}
bool ReplicatedMergeTreeLogEntryData::ReplaceRangeEntry::isMovePartitionOrAttachFrom(const MergeTreePartInfo & drop_range_info)
{
assert(drop_range_info.getBlocksCount() != 0);
return drop_range_info.getBlocksCount() == 1;
}
String ReplicatedMergeTreeLogEntryData::toString() const
{
WriteBufferFromOwnString out;
writeText(out);
return out.str();
}
ReplicatedMergeTreeLogEntry::Ptr ReplicatedMergeTreeLogEntry::parse(const String & s, const Coordination::Stat & stat)
{
ReadBufferFromString in(s);
Ptr res = std::make_shared<ReplicatedMergeTreeLogEntry>();
res->readText(in);
assertEOF(in);
if (!res->create_time)
res->create_time = stat.ctime / 1000;
return res;
}
std::optional<String> ReplicatedMergeTreeLogEntryData::getDropRange(MergeTreeDataFormatVersion format_version) const
{
if (type == DROP_RANGE)
return new_part_name;
if (type == REPLACE_RANGE)
{
auto drop_range_info = MergeTreePartInfo::fromPartName(replace_range_entry->drop_range_part_name, format_version);
if (!ReplaceRangeEntry::isMovePartitionOrAttachFrom(drop_range_info))
{
/// It's REPLACE, not MOVE or ATTACH, so drop range is real
return replace_range_entry->drop_range_part_name;
}
}
return {};
}
bool ReplicatedMergeTreeLogEntryData::isDropPart(MergeTreeDataFormatVersion format_version) const
{
if (type == DROP_RANGE)
{
auto drop_range_info = MergeTreePartInfo::fromPartName(new_part_name, format_version);
return !drop_range_info.isFakeDropRangePart();
}
return false;
}
Strings ReplicatedMergeTreeLogEntryData::getVirtualPartNames(MergeTreeDataFormatVersion format_version) const
{
/// Doesn't produce any part
if (type == ALTER_METADATA)
return {};
/// DROP_RANGE does not add a real part, but we must disable merges in that range
if (type == DROP_RANGE)
{
auto drop_range_part_info = MergeTreePartInfo::fromPartName(new_part_name, format_version);
/// It's DROP PART and we don't want to add it into virtual parts
/// because it can lead to intersecting parts on stale replicas and this
/// problem is fundamental. So we have very weak guarantees for DROP
/// PART. If any concurrent merge will be assigned then DROP PART will
/// delete nothing and part will be successfully merged into bigger part.
///
/// dropPart used in the following cases:
/// 1) Remove empty parts after TTL.
/// 2) Remove parts after move between shards.
/// 3) User queries: ALTER TABLE DROP PART 'part_name'.
///
/// In the first case merge of empty part is even better than DROP. In
/// the second case part UUIDs used to forbid merges for moding parts so
/// there is no problem with concurrent merges. The third case is quite
/// rare and we give very weak guarantee: there will be no active part
/// with this name, but possibly it was merged to some other part.
if (!drop_range_part_info.isFakeDropRangePart())
return {};
return {new_part_name};
}
if (type == REPLACE_RANGE)
{
Strings res = replace_range_entry->new_part_names;
auto drop_range_info = MergeTreePartInfo::fromPartName(replace_range_entry->drop_range_part_name, format_version);
if (auto drop_range = getDropRange(format_version))
res.emplace_back(*drop_range);
return res;
}
/// Doesn't produce any part.
if (type == SYNC_PINNED_PART_UUIDS)
return {};
/// Doesn't produce any part by itself.
if (type == CLONE_PART_FROM_SHARD)
return {};
return {new_part_name};
}
}