diff --git a/dbms/src/Storages/MergeTree/MergeTreeData.h b/dbms/src/Storages/MergeTree/MergeTreeData.h index 189c1b4618d..a01740e62a8 100644 --- a/dbms/src/Storages/MergeTree/MergeTreeData.h +++ b/dbms/src/Storages/MergeTree/MergeTreeData.h @@ -46,10 +46,10 @@ namespace ErrorCodes /// The date column is specified. For each part min and max dates are remembered. /// Essentially it is an index too. /// -/// Data is partitioned by month. Parts belonging to different months are not merged - for the ease of -/// administration (data sync and backup). +/// Data is partitioned by the value of the partitioning expression. +/// Parts belonging to different partitions are not merged - for the ease of administration (data sync and backup). /// -/// File structure: +/// File structure of old-style month-partitioned tables (format_version = 0): /// Part directory - / min-date _ max-date _ min-id _ max-id _ level / /// Inside the part directory: /// checksums.txt - contains the list of all files along with their sizes and checksums. @@ -58,6 +58,13 @@ namespace ErrorCodes /// [Column].bin - contains compressed column data. /// [Column].mrk - marks, pointing to seek positions allowing to skip n * k rows. /// +/// File structure of tables with custom partitioning (format_version >= 1): +/// Part directory - / partiiton-id _ min-id _ max-id _ level / +/// Inside the part directory: +/// The same files as for month-partitioned tables, plus +/// partition.dat - contains the value of the partitioning expression +/// minmax_[Column].idx - MinMax indexes (see MergeTreeDataPart::MinMaxIndex class) for the columns required by the partitioning expression. +/// /// Several modes are implemented. Modes determine additional actions during merge: /// - Ordinary - don't do anything special /// - Collapsing - collapse pairs of rows with the opposite values of sign_columns for the same values diff --git a/dbms/src/Storages/MergeTree/MergeTreeDataFormatVersion.h b/dbms/src/Storages/MergeTree/MergeTreeDataFormatVersion.h index 74f26fcdf45..e08bfd44656 100644 --- a/dbms/src/Storages/MergeTree/MergeTreeDataFormatVersion.h +++ b/dbms/src/Storages/MergeTree/MergeTreeDataFormatVersion.h @@ -1,5 +1,6 @@ #pragma once +#include #include namespace DB diff --git a/dbms/src/Storages/MergeTree/MergeTreePartition.cpp b/dbms/src/Storages/MergeTree/MergeTreePartition.cpp index 4bf7bb352de..e33896a3cdf 100644 --- a/dbms/src/Storages/MergeTree/MergeTreePartition.cpp +++ b/dbms/src/Storages/MergeTree/MergeTreePartition.cpp @@ -19,17 +19,18 @@ static ReadBufferFromFile openForReading(const String & path) return ReadBufferFromFile(path, std::min(static_cast(DBMS_DEFAULT_BUFFER_SIZE), Poco::File(path).getSize())); } +/// NOTE: This ID is used to create part names which are then persisted in ZK and as directory names on the file system. +/// So if you want to change this method, be sure to guarantee compatibility with existing table data. String MergeTreePartition::getID(const MergeTreeData & storage) const { if (value.size() != storage.partition_expr_columns.size()) throw Exception("Invalid partition key size: " + toString(value.size()), ErrorCodes::LOGICAL_ERROR); if (value.empty()) - return "all"; + return "all"; /// It is tempting to use an empty string here. But that would break directory structure in ZK. - /// In case all partition fields are represented by integral types, try to produce a human-readable partition id. + /// In case all partition fields are represented by integral types, try to produce a human-readable ID. /// Otherwise use a hex-encoded hash. - bool are_all_integral = true; for (const Field & field : value) { @@ -51,9 +52,12 @@ String MergeTreePartition::getID(const MergeTreeData & storage) const result += '-'; if (typeid_cast(storage.partition_expr_column_types[i].get())) - result += toString(DateLUT::instance().toNumYYYYMMDD(DayNum_t(value[i].get()))); + result += toString(DateLUT::instance().toNumYYYYMMDD(DayNum_t(value[i].safeGet()))); else result += applyVisitor(to_string_visitor, value[i]); + + /// It is tempting to output DateTime as YYYYMMDDhhmmss, but that would make partition ID + /// timezone-dependent. } return result; diff --git a/dbms/src/Storages/MergeTree/MergeTreePartition.h b/dbms/src/Storages/MergeTree/MergeTreePartition.h index 65263520e83..d32b5f4401a 100644 --- a/dbms/src/Storages/MergeTree/MergeTreePartition.h +++ b/dbms/src/Storages/MergeTree/MergeTreePartition.h @@ -10,6 +10,7 @@ namespace DB class MergeTreeData; struct MergeTreeDataPartChecksums; +/// This class represents a partition value of a single part and encapsulates its loading/storing logic. struct MergeTreePartition { Row value;