2019-10-10 16:30:30 +00:00
# include "IMergeTreeDataPart.h"
2022-10-23 03:29:26 +00:00
# include "Storages/MergeTree/IDataPartStorage.h"
2019-10-10 16:30:30 +00:00
# include <optional>
2021-12-08 02:40:59 +00:00
# include <boost/algorithm/string/join.hpp>
2022-01-03 23:04:56 +00:00
# include <string_view>
2020-02-27 16:47:40 +00:00
# include <Core/Defines.h>
# include <IO/HashingWriteBuffer.h>
2021-12-08 02:40:59 +00:00
# include <IO/HashingReadBuffer.h>
2020-02-27 16:47:40 +00:00
# include <IO/ReadBufferFromString.h>
2019-10-10 16:30:30 +00:00
# include <IO/ReadHelpers.h>
# include <IO/WriteHelpers.h>
# include <Storages/MergeTree/MergeTreeData.h>
2020-03-19 16:37:55 +00:00
# include <Storages/MergeTree/localBackup.h>
2020-07-16 10:54:49 +00:00
# include <Storages/MergeTree/checkDataPart.h>
2021-01-14 16:26:56 +00:00
# include <Storages/StorageReplicatedMergeTree.h>
2022-01-05 11:51:50 +00:00
# include <Storages/MergeTree/PartMetadataManagerOrdinary.h>
# include <Storages/MergeTree/PartMetadataManagerWithCache.h>
2022-07-21 14:47:19 +00:00
# include <Core/NamesAndTypes.h>
# include <Storages/ColumnsDescription.h>
2020-02-27 16:47:40 +00:00
# include <Common/StringUtils/StringUtils.h>
# include <Common/escapeForFileName.h>
2021-01-15 12:28:53 +00:00
# include <Common/CurrentMetrics.h>
2021-01-02 09:47:38 +00:00
# include <Common/FieldVisitorsAccurateComparison.h>
2022-01-10 19:39:10 +00:00
# include <Common/MemoryTrackerBlockerInThread.h>
2021-10-02 07:13:14 +00:00
# include <base/JSON.h>
2022-04-27 15:05:45 +00:00
# include <Common/logger_useful.h>
2020-08-28 09:07:20 +00:00
# include <Compression/getCompressionCodecForFile.h>
2022-01-10 19:01:41 +00:00
# include <Parsers/parseQuery.h>
2020-08-28 09:07:20 +00:00
# include <Parsers/queryToString.h>
2022-01-10 19:01:41 +00:00
# include <Parsers/ExpressionElementParsers.h>
2020-11-27 11:00:33 +00:00
# include <DataTypes/NestedUtils.h>
2021-05-30 13:57:30 +00:00
# include <DataTypes/DataTypeAggregateFunction.h>
2022-02-14 19:50:08 +00:00
# include <Interpreters/MergeTreeTransaction.h>
2022-02-17 21:26:37 +00:00
# include <Interpreters/TransactionLog.h>
2019-10-10 16:30:30 +00:00
2023-04-07 20:54:49 +00:00
# include <Disks/IO/CachedOnDiskReadBufferFromFile.h>
2021-01-15 12:28:53 +00:00
namespace CurrentMetrics
{
extern const Metric PartsTemporary ;
extern const Metric PartsPreCommitted ;
extern const Metric PartsCommitted ;
2021-12-30 14:27:22 +00:00
extern const Metric PartsPreActive ;
extern const Metric PartsActive ;
2021-01-15 12:28:53 +00:00
extern const Metric PartsOutdated ;
extern const Metric PartsDeleting ;
extern const Metric PartsDeleteOnDestroy ;
2021-01-21 18:17:00 +00:00
extern const Metric PartsWide ;
extern const Metric PartsCompact ;
extern const Metric PartsInMemory ;
2021-01-15 12:28:53 +00:00
}
2019-10-10 16:30:30 +00:00
namespace DB
{
2021-01-14 16:26:56 +00:00
2019-10-10 16:30:30 +00:00
namespace ErrorCodes
{
2020-02-25 18:02:41 +00:00
extern const int CANNOT_READ_ALL_DATA ;
extern const int LOGICAL_ERROR ;
2019-10-10 16:30:30 +00:00
extern const int NO_FILE_IN_DATA_PART ;
extern const int EXPECTED_END_OF_FILE ;
extern const int CORRUPTED_DATA ;
extern const int NOT_FOUND_EXPECTED_DATA_PART ;
extern const int BAD_SIZE_OF_FILE_IN_DATA_PART ;
extern const int BAD_TTL_FILE ;
2019-11-18 15:18:50 +00:00
extern const int NOT_IMPLEMENTED ;
2022-07-27 14:05:16 +00:00
extern const int NO_SUCH_COLUMN_IN_TABLE ;
2019-10-10 16:30:30 +00:00
}
2022-09-05 04:31:19 +00:00
2022-01-07 10:37:08 +00:00
void IMergeTreeDataPart : : MinMaxIndex : : load ( const MergeTreeData & data , const PartMetadataManagerPtr & manager )
2019-10-10 16:30:30 +00:00
{
2021-03-02 10:33:54 +00:00
auto metadata_snapshot = data . getInMemoryMetadataPtr ( ) ;
const auto & partition_key = metadata_snapshot - > getPartitionKey ( ) ;
auto minmax_column_names = data . getMinMaxColumnsNames ( partition_key ) ;
auto minmax_column_types = data . getMinMaxColumnsTypes ( partition_key ) ;
size_t minmax_idx_size = minmax_column_types . size ( ) ;
2021-09-16 21:19:58 +00:00
2020-03-10 14:56:55 +00:00
hyperrectangle . reserve ( minmax_idx_size ) ;
2019-10-10 16:30:30 +00:00
for ( size_t i = 0 ; i < minmax_idx_size ; + + i )
{
2022-01-05 11:51:50 +00:00
String file_name = " minmax_ " + escapeForFileName ( minmax_column_names [ i ] ) + " .idx " ;
auto file = manager - > read ( file_name ) ;
2021-03-09 14:46:52 +00:00
auto serialization = minmax_column_types [ i ] - > getDefaultSerialization ( ) ;
2019-10-10 16:30:30 +00:00
Field min_val ;
2022-12-02 12:57:11 +00:00
serialization - > deserializeBinary ( min_val , * file , { } ) ;
2019-10-10 16:30:30 +00:00
Field max_val ;
2022-12-02 12:57:11 +00:00
serialization - > deserializeBinary ( max_val , * file , { } ) ;
2019-10-10 16:30:30 +00:00
2021-01-02 09:47:38 +00:00
// NULL_LAST
if ( min_val . isNull ( ) )
2021-08-27 14:09:15 +00:00
min_val = POSITIVE_INFINITY ;
2021-01-02 09:47:38 +00:00
if ( max_val . isNull ( ) )
2021-08-27 14:09:15 +00:00
max_val = POSITIVE_INFINITY ;
2021-01-02 09:47:38 +00:00
2020-03-10 14:56:55 +00:00
hyperrectangle . emplace_back ( min_val , true , max_val , true ) ;
2019-10-10 16:30:30 +00:00
}
initialized = true ;
}
2022-02-01 10:36:51 +00:00
IMergeTreeDataPart : : MinMaxIndex : : WrittenFiles IMergeTreeDataPart : : MinMaxIndex : : store (
2022-10-23 03:29:26 +00:00
const MergeTreeData & data , IDataPartStorage & part_storage , Checksums & out_checksums ) const
2019-10-10 16:30:30 +00:00
{
2021-03-02 10:33:54 +00:00
auto metadata_snapshot = data . getInMemoryMetadataPtr ( ) ;
const auto & partition_key = metadata_snapshot - > getPartitionKey ( ) ;
auto minmax_column_names = data . getMinMaxColumnsNames ( partition_key ) ;
auto minmax_column_types = data . getMinMaxColumnsTypes ( partition_key ) ;
2022-10-22 22:51:59 +00:00
return store ( minmax_column_names , minmax_column_types , part_storage , out_checksums ) ;
2019-10-10 16:30:30 +00:00
}
2022-02-01 10:36:51 +00:00
IMergeTreeDataPart : : MinMaxIndex : : WrittenFiles IMergeTreeDataPart : : MinMaxIndex : : store (
2020-02-27 16:47:40 +00:00
const Names & column_names ,
const DataTypes & data_types ,
2022-10-23 03:29:26 +00:00
IDataPartStorage & part_storage ,
2020-02-27 16:47:40 +00:00
Checksums & out_checksums ) const
2019-10-10 16:30:30 +00:00
{
if ( ! initialized )
2022-04-12 18:59:49 +00:00
throw Exception (
ErrorCodes : : LOGICAL_ERROR ,
2022-05-03 19:32:24 +00:00
" Attempt to store uninitialized MinMax index for part {}. This is a bug " ,
2022-10-23 03:29:26 +00:00
part_storage . getFullPath ( ) ) ;
2019-10-10 16:30:30 +00:00
2022-02-01 10:36:51 +00:00
WrittenFiles written_files ;
2019-10-10 16:30:30 +00:00
for ( size_t i = 0 ; i < column_names . size ( ) ; + + i )
{
String file_name = " minmax_ " + escapeForFileName ( column_names [ i ] ) + " .idx " ;
2021-03-09 14:46:52 +00:00
auto serialization = data_types . at ( i ) - > getDefaultSerialization ( ) ;
2019-10-10 16:30:30 +00:00
2022-10-23 03:29:26 +00:00
auto out = part_storage . writeFile ( file_name , DBMS_DEFAULT_BUFFER_SIZE , { } ) ;
2020-02-27 16:47:40 +00:00
HashingWriteBuffer out_hashing ( * out ) ;
2022-12-02 12:57:11 +00:00
serialization - > serializeBinary ( hyperrectangle [ i ] . left , out_hashing , { } ) ;
serialization - > serializeBinary ( hyperrectangle [ i ] . right , out_hashing , { } ) ;
2019-10-10 16:30:30 +00:00
out_hashing . next ( ) ;
out_checksums . files [ file_name ] . file_size = out_hashing . count ( ) ;
out_checksums . files [ file_name ] . file_hash = out_hashing . getHash ( ) ;
2022-02-01 10:36:51 +00:00
out - > preFinalize ( ) ;
written_files . emplace_back ( std : : move ( out ) ) ;
2019-10-10 16:30:30 +00:00
}
2022-02-01 10:36:51 +00:00
return written_files ;
2019-10-10 16:30:30 +00:00
}
void IMergeTreeDataPart : : MinMaxIndex : : update ( const Block & block , const Names & column_names )
{
if ( ! initialized )
2020-03-10 14:56:55 +00:00
hyperrectangle . reserve ( column_names . size ( ) ) ;
2019-10-10 16:30:30 +00:00
for ( size_t i = 0 ; i < column_names . size ( ) ; + + i )
{
2020-04-02 17:27:07 +00:00
FieldRef min_value ;
FieldRef max_value ;
2019-10-10 16:30:30 +00:00
const ColumnWithTypeAndName & column = block . getByName ( column_names [ i ] ) ;
2021-01-02 09:47:38 +00:00
if ( const auto * column_nullable = typeid_cast < const ColumnNullable * > ( column . column . get ( ) ) )
column_nullable - > getExtremesNullLast ( min_value , max_value ) ;
else
column . column - > getExtremes ( min_value , max_value ) ;
2019-10-10 16:30:30 +00:00
if ( ! initialized )
2020-03-10 14:56:55 +00:00
hyperrectangle . emplace_back ( min_value , true , max_value , true ) ;
2019-10-10 16:30:30 +00:00
else
{
2021-01-02 09:47:38 +00:00
hyperrectangle [ i ] . left
= applyVisitor ( FieldVisitorAccurateLess ( ) , hyperrectangle [ i ] . left , min_value ) ? hyperrectangle [ i ] . left : min_value ;
hyperrectangle [ i ] . right
= applyVisitor ( FieldVisitorAccurateLess ( ) , hyperrectangle [ i ] . right , max_value ) ? max_value : hyperrectangle [ i ] . right ;
2019-10-10 16:30:30 +00:00
}
}
initialized = true ;
}
void IMergeTreeDataPart : : MinMaxIndex : : merge ( const MinMaxIndex & other )
{
if ( ! other . initialized )
return ;
if ( ! initialized )
{
2020-03-10 14:56:55 +00:00
hyperrectangle = other . hyperrectangle ;
2019-10-10 16:30:30 +00:00
initialized = true ;
}
else
{
2020-03-10 14:56:55 +00:00
for ( size_t i = 0 ; i < hyperrectangle . size ( ) ; + + i )
2019-10-10 16:30:30 +00:00
{
2020-03-10 14:56:55 +00:00
hyperrectangle [ i ] . left = std : : min ( hyperrectangle [ i ] . left , other . hyperrectangle [ i ] . left ) ;
hyperrectangle [ i ] . right = std : : max ( hyperrectangle [ i ] . right , other . hyperrectangle [ i ] . right ) ;
2019-10-10 16:30:30 +00:00
}
}
}
2021-12-08 02:40:59 +00:00
void IMergeTreeDataPart : : MinMaxIndex : : appendFiles ( const MergeTreeData & data , Strings & files )
{
auto metadata_snapshot = data . getInMemoryMetadataPtr ( ) ;
const auto & partition_key = metadata_snapshot - > getPartitionKey ( ) ;
auto minmax_column_names = data . getMinMaxColumnsNames ( partition_key ) ;
size_t minmax_idx_size = minmax_column_names . size ( ) ;
for ( size_t i = 0 ; i < minmax_idx_size ; + + i )
{
String file_name = " minmax_ " + escapeForFileName ( minmax_column_names [ i ] ) + " .idx " ;
files . push_back ( file_name ) ;
}
}
2019-10-10 16:30:30 +00:00
2022-08-12 11:03:57 +00:00
static void incrementStateMetric ( MergeTreeDataPartState state )
2021-01-15 12:28:53 +00:00
{
switch ( state )
{
2022-08-12 11:03:57 +00:00
case MergeTreeDataPartState : : Temporary :
2021-01-15 12:28:53 +00:00
CurrentMetrics : : add ( CurrentMetrics : : PartsTemporary ) ;
return ;
2022-08-12 11:03:57 +00:00
case MergeTreeDataPartState : : PreActive :
2021-12-30 14:27:22 +00:00
CurrentMetrics : : add ( CurrentMetrics : : PartsPreActive ) ;
2021-01-15 12:28:53 +00:00
CurrentMetrics : : add ( CurrentMetrics : : PartsPreCommitted ) ;
return ;
2022-08-12 11:03:57 +00:00
case MergeTreeDataPartState : : Active :
2021-12-30 14:27:22 +00:00
CurrentMetrics : : add ( CurrentMetrics : : PartsActive ) ;
2021-01-15 12:28:53 +00:00
CurrentMetrics : : add ( CurrentMetrics : : PartsCommitted ) ;
return ;
2022-08-12 11:03:57 +00:00
case MergeTreeDataPartState : : Outdated :
2021-01-15 12:28:53 +00:00
CurrentMetrics : : add ( CurrentMetrics : : PartsOutdated ) ;
return ;
2022-08-12 11:03:57 +00:00
case MergeTreeDataPartState : : Deleting :
2021-01-15 12:28:53 +00:00
CurrentMetrics : : add ( CurrentMetrics : : PartsDeleting ) ;
return ;
2022-08-12 11:03:57 +00:00
case MergeTreeDataPartState : : DeleteOnDestroy :
2021-01-15 12:28:53 +00:00
CurrentMetrics : : add ( CurrentMetrics : : PartsDeleteOnDestroy ) ;
return ;
}
}
2022-08-12 11:03:57 +00:00
static void decrementStateMetric ( MergeTreeDataPartState state )
2021-01-15 12:28:53 +00:00
{
switch ( state )
{
2022-08-12 11:03:57 +00:00
case MergeTreeDataPartState : : Temporary :
2021-01-15 12:28:53 +00:00
CurrentMetrics : : sub ( CurrentMetrics : : PartsTemporary ) ;
return ;
2022-08-12 11:03:57 +00:00
case MergeTreeDataPartState : : PreActive :
2021-12-30 14:27:22 +00:00
CurrentMetrics : : sub ( CurrentMetrics : : PartsPreActive ) ;
2021-01-15 12:28:53 +00:00
CurrentMetrics : : sub ( CurrentMetrics : : PartsPreCommitted ) ;
return ;
2022-08-12 11:03:57 +00:00
case MergeTreeDataPartState : : Active :
2021-12-30 14:27:22 +00:00
CurrentMetrics : : sub ( CurrentMetrics : : PartsActive ) ;
2021-01-15 12:28:53 +00:00
CurrentMetrics : : sub ( CurrentMetrics : : PartsCommitted ) ;
return ;
2022-08-12 11:03:57 +00:00
case MergeTreeDataPartState : : Outdated :
2021-01-15 12:28:53 +00:00
CurrentMetrics : : sub ( CurrentMetrics : : PartsOutdated ) ;
return ;
2022-08-12 11:03:57 +00:00
case MergeTreeDataPartState : : Deleting :
2021-01-15 12:28:53 +00:00
CurrentMetrics : : sub ( CurrentMetrics : : PartsDeleting ) ;
return ;
2022-08-12 11:03:57 +00:00
case MergeTreeDataPartState : : DeleteOnDestroy :
2021-01-15 12:28:53 +00:00
CurrentMetrics : : sub ( CurrentMetrics : : PartsDeleteOnDestroy ) ;
return ;
}
2021-01-21 18:17:00 +00:00
}
2021-01-15 12:28:53 +00:00
2021-01-21 18:17:00 +00:00
static void incrementTypeMetric ( MergeTreeDataPartType type )
{
switch ( type . getValue ( ) )
{
2022-03-14 14:42:09 +00:00
case MergeTreeDataPartType : : Wide :
2021-01-21 18:17:00 +00:00
CurrentMetrics : : add ( CurrentMetrics : : PartsWide ) ;
return ;
2022-03-14 14:42:09 +00:00
case MergeTreeDataPartType : : Compact :
2021-01-21 18:17:00 +00:00
CurrentMetrics : : add ( CurrentMetrics : : PartsCompact ) ;
return ;
2022-03-14 14:42:09 +00:00
case MergeTreeDataPartType : : InMemory :
2021-01-21 18:17:00 +00:00
CurrentMetrics : : add ( CurrentMetrics : : PartsInMemory ) ;
return ;
2022-03-14 14:42:09 +00:00
case MergeTreeDataPartType : : Unknown :
2021-01-21 18:17:00 +00:00
return ;
}
}
static void decrementTypeMetric ( MergeTreeDataPartType type )
{
switch ( type . getValue ( ) )
{
2022-03-14 14:42:09 +00:00
case MergeTreeDataPartType : : Wide :
2021-01-21 18:17:00 +00:00
CurrentMetrics : : sub ( CurrentMetrics : : PartsWide ) ;
return ;
2022-03-14 14:42:09 +00:00
case MergeTreeDataPartType : : Compact :
2021-01-21 18:17:00 +00:00
CurrentMetrics : : sub ( CurrentMetrics : : PartsCompact ) ;
return ;
2022-03-14 14:42:09 +00:00
case MergeTreeDataPartType : : InMemory :
2021-01-21 18:17:00 +00:00
CurrentMetrics : : sub ( CurrentMetrics : : PartsInMemory ) ;
return ;
2022-03-14 14:42:09 +00:00
case MergeTreeDataPartType : : Unknown :
2021-01-21 18:17:00 +00:00
return ;
}
2021-01-15 12:28:53 +00:00
}
2019-10-10 16:30:30 +00:00
IMergeTreeDataPart : : IMergeTreeDataPart (
2020-02-27 16:47:40 +00:00
const MergeTreeData & storage_ ,
const String & name_ ,
const MergeTreePartInfo & info_ ,
2022-10-22 22:51:59 +00:00
const MutableDataPartStoragePtr & data_part_storage_ ,
2021-02-10 14:12:49 +00:00
Type part_type_ ,
const IMergeTreeDataPart * parent_part_ )
2022-10-23 03:29:26 +00:00
: DataPartStorageHolder ( data_part_storage_ )
, storage ( storage_ )
2019-10-10 16:30:30 +00:00
, name ( name_ )
, info ( info_ )
2020-01-14 13:23:51 +00:00
, index_granularity_info ( storage_ , part_type_ )
, part_type ( part_type_ )
2021-02-10 14:12:49 +00:00
, parent_part ( parent_part_ )
2021-12-28 11:29:01 +00:00
, use_metadata_cache ( storage . use_metadata_cache )
2019-10-16 18:27:53 +00:00
{
2021-02-10 14:12:49 +00:00
if ( parent_part )
2022-08-12 11:03:57 +00:00
state = MergeTreeDataPartState : : Active ;
2022-10-23 03:29:26 +00:00
2021-01-21 18:17:00 +00:00
incrementStateMetric ( state ) ;
incrementTypeMetric ( part_type ) ;
2021-09-16 21:19:58 +00:00
minmax_idx = std : : make_shared < MinMaxIndex > ( ) ;
2022-01-05 12:05:22 +00:00
2022-09-13 03:18:25 +00:00
initializeIndexGranularityInfo ( ) ;
2022-01-05 11:51:50 +00:00
initializePartMetadataManager ( ) ;
2021-01-15 12:28:53 +00:00
}
IMergeTreeDataPart : : ~ IMergeTreeDataPart ( )
{
2021-01-21 18:17:00 +00:00
decrementStateMetric ( state ) ;
decrementTypeMetric ( part_type ) ;
2019-10-16 18:27:53 +00:00
}
2019-10-10 16:30:30 +00:00
String IMergeTreeDataPart : : getNewName ( const MergeTreePartInfo & new_part_info ) const
{
if ( storage . format_version < MERGE_TREE_DATA_MIN_FORMAT_VERSION_WITH_CUSTOM_PARTITIONING )
{
/// NOTE: getting min and max dates from the part name (instead of part data) because we want
/// the merged part name be determined only by source part names.
/// It is simpler this way when the real min and max dates for the block range can change
/// (e.g. after an ALTER DELETE command).
DayNum min_date ;
DayNum max_date ;
MergeTreePartInfo : : parseMinMaxDatesFromPartName ( name , min_date , max_date ) ;
return new_part_info . getPartNameV0 ( min_date , max_date ) ;
}
else
2022-12-20 21:44:27 +00:00
return new_part_info . getPartNameV1 ( ) ;
2019-10-10 16:30:30 +00:00
}
2019-12-03 00:23:11 +00:00
std : : optional < size_t > IMergeTreeDataPart : : getColumnPosition ( const String & column_name ) const
2019-10-31 14:44:17 +00:00
{
2019-12-25 20:06:16 +00:00
auto it = column_name_to_position . find ( column_name ) ;
if ( it = = column_name_to_position . end ( ) )
2019-12-03 00:23:11 +00:00
return { } ;
2019-12-25 20:06:16 +00:00
return it - > second ;
2019-12-18 16:41:11 +00:00
}
2019-10-31 14:44:17 +00:00
2021-01-15 12:15:13 +00:00
2022-08-12 11:03:57 +00:00
void IMergeTreeDataPart : : setState ( MergeTreeDataPartState new_state ) const
2021-01-15 12:15:13 +00:00
{
2021-01-21 18:17:00 +00:00
decrementStateMetric ( state ) ;
2021-01-15 12:15:13 +00:00
state = new_state ;
2021-01-21 18:17:00 +00:00
incrementStateMetric ( state ) ;
2021-01-15 12:15:13 +00:00
}
2022-08-12 11:03:57 +00:00
MergeTreeDataPartState IMergeTreeDataPart : : getState ( ) const
2021-01-15 12:15:13 +00:00
{
return state ;
}
2021-03-22 22:16:41 +00:00
std : : pair < DayNum , DayNum > IMergeTreeDataPart : : getMinMaxDate ( ) const
2019-10-10 16:30:30 +00:00
{
2021-09-16 21:19:58 +00:00
if ( storage . minmax_idx_date_column_pos ! = - 1 & & minmax_idx - > initialized )
2021-03-22 22:16:41 +00:00
{
2021-09-16 21:19:58 +00:00
const auto & hyperrectangle = minmax_idx - > hyperrectangle [ storage . minmax_idx_date_column_pos ] ;
2021-03-22 22:16:41 +00:00
return { DayNum ( hyperrectangle . left . get < UInt64 > ( ) ) , DayNum ( hyperrectangle . right . get < UInt64 > ( ) ) } ;
}
2019-10-10 16:30:30 +00:00
else
2021-03-22 22:16:41 +00:00
return { } ;
2019-10-10 16:30:30 +00:00
}
2021-03-22 22:16:41 +00:00
std : : pair < time_t , time_t > IMergeTreeDataPart : : getMinMaxTime ( ) const
2019-10-10 16:30:30 +00:00
{
2021-09-16 21:19:58 +00:00
if ( storage . minmax_idx_time_column_pos ! = - 1 & & minmax_idx - > initialized )
2021-03-22 22:16:41 +00:00
{
2021-09-16 21:19:58 +00:00
const auto & hyperrectangle = minmax_idx - > hyperrectangle [ storage . minmax_idx_time_column_pos ] ;
2019-10-10 16:30:30 +00:00
2021-03-22 22:16:41 +00:00
/// The case of DateTime
if ( hyperrectangle . left . getType ( ) = = Field : : Types : : UInt64 )
{
assert ( hyperrectangle . right . getType ( ) = = Field : : Types : : UInt64 ) ;
return { hyperrectangle . left . get < UInt64 > ( ) , hyperrectangle . right . get < UInt64 > ( ) } ;
}
/// The case of DateTime64
else if ( hyperrectangle . left . getType ( ) = = Field : : Types : : Decimal64 )
{
2021-03-23 23:03:14 +00:00
assert ( hyperrectangle . right . getType ( ) = = Field : : Types : : Decimal64 ) ;
2019-10-10 16:30:30 +00:00
2021-03-22 22:16:41 +00:00
auto left = hyperrectangle . left . get < DecimalField < Decimal64 > > ( ) ;
auto right = hyperrectangle . right . get < DecimalField < Decimal64 > > ( ) ;
assert ( left . getScale ( ) = = right . getScale ( ) ) ;
return { left . getValue ( ) / left . getScaleMultiplier ( ) , right . getValue ( ) / right . getScaleMultiplier ( ) } ;
}
else
throw Exception ( ErrorCodes : : LOGICAL_ERROR , " Part minmax index by time is neither DateTime or DateTime64 " ) ;
}
2019-10-10 16:30:30 +00:00
else
2021-03-22 22:16:41 +00:00
return { } ;
2019-10-10 16:30:30 +00:00
}
2021-03-22 22:16:41 +00:00
2023-02-27 11:27:57 +00:00
void IMergeTreeDataPart : : setColumns ( const NamesAndTypesList & new_columns , const SerializationInfoByName & new_infos , int32_t metadata_version_ )
2019-11-18 12:22:27 +00:00
{
2019-12-25 20:06:16 +00:00
columns = new_columns ;
2022-07-27 14:05:16 +00:00
serialization_infos = new_infos ;
2023-02-27 11:27:57 +00:00
metadata_version = metadata_version_ ;
2021-10-29 17:21:02 +00:00
2019-12-25 20:06:16 +00:00
column_name_to_position . clear ( ) ;
column_name_to_position . reserve ( new_columns . size ( ) ) ;
size_t pos = 0 ;
2022-07-21 14:47:19 +00:00
2022-07-27 14:05:16 +00:00
for ( const auto & column : columns )
{
2021-11-02 03:03:52 +00:00
column_name_to_position . emplace ( column . name , pos + + ) ;
2022-09-02 12:48:27 +00:00
2022-07-27 14:05:16 +00:00
auto it = serialization_infos . find ( column . name ) ;
auto serialization = it = = serialization_infos . end ( )
? IDataType : : getSerialization ( column )
: IDataType : : getSerialization ( column , * it - > second ) ;
serializations . emplace ( column . name , serialization ) ;
IDataType : : forEachSubcolumn ( [ & ] ( const auto & , const auto & subname , const auto & subdata )
{
auto full_name = Nested : : concatenateName ( column . name , subname ) ;
serializations . emplace ( full_name , subdata . serialization ) ;
2022-09-02 12:48:27 +00:00
} , ISerialization : : SubstreamData ( serialization ) ) ;
2022-07-27 14:05:16 +00:00
}
2022-07-26 17:31:56 +00:00
columns_description = ColumnsDescription ( columns ) ;
2023-01-25 22:19:05 +00:00
columns_description_with_collected_nested = ColumnsDescription ( Nested : : collect ( columns ) ) ;
2022-01-21 00:20:41 +00:00
}
2021-10-29 17:21:02 +00:00
2022-07-21 14:47:19 +00:00
NameAndTypePair IMergeTreeDataPart : : getColumn ( const String & column_name ) const
2022-01-21 00:20:41 +00:00
{
2022-07-21 14:47:19 +00:00
return columns_description . getColumnOrSubcolumn ( GetColumnsOptions : : AllPhysical , column_name ) ;
2019-11-18 12:22:27 +00:00
}
2022-07-21 14:47:19 +00:00
std : : optional < NameAndTypePair > IMergeTreeDataPart : : tryGetColumn ( const String & column_name ) const
2021-11-01 02:40:43 +00:00
{
2022-07-21 14:47:19 +00:00
return columns_description . tryGetColumnOrSubcolumn ( GetColumnsOptions : : AllPhysical , column_name ) ;
2022-01-21 00:20:41 +00:00
}
2021-10-29 17:21:02 +00:00
2022-07-27 14:05:16 +00:00
SerializationPtr IMergeTreeDataPart : : getSerialization ( const String & column_name ) const
2022-01-21 00:20:41 +00:00
{
2022-07-27 14:05:16 +00:00
auto serialization = tryGetSerialization ( column_name ) ;
if ( ! serialization )
throw Exception ( ErrorCodes : : NO_SUCH_COLUMN_IN_TABLE ,
" There is no column or subcolumn {} in part {} " , column_name , name ) ;
return serialization ;
2019-11-18 12:22:27 +00:00
}
2022-07-27 14:05:16 +00:00
SerializationPtr IMergeTreeDataPart : : tryGetSerialization ( const String & column_name ) const
2021-11-01 02:40:43 +00:00
{
2022-07-27 14:05:16 +00:00
auto it = serializations . find ( column_name ) ;
return it = = serializations . end ( ) ? nullptr : it - > second ;
2021-11-01 02:40:43 +00:00
}
2019-11-18 12:22:27 +00:00
void IMergeTreeDataPart : : removeIfNeeded ( )
2019-10-10 16:30:30 +00:00
{
2022-02-17 21:26:37 +00:00
assert ( assertHasValidVersionMetadata ( ) ) ;
2022-08-12 11:03:57 +00:00
if ( ! is_temp & & state ! = MergeTreeDataPartState : : DeleteOnDestroy )
2021-12-10 13:29:51 +00:00
return ;
2019-10-31 14:44:17 +00:00
2022-08-09 11:02:52 +00:00
std : : string path ;
2021-12-10 13:29:51 +00:00
try
{
2022-10-23 03:29:26 +00:00
path = getDataPartStorage ( ) . getRelativePath ( ) ;
2019-10-31 14:44:17 +00:00
2022-10-23 03:29:26 +00:00
if ( ! getDataPartStorage ( ) . exists ( ) ) // path
2021-12-10 13:29:51 +00:00
return ;
2019-10-10 16:30:30 +00:00
2021-12-10 13:29:51 +00:00
if ( is_temp )
{
2023-06-05 18:22:41 +00:00
const auto & part_directory = getDataPartStorage ( ) . getPartDirectory ( ) ;
String file_name = fileName ( part_directory ) ;
2019-10-10 16:30:30 +00:00
2021-12-10 13:29:51 +00:00
if ( file_name . empty ( ) )
2023-01-23 21:13:58 +00:00
throw Exception ( ErrorCodes : : LOGICAL_ERROR , " relative_path {} of part {} is invalid or not set " ,
getDataPartStorage ( ) . getPartDirectory ( ) , name ) ;
2019-10-10 16:30:30 +00:00
2023-06-05 18:22:41 +00:00
const auto part_parent_directory = directoryPath ( part_directory ) ;
bool is_moving_part = part_parent_directory . ends_with ( " moving/ " ) ;
if ( ! startsWith ( file_name , " tmp " ) & & ! endsWith ( file_name , " .tmp_proj " ) & & ! is_moving_part )
2021-06-09 12:36:47 +00:00
{
2021-12-10 13:29:51 +00:00
LOG_ERROR (
storage . log ,
" ~DataPart() should remove part {} but its name doesn't start with \" tmp \" or end with \" .tmp_proj \" . Too "
" suspicious, keeping the part. " ,
path ) ;
return ;
2021-06-09 12:36:47 +00:00
}
2023-06-05 18:22:41 +00:00
if ( is_moving_part )
{
LOG_TRACE ( storage . log , " Removing unneeded moved part from {} " , path ) ;
}
2021-12-10 13:29:51 +00:00
}
2019-11-19 09:38:17 +00:00
2022-04-19 19:34:41 +00:00
remove ( ) ;
2021-12-10 13:29:51 +00:00
2022-08-12 11:03:57 +00:00
if ( state = = MergeTreeDataPartState : : DeleteOnDestroy )
2019-10-31 14:44:17 +00:00
{
2021-12-10 13:29:51 +00:00
LOG_TRACE ( storage . log , " Removed part from old location {} " , path ) ;
2019-10-31 14:44:17 +00:00
}
}
2021-12-10 13:29:51 +00:00
catch ( . . . )
{
2022-09-02 11:54:41 +00:00
tryLogCurrentException ( __PRETTY_FUNCTION__ , fmt : : format ( " while removing part {} with path {} " , name , path ) ) ;
2021-12-10 13:29:51 +00:00
/// FIXME If part it temporary, then directory will not be removed for 1 day (temporary_directories_lifetime).
/// If it's tmp_merge_<part_name> or tmp_fetch_<part_name>,
/// then all future attempts to execute part producing operation will fail with "directory already exists".
}
2019-10-10 16:30:30 +00:00
}
UInt64 IMergeTreeDataPart : : getIndexSizeInBytes ( ) const
{
UInt64 res = 0 ;
for ( const ColumnPtr & column : index )
res + = column - > byteSize ( ) ;
return res ;
}
UInt64 IMergeTreeDataPart : : getIndexSizeInAllocatedBytes ( ) const
{
UInt64 res = 0 ;
for ( const ColumnPtr & column : index )
res + = column - > allocatedBytes ( ) ;
return res ;
}
2022-08-12 11:03:57 +00:00
void IMergeTreeDataPart : : assertState ( const std : : initializer_list < MergeTreeDataPartState > & affordable_states ) const
2019-10-10 16:30:30 +00:00
{
if ( ! checkState ( affordable_states ) )
{
String states_str ;
for ( auto affordable_state : affordable_states )
2021-09-06 14:24:03 +00:00
{
states_str + = stateString ( affordable_state ) ;
states_str + = ' ' ;
}
2019-10-10 16:30:30 +00:00
2022-12-28 17:59:24 +00:00
if ( ! states_str . empty ( ) )
states_str . pop_back ( ) ;
2023-01-23 21:13:58 +00:00
throw Exception ( ErrorCodes : : NOT_FOUND_EXPECTED_DATA_PART , " Unexpected state of part {}. Expected: {} " , getNameWithState ( ) , states_str ) ;
2019-10-10 16:30:30 +00:00
}
}
void IMergeTreeDataPart : : assertOnDisk ( ) const
{
if ( ! isStoredOnDisk ( ) )
2023-01-23 21:13:58 +00:00
throw Exception ( ErrorCodes : : LOGICAL_ERROR , " Data part '{}' with type '{}' is not stored on disk " ,
name , getType ( ) . toString ( ) ) ;
2019-10-10 16:30:30 +00:00
}
UInt64 IMergeTreeDataPart : : getMarksCount ( ) const
{
return index_granularity . getMarksCount ( ) ;
}
size_t IMergeTreeDataPart : : getFileSizeOrZero ( const String & file_name ) const
{
auto checksum = checksums . files . find ( file_name ) ;
if ( checksum = = checksums . files . end ( ) )
return 0 ;
return checksum - > second . file_size ;
}
2022-07-21 14:47:19 +00:00
String IMergeTreeDataPart : : getColumnNameWithMinimumCompressedSize ( bool with_subcolumns ) const
2020-01-15 18:24:10 +00:00
{
2022-07-27 14:05:16 +00:00
auto options = GetColumnsOptions ( GetColumnsOptions : : AllPhysical ) . withSubcolumns ( with_subcolumns ) ;
auto columns_list = columns_description . get ( options ) ;
2020-05-15 10:26:44 +00:00
std : : optional < std : : string > minimum_size_column ;
2020-01-15 18:24:10 +00:00
UInt64 minimum_size = std : : numeric_limits < UInt64 > : : max ( ) ;
2022-07-27 14:05:16 +00:00
for ( const auto & column : columns_list )
2020-01-15 18:24:10 +00:00
{
2020-09-14 11:22:17 +00:00
if ( ! hasColumnFiles ( column ) )
2020-01-15 18:24:10 +00:00
continue ;
2022-07-27 14:05:16 +00:00
const auto size = getColumnSize ( column . name ) . data_compressed ;
2020-01-15 18:24:10 +00:00
if ( size < minimum_size )
{
minimum_size = size ;
2022-07-27 14:05:16 +00:00
minimum_size_column = column . name ;
2020-01-15 18:24:10 +00:00
}
}
if ( ! minimum_size_column )
2023-01-23 21:13:58 +00:00
throw Exception ( ErrorCodes : : LOGICAL_ERROR , " Could not find a column of minimum size in MergeTree, part {} " ,
getDataPartStorage ( ) . getFullPath ( ) ) ;
2020-01-15 18:24:10 +00:00
return * minimum_size_column ;
}
2019-11-18 15:18:50 +00:00
void IMergeTreeDataPart : : loadColumnsChecksumsIndexes ( bool require_columns_checksums , bool check_consistency )
2019-10-31 14:44:17 +00:00
{
assertOnDisk ( ) ;
/// Memory should not be limited during ATTACH TABLE query.
/// This is already true at the server startup but must be also ensured for manual table ATTACH.
/// Motivation: memory for index is shared between queries - not belong to the query itself.
2022-07-26 15:22:00 +00:00
MemoryTrackerBlockerInThread temporarily_disable_memory_tracker ;
2019-10-31 14:44:17 +00:00
2022-03-31 02:10:05 +00:00
try
{
loadUUID ( ) ;
loadColumns ( require_columns_checksums ) ;
loadChecksums ( require_columns_checksums ) ;
loadIndexGranularity ( ) ;
calculateColumnsAndSecondaryIndicesSizesOnDisk ( ) ;
loadIndex ( ) ; /// Must be called after loadIndexGranularity as it uses the value of `index_granularity`
loadRowsCount ( ) ; /// Must be called after loadIndexGranularity() as it uses the value of `index_granularity`.
loadPartitionAndMinMaxIndex ( ) ;
if ( ! parent_part )
{
loadTTLInfos ( ) ;
2023-05-22 15:25:59 +00:00
loadProjections ( require_columns_checksums , check_consistency , false /* if_not_loaded */ ) ;
2022-03-31 02:10:05 +00:00
}
if ( check_consistency )
checkConsistency ( require_columns_checksums ) ;
loadDefaultCompressionCodec ( ) ;
}
catch ( . . . )
{
// There could be conditions that data part to be loaded is broken, but some of meta infos are already written
// into meta data before exception, need to clean them all.
metadata_manager - > deleteAll ( /*include_projection*/ true ) ;
metadata_manager - > assertAllDeleted ( /*include_projection*/ true ) ;
throw ;
2021-02-10 14:12:49 +00:00
}
2019-10-31 14:44:17 +00:00
}
2021-12-08 02:40:59 +00:00
void IMergeTreeDataPart : : appendFilesOfColumnsChecksumsIndexes ( Strings & files , bool include_projection ) const
{
if ( isStoredOnDisk ( ) )
{
appendFilesOfUUID ( files ) ;
appendFilesOfColumns ( files ) ;
appendFilesOfChecksums ( files ) ;
appendFilesOfIndexGranularity ( files ) ;
2022-01-04 05:41:11 +00:00
appendFilesOfIndex ( files ) ;
2021-12-08 02:40:59 +00:00
appendFilesOfRowsCount ( files ) ;
appendFilesOfPartitionAndMinMaxIndex ( files ) ;
appendFilesOfTTLInfos ( files ) ;
appendFilesOfDefaultCompressionCodec ( files ) ;
2023-02-27 11:27:57 +00:00
appendFilesOfMetadataVersion ( files ) ;
2021-12-08 02:40:59 +00:00
}
if ( ! parent_part & & include_projection )
{
for ( const auto & [ projection_name , projection_part ] : projection_parts )
{
Strings projection_files ;
projection_part - > appendFilesOfColumnsChecksumsIndexes ( projection_files , true ) ;
for ( const auto & projection_file : projection_files )
2022-04-07 17:44:49 +00:00
files . push_back ( fs : : path ( projection_part - > name + " .proj " ) / projection_file ) ;
2021-12-08 02:40:59 +00:00
}
}
}
2023-01-25 17:34:09 +00:00
MergeTreeDataPartBuilder IMergeTreeDataPart : : getProjectionPartBuilder ( const String & projection_name , bool is_temp_projection )
{
2023-01-26 13:09:35 +00:00
const char * projection_extension = is_temp_projection ? " .tmp_proj " : " .proj " ;
2023-01-25 17:34:09 +00:00
auto projection_storage = getDataPartStorage ( ) . getProjection ( projection_name + projection_extension , ! is_temp_projection ) ;
MergeTreeDataPartBuilder builder ( storage , projection_name , projection_storage ) ;
return builder . withPartInfo ( { " all " , 0 , 0 , 0 } ) . withParentPart ( this ) ;
}
void IMergeTreeDataPart : : addProjectionPart (
const String & projection_name ,
std : : shared_ptr < IMergeTreeDataPart > & & projection_part )
{
2023-05-22 15:25:59 +00:00
if ( hasProjection ( projection_name ) )
throw Exception ( ErrorCodes : : LOGICAL_ERROR , " Projection part {} in part {} is already loaded. This is a bug " , projection_name , name ) ;
2023-01-26 13:09:35 +00:00
projection_parts [ projection_name ] = std : : move ( projection_part ) ;
2023-01-25 17:34:09 +00:00
}
2023-05-22 15:25:59 +00:00
void IMergeTreeDataPart : : loadProjections ( bool require_columns_checksums , bool check_consistency , bool if_not_loaded )
2021-02-10 14:12:49 +00:00
{
auto metadata_snapshot = storage . getInMemoryMetadataPtr ( ) ;
for ( const auto & projection : metadata_snapshot - > projections )
{
2023-01-25 17:34:09 +00:00
auto path = projection . name + " .proj " ;
2022-10-23 03:29:26 +00:00
if ( getDataPartStorage ( ) . exists ( path ) )
2021-02-10 14:12:49 +00:00
{
2023-05-22 15:25:59 +00:00
if ( hasProjection ( projection . name ) )
{
if ( ! if_not_loaded )
throw Exception (
ErrorCodes : : LOGICAL_ERROR , " Projection part {} in part {} is already loaded. This is a bug " , projection . name , name ) ;
}
else
{
auto part = getProjectionPartBuilder ( projection . name ) . withPartFormatFromDisk ( ) . build ( ) ;
part - > loadColumnsChecksumsIndexes ( require_columns_checksums , check_consistency ) ;
addProjectionPart ( projection . name , std : : move ( part ) ) ;
}
2021-02-10 14:12:49 +00:00
}
}
}
2019-10-31 14:44:17 +00:00
void IMergeTreeDataPart : : loadIndexGranularity ( )
{
2023-01-23 21:13:58 +00:00
throw Exception ( ErrorCodes : : NOT_IMPLEMENTED ,
" Method 'loadIndexGranularity' is not implemented for part with type {} " , getType ( ) . toString ( ) ) ;
2019-10-31 14:44:17 +00:00
}
2022-03-23 04:13:42 +00:00
/// Currently we don't cache mark files of part, because cache other meta files is enough to speed up loading.
2021-12-08 02:40:59 +00:00
void IMergeTreeDataPart : : appendFilesOfIndexGranularity ( Strings & /* files */ ) const
{
}
2019-10-31 14:44:17 +00:00
void IMergeTreeDataPart : : loadIndex ( )
{
/// It can be empty in case of mutations
if ( ! index_granularity . isInitialized ( ) )
2023-01-23 21:13:58 +00:00
throw Exception ( ErrorCodes : : LOGICAL_ERROR , " Index granularity is not loaded before index loading " ) ;
2019-10-31 14:44:17 +00:00
2020-06-17 12:39:20 +00:00
auto metadata_snapshot = storage . getInMemoryMetadataPtr ( ) ;
2021-02-10 14:12:49 +00:00
if ( parent_part )
metadata_snapshot = metadata_snapshot - > projections . get ( name ) . metadata ;
2020-06-17 12:39:20 +00:00
const auto & primary_key = metadata_snapshot - > getPrimaryKey ( ) ;
2020-05-21 19:46:03 +00:00
size_t key_size = primary_key . column_names . size ( ) ;
2019-10-31 14:44:17 +00:00
if ( key_size )
{
MutableColumns loaded_index ;
loaded_index . resize ( key_size ) ;
for ( size_t i = 0 ; i < key_size ; + + i )
{
2020-05-20 18:11:38 +00:00
loaded_index [ i ] = primary_key . data_types [ i ] - > createColumn ( ) ;
2019-10-31 14:44:17 +00:00
loaded_index [ i ] - > reserve ( index_granularity . getMarksCount ( ) ) ;
}
2022-10-23 03:29:26 +00:00
String index_name = " primary " + getIndexExtensionFromFilesystem ( getDataPartStorage ( ) ) . value ( ) ;
String index_path = fs : : path ( getDataPartStorage ( ) . getRelativePath ( ) ) / index_name ;
2022-01-05 11:51:50 +00:00
auto index_file = metadata_manager - > read ( index_name ) ;
2020-05-25 23:47:11 +00:00
size_t marks_count = index_granularity . getMarksCount ( ) ;
2021-10-29 17:21:02 +00:00
Serializations key_serializations ( key_size ) ;
2021-04-04 09:17:54 +00:00
for ( size_t j = 0 ; j < key_size ; + + j )
2021-10-29 17:21:02 +00:00
key_serializations [ j ] = primary_key . data_types [ j ] - > getDefaultSerialization ( ) ;
2021-04-04 09:17:54 +00:00
2023-02-19 22:15:09 +00:00
for ( size_t i = 0 ; i < marks_count ; + + i )
2019-10-31 14:44:17 +00:00
for ( size_t j = 0 ; j < key_size ; + + j )
2022-12-02 12:57:11 +00:00
key_serializations [ j ] - > deserializeBinary ( * loaded_index [ j ] , * index_file , { } ) ;
2019-10-31 14:44:17 +00:00
for ( size_t i = 0 ; i < key_size ; + + i )
{
loaded_index [ i ] - > protect ( ) ;
2020-05-25 23:47:11 +00:00
if ( loaded_index [ i ] - > size ( ) ! = marks_count )
2023-01-23 21:13:58 +00:00
throw Exception ( ErrorCodes : : CANNOT_READ_ALL_DATA , " Cannot read all data from index file {}(expected size: "
" {}, read: {}) " , index_path , marks_count , loaded_index [ i ] - > size ( ) ) ;
2019-10-31 14:44:17 +00:00
}
2021-12-28 03:57:43 +00:00
if ( ! index_file - > eof ( ) )
2023-01-23 21:13:58 +00:00
throw Exception ( ErrorCodes : : EXPECTED_END_OF_FILE , " Index file {} is unexpectedly long " , index_path ) ;
2019-10-31 14:44:17 +00:00
index . assign ( std : : make_move_iterator ( loaded_index . begin ( ) ) , std : : make_move_iterator ( loaded_index . end ( ) ) ) ;
}
}
2022-01-04 05:41:11 +00:00
void IMergeTreeDataPart : : appendFilesOfIndex ( Strings & files ) const
2021-12-08 02:40:59 +00:00
{
auto metadata_snapshot = storage . getInMemoryMetadataPtr ( ) ;
if ( parent_part )
metadata_snapshot = metadata_snapshot - > projections . has ( name ) ? metadata_snapshot - > projections . get ( name ) . metadata : nullptr ;
if ( ! metadata_snapshot )
return ;
2021-12-29 04:31:54 +00:00
if ( metadata_snapshot - > hasPrimaryKey ( ) )
2022-05-29 07:28:02 +00:00
{
2022-10-23 03:29:26 +00:00
String index_name = " primary " + getIndexExtensionFromFilesystem ( getDataPartStorage ( ) ) . value ( ) ;
2022-05-29 07:28:02 +00:00
files . push_back ( index_name ) ;
}
2021-12-08 02:40:59 +00:00
}
2020-08-26 15:29:46 +00:00
NameSet IMergeTreeDataPart : : getFileNamesWithoutChecksums ( ) const
{
if ( ! isStoredOnDisk ( ) )
return { } ;
NameSet result = { " checksums.txt " , " columns.txt " } ;
2020-10-15 16:17:16 +00:00
2022-10-23 03:29:26 +00:00
if ( getDataPartStorage ( ) . exists ( DEFAULT_COMPRESSION_CODEC_FILE_NAME ) )
2020-08-26 15:29:46 +00:00
result . emplace ( DEFAULT_COMPRESSION_CODEC_FILE_NAME ) ;
2022-10-23 03:29:26 +00:00
if ( getDataPartStorage ( ) . exists ( TXN_VERSION_METADATA_FILE_NAME ) )
2022-02-03 18:57:09 +00:00
result . emplace ( TXN_VERSION_METADATA_FILE_NAME ) ;
2023-02-27 11:27:57 +00:00
if ( getDataPartStorage ( ) . exists ( METADATA_VERSION_FILE_NAME ) )
result . emplace ( METADATA_VERSION_FILE_NAME ) ;
2020-08-26 15:29:46 +00:00
return result ;
}
2020-08-28 09:07:20 +00:00
void IMergeTreeDataPart : : loadDefaultCompressionCodec ( )
2020-08-26 15:29:46 +00:00
{
2020-08-27 08:35:55 +00:00
/// In memory parts doesn't have any compression
2020-08-26 15:29:46 +00:00
if ( ! isStoredOnDisk ( ) )
2020-08-27 13:32:23 +00:00
{
default_codec = CompressionCodecFactory : : instance ( ) . get ( " NONE " , { } ) ;
2020-08-28 09:07:20 +00:00
return ;
2020-08-27 13:32:23 +00:00
}
2020-08-26 15:29:46 +00:00
2022-10-23 03:29:26 +00:00
String path = fs : : path ( getDataPartStorage ( ) . getRelativePath ( ) ) / DEFAULT_COMPRESSION_CODEC_FILE_NAME ;
2022-01-05 11:51:50 +00:00
bool exists = metadata_manager - > exists ( DEFAULT_COMPRESSION_CODEC_FILE_NAME ) ;
2021-12-28 10:06:13 +00:00
if ( ! exists )
2020-08-28 09:07:20 +00:00
{
default_codec = detectDefaultCompressionCodec ( ) ;
}
else
{
2022-01-05 11:51:50 +00:00
auto file_buf = metadata_manager - > read ( DEFAULT_COMPRESSION_CODEC_FILE_NAME ) ;
2020-08-28 09:07:20 +00:00
String codec_line ;
2021-12-28 03:57:43 +00:00
readEscapedStringUntilEOL ( codec_line , * file_buf ) ;
2020-08-26 15:29:46 +00:00
2020-08-28 09:07:20 +00:00
ReadBufferFromString buf ( codec_line ) ;
2020-08-26 15:29:46 +00:00
2020-08-28 09:07:20 +00:00
if ( ! checkString ( " CODEC " , buf ) )
{
2021-12-08 02:40:59 +00:00
LOG_WARNING (
storage . log ,
" Cannot parse default codec for part {} from file {}, content '{}'. Default compression codec will be deduced "
" automatically, from data on disk " ,
name ,
path ,
codec_line ) ;
2020-08-28 09:07:20 +00:00
default_codec = detectDefaultCompressionCodec ( ) ;
}
2020-08-26 15:29:46 +00:00
2020-08-28 09:07:20 +00:00
try
{
ParserCodec codec_parser ;
auto codec_ast = parseQuery ( codec_parser , codec_line . data ( ) + buf . getPosition ( ) , codec_line . data ( ) + codec_line . length ( ) , " codec parser " , 0 , DBMS_DEFAULT_MAX_PARSER_DEPTH ) ;
default_codec = CompressionCodecFactory : : instance ( ) . get ( codec_ast , { } ) ;
}
catch ( const DB : : Exception & ex )
{
LOG_WARNING ( storage . log , " Cannot parse default codec for part {} from file {}, content '{}', error '{}'. Default compression codec will be deduced automatically, from data on disk. " , name , path , codec_line , ex . what ( ) ) ;
default_codec = detectDefaultCompressionCodec ( ) ;
}
2020-08-26 15:29:46 +00:00
}
2020-08-28 09:07:20 +00:00
}
2020-08-26 15:29:46 +00:00
2022-10-23 22:29:24 +00:00
template < typename Writer >
void IMergeTreeDataPart : : writeMetadata ( const String & filename , const WriteSettings & settings , Writer & & writer )
{
auto & data_part_storage = getDataPartStorage ( ) ;
auto tmp_filename = filename + " .tmp " ;
2023-01-25 17:34:09 +00:00
data_part_storage . beginTransaction ( ) ;
2022-10-23 22:29:24 +00:00
try
{
{
auto out = data_part_storage . writeFile ( tmp_filename , 4096 , settings ) ;
writer ( * out ) ;
out - > finalize ( ) ;
}
data_part_storage . moveFile ( tmp_filename , filename ) ;
}
catch ( . . . )
{
try
{
if ( data_part_storage . exists ( tmp_filename ) )
2023-01-25 17:34:09 +00:00
{
2022-10-23 22:29:24 +00:00
data_part_storage . removeFile ( tmp_filename ) ;
2023-01-25 17:34:09 +00:00
data_part_storage . commitTransaction ( ) ;
}
2022-10-23 22:29:24 +00:00
}
catch ( . . . )
{
2023-02-26 02:45:54 +00:00
tryLogCurrentException ( " IMergeTreeDataPart " ) ;
2022-10-23 22:29:24 +00:00
}
throw ;
}
2023-01-25 17:34:09 +00:00
data_part_storage . commitTransaction ( ) ;
2022-10-23 22:29:24 +00:00
}
void IMergeTreeDataPart : : writeChecksums ( const MergeTreeDataPartChecksums & checksums_ , const WriteSettings & settings )
{
writeMetadata ( " checksums.txt " , settings , [ & checksums_ ] ( auto & buffer )
{
checksums_ . write ( buffer ) ;
} ) ;
}
void IMergeTreeDataPart : : writeColumns ( const NamesAndTypesList & columns_ , const WriteSettings & settings )
{
writeMetadata ( " columns.txt " , settings , [ & columns_ ] ( auto & buffer )
{
columns_ . writeText ( buffer ) ;
} ) ;
}
void IMergeTreeDataPart : : writeVersionMetadata ( const VersionMetadata & version_ , bool fsync_part_dir ) const
{
static constexpr auto filename = " txn_version.txt " ;
static constexpr auto tmp_filename = " txn_version.txt.tmp " ;
2022-10-24 14:44:22 +00:00
auto & data_part_storage = const_cast < IDataPartStorage & > ( getDataPartStorage ( ) ) ;
2022-10-23 22:29:24 +00:00
try
{
{
/// TODO IDisk interface does not allow to open file with O_EXCL flag (for DiskLocal),
/// so we create empty file at first (expecting that createFile throws if file already exists)
/// and then overwrite it.
data_part_storage . createFile ( tmp_filename ) ;
auto write_settings = storage . getContext ( ) - > getWriteSettings ( ) ;
auto buf = data_part_storage . writeFile ( tmp_filename , 256 , write_settings ) ;
version_ . write ( * buf ) ;
buf - > finalize ( ) ;
buf - > sync ( ) ;
}
SyncGuardPtr sync_guard ;
if ( fsync_part_dir )
sync_guard = data_part_storage . getDirectorySyncGuard ( ) ;
data_part_storage . replaceFile ( tmp_filename , filename ) ;
}
catch ( . . . )
{
try
{
if ( data_part_storage . exists ( tmp_filename ) )
data_part_storage . removeFile ( tmp_filename ) ;
}
catch ( . . . )
{
2023-01-25 17:34:09 +00:00
tryLogCurrentException ( " DataPartStorageOnDiskFull " ) ;
2022-10-23 22:29:24 +00:00
}
throw ;
}
}
void IMergeTreeDataPart : : writeDeleteOnDestroyMarker ( )
{
static constexpr auto marker_path = " delete-on-destroy.txt " ;
try
{
getDataPartStorage ( ) . createFile ( marker_path ) ;
}
catch ( Poco : : Exception & e )
{
LOG_ERROR ( storage . log , " {} (while creating DeleteOnDestroy marker: {}) " ,
e . what ( ) , ( fs : : path ( getDataPartStorage ( ) . getFullPath ( ) ) / marker_path ) . string ( ) ) ;
}
}
void IMergeTreeDataPart : : removeDeleteOnDestroyMarker ( )
{
getDataPartStorage ( ) . removeFileIfExists ( " delete-on-destroy.txt " ) ;
}
void IMergeTreeDataPart : : removeVersionMetadata ( )
{
getDataPartStorage ( ) . removeFileIfExists ( " txn_version.txt " ) ;
}
2023-02-27 11:27:57 +00:00
void IMergeTreeDataPart : : removeMetadataVersion ( )
{
getDataPartStorage ( ) . removeFileIfExists ( METADATA_VERSION_FILE_NAME ) ;
}
2021-12-31 03:13:38 +00:00
void IMergeTreeDataPart : : appendFilesOfDefaultCompressionCodec ( Strings & files )
2021-12-08 02:40:59 +00:00
{
files . push_back ( DEFAULT_COMPRESSION_CODEC_FILE_NAME ) ;
}
2023-02-27 11:27:57 +00:00
void IMergeTreeDataPart : : appendFilesOfMetadataVersion ( Strings & files )
{
files . push_back ( METADATA_VERSION_FILE_NAME ) ;
}
2020-08-28 09:07:20 +00:00
CompressionCodecPtr IMergeTreeDataPart : : detectDefaultCompressionCodec ( ) const
{
/// In memory parts doesn't have any compression
if ( ! isStoredOnDisk ( ) )
return CompressionCodecFactory : : instance ( ) . get ( " NONE " , { } ) ;
auto metadata_snapshot = storage . getInMemoryMetadataPtr ( ) ;
const auto & storage_columns = metadata_snapshot - > getColumns ( ) ;
CompressionCodecPtr result = nullptr ;
for ( const auto & part_column : columns )
2020-08-26 15:29:46 +00:00
{
2020-08-31 13:39:27 +00:00
/// It was compressed with default codec and it's not empty
2021-12-09 10:39:28 +00:00
auto column_size = getColumnSize ( part_column . name ) ;
2020-08-31 13:39:27 +00:00
if ( column_size . data_compressed ! = 0 & & ! storage_columns . hasCompressionCodec ( part_column . name ) )
2020-08-28 09:07:20 +00:00
{
2021-01-15 09:04:23 +00:00
String path_to_data_file ;
2022-07-27 14:05:16 +00:00
getSerialization ( part_column . name ) - > enumerateStreams ( [ & ] ( const ISerialization : : SubstreamPath & substream_path )
2020-12-23 11:53:49 +00:00
{
2021-01-15 09:04:23 +00:00
if ( path_to_data_file . empty ( ) )
{
2022-06-20 18:18:17 +00:00
String candidate_path = /*fs::path(getRelativePath()) */ ( ISerialization : : getFileNameForStream ( part_column , substream_path ) + " .bin " ) ;
2021-01-15 09:04:23 +00:00
2021-01-15 09:10:03 +00:00
/// We can have existing, but empty .bin files. Example: LowCardinality(Nullable(...)) columns and column_name.dict.null.bin file.
2022-10-23 03:29:26 +00:00
if ( getDataPartStorage ( ) . exists ( candidate_path ) & & getDataPartStorage ( ) . getFileSize ( candidate_path ) ! = 0 )
2021-01-15 09:04:23 +00:00
path_to_data_file = candidate_path ;
}
} ) ;
if ( path_to_data_file . empty ( ) )
2020-12-23 11:53:49 +00:00
{
2021-01-15 09:04:23 +00:00
LOG_WARNING ( storage . log , " Part's {} column {} has non zero data compressed size, but all data files don't exist or empty " , name , backQuoteIfNeed ( part_column . name ) ) ;
2020-12-23 11:53:49 +00:00
continue ;
}
2022-10-23 03:29:26 +00:00
result = getCompressionCodecForFile ( getDataPartStorage ( ) , path_to_data_file ) ;
2020-08-28 09:07:20 +00:00
break ;
}
2020-08-26 15:29:46 +00:00
}
2020-08-28 09:07:20 +00:00
if ( ! result )
result = CompressionCodecFactory : : instance ( ) . getDefaultCodec ( ) ;
2020-08-26 15:29:46 +00:00
2020-08-28 09:07:20 +00:00
return result ;
2020-08-26 15:29:46 +00:00
}
2019-10-31 14:44:17 +00:00
void IMergeTreeDataPart : : loadPartitionAndMinMaxIndex ( )
{
2021-02-10 14:12:49 +00:00
if ( storage . format_version < MERGE_TREE_DATA_MIN_FORMAT_VERSION_WITH_CUSTOM_PARTITIONING & & ! parent_part )
2019-10-31 14:44:17 +00:00
{
DayNum min_date ;
DayNum max_date ;
MergeTreePartInfo : : parseMinMaxDatesFromPartName ( name , min_date , max_date ) ;
const auto & date_lut = DateLUT : : instance ( ) ;
partition = MergeTreePartition ( date_lut . toNumYYYYMM ( min_date ) ) ;
2021-09-16 21:19:58 +00:00
minmax_idx = std : : make_shared < MinMaxIndex > ( min_date , max_date ) ;
2019-10-31 14:44:17 +00:00
}
else
{
2022-06-20 18:18:17 +00:00
//String path = getRelativePath();
2021-02-10 14:12:49 +00:00
if ( ! parent_part )
2022-01-05 11:51:50 +00:00
partition . load ( storage , metadata_manager ) ;
2021-02-10 14:12:49 +00:00
2019-10-31 14:44:17 +00:00
if ( ! isEmpty ( ) )
2021-02-10 14:12:49 +00:00
{
if ( parent_part )
// projection parts don't have minmax_idx, and it's always initialized
2021-09-16 21:19:58 +00:00
minmax_idx - > initialized = true ;
2021-02-10 14:12:49 +00:00
else
2022-01-05 11:51:50 +00:00
minmax_idx - > load ( storage , metadata_manager ) ;
2021-02-10 14:12:49 +00:00
}
if ( parent_part )
return ;
2019-10-31 14:44:17 +00:00
}
2020-06-17 10:34:23 +00:00
auto metadata_snapshot = storage . getInMemoryMetadataPtr ( ) ;
String calculated_partition_id = partition . getID ( metadata_snapshot - > getPartitionKey ( ) . sample_block ) ;
2019-10-31 14:44:17 +00:00
if ( calculated_partition_id ! = info . partition_id )
2023-01-23 21:13:58 +00:00
throw Exception ( ErrorCodes : : CORRUPTED_DATA , " While loading part {}: "
" calculated partition ID: {} differs from partition ID in part name: {} " ,
getDataPartStorage ( ) . getFullPath ( ) , calculated_partition_id , info . partition_id ) ;
2019-10-31 14:44:17 +00:00
}
2021-12-08 02:40:59 +00:00
void IMergeTreeDataPart : : appendFilesOfPartitionAndMinMaxIndex ( Strings & files ) const
{
if ( storage . format_version < MERGE_TREE_DATA_MIN_FORMAT_VERSION_WITH_CUSTOM_PARTITIONING & & ! parent_part )
return ;
if ( ! parent_part )
partition . appendFiles ( storage , files ) ;
2022-05-05 06:48:33 +00:00
if ( ! parent_part )
minmax_idx - > appendFiles ( storage , files ) ;
2021-12-08 02:40:59 +00:00
}
2019-10-31 14:44:17 +00:00
void IMergeTreeDataPart : : loadChecksums ( bool require )
{
2022-01-05 11:51:50 +00:00
bool exists = metadata_manager - > exists ( " checksums.txt " ) ;
2021-12-28 10:06:13 +00:00
if ( exists )
{
2022-01-05 11:51:50 +00:00
auto buf = metadata_manager - > read ( " checksums.txt " ) ;
2020-02-27 16:47:40 +00:00
if ( checksums . read ( * buf ) )
2019-10-31 14:44:17 +00:00
{
2020-02-27 16:47:40 +00:00
assertEOF ( * buf ) ;
2019-10-31 14:44:17 +00:00
bytes_on_disk = checksums . getTotalSizeOnDisk ( ) ;
}
else
2022-10-23 03:29:26 +00:00
bytes_on_disk = getDataPartStorage ( ) . calculateTotalSizeOnDisk ( ) ;
2019-10-31 14:44:17 +00:00
}
else
{
if ( require )
2021-03-22 13:27:35 +00:00
throw Exception ( ErrorCodes : : NO_FILE_IN_DATA_PART , " No checksums.txt in part {} " , name ) ;
2019-10-31 14:44:17 +00:00
2020-07-16 10:54:49 +00:00
/// If the checksums file is not present, calculate the checksums and write them to disk.
/// Check the data while we are at it.
LOG_WARNING ( storage . log , " Checksums for part {} not found. Will calculate them from data on disk. " , name ) ;
2021-03-22 13:27:35 +00:00
2020-07-16 10:54:49 +00:00
checksums = checkDataPart ( shared_from_this ( ) , false ) ;
2022-10-23 22:29:24 +00:00
writeChecksums ( checksums , { } ) ;
2020-07-16 10:54:49 +00:00
bytes_on_disk = checksums . getTotalSizeOnDisk ( ) ;
2019-10-31 14:44:17 +00:00
}
}
2021-12-31 03:13:38 +00:00
void IMergeTreeDataPart : : appendFilesOfChecksums ( Strings & files )
2021-12-08 02:40:59 +00:00
{
files . push_back ( " checksums.txt " ) ;
}
2019-10-31 14:44:17 +00:00
void IMergeTreeDataPart : : loadRowsCount ( )
{
2021-04-17 01:06:59 +00:00
auto read_rows_count = [ & ] ( )
{
2022-01-05 11:51:50 +00:00
auto buf = metadata_manager - > read ( " count.txt " ) ;
2021-04-17 01:06:59 +00:00
readIntText ( rows_count , * buf ) ;
assertEOF ( * buf ) ;
} ;
2019-10-31 14:44:17 +00:00
if ( index_granularity . empty ( ) )
{
rows_count = 0 ;
}
2022-03-14 14:42:09 +00:00
else if ( storage . format_version > = MERGE_TREE_DATA_MIN_FORMAT_VERSION_WITH_CUSTOM_PARTITIONING | | part_type = = Type : : Compact | | parent_part )
2019-10-31 14:44:17 +00:00
{
2022-01-05 11:51:50 +00:00
bool exists = metadata_manager - > exists ( " count.txt " ) ;
if ( ! exists )
2023-01-23 21:13:58 +00:00
throw Exception ( ErrorCodes : : NO_FILE_IN_DATA_PART , " No count.txt in part {} " , name ) ;
2021-12-28 03:57:43 +00:00
read_rows_count ( ) ;
2020-07-25 14:42:20 +00:00
# ifndef NDEBUG
/// columns have to be loaded
for ( const auto & column : getColumns ( ) )
{
2020-07-27 09:42:37 +00:00
/// Most trivial types
2021-04-11 23:30:04 +00:00
if ( column . type - > isValueRepresentedByNumber ( )
& & ! column . type - > haveSubtypes ( )
2022-07-29 11:41:53 +00:00
& & getSerialization ( column . name ) - > getKind ( ) = = ISerialization : : Kind : : DEFAULT )
2020-07-25 14:42:20 +00:00
{
2021-12-09 10:39:28 +00:00
auto size = getColumnSize ( column . name ) ;
2020-07-25 14:42:20 +00:00
if ( size . data_uncompressed = = 0 )
continue ;
size_t rows_in_column = size . data_uncompressed / column . type - > getSizeOfValueInMemory ( ) ;
if ( rows_in_column ! = rows_count )
{
throw Exception (
2023-01-23 21:13:58 +00:00
ErrorCodes : : LOGICAL_ERROR ,
" Column {} has rows count {} according to size in memory "
" and size of single value, but data part {} has {} rows " ,
backQuote ( column . name ) , rows_in_column , name , rows_count ) ;
2020-07-25 14:42:20 +00:00
}
2020-12-09 11:23:37 +00:00
2020-12-09 11:46:04 +00:00
size_t last_possibly_incomplete_mark_rows = index_granularity . getLastNonFinalMarkRows ( ) ;
/// All this rows have to be written in column
size_t index_granularity_without_last_mark = index_granularity . getTotalRows ( ) - last_possibly_incomplete_mark_rows ;
/// We have more rows in column than in index granularity without last possibly incomplete mark
if ( rows_in_column < index_granularity_without_last_mark )
2020-12-09 11:23:37 +00:00
{
throw Exception (
2023-01-23 21:13:58 +00:00
ErrorCodes : : LOGICAL_ERROR ,
" Column {} has rows count {} according to size in memory "
" and size of single value, "
" but index granularity in part {} without last mark has {} rows, which "
" is more than in column " ,
backQuote ( column . name ) , rows_in_column , name , index_granularity . getTotalRows ( ) ) ;
2020-12-09 11:46:04 +00:00
}
/// In last mark we actually written less or equal rows than stored in last mark of index granularity
if ( rows_in_column - index_granularity_without_last_mark > last_possibly_incomplete_mark_rows )
{
throw Exception (
2023-01-23 21:13:58 +00:00
ErrorCodes : : LOGICAL_ERROR ,
" Column {} has rows count {} in last mark according to size in memory "
" and size of single value, "
" but index granularity in part {} "
" in last mark has {} rows which is less than in column " ,
backQuote ( column . name ) , rows_in_column - index_granularity_without_last_mark ,
name , last_possibly_incomplete_mark_rows ) ;
2020-12-09 11:23:37 +00:00
}
2020-07-25 14:42:20 +00:00
}
}
# endif
2019-10-31 14:44:17 +00:00
}
else
{
2022-10-23 03:29:26 +00:00
if ( getDataPartStorage ( ) . exists ( " count.txt " ) )
2021-04-17 01:06:59 +00:00
{
read_rows_count ( ) ;
return ;
}
2019-10-31 14:44:17 +00:00
for ( const NameAndTypePair & column : columns )
{
2022-07-27 14:05:16 +00:00
ColumnPtr column_col = column . type - > createColumn ( * getSerialization ( column . name ) ) ;
2019-10-31 14:44:17 +00:00
if ( ! column_col - > isFixedAndContiguous ( ) | | column_col - > lowCardinality ( ) )
continue ;
2021-12-09 10:39:28 +00:00
size_t column_size = getColumnSize ( column . name ) . data_uncompressed ;
2019-10-31 14:44:17 +00:00
if ( ! column_size )
continue ;
size_t sizeof_field = column_col - > sizeOfValueIfFixed ( ) ;
rows_count = column_size / sizeof_field ;
if ( column_size % sizeof_field ! = 0 )
{
2023-01-23 21:13:58 +00:00
throw Exception ( ErrorCodes : : LOGICAL_ERROR ,
" Uncompressed size of column {}({}) is not divisible by the size of value ({}) " ,
column . name , column_size , sizeof_field ) ;
2019-10-31 14:44:17 +00:00
}
size_t last_mark_index_granularity = index_granularity . getLastNonFinalMarkRows ( ) ;
size_t rows_approx = index_granularity . getTotalRows ( ) ;
if ( ! ( rows_count < = rows_approx & & rows_approx < rows_count + last_mark_index_granularity ) )
2023-01-23 21:13:58 +00:00
throw Exception ( ErrorCodes : : LOGICAL_ERROR , " Unexpected size of column {}: "
" {} rows, expected {}+-{} rows according to the index " ,
column . name , rows_count , rows_approx , toString ( last_mark_index_granularity ) ) ;
2019-10-31 14:44:17 +00:00
return ;
}
2023-01-23 21:13:58 +00:00
throw Exception ( ErrorCodes : : LOGICAL_ERROR , " Data part doesn't contain fixed size column (even Date column) " ) ;
2019-10-31 14:44:17 +00:00
}
}
2021-12-31 03:13:38 +00:00
void IMergeTreeDataPart : : appendFilesOfRowsCount ( Strings & files )
2021-12-08 02:40:59 +00:00
{
files . push_back ( " count.txt " ) ;
}
2019-10-31 14:44:17 +00:00
void IMergeTreeDataPart : : loadTTLInfos ( )
{
2022-01-05 11:51:50 +00:00
bool exists = metadata_manager - > exists ( " ttl.txt " ) ;
2021-12-28 10:06:13 +00:00
if ( exists )
{
2022-01-05 11:51:50 +00:00
auto in = metadata_manager - > read ( " ttl.txt " ) ;
2020-02-27 16:47:40 +00:00
assertString ( " ttl format version: " , * in ) ;
2019-10-31 14:44:17 +00:00
size_t format_version ;
2020-02-27 16:47:40 +00:00
readText ( format_version , * in ) ;
assertChar ( ' \n ' , * in ) ;
2019-10-31 14:44:17 +00:00
if ( format_version = = 1 )
{
try
{
2020-02-27 16:47:40 +00:00
ttl_infos . read ( * in ) ;
2019-10-31 14:44:17 +00:00
}
catch ( const JSONException & )
{
2023-01-23 21:13:58 +00:00
throw Exception ( ErrorCodes : : BAD_TTL_FILE , " Error while parsing file ttl.txt in part: {} " , name ) ;
2019-10-31 14:44:17 +00:00
}
}
else
2023-01-23 21:13:58 +00:00
throw Exception ( ErrorCodes : : BAD_TTL_FILE , " Unknown ttl format version: {} " , toString ( format_version ) ) ;
2019-10-31 14:44:17 +00:00
}
}
2021-12-08 02:40:59 +00:00
2021-12-31 03:13:38 +00:00
void IMergeTreeDataPart : : appendFilesOfTTLInfos ( Strings & files )
2021-12-08 02:40:59 +00:00
{
files . push_back ( " ttl.txt " ) ;
}
2020-10-15 16:17:16 +00:00
void IMergeTreeDataPart : : loadUUID ( )
{
2022-01-05 11:51:50 +00:00
bool exists = metadata_manager - > exists ( UUID_FILE_NAME ) ;
2021-12-28 10:06:13 +00:00
if ( exists )
2020-10-15 16:17:16 +00:00
{
2022-01-05 11:51:50 +00:00
auto in = metadata_manager - > read ( UUID_FILE_NAME ) ;
2020-10-15 16:17:16 +00:00
readText ( uuid , * in ) ;
if ( uuid = = UUIDHelpers : : Nil )
2023-01-23 21:13:58 +00:00
throw Exception ( ErrorCodes : : LOGICAL_ERROR , " Unexpected empty {} in part: {} " , String ( UUID_FILE_NAME ) , name ) ;
2020-10-15 16:17:16 +00:00
}
}
2021-12-31 03:13:38 +00:00
void IMergeTreeDataPart : : appendFilesOfUUID ( Strings & files )
2021-12-08 02:40:59 +00:00
{
files . push_back ( UUID_FILE_NAME ) ;
}
2019-10-31 14:44:17 +00:00
void IMergeTreeDataPart : : loadColumns ( bool require )
{
2022-10-23 03:29:26 +00:00
String path = fs : : path ( getDataPartStorage ( ) . getRelativePath ( ) ) / " columns.txt " ;
2020-06-17 16:39:58 +00:00
auto metadata_snapshot = storage . getInMemoryMetadataPtr ( ) ;
2021-02-10 14:12:49 +00:00
if ( parent_part )
metadata_snapshot = metadata_snapshot - > projections . get ( name ) . metadata ;
2021-01-21 12:34:11 +00:00
NamesAndTypesList loaded_columns ;
2023-02-28 11:17:43 +00:00
bool is_readonly_storage = getDataPartStorage ( ) . isReadonly ( ) ;
2023-02-27 11:27:57 +00:00
if ( ! metadata_manager - > exists ( " columns.txt " ) )
2019-10-31 14:44:17 +00:00
{
2020-01-15 18:24:10 +00:00
/// We can get list of columns only from columns.txt in compact parts.
2022-03-14 14:42:09 +00:00
if ( require | | part_type = = Type : : Compact )
2023-01-23 21:13:58 +00:00
throw Exception ( ErrorCodes : : NO_FILE_IN_DATA_PART , " No columns.txt in part {}, expected path {} on drive {} " ,
name , path , getDataPartStorage ( ) . getDiskName ( ) ) ;
2019-10-31 14:44:17 +00:00
/// If there is no file with a list of columns, write it down.
2020-06-17 16:39:58 +00:00
for ( const NameAndTypePair & column : metadata_snapshot - > getColumns ( ) . getAllPhysical ( ) )
2022-10-23 03:29:26 +00:00
if ( getDataPartStorage ( ) . exists ( getFileNameForColumn ( column ) + " .bin " ) )
2021-01-21 12:34:11 +00:00
loaded_columns . push_back ( column ) ;
2019-10-31 14:44:17 +00:00
if ( columns . empty ( ) )
2023-01-23 21:13:58 +00:00
throw Exception ( ErrorCodes : : NO_FILE_IN_DATA_PART , " No columns in part {} " , name ) ;
2019-10-31 14:44:17 +00:00
2023-02-28 11:17:43 +00:00
if ( ! is_readonly_storage )
writeColumns ( loaded_columns , { } ) ;
2019-12-09 21:21:17 +00:00
}
else
{
2022-01-05 11:51:50 +00:00
auto in = metadata_manager - > read ( " columns.txt " ) ;
2021-12-08 02:40:59 +00:00
loaded_columns . readText ( * in ) ;
2021-11-27 09:40:46 +00:00
2022-09-16 19:07:36 +00:00
for ( auto & column : loaded_columns )
2022-10-14 18:07:02 +00:00
setVersionToAggregateFunctions ( column . type , true ) ;
2019-10-31 14:44:17 +00:00
}
2021-10-29 17:21:02 +00:00
SerializationInfo : : Settings settings =
{
2021-12-08 15:29:00 +00:00
. ratio_of_defaults_for_sparse = storage . getSettings ( ) - > ratio_of_defaults_for_sparse_serialization ,
2021-10-29 17:21:02 +00:00
. choose_kind = false ,
} ;
2023-05-08 11:50:21 +00:00
SerializationInfoByName infos ;
2023-02-27 11:27:57 +00:00
if ( metadata_manager - > exists ( SERIALIZATION_FILE_NAME ) )
2022-03-23 04:13:42 +00:00
{
auto in = metadata_manager - > read ( SERIALIZATION_FILE_NAME ) ;
2023-05-08 11:50:21 +00:00
infos = SerializationInfoByName : : readJSON ( loaded_columns , settings , * in ) ;
2022-03-23 04:13:42 +00:00
}
2021-10-29 17:21:02 +00:00
2023-02-27 11:27:57 +00:00
int32_t loaded_metadata_version ;
if ( metadata_manager - > exists ( METADATA_VERSION_FILE_NAME ) )
{
auto in = metadata_manager - > read ( METADATA_VERSION_FILE_NAME ) ;
readIntText ( loaded_metadata_version , * in ) ;
}
else
{
loaded_metadata_version = metadata_snapshot - > getMetadataVersion ( ) ;
}
setColumns ( loaded_columns , infos , loaded_metadata_version ) ;
2019-10-31 14:44:17 +00:00
}
2023-02-27 11:27:57 +00:00
2022-07-08 08:12:59 +00:00
/// Project part / part with project parts / compact part doesn't support LWD.
bool IMergeTreeDataPart : : supportLightweightDeleteMutate ( ) const
{
2022-07-23 11:55:43 +00:00
return ( part_type = = MergeTreeDataPartType : : Wide | | part_type = = MergeTreeDataPartType : : Compact ) & &
parent_part = = nullptr & & projection_parts . empty ( ) ;
2022-07-08 08:12:59 +00:00
}
2022-02-14 19:50:08 +00:00
void IMergeTreeDataPart : : assertHasVersionMetadata ( MergeTreeTransaction * txn ) const
{
TransactionID expected_tid = txn ? txn - > tid : Tx : : PrehistoricTID ;
if ( version . creation_tid ! = expected_tid )
throw Exception ( ErrorCodes : : LOGICAL_ERROR ,
" CreationTID of part {} (table {}) is set to unexpected value {}, it's a bug. Current transaction: {} " ,
name , storage . getStorageID ( ) . getNameForLogs ( ) , version . creation_tid , txn ? txn - > dumpDescription ( ) : " <none> " ) ;
2022-02-15 15:00:45 +00:00
assert ( ! txn | | storage . supportsTransactions ( ) ) ;
2022-10-23 03:29:26 +00:00
assert ( ! txn | | getDataPartStorage ( ) . exists ( TXN_VERSION_METADATA_FILE_NAME ) ) ;
2022-02-14 19:50:08 +00:00
}
2021-12-30 13:15:28 +00:00
2022-08-25 13:10:14 +00:00
void IMergeTreeDataPart : : storeVersionMetadata ( bool force ) const
2021-12-30 13:15:28 +00:00
{
2022-08-25 13:10:14 +00:00
if ( ! wasInvolvedInTransaction ( ) & & ! force )
2021-12-30 13:15:28 +00:00
return ;
2022-09-26 22:01:00 +00:00
LOG_TEST ( storage . log , " Writing version for {} (creation: {}, removal {}, creation csn {}) " , name , version . creation_tid , version . removal_tid , version . creation_csn ) ;
2022-02-15 15:00:45 +00:00
assert ( storage . supportsTransactions ( ) ) ;
2022-02-14 19:50:08 +00:00
if ( ! isStoredOnDisk ( ) )
throw Exception ( ErrorCodes : : NOT_IMPLEMENTED , " Transactions are not supported for in-memory parts (table: {}, part: {}) " ,
storage . getStorageID ( ) . getNameForLogs ( ) , name ) ;
2022-10-23 22:29:24 +00:00
writeVersionMetadata ( version , storage . getSettings ( ) - > fsync_part_directory ) ;
2021-12-30 13:15:28 +00:00
}
2022-02-17 21:26:37 +00:00
void IMergeTreeDataPart : : appendCSNToVersionMetadata ( VersionMetadata : : WhichCSN which_csn ) const
{
2022-05-20 10:41:44 +00:00
chassert ( ! version . creation_tid . isEmpty ( ) ) ;
2022-11-18 15:33:43 +00:00
chassert ( ! ( which_csn = = VersionMetadata : : WhichCSN : : CREATION & & version . creation_tid . isPrehistoric ( ) ) ) ;
2022-05-20 10:41:44 +00:00
chassert ( ! ( which_csn = = VersionMetadata : : WhichCSN : : CREATION & & version . creation_csn = = 0 ) ) ;
chassert ( ! ( which_csn = = VersionMetadata : : WhichCSN : : REMOVAL & & ( version . removal_tid . isPrehistoric ( ) | | version . removal_tid . isEmpty ( ) ) ) ) ;
chassert ( ! ( which_csn = = VersionMetadata : : WhichCSN : : REMOVAL & & version . removal_csn = = 0 ) ) ;
chassert ( isStoredOnDisk ( ) ) ;
2022-02-17 21:26:37 +00:00
2022-10-23 22:29:24 +00:00
/// Small enough appends to file are usually atomic,
/// so we append new metadata instead of rewriting file to reduce number of fsyncs.
/// We don't need to do fsync when writing CSN, because in case of hard restart
/// we will be able to restore CSN from transaction log in Keeper.
auto out = getDataPartStorage ( ) . writeTransactionFile ( WriteMode : : Append ) ;
version . writeCSN ( * out , which_csn ) ;
out - > finalize ( ) ;
2022-02-17 21:26:37 +00:00
}
2022-03-08 19:11:47 +00:00
void IMergeTreeDataPart : : appendRemovalTIDToVersionMetadata ( bool clear ) const
{
2022-05-20 10:41:44 +00:00
chassert ( ! version . creation_tid . isEmpty ( ) ) ;
2022-11-18 16:51:10 +00:00
chassert ( version . removal_csn = = 0 | | ( version . removal_csn = = Tx : : PrehistoricCSN & & version . removal_tid . isPrehistoric ( ) ) ) ;
2022-05-20 10:41:44 +00:00
chassert ( ! version . removal_tid . isEmpty ( ) ) ;
chassert ( isStoredOnDisk ( ) ) ;
2022-03-08 19:11:47 +00:00
if ( version . creation_tid . isPrehistoric ( ) & & ! clear )
{
/// Metadata file probably does not exist, because it was not written on part creation, because it was created without a transaction.
/// Let's create it (if needed). Concurrent writes are not possible, because creation_csn is prehistoric and we own removal_tid_lock.
2022-11-21 16:58:03 +00:00
/// It can happen that VersionMetadata::isVisible sets creation_csn to PrehistoricCSN when creation_tid is Prehistoric
/// In order to avoid a race always write creation_csn as PrehistoricCSN for Prehistoric creation_tid
assert ( version . creation_csn = = Tx : : UnknownCSN | | version . creation_csn = = Tx : : PrehistoricCSN ) ;
version . creation_csn . store ( Tx : : PrehistoricCSN ) ;
2022-03-08 19:11:47 +00:00
storeVersionMetadata ( ) ;
return ;
}
2022-03-16 19:16:26 +00:00
if ( clear )
LOG_TEST ( storage . log , " Clearing removal TID for {} (creation: {}, removal {}) " , name , version . creation_tid , version . removal_tid ) ;
else
LOG_TEST ( storage . log , " Appending removal TID for {} (creation: {}, removal {}) " , name , version . creation_tid , version . removal_tid ) ;
2022-03-08 19:11:47 +00:00
2022-10-23 22:29:24 +00:00
auto out = getDataPartStorage ( ) . writeTransactionFile ( WriteMode : : Append ) ;
version . writeRemovalTID ( * out , clear ) ;
out - > finalize ( ) ;
/// fsync is not required when we clearing removal TID, because after hard restart we will fix metadata
if ( ! clear )
out - > sync ( ) ;
2022-03-08 19:11:47 +00:00
}
2022-10-24 14:44:22 +00:00
static std : : unique_ptr < ReadBufferFromFileBase > openForReading ( const IDataPartStorage & part_storage , const String & filename )
{
size_t file_size = part_storage . getFileSize ( filename ) ;
return part_storage . readFile ( filename , ReadSettings ( ) . adjustBufferSize ( file_size ) , file_size , file_size ) ;
}
2021-12-30 13:15:28 +00:00
void IMergeTreeDataPart : : loadVersionMetadata ( ) const
2022-01-19 18:29:31 +00:00
try
2021-12-30 13:15:28 +00:00
{
2022-10-24 14:44:22 +00:00
static constexpr auto version_file_name = " txn_version.txt " ;
static constexpr auto tmp_version_file_name = " txn_version.txt.tmp " ;
auto & data_part_storage = const_cast < IDataPartStorage & > ( getDataPartStorage ( ) ) ;
auto remove_tmp_file = [ & ] ( )
{
auto last_modified = data_part_storage . getLastModified ( ) ;
auto buf = openForReading ( data_part_storage , tmp_version_file_name ) ;
String content ;
readStringUntilEOF ( content , * buf ) ;
LOG_WARNING ( storage . log , " Found file {} that was last modified on {}, has size {} and the following content: {} " ,
tmp_version_file_name , last_modified . epochTime ( ) , content . size ( ) , content ) ;
data_part_storage . removeFile ( tmp_version_file_name ) ;
} ;
if ( data_part_storage . exists ( version_file_name ) )
{
auto buf = openForReading ( data_part_storage , version_file_name ) ;
version . read ( * buf ) ;
if ( data_part_storage . exists ( tmp_version_file_name ) )
remove_tmp_file ( ) ;
return ;
}
/// Four (?) cases are possible:
/// 1. Part was created without transactions.
/// 2. Version metadata file was not renamed from *.tmp on part creation.
/// 3. Version metadata were written to *.tmp file, but hard restart happened before fsync.
/// 4. Fsyncs in storeVersionMetadata() work incorrectly.
if ( ! data_part_storage . exists ( tmp_version_file_name ) )
{
/// Case 1.
/// We do not have version metadata and transactions history for old parts,
/// so let's consider that such parts were created by some ancient transaction
/// and were committed with some prehistoric CSN.
/// NOTE It might be Case 3, but version metadata file is written on part creation before other files,
/// so it's not Case 3 if part is not broken.
version . setCreationTID ( Tx : : PrehistoricTID , nullptr ) ;
version . creation_csn = Tx : : PrehistoricCSN ;
return ;
}
/// Case 2.
/// Content of *.tmp file may be broken, just use fake TID.
/// Transaction was not committed if *.tmp file was not renamed, so we should complete rollback by removing part.
version . setCreationTID ( Tx : : DummyTID , nullptr ) ;
version . creation_csn = Tx : : RolledBackCSN ;
remove_tmp_file ( ) ;
2021-12-30 13:15:28 +00:00
}
2022-01-19 18:29:31 +00:00
catch ( Exception & e )
{
e . addMessage ( " While loading version metadata from table {} part {} " , storage . getStorageID ( ) . getNameForLogs ( ) , name ) ;
throw ;
}
2021-12-30 13:15:28 +00:00
2022-02-17 21:26:37 +00:00
bool IMergeTreeDataPart : : wasInvolvedInTransaction ( ) const
{
2022-09-15 15:35:28 +00:00
assert ( ! storage . data_parts_loading_finished | | ! version . creation_tid . isEmpty ( ) | | ( state = = MergeTreeDataPartState : : Temporary /* && std::uncaught_exceptions() */ ) ) ;
2022-02-17 21:26:37 +00:00
bool created_by_transaction = ! version . creation_tid . isPrehistoric ( ) ;
bool removed_by_transaction = version . isRemovalTIDLocked ( ) & & version . removal_tid_lock ! = Tx : : PrehistoricTID . getHash ( ) ;
return created_by_transaction | | removed_by_transaction ;
}
bool IMergeTreeDataPart : : assertHasValidVersionMetadata ( ) const
{
/// We don't have many tests with server restarts and it's really inconvenient to write such tests.
/// So we use debug assertions to ensure that part version is written correctly.
2022-03-18 11:01:26 +00:00
/// This method is not supposed to be called in release builds.
2022-02-17 21:26:37 +00:00
2022-02-24 21:51:21 +00:00
if ( isProjectionPart ( ) )
return true ;
2022-02-17 21:26:37 +00:00
if ( ! wasInvolvedInTransaction ( ) )
return true ;
if ( ! isStoredOnDisk ( ) )
return false ;
2022-03-09 20:38:18 +00:00
if ( part_is_probably_removed_from_disk )
return true ;
2022-08-12 11:03:57 +00:00
if ( state = = MergeTreeDataPartState : : Temporary )
2022-04-25 20:41:46 +00:00
return true ;
2022-10-23 03:29:26 +00:00
if ( ! getDataPartStorage ( ) . exists ( ) )
2022-02-17 21:26:37 +00:00
return true ;
String content ;
2022-05-03 15:48:05 +00:00
String version_file_name = TXN_VERSION_METADATA_FILE_NAME ;
2022-02-17 21:26:37 +00:00
try
{
2022-10-23 03:29:26 +00:00
size_t file_size = getDataPartStorage ( ) . getFileSize ( TXN_VERSION_METADATA_FILE_NAME ) ;
auto buf = getDataPartStorage ( ) . readFile ( TXN_VERSION_METADATA_FILE_NAME , ReadSettings ( ) . adjustBufferSize ( file_size ) , file_size , std : : nullopt ) ;
2022-05-03 15:48:05 +00:00
2023-04-07 20:54:49 +00:00
/// FIXME https://github.com/ClickHouse/ClickHouse/issues/48465
if ( dynamic_cast < CachedOnDiskReadBufferFromFile * > ( buf . get ( ) ) )
return true ;
2022-02-17 21:26:37 +00:00
readStringUntilEOF ( content , * buf ) ;
ReadBufferFromString str_buf { content } ;
VersionMetadata file ;
file . read ( str_buf ) ;
bool valid_creation_tid = version . creation_tid = = file . creation_tid ;
2022-03-08 19:11:47 +00:00
bool valid_removal_tid = version . removal_tid = = file . removal_tid | | version . removal_tid = = Tx : : PrehistoricTID ;
2022-02-17 21:26:37 +00:00
bool valid_creation_csn = version . creation_csn = = file . creation_csn | | version . creation_csn = = Tx : : RolledBackCSN ;
2022-03-08 19:11:47 +00:00
bool valid_removal_csn = version . removal_csn = = file . removal_csn | | version . removal_csn = = Tx : : PrehistoricCSN ;
2022-05-20 20:08:46 +00:00
bool valid_removal_tid_lock = ( version . removal_tid . isEmpty ( ) & & version . removal_tid_lock = = 0 )
| | ( version . removal_tid_lock = = version . removal_tid . getHash ( ) ) ;
if ( ! valid_creation_tid | | ! valid_removal_tid | | ! valid_creation_csn | | ! valid_removal_csn | | ! valid_removal_tid_lock )
2022-02-17 21:26:37 +00:00
throw Exception ( ErrorCodes : : CORRUPTED_DATA , " Invalid version metadata file " ) ;
return true ;
}
catch ( . . . )
{
WriteBufferFromOwnString expected ;
version . write ( expected ) ;
2022-09-26 22:01:00 +00:00
tryLogCurrentException ( storage . log , fmt : : format ( " File {} contains: \n {} \n expected: \n {} \n lock: {} \n name: {} " ,
version_file_name , content , expected . str ( ) , version . removal_tid_lock , name ) ) ;
2022-02-17 21:26:37 +00:00
return false ;
}
}
2021-12-31 03:13:38 +00:00
void IMergeTreeDataPart : : appendFilesOfColumns ( Strings & files )
2021-12-08 02:40:59 +00:00
{
files . push_back ( " columns.txt " ) ;
2022-03-23 04:13:42 +00:00
files . push_back ( SERIALIZATION_FILE_NAME ) ;
2021-12-08 02:40:59 +00:00
}
2020-10-20 15:10:24 +00:00
bool IMergeTreeDataPart : : shallParticipateInMerges ( const StoragePolicyPtr & storage_policy ) const
{
2022-10-25 22:14:06 +00:00
auto disk_name = getDataPartStorage ( ) . getDiskName ( ) ;
return ! storage_policy - > getVolumeByDiskName ( disk_name ) - > areMergesAvoided ( ) ;
2020-10-20 15:10:24 +00:00
}
2022-10-23 03:29:26 +00:00
void IMergeTreeDataPart : : renameTo ( const String & new_relative_path , bool remove_new_dir_if_exists )
2022-01-31 20:47:04 +00:00
try
2019-10-10 16:30:30 +00:00
{
2019-10-31 14:44:17 +00:00
assertOnDisk ( ) ;
2022-05-05 09:19:12 +00:00
std : : string relative_path = storage . relative_data_path ;
bool fsync_dir = storage . getSettings ( ) - > fsync_part_directory ;
2022-04-07 11:58:38 +00:00
if ( parent_part )
2022-05-05 09:19:12 +00:00
{
/// For projections, move is only possible inside parent part dir.
2022-10-23 03:29:26 +00:00
relative_path = parent_part - > getDataPartStorage ( ) . getRelativePath ( ) ;
2022-05-05 09:19:12 +00:00
}
2022-04-07 11:58:38 +00:00
2022-10-24 22:38:53 +00:00
auto old_projection_root_path = getDataPartStorage ( ) . getRelativePath ( ) ;
2022-06-15 11:41:08 +00:00
auto to = fs : : path ( relative_path ) / new_relative_path ;
2021-01-07 16:26:53 +00:00
2022-06-21 13:05:49 +00:00
metadata_manager - > deleteAll ( true ) ;
metadata_manager - > assertAllDeleted ( true ) ;
2022-10-23 03:29:26 +00:00
getDataPartStorage ( ) . rename ( to . parent_path ( ) , to . filename ( ) , storage . log , remove_new_dir_if_exists , fsync_dir ) ;
2022-06-21 13:05:49 +00:00
metadata_manager - > updateAll ( true ) ;
2022-05-03 19:32:24 +00:00
2022-10-23 03:29:26 +00:00
auto new_projection_root_path = to . string ( ) ;
for ( const auto & [ _ , part ] : projection_parts )
part - > getDataPartStorage ( ) . changeRootPath ( old_projection_root_path , new_projection_root_path ) ;
2019-10-10 16:30:30 +00:00
}
2022-01-31 20:47:04 +00:00
catch ( . . . )
{
if ( startsWith ( new_relative_path , " detached/ " ) )
{
// Don't throw when the destination is to the detached folder. It might be able to
// recover in some cases, such as fetching parts into multi-disks while some of the
// disks are broken.
tryLogCurrentException ( __PRETTY_FUNCTION__ ) ;
}
else
throw ;
}
2019-10-10 16:30:30 +00:00
2022-04-18 23:09:09 +00:00
std : : pair < bool , NameSet > IMergeTreeDataPart : : canRemovePart ( ) const
2021-06-08 19:11:22 +00:00
{
2021-07-12 02:56:49 +00:00
/// NOTE: It's needed for zero-copy replication
2021-06-09 12:36:47 +00:00
if ( force_keep_shared_data )
2022-09-06 17:25:58 +00:00
{
LOG_DEBUG ( storage . log , " Blobs for part {} cannot be removed because it's forced to be keeped " , name ) ;
2022-04-18 23:09:09 +00:00
return std : : make_pair ( false , NameSet { } ) ;
2022-09-06 17:25:58 +00:00
}
2019-12-03 14:33:56 +00:00
2022-04-15 16:36:23 +00:00
return storage . unlockSharedData ( * this ) ;
2021-06-08 19:11:22 +00:00
}
2022-01-05 11:51:50 +00:00
void IMergeTreeDataPart : : initializePartMetadataManager ( )
{
# if USE_ROCKSDB
if ( use_metadata_cache )
metadata_manager = std : : make_shared < PartMetadataManagerWithCache > ( this , storage . getContext ( ) - > getMergeTreeMetadataCache ( ) ) ;
else
metadata_manager = std : : make_shared < PartMetadataManagerOrdinary > ( this ) ;
# else
metadata_manager = std : : make_shared < PartMetadataManagerOrdinary > ( this ) ;
# endif
}
2022-09-13 03:18:25 +00:00
void IMergeTreeDataPart : : initializeIndexGranularityInfo ( )
{
2023-01-25 17:34:09 +00:00
auto mrk_type = MergeTreeIndexGranularityInfo : : getMarksTypeFromFilesystem ( getDataPartStorage ( ) ) ;
if ( mrk_type )
index_granularity_info = MergeTreeIndexGranularityInfo ( storage , * mrk_type ) ;
2022-09-13 03:18:25 +00:00
else
index_granularity_info = MergeTreeIndexGranularityInfo ( storage , part_type ) ;
}
2022-10-23 03:29:26 +00:00
void IMergeTreeDataPart : : remove ( )
2019-10-31 14:44:17 +00:00
{
2022-02-17 21:26:37 +00:00
assert ( assertHasValidVersionMetadata ( ) ) ;
2022-03-09 20:38:18 +00:00
part_is_probably_removed_from_disk = true ;
2022-09-16 11:49:39 +00:00
auto can_remove_callback = [ this ] ( )
2022-09-15 12:49:31 +00:00
{
2022-11-08 15:27:26 +00:00
/// Temporary projections are "subparts" which are generated during projections materialization
/// We can always remove them without any additional checks.
2022-11-08 14:33:23 +00:00
if ( isProjectionPart ( ) & & is_temp )
{
LOG_TRACE ( storage . log , " Temporary projection part {} can be removed " , name ) ;
return CanRemoveDescription { . can_remove_anything = true , . files_not_to_remove = { } } ;
}
2022-09-15 12:49:31 +00:00
auto [ can_remove , files_not_to_remove ] = canRemovePart ( ) ;
if ( ! can_remove )
LOG_TRACE ( storage . log , " Blobs of part {} cannot be removed " , name ) ;
if ( ! files_not_to_remove . empty ( ) )
LOG_TRACE ( storage . log , " Some blobs ({}) of part {} cannot be removed " , fmt : : join ( files_not_to_remove , " , " ) , name ) ;
2021-06-09 12:36:47 +00:00
2022-09-15 12:49:31 +00:00
return CanRemoveDescription { . can_remove_anything = can_remove , . files_not_to_remove = files_not_to_remove } ;
} ;
2022-09-06 17:25:58 +00:00
2019-10-31 14:44:17 +00:00
if ( ! isStoredOnDisk ( ) )
return ;
2022-11-08 15:27:26 +00:00
/// Projections should be never removed by themselves, they will be removed
/// with by parent part.
2022-11-08 14:33:23 +00:00
if ( isProjectionPart ( ) & & ! is_temp )
throw Exception ( ErrorCodes : : LOGICAL_ERROR , " Projection part {} should be removed by its parent {}. " , name , parent_part - > name ) ;
2021-02-10 14:12:49 +00:00
2022-01-05 11:51:50 +00:00
metadata_manager - > deleteAll ( false ) ;
metadata_manager - > assertAllDeleted ( false ) ;
2022-06-30 17:48:07 +00:00
2022-12-29 16:00:17 +00:00
GinIndexStoreFactory : : instance ( ) . remove ( getDataPartStoragePtr ( ) - > getRelativePath ( ) ) ;
2021-12-08 02:40:59 +00:00
2022-04-19 19:34:41 +00:00
std : : list < IDataPartStorage : : ProjectionChecksums > projection_checksums ;
2019-10-31 14:44:17 +00:00
2021-02-10 14:12:49 +00:00
for ( const auto & [ p_name , projection_part ] : projection_parts )
{
2022-04-19 19:34:41 +00:00
projection_part - > metadata_manager - > deleteAll ( false ) ;
projection_part - > metadata_manager - > assertAllDeleted ( false ) ;
2022-04-22 16:58:09 +00:00
projection_checksums . emplace_back ( IDataPartStorage : : ProjectionChecksums { . name = p_name , . checksums = projection_part - > checksums } ) ;
2021-02-10 14:12:49 +00:00
}
2023-01-25 17:34:09 +00:00
bool is_temporary_part = is_temp | | state = = MergeTreeDataPartState : : Temporary ;
getDataPartStorage ( ) . remove ( std : : move ( can_remove_callback ) , checksums , projection_checksums , is_temporary_part , storage . log ) ;
2019-10-31 14:44:17 +00:00
}
2022-09-30 18:09:18 +00:00
std : : optional < String > IMergeTreeDataPart : : getRelativePathForPrefix ( const String & prefix , bool detached , bool broken ) const
2019-10-10 16:30:30 +00:00
{
2022-10-18 16:13:18 +00:00
assert ( ! broken | | detached ) ;
2019-10-10 16:30:30 +00:00
String res ;
/** If you need to detach a part, and directory into which we want to rename it already exists,
* we will rename to the directory with the name to which the suffix is added in the form of " _tryN " .
* This is done only in the case of ` to_detached ` , because it is assumed that in this case the exact name does not matter .
* No more than 10 attempts are made so that there are not too many junk directories left .
*/
2021-12-01 14:24:26 +00:00
if ( detached & & parent_part )
2021-12-01 15:00:40 +00:00
throw Exception ( ErrorCodes : : LOGICAL_ERROR , " Cannot detach projection " ) ;
2019-10-10 16:30:30 +00:00
2022-10-23 03:29:26 +00:00
return getDataPartStorage ( ) . getRelativePathForPrefix ( storage . log , prefix , detached , broken ) ;
2019-10-10 16:30:30 +00:00
}
2022-09-30 18:09:18 +00:00
std : : optional < String > IMergeTreeDataPart : : getRelativePathForDetachedPart ( const String & prefix , bool broken ) const
2019-10-10 16:30:30 +00:00
{
2020-06-03 22:00:02 +00:00
/// Do not allow underscores in the prefix because they are used as separators.
assert ( prefix . find_first_of ( ' _ ' ) = = String : : npos ) ;
2021-08-04 14:42:48 +00:00
assert ( prefix . empty ( ) | | std : : find ( DetachedPartInfo : : DETACH_REASONS . begin ( ) ,
DetachedPartInfo : : DETACH_REASONS . end ( ) ,
prefix ) ! = DetachedPartInfo : : DETACH_REASONS . end ( ) ) ;
2022-09-30 18:09:18 +00:00
if ( auto path = getRelativePathForPrefix ( prefix , /* detached */ true , broken ) )
return " detached/ " + * path ;
return { } ;
2019-10-10 16:30:30 +00:00
}
2022-10-23 03:29:26 +00:00
void IMergeTreeDataPart : : renameToDetached ( const String & prefix )
2019-10-10 16:30:30 +00:00
{
2022-10-18 16:13:18 +00:00
auto path_to_detach = getRelativePathForDetachedPart ( prefix , /* broken */ false ) ;
assert ( path_to_detach ) ;
2022-10-22 22:51:59 +00:00
renameTo ( path_to_detach . value ( ) , true ) ;
2022-04-25 20:41:46 +00:00
part_is_probably_removed_from_disk = true ;
2019-10-10 16:30:30 +00:00
}
2022-12-22 13:31:42 +00:00
DataPartStoragePtr IMergeTreeDataPart : : makeCloneInDetached ( const String & prefix , const StorageMetadataPtr & /*metadata_snapshot*/ ) const
2019-10-10 16:30:30 +00:00
{
2022-09-27 12:38:13 +00:00
auto storage_settings = storage . getSettings ( ) ;
2022-09-28 11:09:48 +00:00
/// In case of zero-copy replication we copy directory instead of hardlinks
/// because hardlinks tracking doesn't work for detached parts.
2022-09-27 12:38:13 +00:00
bool copy_instead_of_hardlink = isStoredOnRemoteDiskWithZeroCopySupport ( ) & & storage . supportsReplication ( ) & & storage_settings - > allow_remote_fs_zero_copy_replication ;
2022-09-28 11:09:48 +00:00
2022-09-30 18:09:18 +00:00
/// Avoid unneeded duplicates of broken parts if we try to detach the same broken part multiple times.
/// Otherwise it may pollute detached/ with dirs with _tryN suffix and we will fail to remove broken part after 10 attempts.
bool broken = ! prefix . empty ( ) ;
auto maybe_path_in_detached = getRelativePathForDetachedPart ( prefix , broken ) ;
if ( ! maybe_path_in_detached )
2022-12-22 13:31:42 +00:00
return nullptr ;
2022-09-30 18:09:18 +00:00
2022-12-22 13:31:42 +00:00
return getDataPartStorage ( ) . freeze (
2022-05-03 15:48:05 +00:00
storage . relative_data_path ,
2022-09-30 18:09:18 +00:00
* maybe_path_in_detached ,
2023-01-25 17:34:09 +00:00
/*make_source_readonly=*/ true ,
/*save_metadata_callback=*/ { } ,
2022-09-27 13:50:25 +00:00
copy_instead_of_hardlink ,
2023-01-25 17:34:09 +00:00
/*files_to_copy_instead_of_hardlinks=*/ { } ) ;
2019-10-10 16:30:30 +00:00
}
2022-10-22 22:51:59 +00:00
MutableDataPartStoragePtr IMergeTreeDataPart : : makeCloneOnDisk ( const DiskPtr & disk , const String & directory_name ) const
2019-10-10 16:30:30 +00:00
{
assertOnDisk ( ) ;
2020-10-15 13:55:13 +00:00
2022-10-23 03:29:26 +00:00
if ( disk - > getName ( ) = = getDataPartStorage ( ) . getDiskName ( ) )
2023-01-23 21:13:58 +00:00
throw Exception ( ErrorCodes : : LOGICAL_ERROR , " Can not clone data part {} to same disk {} " , name , getDataPartStorage ( ) . getDiskName ( ) ) ;
2020-10-15 13:55:13 +00:00
if ( directory_name . empty ( ) )
2023-01-23 21:13:58 +00:00
throw Exception ( ErrorCodes : : LOGICAL_ERROR , " Can not clone data part {} to empty directory. " , name ) ;
2019-10-10 16:30:30 +00:00
2021-05-05 15:10:14 +00:00
String path_to_clone = fs : : path ( storage . relative_data_path ) / directory_name / " " ;
2022-10-23 03:29:26 +00:00
return getDataPartStorage ( ) . clonePart ( path_to_clone , getDataPartStorage ( ) . getPartDirectory ( ) , disk , storage . log ) ;
2019-10-10 16:30:30 +00:00
}
2020-01-15 19:16:56 +00:00
void IMergeTreeDataPart : : checkConsistencyBase ( ) const
{
2020-06-17 10:34:23 +00:00
auto metadata_snapshot = storage . getInMemoryMetadataPtr ( ) ;
2021-02-10 14:12:49 +00:00
if ( parent_part )
metadata_snapshot = metadata_snapshot - > projections . get ( name ) . metadata ;
else
{
// No need to check projections here because we already did consistent checking when loading projections if necessary.
}
2020-06-17 12:39:20 +00:00
const auto & pk = metadata_snapshot - > getPrimaryKey ( ) ;
2021-03-02 10:33:54 +00:00
const auto & partition_key = metadata_snapshot - > getPartitionKey ( ) ;
2020-01-15 19:16:56 +00:00
if ( ! checksums . empty ( ) )
{
2022-05-29 07:28:02 +00:00
if ( ! pk . column_names . empty ( )
& & ( ! checksums . files . contains ( " primary " + getIndexExtension ( false ) )
& & ! checksums . files . contains ( " primary " + getIndexExtension ( true ) ) ) )
2023-01-23 21:13:58 +00:00
throw Exception ( ErrorCodes : : NO_FILE_IN_DATA_PART , " No checksum for {} or {} " ,
toString ( " primary " + getIndexExtension ( false ) ) , toString ( " primary " + getIndexExtension ( true ) ) ) ;
2020-01-15 19:16:56 +00:00
if ( storage . format_version > = MERGE_TREE_DATA_MIN_FORMAT_VERSION_WITH_CUSTOM_PARTITIONING )
{
2022-04-18 10:18:43 +00:00
if ( ! checksums . files . contains ( " count.txt " ) )
2023-01-23 21:13:58 +00:00
throw Exception ( ErrorCodes : : NO_FILE_IN_DATA_PART , " No checksum for count.txt " ) ;
2020-01-15 19:16:56 +00:00
2022-04-18 10:18:43 +00:00
if ( metadata_snapshot - > hasPartitionKey ( ) & & ! checksums . files . contains ( " partition.dat " ) )
2023-01-23 21:13:58 +00:00
throw Exception ( ErrorCodes : : NO_FILE_IN_DATA_PART , " No checksum for partition.dat " ) ;
2020-01-15 19:16:56 +00:00
2021-02-10 14:12:49 +00:00
if ( ! isEmpty ( ) & & ! parent_part )
2020-01-15 19:16:56 +00:00
{
2021-03-02 10:33:54 +00:00
for ( const String & col_name : storage . getMinMaxColumnsNames ( partition_key ) )
2020-01-15 19:16:56 +00:00
{
2022-04-18 10:18:43 +00:00
if ( ! checksums . files . contains ( " minmax_ " + escapeForFileName ( col_name ) + " .idx " ) )
2023-01-23 21:13:58 +00:00
throw Exception ( ErrorCodes : : NO_FILE_IN_DATA_PART , " No minmax idx file checksum for column {} " , col_name ) ;
2020-01-15 19:16:56 +00:00
}
}
}
2023-01-25 17:34:09 +00:00
checksums . checkSizes ( getDataPartStorage ( ) ) ;
2020-01-15 19:16:56 +00:00
}
else
{
2022-04-19 19:34:41 +00:00
auto check_file_not_empty = [ this ] ( const String & file_path )
2020-02-27 17:57:49 +00:00
{
2020-02-27 16:47:40 +00:00
UInt64 file_size ;
2022-10-23 03:29:26 +00:00
if ( ! getDataPartStorage ( ) . exists ( file_path ) | | ( file_size = getDataPartStorage ( ) . getFileSize ( file_path ) ) = = 0 )
2022-04-19 19:34:41 +00:00
throw Exception (
ErrorCodes : : BAD_SIZE_OF_FILE_IN_DATA_PART ,
" Part {} is broken: {} is empty " ,
2022-10-23 03:29:26 +00:00
getDataPartStorage ( ) . getFullPath ( ) ,
std : : string ( fs : : path ( getDataPartStorage ( ) . getFullPath ( ) ) / file_path ) ) ;
2020-02-27 16:47:40 +00:00
return file_size ;
2020-01-15 19:16:56 +00:00
} ;
/// Check that the primary key index is not empty.
2020-05-26 13:46:19 +00:00
if ( ! pk . column_names . empty ( ) )
2022-05-29 07:28:02 +00:00
{
2022-10-23 03:29:26 +00:00
String index_name = " primary " + getIndexExtensionFromFilesystem ( getDataPartStorage ( ) ) . value ( ) ;
2022-05-29 07:28:02 +00:00
check_file_not_empty ( index_name ) ;
}
2020-01-15 19:16:56 +00:00
if ( storage . format_version > = MERGE_TREE_DATA_MIN_FORMAT_VERSION_WITH_CUSTOM_PARTITIONING )
{
2022-04-19 19:34:41 +00:00
check_file_not_empty ( " count.txt " ) ;
2020-01-15 19:16:56 +00:00
2020-06-17 10:34:23 +00:00
if ( metadata_snapshot - > hasPartitionKey ( ) )
2022-04-19 19:34:41 +00:00
check_file_not_empty ( " partition.dat " ) ;
2020-01-15 19:16:56 +00:00
2021-02-10 14:12:49 +00:00
if ( ! parent_part )
{
for ( const String & col_name : storage . getMinMaxColumnsNames ( partition_key ) )
2022-04-19 19:34:41 +00:00
check_file_not_empty ( " minmax_ " + escapeForFileName ( col_name ) + " .idx " ) ;
2021-02-10 14:12:49 +00:00
}
2020-01-15 19:16:56 +00:00
}
}
}
2020-06-03 18:59:18 +00:00
void IMergeTreeDataPart : : checkConsistency ( bool /* require_part_metadata */ ) const
{
2023-01-23 21:13:58 +00:00
throw Exception ( ErrorCodes : : NOT_IMPLEMENTED , " Method 'checkConsistency' is not implemented for part with type {} " , getType ( ) . toString ( ) ) ;
2020-06-03 18:59:18 +00:00
}
2021-10-08 13:13:56 +00:00
void IMergeTreeDataPart : : calculateColumnsAndSecondaryIndicesSizesOnDisk ( )
{
calculateColumnsSizesOnDisk ( ) ;
calculateSecondaryIndicesSizesOnDisk ( ) ;
}
2020-03-23 12:19:43 +00:00
void IMergeTreeDataPart : : calculateColumnsSizesOnDisk ( )
{
if ( getColumns ( ) . empty ( ) | | checksums . empty ( ) )
2023-01-23 21:13:58 +00:00
throw Exception ( ErrorCodes : : LOGICAL_ERROR , " Cannot calculate columns sizes when columns or checksums are not initialized " ) ;
2020-03-23 12:19:43 +00:00
2020-06-29 20:36:18 +00:00
calculateEachColumnSizes ( columns_sizes , total_columns_size ) ;
2020-03-23 12:19:43 +00:00
}
2021-10-08 13:13:56 +00:00
void IMergeTreeDataPart : : calculateSecondaryIndicesSizesOnDisk ( )
{
if ( checksums . empty ( ) )
2023-01-23 21:13:58 +00:00
throw Exception ( ErrorCodes : : LOGICAL_ERROR , " Cannot calculate secondary indexes sizes when columns or checksums are not initialized " ) ;
2021-10-08 13:13:56 +00:00
auto secondary_indices_descriptions = storage . getInMemoryMetadataPtr ( ) - > secondary_indices ;
for ( auto & index_description : secondary_indices_descriptions )
{
ColumnSize index_size ;
auto index_ptr = MergeTreeIndexFactory : : instance ( ) . get ( index_description ) ;
auto index_name = index_ptr - > getFileName ( ) ;
auto index_name_escaped = escapeForFileName ( index_name ) ;
auto index_file_name = index_name_escaped + index_ptr - > getSerializedFileExtension ( ) ;
2022-09-05 05:26:58 +00:00
auto index_marks_file_name = index_name_escaped + getMarksFileExtension ( ) ;
2021-10-08 13:13:56 +00:00
2021-10-11 11:00:10 +00:00
/// If part does not contain index
2021-10-08 13:13:56 +00:00
auto bin_checksum = checksums . files . find ( index_file_name ) ;
if ( bin_checksum ! = checksums . files . end ( ) )
{
index_size . data_compressed = bin_checksum - > second . file_size ;
index_size . data_uncompressed = bin_checksum - > second . uncompressed_size ;
}
auto mrk_checksum = checksums . files . find ( index_marks_file_name ) ;
if ( mrk_checksum ! = checksums . files . end ( ) )
index_size . marks = mrk_checksum - > second . file_size ;
total_secondary_indices_size . add ( index_size ) ;
secondary_index_sizes [ index_description . name ] = index_size ;
}
}
2021-12-09 10:39:28 +00:00
ColumnSize IMergeTreeDataPart : : getColumnSize ( const String & column_name ) const
2020-03-23 12:19:43 +00:00
{
/// For some types of parts columns_size maybe not calculated
auto it = columns_sizes . find ( column_name ) ;
if ( it ! = columns_sizes . end ( ) )
return it - > second ;
return ColumnSize { } ;
}
2021-10-08 13:13:56 +00:00
IndexSize IMergeTreeDataPart : : getSecondaryIndexSize ( const String & secondary_index_name ) const
{
auto it = secondary_index_sizes . find ( secondary_index_name ) ;
if ( it ! = secondary_index_sizes . end ( ) )
return it - > second ;
return ColumnSize { } ;
}
2020-03-23 12:19:43 +00:00
void IMergeTreeDataPart : : accumulateColumnSizes ( ColumnToSize & column_to_size ) const
{
2020-03-23 15:43:20 +00:00
for ( const auto & [ column_name , size ] : columns_sizes )
column_to_size [ column_name ] = size . data_compressed ;
2020-03-23 12:19:43 +00:00
}
2020-09-03 08:59:41 +00:00
bool IMergeTreeDataPart : : checkAllTTLCalculated ( const StorageMetadataPtr & metadata_snapshot ) const
{
if ( ! metadata_snapshot - > hasAnyTTL ( ) )
return false ;
if ( metadata_snapshot - > hasRowsTTL ( ) )
{
if ( isEmpty ( ) ) /// All rows were finally deleted and we don't store TTL
return true ;
else if ( ttl_infos . table_ttl . min = = 0 )
return false ;
}
for ( const auto & [ column , desc ] : metadata_snapshot - > getColumnTTLs ( ) )
{
/// Part has this column, but we don't calculated TTL for it
2022-04-18 10:18:43 +00:00
if ( ! ttl_infos . columns_ttl . contains ( column ) & & getColumns ( ) . contains ( column ) )
2020-09-03 08:59:41 +00:00
return false ;
}
for ( const auto & move_desc : metadata_snapshot - > getMoveTTLs ( ) )
{
/// Move TTL is not calculated
2022-04-18 10:18:43 +00:00
if ( ! ttl_infos . moves_ttl . contains ( move_desc . result_column ) )
2020-09-03 08:59:41 +00:00
return false ;
}
2020-12-25 14:52:46 +00:00
for ( const auto & group_by_desc : metadata_snapshot - > getGroupByTTLs ( ) )
{
2022-04-18 10:18:43 +00:00
if ( ! ttl_infos . group_by_ttl . contains ( group_by_desc . result_column ) )
2020-12-25 14:52:46 +00:00
return false ;
}
2021-01-13 14:04:27 +00:00
for ( const auto & rows_where_desc : metadata_snapshot - > getRowsWhereTTLs ( ) )
2021-01-11 23:07:21 +00:00
{
2022-04-18 10:18:43 +00:00
if ( ! ttl_infos . rows_where_ttl . contains ( rows_where_desc . result_column ) )
2021-01-11 23:07:21 +00:00
return false ;
}
2020-09-03 08:59:41 +00:00
return true ;
}
2022-06-30 20:51:27 +00:00
String IMergeTreeDataPart : : getUniqueId ( ) const
{
2022-10-23 03:29:26 +00:00
return getDataPartStorage ( ) . getUniqueId ( ) ;
2022-06-30 20:51:27 +00:00
}
2022-02-02 16:44:29 +00:00
String IMergeTreeDataPart : : getZeroLevelPartBlockID ( std : : string_view token ) const
2021-03-31 15:20:30 +00:00
{
if ( info . level ! = 0 )
throw Exception ( ErrorCodes : : LOGICAL_ERROR , " Trying to get block id for non zero level part {} " , name ) ;
SipHash hash ;
insert_deduplication_token setting for INSERT statement
The setting allows a user to provide own deduplication semantic in Replicated*MergeTree
If provided, it's used instead of data digest to generate block ID
So, for example, by providing a unique value for the setting in each INSERT statement,
user can avoid the same inserted data being deduplicated
Inserting data within the same INSERT statement are split into blocks
according to the *insert_block_size* settings
(max_insert_block_size, min_insert_block_size_rows, min_insert_block_size_bytes).
Each block with the same INSERT statement will get an ordinal number.
The ordinal number is added to insert_deduplication_token to get block dedup token
i.e. <token>_0, <token>_1, ... Deduplication is done per block
So, to guarantee deduplication for two same INSERT queries,
dedup token and number of blocks to have to be the same
Issue: #7461
2021-11-21 20:39:42 +00:00
if ( token . empty ( ) )
{
checksums . computeTotalChecksumDataOnly ( hash ) ;
}
else
{
2022-01-03 23:04:56 +00:00
hash . update ( token . data ( ) , token . size ( ) ) ;
insert_deduplication_token setting for INSERT statement
The setting allows a user to provide own deduplication semantic in Replicated*MergeTree
If provided, it's used instead of data digest to generate block ID
So, for example, by providing a unique value for the setting in each INSERT statement,
user can avoid the same inserted data being deduplicated
Inserting data within the same INSERT statement are split into blocks
according to the *insert_block_size* settings
(max_insert_block_size, min_insert_block_size_rows, min_insert_block_size_bytes).
Each block with the same INSERT statement will get an ordinal number.
The ordinal number is added to insert_deduplication_token to get block dedup token
i.e. <token>_0, <token>_1, ... Deduplication is done per block
So, to guarantee deduplication for two same INSERT queries,
dedup token and number of blocks to have to be the same
Issue: #7461
2021-11-21 20:39:42 +00:00
}
2021-03-31 15:20:30 +00:00
union
{
char bytes [ 16 ] ;
UInt64 words [ 2 ] ;
} hash_value ;
hash . get128 ( hash_value . bytes ) ;
return info . partition_id + " _ " + toString ( hash_value . words [ 0 ] ) + " _ " + toString ( hash_value . words [ 1 ] ) ;
}
2022-06-03 18:49:12 +00:00
IMergeTreeDataPart : : uint128 IMergeTreeDataPart : : getActualChecksumByFile ( const String & file_name ) const
2021-12-08 02:40:59 +00:00
{
2021-12-28 10:06:13 +00:00
assert ( use_metadata_cache ) ;
2021-12-08 02:40:59 +00:00
const auto filenames_without_checksums = getFileNamesWithoutChecksums ( ) ;
auto it = checksums . files . find ( file_name ) ;
2022-04-18 10:18:43 +00:00
if ( ! filenames_without_checksums . contains ( file_name ) & & it ! = checksums . files . end ( ) )
2021-12-08 02:40:59 +00:00
{
return it - > second . file_hash ;
}
2022-10-23 03:29:26 +00:00
if ( ! getDataPartStorage ( ) . exists ( file_name ) )
2021-12-08 02:40:59 +00:00
{
return { } ;
}
2022-10-23 03:29:26 +00:00
std : : unique_ptr < ReadBufferFromFileBase > in_file = getDataPartStorage ( ) . readFile ( file_name , { } , std : : nullopt , std : : nullopt ) ;
2021-12-08 02:40:59 +00:00
HashingReadBuffer in_hash ( * in_file ) ;
String value ;
readStringUntilEOF ( value , in_hash ) ;
return in_hash . getHash ( ) ;
}
2022-01-07 10:37:08 +00:00
std : : unordered_map < String , IMergeTreeDataPart : : uint128 > IMergeTreeDataPart : : checkMetadata ( ) const
2021-12-08 02:40:59 +00:00
{
2022-01-07 10:37:08 +00:00
return metadata_manager - > check ( ) ;
2021-12-08 02:40:59 +00:00
}
2019-12-18 13:09:58 +00:00
bool isCompactPart ( const MergeTreeDataPartPtr & data_part )
{
2022-03-14 14:42:09 +00:00
return ( data_part & & data_part - > getType ( ) = = MergeTreeDataPartType : : Compact ) ;
2019-12-18 13:09:58 +00:00
}
bool isWidePart ( const MergeTreeDataPartPtr & data_part )
{
2022-03-14 14:42:09 +00:00
return ( data_part & & data_part - > getType ( ) = = MergeTreeDataPartType : : Wide ) ;
2019-12-18 13:09:58 +00:00
}
2020-04-20 01:38:38 +00:00
bool isInMemoryPart ( const MergeTreeDataPartPtr & data_part )
{
2022-03-14 14:42:09 +00:00
return ( data_part & & data_part - > getType ( ) = = MergeTreeDataPartType : : InMemory ) ;
2020-04-20 01:38:38 +00:00
}
2022-10-23 03:29:26 +00:00
std : : optional < std : : string > getIndexExtensionFromFilesystem ( const IDataPartStorage & data_part_storage )
2022-05-29 07:28:02 +00:00
{
2022-10-23 03:29:26 +00:00
if ( data_part_storage . exists ( ) )
2022-05-29 07:28:02 +00:00
{
2022-10-23 03:29:26 +00:00
for ( auto it = data_part_storage . iterate ( ) ; it - > isValid ( ) ; it - > next ( ) )
2022-05-29 07:28:02 +00:00
{
const auto & extension = fs : : path ( it - > name ( ) ) . extension ( ) ;
2022-11-16 13:23:58 +00:00
if ( extension = = getIndexExtension ( true ) )
2022-05-29 07:28:02 +00:00
return extension ;
}
}
return { " .idx " } ;
}
2022-08-28 02:19:14 +00:00
bool isCompressedFromIndexExtension ( const String & index_extension )
2022-05-29 07:28:02 +00:00
{
return index_extension = = getIndexExtension ( true ) ;
}
2022-10-13 16:07:25 +00:00
Strings getPartsNames ( const MergeTreeDataPartsVector & parts )
{
Strings part_names ;
for ( const auto & p : parts )
part_names . push_back ( p - > name ) ;
return part_names ;
}
2019-10-10 16:30:30 +00:00
}