2021-12-09 10:39:28 +00:00
# include <Storages/MergeTree/ParallelReplicasReadingCoordinator.h>
2023-02-03 13:34:18 +00:00
# include <algorithm>
# include <mutex>
# include <numeric>
# include <vector>
2021-12-09 10:39:28 +00:00
# include <map>
2023-02-03 13:34:18 +00:00
# include <set>
# include <consistent_hashing.h>
2021-12-09 10:39:28 +00:00
2022-04-27 15:05:45 +00:00
# include <Common/logger_useful.h>
2023-02-03 13:34:18 +00:00
# include <Common/SipHash.h>
# include <Common/thread_local_rng.h>
# include <base/types.h>
# include "IO/WriteBufferFromString.h"
# include "Storages/MergeTree/RangesInDataPart.h"
# include "Storages/MergeTree/RequestResponse.h"
# include <Storages/MergeTree/MarkRange.h>
2021-12-09 10:39:28 +00:00
# include <Storages/MergeTree/IntersectionsIndexes.h>
2023-02-03 13:34:18 +00:00
# include <fmt/format.h>
2021-12-09 10:39:28 +00:00
2022-11-14 05:54:58 +00:00
2021-12-09 10:39:28 +00:00
namespace DB
{
2023-02-03 13:34:18 +00:00
namespace ErrorCodes
{
extern const int LOGICAL_ERROR ;
}
class ParallelReplicasReadingCoordinator : : ImplInterface
{
public :
struct Stat
{
size_t number_of_requests { 0 } ;
size_t sum_marks { 0 } ;
} ;
using Stats = std : : vector < Stat > ;
static String toString ( Stats stats )
{
String result = " Statistics: " ;
for ( size_t i = 0 ; i < stats . size ( ) ; + + i )
result + = fmt : : format ( " -- replica {}, requests: {} marks: {} " , i , stats [ i ] . number_of_requests , stats [ i ] . sum_marks ) ;
return result ;
}
Stats stats ;
std : : mutex mutex ;
size_t replicas_count ;
explicit ImplInterface ( size_t replicas_count_ )
: stats { replicas_count_ }
, replicas_count ( replicas_count_ )
{ }
virtual ~ ImplInterface ( ) = default ;
virtual ParallelReadResponse handleRequest ( ParallelReadRequest request ) = 0 ;
virtual void handleInitialAllRangesAnnouncement ( InitialAllRangesAnnouncement announcement ) = 0 ;
} ;
struct Part
{
mutable RangesInDataPartDescription description ;
// FIXME: This is needed to put this struct in set
// and modify through iterator
mutable std : : set < size_t > replicas ;
bool operator < ( const Part & rhs ) const { return description . info < rhs . description . info ; }
} ;
using Parts = std : : set < Part > ;
using PartRefs = std : : deque < Parts : : iterator > ;
class DefaultCoordinator : public ParallelReplicasReadingCoordinator : : ImplInterface
2021-12-09 10:39:28 +00:00
{
public :
2023-02-03 13:34:18 +00:00
using ParallelReadRequestPtr = std : : unique_ptr < ParallelReadRequest > ;
2021-12-09 10:39:28 +00:00
using PartToMarkRanges = std : : map < PartToRead : : PartAndProjectionNames , HalfIntervals > ;
2023-02-03 13:34:18 +00:00
explicit DefaultCoordinator ( size_t replicas_count_ )
: ParallelReplicasReadingCoordinator : : ImplInterface ( replicas_count_ )
, announcements ( replicas_count_ )
, reading_state ( replicas_count_ )
{
}
~ DefaultCoordinator ( ) override ;
2021-12-09 10:39:28 +00:00
struct PartitionReading
{
PartSegments part_ranges ;
PartToMarkRanges mark_ranges_in_part ;
} ;
using PartitionToBlockRanges = std : : map < String , PartitionReading > ;
PartitionToBlockRanges partitions ;
2023-02-03 13:34:18 +00:00
size_t sent_initial_requests { 0 } ;
std : : vector < InitialAllRangesAnnouncement > announcements ;
Parts all_parts_to_read ;
/// Contains only parts which we haven't started to read from
PartRefs delayed_parts ;
/// Per-replica preferred parts split by consistent hash
/// Once all task will be done by some replica, it can steal tasks
std : : vector < PartRefs > reading_state ;
Poco : : Logger * log = & Poco : : Logger : : get ( " DefaultCoordinator " ) ;
std : : atomic < bool > state_initialized { false } ;
ParallelReadResponse handleRequest ( ParallelReadRequest request ) override ;
void handleInitialAllRangesAnnouncement ( InitialAllRangesAnnouncement announcement ) override ;
void updateReadingState ( const InitialAllRangesAnnouncement & announcement ) ;
void finalizeReadingState ( ) ;
size_t computeConsistentHash ( const MergeTreePartInfo & info ) const
{
auto hash = SipHash ( ) ;
hash . update ( info . getPartNameV1 ( ) ) ;
return ConsistentHashing ( hash . get64 ( ) , replicas_count ) ;
}
2021-12-09 10:39:28 +00:00
2023-02-03 13:34:18 +00:00
void selectPartsAndRanges ( const PartRefs & container , size_t replica_num , size_t min_number_of_marks , size_t & current_mark_size , ParallelReadResponse & response ) const ;
2021-12-09 10:39:28 +00:00
} ;
2023-02-03 13:34:18 +00:00
DefaultCoordinator : : ~ DefaultCoordinator ( )
{
LOG_INFO ( log , " Coordination done: {} " , toString ( stats ) ) ;
}
2021-12-09 10:39:28 +00:00
2023-02-03 13:34:18 +00:00
void DefaultCoordinator : : updateReadingState ( const InitialAllRangesAnnouncement & announcement )
2021-12-09 10:39:28 +00:00
{
2023-02-03 13:34:18 +00:00
PartRefs parts_diff ;
2022-11-14 07:01:48 +00:00
2023-02-03 13:34:18 +00:00
/// To get rid of duplicates
for ( const auto & part : announcement . description )
{
auto the_same_it = std : : find_if ( all_parts_to_read . begin ( ) , all_parts_to_read . end ( ) ,
[ & part ] ( const Part & other ) { return other . description . info . getPartNameV1 ( ) = = part . info . getPartNameV1 ( ) ; } ) ;
2021-12-09 10:39:28 +00:00
2023-02-03 13:34:18 +00:00
/// We have the same part - add the info about presence on current replica to it
if ( the_same_it ! = all_parts_to_read . end ( ) )
{
the_same_it - > replicas . insert ( announcement . replica_num ) ;
continue ;
}
2021-12-09 10:39:28 +00:00
2023-02-03 13:34:18 +00:00
auto covering_or_the_same_it = std : : find_if ( all_parts_to_read . begin ( ) , all_parts_to_read . end ( ) ,
[ & part ] ( const Part & other ) { return ! other . description . info . isDisjoint ( part . info ) ; } ) ;
/// It is covering part or we have covering - skip it
if ( covering_or_the_same_it ! = all_parts_to_read . end ( ) )
continue ;
2021-12-09 10:39:28 +00:00
2023-02-03 13:34:18 +00:00
auto new_part = Part {
. description = part ,
. replicas = { announcement . replica_num }
} ;
auto [ insert_it , _ ] = all_parts_to_read . insert ( new_part ) ;
parts_diff . push_back ( insert_it ) ;
}
/// Split all parts by consistent hash
while ( ! parts_diff . empty ( ) )
2021-12-09 10:39:28 +00:00
{
2023-02-03 13:34:18 +00:00
auto current_part_it = parts_diff . front ( ) ;
parts_diff . pop_front ( ) ;
auto consistent_hash = computeConsistentHash ( current_part_it - > description . info ) ;
2021-12-09 10:39:28 +00:00
2023-02-03 13:34:18 +00:00
/// Check whether the new part can easy go to replica queue
if ( current_part_it - > replicas . contains ( consistent_hash ) )
{
reading_state [ consistent_hash ] . emplace_back ( current_part_it ) ;
continue ;
}
2021-12-09 10:39:28 +00:00
2023-02-03 13:34:18 +00:00
/// Add to delayed parts
delayed_parts . emplace_back ( current_part_it ) ;
}
}
void DefaultCoordinator : : finalizeReadingState ( )
{
/// Clear all the delayed queue
while ( ! delayed_parts . empty ( ) )
{
auto current_part_it = delayed_parts . front ( ) ;
auto consistent_hash = computeConsistentHash ( current_part_it - > description . info ) ;
2021-12-09 10:39:28 +00:00
2023-02-03 13:34:18 +00:00
if ( current_part_it - > replicas . contains ( consistent_hash ) )
{
reading_state [ consistent_hash ] . emplace_back ( current_part_it ) ;
delayed_parts . pop_front ( ) ;
continue ;
}
2021-12-09 10:39:28 +00:00
2023-02-03 13:34:18 +00:00
/// In this situation just assign to a random replica which has this part
auto replica = * ( std : : next ( current_part_it - > replicas . begin ( ) , thread_local_rng ( ) % current_part_it - > replicas . size ( ) ) ) ;
reading_state [ replica ] . emplace_back ( current_part_it ) ;
delayed_parts . pop_front ( ) ;
}
2021-12-09 10:39:28 +00:00
2023-02-03 13:34:18 +00:00
String description ;
for ( const auto & part : all_parts_to_read )
{
description + = part . description . describe ( ) ;
description + = fmt : : format ( " Replicas: ({}) --- " , fmt : : join ( part . replicas , " , " ) ) ;
2021-12-09 10:39:28 +00:00
}
2023-02-03 13:34:18 +00:00
LOG_INFO ( log , " Reading state is fully initialized: {} " , description ) ;
}
void DefaultCoordinator : : handleInitialAllRangesAnnouncement ( InitialAllRangesAnnouncement announcement )
{
std : : lock_guard lock ( mutex ) ;
2021-12-09 10:39:28 +00:00
2023-02-03 13:34:18 +00:00
updateReadingState ( announcement ) ;
stats [ announcement . replica_num ] . number_of_requests + = 1 ;
2021-12-09 10:39:28 +00:00
2023-02-03 13:34:18 +00:00
+ + sent_initial_requests ;
LOG_INFO ( log , " {} {} " , sent_initial_requests , replicas_count ) ;
if ( sent_initial_requests = = replicas_count )
finalizeReadingState ( ) ;
}
2021-12-09 10:39:28 +00:00
2023-02-03 13:34:18 +00:00
void DefaultCoordinator : : selectPartsAndRanges ( const PartRefs & container , size_t replica_num , size_t min_number_of_marks , size_t & current_mark_size , ParallelReadResponse & response ) const
{
for ( const auto & part : container )
2021-12-09 10:39:28 +00:00
{
2023-02-03 13:34:18 +00:00
if ( current_mark_size > = min_number_of_marks )
2021-12-09 10:39:28 +00:00
{
2023-02-03 13:34:18 +00:00
LOG_TEST ( log , " Current mark size {} is bigger than min_number_marks {} " , current_mark_size , min_number_of_marks ) ;
break ;
2021-12-09 10:39:28 +00:00
}
2023-02-03 13:34:18 +00:00
if ( part - > description . ranges . empty ( ) )
2021-12-09 10:39:28 +00:00
{
2023-02-03 13:34:18 +00:00
LOG_TEST ( log , " Part {} is already empty in reading state " , part - > description . info . getPartNameV1 ( ) ) ;
continue ;
}
2021-12-09 10:39:28 +00:00
2023-02-03 13:34:18 +00:00
if ( std : : find ( part - > replicas . begin ( ) , part - > replicas . end ( ) , replica_num ) = = part - > replicas . end ( ) )
{
LOG_TEST ( log , " Not found part {} on replica {} " , part - > description . info . getPartNameV1 ( ) , replica_num ) ;
continue ;
}
2021-12-09 10:39:28 +00:00
2023-02-03 13:34:18 +00:00
response . description . push_back ( {
. info = part - > description . info ,
. ranges = { } ,
} ) ;
2021-12-09 10:39:28 +00:00
2023-02-03 13:34:18 +00:00
while ( ! part - > description . ranges . empty ( ) & & current_mark_size < min_number_of_marks )
{
auto & range = part - > description . ranges . front ( ) ;
if ( range . getNumberOfMarks ( ) > min_number_of_marks )
{
auto new_range = range ;
range . begin + = min_number_of_marks ;
new_range . end = new_range . begin + min_number_of_marks ;
response . description . back ( ) . ranges . emplace_back ( new_range ) ;
current_mark_size + = new_range . getNumberOfMarks ( ) ;
continue ;
}
current_mark_size + = part - > description . ranges . front ( ) . getNumberOfMarks ( ) ;
response . description . back ( ) . ranges . emplace_back ( part - > description . ranges . front ( ) ) ;
part - > description . ranges . pop_front ( ) ;
}
}
}
ParallelReadResponse DefaultCoordinator : : handleRequest ( ParallelReadRequest request )
{
std : : lock_guard lock ( mutex ) ;
LOG_TRACE ( log , " Handling request from replica {}, minimal marks size is {} " , request . replica_num , request . min_number_of_marks ) ;
size_t current_mark_size = 0 ;
ParallelReadResponse response ;
/// 1. Try to select from preferred set of parts for current replica
selectPartsAndRanges ( reading_state [ request . replica_num ] , request . replica_num , request . min_number_of_marks , current_mark_size , response ) ;
/// 2. Try to use parts from delayed queue
while ( ! delayed_parts . empty ( ) & & current_mark_size < request . min_number_of_marks )
{
auto part = delayed_parts . front ( ) ;
delayed_parts . pop_front ( ) ;
reading_state [ request . replica_num ] . emplace_back ( part ) ;
selectPartsAndRanges ( reading_state [ request . replica_num ] , request . replica_num , request . min_number_of_marks , current_mark_size , response ) ;
}
2022-11-14 06:13:42 +00:00
2023-02-03 13:34:18 +00:00
/// 3. Try to steal tasks;
if ( current_mark_size < request . min_number_of_marks )
{
for ( size_t i = 0 ; i < replicas_count ; + + i )
{
if ( i ! = request . replica_num )
selectPartsAndRanges ( reading_state [ i ] , request . replica_num , request . min_number_of_marks , current_mark_size , response ) ;
2022-11-14 06:13:42 +00:00
2023-02-03 13:34:18 +00:00
if ( current_mark_size > = request . min_number_of_marks )
break ;
2021-12-09 10:39:28 +00:00
}
2023-02-03 13:34:18 +00:00
}
stats [ request . replica_num ] . number_of_requests + = 1 ;
stats [ request . replica_num ] . sum_marks + = current_mark_size ;
if ( response . description . empty ( ) )
response . finish = true ;
LOG_TRACE ( log , " Going to respond to replica {} with {} " , request . replica_num , response . describe ( ) ) ;
return response ;
}
template < CoordinationMode mode >
class InOrderCoordinator : public ParallelReplicasReadingCoordinator : : ImplInterface
{
public :
explicit InOrderCoordinator ( [[ maybe_unused ]] size_t replicas_count_ )
: ParallelReplicasReadingCoordinator : : ImplInterface ( replicas_count_ )
{ }
~ InOrderCoordinator ( ) override
{
LOG_INFO ( log , " Coordination done: {} " , toString ( stats ) ) ;
}
ParallelReadResponse handleRequest ( [[ maybe_unused ]] ParallelReadRequest request ) override ;
void handleInitialAllRangesAnnouncement ( [[ maybe_unused ]] InitialAllRangesAnnouncement announcement ) override ;
Parts all_parts_to_read ;
Poco : : Logger * log = & Poco : : Logger : : get ( fmt : : format ( " {}{} " , magic_enum : : enum_name ( mode ) , " Coordinator " ) ) ;
} ;
template < CoordinationMode mode >
void InOrderCoordinator < mode > : : handleInitialAllRangesAnnouncement ( InitialAllRangesAnnouncement announcement )
{
std : : lock_guard lock ( mutex ) ;
LOG_TRACE ( log , " Received an announecement {} " , announcement . describe ( ) ) ;
/// To get rid of duplicates
for ( const auto & part : announcement . description )
{
auto the_same_it = std : : find_if ( all_parts_to_read . begin ( ) , all_parts_to_read . end ( ) ,
[ & part ] ( const Part & other ) { return other . description . info = = part . info ; } ) ;
/// We have the same part - add the info about presence on current replica to it
if ( the_same_it ! = all_parts_to_read . end ( ) )
2021-12-09 10:39:28 +00:00
{
2023-02-03 13:34:18 +00:00
the_same_it - > replicas . insert ( announcement . replica_num ) ;
continue ;
}
2021-12-09 10:39:28 +00:00
2023-02-03 13:34:18 +00:00
auto covering_or_the_same_it = std : : find_if ( all_parts_to_read . begin ( ) , all_parts_to_read . end ( ) ,
[ & part ] ( const Part & other ) { return other . description . info . contains ( part . info ) | | part . info . contains ( other . description . info ) ; } ) ;
2021-12-09 10:39:28 +00:00
2023-02-03 13:34:18 +00:00
/// It is covering part or we have covering - skip it
if ( covering_or_the_same_it ! = all_parts_to_read . end ( ) )
continue ;
auto new_part = Part {
. description = part ,
. replicas = { announcement . replica_num }
} ;
auto insert_it = all_parts_to_read . insert ( new_part ) ;
auto & ranges = insert_it . first - > description . ranges ;
std : : sort ( ranges . begin ( ) , ranges . end ( ) ) ;
}
}
template < CoordinationMode mode >
ParallelReadResponse InOrderCoordinator < mode > : : handleRequest ( ParallelReadRequest request )
{
std : : lock_guard lock ( mutex ) ;
if ( request . mode ! = mode )
throw Exception ( ErrorCodes : : LOGICAL_ERROR ,
" Replica {} decided to read in {} mode, not in {}. This is a bug " ,
request . replica_num , magic_enum : : enum_name ( request . mode ) , magic_enum : : enum_name ( mode ) ) ;
LOG_TRACE ( log , " Got request from replica {}, data {} " , request . replica_num , request . describe ( ) ) ;
ParallelReadResponse response ;
response . description = request . description ;
size_t overall_number_of_marks = 0 ;
for ( auto & part : response . description )
{
auto global_part_it = std : : find_if ( all_parts_to_read . begin ( ) , all_parts_to_read . end ( ) ,
[ & part ] ( const Part & other ) { return other . description . info = = part . info ; } ) ;
if ( global_part_it = = all_parts_to_read . end ( ) )
continue ;
if ( ! global_part_it - > replicas . contains ( request . replica_num ) )
throw Exception ( ErrorCodes : : LOGICAL_ERROR , " Part {} doesn't exist on replica {} according to the global state " , part . info . getPartNameV1 ( ) , request . replica_num ) ;
size_t current_mark_size = 0 ;
/// Now we can recommend to read more intervals
if constexpr ( mode = = CoordinationMode : : ReverseOrder )
{
while ( ! global_part_it - > description . ranges . empty ( ) & & current_mark_size < request . min_number_of_marks )
{
auto range = global_part_it - > description . ranges . back ( ) ;
if ( range . getNumberOfMarks ( ) > request . min_number_of_marks )
{
auto new_range = range ;
range . end - = request . min_number_of_marks ;
new_range . begin = new_range . end - request . min_number_of_marks ;
global_part_it - > description . ranges . back ( ) = range ;
part . ranges . emplace_front ( new_range ) ;
current_mark_size + = new_range . getNumberOfMarks ( ) ;
continue ;
}
current_mark_size + = global_part_it - > description . ranges . back ( ) . getNumberOfMarks ( ) ;
part . ranges . emplace_front ( global_part_it - > description . ranges . back ( ) ) ;
global_part_it - > description . ranges . pop_back ( ) ;
}
2021-12-09 10:39:28 +00:00
}
2023-02-03 13:34:18 +00:00
else if constexpr ( mode = = CoordinationMode : : WithOrder )
{
while ( ! global_part_it - > description . ranges . empty ( ) & & current_mark_size < request . min_number_of_marks )
{
auto range = global_part_it - > description . ranges . front ( ) ;
if ( range . getNumberOfMarks ( ) > request . min_number_of_marks )
{
auto new_range = range ;
range . begin + = request . min_number_of_marks ;
new_range . end = new_range . begin + request . min_number_of_marks ;
global_part_it - > description . ranges . front ( ) = range ;
part . ranges . emplace_back ( new_range ) ;
current_mark_size + = new_range . getNumberOfMarks ( ) ;
continue ;
}
current_mark_size + = global_part_it - > description . ranges . front ( ) . getNumberOfMarks ( ) ;
part . ranges . emplace_back ( global_part_it - > description . ranges . front ( ) ) ;
global_part_it - > description . ranges . pop_front ( ) ;
}
}
overall_number_of_marks + = current_mark_size ;
2021-12-09 10:39:28 +00:00
}
2023-02-03 13:34:18 +00:00
if ( ! overall_number_of_marks )
response . finish = true ;
stats [ request . replica_num ] . number_of_requests + = 1 ;
stats [ request . replica_num ] . sum_marks + = overall_number_of_marks ;
LOG_TRACE ( log , " Going to respond to replica {} with {} " , request . replica_num , response . describe ( ) ) ;
return response ;
2021-12-09 10:39:28 +00:00
}
2023-02-03 13:34:18 +00:00
void ParallelReplicasReadingCoordinator : : handleInitialAllRangesAnnouncement ( InitialAllRangesAnnouncement announcement )
2021-12-09 10:39:28 +00:00
{
2023-02-03 13:34:18 +00:00
if ( ! pimpl )
initialize ( ) ;
return pimpl - > handleInitialAllRangesAnnouncement ( announcement ) ;
}
ParallelReadResponse ParallelReplicasReadingCoordinator : : handleRequest ( ParallelReadRequest request )
{
if ( ! pimpl )
initialize ( ) ;
2021-12-09 10:39:28 +00:00
return pimpl - > handleRequest ( std : : move ( request ) ) ;
}
2023-02-03 13:34:18 +00:00
void ParallelReplicasReadingCoordinator : : setMode ( CoordinationMode mode_ )
{
mode = mode_ ;
}
void ParallelReplicasReadingCoordinator : : initialize ( )
2021-12-09 10:39:28 +00:00
{
2023-02-03 13:34:18 +00:00
switch ( mode )
{
case CoordinationMode : : Default :
pimpl = std : : make_unique < DefaultCoordinator > ( replicas_count ) ;
return ;
case CoordinationMode : : WithOrder :
pimpl = std : : make_unique < InOrderCoordinator < CoordinationMode : : WithOrder > > ( replicas_count ) ;
return ;
case CoordinationMode : : ReverseOrder :
pimpl = std : : make_unique < InOrderCoordinator < CoordinationMode : : ReverseOrder > > ( replicas_count ) ;
return ;
}
2021-12-09 10:39:28 +00:00
}
2023-02-03 13:34:18 +00:00
ParallelReplicasReadingCoordinator : : ParallelReplicasReadingCoordinator ( size_t replicas_count_ ) : replicas_count ( replicas_count_ ) { }
2021-12-09 10:39:28 +00:00
ParallelReplicasReadingCoordinator : : ~ ParallelReplicasReadingCoordinator ( ) = default ;
}