2014-03-21 13:42:14 +00:00
|
|
|
#pragma once
|
|
|
|
|
2017-06-06 17:18:32 +00:00
|
|
|
#include <ext/shared_ptr_helper.h>
|
2017-03-31 14:46:48 +00:00
|
|
|
#include <atomic>
|
2017-09-09 23:17:38 +00:00
|
|
|
#include <pcg_random.hpp>
|
2017-04-01 09:19:00 +00:00
|
|
|
#include <Storages/IStorage.h>
|
|
|
|
#include <Storages/MergeTree/MergeTreeData.h>
|
2018-04-20 16:18:16 +00:00
|
|
|
#include <Storages/MergeTree/MergeTreeDataMergerMutator.h>
|
2017-04-01 09:19:00 +00:00
|
|
|
#include <Storages/MergeTree/MergeTreeDataWriter.h>
|
|
|
|
#include <Storages/MergeTree/MergeTreeDataSelectExecutor.h>
|
|
|
|
#include <Storages/MergeTree/ReplicatedMergeTreeLogEntry.h>
|
|
|
|
#include <Storages/MergeTree/ReplicatedMergeTreeQueue.h>
|
|
|
|
#include <Storages/MergeTree/ReplicatedMergeTreeCleanupThread.h>
|
|
|
|
#include <Storages/MergeTree/ReplicatedMergeTreeRestartingThread.h>
|
|
|
|
#include <Storages/MergeTree/ReplicatedMergeTreePartCheckThread.h>
|
|
|
|
#include <Storages/MergeTree/ReplicatedMergeTreeAlterThread.h>
|
2018-11-02 15:39:19 +00:00
|
|
|
#include <Storages/MergeTree/ReplicatedMergeTreeTableMetadata.h>
|
2018-07-04 16:31:21 +00:00
|
|
|
#include <Storages/MergeTree/EphemeralLockInZooKeeper.h>
|
2017-04-01 09:19:00 +00:00
|
|
|
#include <Storages/MergeTree/BackgroundProcessingPool.h>
|
|
|
|
#include <Storages/MergeTree/DataPartsExchange.h>
|
2018-04-17 17:59:42 +00:00
|
|
|
#include <Storages/MergeTree/ReplicatedMergeTreeAddress.h>
|
2017-04-01 09:19:00 +00:00
|
|
|
#include <DataTypes/DataTypesNumber.h>
|
2018-12-28 17:11:52 +00:00
|
|
|
#include <Interpreters/Cluster.h>
|
2018-05-25 19:44:14 +00:00
|
|
|
#include <Interpreters/PartLog.h>
|
2017-09-08 23:31:18 +00:00
|
|
|
#include <Common/randomSeed.h>
|
2017-06-19 20:06:35 +00:00
|
|
|
#include <Common/ZooKeeper/ZooKeeper.h>
|
|
|
|
#include <Common/ZooKeeper/LeaderElection.h>
|
2018-08-20 15:34:37 +00:00
|
|
|
#include <Core/BackgroundSchedulePool.h>
|
2015-03-16 19:24:57 +00:00
|
|
|
|
2014-03-21 13:42:14 +00:00
|
|
|
|
|
|
|
namespace DB
|
|
|
|
{
|
|
|
|
|
2017-04-16 15:00:33 +00:00
|
|
|
/** The engine that uses the merge tree (see MergeTreeData) and replicated through ZooKeeper.
|
2015-09-20 05:21:43 +00:00
|
|
|
*
|
2017-04-16 15:00:33 +00:00
|
|
|
* ZooKeeper is used for the following things:
|
|
|
|
* - the structure of the table (/ metadata, /columns)
|
|
|
|
* - action log with data (/log/log-...,/replicas/replica_name/queue/queue-...);
|
|
|
|
* - a replica list (/replicas), and replica activity tag (/replicas/replica_name/is_active), replica addresses (/replicas/replica_name/host);
|
|
|
|
* - select the leader replica (/leader_election) - this is the replica that assigns the merge;
|
|
|
|
* - a set of parts of data on each replica (/replicas/replica_name/parts);
|
|
|
|
* - list of the last N blocks of data with checksum, for deduplication (/blocks);
|
|
|
|
* - the list of incremental block numbers (/block_numbers) that we are about to insert,
|
|
|
|
* to ensure the linear order of data insertion and data merge only on the intervals in this sequence;
|
|
|
|
* - coordinates writes with quorum (/quorum).
|
2018-06-05 14:55:35 +00:00
|
|
|
* - Storage of mutation entries (ALTER DELETE, ALTER UPDATE etc.) to execute (/mutations).
|
|
|
|
* See comments in StorageReplicatedMergeTree::mutate() for details.
|
2014-03-21 19:17:59 +00:00
|
|
|
*/
|
2015-09-20 05:21:43 +00:00
|
|
|
|
2017-04-16 15:00:33 +00:00
|
|
|
/** The replicated tables have a common log (/log/log-...).
|
|
|
|
* Log - a sequence of entries (LogEntry) about what to do.
|
|
|
|
* Each entry is one of:
|
|
|
|
* - normal data insertion (GET),
|
|
|
|
* - merge (MERGE),
|
|
|
|
* - delete the partition (DROP).
|
2015-09-20 05:21:43 +00:00
|
|
|
*
|
2018-06-05 14:55:35 +00:00
|
|
|
* Each replica copies (queueUpdatingTask, pullLogsToQueue) entries from the log to its queue (/replicas/replica_name/queue/queue-...)
|
2017-04-16 15:00:33 +00:00
|
|
|
* and then executes them (queueTask).
|
|
|
|
* Despite the name of the "queue", execution can be reordered, if necessary (shouldExecuteLogEntry, executeLogEntry).
|
|
|
|
* In addition, the records in the queue can be generated independently (not from the log), in the following cases:
|
|
|
|
* - when creating a new replica, actions are put on GET from other replicas (createReplica);
|
|
|
|
* - if the part is corrupt (removePartAndEnqueueFetch) or absent during the check (at start - checkParts, while running - searchForMissingPart),
|
|
|
|
* actions are put on GET from other replicas;
|
2015-09-20 05:21:43 +00:00
|
|
|
*
|
2017-04-16 15:00:33 +00:00
|
|
|
* The replica to which INSERT was made in the queue will also have an entry of the GET of this data.
|
|
|
|
* Such an entry is considered to be executed as soon as the queue handler sees it.
|
2015-09-20 05:21:43 +00:00
|
|
|
*
|
2017-04-16 15:00:33 +00:00
|
|
|
* The log entry has a creation time. This time is generated by the clock of server that created entry
|
|
|
|
* - the one on which the corresponding INSERT or ALTER query came.
|
2015-09-20 05:21:43 +00:00
|
|
|
*
|
2017-04-16 15:00:33 +00:00
|
|
|
* For the entries in the queue that the replica made for itself,
|
|
|
|
* as the time will take the time of creation the appropriate part on any of the replicas.
|
2015-09-20 05:21:43 +00:00
|
|
|
*/
|
|
|
|
|
2019-05-03 02:00:57 +00:00
|
|
|
class StorageReplicatedMergeTree : public ext::shared_ptr_helper<StorageReplicatedMergeTree>, public MergeTreeData
|
2014-03-21 13:42:14 +00:00
|
|
|
{
|
|
|
|
public:
|
2017-06-06 17:06:14 +00:00
|
|
|
void startup() override;
|
2017-04-01 07:20:54 +00:00
|
|
|
void shutdown() override;
|
|
|
|
~StorageReplicatedMergeTree() override;
|
|
|
|
|
2019-05-03 02:00:57 +00:00
|
|
|
std::string getName() const override { return "Replicated" + merging_params.getModeName() + "MergeTree"; }
|
2017-04-01 07:20:54 +00:00
|
|
|
std::string getTableName() const override { return table_name; }
|
2019-03-29 20:31:06 +00:00
|
|
|
std::string getDatabaseName() const override { return database_name; }
|
|
|
|
|
2017-04-25 15:21:03 +00:00
|
|
|
bool supportsReplication() const override { return true; }
|
2018-05-21 23:17:57 +00:00
|
|
|
bool supportsDeduplication() const override { return true; }
|
2017-04-01 07:20:54 +00:00
|
|
|
|
|
|
|
BlockInputStreams read(
|
|
|
|
const Names & column_names,
|
2017-07-15 03:48:36 +00:00
|
|
|
const SelectQueryInfo & query_info,
|
2017-04-01 07:20:54 +00:00
|
|
|
const Context & context,
|
2018-04-19 14:47:09 +00:00
|
|
|
QueryProcessingStage::Enum processed_stage,
|
2019-02-18 23:38:44 +00:00
|
|
|
size_t max_block_size,
|
2017-06-02 15:54:39 +00:00
|
|
|
unsigned num_streams) override;
|
2017-04-01 07:20:54 +00:00
|
|
|
|
2019-02-27 18:26:24 +00:00
|
|
|
BlockOutputStreamPtr write(const ASTPtr & query, const Context & context) override;
|
2017-04-01 07:20:54 +00:00
|
|
|
|
2018-12-28 13:39:44 +00:00
|
|
|
bool optimize(const ASTPtr & query, const ASTPtr & partition, bool final, bool deduplicate, const Context & query_context) override;
|
2017-04-01 07:20:54 +00:00
|
|
|
|
2019-03-05 10:12:20 +00:00
|
|
|
void alter(
|
|
|
|
const AlterCommands & params, const String & database_name, const String & table_name,
|
2019-03-07 20:52:25 +00:00
|
|
|
const Context & query_context, TableStructureWriteLockHolder & table_lock_holder) override;
|
2017-04-01 07:20:54 +00:00
|
|
|
|
2018-12-28 13:39:44 +00:00
|
|
|
void alterPartition(const ASTPtr & query, const PartitionCommands & commands, const Context & query_context) override;
|
2017-04-01 07:20:54 +00:00
|
|
|
|
2018-04-19 10:33:16 +00:00
|
|
|
void mutate(const MutationCommands & commands, const Context & context) override;
|
2019-05-03 02:00:57 +00:00
|
|
|
std::vector<MergeTreeMutationStatus> getMutationsStatus() const override;
|
2019-02-04 13:04:02 +00:00
|
|
|
CancellationCode killMutation(const String & mutation_id) override;
|
2018-06-07 13:28:39 +00:00
|
|
|
|
2017-04-16 15:00:33 +00:00
|
|
|
/** Removes a replica from ZooKeeper. If there are no other replicas, it deletes the entire table from ZooKeeper.
|
2017-04-01 07:20:54 +00:00
|
|
|
*/
|
|
|
|
void drop() override;
|
|
|
|
|
2018-12-28 13:39:44 +00:00
|
|
|
void truncate(const ASTPtr &, const Context &) override;
|
2018-04-21 00:35:20 +00:00
|
|
|
|
2017-04-01 07:20:54 +00:00
|
|
|
void rename(const String & new_path_to_db, const String & new_database_name, const String & new_table_name) override;
|
|
|
|
|
|
|
|
bool supportsIndexForIn() const override { return true; }
|
|
|
|
|
2018-08-03 09:39:01 +00:00
|
|
|
void checkTableCanBeDropped() const override;
|
2017-04-01 07:20:54 +00:00
|
|
|
|
2018-08-03 09:39:01 +00:00
|
|
|
void checkPartitionCanBeDropped(const ASTPtr & partition) override;
|
2017-04-01 07:20:54 +00:00
|
|
|
|
2018-05-28 15:37:30 +00:00
|
|
|
ActionLock getActionLock(StorageActionBlockType action_type) override;
|
2018-05-21 13:49:54 +00:00
|
|
|
|
|
|
|
/// Wait when replication queue size becomes less or equal than queue_size
|
|
|
|
/// If timeout is exceeded returns false
|
|
|
|
bool waitForShrinkingQueueSize(size_t queue_size = 0, UInt64 max_wait_milliseconds = 0);
|
|
|
|
|
2017-04-16 15:00:33 +00:00
|
|
|
/** For the system table replicas. */
|
2017-04-01 07:20:54 +00:00
|
|
|
struct Status
|
|
|
|
{
|
|
|
|
bool is_leader;
|
2019-02-13 13:05:58 +00:00
|
|
|
bool can_become_leader;
|
2017-04-01 07:20:54 +00:00
|
|
|
bool is_readonly;
|
|
|
|
bool is_session_expired;
|
|
|
|
ReplicatedMergeTreeQueue::Status queue;
|
|
|
|
UInt32 parts_to_check;
|
|
|
|
String zookeeper_path;
|
|
|
|
String replica_name;
|
|
|
|
String replica_path;
|
|
|
|
Int32 columns_version;
|
|
|
|
UInt64 log_max_index;
|
|
|
|
UInt64 log_pointer;
|
2017-04-17 15:06:12 +00:00
|
|
|
UInt64 absolute_delay;
|
2017-04-01 07:20:54 +00:00
|
|
|
UInt8 total_replicas;
|
|
|
|
UInt8 active_replicas;
|
|
|
|
};
|
|
|
|
|
2017-04-16 15:00:33 +00:00
|
|
|
/// Get the status of the table. If with_zk_fields = false - do not fill in the fields that require queries to ZK.
|
2017-04-01 07:20:54 +00:00
|
|
|
void getStatus(Status & res, bool with_zk_fields = true);
|
|
|
|
|
|
|
|
using LogEntriesData = std::vector<ReplicatedMergeTreeLogEntryData>;
|
|
|
|
void getQueue(LogEntriesData & res, String & replica_name);
|
|
|
|
|
2017-04-17 15:06:12 +00:00
|
|
|
/// Get replica delay relative to current time.
|
|
|
|
time_t getAbsoluteDelay() const;
|
|
|
|
|
|
|
|
/// If the absolute delay is greater than min_relative_delay_to_yield_leadership,
|
|
|
|
/// will also calculate the difference from the unprocessed time of the best replica.
|
|
|
|
/// NOTE: Will communicate to ZooKeeper to calculate relative delay.
|
2017-04-01 07:20:54 +00:00
|
|
|
void getReplicaDelays(time_t & out_absolute_delay, time_t & out_relative_delay);
|
|
|
|
|
2017-04-16 15:00:33 +00:00
|
|
|
/// Add a part to the queue of parts whose data you want to check in the background thread.
|
2017-04-01 07:20:54 +00:00
|
|
|
void enqueuePartForCheck(const String & part_name, time_t delay_to_check_seconds = 0)
|
|
|
|
{
|
|
|
|
part_check_thread.enqueuePart(part_name, delay_to_check_seconds);
|
|
|
|
}
|
2016-04-09 03:50:02 +00:00
|
|
|
|
2018-02-21 19:26:59 +00:00
|
|
|
String getDataPath() const override { return full_path; }
|
|
|
|
|
2019-07-03 13:17:19 +00:00
|
|
|
CheckResults checkData(const ASTPtr & query, const Context & context) override;
|
|
|
|
|
2019-08-12 13:30:29 +00:00
|
|
|
/// Checks ability to use granularity
|
|
|
|
bool canUseAdaptiveGranularity() const override;
|
|
|
|
|
2014-03-21 13:42:14 +00:00
|
|
|
private:
|
2017-12-15 18:23:05 +00:00
|
|
|
/// Delete old parts from disk and from ZooKeeper.
|
2017-10-06 11:30:57 +00:00
|
|
|
void clearOldPartsAndRemoveFromZK();
|
2017-05-24 20:19:29 +00:00
|
|
|
|
2017-04-01 07:20:54 +00:00
|
|
|
friend class ReplicatedMergeTreeBlockOutputStream;
|
|
|
|
friend class ReplicatedMergeTreePartCheckThread;
|
|
|
|
friend class ReplicatedMergeTreeCleanupThread;
|
|
|
|
friend class ReplicatedMergeTreeAlterThread;
|
|
|
|
friend class ReplicatedMergeTreeRestartingThread;
|
|
|
|
friend struct ReplicatedMergeTreeLogEntry;
|
|
|
|
friend class ScopedPartitionMergeLock;
|
2018-05-20 19:56:03 +00:00
|
|
|
friend class ReplicatedMergeTreeQueue;
|
2018-05-21 13:49:54 +00:00
|
|
|
friend class MergeTreeData;
|
2016-01-28 01:00:27 +00:00
|
|
|
|
2017-04-01 07:20:54 +00:00
|
|
|
using LogEntry = ReplicatedMergeTreeLogEntry;
|
|
|
|
using LogEntryPtr = LogEntry::Ptr;
|
2014-08-05 13:49:44 +00:00
|
|
|
|
2017-04-16 15:00:33 +00:00
|
|
|
zkutil::ZooKeeperPtr current_zookeeper; /// Use only the methods below.
|
|
|
|
std::mutex current_zookeeper_mutex; /// To recreate the session in the background thread.
|
2014-12-12 20:50:32 +00:00
|
|
|
|
2017-04-01 07:20:54 +00:00
|
|
|
zkutil::ZooKeeperPtr tryGetZooKeeper();
|
|
|
|
zkutil::ZooKeeperPtr getZooKeeper();
|
|
|
|
void setZooKeeper(zkutil::ZooKeeperPtr zookeeper);
|
2014-03-22 14:44:44 +00:00
|
|
|
|
2017-04-16 15:00:33 +00:00
|
|
|
/// If true, the table is offline and can not be written to it.
|
2017-10-31 19:19:36 +00:00
|
|
|
std::atomic_bool is_readonly {false};
|
2017-04-01 07:20:54 +00:00
|
|
|
|
|
|
|
String zookeeper_path;
|
|
|
|
String replica_name;
|
|
|
|
String replica_path;
|
|
|
|
|
|
|
|
/** /replicas/me/is_active.
|
|
|
|
*/
|
|
|
|
zkutil::EphemeralNodeHolderPtr replica_is_active_node;
|
|
|
|
|
2018-11-28 16:05:29 +00:00
|
|
|
/** Version of the /columns node in ZooKeeper corresponding to the current data.columns.
|
2017-04-16 15:00:33 +00:00
|
|
|
* Read and modify along with the data.columns - under TableStructureLock.
|
2017-04-01 07:20:54 +00:00
|
|
|
*/
|
|
|
|
int columns_version = -1;
|
|
|
|
|
2018-11-28 16:05:29 +00:00
|
|
|
/// Version of the /metadata node in ZooKeeper.
|
2018-11-01 13:30:38 +00:00
|
|
|
int metadata_version = -1;
|
|
|
|
|
2018-11-28 16:05:29 +00:00
|
|
|
/// Used to delay setting table structure till startup() in case of an offline ALTER.
|
|
|
|
std::function<void()> set_table_structure_at_startup;
|
|
|
|
|
2017-07-11 13:48:26 +00:00
|
|
|
/** Is this replica "leading". The leader replica selects the parts to merge.
|
2017-04-01 07:20:54 +00:00
|
|
|
*/
|
2018-04-06 16:06:07 +00:00
|
|
|
std::atomic<bool> is_leader {false};
|
|
|
|
zkutil::LeaderElectionPtr leader_election;
|
2017-04-01 07:20:54 +00:00
|
|
|
|
2017-10-06 16:53:55 +00:00
|
|
|
InterserverIOEndpointHolderPtr data_parts_exchange_endpoint_holder;
|
2017-04-01 07:20:54 +00:00
|
|
|
|
|
|
|
MergeTreeDataSelectExecutor reader;
|
|
|
|
MergeTreeDataWriter writer;
|
2018-04-20 16:18:16 +00:00
|
|
|
MergeTreeDataMergerMutator merger_mutator;
|
2017-04-01 07:20:54 +00:00
|
|
|
|
2017-08-25 20:41:45 +00:00
|
|
|
/** The queue of what needs to be done on this replica to catch up with everyone. It is taken from ZooKeeper (/replicas/me/queue/).
|
|
|
|
* In ZK entries in chronological order. Here it is not necessary.
|
|
|
|
*/
|
|
|
|
ReplicatedMergeTreeQueue queue;
|
|
|
|
std::atomic<time_t> last_queue_update_start_time{0};
|
|
|
|
std::atomic<time_t> last_queue_update_finish_time{0};
|
|
|
|
|
2017-04-01 07:20:54 +00:00
|
|
|
DataPartsExchange::Fetcher fetcher;
|
|
|
|
|
|
|
|
|
2018-03-21 23:30:20 +00:00
|
|
|
/// When activated, replica is initialized and startup() method could exit
|
|
|
|
Poco::Event startup_event;
|
|
|
|
|
2017-04-16 15:00:33 +00:00
|
|
|
/// Do I need to complete background threads (except restarting_thread)?
|
2018-07-30 18:30:33 +00:00
|
|
|
std::atomic<bool> partial_shutdown_called {false};
|
2018-07-30 20:21:45 +00:00
|
|
|
|
2018-07-30 18:30:33 +00:00
|
|
|
/// Event that is signalled (and is reset) by the restarting_thread when the ZooKeeper session expires.
|
2018-07-30 20:21:45 +00:00
|
|
|
Poco::Event partial_shutdown_event {false}; /// Poco::Event::EVENT_MANUALRESET
|
2017-04-01 07:20:54 +00:00
|
|
|
|
|
|
|
/// Limiting parallel fetches per one table
|
|
|
|
std::atomic_uint current_table_fetches {0};
|
|
|
|
|
2017-11-15 16:32:47 +00:00
|
|
|
/// Threads.
|
2017-04-01 07:20:54 +00:00
|
|
|
|
2017-12-29 22:32:04 +00:00
|
|
|
/// A task that keeps track of the updates in the logs of all replicas and loads them into the queue.
|
|
|
|
bool queue_update_in_progress = false;
|
2018-05-31 13:05:05 +00:00
|
|
|
BackgroundSchedulePool::TaskHolder queue_updating_task;
|
2017-04-01 07:20:54 +00:00
|
|
|
|
2018-05-31 13:05:05 +00:00
|
|
|
BackgroundSchedulePool::TaskHolder mutations_updating_task;
|
2018-04-19 14:20:18 +00:00
|
|
|
|
2017-04-16 15:00:33 +00:00
|
|
|
/// A task that performs actions from the queue.
|
2017-04-01 07:20:54 +00:00
|
|
|
BackgroundProcessingPool::TaskHandle queue_task_handle;
|
|
|
|
|
2017-12-29 22:32:04 +00:00
|
|
|
/// A task that selects parts to merge.
|
2018-05-31 13:05:05 +00:00
|
|
|
BackgroundSchedulePool::TaskHolder merge_selecting_task;
|
2018-07-30 17:34:55 +00:00
|
|
|
/// It is acquired for each iteration of the selection of parts to merge or each OPTIMIZE query.
|
|
|
|
std::mutex merge_selecting_mutex;
|
2018-04-02 12:45:55 +00:00
|
|
|
|
2018-06-21 13:27:36 +00:00
|
|
|
/// A task that marks finished mutations as done.
|
|
|
|
BackgroundSchedulePool::TaskHolder mutations_finalizing_task;
|
|
|
|
|
2017-04-16 15:00:33 +00:00
|
|
|
/// A thread that removes old parts, log entries, and blocks.
|
2018-07-30 17:34:55 +00:00
|
|
|
ReplicatedMergeTreeCleanupThread cleanup_thread;
|
2017-04-01 07:20:54 +00:00
|
|
|
|
2017-04-16 15:00:33 +00:00
|
|
|
/// A thread monitoring changes to the column list in ZooKeeper and updating the parts in accordance with these changes.
|
2018-07-30 17:34:55 +00:00
|
|
|
ReplicatedMergeTreeAlterThread alter_thread;
|
2017-04-01 07:20:54 +00:00
|
|
|
|
2017-04-16 15:00:33 +00:00
|
|
|
/// A thread that checks the data of the parts, as well as the queue of the parts to be checked.
|
2017-04-01 07:20:54 +00:00
|
|
|
ReplicatedMergeTreePartCheckThread part_check_thread;
|
|
|
|
|
2018-07-30 17:34:55 +00:00
|
|
|
/// A thread that processes reconnection to ZooKeeper when the session expires.
|
2018-08-21 14:03:06 +00:00
|
|
|
ReplicatedMergeTreeRestartingThread restarting_thread;
|
2018-07-30 17:34:55 +00:00
|
|
|
|
2017-04-16 15:00:33 +00:00
|
|
|
/// An event that awakens `alter` method from waiting for the completion of the ALTER query.
|
2017-04-01 07:20:54 +00:00
|
|
|
zkutil::EventPtr alter_query_event = std::make_shared<Poco::Event>();
|
|
|
|
|
2019-08-12 13:30:29 +00:00
|
|
|
/// True if replica was created for existing table with fixed granularity
|
|
|
|
bool other_replicas_fixed_granularity = false;
|
|
|
|
|
2017-04-16 15:00:33 +00:00
|
|
|
/** Creates the minimum set of nodes in ZooKeeper.
|
2017-04-01 07:20:54 +00:00
|
|
|
*/
|
|
|
|
void createTableIfNotExists();
|
|
|
|
|
2017-04-16 15:00:33 +00:00
|
|
|
/** Creates a replica in ZooKeeper and adds to the queue all that it takes to catch up with the rest of the replicas.
|
2017-04-01 07:20:54 +00:00
|
|
|
*/
|
|
|
|
void createReplica();
|
|
|
|
|
2017-04-16 15:00:33 +00:00
|
|
|
/** Create nodes in the ZK, which must always be, but which might not exist when older versions of the server are running.
|
2017-04-01 07:20:54 +00:00
|
|
|
*/
|
|
|
|
void createNewZooKeeperNodes();
|
|
|
|
|
2017-04-16 15:00:33 +00:00
|
|
|
/** Verify that the list of columns and table settings match those specified in ZK (/metadata).
|
|
|
|
* If not, throw an exception.
|
2018-11-28 16:05:29 +00:00
|
|
|
* Must be called before startup().
|
2017-04-01 07:20:54 +00:00
|
|
|
*/
|
|
|
|
void checkTableStructure(bool skip_sanity_checks, bool allow_alter);
|
|
|
|
|
2018-11-02 11:53:05 +00:00
|
|
|
/// A part of ALTER: apply metadata changes only (data parts are altered separately).
|
|
|
|
/// Must be called under IStorage::lockStructureForAlter() lock.
|
2018-11-02 15:39:19 +00:00
|
|
|
void setTableStructure(ColumnsDescription new_columns, const ReplicatedMergeTreeTableMetadata::Diff & metadata_diff);
|
2018-11-02 11:53:05 +00:00
|
|
|
|
2017-04-16 15:00:33 +00:00
|
|
|
/** Check that the set of parts corresponds to that in ZK (/replicas/me/parts/).
|
|
|
|
* If any parts described in ZK are not locally, throw an exception.
|
|
|
|
* If any local parts are not mentioned in ZK, remove them.
|
|
|
|
* But if there are too many, throw an exception just in case - it's probably a configuration error.
|
2017-04-01 07:20:54 +00:00
|
|
|
*/
|
|
|
|
void checkParts(bool skip_sanity_checks);
|
|
|
|
|
2017-04-16 15:00:33 +00:00
|
|
|
/** Check that the part's checksum is the same as the checksum of the same part on some other replica.
|
|
|
|
* If no one has such a part, nothing checks.
|
|
|
|
* Not very reliable: if two replicas add a part almost at the same time, no checks will occur.
|
|
|
|
* Adds actions to `ops` that add data about the part into ZooKeeper.
|
|
|
|
* Call under TableStructureLock.
|
2017-04-01 07:20:54 +00:00
|
|
|
*/
|
2019-05-03 02:00:57 +00:00
|
|
|
void checkPartChecksumsAndAddCommitOps(const zkutil::ZooKeeperPtr & zookeeper, const DataPartPtr & part,
|
2018-08-25 01:58:14 +00:00
|
|
|
Coordination::Requests & ops, String part_name = "", NameSet * absent_replicas_paths = nullptr);
|
2018-03-21 23:30:20 +00:00
|
|
|
|
2018-05-21 13:49:54 +00:00
|
|
|
String getChecksumsForZooKeeper(const MergeTreeDataPartChecksums & checksums) const;
|
2018-03-21 23:30:20 +00:00
|
|
|
|
|
|
|
/// Accepts a PreComitted part, atomically checks its checksums with ones on other replicas and commit the part
|
2019-05-03 02:00:57 +00:00
|
|
|
DataPartsVector checkPartChecksumsAndCommit(Transaction & transaction,
|
|
|
|
const DataPartPtr & part);
|
2017-04-01 07:20:54 +00:00
|
|
|
|
2018-05-21 13:49:54 +00:00
|
|
|
void getCommitPartOps(
|
2018-08-25 01:58:14 +00:00
|
|
|
Coordination::Requests & ops,
|
2019-05-03 02:00:57 +00:00
|
|
|
MutableDataPartPtr & part,
|
2018-05-21 13:49:54 +00:00
|
|
|
const String & block_id_path = "") const;
|
|
|
|
|
2018-12-11 13:30:20 +00:00
|
|
|
/// Updates info about part columns and checksums in ZooKeeper and commits transaction if successful.
|
|
|
|
void updatePartHeaderInZooKeeperAndCommit(
|
|
|
|
const zkutil::ZooKeeperPtr & zookeeper,
|
2019-05-03 02:00:57 +00:00
|
|
|
AlterDataPartTransaction & transaction);
|
2018-12-11 13:30:20 +00:00
|
|
|
|
2017-04-16 15:00:33 +00:00
|
|
|
/// Adds actions to `ops` that remove a part from ZooKeeper.
|
2018-12-11 13:30:20 +00:00
|
|
|
/// Set has_children to true for "old-style" parts (those with /columns and /checksums child znodes).
|
|
|
|
void removePartFromZooKeeper(const String & part_name, Coordination::Requests & ops, bool has_children);
|
2017-04-01 07:20:54 +00:00
|
|
|
|
2017-08-09 21:09:44 +00:00
|
|
|
/// Quickly removes big set of parts from ZooKeeper (using async multi queries)
|
2017-10-03 14:44:10 +00:00
|
|
|
void removePartsFromZooKeeper(zkutil::ZooKeeperPtr & zookeeper, const Strings & part_names,
|
2018-04-06 19:48:54 +00:00
|
|
|
NameSet * parts_should_be_retried = nullptr);
|
2017-06-01 12:53:54 +00:00
|
|
|
|
2018-05-21 13:49:54 +00:00
|
|
|
bool tryRemovePartsFromZooKeeperWithRetries(const Strings & part_names, size_t max_retries = 5);
|
2019-05-03 02:00:57 +00:00
|
|
|
bool tryRemovePartsFromZooKeeperWithRetries(DataPartsVector & parts, size_t max_retries = 5);
|
2018-05-21 13:49:54 +00:00
|
|
|
|
2017-04-16 15:00:33 +00:00
|
|
|
/// Removes a part from ZooKeeper and adds a task to the queue to download it. It is supposed to do this with broken parts.
|
2017-04-01 07:20:54 +00:00
|
|
|
void removePartAndEnqueueFetch(const String & part_name);
|
|
|
|
|
2017-04-16 15:00:33 +00:00
|
|
|
/// Running jobs from the queue.
|
2017-04-01 07:20:54 +00:00
|
|
|
|
2017-04-16 15:00:33 +00:00
|
|
|
/** Execute the action from the queue. Throws an exception if something is wrong.
|
|
|
|
* Returns whether or not it succeeds. If it did not work, write it to the end of the queue.
|
2017-04-01 07:20:54 +00:00
|
|
|
*/
|
2018-05-23 14:33:55 +00:00
|
|
|
bool executeLogEntry(LogEntry & entry);
|
2017-04-01 07:20:54 +00:00
|
|
|
|
2018-05-25 19:44:14 +00:00
|
|
|
void writePartLog(
|
|
|
|
PartLogElement::Type type, const ExecutionStatus & execution_status, UInt64 elapsed_ns,
|
|
|
|
const String & new_part_name,
|
2019-05-03 02:00:57 +00:00
|
|
|
const DataPartPtr & result_part,
|
|
|
|
const DataPartsVector & source_parts,
|
2019-01-04 12:10:00 +00:00
|
|
|
const MergeListEntry * merge_entry);
|
2018-05-25 19:44:14 +00:00
|
|
|
|
2017-04-01 07:20:54 +00:00
|
|
|
void executeDropRange(const LogEntry & entry);
|
|
|
|
|
2018-01-23 22:56:46 +00:00
|
|
|
/// Do the merge or recommend to make the fetch instead of the merge
|
2018-04-20 19:11:20 +00:00
|
|
|
bool tryExecuteMerge(const LogEntry & entry);
|
|
|
|
|
|
|
|
bool tryExecutePartMutation(const LogEntry & entry);
|
2018-01-23 22:56:46 +00:00
|
|
|
|
2018-05-23 14:33:55 +00:00
|
|
|
bool executeFetch(LogEntry & entry);
|
2018-01-23 22:56:46 +00:00
|
|
|
|
2019-05-09 14:25:18 +00:00
|
|
|
void executeClearColumnOrIndexInPartition(const LogEntry & entry);
|
2017-06-16 16:47:09 +00:00
|
|
|
|
2018-05-21 13:49:54 +00:00
|
|
|
bool executeReplaceRange(const LogEntry & entry);
|
|
|
|
|
2017-04-16 15:00:33 +00:00
|
|
|
/** Updates the queue.
|
2017-04-01 07:20:54 +00:00
|
|
|
*/
|
2018-05-31 13:05:05 +00:00
|
|
|
void queueUpdatingTask();
|
2017-04-01 07:20:54 +00:00
|
|
|
|
2018-05-31 13:05:05 +00:00
|
|
|
void mutationsUpdatingTask();
|
2018-04-19 14:20:18 +00:00
|
|
|
|
2018-08-20 17:15:04 +00:00
|
|
|
/** Clone data from another replica.
|
2018-08-27 23:59:49 +00:00
|
|
|
* If replica can not be cloned throw Exception.
|
|
|
|
*/
|
2018-08-27 13:51:22 +00:00
|
|
|
void cloneReplica(const String & source_replica, Coordination::Stat source_is_lost_stat, zkutil::ZooKeeperPtr & zookeeper);
|
2018-08-07 15:21:42 +00:00
|
|
|
|
|
|
|
/// Clone replica if it is lost.
|
2018-08-22 14:01:54 +00:00
|
|
|
void cloneReplicaIfNeeded(zkutil::ZooKeeperPtr zookeeper);
|
2018-08-07 15:21:42 +00:00
|
|
|
|
2017-04-16 15:00:33 +00:00
|
|
|
/** Performs actions from the queue.
|
2017-04-01 07:20:54 +00:00
|
|
|
*/
|
2018-12-26 17:03:29 +00:00
|
|
|
BackgroundProcessingPoolTaskResult queueTask();
|
2017-04-01 07:20:54 +00:00
|
|
|
|
2018-04-06 16:06:07 +00:00
|
|
|
/// Postcondition:
|
|
|
|
/// either leader_election is fully initialized (node in ZK is created and the watching thread is launched)
|
|
|
|
/// or an exception is thrown and leader_election is destroyed.
|
|
|
|
void enterLeaderElection();
|
2017-04-01 07:20:54 +00:00
|
|
|
|
2018-04-06 16:06:07 +00:00
|
|
|
/// Postcondition:
|
|
|
|
/// is_leader is false, merge_selecting_thread is stopped, leader_election is nullptr.
|
|
|
|
/// leader_election node in ZK is either deleted, or the session is marked expired.
|
|
|
|
void exitLeaderElection();
|
2017-04-01 07:20:54 +00:00
|
|
|
|
2017-04-16 15:00:33 +00:00
|
|
|
/** Selects the parts to merge and writes to the log.
|
2017-04-01 07:20:54 +00:00
|
|
|
*/
|
2018-05-31 13:05:05 +00:00
|
|
|
void mergeSelectingTask();
|
2017-04-01 07:20:54 +00:00
|
|
|
|
2018-06-21 13:27:36 +00:00
|
|
|
/// Checks if some mutations are done and marks them as done.
|
|
|
|
void mutationsFinalizingTask();
|
|
|
|
|
2017-04-16 15:00:33 +00:00
|
|
|
/** Write the selected parts to merge into the log,
|
|
|
|
* Call when merge_selecting_mutex is locked.
|
|
|
|
* Returns false if any part is not in ZK.
|
2017-04-01 07:20:54 +00:00
|
|
|
*/
|
|
|
|
bool createLogEntryToMergeParts(
|
2018-05-10 15:01:10 +00:00
|
|
|
zkutil::ZooKeeperPtr & zookeeper,
|
2019-05-03 02:00:57 +00:00
|
|
|
const DataPartsVector & parts,
|
2017-04-01 07:20:54 +00:00
|
|
|
const String & merged_name,
|
2017-04-17 15:14:56 +00:00
|
|
|
bool deduplicate,
|
2019-08-01 11:10:42 +00:00
|
|
|
bool force_ttl,
|
2017-04-01 07:20:54 +00:00
|
|
|
ReplicatedMergeTreeLogEntryData * out_log_entry = nullptr);
|
|
|
|
|
2018-04-20 16:18:16 +00:00
|
|
|
bool createLogEntryToMutatePart(const MergeTreeDataPart & part, Int64 mutation_version);
|
|
|
|
|
2017-04-16 15:00:33 +00:00
|
|
|
/// Exchange parts.
|
2017-04-01 07:20:54 +00:00
|
|
|
|
2017-04-16 15:00:33 +00:00
|
|
|
/** Returns an empty string if no one has a part.
|
2017-04-01 07:20:54 +00:00
|
|
|
*/
|
|
|
|
String findReplicaHavingPart(const String & part_name, bool active);
|
|
|
|
|
|
|
|
/** Find replica having specified part or any part that covers it.
|
|
|
|
* If active = true, consider only active replicas.
|
2017-05-12 13:47:42 +00:00
|
|
|
* If found, returns replica name and set 'entry->actual_new_part_name' to name of found largest covering part.
|
2017-04-01 07:20:54 +00:00
|
|
|
* If not found, returns empty string.
|
|
|
|
*/
|
2018-05-23 14:33:55 +00:00
|
|
|
String findReplicaHavingCoveringPart(LogEntry & entry, bool active);
|
2018-05-21 13:49:54 +00:00
|
|
|
String findReplicaHavingCoveringPart(const String & part_name, bool active, String & found_part_name);
|
2017-04-01 07:20:54 +00:00
|
|
|
|
2017-04-16 15:00:33 +00:00
|
|
|
/** Download the specified part from the specified replica.
|
|
|
|
* If `to_detached`, the part is placed in the `detached` directory.
|
|
|
|
* If quorum != 0, then the node for tracking the quorum is updated.
|
2017-04-01 07:20:54 +00:00
|
|
|
* Returns false if part is already fetching right now.
|
|
|
|
*/
|
|
|
|
bool fetchPart(const String & part_name, const String & replica_path, bool to_detached, size_t quorum);
|
|
|
|
|
2017-05-12 13:47:42 +00:00
|
|
|
/// Required only to avoid races between executeLogEntry and fetchPartition
|
2017-04-01 07:20:54 +00:00
|
|
|
std::unordered_set<String> currently_fetching_parts;
|
|
|
|
std::mutex currently_fetching_parts_mutex;
|
|
|
|
|
2017-05-12 13:47:42 +00:00
|
|
|
/// With the quorum being tracked, add a replica to the quorum for the part.
|
2017-04-01 07:20:54 +00:00
|
|
|
void updateQuorum(const String & part_name);
|
|
|
|
|
2018-05-21 13:49:54 +00:00
|
|
|
/// Creates new block number if block with such block_id does not exist
|
2018-07-04 16:31:21 +00:00
|
|
|
std::optional<EphemeralLockInZooKeeper> allocateBlockNumber(
|
|
|
|
const String & partition_id, zkutil::ZooKeeperPtr & zookeeper,
|
|
|
|
const String & zookeeper_block_id_path = "");
|
2017-04-01 07:20:54 +00:00
|
|
|
|
2017-04-16 15:00:33 +00:00
|
|
|
/** Wait until all replicas, including this, execute the specified action from the log.
|
|
|
|
* If replicas are added at the same time, it can not wait the added replica .
|
2019-08-20 01:46:48 +00:00
|
|
|
*
|
|
|
|
* NOTE: This method must be called without table lock held.
|
|
|
|
* Because it effectively waits for other thread that usually has to also acquire a lock to proceed and this yields deadlock.
|
|
|
|
* TODO: There are wrong usages of this method that are not fixed yet.
|
2017-04-01 07:20:54 +00:00
|
|
|
*/
|
|
|
|
void waitForAllReplicasToProcessLogEntry(const ReplicatedMergeTreeLogEntryData & entry);
|
|
|
|
|
2017-04-16 15:00:33 +00:00
|
|
|
/** Wait until the specified replica executes the specified action from the log.
|
2019-08-20 01:46:48 +00:00
|
|
|
* NOTE: See comment about locks above.
|
2017-04-01 07:20:54 +00:00
|
|
|
*/
|
|
|
|
void waitForReplicaToProcessLogEntry(const String & replica_name, const ReplicatedMergeTreeLogEntryData & entry);
|
|
|
|
|
2017-06-16 16:47:09 +00:00
|
|
|
/// Choose leader replica, send requst to it and wait.
|
2018-12-28 13:39:44 +00:00
|
|
|
void sendRequestToLeaderReplica(const ASTPtr & query, const Context & query_context);
|
2017-06-16 16:47:09 +00:00
|
|
|
|
2017-04-16 15:00:33 +00:00
|
|
|
/// Throw an exception if the table is readonly.
|
2017-04-01 07:20:54 +00:00
|
|
|
void assertNotReadonly() const;
|
2016-01-21 16:30:05 +00:00
|
|
|
|
2018-05-21 13:49:54 +00:00
|
|
|
/// Produce an imaginary part info covering all parts in the specified partition (at the call moment).
|
|
|
|
/// Returns false if the partition doesn't exist yet.
|
|
|
|
bool getFakePartCoveringAllPartsInPartition(const String & partition_id, MergeTreePartInfo & part_info);
|
2017-04-01 07:20:54 +00:00
|
|
|
|
2017-04-16 15:00:33 +00:00
|
|
|
/// Check for a node in ZK. If it is, remember this information, and then immediately answer true.
|
2017-04-01 07:20:54 +00:00
|
|
|
std::unordered_set<std::string> existing_nodes_cache;
|
|
|
|
std::mutex existing_nodes_cache_mutex;
|
|
|
|
bool existsNodeCached(const std::string & path);
|
|
|
|
|
2017-11-15 16:32:47 +00:00
|
|
|
/// Remove block IDs from `blocks/` in ZooKeeper for the given partition ID in the given block number range.
|
|
|
|
void clearBlocksInPartition(
|
|
|
|
zkutil::ZooKeeper & zookeeper, const String & partition_id, Int64 min_block_num, Int64 max_block_num);
|
2017-04-01 07:20:54 +00:00
|
|
|
|
2018-04-17 17:59:42 +00:00
|
|
|
/// Info about how other replicas can access this one.
|
|
|
|
ReplicatedMergeTreeAddress getReplicatedMergeTreeAddress() const;
|
2018-06-09 15:48:22 +00:00
|
|
|
|
|
|
|
bool dropPartsInPartition(zkutil::ZooKeeper & zookeeper, String & partition_id,
|
|
|
|
StorageReplicatedMergeTree::LogEntry & entry, bool detach);
|
2018-04-17 17:59:42 +00:00
|
|
|
|
2018-12-28 16:28:24 +00:00
|
|
|
/// Find cluster address for host
|
|
|
|
std::optional<Cluster::Address> findClusterAddress(const ReplicatedMergeTreeAddress & leader_address) const;
|
2018-11-22 22:08:00 +00:00
|
|
|
|
2018-11-13 13:48:53 +00:00
|
|
|
// Partition helpers
|
2019-05-09 14:25:18 +00:00
|
|
|
void clearColumnOrIndexInPartition(const ASTPtr & partition, LogEntry && entry, const Context & query_context);
|
2018-12-28 13:39:44 +00:00
|
|
|
void dropPartition(const ASTPtr & query, const ASTPtr & partition, bool detach, const Context & query_context);
|
|
|
|
void attachPartition(const ASTPtr & partition, bool part, const Context & query_context);
|
|
|
|
void replacePartitionFrom(const StoragePtr & source_table, const ASTPtr & partition, bool replace, const Context & query_context);
|
|
|
|
void fetchPartition(const ASTPtr & partition, const String & from, const Context & query_context);
|
2018-11-13 13:48:53 +00:00
|
|
|
|
2019-08-12 13:30:29 +00:00
|
|
|
/// Check granularity of already existing replicated table in zookeeper if it exists
|
|
|
|
/// return true if it's fixed
|
|
|
|
bool checkFixedGranualrityInZookeeper();
|
|
|
|
|
2017-11-04 03:20:18 +00:00
|
|
|
protected:
|
2017-12-25 14:56:32 +00:00
|
|
|
/** If not 'attach', either creates a new table in ZK, or adds a replica to an existing table.
|
|
|
|
*/
|
2017-11-04 03:20:18 +00:00
|
|
|
StorageReplicatedMergeTree(
|
|
|
|
const String & zookeeper_path_,
|
|
|
|
const String & replica_name_,
|
|
|
|
bool attach,
|
|
|
|
const String & path_, const String & database_name_, const String & name_,
|
2018-03-06 20:18:34 +00:00
|
|
|
const ColumnsDescription & columns_,
|
2019-02-05 14:50:25 +00:00
|
|
|
const IndicesDescription & indices_,
|
2019-05-18 08:05:52 +00:00
|
|
|
const ConstraintsDescription & constraints_,
|
2017-11-04 03:20:18 +00:00
|
|
|
Context & context_,
|
|
|
|
const String & date_column_name,
|
2018-11-06 18:25:36 +00:00
|
|
|
const ASTPtr & partition_by_ast_,
|
|
|
|
const ASTPtr & order_by_ast_,
|
|
|
|
const ASTPtr & primary_key_ast_,
|
2018-11-09 19:01:39 +00:00
|
|
|
const ASTPtr & sample_by_ast_,
|
2019-04-15 09:30:45 +00:00
|
|
|
const ASTPtr & table_ttl_ast_,
|
2019-05-03 02:00:57 +00:00
|
|
|
const MergingParams & merging_params_,
|
2019-08-13 10:29:31 +00:00
|
|
|
MergeTreeSettingsPtr settings_,
|
2017-11-04 03:20:18 +00:00
|
|
|
bool has_force_restore_data_flag);
|
2016-01-21 16:30:05 +00:00
|
|
|
};
|
|
|
|
|
2016-04-07 21:35:01 +00:00
|
|
|
|
|
|
|
extern const int MAX_AGE_OF_LOCAL_PART_THAT_WASNT_ADDED_TO_ZOOKEEPER;
|
|
|
|
|
2014-03-21 13:42:14 +00:00
|
|
|
}
|