ClickHouse/src/Storages/IStorage.h

Ignoring revisions in .git-blame-ignore-revs. Click here to bypass and see the normal blame view.

759 lines
33 KiB
C++
Raw Normal View History

2011-08-15 01:12:57 +00:00
#pragma once
2010-03-04 19:20:28 +00:00
#include <Core/Names.h>
#include <Core/QueryProcessingStage.h>
#include <Databases/IDatabase.h>
#include <Interpreters/CancellationCode.h>
#include <Interpreters/Context_fwd.h>
2020-03-13 10:30:55 +00:00
#include <Interpreters/StorageID.h>
2019-07-03 13:17:19 +00:00
#include <Storages/CheckResults.h>
#include <Storages/ColumnDependency.h>
#include <Storages/IStorage_fwd.h>
#include <Storages/SelectQueryDescription.h>
#include <Storages/StorageInMemoryMetadata.h>
#include <Storages/TableLockHolder.h>
#include <Storages/StorageSnapshot.h>
#include <Common/ActionLock.h>
#include <Common/Exception.h>
#include <Common/RWLock.h>
#include <Common/TypePromotion.h>
#include <optional>
#include <compare>
2010-03-01 16:59:51 +00:00
namespace DB
{
namespace ErrorCodes
{
extern const int NOT_IMPLEMENTED;
}
using StorageActionBlockType = size_t;
class ASTCreateQuery;
class ASTInsertQuery;
2010-03-01 16:59:51 +00:00
struct Settings;
class AlterCommands;
2018-06-13 20:02:27 +00:00
class MutationCommands;
2020-03-17 13:49:50 +00:00
struct PartitionCommand;
using PartitionCommands = std::vector<PartitionCommand>;
2019-09-13 12:59:48 +00:00
class IProcessor;
using ProcessorPtr = std::shared_ptr<IProcessor>;
using Processors = std::vector<ProcessorPtr>;
class Pipe;
2020-09-14 14:13:58 +00:00
class QueryPlan;
using QueryPlanPtr = std::unique_ptr<QueryPlan>;
2019-09-13 12:59:48 +00:00
2021-07-23 14:25:35 +00:00
class SinkToStorage;
using SinkToStoragePtr = std::shared_ptr<SinkToStorage>;
2022-05-20 19:49:31 +00:00
class QueryPipeline;
2021-01-23 15:20:15 +00:00
class IStoragePolicy;
using StoragePolicyPtr = std::shared_ptr<const IStoragePolicy>;
2020-09-15 17:23:49 +00:00
struct StreamLocalLimits;
class EnabledQuota;
struct SelectQueryInfo;
2020-09-15 17:23:49 +00:00
using NameDependencies = std::unordered_map<String, std::vector<String>>;
2021-08-18 22:19:14 +00:00
using DatabaseAndTableName = std::pair<String, String>;
class BackupEntriesCollector;
class RestorerFromBackup;
2023-09-26 17:16:01 +00:00
class ConditionEstimator;
2019-07-17 01:24:37 +00:00
struct ColumnSize
{
size_t marks = 0;
size_t data_compressed = 0;
size_t data_uncompressed = 0;
void add(const ColumnSize & other)
{
marks += other.marks;
data_compressed += other.data_compressed;
data_uncompressed += other.data_uncompressed;
}
};
using IndexSize = ColumnSize;
/** Storage. Describes the table. Responsible for
2017-04-16 15:00:33 +00:00
* - storage of the table data;
* - the definition in which files (or not in files) the data is stored;
* - data lookups and appends;
2017-04-16 15:00:33 +00:00
* - data storage structure (compression, etc.)
* - concurrent access to data (locks, etc.)
2010-03-01 16:59:51 +00:00
*/
2023-09-10 03:48:12 +00:00
class IStorage : public std::enable_shared_from_this<IStorage>, public TypePromotion<IStorage>, public IHints<>
2010-03-01 16:59:51 +00:00
{
public:
2019-12-03 16:25:32 +00:00
IStorage() = delete;
2020-06-19 15:39:41 +00:00
/// Storage metadata can be set separately in setInMemoryMetadata method
2020-06-19 17:17:13 +00:00
explicit IStorage(StorageID storage_id_)
: storage_id(std::move(storage_id_))
2023-02-19 22:15:09 +00:00
, metadata(std::make_unique<StorageInMemoryMetadata>()) {}
IStorage(const IStorage &) = delete;
IStorage & operator=(const IStorage &) = delete;
2017-04-16 15:00:33 +00:00
/// The main name of the table type (for example, StorageMergeTree).
2010-03-04 19:20:28 +00:00
virtual std::string getName() const = 0;
/// The name of the table.
2019-12-27 19:30:22 +00:00
StorageID getStorageID() const;
virtual bool isMergeTree() const { return false; }
/// Returns true if the storage receives data from a remote server or servers.
virtual bool isRemote() const { return false; }
2020-01-24 16:20:36 +00:00
/// Returns true if the storage is a view of a table or another view.
virtual bool isView() const { return false; }
2021-04-21 13:45:13 +00:00
/// Returns true if the storage is dictionary
virtual bool isDictionary() const { return false; }
/// Returns true if the storage supports queries with the SAMPLE section.
virtual bool supportsSampling() const { return getInMemoryMetadataPtr()->hasSamplingKey(); }
/// Returns true if the storage supports queries with the FINAL section.
2013-04-23 11:08:41 +00:00
virtual bool supportsFinal() const { return false; }
2021-07-14 08:49:05 +00:00
/// Returns true if the storage supports insert queries with the PARTITION BY section.
virtual bool supportsPartitionBy() const { return false; }
/// Returns true if the storage supports queries with the TTL section.
virtual bool supportsTTL() const { return false; }
/// Returns true if the storage supports queries with the PREWHERE section.
virtual bool supportsPrewhere() const { return false; }
2023-10-30 23:53:46 +00:00
virtual ConditionEstimator getConditionEstimatorByPredicate(const SelectQueryInfo &, const StorageSnapshotPtr &, ContextPtr) const;
/// Returns which columns supports PREWHERE, or empty std::nullopt if all columns is supported.
/// This is needed for engines whose aggregates data from multiple tables, like Merge.
virtual std::optional<NameSet> supportedPrewhereColumns() const { return std::nullopt; }
/// Returns true if the storage supports optimization of moving conditions to PREWHERE section.
virtual bool canMoveConditionsToPrewhere() const { return supportsPrewhere(); }
/// Returns true if the storage replicates SELECT, INSERT and ALTER commands among replicas.
2017-04-25 15:21:03 +00:00
virtual bool supportsReplication() const { return false; }
2019-12-12 10:49:15 +00:00
/// Returns true if the storage supports parallel insert.
/// If false, each INSERT query will call write() only once.
/// Different INSERT queries may write in parallel regardless of this value.
2019-12-12 10:49:15 +00:00
virtual bool supportsParallelInsert() const { return false; }
/// Returns true if the storage supports deduplication of inserted data blocks.
virtual bool supportsDeduplication() const { return false; }
2017-04-25 15:21:03 +00:00
/// Returns true if the blocks shouldn't be pushed to associated views on insert.
virtual bool noPushingToViews() const { return false; }
2020-01-24 15:10:24 +00:00
/// Read query returns streams which automatically distribute data between themselves.
/// So, it's impossible for one stream run out of data when there is data in other streams.
/// Example is StorageSystemNumbers.
virtual bool hasEvenlyDistributedRead() const { return false; }
2020-12-22 16:40:53 +00:00
/// Returns true if the storage supports reading of subcolumns of complex types.
virtual bool supportsSubcolumns() const { return false; }
/// Returns true if the storage supports transactions for SELECT, INSERT and ALTER queries.
/// Storage may throw an exception later if some query kind is not fully supported.
2022-02-14 19:47:17 +00:00
/// This method can return true for readonly engines that return the same rows for reading (such as SystemNumbers)
virtual bool supportsTransactions() const { return false; }
2022-03-01 16:32:55 +00:00
/// Returns true if the storage supports storing of dynamic subcolumns.
/// For now it makes sense only for data type Object.
virtual bool supportsDynamicSubcolumns() const { return false; }
/// Requires squashing small blocks to large for optimal storage.
/// This is true for most storages that store data on disk.
virtual bool prefersLargeBlocks() const { return true; }
/// Returns true if the storage is for system, which cannot be target of SHOW CREATE TABLE.
virtual bool isSystemStorage() const { return false; }
2023-04-25 01:11:58 +00:00
/// Returns true if asynchronous inserts are enabled for table.
virtual bool areAsynchronousInsertsEnabled() const { return false; }
2020-02-18 19:03:40 +00:00
2019-07-17 01:24:37 +00:00
/// Optional size information of each physical column.
/// Currently it's only used by the MergeTree family for query optimizations.
using ColumnSizeByName = std::unordered_map<std::string, ColumnSize>;
virtual ColumnSizeByName getColumnSizes() const { return {}; }
2019-03-05 10:12:20 +00:00
/// Optional size information of each secondary index.
/// Valid only for MergeTree family.
using IndexSizeByName = std::unordered_map<std::string, IndexSize>;
virtual IndexSizeByName getSecondaryIndexSizes() const { return {}; }
2020-06-22 09:49:21 +00:00
/// Get mutable version (snapshot) of storage metadata. Metadata object is
2020-08-08 00:47:03 +00:00
/// multiversion, so it can be concurrently changed, but returned copy can be
2020-06-22 09:49:21 +00:00
/// used without any locks.
2020-06-18 11:02:31 +00:00
StorageInMemoryMetadata getInMemoryMetadata() const { return *metadata.get(); }
2020-06-22 09:49:21 +00:00
/// Get immutable version (snapshot) of storage metadata. Metadata object is
2020-08-08 00:47:03 +00:00
/// multiversion, so it can be concurrently changed, but returned copy can be
2020-06-22 09:49:21 +00:00
/// used without any locks.
2020-06-18 11:02:31 +00:00
StorageMetadataPtr getInMemoryMetadataPtr() const { return metadata.get(); }
2020-06-22 09:49:21 +00:00
/// Update storage metadata. Used in ALTER or initialization of Storage.
/// Metadata object is multiversion, so this method can be called without
/// any locks.
2020-06-18 11:02:31 +00:00
void setInMemoryMetadata(const StorageInMemoryMetadata & metadata_)
{
metadata.set(std::make_unique<StorageInMemoryMetadata>(metadata_));
}
2019-12-26 18:17:05 +00:00
2020-04-27 17:46:51 +00:00
/// Return list of virtual columns (like _part, _table, etc). In the vast
/// majority of cases virtual columns are static constant part of Storage
/// class and don't depend on Storage object. But sometimes we have fake
/// storages, like Merge, which works as proxy for other storages and it's
/// virtual columns must contain virtual columns from underlying table.
///
/// User can create columns with the same name as virtual column. After that
2020-08-08 00:47:03 +00:00
/// virtual column will be overridden and inaccessible.
2020-04-27 17:46:51 +00:00
///
/// By default return empty list of columns.
virtual NamesAndTypesList getVirtuals() const;
2020-05-20 15:16:39 +00:00
2020-12-07 09:30:47 +00:00
Names getAllRegisteredNames() const override;
2019-05-21 11:24:32 +00:00
NameDependencies getDependentViewsByColumn(ContextPtr context) const;
/// Returns whether the column is virtual - by default all columns are real.
/// Initially reserved virtual column name may be shadowed by real column.
bool isVirtualColumn(const String & column_name, const StorageMetadataPtr & metadata_snapshot) const;
2022-06-22 22:56:41 +00:00
/// Modify a CREATE TABLE query to make a variant which must be written to a backup.
virtual void adjustCreateQueryForBackup(ASTPtr & create_query) const;
2021-08-18 22:19:14 +00:00
/// Makes backup entries to backup the data of this storage.
virtual void backupData(BackupEntriesCollector & backup_entries_collector, const String & data_path_in_backup, const std::optional<ASTs> & partitions);
/// Extracts data from the backup and put it to the storage.
virtual void restoreDataFromBackup(RestorerFromBackup & restorer, const String & data_path_in_backup, const std::optional<ASTs> & partitions);
/// Returns true if the storage supports backup/restore for specific partitions.
virtual bool supportsBackupPartition() const { return false; }
2022-07-21 19:50:19 +00:00
/// Return true if there is at least one part containing lightweight deleted mask.
virtual bool hasLightweightDeletedMask() const { return false; }
/// Return true if storage can execute lightweight delete mutations.
virtual bool supportsLightweightDelete() const { return false; }
/// Return true if storage can execute 'DELETE FROM' mutations. This is different from lightweight delete
/// because those are internally translated into 'ALTER UDPATE' mutations.
virtual bool supportsDelete() const { return false; }
2023-08-23 13:19:15 +00:00
/// Return true if the trivial count query could be optimized without reading the data at all
/// in totalRows() or totalRowsByPartitionPredicate() methods or with optimized reading in read() method.
virtual bool supportsTrivialCountOptimization() const { return false; }
private:
SYSTEM RESTORE REPLICA replica [ON CLUSTER cluster] (#13652) * initial commit: add setting and stub * typo * added test stub * fix * wip merging new integration test and code proto * adding steps interpreters * adding firstly proposed solution (moving parts etc) * added checking zookeeper path existence * fixing the include * fixing and sorting includes * fixing outdated struct * fix the name * added ast ptr as level of indirection * fix ref * updating the changes * working on test stub * fix iterator -> reference * revert rocksdb submodule update * fixed show privileges test * updated the test stub * replaced rand() with thread_local_rng(), updated the tests updated the test fixed test config path test fix removed error messages fixed the test updated the test fixed string literal fixed literal typo: = * fixed the empty replica error message * updated the test and the code with logs * updated the possible test cases, updated * added the code/test milestone comments * updated the test (added more testcases) * replaced native assert with CH one * individual replicas recursive delete fix * updated the AS db.name AST * two small logging fixes * manually generated AST fixes * Updated the test, added the possible algo change * Some thoughts about optimizing the solution: ALTER MOVE PARTITION .. TO TABLE -> move to detached/ + ALTER ... ATTACH * fix * Removed the replica sync in test as it's invalid * Some test tweaks * tmp * Rewrote the algo by using the executeQuery instead of hand-crafting the ASTPtr. Two questions still active. * tr: logging active parts * Extracted the parts moving algo into a separate helper function * Fixed the test data and the queries slightly * Replaced query to system.parts to direct invocation, started building the test that breaks on various parts. * Added the case for tables when at least one replica is alive * Updated the test to test replicas restoration by detaching/attaching * Altered the test to check restoration without replica restart * Added the tables swap in the start if the server failed last time * Hotfix when only /replicas/replica... path was deleted * Restore ZK paths while creating a replicated MergeTree table * Updated the docs, fixed the algo for individual replicas restoration case * Initial parts table storage fix, tests sync fix * Reverted individual replica restoration to general algo * Slightly optimised getDataParts * Trying another solution with parts detaching * Rewrote algo without any steps, added ON CLUSTER support * Attaching parts from other replica on restoration * Getting part checksums from ZK * Removed ON CLUSTER, finished working solution * Multiple small changes after review * Fixing parallel test * Supporting rewritten form on cluster * Test fix * Moar logging * Using source replica as checksum provider * improve test, remove some code from parser * Trying solution with move to detached + forget * Moving all parts (not only Committed) to detached * Edited docs for RESTORE REPLICA * Re-merging * minor fixes Co-authored-by: Alexander Tokmakov <avtokmakov@yandex-team.ru>
2021-06-20 08:24:43 +00:00
2019-12-12 12:30:31 +00:00
StorageID storage_id;
SYSTEM RESTORE REPLICA replica [ON CLUSTER cluster] (#13652) * initial commit: add setting and stub * typo * added test stub * fix * wip merging new integration test and code proto * adding steps interpreters * adding firstly proposed solution (moving parts etc) * added checking zookeeper path existence * fixing the include * fixing and sorting includes * fixing outdated struct * fix the name * added ast ptr as level of indirection * fix ref * updating the changes * working on test stub * fix iterator -> reference * revert rocksdb submodule update * fixed show privileges test * updated the test stub * replaced rand() with thread_local_rng(), updated the tests updated the test fixed test config path test fix removed error messages fixed the test updated the test fixed string literal fixed literal typo: = * fixed the empty replica error message * updated the test and the code with logs * updated the possible test cases, updated * added the code/test milestone comments * updated the test (added more testcases) * replaced native assert with CH one * individual replicas recursive delete fix * updated the AS db.name AST * two small logging fixes * manually generated AST fixes * Updated the test, added the possible algo change * Some thoughts about optimizing the solution: ALTER MOVE PARTITION .. TO TABLE -> move to detached/ + ALTER ... ATTACH * fix * Removed the replica sync in test as it's invalid * Some test tweaks * tmp * Rewrote the algo by using the executeQuery instead of hand-crafting the ASTPtr. Two questions still active. * tr: logging active parts * Extracted the parts moving algo into a separate helper function * Fixed the test data and the queries slightly * Replaced query to system.parts to direct invocation, started building the test that breaks on various parts. * Added the case for tables when at least one replica is alive * Updated the test to test replicas restoration by detaching/attaching * Altered the test to check restoration without replica restart * Added the tables swap in the start if the server failed last time * Hotfix when only /replicas/replica... path was deleted * Restore ZK paths while creating a replicated MergeTree table * Updated the docs, fixed the algo for individual replicas restoration case * Initial parts table storage fix, tests sync fix * Reverted individual replica restoration to general algo * Slightly optimised getDataParts * Trying another solution with parts detaching * Rewrote algo without any steps, added ON CLUSTER support * Attaching parts from other replica on restoration * Getting part checksums from ZK * Removed ON CLUSTER, finished working solution * Multiple small changes after review * Fixing parallel test * Supporting rewritten form on cluster * Test fix * Moar logging * Using source replica as checksum provider * improve test, remove some code from parser * Trying solution with move to detached + forget * Moving all parts (not only Committed) to detached * Edited docs for RESTORE REPLICA * Re-merging * minor fixes Co-authored-by: Alexander Tokmakov <avtokmakov@yandex-team.ru>
2021-06-20 08:24:43 +00:00
2019-12-03 16:25:32 +00:00
mutable std::mutex id_mutex;
2020-05-20 12:16:55 +00:00
2020-06-22 09:49:21 +00:00
/// Multiversion storage metadata. Allows to read/write storage metadata
/// without locks.
2020-06-18 11:02:31 +00:00
MultiVersionStorageMetadataPtr metadata;
2020-08-03 11:33:11 +00:00
protected:
2020-04-27 15:21:37 +00:00
RWLockImpl::LockHolder tryLockTimed(
const RWLock & rwlock, RWLockImpl::Type type, const String & query_id, const std::chrono::milliseconds & acquire_timeout) const;
public:
2022-09-29 06:44:10 +00:00
/// Lock table for share. This lock must be acquired if you want to be sure,
2020-06-22 09:49:21 +00:00
/// that table will be not dropped while you holding this lock. It's used in
/// variety of cases starting from SELECT queries to background merges in
/// MergeTree.
TableLockHolder lockForShare(const String & query_id, const std::chrono::milliseconds & acquire_timeout);
2022-09-29 06:44:10 +00:00
/// Similar to lockForShare, but returns a nullptr if the table is dropped while
/// acquiring the lock instead of raising a TABLE_IS_DROPPED exception
TableLockHolder tryLockForShare(const String & query_id, const std::chrono::milliseconds & acquire_timeout);
/// Lock table for alter. This lock must be acquired in ALTER queries to be
2020-06-22 09:49:21 +00:00
/// sure, that we execute only one simultaneous alter. Doesn't affect share lock.
2021-10-25 17:49:49 +00:00
using AlterLockHolder = std::unique_lock<std::timed_mutex>;
AlterLockHolder lockForAlter(const std::chrono::milliseconds & acquire_timeout);
2023-07-17 17:02:29 +00:00
std::optional<AlterLockHolder> tryLockForAlter(const std::chrono::milliseconds & acquire_timeout);
2019-03-05 10:12:20 +00:00
2020-08-08 00:47:03 +00:00
/// Lock table exclusively. This lock must be acquired if you want to be
2020-06-22 09:49:21 +00:00
/// sure, that no other thread (SELECT, merge, ALTER, etc.) doing something
/// with table. For example it allows to wait all threads before DROP or
/// truncate query.
///
/// NOTE: You have to be 100% sure that you need this lock. It's extremely
/// heavyweight and makes table irresponsive.
TableExclusiveLockHolder lockExclusively(const String & query_id, const std::chrono::milliseconds & acquire_timeout);
/** Returns stage to which query is going to be processed in read() function.
* (Normally, the function only reads the columns from the list, but in other cases,
* for example, the request can be partially processed on a remote server, or an aggregate projection.)
*
* SelectQueryInfo is required since the stage can depends on the query
* (see Distributed() engine and optimize_skip_unused_shards,
* see also MergeTree engine and projection optimization).
* And to store optimized cluster (after optimize_skip_unused_shards).
* It will also store needed stuff for projection query pipeline.
*
* QueryProcessingStage::Enum required for Distributed over Distributed,
* since it cannot return Complete for intermediate queries never.
*/
virtual QueryProcessingStage::Enum getQueryProcessingStage(ContextPtr, QueryProcessingStage::Enum, const StorageSnapshotPtr &, SelectQueryInfo &) const
{
return QueryProcessingStage::FetchColumns;
}
/** Watch live changes to the table.
* Accepts a list of columns to read, as well as a description of the query,
* from which information can be extracted about how to retrieve data
* (indexes, locks, etc.)
* Returns a stream with which you can read data sequentially
* or multiple streams for parallel data reading.
* The `processed_stage` info is also written to what stage the request was processed.
* (Normally, the function only reads the columns from the list, but in other cases,
* for example, the request can be partially processed on a remote server.)
*
* context contains settings for one query.
* Usually Storage does not care about these settings, since they are used in the interpreter.
* But, for example, for distributed query processing, the settings are passed to the remote server.
*
* num_streams - a recommendation, how many streams to return,
* if the storage can return a different number of streams.
*
* It is guaranteed that the structure of the table will not change over the lifetime of the returned streams (that is, there will not be ALTER, RENAME and DROP).
*/
2021-08-11 17:28:54 +00:00
virtual Pipe watch(
const Names & /*column_names*/,
const SelectQueryInfo & /*query_info*/,
ContextPtr /*context*/,
QueryProcessingStage::Enum & /*processed_stage*/,
size_t /*max_block_size*/,
size_t /*num_streams*/);
/// Returns true if FINAL modifier must be added to SELECT query depending on required columns.
/// It's needed for ReplacingMergeTree wrappers such as MaterializedMySQL and MaterializedPostrgeSQL
virtual bool needRewriteQueryWithFinal(const Names & /*column_names*/) const { return false; }
2022-05-23 19:47:32 +00:00
private:
2017-04-16 15:00:33 +00:00
/** Read a set of columns from the table.
* Accepts a list of columns to read, as well as a description of the query,
* from which information can be extracted about how to retrieve data
* (indexes, locks, etc.)
* Returns a stream with which you can read data sequentially
* or multiple streams for parallel data reading.
* The `processed_stage` must be the result of getQueryProcessingStage() function.
2012-01-09 19:20:48 +00:00
*
* context contains settings for one query.
2017-04-16 15:00:33 +00:00
* Usually Storage does not care about these settings, since they are used in the interpreter.
* But, for example, for distributed query processing, the settings are passed to the remote server.
*
2017-06-02 15:54:39 +00:00
* num_streams - a recommendation, how many streams to return,
* if the storage can return a different number of streams.
*
2020-06-22 09:49:21 +00:00
* metadata_snapshot is consistent snapshot of table metadata, it should be
* passed in all parts of the returned pipeline. Storage metadata can be
* changed during lifetime of the returned pipeline, but the snapshot is
* guaranteed to be immutable.
2010-03-01 16:59:51 +00:00
*/
2020-08-03 11:33:11 +00:00
virtual Pipe read(
2019-09-13 12:59:48 +00:00
const Names & /*column_names*/,
const StorageSnapshotPtr & /*storage_snapshot*/,
SelectQueryInfo & /*query_info*/,
ContextPtr /*context*/,
QueryProcessingStage::Enum /*processed_stage*/,
size_t /*max_block_size*/,
size_t /*num_streams*/);
2023-04-21 08:50:07 +00:00
/// Should we process blocks of data returned by the storage in parallel
/// even when the storage returned only one stream of data for reading?
/// It is beneficial, for example, when you read from a file quickly,
/// but then do heavy computations on returned blocks.
///
/// This is enabled by default, but in some cases shouldn't be done (for
/// example it is disabled for all system tables, since it is pretty
/// useless).
virtual bool parallelizeOutputAfterReading(ContextPtr) const { return !isSystemStorage(); }
2022-05-23 19:47:32 +00:00
public:
2020-09-15 09:22:45 +00:00
/// Other version of read which adds reading step to query plan.
/// Default implementation creates ReadFromStorageStep and uses usual read.
2020-09-14 14:13:58 +00:00
virtual void read(
QueryPlan & query_plan,
const Names & /*column_names*/,
const StorageSnapshotPtr & /*storage_snapshot*/,
SelectQueryInfo & /*query_info*/,
ContextPtr /*context*/,
QueryProcessingStage::Enum /*processed_stage*/,
size_t /*max_block_size*/,
size_t /*num_streams*/);
2020-09-14 14:13:58 +00:00
2017-04-16 15:00:33 +00:00
/** Writes the data to a table.
* Receives a description of the query, which can contain information about the data write method.
* Returns an object by which you can write data sequentially.
*
2020-06-22 09:49:21 +00:00
* metadata_snapshot is consistent snapshot of table metadata, it should be
* passed in all parts of the returned streams. Storage metadata can be
* changed during lifetime of the returned streams, but the snapshot is
* guaranteed to be immutable.
*
* async_insert - set to true if the write is part of async insert flushing
2010-03-18 19:32:14 +00:00
*/
2021-07-23 14:25:35 +00:00
virtual SinkToStoragePtr write(
2017-12-01 19:34:51 +00:00
const ASTPtr & /*query*/,
const StorageMetadataPtr & /*metadata_snapshot*/,
ContextPtr /*context*/,
bool /*async_insert*/)
2010-03-18 19:32:14 +00:00
{
throw Exception(ErrorCodes::NOT_IMPLEMENTED, "Method write is not supported by storage {}", getName());
2011-08-19 18:31:14 +00:00
}
/** Writes the data to a table in distributed manner.
* It is supposed that implementation looks into SELECT part of the query and executes distributed
* INSERT SELECT if it is possible with current storage as a receiver and query SELECT part as a producer.
*
* Returns query pipeline if distributed writing is possible, and nullptr otherwise.
*/
2022-05-20 19:49:31 +00:00
virtual std::optional<QueryPipeline> distributedWrite(
const ASTInsertQuery & /*query*/,
2022-05-20 19:49:31 +00:00
ContextPtr /*context*/);
2017-04-16 15:00:33 +00:00
/** Delete the table data. Called before deleting the directory with the data.
2018-06-09 15:48:22 +00:00
* The method can be called only after detaching table from Context (when no queries are performed with table).
* The table is not usable during and after call to this method.
2020-01-22 11:30:11 +00:00
* If some queries may still use the table, then it must be called under exclusive lock.
2017-04-16 15:00:33 +00:00
* If you do not need any action other than deleting the directory with data, you can leave this method blank.
*/
2020-01-22 11:30:11 +00:00
virtual void drop() {}
2022-06-23 07:59:13 +00:00
virtual void dropInnerTableIfAny(bool /* sync */, ContextPtr /* context */) {}
2018-06-09 15:48:22 +00:00
/** Clear the table data and leave it empty.
2020-06-22 09:49:21 +00:00
* Must be called under exclusive lock (lockExclusively).
2018-04-21 00:35:20 +00:00
*/
2020-06-18 10:29:13 +00:00
virtual void truncate(
const ASTPtr & /*query*/,
const StorageMetadataPtr & /* metadata_snapshot */,
ContextPtr /* context */,
2020-06-18 16:10:47 +00:00
TableExclusiveLockHolder &)
2018-04-21 00:35:20 +00:00
{
throw Exception(ErrorCodes::NOT_IMPLEMENTED, "Truncate is not supported by storage {}", getName());
2018-04-21 00:35:20 +00:00
}
2022-04-13 14:51:59 +00:00
virtual void checkTableCanBeRenamed(const StorageID & /*new_name*/) const {}
2020-09-26 19:18:28 +00:00
2017-04-16 15:00:33 +00:00
/** Rename the table.
* Renaming a name in a file with metadata, the name in the list of tables in the RAM, is done separately.
* In this function, you need to rename the directory with the data, if any.
* Called when the table structure is locked for write.
2020-04-07 14:05:51 +00:00
* Table UUID must remain unchanged, unless table moved between Ordinary and Atomic databases.
2012-06-18 06:19:13 +00:00
*/
2020-04-07 14:05:51 +00:00
virtual void rename(const String & /*new_path_to_table_data*/, const StorageID & new_table_id)
2012-06-18 06:19:13 +00:00
{
2020-04-07 14:05:51 +00:00
renameInMemory(new_table_id);
2012-06-18 06:19:13 +00:00
}
2019-12-03 16:25:32 +00:00
/**
* Just updates names of database and table without moving any data on disk
2019-12-12 12:30:31 +00:00
* Can be called directly only from DatabaseAtomic.
2019-12-03 16:25:32 +00:00
*/
2020-04-07 14:05:51 +00:00
virtual void renameInMemory(const StorageID & new_table_id);
2019-12-03 16:25:32 +00:00
2020-06-22 09:49:21 +00:00
/** ALTER tables in the form of column changes that do not affect the change
* to Storage or its parameters. Executes under alter lock (lockForAlter).
2011-08-19 18:31:14 +00:00
*/
2021-10-25 17:49:49 +00:00
virtual void alter(const AlterCommands & params, ContextPtr context, AlterLockHolder & alter_lock_holder);
2019-12-27 15:07:53 +00:00
/** Checks that alter commands can be applied to storage. For example, columns can be modified,
* or primary key can be changes, etc.
*/
virtual void checkAlterIsPossible(const AlterCommands & commands, ContextPtr context) const;
2019-12-26 18:17:05 +00:00
/**
* Checks that mutation commands can be applied to storage.
*/
virtual void checkMutationIsPossible(const MutationCommands & commands, const Settings & settings) const;
/** ALTER tables with regard to its partitions.
* Should handle locks for each command on its own.
*/
2020-08-03 11:33:11 +00:00
virtual Pipe alterPartition(
const StorageMetadataPtr & /* metadata_snapshot */,
const PartitionCommands & /* commands */,
ContextPtr /* context */);
2020-07-13 16:19:08 +00:00
/// Checks that partition commands can be applied to storage.
2023-10-13 14:22:18 +00:00
virtual void checkAlterPartitionIsPossible(
const PartitionCommands & commands,
const StorageMetadataPtr & metadata_snapshot,
const Settings & settings,
ContextPtr context) const;
2020-07-13 16:19:08 +00:00
2017-04-16 15:00:33 +00:00
/** Perform any background work. For example, combining parts in a MergeTree type table.
* Returns whether any work has been done.
2012-07-16 20:25:19 +00:00
*/
2020-06-17 13:39:26 +00:00
virtual bool optimize(
const ASTPtr & /*query*/,
const StorageMetadataPtr & /*metadata_snapshot*/,
const ASTPtr & /*partition*/,
bool /*final*/,
bool /*deduplicate*/,
const Names & /* deduplicate_by_columns */,
[RFC] Replacing merge tree new engine (#41005) * Add new engine to ReplacingMergeTree corresponding to the ReplacingCollapsingMergeTree * Add new test for the new ReplacingMergeTree engine * Limit sign value to -1/1 * Add new engine to ReplacingMergeTree corresponding to the ReplacingCollapsingMergeTree * Add new test for the new ReplacingMergeTree engine * Limit sign value to -1/1 * Replace sign column(Int8) by is_deleted(UInt8) * Add new engine to ReplacingMergeTree corresponding to the ReplacingCollapsingMergeTree * Add new test for the new ReplacingMergeTree engine * Limit sign value to -1/1 * Replace sign column(Int8) by is_deleted(UInt8) * Add new engine to ReplacingMergeTree corresponding to the ReplacingCollapsingMergeTree * Add new test for the new ReplacingMergeTree engine * Limit sign value to -1/1 * Replace sign column(Int8) by is_deleted(UInt8) * Add keyword 'CLEANUP' when OPTIMIZE * Cleanup uniquely when it's a replacingMergeTree * Propagate CLEANUP information and change from 'with_cleanup' to 'cleanup' * Cleanup data flagged as 'is_deleted' * Fix merge when optimize and add a test * Fix OPTIMIZE and INSERT + add tests * New fix for cleanup at the merge * Cleanup debug logs * Add the SETTINGS option 'clean_deleted_rows' that can be 'never' or 'always' * Fix regression bug; Now REplicatedMergeTree can be called as before without 'is_deleted' * Add Replicated tests * Disable tag 'long' for our test and cleanup some white spaces * Update tests * Fix tests and remove additional useless whitespace * Fix replica test * Style clean && add condition check for is_deleted values * clean_deleted_rows settings is nom an enum * Add valid default value to the clean_deleted_rows settings * Update cleanup checkers to use the enum and fix typos in the test * Fix submodule contrib/AMQP-CPP pointer * Add missing messages in test reference and remove a print with non derterministic order * fix replica test reference * Fix edge case * Fix a typo for the spell checker * Fix reference * Fix a condition to raise an error if is_deleted differ from 0/1 and cleanup * Change tests file name and update number * This should fix the ReplacingMergeTree parameter set * Fix replicated parameters * Disable allow_deprecated_syntax_for_merge_tree for our new column * Fix a test * Remove non deterministic order print in the test * Test on replicas * Remove a condition, when checking optional parameters, that should not be sueful since we disabled the deprected_syntaxe * Revert "Remove a condition, when checking optional parameters, that should not be useful since we disabled the deprected_syntaxe" This reverts commit b65d64c05e482945ac20fcfcf0311e1b028ea137. * Fix replica management and limit the number of argument to two maximum, due to the possiblity of deprecated table create/attach failing otherwise * Test a fix for replicated log information error * Try to add sync to have consistent results * Change path of replicas that should cause one issue and add few prints in case it's not that * Get cleanup info on replicas only if information found * Fix style issues * Try to avoid replication error 'cannot select parts...' and and replica read/write field order * Cleanup according to PR reviews and add tests on error raised. * Update src/Storages/MergeTree/registerStorageMergeTree.cpp Co-authored-by: Alexander Tokmakov <tavplubix@gmail.com> * Select ... FINAL don't show rows with is_deleted = true * Update and fix SELECT ... FINAL merge parameter * Remove is_deleted rows only on the version inserted when merge * Fix (master) updates issues * Revert changes that should not be commited * Add changes according to review * Revert changes that should not be commited - part 2 --------- Co-authored-by: Alexander Tokmakov <tavplubix@gmail.com>
2023-02-16 13:03:16 +00:00
bool /*cleanup*/,
ContextPtr /*context*/)
2012-07-16 20:25:19 +00:00
{
throw Exception(ErrorCodes::NOT_IMPLEMENTED, "Method optimize is not supported by storage {}", getName());
2012-07-16 20:25:19 +00:00
}
/// Mutate the table contents
2023-01-30 17:38:28 +00:00
virtual void mutate(const MutationCommands &, ContextPtr)
{
throw Exception(ErrorCodes::NOT_IMPLEMENTED, "Mutations are not supported by storage {}", getName());
}
/// Cancel a mutation.
virtual CancellationCode killMutation(const String & /*mutation_id*/)
{
throw Exception(ErrorCodes::NOT_IMPLEMENTED, "Mutations are not supported by storage {}", getName());
}
2023-04-14 17:42:17 +00:00
virtual void waitForMutation(const String & /*mutation_id*/, bool /*wait_for_another_mutation*/)
2021-12-14 20:06:34 +00:00
{
throw Exception(ErrorCodes::NOT_IMPLEMENTED, "Mutations are not supported by storage {}", getName());
2021-12-14 20:06:34 +00:00
}
virtual void setMutationCSN(const String & /*mutation_id*/, UInt64 /*csn*/)
{
throw Exception(ErrorCodes::NOT_IMPLEMENTED, "Mutations are not supported by storage {}", getName());
}
/// Cancel a part move to shard.
virtual CancellationCode killPartMoveToShard(const UUID & /*task_uuid*/)
{
throw Exception(ErrorCodes::NOT_IMPLEMENTED, "Part moves between shards are not supported by storage {}", getName());
}
/** If the table have to do some complicated work on startup,
* that must be postponed after creation of table object
* (like launching some background threads),
* do it in this method.
* You should call this method after creation of object.
* By default, does nothing.
* Cannot be called simultaneously by multiple threads.
*/
virtual void startup() {}
2021-05-13 07:03:00 +00:00
/**
* If the storage requires some complicated work on destroying,
* then you have two virtual methods:
* - flushAndPrepareForShutdown()
2021-05-13 07:03:00 +00:00
* - shutdown()
*
* @see shutdown()
* @see flushAndPrepareForShutdown()
2021-05-13 07:03:00 +00:00
*/
2023-11-06 14:40:01 +00:00
void flushAndShutdown(bool is_drop = false)
2021-05-13 07:03:00 +00:00
{
flushAndPrepareForShutdown();
2023-11-06 14:40:01 +00:00
shutdown(is_drop);
2021-05-13 07:03:00 +00:00
}
/** If the table have to do some complicated work when destroying an object - do it in advance.
2017-04-16 15:00:33 +00:00
* For example, if the table contains any threads for background work - ask them to complete and wait for completion.
* By default, does nothing.
2017-04-16 15:00:33 +00:00
* Can be called simultaneously from different threads, even after a call to drop().
2013-09-30 01:29:19 +00:00
*/
2023-11-06 14:40:01 +00:00
virtual void shutdown(bool is_drop = false) { UNUSED(is_drop); } // NOLINT
/// Called before shutdown() to flush data to underlying storage
/// Data in memory need to be persistent
virtual void flushAndPrepareForShutdown() {}
/// Asks table to stop executing some action identified by action_type
/// If table does not support such type of lock, and empty lock is returned
virtual ActionLock getActionLock(StorageActionBlockType /* action_type */)
{
return {};
}
2020-10-16 10:12:31 +00:00
/// Call when lock from previous method removed
2020-10-15 16:10:22 +00:00
virtual void onActionLockRemove(StorageActionBlockType /* action_type */) {}
std::atomic<bool> is_dropped{false};
std::atomic<bool> is_detached{false};
2023-11-02 17:30:32 +00:00
std::atomic<bool> is_being_restarted{false};
/// Does table support index for IN sections
virtual bool supportsIndexForIn() const { return false; }
2018-01-23 08:18:12 +00:00
/// Provides a hint that the storage engine may evaluate the IN-condition by using an index.
virtual bool mayBenefitFromIndexForIn(const ASTPtr & /* left_in_operand */, ContextPtr /* query_context */, const StorageMetadataPtr & /* metadata_snapshot */) const { return false; }
/** A list of tasks to check a validity of data.
* Each IStorage implementation may interpret this task in its own way.
* E.g. for some storages it's a list of files in filesystem, for others it can be a list of parts.
* Also it may hold resources (e.g. locks) required during check.
*/
struct DataValidationTasksBase
{
/// Number of entries left to check.
/// It decreases after each call to checkDataNext().
virtual size_t size() const = 0;
virtual ~DataValidationTasksBase() = default;
};
using DataValidationTasksPtr = std::shared_ptr<DataValidationTasksBase>;
2023-10-24 12:50:24 +00:00
/// Specifies to check all data / partition / part
using CheckTaskFilter = std::variant<std::monostate, ASTPtr, String>;
virtual DataValidationTasksPtr getCheckTaskList(const CheckTaskFilter & /* check_task_filter */, ContextPtr /* context */);
/** Executes one task from the list.
* If no tasks left - returns nullopt.
* Note: Function `checkDataNext` is accessing `check_task_list` thread-safely,
* and can be called simultaneously for the same `getCheckTaskList` result
* to process different tasks in parallel.
* Usage:
*
2023-10-24 12:50:24 +00:00
* auto check_task_list = storage.getCheckTaskList({}, context);
* size_t total_tasks = check_task_list->size();
* while (true)
* {
* size_t tasks_left = check_task_list->size();
* std::cout << "Checking data: " << (total_tasks - tasks_left) << " / " << total_tasks << " tasks done." << std::endl;
* auto result = storage.checkDataNext(check_task_list);
* if (!result)
* break;
* doSomething(*result);
* }
*/
virtual std::optional<CheckResult> checkDataNext(DataValidationTasksPtr & check_task_list);
/// Checks that table could be dropped right now
2018-08-03 09:54:46 +00:00
/// Otherwise - throws an exception with detailed information.
2018-08-03 13:19:53 +00:00
/// We do not use mutex because it is not very important that the size could change during the operation.
2023-08-29 14:26:48 +00:00
virtual void checkTableCanBeDropped([[ maybe_unused ]] ContextPtr query_context) const {}
/// Similar to above but checks for DETACH. It's only used for DICTIONARIES.
virtual void checkTableCanBeDetached() const {}
2020-11-01 17:38:43 +00:00
/// Returns true if Storage may store some data on disk.
/// NOTE: may not be equivalent to !getDataPaths().empty()
virtual bool storesDataOnDisk() const { return false; }
2019-08-01 10:29:14 +00:00
/// Returns data paths if storage supports it, empty vector otherwise.
2019-04-04 13:13:59 +00:00
virtual Strings getDataPaths() const { return {}; }
2020-05-25 17:57:08 +00:00
/// Returns storage policy if storage supports it.
2019-11-27 09:39:44 +00:00
virtual StoragePolicyPtr getStoragePolicy() const { return {}; }
2014-03-09 17:36:01 +00:00
/// Returns true if all disks of storage are read-only or write-once.
/// NOTE: write-once also does not support INSERTs/merges/... for MergeTree
2021-09-04 09:02:07 +00:00
virtual bool isStaticStorage() const;
2021-08-23 11:26:54 +00:00
2020-03-29 08:38:38 +00:00
/// If it is possible to quickly determine exact number of rows in the table at this moment of time, then return it.
/// Used for:
/// - Simple count() optimization
2020-03-29 08:38:38 +00:00
/// - For total_rows column in system.tables
///
/// Does takes underlying Storage (if any) into account.
2020-11-25 13:47:32 +00:00
virtual std::optional<UInt64> totalRows(const Settings &) const { return {}; }
2020-09-21 10:13:01 +00:00
/// Same as above but also take partition predicate into account.
virtual std::optional<UInt64> totalRowsByPartitionPredicate(const SelectQueryInfo &, ContextPtr) const { return {}; }
2020-09-21 10:13:01 +00:00
2020-03-29 08:38:38 +00:00
/// If it is possible to quickly determine exact number of bytes for the table on storage:
/// - memory (approximated, resident)
2020-03-29 08:38:38 +00:00
/// - disk (compressed)
///
/// Used for:
/// - For total_bytes column in system.tables
//
/// Does not takes underlying Storage (if any) into account
/// (since for Buffer we still need to know how much bytes it uses).
///
/// Memory part should be estimated as a resident memory size.
/// In particular, alloctedBytes() is preferable over bytes()
/// when considering in-memory blocks.
2020-11-25 13:47:32 +00:00
virtual std::optional<UInt64> totalBytes(const Settings &) const { return {}; }
2020-03-29 08:38:38 +00:00
/// If it is possible to quickly determine exact number of uncompressed bytes for the table on storage:
/// - disk (uncompressed)
///
/// Used for:
/// - For total_bytes_uncompressed column in system.tables
2023-11-24 08:20:11 +00:00
///
2023-12-09 12:09:41 +00:00
/// Does not take underlying Storage (if any) into account
2023-11-29 09:09:56 +00:00
virtual std::optional<UInt64> totalBytesUncompressed(const Settings &) const { return {}; }
/// Number of rows INSERTed since server start.
///
2022-12-18 00:57:06 +00:00
/// Does not take the underlying Storage (if any) into account.
virtual std::optional<UInt64> lifetimeRows() const { return {}; }
/// Number of bytes INSERTed since server start.
///
2022-12-18 00:57:06 +00:00
/// Does not take the underlying Storage (if any) into account.
virtual std::optional<UInt64> lifetimeBytes() const { return {}; }
2022-03-01 16:32:55 +00:00
/// Creates a storage snapshot from given metadata.
virtual StorageSnapshotPtr getStorageSnapshot(const StorageMetadataPtr & metadata_snapshot, ContextPtr /*query_context*/) const
{
return std::make_shared<StorageSnapshot>(*this, metadata_snapshot);
}
2022-03-01 16:32:55 +00:00
/// Creates a storage snapshot from given metadata and columns, which are used in query.
virtual StorageSnapshotPtr getStorageSnapshotForQuery(const StorageMetadataPtr & metadata_snapshot, const ASTPtr & /*query*/, ContextPtr query_context) const
{
return getStorageSnapshot(metadata_snapshot, query_context);
}
2021-10-06 10:37:58 +00:00
2023-05-25 22:54:54 +00:00
/// Creates a storage snapshot but without holding a data specific to storage.
virtual StorageSnapshotPtr getStorageSnapshotWithoutData(const StorageMetadataPtr & metadata_snapshot, ContextPtr query_context) const
{
return getStorageSnapshot(metadata_snapshot, query_context);
}
/// Re initialize disks in case the underlying storage policy changed
virtual bool initializeDiskOnConfigChange(const std::set<String> & /*new_added_disks*/) { return true; }
2022-05-20 19:49:31 +00:00
/// A helper to implement read()
static void readFromPipe(
QueryPlan & query_plan,
Pipe pipe,
const Names & column_names,
const StorageSnapshotPtr & storage_snapshot,
SelectQueryInfo & query_info,
ContextPtr context,
std::string storage_name);
private:
2021-10-25 17:49:49 +00:00
/// Lock required for alter queries (lockForAlter).
/// Allows to execute only one simultaneous alter query.
mutable std::timed_mutex alter_lock;
2020-06-18 16:10:47 +00:00
2020-06-22 09:49:21 +00:00
/// Lock required for drop queries. Every thread that want to ensure, that
2020-08-08 00:47:03 +00:00
/// table is not dropped have to table this lock for read (lockForShare).
2020-06-22 09:49:21 +00:00
/// DROP-like queries take this lock for write (lockExclusively), to be sure
/// that all table threads finished.
2020-06-18 16:10:47 +00:00
mutable RWLock drop_lock = RWLockImpl::create();
};
2010-03-01 16:59:51 +00:00
2011-08-15 01:12:57 +00:00
}