ClickHouse/src/Storages/StorageBuffer.h

Ignoring revisions in .git-blame-ignore-revs. Click here to bypass and see the normal blame view.

180 lines
6.9 KiB
C++
Raw Normal View History

#pragma once
#include <Core/BackgroundSchedulePool.h>
#include <Core/NamesAndTypes.h>
#include <Storages/IStorage.h>
#include <Poco/Event.h>
#include <atomic>
#include <mutex>
#include <thread>
namespace Poco { class Logger; }
namespace DB
{
2017-04-16 15:00:33 +00:00
/** During insertion, buffers the data in the RAM until certain thresholds are exceeded.
* When thresholds are exceeded, flushes the data to another table.
* When reading, it reads both from its buffers and from the subordinate table.
*
2017-04-16 15:00:33 +00:00
* The buffer is a set of num_shards blocks.
* When writing, select the block number by the remainder of the `ThreadNumber` division by `num_shards` (or one of the others),
* and add rows to the corresponding block.
* When using a block, it is locked by some mutex. If during write the corresponding block is already occupied
* - try to lock the next block in a round-robin fashion, and so no more than `num_shards` times (then wait for lock).
2017-04-16 15:00:33 +00:00
* Thresholds are checked on insertion, and, periodically, in the background thread (to implement time thresholds).
* Thresholds act independently for each shard. Each shard can be flushed independently of the others.
* If a block is inserted into the table, which itself exceeds the max-thresholds, it is written directly to the subordinate table without buffering.
* Thresholds can be exceeded. For example, if max_rows = 1 000 000, the buffer already had 500 000 rows,
* and a part of 800 000 rows is added, then there will be 1 300 000 rows in the buffer, and then such a block will be written to the subordinate table.
*
* There are also separate thresholds for flush, those thresholds are checked only for non-direct flush.
* This maybe useful if you do not want to add extra latency for INSERT queries,
* so you can set max_rows=1e6 and flush_rows=500e3, then each 500e3 rows buffer will be flushed in background only.
*
* When you destroy a Buffer table, all remaining data is flushed to the subordinate table.
2017-04-16 15:00:33 +00:00
* The data in the buffer is not replicated, not logged to disk, not indexed. With a rough restart of the server, the data is lost.
*/
class StorageBuffer final : public IStorage, WithContext
{
2020-01-29 18:14:40 +00:00
friend class BufferSource;
2021-07-23 14:25:35 +00:00
friend class BufferSink;
public:
struct Thresholds
{
time_t time = 0; /// The number of seconds from the insertion of the first row into the block.
size_t rows = 0; /// The number of rows in the block.
size_t bytes = 0; /// The number of (uncompressed) bytes in the block.
};
/** num_shards - the level of internal parallelism (the number of independent buffers)
* The buffer is flushed if all minimum thresholds or at least one of the maximum thresholds are exceeded.
*/
StorageBuffer(
const StorageID & table_id_,
const ColumnsDescription & columns_,
const ConstraintsDescription & constraints_,
const String & comment,
ContextPtr context_,
size_t num_shards_,
const Thresholds & min_thresholds_,
const Thresholds & max_thresholds_,
const Thresholds & flush_thresholds_,
const StorageID & destination_id,
bool allow_materialized_);
std::string getName() const override { return "Buffer"; }
QueryProcessingStage::Enum
getQueryProcessingStage(ContextPtr, QueryProcessingStage::Enum, const StorageSnapshotPtr &, SelectQueryInfo &) const override;
void read(
QueryPlan & query_plan,
const Names & column_names,
const StorageSnapshotPtr & storage_snapshot,
SelectQueryInfo & query_info,
ContextPtr context,
QueryProcessingStage::Enum processed_stage,
size_t max_block_size,
size_t num_streams) override;
bool supportsParallelInsert() const override { return true; }
2020-12-22 16:40:53 +00:00
bool supportsSubcolumns() const override { return true; }
SinkToStoragePtr write(const ASTPtr & query, const StorageMetadataPtr & /*metadata_snapshot*/, ContextPtr context, bool /*async_insert*/) override;
void startup() override;
/// Flush all buffers into the subordinate table and stop background thread.
void flush() override;
2020-11-06 14:07:56 +00:00
bool optimize(
const ASTPtr & query,
const StorageMetadataPtr & metadata_snapshot,
const ASTPtr & partition,
bool final,
bool deduplicate,
const Names & deduplicate_by_columns,
[RFC] Replacing merge tree new engine (#41005) * Add new engine to ReplacingMergeTree corresponding to the ReplacingCollapsingMergeTree * Add new test for the new ReplacingMergeTree engine * Limit sign value to -1/1 * Add new engine to ReplacingMergeTree corresponding to the ReplacingCollapsingMergeTree * Add new test for the new ReplacingMergeTree engine * Limit sign value to -1/1 * Replace sign column(Int8) by is_deleted(UInt8) * Add new engine to ReplacingMergeTree corresponding to the ReplacingCollapsingMergeTree * Add new test for the new ReplacingMergeTree engine * Limit sign value to -1/1 * Replace sign column(Int8) by is_deleted(UInt8) * Add new engine to ReplacingMergeTree corresponding to the ReplacingCollapsingMergeTree * Add new test for the new ReplacingMergeTree engine * Limit sign value to -1/1 * Replace sign column(Int8) by is_deleted(UInt8) * Add keyword 'CLEANUP' when OPTIMIZE * Cleanup uniquely when it's a replacingMergeTree * Propagate CLEANUP information and change from 'with_cleanup' to 'cleanup' * Cleanup data flagged as 'is_deleted' * Fix merge when optimize and add a test * Fix OPTIMIZE and INSERT + add tests * New fix for cleanup at the merge * Cleanup debug logs * Add the SETTINGS option 'clean_deleted_rows' that can be 'never' or 'always' * Fix regression bug; Now REplicatedMergeTree can be called as before without 'is_deleted' * Add Replicated tests * Disable tag 'long' for our test and cleanup some white spaces * Update tests * Fix tests and remove additional useless whitespace * Fix replica test * Style clean && add condition check for is_deleted values * clean_deleted_rows settings is nom an enum * Add valid default value to the clean_deleted_rows settings * Update cleanup checkers to use the enum and fix typos in the test * Fix submodule contrib/AMQP-CPP pointer * Add missing messages in test reference and remove a print with non derterministic order * fix replica test reference * Fix edge case * Fix a typo for the spell checker * Fix reference * Fix a condition to raise an error if is_deleted differ from 0/1 and cleanup * Change tests file name and update number * This should fix the ReplacingMergeTree parameter set * Fix replicated parameters * Disable allow_deprecated_syntax_for_merge_tree for our new column * Fix a test * Remove non deterministic order print in the test * Test on replicas * Remove a condition, when checking optional parameters, that should not be sueful since we disabled the deprected_syntaxe * Revert "Remove a condition, when checking optional parameters, that should not be useful since we disabled the deprected_syntaxe" This reverts commit b65d64c05e482945ac20fcfcf0311e1b028ea137. * Fix replica management and limit the number of argument to two maximum, due to the possiblity of deprecated table create/attach failing otherwise * Test a fix for replicated log information error * Try to add sync to have consistent results * Change path of replicas that should cause one issue and add few prints in case it's not that * Get cleanup info on replicas only if information found * Fix style issues * Try to avoid replication error 'cannot select parts...' and and replica read/write field order * Cleanup according to PR reviews and add tests on error raised. * Update src/Storages/MergeTree/registerStorageMergeTree.cpp Co-authored-by: Alexander Tokmakov <tavplubix@gmail.com> * Select ... FINAL don't show rows with is_deleted = true * Update and fix SELECT ... FINAL merge parameter * Remove is_deleted rows only on the version inserted when merge * Fix (master) updates issues * Revert changes that should not be commited * Add changes according to review * Revert changes that should not be commited - part 2 --------- Co-authored-by: Alexander Tokmakov <tavplubix@gmail.com>
2023-02-16 13:03:16 +00:00
bool cleanup,
ContextPtr context) override;
bool supportsSampling() const override { return true; }
bool supportsPrewhere() const override;
2015-05-18 21:20:43 +00:00
bool supportsFinal() const override { return true; }
bool supportsIndexForIn() const override { return true; }
bool mayBenefitFromIndexForIn(const ASTPtr & left_in_operand, ContextPtr query_context, const StorageMetadataPtr & metadata_snapshot) const override;
2018-03-16 09:00:04 +00:00
void checkAlterIsPossible(const AlterCommands & commands, ContextPtr context) const override;
2019-12-26 18:17:05 +00:00
/// The structure of the subordinate table is not checked and does not change.
2021-10-25 17:49:49 +00:00
void alter(const AlterCommands & params, ContextPtr context, AlterLockHolder & table_lock_holder) override;
2020-11-25 13:47:32 +00:00
std::optional<UInt64> totalRows(const Settings & settings) const override;
std::optional<UInt64> totalBytes(const Settings & settings) const override;
std::optional<UInt64> lifetimeRows() const override { return lifetime_writes.rows; }
std::optional<UInt64> lifetimeBytes() const override { return lifetime_writes.bytes; }
private:
struct Buffer
{
time_t first_write_time = 0;
Block data;
std::unique_lock<std::mutex> lockForReading() const;
std::unique_lock<std::mutex> lockForWriting() const;
std::unique_lock<std::mutex> tryLock() const;
private:
mutable std::mutex mutex;
std::unique_lock<std::mutex> lockImpl(bool read) const;
};
2017-04-16 15:00:33 +00:00
/// There are `num_shards` of independent buffers.
const size_t num_shards;
std::vector<Buffer> buffers;
const Thresholds min_thresholds;
const Thresholds max_thresholds;
const Thresholds flush_thresholds;
2020-02-17 19:28:25 +00:00
StorageID destination_id;
2018-01-12 13:03:19 +00:00
bool allow_materialized;
struct Writes
{
std::atomic<size_t> rows = 0;
std::atomic<size_t> bytes = 0;
};
Writes lifetime_writes;
Writes total_writes;
Poco::Logger * log;
void flushAllBuffers(bool check_thresholds = true);
bool flushBuffer(Buffer & buffer, bool check_thresholds, bool locked = false);
bool checkThresholds(const Buffer & buffer, bool direct, time_t current_time, size_t additional_rows = 0, size_t additional_bytes = 0) const;
bool checkThresholdsImpl(bool direct, size_t rows, size_t bytes, time_t time_passed) const;
2017-04-16 15:00:33 +00:00
/// `table` argument is passed, as it is sometimes evaluated beforehand. It must match the `destination`.
void writeBlockToDestination(const Block & block, StoragePtr table);
void backgroundFlush();
void reschedule();
2022-08-26 01:00:56 +00:00
StoragePtr getDestinationTable() const;
BackgroundSchedulePool & bg_pool;
BackgroundSchedulePoolTaskHolder flush_handle;
};
}