mirror of
https://github.com/ClickHouse/ClickHouse.git
synced 2024-11-29 02:52:13 +00:00
in-memory parts: sync insert to replicated
This commit is contained in:
parent
66e31d4311
commit
a3ac224ae4
@ -23,12 +23,14 @@ void MergeTreeBlockOutputStream::write(const Block & block)
|
||||
storage.delayInsertOrThrowIfNeeded();
|
||||
|
||||
auto part_blocks = storage.writer.splitBlockIntoParts(block, max_parts_per_block);
|
||||
MergeTreeData::DataPartsVector inserted_parts;
|
||||
for (auto & current_block : part_blocks)
|
||||
{
|
||||
Stopwatch watch;
|
||||
|
||||
MergeTreeData::MutableDataPartPtr part = storage.writer.writeTempPart(current_block);
|
||||
storage.renameTempPartAndAdd(part, &storage.increment);
|
||||
inserted_parts.push_back(part);
|
||||
|
||||
PartLog::addNewPart(storage.global_context, part, watch.elapsed());
|
||||
|
||||
@ -36,20 +38,11 @@ void MergeTreeBlockOutputStream::write(const Block & block)
|
||||
{
|
||||
storage.in_memory_merges_throttler.add(part_in_memory->block.bytes(), part_in_memory->rows_count);
|
||||
|
||||
auto settings = storage.getSettings();
|
||||
if (settings->in_memory_parts_insert_sync)
|
||||
{
|
||||
if (!part_in_memory->waitUntilMerged(in_memory_parts_timeout))
|
||||
throw Exception("Timeout exceeded while waiting to write part "
|
||||
+ part->name + " on disk", ErrorCodes::TIMEOUT_EXCEEDED);
|
||||
}
|
||||
else if (storage.merging_mutating_task_handle && !storage.in_memory_merges_throttler.needDelayMerge())
|
||||
if (storage.merging_mutating_task_handle && !storage.in_memory_merges_throttler.needDelayMerge())
|
||||
{
|
||||
storage.in_memory_merges_throttler.reset();
|
||||
storage.merging_mutating_task_handle->wake();
|
||||
}
|
||||
|
||||
continue;
|
||||
}
|
||||
else if (storage.merging_mutating_task_handle)
|
||||
{
|
||||
@ -57,6 +50,17 @@ void MergeTreeBlockOutputStream::write(const Block & block)
|
||||
storage.merging_mutating_task_handle->wake();
|
||||
}
|
||||
}
|
||||
|
||||
if (storage.getSettings()->in_memory_parts_insert_sync)
|
||||
{
|
||||
for (const auto & part : inserted_parts)
|
||||
{
|
||||
auto part_in_memory = asInMemoryPart(part);
|
||||
if (!part_in_memory->waitUntilMerged(in_memory_parts_timeout))
|
||||
throw Exception("Timeout exceeded while waiting to write part "
|
||||
+ part->name + " on disk", ErrorCodes::TIMEOUT_EXCEEDED);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
|
@ -245,7 +245,8 @@ MergeTreeData::MergeTreeData(
|
||||
|
||||
String reason;
|
||||
if (!canUsePolymorphicParts(*settings, &reason) && !reason.empty())
|
||||
LOG_WARNING(log, "{} Settings 'min_bytes_for_wide_part' and 'min_bytes_for_wide_part' will be ignored.", reason);
|
||||
LOG_WARNING(log, "{} Settings 'min_rows_for_wide_part', 'min_bytes_for_wide_part', "
|
||||
"'min_rows_for_compact_part' and 'min_bytes_for_compact_part' will be ignored.", reason);
|
||||
}
|
||||
|
||||
|
||||
@ -1592,10 +1593,10 @@ void MergeTreeData::checkAlterIsPossible(const AlterCommands & commands, const S
|
||||
|
||||
MergeTreeDataPartType MergeTreeData::choosePartType(size_t bytes_uncompressed, size_t rows_count) const
|
||||
{
|
||||
if (!canUseAdaptiveGranularity())
|
||||
const auto settings = getSettings();
|
||||
if (!canUsePolymorphicParts(*settings))
|
||||
return MergeTreeDataPartType::WIDE;
|
||||
|
||||
const auto settings = getSettings();
|
||||
if (bytes_uncompressed < settings->min_bytes_for_compact_part || rows_count < settings->min_rows_for_compact_part)
|
||||
return MergeTreeDataPartType::IN_MEMORY;
|
||||
|
||||
@ -1607,10 +1608,10 @@ MergeTreeDataPartType MergeTreeData::choosePartType(size_t bytes_uncompressed, s
|
||||
|
||||
MergeTreeDataPartType MergeTreeData::choosePartTypeOnDisk(size_t bytes_uncompressed, size_t rows_count) const
|
||||
{
|
||||
if (!canUseAdaptiveGranularity())
|
||||
const auto settings = getSettings();
|
||||
if (!canUsePolymorphicParts(*settings))
|
||||
return MergeTreeDataPartType::WIDE;
|
||||
|
||||
const auto settings = getSettings();
|
||||
if (bytes_uncompressed < settings->min_bytes_for_wide_part || rows_count < settings->min_rows_for_wide_part)
|
||||
return MergeTreeDataPartType::COMPACT;
|
||||
|
||||
@ -3605,11 +3606,15 @@ bool MergeTreeData::canUsePolymorphicParts(const MergeTreeSettings & settings, S
|
||||
{
|
||||
if (!canUseAdaptiveGranularity())
|
||||
{
|
||||
if ((settings.min_rows_for_wide_part != 0 || settings.min_bytes_for_wide_part != 0) && out_reason)
|
||||
if (out_reason && (settings.min_rows_for_wide_part != 0 || settings.min_bytes_for_wide_part != 0
|
||||
|| settings.min_rows_for_compact_part != 0 || settings.min_bytes_for_compact_part != 0))
|
||||
{
|
||||
std::ostringstream message;
|
||||
message << "Table can't create parts with adaptive granularity, but settings min_rows_for_wide_part = "
|
||||
<< settings.min_rows_for_wide_part << ", min_bytes_for_wide_part = " << settings.min_bytes_for_wide_part
|
||||
message << "Table can't create parts with adaptive granularity, but settings"
|
||||
<< "min_rows_for_wide_part = " << settings.min_rows_for_wide_part
|
||||
<< ", min_bytes_for_wide_part = " << settings.min_bytes_for_wide_part
|
||||
<< ", min_rows_for_compact_part = " << settings.min_rows_for_compact_part
|
||||
<< ", min_bytes_for_compact_part = " << settings.min_bytes_for_compact_part
|
||||
<< ". Parts with non-adaptive granularity can be stored only in Wide (default) format.";
|
||||
*out_reason = message.str();
|
||||
}
|
||||
|
@ -901,7 +901,7 @@ private:
|
||||
/// Check selected parts for movements. Used by ALTER ... MOVE queries.
|
||||
CurrentlyMovingPartsTagger checkPartsForMove(const DataPartsVector & parts, SpacePtr space);
|
||||
|
||||
bool canUsePolymorphicParts(const MergeTreeSettings & settings, String * out_reason) const;
|
||||
bool canUsePolymorphicParts(const MergeTreeSettings & settings, String * out_reason = nullptr) const;
|
||||
|
||||
WriteAheadLogPtr write_ahead_log;
|
||||
};
|
||||
|
@ -95,16 +95,19 @@ void MergeTreeDataPartInMemory::makeCloneInDetached(const String & prefix) const
|
||||
flushToDisk(storage.getRelativeDataPath(), detached_path);
|
||||
}
|
||||
|
||||
bool MergeTreeDataPartInMemory::waitUntilMerged(size_t timeout) const
|
||||
bool MergeTreeDataPartInMemory::waitUntilMerged(size_t timeout_ms) const
|
||||
{
|
||||
auto lock = storage.lockParts();
|
||||
return is_merged.wait_for(lock, std::chrono::milliseconds(timeout),
|
||||
[this]() { return state == State::Outdated; });
|
||||
return is_merged.wait_for(lock, std::chrono::milliseconds(timeout_ms),
|
||||
[this]() { return state != State::Committed; });
|
||||
}
|
||||
|
||||
void MergeTreeDataPartInMemory::notifyMerged() const
|
||||
{
|
||||
is_merged.notify_one();
|
||||
LOG_DEBUG(&Poco::Logger::get("InMemPart"), "notifiedMerged");
|
||||
LOG_DEBUG(&Poco::Logger::get("InMemPart"), "state {}", stateString());
|
||||
|
||||
is_merged.notify_all();
|
||||
}
|
||||
|
||||
void MergeTreeDataPartInMemory::renameTo(const String & new_relative_path, bool /* remove_new_dir_if_exists */) const
|
||||
|
@ -45,7 +45,7 @@ public:
|
||||
|
||||
void flushToDisk(const String & base_path, const String & new_relative_path) const;
|
||||
|
||||
bool waitUntilMerged(size_t timeout) const;
|
||||
bool waitUntilMerged(size_t timeout_ms) const;
|
||||
void notifyMerged() const;
|
||||
|
||||
mutable Block block;
|
||||
|
@ -121,7 +121,8 @@ struct MergeTreeSettings : public SettingsCollection<MergeTreeSettings>
|
||||
|
||||
static bool isPartFormatSetting(const String & name)
|
||||
{
|
||||
return name == "min_bytes_for_wide_part" || name == "min_rows_for_wide_part";
|
||||
return name == "min_bytes_for_wide_part" || name == "min_rows_for_wide_part"
|
||||
|| name == "min_bytes_for_compact_part" || name == "min_rows_for_compact_part";
|
||||
}
|
||||
};
|
||||
|
||||
|
@ -31,9 +31,14 @@ namespace ErrorCodes
|
||||
|
||||
|
||||
ReplicatedMergeTreeBlockOutputStream::ReplicatedMergeTreeBlockOutputStream(
|
||||
StorageReplicatedMergeTree & storage_, size_t quorum_, size_t quorum_timeout_ms_, size_t max_parts_per_block_, bool deduplicate_)
|
||||
: storage(storage_), quorum(quorum_), quorum_timeout_ms(quorum_timeout_ms_), max_parts_per_block(max_parts_per_block_), deduplicate(deduplicate_),
|
||||
log(&Poco::Logger::get(storage.getLogName() + " (Replicated OutputStream)"))
|
||||
StorageReplicatedMergeTree & storage_, size_t quorum_, size_t quorum_timeout_ms_,
|
||||
size_t max_parts_per_block_, size_t insert_in_memory_parts_timeout_ms_, bool deduplicate_)
|
||||
: storage(storage_), quorum(quorum_)
|
||||
, quorum_timeout_ms(quorum_timeout_ms_)
|
||||
, max_parts_per_block(max_parts_per_block_)
|
||||
, insert_in_memory_parts_timeout_ms(insert_in_memory_parts_timeout_ms_)
|
||||
, deduplicate(deduplicate_)
|
||||
, log(&Poco::Logger::get(storage.getLogName() + " (Replicated OutputStream)"))
|
||||
{
|
||||
/// The quorum value `1` has the same meaning as if it is disabled.
|
||||
if (quorum == 1)
|
||||
@ -365,6 +370,14 @@ void ReplicatedMergeTreeBlockOutputStream::commitPart(zkutil::ZooKeeperPtr & zoo
|
||||
+ zkutil::ZooKeeper::error2string(multi_code), ErrorCodes::UNEXPECTED_ZOOKEEPER_ERROR);
|
||||
}
|
||||
|
||||
auto part_in_memory = asInMemoryPart(part);
|
||||
if (part_in_memory && storage.getSettings()->in_memory_parts_insert_sync)
|
||||
{
|
||||
if (!part_in_memory->waitUntilMerged(insert_in_memory_parts_timeout_ms))
|
||||
throw Exception("Timeout exceeded while waiting to write part "
|
||||
+ part->name + " on disk", ErrorCodes::TIMEOUT_EXCEEDED);
|
||||
}
|
||||
|
||||
if (quorum)
|
||||
{
|
||||
/// We are waiting for quorum to be satisfied.
|
||||
|
@ -24,7 +24,7 @@ class ReplicatedMergeTreeBlockOutputStream : public IBlockOutputStream
|
||||
public:
|
||||
ReplicatedMergeTreeBlockOutputStream(StorageReplicatedMergeTree & storage_,
|
||||
size_t quorum_, size_t quorum_timeout_ms_, size_t max_parts_per_block_,
|
||||
bool deduplicate_);
|
||||
size_t insert_in_memory_parts_timeout_ms_, bool deduplicate_);
|
||||
|
||||
Block getHeader() const override;
|
||||
void writePrefix() override;
|
||||
@ -58,6 +58,7 @@ private:
|
||||
size_t quorum;
|
||||
size_t quorum_timeout_ms;
|
||||
size_t max_parts_per_block;
|
||||
size_t insert_in_memory_parts_timeout_ms;
|
||||
|
||||
bool deduplicate = true;
|
||||
bool last_block_is_duplicate = false;
|
||||
|
@ -1085,20 +1085,6 @@ bool StorageReplicatedMergeTree::tryExecuteMerge(const LogEntry & entry)
|
||||
try
|
||||
{
|
||||
checkPartChecksumsAndCommit(transaction, part);
|
||||
|
||||
DataPartsVector parts_to_remove_immediatly;
|
||||
for (const auto & part_ptr : parts)
|
||||
{
|
||||
if (auto part_in_memory = asInMemoryPart(part_ptr))
|
||||
{
|
||||
part_in_memory->notifyMerged();
|
||||
modifyPartState(part_in_memory, DataPartState::Deleting);
|
||||
parts_to_remove_immediatly.push_back(part_in_memory);
|
||||
}
|
||||
}
|
||||
|
||||
tryRemovePartsFromZooKeeperWithRetries(parts_to_remove_immediatly);
|
||||
removePartsFinally(parts_to_remove_immediatly);
|
||||
}
|
||||
catch (const Exception & e)
|
||||
{
|
||||
@ -1122,6 +1108,20 @@ bool StorageReplicatedMergeTree::tryExecuteMerge(const LogEntry & entry)
|
||||
throw;
|
||||
}
|
||||
|
||||
DataPartsVector parts_to_remove_immediatly;
|
||||
for (const auto & part_ptr : parts)
|
||||
{
|
||||
if (auto part_in_memory = asInMemoryPart(part_ptr))
|
||||
{
|
||||
modifyPartState(part_in_memory, DataPartState::Deleting);
|
||||
part_in_memory->notifyMerged();
|
||||
parts_to_remove_immediatly.push_back(part_in_memory);
|
||||
}
|
||||
}
|
||||
|
||||
tryRemovePartsFromZooKeeperWithRetries(parts_to_remove_immediatly);
|
||||
removePartsFinally(parts_to_remove_immediatly);
|
||||
|
||||
/** Removing old parts from ZK and from the disk is delayed - see ReplicatedMergeTreeCleanupThread, clearOldParts.
|
||||
*/
|
||||
|
||||
@ -3138,7 +3138,11 @@ BlockOutputStreamPtr StorageReplicatedMergeTree::write(const ASTPtr & /*query*/,
|
||||
bool deduplicate = storage_settings_ptr->replicated_deduplication_window != 0 && query_settings.insert_deduplicate;
|
||||
|
||||
return std::make_shared<ReplicatedMergeTreeBlockOutputStream>(*this,
|
||||
query_settings.insert_quorum, query_settings.insert_quorum_timeout.totalMilliseconds(), query_settings.max_partitions_per_insert_block, deduplicate);
|
||||
query_settings.insert_quorum,
|
||||
query_settings.insert_quorum_timeout.totalMilliseconds(),
|
||||
query_settings.max_partitions_per_insert_block,
|
||||
query_settings.insert_in_memory_parts_timeout.totalMilliseconds(),
|
||||
deduplicate);
|
||||
}
|
||||
|
||||
|
||||
@ -3662,7 +3666,7 @@ void StorageReplicatedMergeTree::attachPartition(const ASTPtr & partition, bool
|
||||
PartsTemporaryRename renamed_parts(*this, "detached/");
|
||||
MutableDataPartsVector loaded_parts = tryLoadPartsToAttach(partition, attach_part, query_context, renamed_parts);
|
||||
|
||||
ReplicatedMergeTreeBlockOutputStream output(*this, 0, 0, 0, false); /// TODO Allow to use quorum here.
|
||||
ReplicatedMergeTreeBlockOutputStream output(*this, 0, 0, 0, 0, false); /// TODO Allow to use quorum here.
|
||||
for (size_t i = 0; i < loaded_parts.size(); ++i)
|
||||
{
|
||||
String old_name = loaded_parts[i]->name;
|
||||
|
@ -9,6 +9,7 @@ from helpers.test_tools import TSV
|
||||
from helpers.test_tools import assert_eq_with_retry
|
||||
from helpers.cluster import ClickHouseCluster
|
||||
from helpers.network import PartitionManager
|
||||
from multiprocessing.dummy import Pool
|
||||
|
||||
cluster = ClickHouseCluster(__file__)
|
||||
|
||||
@ -90,6 +91,8 @@ def start_cluster():
|
||||
create_tables('in_memory_table', [node9, node10], [settings_in_memory, settings_in_memory], "shard4")
|
||||
create_tables('wal_table', [node11, node12], [settings_in_memory, settings_in_memory], "shard4")
|
||||
create_tables('restore_table', [node11, node12], [settings_in_memory, settings_in_memory], "shard5")
|
||||
create_tables('deduplication_table', [node9, node10], [settings_in_memory, settings_in_memory], "shard5")
|
||||
create_tables('sync_table', [node9, node10], [settings_in_memory, settings_in_memory], "shard5")
|
||||
|
||||
yield cluster
|
||||
|
||||
@ -422,6 +425,45 @@ def test_in_memory_wal_rotate(start_cluster):
|
||||
assert os.path.exists(wal_file)
|
||||
assert os.path.getsize(wal_file) == 0
|
||||
|
||||
def test_in_memory_deduplication(start_cluster):
|
||||
for i in range(3):
|
||||
node9.query("INSERT INTO deduplication_table (date, id, s) VALUES (toDate('2020-03-03'), 1, 'foo')")
|
||||
node10.query("INSERT INTO deduplication_table (date, id, s) VALUES (toDate('2020-03-03'), 1, 'foo')")
|
||||
|
||||
node9.query("SYSTEM SYNC REPLICA deduplication_table", timeout=20)
|
||||
node10.query("SYSTEM SYNC REPLICA deduplication_table", timeout=20)
|
||||
|
||||
assert node9.query("SELECT date, id, s FROM deduplication_table") == "2020-03-03\t1\tfoo\n"
|
||||
assert node10.query("SELECT date, id, s FROM deduplication_table") == "2020-03-03\t1\tfoo\n"
|
||||
|
||||
def test_in_memory_sync_insert(start_cluster):
|
||||
node9.query("ALTER TABLE sync_table MODIFY SETTING in_memory_parts_insert_sync = 1")
|
||||
node10.query("ALTER TABLE sync_table MODIFY SETTING in_memory_parts_insert_sync = 1")
|
||||
node9.query("SYSTEM STOP MERGES sync_table")
|
||||
node10.query("SYSTEM STOP MERGES sync_table")
|
||||
|
||||
pool = Pool(5)
|
||||
tasks = []
|
||||
for i in range(5):
|
||||
tasks.append(pool.apply_async(insert_random_data, ('sync_table', node9, 50)))
|
||||
|
||||
time.sleep(5)
|
||||
assert node9.query("SELECT count() FROM sync_table") == "250\n"
|
||||
assert node9.query("SELECT part_type, count() FROM system.parts WHERE table = 'sync_table' AND active GROUP BY part_type") == "InMemory\t5\n"
|
||||
|
||||
for task in tasks:
|
||||
assert not task.ready()
|
||||
|
||||
node9.query("SYSTEM START MERGES sync_table")
|
||||
node10.query("SYSTEM START MERGES sync_table")
|
||||
assert_eq_with_retry(node9, "OPTIMIZE TABLE sync_table FINAL SETTINGS optimize_throw_if_noop = 1", "")
|
||||
|
||||
for task in tasks:
|
||||
task.get()
|
||||
|
||||
assert node9.query("SELECT count() FROM sync_table") == "250\n"
|
||||
assert node9.query("SELECT part_type, count() FROM system.parts WHERE table = 'sync_table' AND active GROUP BY part_type") == "Compact\t1\n"
|
||||
|
||||
def test_polymorphic_parts_index(start_cluster):
|
||||
node1.query('''
|
||||
CREATE TABLE index_compact(a UInt32, s String)
|
||||
|
Loading…
Reference in New Issue
Block a user