Merge pull request #68837 from CurtizJ/fix-async-insert-alter

Fix async inserts with `ALTER ADD/MODIFY COLUMN`
This commit is contained in:
Anton Popov 2024-09-03 12:37:27 +00:00 committed by GitHub
commit 34c14a6493
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
8 changed files with 242 additions and 102 deletions

View File

@ -33,6 +33,8 @@
#include <Common/SensitiveDataMasker.h>
#include <Common/SipHash.h>
#include <Common/logger_useful.h>
#include <Parsers/ASTExpressionList.h>
#include <Parsers/ASTIdentifier.h>
namespace CurrentMetrics
{
@ -308,6 +310,7 @@ void AsynchronousInsertQueue::preprocessInsertQuery(const ASTPtr & query, const
/* no_squash */ false,
/* no_destination */ false,
/* async_insert */ false);
auto table = interpreter.getTable(insert_query);
auto sample_block = InterpreterInsertQuery::getSampleBlock(insert_query, table, table->getInMemoryMetadataPtr(), query_context);
@ -318,6 +321,10 @@ void AsynchronousInsertQueue::preprocessInsertQuery(const ASTPtr & query, const
/// InterpreterInsertQuery::getTable() -> ITableFunction::execute().
if (insert_query.table_id)
query_context->checkAccess(AccessType::INSERT, insert_query.table_id, sample_block.getNames());
insert_query.columns = std::make_shared<ASTExpressionList>();
for (const auto & column : sample_block)
insert_query.columns->children.push_back(std::make_shared<ASTIdentifier>(column.name));
}
AsynchronousInsertQueue::PushResult
@ -696,6 +703,17 @@ catch (...)
tryLogCurrentException("AsynchronousInsertQueue", "Failed to add elements to AsynchronousInsertLog");
}
void convertBlockToHeader(Block & block, const Block & header)
{
auto converting_dag = ActionsDAG::makeConvertingActions(
block.getColumnsWithTypeAndName(),
header.getColumnsWithTypeAndName(),
ActionsDAG::MatchColumnsMode::Name);
auto converting_actions = std::make_shared<ExpressionActions>(std::move(converting_dag));
converting_actions->execute(block);
}
String serializeQuery(const IAST & query, size_t max_length)
{
return query.hasSecretParts()
@ -791,6 +809,61 @@ try
if (async_insert_log)
log_elements.reserve(data->entries.size());
auto add_entry_to_asynchronous_insert_log = [&, query_by_format = NameToNameMap{}](
const InsertData::EntryPtr & entry,
const String & parsing_exception,
size_t num_rows,
size_t num_bytes) mutable
{
if (!async_insert_log)
return;
AsynchronousInsertLogElement elem;
elem.event_time = timeInSeconds(entry->create_time);
elem.event_time_microseconds = timeInMicroseconds(entry->create_time);
elem.database = query_database;
elem.table = query_table;
elem.format = entry->format;
elem.query_id = entry->query_id;
elem.bytes = num_bytes;
elem.rows = num_rows;
elem.exception = parsing_exception;
elem.data_kind = entry->chunk.getDataKind();
elem.timeout_milliseconds = data->timeout_ms.count();
elem.flush_query_id = insert_query_id;
auto get_query_by_format = [&](const String & format) -> const String &
{
auto [it, inserted] = query_by_format.try_emplace(format);
if (!inserted)
return it->second;
auto query = key.query->clone();
assert_cast<ASTInsertQuery &>(*query).format = format;
it->second = serializeQuery(*query, insert_context->getSettingsRef().log_queries_cut_to_length);
return it->second;
};
if (entry->chunk.getDataKind() == DataKind::Parsed)
elem.query_for_logging = key.query_str;
else
elem.query_for_logging = get_query_by_format(entry->format);
/// If there was a parsing error,
/// the entry won't be flushed anyway,
/// so add the log element immediately.
if (!elem.exception.empty())
{
elem.status = AsynchronousInsertLogElement::ParsingError;
async_insert_log->add(std::move(elem));
}
else
{
elem.status = AsynchronousInsertLogElement::Ok;
log_elements.push_back(std::move(elem));
}
};
try
{
interpreter = std::make_unique<InterpreterInsertQuery>(
@ -819,49 +892,20 @@ try
catch (...)
{
logExceptionBeforeStart(query_for_logging, insert_context, key.query, query_span, start_watch.elapsedMilliseconds());
if (async_insert_log)
{
for (const auto & entry : data->entries)
add_entry_to_asynchronous_insert_log(entry, /*parsing_exception=*/ "", /*num_rows=*/ 0, entry->chunk.byteSize());
auto exception = getCurrentExceptionMessage(false);
auto flush_time = std::chrono::system_clock::now();
appendElementsToLogSafe(*async_insert_log, std::move(log_elements), flush_time, exception);
}
throw;
}
auto add_entry_to_asynchronous_insert_log = [&](const auto & entry,
const auto & entry_query_for_logging,
const auto & exception,
size_t num_rows,
size_t num_bytes,
Milliseconds timeout_ms)
{
if (!async_insert_log)
return;
AsynchronousInsertLogElement elem;
elem.event_time = timeInSeconds(entry->create_time);
elem.event_time_microseconds = timeInMicroseconds(entry->create_time);
elem.query_for_logging = entry_query_for_logging;
elem.database = query_database;
elem.table = query_table;
elem.format = entry->format;
elem.query_id = entry->query_id;
elem.bytes = num_bytes;
elem.rows = num_rows;
elem.exception = exception;
elem.data_kind = entry->chunk.getDataKind();
elem.timeout_milliseconds = timeout_ms.count();
elem.flush_query_id = insert_query_id;
/// If there was a parsing error,
/// the entry won't be flushed anyway,
/// so add the log element immediately.
if (!elem.exception.empty())
{
elem.status = AsynchronousInsertLogElement::ParsingError;
async_insert_log->add(std::move(elem));
}
else
{
log_elements.push_back(elem);
}
};
auto finish_entries = [&]
auto finish_entries = [&](size_t num_rows, size_t num_bytes)
{
for (const auto & entry : data->entries)
{
@ -874,20 +918,7 @@ try
auto flush_time = std::chrono::system_clock::now();
appendElementsToLogSafe(*async_insert_log, std::move(log_elements), flush_time, "");
}
};
Chunk chunk;
auto header = pipeline.getHeader();
if (key.data_kind == DataKind::Parsed)
chunk = processEntriesWithParsing(key, data, header, insert_context, log, add_entry_to_asynchronous_insert_log);
else
chunk = processPreprocessedEntries(key, data, header, insert_context, add_entry_to_asynchronous_insert_log);
ProfileEvents::increment(ProfileEvents::AsyncInsertRows, chunk.getNumRows());
auto log_and_add_finish_to_query_log = [&](size_t num_rows, size_t num_bytes)
{
LOG_DEBUG(log, "Flushed {} rows, {} bytes for query '{}'", num_rows, num_bytes, key.query_str);
queue_shard_flush_time_history.updateWithCurrentTime();
@ -896,16 +927,24 @@ try
query_log_elem, insert_context, key.query, pipeline, pulling_pipeline, query_span, QueryCache::Usage::None, internal);
};
if (chunk.getNumRows() == 0)
{
finish_entries();
log_and_add_finish_to_query_log(0, 0);
return;
}
try
{
Chunk chunk;
auto header = pipeline.getHeader();
if (key.data_kind == DataKind::Parsed)
chunk = processEntriesWithParsing(key, data, header, insert_context, log, add_entry_to_asynchronous_insert_log);
else
chunk = processPreprocessedEntries(data, header, add_entry_to_asynchronous_insert_log);
ProfileEvents::increment(ProfileEvents::AsyncInsertRows, chunk.getNumRows());
if (chunk.getNumRows() == 0)
{
finish_entries(/*num_rows=*/ 0, /*num_bytes=*/ 0);
return;
}
size_t num_rows = chunk.getNumRows();
size_t num_bytes = chunk.bytes();
@ -915,7 +954,7 @@ try
CompletedPipelineExecutor completed_executor(pipeline);
completed_executor.execute();
log_and_add_finish_to_query_log(num_rows, num_bytes);
finish_entries(num_rows, num_bytes);
}
catch (...)
{
@ -929,8 +968,6 @@ try
}
throw;
}
finish_entries();
}
catch (const Exception & e)
{
@ -991,7 +1028,6 @@ Chunk AsynchronousInsertQueue::processEntriesWithParsing(
StreamingFormatExecutor executor(header, format, std::move(on_error), std::move(adding_defaults_transform));
auto chunk_info = std::make_shared<AsyncInsertInfo>();
auto query_for_logging = serializeQuery(*key.query, insert_context->getSettingsRef().log_queries_cut_to_length);
for (const auto & entry : data->entries)
{
@ -1009,7 +1045,8 @@ Chunk AsynchronousInsertQueue::processEntriesWithParsing(
size_t num_rows = executor.execute(*buffer);
total_rows += num_rows;
/// for some reason, client can pass zero rows and bytes to server.
/// For some reason, client can pass zero rows and bytes to server.
/// We don't update offsets in this case, because we assume every insert has some rows during dedup
/// but we have nothing to deduplicate for this insert.
if (num_rows > 0)
@ -1018,8 +1055,7 @@ Chunk AsynchronousInsertQueue::processEntriesWithParsing(
chunk_info->tokens.push_back(entry->async_dedup_token);
}
add_to_async_insert_log(entry, query_for_logging, current_exception, num_rows, num_bytes, data->timeout_ms);
add_to_async_insert_log(entry, current_exception, num_rows, num_bytes);
current_exception.clear();
entry->resetChunk();
}
@ -1031,30 +1067,14 @@ Chunk AsynchronousInsertQueue::processEntriesWithParsing(
template <typename LogFunc>
Chunk AsynchronousInsertQueue::processPreprocessedEntries(
const InsertQuery & key,
const InsertDataPtr & data,
const Block & header,
const ContextPtr & insert_context,
LogFunc && add_to_async_insert_log)
{
size_t total_rows = 0;
auto chunk_info = std::make_shared<AsyncInsertInfo>();
auto result_columns = header.cloneEmptyColumns();
std::unordered_map<String, String> format_to_query;
auto get_query_by_format = [&](const String & format) -> const String &
{
auto [it, inserted] = format_to_query.try_emplace(format);
if (!inserted)
return it->second;
auto query = key.query->clone();
assert_cast<ASTInsertQuery &>(*query).format = format;
it->second = serializeQuery(*query, insert_context->getSettingsRef().log_queries_cut_to_length);
return it->second;
};
for (const auto & entry : data->entries)
{
const auto * block = entry->chunk.asBlock();
@ -1062,23 +1082,26 @@ Chunk AsynchronousInsertQueue::processPreprocessedEntries(
throw Exception(ErrorCodes::LOGICAL_ERROR,
"Expected entry with data kind Preprocessed. Got: {}", entry->chunk.getDataKind());
auto columns = block->getColumns();
Block block_to_insert = *block;
if (!isCompatibleHeader(block_to_insert, header))
convertBlockToHeader(block_to_insert, header);
auto columns = block_to_insert.getColumns();
for (size_t i = 0, s = columns.size(); i < s; ++i)
result_columns[i]->insertRangeFrom(*columns[i], 0, columns[i]->size());
total_rows += block->rows();
/// for some reason, client can pass zero rows and bytes to server.
total_rows += block_to_insert.rows();
/// For some reason, client can pass zero rows and bytes to server.
/// We don't update offsets in this case, because we assume every insert has some rows during dedup,
/// but we have nothing to deduplicate for this insert.
if (block->rows())
if (block_to_insert.rows() > 0)
{
chunk_info->offsets.push_back(total_rows);
chunk_info->tokens.push_back(entry->async_dedup_token);
}
const auto & query_for_logging = get_query_by_format(entry->format);
add_to_async_insert_log(entry, query_for_logging, "", block->rows(), block->bytes(), data->timeout_ms);
add_to_async_insert_log(entry, /*parsing_exception=*/ "", block_to_insert.rows(), block_to_insert.bytes());
entry->resetChunk();
}

View File

@ -288,10 +288,8 @@ private:
template <typename LogFunc>
static Chunk processPreprocessedEntries(
const InsertQuery & key,
const InsertDataPtr & data,
const Block & header,
const ContextPtr & insert_context,
LogFunc && add_to_async_insert_log);
template <typename E>

View File

@ -9,7 +9,7 @@ written_rows: 0
written_bytes: 0
result_rows: 0
result_bytes: 0
query: INSERT INTO default.async_insert_landing SETTINGS wait_for_async_insert = 1, async_insert = 1 FORMAT Values
query: INSERT INTO default.async_insert_landing (id) SETTINGS wait_for_async_insert = 1, async_insert = 1 FORMAT Values
query_kind: AsyncInsertFlush
databases: ['default']
tables: ['default.async_insert_landing']
@ -26,7 +26,7 @@ written_rows: 4
written_bytes: 16
result_rows: 4
result_bytes: 16
query: INSERT INTO default.async_insert_landing SETTINGS wait_for_async_insert = 1, async_insert = 1 FORMAT Values
query: INSERT INTO default.async_insert_landing (id) SETTINGS wait_for_async_insert = 1, async_insert = 1 FORMAT Values
query_kind: AsyncInsertFlush
databases: ['default']
tables: ['default.async_insert_landing']
@ -54,7 +54,7 @@ written_rows: 0
written_bytes: 0
result_rows: 0
result_bytes: 0
query: INSERT INTO default.async_insert_landing SETTINGS wait_for_async_insert = 1, async_insert = 1 FORMAT Values
query: INSERT INTO default.async_insert_landing (id) SETTINGS wait_for_async_insert = 1, async_insert = 1 FORMAT Values
query_kind: AsyncInsertFlush
databases: ['default']
tables: ['default.async_insert_landing','default.async_insert_target']
@ -71,7 +71,7 @@ written_rows: 6
written_bytes: 24
result_rows: 6
result_bytes: 24
query: INSERT INTO default.async_insert_landing SETTINGS wait_for_async_insert = 1, async_insert = 1 FORMAT Values
query: INSERT INTO default.async_insert_landing (id) SETTINGS wait_for_async_insert = 1, async_insert = 1 FORMAT Values
query_kind: AsyncInsertFlush
databases: ['default']
tables: ['default.async_insert_landing','default.async_insert_target']
@ -118,7 +118,7 @@ written_rows: 0
written_bytes: 0
result_rows: 0
result_bytes: 0
query: INSERT INTO default.async_insert_landing SETTINGS wait_for_async_insert = 1, async_insert = 1 FORMAT Values
query: INSERT INTO default.async_insert_landing (id) SETTINGS wait_for_async_insert = 1, async_insert = 1 FORMAT Values
query_kind: AsyncInsertFlush
databases: ['default']
tables: ['default.async_insert_landing','default.async_insert_target']
@ -135,7 +135,7 @@ written_rows: 3
written_bytes: 12
result_rows: 0
result_bytes: 0
query: INSERT INTO default.async_insert_landing SETTINGS wait_for_async_insert = 1, async_insert = 1 FORMAT Values
query: INSERT INTO default.async_insert_landing (id) SETTINGS wait_for_async_insert = 1, async_insert = 1 FORMAT Values
query_kind: AsyncInsertFlush
databases: ['default']
tables: ['default.async_insert_landing','default.async_insert_target']

View File

@ -4,7 +4,7 @@ Row 1:
──────
database: default
table: async_insert_landing
query: INSERT INTO default.async_insert_landing SETTINGS wait_for_async_insert = 0, async_insert = 1 FORMAT Values
query: INSERT INTO default.async_insert_landing (id) SETTINGS wait_for_async_insert = 0, async_insert = 1 FORMAT Values
format: Values
error: DB::Exc*****on: Cannot parse string 'Invalid' as UInt32:
populated_flush_query_id: 1
@ -18,7 +18,7 @@ written_rows: 0
written_bytes: 0
result_rows: 0
result_bytes: 0
query: INSERT INTO default.async_insert_landing SETTINGS wait_for_async_insert = 0, async_insert = 1 FORMAT Values
query: INSERT INTO default.async_insert_landing (id) SETTINGS wait_for_async_insert = 0, async_insert = 1 FORMAT Values
query_kind: AsyncInsertFlush
databases: ['default']
tables: ['default.async_insert_landing']
@ -35,7 +35,7 @@ written_rows: 0
written_bytes: 0
result_rows: 0
result_bytes: 0
query: INSERT INTO default.async_insert_landing SETTINGS wait_for_async_insert = 0, async_insert = 1 FORMAT Values
query: INSERT INTO default.async_insert_landing (id) SETTINGS wait_for_async_insert = 0, async_insert = 1 FORMAT Values
query_kind: AsyncInsertFlush
databases: ['default']
tables: ['default.async_insert_landing']

View File

@ -0,0 +1,8 @@
42 24 0
42 24 0
43 34 55
42 24
43 34
INSERT INTO default.t_async_insert_alter (id, v1) FORMAT Values Preprocessed Ok
INSERT INTO default.t_async_insert_alter (id, v1, value2) FORMAT Values Preprocessed Ok
INSERT INTO default.t_async_insert_alter (id, v1, value2) FORMAT Values Preprocessed FlushError

View File

@ -0,0 +1,47 @@
-- Tags: no-parallel
-- no-parallel because the test uses FLUSH ASYNC INSERT QUEUE
SET wait_for_async_insert = 0;
SET async_insert_busy_timeout_max_ms = 300000;
SET async_insert_busy_timeout_min_ms = 300000;
SET async_insert_use_adaptive_busy_timeout = 0;
DROP TABLE IF EXISTS t_async_insert_alter;
CREATE TABLE t_async_insert_alter (id Int64, v1 Int64) ENGINE = MergeTree ORDER BY id SETTINGS async_insert = 1;
-- ADD COLUMN
INSERT INTO t_async_insert_alter VALUES (42, 24);
ALTER TABLE t_async_insert_alter ADD COLUMN value2 Int64;
SYSTEM FLUSH ASYNC INSERT QUEUE;
SYSTEM FLUSH LOGS;
SELECT * FROM t_async_insert_alter ORDER BY id;
-- MODIFY COLUMN
INSERT INTO t_async_insert_alter VALUES (43, 34, 55);
ALTER TABLE t_async_insert_alter MODIFY COLUMN value2 String;
SYSTEM FLUSH ASYNC INSERT QUEUE;
SYSTEM FLUSH LOGS;
SELECT * FROM t_async_insert_alter ORDER BY id;
-- DROP COLUMN
INSERT INTO t_async_insert_alter VALUES ('100', '200', '300');
ALTER TABLE t_async_insert_alter DROP COLUMN value2;
SYSTEM FLUSH ASYNC INSERT QUEUE;
SYSTEM FLUSH LOGS;
SELECT * FROM t_async_insert_alter ORDER BY id;
SELECT query, data_kind, status FROM system.asynchronous_insert_log WHERE database = currentDatabase() AND table = 't_async_insert_alter' ORDER BY event_time_microseconds;
DROP TABLE t_async_insert_alter;

View File

@ -0,0 +1,8 @@
42 24 0
42 24 0
43 34 55
42 24
43 34
INSERT INTO default.t_async_insert_alter (id, v1) FORMAT Values Parsed Ok
INSERT INTO default.t_async_insert_alter (id, v1, value2) FORMAT Values Parsed Ok
INSERT INTO default.t_async_insert_alter (id, v1, value2) FORMAT Values Parsed FlushError

View File

@ -0,0 +1,56 @@
#!/usr/bin/env bash
# Tags: no-parallel
# no-parallel because the test uses FLUSH ASYNC INSERT QUEUE
CURDIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)
# shellcheck source=../shell_config.sh
. "$CURDIR"/../shell_config.sh
$CLICKHOUSE_CLIENT -q "
DROP TABLE IF EXISTS t_async_insert_alter;
CREATE TABLE t_async_insert_alter (id Int64, v1 Int64) ENGINE = MergeTree ORDER BY id SETTINGS async_insert = 1;
"
url="${CLICKHOUSE_URL}&async_insert=1&async_insert_busy_timeout_max_ms=300000&async_insert_busy_timeout_min_ms=300000&wait_for_async_insert=0&async_insert_use_adaptive_busy_timeout=0"
# ADD COLUMN
${CLICKHOUSE_CURL} -sS "$url" -d "INSERT INTO t_async_insert_alter VALUES (42, 24)"
$CLICKHOUSE_CLIENT -q "
ALTER TABLE t_async_insert_alter ADD COLUMN value2 Int64;
SYSTEM FLUSH ASYNC INSERT QUEUE;
SYSTEM FLUSH LOGS;
SELECT * FROM t_async_insert_alter ORDER BY id;
"
# MODIFY COLUMN
${CLICKHOUSE_CURL} -sS "$url" -d "INSERT INTO t_async_insert_alter VALUES (43, 34, 55)"
$CLICKHOUSE_CLIENT -q "
ALTER TABLE t_async_insert_alter MODIFY COLUMN value2 String;
SYSTEM FLUSH ASYNC INSERT QUEUE;
SYSTEM FLUSH LOGS;
SELECT * FROM t_async_insert_alter ORDER BY id;
"
## DROP COLUMN
${CLICKHOUSE_CURL} -sS "$url" -d "INSERT INTO t_async_insert_alter VALUES ('100', '200', '300')"
$CLICKHOUSE_CLIENT -q "
ALTER TABLE t_async_insert_alter DROP COLUMN value2;
SYSTEM FLUSH ASYNC INSERT QUEUE;
SYSTEM FLUSH LOGS;
SELECT * FROM t_async_insert_alter ORDER BY id;
SELECT query, data_kind, status FROM system.asynchronous_insert_log WHERE database = currentDatabase() AND table = 't_async_insert_alter' ORDER BY event_time_microseconds;
DROP TABLE t_async_insert_alter;
"