Merge branch 'master' into fix-sparse-columns-crash

This commit is contained in:
Anton Popov 2023-02-03 21:00:02 +01:00 committed by GitHub
commit 20ebd38242
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
24 changed files with 106 additions and 47 deletions

View File

@ -13,15 +13,28 @@ sysctl kernel.core_pattern='core.%e.%p-%P'
OK="\tOK\t\\N\t"
FAIL="\tFAIL\t\\N\t"
FAILURE_CONTEXT_LINES=50
FAILURE_CONTEXT_MAX_LINE_WIDTH=400
function escaped()
{
# That's the simplest way I found to escape a string in bash. Yep, bash is the most convenient programming language.
clickhouse local -S 's String' --input-format=LineAsString -q "select * from table format CustomSeparated settings format_custom_row_after_delimiter='\\\\\\\\n'"
# Also limit lines width just in case (too long lines are not really useful usually)
clickhouse local -S 's String' --input-format=LineAsString -q "select substr(s, 1, $FAILURE_CONTEXT_MAX_LINE_WIDTH)
from table format CustomSeparated settings format_custom_row_after_delimiter='\\\\\\\\n'"
}
function head_escaped()
{
head -50 $1 | escaped
head -n $FAILURE_CONTEXT_LINES $1 | escaped
}
function unts()
{
grep -Po "[0-9][0-9]:[0-9][0-9] \K.*"
}
function trim_server_logs()
{
head -n $FAILURE_CONTEXT_LINES "/test_output/$1" | grep -Eo " \[ [0-9]+ \] \{.*" | escaped
}
function install_packages()
@ -167,7 +180,7 @@ function start()
then
echo "Cannot start clickhouse-server"
rg --text "<Error>.*Application" /var/log/clickhouse-server/clickhouse-server.log > /test_output/application_errors.txt ||:
echo -e "Cannot start clickhouse-server$FAIL$(head_escaped /test_output/application_errors.txt)" >> /test_output/test_results.tsv
echo -e "Cannot start clickhouse-server$FAIL$(trim_server_logs application_errors.txt)" >> /test_output/test_results.tsv
cat /var/log/clickhouse-server/stdout.log
tail -n100 /var/log/clickhouse-server/stderr.log
tail -n100000 /var/log/clickhouse-server/clickhouse-server.log | rg -F -v -e '<Warning> RaftInstance:' -e '<Information> RaftInstance' | tail -n100
@ -389,7 +402,7 @@ start
# NOTE Hung check is implemented in docker/tests/stress/stress
rg -Fa "No queries hung" /test_output/test_results.tsv | grep -Fa "OK" \
|| echo -e "Hung check failed, possible deadlock found (see hung_check.log)$FAIL$(head_escaped /test_output/hung_check.log)"
|| echo -e "Hung check failed, possible deadlock found (see hung_check.log)$FAIL$(head_escaped /test_output/hung_check.log | unts)"
stop
mv /var/log/clickhouse-server/clickhouse-server.log /var/log/clickhouse-server/clickhouse-server.stress.log
@ -402,7 +415,7 @@ start
clickhouse-client --query "SELECT 'Server successfully started', 'OK', NULL, ''" >> /test_output/test_results.tsv \
|| (rg --text "<Error>.*Application" /var/log/clickhouse-server/clickhouse-server.log > /test_output/application_errors.txt \
&& echo -e "Server failed to start (see application_errors.txt and clickhouse-server.clean.log)$FAIL$(head_escaped /test_output/application_errors.txt)" \
&& echo -e "Server failed to start (see application_errors.txt and clickhouse-server.clean.log)$FAIL$(trim_server_logs application_errors.txt)" \
>> /test_output/test_results.tsv)
stop
@ -435,7 +448,7 @@ rg -Fa "Code: 49. DB::Exception: " /var/log/clickhouse-server/clickhouse-server*
# No such key errors
rg --text "Code: 499.*The specified key does not exist" /var/log/clickhouse-server/clickhouse-server*.log > /test_output/no_such_key_errors.txt \
&& echo -e "S3_ERROR No such key thrown (see clickhouse-server.log or no_such_key_errors.txt)$FAIL$(head_escaped /test_output/no_such_key_errors.txt)" >> /test_output/test_results.tsv \
&& echo -e "S3_ERROR No such key thrown (see clickhouse-server.log or no_such_key_errors.txt)$FAIL$(trim_server_logs no_such_key_errors.txt)" >> /test_output/test_results.tsv \
|| echo -e "No lost s3 keys$OK" >> /test_output/test_results.tsv
# Remove file no_such_key_errors.txt if it's empty
@ -448,7 +461,7 @@ rg -Fa "########################################" /var/log/clickhouse-server/cli
# It also checks for crash without stacktrace (printed by watchdog)
rg -Fa " <Fatal> " /var/log/clickhouse-server/clickhouse-server*.log > /test_output/fatal_messages.txt \
&& echo -e "Fatal message in clickhouse-server.log (see fatal_messages.txt)$FAIL$(head_escaped /test_output/fatal_messages.txt)" >> /test_output/test_results.tsv \
&& echo -e "Fatal message in clickhouse-server.log (see fatal_messages.txt)$FAIL$(trim_server_logs fatal_messages.txt)" >> /test_output/test_results.tsv \
|| echo -e "No fatal messages in clickhouse-server.log$OK" >> /test_output/test_results.tsv
# Remove file fatal_messages.txt if it's empty
@ -457,8 +470,13 @@ rg -Fa " <Fatal> " /var/log/clickhouse-server/clickhouse-server*.log > /test_out
rg -Fa "########################################" /test_output/* > /dev/null \
&& echo -e "Killed by signal (output files)$FAIL" >> /test_output/test_results.tsv
function get_gdb_log_context()
{
rg -A50 -Fa " received signal " /test_output/gdb.log | head_escaped
}
rg -Fa " received signal " /test_output/gdb.log > /dev/null \
&& echo -e "Found signal in gdb.log$FAIL$(rg -A50 -Fa " received signal " /test_output/gdb.log | escaped)" >> /test_output/test_results.tsv
&& echo -e "Found signal in gdb.log$FAIL$(get_gdb_log_context)" >> /test_output/test_results.tsv
if [ "$DISABLE_BC_CHECK" -ne "1" ]; then
echo -e "Backward compatibility check\n"
@ -579,7 +597,7 @@ if [ "$DISABLE_BC_CHECK" -ne "1" ]; then
start 500
clickhouse-client --query "SELECT 'Backward compatibility check: Server successfully started', 'OK', NULL, ''" >> /test_output/test_results.tsv \
|| (rg --text "<Error>.*Application" /var/log/clickhouse-server/clickhouse-server.log >> /test_output/bc_check_application_errors.txt \
&& echo -e "Backward compatibility check: Server failed to start$FAIL$(head_escaped /test_output/bc_check_application_errors.txt)" >> /test_output/test_results.tsv)
&& echo -e "Backward compatibility check: Server failed to start$FAIL$(trim_server_logs bc_check_application_errors.txt)" >> /test_output/test_results.tsv)
clickhouse-client --query="SELECT 'Server version: ', version()"
@ -634,7 +652,7 @@ if [ "$DISABLE_BC_CHECK" -ne "1" ]; then
-e "Session expired" \
-e "TOO_MANY_PARTS" \
/var/log/clickhouse-server/clickhouse-server.backward.dirty.log | rg -Fa "<Error>" > /test_output/bc_check_error_messages.txt \
&& echo -e "Backward compatibility check: Error message in clickhouse-server.log (see bc_check_error_messages.txt)$FAIL$(head_escaped /test_output/bc_check_error_messages.txt)" \
&& echo -e "Backward compatibility check: Error message in clickhouse-server.log (see bc_check_error_messages.txt)$FAIL$(trim_server_logs bc_check_error_messages.txt)" \
>> /test_output/test_results.tsv \
|| echo -e "Backward compatibility check: No Error messages in clickhouse-server.log$OK" >> /test_output/test_results.tsv
@ -657,7 +675,7 @@ if [ "$DISABLE_BC_CHECK" -ne "1" ]; then
# Logical errors
echo "Check for Logical errors in server log:"
rg -Fa -A20 "Code: 49. DB::Exception:" /var/log/clickhouse-server/clickhouse-server.backward.*.log > /test_output/bc_check_logical_errors.txt \
&& echo -e "Backward compatibility check: Logical error thrown (see clickhouse-server.log or bc_check_logical_errors.txt)$FAIL$(head_escaped /test_output/bc_check_logical_errors.txt)" \
&& echo -e "Backward compatibility check: Logical error thrown (see clickhouse-server.log or bc_check_logical_errors.txt)$FAIL$(trim_server_logs bc_check_logical_errors.txt)" \
>> /test_output/test_results.tsv \
|| echo -e "Backward compatibility check: No logical errors$OK" >> /test_output/test_results.tsv
@ -672,7 +690,7 @@ if [ "$DISABLE_BC_CHECK" -ne "1" ]; then
# It also checks for crash without stacktrace (printed by watchdog)
echo "Check for Fatal message in server log:"
rg -Fa " <Fatal> " /var/log/clickhouse-server/clickhouse-server.backward.*.log > /test_output/bc_check_fatal_messages.txt \
&& echo -e "Backward compatibility check: Fatal message in clickhouse-server.log (see bc_check_fatal_messages.txt)$FAIL$(head_escaped /test_output/bc_check_fatal_messages.txt)" \
&& echo -e "Backward compatibility check: Fatal message in clickhouse-server.log (see bc_check_fatal_messages.txt)$FAIL$(trim_server_logs bc_check_fatal_messages.txt)" \
>> /test_output/test_results.tsv \
|| echo -e "Backward compatibility check: No fatal messages in clickhouse-server.log$OK" >> /test_output/test_results.tsv

View File

@ -85,8 +85,16 @@ def process_test_log(log_path):
if DATABASE_SIGN in line:
test_end = True
# Python does not support TSV, so we have to escape '\t' and '\n' manually
# and hope that complex escape sequences will not break anything
test_results = [
(test[0], test[1], test[2], "".join(test[3])[:4096]) for test in test_results
(
test[0],
test[1],
test[2],
"".join(test[3])[:4096].replace("\t", "\\t").replace("\n", "\\n"),
)
for test in test_results
]
return (

View File

@ -21,14 +21,14 @@ For example, you cant compare a date with a string. You have to use a functio
Strings are compared by bytes. A shorter string is smaller than all strings that start with it and that contain at least one more character.
## equals, a = b and a == b operator
### equals, a `=` b and a `==` b operator
## notEquals, a != b and a \<\> b operator
### notEquals, a `!=` b and a `<>` b operator
## less, \< operator
### less, `<` operator
## greater, \> operator
### greater, `>` operator
## lessOrEquals, \<= operator
### lessOrEquals, `<=` operator
## greaterOrEquals, \>= operator
### greaterOrEquals, `>=` operator

View File

@ -559,9 +559,9 @@ bool ExecutionStatus::tryDeserializeText(const std::string & data)
return true;
}
ExecutionStatus ExecutionStatus::fromCurrentException(const std::string & start_of_message)
ExecutionStatus ExecutionStatus::fromCurrentException(const std::string & start_of_message, bool with_stacktrace)
{
String msg = (start_of_message.empty() ? "" : (start_of_message + ": ")) + getCurrentExceptionMessage(false, true);
String msg = (start_of_message.empty() ? "" : (start_of_message + ": ")) + getCurrentExceptionMessage(with_stacktrace, true);
return ExecutionStatus(getCurrentExceptionCode(), msg);
}

View File

@ -242,7 +242,7 @@ struct ExecutionStatus
explicit ExecutionStatus(int return_code, const std::string & exception_message = "")
: code(return_code), message(exception_message) {}
static ExecutionStatus fromCurrentException(const std::string & start_of_message = "");
static ExecutionStatus fromCurrentException(const std::string & start_of_message = "", bool with_stacktrace = false);
static ExecutionStatus fromText(const std::string & data);

View File

@ -56,10 +56,9 @@ public:
return true;
}
ColumnNumbers getArgumentsThatAreAlwaysConstant() const override
{
return {1};
}
ColumnNumbers getArgumentsThatAreAlwaysConstant() const override { return {1}; }
bool useDefaultImplementationForNulls() const override { return false; }
bool isSuitableForShortCircuitArgumentsExecution(const DataTypesWithConstInfo & /*arguments*/) const override { return false; }

View File

@ -335,7 +335,7 @@ bool MergeFromLogEntryTask::finalize(ReplicatedMergeMutateTaskBase::PartLogWrite
" We will download merged part from replica to force byte-identical result.",
getCurrentExceptionMessage(false));
write_part_log(ExecutionStatus::fromCurrentException());
write_part_log(ExecutionStatus::fromCurrentException("", true));
if (storage.getSettings()->detach_not_byte_identical_parts)
storage.forcefullyMovePartToDetachedAndRemoveFromMemory(std::move(part), "merge-not-byte-identical");

View File

@ -52,7 +52,7 @@ bool MergePlainMergeTreeTask::executeStep()
}
catch (...)
{
write_part_log(ExecutionStatus::fromCurrentException());
write_part_log(ExecutionStatus::fromCurrentException("", true));
throw;
}
}

View File

@ -7504,7 +7504,7 @@ bool MergeTreeData::moveParts(const CurrentlyMovingPartsTaggerPtr & moving_tagge
}
catch (...)
{
write_part_log(ExecutionStatus::fromCurrentException());
write_part_log(ExecutionStatus::fromCurrentException("", true));
if (cloned_part)
cloned_part->remove();

View File

@ -215,7 +215,7 @@ bool MutateFromLogEntryTask::finalize(ReplicatedMergeMutateTaskBase::PartLogWrit
LOG_ERROR(log, "{}. Data after mutation is not byte-identical to data on another replicas. "
"We will download merged part from replica to force byte-identical result.", getCurrentExceptionMessage(false));
write_part_log(ExecutionStatus::fromCurrentException());
write_part_log(ExecutionStatus::fromCurrentException("", true));
if (storage.getSettings()->detach_not_byte_identical_parts)
storage.forcefullyMovePartToDetachedAndRemoveFromMemory(std::move(new_part), "mutate-not-byte-identical");

View File

@ -103,7 +103,7 @@ bool MutatePlainMergeTreeTask::executeStep()
PreformattedMessage exception_message = getCurrentExceptionMessageAndPattern(/* with_stacktrace */ false);
LOG_ERROR(&Poco::Logger::get("MutatePlainMergeTreeTask"), exception_message);
storage.updateMutationEntriesErrors(future_part, false, exception_message.text);
write_part_log(ExecutionStatus::fromCurrentException());
write_part_log(ExecutionStatus::fromCurrentException("", true));
tryLogCurrentException(__PRETTY_FUNCTION__);
return false;
}

View File

@ -91,6 +91,7 @@ bool ReplicatedMergeMutateTaskBase::executeStep()
auto & log_entry = selected_entry->log_entry;
log_entry->exception = saved_exception;
log_entry->last_exception_time = time(nullptr);
if (log_entry->type == ReplicatedMergeTreeLogEntryData::MUTATE_PART)
{
@ -188,7 +189,7 @@ bool ReplicatedMergeMutateTaskBase::executeImpl()
catch (...)
{
if (part_log_writer)
part_log_writer(ExecutionStatus::fromCurrentException());
part_log_writer(ExecutionStatus::fromCurrentException("", true));
throw;
}
@ -204,7 +205,7 @@ bool ReplicatedMergeMutateTaskBase::executeImpl()
catch (...)
{
if (part_log_writer)
part_log_writer(ExecutionStatus::fromCurrentException());
part_log_writer(ExecutionStatus::fromCurrentException("", true));
throw;
}

View File

@ -157,6 +157,7 @@ struct ReplicatedMergeTreeLogEntryData
/// Access under queue_mutex, see ReplicatedMergeTreeQueue.
size_t num_tries = 0; /// The number of attempts to perform the action (since the server started, including the running one).
std::exception_ptr exception; /// The last exception, in the case of an unsuccessful attempt to perform the action.
time_t last_exception_time = 0; /// The time at which the last exception occurred.
time_t last_attempt_time = 0; /// The time at which the last attempt was attempted to complete the action.
size_t num_postponed = 0; /// The number of times the action was postponed.
String postpone_reason; /// The reason why the action was postponed, if it was postponed.

View File

@ -1692,6 +1692,7 @@ bool ReplicatedMergeTreeQueue::processEntry(
{
std::lock_guard lock(state_mutex);
entry->exception = saved_exception;
entry->last_exception_time = time(nullptr);
return false;
}

View File

@ -520,7 +520,7 @@ void ReplicatedMergeTreeSinkImpl<false>::finishDelayedChunk(const ZooKeeperWithF
}
catch (...)
{
PartLog::addNewPart(storage.getContext(), part, partition.elapsed_ns, ExecutionStatus::fromCurrentException(__PRETTY_FUNCTION__));
PartLog::addNewPart(storage.getContext(), part, partition.elapsed_ns, ExecutionStatus::fromCurrentException("", true));
throw;
}
}
@ -588,7 +588,7 @@ void ReplicatedMergeTreeSinkImpl<async_insert>::writeExistingPart(MergeTreeData:
}
catch (...)
{
PartLog::addNewPart(storage.getContext(), part, watch.elapsed(), ExecutionStatus::fromCurrentException(__PRETTY_FUNCTION__));
PartLog::addNewPart(storage.getContext(), part, watch.elapsed(), ExecutionStatus::fromCurrentException("", true));
throw;
}
}

View File

@ -1876,7 +1876,7 @@ void StorageMergeTree::replacePartitionFrom(const StoragePtr & source_table, con
}
catch (...)
{
PartLog::addNewParts(getContext(), dst_parts, watch.elapsed(), ExecutionStatus::fromCurrentException());
PartLog::addNewParts(getContext(), dst_parts, watch.elapsed(), ExecutionStatus::fromCurrentException("", true));
throw;
}
}
@ -1959,7 +1959,7 @@ void StorageMergeTree::movePartitionToTable(const StoragePtr & dest_table, const
}
catch (...)
{
PartLog::addNewParts(getContext(), dst_parts, watch.elapsed(), ExecutionStatus::fromCurrentException());
PartLog::addNewParts(getContext(), dst_parts, watch.elapsed(), ExecutionStatus::fromCurrentException("", true));
throw;
}
}

View File

@ -2340,7 +2340,7 @@ bool StorageReplicatedMergeTree::executeReplaceRange(const LogEntry & entry)
}
catch (...)
{
PartLog::addNewParts(getContext(), res_parts, watch.elapsed(), ExecutionStatus::fromCurrentException());
PartLog::addNewParts(getContext(), res_parts, watch.elapsed(), ExecutionStatus::fromCurrentException("", true));
for (const auto & res_part : res_parts)
unlockSharedData(*res_part);
@ -4193,7 +4193,7 @@ bool StorageReplicatedMergeTree::fetchPart(
catch (...)
{
if (!to_detached)
write_part_log(ExecutionStatus::fromCurrentException());
write_part_log(ExecutionStatus::fromCurrentException("", true));
throw;
}
@ -4303,7 +4303,7 @@ MutableDataPartStoragePtr StorageReplicatedMergeTree::fetchExistsPart(
}
catch (...)
{
write_part_log(ExecutionStatus::fromCurrentException());
write_part_log(ExecutionStatus::fromCurrentException("", true));
throw;
}
@ -7068,7 +7068,7 @@ void StorageReplicatedMergeTree::replacePartitionFrom(
}
catch (...)
{
PartLog::addNewParts(getContext(), dst_parts, watch.elapsed(), ExecutionStatus::fromCurrentException());
PartLog::addNewParts(getContext(), dst_parts, watch.elapsed(), ExecutionStatus::fromCurrentException("", true));
for (const auto & dst_part : dst_parts)
unlockSharedData(*dst_part);
@ -7297,7 +7297,7 @@ void StorageReplicatedMergeTree::movePartitionToTable(const StoragePtr & dest_ta
}
catch (...)
{
PartLog::addNewParts(getContext(), dst_parts, watch.elapsed(), ExecutionStatus::fromCurrentException());
PartLog::addNewParts(getContext(), dst_parts, watch.elapsed(), ExecutionStatus::fromCurrentException("", true));
for (const auto & dst_part : dst_parts)
dest_table_storage->unlockSharedData(*dst_part);

View File

@ -38,6 +38,7 @@ NamesAndTypesList StorageSystemReplicationQueue::getNamesAndTypes()
{ "is_currently_executing", std::make_shared<DataTypeUInt8>() },
{ "num_tries", std::make_shared<DataTypeUInt32>() },
{ "last_exception", std::make_shared<DataTypeString>() },
{ "last_exception_time", std::make_shared<DataTypeDateTime>() },
{ "last_attempt_time", std::make_shared<DataTypeDateTime>() },
{ "num_postponed", std::make_shared<DataTypeUInt32>() },
{ "postpone_reason", std::make_shared<DataTypeString>() },
@ -141,7 +142,8 @@ void StorageSystemReplicationQueue::fillData(MutableColumns & res_columns, Conte
res_columns[col_num++]->insert(entry.detach);
res_columns[col_num++]->insert(entry.currently_executing);
res_columns[col_num++]->insert(entry.num_tries);
res_columns[col_num++]->insert(entry.exception ? getExceptionMessage(entry.exception, false) : "");
res_columns[col_num++]->insert(entry.exception ? getExceptionMessage(entry.exception, true) : "");
res_columns[col_num++]->insert(UInt64(entry.last_exception_time));
res_columns[col_num++]->insert(UInt64(entry.last_attempt_time));
res_columns[col_num++]->insert(entry.num_postponed);
res_columns[col_num++]->insert(entry.postpone_reason);

View File

@ -183,7 +183,11 @@ def prepare_tests_results_for_clickhouse(
current_row["test_duration_ms"] = int(test_time * 1000)
current_row["test_name"] = test_name
current_row["test_status"] = test_status
current_row["test_context_raw"] = test_result.raw_logs or ""
if test_result.raw_logs:
# Protect from too big blobs that contain garbage
current_row["test_context_raw"] = test_result.raw_logs[: 32 * 1024]
else:
current_row["test_context_raw"] = ""
result.append(current_row)
return result

View File

@ -224,7 +224,10 @@ def read_test_results(results_path: Path, with_raw_logs: bool = True) -> TestRes
# The value can be emtpy, but when it's not,
# the 4th value is a pythonic list, e.g. ['file1', 'file2']
if with_raw_logs:
result.set_raw_logs(line[3])
# Python does not support TSV, so we unescape manually
result.set_raw_logs(
line[3].replace("\\t", "\t").replace("\\n", "\n")
)
else:
result.set_log_files(line[3])

View File

@ -58,7 +58,7 @@ select min(dt), max(dt), count(toDate(dt) >= '2021-10-25') from d where toDate(d
select count() from d group by toDate(dt);
-- fuzz crash
SELECT min(dt), count(ignore(ignore(ignore(tupleElement(_partition_value, NULL) = NULL), NULL, NULL, NULL), 0, '10485.76', NULL)), max(dt), count(toDate(dt) >= '2021-10-25') FROM d WHERE toDate(dt) >= '2021-10-25';
SELECT min(dt), count(ignore(ignore(ignore(tupleElement(_partition_value, 'xxxx', NULL) = NULL), NULL, NULL, NULL), 0, '10485.76', NULL)), max(dt), count(toDate(dt) >= '2021-10-25') FROM d WHERE toDate(dt) >= '2021-10-25';
-- fuzz crash
SELECT pointInEllipses(min(j), NULL), max(dt), count('0.0000000007') FROM d WHERE toDate(dt) >= '2021-10-25';

View File

@ -923,6 +923,7 @@ CREATE TABLE system.replication_queue
`is_currently_executing` UInt8,
`num_tries` UInt32,
`last_exception` String,
`last_exception_time` DateTime,
`last_attempt_time` DateTime,
`num_postponed` UInt32,
`postpone_reason` String,

View File

@ -0,0 +1,2 @@
1 1 2 2
\N \N 3 3

View File

@ -0,0 +1,19 @@
DROP TABLE IF EXISTS test_tuple_element;
CREATE TABLE test_tuple_element
(
tuple Tuple(k1 Nullable(UInt64), k2 UInt64)
)
ENGINE = MergeTree
ORDER BY tuple()
SETTINGS index_granularity = 8192;
INSERT INTO test_tuple_element VALUES (tuple(1,2)), (tuple(NULL, 3));
SELECT
tupleElement(tuple, 'k1', 0) fine_k1_with_0,
tupleElement(tuple, 'k1', NULL) k1_with_null,
tupleElement(tuple, 'k2', 0) k2_with_0,
tupleElement(tuple, 'k2', NULL) k2_with_null
FROM test_tuple_element;
DROP TABLE test_tuple_element;