ClickHouse/tests/queries/0_stateless/replication.lib
Alexander Tokmakov 697f2bcedb fix
2021-08-24 11:35:45 +03:00

83 lines
4.0 KiB
Bash
Executable File

#!/usr/bin/env bash
# shellcheck source=./mergetree_mutations.lib
. "$CURDIR"/mergetree_mutations.lib
function try_sync_replicas()
{
readarray -t empty_partitions_arr < <(${CLICKHOUSE_CLIENT} -q \
"SELECT DISTINCT substr(new_part_name, 1, position(new_part_name, '_') - 1) AS partition_id
FROM system.replication_queue
WHERE (database = currentDatabase()) AND (table LIKE '$1%') AND (last_exception LIKE '%No active replica has part%') AND (partition_id NOT IN (
SELECT partition_id
FROM system.parts
WHERE (database = currentDatabase()) AND (table LIKE '$1%')
))")
readarray -t tables_arr < <(${CLICKHOUSE_CLIENT} -q "SELECT name FROM system.tables WHERE database=currentDatabase() AND name like '$1%' AND engine like '%Replicated%'")
for t in "${tables_arr[@]}"
do
for p in "${empty_partitions_arr[@]}"
do
# Avoid "Empty part ... is not created instead of lost part because there are no parts in partition"
$CLICKHOUSE_CLIENT -q "ALTER TABLE $t DROP PARTITION ID '$p'" 2>/dev/null
done
done
for t in "${tables_arr[@]}"
do
# The size of log may be big, so increase timeout.
$CLICKHOUSE_CLIENT --receive_timeout 400 -q "SYSTEM SYNC REPLICA $t" || $CLICKHOUSE_CLIENT -q \
"select 'sync failed, queue:', * from system.replication_queue where database=currentDatabase() and table='$t' order by database, table, node_name" &
done
wait
echo "Replication did not hang: synced all replicas of $1"
}
function check_replication_consistency()
{
# Do not check anything if all replicas are readonly,
# because is this case all replicas are probably lost (it may happen and it's not a bug)
res=$($CLICKHOUSE_CLIENT -q "SELECT count() - sum(is_readonly) FROM system.replicas WHERE database=currentDatabase() AND table LIKE '$1%'")
if [ $res -eq 0 ]; then
# Print dummy lines
echo "Replication did not hang: synced all replicas of $1"
echo "Consistency: 1"
return 0
fi
# Trigger pullLogsToQueue(...) and updateMutations(...) on some replica to make it pull all mutations, so it will be possible to kill them
some_table=$($CLICKHOUSE_CLIENT -q "SELECT name FROM system.tables WHERE database=currentDatabase() AND name like '$1%' ORDER BY rand() LIMIT 1")
$CLICKHOUSE_CLIENT --receive_timeout 3 -q "SYSTEM SYNC REPLICA $some_table" 1>/dev/null 2>/dev/null ||:
some_table=$($CLICKHOUSE_CLIENT -q "SELECT name FROM system.tables WHERE database=currentDatabase() AND name like '$1%' ORDER BY rand() LIMIT 1")
$CLICKHOUSE_CLIENT --receive_timeout 3 -q "SYSTEM SYNC REPLICA $some_table" 1>/dev/null 2>/dev/null ||:
# Forcefully cancel mutations to avoid waiting for them to finish
${CLICKHOUSE_CLIENT} -q "KILL MUTATION WHERE database=currentDatabase() AND table like '$1%'" > /dev/null
# SYNC REPLICA is not enough if some MUTATE_PARTs are not assigned yet
wait_for_all_mutations "$1%"
try_sync_replicas "$1"
res=$($CLICKHOUSE_CLIENT -q \
"SELECT
if((countDistinct(data) as c) == 0, 1, c)
FROM
(
SELECT _table, ($2) AS data
FROM merge(currentDatabase(), '$1') GROUP BY _table
)")
echo "Consistency: $res"
if [ $res -ne 1 ]; then
echo "Replicas have diverged:"
$CLICKHOUSE_CLIENT -q "select 'data', _table, $2, arraySort(groupArrayDistinct(_part)) from merge(currentDatabase(), '$1') group by _table order by _table"
$CLICKHOUSE_CLIENT -q "select 'queue', * from system.replication_queue where database=currentDatabase() and table like '$1%' order by database, table, node_name"
$CLICKHOUSE_CLIENT -q "select 'mutations', * from system.mutations where database=currentDatabase() and table like '$1%' order by database, table, mutation_id"
$CLICKHOUSE_CLIENT -q "select 'parts', * from system.parts where database=currentDatabase() and table like '$1%' order by database, table, name"
echo "Good luck with debugging..."
fi
}