ClickHouse/tests/queries/0_stateless/02903_rmt_retriable_merge_exception.sh
Azat Khuzhin 81da52bdf4 Fix 02903_rmt_retriable_merge_exception flakiness for replicated database
In case of replicated database system stop pulling replication log for
rmt2 should be done on all replicas, otherwise some replica may merge
the part and all other replicas may fetch it.

Also, since SYSTEM STOP PULLING REPLICATION LOG does not waits for the
current pull, let's trigger log pull explicitly to provide at least some
guarantee that replication log pulling had been stopped, otherwise race
is possible [1].

  [1]: https://s3.amazonaws.com/clickhouse-test-reports/57155/f68717ccd0a07a499911c9b0db7537ae8205e41b/stateless_tests_flaky_check__asan_.html

Signed-off-by: Azat Khuzhin <a.khuzhin@semrush.com>
2023-11-24 15:01:37 +01:00

83 lines
3.3 KiB
Bash
Executable File

#!/usr/bin/env bash
# Tags: no-ordinary-database
# Tag no-ordinary-database: requires UUID
CUR_DIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)
# shellcheck source=../shell_config.sh
. "$CUR_DIR"/../shell_config.sh
# Test that retriable errors during merges/mutations
# (i.e. "No active replica has part X or covering part")
# does not appears as errors (level=Error), only as info message (level=Information).
cluster=default
if [[ $($CLICKHOUSE_CLIENT -q "select count()>0 from system.clusters where cluster = 'test_cluster_database_replicated'") = 1 ]]; then
cluster=test_cluster_database_replicated
fi
$CLICKHOUSE_CLIENT -nm --distributed_ddl_output_mode=none -q "
drop table if exists rmt1;
drop table if exists rmt2;
create table rmt1 (key Int) engine=ReplicatedMergeTree('/clickhouse/{database}', '1') order by key settings always_fetch_merged_part=1;
create table rmt2 (key Int) engine=ReplicatedMergeTree('/clickhouse/{database}', '2') order by key settings always_fetch_merged_part=0;
insert into rmt1 values (1);
insert into rmt1 values (2);
system sync replica rmt1;
-- SYSTEM STOP PULLING REPLICATION LOG does not waits for the current pull,
-- trigger it explicitly to 'avoid race' (though proper way will be to wait
-- for current pull in the StorageReplicatedMergeTree::getActionLock())
system sync replica rmt2;
-- NOTE: CLICKHOUSE_DATABASE is required
system stop pulling replication log on cluster $cluster $CLICKHOUSE_DATABASE.rmt2;
optimize table rmt1 final settings alter_sync=0, optimize_throw_if_noop=1;
" || exit 1
table_uuid=$($CLICKHOUSE_CLIENT -q "select uuid from system.tables where database = currentDatabase() and table = 'rmt1'")
if [[ -z $table_uuid ]]; then
echo "Table does not have UUID" >&2
exit 1
fi
# NOTE: that part name can be different from all_0_1_1, in case of ZooKeeper retries
part_name='%'
# wait while there be at least one 'No active replica has part all_0_1_1 or covering part' in logs
for _ in {0..50}; do
no_active_repilica_messages=$($CLICKHOUSE_CLIENT -nm -q "
system flush logs;
select count()
from system.text_log
where
event_date >= yesterday() and event_time >= now() - 600 and
(
(logger_name = 'MergeTreeBackgroundExecutor' and message like '%{$table_uuid::$part_name}%No active replica has part $part_name or covering part%') or
(logger_name like '$table_uuid::$part_name (MergeFromLogEntryTask)' and message like '%No active replica has part $part_name or covering part%')
);
")
if [[ $no_active_repilica_messages -gt 0 ]]; then
break
fi
# too frequent "system flush logs" causes troubles
sleep 1
done
$CLICKHOUSE_CLIENT -nm -q "
system start pulling replication log rmt2;
system flush logs;
select
level, count() > 0
from system.text_log
where
event_date >= yesterday() and event_time >= now() - 600 and
(
(logger_name = 'MergeTreeBackgroundExecutor' and message like '%{$table_uuid::$part_name}%No active replica has part $part_name or covering part%') or
(logger_name like '$table_uuid::$part_name (MergeFromLogEntryTask)' and message like '%No active replica has part $part_name or covering part%')
)
group by level;
"