ClickHouse/tests/integration/test_distributed_queries_stress/test.py

# pylint: disable=redefined-outer-name
# pylint: disable=unused-argument
# pylint: disable=line-too-long

import shlex
import itertools
import pytest
from helpers.cluster import ClickHouseCluster

cluster = ClickHouseCluster(__file__)
node1_r1 = cluster.add_instance('node1_r1', main_configs=['configs/remote_servers.xml'])
node2_r1 = cluster.add_instance('node2_r1', main_configs=['configs/remote_servers.xml'])
node1_r2 = cluster.add_instance('node1_r2', main_configs=['configs/remote_servers.xml'])
node2_r2 = cluster.add_instance('node2_r2', main_configs=['configs/remote_servers.xml'])

def run_benchmark(payload, settings):
    node1_r1.exec_in_container([
        'bash', '-c', 'echo {} | '.format(shlex.quote(payload.strip())) + ' '.join([
            'clickhouse', 'benchmark',
            '--concurrency=100',
            '--cumulative',
            '--delay=0',
            # NOTE: with current matrix even 3 seconds it huge...
            '--timelimit=3',
            # tune some basic timeouts
            '--hedged_connection_timeout_ms=200',
            '--connect_timeout_with_failover_ms=200',
            '--connections_with_failover_max_tries=5',
            *settings,
        ])
    ])

@pytest.fixture(scope='module')
def started_cluster():
    try:
        cluster.start()

        for _, instance in cluster.instances.items():
            instance.query("""
            create table if not exists data (
                key Int,
                /* just to increase block size */
                v1 UInt64,
                v2 UInt64,
                v3 UInt64,
                v4 UInt64,
                v5 UInt64,
                v6 UInt64,
                v7 UInt64,
                v8 UInt64,
                v9 UInt64,
                v10 UInt64,
                v11 UInt64,
                v12 UInt64
            ) Engine=MergeTree() order by key partition by key%5;
            insert into data (key) select * from numbers(10);

            create table if not exists dist_one           as data engine=Distributed(one_shard, currentDatabase(), data, key);
            create table if not exists dist_one_over_dist as data engine=Distributed(one_shard, currentDatabase(), dist_one, yandexConsistentHash(key, 2));

            create table if not exists dist_two as data           engine=Distributed(two_shards, currentDatabase(), data, key);
            create table if not exists dist_two_over_dist as data engine=Distributed(two_shards, currentDatabase(), dist_two, yandexConsistentHash(key, 2));
            """)
        yield cluster
    finally:
        cluster.shutdown()

@pytest.mark.parametrize('table,settings', itertools.product(
    [ # tables
        'dist_one',
        'dist_one_over_dist',
        'dist_two',
        'dist_two_over_dist',
    ],
    [ # settings
        *list(itertools.combinations([
            '', # defaults
            '--prefer_localhost_replica=0',
            '--async_socket_for_remote=0',
            '--use_hedged_requests=0',
            '--optimize_skip_unused_shards=1',
            '--distributed_group_by_no_merge=2',
            '--optimize_distributed_group_by_sharding_key=1',

            # TODO: enlarge test matrix (but first those values to accept ms):
            #
            # - sleep_in_send_tables_status
            # - sleep_in_send_data
        ], 2))
        # TODO: more combinations that just 2
    ],
))
def test_stress_distributed(table, settings, started_cluster):
    payload = f'''
    select * from {table} where key = 0;
    select * from {table} where key = 1;
    select * from {table} where key = 2;
    select * from {table} where key = 3;
    select * from {table};
    '''
    run_benchmark(payload, settings)
Add stress test for distributed queries It may founds issue like in [1]: 2021.03.18 19:05:38.783328 [ 245 ] {4b1f5ec0-bf2d-478c-a2e1-d312531db206} <Debug> executeQuery: (from 127.0.0.1:40918, using production parser) select * from dist where key = 0; 2021.03.18 19:05:38.783760 [ 245 ] {4b1f5ec0-bf2d-478c-a2e1-d312531db206} <Debug> StorageDistributed (dist): Skipping irrelevant shards - the query will be sent to the following shards of the cluster (shard numbers): [1] 2021.03.18 19:05:38.784012 [ 245 ] {4b1f5ec0-bf2d-478c-a2e1-d312531db206} <Trace> ContextAccess (default): Access granted: SELECT(key) ON default.dist 2021.03.18 19:05:38.784410 [ 245 ] {4b1f5ec0-bf2d-478c-a2e1-d312531db206} <Trace> ContextAccess (default): Access granted: SELECT(key) ON default.dist 2021.03.18 19:05:38.784488 [ 245 ] {4b1f5ec0-bf2d-478c-a2e1-d312531db206} <Trace> StorageDistributed (dist): Disabling force_optimize_skip_unused_shards for nested queries (force_optimize_skip_unused_shards_nesting exceeded) 2021.03.18 19:05:38.784572 [ 245 ] {4b1f5ec0-bf2d-478c-a2e1-d312531db206} <Trace> InterpreterSelectQuery: Complete -> Complete 2021.03.18 19:05:38.819063 [ 245 ] {4b1f5ec0-bf2d-478c-a2e1-d312531db206} <Information> executeQuery: Read 20 rows, 80.00 B in 0.035687783 sec., 560 rows/sec., 2.19 KiB/sec. 2021.03.18 19:05:38.827842 [ 245 ] {4b1f5ec0-bf2d-478c-a2e1-d312531db206} <Debug> MemoryTracker: Peak memory usage (for query): 0.00 B. 2021.03.18 19:05:38.867752 [ 547 ] {} <Fatal> BaseDaemon: ######################################## 2021.03.18 19:05:38.867959 [ 547 ] {} <Fatal> BaseDaemon: (version 21.4.1.1, build id: A0ADEC175BD65E58EA012C47C265E661C32D23B5) (from thread 245) (query_id: 4b1f5ec0-bf2d-478c-a2e1-d312531db206) Received signal Aborted (6) 2021.03.18 19:05:38.868733 [ 547 ] {} <Fatal> BaseDaemon: 2021.03.18 19:05:38.868958 [ 547 ] {} <Fatal> BaseDaemon: Stack trace: 0x7fd1394be18b 0x7fd13949d859 0x10c4c99b 0xd434ee1 0xd434f1a 2021.03.18 19:05:38.870135 [ 547 ] {} <Fatal> BaseDaemon: 3. gsignal @ 0x4618b in /usr/lib/x86_64-linux-gnu/libc-2.31.so 2021.03.18 19:05:38.870383 [ 547 ] {} <Fatal> BaseDaemon: 4. abort @ 0x25859 in /usr/lib/x86_64-linux-gnu/libc-2.31.so 2021.03.18 19:05:38.886783 [ 547 ] {} <Fatal> BaseDaemon: 5. /work3/azat/ch/clickhouse/.cmake/../contrib/libunwind/src/UnwindLevel1.c:396: _Unwind_Resume @ 0x10c4c99b in /usr/bin/clickhouse 2021.03.18 19:05:47.200208 [ 547 ] {} <Fatal> BaseDaemon: 6. ? @ 0xd434ee1 in /usr/bin/clickhouse 2021.03.18 19:05:47.348738 [ 547 ] {} <Fatal> BaseDaemon: 7.1. inlined from /work3/azat/ch/clickhouse/.cmake/../contrib/boost/boost/context/fiber_fcontext.hpp:253: boost::context::fiber::~fiber() 2021.03.18 19:05:47.349118 [ 547 ] {} <Fatal> BaseDaemon: 7.2. inlined from ../contrib/boost/boost/context/fiber_fcontext.hpp:252: boost::context::detail::fiber_record<boost::context::fiber, FiberStack&, DB::RemoteQueryExecutorRoutine>::run(void*) 2021.03.18 19:05:47.349163 [ 547 ] {} <Fatal> BaseDaemon: 7. ../contrib/boost/boost/context/fiber_fcontext.hpp:80: void boost::context::detail::fiber_entry<boost::context::detail::fiber_record<boost::context::fiber, FiberStack&, DB::RemoteQueryExecutorRoutine> >(boost::context::detail::transfer_t) @ 0xd434f1a in /usr/bin/clickhouse 2021.03.18 19:05:47.618174 [ 547 ] {} <Fatal> BaseDaemon: Calculated checksum of the binary: FF3BA83D0CD648741EEEC242CB1966D9. There is no information about the reference checksum. [1]: https://clickhouse-test-reports.s3.yandex.net/0/1b2ed51ff5e4a3dc45567d4967108f43f680c884/stress_test_(debug).html#fail1 2021-03-20 19:40:24 +00:00			`# pylint: disable=redefined-outer-name`
			`# pylint: disable=unused-argument`
			`# pylint: disable=line-too-long`

			`import shlex`
			`import itertools`
			`import pytest`
			`from helpers.cluster import ClickHouseCluster`

			`cluster = ClickHouseCluster(__file__)`
			`node1_r1 = cluster.add_instance('node1_r1', main_configs=['configs/remote_servers.xml'])`
			`node2_r1 = cluster.add_instance('node2_r1', main_configs=['configs/remote_servers.xml'])`
			`node1_r2 = cluster.add_instance('node1_r2', main_configs=['configs/remote_servers.xml'])`
			`node2_r2 = cluster.add_instance('node2_r2', main_configs=['configs/remote_servers.xml'])`

			`def run_benchmark(payload, settings):`
			`node1_r1.exec_in_container([`
			`'bash', '-c', 'echo {} \| '.format(shlex.quote(payload.strip())) + ' '.join([`
			`'clickhouse', 'benchmark',`
			`'--concurrency=100',`
			`'--cumulative',`
			`'--delay=0',`
			`# NOTE: with current matrix even 3 seconds it huge...`
			`'--timelimit=3',`
			`# tune some basic timeouts`
			`'--hedged_connection_timeout_ms=200',`
			`'--connect_timeout_with_failover_ms=200',`
			`'--connections_with_failover_max_tries=5',`
			`*settings,`
			`])`
			`])`

			`@pytest.fixture(scope='module')`
			`def started_cluster():`
			`try:`
			`cluster.start()`

			`for _, instance in cluster.instances.items():`
			`instance.query("""`
			`create table if not exists data (`
			`key Int,`
			`/* just to increase block size */`
			`v1 UInt64,`
			`v2 UInt64,`
			`v3 UInt64,`
			`v4 UInt64,`
			`v5 UInt64,`
			`v6 UInt64,`
			`v7 UInt64,`
			`v8 UInt64,`
			`v9 UInt64,`
			`v10 UInt64,`
			`v11 UInt64,`
			`v12 UInt64`
			`) Engine=MergeTree() order by key partition by key%5;`
			`insert into data (key) select * from numbers(10);`

			`create table if not exists dist_one as data engine=Distributed(one_shard, currentDatabase(), data, key);`
			`create table if not exists dist_one_over_dist as data engine=Distributed(one_shard, currentDatabase(), dist_one, yandexConsistentHash(key, 2));`

			`create table if not exists dist_two as data engine=Distributed(two_shards, currentDatabase(), data, key);`
			`create table if not exists dist_two_over_dist as data engine=Distributed(two_shards, currentDatabase(), dist_two, yandexConsistentHash(key, 2));`
			`""")`
			`yield cluster`
			`finally:`
			`cluster.shutdown()`

			`@pytest.mark.parametrize('table,settings', itertools.product(`
			`[ # tables`
			`'dist_one',`
			`'dist_one_over_dist',`
			`'dist_two',`
			`'dist_two_over_dist',`
			`],`
			`[ # settings`
			`*list(itertools.combinations([`
			`'', # defaults`
			`'--prefer_localhost_replica=0',`
			`'--async_socket_for_remote=0',`
			`'--use_hedged_requests=0',`
			`'--optimize_skip_unused_shards=1',`
			`'--distributed_group_by_no_merge=2',`
			`'--optimize_distributed_group_by_sharding_key=1',`

			`# TODO: enlarge test matrix (but first those values to accept ms):`
			`#`
			`# - sleep_in_send_tables_status`
			`# - sleep_in_send_data`
			`], 2))`
			`# TODO: more combinations that just 2`
			`],`
			`))`
			`def test_stress_distributed(table, settings, started_cluster):`
			`payload = f'''`
			`select * from {table} where key = 0;`
			`select * from {table} where key = 1;`
			`select * from {table} where key = 2;`
			`select * from {table} where key = 3;`
			`select * from {table};`
			`'''`
			`run_benchmark(payload, settings)`