ClickHouse/tests/integration/test_merge_tree_s3_with_cache/test.py

import logging

import pytest

from helpers.cluster import ClickHouseCluster


@pytest.fixture(scope="module")
def cluster():
    try:
        cluster = ClickHouseCluster(__file__)
        cluster.add_instance(
            "node",
            main_configs=[
                "configs/config.d/storage_conf.xml",
                "configs/config.d/ssl_conf.xml",
                "configs/config.d/query_log.xml",
            ],
            user_configs=["configs/config.d/users.xml"],
            with_minio=True,
        )
        logging.info("Starting cluster...")
        cluster.start()
        logging.info("Cluster started")

        yield cluster
    finally:
        cluster.shutdown()


def get_query_stat(instance, hint):
    result = {}
    instance.query("SYSTEM FLUSH LOGS")
    events = instance.query(
        """
        SELECT ProfileEvents.keys, ProfileEvents.values
        FROM system.query_log
        ARRAY JOIN ProfileEvents
        WHERE type != 1 AND query LIKE '%{}%'
        """.format(
            hint.replace("'", "\\'")
        )
    ).split("\n")
    for event in events:
        ev = event.split("\t")
        if len(ev) == 2:
            if ev[0].startswith("S3"):
                result[ev[0]] = int(ev[1])
    return result


@pytest.mark.parametrize("min_rows_for_wide_part,read_requests", [(0, 2), (8192, 1)])
def test_write_is_cached(cluster, min_rows_for_wide_part, read_requests):
    node = cluster.instances["node"]

    node.query(
        """
        CREATE TABLE s3_test (
            id Int64,
            data String
        ) ENGINE=MergeTree()
        ORDER BY id
        SETTINGS storage_policy='s3', min_rows_for_wide_part={}
        """.format(
            min_rows_for_wide_part
        )
    )

    node.query("SYSTEM FLUSH LOGS")
    node.query("TRUNCATE TABLE system.query_log")

    node.query("INSERT INTO s3_test VALUES (0,'data'),(1,'data')")

    select_query = "SELECT * FROM s3_test order by id FORMAT Values"
    assert node.query(select_query) == "(0,'data'),(1,'data')"

    # With async reads profile events are not updated because reads are done in a separate thread.
    # stat = get_query_stat(node, select_query)
    # assert stat["S3ReadRequestsCount"] == read_requests  # Only .bin files should be accessed from S3.

    node.query("DROP TABLE IF EXISTS s3_test SYNC")


@pytest.mark.parametrize(
    "min_rows_for_wide_part,all_files,bin_files", [(0, 4, 2), (8192, 2, 1)]
)
def test_read_after_cache_is_wiped(
    cluster, min_rows_for_wide_part, all_files, bin_files
):
    node = cluster.instances["node"]

    node.query(
        """
        CREATE TABLE s3_test (
            id Int64,
            data String
        ) ENGINE=MergeTree()
        ORDER BY id
        SETTINGS storage_policy='s3', min_rows_for_wide_part={}
        """.format(
            min_rows_for_wide_part
        )
    )

    node.query("SYSTEM FLUSH LOGS")
    node.query("TRUNCATE TABLE system.query_log")

    node.query("INSERT INTO s3_test VALUES (0,'data'),(1,'data')")

    # Wipe cache
    cluster.exec_in_container(
        cluster.get_container_id("node"),
        ["rm", "-rf", "/var/lib/clickhouse/disks/s3/cache/"],
    )

    select_query = "SELECT * FROM s3_test"
    node.query(select_query)
    # With async reads profile events are not updated because reads are done in a separate thread.
    # stat = get_query_stat(node, select_query)
    # assert stat["S3ReadRequestsCount"] == all_files  # .mrk and .bin files should be accessed from S3.

    # After cache is populated again, only .bin files should be accessed from S3.
    select_query = "SELECT * FROM s3_test order by id FORMAT Values"
    assert node.query(select_query) == "(0,'data'),(1,'data')"

    # With async reads profile events are not updated because reads are done in a separate thread.
    # stat = get_query_stat(node, select_query)
    # assert stat["S3ReadRequestsCount"] == bin_files

    node.query("DROP TABLE IF EXISTS s3_test SYNC")
DiskS3 with cache tests. 2020-07-30 14:49:56 +00:00			`import logging`

			`import pytest`
Automatic style fix 2024-09-27 10:19:39 +00:00
DiskS3 with cache tests. 2020-07-30 14:49:56 +00:00			`from helpers.cluster import ClickHouseCluster`


			`@pytest.fixture(scope="module")`
			`def cluster():`
			`try:`
			`cluster = ClickHouseCluster(__file__)`
Collect stderr.log and stdout.log in all integration tests by default. 2021-06-29 13:01:15 +00:00			`cluster.add_instance(`
			`"node",`
			`main_configs=[`
			`"configs/config.d/storage_conf.xml",`
			`"configs/config.d/ssl_conf.xml",`
			`"configs/config.d/query_log.xml",`
			`],`
Add explicit main_configs, user_configs and dictionaries in integration tests. 2020-08-12 08:55:04 +00:00			`user_configs=["configs/config.d/users.xml"],`
			`with_minio=True,`
			`)`
DiskS3 with cache tests. 2020-07-30 14:49:56 +00:00			`logging.info("Starting cluster...")`
			`cluster.start()`
			`logging.info("Cluster started")`

			`yield cluster`
			`finally:`
			`cluster.shutdown()`


			`def get_query_stat(instance, hint):`
			`result = {}`
			`instance.query("SYSTEM FLUSH LOGS")`
			`events = instance.query(`
Apply black formatter to all *.py files in the repo 2022-03-22 16:39:58 +00:00			`"""`
Update more integration tests 2021-01-21 06:55:13 +00:00			`SELECT ProfileEvents.keys, ProfileEvents.values`
DiskS3 with cache tests. 2020-07-30 14:49:56 +00:00			`FROM system.query_log`
			`ARRAY JOIN ProfileEvents`
			`WHERE type != 1 AND query LIKE '%{}%'`
			`""".format(`
			`hint.replace("'", "\\'")`
Apply black formatter to all *.py files in the repo 2022-03-22 16:39:58 +00:00			`)`
DiskS3 with cache tests. 2020-07-30 14:49:56 +00:00			`).split("\n")`
			`for event in events:`
			`ev = event.split("\t")`
			`if len(ev) == 2:`
			`if ev[0].startswith("S3"):`
			`result[ev[0]] = int(ev[1])`
			`return result`

Apply black formatter to all *.py files in the repo 2022-03-22 16:39:58 +00:00
fix more tests 2020-09-14 12:49:04 +00:00			`@pytest.mark.parametrize("min_rows_for_wide_part,read_requests", [(0, 2), (8192, 1)])`
			`def test_write_is_cached(cluster, min_rows_for_wide_part, read_requests):`
DiskS3 with cache tests. 2020-07-30 14:49:56 +00:00			`node = cluster.instances["node"]`

			`node.query(`
			`"""`
			`CREATE TABLE s3_test (`
			`id Int64,`
			`data String`
			`) ENGINE=MergeTree()`
			`ORDER BY id`
fix more tests 2020-09-14 12:49:04 +00:00			`SETTINGS storage_policy='s3', min_rows_for_wide_part={}`
			`""".format(`
			`min_rows_for_wide_part`
			`)`
DiskS3 with cache tests. 2020-07-30 14:49:56 +00:00			`)`

DiskS3 with cache test fix. 2020-07-31 09:24:57 +00:00			`node.query("SYSTEM FLUSH LOGS")`
DiskS3 with cache tests. 2020-07-30 14:49:56 +00:00			`node.query("TRUNCATE TABLE system.query_log")`

			`node.query("INSERT INTO s3_test VALUES (0,'data'),(1,'data')")`

			`select_query = "SELECT * FROM s3_test order by id FORMAT Values"`
			`assert node.query(select_query) == "(0,'data'),(1,'data')"`

Remove checks for profile events because they are not updated the same way with async reads 2021-11-13 10:37:00 +00:00			`# With async reads profile events are not updated because reads are done in a separate thread.`
			`# stat = get_query_stat(node, select_query)`
			`# assert stat["S3ReadRequestsCount"] == read_requests # Only .bin files should be accessed from S3.`
DiskS3 with cache tests. 2020-07-30 14:49:56 +00:00
replace NO DELAY with SYNC in tests 2023-05-03 18:06:46 +00:00			`node.query("DROP TABLE IF EXISTS s3_test SYNC")`
DiskS3 with cache tests. 2020-07-30 14:49:56 +00:00
Apply black formatter to all *.py files in the repo 2022-03-22 16:39:58 +00:00
Remove checks for profile events because they are not updated the same way with async reads 2021-11-13 10:37:00 +00:00			`@pytest.mark.parametrize(`
			`"min_rows_for_wide_part,all_files,bin_files", [(0, 4, 2), (8192, 2, 1)]`
			`)`
fix more tests 2020-09-14 12:49:04 +00:00			`def test_read_after_cache_is_wiped(`
			`cluster, min_rows_for_wide_part, all_files, bin_files`
			`):`
DiskS3 with cache tests. 2020-07-30 14:49:56 +00:00			`node = cluster.instances["node"]`

			`node.query(`
			`"""`
			`CREATE TABLE s3_test (`
			`id Int64,`
			`data String`
			`) ENGINE=MergeTree()`
			`ORDER BY id`
fix more tests 2020-09-14 12:49:04 +00:00			`SETTINGS storage_policy='s3', min_rows_for_wide_part={}`
			`""".format(`
			`min_rows_for_wide_part`
			`)`
DiskS3 with cache tests. 2020-07-30 14:49:56 +00:00			`)`

DiskS3 with cache test fix. 2020-07-31 09:24:57 +00:00			`node.query("SYSTEM FLUSH LOGS")`
DiskS3 with cache tests. 2020-07-30 14:49:56 +00:00			`node.query("TRUNCATE TABLE system.query_log")`

			`node.query("INSERT INTO s3_test VALUES (0,'data'),(1,'data')")`

			`# Wipe cache`
			`cluster.exec_in_container(`
			`cluster.get_container_id("node"),`
			`["rm", "-rf", "/var/lib/clickhouse/disks/s3/cache/"],`
			`)`

			`select_query = "SELECT * FROM s3_test"`
			`node.query(select_query)`
Remove checks for profile events because they are not updated the same way with async reads 2021-11-13 10:37:00 +00:00			`# With async reads profile events are not updated because reads are done in a separate thread.`
			`# stat = get_query_stat(node, select_query)`
			`# assert stat["S3ReadRequestsCount"] == all_files # .mrk and .bin files should be accessed from S3.`
DiskS3 with cache tests. 2020-07-30 14:49:56 +00:00
			`# After cache is populated again, only .bin files should be accessed from S3.`
			`select_query = "SELECT * FROM s3_test order by id FORMAT Values"`
			`assert node.query(select_query) == "(0,'data'),(1,'data')"`
Remove checks for profile events because they are not updated the same way with async reads 2021-11-13 10:37:00 +00:00
			`# With async reads profile events are not updated because reads are done in a separate thread.`
			`# stat = get_query_stat(node, select_query)`
			`# assert stat["S3ReadRequestsCount"] == bin_files`
DiskS3 with cache tests. 2020-07-30 14:49:56 +00:00
replace NO DELAY with SYNC in tests 2023-05-03 18:06:46 +00:00			`node.query("DROP TABLE IF EXISTS s3_test SYNC")`