Merge branch 'master' into tighten-limits-functional-tests

2024-11-09 17:14:47 +00:00 · 2024-07-31 22:44:44 +02:00 · 2024-07-31 22:44:44 +02:00 · ec666aa113
commit ec666aa113
parent 461251b519 cc27c254ab
6 changed files with 113 additions and 77 deletions
--- a/src/Interpreters/InterpreterCreateQuery.cpp
+++ b/src/Interpreters/InterpreterCreateQuery.cpp
@ -361,18 +361,10 @@ BlockIO InterpreterCreateQuery::createDatabase(ASTCreateQuery & create)
            TablesLoader loader{getContext()->getGlobalContext(), {{database_name, database}}, mode};
            auto load_tasks = loader.loadTablesAsync();
            auto startup_tasks = loader.startupTablesAsync();
-            if (getContext()->getGlobalContext()->getServerSettings().async_load_databases)
-            {
-                scheduleLoad(load_tasks);
-                scheduleLoad(startup_tasks);
-            }
-            else
-            {
-                /// First prioritize, schedule and wait all the load table tasks
-                waitLoad(currentPoolOr(TablesLoaderForegroundPoolId), load_tasks);
-                /// Only then prioritize, schedule and wait all the startup tasks
-                waitLoad(currentPoolOr(TablesLoaderForegroundPoolId), startup_tasks);
-            }
+            /// First prioritize, schedule and wait all the load table tasks
+            waitLoad(currentPoolOr(TablesLoaderForegroundPoolId), load_tasks);
+            /// Only then prioritize, schedule and wait all the startup tasks
+            waitLoad(currentPoolOr(TablesLoaderForegroundPoolId), startup_tasks);
        }
    }
    catch (...)
--- a/tests/integration/README.md
+++ b/tests/integration/README.md
@ -16,7 +16,7 @@ Don't use Docker from your system repository.
 * [py.test](https://docs.pytest.org/) testing framework. To install: `sudo -H pip install pytest`
 * [docker-compose](https://docs.docker.com/compose/) and additional python libraries. To install:

-```
+```bash
 sudo -H pip install \
    PyMySQL \
    avro \
@ -78,7 +78,7 @@ Notes:
 * Some tests maybe require a lot of resources (CPU, RAM, etc.). Better not try large tests like `test_distributed_ddl*` on your laptop.

 You can run tests via `./runner` script and pass pytest arguments as last arg:
-```
+```bash
 $ ./runner --binary $HOME/ClickHouse/programs/clickhouse  --odbc-bridge-binary $HOME/ClickHouse/programs/clickhouse-odbc-bridge --base-configs-dir $HOME/ClickHouse/programs/server/ 'test_ssl_cert_authentication -ss'
 Start tests
 ====================================================================================================== test session starts ======================================================================================================
@ -102,7 +102,7 @@ test_ssl_cert_authentication/test.py::test_create_user PASSED
 ```

 Path to binary and configs maybe specified via env variables:
-```
+```bash
 $ export CLICKHOUSE_TESTS_BASE_CONFIG_DIR=$HOME/ClickHouse/programs/server/
 $ export CLICKHOUSE_TESTS_SERVER_BIN_PATH=$HOME/ClickHouse/programs/clickhouse
 $ export CLICKHOUSE_TESTS_ODBC_BRIDGE_BIN_PATH=$HOME/ClickHouse/programs/clickhouse-odbc-bridge
@ -121,6 +121,64 @@ test_odbc_interaction/test.py ......                                     [100%]
 You can just open shell inside a container by overwritting the command:
 ./runner --command=bash

+### Parallel test execution
+
+On the CI, we run a number of parallel runners (5 at the time of this writing), each on its own
+Docker container. These runner containers spawn more containers for each test for the services
+needed such as ZooKeeper, MySQL, PostgreSQL and minio, among others. This means that tests do not
+share any services among them. Within each runner, tests are parallelized using
+[pytest-xdist](https://pytest-xdist.readthedocs.io/en/stable/). We're using `--dist=loadfile` to
+[distribute the load](https://pytest-xdist.readthedocs.io/en/stable/distribution.html). In the
+documentation words: this guarantees that all tests in a file run in the same worker. This means
+that any test within the same file will never execute their tests in parallel. They'll be executed
+on the same worker one after the other.
+
+If the test supports parallel and repeated execution, you can run a bunch of them in parallel to
+look for flakiness. We use [pytest-repeat](https://pypi.org/project/pytest-repeat/) to set the
+number of times we want to execute a test through the `--count` argument. Then, `-n` sets the number
+of parallel workers for `pytest-xdist`.
+
+```bash
+$ export CLICKHOUSE_TESTS_BASE_CONFIG_DIR=$HOME/ClickHouse/programs/server/
+$ export CLICKHOUSE_TESTS_SERVER_BIN_PATH=$HOME/ClickHouse/programs/clickhouse
+$ export CLICKHOUSE_TESTS_ODBC_BRIDGE_BIN_PATH=$HOME/ClickHouse/programs/clickhouse-odbc-bridge
+$ ./runner 'test_storage_s3_queue/test.py::test_max_set_age -- --count 10 -n 5'
+Start tests
+=============================================================================== test session starts ================================================================================
+platform linux -- Python 3.10.12, pytest-7.4.4, pluggy-1.5.0 -- /usr/bin/python3
+cachedir: .pytest_cache
+rootdir: /ClickHouse/tests/integration
+configfile: pytest.ini
+plugins: reportlog-0.4.0, xdist-3.5.0, random-0.2, repeat-0.9.3, order-1.0.0, timeout-2.2.0
+timeout: 900.0s
+timeout method: signal
+timeout func_only: False
+5 workers [10 items]
+scheduling tests via LoadScheduling
+
+test_storage_s3_queue/test.py::test_max_set_age[9-10]
+test_storage_s3_queue/test.py::test_max_set_age[7-10]
+test_storage_s3_queue/test.py::test_max_set_age[5-10]
+test_storage_s3_queue/test.py::test_max_set_age[1-10]
+test_storage_s3_queue/test.py::test_max_set_age[3-10]
+[gw3] [ 10%] PASSED test_storage_s3_queue/test.py::test_max_set_age[7-10]
+test_storage_s3_queue/test.py::test_max_set_age[8-10]
+[gw4] [ 20%] PASSED test_storage_s3_queue/test.py::test_max_set_age[9-10]
+test_storage_s3_queue/test.py::test_max_set_age[10-10]
+[gw0] [ 30%] PASSED test_storage_s3_queue/test.py::test_max_set_age[1-10]
+test_storage_s3_queue/test.py::test_max_set_age[2-10]
+[gw1] [ 40%] PASSED test_storage_s3_queue/test.py::test_max_set_age[3-10]
+test_storage_s3_queue/test.py::test_max_set_age[4-10]
+[gw2] [ 50%] PASSED test_storage_s3_queue/test.py::test_max_set_age[5-10]
+test_storage_s3_queue/test.py::test_max_set_age[6-10]
+[gw3] [ 60%] PASSED test_storage_s3_queue/test.py::test_max_set_age[8-10]
+[gw4] [ 70%] PASSED test_storage_s3_queue/test.py::test_max_set_age[10-10]
+[gw0] [ 80%] PASSED test_storage_s3_queue/test.py::test_max_set_age[2-10]
+[gw1] [ 90%] PASSED test_storage_s3_queue/test.py::test_max_set_age[4-10]
+[gw2] [100%] PASSED test_storage_s3_queue/test.py::test_max_set_age[6-10]
+========================================================================== 10 passed in 120.65s (0:02:00) ==========================================================================
+```
+
 ### Rebuilding the docker containers

 The main container used for integration tests lives in `docker/test/integration/base/Dockerfile`. Rebuild it with
@ -149,7 +207,7 @@ will automagically detect the types of variables and only the small diff of two

 If tests failing for mysterious reasons, this may help:

-```
+```bash
 sudo service docker stop
 sudo bash -c 'rm -rf /var/lib/docker/*'
 sudo service docker start
@ -159,6 +217,6 @@ sudo service docker start

 On Ubuntu 20.10 and later in host network mode (default) one may encounter problem with nested containers not seeing each other. It happens because legacy and nftables rules are out of sync. Problem can be solved by:

-```
+```bash
 sudo iptables -P FORWARD ACCEPT
 ```
--- a/tests/integration/helpers/cluster.py
+++ b/tests/integration/helpers/cluster.py
@ -435,6 +435,11 @@ class ClickHouseCluster:
        # docker-compose removes everything non-alphanumeric from project names so we do it too.
        self.project_name = re.sub(r"[^a-z0-9]", "", project_name.lower())
        self.instances_dir_name = get_instances_dir(self.name)
+        xdist_worker = os.getenv("PYTEST_XDIST_WORKER")
+        if xdist_worker:
+            self.project_name += f"_{xdist_worker}"
+            self.instances_dir_name += f"_{xdist_worker}"
+
        self.instances_dir = p.join(self.base_dir, self.instances_dir_name)
        self.docker_logs_path = p.join(self.instances_dir, "docker.log")
        self.env_file = p.join(self.instances_dir, DEFAULT_ENV_NAME)
--- a/tests/integration/test_storage_s3_queue/test.py
+++ b/tests/integration/test_storage_s3_queue/test.py
@ -7,6 +7,7 @@ import pytest
 from helpers.client import QueryRuntimeException
 from helpers.cluster import ClickHouseCluster, ClickHouseInstance
 import json
+from uuid import uuid4


 AVAILABLE_MODES = ["unordered", "ordered"]
@ -822,11 +823,11 @@ def test_multiple_tables_streaming_sync_distributed(started_cluster, mode):

 def test_max_set_age(started_cluster):
    node = started_cluster.instances["instance"]
-    table_name = f"max_set_age"
+    table_name = "max_set_age"
    dst_table_name = f"{table_name}_dst"
    keeper_path = f"/clickhouse/test_{table_name}"
    files_path = f"{table_name}_data"
-    max_age = 10
+    max_age = 20
    files_to_generate = 10

    create_table(
@ -847,11 +848,9 @@ def test_max_set_age(started_cluster):
    )
    create_mv(node, table_name, dst_table_name)

-    total_values = generate_random_files(
-        started_cluster, files_path, files_to_generate, row_num=1
-    )
+    _ = generate_random_files(started_cluster, files_path, files_to_generate, row_num=1)

-    expected_rows = 10
+    expected_rows = files_to_generate

    node.wait_for_log_line("Checking node limits")
    node.wait_for_log_line("Node limits check finished")
@ -859,25 +858,24 @@ def test_max_set_age(started_cluster):
    def get_count():
        return int(node.query(f"SELECT count() FROM {dst_table_name}"))

-    for _ in range(20):
-        if expected_rows == get_count():
-            break
-        time.sleep(1)
+    def wait_for_condition(check_function, max_wait_time=1.5 * max_age):
+        before = time.time()
+        while time.time() - before < max_wait_time:
+            if check_function():
+                return
+            time.sleep(0.25)
+        assert False

-    assert expected_rows == get_count()
-    assert 10 == int(node.query(f"SELECT uniq(_path) from {dst_table_name}"))
+    wait_for_condition(lambda: get_count() == expected_rows)
+    assert files_to_generate == int(
+        node.query(f"SELECT uniq(_path) from {dst_table_name}")
+    )

-    time.sleep(max_age + 5)
-
-    expected_rows = 20
-
-    for _ in range(20):
-        if expected_rows == get_count():
-            break
-        time.sleep(1)
-
-    assert expected_rows == get_count()
-    assert 10 == int(node.query(f"SELECT uniq(_path) from {dst_table_name}"))
+    expected_rows *= 2
+    wait_for_condition(lambda: get_count() == expected_rows)
+    assert files_to_generate == int(
+        node.query(f"SELECT uniq(_path) from {dst_table_name}")
+    )

    paths_count = [
        int(x)
@ -885,15 +883,18 @@ def test_max_set_age(started_cluster):
            f"SELECT count() from {dst_table_name} GROUP BY _path"
        ).splitlines()
    ]
-    assert 10 == len(paths_count)
+    assert files_to_generate == len(paths_count)
    for path_count in paths_count:
        assert 2 == path_count

-    failed_count = int(
-        node.query(
-            "SELECT value FROM system.events WHERE name = 'ObjectStorageQueueFailedFiles' SETTINGS system_events_show_zero_values=1"
+    def get_object_storage_failures():
+        return int(
+            node.query(
+                "SELECT value FROM system.events WHERE name = 'ObjectStorageQueueFailedFiles' SETTINGS system_events_show_zero_values=1"
+            )
        )
-    )
+
+    failed_count = get_object_storage_failures()

    values = [
        ["failed", 1, 1],
@ -901,53 +902,33 @@ def test_max_set_age(started_cluster):
    values_csv = (
        "\n".join((",".join(map(str, row)) for row in values)) + "\n"
    ).encode()
-    put_s3_file_content(started_cluster, f"{files_path}/fff.csv", values_csv)

-    for _ in range(30):
-        if failed_count + 1 == int(
-            node.query(
-                "SELECT value FROM system.events WHERE name = 'ObjectStorageQueueFailedFiles' SETTINGS system_events_show_zero_values=1"
-            )
-        ):
-            break
-        time.sleep(1)
+    # use a different filename for each test to allow running a bunch of them sequentially with --count
+    file_with_error = f"max_set_age_fail_{uuid4().hex[:8]}.csv"
+    put_s3_file_content(started_cluster, f"{files_path}/{file_with_error}", values_csv)

-    assert failed_count + 1 == int(
-        node.query(
-            "SELECT value FROM system.events WHERE name = 'ObjectStorageQueueFailedFiles' SETTINGS system_events_show_zero_values=1"
-        )
-    )
+    wait_for_condition(lambda: failed_count + 1 == get_object_storage_failures())

    node.query("SYSTEM FLUSH LOGS")
    assert "Cannot parse input" in node.query(
-        "SELECT exception FROM system.s3queue WHERE file_name ilike '%fff.csv'"
+        f"SELECT exception FROM system.s3queue WHERE file_name ilike '%{file_with_error}'"
    )
+
    assert 1 == int(
        node.query(
-            "SELECT count() FROM system.s3queue_log WHERE file_name ilike '%fff.csv'"
-        )
-    )
-    assert 1 == int(
-        node.query(
-            "SELECT count() FROM system.s3queue_log WHERE file_name ilike '%fff.csv' AND notEmpty(exception)"
+            f"SELECT count() FROM system.s3queue_log WHERE file_name ilike '%{file_with_error}' AND notEmpty(exception)"
        )
    )

-    time.sleep(max_age + 1)
-
-    assert failed_count + 2 == int(
-        node.query(
-            "SELECT value FROM system.events WHERE name = 'ObjectStorageQueueFailedFiles'"
-        )
-    )
+    wait_for_condition(lambda: failed_count + 2 == get_object_storage_failures())

    node.query("SYSTEM FLUSH LOGS")
    assert "Cannot parse input" in node.query(
-        "SELECT exception FROM system.s3queue WHERE file_name ilike '%fff.csv' ORDER BY processing_end_time DESC LIMIT 1"
+        f"SELECT exception FROM system.s3queue WHERE file_name ilike '%{file_with_error}' ORDER BY processing_end_time DESC LIMIT 1"
    )
    assert 1 < int(
        node.query(
-            "SELECT count() FROM system.s3queue_log WHERE file_name ilike '%fff.csv' AND notEmpty(exception)"
+            f"SELECT count() FROM system.s3queue_log WHERE file_name ilike '%{file_with_error}' AND notEmpty(exception)"
        )
    )

--- a/tests/queries/0_stateless/01254_dict_load_after_detach_attach.reference
+++ b/tests/queries/0_stateless/01254_dict_load_after_detach_attach.reference
@ -1,4 +1,4 @@
-NOT_LOADED
+0	NOT_LOADED
 0	LOADED
 10
 1	LOADED
--- a/tests/queries/0_stateless/01254_dict_load_after_detach_attach.sql
+++ b/tests/queries/0_stateless/01254_dict_load_after_detach_attach.sql
@ -12,7 +12,7 @@ LAYOUT(FLAT());
 DETACH DATABASE {CLICKHOUSE_DATABASE:Identifier};
 ATTACH DATABASE {CLICKHOUSE_DATABASE:Identifier};

-SELECT COALESCE((SELECT status FROM system.dictionaries WHERE database = currentDatabase() AND name = 'dict')::Nullable(String), 'NOT_LOADED');
+SELECT query_count, status FROM system.dictionaries WHERE database = currentDatabase() AND name = 'dict';
 SYSTEM RELOAD DICTIONARY dict;
 SELECT query_count, status FROM system.dictionaries WHERE database = currentDatabase() AND name = 'dict';
 SELECT dictGetUInt64('dict', 'val', toUInt64(0));