Merge branch 'master' into tighten-limits-functional-tests

This commit is contained in:
Alexey Milovidov 2024-07-31 22:44:44 +02:00
commit ec666aa113
6 changed files with 113 additions and 77 deletions

View File

@ -361,18 +361,10 @@ BlockIO InterpreterCreateQuery::createDatabase(ASTCreateQuery & create)
TablesLoader loader{getContext()->getGlobalContext(), {{database_name, database}}, mode};
auto load_tasks = loader.loadTablesAsync();
auto startup_tasks = loader.startupTablesAsync();
if (getContext()->getGlobalContext()->getServerSettings().async_load_databases)
{
scheduleLoad(load_tasks);
scheduleLoad(startup_tasks);
}
else
{
/// First prioritize, schedule and wait all the load table tasks
waitLoad(currentPoolOr(TablesLoaderForegroundPoolId), load_tasks);
/// Only then prioritize, schedule and wait all the startup tasks
waitLoad(currentPoolOr(TablesLoaderForegroundPoolId), startup_tasks);
}
/// First prioritize, schedule and wait all the load table tasks
waitLoad(currentPoolOr(TablesLoaderForegroundPoolId), load_tasks);
/// Only then prioritize, schedule and wait all the startup tasks
waitLoad(currentPoolOr(TablesLoaderForegroundPoolId), startup_tasks);
}
}
catch (...)

View File

@ -16,7 +16,7 @@ Don't use Docker from your system repository.
* [py.test](https://docs.pytest.org/) testing framework. To install: `sudo -H pip install pytest`
* [docker-compose](https://docs.docker.com/compose/) and additional python libraries. To install:
```
```bash
sudo -H pip install \
PyMySQL \
avro \
@ -78,7 +78,7 @@ Notes:
* Some tests maybe require a lot of resources (CPU, RAM, etc.). Better not try large tests like `test_distributed_ddl*` on your laptop.
You can run tests via `./runner` script and pass pytest arguments as last arg:
```
```bash
$ ./runner --binary $HOME/ClickHouse/programs/clickhouse --odbc-bridge-binary $HOME/ClickHouse/programs/clickhouse-odbc-bridge --base-configs-dir $HOME/ClickHouse/programs/server/ 'test_ssl_cert_authentication -ss'
Start tests
====================================================================================================== test session starts ======================================================================================================
@ -102,7 +102,7 @@ test_ssl_cert_authentication/test.py::test_create_user PASSED
```
Path to binary and configs maybe specified via env variables:
```
```bash
$ export CLICKHOUSE_TESTS_BASE_CONFIG_DIR=$HOME/ClickHouse/programs/server/
$ export CLICKHOUSE_TESTS_SERVER_BIN_PATH=$HOME/ClickHouse/programs/clickhouse
$ export CLICKHOUSE_TESTS_ODBC_BRIDGE_BIN_PATH=$HOME/ClickHouse/programs/clickhouse-odbc-bridge
@ -121,6 +121,64 @@ test_odbc_interaction/test.py ...... [100%]
You can just open shell inside a container by overwritting the command:
./runner --command=bash
### Parallel test execution
On the CI, we run a number of parallel runners (5 at the time of this writing), each on its own
Docker container. These runner containers spawn more containers for each test for the services
needed such as ZooKeeper, MySQL, PostgreSQL and minio, among others. This means that tests do not
share any services among them. Within each runner, tests are parallelized using
[pytest-xdist](https://pytest-xdist.readthedocs.io/en/stable/). We're using `--dist=loadfile` to
[distribute the load](https://pytest-xdist.readthedocs.io/en/stable/distribution.html). In the
documentation words: this guarantees that all tests in a file run in the same worker. This means
that any test within the same file will never execute their tests in parallel. They'll be executed
on the same worker one after the other.
If the test supports parallel and repeated execution, you can run a bunch of them in parallel to
look for flakiness. We use [pytest-repeat](https://pypi.org/project/pytest-repeat/) to set the
number of times we want to execute a test through the `--count` argument. Then, `-n` sets the number
of parallel workers for `pytest-xdist`.
```bash
$ export CLICKHOUSE_TESTS_BASE_CONFIG_DIR=$HOME/ClickHouse/programs/server/
$ export CLICKHOUSE_TESTS_SERVER_BIN_PATH=$HOME/ClickHouse/programs/clickhouse
$ export CLICKHOUSE_TESTS_ODBC_BRIDGE_BIN_PATH=$HOME/ClickHouse/programs/clickhouse-odbc-bridge
$ ./runner 'test_storage_s3_queue/test.py::test_max_set_age -- --count 10 -n 5'
Start tests
=============================================================================== test session starts ================================================================================
platform linux -- Python 3.10.12, pytest-7.4.4, pluggy-1.5.0 -- /usr/bin/python3
cachedir: .pytest_cache
rootdir: /ClickHouse/tests/integration
configfile: pytest.ini
plugins: reportlog-0.4.0, xdist-3.5.0, random-0.2, repeat-0.9.3, order-1.0.0, timeout-2.2.0
timeout: 900.0s
timeout method: signal
timeout func_only: False
5 workers [10 items]
scheduling tests via LoadScheduling
test_storage_s3_queue/test.py::test_max_set_age[9-10]
test_storage_s3_queue/test.py::test_max_set_age[7-10]
test_storage_s3_queue/test.py::test_max_set_age[5-10]
test_storage_s3_queue/test.py::test_max_set_age[1-10]
test_storage_s3_queue/test.py::test_max_set_age[3-10]
[gw3] [ 10%] PASSED test_storage_s3_queue/test.py::test_max_set_age[7-10]
test_storage_s3_queue/test.py::test_max_set_age[8-10]
[gw4] [ 20%] PASSED test_storage_s3_queue/test.py::test_max_set_age[9-10]
test_storage_s3_queue/test.py::test_max_set_age[10-10]
[gw0] [ 30%] PASSED test_storage_s3_queue/test.py::test_max_set_age[1-10]
test_storage_s3_queue/test.py::test_max_set_age[2-10]
[gw1] [ 40%] PASSED test_storage_s3_queue/test.py::test_max_set_age[3-10]
test_storage_s3_queue/test.py::test_max_set_age[4-10]
[gw2] [ 50%] PASSED test_storage_s3_queue/test.py::test_max_set_age[5-10]
test_storage_s3_queue/test.py::test_max_set_age[6-10]
[gw3] [ 60%] PASSED test_storage_s3_queue/test.py::test_max_set_age[8-10]
[gw4] [ 70%] PASSED test_storage_s3_queue/test.py::test_max_set_age[10-10]
[gw0] [ 80%] PASSED test_storage_s3_queue/test.py::test_max_set_age[2-10]
[gw1] [ 90%] PASSED test_storage_s3_queue/test.py::test_max_set_age[4-10]
[gw2] [100%] PASSED test_storage_s3_queue/test.py::test_max_set_age[6-10]
========================================================================== 10 passed in 120.65s (0:02:00) ==========================================================================
```
### Rebuilding the docker containers
The main container used for integration tests lives in `docker/test/integration/base/Dockerfile`. Rebuild it with
@ -149,7 +207,7 @@ will automagically detect the types of variables and only the small diff of two
If tests failing for mysterious reasons, this may help:
```
```bash
sudo service docker stop
sudo bash -c 'rm -rf /var/lib/docker/*'
sudo service docker start
@ -159,6 +217,6 @@ sudo service docker start
On Ubuntu 20.10 and later in host network mode (default) one may encounter problem with nested containers not seeing each other. It happens because legacy and nftables rules are out of sync. Problem can be solved by:
```
```bash
sudo iptables -P FORWARD ACCEPT
```

View File

@ -435,6 +435,11 @@ class ClickHouseCluster:
# docker-compose removes everything non-alphanumeric from project names so we do it too.
self.project_name = re.sub(r"[^a-z0-9]", "", project_name.lower())
self.instances_dir_name = get_instances_dir(self.name)
xdist_worker = os.getenv("PYTEST_XDIST_WORKER")
if xdist_worker:
self.project_name += f"_{xdist_worker}"
self.instances_dir_name += f"_{xdist_worker}"
self.instances_dir = p.join(self.base_dir, self.instances_dir_name)
self.docker_logs_path = p.join(self.instances_dir, "docker.log")
self.env_file = p.join(self.instances_dir, DEFAULT_ENV_NAME)

View File

@ -7,6 +7,7 @@ import pytest
from helpers.client import QueryRuntimeException
from helpers.cluster import ClickHouseCluster, ClickHouseInstance
import json
from uuid import uuid4
AVAILABLE_MODES = ["unordered", "ordered"]
@ -822,11 +823,11 @@ def test_multiple_tables_streaming_sync_distributed(started_cluster, mode):
def test_max_set_age(started_cluster):
node = started_cluster.instances["instance"]
table_name = f"max_set_age"
table_name = "max_set_age"
dst_table_name = f"{table_name}_dst"
keeper_path = f"/clickhouse/test_{table_name}"
files_path = f"{table_name}_data"
max_age = 10
max_age = 20
files_to_generate = 10
create_table(
@ -847,11 +848,9 @@ def test_max_set_age(started_cluster):
)
create_mv(node, table_name, dst_table_name)
total_values = generate_random_files(
started_cluster, files_path, files_to_generate, row_num=1
)
_ = generate_random_files(started_cluster, files_path, files_to_generate, row_num=1)
expected_rows = 10
expected_rows = files_to_generate
node.wait_for_log_line("Checking node limits")
node.wait_for_log_line("Node limits check finished")
@ -859,25 +858,24 @@ def test_max_set_age(started_cluster):
def get_count():
return int(node.query(f"SELECT count() FROM {dst_table_name}"))
for _ in range(20):
if expected_rows == get_count():
break
time.sleep(1)
def wait_for_condition(check_function, max_wait_time=1.5 * max_age):
before = time.time()
while time.time() - before < max_wait_time:
if check_function():
return
time.sleep(0.25)
assert False
assert expected_rows == get_count()
assert 10 == int(node.query(f"SELECT uniq(_path) from {dst_table_name}"))
wait_for_condition(lambda: get_count() == expected_rows)
assert files_to_generate == int(
node.query(f"SELECT uniq(_path) from {dst_table_name}")
)
time.sleep(max_age + 5)
expected_rows = 20
for _ in range(20):
if expected_rows == get_count():
break
time.sleep(1)
assert expected_rows == get_count()
assert 10 == int(node.query(f"SELECT uniq(_path) from {dst_table_name}"))
expected_rows *= 2
wait_for_condition(lambda: get_count() == expected_rows)
assert files_to_generate == int(
node.query(f"SELECT uniq(_path) from {dst_table_name}")
)
paths_count = [
int(x)
@ -885,15 +883,18 @@ def test_max_set_age(started_cluster):
f"SELECT count() from {dst_table_name} GROUP BY _path"
).splitlines()
]
assert 10 == len(paths_count)
assert files_to_generate == len(paths_count)
for path_count in paths_count:
assert 2 == path_count
failed_count = int(
node.query(
"SELECT value FROM system.events WHERE name = 'ObjectStorageQueueFailedFiles' SETTINGS system_events_show_zero_values=1"
def get_object_storage_failures():
return int(
node.query(
"SELECT value FROM system.events WHERE name = 'ObjectStorageQueueFailedFiles' SETTINGS system_events_show_zero_values=1"
)
)
)
failed_count = get_object_storage_failures()
values = [
["failed", 1, 1],
@ -901,53 +902,33 @@ def test_max_set_age(started_cluster):
values_csv = (
"\n".join((",".join(map(str, row)) for row in values)) + "\n"
).encode()
put_s3_file_content(started_cluster, f"{files_path}/fff.csv", values_csv)
for _ in range(30):
if failed_count + 1 == int(
node.query(
"SELECT value FROM system.events WHERE name = 'ObjectStorageQueueFailedFiles' SETTINGS system_events_show_zero_values=1"
)
):
break
time.sleep(1)
# use a different filename for each test to allow running a bunch of them sequentially with --count
file_with_error = f"max_set_age_fail_{uuid4().hex[:8]}.csv"
put_s3_file_content(started_cluster, f"{files_path}/{file_with_error}", values_csv)
assert failed_count + 1 == int(
node.query(
"SELECT value FROM system.events WHERE name = 'ObjectStorageQueueFailedFiles' SETTINGS system_events_show_zero_values=1"
)
)
wait_for_condition(lambda: failed_count + 1 == get_object_storage_failures())
node.query("SYSTEM FLUSH LOGS")
assert "Cannot parse input" in node.query(
"SELECT exception FROM system.s3queue WHERE file_name ilike '%fff.csv'"
f"SELECT exception FROM system.s3queue WHERE file_name ilike '%{file_with_error}'"
)
assert 1 == int(
node.query(
"SELECT count() FROM system.s3queue_log WHERE file_name ilike '%fff.csv'"
)
)
assert 1 == int(
node.query(
"SELECT count() FROM system.s3queue_log WHERE file_name ilike '%fff.csv' AND notEmpty(exception)"
f"SELECT count() FROM system.s3queue_log WHERE file_name ilike '%{file_with_error}' AND notEmpty(exception)"
)
)
time.sleep(max_age + 1)
assert failed_count + 2 == int(
node.query(
"SELECT value FROM system.events WHERE name = 'ObjectStorageQueueFailedFiles'"
)
)
wait_for_condition(lambda: failed_count + 2 == get_object_storage_failures())
node.query("SYSTEM FLUSH LOGS")
assert "Cannot parse input" in node.query(
"SELECT exception FROM system.s3queue WHERE file_name ilike '%fff.csv' ORDER BY processing_end_time DESC LIMIT 1"
f"SELECT exception FROM system.s3queue WHERE file_name ilike '%{file_with_error}' ORDER BY processing_end_time DESC LIMIT 1"
)
assert 1 < int(
node.query(
"SELECT count() FROM system.s3queue_log WHERE file_name ilike '%fff.csv' AND notEmpty(exception)"
f"SELECT count() FROM system.s3queue_log WHERE file_name ilike '%{file_with_error}' AND notEmpty(exception)"
)
)

View File

@ -1,4 +1,4 @@
NOT_LOADED
0 NOT_LOADED
0 LOADED
10
1 LOADED

View File

@ -12,7 +12,7 @@ LAYOUT(FLAT());
DETACH DATABASE {CLICKHOUSE_DATABASE:Identifier};
ATTACH DATABASE {CLICKHOUSE_DATABASE:Identifier};
SELECT COALESCE((SELECT status FROM system.dictionaries WHERE database = currentDatabase() AND name = 'dict')::Nullable(String), 'NOT_LOADED');
SELECT query_count, status FROM system.dictionaries WHERE database = currentDatabase() AND name = 'dict';
SYSTEM RELOAD DICTIONARY dict;
SELECT query_count, status FROM system.dictionaries WHERE database = currentDatabase() AND name = 'dict';
SELECT dictGetUInt64('dict', 'val', toUInt64(0));