add docs and test

This commit is contained in:
serxa 2024-10-24 15:57:14 +00:00
parent 3ee2b186c6
commit 52091f4ff8
3 changed files with 75 additions and 19 deletions

View File

@ -1975,6 +1975,22 @@ The default is `false`.
<async_load_databases>true</async_load_databases>
```
## async_load_system_database {#async_load_system_database}
Asynchronous loading of system tables. Helpful if there is high amount of log tables and parts in system database. Independent of `async_load_databases` setting.
If `true` all system databases with `Ordinary`, `Atomic` and `Replicated` engine will be loaded asynchronously after the ClickHouse server start up. See `system.asynchronous_loader` table, `tables_loader_background_pool_size` and `tables_loader_foreground_pool_size` server settings. Any query that tries to access a system table, that is not yet loaded, will wait for exactly this table to be started up. The table that is waited for by at least one query will be loaded with higher priority. Also consider setting a limit `max_waiting_queries` for the total number of waiting queries.
If `false`, system database loads before server start.
The default is `false`.
**Example**
``` xml
<async_load_system_database>true</async_load_system_database>
```
## tables_loader_foreground_pool_size {#tables_loader_foreground_pool_size}
Sets the number of threads performing load jobs in foreground pool. The foreground pool is used for loading table synchronously before server start listening on a port and for loading tables that are waited for. Foreground pool has higher priority than background pool. It means that no job starts in background pool while there are jobs running in foreground pool.
@ -3109,7 +3125,7 @@ By default, tunneling (i.e, `HTTP CONNECT`) is used to make `HTTPS` requests ove
### no_proxy
By default, all requests will go through the proxy. In order to disable it for specific hosts, the `no_proxy` variable must be set.
It can be set inside the `<proxy>` clause for list and remote resolvers and as an environment variable for environment resolver.
It can be set inside the `<proxy>` clause for list and remote resolvers and as an environment variable for environment resolver.
It supports IP addresses, domains, subdomains and `'*'` wildcard for full bypass. Leading dots are stripped just like curl does.
Example:

View File

@ -0,0 +1,3 @@
<clickhouse>
<async_load_system_database>true</async_load_system_database>
</clickhouse>

View File

@ -1,4 +1,5 @@
import random
import time
import pytest
@ -13,25 +14,35 @@ DICTIONARY_FILES = [
]
cluster = ClickHouseCluster(__file__)
instance = cluster.add_instance(
"instance",
node1 = cluster.add_instance(
"node1",
main_configs=["configs/config.xml"],
dictionaries=DICTIONARY_FILES,
stay_alive=True,
)
node2 = cluster.add_instance(
"node2",
main_configs=[
"configs/async_load_system_database.xml",
],
dictionaries=DICTIONARY_FILES,
stay_alive=True,
)
@pytest.fixture(scope="module")
def started_cluster():
try:
cluster.start()
instance.query(
"""
CREATE DATABASE IF NOT EXISTS dict ENGINE=Dictionary;
CREATE DATABASE IF NOT EXISTS test;
"""
)
for node in [node1, node2]:
node.query(
"""
CREATE DATABASE IF NOT EXISTS dict ENGINE=Dictionary;
CREATE DATABASE IF NOT EXISTS test;
"""
)
yield cluster
@ -40,13 +51,13 @@ def started_cluster():
def get_status(dictionary_name):
return instance.query(
return node1.query(
"SELECT status FROM system.dictionaries WHERE name='" + dictionary_name + "'"
).rstrip("\n")
def test_dict_get_data(started_cluster):
query = instance.query
query = node1.query
query(
"CREATE TABLE test.elements (id UInt64, a String, b Int32, c Float64) ENGINE=Log;"
@ -80,7 +91,7 @@ def test_dict_get_data(started_cluster):
# Wait for dictionaries to be reloaded.
assert_eq_with_retry(
instance,
node1,
"SELECT dictHas('dep_x', toUInt64(3))",
"1",
sleep_time=2,
@ -94,7 +105,7 @@ def test_dict_get_data(started_cluster):
# so dep_x and dep_z are not going to be updated after the following INSERT.
query("INSERT INTO test.elements VALUES (4, 'ether', 404, 0.001)")
assert_eq_with_retry(
instance,
node1,
"SELECT dictHas('dep_y', toUInt64(4))",
"1",
sleep_time=2,
@ -104,11 +115,11 @@ def test_dict_get_data(started_cluster):
assert query("SELECT dictGetString('dep_y', 'a', toUInt64(4))") == "ether\n"
assert query("SELECT dictGetString('dep_z', 'a', toUInt64(4))") == "ZZ\n"
query("DROP TABLE IF EXISTS test.elements;")
instance.restart_clickhouse()
node1.restart_clickhouse()
def dependent_tables_assert():
res = instance.query("select database || '.' || name from system.tables")
res = node1.query("select database || '.' || name from system.tables")
assert "system.join" in res
assert "default.src" in res
assert "dict.dep_y" in res
@ -119,7 +130,7 @@ def dependent_tables_assert():
def test_dependent_tables(started_cluster):
query = instance.query
query = node1.query
query("create database lazy engine=Lazy(10)")
query("create database a")
query("create table lazy.src (n int, m int) engine=Log")
@ -157,7 +168,7 @@ def test_dependent_tables(started_cluster):
)
dependent_tables_assert()
instance.restart_clickhouse()
node1.restart_clickhouse()
dependent_tables_assert()
query("drop table a.t")
query("drop table lazy.log")
@ -170,14 +181,14 @@ def test_dependent_tables(started_cluster):
def test_multiple_tables(started_cluster):
query = instance.query
query = node1.query
tables_count = 20
for i in range(tables_count):
query(
f"create table test.table_{i} (n UInt64, s String) engine=MergeTree order by n as select number, randomString(100) from numbers(100)"
)
instance.restart_clickhouse()
node1.restart_clickhouse()
order = [i for i in range(tables_count)]
random.shuffle(order)
@ -185,3 +196,29 @@ def test_multiple_tables(started_cluster):
assert query(f"select count() from test.table_{i}") == "100\n"
for i in range(tables_count):
query(f"drop table test.table_{i} sync")
def test_async_load_system_database(started_cluster):
id = 1
for i in range(4):
# Access some system tables that might be still loading
if id > 1:
for j in range(3):
node2.query(f"select count() from system.text_log_{random.randint(1, id - 1)}")
node2.query(f"select count() from system.query_log_{random.randint(1, id - 1)}")
# Generate more system tables
for j in range(30):
while True:
count = int(
node2.query("select count() from system.tables where database = 'system' and name in ['query_log', 'text_log']")
)
if count == 2:
break
time.sleep(0.1)
node2.query(f"rename table system.text_log to system.text_log_{id}")
node2.query(f"rename table system.query_log to system.query_log_{id}")
id += 1
# Trigger async load of system database
node2.restart_clickhouse()