add docs and test

2024-11-15 12:14:18 +00:00 · 2024-10-24 15:57:14 +00:00 · 2024-10-24 15:57:14 +00:00 · 52091f4ff8
commit 52091f4ff8
parent 3ee2b186c6
3 changed files with 75 additions and 19 deletions
--- a/docs/en/operations/server-configuration-parameters/settings.md
+++ b/docs/en/operations/server-configuration-parameters/settings.md
@ -1975,6 +1975,22 @@ The default is `false`.
 <async_load_databases>true</async_load_databases>
 ```

+## async_load_system_database {#async_load_system_database}
+
+Asynchronous loading of system tables. Helpful if there is high amount of log tables and parts in system database. Independent of `async_load_databases` setting.
+
+If `true` all system databases with `Ordinary`, `Atomic` and `Replicated` engine will be loaded asynchronously after the ClickHouse server start up. See `system.asynchronous_loader` table, `tables_loader_background_pool_size` and `tables_loader_foreground_pool_size` server settings. Any query that tries to access a system table, that is not yet loaded, will wait for exactly this table to be started up. The table that is waited for by at least one query will be loaded with higher priority. Also consider setting a limit `max_waiting_queries` for the total number of waiting queries.
+
+If `false`, system database loads before server start.
+
+The default is `false`.
+
+**Example**
+
+``` xml
+<async_load_system_database>true</async_load_system_database>
+```
+
 ## tables_loader_foreground_pool_size {#tables_loader_foreground_pool_size}

 Sets the number of threads performing load jobs in foreground pool. The foreground pool is used for loading table synchronously before server start listening on a port and for loading tables that are waited for. Foreground pool has higher priority than background pool. It means that no job starts in background pool while there are jobs running in foreground pool.
@ -3109,7 +3125,7 @@ By default, tunneling (i.e, `HTTP CONNECT`) is used to make `HTTPS` requests ove

 ### no_proxy
 By default, all requests will go through the proxy. In order to disable it for specific hosts, the `no_proxy` variable must be set.
-It can be set inside the `<proxy>` clause for list and remote resolvers and as an environment variable for environment resolver. 
+It can be set inside the `<proxy>` clause for list and remote resolvers and as an environment variable for environment resolver.
 It supports IP addresses, domains, subdomains and `'*'` wildcard for full bypass. Leading dots are stripped just like curl does.

 Example:
--- a/tests/integration/test_async_load_databases/configs/async_load_system_database.html
+++ b/tests/integration/test_async_load_databases/configs/async_load_system_database.html
@ -0,0 +1,3 @@
+<clickhouse>
+    <async_load_system_database>true</async_load_system_database>
+</clickhouse>
--- a/tests/integration/test_async_load_databases/test.py
+++ b/tests/integration/test_async_load_databases/test.py
@ -1,4 +1,5 @@
 import random
+import time

 import pytest

@ -13,25 +14,35 @@ DICTIONARY_FILES = [
 ]

 cluster = ClickHouseCluster(__file__)
-instance = cluster.add_instance(
-    "instance",
+node1 = cluster.add_instance(
+    "node1",
    main_configs=["configs/config.xml"],
    dictionaries=DICTIONARY_FILES,
    stay_alive=True,
 )

+node2 = cluster.add_instance(
+    "node2",
+    main_configs=[
+        "configs/async_load_system_database.xml",
+    ],
+    dictionaries=DICTIONARY_FILES,
+    stay_alive=True,
+)
+

@pytest.fixture(scope="module")
 def started_cluster():
    try:
        cluster.start()

-        instance.query(
-            """
-            CREATE DATABASE IF NOT EXISTS dict ENGINE=Dictionary;
-            CREATE DATABASE IF NOT EXISTS test;
-            """
-        )
+        for node in [node1, node2]:
+            node.query(
+                """
+                CREATE DATABASE IF NOT EXISTS dict ENGINE=Dictionary;
+                CREATE DATABASE IF NOT EXISTS test;
+                """
+            )

        yield cluster

@ -40,13 +51,13 @@ def started_cluster():


 def get_status(dictionary_name):
-    return instance.query(
+    return node1.query(
        "SELECT status FROM system.dictionaries WHERE name='" + dictionary_name + "'"
    ).rstrip("\n")


 def test_dict_get_data(started_cluster):
-    query = instance.query
+    query = node1.query

    query(
        "CREATE TABLE test.elements (id UInt64, a String, b Int32, c Float64) ENGINE=Log;"
@ -80,7 +91,7 @@ def test_dict_get_data(started_cluster):

    # Wait for dictionaries to be reloaded.
    assert_eq_with_retry(
-        instance,
+        node1,
        "SELECT dictHas('dep_x', toUInt64(3))",
        "1",
        sleep_time=2,
@ -94,7 +105,7 @@ def test_dict_get_data(started_cluster):
    # so dep_x and dep_z are not going to be updated after the following INSERT.
    query("INSERT INTO test.elements VALUES (4, 'ether', 404, 0.001)")
    assert_eq_with_retry(
-        instance,
+        node1,
        "SELECT dictHas('dep_y', toUInt64(4))",
        "1",
        sleep_time=2,
@ -104,11 +115,11 @@ def test_dict_get_data(started_cluster):
    assert query("SELECT dictGetString('dep_y', 'a', toUInt64(4))") == "ether\n"
    assert query("SELECT dictGetString('dep_z', 'a', toUInt64(4))") == "ZZ\n"
    query("DROP TABLE IF EXISTS test.elements;")
-    instance.restart_clickhouse()
+    node1.restart_clickhouse()


 def dependent_tables_assert():
-    res = instance.query("select database || '.' || name from system.tables")
+    res = node1.query("select database || '.' || name from system.tables")
    assert "system.join" in res
    assert "default.src" in res
    assert "dict.dep_y" in res
@ -119,7 +130,7 @@ def dependent_tables_assert():


 def test_dependent_tables(started_cluster):
-    query = instance.query
+    query = node1.query
    query("create database lazy engine=Lazy(10)")
    query("create database a")
    query("create table lazy.src (n int, m int) engine=Log")
@ -157,7 +168,7 @@ def test_dependent_tables(started_cluster):
    )

    dependent_tables_assert()
-    instance.restart_clickhouse()
+    node1.restart_clickhouse()
    dependent_tables_assert()
    query("drop table a.t")
    query("drop table lazy.log")
@ -170,14 +181,14 @@ def test_dependent_tables(started_cluster):


 def test_multiple_tables(started_cluster):
-    query = instance.query
+    query = node1.query
    tables_count = 20
    for i in range(tables_count):
        query(
            f"create table test.table_{i} (n UInt64, s String) engine=MergeTree order by n as select number, randomString(100) from numbers(100)"
        )

-    instance.restart_clickhouse()
+    node1.restart_clickhouse()

    order = [i for i in range(tables_count)]
    random.shuffle(order)
@ -185,3 +196,29 @@ def test_multiple_tables(started_cluster):
        assert query(f"select count() from test.table_{i}") == "100\n"
    for i in range(tables_count):
        query(f"drop table test.table_{i} sync")
+
+
+def test_async_load_system_database(started_cluster):
+    id = 1
+    for i in range(4):
+        # Access some system tables that might be still loading
+        if id > 1:
+            for j in range(3):
+                node2.query(f"select count() from system.text_log_{random.randint(1, id - 1)}")
+                node2.query(f"select count() from system.query_log_{random.randint(1, id - 1)}")
+
+        # Generate more system tables
+        for j in range(30):
+            while True:
+                count = int(
+                    node2.query("select count() from system.tables where database = 'system' and name in ['query_log', 'text_log']")
+                )
+                if count == 2:
+                    break
+                time.sleep(0.1)
+            node2.query(f"rename table system.text_log to system.text_log_{id}")
+            node2.query(f"rename table system.query_log to system.query_log_{id}")
+            id += 1
+
+        # Trigger async load of system database
+        node2.restart_clickhouse()