From eef7d8c0bc6ed04597881ccfa83b012025c0341b Mon Sep 17 00:00:00 2001 From: Pavel Kruglov Date: Wed, 3 Mar 2021 16:10:15 +0300 Subject: [PATCH 001/155] Fix blocking mode and timeouts in SecureStreamSocket --- src/IO/ReadBufferFromPocoSocket.cpp | 23 ++++++++++++----------- src/IO/WriteBufferFromPocoSocket.cpp | 8 ++++++++ 2 files changed, 20 insertions(+), 11 deletions(-) diff --git a/src/IO/ReadBufferFromPocoSocket.cpp b/src/IO/ReadBufferFromPocoSocket.cpp index 37896a387bb..df8739904ec 100644 --- a/src/IO/ReadBufferFromPocoSocket.cpp +++ b/src/IO/ReadBufferFromPocoSocket.cpp @@ -4,6 +4,7 @@ #include #include #include +#include namespace ProfileEvents @@ -27,23 +28,23 @@ bool ReadBufferFromPocoSocket::nextImpl() ssize_t bytes_read = 0; Stopwatch watch; - int flags = 0; - if (async_callback) - flags |= MSG_DONTWAIT; - /// Add more details to exceptions. try { - bytes_read = socket.impl()->receiveBytes(internal_buffer.begin(), internal_buffer.size(), flags); - - /// If async_callback is specified, and read is blocking, run async_callback and try again later. + /// If async_callback is specified, and read will block, run async_callback and try again later. /// It is expected that file descriptor may be polled externally. /// Note that receive timeout is not checked here. External code should check it while polling. - while (bytes_read < 0 && async_callback && errno == EAGAIN) - { + while (async_callback && !socket.poll(0, Poco::Net::Socket::SELECT_READ)) async_callback(socket.impl()->sockfd(), socket.getReceiveTimeout(), socket_description); - bytes_read = socket.impl()->receiveBytes(internal_buffer.begin(), internal_buffer.size(), flags); - } + + /// receiveBytes in SecureStreamSocket throws TimeoutException after max(receive_timeout, send_timeout), + /// but we want to get this exception exactly after receive_timeout. So, set send_timeout = receive_timeout + /// before receiveBytes. + std::unique_ptr timeout_setter = nullptr; + if (socket.secure()) + timeout_setter = std::make_unique(dynamic_cast(socket), socket.getReceiveTimeout(), socket.getReceiveTimeout()); + + bytes_read = socket.impl()->receiveBytes(internal_buffer.begin(), internal_buffer.size()); } catch (const Poco::Net::NetException & e) { diff --git a/src/IO/WriteBufferFromPocoSocket.cpp b/src/IO/WriteBufferFromPocoSocket.cpp index 284fa5dbd97..c666586770e 100644 --- a/src/IO/WriteBufferFromPocoSocket.cpp +++ b/src/IO/WriteBufferFromPocoSocket.cpp @@ -6,6 +6,7 @@ #include #include #include +#include namespace ProfileEvents @@ -40,6 +41,13 @@ void WriteBufferFromPocoSocket::nextImpl() /// Add more details to exceptions. try { + /// sendBytes in SecureStreamSocket throws TimeoutException after max(receive_timeout, send_timeout), + /// but we want to get this exception exactly after send_timeout. So, set receive_timeout = send_timeout + /// before sendBytes. + std::unique_ptr timeout_setter = nullptr; + if (socket.secure()) + timeout_setter = std::make_unique(dynamic_cast(socket), socket.getSendTimeout(), socket.getSendTimeout()); + res = socket.impl()->sendBytes(working_buffer.begin() + bytes_written, offset() - bytes_written); } catch (const Poco::Net::NetException & e) From 1c7f16e0ff48a5de3cc302880b4888262fe1876c Mon Sep 17 00:00:00 2001 From: Pavel Kruglov Date: Wed, 3 Mar 2021 20:47:27 +0300 Subject: [PATCH 002/155] Add test and move TimeoutSetter in IO/ --- src/Client/Connection.cpp | 2 +- src/Client/ya.make | 1 - src/IO/ReadBufferFromPocoSocket.cpp | 2 +- src/{Client => IO}/TimeoutSetter.cpp | 2 +- src/{Client => IO}/TimeoutSetter.h | 0 src/IO/WriteBufferFromPocoSocket.cpp | 2 +- src/IO/ya.make | 1 + src/Server/TCPHandler.h | 2 +- .../config.d/remote_servers.xml | 14 ++++ .../configs_secure/config.d/ssl_conf.xml | 18 ++++ .../configs_secure/dhparam.pem | 8 ++ .../configs_secure/server.crt | 19 +++++ .../configs_secure/server.key | 28 +++++++ .../configs_secure/users.d/users.xml | 6 ++ tests/integration/test_secure_socket/test.py | 83 +++++++++++++++++++ 15 files changed, 182 insertions(+), 6 deletions(-) rename src/{Client => IO}/TimeoutSetter.cpp (97%) rename src/{Client => IO}/TimeoutSetter.h (100%) create mode 100644 tests/integration/test_secure_socket/configs_secure/config.d/remote_servers.xml create mode 100644 tests/integration/test_secure_socket/configs_secure/config.d/ssl_conf.xml create mode 100644 tests/integration/test_secure_socket/configs_secure/dhparam.pem create mode 100644 tests/integration/test_secure_socket/configs_secure/server.crt create mode 100644 tests/integration/test_secure_socket/configs_secure/server.key create mode 100644 tests/integration/test_secure_socket/configs_secure/users.d/users.xml create mode 100644 tests/integration/test_secure_socket/test.py diff --git a/src/Client/Connection.cpp b/src/Client/Connection.cpp index 80d44a336a5..939a48d949f 100644 --- a/src/Client/Connection.cpp +++ b/src/Client/Connection.cpp @@ -8,10 +8,10 @@ #include #include #include +#include #include #include #include -#include #include #include #include diff --git a/src/Client/ya.make b/src/Client/ya.make index af1dd05f1d4..4201203a8e9 100644 --- a/src/Client/ya.make +++ b/src/Client/ya.make @@ -16,7 +16,6 @@ SRCS( HedgedConnections.cpp HedgedConnectionsFactory.cpp MultiplexedConnections.cpp - TimeoutSetter.cpp ) diff --git a/src/IO/ReadBufferFromPocoSocket.cpp b/src/IO/ReadBufferFromPocoSocket.cpp index df8739904ec..c70993c5c3a 100644 --- a/src/IO/ReadBufferFromPocoSocket.cpp +++ b/src/IO/ReadBufferFromPocoSocket.cpp @@ -1,10 +1,10 @@ #include #include +#include #include #include #include -#include namespace ProfileEvents diff --git a/src/Client/TimeoutSetter.cpp b/src/IO/TimeoutSetter.cpp similarity index 97% rename from src/Client/TimeoutSetter.cpp rename to src/IO/TimeoutSetter.cpp index 87368f93ba3..f06cafecff8 100644 --- a/src/Client/TimeoutSetter.cpp +++ b/src/IO/TimeoutSetter.cpp @@ -1,4 +1,4 @@ -#include "TimeoutSetter.h" +#include #include diff --git a/src/Client/TimeoutSetter.h b/src/IO/TimeoutSetter.h similarity index 100% rename from src/Client/TimeoutSetter.h rename to src/IO/TimeoutSetter.h diff --git a/src/IO/WriteBufferFromPocoSocket.cpp b/src/IO/WriteBufferFromPocoSocket.cpp index c666586770e..4edfc8a2795 100644 --- a/src/IO/WriteBufferFromPocoSocket.cpp +++ b/src/IO/WriteBufferFromPocoSocket.cpp @@ -1,12 +1,12 @@ #include #include +#include #include #include #include #include -#include namespace ProfileEvents diff --git a/src/IO/ya.make b/src/IO/ya.make index 6605cf64277..58df027c561 100644 --- a/src/IO/ya.make +++ b/src/IO/ya.make @@ -50,6 +50,7 @@ SRCS( ReadBufferFromPocoSocket.cpp ReadHelpers.cpp SeekAvoidingReadBuffer.cpp + TimeoutSetter.cpp UseSSL.cpp WriteBufferFromFile.cpp WriteBufferFromFileBase.cpp diff --git a/src/Server/TCPHandler.h b/src/Server/TCPHandler.h index ee2f7c96b5a..c3dd8346c8e 100644 --- a/src/Server/TCPHandler.h +++ b/src/Server/TCPHandler.h @@ -8,10 +8,10 @@ #include #include #include +#include #include #include #include -#include #include "IServer.h" diff --git a/tests/integration/test_secure_socket/configs_secure/config.d/remote_servers.xml b/tests/integration/test_secure_socket/configs_secure/config.d/remote_servers.xml new file mode 100644 index 00000000000..0c109d6d768 --- /dev/null +++ b/tests/integration/test_secure_socket/configs_secure/config.d/remote_servers.xml @@ -0,0 +1,14 @@ + + 9440 + + + + + node2 + 9440 + 1 + + + + + diff --git a/tests/integration/test_secure_socket/configs_secure/config.d/ssl_conf.xml b/tests/integration/test_secure_socket/configs_secure/config.d/ssl_conf.xml new file mode 100644 index 00000000000..fe39e3712b8 --- /dev/null +++ b/tests/integration/test_secure_socket/configs_secure/config.d/ssl_conf.xml @@ -0,0 +1,18 @@ + + + + /etc/clickhouse-server/config.d/server.crt + /etc/clickhouse-server/config.d/server.key + /etc/clickhouse-server/config.d/dhparam.pem + none + true + + + true + none + + AcceptCertificateHandler + + + + diff --git a/tests/integration/test_secure_socket/configs_secure/dhparam.pem b/tests/integration/test_secure_socket/configs_secure/dhparam.pem new file mode 100644 index 00000000000..2e6cee0798d --- /dev/null +++ b/tests/integration/test_secure_socket/configs_secure/dhparam.pem @@ -0,0 +1,8 @@ +-----BEGIN DH PARAMETERS----- +MIIBCAKCAQEAua92DDli13gJ+//ZXyGaggjIuidqB0crXfhUlsrBk9BV1hH3i7fR +XGP9rUdk2ubnB3k2ejBStL5oBrkHm9SzUFSQHqfDjLZjKoUpOEmuDc4cHvX1XTR5 +Pr1vf5cd0yEncJWG5W4zyUB8k++SUdL2qaeslSs+f491HBLDYn/h8zCgRbBvxhxb +9qeho1xcbnWeqkN6Kc9bgGozA16P9NLuuLttNnOblkH+lMBf42BSne/TWt3AlGZf +slKmmZcySUhF8aKfJnLKbkBCFqOtFRh8zBA9a7g+BT/lSANATCDPaAk1YVih2EKb +dpc3briTDbRsiqg2JKMI7+VdULY9bh3EawIBAg== +-----END DH PARAMETERS----- diff --git a/tests/integration/test_secure_socket/configs_secure/server.crt b/tests/integration/test_secure_socket/configs_secure/server.crt new file mode 100644 index 00000000000..7ade2d96273 --- /dev/null +++ b/tests/integration/test_secure_socket/configs_secure/server.crt @@ -0,0 +1,19 @@ +-----BEGIN CERTIFICATE----- +MIIC/TCCAeWgAwIBAgIJANjx1QSR77HBMA0GCSqGSIb3DQEBCwUAMBQxEjAQBgNV +BAMMCWxvY2FsaG9zdDAgFw0xODA3MzAxODE2MDhaGA8yMjkyMDUxNDE4MTYwOFow +FDESMBAGA1UEAwwJbG9jYWxob3N0MIIBIjANBgkqhkiG9w0BAQEFAAOCAQ8AMIIB +CgKCAQEAs9uSo6lJG8o8pw0fbVGVu0tPOljSWcVSXH9uiJBwlZLQnhN4SFSFohfI +4K8U1tBDTnxPLUo/V1K9yzoLiRDGMkwVj6+4+hE2udS2ePTQv5oaMeJ9wrs+5c9T +4pOtlq3pLAdm04ZMB1nbrEysceVudHRkQbGHzHp6VG29Fw7Ga6YpqyHQihRmEkTU +7UCYNA+Vk7aDPdMS/khweyTpXYZimaK9f0ECU3/VOeG3fH6Sp2X6FN4tUj/aFXEj +sRmU5G2TlYiSIUMF2JPdhSihfk1hJVALrHPTU38SOL+GyyBRWdNcrIwVwbpvsvPg +pryMSNxnpr0AK0dFhjwnupIv5hJIOQIDAQABo1AwTjAdBgNVHQ4EFgQUjPLb3uYC +kcamyZHK4/EV8jAP0wQwHwYDVR0jBBgwFoAUjPLb3uYCkcamyZHK4/EV8jAP0wQw +DAYDVR0TBAUwAwEB/zANBgkqhkiG9w0BAQsFAAOCAQEAM/ocuDvfPus/KpMVD51j +4IdlU8R0vmnYLQ+ygzOAo7+hUWP5j0yvq4ILWNmQX6HNvUggCgFv9bjwDFhb/5Vr +85ieWfTd9+LTjrOzTw4avdGwpX9G+6jJJSSq15tw5ElOIFb/qNA9O4dBiu8vn03C +L/zRSXrARhSqTW5w/tZkUcSTT+M5h28+Lgn9ysx4Ff5vi44LJ1NnrbJbEAIYsAAD ++UA+4MBFKx1r6hHINULev8+lCfkpwIaeS8RL+op4fr6kQPxnULw8wT8gkuc8I4+L +P9gg/xDHB44T3ADGZ5Ib6O0DJaNiToO6rnoaaxs0KkotbvDWvRoxEytSbXKoYjYp +0g== +-----END CERTIFICATE----- diff --git a/tests/integration/test_secure_socket/configs_secure/server.key b/tests/integration/test_secure_socket/configs_secure/server.key new file mode 100644 index 00000000000..f0fb61ac443 --- /dev/null +++ b/tests/integration/test_secure_socket/configs_secure/server.key @@ -0,0 +1,28 @@ +-----BEGIN PRIVATE KEY----- +MIIEvQIBADANBgkqhkiG9w0BAQEFAASCBKcwggSjAgEAAoIBAQCz25KjqUkbyjyn +DR9tUZW7S086WNJZxVJcf26IkHCVktCeE3hIVIWiF8jgrxTW0ENOfE8tSj9XUr3L +OguJEMYyTBWPr7j6ETa51LZ49NC/mhox4n3Cuz7lz1Pik62WreksB2bThkwHWdus +TKxx5W50dGRBsYfMenpUbb0XDsZrpimrIdCKFGYSRNTtQJg0D5WTtoM90xL+SHB7 +JOldhmKZor1/QQJTf9U54bd8fpKnZfoU3i1SP9oVcSOxGZTkbZOViJIhQwXYk92F +KKF+TWElUAusc9NTfxI4v4bLIFFZ01ysjBXBum+y8+CmvIxI3GemvQArR0WGPCe6 +ki/mEkg5AgMBAAECggEATrbIBIxwDJOD2/BoUqWkDCY3dGevF8697vFuZKIiQ7PP +TX9j4vPq0DfsmDjHvAPFkTHiTQXzlroFik3LAp+uvhCCVzImmHq0IrwvZ9xtB43f +7Pkc5P6h1l3Ybo8HJ6zRIY3TuLtLxuPSuiOMTQSGRL0zq3SQ5DKuGwkz+kVjHXUN +MR2TECFwMHKQ5VLrC+7PMpsJYyOMlDAWhRfUalxC55xOXTpaN8TxNnwQ8K2ISVY5 +212Jz/a4hn4LdwxSz3Tiu95PN072K87HLWx3EdT6vW4Ge5P/A3y+smIuNAlanMnu +plHBRtpATLiTxZt/n6npyrfQVbYjSH7KWhB8hBHtaQKBgQDh9Cq1c/KtqDtE0Ccr +/r9tZNTUwBE6VP+3OJeKdEdtsfuxjOCkS1oAjgBJiSDOiWPh1DdoDeVZjPKq6pIu +Mq12OE3Doa8znfCXGbkSzEKOb2unKZMJxzrz99kXt40W5DtrqKPNb24CNqTiY8Aa +CjtcX+3weat82VRXvph6U8ltMwKBgQDLxjiQQzNoY7qvg7CwJCjf9qq8jmLK766g +1FHXopqS+dTxDLM8eJSRrpmxGWJvNeNc1uPhsKsKgotqAMdBUQTf7rSTbt4MyoH5 +bUcRLtr+0QTK9hDWMOOvleqNXha68vATkohWYfCueNsC60qD44o8RZAS6UNy3ENq +cM1cxqe84wKBgQDKkHutWnooJtajlTxY27O/nZKT/HA1bDgniMuKaz4R4Gr1PIez +on3YW3V0d0P7BP6PWRIm7bY79vkiMtLEKdiKUGWeyZdo3eHvhDb/3DCawtau8L2K +GZsHVp2//mS1Lfz7Qh8/L/NedqCQ+L4iWiPnZ3THjjwn3CoZ05ucpvrAMwKBgB54 +nay039MUVq44Owub3KDg+dcIU62U+cAC/9oG7qZbxYPmKkc4oL7IJSNecGHA5SbU +2268RFdl/gLz6tfRjbEOuOHzCjFPdvAdbysanpTMHLNc6FefJ+zxtgk9sJh0C4Jh +vxFrw9nTKKzfEl12gQ1SOaEaUIO0fEBGbe8ZpauRAoGAMAlGV+2/K4ebvAJKOVTa +dKAzQ+TD2SJmeR1HZmKDYddNqwtZlzg3v4ZhCk4eaUmGeC1Bdh8MDuB3QQvXz4Dr +vOIP4UVaOr+uM+7TgAgVnP4/K6IeJGzUDhX93pmpWhODfdu/oojEKVcpCojmEmS1 +KCBtmIrQLqzMpnBpLNuSY+Q= +-----END PRIVATE KEY----- diff --git a/tests/integration/test_secure_socket/configs_secure/users.d/users.xml b/tests/integration/test_secure_socket/configs_secure/users.d/users.xml new file mode 100644 index 00000000000..479017f6370 --- /dev/null +++ b/tests/integration/test_secure_socket/configs_secure/users.d/users.xml @@ -0,0 +1,6 @@ + + + + + + diff --git a/tests/integration/test_secure_socket/test.py b/tests/integration/test_secure_socket/test.py new file mode 100644 index 00000000000..6932c4a5bc9 --- /dev/null +++ b/tests/integration/test_secure_socket/test.py @@ -0,0 +1,83 @@ +import os.path +import time + +import pytest +from helpers.cluster import ClickHouseCluster +from helpers.test_tools import TSV + +cluster = ClickHouseCluster(__file__) + +NODES = {'node' + str(i): None for i in (1, 2)} + +config = ''' + + + {sleep_in_send_data} + + +''' + + +@pytest.fixture(scope="module") +def started_cluster(): + cluster.__with_ssl_config = True + main_configs = [ + "configs_secure/config.d/remote_servers.xml", + "configs_secure/server.crt", + "configs_secure/server.key", + "configs_secure/dhparam.pem", + "configs_secure/config.d/ssl_conf.xml", + ] + + NODES['node1'] = cluster.add_instance('node1', main_configs=main_configs) + NODES['node2'] = cluster.add_instance('node2', main_configs=main_configs, user_configs=["configs_secure/users.d/users.xml"]) + try: + cluster.start() + NODES['node2'].query("CREATE TABLE base_table (x UInt64) ENGINE = MergeTree ORDER BY x;") + NODES['node2'].query("INSERT INTO base_table VALUES (5);") + NODES['node1'].query("CREATE TABLE distributed_table (x UInt64) ENGINE = Distributed(test_cluster, default, base_table);") + + yield cluster + + finally: + cluster.shutdown() + + +def test(started_cluster): + NODES['node2'].replace_config('/etc/clickhouse-server/users.d/users.xml', config.format(sleep_in_send_data=1000)) + + attempts = 0 + while attempts < 1000: + setting = NODES['node2'].http_query("SELECT value FROM system.settings WHERE name='sleep_in_send_data'") + if int(setting) == 1000: + break + time.sleep(0.1) + attempts += 1 + + assert attempts < 1000 + + + start = time.time() + NODES['node1'].query_and_get_error('SELECT * FROM distributed_table settings receive_timeout=5, use_hedged_requests=0, async_socket_for_remote=0;') + end = time.time() + assert end - start < 6 + + start = time.time() + error = NODES['node1'].query_and_get_error('SELECT * FROM distributed_table settings receive_timeout=5, use_hedged_requests=0;') + end = time.time() + + assert end - start < 6 + + # Check that exception about timeout wasn't thrown from DB::ReadBufferFromPocoSocket::nextImpl(). + assert error.find('DB::ReadBufferFromPocoSocket::nextImpl()') == -1 + + start = time.time() + error = NODES['node1'].query_and_get_error('SELECT * FROM distributed_table settings receive_timeout=5;') + end = time.time() + + assert end - start < 6 + + # Check that exception about timeout wasn't thrown from DB::ReadBufferFromPocoSocket::nextImpl(). + assert error.find('DB::ReadBufferFromPocoSocket::nextImpl()') == -1 + + From 271398be61e88e4f20f327210ba53595273715d8 Mon Sep 17 00:00:00 2001 From: Pavel Kruglov Date: Thu, 4 Mar 2021 23:15:33 +0300 Subject: [PATCH 003/155] add __init__.py --- tests/integration/test_secure_socket/__init__.py | 0 1 file changed, 0 insertions(+), 0 deletions(-) create mode 100644 tests/integration/test_secure_socket/__init__.py diff --git a/tests/integration/test_secure_socket/__init__.py b/tests/integration/test_secure_socket/__init__.py new file mode 100644 index 00000000000..e69de29bb2d From ed6363b88bd1b436c706b639c0a9697037c3a5b7 Mon Sep 17 00:00:00 2001 From: Pavel Kruglov Date: Fri, 5 Mar 2021 16:18:12 +0300 Subject: [PATCH 004/155] Increase allowed query time --- tests/integration/test_secure_socket/test.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/tests/integration/test_secure_socket/test.py b/tests/integration/test_secure_socket/test.py index 6932c4a5bc9..8c94b4c35ad 100644 --- a/tests/integration/test_secure_socket/test.py +++ b/tests/integration/test_secure_socket/test.py @@ -60,13 +60,13 @@ def test(started_cluster): start = time.time() NODES['node1'].query_and_get_error('SELECT * FROM distributed_table settings receive_timeout=5, use_hedged_requests=0, async_socket_for_remote=0;') end = time.time() - assert end - start < 6 + assert end - start < 10 start = time.time() error = NODES['node1'].query_and_get_error('SELECT * FROM distributed_table settings receive_timeout=5, use_hedged_requests=0;') end = time.time() - assert end - start < 6 + assert end - start < 10 # Check that exception about timeout wasn't thrown from DB::ReadBufferFromPocoSocket::nextImpl(). assert error.find('DB::ReadBufferFromPocoSocket::nextImpl()') == -1 @@ -75,7 +75,7 @@ def test(started_cluster): error = NODES['node1'].query_and_get_error('SELECT * FROM distributed_table settings receive_timeout=5;') end = time.time() - assert end - start < 6 + assert end - start < 10 # Check that exception about timeout wasn't thrown from DB::ReadBufferFromPocoSocket::nextImpl(). assert error.find('DB::ReadBufferFromPocoSocket::nextImpl()') == -1 From 19af94bad97a7f0f6316249f29b69b6a6f64ea01 Mon Sep 17 00:00:00 2001 From: Pavel Kruglov Date: Fri, 5 Mar 2021 19:08:49 +0300 Subject: [PATCH 005/155] restart tests --- tests/integration/test_secure_socket/test.py | 1 + 1 file changed, 1 insertion(+) diff --git a/tests/integration/test_secure_socket/test.py b/tests/integration/test_secure_socket/test.py index 8c94b4c35ad..337b6b05bd7 100644 --- a/tests/integration/test_secure_socket/test.py +++ b/tests/integration/test_secure_socket/test.py @@ -31,6 +31,7 @@ def started_cluster(): NODES['node1'] = cluster.add_instance('node1', main_configs=main_configs) NODES['node2'] = cluster.add_instance('node2', main_configs=main_configs, user_configs=["configs_secure/users.d/users.xml"]) + try: cluster.start() NODES['node2'].query("CREATE TABLE base_table (x UInt64) ENGINE = MergeTree ORDER BY x;") From 9c35e4987899c4c52c6d4c33dce602ac9a8629f1 Mon Sep 17 00:00:00 2001 From: Azat Khuzhin Date: Sat, 6 Mar 2021 10:34:39 +0300 Subject: [PATCH 006/155] Fix heap-buffer-overflow in highlighting multi-line comments Not closed multi-line comment returns the whole query, so it should not be processed further with the lexer. ASan report: :) /*================================================================= ==14889==ERROR: AddressSanitizer: heap-buffer-overflow on address 0x60400006ebc0 at pc 0x00000a8148ea bp 0x7fffffff8610 sp 0x7fffffff7dd8 WRITE of size 16 at 0x60400006ebc0 thread T0 0 0xa8148e9 in __asan_memcpy (/src/ch/tmp/upstream/clickhouse-asan+0xa8148e9) 1 0xaa8a3a4 in DB::Client::highlight(std::__1::basic_string, std::__1::allocator > const&, std::__1::vector >&) obj-x86_64-linux-gnu/../programs/client/Client.cpp:464:52 2 0x25f7b6d8 in std::__1::__function::__policy_func, std::__1::allocator > const&, std::__1::vector >&)>::operator()(std::__1::basic_string, std::__1::allocator > const&, std::__1::vector >&) const obj-x86_64-linux-gnu/../contrib/libcxx/include/functional:2221:16 3 0x25f7b6d8 in std::__1::function, std::__1::allocator > const&, std::__1::vector >&)>::operator()(std::__1::basic_string, std::__1::allocator > const&, std::__1::vector >&) const obj-x86_64-linux-gnu/../contrib/libcxx/include/functional:2560:12 4 0x25f7b6d8 in replxx::Replxx::ReplxxImpl::render(replxx::Replxx::ReplxxImpl::HINT_ACTION) obj-x86_64-linux-gnu/../contrib/replxx/src/replxx_impl.cxx:546:3 5 0x25f74059 in replxx::Replxx::ReplxxImpl::refresh_line(replxx::Replxx::ReplxxImpl::HINT_ACTION) obj-x86_64-linux-gnu/../contrib/replxx/src/replxx_impl.cxx:729:2 6 0x25f6bc8f in replxx::Replxx::ReplxxImpl::insert_character(char32_t) obj-x86_64-linux-gnu/../contrib/replxx/src/replxx_impl.cxx:1197:3 7 0x25f79347 in replxx::Replxx::ReplxxImpl::action(unsigned long long, replxx::Replxx::ACTION_RESULT (replxx::Replxx::ReplxxImpl::* const&)(char32_t), char32_t) obj-x86_64-linux-gnu/../contrib/replxx/src/replxx_impl.cxx:1130:29 8 0x25f79347 in replxx::Replxx::ReplxxImpl::get_input_line() obj-x86_64-linux-gnu/../contrib/replxx/src/replxx_impl.cxx:1123:11 9 0x25f7844c in replxx::Replxx::ReplxxImpl::input(std::__1::basic_string, std::__1::allocator > const&) obj-x86_64-linux-gnu/../contrib/replxx/src/replxx_impl.cxx:455:8 10 0x25af5693 in ReplxxLineReader::readOneLine(std::__1::basic_string, std::__1::allocator > const&) obj-x86_64-linux-gnu/../base/common/ReplxxLineReader.cpp:108:29 11 0x25aed149 in LineReader::readLine(std::__1::basic_string, std::__1::allocator > const&, std::__1::basic_string, std::__1::allocator > const&) obj-x86_64-linux-gnu/../base/common/LineReader.cpp:81:26 12 0xaa80ba2 in DB::Client::mainImpl() obj-x86_64-linux-gnu/../programs/client/Client.cpp:654:33 13 0xaa756f5 in DB::Client::main(std::__1::vector, std::__1::allocator >, std::__1::allocator, std::__1::allocator > > > const&) obj-x86_64-linux-gnu/../programs/client/Client.cpp:280:20 14 0x25c0c8b5 in Poco::Util::Application::run() obj-x86_64-linux-gnu/../contrib/poco/Util/src/Application.cpp:334:8 15 0xaa4d050 in mainEntryClickHouseClient(int, char**) obj-x86_64-linux-gnu/../programs/client/Client.cpp:2724:23 16 0xa848c3a in main obj-x86_64-linux-gnu/../programs/main.cpp:368:12 17 0x7ffff7dcab24 in __libc_start_main (/usr/lib/libc.so.6+0x27b24) 18 0xa79b36d in _start (/src/ch/tmp/upstream/clickhouse-asan+0xa79b36d) 0x60400006ebc0 is located 0 bytes to the right of 48-byte region [0x60400006eb90,0x60400006ebc0) allocated by thread T0 here: 0 0xa84509d in operator new(unsigned long) (/src/ch/tmp/upstream/clickhouse-asan+0xa84509d) 1 0x25f7af76 in void* std::__1::__libcpp_operator_new(unsigned long) obj-x86_64-linux-gnu/../contrib/libcxx/include/new:235:10 2 0x25f7af76 in std::__1::__libcpp_allocate(unsigned long, unsigned long) obj-x86_64-linux-gnu/../contrib/libcxx/include/new:261:10 3 0x25f7af76 in std::__1::allocator::allocate(unsigned long) obj-x86_64-linux-gnu/../contrib/libcxx/include/memory:840:38 4 0x25f7af76 in std::__1::allocator_traits >::allocate(std::__1::allocator&, unsigned long) obj-x86_64-linux-gnu/../contrib/libcxx/include/__memory/allocator_traits.h:468:21 5 0x25f7af76 in std::__1::vector >::__vallocate(unsigned long) obj-x86_64-linux-gnu/../contrib/libcxx/include/vector:993:37 6 0x25f7af76 in std::__1::vector >::vector(unsigned long, replxx::Replxx::Color const&) obj-x86_64-linux-gnu/../contrib/libcxx/include/vector:1155:9 7 0x25f7af76 in replxx::Replxx::ReplxxImpl::render(replxx::Replxx::ReplxxImpl::HINT_ACTION) obj-x86_64-linux-gnu/../contrib/replxx/src/replxx_impl.cxx:543:19 8 0x25f74059 in replxx::Replxx::ReplxxImpl::refresh_line(replxx::Replxx::ReplxxImpl::HINT_ACTION) obj-x86_64-linux-gnu/../contrib/replxx/src/replxx_impl.cxx:729:2 9 0x25f6bc8f in replxx::Replxx::ReplxxImpl::insert_character(char32_t) obj-x86_64-linux-gnu/../contrib/replxx/src/replxx_impl.cxx:1197:3 10 0x25f79347 in replxx::Replxx::ReplxxImpl::action(unsigned long long, replxx::Replxx::ACTION_RESULT (replxx::Replxx::ReplxxImpl::* const&)(char32_t), char32_t) obj-x86_64-linux-gnu/../contrib/replxx/src/replxx_impl.cxx:1130:29 11 0x25f79347 in replxx::Replxx::ReplxxImpl::get_input_line() obj-x86_64-linux-gnu/../contrib/replxx/src/replxx_impl.cxx:1123:11 12 0x25f7844c in replxx::Replxx::ReplxxImpl::input(std::__1::basic_string, std::__1::allocator > const&) obj-x86_64-linux-gnu/../contrib/replxx/src/replxx_impl.cxx:455:8 13 0x25af5693 in ReplxxLineReader::readOneLine(std::__1::basic_string, std::__1::allocator > const&) obj-x86_64-linux-gnu/../base/common/ReplxxLineReader.cpp:108:29 14 0x25aed149 in LineReader::readLine(std::__1::basic_string, std::__1::allocator > const&, std::__1::basic_string, std::__1::allocator > const&) obj-x86_64-linux-gnu/../base/common/LineReader.cpp:81:26 15 0xaa80ba2 in DB::Client::mainImpl() obj-x86_64-linux-gnu/../programs/client/Client.cpp:654:33 16 0xaa756f5 in DB::Client::main(std::__1::vector, std::__1::allocator >, std::__1::allocator, std::__1::allocator > > > const&) obj-x86_64-linux-gnu/../programs/client/Client.cpp:280:20 17 0x25c0c8b5 in Poco::Util::Application::run() obj-x86_64-linux-gnu/../contrib/poco/Util/src/Application.cpp:334:8 18 0xaa4d050 in mainEntryClickHouseClient(int, char**) obj-x86_64-linux-gnu/../programs/client/Client.cpp:2724:23 19 0xa848c3a in main obj-x86_64-linux-gnu/../programs/main.cpp:368:12 20 0x7ffff7dcab24 in __libc_start_main (/usr/lib/libc.so.6+0x27b24) SUMMARY: AddressSanitizer: heap-buffer-overflow (/src/ch/tmp/upstream/clickhouse-asan+0xa8148e9) in __asan_memcpy v2: fix lexer instead of client quirk --- src/Parsers/Lexer.cpp | 3 ++- ...light_multi_line_comment_regression.expect | 25 +++++++++++++++++++ ...ht_multi_line_comment_regression.reference | 0 .../queries/0_stateless/arcadia_skip_list.txt | 1 + tests/queries/skip_list.json | 3 ++- 5 files changed, 30 insertions(+), 2 deletions(-) create mode 100755 tests/queries/0_stateless/01755_client_highlight_multi_line_comment_regression.expect create mode 100644 tests/queries/0_stateless/01755_client_highlight_multi_line_comment_regression.reference diff --git a/src/Parsers/Lexer.cpp b/src/Parsers/Lexer.cpp index ffa8250a3f3..1fa4d396113 100644 --- a/src/Parsers/Lexer.cpp +++ b/src/Parsers/Lexer.cpp @@ -275,7 +275,8 @@ Token Lexer::nextTokenImpl() else ++pos; } - return Token(TokenType::ErrorMultilineCommentIsNotClosed, token_begin, end); + pos = end; + return Token(TokenType::ErrorMultilineCommentIsNotClosed, token_begin, pos); } } return Token(TokenType::Slash, token_begin, pos); diff --git a/tests/queries/0_stateless/01755_client_highlight_multi_line_comment_regression.expect b/tests/queries/0_stateless/01755_client_highlight_multi_line_comment_regression.expect new file mode 100755 index 00000000000..65b9bde235b --- /dev/null +++ b/tests/queries/0_stateless/01755_client_highlight_multi_line_comment_regression.expect @@ -0,0 +1,25 @@ +#!/usr/bin/expect -f + +log_user 0 +set timeout 5 +match_max 100000 +# A default timeout action is to do nothing, change it to fail +expect_after { + timeout { + exit 2 + } +} + +set basedir [file dirname $argv0] +spawn bash -c "source $basedir/../shell_config.sh ; \$CLICKHOUSE_CLIENT_BINARY \$CLICKHOUSE_CLIENT_OPT" +expect ":) " + +# regression for heap-buffer-overflow issue (under ASAN) +send -- "/**" +expect "/**" +# just in case few more bytes +send -- "foobar" +expect "/**foobar" + +send -- "\3\4" +expect eof diff --git a/tests/queries/0_stateless/01755_client_highlight_multi_line_comment_regression.reference b/tests/queries/0_stateless/01755_client_highlight_multi_line_comment_regression.reference new file mode 100644 index 00000000000..e69de29bb2d diff --git a/tests/queries/0_stateless/arcadia_skip_list.txt b/tests/queries/0_stateless/arcadia_skip_list.txt index c1e991ff6b2..a9cca053d3c 100644 --- a/tests/queries/0_stateless/arcadia_skip_list.txt +++ b/tests/queries/0_stateless/arcadia_skip_list.txt @@ -212,3 +212,4 @@ 01017_uniqCombined_memory_usage 01747_join_view_filter_dictionary 01748_dictionary_table_dot +01755_client_highlight_multi_line_comment_regression diff --git a/tests/queries/skip_list.json b/tests/queries/skip_list.json index 1200d8f5436..b829423f846 100644 --- a/tests/queries/skip_list.json +++ b/tests/queries/skip_list.json @@ -95,7 +95,8 @@ "01370_client_autocomplete_word_break_characters", "01676_clickhouse_client_autocomplete", "01193_metadata_loading", - "01455_time_zones" + "01455_time_zones", + "01755_client_highlight_multi_line_comment_regression" ], "release-build": [ ], From ba9e1e5a8d23e7fbf6bff83e6493d46f1d49ef75 Mon Sep 17 00:00:00 2001 From: alesapin Date: Wed, 10 Mar 2021 19:12:32 +0300 Subject: [PATCH 007/155] Some initial code Add some java magic Allow to connect with old session id More angry nemesis and fixes Angry Fix style Split to files Better wrappers Better structure Add set test and split to separate files (I think something broken now) Better Missed files --- src/Coordination/CoordinationSettings.h | 1 + src/Coordination/NuKeeperServer.cpp | 4 +- src/Server/NuKeeperTCPHandler.cpp | 10 +- tests/jepsen.nukeeper/.gitignore | 13 + tests/jepsen.nukeeper/CHANGELOG.md | 24 ++ tests/jepsen.nukeeper/LICENSE | 280 ++++++++++++++++++ tests/jepsen.nukeeper/README.md | 22 ++ tests/jepsen.nukeeper/doc/intro.md | 3 + tests/jepsen.nukeeper/project.clj | 13 + tests/jepsen.nukeeper/resources/config.xml | 1 + tests/jepsen.nukeeper/resources/listen.xml | 3 + .../resources/test_keeper_config.xml | 33 +++ tests/jepsen.nukeeper/resources/users.xml | 1 + .../src/jepsen/nukeeper/main.clj | 143 +++++++++ .../src/jepsen/nukeeper/register.clj | 64 ++++ .../src/jepsen/nukeeper/set.clj | 43 +++ .../src/jepsen/nukeeper/utils.clj | 56 ++++ .../test/jepsen/nukeeper_test.clj | 28 ++ 18 files changed, 733 insertions(+), 9 deletions(-) create mode 100644 tests/jepsen.nukeeper/.gitignore create mode 100644 tests/jepsen.nukeeper/CHANGELOG.md create mode 100644 tests/jepsen.nukeeper/LICENSE create mode 100644 tests/jepsen.nukeeper/README.md create mode 100644 tests/jepsen.nukeeper/doc/intro.md create mode 100644 tests/jepsen.nukeeper/project.clj create mode 120000 tests/jepsen.nukeeper/resources/config.xml create mode 100644 tests/jepsen.nukeeper/resources/listen.xml create mode 100644 tests/jepsen.nukeeper/resources/test_keeper_config.xml create mode 120000 tests/jepsen.nukeeper/resources/users.xml create mode 100644 tests/jepsen.nukeeper/src/jepsen/nukeeper/main.clj create mode 100644 tests/jepsen.nukeeper/src/jepsen/nukeeper/register.clj create mode 100644 tests/jepsen.nukeeper/src/jepsen/nukeeper/set.clj create mode 100644 tests/jepsen.nukeeper/src/jepsen/nukeeper/utils.clj create mode 100644 tests/jepsen.nukeeper/test/jepsen/nukeeper_test.clj diff --git a/src/Coordination/CoordinationSettings.h b/src/Coordination/CoordinationSettings.h index dcfb13c359e..c816f8089d5 100644 --- a/src/Coordination/CoordinationSettings.h +++ b/src/Coordination/CoordinationSettings.h @@ -31,6 +31,7 @@ struct Settings; M(UInt64, rotate_log_storage_interval, 10000, "How many records will be stored in one log storage file", 0) \ M(UInt64, snapshots_to_keep, 3, "How many compressed snapshots to keep on disk", 0) \ M(UInt64, stale_log_gap, 10000, "When node became stale and should receive snapshots from leader", 0) \ + M(Bool, quorum_reads, false, "Execute read requests as writes through whole RAFT consesus with similar speed", 0) \ M(Bool, force_sync, true, " Call fsync on each change in RAFT changelog", 0) DECLARE_SETTINGS_TRAITS(CoordinationSettingsTraits, LIST_OF_COORDINATION_SETTINGS) diff --git a/src/Coordination/NuKeeperServer.cpp b/src/Coordination/NuKeeperServer.cpp index edda26613dd..2081c969523 100644 --- a/src/Coordination/NuKeeperServer.cpp +++ b/src/Coordination/NuKeeperServer.cpp @@ -30,6 +30,8 @@ NuKeeperServer::NuKeeperServer( , state_manager(nuraft::cs_new(server_id, "test_keeper_server", config, coordination_settings)) , responses_queue(responses_queue_) { + if (coordination_settings->quorum_reads) + LOG_WARNING(&Poco::Logger::get("NuKeeperServer"), "Quorum reads enabled, NuKeeper will work slower."); } void NuKeeperServer::startup() @@ -106,7 +108,7 @@ nuraft::ptr getZooKeeperLogEntry(int64_t session_id, const Coord void NuKeeperServer::putRequest(const NuKeeperStorage::RequestForSession & request_for_session) { auto [session_id, request] = request_for_session; - if (isLeaderAlive() && request->isReadRequest()) + if (!coordination_settings->quorum_reads && isLeaderAlive() && request->isReadRequest()) { state_machine->processReadRequest(request_for_session); } diff --git a/src/Server/NuKeeperTCPHandler.cpp b/src/Server/NuKeeperTCPHandler.cpp index b283356d27d..b676331f6c0 100644 --- a/src/Server/NuKeeperTCPHandler.cpp +++ b/src/Server/NuKeeperTCPHandler.cpp @@ -240,16 +240,10 @@ Poco::Timespan NuKeeperTCPHandler::receiveHandshake() throw Exception("Unexpected protocol version: " + toString(protocol_version), ErrorCodes::UNEXPECTED_PACKET_FROM_CLIENT); Coordination::read(last_zxid_seen, *in); - - if (last_zxid_seen != 0) - throw Exception("Non zero last_zxid_seen is not supported", ErrorCodes::UNEXPECTED_PACKET_FROM_CLIENT); - Coordination::read(timeout_ms, *in); + + /// TODO Stop ignoring this value Coordination::read(previous_session_id, *in); - - if (previous_session_id != 0) - throw Exception("Non zero previous session id is not supported", ErrorCodes::UNEXPECTED_PACKET_FROM_CLIENT); - Coordination::read(passwd, *in); int8_t readonly; diff --git a/tests/jepsen.nukeeper/.gitignore b/tests/jepsen.nukeeper/.gitignore new file mode 100644 index 00000000000..d956ab0a125 --- /dev/null +++ b/tests/jepsen.nukeeper/.gitignore @@ -0,0 +1,13 @@ +/target +/classes +/checkouts +profiles.clj +pom.xml +pom.xml.asc +*.jar +*.class +/.lein-* +/.nrepl-port +/.prepl-port +.hgignore +.hg/ diff --git a/tests/jepsen.nukeeper/CHANGELOG.md b/tests/jepsen.nukeeper/CHANGELOG.md new file mode 100644 index 00000000000..6c7cb4f7c8a --- /dev/null +++ b/tests/jepsen.nukeeper/CHANGELOG.md @@ -0,0 +1,24 @@ +# Change Log +All notable changes to this project will be documented in this file. This change log follows the conventions of [keepachangelog.com](http://keepachangelog.com/). + +## [Unreleased] +### Changed +- Add a new arity to `make-widget-async` to provide a different widget shape. + +## [0.1.1] - 2021-03-10 +### Changed +- Documentation on how to make the widgets. + +### Removed +- `make-widget-sync` - we're all async, all the time. + +### Fixed +- Fixed widget maker to keep working when daylight savings switches over. + +## 0.1.0 - 2021-03-10 +### Added +- Files from the new template. +- Widget maker public API - `make-widget-sync`. + +[Unreleased]: https://github.com/your-name/jepsen.nukeeper/compare/0.1.1...HEAD +[0.1.1]: https://github.com/your-name/jepsen.nukeeper/compare/0.1.0...0.1.1 diff --git a/tests/jepsen.nukeeper/LICENSE b/tests/jepsen.nukeeper/LICENSE new file mode 100644 index 00000000000..231512650b9 --- /dev/null +++ b/tests/jepsen.nukeeper/LICENSE @@ -0,0 +1,280 @@ +Eclipse Public License - v 2.0 + + THE ACCOMPANYING PROGRAM IS PROVIDED UNDER THE TERMS OF THIS ECLIPSE + PUBLIC LICENSE ("AGREEMENT"). ANY USE, REPRODUCTION OR DISTRIBUTION + OF THE PROGRAM CONSTITUTES RECIPIENT'S ACCEPTANCE OF THIS AGREEMENT. + +1. DEFINITIONS + +"Contribution" means: + + a) in the case of the initial Contributor, the initial content + Distributed under this Agreement, and + + b) in the case of each subsequent Contributor: + i) changes to the Program, and + ii) additions to the Program; + where such changes and/or additions to the Program originate from + and are Distributed by that particular Contributor. A Contribution + "originates" from a Contributor if it was added to the Program by + such Contributor itself or anyone acting on such Contributor's behalf. + Contributions do not include changes or additions to the Program that + are not Modified Works. + +"Contributor" means any person or entity that Distributes the Program. + +"Licensed Patents" mean patent claims licensable by a Contributor which +are necessarily infringed by the use or sale of its Contribution alone +or when combined with the Program. + +"Program" means the Contributions Distributed in accordance with this +Agreement. + +"Recipient" means anyone who receives the Program under this Agreement +or any Secondary License (as applicable), including Contributors. + +"Derivative Works" shall mean any work, whether in Source Code or other +form, that is based on (or derived from) the Program and for which the +editorial revisions, annotations, elaborations, or other modifications +represent, as a whole, an original work of authorship. + +"Modified Works" shall mean any work in Source Code or other form that +results from an addition to, deletion from, or modification of the +contents of the Program, including, for purposes of clarity any new file +in Source Code form that contains any contents of the Program. Modified +Works shall not include works that contain only declarations, +interfaces, types, classes, structures, or files of the Program solely +in each case in order to link to, bind by name, or subclass the Program +or Modified Works thereof. + +"Distribute" means the acts of a) distributing or b) making available +in any manner that enables the transfer of a copy. + +"Source Code" means the form of a Program preferred for making +modifications, including but not limited to software source code, +documentation source, and configuration files. + +"Secondary License" means either the GNU General Public License, +Version 2.0, or any later versions of that license, including any +exceptions or additional permissions as identified by the initial +Contributor. + +2. GRANT OF RIGHTS + + a) Subject to the terms of this Agreement, each Contributor hereby + grants Recipient a non-exclusive, worldwide, royalty-free copyright + license to reproduce, prepare Derivative Works of, publicly display, + publicly perform, Distribute and sublicense the Contribution of such + Contributor, if any, and such Derivative Works. + + b) Subject to the terms of this Agreement, each Contributor hereby + grants Recipient a non-exclusive, worldwide, royalty-free patent + license under Licensed Patents to make, use, sell, offer to sell, + import and otherwise transfer the Contribution of such Contributor, + if any, in Source Code or other form. This patent license shall + apply to the combination of the Contribution and the Program if, at + the time the Contribution is added by the Contributor, such addition + of the Contribution causes such combination to be covered by the + Licensed Patents. The patent license shall not apply to any other + combinations which include the Contribution. No hardware per se is + licensed hereunder. + + c) Recipient understands that although each Contributor grants the + licenses to its Contributions set forth herein, no assurances are + provided by any Contributor that the Program does not infringe the + patent or other intellectual property rights of any other entity. + Each Contributor disclaims any liability to Recipient for claims + brought by any other entity based on infringement of intellectual + property rights or otherwise. As a condition to exercising the + rights and licenses granted hereunder, each Recipient hereby + assumes sole responsibility to secure any other intellectual + property rights needed, if any. For example, if a third party + patent license is required to allow Recipient to Distribute the + Program, it is Recipient's responsibility to acquire that license + before distributing the Program. + + d) Each Contributor represents that to its knowledge it has + sufficient copyright rights in its Contribution, if any, to grant + the copyright license set forth in this Agreement. + + e) Notwithstanding the terms of any Secondary License, no + Contributor makes additional grants to any Recipient (other than + those set forth in this Agreement) as a result of such Recipient's + receipt of the Program under the terms of a Secondary License + (if permitted under the terms of Section 3). + +3. REQUIREMENTS + +3.1 If a Contributor Distributes the Program in any form, then: + + a) the Program must also be made available as Source Code, in + accordance with section 3.2, and the Contributor must accompany + the Program with a statement that the Source Code for the Program + is available under this Agreement, and informs Recipients how to + obtain it in a reasonable manner on or through a medium customarily + used for software exchange; and + + b) the Contributor may Distribute the Program under a license + different than this Agreement, provided that such license: + i) effectively disclaims on behalf of all other Contributors all + warranties and conditions, express and implied, including + warranties or conditions of title and non-infringement, and + implied warranties or conditions of merchantability and fitness + for a particular purpose; + + ii) effectively excludes on behalf of all other Contributors all + liability for damages, including direct, indirect, special, + incidental and consequential damages, such as lost profits; + + iii) does not attempt to limit or alter the recipients' rights + in the Source Code under section 3.2; and + + iv) requires any subsequent distribution of the Program by any + party to be under a license that satisfies the requirements + of this section 3. + +3.2 When the Program is Distributed as Source Code: + + a) it must be made available under this Agreement, or if the + Program (i) is combined with other material in a separate file or + files made available under a Secondary License, and (ii) the initial + Contributor attached to the Source Code the notice described in + Exhibit A of this Agreement, then the Program may be made available + under the terms of such Secondary Licenses, and + + b) a copy of this Agreement must be included with each copy of + the Program. + +3.3 Contributors may not remove or alter any copyright, patent, +trademark, attribution notices, disclaimers of warranty, or limitations +of liability ("notices") contained within the Program from any copy of +the Program which they Distribute, provided that Contributors may add +their own appropriate notices. + +4. COMMERCIAL DISTRIBUTION + +Commercial distributors of software may accept certain responsibilities +with respect to end users, business partners and the like. While this +license is intended to facilitate the commercial use of the Program, +the Contributor who includes the Program in a commercial product +offering should do so in a manner which does not create potential +liability for other Contributors. Therefore, if a Contributor includes +the Program in a commercial product offering, such Contributor +("Commercial Contributor") hereby agrees to defend and indemnify every +other Contributor ("Indemnified Contributor") against any losses, +damages and costs (collectively "Losses") arising from claims, lawsuits +and other legal actions brought by a third party against the Indemnified +Contributor to the extent caused by the acts or omissions of such +Commercial Contributor in connection with its distribution of the Program +in a commercial product offering. The obligations in this section do not +apply to any claims or Losses relating to any actual or alleged +intellectual property infringement. In order to qualify, an Indemnified +Contributor must: a) promptly notify the Commercial Contributor in +writing of such claim, and b) allow the Commercial Contributor to control, +and cooperate with the Commercial Contributor in, the defense and any +related settlement negotiations. The Indemnified Contributor may +participate in any such claim at its own expense. + +For example, a Contributor might include the Program in a commercial +product offering, Product X. That Contributor is then a Commercial +Contributor. If that Commercial Contributor then makes performance +claims, or offers warranties related to Product X, those performance +claims and warranties are such Commercial Contributor's responsibility +alone. Under this section, the Commercial Contributor would have to +defend claims against the other Contributors related to those performance +claims and warranties, and if a court requires any other Contributor to +pay any damages as a result, the Commercial Contributor must pay +those damages. + +5. NO WARRANTY + +EXCEPT AS EXPRESSLY SET FORTH IN THIS AGREEMENT, AND TO THE EXTENT +PERMITTED BY APPLICABLE LAW, THE PROGRAM IS PROVIDED ON AN "AS IS" +BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, EITHER EXPRESS OR +IMPLIED INCLUDING, WITHOUT LIMITATION, ANY WARRANTIES OR CONDITIONS OF +TITLE, NON-INFRINGEMENT, MERCHANTABILITY OR FITNESS FOR A PARTICULAR +PURPOSE. Each Recipient is solely responsible for determining the +appropriateness of using and distributing the Program and assumes all +risks associated with its exercise of rights under this Agreement, +including but not limited to the risks and costs of program errors, +compliance with applicable laws, damage to or loss of data, programs +or equipment, and unavailability or interruption of operations. + +6. DISCLAIMER OF LIABILITY + +EXCEPT AS EXPRESSLY SET FORTH IN THIS AGREEMENT, AND TO THE EXTENT +PERMITTED BY APPLICABLE LAW, NEITHER RECIPIENT NOR ANY CONTRIBUTORS +SHALL HAVE ANY LIABILITY FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING WITHOUT LIMITATION LOST +PROFITS), HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +ARISING IN ANY WAY OUT OF THE USE OR DISTRIBUTION OF THE PROGRAM OR THE +EXERCISE OF ANY RIGHTS GRANTED HEREUNDER, EVEN IF ADVISED OF THE +POSSIBILITY OF SUCH DAMAGES. + +7. GENERAL + +If any provision of this Agreement is invalid or unenforceable under +applicable law, it shall not affect the validity or enforceability of +the remainder of the terms of this Agreement, and without further +action by the parties hereto, such provision shall be reformed to the +minimum extent necessary to make such provision valid and enforceable. + +If Recipient institutes patent litigation against any entity +(including a cross-claim or counterclaim in a lawsuit) alleging that the +Program itself (excluding combinations of the Program with other software +or hardware) infringes such Recipient's patent(s), then such Recipient's +rights granted under Section 2(b) shall terminate as of the date such +litigation is filed. + +All Recipient's rights under this Agreement shall terminate if it +fails to comply with any of the material terms or conditions of this +Agreement and does not cure such failure in a reasonable period of +time after becoming aware of such noncompliance. If all Recipient's +rights under this Agreement terminate, Recipient agrees to cease use +and distribution of the Program as soon as reasonably practicable. +However, Recipient's obligations under this Agreement and any licenses +granted by Recipient relating to the Program shall continue and survive. + +Everyone is permitted to copy and distribute copies of this Agreement, +but in order to avoid inconsistency the Agreement is copyrighted and +may only be modified in the following manner. The Agreement Steward +reserves the right to publish new versions (including revisions) of +this Agreement from time to time. No one other than the Agreement +Steward has the right to modify this Agreement. The Eclipse Foundation +is the initial Agreement Steward. The Eclipse Foundation may assign the +responsibility to serve as the Agreement Steward to a suitable separate +entity. Each new version of the Agreement will be given a distinguishing +version number. The Program (including Contributions) may always be +Distributed subject to the version of the Agreement under which it was +received. In addition, after a new version of the Agreement is published, +Contributor may elect to Distribute the Program (including its +Contributions) under the new version. + +Except as expressly stated in Sections 2(a) and 2(b) above, Recipient +receives no rights or licenses to the intellectual property of any +Contributor under this Agreement, whether expressly, by implication, +estoppel or otherwise. All rights in the Program not expressly granted +under this Agreement are reserved. Nothing in this Agreement is intended +to be enforceable by any entity that is not a Contributor or Recipient. +No third-party beneficiary rights are created under this Agreement. + +Exhibit A - Form of Secondary Licenses Notice + +"This Source Code may also be made available under the following +Secondary Licenses when the conditions for such availability set forth +in the Eclipse Public License, v. 2.0 are satisfied: GNU General Public +License as published by the Free Software Foundation, either version 2 +of the License, or (at your option) any later version, with the GNU +Classpath Exception which is available at +https://www.gnu.org/software/classpath/license.html." + + Simply including a copy of this Agreement, including this Exhibit A + is not sufficient to license the Source Code under Secondary Licenses. + + If it is not possible or desirable to put the notice in a particular + file, then You may include the notice in a location (such as a LICENSE + file in a relevant directory) where a recipient would be likely to + look for such a notice. + + You may add additional accurate notices of copyright ownership. diff --git a/tests/jepsen.nukeeper/README.md b/tests/jepsen.nukeeper/README.md new file mode 100644 index 00000000000..f72409e080f --- /dev/null +++ b/tests/jepsen.nukeeper/README.md @@ -0,0 +1,22 @@ +# jepsen.nukeeper + +A Clojure library designed to ... well, that part is up to you. + +## Usage + +FIXME + +## License + +Copyright © 2021 FIXME + +This program and the accompanying materials are made available under the +terms of the Eclipse Public License 2.0 which is available at +http://www.eclipse.org/legal/epl-2.0. + +This Source Code may also be made available under the following Secondary +Licenses when the conditions for such availability set forth in the Eclipse +Public License, v. 2.0 are satisfied: GNU General Public License as published by +the Free Software Foundation, either version 2 of the License, or (at your +option) any later version, with the GNU Classpath Exception which is available +at https://www.gnu.org/software/classpath/license.html. diff --git a/tests/jepsen.nukeeper/doc/intro.md b/tests/jepsen.nukeeper/doc/intro.md new file mode 100644 index 00000000000..c6e5ccbd04a --- /dev/null +++ b/tests/jepsen.nukeeper/doc/intro.md @@ -0,0 +1,3 @@ +# Introduction to jepsen.nukeeper + +TODO: write [great documentation](http://jacobian.org/writing/what-to-write/) diff --git a/tests/jepsen.nukeeper/project.clj b/tests/jepsen.nukeeper/project.clj new file mode 100644 index 00000000000..e7150c9e5d4 --- /dev/null +++ b/tests/jepsen.nukeeper/project.clj @@ -0,0 +1,13 @@ +(defproject jepsen.nukeeper "0.1.0-SNAPSHOT" + :injections [(.. System (setProperty "zookeeper.request.timeout" "10000"))] + :description "A jepsen tests for ClickHouse NuKeeper" + :url "https://clickhouse.tech/" + :license {:name "EPL-2.0" + :url "https://www.eclipse.org/legal/epl-2.0/"} + :main jepsen.nukeeper.main + :plugins [[lein-cljfmt "0.7.0"]] + :dependencies [[org.clojure/clojure "1.10.1"] + [jepsen "0.2.3"] + [zookeeper-clj "0.9.4"] + [org.apache.zookeeper/zookeeper "3.6.1" :exclusions [org.slf4j/slf4j-log4j12]]] + :repl-options {:init-ns jepsen.nukeeper.main}) diff --git a/tests/jepsen.nukeeper/resources/config.xml b/tests/jepsen.nukeeper/resources/config.xml new file mode 120000 index 00000000000..c7596baa075 --- /dev/null +++ b/tests/jepsen.nukeeper/resources/config.xml @@ -0,0 +1 @@ +../../../programs/server/config.xml \ No newline at end of file diff --git a/tests/jepsen.nukeeper/resources/listen.xml b/tests/jepsen.nukeeper/resources/listen.xml new file mode 100644 index 00000000000..de8c737ff75 --- /dev/null +++ b/tests/jepsen.nukeeper/resources/listen.xml @@ -0,0 +1,3 @@ + + :: + diff --git a/tests/jepsen.nukeeper/resources/test_keeper_config.xml b/tests/jepsen.nukeeper/resources/test_keeper_config.xml new file mode 100644 index 00000000000..0e2a688ea0b --- /dev/null +++ b/tests/jepsen.nukeeper/resources/test_keeper_config.xml @@ -0,0 +1,33 @@ + + + 9181 + {id} + + + 10000 + 30000 + false + 60000 + trace + {quorum_reads} + + + + + 1 + {srv1} + 9444 + + + 2 + {srv2} + 9444 + + + 3 + {srv3} + 9444 + + + + diff --git a/tests/jepsen.nukeeper/resources/users.xml b/tests/jepsen.nukeeper/resources/users.xml new file mode 120000 index 00000000000..41b137a130f --- /dev/null +++ b/tests/jepsen.nukeeper/resources/users.xml @@ -0,0 +1 @@ +../../../programs/server/users.xml \ No newline at end of file diff --git a/tests/jepsen.nukeeper/src/jepsen/nukeeper/main.clj b/tests/jepsen.nukeeper/src/jepsen/nukeeper/main.clj new file mode 100644 index 00000000000..8aa157bc16e --- /dev/null +++ b/tests/jepsen.nukeeper/src/jepsen/nukeeper/main.clj @@ -0,0 +1,143 @@ +(ns jepsen.nukeeper.main + (:require [clojure.tools.logging :refer :all] + [jepsen.nukeeper.utils :refer :all] + [jepsen.nukeeper.set :as set] + [jepsen.nukeeper.register :as register] + [clojure.string :as str] + [jepsen + [checker :as checker] + [cli :as cli] + [client :as client] + [control :as c] + [db :as db] + [nemesis :as nemesis] + [generator :as gen] + [independent :as independent] + [tests :as tests]] + [jepsen.control.util :as cu] + [jepsen.os.ubuntu :as ubuntu] + [jepsen.checker.timeline :as timeline] + [clojure.java.io :as io] + [knossos.model :as model] + [zookeeper.data :as data] + [zookeeper :as zk]) + (:import (org.apache.zookeeper ZooKeeper KeeperException KeeperException$BadVersionException))) + +(def dir "/var/lib/clickhouse") +(def binary "clickhouse") +(def logdir "/var/log/clickhouse-server") +(def logfile "/var/log/clickhouse-server/stderr.log") +(def serverlog "/var/log/clickhouse-server/clickhouse-server.log") +(def pidfile (str dir "/clickhouse.pid")) +(def binary-path "/tmp") + +(defn cluster-config + [test node config-template] + (let [nodes (:nodes test)] + (clojure.string/replace + (clojure.string/replace + (clojure.string/replace + (clojure.string/replace + (clojure.string/replace config-template #"\{quorum_reads\}" (str (boolean (:quorum test)))) + #"\{srv1\}" (get nodes 0)) + #"\{srv2\}" (get nodes 1)) + #"\{srv3\}" (get nodes 2)) + #"\{id\}" (str (inc (.indexOf nodes node)))))) + +(defn db + [version] + (reify db/DB + (setup! [_ test node] + (info node "installing clickhouse" version) + (c/su + (if-not (cu/exists? (str binary-path "/clickhouse")) + (c/exec :sky :get :-d binary-path :-N :Backbone version)) + (c/exec :mkdir :-p logdir) + (c/exec :touch logfile) + (c/exec (str binary-path "/clickhouse") :install) + (c/exec :chown :-R :root dir) + (c/exec :chown :-R :root logdir) + (c/exec :echo (slurp (io/resource "listen.xml")) :> "/etc/clickhouse-server/config.d/listen.xml") + (c/exec :echo (cluster-config test node (slurp (io/resource "test_keeper_config.xml"))) :> "/etc/clickhouse-server/config.d/test_keeper_config.xml") + (cu/start-daemon! + {:pidfile pidfile + :logfile logfile + :chdir dir} + (str binary-path "/clickhouse") + :server + :--config "/etc/clickhouse-server/config.xml") + (Thread/sleep 10000))) + + (teardown! [_ test node] + (info node "tearing down clickhouse") + (cu/stop-daemon! (str binary-path "/clickhouse") pidfile) + (c/su + (c/exec :rm :-f (str binary-path "/clickhouse")) + (c/exec :rm :-rf dir) + (c/exec :rm :-rf logdir) + (c/exec :rm :-rf "/etc/clickhouse-server"))) + + db/LogFiles + (log-files [_ test node] + [logfile serverlog]))) + +(def workloads + "A map of workload names to functions that construct workloads, given opts." + {"set" set/workload + "register" register/workload}) + +(def cli-opts + "Additional command line options." + [["-w" "--workload NAME" "What workload should we run?" + :missing (str "--workload " (cli/one-of workloads)) + :validate [workloads (cli/one-of workloads)]] + ["-q" "--quorum" "Use quorum reads, instead of reading from any primary."] + ["-r" "--rate HZ" "Approximate number of requests per second, per thread." + :default 10 + :parse-fn read-string + :validate [#(and (number? %) (pos? %)) "Must be a positive number"]] + [nil "--ops-per-key NUM" "Maximum number of operations on any given key." + :default 100 + :parse-fn parse-long + :validate [pos? "Must be a positive integer."]]]) + +(defn nukeeper-test + "Given an options map from the command line runner (e.g. :nodes, :ssh, + :concurrency, ...), constructs a test map." + [opts] + (let [quorum (boolean (:quorum opts)) + workload ((get workloads (:workload opts)) opts)] + (merge tests/noop-test + opts + {:name (str "clickhouse-keeper quorum=" quorum " " (name (:workload opts))) + :os ubuntu/os + :db (db "rbtorrent:8831b5baa571abc28340cf66a9279a4ce45fac64") + :pure-generators true + :client (:client workload) + :nemesis (nemesis/partition-random-halves) + :checker (checker/compose + {:perf (checker/perf) + :workload (:checker workload)}) + :generator (gen/phases + (->> (:generator workload) + (gen/stagger (/ (:rate opts))) + (gen/nemesis + (cycle [(gen/sleep 5) + {:type :info, :f :start} + (gen/sleep 5) + {:type :info, :f :stop}])) + (gen/time-limit (:time-limit opts))) + (gen/log "Healing cluster") + (gen/nemesis (gen/once {:type :info, :f :stop})) + (gen/log "Waiting for recovery") + (gen/sleep 10) + (gen/clients (:final-generator workload)))}))) + +(defn -main + "Handles command line arguments. Can either run a test, or a web server for + browsing results." + [& args] + (cli/run! (merge (cli/single-test-cmd {:test-fn nukeeper-test + :opt-spec cli-opts}) + (cli/serve-cmd)) + args)) diff --git a/tests/jepsen.nukeeper/src/jepsen/nukeeper/register.clj b/tests/jepsen.nukeeper/src/jepsen/nukeeper/register.clj new file mode 100644 index 00000000000..98322845346 --- /dev/null +++ b/tests/jepsen.nukeeper/src/jepsen/nukeeper/register.clj @@ -0,0 +1,64 @@ +(ns jepsen.nukeeper.register + (:require [jepsen + [checker :as checker] + [client :as client] + [independent :as independent] + [generator :as gen]] + [jepsen.checker.timeline :as timeline] + [knossos.model :as model] + [jepsen.nukeeper.utils :refer :all] + [zookeeper :as zk]) + (:import (org.apache.zookeeper ZooKeeper KeeperException KeeperException$BadVersionException))) + +(defn r [_ _] {:type :invoke, :f :read, :value nil}) +(defn w [_ _] {:type :invoke, :f :write, :value (rand-int 5)}) +(defn cas [_ _] {:type :invoke, :f :cas, :value [(rand-int 5) (rand-int 5)]}) + +(defrecord RegisterClient [conn] + client/Client + (open! [this test node] + (assoc this :conn (zk-connect node 9181 30000))) + + (setup! [this test] + (zk-create-range conn 300)) ; 300 nodes to be sure + + (invoke! [_ test op] + (let [[k v] (:value op) + zk-k (zk-path k)] + (case (:f op) + :read (try + (assoc op :type :ok, :value (independent/tuple k (parse-long (:data (zk-get-str conn zk-k))))) + (catch Exception _ (assoc op :type :fail, :error :connect-error))) + :write (try + (do (zk-set conn zk-k v) + (assoc op :type :ok)) + (catch Exception _ (assoc op :type :info, :error :connect-error))) + :cas (try + (let [[old new] v] + (assoc op :type (if (zk-cas conn zk-k old new) + :ok + :fail))) + (catch KeeperException$BadVersionException _ (assoc op :type :fail, :error :bad-version)) + (catch Exception _ (assoc op :type :info, :error :connect-error)))))) + + (teardown! [this test]) + + (close! [_ test] + (zk/close conn))) + +(defn workload + "Tests linearizable reads, writes, and compare-and-set operations on + independent keys." + [opts] + {:client (RegisterClient. nil) + :checker (independent/checker + (checker/compose + {:linear (checker/linearizable {:model (model/cas-register) + :algorithm :linear}) + :timeline (timeline/html)})) + :generator (independent/concurrent-generator + 10 + (range) + (fn [k] + (->> (gen/mix [r w cas]) + (gen/limit (:ops-per-key opts)))))}) diff --git a/tests/jepsen.nukeeper/src/jepsen/nukeeper/set.clj b/tests/jepsen.nukeeper/src/jepsen/nukeeper/set.clj new file mode 100644 index 00000000000..7e196fab4c7 --- /dev/null +++ b/tests/jepsen.nukeeper/src/jepsen/nukeeper/set.clj @@ -0,0 +1,43 @@ +(ns jepsen.nukeeper.set + (:require [jepsen + [checker :as checker] + [client :as client] + [generator :as gen]] + [jepsen.nukeeper.utils :refer :all] + [zookeeper :as zk]) + (:import (org.apache.zookeeper ZooKeeper KeeperException KeeperException$BadVersionException))) + +(defrecord SetClient [k conn] + client/Client + (open! [this test node] + (assoc this :conn (zk-connect node 9181 30000))) + + (setup! [this test] + (zk-create-if-not-exists conn k "#{}")) + + (invoke! [_ test op] + (case (:f op) + :read ;(try + (assoc op + :type :ok + :value (read-string (:data (zk-get-str conn k)))) + ;(catch Exception _ (assoc op :type :fail, :error :connect-error))) + :add (try + (do + (zk-add-to-set conn k (:value op)) + (assoc op :type :ok)) + (catch KeeperException$BadVersionException _ (assoc op :type :fail, :error :bad-version)) + (catch Exception _ (assoc op :type :info, :error :connect-error))))) + + (teardown! [_ test]) + + (close! [_ test])) + +(defn workload + "A generator, client, and checker for a set test." + [opts] + {:client (SetClient. "/a-set" nil) + :checker (checker/set) + :generator (->> (range) + (map (fn [x] {:type :invoke, :f :add, :value x}))) + :final-generator (gen/once {:type :invoke, :f :read, :value nil})}) diff --git a/tests/jepsen.nukeeper/src/jepsen/nukeeper/utils.clj b/tests/jepsen.nukeeper/src/jepsen/nukeeper/utils.clj new file mode 100644 index 00000000000..3caec8e5f62 --- /dev/null +++ b/tests/jepsen.nukeeper/src/jepsen/nukeeper/utils.clj @@ -0,0 +1,56 @@ +(ns jepsen.nukeeper.utils + (:require [clojure.string :as str] + [zookeeper.data :as data] + [zookeeper :as zk])) + +(defn parse-long + "Parses a string to a Long. Passes through `nil` and empty strings." + [s] + (if (and s (> (count s) 0)) + (Long/parseLong s))) + +(defn zk-range + [] + (map (fn [v] (str "/" v)) (range))) + +(defn zk-path + [n] + (str "/" n)) + +(defn zk-connect + [host port timeout] + (zk/connect (str host ":" port) :timeout-msec timeout)) + +(defn zk-create-range + [conn n] + (dorun (map (fn [v] (zk/create-all conn v :persistent? true)) (take n (zk-range))))) + +(defn zk-set + ([conn path value] + (zk/set-data conn path (data/to-bytes (str value)) -1)) + ([conn path value version] + (zk/set-data conn path (data/to-bytes (str value)) version))) + +(defn zk-get-str + [conn path] + (let [zk-result (zk/data conn path)] + {:data (data/to-string (:data zk-result)) + :stat (:stat zk-result)})) + +(defn zk-cas + [conn path old-value new-value] + (let [current-value (zk-get-str conn path)] + (if (= (parse-long (:data current-value)) old-value) + (do (zk-set conn path new-value (:version (:stat current-value))) + true)))) + +(defn zk-add-to-set + [conn path elem] + (let [current-value (zk-get-str conn path) + current-set (read-string (:data current-value)) + new-set (conj current-set elem)] + (zk-set conn path (pr-str new-set) (:version (:stat current-value))))) + +(defn zk-create-if-not-exists + [conn path data] + (zk/create conn path :data (data/to-bytes (str data)))) diff --git a/tests/jepsen.nukeeper/test/jepsen/nukeeper_test.clj b/tests/jepsen.nukeeper/test/jepsen/nukeeper_test.clj new file mode 100644 index 00000000000..824aa40d2c8 --- /dev/null +++ b/tests/jepsen.nukeeper/test/jepsen/nukeeper_test.clj @@ -0,0 +1,28 @@ +(ns jepsen.nukeeper-test + (:require [clojure.test :refer :all] + [jepsen.nukeeper.utils :refer :all] + [zookeeper :as zk] + [zookeeper.data :as data])) + +(defn multicreate + [conn] + (dorun (map (fn [v] (zk/create conn v :persistent? true)) (take 10 (zk-range))))) + +(defn multidelete + [conn] + (dorun (map (fn [v] (zk/delete conn v)) (take 10 (zk-range))))) + +(deftest a-test + (testing "nukeeper connection" + (let [conn (zk/connect "localhost:9181" :timeout-msec 5000)] + (println (take 10 (zk-range))) + (multidelete conn) + (multicreate conn) + (zk/create-all conn "/0") + (zk/create conn "/0") + (println (zk/children conn "/")) + (zk/set-data conn "/0" (data/to-bytes "777") -1) + (Thread/sleep 5000) + (println "VALUE" (data/to-string (:data (zk/data conn "/0")))) + (is (= (data/to-string (:data (zk/data conn "/0"))) "777")) + (zk/close conn)))) From ce20eae2a3efd4e649bcb598c71d403f83463deb Mon Sep 17 00:00:00 2001 From: Maksim Kita Date: Wed, 10 Mar 2021 01:58:19 +0300 Subject: [PATCH 008/155] Added specialized CacheDictionaryStorage --- src/Common/PODArray.h | 24 + src/Common/tests/gtest_pod_array.cpp | 54 ++ src/Dictionaries/CacheDictionaryStorage.h | 474 ++++++++++++++---- src/Dictionaries/SSDCacheDictionaryStorage.h | 3 - .../SerializedCacheDictionaryStorage.h | 412 +++++++++++++++ src/Dictionaries/benchmark | 154 ------ .../registerCacheDictionaries.cpp | 62 ++- 7 files changed, 908 insertions(+), 275 deletions(-) create mode 100644 src/Dictionaries/SerializedCacheDictionaryStorage.h delete mode 100644 src/Dictionaries/benchmark diff --git a/src/Common/PODArray.h b/src/Common/PODArray.h index 163a6503d2e..57ad3d46177 100644 --- a/src/Common/PODArray.h +++ b/src/Common/PODArray.h @@ -692,6 +692,30 @@ public: assign(from.begin(), from.end()); } + void erase(const_iterator first, const_iterator last) + { + iterator first_no_const = const_cast(first); + iterator last_no_const = const_cast(last); + + size_t items_to_move = end() - last; + + while (items_to_move != 0) + { + *first_no_const = *last_no_const; + + ++first_no_const; + ++last_no_const; + + --items_to_move; + } + + this->c_end = reinterpret_cast(first_no_const); + } + + void erase(const_iterator pos) + { + this->erase(pos, pos + 1); + } bool operator== (const PODArray & rhs) const { diff --git a/src/Common/tests/gtest_pod_array.cpp b/src/Common/tests/gtest_pod_array.cpp index 53b3e207a22..63cf7026757 100644 --- a/src/Common/tests/gtest_pod_array.cpp +++ b/src/Common/tests/gtest_pod_array.cpp @@ -92,3 +92,57 @@ TEST(Common, PODInsertElementSizeNotMultipleOfLeftPadding) EXPECT_EQ(arr1_initially_empty.size(), items_to_insert_size); } + +TEST(Common, PODErase) +{ + { + PaddedPODArray items {0,1,2,3,4,5,6,7,8,9}; + PaddedPODArray expected; + expected = {0,1,2,3,4,5,6,7,8,9}; + + items.erase(items.begin(), items.begin()); + EXPECT_EQ(items, expected); + + items.erase(items.end(), items.end()); + EXPECT_EQ(items, expected); + } + { + PaddedPODArray actual {0,1,2,3,4,5,6,7,8,9}; + PaddedPODArray expected; + + expected = {0,1,4,5,6,7,8,9}; + actual.erase(actual.begin() + 2, actual.begin() + 4); + EXPECT_EQ(actual, expected); + + expected = {0,1,4}; + actual.erase(actual.begin() + 3, actual.end()); + EXPECT_EQ(actual, expected); + + expected = {}; + actual.erase(actual.begin(), actual.end()); + EXPECT_EQ(actual, expected); + + for (size_t i = 0; i < 10; ++i) + actual.emplace_back(static_cast(i)); + + expected = {0,1,4,5,6,7,8,9}; + actual.erase(actual.begin() + 2, actual.begin() + 4); + EXPECT_EQ(actual, expected); + + expected = {0,1,4}; + actual.erase(actual.begin() + 3, actual.end()); + EXPECT_EQ(actual, expected); + + expected = {}; + actual.erase(actual.begin(), actual.end()); + EXPECT_EQ(actual, expected); + } + { + PaddedPODArray actual {0,1,2,3,4,5,6,7,8,9}; + PaddedPODArray expected; + + expected = {1,2,3,4,5,6,7,8,9}; + actual.erase(actual.begin()); + EXPECT_EQ(actual, expected); + } +} diff --git a/src/Dictionaries/CacheDictionaryStorage.h b/src/Dictionaries/CacheDictionaryStorage.h index cf0b74e8bd2..2b34b13fa6f 100644 --- a/src/Dictionaries/CacheDictionaryStorage.h +++ b/src/Dictionaries/CacheDictionaryStorage.h @@ -1,6 +1,7 @@ #pragma once #include +#include #include @@ -30,16 +31,7 @@ struct CacheDictionaryStorageConfiguration const DictionaryLifetime lifetime; }; -/** Keys are stored in LRUCache and column values are serialized into arena. - - Cell in LRUCache consists of allocated size and place in arena were columns serialized data is stored. - - Columns are serialized by rows. - - When cell is removed from LRUCache data associated with it is also removed from arena. - - In case of complex key we also store key data in arena and it is removed from arena. -*/ +/// TODO: Add documentation template class CacheDictionaryStorage final : public ICacheDictionaryStorage { @@ -47,11 +39,36 @@ public: using KeyType = std::conditional_t; static_assert(dictionary_key_type != DictionaryKeyType::range, "Range key type is not supported by CacheDictionaryStorage"); - explicit CacheDictionaryStorage(CacheDictionaryStorageConfiguration & configuration_) + explicit CacheDictionaryStorage( + const DictionaryStructure & dictionary_structure, + CacheDictionaryStorageConfiguration & configuration_) : configuration(configuration_) , rnd_engine(randomSeed()) - , cache(configuration.max_size_in_cells, false, { arena }) + , cache(configuration.max_size_in_cells, false, { *this }) { + for (const auto & dictionary_attribute : dictionary_structure.attributes) + { + auto attribute_type = dictionary_attribute.underlying_type; + + auto type_call = [&](const auto & dictionary_attribute_type) + { + using Type = std::decay_t; + using AttributeType = typename Type::AttributeType; + using ValueType = DictionaryValueType; + + attributes.emplace_back(); + auto & last_attribute = attributes.back(); + last_attribute.type = attribute_type; + last_attribute.is_complex_type = dictionary_attribute.is_nullable || dictionary_attribute.is_array; + + if (dictionary_attribute.is_nullable) + last_attribute.attribute_container = std::vector(); + else + last_attribute.attribute_container = PaddedPODArray(); + }; + + callOnDictionaryAttributeType(attribute_type, type_call); + } } bool returnsFetchedColumnsInOrderOfRequestedKeys() const override { return true; } @@ -144,10 +161,36 @@ public: size_t getMaxSize() const override { return cache.getMaxSize(); } - size_t getBytesAllocated() const override { return arena.size() + cache.getSizeInBytes(); } + size_t getBytesAllocated() const override + { + size_t attributes_size_in_bytes = 0; + size_t attributes_size = attributes.size(); + + for (size_t attribute_index = 0; attribute_index < attributes_size; ++attribute_index) + { + getAttributeContainer(attribute_index, [&](const auto & container) + { + attributes_size_in_bytes += container.capacity() * sizeof(container[0]); + }); + } + + return arena.size() + cache.getSizeInBytes(); + } private: + struct FetchedKey + { + FetchedKey(size_t element_index_, bool is_default_) + : element_index(element_index_) + , is_default(is_default_) + {} + + const size_t element_index; + const bool is_default; + }; + + template ALWAYS_INLINE KeysStorageFetchResult fetchColumnsForKeysImpl( const PaddedPODArray & keys, @@ -161,10 +204,12 @@ private: const auto now = std::chrono::system_clock::now(); size_t fetched_columns_index = 0; + size_t keys_size = keys.size(); std::chrono::seconds max_lifetime_seconds(configuration.strict_max_lifetime_seconds); - size_t keys_size = keys.size(); + PaddedPODArray fetched_keys; + fetched_keys.reserve(keys_size); for (size_t key_index = 0; key_index < keys_size; ++key_index) { @@ -195,19 +240,14 @@ private: ++result.found_keys_size; } - ++fetched_columns_index; - - if (cell.isDefault()) + if (cell.is_default) { result.key_index_to_state[key_index].setDefault(); ++result.default_keys_size; - insertDefaultValuesIntoColumns(result.fetched_columns, fetch_request, key_index); - } - else - { - const char * place_for_serialized_columns = cell.place_for_serialized_columns; - deserializeAndInsertIntoColumns(result.fetched_columns, fetch_request, place_for_serialized_columns); } + + fetched_keys.emplace_back(cell.element_index, cell.is_default); + ++fetched_columns_index; } else { @@ -216,64 +256,166 @@ private: } } + for (size_t attribute_index = 0; attribute_index < fetch_request.attributesSize(); ++attribute_index) + { + if (!fetch_request.shouldFillResultColumnWithIndex(attribute_index)) + continue; + + size_t fetched_keys_size = fetched_keys.size(); + auto & attribute = attributes[attribute_index]; + const auto & default_value_provider = fetch_request.defaultValueProviderAtIndex(attribute_index); + auto & fetched_column = *result.fetched_columns[attribute_index]; + fetched_column.reserve(fetched_keys_size); + + if (unlikely(attribute.is_complex_type)) + { + auto & container = std::get>(attribute.attribute_container); + + for (size_t fetched_key_index = 0; fetched_key_index < fetched_keys.size(); ++fetched_key_index) + { + auto fetched_key = fetched_keys[fetched_key_index]; + + if (fetched_key.is_default) + fetched_column.insert(default_value_provider.getDefaultValue(fetched_key_index)); + else + fetched_column.insert(container[fetched_key.element_index]); + } + } + else + { + auto type_call = [&](const auto & dictionary_attribute_type) + { + using Type = std::decay_t; + using AttributeType = typename Type::AttributeType; + using ValueType = DictionaryValueType; + using ColumnType = + std::conditional_t, ColumnString, + std::conditional_t, ColumnDecimal, + ColumnVector>>; + + auto & container = std::get>(attribute.attribute_container); + ColumnType & column_typed = static_cast(fetched_column); + + if constexpr (std::is_same_v) + { + for (size_t fetched_key_index = 0; fetched_key_index < fetched_keys.size(); ++fetched_key_index) + { + auto fetched_key = fetched_keys[fetched_key_index]; + + if (fetched_key.is_default) + column_typed.insert(default_value_provider.getDefaultValue(fetched_key_index)); + else + { + auto item = container[fetched_key.element_index]; + column_typed.insertData(item.data, item.size); + } + } + } + else + { + for (size_t fetched_key_index = 0; fetched_key_index < fetched_keys.size(); ++fetched_key_index) + { + auto fetched_key = fetched_keys[fetched_key_index]; + auto & data = column_typed.getData(); + + if (fetched_key.is_default) + column_typed.insert(default_value_provider.getDefaultValue(fetched_key_index)); + else + { + auto item = container[fetched_key.element_index]; + data.push_back(item); + } + } + } + }; + + callOnDictionaryAttributeType(attribute.type, type_call); + } + } + return result; } void insertColumnsForKeysImpl(const PaddedPODArray & keys, Columns columns) { - Arena temporary_values_pool; - - size_t columns_to_serialize_size = columns.size(); - PaddedPODArray temporary_column_data(columns_to_serialize_size); - const auto now = std::chrono::system_clock::now(); - size_t keys_size = keys.size(); for (size_t key_index = 0; key_index < keys_size; ++key_index) { - size_t allocated_size_for_columns = 0; - const char * block_start = nullptr; - auto key = keys[key_index]; - auto * it = cache.find(key); + cache.erase(key); - for (size_t column_index = 0; column_index < columns_to_serialize_size; ++column_index) + Cell cell; + + setCellDeadline(cell, now); + cell.element_index = insert_index; + cell.is_default = false; + + ++insert_index; + + insertCellInCache(key, cell); + } + + Field complex_column_value; + + for (size_t column_index = 0; column_index < columns.size(); ++column_index) + { + auto & attribute = attributes[column_index]; + const auto & column = columns[column_index]; + size_t column_size = column->size(); + + if (unlikely(attribute.is_complex_type)) { - auto & column = columns[column_index]; - temporary_column_data[column_index] = column->serializeValueIntoArena(key_index, temporary_values_pool, block_start); - allocated_size_for_columns += temporary_column_data[column_index].size; - } + auto & container = std::get>(attribute.attribute_container); + container.reserve(column_size); - char * place_for_serialized_columns = arena.alloc(allocated_size_for_columns); - memcpy(reinterpret_cast(place_for_serialized_columns), reinterpret_cast(block_start), allocated_size_for_columns); - - if (it) - { - /// Cell exists need to free previous serialized place and update deadline - auto & cell = it->getMapped(); - - if (cell.place_for_serialized_columns) - arena.free(cell.place_for_serialized_columns, cell.allocated_size_for_columns); - - setCellDeadline(cell, now); - cell.allocated_size_for_columns = allocated_size_for_columns; - cell.place_for_serialized_columns = place_for_serialized_columns; + for (size_t item_index = 0; item_index < column_size; ++item_index) + { + column->get(item_index, complex_column_value); + container.emplace_back(complex_column_value); + } } else { - /// No cell exists so create and put in cache - Cell cell; + auto type_call = [&](const auto & dictionary_attribute_type) + { + using Type = std::decay_t; + using AttributeType = typename Type::AttributeType; + using ValueType = DictionaryValueType; + using ColumnType = + std::conditional_t, ColumnString, + std::conditional_t, ColumnDecimal, + ColumnVector>>; - setCellDeadline(cell, now); - cell.allocated_size_for_columns = allocated_size_for_columns; - cell.place_for_serialized_columns = place_for_serialized_columns; + const ColumnType & column_typed = static_cast(*column); - insertCellInCache(key, cell); + auto & container = std::get>(attribute.attribute_container); + container.reserve(column_size); + + if constexpr (std::is_same_v) + { + /// TODO: Serialize while column string in arena then just insert offsets in container + for (size_t item_index = 0; item_index < column_size; ++item_index) + { + StringRef value = column->getDataAt(item_index); + StringRef updated_data = copyStringInArena(value); + + container.emplace_back(updated_data); + } + } + else + { + const auto & data = column_typed.getData(); + container.insert(data.begin(), data.end()); + } + }; + + callOnDictionaryAttributeType(attribute.type, type_call); } - - temporary_values_pool.rollback(allocated_size_for_columns); } + + deleteUnusedKeysIfNecessary(); } void insertDefaultKeysImpl(const PaddedPODArray & keys) @@ -282,31 +424,18 @@ private: for (auto key : keys) { - auto * it = cache.find(key); + cache.erase(key); - if (it) - { - auto & cell = it->getMapped(); + Cell cell; - setCellDeadline(cell, now); + setCellDeadline(cell, now); + cell.element_index = 0; + cell.is_default = true; - if (cell.place_for_serialized_columns) - arena.free(cell.place_for_serialized_columns, cell.allocated_size_for_columns); - - cell.allocated_size_for_columns = 0; - cell.place_for_serialized_columns = nullptr; - } - else - { - Cell cell; - - setCellDeadline(cell, now); - cell.allocated_size_for_columns = 0; - cell.place_for_serialized_columns = nullptr; - - insertCellInCache(key, cell); - } + insertCellInCache(key, cell); } + + deleteUnusedKeysIfNecessary(); } PaddedPODArray getCachedKeysImpl() const @@ -318,7 +447,7 @@ private: { auto & cell = node.getMapped(); - if (cell.isDefault()) + if (cell.is_default) continue; result.emplace_back(node.getKey()); @@ -327,37 +456,138 @@ private: return result; } + void deleteUnusedKeysIfNecessary() + { + size_t cache_max_size = cache.getMaxSize(); + + if (unlikely(attributes.empty()) || insert_index * 2 < cache_max_size) + return; + + std::unordered_map element_index_to_cache_iterator; + + for (auto begin = cache.begin(); begin != cache.end(); ++begin) + { + auto & node = *begin; + auto & cell = node.getMapped(); + size_t element_index = cell.element_index; + element_index_to_cache_iterator.insert(std::make_pair(element_index, begin)); + } + + size_t last_remove_index = 0; + + getAttributeContainer(0, [&, this](auto & container) + { + size_t container_size = container.size(); + size_t remove_index = 0; + + for (size_t i = 0; i < container_size; ++i) + { + if (indexes_to_delete.contains(i)) + continue; + + std::swap(container[remove_index], container[i]); + + auto it = element_index_to_cache_iterator.find(remove_index); + if (it != element_index_to_cache_iterator.end()) + { + auto & cell = it->second->getMapped(); + cell.element_index = remove_index; + } + + ++remove_index; + } + + container.erase(container.begin() + remove_index, container.end()); + last_remove_index = remove_index; + }); + + insert_index = last_remove_index; + + for (size_t attribute_index = 1; attribute_index < attributes.size(); ++attribute_index) + { + getAttributeContainer(attribute_index, [this](auto & container) + { + size_t container_size = container.size(); + size_t remove_index = 0; + + for (size_t i = 0; i < container_size; ++i) + { + if (indexes_to_delete.contains(i)) + continue; + + std::swap(container[remove_index], container[i]); + ++remove_index; + } + + container.erase(container.begin() + remove_index, container.end()); + }); + } + + indexes_to_delete.clear(); + } + + template + void getAttributeContainer(size_t attribute_index, GetContainerFunc && func) + { + auto & attribute = attributes[attribute_index]; + auto & attribute_type = attribute.type; + + if (unlikely(attribute.is_complex_type)) + { + auto & container = std::get>(attribute.attribute_container); + std::forward(func)(container); + } + else + { + auto type_call = [&](const auto & dictionary_attribute_type) + { + using Type = std::decay_t; + using AttributeType = typename Type::AttributeType; + using ValueType = DictionaryValueType; + + auto & container = std::get>(attribute.attribute_container); + std::forward(func)(container); + }; + + callOnDictionaryAttributeType(attribute_type, type_call); + } + } + + template + void getAttributeContainer(size_t attribute_index, GetContainerFunc && func) const + { + return const_cast *>(this)->template getAttributeContainer(attribute_index, std::forward(func)); + } + + using TimePoint = std::chrono::system_clock::time_point; struct Cell { TimePoint deadline; - size_t allocated_size_for_columns; - char * place_for_serialized_columns; - - inline bool isDefault() const { return place_for_serialized_columns == nullptr; } - inline void setDefault() - { - place_for_serialized_columns = nullptr; - allocated_size_for_columns = 0; - } + size_t element_index; + bool is_default; }; void insertCellInCache(KeyType & key, const Cell & cell) { + /// Copy complex key into arena and put in cache if constexpr (dictionary_key_type == DictionaryKeyType::complex) - { - /// Copy complex key into arena and put in cache - size_t key_size = key.size; - char * place_for_key = arena.alloc(key_size); - memcpy(reinterpret_cast(place_for_key), reinterpret_cast(key.data), key_size); - KeyType updated_key{place_for_key, key_size}; - key = updated_key; - } + key = copyStringInArena(key); cache.insert(key, cell); } + StringRef copyStringInArena(StringRef value_to_copy) + { + size_t value_to_copy_size = value_to_copy.size; + char * place_for_key = arena.alloc(value_to_copy_size); + memcpy(reinterpret_cast(place_for_key), reinterpret_cast(value_to_copy.data), value_to_copy_size); + StringRef updated_value{place_for_key, value_to_copy_size}; + + return updated_value; + } + inline static bool cellHasDeadline(const Cell & cell) { return cell.deadline != std::chrono::system_clock::from_time_t(0); @@ -378,34 +608,58 @@ private: cell.deadline = now + std::chrono::seconds(distribution(rnd_engine)); } - template - friend class ArenaCellDisposer; - CacheDictionaryStorageConfiguration configuration; ArenaWithFreeLists arena; pcg64 rnd_engine; - class ArenaCellDisposer + struct Attribute + { + AttributeUnderlyingType type; + bool is_complex_type; + + std::variant< + PaddedPODArray, + PaddedPODArray, + PaddedPODArray, + PaddedPODArray, + PaddedPODArray, + PaddedPODArray, + PaddedPODArray, + PaddedPODArray, + PaddedPODArray, + PaddedPODArray, + PaddedPODArray, + PaddedPODArray, + PaddedPODArray, + PaddedPODArray, + PaddedPODArray, + std::vector> attribute_container; + }; + + std::vector attributes; + size_t insert_index = 0; + std::unordered_set> indexes_to_delete; + + class CacheStorageCellDisposer { public: - ArenaWithFreeLists & arena; + CacheDictionaryStorage & storage; template - void operator()(const Key & key, const Value & value) const + void operator()(const Key & key, const Value & cell) const { /// In case of complex key we keep it in arena if constexpr (std::is_same_v) - arena.free(const_cast(key.data), key.size); + storage.arena.free(const_cast(key.data), key.size); - if (value.place_for_serialized_columns) - arena.free(value.place_for_serialized_columns, value.allocated_size_for_columns); + storage.indexes_to_delete.insert(cell.element_index); } }; - using SimpleKeyLRUHashMap = LRUHashMap; - using ComplexKeyLRUHashMap = LRUHashMapWithSavedHash; + using SimpleKeyLRUHashMap = LRUHashMap; + using ComplexKeyLRUHashMap = LRUHashMapWithSavedHash; using CacheLRUHashMap = std::conditional_t< dictionary_key_type == DictionaryKeyType::simple, diff --git a/src/Dictionaries/SSDCacheDictionaryStorage.h b/src/Dictionaries/SSDCacheDictionaryStorage.h index 16a8954de58..e061b783ee4 100644 --- a/src/Dictionaries/SSDCacheDictionaryStorage.h +++ b/src/Dictionaries/SSDCacheDictionaryStorage.h @@ -1316,9 +1316,6 @@ private: cell.deadline = now + std::chrono::seconds{distribution(rnd_engine)}; } - template - friend class ArenaCellKeyDisposer; - SSDCacheDictionaryStorageConfiguration configuration; SSDCacheFileBuffer file_buffer; diff --git a/src/Dictionaries/SerializedCacheDictionaryStorage.h b/src/Dictionaries/SerializedCacheDictionaryStorage.h new file mode 100644 index 00000000000..2616e03763c --- /dev/null +++ b/src/Dictionaries/SerializedCacheDictionaryStorage.h @@ -0,0 +1,412 @@ +#pragma once + +#include + +#include + +#include +#include +#include +#include +#include +#include +#include + +namespace DB +{ + +namespace ErrorCodes +{ + extern const int NOT_IMPLEMENTED; +} + +struct SerializedCacheDictionaryStorageConfiguration +{ + /// Max size of storage in cells + const size_t max_size_in_cells; + /// Needed to perform check if cell is expired or not found. Default value is dictionary max lifetime. + const size_t strict_max_lifetime_seconds; + /// Lifetime of dictionary. Cell deadline is random value between lifetime min and max seconds. + const DictionaryLifetime lifetime; +}; + +/** Keys are stored in LRUCache and column values are serialized into arena. + + Cell in LRUCache consists of allocated size and place in arena were columns serialized data is stored. + + Columns are serialized by rows. + + When cell is removed from LRUCache data associated with it is also removed from arena. + + In case of complex key we also store key data in arena and it is removed from arena. +*/ +/// TODO: Remove +template +class SerializedCacheDictionaryStorage final : public ICacheDictionaryStorage +{ +public: + using KeyType = std::conditional_t; + static_assert(dictionary_key_type != DictionaryKeyType::range, "Range key type is not supported by CacheDictionaryStorage"); + + explicit SerializedCacheDictionaryStorage(SerializedCacheDictionaryStorageConfiguration & configuration_) + : configuration(configuration_) + , rnd_engine(randomSeed()) + , cache(configuration.max_size_in_cells, false, { arena }) + { + } + + bool returnsFetchedColumnsInOrderOfRequestedKeys() const override { return true; } + + String getName() const override + { + if (dictionary_key_type == DictionaryKeyType::simple) + return "SerializedCache"; + else + return "ComplexKeySerializedCache"; + } + + bool supportsSimpleKeys() const override { return dictionary_key_type == DictionaryKeyType::simple; } + + SimpleKeysStorageFetchResult fetchColumnsForKeys( + const PaddedPODArray & keys, + const DictionaryStorageFetchRequest & fetch_request) override + { + if constexpr (dictionary_key_type == DictionaryKeyType::simple) + return fetchColumnsForKeysImpl(keys, fetch_request); + else + throw Exception("Method fetchColumnsForKeys is not supported for complex key storage", ErrorCodes::NOT_IMPLEMENTED); + } + + void insertColumnsForKeys(const PaddedPODArray & keys, Columns columns) override + { + if constexpr (dictionary_key_type == DictionaryKeyType::simple) + insertColumnsForKeysImpl(keys, columns); + else + throw Exception("Method insertColumnsForKeys is not supported for complex key storage", ErrorCodes::NOT_IMPLEMENTED); + } + + void insertDefaultKeys(const PaddedPODArray & keys) override + { + if constexpr (dictionary_key_type == DictionaryKeyType::simple) + insertDefaultKeysImpl(keys); + else + throw Exception("Method insertDefaultKeysImpl is not supported for complex key storage", ErrorCodes::NOT_IMPLEMENTED); + } + + PaddedPODArray getCachedSimpleKeys() const override + { + if constexpr (dictionary_key_type == DictionaryKeyType::simple) + return getCachedKeysImpl(); + else + throw Exception("Method getCachedSimpleKeys is not supported for complex key storage", ErrorCodes::NOT_IMPLEMENTED); + } + + bool supportsComplexKeys() const override { return dictionary_key_type == DictionaryKeyType::complex; } + + ComplexKeysStorageFetchResult fetchColumnsForKeys( + const PaddedPODArray & keys, + const DictionaryStorageFetchRequest & column_fetch_requests) override + { + if constexpr (dictionary_key_type == DictionaryKeyType::complex) + return fetchColumnsForKeysImpl(keys, column_fetch_requests); + else + throw Exception("Method fetchColumnsForKeys is not supported for simple key storage", ErrorCodes::NOT_IMPLEMENTED); + } + + void insertColumnsForKeys(const PaddedPODArray & keys, Columns columns) override + { + if constexpr (dictionary_key_type == DictionaryKeyType::complex) + insertColumnsForKeysImpl(keys, columns); + else + throw Exception("Method insertColumnsForKeys is not supported for simple key storage", ErrorCodes::NOT_IMPLEMENTED); + } + + void insertDefaultKeys(const PaddedPODArray & keys) override + { + if constexpr (dictionary_key_type == DictionaryKeyType::complex) + insertDefaultKeysImpl(keys); + else + throw Exception("Method insertDefaultKeysImpl is not supported for simple key storage", ErrorCodes::NOT_IMPLEMENTED); + } + + PaddedPODArray getCachedComplexKeys() const override + { + if constexpr (dictionary_key_type == DictionaryKeyType::complex) + return getCachedKeysImpl(); + else + throw Exception("Method getCachedComplexKeys is not supported for simple key storage", ErrorCodes::NOT_IMPLEMENTED); + } + + size_t getSize() const override { return cache.size(); } + + size_t getMaxSize() const override { return cache.getMaxSize(); } + + size_t getBytesAllocated() const override { return arena.size() + cache.getSizeInBytes(); } + +private: + + template + ALWAYS_INLINE KeysStorageFetchResult fetchColumnsForKeysImpl( + const PaddedPODArray & keys, + const DictionaryStorageFetchRequest & fetch_request) + { + KeysStorageFetchResult result; + + result.fetched_columns = fetch_request.makeAttributesResultColumns(); + result.key_index_to_state.resize_fill(keys.size(), {KeyState::not_found}); + + const auto now = std::chrono::system_clock::now(); + + size_t fetched_columns_index = 0; + + std::chrono::seconds max_lifetime_seconds(configuration.strict_max_lifetime_seconds); + + size_t keys_size = keys.size(); + + for (size_t key_index = 0; key_index < keys_size; ++key_index) + { + auto key = keys[key_index]; + auto * it = cache.find(key); + + if (it) + { + /// Columns values for key are serialized in cache now deserialize them + const auto & cell = it->getMapped(); + + bool has_deadline = cellHasDeadline(cell); + + if (has_deadline && now > cell.deadline + max_lifetime_seconds) + { + result.key_index_to_state[key_index] = {KeyState::not_found}; + ++result.not_found_keys_size; + continue; + } + else if (has_deadline && now > cell.deadline) + { + result.key_index_to_state[key_index] = {KeyState::expired, fetched_columns_index}; + ++result.expired_keys_size; + } + else + { + result.key_index_to_state[key_index] = {KeyState::found, fetched_columns_index}; + ++result.found_keys_size; + } + + ++fetched_columns_index; + + if (cell.isDefault()) + { + result.key_index_to_state[key_index].setDefault(); + ++result.default_keys_size; + insertDefaultValuesIntoColumns(result.fetched_columns, fetch_request, key_index); + } + else + { + const char * place_for_serialized_columns = cell.place_for_serialized_columns; + deserializeAndInsertIntoColumns(result.fetched_columns, fetch_request, place_for_serialized_columns); + } + } + else + { + result.key_index_to_state[key_index] = {KeyState::not_found}; + ++result.not_found_keys_size; + } + } + + return result; + } + + void insertColumnsForKeysImpl(const PaddedPODArray & keys, Columns columns) + { + Arena temporary_values_pool; + + size_t columns_to_serialize_size = columns.size(); + PaddedPODArray temporary_column_data(columns_to_serialize_size); + + const auto now = std::chrono::system_clock::now(); + + size_t keys_size = keys.size(); + + for (size_t key_index = 0; key_index < keys_size; ++key_index) + { + size_t allocated_size_for_columns = 0; + const char * block_start = nullptr; + + auto key = keys[key_index]; + auto * it = cache.find(key); + + for (size_t column_index = 0; column_index < columns_to_serialize_size; ++column_index) + { + auto & column = columns[column_index]; + temporary_column_data[column_index] = column->serializeValueIntoArena(key_index, temporary_values_pool, block_start); + allocated_size_for_columns += temporary_column_data[column_index].size; + } + + char * place_for_serialized_columns = arena.alloc(allocated_size_for_columns); + memcpy(reinterpret_cast(place_for_serialized_columns), reinterpret_cast(block_start), allocated_size_for_columns); + + if (it) + { + /// Cell exists need to free previous serialized place and update deadline + auto & cell = it->getMapped(); + + if (cell.place_for_serialized_columns) + arena.free(cell.place_for_serialized_columns, cell.allocated_size_for_columns); + + setCellDeadline(cell, now); + cell.allocated_size_for_columns = allocated_size_for_columns; + cell.place_for_serialized_columns = place_for_serialized_columns; + } + else + { + /// No cell exists so create and put in cache + Cell cell; + + setCellDeadline(cell, now); + cell.allocated_size_for_columns = allocated_size_for_columns; + cell.place_for_serialized_columns = place_for_serialized_columns; + + insertCellInCache(key, cell); + } + + temporary_values_pool.rollback(allocated_size_for_columns); + } + } + + void insertDefaultKeysImpl(const PaddedPODArray & keys) + { + const auto now = std::chrono::system_clock::now(); + + for (auto key : keys) + { + auto * it = cache.find(key); + + if (it) + { + auto & cell = it->getMapped(); + + setCellDeadline(cell, now); + + if (cell.place_for_serialized_columns) + arena.free(cell.place_for_serialized_columns, cell.allocated_size_for_columns); + + cell.allocated_size_for_columns = 0; + cell.place_for_serialized_columns = nullptr; + } + else + { + Cell cell; + + setCellDeadline(cell, now); + cell.allocated_size_for_columns = 0; + cell.place_for_serialized_columns = nullptr; + + insertCellInCache(key, cell); + } + } + } + + PaddedPODArray getCachedKeysImpl() const + { + PaddedPODArray result; + result.reserve(cache.size()); + + for (auto & node : cache) + { + auto & cell = node.getMapped(); + + if (cell.isDefault()) + continue; + + result.emplace_back(node.getKey()); + } + + return result; + } + + using TimePoint = std::chrono::system_clock::time_point; + + struct Cell + { + TimePoint deadline; + size_t allocated_size_for_columns; + char * place_for_serialized_columns; + + inline bool isDefault() const { return place_for_serialized_columns == nullptr; } + inline void setDefault() + { + place_for_serialized_columns = nullptr; + allocated_size_for_columns = 0; + } + }; + + void insertCellInCache(KeyType & key, const Cell & cell) + { + if constexpr (dictionary_key_type == DictionaryKeyType::complex) + { + /// Copy complex key into arena and put in cache + size_t key_size = key.size; + char * place_for_key = arena.alloc(key_size); + memcpy(reinterpret_cast(place_for_key), reinterpret_cast(key.data), key_size); + KeyType updated_key{place_for_key, key_size}; + key = updated_key; + } + + cache.insert(key, cell); + } + + inline static bool cellHasDeadline(const Cell & cell) + { + return cell.deadline != std::chrono::system_clock::from_time_t(0); + } + + inline void setCellDeadline(Cell & cell, TimePoint now) + { + if (configuration.lifetime.min_sec == 0 && configuration.lifetime.max_sec == 0) + { + cell.deadline = std::chrono::system_clock::from_time_t(0); + return; + } + + size_t min_sec_lifetime = configuration.lifetime.min_sec; + size_t max_sec_lifetime = configuration.lifetime.max_sec; + + std::uniform_int_distribution distribution{min_sec_lifetime, max_sec_lifetime}; + cell.deadline = now + std::chrono::seconds(distribution(rnd_engine)); + } + + SerializedCacheDictionaryStorageConfiguration configuration; + + ArenaWithFreeLists arena; + + pcg64 rnd_engine; + + class ArenaCellDisposer + { + public: + ArenaWithFreeLists & arena; + + template + void operator()(const Key & key, const Value & value) const + { + /// In case of complex key we keep it in arena + if constexpr (std::is_same_v) + arena.free(const_cast(key.data), key.size); + + if (value.place_for_serialized_columns) + arena.free(value.place_for_serialized_columns, value.allocated_size_for_columns); + } + }; + + using SimpleKeyLRUHashMap = LRUHashMap; + using ComplexKeyLRUHashMap = LRUHashMapWithSavedHash; + + using CacheLRUHashMap = std::conditional_t< + dictionary_key_type == DictionaryKeyType::simple, + SimpleKeyLRUHashMap, + ComplexKeyLRUHashMap>; + + CacheLRUHashMap cache; +}; + +} diff --git a/src/Dictionaries/benchmark b/src/Dictionaries/benchmark deleted file mode 100644 index 37d0d92ac14..00000000000 --- a/src/Dictionaries/benchmark +++ /dev/null @@ -1,154 +0,0 @@ -clickhouse-client --query="DROP TABLE IF EXISTS simple_cache_dictionary_table_source"; -clickhouse-client --query="CREATE TABLE simple_cache_dictionary_table_source (id UInt64, value1 String, value2 UInt64, value3 String, value4 Float64, value5 Decimal64(4)) ENGINE=TinyLog;" -clickhouse-client --query="INSERT INTO simple_cache_dictionary_table_source SELECT number, concat('Value1 ', toString(number)), number, concat('Value3 ', toString(number)), toFloat64(number), cast(number, 'Decimal64(4)') FROM system.numbers LIMIT 1000000;" - -clickhouse-client --multiquery --query="CREATE DICTIONARY clickhouse_simple_cache_dictionary ( - id UInt64, - value1 String, - value2 UInt64, - value3 String, - value4 Float64, - value5 Decimal64(4) -) -PRIMARY KEY id -SOURCE(CLICKHOUSE(HOST 'localhost' PORT tcpPort() USER 'default' TABLE 'simple_cache_dictionary_table_source' PASSWORD '' DB 'default')) -LIFETIME(MIN 300 MAX 300) -LAYOUT(CACHE(SIZE_IN_CELLS 100000));" - -clickhouse-client --multiquery --query="CREATE DICTIONARY clickhouse_ssd_simple_cache_dictionary ( - id UInt64, - value1 String, - value2 UInt64, - value3 String, - value4 Float64, - value5 Decimal64(4) -) -PRIMARY KEY id -SOURCE(CLICKHOUSE(HOST 'localhost' PORT tcpPort() USER 'default' TABLE 'simple_cache_dictionary_table_source' PASSWORD '' DB 'default')) -LIFETIME(MIN 300 MAX 300) -LAYOUT(SSD_CACHE(BLOCK_SIZE 4096 FILE_SIZE 16777216 READ_BUFFER_SIZE 1048576 WRITE_BUFFER_SIZE 327680 MAX_STORED_KEYS 1048576 PATH '/opt/mkita/ClickHouse/build_release/programs/ssd_cache'));" - -clickhouse-client --multiquery --query="CREATE DICTIONARY clickhouse_dummy_simple_cache_dictionary ( - id UInt64, - value1 String, - value2 UInt64, - value3 String, - value4 Float64, - value5 Decimal64(4) -) -PRIMARY KEY id -SOURCE(CLICKHOUSE(HOST 'localhost' PORT tcpPort() USER 'default' TABLE 'simple_cache_dictionary_table_source' PASSWORD '' DB 'default')) -LIFETIME(MIN 300 MAX 300) -LAYOUT(DUMMY_SIMPLE());" - -./clickhouse-benchmark --query="SELECT - dictGet('default.clickhouse_dummy_simple_cache_dictionary', 'value1', number), - dictGet('default.clickhouse_dummy_simple_cache_dictionary', 'value2', number), - dictGet('default.clickhouse_dummy_simple_cache_dictionary', 'value3', number), - dictGet('default.clickhouse_dummy_simple_cache_dictionary', 'value4', number), - dictGet('default.clickhouse_dummy_simple_cache_dictionary', 'value5', number) -FROM system.numbers -LIMIT 10000 -FORMAT Null" - -./clickhouse-benchmark --query="SELECT - dictGet('default.clickhouse_simple_cache_dictionary', ('value1', 'value2', 'value3', 'value4', 'value5'), number) -FROM system.numbers -LIMIT 10000 -FORMAT Null" - -./clickhouse-benchmark --query="SELECT dictGet('default.clickhouse_ssd_simple_cache_dictionary', 'value1', number) FROM system.numbers_mt LIMIT 10000 FORMAT Null" - -./clickhouse-benchmark --query="SELECT - dictGet('default.clickhouse_simple_cache_dictionary', 'value1', number), - dictGet('default.clickhouse_simple_cache_dictionary', 'value2', number), - dictGet('default.clickhouse_simple_cache_dictionary', 'value3', number), - dictGet('default.clickhouse_simple_cache_dictionary', 'value4', number), - dictGet('default.clickhouse_simple_cache_dictionary', 'value5', number) -FROM system.numbers -LIMIT 10000 -FORMAT Null" - -./clickhouse-benchmark --query="SELECT dictGet('default.clickhouse_ssd_simple_cache_dictionary', 'value1', number) FROM system.numbers_mt LIMIT 10000 FORMAT Null" - -SELECT - dictGet('default.clickhouse_ssd_simple_cache_dictionary', 'value1', number), - dictGet('default.clickhouse_ssd_simple_cache_dictionary', 'value2', number), - dictGet('default.clickhouse_ssd_simple_cache_dictionary', 'value3', number), - dictGet('default.clickhouse_ssd_simple_cache_dictionary', 'value4', number), - dictGet('default.clickhouse_ssd_simple_cache_dictionary', 'value5', number) -FROM system.numbers - LIMIT 10000 -FORMAT Null - -SELECT dictGet('default.clickhouse_simple_cache_dictionary', ('value1', 'value2', 'value3', 'value4', 'value5'), number) FROM system.numbers LIMIT 10000 FORMAT Null - -SELECT dictGet('default.clickhouse_ssd_simple_cache_dictionary', ('value1', 'value2', 'value3', 'value4', 'value5'), number) FROM system.numbers LIMIT 10000 -FORMAT Null - -SELECT - dictGet('default.clickhouse_simple_cache_dictionary', ('value1', 'value2', 'value3', 'value4', 'value5'), number) -FROM system.numbers - LIMIT 10000 -FORMAT - Null - -SELECT - dictGet('default.clickhouse_simple_cache_dictionary', 'value1', number), - dictGet('default.clickhouse_simple_cache_dictionary', 'value2', number), - dictGet('default.clickhouse_simple_cache_dictionary', 'value3', number), - dictGet('default.clickhouse_simple_cache_dictionary', 'value4', number), - dictGet('default.clickhouse_simple_cache_dictionary', 'value5', number) -FROM system.numbers - LIMIT 10000 -FORMAT - Null - -SELECT - dictGet('default.clickhouse_simple_cache_dictionary', 'value1', number), - dictGet('default.clickhouse_simple_cache_dictionary', 'value2', number) -FROM system.numbers -LIMIT 10000 -FORMAT Null - -SELECT - dictGet('clickhouse_simple_cache_dictionary', 'value1', number) -FROM system.numbers -LIMIT 100000 -FORMAT Null - -SELECT - dictGet('clickhouse_simple_cache_dictionary', 'value2', number) -FROM system.numbers -LIMIT 100000 -FORMAT Null - -SELECT - dictGet('clickhouse_simple_cache_dictionary', 'value3', number) -FROM system.numbers -LIMIT 100000 -FORMAT Null - -SELECT - dictGet('clickhouse_simple_cache_dictionary', 'value4', number) -FROM system.numbers -LIMIT 100000 -FORMAT Null - -SELECT - dictGet('clickhouse_simple_cache_dictionary', 'value5', number) -FROM system.numbers -LIMIT 100000 -FORMAT Null - -SELECT - dictGet('clickhouse_simple_cache_dictionary', 'value1', number), - dictGet('clickhouse_simple_cache_dictionary', 'value2', number), - dictGet('clickhouse_simple_cache_dictionary', 'value3', number), - dictGet('clickhouse_simple_cache_dictionary', 'value4', number), - dictGet('clickhouse_simple_cache_dictionary', 'value5', number) -FROM system.numbers -LIMIT 100000 -FORMAT Null - -SELECT * FROM clickhouse_simple_cache_dictionary_table; \ No newline at end of file diff --git a/src/Dictionaries/registerCacheDictionaries.cpp b/src/Dictionaries/registerCacheDictionaries.cpp index 92e6eb97b63..23eea6e7e21 100644 --- a/src/Dictionaries/registerCacheDictionaries.cpp +++ b/src/Dictionaries/registerCacheDictionaries.cpp @@ -1,6 +1,7 @@ #include "CacheDictionary.h" -#include "SSDCacheDictionaryStorage.h" #include "CacheDictionaryStorage.h" +#include "SerializedCacheDictionaryStorage.h" +#include "SSDCacheDictionaryStorage.h" #include namespace DB @@ -18,9 +19,16 @@ CacheDictionaryStorageConfiguration parseCacheStorageConfiguration( const Poco::Util::AbstractConfiguration & config, const String & layout_prefix, const DictionaryLifetime & dict_lifetime, - DictionaryKeyType dictionary_key_type) + DictionaryKeyType dictionary_key_type, + bool serialized_storage) { - String dictionary_type_prefix = dictionary_key_type == DictionaryKeyType::complex ? ".complex_key_cache." : ".cache."; + String dictionary_type_prefix; + + if (!serialized_storage) + dictionary_type_prefix = dictionary_key_type == DictionaryKeyType::complex ? ".complex_key_cache." : ".cache."; + else + dictionary_type_prefix = dictionary_key_type == DictionaryKeyType::complex ? ".serialized_complex_key_cache." : ".serialized_cache."; + String dictionary_configuration_prefix = layout_prefix + dictionary_type_prefix; const size_t size = config.getUInt64(dictionary_configuration_prefix + "size_in_cells"); @@ -158,7 +166,8 @@ DictionaryPtr createCacheDictionaryLayout( const DictionaryStructure & dict_struct, const Poco::Util::AbstractConfiguration & config, const std::string & config_prefix, - DictionarySourcePtr source_ptr) + DictionarySourcePtr source_ptr, + bool serialized_storage) { static_assert(dictionary_key_type != DictionaryKeyType::range, "Range key type is not supported by CacheDictionary"); @@ -193,8 +202,23 @@ DictionaryPtr createCacheDictionaryLayout( const bool allow_read_expired_keys = config.getBool(layout_prefix + ".cache.allow_read_expired_keys", false); - auto storage_configuration = parseCacheStorageConfiguration(full_name, config, layout_prefix, dict_lifetime, dictionary_key_type); - auto storage = std::make_shared>(storage_configuration); + auto storage_configuration = parseCacheStorageConfiguration(full_name, config, layout_prefix, dict_lifetime, dictionary_key_type, serialized_storage); + + std::shared_ptr storage; + + if (serialized_storage) + { + SerializedCacheDictionaryStorageConfiguration serialized_configuration + { + .max_size_in_cells = storage_configuration.max_size_in_cells, + .strict_max_lifetime_seconds = storage_configuration.strict_max_lifetime_seconds, + .lifetime = storage_configuration.lifetime, + }; + + storage = std::make_shared>(serialized_configuration); + } + else + storage = std::make_shared>(dict_struct, storage_configuration); auto update_queue_configuration = parseCacheDictionaryUpdateQueueConfiguration(full_name, config, layout_prefix, dictionary_key_type); @@ -265,7 +289,7 @@ void registerDictionaryCache(DictionaryFactory & factory) const std::string & config_prefix, DictionarySourcePtr source_ptr) -> DictionaryPtr { - return createCacheDictionaryLayout(full_name, dict_struct, config, config_prefix, std::move(source_ptr)); + return createCacheDictionaryLayout(full_name, dict_struct, config, config_prefix, std::move(source_ptr), false); }; factory.registerLayout("cache", create_simple_cache_layout, false); @@ -276,11 +300,33 @@ void registerDictionaryCache(DictionaryFactory & factory) const std::string & config_prefix, DictionarySourcePtr source_ptr) -> DictionaryPtr { - return createCacheDictionaryLayout(full_name, dict_struct, config, config_prefix, std::move(source_ptr)); + return createCacheDictionaryLayout(full_name, dict_struct, config, config_prefix, std::move(source_ptr), false); }; factory.registerLayout("complex_key_cache", create_complex_key_cache_layout, true); + auto create_simple_serialized_cache_layout = [=](const String & full_name, + const DictionaryStructure & dict_struct, + const Poco::Util::AbstractConfiguration & config, + const std::string & config_prefix, + DictionarySourcePtr source_ptr) -> DictionaryPtr + { + return createCacheDictionaryLayout(full_name, dict_struct, config, config_prefix, std::move(source_ptr), true); + }; + + factory.registerLayout("serialized_cache", create_simple_serialized_cache_layout, false); + + auto create_complex_key_serialzied_cache_layout = [=](const std::string & full_name, + const DictionaryStructure & dict_struct, + const Poco::Util::AbstractConfiguration & config, + const std::string & config_prefix, + DictionarySourcePtr source_ptr) -> DictionaryPtr + { + return createCacheDictionaryLayout(full_name, dict_struct, config, config_prefix, std::move(source_ptr), true); + }; + + factory.registerLayout("complex_key_serialized_cache", create_complex_key_serialzied_cache_layout, true); + #if defined(OS_LINUX) || defined(__FreeBSD__) auto create_simple_ssd_cache_layout = [=](const std::string & full_name, From 0783882fcfd5d372b1631b41c4145f7a2808425c Mon Sep 17 00:00:00 2001 From: Maksim Kita Date: Sun, 14 Mar 2021 00:49:45 +0300 Subject: [PATCH 009/155] Updated cache implementation --- src/Dictionaries/CacheDictionaryStorage.h | 195 +++++---- src/Dictionaries/ICacheDictionaryStorage.h | 1 + .../SerializedCacheDictionaryStorage.h | 412 ------------------ .../registerCacheDictionaries.cpp | 59 +-- 4 files changed, 115 insertions(+), 552 deletions(-) delete mode 100644 src/Dictionaries/SerializedCacheDictionaryStorage.h diff --git a/src/Dictionaries/CacheDictionaryStorage.h b/src/Dictionaries/CacheDictionaryStorage.h index 2b34b13fa6f..bbf1325c8a3 100644 --- a/src/Dictionaries/CacheDictionaryStorage.h +++ b/src/Dictionaries/CacheDictionaryStorage.h @@ -4,6 +4,8 @@ #include #include +#include +#include #include #include @@ -31,6 +33,8 @@ struct CacheDictionaryStorageConfiguration const DictionaryLifetime lifetime; }; + + /// TODO: Add documentation template class CacheDictionaryStorage final : public ICacheDictionaryStorage @@ -46,29 +50,7 @@ public: , rnd_engine(randomSeed()) , cache(configuration.max_size_in_cells, false, { *this }) { - for (const auto & dictionary_attribute : dictionary_structure.attributes) - { - auto attribute_type = dictionary_attribute.underlying_type; - - auto type_call = [&](const auto & dictionary_attribute_type) - { - using Type = std::decay_t; - using AttributeType = typename Type::AttributeType; - using ValueType = DictionaryValueType; - - attributes.emplace_back(); - auto & last_attribute = attributes.back(); - last_attribute.type = attribute_type; - last_attribute.is_complex_type = dictionary_attribute.is_nullable || dictionary_attribute.is_array; - - if (dictionary_attribute.is_nullable) - last_attribute.attribute_container = std::vector(); - else - last_attribute.attribute_container = PaddedPODArray(); - }; - - callOnDictionaryAttributeType(attribute_type, type_call); - } + setup(dictionary_structure); } bool returnsFetchedColumnsInOrderOfRequestedKeys() const override { return true; } @@ -88,9 +70,7 @@ public: const DictionaryStorageFetchRequest & fetch_request) override { if constexpr (dictionary_key_type == DictionaryKeyType::simple) - { return fetchColumnsForKeysImpl(keys, fetch_request); - } else throw Exception("Method fetchColumnsForKeys is not supported for complex key storage", ErrorCodes::NOT_IMPLEMENTED); } @@ -126,9 +106,7 @@ public: const DictionaryStorageFetchRequest & column_fetch_requests) override { if constexpr (dictionary_key_type == DictionaryKeyType::complex) - { return fetchColumnsForKeysImpl(keys, column_fetch_requests); - } else throw Exception("Method fetchColumnsForKeys is not supported for simple key storage", ErrorCodes::NOT_IMPLEMENTED); } @@ -174,7 +152,7 @@ public: }); } - return arena.size() + cache.getSizeInBytes(); + return arena.size() + cache.getSizeInBytes() + attributes_size_in_bytes; } private: @@ -192,7 +170,7 @@ private: template - ALWAYS_INLINE KeysStorageFetchResult fetchColumnsForKeysImpl( + KeysStorageFetchResult fetchColumnsForKeysImpl( const PaddedPODArray & keys, const DictionaryStorageFetchRequest & fetch_request) { @@ -216,44 +194,41 @@ private: auto key = keys[key_index]; auto * it = cache.find(key); - if (it) - { - /// Columns values for key are serialized in cache now deserialize them - const auto & cell = it->getMapped(); - - bool has_deadline = cellHasDeadline(cell); - - if (has_deadline && now > cell.deadline + max_lifetime_seconds) - { - result.key_index_to_state[key_index] = {KeyState::not_found}; - ++result.not_found_keys_size; - continue; - } - else if (has_deadline && now > cell.deadline) - { - result.key_index_to_state[key_index] = {KeyState::expired, fetched_columns_index}; - ++result.expired_keys_size; - } - else - { - result.key_index_to_state[key_index] = {KeyState::found, fetched_columns_index}; - ++result.found_keys_size; - } - - if (cell.is_default) - { - result.key_index_to_state[key_index].setDefault(); - ++result.default_keys_size; - } - - fetched_keys.emplace_back(cell.element_index, cell.is_default); - ++fetched_columns_index; - } - else + if (!it) { result.key_index_to_state[key_index] = {KeyState::not_found}; ++result.not_found_keys_size; + continue; } + + const auto & cell = it->getMapped(); + + if (now > cell.deadline + max_lifetime_seconds) + { + result.key_index_to_state[key_index] = {KeyState::not_found}; + ++result.not_found_keys_size; + continue; + } + + bool cell_is_expired = false; + KeyState::State key_state = KeyState::found; + + if (now > cell.deadline) + { + cell_is_expired = true; + key_state = KeyState::expired; + } + + result.key_index_to_state[key_index] = {key_state, fetched_columns_index}; + ++fetched_columns_index; + + result.expired_keys_size += cell_is_expired; + result.found_keys_size += !cell_is_expired; + + result.key_index_to_state[key_index].setDefaultValue(cell.is_default); + result.default_keys_size += cell.is_default; + + fetched_keys.emplace_back(cell.element_index, cell.is_default); } for (size_t attribute_index = 0; attribute_index < fetch_request.attributesSize(); ++attribute_index) @@ -275,7 +250,7 @@ private: { auto fetched_key = fetched_keys[fetched_key_index]; - if (fetched_key.is_default) + if (unlikely(fetched_key.is_default)) fetched_column.insert(default_value_provider.getDefaultValue(fetched_key_index)); else fetched_column.insert(container[fetched_key.element_index]); @@ -302,7 +277,7 @@ private: { auto fetched_key = fetched_keys[fetched_key_index]; - if (fetched_key.is_default) + if (unlikely(fetched_key.is_default)) column_typed.insert(default_value_provider.getDefaultValue(fetched_key_index)); else { @@ -318,7 +293,7 @@ private: auto fetched_key = fetched_keys[fetched_key_index]; auto & data = column_typed.getData(); - if (fetched_key.is_default) + if (unlikely(fetched_key.is_default)) column_typed.insert(default_value_provider.getDefaultValue(fetched_key_index)); else { @@ -460,10 +435,10 @@ private: { size_t cache_max_size = cache.getMaxSize(); - if (unlikely(attributes.empty()) || insert_index * 2 < cache_max_size) + if (unlikely(attributes.empty()) || insert_index < cache_max_size * 2) return; - std::unordered_map element_index_to_cache_iterator; + absl::flat_hash_map> element_index_to_cache_iterator; for (auto begin = cache.begin(); begin != cache.end(); ++begin) { @@ -483,7 +458,15 @@ private: for (size_t i = 0; i < container_size; ++i) { if (indexes_to_delete.contains(i)) + { + if constexpr (std::is_same_v) + { + StringRef data = container[i]; + arena.free(const_cast(data.data), data.size); + } + continue; + } std::swap(container[remove_index], container[i]); @@ -513,7 +496,15 @@ private: for (size_t i = 0; i < container_size; ++i) { if (indexes_to_delete.contains(i)) + { + if constexpr (std::is_same_v) + { + StringRef data = container[i]; + arena.free(const_cast(data.data), data.size); + } + continue; + } std::swap(container[remove_index], container[i]); ++remove_index; @@ -559,6 +550,47 @@ private: return const_cast *>(this)->template getAttributeContainer(attribute_index, std::forward(func)); } + StringRef copyStringInArena(StringRef value_to_copy) + { + size_t value_to_copy_size = value_to_copy.size; + char * place_for_key = arena.alloc(value_to_copy_size); + memcpy(reinterpret_cast(place_for_key), reinterpret_cast(value_to_copy.data), value_to_copy_size); + StringRef updated_value{place_for_key, value_to_copy_size}; + + return updated_value; + } + + void setup(const DictionaryStructure & dictionary_structure) + { + /// For each dictionary attribute create storage attribute + /// For simple attributes create PODArray, for complex vector of Fields + + attributes.reserve(dictionary_structure.attributes.size()); + + for (const auto & dictionary_attribute : dictionary_structure.attributes) + { + auto attribute_type = dictionary_attribute.underlying_type; + + auto type_call = [&](const auto & dictionary_attribute_type) + { + using Type = std::decay_t; + using AttributeType = typename Type::AttributeType; + using ValueType = DictionaryValueType; + + attributes.emplace_back(); + auto & last_attribute = attributes.back(); + last_attribute.type = attribute_type; + last_attribute.is_complex_type = dictionary_attribute.is_nullable || dictionary_attribute.is_array; + + if (dictionary_attribute.is_nullable) + last_attribute.attribute_container = std::vector(); + else + last_attribute.attribute_container = PaddedPODArray(); + }; + + callOnDictionaryAttributeType(attribute_type, type_call); + } + } using TimePoint = std::chrono::system_clock::time_point; @@ -578,26 +610,13 @@ private: cache.insert(key, cell); } - StringRef copyStringInArena(StringRef value_to_copy) - { - size_t value_to_copy_size = value_to_copy.size; - char * place_for_key = arena.alloc(value_to_copy_size); - memcpy(reinterpret_cast(place_for_key), reinterpret_cast(value_to_copy.data), value_to_copy_size); - StringRef updated_value{place_for_key, value_to_copy_size}; - - return updated_value; - } - - inline static bool cellHasDeadline(const Cell & cell) - { - return cell.deadline != std::chrono::system_clock::from_time_t(0); - } - inline void setCellDeadline(Cell & cell, TimePoint now) { if (configuration.lifetime.min_sec == 0 && configuration.lifetime.max_sec == 0) { - cell.deadline = std::chrono::system_clock::from_time_t(0); + /// This maybe not obvious, but when we define is this cell is expired or expired permanently, we add strict_max_lifetime_seconds + /// to the expiration time. And it overflows pretty well. + cell.deadline = std::chrono::time_point::max() - 2 * std::chrono::seconds(configuration.strict_max_lifetime_seconds); return; } @@ -638,10 +657,6 @@ private: std::vector> attribute_container; }; - std::vector attributes; - size_t insert_index = 0; - std::unordered_set> indexes_to_delete; - class CacheStorageCellDisposer { public: @@ -667,6 +682,10 @@ private: ComplexKeyLRUHashMap>; CacheLRUHashMap cache; + + std::vector attributes; + size_t insert_index = 0; + absl::flat_hash_set> indexes_to_delete; }; } diff --git a/src/Dictionaries/ICacheDictionaryStorage.h b/src/Dictionaries/ICacheDictionaryStorage.h index 8db2dab536c..a428cebdfe7 100644 --- a/src/Dictionaries/ICacheDictionaryStorage.h +++ b/src/Dictionaries/ICacheDictionaryStorage.h @@ -31,6 +31,7 @@ struct KeyState inline bool isNotFound() const { return state == State::not_found; } inline bool isDefault() const { return is_default; } inline void setDefault() { is_default = true; } + inline void setDefaultValue(bool is_default_value) { is_default = is_default_value; } /// Valid only if keyState is found or expired inline size_t getFetchedColumnIndex() const { return fetched_column_index; } diff --git a/src/Dictionaries/SerializedCacheDictionaryStorage.h b/src/Dictionaries/SerializedCacheDictionaryStorage.h deleted file mode 100644 index 2616e03763c..00000000000 --- a/src/Dictionaries/SerializedCacheDictionaryStorage.h +++ /dev/null @@ -1,412 +0,0 @@ -#pragma once - -#include - -#include - -#include -#include -#include -#include -#include -#include -#include - -namespace DB -{ - -namespace ErrorCodes -{ - extern const int NOT_IMPLEMENTED; -} - -struct SerializedCacheDictionaryStorageConfiguration -{ - /// Max size of storage in cells - const size_t max_size_in_cells; - /// Needed to perform check if cell is expired or not found. Default value is dictionary max lifetime. - const size_t strict_max_lifetime_seconds; - /// Lifetime of dictionary. Cell deadline is random value between lifetime min and max seconds. - const DictionaryLifetime lifetime; -}; - -/** Keys are stored in LRUCache and column values are serialized into arena. - - Cell in LRUCache consists of allocated size and place in arena were columns serialized data is stored. - - Columns are serialized by rows. - - When cell is removed from LRUCache data associated with it is also removed from arena. - - In case of complex key we also store key data in arena and it is removed from arena. -*/ -/// TODO: Remove -template -class SerializedCacheDictionaryStorage final : public ICacheDictionaryStorage -{ -public: - using KeyType = std::conditional_t; - static_assert(dictionary_key_type != DictionaryKeyType::range, "Range key type is not supported by CacheDictionaryStorage"); - - explicit SerializedCacheDictionaryStorage(SerializedCacheDictionaryStorageConfiguration & configuration_) - : configuration(configuration_) - , rnd_engine(randomSeed()) - , cache(configuration.max_size_in_cells, false, { arena }) - { - } - - bool returnsFetchedColumnsInOrderOfRequestedKeys() const override { return true; } - - String getName() const override - { - if (dictionary_key_type == DictionaryKeyType::simple) - return "SerializedCache"; - else - return "ComplexKeySerializedCache"; - } - - bool supportsSimpleKeys() const override { return dictionary_key_type == DictionaryKeyType::simple; } - - SimpleKeysStorageFetchResult fetchColumnsForKeys( - const PaddedPODArray & keys, - const DictionaryStorageFetchRequest & fetch_request) override - { - if constexpr (dictionary_key_type == DictionaryKeyType::simple) - return fetchColumnsForKeysImpl(keys, fetch_request); - else - throw Exception("Method fetchColumnsForKeys is not supported for complex key storage", ErrorCodes::NOT_IMPLEMENTED); - } - - void insertColumnsForKeys(const PaddedPODArray & keys, Columns columns) override - { - if constexpr (dictionary_key_type == DictionaryKeyType::simple) - insertColumnsForKeysImpl(keys, columns); - else - throw Exception("Method insertColumnsForKeys is not supported for complex key storage", ErrorCodes::NOT_IMPLEMENTED); - } - - void insertDefaultKeys(const PaddedPODArray & keys) override - { - if constexpr (dictionary_key_type == DictionaryKeyType::simple) - insertDefaultKeysImpl(keys); - else - throw Exception("Method insertDefaultKeysImpl is not supported for complex key storage", ErrorCodes::NOT_IMPLEMENTED); - } - - PaddedPODArray getCachedSimpleKeys() const override - { - if constexpr (dictionary_key_type == DictionaryKeyType::simple) - return getCachedKeysImpl(); - else - throw Exception("Method getCachedSimpleKeys is not supported for complex key storage", ErrorCodes::NOT_IMPLEMENTED); - } - - bool supportsComplexKeys() const override { return dictionary_key_type == DictionaryKeyType::complex; } - - ComplexKeysStorageFetchResult fetchColumnsForKeys( - const PaddedPODArray & keys, - const DictionaryStorageFetchRequest & column_fetch_requests) override - { - if constexpr (dictionary_key_type == DictionaryKeyType::complex) - return fetchColumnsForKeysImpl(keys, column_fetch_requests); - else - throw Exception("Method fetchColumnsForKeys is not supported for simple key storage", ErrorCodes::NOT_IMPLEMENTED); - } - - void insertColumnsForKeys(const PaddedPODArray & keys, Columns columns) override - { - if constexpr (dictionary_key_type == DictionaryKeyType::complex) - insertColumnsForKeysImpl(keys, columns); - else - throw Exception("Method insertColumnsForKeys is not supported for simple key storage", ErrorCodes::NOT_IMPLEMENTED); - } - - void insertDefaultKeys(const PaddedPODArray & keys) override - { - if constexpr (dictionary_key_type == DictionaryKeyType::complex) - insertDefaultKeysImpl(keys); - else - throw Exception("Method insertDefaultKeysImpl is not supported for simple key storage", ErrorCodes::NOT_IMPLEMENTED); - } - - PaddedPODArray getCachedComplexKeys() const override - { - if constexpr (dictionary_key_type == DictionaryKeyType::complex) - return getCachedKeysImpl(); - else - throw Exception("Method getCachedComplexKeys is not supported for simple key storage", ErrorCodes::NOT_IMPLEMENTED); - } - - size_t getSize() const override { return cache.size(); } - - size_t getMaxSize() const override { return cache.getMaxSize(); } - - size_t getBytesAllocated() const override { return arena.size() + cache.getSizeInBytes(); } - -private: - - template - ALWAYS_INLINE KeysStorageFetchResult fetchColumnsForKeysImpl( - const PaddedPODArray & keys, - const DictionaryStorageFetchRequest & fetch_request) - { - KeysStorageFetchResult result; - - result.fetched_columns = fetch_request.makeAttributesResultColumns(); - result.key_index_to_state.resize_fill(keys.size(), {KeyState::not_found}); - - const auto now = std::chrono::system_clock::now(); - - size_t fetched_columns_index = 0; - - std::chrono::seconds max_lifetime_seconds(configuration.strict_max_lifetime_seconds); - - size_t keys_size = keys.size(); - - for (size_t key_index = 0; key_index < keys_size; ++key_index) - { - auto key = keys[key_index]; - auto * it = cache.find(key); - - if (it) - { - /// Columns values for key are serialized in cache now deserialize them - const auto & cell = it->getMapped(); - - bool has_deadline = cellHasDeadline(cell); - - if (has_deadline && now > cell.deadline + max_lifetime_seconds) - { - result.key_index_to_state[key_index] = {KeyState::not_found}; - ++result.not_found_keys_size; - continue; - } - else if (has_deadline && now > cell.deadline) - { - result.key_index_to_state[key_index] = {KeyState::expired, fetched_columns_index}; - ++result.expired_keys_size; - } - else - { - result.key_index_to_state[key_index] = {KeyState::found, fetched_columns_index}; - ++result.found_keys_size; - } - - ++fetched_columns_index; - - if (cell.isDefault()) - { - result.key_index_to_state[key_index].setDefault(); - ++result.default_keys_size; - insertDefaultValuesIntoColumns(result.fetched_columns, fetch_request, key_index); - } - else - { - const char * place_for_serialized_columns = cell.place_for_serialized_columns; - deserializeAndInsertIntoColumns(result.fetched_columns, fetch_request, place_for_serialized_columns); - } - } - else - { - result.key_index_to_state[key_index] = {KeyState::not_found}; - ++result.not_found_keys_size; - } - } - - return result; - } - - void insertColumnsForKeysImpl(const PaddedPODArray & keys, Columns columns) - { - Arena temporary_values_pool; - - size_t columns_to_serialize_size = columns.size(); - PaddedPODArray temporary_column_data(columns_to_serialize_size); - - const auto now = std::chrono::system_clock::now(); - - size_t keys_size = keys.size(); - - for (size_t key_index = 0; key_index < keys_size; ++key_index) - { - size_t allocated_size_for_columns = 0; - const char * block_start = nullptr; - - auto key = keys[key_index]; - auto * it = cache.find(key); - - for (size_t column_index = 0; column_index < columns_to_serialize_size; ++column_index) - { - auto & column = columns[column_index]; - temporary_column_data[column_index] = column->serializeValueIntoArena(key_index, temporary_values_pool, block_start); - allocated_size_for_columns += temporary_column_data[column_index].size; - } - - char * place_for_serialized_columns = arena.alloc(allocated_size_for_columns); - memcpy(reinterpret_cast(place_for_serialized_columns), reinterpret_cast(block_start), allocated_size_for_columns); - - if (it) - { - /// Cell exists need to free previous serialized place and update deadline - auto & cell = it->getMapped(); - - if (cell.place_for_serialized_columns) - arena.free(cell.place_for_serialized_columns, cell.allocated_size_for_columns); - - setCellDeadline(cell, now); - cell.allocated_size_for_columns = allocated_size_for_columns; - cell.place_for_serialized_columns = place_for_serialized_columns; - } - else - { - /// No cell exists so create and put in cache - Cell cell; - - setCellDeadline(cell, now); - cell.allocated_size_for_columns = allocated_size_for_columns; - cell.place_for_serialized_columns = place_for_serialized_columns; - - insertCellInCache(key, cell); - } - - temporary_values_pool.rollback(allocated_size_for_columns); - } - } - - void insertDefaultKeysImpl(const PaddedPODArray & keys) - { - const auto now = std::chrono::system_clock::now(); - - for (auto key : keys) - { - auto * it = cache.find(key); - - if (it) - { - auto & cell = it->getMapped(); - - setCellDeadline(cell, now); - - if (cell.place_for_serialized_columns) - arena.free(cell.place_for_serialized_columns, cell.allocated_size_for_columns); - - cell.allocated_size_for_columns = 0; - cell.place_for_serialized_columns = nullptr; - } - else - { - Cell cell; - - setCellDeadline(cell, now); - cell.allocated_size_for_columns = 0; - cell.place_for_serialized_columns = nullptr; - - insertCellInCache(key, cell); - } - } - } - - PaddedPODArray getCachedKeysImpl() const - { - PaddedPODArray result; - result.reserve(cache.size()); - - for (auto & node : cache) - { - auto & cell = node.getMapped(); - - if (cell.isDefault()) - continue; - - result.emplace_back(node.getKey()); - } - - return result; - } - - using TimePoint = std::chrono::system_clock::time_point; - - struct Cell - { - TimePoint deadline; - size_t allocated_size_for_columns; - char * place_for_serialized_columns; - - inline bool isDefault() const { return place_for_serialized_columns == nullptr; } - inline void setDefault() - { - place_for_serialized_columns = nullptr; - allocated_size_for_columns = 0; - } - }; - - void insertCellInCache(KeyType & key, const Cell & cell) - { - if constexpr (dictionary_key_type == DictionaryKeyType::complex) - { - /// Copy complex key into arena and put in cache - size_t key_size = key.size; - char * place_for_key = arena.alloc(key_size); - memcpy(reinterpret_cast(place_for_key), reinterpret_cast(key.data), key_size); - KeyType updated_key{place_for_key, key_size}; - key = updated_key; - } - - cache.insert(key, cell); - } - - inline static bool cellHasDeadline(const Cell & cell) - { - return cell.deadline != std::chrono::system_clock::from_time_t(0); - } - - inline void setCellDeadline(Cell & cell, TimePoint now) - { - if (configuration.lifetime.min_sec == 0 && configuration.lifetime.max_sec == 0) - { - cell.deadline = std::chrono::system_clock::from_time_t(0); - return; - } - - size_t min_sec_lifetime = configuration.lifetime.min_sec; - size_t max_sec_lifetime = configuration.lifetime.max_sec; - - std::uniform_int_distribution distribution{min_sec_lifetime, max_sec_lifetime}; - cell.deadline = now + std::chrono::seconds(distribution(rnd_engine)); - } - - SerializedCacheDictionaryStorageConfiguration configuration; - - ArenaWithFreeLists arena; - - pcg64 rnd_engine; - - class ArenaCellDisposer - { - public: - ArenaWithFreeLists & arena; - - template - void operator()(const Key & key, const Value & value) const - { - /// In case of complex key we keep it in arena - if constexpr (std::is_same_v) - arena.free(const_cast(key.data), key.size); - - if (value.place_for_serialized_columns) - arena.free(value.place_for_serialized_columns, value.allocated_size_for_columns); - } - }; - - using SimpleKeyLRUHashMap = LRUHashMap; - using ComplexKeyLRUHashMap = LRUHashMapWithSavedHash; - - using CacheLRUHashMap = std::conditional_t< - dictionary_key_type == DictionaryKeyType::simple, - SimpleKeyLRUHashMap, - ComplexKeyLRUHashMap>; - - CacheLRUHashMap cache; -}; - -} diff --git a/src/Dictionaries/registerCacheDictionaries.cpp b/src/Dictionaries/registerCacheDictionaries.cpp index 23eea6e7e21..9f0f214e79b 100644 --- a/src/Dictionaries/registerCacheDictionaries.cpp +++ b/src/Dictionaries/registerCacheDictionaries.cpp @@ -1,6 +1,5 @@ #include "CacheDictionary.h" #include "CacheDictionaryStorage.h" -#include "SerializedCacheDictionaryStorage.h" #include "SSDCacheDictionaryStorage.h" #include @@ -19,16 +18,9 @@ CacheDictionaryStorageConfiguration parseCacheStorageConfiguration( const Poco::Util::AbstractConfiguration & config, const String & layout_prefix, const DictionaryLifetime & dict_lifetime, - DictionaryKeyType dictionary_key_type, - bool serialized_storage) + DictionaryKeyType dictionary_key_type) { - String dictionary_type_prefix; - - if (!serialized_storage) - dictionary_type_prefix = dictionary_key_type == DictionaryKeyType::complex ? ".complex_key_cache." : ".cache."; - else - dictionary_type_prefix = dictionary_key_type == DictionaryKeyType::complex ? ".serialized_complex_key_cache." : ".serialized_cache."; - + String dictionary_type_prefix = (dictionary_key_type == DictionaryKeyType::complex) ? ".complex_key_cache." : ".cache."; String dictionary_configuration_prefix = layout_prefix + dictionary_type_prefix; const size_t size = config.getUInt64(dictionary_configuration_prefix + "size_in_cells"); @@ -166,8 +158,7 @@ DictionaryPtr createCacheDictionaryLayout( const DictionaryStructure & dict_struct, const Poco::Util::AbstractConfiguration & config, const std::string & config_prefix, - DictionarySourcePtr source_ptr, - bool serialized_storage) + DictionarySourcePtr source_ptr) { static_assert(dictionary_key_type != DictionaryKeyType::range, "Range key type is not supported by CacheDictionary"); @@ -202,23 +193,9 @@ DictionaryPtr createCacheDictionaryLayout( const bool allow_read_expired_keys = config.getBool(layout_prefix + ".cache.allow_read_expired_keys", false); - auto storage_configuration = parseCacheStorageConfiguration(full_name, config, layout_prefix, dict_lifetime, dictionary_key_type, serialized_storage); + auto storage_configuration = parseCacheStorageConfiguration(full_name, config, layout_prefix, dict_lifetime, dictionary_key_type); - std::shared_ptr storage; - - if (serialized_storage) - { - SerializedCacheDictionaryStorageConfiguration serialized_configuration - { - .max_size_in_cells = storage_configuration.max_size_in_cells, - .strict_max_lifetime_seconds = storage_configuration.strict_max_lifetime_seconds, - .lifetime = storage_configuration.lifetime, - }; - - storage = std::make_shared>(serialized_configuration); - } - else - storage = std::make_shared>(dict_struct, storage_configuration); + std::shared_ptr storage = std::make_shared>(dict_struct, storage_configuration); auto update_queue_configuration = parseCacheDictionaryUpdateQueueConfiguration(full_name, config, layout_prefix, dictionary_key_type); @@ -289,7 +266,7 @@ void registerDictionaryCache(DictionaryFactory & factory) const std::string & config_prefix, DictionarySourcePtr source_ptr) -> DictionaryPtr { - return createCacheDictionaryLayout(full_name, dict_struct, config, config_prefix, std::move(source_ptr), false); + return createCacheDictionaryLayout(full_name, dict_struct, config, config_prefix, std::move(source_ptr)); }; factory.registerLayout("cache", create_simple_cache_layout, false); @@ -300,33 +277,11 @@ void registerDictionaryCache(DictionaryFactory & factory) const std::string & config_prefix, DictionarySourcePtr source_ptr) -> DictionaryPtr { - return createCacheDictionaryLayout(full_name, dict_struct, config, config_prefix, std::move(source_ptr), false); + return createCacheDictionaryLayout(full_name, dict_struct, config, config_prefix, std::move(source_ptr)); }; factory.registerLayout("complex_key_cache", create_complex_key_cache_layout, true); - auto create_simple_serialized_cache_layout = [=](const String & full_name, - const DictionaryStructure & dict_struct, - const Poco::Util::AbstractConfiguration & config, - const std::string & config_prefix, - DictionarySourcePtr source_ptr) -> DictionaryPtr - { - return createCacheDictionaryLayout(full_name, dict_struct, config, config_prefix, std::move(source_ptr), true); - }; - - factory.registerLayout("serialized_cache", create_simple_serialized_cache_layout, false); - - auto create_complex_key_serialzied_cache_layout = [=](const std::string & full_name, - const DictionaryStructure & dict_struct, - const Poco::Util::AbstractConfiguration & config, - const std::string & config_prefix, - DictionarySourcePtr source_ptr) -> DictionaryPtr - { - return createCacheDictionaryLayout(full_name, dict_struct, config, config_prefix, std::move(source_ptr), true); - }; - - factory.registerLayout("complex_key_serialized_cache", create_complex_key_serialzied_cache_layout, true); - #if defined(OS_LINUX) || defined(__FreeBSD__) auto create_simple_ssd_cache_layout = [=](const std::string & full_name, From ee898d6d47a01a5daa5baee1d73e63acb6b122e4 Mon Sep 17 00:00:00 2001 From: Maksim Kita Date: Sun, 14 Mar 2021 15:51:55 +0300 Subject: [PATCH 010/155] Fixed style check --- src/Dictionaries/CacheDictionaryStorage.h | 3 --- 1 file changed, 3 deletions(-) diff --git a/src/Dictionaries/CacheDictionaryStorage.h b/src/Dictionaries/CacheDictionaryStorage.h index bbf1325c8a3..d27c6512244 100644 --- a/src/Dictionaries/CacheDictionaryStorage.h +++ b/src/Dictionaries/CacheDictionaryStorage.h @@ -33,8 +33,6 @@ struct CacheDictionaryStorageConfiguration const DictionaryLifetime lifetime; }; - - /// TODO: Add documentation template class CacheDictionaryStorage final : public ICacheDictionaryStorage @@ -168,7 +166,6 @@ private: const bool is_default; }; - template KeysStorageFetchResult fetchColumnsForKeysImpl( const PaddedPODArray & keys, From 3d1c42827b01b08ffb8f60aa2cf4685fb759a1d3 Mon Sep 17 00:00:00 2001 From: Maksim Kita Date: Tue, 16 Mar 2021 01:59:04 +0300 Subject: [PATCH 011/155] Added FixedDeadlineHashMap --- src/Common/HashTable/FixedDeadlineHashMap.h | 253 +++++++++++++++ src/Common/HashTable/LRUHashMap.h | 6 +- src/Dictionaries/CacheDictionaryStorage.h | 339 ++++++++------------ 3 files changed, 398 insertions(+), 200 deletions(-) create mode 100644 src/Common/HashTable/FixedDeadlineHashMap.h diff --git a/src/Common/HashTable/FixedDeadlineHashMap.h b/src/Common/HashTable/FixedDeadlineHashMap.h new file mode 100644 index 00000000000..0f7819f4020 --- /dev/null +++ b/src/Common/HashTable/FixedDeadlineHashMap.h @@ -0,0 +1,253 @@ +#pragma once + +#include +#include +#include +#include + +using TimePoint = std::chrono::system_clock::time_point; + +template +struct DeadlineCell : + public std::conditional_t, + HashMapCell> +{ + using Key = TKey; + + using Base = std::conditional_t, + HashMapCell>; + + using Mapped = typename Base::Mapped; + using State = typename Base::State; + + using mapped_type = Mapped; + using key_type = Key; + + using Base::Base; + + inline TimePoint getDeadline() const { return deadline; } + + void setDeadline(TimePoint & deadline_value) { deadline = deadline_value; } + +private: + TimePoint deadline; +}; + +template +class FixedDeadlineHashMapImpl : + private HashMapTable< + TKey, + DeadlineCell, + Hash, + HashTableGrower<>, + HashTableAllocator> +{ + /// TODO: Make custom grower + using Base = HashMapTable< + TKey, + DeadlineCell, + Hash, + HashTableGrower<>, + HashTableAllocator>; + + static size_t calculateMaxSize(size_t max_size, size_t max_collision_resolution_chain) + { + return roundUpToPowerOfTwoOrZero(std::max(max_size, max_collision_resolution_chain)); + } +public: + using Cell = DeadlineCell; + using Key = TKey; + using Value = TValue; + using Mapped = typename Cell::Mapped; + + explicit FixedDeadlineHashMapImpl(size_t max_size_, size_t max_collision_resolution_chain_, Disposer disposer_ = Disposer()) + : Base(calculateMaxSize(max_size_, max_collision_resolution_chain_)) + , max_collision_resolution_chain(max_collision_resolution_chain_) + , max_size(max_size_) + , disposer(std::move(disposer_)) + { + assert(max_size > 0); + assert(max_collision_resolution_chain > 0); + } + + ~FixedDeadlineHashMapImpl() + { + clear(); + } + + Cell * get(const Key & key) + { + if (Cell::isZero(key, *this)) + return this->hasZero() ? this->zeroValue() : nullptr; + + /// TODO: Optimize + + size_t hash_value = Base::hash(key); + size_t place_value = Base::grower.place(hash_value); + size_t resolution_chain = max_collision_resolution_chain; + + while (resolution_chain != 0) + { + auto & cell = Base::buf[place_value]; + + if (cell.isZero(*this)) + return nullptr; + + if (cell.keyEquals(key, hash_value, *this)) + return &cell; + + place_value = Base::grower.next(place_value); + --resolution_chain; + } + + return nullptr; + } + + const Cell * get(const Key & key) const + { + return const_cast *>(this)->get(key); + } + + std::pair ALWAYS_INLINE insert(const Key & key, const Value & value) + { + return emplace(key, value); + } + + std::pair ALWAYS_INLINE insert(const Key & key, Value && value) + { + return emplace(key, std::move(value)); + } + + template + std::pair ALWAYS_INLINE emplace(const Key & key, Args && ... args) + { + size_t hash_value = Base::hash(key); + std::pair result; + + if (!emplaceIfZero(key, hash_value, result)) + result = emplaceNonZeroImpl(key, hash_value); + + bool was_inserted = result.second; + + if (was_inserted) + new (&result.first->getMapped()) Value(std::forward(args)...); + + return result; + } + + template + void reinsert(Cell * place_to_use, const Key & key, Args && ... args) + { + size_t hash_value = Base::hash(key); + + new (place_to_use) Cell(key, *this); + new (&place_to_use->getMapped()) Value(std::forward(args)...); + place_to_use->setHash(hash_value); + } + + using Base::size; + + using iterator = typename Base::iterator; + using const_iterator = typename Base::const_iterator; + + using Base::begin; + using Base::end; + + size_t getMaxSize() const { return max_size; } + + size_t getSizeInBytes() const { return Base::getBufferSizeInBytes(); } + + void clear() + { + for (auto & cell : *this) + disposer(cell.getKey(), cell.getMapped()); + } + +private: + size_t max_collision_resolution_chain; + size_t max_size; + Disposer disposer; + + bool emplaceIfZero(const Key & key, size_t hash_value, std::pair & result) + { + if (!Cell::isZero(key, *this)) + return false; + + if (this->hasZero()) + { + result = {this->zeroValue(), false}; + return true; + } + + ++Base::m_size; + + this->setHasZero(); + this->zeroValue()->setHash(hash_value); + result = {this->zeroValue(), true}; + + return true; + } + + std::pair emplaceNonZeroImpl(const Key & key, size_t hash_value) + { + TimePoint oldest_time = TimePoint::max(); + size_t place_value = Base::grower.place(hash_value); + size_t resolution_chain = max_collision_resolution_chain; + + bool use_old_value_place = false; + Cell * place_to_insert = nullptr; + + while (resolution_chain != 0) + { + auto & cell = Base::buf[place_value]; + + if (cell.isZero(*this)) + { + use_old_value_place = false; + place_to_insert = &cell; + break; + } + + if (cell.keyEquals(key, hash_value, *this)) + return std::make_pair(&cell, false); + + if (cell.getDeadline() < oldest_time) + { + use_old_value_place = true; + place_to_insert = &cell; + } + + place_value = Base::grower.next(place_value); + --resolution_chain; + } + + if (!place_to_insert) + place_to_insert = &Base::buf[place_value]; + + if (use_old_value_place) + return std::make_pair(place_to_insert, false); + else + { + ++Base::m_size; + + new (place_to_insert) Cell(key, *this); + place_to_insert->setHash(hash_value); + + return std::make_pair(place_to_insert, true); + } + } +}; + +template +struct DefaultFixedHashMapCellDisposer +{ + void operator()(const Key &, const Mapped &) const {} +}; + +template , typename Hash = DefaultHash> +using FixedDeadlineHashMap = FixedDeadlineHashMapImpl; + +template , typename Hash = DefaultHash> +using FixedDeadlineHashMapWithSavedHash = FixedDeadlineHashMapImpl; diff --git a/src/Common/HashTable/LRUHashMap.h b/src/Common/HashTable/LRUHashMap.h index df9766c5ee8..870fb219523 100644 --- a/src/Common/HashTable/LRUHashMap.h +++ b/src/Common/HashTable/LRUHashMap.h @@ -271,13 +271,13 @@ private: }; template -struct DefaultCellDisposer +struct DefaultLRUHashMapCellDisposer { void operator()(const Key &, const Mapped &) const {} }; -template , typename Hash = DefaultHash> +template , typename Hash = DefaultHash> using LRUHashMap = LRUHashMapImpl; -template , typename Hash = DefaultHash> +template , typename Hash = DefaultHash> using LRUHashMapWithSavedHash = LRUHashMapImpl; diff --git a/src/Dictionaries/CacheDictionaryStorage.h b/src/Dictionaries/CacheDictionaryStorage.h index d27c6512244..a98f92e5da9 100644 --- a/src/Dictionaries/CacheDictionaryStorage.h +++ b/src/Dictionaries/CacheDictionaryStorage.h @@ -11,6 +11,7 @@ #include #include #include +#include #include #include #include @@ -46,7 +47,7 @@ public: CacheDictionaryStorageConfiguration & configuration_) : configuration(configuration_) , rnd_engine(randomSeed()) - , cache(configuration.max_size_in_cells, false, { *this }) + , cache(configuration.max_size_in_cells, 10, { *this }) { setup(dictionary_structure); } @@ -162,8 +163,8 @@ private: , is_default(is_default_) {} - const size_t element_index; - const bool is_default; + size_t element_index; + bool is_default; }; template @@ -184,12 +185,12 @@ private: std::chrono::seconds max_lifetime_seconds(configuration.strict_max_lifetime_seconds); PaddedPODArray fetched_keys; - fetched_keys.reserve(keys_size); + fetched_keys.resize_fill(keys_size); for (size_t key_index = 0; key_index < keys_size; ++key_index) { auto key = keys[key_index]; - auto * it = cache.find(key); + auto * it = cache.get(key); if (!it) { @@ -198,9 +199,10 @@ private: continue; } + auto deadline = it->getDeadline(); const auto & cell = it->getMapped(); - if (now > cell.deadline + max_lifetime_seconds) + if (now > deadline + max_lifetime_seconds) { result.key_index_to_state[key_index] = {KeyState::not_found}; ++result.not_found_keys_size; @@ -210,7 +212,7 @@ private: bool cell_is_expired = false; KeyState::State key_state = KeyState::found; - if (now > cell.deadline) + if (now > deadline) { cell_is_expired = true; key_state = KeyState::expired; @@ -225,7 +227,7 @@ private: result.key_index_to_state[key_index].setDefaultValue(cell.is_default); result.default_keys_size += cell.is_default; - fetched_keys.emplace_back(cell.element_index, cell.is_default); + fetched_keys[key_index] = FetchedKey{cell.element_index, cell.is_default}; } for (size_t attribute_index = 0; attribute_index < fetch_request.attributesSize(); ++attribute_index) @@ -311,103 +313,143 @@ private: void insertColumnsForKeysImpl(const PaddedPODArray & keys, Columns columns) { const auto now = std::chrono::system_clock::now(); + size_t keys_size = keys.size(); + size_t columns_size = columns.size(); + Field column_value; + for (size_t key_index = 0; key_index < keys_size; ++key_index) { auto key = keys[key_index]; - cache.erase(key); - Cell cell; + auto [it, was_inserted] = cache.insert(key, {}); - setCellDeadline(cell, now); - cell.element_index = insert_index; - cell.is_default = false; - - ++insert_index; - - insertCellInCache(key, cell); - } - - Field complex_column_value; - - for (size_t column_index = 0; column_index < columns.size(); ++column_index) - { - auto & attribute = attributes[column_index]; - const auto & column = columns[column_index]; - size_t column_size = column->size(); - - if (unlikely(attribute.is_complex_type)) + if (was_inserted) { - auto & container = std::get>(attribute.attribute_container); - container.reserve(column_size); + auto & cell = it->getMapped(); + cell.is_default = false; - for (size_t item_index = 0; item_index < column_size; ++item_index) + for (size_t attribute_index = 0; attribute_index < columns_size; ++attribute_index) { - column->get(item_index, complex_column_value); - container.emplace_back(complex_column_value); + auto & column = columns[attribute_index]; + + getAttributeContainer(attribute_index, [&](auto & container) + { + container.emplace_back(); + cell.element_index = container.size() - 1; + + using ElementType = std::decay_t; + + column->get(key_index, column_value); + + if constexpr (std::is_same_v) + container.back() = column_value; + else if constexpr (std::is_same_v) + { + const String & value = column_value.get(); + StringRef inserted_value = copyStringInArena(StringRef { value.data(), value.size() }); + container.back() = inserted_value; + } + else + container.back() = column_value.get(); + }); } } else { - auto type_call = [&](const auto & dictionary_attribute_type) + auto & cell_key = it->getKey(); + + Cell cell; + + size_t existing_index = it->getMapped().element_index; + + cell.element_index = existing_index; + cell.is_default = false; + + if (cell_key != key) { - using Type = std::decay_t; - using AttributeType = typename Type::AttributeType; - using ValueType = DictionaryValueType; - using ColumnType = - std::conditional_t, ColumnString, - std::conditional_t, ColumnDecimal, - ColumnVector>>; + /// In case of complex key we keep it in arena + if constexpr (std::is_same_v) + arena.free(const_cast(key.data), key.size); + } - const ColumnType & column_typed = static_cast(*column); + cache.reinsert(it, key, cell); - auto & container = std::get>(attribute.attribute_container); - container.reserve(column_size); + /// Put values into index - if constexpr (std::is_same_v) + for (size_t attribute_index = 0; attribute_index < columns_size; ++attribute_index) + { + auto & column = columns[attribute_index]; + + getAttributeContainer(attribute_index, [&](auto & container) { - /// TODO: Serialize while column string in arena then just insert offsets in container - for (size_t item_index = 0; item_index < column_size; ++item_index) + using ElementType = std::decay_t; + + column->get(key_index, column_value); + + if constexpr (std::is_same_v) + container[existing_index] = column_value; + else if constexpr (std::is_same_v) { - StringRef value = column->getDataAt(item_index); - StringRef updated_data = copyStringInArena(value); - - container.emplace_back(updated_data); + const String & value = column_value.get(); + StringRef inserted_value = copyStringInArena(StringRef { value.data(), value.size() }); + container[existing_index] = inserted_value; } - } - else - { - const auto & data = column_typed.getData(); - container.insert(data.begin(), data.end()); - } - }; - - callOnDictionaryAttributeType(attribute.type, type_call); + else + container[existing_index] = column_value.get(); + }); + } } - } - deleteUnusedKeysIfNecessary(); + setCellDeadline(*it, now); + } } void insertDefaultKeysImpl(const PaddedPODArray & keys) { const auto now = std::chrono::system_clock::now(); - for (auto key : keys) + size_t keys_size = keys.size(); + + for (size_t key_index = 0; key_index < keys_size; ++key_index) { - cache.erase(key); + auto key = keys[key_index]; - Cell cell; + Cell value; + value.is_default = true; - setCellDeadline(cell, now); - cell.element_index = 0; - cell.is_default = true; + auto [it, was_inserted] = cache.insert(key, value); - insertCellInCache(key, cell); + if (was_inserted) + { + auto & cell = it->getMapped(); + + for (size_t attribute_index = 0; attribute_index < attributes.size(); ++attribute_index) + { + getAttributeContainer(attribute_index, [&](auto & container) + { + container.emplace_back(); + cell.element_index = container.size(); + }); + } + } + else + { + value.element_index = it->getMapped().element_index; + + if (it->getKey() != key) + { + /// In case of complex key we keep it in arena + if constexpr (std::is_same_v) + arena.free(const_cast(key.data), key.size); + } + + cache.reinsert(it, key, value); + } + + setCellDeadline(*it, now); } - - deleteUnusedKeysIfNecessary(); } PaddedPODArray getCachedKeysImpl() const @@ -428,92 +470,6 @@ private: return result; } - void deleteUnusedKeysIfNecessary() - { - size_t cache_max_size = cache.getMaxSize(); - - if (unlikely(attributes.empty()) || insert_index < cache_max_size * 2) - return; - - absl::flat_hash_map> element_index_to_cache_iterator; - - for (auto begin = cache.begin(); begin != cache.end(); ++begin) - { - auto & node = *begin; - auto & cell = node.getMapped(); - size_t element_index = cell.element_index; - element_index_to_cache_iterator.insert(std::make_pair(element_index, begin)); - } - - size_t last_remove_index = 0; - - getAttributeContainer(0, [&, this](auto & container) - { - size_t container_size = container.size(); - size_t remove_index = 0; - - for (size_t i = 0; i < container_size; ++i) - { - if (indexes_to_delete.contains(i)) - { - if constexpr (std::is_same_v) - { - StringRef data = container[i]; - arena.free(const_cast(data.data), data.size); - } - - continue; - } - - std::swap(container[remove_index], container[i]); - - auto it = element_index_to_cache_iterator.find(remove_index); - if (it != element_index_to_cache_iterator.end()) - { - auto & cell = it->second->getMapped(); - cell.element_index = remove_index; - } - - ++remove_index; - } - - container.erase(container.begin() + remove_index, container.end()); - last_remove_index = remove_index; - }); - - insert_index = last_remove_index; - - for (size_t attribute_index = 1; attribute_index < attributes.size(); ++attribute_index) - { - getAttributeContainer(attribute_index, [this](auto & container) - { - size_t container_size = container.size(); - size_t remove_index = 0; - - for (size_t i = 0; i < container_size; ++i) - { - if (indexes_to_delete.contains(i)) - { - if constexpr (std::is_same_v) - { - StringRef data = container[i]; - arena.free(const_cast(data.data), data.size); - } - - continue; - } - - std::swap(container[remove_index], container[i]); - ++remove_index; - } - - container.erase(container.begin() + remove_index, container.end()); - }); - } - - indexes_to_delete.clear(); - } - template void getAttributeContainer(size_t attribute_index, GetContainerFunc && func) { @@ -589,41 +545,12 @@ private: } } - using TimePoint = std::chrono::system_clock::time_point; - struct Cell { - TimePoint deadline; size_t element_index; bool is_default; }; - void insertCellInCache(KeyType & key, const Cell & cell) - { - /// Copy complex key into arena and put in cache - if constexpr (dictionary_key_type == DictionaryKeyType::complex) - key = copyStringInArena(key); - - cache.insert(key, cell); - } - - inline void setCellDeadline(Cell & cell, TimePoint now) - { - if (configuration.lifetime.min_sec == 0 && configuration.lifetime.max_sec == 0) - { - /// This maybe not obvious, but when we define is this cell is expired or expired permanently, we add strict_max_lifetime_seconds - /// to the expiration time. And it overflows pretty well. - cell.deadline = std::chrono::time_point::max() - 2 * std::chrono::seconds(configuration.strict_max_lifetime_seconds); - return; - } - - size_t min_sec_lifetime = configuration.lifetime.min_sec; - size_t max_sec_lifetime = configuration.lifetime.max_sec; - - std::uniform_int_distribution distribution{min_sec_lifetime, max_sec_lifetime}; - cell.deadline = now + std::chrono::seconds(distribution(rnd_engine)); - } - CacheDictionaryStorageConfiguration configuration; ArenaWithFreeLists arena; @@ -660,29 +587,47 @@ private: CacheDictionaryStorage & storage; template - void operator()(const Key & key, const Value & cell) const + void operator()(const Key & key, const Value &) const { /// In case of complex key we keep it in arena if constexpr (std::is_same_v) storage.arena.free(const_cast(key.data), key.size); - - storage.indexes_to_delete.insert(cell.element_index); } }; - using SimpleKeyLRUHashMap = LRUHashMap; - using ComplexKeyLRUHashMap = LRUHashMapWithSavedHash; + using SimpleFixedDeadlineHashMap = FixedDeadlineHashMap; + using ComplexFixedDeadlineHashMap = FixedDeadlineHashMap; - using CacheLRUHashMap = std::conditional_t< + using FixedDeadlineHashMap = std::conditional_t< dictionary_key_type == DictionaryKeyType::simple, - SimpleKeyLRUHashMap, - ComplexKeyLRUHashMap>; + SimpleFixedDeadlineHashMap, + ComplexFixedDeadlineHashMap>; - CacheLRUHashMap cache; + using FixedDeadlineHashMapCell = typename FixedDeadlineHashMap::Cell; + + inline void setCellDeadline(FixedDeadlineHashMapCell & cell, TimePoint now) + { + if (configuration.lifetime.min_sec == 0 && configuration.lifetime.max_sec == 0) + { + /// This maybe not obvious, but when we define is this cell is expired or expired permanently, we add strict_max_lifetime_seconds + /// to the expiration time. And it overflows pretty well. + auto deadline = std::chrono::time_point::max() - 2 * std::chrono::seconds(configuration.strict_max_lifetime_seconds); + cell.setDeadline(deadline); + return; + } + + size_t min_sec_lifetime = configuration.lifetime.min_sec; + size_t max_sec_lifetime = configuration.lifetime.max_sec; + + std::uniform_int_distribution distribution{min_sec_lifetime, max_sec_lifetime}; + + auto deadline = now + std::chrono::seconds(distribution(rnd_engine)); + cell.setDeadline(deadline); + } + + FixedDeadlineHashMap cache; std::vector attributes; - size_t insert_index = 0; - absl::flat_hash_set> indexes_to_delete; }; } From f00e1084107f5fe3e9262a45116fbabd945f508d Mon Sep 17 00:00:00 2001 From: Amos Bird Date: Tue, 16 Mar 2021 14:07:30 +0800 Subject: [PATCH 012/155] Fix scalar subquery index analysis --- src/Storages/MergeTree/KeyCondition.cpp | 11 ++++++++--- .../0_stateless/01649_with_alias_key_condition.sql | 2 +- 2 files changed, 9 insertions(+), 4 deletions(-) diff --git a/src/Storages/MergeTree/KeyCondition.cpp b/src/Storages/MergeTree/KeyCondition.cpp index 8f5dec8077d..6833d2e2fd4 100644 --- a/src/Storages/MergeTree/KeyCondition.cpp +++ b/src/Storages/MergeTree/KeyCondition.cpp @@ -444,7 +444,8 @@ bool KeyCondition::addCondition(const String & column, const Range & range) */ bool KeyCondition::getConstant(const ASTPtr & expr, Block & block_with_constants, Field & out_value, DataTypePtr & out_type) { - String column_name = expr->getColumnNameWithoutAlias(); + // Constant expr should use alias names if any + String column_name = expr->getColumnName(); if (const auto * lit = expr->as()) { @@ -607,7 +608,8 @@ bool KeyCondition::canConstantBeWrappedByMonotonicFunctions( if (strict) return false; - String expr_name = node->getColumnNameWithoutAlias(); + // Constant expr should use alias names if any + String expr_name = node->getColumnName(); const auto & sample_block = key_expr->getSampleBlock(); if (!sample_block.has(expr_name)) return false; @@ -675,7 +677,8 @@ bool KeyCondition::canConstantBeWrappedByFunctions( if (strict) return false; - String expr_name = ast->getColumnNameWithoutAlias(); + // Constant expr should use alias names if any + String expr_name = ast->getColumnName(); const auto & sample_block = key_expr->getSampleBlock(); if (!sample_block.has(expr_name)) return false; @@ -1011,6 +1014,8 @@ bool KeyCondition::isKeyPossiblyWrappedByMonotonicFunctionsImpl( * Therefore, use the full name of the expression for search. */ const auto & sample_block = key_expr->getSampleBlock(); + + // Key columns should use canonical names for index analysis String name = node->getColumnNameWithoutAlias(); auto it = key_columns.find(name); diff --git a/tests/queries/0_stateless/01649_with_alias_key_condition.sql b/tests/queries/0_stateless/01649_with_alias_key_condition.sql index b813e6ee84f..0a796f8512e 100644 --- a/tests/queries/0_stateless/01649_with_alias_key_condition.sql +++ b/tests/queries/0_stateless/01649_with_alias_key_condition.sql @@ -6,6 +6,6 @@ insert into alias_key_condition values (1, 2), (3, 4); set force_primary_key = 1; -with i as k select * from alias_key_condition where k = 3; +with i as k select * from alias_key_condition where k = (select i from alias_key_condition where i = 3); drop table if exists alias_key_condition; From f49d6404f39807850ef8fca116dd180261cf3be2 Mon Sep 17 00:00:00 2001 From: alesapin Date: Tue, 16 Mar 2021 11:03:47 +0300 Subject: [PATCH 013/155] Trying to add new nemesis --- .../src/jepsen/nukeeper/constants.clj | 9 ++++ .../src/jepsen/nukeeper/main.clj | 18 +++----- .../src/jepsen/nukeeper/nemesis.clj | 13 ++++++ .../src/jepsen/nukeeper/set.clj | 10 +++-- .../src/jepsen/nukeeper/utils.clj | 44 ++++++++++++++++++- 5 files changed, 78 insertions(+), 16 deletions(-) create mode 100644 tests/jepsen.nukeeper/src/jepsen/nukeeper/constants.clj create mode 100644 tests/jepsen.nukeeper/src/jepsen/nukeeper/nemesis.clj diff --git a/tests/jepsen.nukeeper/src/jepsen/nukeeper/constants.clj b/tests/jepsen.nukeeper/src/jepsen/nukeeper/constants.clj new file mode 100644 index 00000000000..0a20adea086 --- /dev/null +++ b/tests/jepsen.nukeeper/src/jepsen/nukeeper/constants.clj @@ -0,0 +1,9 @@ +(ns jepsen.nukeeper.constants) + +(def dir "/var/lib/clickhouse") +(def binary "clickhouse") +(def logdir "/var/log/clickhouse-server") +(def logfile "/var/log/clickhouse-server/stderr.log") +(def serverlog "/var/log/clickhouse-server/clickhouse-server.log") +(def pidfile (str dir "/clickhouse.pid")) +(def binary-path "/tmp") diff --git a/tests/jepsen.nukeeper/src/jepsen/nukeeper/main.clj b/tests/jepsen.nukeeper/src/jepsen/nukeeper/main.clj index 8aa157bc16e..2b244c924bd 100644 --- a/tests/jepsen.nukeeper/src/jepsen/nukeeper/main.clj +++ b/tests/jepsen.nukeeper/src/jepsen/nukeeper/main.clj @@ -2,7 +2,9 @@ (:require [clojure.tools.logging :refer :all] [jepsen.nukeeper.utils :refer :all] [jepsen.nukeeper.set :as set] + [jepsen.nukeeper.nemesis :as custom-nemesis] [jepsen.nukeeper.register :as register] + [jepsen.nukeeper.constants :refer :all] [clojure.string :as str] [jepsen [checker :as checker] @@ -23,14 +25,6 @@ [zookeeper :as zk]) (:import (org.apache.zookeeper ZooKeeper KeeperException KeeperException$BadVersionException))) -(def dir "/var/lib/clickhouse") -(def binary "clickhouse") -(def logdir "/var/log/clickhouse-server") -(def logfile "/var/log/clickhouse-server/stderr.log") -(def serverlog "/var/log/clickhouse-server/clickhouse-server.log") -(def pidfile (str dir "/clickhouse.pid")) -(def binary-path "/tmp") - (defn cluster-config [test node config-template] (let [nodes (:nodes test)] @@ -66,13 +60,13 @@ (str binary-path "/clickhouse") :server :--config "/etc/clickhouse-server/config.xml") - (Thread/sleep 10000))) + (wait-clickhouse-alive! node test))) (teardown! [_ test node] (info node "tearing down clickhouse") (cu/stop-daemon! (str binary-path "/clickhouse") pidfile) (c/su - (c/exec :rm :-f (str binary-path "/clickhouse")) + ;(c/exec :rm :-f (str binary-path "/clickhouse")) (c/exec :rm :-rf dir) (c/exec :rm :-rf logdir) (c/exec :rm :-rf "/etc/clickhouse-server"))) @@ -111,10 +105,10 @@ opts {:name (str "clickhouse-keeper quorum=" quorum " " (name (:workload opts))) :os ubuntu/os - :db (db "rbtorrent:8831b5baa571abc28340cf66a9279a4ce45fac64") + :db (db "rbtorrent:46832e8fa975b094a5591184b3c854700ed770f4") :pure-generators true :client (:client workload) - :nemesis (nemesis/partition-random-halves) + :nemesis (custom-nemesis/random-single-node-killer-nemesis) :checker (checker/compose {:perf (checker/perf) :workload (:checker workload)}) diff --git a/tests/jepsen.nukeeper/src/jepsen/nukeeper/nemesis.clj b/tests/jepsen.nukeeper/src/jepsen/nukeeper/nemesis.clj new file mode 100644 index 00000000000..2f359bc5cba --- /dev/null +++ b/tests/jepsen.nukeeper/src/jepsen/nukeeper/nemesis.clj @@ -0,0 +1,13 @@ +(ns jepsen.nukeeper.nemesis + (:require [jepsen + [nemesis :as nemesis]] + [jepsen.nukeeper.utils :refer :all])) + + + +(defn random-single-node-killer-nemesis + [] + (nemesis/node-start-stopper + rand-nth + (fn start [test node] (kill-clickhouse! node test)) + (fn stop [test node] (start-clickhouse! node test)))) diff --git a/tests/jepsen.nukeeper/src/jepsen/nukeeper/set.clj b/tests/jepsen.nukeeper/src/jepsen/nukeeper/set.clj index 7e196fab4c7..6a33350673d 100644 --- a/tests/jepsen.nukeeper/src/jepsen/nukeeper/set.clj +++ b/tests/jepsen.nukeeper/src/jepsen/nukeeper/set.clj @@ -1,5 +1,7 @@ (ns jepsen.nukeeper.set - (:require [jepsen + (:require + [clojure.tools.logging :refer :all] + [jepsen [checker :as checker] [client :as client] [generator :as gen]] @@ -18,9 +20,11 @@ (invoke! [_ test op] (case (:f op) :read ;(try - (assoc op + (do (info "LIST ON NODE" (zk-list conn "/")) + (info "EXISTS NODE" (zk/exists conn "/a-set")) + (assoc op :type :ok - :value (read-string (:data (zk-get-str conn k)))) + :value (read-string (:data (zk-get-str conn k))))) ;(catch Exception _ (assoc op :type :fail, :error :connect-error))) :add (try (do diff --git a/tests/jepsen.nukeeper/src/jepsen/nukeeper/utils.clj b/tests/jepsen.nukeeper/src/jepsen/nukeeper/utils.clj index 3caec8e5f62..e398039a329 100644 --- a/tests/jepsen.nukeeper/src/jepsen/nukeeper/utils.clj +++ b/tests/jepsen.nukeeper/src/jepsen/nukeeper/utils.clj @@ -1,7 +1,11 @@ (ns jepsen.nukeeper.utils (:require [clojure.string :as str] [zookeeper.data :as data] - [zookeeper :as zk])) + [zookeeper :as zk] + [jepsen.control.util :as cu] + [jepsen.nukeeper.constants :refer :all] + [jepsen.control :as c] + [clojure.tools.logging :refer :all])) (defn parse-long "Parses a string to a Long. Passes through `nil` and empty strings." @@ -37,6 +41,10 @@ {:data (data/to-string (:data zk-result)) :stat (:stat zk-result)})) +(defn zk-list + [conn path] + (zk/children conn path)) + (defn zk-cas [conn path old-value new-value] (let [current-value (zk-get-str conn path)] @@ -54,3 +62,37 @@ (defn zk-create-if-not-exists [conn path data] (zk/create conn path :data (data/to-bytes (str data)))) + + +(defn clickhouse-alive? + [node test] + (info "Checking server alive on" node) + (try + (c/exec (str binary-path "/clickhouse") :client :--query "SELECT 1") + (catch Exception _ false))) + +(defn wait-clickhouse-alive! + [node test & {:keys [maxtries] :or {maxtries 30}}] + (loop [i 0] + (cond (> i maxtries) false + (clickhouse-alive? node test) true + :else (do (Thread/sleep 1000) (recur (inc i)))))) + +(defn kill-clickhouse! + [node test] + (info "Killing server on node" node) + (c/su + (cu/stop-daemon! (str binary-path "/clickhouse") pidfile))) + +(defn start-clickhouse! + [node test] + (info "Starting server on node" node) + (c/su + (cu/start-daemon! + {:pidfile pidfile + :logfile logfile + :chdir dir} + (str binary-path "/clickhouse") + :server + :--config "/etc/clickhouse-server/config.xml")) + (wait-clickhouse-alive! node test)) From 6454479edda94ed7df6b00e25f77388139ae0fb8 Mon Sep 17 00:00:00 2001 From: alesapin Date: Tue, 16 Mar 2021 14:44:43 +0300 Subject: [PATCH 014/155] Add useful util for state dump --- .../src/jepsen/nukeeper/constants.clj | 3 + .../src/jepsen/nukeeper/main.clj | 9 +- .../src/jepsen/nukeeper/nemesis.clj | 1 - .../src/jepsen/nukeeper/set.clj | 13 +-- utils/CMakeLists.txt | 1 + utils/nukeeper-data-dumper/CMakeLists.txt | 2 + utils/nukeeper-data-dumper/main.cpp | 87 +++++++++++++++++++ 7 files changed, 108 insertions(+), 8 deletions(-) create mode 100644 utils/nukeeper-data-dumper/CMakeLists.txt create mode 100644 utils/nukeeper-data-dumper/main.cpp diff --git a/tests/jepsen.nukeeper/src/jepsen/nukeeper/constants.clj b/tests/jepsen.nukeeper/src/jepsen/nukeeper/constants.clj index 0a20adea086..511ff8e3bf3 100644 --- a/tests/jepsen.nukeeper/src/jepsen/nukeeper/constants.clj +++ b/tests/jepsen.nukeeper/src/jepsen/nukeeper/constants.clj @@ -5,5 +5,8 @@ (def logdir "/var/log/clickhouse-server") (def logfile "/var/log/clickhouse-server/stderr.log") (def serverlog "/var/log/clickhouse-server/clickhouse-server.log") +(def snapshotsdir "/var/lib/clickhouse/coordination/snapshots") +(def coordinationdir "/var/lib/clickhouse/coordination") +(def logsdir "/var/lib/clickhouse/coordination/logs") (def pidfile (str dir "/clickhouse.pid")) (def binary-path "/tmp") diff --git a/tests/jepsen.nukeeper/src/jepsen/nukeeper/main.clj b/tests/jepsen.nukeeper/src/jepsen/nukeeper/main.clj index 2b244c924bd..1153f6f1389 100644 --- a/tests/jepsen.nukeeper/src/jepsen/nukeeper/main.clj +++ b/tests/jepsen.nukeeper/src/jepsen/nukeeper/main.clj @@ -15,7 +15,8 @@ [nemesis :as nemesis] [generator :as gen] [independent :as independent] - [tests :as tests]] + [tests :as tests] + [util :as util :refer [meh]]] [jepsen.control.util :as cu] [jepsen.os.ubuntu :as ubuntu] [jepsen.checker.timeline :as timeline] @@ -73,7 +74,11 @@ db/LogFiles (log-files [_ test node] - [logfile serverlog]))) + (c/su + (cu/stop-daemon! (str binary-path "/clickhouse") pidfile) + (c/cd dir + (c/exec :tar :czf "coordination.tar.gz" "coordination"))) + [logfile serverlog (str dir "/coordination.tar.gz")]))) (def workloads "A map of workload names to functions that construct workloads, given opts." diff --git a/tests/jepsen.nukeeper/src/jepsen/nukeeper/nemesis.clj b/tests/jepsen.nukeeper/src/jepsen/nukeeper/nemesis.clj index 2f359bc5cba..84253dd6d42 100644 --- a/tests/jepsen.nukeeper/src/jepsen/nukeeper/nemesis.clj +++ b/tests/jepsen.nukeeper/src/jepsen/nukeeper/nemesis.clj @@ -4,7 +4,6 @@ [jepsen.nukeeper.utils :refer :all])) - (defn random-single-node-killer-nemesis [] (nemesis/node-start-stopper diff --git a/tests/jepsen.nukeeper/src/jepsen/nukeeper/set.clj b/tests/jepsen.nukeeper/src/jepsen/nukeeper/set.clj index 6a33350673d..fcdfa138c4c 100644 --- a/tests/jepsen.nukeeper/src/jepsen/nukeeper/set.clj +++ b/tests/jepsen.nukeeper/src/jepsen/nukeeper/set.clj @@ -9,18 +9,21 @@ [zookeeper :as zk]) (:import (org.apache.zookeeper ZooKeeper KeeperException KeeperException$BadVersionException))) -(defrecord SetClient [k conn] +(defrecord SetClient [k conn nodename] client/Client (open! [this test node] - (assoc this :conn (zk-connect node 9181 30000))) + (assoc + (assoc this + :conn (zk-connect node 9181 30000)) + :nodename node)) (setup! [this test] (zk-create-if-not-exists conn k "#{}")) - (invoke! [_ test op] + (invoke! [this test op] (case (:f op) :read ;(try - (do (info "LIST ON NODE" (zk-list conn "/")) + (do (info "LIST ON NODE" nodename (zk-list conn "/")) (info "EXISTS NODE" (zk/exists conn "/a-set")) (assoc op :type :ok @@ -40,7 +43,7 @@ (defn workload "A generator, client, and checker for a set test." [opts] - {:client (SetClient. "/a-set" nil) + {:client (SetClient. "/a-set" nil nil) :checker (checker/set) :generator (->> (range) (map (fn [x] {:type :invoke, :f :add, :value x}))) diff --git a/utils/CMakeLists.txt b/utils/CMakeLists.txt index d38b34f3419..dc077f0e49a 100644 --- a/utils/CMakeLists.txt +++ b/utils/CMakeLists.txt @@ -21,6 +21,7 @@ if (NOT DEFINED ENABLE_UTILS OR ENABLE_UTILS) add_subdirectory (corrector_utf8) add_subdirectory (zookeeper-cli) add_subdirectory (zookeeper-test) + add_subdirectory (nukeeper-data-dumper) add_subdirectory (zookeeper-dump-tree) add_subdirectory (zookeeper-remove-by-list) add_subdirectory (zookeeper-create-entry-to-download-part) diff --git a/utils/nukeeper-data-dumper/CMakeLists.txt b/utils/nukeeper-data-dumper/CMakeLists.txt new file mode 100644 index 00000000000..bab1137bf4d --- /dev/null +++ b/utils/nukeeper-data-dumper/CMakeLists.txt @@ -0,0 +1,2 @@ +add_executable(nukeeper-data-dumper main.cpp) +target_link_libraries(nukeeper-data-dumper PRIVATE dbms) diff --git a/utils/nukeeper-data-dumper/main.cpp b/utils/nukeeper-data-dumper/main.cpp new file mode 100644 index 00000000000..20682bdb366 --- /dev/null +++ b/utils/nukeeper-data-dumper/main.cpp @@ -0,0 +1,87 @@ +#include +#include +#include +#include +#include +#include +#include // Y_IGNORE +#include +#include +#include + +using namespace Coordination; +using namespace DB; + +void dumpMachine(std::shared_ptr machine) +{ + auto & storage = machine->getStorage(); + std::queue keys; + keys.push("/"); + + while (!keys.empty()) + { + auto key = keys.front(); + keys.pop(); + auto value = storage.container.getValue(key); + std::cout << key << "\n"; + std::cout << "\tStat: {version: " << value.stat.version << + ", mtime: " << value.stat.mtime << + ", emphemeralOwner: " << value.stat.ephemeralOwner << + ", czxid: " << value.stat.czxid << + ", mzxid: " << value.stat.mzxid << + ", numChildren: " << value.stat.numChildren << + ", dataLength: " << value.stat.dataLength << + "}" << std::endl; + std::cout << "\tData: " << storage.container.getValue(key).data << std::endl; + + for (const auto & child : value.children) + { + if (key == "/") + keys.push(key + child); + else + keys.push(key + "/" + child); + } + } + std::cout << std::flush; +} + +int main(int argc, char *argv[]) +{ + if (argc != 3) + { + std::cerr << "usage: " << argv[0] << " snapshotpath logpath" << std::endl; + return 3; + } + else + { + Poco::AutoPtr channel(new Poco::ConsoleChannel(std::cerr)); + Poco::Logger::root().setChannel(channel); + Poco::Logger::root().setLevel("trace"); + } + auto * logger = &Poco::Logger::get("nukeeper-dumper"); + ResponsesQueue queue; + SnapshotsQueue snapshots_queue{1}; + CoordinationSettingsPtr settings = std::make_shared(); + auto state_machine = std::make_shared(queue, snapshots_queue, argv[1], settings); + state_machine->init(); + size_t last_commited_index = state_machine->last_commit_index(); + + LOG_INFO(logger, "Last commited index: {}", last_commited_index); + + DB::NuKeeperLogStore changelog(argv[2], 10000000, true); + changelog.init(last_commited_index, 10000000000UL); /// collect all logs + if (changelog.size() == 0) + LOG_INFO(logger, "Changelog empty"); + else + LOG_INFO(logger, "Last changelog entry {}", changelog.next_slot() - 1); + + for (size_t i = last_commited_index + 1; i < changelog.next_slot(); ++i) + { + if (changelog.entry_at(i)->get_val_type() == nuraft::log_val_type::app_log) + state_machine->commit(i, changelog.entry_at(i)->get_buf()); + } + + dumpMachine(state_machine); + + return 0; +} From 077a2019b6e577b530c7edd116b16dbe35168692 Mon Sep 17 00:00:00 2001 From: alesapin Date: Tue, 16 Mar 2021 15:36:54 +0300 Subject: [PATCH 015/155] Found first real bug with jepsen --- src/Coordination/NuKeeperStorage.cpp | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/src/Coordination/NuKeeperStorage.cpp b/src/Coordination/NuKeeperStorage.cpp index fff44163b71..2440d6f6613 100644 --- a/src/Coordination/NuKeeperStorage.cpp +++ b/src/Coordination/NuKeeperStorage.cpp @@ -641,6 +641,13 @@ NuKeeperStorage::ResponsesForSessions NuKeeperStorage::processRequest(const Coor for (const auto & ephemeral_path : it->second) { container.erase(ephemeral_path); + container.updateValue(parentPath(ephemeral_path), [&ephemeral_path] (NuKeeperStorage::Node & parent) + { + --parent.stat.numChildren; + ++parent.stat.cversion; + parent.children.erase(getBaseName(ephemeral_path)); + }); + auto responses = processWatchesImpl(ephemeral_path, watches, list_watches, Coordination::Event::DELETED); results.insert(results.end(), responses.begin(), responses.end()); } From 8cf8265d474b038c60ffdb5a855451cadb24520c Mon Sep 17 00:00:00 2001 From: alesapin Date: Tue, 16 Mar 2021 15:37:46 +0300 Subject: [PATCH 016/155] Style --- .../src/jepsen/nukeeper/main.clj | 9 ++++--- .../src/jepsen/nukeeper/nemesis.clj | 26 +++++++++++++++---- .../src/jepsen/nukeeper/set.clj | 18 ++++++------- .../src/jepsen/nukeeper/utils.clj | 21 +++++++-------- utils/nukeeper-data-dumper/main.cpp | 2 +- 5 files changed, 46 insertions(+), 30 deletions(-) diff --git a/tests/jepsen.nukeeper/src/jepsen/nukeeper/main.clj b/tests/jepsen.nukeeper/src/jepsen/nukeeper/main.clj index 1153f6f1389..dd40b7e399b 100644 --- a/tests/jepsen.nukeeper/src/jepsen/nukeeper/main.clj +++ b/tests/jepsen.nukeeper/src/jepsen/nukeeper/main.clj @@ -75,9 +75,9 @@ db/LogFiles (log-files [_ test node] (c/su - (cu/stop-daemon! (str binary-path "/clickhouse") pidfile) - (c/cd dir - (c/exec :tar :czf "coordination.tar.gz" "coordination"))) + (cu/stop-daemon! (str binary-path "/clickhouse") pidfile) + (c/cd dir + (c/exec :tar :czf "coordination.tar.gz" "coordination"))) [logfile serverlog (str dir "/coordination.tar.gz")]))) (def workloads @@ -105,7 +105,8 @@ :concurrency, ...), constructs a test map." [opts] (let [quorum (boolean (:quorum opts)) - workload ((get workloads (:workload opts)) opts)] + workload ((get workloads (:workload opts)) opts) + current-nemesis (get custom-nemesis/custom-nemesises "killer")] (merge tests/noop-test opts {:name (str "clickhouse-keeper quorum=" quorum " " (name (:workload opts))) diff --git a/tests/jepsen.nukeeper/src/jepsen/nukeeper/nemesis.clj b/tests/jepsen.nukeeper/src/jepsen/nukeeper/nemesis.clj index 84253dd6d42..620ad1bd3d3 100644 --- a/tests/jepsen.nukeeper/src/jepsen/nukeeper/nemesis.clj +++ b/tests/jepsen.nukeeper/src/jepsen/nukeeper/nemesis.clj @@ -1,12 +1,28 @@ (ns jepsen.nukeeper.nemesis (:require [jepsen - [nemesis :as nemesis]] + [nemesis :as nemesis] + [generator :as gen]] [jepsen.nukeeper.utils :refer :all])) - (defn random-single-node-killer-nemesis [] (nemesis/node-start-stopper - rand-nth - (fn start [test node] (kill-clickhouse! node test)) - (fn stop [test node] (start-clickhouse! node test)))) + rand-nth + (fn start [test node] (kill-clickhouse! node test)) + (fn stop [test node] (start-clickhouse! node test)))) + +(def custom-nemesises + {"killer" {:nemesis (random-single-node-killer-nemesis) + :generator + (gen/nemesis + (cycle [(gen/sleep 5) + {:type :info, :f :start} + (gen/sleep 5) + {:type :info, :f :stop}]))} + "simple-partitioner" {:nemesis (nemesis/partition-random-halves) + :generator + (gen/nemesis + (cycle [(gen/sleep 5) + {:type :info, :f :start} + (gen/sleep 5) + {:type :info, :f :stop}]))}}) diff --git a/tests/jepsen.nukeeper/src/jepsen/nukeeper/set.clj b/tests/jepsen.nukeeper/src/jepsen/nukeeper/set.clj index fcdfa138c4c..f2f614b2d17 100644 --- a/tests/jepsen.nukeeper/src/jepsen/nukeeper/set.clj +++ b/tests/jepsen.nukeeper/src/jepsen/nukeeper/set.clj @@ -1,12 +1,12 @@ (ns jepsen.nukeeper.set (:require - [clojure.tools.logging :refer :all] - [jepsen - [checker :as checker] - [client :as client] - [generator :as gen]] - [jepsen.nukeeper.utils :refer :all] - [zookeeper :as zk]) + [clojure.tools.logging :refer :all] + [jepsen + [checker :as checker] + [client :as client] + [generator :as gen]] + [jepsen.nukeeper.utils :refer :all] + [zookeeper :as zk]) (:import (org.apache.zookeeper ZooKeeper KeeperException KeeperException$BadVersionException))) (defrecord SetClient [k conn nodename] @@ -26,8 +26,8 @@ (do (info "LIST ON NODE" nodename (zk-list conn "/")) (info "EXISTS NODE" (zk/exists conn "/a-set")) (assoc op - :type :ok - :value (read-string (:data (zk-get-str conn k))))) + :type :ok + :value (read-string (:data (zk-get-str conn k))))) ;(catch Exception _ (assoc op :type :fail, :error :connect-error))) :add (try (do diff --git a/tests/jepsen.nukeeper/src/jepsen/nukeeper/utils.clj b/tests/jepsen.nukeeper/src/jepsen/nukeeper/utils.clj index e398039a329..19b4959d742 100644 --- a/tests/jepsen.nukeeper/src/jepsen/nukeeper/utils.clj +++ b/tests/jepsen.nukeeper/src/jepsen/nukeeper/utils.clj @@ -63,13 +63,12 @@ [conn path data] (zk/create conn path :data (data/to-bytes (str data)))) - (defn clickhouse-alive? [node test] (info "Checking server alive on" node) (try - (c/exec (str binary-path "/clickhouse") :client :--query "SELECT 1") - (catch Exception _ false))) + (c/exec (str binary-path "/clickhouse") :client :--query "SELECT 1") + (catch Exception _ false))) (defn wait-clickhouse-alive! [node test & {:keys [maxtries] :or {maxtries 30}}] @@ -82,17 +81,17 @@ [node test] (info "Killing server on node" node) (c/su - (cu/stop-daemon! (str binary-path "/clickhouse") pidfile))) + (cu/stop-daemon! (str binary-path "/clickhouse") pidfile))) (defn start-clickhouse! [node test] (info "Starting server on node" node) (c/su - (cu/start-daemon! - {:pidfile pidfile - :logfile logfile - :chdir dir} - (str binary-path "/clickhouse") - :server - :--config "/etc/clickhouse-server/config.xml")) + (cu/start-daemon! + {:pidfile pidfile + :logfile logfile + :chdir dir} + (str binary-path "/clickhouse") + :server + :--config "/etc/clickhouse-server/config.xml")) (wait-clickhouse-alive! node test)) diff --git a/utils/nukeeper-data-dumper/main.cpp b/utils/nukeeper-data-dumper/main.cpp index 20682bdb366..0340c94c5a0 100644 --- a/utils/nukeeper-data-dumper/main.cpp +++ b/utils/nukeeper-data-dumper/main.cpp @@ -22,8 +22,8 @@ void dumpMachine(std::shared_ptr machine) { auto key = keys.front(); keys.pop(); - auto value = storage.container.getValue(key); std::cout << key << "\n"; + auto value = storage.container.getValue(key); std::cout << "\tStat: {version: " << value.stat.version << ", mtime: " << value.stat.mtime << ", emphemeralOwner: " << value.stat.ephemeralOwner << From 63873f46bbb791cc9f5f094af3a7b35a64e4f04a Mon Sep 17 00:00:00 2001 From: alesapin Date: Tue, 16 Mar 2021 15:40:28 +0300 Subject: [PATCH 017/155] Create persistent nodes in tests --- tests/jepsen.nukeeper/src/jepsen/nukeeper/set.clj | 6 ++---- tests/jepsen.nukeeper/src/jepsen/nukeeper/utils.clj | 2 +- 2 files changed, 3 insertions(+), 5 deletions(-) diff --git a/tests/jepsen.nukeeper/src/jepsen/nukeeper/set.clj b/tests/jepsen.nukeeper/src/jepsen/nukeeper/set.clj index f2f614b2d17..deb69c3ced4 100644 --- a/tests/jepsen.nukeeper/src/jepsen/nukeeper/set.clj +++ b/tests/jepsen.nukeeper/src/jepsen/nukeeper/set.clj @@ -22,13 +22,11 @@ (invoke! [this test op] (case (:f op) - :read ;(try - (do (info "LIST ON NODE" nodename (zk-list conn "/")) - (info "EXISTS NODE" (zk/exists conn "/a-set")) + :read + (do (assoc op :type :ok :value (read-string (:data (zk-get-str conn k))))) - ;(catch Exception _ (assoc op :type :fail, :error :connect-error))) :add (try (do (zk-add-to-set conn k (:value op)) diff --git a/tests/jepsen.nukeeper/src/jepsen/nukeeper/utils.clj b/tests/jepsen.nukeeper/src/jepsen/nukeeper/utils.clj index 19b4959d742..9912b34cd46 100644 --- a/tests/jepsen.nukeeper/src/jepsen/nukeeper/utils.clj +++ b/tests/jepsen.nukeeper/src/jepsen/nukeeper/utils.clj @@ -61,7 +61,7 @@ (defn zk-create-if-not-exists [conn path data] - (zk/create conn path :data (data/to-bytes (str data)))) + (zk/create conn path :data (data/to-bytes (str data)) :persistent? true)) (defn clickhouse-alive? [node test] From 54fbea68a194cccfd286cc76b9224684667ec5f8 Mon Sep 17 00:00:00 2001 From: alesapin Date: Tue, 16 Mar 2021 17:53:49 +0300 Subject: [PATCH 018/155] Add hammer-time nemesis --- tests/jepsen.nukeeper/src/jepsen/nukeeper/main.clj | 4 ++-- tests/jepsen.nukeeper/src/jepsen/nukeeper/nemesis.clj | 4 ++++ 2 files changed, 6 insertions(+), 2 deletions(-) diff --git a/tests/jepsen.nukeeper/src/jepsen/nukeeper/main.clj b/tests/jepsen.nukeeper/src/jepsen/nukeeper/main.clj index dd40b7e399b..6e3777d3141 100644 --- a/tests/jepsen.nukeeper/src/jepsen/nukeeper/main.clj +++ b/tests/jepsen.nukeeper/src/jepsen/nukeeper/main.clj @@ -111,10 +111,10 @@ opts {:name (str "clickhouse-keeper quorum=" quorum " " (name (:workload opts))) :os ubuntu/os - :db (db "rbtorrent:46832e8fa975b094a5591184b3c854700ed770f4") + :db (db "rbtorrent:a122093aee0bdcb70ca42d5e5fb4ba5544372f5f") :pure-generators true :client (:client workload) - :nemesis (custom-nemesis/random-single-node-killer-nemesis) + :nemesis (custom-nemesis/hammer-time-nemesis) :checker (checker/compose {:perf (checker/perf) :workload (:checker workload)}) diff --git a/tests/jepsen.nukeeper/src/jepsen/nukeeper/nemesis.clj b/tests/jepsen.nukeeper/src/jepsen/nukeeper/nemesis.clj index 620ad1bd3d3..f3e01714128 100644 --- a/tests/jepsen.nukeeper/src/jepsen/nukeeper/nemesis.clj +++ b/tests/jepsen.nukeeper/src/jepsen/nukeeper/nemesis.clj @@ -11,6 +11,10 @@ (fn start [test node] (kill-clickhouse! node test)) (fn stop [test node] (start-clickhouse! node test)))) +(defn hammer-time-nemesis + [] + (nemesis/hammer-time "clickhouse")) + (def custom-nemesises {"killer" {:nemesis (random-single-node-killer-nemesis) :generator From 82b2c34c4029ab0dd80ba0bf97974d2ffb1285d2 Mon Sep 17 00:00:00 2001 From: alesapin Date: Tue, 16 Mar 2021 23:27:09 +0300 Subject: [PATCH 019/155] Remove strange file --- tests/jepsen.nukeeper/CHANGELOG.md | 24 ------------------------ 1 file changed, 24 deletions(-) delete mode 100644 tests/jepsen.nukeeper/CHANGELOG.md diff --git a/tests/jepsen.nukeeper/CHANGELOG.md b/tests/jepsen.nukeeper/CHANGELOG.md deleted file mode 100644 index 6c7cb4f7c8a..00000000000 --- a/tests/jepsen.nukeeper/CHANGELOG.md +++ /dev/null @@ -1,24 +0,0 @@ -# Change Log -All notable changes to this project will be documented in this file. This change log follows the conventions of [keepachangelog.com](http://keepachangelog.com/). - -## [Unreleased] -### Changed -- Add a new arity to `make-widget-async` to provide a different widget shape. - -## [0.1.1] - 2021-03-10 -### Changed -- Documentation on how to make the widgets. - -### Removed -- `make-widget-sync` - we're all async, all the time. - -### Fixed -- Fixed widget maker to keep working when daylight savings switches over. - -## 0.1.0 - 2021-03-10 -### Added -- Files from the new template. -- Widget maker public API - `make-widget-sync`. - -[Unreleased]: https://github.com/your-name/jepsen.nukeeper/compare/0.1.1...HEAD -[0.1.1]: https://github.com/your-name/jepsen.nukeeper/compare/0.1.0...0.1.1 From 46af999f3aea5db1963ed241062ed3048af8f103 Mon Sep 17 00:00:00 2001 From: alesapin Date: Wed, 17 Mar 2021 10:11:55 +0300 Subject: [PATCH 020/155] Trying to add corruption nemesis --- .../src/jepsen/nukeeper/main.clj | 6 +-- .../src/jepsen/nukeeper/nemesis.clj | 44 ++++++++++++++++++- .../src/jepsen/nukeeper/set.clj | 6 +-- 3 files changed, 48 insertions(+), 8 deletions(-) diff --git a/tests/jepsen.nukeeper/src/jepsen/nukeeper/main.clj b/tests/jepsen.nukeeper/src/jepsen/nukeeper/main.clj index 6e3777d3141..d62cbabd56f 100644 --- a/tests/jepsen.nukeeper/src/jepsen/nukeeper/main.clj +++ b/tests/jepsen.nukeeper/src/jepsen/nukeeper/main.clj @@ -114,7 +114,7 @@ :db (db "rbtorrent:a122093aee0bdcb70ca42d5e5fb4ba5544372f5f") :pure-generators true :client (:client workload) - :nemesis (custom-nemesis/hammer-time-nemesis) + :nemesis (custom-nemesis/logs-corruption-nemesis) :checker (checker/compose {:perf (checker/perf) :workload (:checker workload)}) @@ -123,9 +123,7 @@ (gen/stagger (/ (:rate opts))) (gen/nemesis (cycle [(gen/sleep 5) - {:type :info, :f :start} - (gen/sleep 5) - {:type :info, :f :stop}])) + {:type :info, :f :corrupt}])) (gen/time-limit (:time-limit opts))) (gen/log "Healing cluster") (gen/nemesis (gen/once {:type :info, :f :stop})) diff --git a/tests/jepsen.nukeeper/src/jepsen/nukeeper/nemesis.clj b/tests/jepsen.nukeeper/src/jepsen/nukeeper/nemesis.clj index f3e01714128..6b0497cd0af 100644 --- a/tests/jepsen.nukeeper/src/jepsen/nukeeper/nemesis.clj +++ b/tests/jepsen.nukeeper/src/jepsen/nukeeper/nemesis.clj @@ -1,7 +1,11 @@ (ns jepsen.nukeeper.nemesis - (:require [jepsen + (:require + [clojure.tools.logging :refer :all] + [jepsen [nemesis :as nemesis] + [control :as c] [generator :as gen]] + [jepsen.nukeeper.constants :refer :all] [jepsen.nukeeper.utils :refer :all])) (defn random-single-node-killer-nemesis @@ -15,6 +19,44 @@ [] (nemesis/hammer-time "clickhouse")) +(defn select-last-file + [path] + (info "EXECUTE ON PATH" path) + (last (clojure.string/split (c/exec :find path :-type :f :-printf "%T+ $PWD%p\n" :| :sort :| :awk "'{print $2}'")) #"\n")) + +(defn corrupt-file + [fname] + (c/exec :dd "if=/dev/zero" ("str of=" fname) "bs=1" "count=1" "seek=N" "conv=notrunc")) + +(defn corruptor-nemesis + [path corruption-op] + (reify nemesis/Nemesis + (setup! [this test] this) + + (invoke! [this test op] + (let [nodes (list (rand-nth (:nodes test)))] + (info "Corruption on node" nodes) + (c/on-nodes test nodes + (fn [node] + (let [file-to-corrupt (select-last-file path)] + (info "Corrupting file" file-to-corrupt) + (c/su + (corruption-op (select-last-file path)) + (kill-clickhouse! node test) + (start-clickhouse! node test))))) + {:f (:f op) + :value :corrupted})) + + (teardown! [this test]))) + +(defn logs-corruption-nemesis + [] + (corruptor-nemesis logsdir corrupt-file)) + +(defn snapshots-corruption-nemesis + [] + (corruptor-nemesis snapshotsdir corrupt-file)) + (def custom-nemesises {"killer" {:nemesis (random-single-node-killer-nemesis) :generator diff --git a/tests/jepsen.nukeeper/src/jepsen/nukeeper/set.clj b/tests/jepsen.nukeeper/src/jepsen/nukeeper/set.clj index deb69c3ced4..d50253aa174 100644 --- a/tests/jepsen.nukeeper/src/jepsen/nukeeper/set.clj +++ b/tests/jepsen.nukeeper/src/jepsen/nukeeper/set.clj @@ -24,9 +24,9 @@ (case (:f op) :read (do - (assoc op - :type :ok - :value (read-string (:data (zk-get-str conn k))))) + (assoc op + :type :ok + :value (read-string (:data (zk-get-str conn k))))) :add (try (do (zk-add-to-set conn k (:value op)) From d9f835a242743116332604fde39db3c74aa0afc9 Mon Sep 17 00:00:00 2001 From: alesapin Date: Wed, 17 Mar 2021 11:13:52 +0300 Subject: [PATCH 021/155] Finally corrupted logs --- .../src/jepsen/nukeeper/nemesis.clj | 38 ++++++++++++------- 1 file changed, 24 insertions(+), 14 deletions(-) diff --git a/tests/jepsen.nukeeper/src/jepsen/nukeeper/nemesis.clj b/tests/jepsen.nukeeper/src/jepsen/nukeeper/nemesis.clj index 6b0497cd0af..bf2348f1860 100644 --- a/tests/jepsen.nukeeper/src/jepsen/nukeeper/nemesis.clj +++ b/tests/jepsen.nukeeper/src/jepsen/nukeeper/nemesis.clj @@ -22,30 +22,40 @@ (defn select-last-file [path] (info "EXECUTE ON PATH" path) - (last (clojure.string/split (c/exec :find path :-type :f :-printf "%T+ $PWD%p\n" :| :sort :| :awk "'{print $2}'")) #"\n")) + (last (clojure.string/split + (c/exec :find path :-type :f :-printf "%T+ %p\n" :| :sort :| :awk "{print $2}") + #"\n"))) + +(defn random-file-pos + [fname] + (let [fsize (Integer/parseInt (c/exec :du :-b fname :| :cut :-f1))] + (rand-int fsize))) (defn corrupt-file [fname] - (c/exec :dd "if=/dev/zero" ("str of=" fname) "bs=1" "count=1" "seek=N" "conv=notrunc")) + (info "Corrupting" fname) + (c/exec :dd "if=/dev/zero" (str "of=" fname) "bs=1" "count=1" (str "seek=" (random-file-pos fname)) "conv=notrunc")) (defn corruptor-nemesis [path corruption-op] (reify nemesis/Nemesis + (setup! [this test] this) (invoke! [this test op] - (let [nodes (list (rand-nth (:nodes test)))] - (info "Corruption on node" nodes) - (c/on-nodes test nodes - (fn [node] - (let [file-to-corrupt (select-last-file path)] - (info "Corrupting file" file-to-corrupt) - (c/su - (corruption-op (select-last-file path)) - (kill-clickhouse! node test) - (start-clickhouse! node test))))) - {:f (:f op) - :value :corrupted})) + (cond (= (:f op) :corrupt) + (let [nodes (list (rand-nth (:nodes test)))] + (info "Corruption on node" nodes) + (c/on-nodes test nodes + (fn [test node] + (let [file-to-corrupt (select-last-file path)] + (info "Corrupting file" file-to-corrupt) + (c/su + (corruption-op (select-last-file path)) + (kill-clickhouse! node test) + (start-clickhouse! node test))))) + (assoc op :type :info, :value :corrupted)) + :else (assoc op :type :info, :value :not-started))) (teardown! [this test]))) From 5e20ea2c33a19cdc053a86c94295dba3671a7ba2 Mon Sep 17 00:00:00 2001 From: fuqi Date: Wed, 17 Mar 2021 18:49:24 +0800 Subject: [PATCH 022/155] optimize select final with prewhere primary key --- src/Interpreters/InterpreterSelectQuery.cpp | 2 +- .../MergeTree/MergeTreeWhereOptimizer.cpp | 23 +++++++++++-------- .../MergeTree/MergeTreeWhereOptimizer.h | 9 +++++--- 3 files changed, 21 insertions(+), 13 deletions(-) diff --git a/src/Interpreters/InterpreterSelectQuery.cpp b/src/Interpreters/InterpreterSelectQuery.cpp index d0c8966cf07..45ded5223e9 100644 --- a/src/Interpreters/InterpreterSelectQuery.cpp +++ b/src/Interpreters/InterpreterSelectQuery.cpp @@ -393,7 +393,7 @@ InterpreterSelectQuery::InterpreterSelectQuery( view = nullptr; } - if (try_move_to_prewhere && storage && query.where() && !query.prewhere() && !query.final()) + if (try_move_to_prewhere && storage && query.where() && !query.prewhere()) { /// PREWHERE optimization: transfer some condition from WHERE to PREWHERE if enabled and viable if (const auto & column_sizes = storage->getColumnSizes(); !column_sizes.empty()) diff --git a/src/Storages/MergeTree/MergeTreeWhereOptimizer.cpp b/src/Storages/MergeTree/MergeTreeWhereOptimizer.cpp index 34cac56d74c..b80c0700602 100644 --- a/src/Storages/MergeTree/MergeTreeWhereOptimizer.cpp +++ b/src/Storages/MergeTree/MergeTreeWhereOptimizer.cpp @@ -36,7 +36,8 @@ MergeTreeWhereOptimizer::MergeTreeWhereOptimizer( Poco::Logger * log_) : table_columns{ext::map( metadata_snapshot->getColumns().getAllPhysical(), [](const NameAndTypePair & col) { return col.name; })} - , queried_columns{queried_columns_} + , queried_columns{queried_columns_}, + , primary_key_columns{metadata_snapshot->getPrimaryKey().column_names} , block_with_constants{KeyCondition::getBlockWithConstants(query_info.query, query_info.syntax_analyzer_result, context)} , log{log_} , column_sizes{std::move(column_sizes_)} @@ -114,7 +115,7 @@ static bool isConditionGood(const ASTPtr & condition) } -void MergeTreeWhereOptimizer::analyzeImpl(Conditions & res, const ASTPtr & node) const +void MergeTreeWhereOptimizer::analyzeImpl(Conditions & res, const ASTPtr & node, bool final) const { if (const auto * func_and = node->as(); func_and && func_and->name == "and") { @@ -133,7 +134,7 @@ void MergeTreeWhereOptimizer::analyzeImpl(Conditions & res, const ASTPtr & node) cond.viable = /// Condition depend on some column. Constant expressions are not moved. !cond.identifiers.empty() - && !cannotBeMoved(node) + && !cannotBeMoved(node, final) /// Do not take into consideration the conditions consisting only of the first primary key column && !hasPrimaryKeyAtoms(node) /// Only table columns are considered. Not array joined columns. NOTE We're assuming that aliases was expanded. @@ -149,10 +150,10 @@ void MergeTreeWhereOptimizer::analyzeImpl(Conditions & res, const ASTPtr & node) } /// Transform conjunctions chain in WHERE expression to Conditions list. -MergeTreeWhereOptimizer::Conditions MergeTreeWhereOptimizer::analyze(const ASTPtr & expression) const +MergeTreeWhereOptimizer::Conditions MergeTreeWhereOptimizer::analyze(const ASTPtr & expression, bool final) const { Conditions res; - analyzeImpl(res, expression); + analyzeImpl(res, expression, final); return res; } @@ -183,7 +184,7 @@ void MergeTreeWhereOptimizer::optimize(ASTSelectQuery & select) const if (!select.where() || select.prewhere()) return; - Conditions where_conditions = analyze(select.where()); + Conditions where_conditions = analyze(select.where(), select.final()); Conditions prewhere_conditions; UInt64 total_size_of_moved_conditions = 0; @@ -299,6 +300,9 @@ bool MergeTreeWhereOptimizer::isPrimaryKeyAtom(const ASTPtr & ast) const return false; } +bool MergeTreeWhereOptimizer::isPrimaryKey(const String & columnName) const { + return std::find(primary_key_columns.begin(), primary_key_columns.end(), columnName) != primary_key_columns.end(); +} bool MergeTreeWhereOptimizer::isConstant(const ASTPtr & expr) const { @@ -319,7 +323,7 @@ bool MergeTreeWhereOptimizer::isSubsetOfTableColumns(const NameSet & identifiers } -bool MergeTreeWhereOptimizer::cannotBeMoved(const ASTPtr & ptr) const +bool MergeTreeWhereOptimizer::cannotBeMoved(const ASTPtr & ptr, bool final) const { if (const auto * function_ptr = ptr->as()) { @@ -336,12 +340,13 @@ bool MergeTreeWhereOptimizer::cannotBeMoved(const ASTPtr & ptr) const { /// disallow moving result of ARRAY JOIN to PREWHERE if (array_joined_names.count(*opt_name) || - array_joined_names.count(Nested::extractTableName(*opt_name))) + array_joined_names.count(Nested::extractTableName(*opt_name)) || + (final && !isPrimaryKey(*opt_name))) return true; } for (const auto & child : ptr->children) - if (cannotBeMoved(child)) + if (cannotBeMoved(child, final)) return true; return false; diff --git a/src/Storages/MergeTree/MergeTreeWhereOptimizer.h b/src/Storages/MergeTree/MergeTreeWhereOptimizer.h index cad77fb9eed..83c45efef74 100644 --- a/src/Storages/MergeTree/MergeTreeWhereOptimizer.h +++ b/src/Storages/MergeTree/MergeTreeWhereOptimizer.h @@ -67,10 +67,10 @@ private: using Conditions = std::list; - void analyzeImpl(Conditions & res, const ASTPtr & node) const; + void analyzeImpl(Conditions & res, const ASTPtr & node, bool final) const; /// Transform conjunctions chain in WHERE expression to Conditions list. - Conditions analyze(const ASTPtr & expression) const; + Conditions analyze(const ASTPtr & expression, bool final) const; /// Transform Conditions list to WHERE or PREWHERE expression. static ASTPtr reconstruct(const Conditions & conditions); @@ -85,6 +85,8 @@ private: bool isPrimaryKeyAtom(const ASTPtr & ast) const; + bool isPrimaryKey(const String & columnName) const; + bool isConstant(const ASTPtr & expr) const; bool isSubsetOfTableColumns(const NameSet & identifiers) const; @@ -95,7 +97,7 @@ private: * * Also, disallow moving expressions with GLOBAL [NOT] IN. */ - bool cannotBeMoved(const ASTPtr & ptr) const; + bool cannotBeMoved(const ASTPtr & ptr, bool final) const; void determineArrayJoinedNames(ASTSelectQuery & select); @@ -104,6 +106,7 @@ private: String first_primary_key_column; const StringSet table_columns; const Names queried_columns; + const Names primary_key_columns; const Block block_with_constants; Poco::Logger * log; std::unordered_map column_sizes; From 9ab713c2e17b3b055ad76224c095636c5d4b5663 Mon Sep 17 00:00:00 2001 From: fuqi Date: Wed, 17 Mar 2021 18:57:58 +0800 Subject: [PATCH 023/155] optimize select final with prewhere primary key --- src/Storages/MergeTree/MergeTreeWhereOptimizer.cpp | 14 +++++++------- src/Storages/MergeTree/MergeTreeWhereOptimizer.h | 6 +++--- 2 files changed, 10 insertions(+), 10 deletions(-) diff --git a/src/Storages/MergeTree/MergeTreeWhereOptimizer.cpp b/src/Storages/MergeTree/MergeTreeWhereOptimizer.cpp index b80c0700602..792884689d5 100644 --- a/src/Storages/MergeTree/MergeTreeWhereOptimizer.cpp +++ b/src/Storages/MergeTree/MergeTreeWhereOptimizer.cpp @@ -115,7 +115,7 @@ static bool isConditionGood(const ASTPtr & condition) } -void MergeTreeWhereOptimizer::analyzeImpl(Conditions & res, const ASTPtr & node, bool final) const +void MergeTreeWhereOptimizer::analyzeImpl(Conditions & res, const ASTPtr & node, bool isFinal) const { if (const auto * func_and = node->as(); func_and && func_and->name == "and") { @@ -134,7 +134,7 @@ void MergeTreeWhereOptimizer::analyzeImpl(Conditions & res, const ASTPtr & node, cond.viable = /// Condition depend on some column. Constant expressions are not moved. !cond.identifiers.empty() - && !cannotBeMoved(node, final) + && !cannotBeMoved(node, isFinal) /// Do not take into consideration the conditions consisting only of the first primary key column && !hasPrimaryKeyAtoms(node) /// Only table columns are considered. Not array joined columns. NOTE We're assuming that aliases was expanded. @@ -150,10 +150,10 @@ void MergeTreeWhereOptimizer::analyzeImpl(Conditions & res, const ASTPtr & node, } /// Transform conjunctions chain in WHERE expression to Conditions list. -MergeTreeWhereOptimizer::Conditions MergeTreeWhereOptimizer::analyze(const ASTPtr & expression, bool final) const +MergeTreeWhereOptimizer::Conditions MergeTreeWhereOptimizer::analyze(const ASTPtr & expression, bool isFinal) const { Conditions res; - analyzeImpl(res, expression, final); + analyzeImpl(res, expression, isFinal); return res; } @@ -323,7 +323,7 @@ bool MergeTreeWhereOptimizer::isSubsetOfTableColumns(const NameSet & identifiers } -bool MergeTreeWhereOptimizer::cannotBeMoved(const ASTPtr & ptr, bool final) const +bool MergeTreeWhereOptimizer::cannotBeMoved(const ASTPtr & ptr, bool isFinal) const { if (const auto * function_ptr = ptr->as()) { @@ -341,12 +341,12 @@ bool MergeTreeWhereOptimizer::cannotBeMoved(const ASTPtr & ptr, bool final) cons /// disallow moving result of ARRAY JOIN to PREWHERE if (array_joined_names.count(*opt_name) || array_joined_names.count(Nested::extractTableName(*opt_name)) || - (final && !isPrimaryKey(*opt_name))) + (isFinal && !isPrimaryKey(*opt_name))) return true; } for (const auto & child : ptr->children) - if (cannotBeMoved(child, final)) + if (cannotBeMoved(child, isFinal)) return true; return false; diff --git a/src/Storages/MergeTree/MergeTreeWhereOptimizer.h b/src/Storages/MergeTree/MergeTreeWhereOptimizer.h index 83c45efef74..45eb077ed96 100644 --- a/src/Storages/MergeTree/MergeTreeWhereOptimizer.h +++ b/src/Storages/MergeTree/MergeTreeWhereOptimizer.h @@ -67,10 +67,10 @@ private: using Conditions = std::list; - void analyzeImpl(Conditions & res, const ASTPtr & node, bool final) const; + void analyzeImpl(Conditions & res, const ASTPtr & node, bool isFinal) const; /// Transform conjunctions chain in WHERE expression to Conditions list. - Conditions analyze(const ASTPtr & expression, bool final) const; + Conditions analyze(const ASTPtr & expression, bool isFinal) const; /// Transform Conditions list to WHERE or PREWHERE expression. static ASTPtr reconstruct(const Conditions & conditions); @@ -97,7 +97,7 @@ private: * * Also, disallow moving expressions with GLOBAL [NOT] IN. */ - bool cannotBeMoved(const ASTPtr & ptr, bool final) const; + bool cannotBeMoved(const ASTPtr & ptr, bool isFinal) const; void determineArrayJoinedNames(ASTSelectQuery & select); From 341e22341944a405af306e5dd75631f81228c8e6 Mon Sep 17 00:00:00 2001 From: alesapin Date: Wed, 17 Mar 2021 14:35:37 +0300 Subject: [PATCH 024/155] Better corruption nemesises, options --- .../resources/test_keeper_config.xml | 3 + .../src/jepsen/nukeeper/main.clj | 45 +++++--- .../src/jepsen/nukeeper/nemesis.clj | 103 +++++++++++------- .../src/jepsen/nukeeper/utils.clj | 3 +- 4 files changed, 99 insertions(+), 55 deletions(-) diff --git a/tests/jepsen.nukeeper/resources/test_keeper_config.xml b/tests/jepsen.nukeeper/resources/test_keeper_config.xml index 0e2a688ea0b..7ef34d4bea1 100644 --- a/tests/jepsen.nukeeper/resources/test_keeper_config.xml +++ b/tests/jepsen.nukeeper/resources/test_keeper_config.xml @@ -10,6 +10,9 @@ 60000 trace {quorum_reads} + {snapshot_distance} + {stale_log_gap} + {reserved_log_items} diff --git a/tests/jepsen.nukeeper/src/jepsen/nukeeper/main.clj b/tests/jepsen.nukeeper/src/jepsen/nukeeper/main.clj index d62cbabd56f..a5ceae5d5ae 100644 --- a/tests/jepsen.nukeeper/src/jepsen/nukeeper/main.clj +++ b/tests/jepsen.nukeeper/src/jepsen/nukeeper/main.clj @@ -28,16 +28,16 @@ (defn cluster-config [test node config-template] - (let [nodes (:nodes test)] - (clojure.string/replace - (clojure.string/replace - (clojure.string/replace - (clojure.string/replace - (clojure.string/replace config-template #"\{quorum_reads\}" (str (boolean (:quorum test)))) - #"\{srv1\}" (get nodes 0)) - #"\{srv2\}" (get nodes 1)) - #"\{srv3\}" (get nodes 2)) - #"\{id\}" (str (inc (.indexOf nodes node)))))) + (let [nodes (:nodes test) + replacement-map {#"\{srv1\}" (get nodes 0) + #"\{srv2\}" (get nodes 1) + #"\{srv3\}" (get nodes 2) + #"\{id\}" (str (inc (.indexOf nodes node))) + #"\{quorum_reads\}" (str (boolean (:quorum test))) + #"\{snapshot_distance\}" (str (:snapshot-distance test)) + #"\{stale_log_gap\}" (str (:stale-log-gap test)) + #"\{reserved_log_items\}" (str (:reserved-log-items test))}] + (reduce #(clojure.string/replace %1 (get %2 0) (get %2 1)) config-template replacement-map))) (defn db [version] @@ -90,11 +90,26 @@ [["-w" "--workload NAME" "What workload should we run?" :missing (str "--workload " (cli/one-of workloads)) :validate [workloads (cli/one-of workloads)]] + [nil "--nemesis NAME" "Which nemesis will poison our lives?" + :missing (str "--nemesis " (cli/one-of custom-nemesis/custom-nemesises)) + :validate [custom-nemesis/custom-nemesises (cli/one-of custom-nemesis/custom-nemesises)]] ["-q" "--quorum" "Use quorum reads, instead of reading from any primary."] ["-r" "--rate HZ" "Approximate number of requests per second, per thread." :default 10 :parse-fn read-string :validate [#(and (number? %) (pos? %)) "Must be a positive number"]] + ["-s" "--snapshot-distance NUM" "Number of log entries to create snapshot" + :default 10000 + :parse-fn read-string + :validate [#(and (number? %) (pos? %)) "Must be a positive number"]] + [nil "--stale-log-gap NUM" "Number of log entries to send snapshot instead of separate logs" + :default 1000 + :parse-fn read-string + :validate [#(and (number? %) (pos? %)) "Must be a positive number"]] + [nil "--reserved-log-items NUM" "Number of log entries to keep after snapshot" + :default 1000 + :parse-fn read-string + :validate [#(and (number? %) (pos? %)) "Must be a positive number"]] [nil "--ops-per-key NUM" "Maximum number of operations on any given key." :default 100 :parse-fn parse-long @@ -106,24 +121,22 @@ [opts] (let [quorum (boolean (:quorum opts)) workload ((get workloads (:workload opts)) opts) - current-nemesis (get custom-nemesis/custom-nemesises "killer")] + current-nemesis (get custom-nemesis/custom-nemesises (:nemesis opts))] (merge tests/noop-test opts - {:name (str "clickhouse-keeper quorum=" quorum " " (name (:workload opts))) + {:name (str "clickhouse-keeper quorum=" quorum " " (name (:workload opts)) (name (:nemesis opts))) :os ubuntu/os :db (db "rbtorrent:a122093aee0bdcb70ca42d5e5fb4ba5544372f5f") :pure-generators true :client (:client workload) - :nemesis (custom-nemesis/logs-corruption-nemesis) + :nemesis (:nemesis current-nemesis) :checker (checker/compose {:perf (checker/perf) :workload (:checker workload)}) :generator (gen/phases (->> (:generator workload) (gen/stagger (/ (:rate opts))) - (gen/nemesis - (cycle [(gen/sleep 5) - {:type :info, :f :corrupt}])) + (gen/nemesis (:generator current-nemesis)) (gen/time-limit (:time-limit opts))) (gen/log "Healing cluster") (gen/nemesis (gen/once {:type :info, :f :stop})) diff --git a/tests/jepsen.nukeeper/src/jepsen/nukeeper/nemesis.clj b/tests/jepsen.nukeeper/src/jepsen/nukeeper/nemesis.clj index bf2348f1860..93026a7d64c 100644 --- a/tests/jepsen.nukeeper/src/jepsen/nukeeper/nemesis.clj +++ b/tests/jepsen.nukeeper/src/jepsen/nukeeper/nemesis.clj @@ -1,12 +1,12 @@ (ns jepsen.nukeeper.nemesis (:require - [clojure.tools.logging :refer :all] - [jepsen - [nemesis :as nemesis] - [control :as c] - [generator :as gen]] - [jepsen.nukeeper.constants :refer :all] - [jepsen.nukeeper.utils :refer :all])) + [clojure.tools.logging :refer :all] + [jepsen + [nemesis :as nemesis] + [control :as c] + [generator :as gen]] + [jepsen.nukeeper.constants :refer :all] + [jepsen.nukeeper.utils :refer :all])) (defn random-single-node-killer-nemesis [] @@ -21,9 +21,8 @@ (defn select-last-file [path] - (info "EXECUTE ON PATH" path) (last (clojure.string/split - (c/exec :find path :-type :f :-printf "%T+ %p\n" :| :sort :| :awk "{print $2}") + (c/exec :find path :-type :f :-printf "%T+ %p\n" :| :grep :-v :tmp_ :| :sort :| :awk "{print $2}") #"\n"))) (defn random-file-pos @@ -33,8 +32,11 @@ (defn corrupt-file [fname] - (info "Corrupting" fname) - (c/exec :dd "if=/dev/zero" (str "of=" fname) "bs=1" "count=1" (str "seek=" (random-file-pos fname)) "conv=notrunc")) + (if (not (empty? fname)) + (do + (info "Corrupting" fname) + (c/exec :dd "if=/dev/zero" (str "of=" fname) "bs=1" "count=1" (str "seek=" (random-file-pos fname)) "conv=notrunc")) + (info "Nothing to corrupt"))) (defn corruptor-nemesis [path corruption-op] @@ -44,41 +46,66 @@ (invoke! [this test op] (cond (= (:f op) :corrupt) - (let [nodes (list (rand-nth (:nodes test)))] - (info "Corruption on node" nodes) - (c/on-nodes test nodes - (fn [test node] - (let [file-to-corrupt (select-last-file path)] - (info "Corrupting file" file-to-corrupt) - (c/su - (corruption-op (select-last-file path)) - (kill-clickhouse! node test) - (start-clickhouse! node test))))) - (assoc op :type :info, :value :corrupted)) - :else (assoc op :type :info, :value :not-started))) + (let [nodes (list (rand-nth (:nodes test)))] + (info "Corruption on node" nodes) + (c/on-nodes test nodes + (fn [test node] + (c/su + (kill-clickhouse! node test) + (corruption-op path) + (start-clickhouse! node test)))) + (assoc op :type :info, :value :corrupted)) + :else (do (c/on-nodes test (:nodes test) + (fn [test node] + (c/su + (start-clickhouse! node test)))) + (assoc op :type :info, :value :done)))) (teardown! [this test]))) (defn logs-corruption-nemesis [] - (corruptor-nemesis logsdir corrupt-file)) + (corruptor-nemesis logsdir #(corrupt-file (select-last-file %1)))) (defn snapshots-corruption-nemesis [] - (corruptor-nemesis snapshotsdir corrupt-file)) + (corruptor-nemesis snapshotsdir #(corrupt-file (select-last-file %1)))) + +(defn logs-and-snapshots-corruption-nemesis + [] + (corruptor-nemesis coordinationdir (fn [path] + (do + (corrupt-file (select-last-file (str path "/snapshots"))) + (corrupt-file (select-last-file (str path "/logs"))))))) +(defn drop-all-corruption-nemesis + [] + (corruptor-nemesis coordinationdir (fn [path] + (c/exec :rm :-fr path)))) + +(defn start-stop-generator + [] + (->> + (cycle [(gen/sleep 5) + {:type :info, :f :start} + (gen/sleep 5) + {:type :info, :f :stop}]))) + +(defn corruption-generator + [] + (->> + (cycle [(gen/sleep 5) + {:type :info, :f :corrupt}]))) (def custom-nemesises - {"killer" {:nemesis (random-single-node-killer-nemesis) - :generator - (gen/nemesis - (cycle [(gen/sleep 5) - {:type :info, :f :start} - (gen/sleep 5) - {:type :info, :f :stop}]))} + {"single-node-killer" {:nemesis (random-single-node-killer-nemesis) + :generator (start-stop-generator)} "simple-partitioner" {:nemesis (nemesis/partition-random-halves) - :generator - (gen/nemesis - (cycle [(gen/sleep 5) - {:type :info, :f :start} - (gen/sleep 5) - {:type :info, :f :stop}]))}}) + :generator (start-stop-generator)} + "logs-corruptor" {:nemesis (logs-corruption-nemesis) + :generator (corruption-generator)} + "snapshots-corruptor" {:nemesis (snapshots-corruption-nemesis) + :generator (corruption-generator)} + "logs-and-snapshots-corruptor" {:nemesis (logs-and-snapshots-corruption-nemesis) + :generator (corruption-generator)} + "drop-data-corruptor" {:nemesis (drop-all-corruption-nemesis) + :generator (corruption-generator)}}) diff --git a/tests/jepsen.nukeeper/src/jepsen/nukeeper/utils.clj b/tests/jepsen.nukeeper/src/jepsen/nukeeper/utils.clj index 9912b34cd46..e9658e9d6d5 100644 --- a/tests/jepsen.nukeeper/src/jepsen/nukeeper/utils.clj +++ b/tests/jepsen.nukeeper/src/jepsen/nukeeper/utils.clj @@ -81,7 +81,8 @@ [node test] (info "Killing server on node" node) (c/su - (cu/stop-daemon! (str binary-path "/clickhouse") pidfile))) + (cu/stop-daemon! (str binary-path "/clickhouse") pidfile) + (c/exec :rm :-fr (str dir "/status")))) (defn start-clickhouse! [node test] From ecd081144c6a1db08b4952b0be19548b54d0f873 Mon Sep 17 00:00:00 2001 From: alesapin Date: Wed, 17 Mar 2021 14:54:26 +0300 Subject: [PATCH 025/155] Add missing hammer-time nemesis --- tests/jepsen.nukeeper/src/jepsen/nukeeper/nemesis.clj | 2 ++ 1 file changed, 2 insertions(+) diff --git a/tests/jepsen.nukeeper/src/jepsen/nukeeper/nemesis.clj b/tests/jepsen.nukeeper/src/jepsen/nukeeper/nemesis.clj index 93026a7d64c..d1dc0d55e5f 100644 --- a/tests/jepsen.nukeeper/src/jepsen/nukeeper/nemesis.clj +++ b/tests/jepsen.nukeeper/src/jepsen/nukeeper/nemesis.clj @@ -101,6 +101,8 @@ :generator (start-stop-generator)} "simple-partitioner" {:nemesis (nemesis/partition-random-halves) :generator (start-stop-generator)} + "hammer-time" {:nemesis (hammer-time-nemesis) + :generator (start-stop-generator)} "logs-corruptor" {:nemesis (logs-corruption-nemesis) :generator (corruption-generator)} "snapshots-corruptor" {:nemesis (snapshots-corruption-nemesis) From 7c4fdd79cfa0461156c6dae6015d64ff5e8d66ca Mon Sep 17 00:00:00 2001 From: alesapin Date: Wed, 17 Mar 2021 15:58:01 +0300 Subject: [PATCH 026/155] Add unique-ids workload --- .../src/jepsen/nukeeper/main.clj | 4 +- .../src/jepsen/nukeeper/unique.clj | 45 +++++++++++++++++++ .../src/jepsen/nukeeper/utils.clj | 4 ++ 3 files changed, 52 insertions(+), 1 deletion(-) create mode 100644 tests/jepsen.nukeeper/src/jepsen/nukeeper/unique.clj diff --git a/tests/jepsen.nukeeper/src/jepsen/nukeeper/main.clj b/tests/jepsen.nukeeper/src/jepsen/nukeeper/main.clj index a5ceae5d5ae..8b7c1a6caac 100644 --- a/tests/jepsen.nukeeper/src/jepsen/nukeeper/main.clj +++ b/tests/jepsen.nukeeper/src/jepsen/nukeeper/main.clj @@ -4,6 +4,7 @@ [jepsen.nukeeper.set :as set] [jepsen.nukeeper.nemesis :as custom-nemesis] [jepsen.nukeeper.register :as register] + [jepsen.nukeeper.unique :as unique] [jepsen.nukeeper.constants :refer :all] [clojure.string :as str] [jepsen @@ -83,7 +84,8 @@ (def workloads "A map of workload names to functions that construct workloads, given opts." {"set" set/workload - "register" register/workload}) + "register" register/workload + "unique-ids" unique/workload}) (def cli-opts "Additional command line options." diff --git a/tests/jepsen.nukeeper/src/jepsen/nukeeper/unique.clj b/tests/jepsen.nukeeper/src/jepsen/nukeeper/unique.clj new file mode 100644 index 00000000000..fc8370005aa --- /dev/null +++ b/tests/jepsen.nukeeper/src/jepsen/nukeeper/unique.clj @@ -0,0 +1,45 @@ +(ns jepsen.nukeeper.unique + (:require + [clojure.tools.logging :refer :all] + [jepsen + [checker :as checker] + [client :as client] + [generator :as gen]] + [jepsen.nukeeper.utils :refer :all] + [zookeeper :as zk]) + (:import (org.apache.zookeeper ZooKeeper KeeperException KeeperException$BadVersionException))) + +(defn parse-and-get-counter + [path] + (Integer/parseInt (apply str (take-last 10 (seq (str path)))))) + +(defrecord UniqueClient [conn nodename] + client/Client + (open! [this test node] + (assoc + (assoc this + :conn (zk-connect node 9181 30000)) + :nodename node)) + + (setup! [this test]) + + (invoke! [this test op] + (case + :generate + (try + (let [result-path (zk-create-sequential conn "/seq-" "")] + (assoc op :type :ok :value (parse-and-get-counter result-path))) + (catch Exception _ (assoc op :type :info, :error :connect-error))))) + + (teardown! [_ test]) + + (close! [_ test])) + +(defn workload + "A generator, client, and checker for a set test." + [opts] + {:client (UniqueClient. nil nil) + :checker (checker/unique-ids) + :generator (->> + (range) + (map (fn [_] {:type :invoke, :f :generate})))}) diff --git a/tests/jepsen.nukeeper/src/jepsen/nukeeper/utils.clj b/tests/jepsen.nukeeper/src/jepsen/nukeeper/utils.clj index e9658e9d6d5..10851a2adc7 100644 --- a/tests/jepsen.nukeeper/src/jepsen/nukeeper/utils.clj +++ b/tests/jepsen.nukeeper/src/jepsen/nukeeper/utils.clj @@ -63,6 +63,10 @@ [conn path data] (zk/create conn path :data (data/to-bytes (str data)) :persistent? true)) +(defn zk-create-sequential + [conn path-prefix data] + (zk/create conn path-prefix :data (data/to-bytes (str data)) :persistent? true :sequential? true)) + (defn clickhouse-alive? [node test] (info "Checking server alive on" node) From 2ee58ed82fc6fe98d67b5f5cf9469c17e60602d6 Mon Sep 17 00:00:00 2001 From: alesapin Date: Wed, 17 Mar 2021 16:00:08 +0300 Subject: [PATCH 027/155] Fix style --- tests/jepsen.nukeeper/src/jepsen/nukeeper/unique.clj | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/tests/jepsen.nukeeper/src/jepsen/nukeeper/unique.clj b/tests/jepsen.nukeeper/src/jepsen/nukeeper/unique.clj index fc8370005aa..9c753dfe0ab 100644 --- a/tests/jepsen.nukeeper/src/jepsen/nukeeper/unique.clj +++ b/tests/jepsen.nukeeper/src/jepsen/nukeeper/unique.clj @@ -25,11 +25,11 @@ (invoke! [this test op] (case - :generate - (try - (let [result-path (zk-create-sequential conn "/seq-" "")] - (assoc op :type :ok :value (parse-and-get-counter result-path))) - (catch Exception _ (assoc op :type :info, :error :connect-error))))) + :generate + (try + (let [result-path (zk-create-sequential conn "/seq-" "")] + (assoc op :type :ok :value (parse-and-get-counter result-path))) + (catch Exception _ (assoc op :type :info, :error :connect-error))))) (teardown! [_ test]) From e116e8d5e84c0a3fef9e2c5d55157d8284c5d95a Mon Sep 17 00:00:00 2001 From: fuqi Date: Wed, 17 Mar 2021 22:19:10 +0800 Subject: [PATCH 028/155] fix bug --- src/Storages/MergeTree/MergeTreeWhereOptimizer.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/Storages/MergeTree/MergeTreeWhereOptimizer.cpp b/src/Storages/MergeTree/MergeTreeWhereOptimizer.cpp index 792884689d5..256ea69c4e9 100644 --- a/src/Storages/MergeTree/MergeTreeWhereOptimizer.cpp +++ b/src/Storages/MergeTree/MergeTreeWhereOptimizer.cpp @@ -36,7 +36,7 @@ MergeTreeWhereOptimizer::MergeTreeWhereOptimizer( Poco::Logger * log_) : table_columns{ext::map( metadata_snapshot->getColumns().getAllPhysical(), [](const NameAndTypePair & col) { return col.name; })} - , queried_columns{queried_columns_}, + , queried_columns{queried_columns_} , primary_key_columns{metadata_snapshot->getPrimaryKey().column_names} , block_with_constants{KeyCondition::getBlockWithConstants(query_info.query, query_info.syntax_analyzer_result, context)} , log{log_} @@ -120,7 +120,7 @@ void MergeTreeWhereOptimizer::analyzeImpl(Conditions & res, const ASTPtr & node, if (const auto * func_and = node->as(); func_and && func_and->name == "and") { for (const auto & elem : func_and->arguments->children) - analyzeImpl(res, elem); + analyzeImpl(res, elem, isFinal); } else { From b8edc12812e0718e065110edc0ac621069f38c4f Mon Sep 17 00:00:00 2001 From: fuqi Date: Wed, 17 Mar 2021 23:56:55 +0800 Subject: [PATCH 029/155] fix code style --- .../MergeTree/MergeTreeWhereOptimizer.cpp | 20 +++++++++---------- .../MergeTree/MergeTreeWhereOptimizer.h | 8 ++++---- 2 files changed, 14 insertions(+), 14 deletions(-) diff --git a/src/Storages/MergeTree/MergeTreeWhereOptimizer.cpp b/src/Storages/MergeTree/MergeTreeWhereOptimizer.cpp index 256ea69c4e9..2effcbb6c75 100644 --- a/src/Storages/MergeTree/MergeTreeWhereOptimizer.cpp +++ b/src/Storages/MergeTree/MergeTreeWhereOptimizer.cpp @@ -115,12 +115,12 @@ static bool isConditionGood(const ASTPtr & condition) } -void MergeTreeWhereOptimizer::analyzeImpl(Conditions & res, const ASTPtr & node, bool isFinal) const +void MergeTreeWhereOptimizer::analyzeImpl(Conditions & res, const ASTPtr & node, bool is_final) const { if (const auto * func_and = node->as(); func_and && func_and->name == "and") { for (const auto & elem : func_and->arguments->children) - analyzeImpl(res, elem, isFinal); + analyzeImpl(res, elem, is_final); } else { @@ -134,7 +134,7 @@ void MergeTreeWhereOptimizer::analyzeImpl(Conditions & res, const ASTPtr & node, cond.viable = /// Condition depend on some column. Constant expressions are not moved. !cond.identifiers.empty() - && !cannotBeMoved(node, isFinal) + && !cannotBeMoved(node, is_final) /// Do not take into consideration the conditions consisting only of the first primary key column && !hasPrimaryKeyAtoms(node) /// Only table columns are considered. Not array joined columns. NOTE We're assuming that aliases was expanded. @@ -150,10 +150,10 @@ void MergeTreeWhereOptimizer::analyzeImpl(Conditions & res, const ASTPtr & node, } /// Transform conjunctions chain in WHERE expression to Conditions list. -MergeTreeWhereOptimizer::Conditions MergeTreeWhereOptimizer::analyze(const ASTPtr & expression, bool isFinal) const +MergeTreeWhereOptimizer::Conditions MergeTreeWhereOptimizer::analyze(const ASTPtr & expression, bool is_final) const { Conditions res; - analyzeImpl(res, expression, isFinal); + analyzeImpl(res, expression, is_final); return res; } @@ -300,8 +300,8 @@ bool MergeTreeWhereOptimizer::isPrimaryKeyAtom(const ASTPtr & ast) const return false; } -bool MergeTreeWhereOptimizer::isPrimaryKey(const String & columnName) const { - return std::find(primary_key_columns.begin(), primary_key_columns.end(), columnName) != primary_key_columns.end(); +bool MergeTreeWhereOptimizer::isPrimaryKey(const String & column_name) const { + return std::find(primary_key_columns.begin(), primary_key_columns.end(), column_name) != primary_key_columns.end(); } bool MergeTreeWhereOptimizer::isConstant(const ASTPtr & expr) const @@ -323,7 +323,7 @@ bool MergeTreeWhereOptimizer::isSubsetOfTableColumns(const NameSet & identifiers } -bool MergeTreeWhereOptimizer::cannotBeMoved(const ASTPtr & ptr, bool isFinal) const +bool MergeTreeWhereOptimizer::cannotBeMoved(const ASTPtr & ptr, bool is_final) const { if (const auto * function_ptr = ptr->as()) { @@ -341,12 +341,12 @@ bool MergeTreeWhereOptimizer::cannotBeMoved(const ASTPtr & ptr, bool isFinal) co /// disallow moving result of ARRAY JOIN to PREWHERE if (array_joined_names.count(*opt_name) || array_joined_names.count(Nested::extractTableName(*opt_name)) || - (isFinal && !isPrimaryKey(*opt_name))) + (is_final && !isPrimaryKey(*opt_name))) return true; } for (const auto & child : ptr->children) - if (cannotBeMoved(child, isFinal)) + if (cannotBeMoved(child, is_final)) return true; return false; diff --git a/src/Storages/MergeTree/MergeTreeWhereOptimizer.h b/src/Storages/MergeTree/MergeTreeWhereOptimizer.h index 45eb077ed96..85d1df583fa 100644 --- a/src/Storages/MergeTree/MergeTreeWhereOptimizer.h +++ b/src/Storages/MergeTree/MergeTreeWhereOptimizer.h @@ -67,10 +67,10 @@ private: using Conditions = std::list; - void analyzeImpl(Conditions & res, const ASTPtr & node, bool isFinal) const; + void analyzeImpl(Conditions & res, const ASTPtr & node, bool is_final) const; /// Transform conjunctions chain in WHERE expression to Conditions list. - Conditions analyze(const ASTPtr & expression, bool isFinal) const; + Conditions analyze(const ASTPtr & expression, bool is_final) const; /// Transform Conditions list to WHERE or PREWHERE expression. static ASTPtr reconstruct(const Conditions & conditions); @@ -85,7 +85,7 @@ private: bool isPrimaryKeyAtom(const ASTPtr & ast) const; - bool isPrimaryKey(const String & columnName) const; + bool isPrimaryKey(const String & column_name) const; bool isConstant(const ASTPtr & expr) const; @@ -97,7 +97,7 @@ private: * * Also, disallow moving expressions with GLOBAL [NOT] IN. */ - bool cannotBeMoved(const ASTPtr & ptr, bool isFinal) const; + bool cannotBeMoved(const ASTPtr & ptr, bool is_final) const; void determineArrayJoinedNames(ASTSelectQuery & select); From cfa92f0045436f60265086d4fb8e5434b75b92c4 Mon Sep 17 00:00:00 2001 From: fuqi Date: Thu, 18 Mar 2021 00:25:43 +0800 Subject: [PATCH 030/155] fix code style --- src/Storages/MergeTree/MergeTreeWhereOptimizer.cpp | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/src/Storages/MergeTree/MergeTreeWhereOptimizer.cpp b/src/Storages/MergeTree/MergeTreeWhereOptimizer.cpp index 2effcbb6c75..98e40bf394d 100644 --- a/src/Storages/MergeTree/MergeTreeWhereOptimizer.cpp +++ b/src/Storages/MergeTree/MergeTreeWhereOptimizer.cpp @@ -300,10 +300,13 @@ bool MergeTreeWhereOptimizer::isPrimaryKeyAtom(const ASTPtr & ast) const return false; } -bool MergeTreeWhereOptimizer::isPrimaryKey(const String & column_name) const { + +bool MergeTreeWhereOptimizer::isPrimaryKey(const String & column_name) const +{ return std::find(primary_key_columns.begin(), primary_key_columns.end(), column_name) != primary_key_columns.end(); } + bool MergeTreeWhereOptimizer::isConstant(const ASTPtr & expr) const { const auto column_name = expr->getColumnName(); From bc22f4f6ebd2f18b5a0bc99b756aea9a3cb6e0b7 Mon Sep 17 00:00:00 2001 From: Maksim Kita Date: Wed, 17 Mar 2021 20:14:36 +0300 Subject: [PATCH 031/155] Updated implementation --- src/Common/HashTable/FixedDeadlineHashMap.h | 253 --------------- src/Dictionaries/CacheDictionary.cpp | 12 +- src/Dictionaries/CacheDictionaryStorage.h | 291 +++++++++++------- src/Dictionaries/ICacheDictionaryStorage.h | 8 +- src/Dictionaries/SSDCacheDictionaryStorage.h | 2 + .../01681_cache_dictionary_simple_key.sql | 4 +- .../01682_cache_dictionary_complex_key.sql | 4 +- .../01684_ssd_cache_dictionary_simple_key.sql | 4 +- ...01685_ssd_cache_dictionary_complex_key.sql | 4 +- 9 files changed, 198 insertions(+), 384 deletions(-) delete mode 100644 src/Common/HashTable/FixedDeadlineHashMap.h diff --git a/src/Common/HashTable/FixedDeadlineHashMap.h b/src/Common/HashTable/FixedDeadlineHashMap.h deleted file mode 100644 index 0f7819f4020..00000000000 --- a/src/Common/HashTable/FixedDeadlineHashMap.h +++ /dev/null @@ -1,253 +0,0 @@ -#pragma once - -#include -#include -#include -#include - -using TimePoint = std::chrono::system_clock::time_point; - -template -struct DeadlineCell : - public std::conditional_t, - HashMapCell> -{ - using Key = TKey; - - using Base = std::conditional_t, - HashMapCell>; - - using Mapped = typename Base::Mapped; - using State = typename Base::State; - - using mapped_type = Mapped; - using key_type = Key; - - using Base::Base; - - inline TimePoint getDeadline() const { return deadline; } - - void setDeadline(TimePoint & deadline_value) { deadline = deadline_value; } - -private: - TimePoint deadline; -}; - -template -class FixedDeadlineHashMapImpl : - private HashMapTable< - TKey, - DeadlineCell, - Hash, - HashTableGrower<>, - HashTableAllocator> -{ - /// TODO: Make custom grower - using Base = HashMapTable< - TKey, - DeadlineCell, - Hash, - HashTableGrower<>, - HashTableAllocator>; - - static size_t calculateMaxSize(size_t max_size, size_t max_collision_resolution_chain) - { - return roundUpToPowerOfTwoOrZero(std::max(max_size, max_collision_resolution_chain)); - } -public: - using Cell = DeadlineCell; - using Key = TKey; - using Value = TValue; - using Mapped = typename Cell::Mapped; - - explicit FixedDeadlineHashMapImpl(size_t max_size_, size_t max_collision_resolution_chain_, Disposer disposer_ = Disposer()) - : Base(calculateMaxSize(max_size_, max_collision_resolution_chain_)) - , max_collision_resolution_chain(max_collision_resolution_chain_) - , max_size(max_size_) - , disposer(std::move(disposer_)) - { - assert(max_size > 0); - assert(max_collision_resolution_chain > 0); - } - - ~FixedDeadlineHashMapImpl() - { - clear(); - } - - Cell * get(const Key & key) - { - if (Cell::isZero(key, *this)) - return this->hasZero() ? this->zeroValue() : nullptr; - - /// TODO: Optimize - - size_t hash_value = Base::hash(key); - size_t place_value = Base::grower.place(hash_value); - size_t resolution_chain = max_collision_resolution_chain; - - while (resolution_chain != 0) - { - auto & cell = Base::buf[place_value]; - - if (cell.isZero(*this)) - return nullptr; - - if (cell.keyEquals(key, hash_value, *this)) - return &cell; - - place_value = Base::grower.next(place_value); - --resolution_chain; - } - - return nullptr; - } - - const Cell * get(const Key & key) const - { - return const_cast *>(this)->get(key); - } - - std::pair ALWAYS_INLINE insert(const Key & key, const Value & value) - { - return emplace(key, value); - } - - std::pair ALWAYS_INLINE insert(const Key & key, Value && value) - { - return emplace(key, std::move(value)); - } - - template - std::pair ALWAYS_INLINE emplace(const Key & key, Args && ... args) - { - size_t hash_value = Base::hash(key); - std::pair result; - - if (!emplaceIfZero(key, hash_value, result)) - result = emplaceNonZeroImpl(key, hash_value); - - bool was_inserted = result.second; - - if (was_inserted) - new (&result.first->getMapped()) Value(std::forward(args)...); - - return result; - } - - template - void reinsert(Cell * place_to_use, const Key & key, Args && ... args) - { - size_t hash_value = Base::hash(key); - - new (place_to_use) Cell(key, *this); - new (&place_to_use->getMapped()) Value(std::forward(args)...); - place_to_use->setHash(hash_value); - } - - using Base::size; - - using iterator = typename Base::iterator; - using const_iterator = typename Base::const_iterator; - - using Base::begin; - using Base::end; - - size_t getMaxSize() const { return max_size; } - - size_t getSizeInBytes() const { return Base::getBufferSizeInBytes(); } - - void clear() - { - for (auto & cell : *this) - disposer(cell.getKey(), cell.getMapped()); - } - -private: - size_t max_collision_resolution_chain; - size_t max_size; - Disposer disposer; - - bool emplaceIfZero(const Key & key, size_t hash_value, std::pair & result) - { - if (!Cell::isZero(key, *this)) - return false; - - if (this->hasZero()) - { - result = {this->zeroValue(), false}; - return true; - } - - ++Base::m_size; - - this->setHasZero(); - this->zeroValue()->setHash(hash_value); - result = {this->zeroValue(), true}; - - return true; - } - - std::pair emplaceNonZeroImpl(const Key & key, size_t hash_value) - { - TimePoint oldest_time = TimePoint::max(); - size_t place_value = Base::grower.place(hash_value); - size_t resolution_chain = max_collision_resolution_chain; - - bool use_old_value_place = false; - Cell * place_to_insert = nullptr; - - while (resolution_chain != 0) - { - auto & cell = Base::buf[place_value]; - - if (cell.isZero(*this)) - { - use_old_value_place = false; - place_to_insert = &cell; - break; - } - - if (cell.keyEquals(key, hash_value, *this)) - return std::make_pair(&cell, false); - - if (cell.getDeadline() < oldest_time) - { - use_old_value_place = true; - place_to_insert = &cell; - } - - place_value = Base::grower.next(place_value); - --resolution_chain; - } - - if (!place_to_insert) - place_to_insert = &Base::buf[place_value]; - - if (use_old_value_place) - return std::make_pair(place_to_insert, false); - else - { - ++Base::m_size; - - new (place_to_insert) Cell(key, *this); - place_to_insert->setHash(hash_value); - - return std::make_pair(place_to_insert, true); - } - } -}; - -template -struct DefaultFixedHashMapCellDisposer -{ - void operator()(const Key &, const Mapped &) const {} -}; - -template , typename Hash = DefaultHash> -using FixedDeadlineHashMap = FixedDeadlineHashMapImpl; - -template , typename Hash = DefaultHash> -using FixedDeadlineHashMapWithSavedHash = FixedDeadlineHashMapImpl; diff --git a/src/Dictionaries/CacheDictionary.cpp b/src/Dictionaries/CacheDictionary.cpp index fe777355ca1..bef391c4222 100644 --- a/src/Dictionaries/CacheDictionary.cpp +++ b/src/Dictionaries/CacheDictionary.cpp @@ -332,10 +332,16 @@ Columns CacheDictionary::getColumnsImpl( FetchResult result_of_fetch_from_storage; - { - /// Write lock on storage - const ProfilingScopedWriteRWLock write_lock{rw_lock, ProfileEvents::DictCacheLockWriteNs}; + bool protect_get_with_write_lock = cache_storage_ptr->canPerformFetchByMultipleThreadsWithoutLock(); + if (protect_get_with_write_lock) + { + const ProfilingScopedReadRWLock write_lock{rw_lock, ProfileEvents::DictCacheLockWriteNs}; + result_of_fetch_from_storage = cache_storage_ptr->fetchColumnsForKeys(keys, request); + } + else + { + const ProfilingScopedWriteRWLock write_lock{rw_lock, ProfileEvents::DictCacheLockWriteNs}; result_of_fetch_from_storage = cache_storage_ptr->fetchColumnsForKeys(keys, request); } diff --git a/src/Dictionaries/CacheDictionaryStorage.h b/src/Dictionaries/CacheDictionaryStorage.h index a98f92e5da9..6b1200dd474 100644 --- a/src/Dictionaries/CacheDictionaryStorage.h +++ b/src/Dictionaries/CacheDictionaryStorage.h @@ -4,14 +4,11 @@ #include #include -#include -#include #include #include #include #include -#include #include #include #include @@ -38,6 +35,9 @@ struct CacheDictionaryStorageConfiguration template class CacheDictionaryStorage final : public ICacheDictionaryStorage { + + static constexpr size_t max_collision_length = 10; + public: using KeyType = std::conditional_t; static_assert(dictionary_key_type != DictionaryKeyType::range, "Range key type is not supported by CacheDictionaryStorage"); @@ -47,13 +47,19 @@ public: CacheDictionaryStorageConfiguration & configuration_) : configuration(configuration_) , rnd_engine(randomSeed()) - , cache(configuration.max_size_in_cells, 10, { *this }) { + size_t cells_size = roundUpToPowerOfTwoOrZero(std::max(configuration.max_size_in_cells, max_collision_length)); + + cells.resize_fill(cells_size); + size_overlap_mask = cells_size - 1; + setup(dictionary_structure); } bool returnsFetchedColumnsInOrderOfRequestedKeys() const override { return true; } + bool canPerformFetchByMultipleThreadsWithoutLock() const override { return true; } + String getName() const override { if (dictionary_key_type == DictionaryKeyType::simple) @@ -134,9 +140,9 @@ public: throw Exception("Method getCachedComplexKeys is not supported for simple key storage", ErrorCodes::NOT_IMPLEMENTED); } - size_t getSize() const override { return cache.size(); } + size_t getSize() const override { return size; } - size_t getMaxSize() const override { return cache.getMaxSize(); } + size_t getMaxSize() const override { return configuration.max_size_in_cells; } size_t getBytesAllocated() const override { @@ -151,7 +157,7 @@ public: }); } - return arena.size() + cache.getSizeInBytes() + attributes_size_in_bytes; + return arena.size() + sizeof(Cell) * configuration.max_size_in_cells + attributes_size_in_bytes; } private: @@ -175,9 +181,9 @@ private: KeysStorageFetchResult result; result.fetched_columns = fetch_request.makeAttributesResultColumns(); - result.key_index_to_state.resize_fill(keys.size(), {KeyState::not_found}); + result.key_index_to_state.resize_fill(keys.size()); - const auto now = std::chrono::system_clock::now(); + const time_t now = std::chrono::system_clock::to_time_t(std::chrono::system_clock::now()); size_t fetched_columns_index = 0; size_t keys_size = keys.size(); @@ -190,54 +196,39 @@ private: for (size_t key_index = 0; key_index < keys_size; ++key_index) { auto key = keys[key_index]; - auto * it = cache.get(key); + auto [key_state, cell_index] = getKeyStateAndCellIndex(key, now); - if (!it) + if (unlikely(key_state == KeyState::not_found)) { result.key_index_to_state[key_index] = {KeyState::not_found}; ++result.not_found_keys_size; continue; } - auto deadline = it->getDeadline(); - const auto & cell = it->getMapped(); + auto & cell = cells[cell_index]; - if (now > deadline + max_lifetime_seconds) - { - result.key_index_to_state[key_index] = {KeyState::not_found}; - ++result.not_found_keys_size; - continue; - } - - bool cell_is_expired = false; - KeyState::State key_state = KeyState::found; - - if (now > deadline) - { - cell_is_expired = true; - key_state = KeyState::expired; - } + result.expired_keys_size += static_cast(key_state == KeyState::expired); result.key_index_to_state[key_index] = {key_state, fetched_columns_index}; - ++fetched_columns_index; + fetched_keys[fetched_columns_index] = FetchedKey(cell.element_index, cell.is_default); - result.expired_keys_size += cell_is_expired; - result.found_keys_size += !cell_is_expired; + ++fetched_columns_index; result.key_index_to_state[key_index].setDefaultValue(cell.is_default); result.default_keys_size += cell.is_default; - - fetched_keys[key_index] = FetchedKey{cell.element_index, cell.is_default}; } + result.found_keys_size = keys_size - (result.expired_keys_size + result.not_found_keys_size); + for (size_t attribute_index = 0; attribute_index < fetch_request.attributesSize(); ++attribute_index) { if (!fetch_request.shouldFillResultColumnWithIndex(attribute_index)) continue; - size_t fetched_keys_size = fetched_keys.size(); auto & attribute = attributes[attribute_index]; const auto & default_value_provider = fetch_request.defaultValueProviderAtIndex(attribute_index); + + size_t fetched_keys_size = fetched_keys.size(); auto & fetched_column = *result.fetched_columns[attribute_index]; fetched_column.reserve(fetched_keys_size); @@ -245,7 +236,7 @@ private: { auto & container = std::get>(attribute.attribute_container); - for (size_t fetched_key_index = 0; fetched_key_index < fetched_keys.size(); ++fetched_key_index) + for (size_t fetched_key_index = 0; fetched_key_index < fetched_columns_index; ++fetched_key_index) { auto fetched_key = fetched_keys[fetched_key_index]; @@ -272,7 +263,7 @@ private: if constexpr (std::is_same_v) { - for (size_t fetched_key_index = 0; fetched_key_index < fetched_keys.size(); ++fetched_key_index) + for (size_t fetched_key_index = 0; fetched_key_index < fetched_columns_index; ++fetched_key_index) { auto fetched_key = fetched_keys[fetched_key_index]; @@ -287,7 +278,7 @@ private: } else { - for (size_t fetched_key_index = 0; fetched_key_index < fetched_keys.size(); ++fetched_key_index) + for (size_t fetched_key_index = 0; fetched_key_index < fetched_columns_index; ++fetched_key_index) { auto fetched_key = fetched_keys[fetched_key_index]; auto & data = column_typed.getData(); @@ -314,23 +305,27 @@ private: { const auto now = std::chrono::system_clock::now(); - size_t keys_size = keys.size(); - - size_t columns_size = columns.size(); Field column_value; - for (size_t key_index = 0; key_index < keys_size; ++key_index) + for (size_t key_index = 0; key_index < keys.size(); ++key_index) { auto key = keys[key_index]; - auto [it, was_inserted] = cache.insert(key, {}); + size_t cell_index = getCellIndexForInsert(key); + auto & cell = cells[cell_index]; + + cell.is_default = false; + + bool was_inserted = cell.deadline == 0; if (was_inserted) { - auto & cell = it->getMapped(); - cell.is_default = false; + if constexpr (std::is_same_v) + cell.key = copyStringInArena(key); + else + cell.key = key; - for (size_t attribute_index = 0; attribute_index < columns_size; ++attribute_index) + for (size_t attribute_index = 0; attribute_index < columns.size(); ++attribute_index) { auto & column = columns[attribute_index]; @@ -347,38 +342,36 @@ private: container.back() = column_value; else if constexpr (std::is_same_v) { - const String & value = column_value.get(); - StringRef inserted_value = copyStringInArena(StringRef { value.data(), value.size() }); + const String & string_value = column_value.get(); + StringRef string_value_ref = StringRef {string_value.data(), string_value.size()}; + StringRef inserted_value = copyStringInArena(string_value_ref); container.back() = inserted_value; } else - container.back() = column_value.get(); + container.back() = column_value.get>(); }); } + + ++size; } else { - auto & cell_key = it->getKey(); - - Cell cell; - - size_t existing_index = it->getMapped().element_index; - - cell.element_index = existing_index; - cell.is_default = false; - - if (cell_key != key) + if (cell.key != key) { - /// In case of complex key we keep it in arena if constexpr (std::is_same_v) - arena.free(const_cast(key.data), key.size); + { + char * data = const_cast(cell.key.data); + arena.free(data, cell.key.size); + cell.key = copyStringInArena(key); + } + else + cell.key = key; } - cache.reinsert(it, key, cell); + /// Put values into existing index + size_t index_to_use = cell.element_index; - /// Put values into index - - for (size_t attribute_index = 0; attribute_index < columns_size; ++attribute_index) + for (size_t attribute_index = 0; attribute_index < columns.size(); ++attribute_index) { auto & column = columns[attribute_index]; @@ -389,20 +382,26 @@ private: column->get(key_index, column_value); if constexpr (std::is_same_v) - container[existing_index] = column_value; + container[index_to_use] = column_value; else if constexpr (std::is_same_v) { - const String & value = column_value.get(); - StringRef inserted_value = copyStringInArena(StringRef { value.data(), value.size() }); - container[existing_index] = inserted_value; + const String & string_value = column_value.get(); + StringRef string_ref_value = StringRef {string_value.data(), string_value.size()}; + StringRef inserted_value = copyStringInArena(string_ref_value); + + StringRef previous_value = container[index_to_use]; + char * data = const_cast(previous_value.data); + arena.free(data, previous_value.size); + + container[index_to_use] = inserted_value; } else - container[existing_index] = column_value.get(); + container[index_to_use] = column_value.get>(); }); } } - setCellDeadline(*it, now); + setCellDeadline(cell, now); } } @@ -416,55 +415,64 @@ private: { auto key = keys[key_index]; - Cell value; - value.is_default = true; + size_t cell_index = getCellIndexForInsert(key); + auto & cell = cells[cell_index]; - auto [it, was_inserted] = cache.insert(key, value); + bool was_inserted = cell.deadline == 0; + + cell.is_default = true; if (was_inserted) { - auto & cell = it->getMapped(); + if constexpr (std::is_same_v) + cell.key = copyStringInArena(key); + else + cell.key = key; for (size_t attribute_index = 0; attribute_index < attributes.size(); ++attribute_index) { getAttributeContainer(attribute_index, [&](auto & container) { container.emplace_back(); - cell.element_index = container.size(); + cell.element_index = container.size() - 1; }); } + + ++size; } else { - value.element_index = it->getMapped().element_index; - - if (it->getKey() != key) + if (cell.key != key) { - /// In case of complex key we keep it in arena if constexpr (std::is_same_v) - arena.free(const_cast(key.data), key.size); + { + char * data = const_cast(cell.key.data); + arena.free(data, cell.key.size); + cell.key = copyStringInArena(key); + } + else + cell.key = key; } - - cache.reinsert(it, key, value); } - setCellDeadline(*it, now); + setCellDeadline(cell, now); } } PaddedPODArray getCachedKeysImpl() const { PaddedPODArray result; - result.reserve(cache.size()); + result.reserve(size); - for (auto & node : cache) + for (auto cell : cells) { - auto & cell = node.getMapped(); + if (cell.deadline == 0) + continue; if (cell.is_default) continue; - result.emplace_back(node.getKey()); + result.emplace_back(cell.key); } return result; @@ -545,18 +553,16 @@ private: } } + using TimePoint = std::chrono::system_clock::time_point; + struct Cell { + KeyType key; size_t element_index; bool is_default; + time_t deadline; }; - CacheDictionaryStorageConfiguration configuration; - - ArenaWithFreeLists arena; - - pcg64 rnd_engine; - struct Attribute { AttributeUnderlyingType type; @@ -581,38 +587,28 @@ private: std::vector> attribute_container; }; - class CacheStorageCellDisposer - { - public: - CacheDictionaryStorage & storage; + CacheDictionaryStorageConfiguration configuration; - template - void operator()(const Key & key, const Value &) const - { - /// In case of complex key we keep it in arena - if constexpr (std::is_same_v) - storage.arena.free(const_cast(key.data), key.size); - } - }; + pcg64 rnd_engine; - using SimpleFixedDeadlineHashMap = FixedDeadlineHashMap; - using ComplexFixedDeadlineHashMap = FixedDeadlineHashMap; + size_t size_overlap_mask = 0; - using FixedDeadlineHashMap = std::conditional_t< - dictionary_key_type == DictionaryKeyType::simple, - SimpleFixedDeadlineHashMap, - ComplexFixedDeadlineHashMap>; + size_t size = 0; - using FixedDeadlineHashMapCell = typename FixedDeadlineHashMap::Cell; + PaddedPODArray cells; - inline void setCellDeadline(FixedDeadlineHashMapCell & cell, TimePoint now) + ArenaWithFreeLists arena; + + std::vector attributes; + + inline void setCellDeadline(Cell & cell, TimePoint now) { if (configuration.lifetime.min_sec == 0 && configuration.lifetime.max_sec == 0) { /// This maybe not obvious, but when we define is this cell is expired or expired permanently, we add strict_max_lifetime_seconds /// to the expiration time. And it overflows pretty well. auto deadline = std::chrono::time_point::max() - 2 * std::chrono::seconds(configuration.strict_max_lifetime_seconds); - cell.setDeadline(deadline); + cell.deadline = std::chrono::system_clock::to_time_t(deadline); return; } @@ -622,12 +618,73 @@ private: std::uniform_int_distribution distribution{min_sec_lifetime, max_sec_lifetime}; auto deadline = now + std::chrono::seconds(distribution(rnd_engine)); - cell.setDeadline(deadline); + cell.deadline = std::chrono::system_clock::to_time_t(deadline); } - FixedDeadlineHashMap cache; + inline size_t getCellIndex(const KeyType key) const + { + const size_t hash = DefaultHash()(key); + const size_t index = hash & size_overlap_mask; + return index; + } - std::vector attributes; + using KeyStateAndCellIndex = std::pair; + + inline KeyStateAndCellIndex getKeyStateAndCellIndex(const KeyType key, const time_t now) const + { + size_t place_value = getCellIndex(key); + const size_t place_value_end = place_value + max_collision_length; + + time_t max_lifetime_seconds = static_cast(configuration.strict_max_lifetime_seconds); + + for (; place_value < place_value_end; ++place_value) + { + const auto cell_place_value = place_value & size_overlap_mask; + const auto & cell = cells[cell_place_value]; + + if (cell.key != key) + continue; + + if (unlikely(now > cell.deadline + max_lifetime_seconds)) + return std::make_pair(KeyState::not_found, cell_place_value); + + if (unlikely(now > cell.deadline)) + return std::make_pair(KeyState::expired, cell_place_value); + + return std::make_pair(KeyState::found, cell_place_value); + } + + return std::make_pair(KeyState::not_found, place_value); + } + + inline size_t getCellIndexForInsert(const KeyType & key) const + { + size_t place_value = getCellIndex(key); + const size_t place_value_end = place_value + max_collision_length; + size_t oldest_place_value = place_value; + + time_t oldest_time = std::numeric_limits::max(); + + for (; place_value < place_value_end; ++place_value) + { + const size_t cell_place_value = place_value & size_overlap_mask; + const Cell cell = cells[cell_place_value]; + + if (cell.deadline == 0) + return cell_place_value; + + if (cell.key == key) + return place_value; + + if (cell.deadline < oldest_time) + { + oldest_time = cell.deadline; + oldest_place_value = cell_place_value; + } + } + + return oldest_place_value; + } }; } diff --git a/src/Dictionaries/ICacheDictionaryStorage.h b/src/Dictionaries/ICacheDictionaryStorage.h index a428cebdfe7..8a3202b5590 100644 --- a/src/Dictionaries/ICacheDictionaryStorage.h +++ b/src/Dictionaries/ICacheDictionaryStorage.h @@ -12,9 +12,9 @@ struct KeyState { enum State: uint8_t { - not_found = 2, - expired = 4, - found = 8, + not_found = 0, + expired = 1, + found = 2, }; KeyState(State state_, size_t fetched_column_index_) @@ -72,6 +72,8 @@ public: /// Necessary if all keys are found we can return result to client without additional aggregation virtual bool returnsFetchedColumnsInOrderOfRequestedKeys() const = 0; + virtual bool canPerformFetchByMultipleThreadsWithoutLock() const = 0; + /// Name of storage virtual String getName() const = 0; diff --git a/src/Dictionaries/SSDCacheDictionaryStorage.h b/src/Dictionaries/SSDCacheDictionaryStorage.h index e061b783ee4..32d521db103 100644 --- a/src/Dictionaries/SSDCacheDictionaryStorage.h +++ b/src/Dictionaries/SSDCacheDictionaryStorage.h @@ -815,6 +815,8 @@ public: bool returnsFetchedColumnsInOrderOfRequestedKeys() const override { return false; } + bool canPerformFetchByMultipleThreadsWithoutLock() const override { return false; } + String getName() const override { if (dictionary_key_type == DictionaryKeyType::simple) diff --git a/tests/queries/0_stateless/01681_cache_dictionary_simple_key.sql b/tests/queries/0_stateless/01681_cache_dictionary_simple_key.sql index ee2cde963d7..f200ead341b 100644 --- a/tests/queries/0_stateless/01681_cache_dictionary_simple_key.sql +++ b/tests/queries/0_stateless/01681_cache_dictionary_simple_key.sql @@ -40,7 +40,7 @@ SELECT dictGetOrDefault('01681_database_for_cache_dictionary.cache_dictionary_si SELECT 'dictHas'; SELECT dictHas('01681_database_for_cache_dictionary.cache_dictionary_simple_key_simple_attributes', number) FROM system.numbers LIMIT 4; SELECT 'select all values as input stream'; -SELECT * FROM 01681_database_for_cache_dictionary.cache_dictionary_simple_key_simple_attributes; +SELECT * FROM 01681_database_for_cache_dictionary.cache_dictionary_simple_key_simple_attributes ORDER BY id; DROP DICTIONARY 01681_database_for_cache_dictionary.cache_dictionary_simple_key_simple_attributes; DROP TABLE 01681_database_for_cache_dictionary.simple_key_simple_attributes_source_table; @@ -84,7 +84,7 @@ SELECT dictGetOrDefault('01681_database_for_cache_dictionary.cache_dictionary_si SELECT 'dictHas'; SELECT dictHas('01681_database_for_cache_dictionary.cache_dictionary_simple_key_complex_attributes', number) FROM system.numbers LIMIT 4; SELECT 'select all values as input stream'; -SELECT * FROM 01681_database_for_cache_dictionary.cache_dictionary_simple_key_complex_attributes; +SELECT * FROM 01681_database_for_cache_dictionary.cache_dictionary_simple_key_complex_attributes ORDER BY id; DROP DICTIONARY 01681_database_for_cache_dictionary.cache_dictionary_simple_key_complex_attributes; DROP TABLE 01681_database_for_cache_dictionary.simple_key_complex_attributes_source_table; diff --git a/tests/queries/0_stateless/01682_cache_dictionary_complex_key.sql b/tests/queries/0_stateless/01682_cache_dictionary_complex_key.sql index 65c56090c47..4cc83412457 100644 --- a/tests/queries/0_stateless/01682_cache_dictionary_complex_key.sql +++ b/tests/queries/0_stateless/01682_cache_dictionary_complex_key.sql @@ -42,7 +42,7 @@ SELECT dictGetOrDefault('01682_database_for_cache_dictionary.cache_dictionary_co SELECT 'dictHas'; SELECT dictHas('01682_database_for_cache_dictionary.cache_dictionary_complex_key_simple_attributes', (number, concat('id_key_', toString(number)))) FROM system.numbers LIMIT 4; SELECT 'select all values as input stream'; -SELECT * FROM 01682_database_for_cache_dictionary.cache_dictionary_complex_key_simple_attributes; +SELECT * FROM 01682_database_for_cache_dictionary.cache_dictionary_complex_key_simple_attributes ORDER BY id; DROP DICTIONARY 01682_database_for_cache_dictionary.cache_dictionary_complex_key_simple_attributes; DROP TABLE 01682_database_for_cache_dictionary.complex_key_simple_attributes_source_table; @@ -89,7 +89,7 @@ SELECT dictGetOrDefault('01682_database_for_cache_dictionary.cache_dictionary_co SELECT 'dictHas'; SELECT dictHas('01682_database_for_cache_dictionary.cache_dictionary_complex_key_complex_attributes', (number, concat('id_key_', toString(number)))) FROM system.numbers LIMIT 4; SELECT 'select all values as input stream'; -SELECT * FROM 01682_database_for_cache_dictionary.cache_dictionary_complex_key_complex_attributes; +SELECT * FROM 01682_database_for_cache_dictionary.cache_dictionary_complex_key_complex_attributes ORDER BY id; DROP DICTIONARY 01682_database_for_cache_dictionary.cache_dictionary_complex_key_complex_attributes; DROP TABLE 01682_database_for_cache_dictionary.complex_key_complex_attributes_source_table; diff --git a/tests/queries/0_stateless/01684_ssd_cache_dictionary_simple_key.sql b/tests/queries/0_stateless/01684_ssd_cache_dictionary_simple_key.sql index 3b327257fc4..9dbad1289f1 100644 --- a/tests/queries/0_stateless/01684_ssd_cache_dictionary_simple_key.sql +++ b/tests/queries/0_stateless/01684_ssd_cache_dictionary_simple_key.sql @@ -40,7 +40,7 @@ SELECT dictGetOrDefault('01684_database_for_cache_dictionary.cache_dictionary_si SELECT 'dictHas'; SELECT dictHas('01684_database_for_cache_dictionary.cache_dictionary_simple_key_simple_attributes', number) FROM system.numbers LIMIT 4; SELECT 'select all values as input stream'; -SELECT * FROM 01684_database_for_cache_dictionary.cache_dictionary_simple_key_simple_attributes; +SELECT * FROM 01684_database_for_cache_dictionary.cache_dictionary_simple_key_simple_attributes ORDER BY id; DROP DICTIONARY 01684_database_for_cache_dictionary.cache_dictionary_simple_key_simple_attributes; DROP TABLE 01684_database_for_cache_dictionary.simple_key_simple_attributes_source_table; @@ -84,7 +84,7 @@ SELECT dictGetOrDefault('01684_database_for_cache_dictionary.cache_dictionary_si SELECT 'dictHas'; SELECT dictHas('01684_database_for_cache_dictionary.cache_dictionary_simple_key_complex_attributes', number) FROM system.numbers LIMIT 4; SELECT 'select all values as input stream'; -SELECT * FROM 01684_database_for_cache_dictionary.cache_dictionary_simple_key_complex_attributes; +SELECT * FROM 01684_database_for_cache_dictionary.cache_dictionary_simple_key_complex_attributes ORDER BY id; DROP DICTIONARY 01684_database_for_cache_dictionary.cache_dictionary_simple_key_complex_attributes; DROP TABLE 01684_database_for_cache_dictionary.simple_key_complex_attributes_source_table; diff --git a/tests/queries/0_stateless/01685_ssd_cache_dictionary_complex_key.sql b/tests/queries/0_stateless/01685_ssd_cache_dictionary_complex_key.sql index 1757b136d3e..8ec5a4a2c24 100644 --- a/tests/queries/0_stateless/01685_ssd_cache_dictionary_complex_key.sql +++ b/tests/queries/0_stateless/01685_ssd_cache_dictionary_complex_key.sql @@ -42,7 +42,7 @@ SELECT dictGetOrDefault('01685_database_for_cache_dictionary.cache_dictionary_co SELECT 'dictHas'; SELECT dictHas('01685_database_for_cache_dictionary.cache_dictionary_complex_key_simple_attributes', (number, concat('id_key_', toString(number)))) FROM system.numbers LIMIT 4; SELECT 'select all values as input stream'; -SELECT * FROM 01685_database_for_cache_dictionary.cache_dictionary_complex_key_simple_attributes; +SELECT * FROM 01685_database_for_cache_dictionary.cache_dictionary_complex_key_simple_attributes ORDER BY id; DROP DICTIONARY 01685_database_for_cache_dictionary.cache_dictionary_complex_key_simple_attributes; DROP TABLE 01685_database_for_cache_dictionary.complex_key_simple_attributes_source_table; @@ -89,7 +89,7 @@ SELECT dictGetOrDefault('01685_database_for_cache_dictionary.cache_dictionary_co SELECT 'dictHas'; SELECT dictHas('01685_database_for_cache_dictionary.cache_dictionary_complex_key_complex_attributes', (number, concat('id_key_', toString(number)))) FROM system.numbers LIMIT 4; SELECT 'select all values as input stream'; -SELECT * FROM 01685_database_for_cache_dictionary.cache_dictionary_complex_key_complex_attributes; +SELECT * FROM 01685_database_for_cache_dictionary.cache_dictionary_complex_key_complex_attributes ORDER BY id; DROP DICTIONARY 01685_database_for_cache_dictionary.cache_dictionary_complex_key_complex_attributes; DROP TABLE 01685_database_for_cache_dictionary.complex_key_complex_attributes_source_table; From f14020427989af5920b6e96e5f74985f0dc1f174 Mon Sep 17 00:00:00 2001 From: Maksim Kita Date: Wed, 17 Mar 2021 22:01:45 +0300 Subject: [PATCH 032/155] Improved performance of SSDCache dictionary --- src/Common/HashTable/LRUHashMap.h | 10 ++++ src/Dictionaries/SSDCacheDictionaryStorage.h | 57 ++++++++++---------- 2 files changed, 37 insertions(+), 30 deletions(-) diff --git a/src/Common/HashTable/LRUHashMap.h b/src/Common/HashTable/LRUHashMap.h index 870fb219523..bc5fd51d0e2 100644 --- a/src/Common/HashTable/LRUHashMap.h +++ b/src/Common/HashTable/LRUHashMap.h @@ -202,6 +202,16 @@ public: return const_cast *>(this)->find(key); } + LookupResult ALWAYS_INLINE findNoLRU(const Key & key) + { + return Base::find(key); + } + + ConstLookupResult ALWAYS_INLINE findNoLRU(const Key & key) const + { + return const_cast *>(this)->findNoLRU(key); + } + Value & ALWAYS_INLINE get(const Key & key) { auto it = find(key); diff --git a/src/Dictionaries/SSDCacheDictionaryStorage.h b/src/Dictionaries/SSDCacheDictionaryStorage.h index 32d521db103..5396846e383 100644 --- a/src/Dictionaries/SSDCacheDictionaryStorage.h +++ b/src/Dictionaries/SSDCacheDictionaryStorage.h @@ -614,11 +614,13 @@ public: } template - ALWAYS_INLINE void fetchBlocks(char * read_buffer, size_t read_from_file_buffer_blocks_size, const PaddedPODArray & blocks_to_fetch, FetchBlockFunc && func) const + void fetchBlocks(size_t read_from_file_buffer_blocks_size, const PaddedPODArray & blocks_to_fetch, FetchBlockFunc && func) const { if (blocks_to_fetch.empty()) return; + Memory> read_buffer(read_from_file_buffer_blocks_size * block_size, 4096); + size_t blocks_to_fetch_size = blocks_to_fetch.size(); PaddedPODArray requests; @@ -631,7 +633,7 @@ public: { iocb request{}; - char * buffer_place = read_buffer + block_size * (block_to_fetch_index % read_from_file_buffer_blocks_size); + char * buffer_place = read_buffer.data() + block_size * (block_to_fetch_index % read_from_file_buffer_blocks_size); #if defined(__FreeBSD__) request.aio.aio_lio_opcode = LIO_READ; @@ -806,7 +808,6 @@ public: explicit SSDCacheDictionaryStorage(const SSDCacheDictionaryStorageConfiguration & configuration_) : configuration(configuration_) , file_buffer(configuration_.file_path, configuration.block_size, configuration.file_blocks_size) - , read_from_file_buffer(configuration_.block_size * configuration_.read_buffer_blocks_size, 4096) , rnd_engine(randomSeed()) , index(configuration.max_stored_keys, false, { complex_key_arena }) { @@ -815,7 +816,7 @@ public: bool returnsFetchedColumnsInOrderOfRequestedKeys() const override { return false; } - bool canPerformFetchByMultipleThreadsWithoutLock() const override { return false; } + bool canPerformFetchByMultipleThreadsWithoutLock() const override { return true; } String getName() const override { @@ -922,8 +923,7 @@ private: default_value }; - TimePoint deadline; - + time_t deadline; SSDCacheIndex index; size_t in_memory_partition_index; CellState state; @@ -954,23 +954,27 @@ private: result.fetched_columns = fetch_request.makeAttributesResultColumns(); result.key_index_to_state.resize_fill(keys.size(), {KeyState::not_found}); - const auto now = std::chrono::system_clock::now(); + const time_t now = std::chrono::system_clock::to_time_t(std::chrono::system_clock::now()); size_t fetched_columns_index = 0; - using BlockIndexToKeysMap = std::unordered_map, DefaultHash>; + using BlockIndexToKeysMap = absl::flat_hash_map, DefaultHash>; BlockIndexToKeysMap block_to_keys_map; absl::flat_hash_set> unique_blocks_to_request; PaddedPODArray blocks_to_request; - std::chrono::seconds strict_max_lifetime_seconds(configuration.strict_max_lifetime_seconds); + time_t strict_max_lifetime_seconds = static_cast(configuration.strict_max_lifetime_seconds); size_t keys_size = keys.size(); + for (size_t attribute_size = 0; attribute_size < fetch_request.attributesSize(); ++attribute_size) + if (fetch_request.shouldFillResultColumnWithIndex(attribute_size)) + result.fetched_columns[attribute_size]->reserve(keys_size); + for (size_t key_index = 0; key_index < keys_size; ++key_index) { auto key = keys[key_index]; - const auto * it = index.find(key); + const auto * it = index.findNoLRU(key); if (!it) { @@ -980,9 +984,7 @@ private: const auto & cell = it->getMapped(); - bool has_deadline = cellHasDeadline(cell); - - if (has_deadline && now > cell.deadline + strict_max_lifetime_seconds) + if (now > cell.deadline + strict_max_lifetime_seconds) { ++result.not_found_keys_size; continue; @@ -991,14 +993,13 @@ private: bool cell_is_expired = false; KeyState::State key_state = KeyState::found; - if (has_deadline && now > cell.deadline) + if (now > cell.deadline) { cell_is_expired = true; key_state = KeyState::expired; } result.expired_keys_size += cell_is_expired; - result.found_keys_size += !cell_is_expired; switch (cell.state) { @@ -1014,7 +1015,8 @@ private: } case Cell::on_disk: { - block_to_keys_map[cell.index.block_index].emplace_back(key_index, cell.index.offset_in_block, cell_is_expired); + PaddedPODArray & keys_block = block_to_keys_map[cell.index.block_index]; + keys_block.emplace_back(key_index, cell.index.offset_in_block, cell_is_expired); if (!unique_blocks_to_request.contains(cell.index.block_index)) { @@ -1036,10 +1038,12 @@ private: } } + result.found_keys_size = keys_size - (result.not_found_keys_size + result.expired_keys_size); + /// Sort blocks by offset before start async io requests std::sort(blocks_to_request.begin(), blocks_to_request.end()); - file_buffer.fetchBlocks(read_from_file_buffer.m_data, configuration.read_buffer_blocks_size, blocks_to_request, [&](size_t block_index, char * block_data) + file_buffer.fetchBlocks(configuration.read_buffer_blocks_size, blocks_to_request, [&](size_t block_index, char * block_data) { auto & keys_in_block = block_to_keys_map[block_index]; @@ -1048,10 +1052,8 @@ private: char * key_data = block_data + key_in_block.offset_in_block; deserializeAndInsertIntoColumns(result.fetched_columns, fetch_request, key_data); - if (key_in_block.is_expired) - result.key_index_to_state[key_in_block.key_index] = {KeyState::expired, fetched_columns_index}; - else - result.key_index_to_state[key_in_block.key_index] = {KeyState::found, fetched_columns_index}; + KeyState::State state = key_in_block.is_expired ? KeyState::expired : KeyState::found; + result.key_index_to_state[key_in_block.key_index] = {state, fetched_columns_index}; ++fetched_columns_index; } @@ -1298,16 +1300,12 @@ private: } } - inline static bool cellHasDeadline(const Cell & cell) - { - return cell.deadline != std::chrono::system_clock::from_time_t(0); - } - inline void setCellDeadline(Cell & cell, TimePoint now) { if (configuration.lifetime.min_sec == 0 && configuration.lifetime.max_sec == 0) { - cell.deadline = std::chrono::system_clock::from_time_t(0); + auto deadline = std::chrono::time_point::max() - 2 * std::chrono::seconds(configuration.strict_max_lifetime_seconds); + cell.deadline = std::chrono::system_clock::to_time_t(deadline); return; } @@ -1315,15 +1313,14 @@ private: size_t max_sec_lifetime = configuration.lifetime.max_sec; std::uniform_int_distribution distribution{min_sec_lifetime, max_sec_lifetime}; - cell.deadline = now + std::chrono::seconds{distribution(rnd_engine)}; + auto deadline = now + std::chrono::seconds{distribution(rnd_engine)}; + cell.deadline = std::chrono::system_clock::to_time_t(deadline); } SSDCacheDictionaryStorageConfiguration configuration; SSDCacheFileBuffer file_buffer; - Memory> read_from_file_buffer; - std::vector> memory_buffer_partitions; pcg64 rnd_engine; From d5a1b50fd22a91ced9c5839f7c573f3aef870bfc Mon Sep 17 00:00:00 2001 From: Maksim Kita Date: Wed, 17 Mar 2021 22:06:46 +0300 Subject: [PATCH 033/155] Updated naming --- src/Dictionaries/CacheDictionary.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/Dictionaries/CacheDictionary.cpp b/src/Dictionaries/CacheDictionary.cpp index bef391c4222..6c13f76132b 100644 --- a/src/Dictionaries/CacheDictionary.cpp +++ b/src/Dictionaries/CacheDictionary.cpp @@ -332,9 +332,9 @@ Columns CacheDictionary::getColumnsImpl( FetchResult result_of_fetch_from_storage; - bool protect_get_with_write_lock = cache_storage_ptr->canPerformFetchByMultipleThreadsWithoutLock(); + bool can_perform_fetch_without_write_lock = cache_storage_ptr->canPerformFetchByMultipleThreadsWithoutLock(); - if (protect_get_with_write_lock) + if (can_perform_fetch_without_write_lock) { const ProfilingScopedReadRWLock write_lock{rw_lock, ProfileEvents::DictCacheLockWriteNs}; result_of_fetch_from_storage = cache_storage_ptr->fetchColumnsForKeys(keys, request); From 18ed9c5c67f1aefed757484ebe42cf2120f1be74 Mon Sep 17 00:00:00 2001 From: Maksim Kita Date: Thu, 18 Mar 2021 12:55:17 +0300 Subject: [PATCH 034/155] SSDCacheDictionary remove max_stored_keys option --- .../external-dicts-dict-layout.md | 6 +- .../external-dicts-dict-layout.md | 6 +- src/Common/HashTable/LRUHashMap.h | 10 -- src/Dictionaries/CacheDictionary.cpp | 12 +- src/Dictionaries/CacheDictionaryStorage.h | 8 +- src/Dictionaries/ICacheDictionaryStorage.h | 8 +- src/Dictionaries/SSDCacheDictionaryStorage.h | 148 ++++++++++-------- .../registerCacheDictionaries.cpp | 8 +- tests/integration/helpers/dictionary.py | 4 +- .../ssd_complex_key_cache_string.xml | 1 - .../0_stateless/01053_ssd_dictionary.sql | 4 +- .../01280_ssd_complex_key_dictionary.sql | 2 +- 12 files changed, 99 insertions(+), 118 deletions(-) diff --git a/docs/en/sql-reference/dictionaries/external-dictionaries/external-dicts-dict-layout.md b/docs/en/sql-reference/dictionaries/external-dictionaries/external-dicts-dict-layout.md index efef91b4b09..6af22eb27dc 100644 --- a/docs/en/sql-reference/dictionaries/external-dictionaries/external-dicts-dict-layout.md +++ b/docs/en/sql-reference/dictionaries/external-dictionaries/external-dicts-dict-layout.md @@ -320,8 +320,6 @@ Similar to `cache`, but stores data on SSD and index in RAM. 1048576 /var/lib/clickhouse/clickhouse_dictionaries/test_dict - - 1048576 ``` @@ -329,8 +327,8 @@ Similar to `cache`, but stores data on SSD and index in RAM. or ``` sql -LAYOUT(CACHE(BLOCK_SIZE 4096 FILE_SIZE 16777216 READ_BUFFER_SIZE 1048576 - PATH /var/lib/clickhouse/clickhouse_dictionaries/test_dict MAX_STORED_KEYS 1048576)) +LAYOUT(SSD_CACHE(BLOCK_SIZE 4096 FILE_SIZE 16777216 READ_BUFFER_SIZE 1048576 + PATH /var/lib/clickhouse/clickhouse_dictionaries/test_dict)) ``` ### complex_key_ssd_cache {#complex-key-ssd-cache} diff --git a/docs/ru/sql-reference/dictionaries/external-dictionaries/external-dicts-dict-layout.md b/docs/ru/sql-reference/dictionaries/external-dictionaries/external-dicts-dict-layout.md index 0fd4a85c46f..9b33a801973 100644 --- a/docs/ru/sql-reference/dictionaries/external-dictionaries/external-dicts-dict-layout.md +++ b/docs/ru/sql-reference/dictionaries/external-dictionaries/external-dicts-dict-layout.md @@ -318,8 +318,6 @@ LAYOUT(CACHE(SIZE_IN_CELLS 1000000000)) 1048576 /var/lib/clickhouse/clickhouse_dictionaries/test_dict - - 1048576 ``` @@ -327,8 +325,8 @@ LAYOUT(CACHE(SIZE_IN_CELLS 1000000000)) или ``` sql -LAYOUT(CACHE(BLOCK_SIZE 4096 FILE_SIZE 16777216 READ_BUFFER_SIZE 1048576 - PATH /var/lib/clickhouse/clickhouse_dictionaries/test_dict MAX_STORED_KEYS 1048576)) +LAYOUT(SSD_CACHE(BLOCK_SIZE 4096 FILE_SIZE 16777216 READ_BUFFER_SIZE 1048576 + PATH /var/lib/clickhouse/clickhouse_dictionaries/test_dict)) ``` ### complex_key_ssd_cache {#complex-key-ssd-cache} diff --git a/src/Common/HashTable/LRUHashMap.h b/src/Common/HashTable/LRUHashMap.h index bc5fd51d0e2..870fb219523 100644 --- a/src/Common/HashTable/LRUHashMap.h +++ b/src/Common/HashTable/LRUHashMap.h @@ -202,16 +202,6 @@ public: return const_cast *>(this)->find(key); } - LookupResult ALWAYS_INLINE findNoLRU(const Key & key) - { - return Base::find(key); - } - - ConstLookupResult ALWAYS_INLINE findNoLRU(const Key & key) const - { - return const_cast *>(this)->findNoLRU(key); - } - Value & ALWAYS_INLINE get(const Key & key) { auto it = find(key); diff --git a/src/Dictionaries/CacheDictionary.cpp b/src/Dictionaries/CacheDictionary.cpp index 6c13f76132b..eedf4dd3d87 100644 --- a/src/Dictionaries/CacheDictionary.cpp +++ b/src/Dictionaries/CacheDictionary.cpp @@ -101,7 +101,7 @@ template double CacheDictionary::getLoadFactor() const { const ProfilingScopedReadRWLock read_lock{rw_lock, ProfileEvents::DictCacheLockReadNs}; - return static_cast(cache_storage_ptr->getSize()) / cache_storage_ptr->getMaxSize(); + return cache_storage_ptr->getLoadFactor(); } template @@ -332,16 +332,8 @@ Columns CacheDictionary::getColumnsImpl( FetchResult result_of_fetch_from_storage; - bool can_perform_fetch_without_write_lock = cache_storage_ptr->canPerformFetchByMultipleThreadsWithoutLock(); - - if (can_perform_fetch_without_write_lock) { - const ProfilingScopedReadRWLock write_lock{rw_lock, ProfileEvents::DictCacheLockWriteNs}; - result_of_fetch_from_storage = cache_storage_ptr->fetchColumnsForKeys(keys, request); - } - else - { - const ProfilingScopedWriteRWLock write_lock{rw_lock, ProfileEvents::DictCacheLockWriteNs}; + const ProfilingScopedReadRWLock read_lock{rw_lock, ProfileEvents::DictCacheLockWriteNs}; result_of_fetch_from_storage = cache_storage_ptr->fetchColumnsForKeys(keys, request); } diff --git a/src/Dictionaries/CacheDictionaryStorage.h b/src/Dictionaries/CacheDictionaryStorage.h index 6b1200dd474..874796d879b 100644 --- a/src/Dictionaries/CacheDictionaryStorage.h +++ b/src/Dictionaries/CacheDictionaryStorage.h @@ -58,8 +58,6 @@ public: bool returnsFetchedColumnsInOrderOfRequestedKeys() const override { return true; } - bool canPerformFetchByMultipleThreadsWithoutLock() const override { return true; } - String getName() const override { if (dictionary_key_type == DictionaryKeyType::simple) @@ -142,7 +140,7 @@ public: size_t getSize() const override { return size; } - size_t getMaxSize() const override { return configuration.max_size_in_cells; } + double getLoadFactor() const override { return static_cast(size) / configuration.max_size_in_cells; } size_t getBytesAllocated() const override { @@ -654,7 +652,7 @@ private: return std::make_pair(KeyState::found, cell_place_value); } - return std::make_pair(KeyState::not_found, place_value); + return std::make_pair(KeyState::not_found, place_value & size_overlap_mask); } inline size_t getCellIndexForInsert(const KeyType & key) const @@ -674,7 +672,7 @@ private: return cell_place_value; if (cell.key == key) - return place_value; + return cell_place_value; if (cell.deadline < oldest_time) { diff --git a/src/Dictionaries/ICacheDictionaryStorage.h b/src/Dictionaries/ICacheDictionaryStorage.h index 8a3202b5590..72b3ef76f11 100644 --- a/src/Dictionaries/ICacheDictionaryStorage.h +++ b/src/Dictionaries/ICacheDictionaryStorage.h @@ -34,7 +34,7 @@ struct KeyState inline void setDefaultValue(bool is_default_value) { is_default = is_default_value; } /// Valid only if keyState is found or expired inline size_t getFetchedColumnIndex() const { return fetched_column_index; } - + inline void setFetchedColumnIndex(size_t fetched_column_index_value) { fetched_column_index = fetched_column_index_value; } private: State state = not_found; size_t fetched_column_index = 0; @@ -72,8 +72,6 @@ public: /// Necessary if all keys are found we can return result to client without additional aggregation virtual bool returnsFetchedColumnsInOrderOfRequestedKeys() const = 0; - virtual bool canPerformFetchByMultipleThreadsWithoutLock() const = 0; - /// Name of storage virtual String getName() const = 0; @@ -114,8 +112,8 @@ public: /// Return size of keys in storage virtual size_t getSize() const = 0; - /// Return maximum size of keys in storage - virtual size_t getMaxSize() const = 0; + /// Returns storage load factor + virtual double getLoadFactor() const = 0; /// Return bytes allocated in storage virtual size_t getBytesAllocated() const = 0; diff --git a/src/Dictionaries/SSDCacheDictionaryStorage.h b/src/Dictionaries/SSDCacheDictionaryStorage.h index 5396846e383..f28f9ab37cd 100644 --- a/src/Dictionaries/SSDCacheDictionaryStorage.h +++ b/src/Dictionaries/SSDCacheDictionaryStorage.h @@ -17,7 +17,7 @@ #include #include #include -#include +#include #include #include #include @@ -56,7 +56,6 @@ struct SSDCacheDictionaryStorageConfiguration const std::string file_path; const size_t max_partitions_count; - const size_t max_stored_keys; const size_t block_size; const size_t file_blocks_size; const size_t read_buffer_blocks_size; @@ -127,7 +126,7 @@ public: /// Reset block with new block_data /// block_data must be filled with zeroes if it is new block - ALWAYS_INLINE inline void reset(char * new_block_data) + inline void reset(char * new_block_data) { block_data = new_block_data; current_block_offset = block_header_size; @@ -135,13 +134,13 @@ public: } /// Check if it is enough place to write key in block - ALWAYS_INLINE inline bool enoughtPlaceToWriteKey(const SSDCacheSimpleKey & cache_key) const + inline bool enoughtPlaceToWriteKey(const SSDCacheSimpleKey & cache_key) const { return (current_block_offset + (sizeof(cache_key.key) + sizeof(cache_key.size) + cache_key.size)) <= block_size; } /// Check if it is enough place to write key in block - ALWAYS_INLINE inline bool enoughtPlaceToWriteKey(const SSDCacheComplexKey & cache_key) const + inline bool enoughtPlaceToWriteKey(const SSDCacheComplexKey & cache_key) const { const StringRef & key = cache_key.key; size_t complex_key_size = sizeof(key.size) + key.size; @@ -152,7 +151,7 @@ public: /// Write key and returns offset in ssd cache block where data is written /// It is client responsibility to check if there is enough place in block to write key /// Returns true if key was written and false if there was not enough place to write key - ALWAYS_INLINE inline bool writeKey(const SSDCacheSimpleKey & cache_key, size_t & offset_in_block) + inline bool writeKey(const SSDCacheSimpleKey & cache_key, size_t & offset_in_block) { assert(cache_key.size > 0); @@ -181,7 +180,7 @@ public: return true; } - ALWAYS_INLINE inline bool writeKey(const SSDCacheComplexKey & cache_key, size_t & offset_in_block) + inline bool writeKey(const SSDCacheComplexKey & cache_key, size_t & offset_in_block) { assert(cache_key.size > 0); @@ -216,20 +215,20 @@ public: return true; } - ALWAYS_INLINE inline size_t getKeysSize() const { return keys_size; } + inline size_t getKeysSize() const { return keys_size; } /// Write keys size into block header - ALWAYS_INLINE inline void writeKeysSize() + inline void writeKeysSize() { char * keys_size_offset_data = block_data + block_header_check_sum_size; std::memcpy(keys_size_offset_data, &keys_size, sizeof(size_t)); } /// Get check sum from block header - ALWAYS_INLINE inline size_t getCheckSum() const { return unalignedLoad(block_data); } + inline size_t getCheckSum() const { return unalignedLoad(block_data); } /// Calculate check sum in block - ALWAYS_INLINE inline size_t calculateCheckSum() const + inline size_t calculateCheckSum() const { size_t calculated_check_sum = static_cast(CityHash_v1_0_2::CityHash64(block_data + block_header_check_sum_size, block_size - block_header_check_sum_size)); @@ -237,7 +236,7 @@ public: } /// Check if check sum from block header matched calculated check sum in block - ALWAYS_INLINE inline bool checkCheckSum() const + inline bool checkCheckSum() const { size_t calculated_check_sum = calculateCheckSum(); size_t check_sum = getCheckSum(); @@ -246,16 +245,16 @@ public: } /// Write check sum in block header - ALWAYS_INLINE inline void writeCheckSum() + inline void writeCheckSum() { size_t check_sum = static_cast(CityHash_v1_0_2::CityHash64(block_data + block_header_check_sum_size, block_size - block_header_check_sum_size)); std::memcpy(block_data, &check_sum, sizeof(size_t)); } - ALWAYS_INLINE inline size_t getBlockSize() const { return block_size; } + inline size_t getBlockSize() const { return block_size; } /// Returns block data - ALWAYS_INLINE inline char * getBlockData() const { return block_data; } + inline char * getBlockData() const { return block_data; } /// Read keys that were serialized in block /// It is client responsibility to ensure that simple or complex keys were written in block @@ -753,7 +752,7 @@ private: int fd = -1; }; - ALWAYS_INLINE inline static int preallocateDiskSpace(int fd, size_t offset, size_t len) + inline static int preallocateDiskSpace(int fd, size_t offset, size_t len) { #if defined(__FreeBSD__) return posix_fallocate(fd, offset, len); @@ -762,7 +761,7 @@ private: #endif } - ALWAYS_INLINE inline static char * getRequestBuffer(const iocb & request) + inline static char * getRequestBuffer(const iocb & request) { char * result = nullptr; @@ -775,7 +774,7 @@ private: return result; } - ALWAYS_INLINE inline static ssize_t eventResult(io_event & event) + inline static ssize_t eventResult(io_event & event) { ssize_t bytes_written; @@ -809,15 +808,12 @@ public: : configuration(configuration_) , file_buffer(configuration_.file_path, configuration.block_size, configuration.file_blocks_size) , rnd_engine(randomSeed()) - , index(configuration.max_stored_keys, false, { complex_key_arena }) { memory_buffer_partitions.emplace_back(configuration.block_size, configuration.write_buffer_blocks_size); } bool returnsFetchedColumnsInOrderOfRequestedKeys() const override { return false; } - bool canPerformFetchByMultipleThreadsWithoutLock() const override { return true; } - String getName() const override { if (dictionary_key_type == DictionaryKeyType::simple) @@ -900,14 +896,31 @@ public: size_t getSize() const override { return index.size(); } - size_t getMaxSize() const override {return index.getMaxSize(); } + double getLoadFactor() const override + { + size_t partitions_size = memory_buffer_partitions.size(); + + if (partitions_size == configuration.max_partitions_count) + return 1.0; + + auto & current_memory_partition = memory_buffer_partitions[current_partition_index]; + + size_t full_partitions = partitions_size - 1; + size_t blocks_in_memory = (full_partitions * configuration.write_buffer_blocks_size) + current_memory_partition.getCurrentBlockIndex(); + size_t blocks_on_disk = file_buffer.getCurrentBlockIndex(); + + size_t max_blocks_size = (configuration.file_blocks_size + configuration.write_buffer_blocks_size) * configuration.max_partitions_count; + + double load_factor = static_cast(blocks_in_memory + blocks_on_disk) / max_blocks_size; + return load_factor; + } size_t getBytesAllocated() const override { size_t memory_partitions_bytes_size = memory_buffer_partitions.size() * configuration.write_buffer_blocks_size * configuration.block_size; size_t file_partitions_bytes_size = memory_buffer_partitions.size() * configuration.file_blocks_size * configuration.block_size; - return index.getSizeInBytes() + memory_partitions_bytes_size + file_partitions_bytes_size; + return index.getBufferSizeInBytes() + memory_partitions_bytes_size + file_partitions_bytes_size; } private: @@ -935,13 +948,12 @@ private: struct KeyToBlockOffset { - KeyToBlockOffset(size_t key_index_, size_t offset_in_block_, bool is_expired_) - : key_index(key_index_), offset_in_block(offset_in_block_), is_expired(is_expired_) + KeyToBlockOffset(size_t key_index_, size_t offset_in_block_) + : key_index(key_index_), offset_in_block(offset_in_block_) {} size_t key_index = 0; size_t offset_in_block = 0; - bool is_expired = false; }; template @@ -952,7 +964,7 @@ private: Result result; result.fetched_columns = fetch_request.makeAttributesResultColumns(); - result.key_index_to_state.resize_fill(keys.size(), {KeyState::not_found}); + result.key_index_to_state.resize_fill(keys.size()); const time_t now = std::chrono::system_clock::to_time_t(std::chrono::system_clock::now()); @@ -974,7 +986,7 @@ private: { auto key = keys[key_index]; - const auto * it = index.findNoLRU(key); + const auto * it = index.find(key); if (!it) { @@ -984,7 +996,7 @@ private: const auto & cell = it->getMapped(); - if (now > cell.deadline + strict_max_lifetime_seconds) + if (unlikely(now > cell.deadline + strict_max_lifetime_seconds)) { ++result.not_found_keys_size; continue; @@ -999,7 +1011,8 @@ private: key_state = KeyState::expired; } - result.expired_keys_size += cell_is_expired; + result.expired_keys_size += static_cast(cell_is_expired); + result.found_keys_size += static_cast(!cell_is_expired); switch (cell.state) { @@ -1016,13 +1029,19 @@ private: case Cell::on_disk: { PaddedPODArray & keys_block = block_to_keys_map[cell.index.block_index]; - keys_block.emplace_back(key_index, cell.index.offset_in_block, cell_is_expired); + keys_block.emplace_back(key_index, cell.index.offset_in_block); - if (!unique_blocks_to_request.contains(cell.index.block_index)) - { + KeyState::State state = cell_is_expired ? KeyState::expired : KeyState::found; + + /// Fetched column index will be set later during fetch blocks + result.key_index_to_state[key_index] = {state, 0}; + + auto insert_result = unique_blocks_to_request.insert(cell.index.block_index); + bool was_inserted = insert_result.second; + + if (was_inserted) blocks_to_request.emplace_back(cell.index.block_index); - unique_blocks_to_request.insert(cell.index.block_index); - } + break; } case Cell::default_value: @@ -1038,8 +1057,6 @@ private: } } - result.found_keys_size = keys_size - (result.not_found_keys_size + result.expired_keys_size); - /// Sort blocks by offset before start async io requests std::sort(blocks_to_request.begin(), blocks_to_request.end()); @@ -1052,8 +1069,7 @@ private: char * key_data = block_data + key_in_block.offset_in_block; deserializeAndInsertIntoColumns(result.fetched_columns, fetch_request, key_data); - KeyState::State state = key_in_block.is_expired ? KeyState::expired : KeyState::found; - result.key_index_to_state[key_in_block.key_index] = {state, fetched_columns_index}; + result.key_index_to_state[key_in_block.key_index].setFetchedColumnIndex(fetched_columns_index); ++fetched_columns_index; } @@ -1091,7 +1107,7 @@ private: throw Exception("Serialized columns size is greater than allowed block size and metadata", ErrorCodes::UNSUPPORTED_METHOD); /// We cannot reuse place that is already allocated in file or memory cache so we erase key from index - index.erase(key); + eraseKeyFromIndex(key); Cell cell; setCellDeadline(cell, now); @@ -1118,8 +1134,7 @@ private: for (auto key : keys) { - /// We cannot reuse place that is already allocated in file or memory cache so we erase key from index - index.erase(key); + eraseKeyFromIndex(key); Cell cell; @@ -1139,7 +1154,7 @@ private: key = updated_key; } - index.insert(key, cell); + index[key] = cell; } } @@ -1192,7 +1207,7 @@ private: cell.index = cache_index; cell.in_memory_partition_index = current_partition_index; - index.insert(ssd_cache_key.key, cell); + index[ssd_cache_key.key] = cell; break; } else @@ -1222,7 +1237,7 @@ private: if (old_key_cell.isOnDisk() && old_key_block >= block_index_in_file_before_write && old_key_block < file_read_end_block_index) - index.erase(old_key); + eraseKeyFromIndex(old_key); } } } @@ -1275,7 +1290,7 @@ private: cell.index = cache_index; cell.in_memory_partition_index = current_partition_index; - index.insert(ssd_cache_key.key, cell); + index[ssd_cache_key.key] = cell; break; } else @@ -1313,10 +1328,23 @@ private: size_t max_sec_lifetime = configuration.lifetime.max_sec; std::uniform_int_distribution distribution{min_sec_lifetime, max_sec_lifetime}; - auto deadline = now + std::chrono::seconds{distribution(rnd_engine)}; + auto deadline = now + std::chrono::seconds(distribution(rnd_engine)); cell.deadline = std::chrono::system_clock::to_time_t(deadline); } + inline void eraseKeyFromIndex(KeyType key) + { + auto it = index.find(key); + + if (it == nullptr) + return; + + index.erase(key); + + if constexpr(std::is_same_v) + complex_key_arena.free(const_cast(key.data), key.size); + } + SSDCacheDictionaryStorageConfiguration configuration; SSDCacheFileBuffer file_buffer; @@ -1325,31 +1353,17 @@ private: pcg64 rnd_engine; - class ArenaCellKeyDisposer - { - public: - ArenaWithFreeLists & arena; + using SimpleKeyHashMap = HashMap; + using ComplexKeyHashMap = HashMapWithSavedHash; - template - void operator()(const Key & key, const Value &) const - { - /// In case of complex key we keep it in arena - if constexpr (std::is_same_v) - arena.free(const_cast(key.data), key.size); - } - }; - - using SimpleKeyLRUHashMap = LRUHashMap; - using ComplexKeyLRUHashMap = LRUHashMapWithSavedHash; - - using CacheLRUHashMap = std::conditional_t< + using CacheMap = std::conditional_t< dictionary_key_type == DictionaryKeyType::simple, - SimpleKeyLRUHashMap, - ComplexKeyLRUHashMap>; + SimpleKeyHashMap, + ComplexKeyHashMap>; ArenaWithFreeLists complex_key_arena; - CacheLRUHashMap index; + CacheMap index; size_t current_partition_index = 0; diff --git a/src/Dictionaries/registerCacheDictionaries.cpp b/src/Dictionaries/registerCacheDictionaries.cpp index 9f0f214e79b..b93a08acb76 100644 --- a/src/Dictionaries/registerCacheDictionaries.cpp +++ b/src/Dictionaries/registerCacheDictionaries.cpp @@ -26,7 +26,7 @@ CacheDictionaryStorageConfiguration parseCacheStorageConfiguration( const size_t size = config.getUInt64(dictionary_configuration_prefix + "size_in_cells"); if (size == 0) throw Exception(ErrorCodes::TOO_SMALL_BUFFER_SIZE, - "({}: cache dictionary cannot have 0 cells", + "({}): cache dictionary cannot have 0 cells", full_name); size_t dict_lifetime_seconds = static_cast(dict_lifetime.max_sec); @@ -59,7 +59,6 @@ SSDCacheDictionaryStorageConfiguration parseSSDCacheStorageConfiguration( static constexpr size_t DEFAULT_READ_BUFFER_SIZE_BYTES = 16 * DEFAULT_SSD_BLOCK_SIZE_BYTES; static constexpr size_t DEFAULT_WRITE_BUFFER_SIZE_BYTES = DEFAULT_SSD_BLOCK_SIZE_BYTES; - static constexpr size_t DEFAULT_MAX_STORED_KEYS = 100000; static constexpr size_t DEFAULT_PARTITIONS_COUNT = 16; const size_t max_partitions_count @@ -94,16 +93,11 @@ SSDCacheDictionaryStorageConfiguration parseSSDCacheStorageConfiguration( if (directory_path.at(0) != '/') directory_path = std::filesystem::path{config.getString("path")}.concat(directory_path).string(); - const size_t max_stored_keys_in_partition - = config.getInt64(dictionary_configuration_prefix + "max_stored_keys", DEFAULT_MAX_STORED_KEYS); - const size_t rounded_size = roundUpToPowerOfTwoOrZero(max_stored_keys_in_partition); - SSDCacheDictionaryStorageConfiguration configuration{ strict_max_lifetime_seconds, dict_lifetime, directory_path, max_partitions_count, - rounded_size, block_size, file_size / block_size, read_buffer_size / block_size, diff --git a/tests/integration/helpers/dictionary.py b/tests/integration/helpers/dictionary.py index b3f7a729777..41d87180c8a 100644 --- a/tests/integration/helpers/dictionary.py +++ b/tests/integration/helpers/dictionary.py @@ -7,12 +7,12 @@ class Layout(object): 'flat': '', 'hashed': '', 'cache': '128', - 'ssd_cache': '/etc/clickhouse/dictionaries/all128', + 'ssd_cache': '/etc/clickhouse/dictionaries/all', 'complex_key_hashed': '', 'complex_key_hashed_one_key': '', 'complex_key_hashed_two_keys': '', 'complex_key_cache': '128', - 'complex_key_ssd_cache': '/etc/clickhouse/dictionaries/all128', + 'complex_key_ssd_cache': '/etc/clickhouse/dictionaries/all', 'range_hashed': '', 'direct': '', 'complex_key_direct': '' diff --git a/tests/integration/test_dictionaries_complex_key_cache_string/configs/dictionaries/ssd_complex_key_cache_string.xml b/tests/integration/test_dictionaries_complex_key_cache_string/configs/dictionaries/ssd_complex_key_cache_string.xml index 85f811d2d85..c8fdbcbe0ef 100644 --- a/tests/integration/test_dictionaries_complex_key_cache_string/configs/dictionaries/ssd_complex_key_cache_string.xml +++ b/tests/integration/test_dictionaries_complex_key_cache_string/configs/dictionaries/ssd_complex_key_cache_string.xml @@ -42,7 +42,6 @@ 131072 1048576 /etc/clickhouse/dictionaries/radars - 1048576 1 diff --git a/tests/queries/0_stateless/01053_ssd_dictionary.sql b/tests/queries/0_stateless/01053_ssd_dictionary.sql index a23ae7e5e96..23a369cc8a6 100644 --- a/tests/queries/0_stateless/01053_ssd_dictionary.sql +++ b/tests/queries/0_stateless/01053_ssd_dictionary.sql @@ -76,7 +76,7 @@ CREATE DICTIONARY 01053_db.ssd_dict PRIMARY KEY id SOURCE(CLICKHOUSE(HOST 'localhost' PORT tcpPort() USER 'default' TABLE 'table_for_dict' PASSWORD '' DB '01053_db')) LIFETIME(MIN 1000 MAX 2000) -LAYOUT(SSD_CACHE(FILE_SIZE 8192 PATH '/var/lib/clickhouse/clickhouse_dicts/1d' BLOCK_SIZE 512 WRITE_BUFFER_SIZE 4096 MAX_STORED_KEYS 1000000)); +LAYOUT(SSD_CACHE(FILE_SIZE 8192 PATH '/var/lib/clickhouse/clickhouse_dicts/1d' BLOCK_SIZE 512 WRITE_BUFFER_SIZE 4096)); SELECT 'UPDATE DICTIONARY'; -- 118 @@ -142,7 +142,7 @@ CREATE DICTIONARY 01053_db.ssd_dict PRIMARY KEY id SOURCE(CLICKHOUSE(HOST 'localhost' PORT tcpPort() USER 'default' TABLE 'table_for_dict' PASSWORD '' DB '01053_db')) LIFETIME(MIN 1000 MAX 2000) -LAYOUT(SSD_CACHE(FILE_SIZE 8192 PATH '/var/lib/clickhouse/clickhouse_dicts/2d' BLOCK_SIZE 512 WRITE_BUFFER_SIZE 1024 MAX_STORED_KEYS 10)); +LAYOUT(SSD_CACHE(FILE_SIZE 8192 PATH '/var/lib/clickhouse/clickhouse_dicts/2d' BLOCK_SIZE 512 WRITE_BUFFER_SIZE 1024)); SELECT 'UPDATE DICTIONARY (MT)'; -- 118 diff --git a/tests/queries/0_stateless/01280_ssd_complex_key_dictionary.sql b/tests/queries/0_stateless/01280_ssd_complex_key_dictionary.sql index 50b34c4b18f..cd3e52c9691 100644 --- a/tests/queries/0_stateless/01280_ssd_complex_key_dictionary.sql +++ b/tests/queries/0_stateless/01280_ssd_complex_key_dictionary.sql @@ -98,7 +98,7 @@ CREATE DICTIONARY 01280_db.ssd_dict PRIMARY KEY k1, k2 SOURCE(CLICKHOUSE(HOST 'localhost' PORT tcpPort() USER 'default' TABLE 'table_for_dict' PASSWORD '' DB '01280_db')) LIFETIME(MIN 1000 MAX 2000) -LAYOUT(COMPLEX_KEY_SSD_CACHE(FILE_SIZE 8192 PATH '/var/lib/clickhouse/clickhouse_dicts/1d' BLOCK_SIZE 512 WRITE_BUFFER_SIZE 4096 MAX_STORED_KEYS 1000000)); +LAYOUT(COMPLEX_KEY_SSD_CACHE(FILE_SIZE 8192 PATH '/var/lib/clickhouse/clickhouse_dicts/1d' BLOCK_SIZE 512 WRITE_BUFFER_SIZE 4096)); SELECT 'UPDATE DICTIONARY'; -- 118 From cf985a86c48e25f86e27591fba1da801f3b3ecf2 Mon Sep 17 00:00:00 2001 From: Maksim Kita Date: Thu, 18 Mar 2021 14:25:13 +0300 Subject: [PATCH 035/155] test --- .../0_stateless/01684_ssd_cache_dictionary_simple_key.sql | 6 +++--- .../0_stateless/01685_ssd_cache_dictionary_complex_key.sql | 6 +++--- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/tests/queries/0_stateless/01684_ssd_cache_dictionary_simple_key.sql b/tests/queries/0_stateless/01684_ssd_cache_dictionary_simple_key.sql index 9dbad1289f1..2fe1e54fe6c 100644 --- a/tests/queries/0_stateless/01684_ssd_cache_dictionary_simple_key.sql +++ b/tests/queries/0_stateless/01684_ssd_cache_dictionary_simple_key.sql @@ -22,7 +22,7 @@ CREATE DICTIONARY 01684_database_for_cache_dictionary.cache_dictionary_simple_ke PRIMARY KEY id SOURCE(CLICKHOUSE(HOST 'localhost' PORT tcpPort() USER 'default' TABLE 'simple_key_simple_attributes_source_table')) LIFETIME(MIN 1 MAX 1000) -LAYOUT(SSD_CACHE(BLOCK_SIZE 4096 FILE_SIZE 8192 PATH '/var/lib/clickhouse/clickhouse_dicts/0d')); +LAYOUT(SSD_CACHE(BLOCK_SIZE 4096 FILE_SIZE 8192 PATH '/home/yetti/Documents/ClickHouse/build_address/programs/0d')); SELECT 'Dictionary cache_dictionary_simple_key_simple_attributes'; SELECT 'dictGet existing value'; @@ -66,7 +66,7 @@ CREATE DICTIONARY 01684_database_for_cache_dictionary.cache_dictionary_simple_ke PRIMARY KEY id SOURCE(CLICKHOUSE(HOST 'localhost' PORT tcpPort() USER 'default' TABLE 'simple_key_complex_attributes_source_table')) LIFETIME(MIN 1 MAX 1000) -LAYOUT(SSD_CACHE(BLOCK_SIZE 4096 FILE_SIZE 8192 PATH '/var/lib/clickhouse/clickhouse_dicts/1d')); +LAYOUT(SSD_CACHE(BLOCK_SIZE 4096 FILE_SIZE 8192 PATH '/home/yetti/Documents/ClickHouse/build_address/programs/1d')); SELECT 'Dictionary cache_dictionary_simple_key_complex_attributes'; SELECT 'dictGet existing value'; @@ -108,7 +108,7 @@ CREATE DICTIONARY 01684_database_for_cache_dictionary.cache_dictionary_simple_ke PRIMARY KEY id SOURCE(CLICKHOUSE(HOST 'localhost' PORT tcpPort() USER 'default' TABLE 'simple_key_hierarchy_table')) LIFETIME(MIN 1 MAX 1000) -LAYOUT(SSD_CACHE(BLOCK_SIZE 4096 FILE_SIZE 8192 PATH '/var/lib/clickhouse/clickhouse_dicts/2d')); +LAYOUT(SSD_CACHE(BLOCK_SIZE 4096 FILE_SIZE 8192 PATH '/home/yetti/Documents/ClickHouse/build_address/programs/2d')); SELECT 'Dictionary cache_dictionary_simple_key_hierarchy'; SELECT 'dictGet'; diff --git a/tests/queries/0_stateless/01685_ssd_cache_dictionary_complex_key.sql b/tests/queries/0_stateless/01685_ssd_cache_dictionary_complex_key.sql index 8ec5a4a2c24..f65aa445284 100644 --- a/tests/queries/0_stateless/01685_ssd_cache_dictionary_complex_key.sql +++ b/tests/queries/0_stateless/01685_ssd_cache_dictionary_complex_key.sql @@ -24,7 +24,7 @@ CREATE DICTIONARY 01685_database_for_cache_dictionary.cache_dictionary_complex_k PRIMARY KEY id, id_key SOURCE(CLICKHOUSE(HOST 'localhost' PORT tcpPort() USER 'default' TABLE 'complex_key_simple_attributes_source_table' DB '01685_database_for_cache_dictionary')) LIFETIME(MIN 1 MAX 1000) -LAYOUT(COMPLEX_KEY_SSD_CACHE(BLOCK_SIZE 4096 FILE_SIZE 8192 PATH '/var/lib/clickhouse/clickhouse_dicts/0d')); +LAYOUT(COMPLEX_KEY_SSD_CACHE(BLOCK_SIZE 4096 FILE_SIZE 8192 PATH '/home/yetti/Documents/ClickHouse/build_address/programs/0d')); SELECT 'Dictionary cache_dictionary_complex_key_simple_attributes'; SELECT 'dictGet existing value'; @@ -71,7 +71,7 @@ CREATE DICTIONARY 01685_database_for_cache_dictionary.cache_dictionary_complex_k PRIMARY KEY id, id_key SOURCE(CLICKHOUSE(HOST 'localhost' PORT tcpPort() USER 'default' TABLE 'complex_key_complex_attributes_source_table' DB '01685_database_for_cache_dictionary')) LIFETIME(MIN 1 MAX 1000) -LAYOUT(COMPLEX_KEY_SSD_CACHE(BLOCK_SIZE 4096 FILE_SIZE 8192 PATH '/var/lib/clickhouse/clickhouse_dicts/1d')); +LAYOUT(COMPLEX_KEY_SSD_CACHE(BLOCK_SIZE 4096 FILE_SIZE 8192 PATH '/home/yetti/Documents/ClickHouse/build_address/programs/1d')); SELECT 'Dictionary cache_dictionary_complex_key_complex_attributes'; SELECT 'dictGet existing value'; @@ -95,4 +95,4 @@ DROP DICTIONARY 01685_database_for_cache_dictionary.cache_dictionary_complex_key DROP TABLE 01685_database_for_cache_dictionary.complex_key_complex_attributes_source_table; DROP DATABASE 01685_database_for_cache_dictionary; - + From f3ff437a3997e399b013f7634b4f7dd7a5184e96 Mon Sep 17 00:00:00 2001 From: alesapin Date: Thu, 18 Mar 2021 14:32:45 +0300 Subject: [PATCH 036/155] Add all nodes killer/stop, one multitransaction request and counter test --- .../src/jepsen/nukeeper/counter.clj | 52 +++++++++++++++++++ .../src/jepsen/nukeeper/main.clj | 6 ++- .../src/jepsen/nukeeper/nemesis.clj | 35 +++++++++---- .../src/jepsen/nukeeper/set.clj | 6 +-- .../src/jepsen/nukeeper/utils.clj | 16 +++++- .../test/jepsen/nukeeper_test.clj | 2 + 6 files changed, 100 insertions(+), 17 deletions(-) create mode 100644 tests/jepsen.nukeeper/src/jepsen/nukeeper/counter.clj diff --git a/tests/jepsen.nukeeper/src/jepsen/nukeeper/counter.clj b/tests/jepsen.nukeeper/src/jepsen/nukeeper/counter.clj new file mode 100644 index 00000000000..1bdf3f89186 --- /dev/null +++ b/tests/jepsen.nukeeper/src/jepsen/nukeeper/counter.clj @@ -0,0 +1,52 @@ +(ns jepsen.nukeeper.counter + (:require + [clojure.tools.logging :refer :all] + [jepsen + [checker :as checker] + [client :as client] + [generator :as gen]] + [jepsen.nukeeper.utils :refer :all] + [zookeeper :as zk]) + (:import (org.apache.zookeeper ZooKeeper KeeperException KeeperException$BadVersionException))) + + +(defn r [_ _] {:type :invoke, :f :read}) +(defn add [_ _] {:type :invoke, :f :add, :value (rand-int 5)}) + + +(defrecord CounterClient [conn nodename] + client/Client + (open! [this test node] + (assoc + (assoc this + :conn (zk-connect node 9181 30000)) + :nodename node)) + + (setup! [this test]) + + (invoke! [this test op] + (case (:f op) + :read (try + (assoc op + :type :ok + :value (count (zk-list conn "/"))) + (catch Exception _ (assoc op :type :fail, :error :connect-error))) + :add (try + (do + (zk-multi-create-many-seq-nodes conn "/seq-" (:value op)) + (assoc op :type :ok)) + (catch Exception _ (assoc op :type :info, :error :connect-error))))) + + (teardown! [_ test]) + + (close! [_ test])) + +(defn workload + "A generator, client, and checker for a set test." + [opts] + {:client (CounterClient. nil nil) + :checker (checker/counter) + :generator (->> (range) + (map (fn [x] + (->> (gen/mix [r add]))))) + :final-generator (gen/once {:type :invoke, :f :read, :value nil})}) diff --git a/tests/jepsen.nukeeper/src/jepsen/nukeeper/main.clj b/tests/jepsen.nukeeper/src/jepsen/nukeeper/main.clj index 8b7c1a6caac..0f9619a7653 100644 --- a/tests/jepsen.nukeeper/src/jepsen/nukeeper/main.clj +++ b/tests/jepsen.nukeeper/src/jepsen/nukeeper/main.clj @@ -5,6 +5,7 @@ [jepsen.nukeeper.nemesis :as custom-nemesis] [jepsen.nukeeper.register :as register] [jepsen.nukeeper.unique :as unique] + [jepsen.nukeeper.counter :as counter] [jepsen.nukeeper.constants :refer :all] [clojure.string :as str] [jepsen @@ -85,7 +86,8 @@ "A map of workload names to functions that construct workloads, given opts." {"set" set/workload "register" register/workload - "unique-ids" unique/workload}) + "unique-ids" unique/workload + "counter" counter/workload}) (def cli-opts "Additional command line options." @@ -126,7 +128,7 @@ current-nemesis (get custom-nemesis/custom-nemesises (:nemesis opts))] (merge tests/noop-test opts - {:name (str "clickhouse-keeper quorum=" quorum " " (name (:workload opts)) (name (:nemesis opts))) + {:name (str "clickhouse-keeper quorum=" quorum " " (name (:workload opts)) " " (name (:nemesis opts))) :os ubuntu/os :db (db "rbtorrent:a122093aee0bdcb70ca42d5e5fb4ba5544372f5f") :pure-generators true diff --git a/tests/jepsen.nukeeper/src/jepsen/nukeeper/nemesis.clj b/tests/jepsen.nukeeper/src/jepsen/nukeeper/nemesis.clj index d1dc0d55e5f..bf22f9ad1f6 100644 --- a/tests/jepsen.nukeeper/src/jepsen/nukeeper/nemesis.clj +++ b/tests/jepsen.nukeeper/src/jepsen/nukeeper/nemesis.clj @@ -8,17 +8,28 @@ [jepsen.nukeeper.constants :refer :all] [jepsen.nukeeper.utils :refer :all])) -(defn random-single-node-killer-nemesis +(defn random-node-killer-nemesis [] (nemesis/node-start-stopper rand-nth (fn start [test node] (kill-clickhouse! node test)) (fn stop [test node] (start-clickhouse! node test)))) -(defn hammer-time-nemesis +(defn all-nodes-killer-nemesis + [] + (nemesis/node-start-stopper + identity + (fn start [test node] (kill-clickhouse! node test)) + (fn stop [test node] (start-clickhouse! node test)))) + +(defn random-node-hammer-time-nemesis [] (nemesis/hammer-time "clickhouse")) +(defn all-nodes-hammer-time-nemesis + [] + (nemesis/hammer-time identity "clickhouse")) + (defn select-last-file [path] (last (clojure.string/split @@ -83,11 +94,11 @@ (c/exec :rm :-fr path)))) (defn start-stop-generator - [] + [time-corrupt time-ok] (->> - (cycle [(gen/sleep 5) + (cycle [(gen/sleep time-ok) {:type :info, :f :start} - (gen/sleep 5) + (gen/sleep time-corrupt) {:type :info, :f :stop}]))) (defn corruption-generator @@ -97,12 +108,16 @@ {:type :info, :f :corrupt}]))) (def custom-nemesises - {"single-node-killer" {:nemesis (random-single-node-killer-nemesis) - :generator (start-stop-generator)} + {"random-node-killer" {:nemesis (random-node-killer-nemesis) + :generator (start-stop-generator 5 5)} + "all-nodes-killer" {:nemesis (all-nodes-killer-nemesis) + :generator (start-stop-generator 1 10)} "simple-partitioner" {:nemesis (nemesis/partition-random-halves) - :generator (start-stop-generator)} - "hammer-time" {:nemesis (hammer-time-nemesis) - :generator (start-stop-generator)} + :generator (start-stop-generator 5 5)} + "random-node-hammer-time" {:nemesis (random-node-hammer-time-nemesis) + :generator (start-stop-generator 5 5)} + "all-nodes-hammer-time" {:nemesis (all-nodes-hammer-time-nemesis) + :generator (start-stop-generator 1 10)} "logs-corruptor" {:nemesis (logs-corruption-nemesis) :generator (corruption-generator)} "snapshots-corruptor" {:nemesis (snapshots-corruption-nemesis) diff --git a/tests/jepsen.nukeeper/src/jepsen/nukeeper/set.clj b/tests/jepsen.nukeeper/src/jepsen/nukeeper/set.clj index d50253aa174..c30ec9635a1 100644 --- a/tests/jepsen.nukeeper/src/jepsen/nukeeper/set.clj +++ b/tests/jepsen.nukeeper/src/jepsen/nukeeper/set.clj @@ -22,11 +22,9 @@ (invoke! [this test op] (case (:f op) - :read - (do - (assoc op + :read (assoc op :type :ok - :value (read-string (:data (zk-get-str conn k))))) + :value (read-string (:data (zk-get-str conn k)))) :add (try (do (zk-add-to-set conn k (:value op)) diff --git a/tests/jepsen.nukeeper/src/jepsen/nukeeper/utils.clj b/tests/jepsen.nukeeper/src/jepsen/nukeeper/utils.clj index 10851a2adc7..6fd2f3c87f4 100644 --- a/tests/jepsen.nukeeper/src/jepsen/nukeeper/utils.clj +++ b/tests/jepsen.nukeeper/src/jepsen/nukeeper/utils.clj @@ -2,10 +2,13 @@ (:require [clojure.string :as str] [zookeeper.data :as data] [zookeeper :as zk] + [zookeeper.internal :as zi] [jepsen.control.util :as cu] [jepsen.nukeeper.constants :refer :all] [jepsen.control :as c] - [clojure.tools.logging :refer :all])) + [clojure.tools.logging :refer :all]) + (:import (org.apache.zookeeper CreateMode + ZooKeeper))) (defn parse-long "Parses a string to a Long. Passes through `nil` and empty strings." @@ -67,6 +70,17 @@ [conn path-prefix data] (zk/create conn path-prefix :data (data/to-bytes (str data)) :persistent? true :sequential? true)) +(defn zk-multi-create-many-seq-nodes + [conn path-prefix num] + (let [txn (.transaction conn)] + (loop [i 0] + (cond (>= i num) (.commit txn) + :else (do (.create txn path-prefix + (data/to-bytes "") + (zi/acls :open-acl-unsafe) + CreateMode/PERSISTENT_SEQUENTIAL) + (recur (inc i))))))) + (defn clickhouse-alive? [node test] (info "Checking server alive on" node) diff --git a/tests/jepsen.nukeeper/test/jepsen/nukeeper_test.clj b/tests/jepsen.nukeeper/test/jepsen/nukeeper_test.clj index 824aa40d2c8..1a3e8646574 100644 --- a/tests/jepsen.nukeeper/test/jepsen/nukeeper_test.clj +++ b/tests/jepsen.nukeeper/test/jepsen/nukeeper_test.clj @@ -22,6 +22,8 @@ (zk/create conn "/0") (println (zk/children conn "/")) (zk/set-data conn "/0" (data/to-bytes "777") -1) + (zk-multi-create-many-seq-nodes conn "/seq-" 5) + (println (zk/children conn "/")) (Thread/sleep 5000) (println "VALUE" (data/to-string (:data (zk/data conn "/0")))) (is (= (data/to-string (:data (zk/data conn "/0"))) "777")) From c01171c626e0344915abce370a0dc777a7ce93f9 Mon Sep 17 00:00:00 2001 From: Maksim Kita Date: Thu, 18 Mar 2021 14:58:43 +0300 Subject: [PATCH 037/155] Fixed tests --- src/Dictionaries/SSDCacheDictionaryStorage.h | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/src/Dictionaries/SSDCacheDictionaryStorage.h b/src/Dictionaries/SSDCacheDictionaryStorage.h index f28f9ab37cd..d0b4a5ca835 100644 --- a/src/Dictionaries/SSDCacheDictionaryStorage.h +++ b/src/Dictionaries/SSDCacheDictionaryStorage.h @@ -1339,10 +1339,13 @@ private: if (it == nullptr) return; + /// In case of complex key in arena key is serialized from hash table + KeyType key_copy = it->getKey(); + index.erase(key); - if constexpr(std::is_same_v) - complex_key_arena.free(const_cast(key.data), key.size); + if constexpr (std::is_same_v) + complex_key_arena.free(const_cast(key_copy.data), key_copy.size); } SSDCacheDictionaryStorageConfiguration configuration; From 5324d75505f49b7fe9fdfb630e0c00399115a4e1 Mon Sep 17 00:00:00 2001 From: Maksim Kita Date: Thu, 18 Mar 2021 17:12:25 +0300 Subject: [PATCH 038/155] Fixed tests --- .../0_stateless/01684_ssd_cache_dictionary_simple_key.sql | 6 +++--- .../0_stateless/01685_ssd_cache_dictionary_complex_key.sql | 4 ++-- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/tests/queries/0_stateless/01684_ssd_cache_dictionary_simple_key.sql b/tests/queries/0_stateless/01684_ssd_cache_dictionary_simple_key.sql index 2fe1e54fe6c..9dbad1289f1 100644 --- a/tests/queries/0_stateless/01684_ssd_cache_dictionary_simple_key.sql +++ b/tests/queries/0_stateless/01684_ssd_cache_dictionary_simple_key.sql @@ -22,7 +22,7 @@ CREATE DICTIONARY 01684_database_for_cache_dictionary.cache_dictionary_simple_ke PRIMARY KEY id SOURCE(CLICKHOUSE(HOST 'localhost' PORT tcpPort() USER 'default' TABLE 'simple_key_simple_attributes_source_table')) LIFETIME(MIN 1 MAX 1000) -LAYOUT(SSD_CACHE(BLOCK_SIZE 4096 FILE_SIZE 8192 PATH '/home/yetti/Documents/ClickHouse/build_address/programs/0d')); +LAYOUT(SSD_CACHE(BLOCK_SIZE 4096 FILE_SIZE 8192 PATH '/var/lib/clickhouse/clickhouse_dicts/0d')); SELECT 'Dictionary cache_dictionary_simple_key_simple_attributes'; SELECT 'dictGet existing value'; @@ -66,7 +66,7 @@ CREATE DICTIONARY 01684_database_for_cache_dictionary.cache_dictionary_simple_ke PRIMARY KEY id SOURCE(CLICKHOUSE(HOST 'localhost' PORT tcpPort() USER 'default' TABLE 'simple_key_complex_attributes_source_table')) LIFETIME(MIN 1 MAX 1000) -LAYOUT(SSD_CACHE(BLOCK_SIZE 4096 FILE_SIZE 8192 PATH '/home/yetti/Documents/ClickHouse/build_address/programs/1d')); +LAYOUT(SSD_CACHE(BLOCK_SIZE 4096 FILE_SIZE 8192 PATH '/var/lib/clickhouse/clickhouse_dicts/1d')); SELECT 'Dictionary cache_dictionary_simple_key_complex_attributes'; SELECT 'dictGet existing value'; @@ -108,7 +108,7 @@ CREATE DICTIONARY 01684_database_for_cache_dictionary.cache_dictionary_simple_ke PRIMARY KEY id SOURCE(CLICKHOUSE(HOST 'localhost' PORT tcpPort() USER 'default' TABLE 'simple_key_hierarchy_table')) LIFETIME(MIN 1 MAX 1000) -LAYOUT(SSD_CACHE(BLOCK_SIZE 4096 FILE_SIZE 8192 PATH '/home/yetti/Documents/ClickHouse/build_address/programs/2d')); +LAYOUT(SSD_CACHE(BLOCK_SIZE 4096 FILE_SIZE 8192 PATH '/var/lib/clickhouse/clickhouse_dicts/2d')); SELECT 'Dictionary cache_dictionary_simple_key_hierarchy'; SELECT 'dictGet'; diff --git a/tests/queries/0_stateless/01685_ssd_cache_dictionary_complex_key.sql b/tests/queries/0_stateless/01685_ssd_cache_dictionary_complex_key.sql index f65aa445284..03a7e1d80df 100644 --- a/tests/queries/0_stateless/01685_ssd_cache_dictionary_complex_key.sql +++ b/tests/queries/0_stateless/01685_ssd_cache_dictionary_complex_key.sql @@ -24,7 +24,7 @@ CREATE DICTIONARY 01685_database_for_cache_dictionary.cache_dictionary_complex_k PRIMARY KEY id, id_key SOURCE(CLICKHOUSE(HOST 'localhost' PORT tcpPort() USER 'default' TABLE 'complex_key_simple_attributes_source_table' DB '01685_database_for_cache_dictionary')) LIFETIME(MIN 1 MAX 1000) -LAYOUT(COMPLEX_KEY_SSD_CACHE(BLOCK_SIZE 4096 FILE_SIZE 8192 PATH '/home/yetti/Documents/ClickHouse/build_address/programs/0d')); +LAYOUT(COMPLEX_KEY_SSD_CACHE(BLOCK_SIZE 4096 FILE_SIZE 8192 PATH '/var/lib/clickhouse/clickhouse_dicts/0d')); SELECT 'Dictionary cache_dictionary_complex_key_simple_attributes'; SELECT 'dictGet existing value'; @@ -71,7 +71,7 @@ CREATE DICTIONARY 01685_database_for_cache_dictionary.cache_dictionary_complex_k PRIMARY KEY id, id_key SOURCE(CLICKHOUSE(HOST 'localhost' PORT tcpPort() USER 'default' TABLE 'complex_key_complex_attributes_source_table' DB '01685_database_for_cache_dictionary')) LIFETIME(MIN 1 MAX 1000) -LAYOUT(COMPLEX_KEY_SSD_CACHE(BLOCK_SIZE 4096 FILE_SIZE 8192 PATH '/home/yetti/Documents/ClickHouse/build_address/programs/1d')); +LAYOUT(COMPLEX_KEY_SSD_CACHE(BLOCK_SIZE 4096 FILE_SIZE 8192 PATH '/var/lib/clickhouse/clickhouse_dicts/1d')); SELECT 'Dictionary cache_dictionary_complex_key_complex_attributes'; SELECT 'dictGet existing value'; From 0137a6baac723f94c3ee5401bbde98b2e0c51379 Mon Sep 17 00:00:00 2001 From: alesapin Date: Thu, 18 Mar 2021 23:55:11 +0300 Subject: [PATCH 039/155] Add test founding bug --- src/Coordination/NuKeeperSnapshotManager.cpp | 1 + src/Coordination/NuKeeperStorage.cpp | 11 +++ src/Coordination/NuKeeperStorage.h | 4 ++ .../src/jepsen/nukeeper/counter.clj | 7 +- .../src/jepsen/nukeeper/main.clj | 9 +-- .../src/jepsen/nukeeper/nemesis.clj | 6 +- .../src/jepsen/nukeeper/queue.clj | 67 +++++++++++++++++++ .../src/jepsen/nukeeper/set.clj | 7 +- .../src/jepsen/nukeeper/unique.clj | 7 +- .../src/jepsen/nukeeper/utils.clj | 36 +++++++++- .../test/jepsen/nukeeper_test.clj | 27 +++++--- 11 files changed, 150 insertions(+), 32 deletions(-) create mode 100644 tests/jepsen.nukeeper/src/jepsen/nukeeper/queue.clj diff --git a/src/Coordination/NuKeeperSnapshotManager.cpp b/src/Coordination/NuKeeperSnapshotManager.cpp index f5a97619976..5cc7bc356be 100644 --- a/src/Coordination/NuKeeperSnapshotManager.cpp +++ b/src/Coordination/NuKeeperSnapshotManager.cpp @@ -161,6 +161,7 @@ void NuKeeperStorageSnapshot::serialize(const NuKeeperStorageSnapshot & snapshot SnapshotMetadataPtr NuKeeperStorageSnapshot::deserialize(NuKeeperStorage & storage, ReadBuffer & in) { + storage.clearData(); uint8_t version; readBinary(version, in); if (static_cast(version) > SnapshotVersion::V0) diff --git a/src/Coordination/NuKeeperStorage.cpp b/src/Coordination/NuKeeperStorage.cpp index 2440d6f6613..0b773aeaafd 100644 --- a/src/Coordination/NuKeeperStorage.cpp +++ b/src/Coordination/NuKeeperStorage.cpp @@ -752,4 +752,15 @@ void NuKeeperStorage::clearDeadWatches(int64_t session_id) } } +void NuKeeperStorage::clearData() +{ + container.clear(); + ephemerals.clear(); + sessions_and_watchers.clear(); + session_expiry_queue.clear(); + session_and_timeout.clear(); + session_id_counter = 1; + zxid = 0; +} + } diff --git a/src/Coordination/NuKeeperStorage.h b/src/Coordination/NuKeeperStorage.h index c49df88159f..b44a077c277 100644 --- a/src/Coordination/NuKeeperStorage.h +++ b/src/Coordination/NuKeeperStorage.h @@ -82,6 +82,8 @@ public: public: NuKeeperStorage(int64_t tick_time_ms); + void clearData(); + int64_t getSessionID(int64_t session_timeout_ms) { auto result = session_id_counter++; @@ -131,4 +133,6 @@ public: } }; +using NuKeeperStoragePtr = std::unique_ptr; + } diff --git a/tests/jepsen.nukeeper/src/jepsen/nukeeper/counter.clj b/tests/jepsen.nukeeper/src/jepsen/nukeeper/counter.clj index 1bdf3f89186..48b270517a4 100644 --- a/tests/jepsen.nukeeper/src/jepsen/nukeeper/counter.clj +++ b/tests/jepsen.nukeeper/src/jepsen/nukeeper/counter.clj @@ -1,5 +1,5 @@ (ns jepsen.nukeeper.counter - (:require + (:require [clojure.tools.logging :refer :all] [jepsen [checker :as checker] @@ -9,11 +9,9 @@ [zookeeper :as zk]) (:import (org.apache.zookeeper ZooKeeper KeeperException KeeperException$BadVersionException))) - (defn r [_ _] {:type :invoke, :f :read}) (defn add [_ _] {:type :invoke, :f :add, :value (rand-int 5)}) - (defrecord CounterClient [conn nodename] client/Client (open! [this test node] @@ -39,7 +37,8 @@ (teardown! [_ test]) - (close! [_ test])) + (close! [_ test] + (zk/close conn))) (defn workload "A generator, client, and checker for a set test." diff --git a/tests/jepsen.nukeeper/src/jepsen/nukeeper/main.clj b/tests/jepsen.nukeeper/src/jepsen/nukeeper/main.clj index 0f9619a7653..b8854638ed0 100644 --- a/tests/jepsen.nukeeper/src/jepsen/nukeeper/main.clj +++ b/tests/jepsen.nukeeper/src/jepsen/nukeeper/main.clj @@ -5,6 +5,7 @@ [jepsen.nukeeper.nemesis :as custom-nemesis] [jepsen.nukeeper.register :as register] [jepsen.nukeeper.unique :as unique] + [jepsen.nukeeper.queue :as queue] [jepsen.nukeeper.counter :as counter] [jepsen.nukeeper.constants :refer :all] [clojure.string :as str] @@ -23,7 +24,6 @@ [jepsen.os.ubuntu :as ubuntu] [jepsen.checker.timeline :as timeline] [clojure.java.io :as io] - [knossos.model :as model] [zookeeper.data :as data] [zookeeper :as zk]) (:import (org.apache.zookeeper ZooKeeper KeeperException KeeperException$BadVersionException))) @@ -69,7 +69,7 @@ (info node "tearing down clickhouse") (cu/stop-daemon! (str binary-path "/clickhouse") pidfile) (c/su - ;(c/exec :rm :-f (str binary-path "/clickhouse")) + (c/exec :rm :-f (str binary-path "/clickhouse")) (c/exec :rm :-rf dir) (c/exec :rm :-rf logdir) (c/exec :rm :-rf "/etc/clickhouse-server"))) @@ -87,7 +87,8 @@ {"set" set/workload "register" register/workload "unique-ids" unique/workload - "counter" counter/workload}) + "counter" counter/workload + "queue" queue/workload}) (def cli-opts "Additional command line options." @@ -130,7 +131,7 @@ opts {:name (str "clickhouse-keeper quorum=" quorum " " (name (:workload opts)) " " (name (:nemesis opts))) :os ubuntu/os - :db (db "rbtorrent:a122093aee0bdcb70ca42d5e5fb4ba5544372f5f") + :db (db "rbtorrent:711cf0ff9281804eb53875d0c12499df1c2a0adc") :pure-generators true :client (:client workload) :nemesis (:nemesis current-nemesis) diff --git a/tests/jepsen.nukeeper/src/jepsen/nukeeper/nemesis.clj b/tests/jepsen.nukeeper/src/jepsen/nukeeper/nemesis.clj index bf22f9ad1f6..59f3cb52dae 100644 --- a/tests/jepsen.nukeeper/src/jepsen/nukeeper/nemesis.clj +++ b/tests/jepsen.nukeeper/src/jepsen/nukeeper/nemesis.clj @@ -111,13 +111,13 @@ {"random-node-killer" {:nemesis (random-node-killer-nemesis) :generator (start-stop-generator 5 5)} "all-nodes-killer" {:nemesis (all-nodes-killer-nemesis) - :generator (start-stop-generator 1 10)} + :generator (start-stop-generator 1 10)} "simple-partitioner" {:nemesis (nemesis/partition-random-halves) :generator (start-stop-generator 5 5)} "random-node-hammer-time" {:nemesis (random-node-hammer-time-nemesis) - :generator (start-stop-generator 5 5)} + :generator (start-stop-generator 5 5)} "all-nodes-hammer-time" {:nemesis (all-nodes-hammer-time-nemesis) - :generator (start-stop-generator 1 10)} + :generator (start-stop-generator 1 10)} "logs-corruptor" {:nemesis (logs-corruption-nemesis) :generator (corruption-generator)} "snapshots-corruptor" {:nemesis (snapshots-corruption-nemesis) diff --git a/tests/jepsen.nukeeper/src/jepsen/nukeeper/queue.clj b/tests/jepsen.nukeeper/src/jepsen/nukeeper/queue.clj new file mode 100644 index 00000000000..f6f7abb51b6 --- /dev/null +++ b/tests/jepsen.nukeeper/src/jepsen/nukeeper/queue.clj @@ -0,0 +1,67 @@ +(ns jepsen.nukeeper.queue + (:require + [clojure.tools.logging :refer :all] + [jepsen + [checker :as checker] + [client :as client] + [generator :as gen]] + [jepsen.nukeeper.utils :refer :all] + [zookeeper :as zk]) + (:import (org.apache.zookeeper ZooKeeper KeeperException KeeperException$BadVersionException))) + +(defn enqueue [val _ _] {:type :invoke, :f :enqueue :value val}) +(defn dequeue [_ _] {:type :invoke, :f :dequeue}) + +(defrecord QueueClient [conn nodename] + client/Client + (open! [this test node] + (assoc + (assoc this + :conn (zk-connect node 9181 30000)) + :nodename node)) + + (setup! [this test]) + + (invoke! [this test op] + (case (:f op) + :enqueue (try + (do + (zk-create-if-not-exists conn (str "/" (:value op)) "") + (assoc op :type :ok)) + (catch Exception _ (assoc op :type :info, :error :connect-error))) + :dequeue + (try + (let [result (zk-multi-delete-first-child conn "/")] + (if (not (nil? result)) + (assoc op :type :ok :value result) + (assoc op :type :fail :value result))) + (catch KeeperException$BadVersionException _ (assoc op :type :fail, :error :bad-version)) + (catch Exception _ (assoc op :type :info, :error :connect-error))) + :drain + (try + (loop [result '()] + (let [deleted-child (zk-multi-delete-first-child conn "/")] + (if (not (nil? deleted-child)) + (recur (concat result [deleted-child])) + (assoc op :type :ok :value result)))) + (catch Exception _ (assoc op :type :info, :error :connect-error))))) + + (teardown! [_ test]) + + (close! [_ test] + (zk/close conn))) + +(defn sorted-str-range + [n] + (sort (map (fn [v] (str v)) (take n (range))))) + +(defn workload + "A generator, client, and checker for a set test." + [opts] + {:client (QueueClient. nil nil) + :checker (checker/total-queue) + :generator (->> (sorted-str-range 10000) + (map (fn [x] + (rand-nth [{:type :invoke, :f :enqueue :value x} + {:type :invoke, :f :dequeue}])))) + :final-generator (gen/once {:type :invoke, :f :drain, :value nil})}) diff --git a/tests/jepsen.nukeeper/src/jepsen/nukeeper/set.clj b/tests/jepsen.nukeeper/src/jepsen/nukeeper/set.clj index c30ec9635a1..3213042a3cc 100644 --- a/tests/jepsen.nukeeper/src/jepsen/nukeeper/set.clj +++ b/tests/jepsen.nukeeper/src/jepsen/nukeeper/set.clj @@ -23,8 +23,8 @@ (invoke! [this test op] (case (:f op) :read (assoc op - :type :ok - :value (read-string (:data (zk-get-str conn k)))) + :type :ok + :value (read-string (:data (zk-get-str conn k)))) :add (try (do (zk-add-to-set conn k (:value op)) @@ -34,7 +34,8 @@ (teardown! [_ test]) - (close! [_ test])) + (close! [_ test] + (zk/close conn))) (defn workload "A generator, client, and checker for a set test." diff --git a/tests/jepsen.nukeeper/src/jepsen/nukeeper/unique.clj b/tests/jepsen.nukeeper/src/jepsen/nukeeper/unique.clj index 9c753dfe0ab..9dfb906bc17 100644 --- a/tests/jepsen.nukeeper/src/jepsen/nukeeper/unique.clj +++ b/tests/jepsen.nukeeper/src/jepsen/nukeeper/unique.clj @@ -9,10 +9,6 @@ [zookeeper :as zk]) (:import (org.apache.zookeeper ZooKeeper KeeperException KeeperException$BadVersionException))) -(defn parse-and-get-counter - [path] - (Integer/parseInt (apply str (take-last 10 (seq (str path)))))) - (defrecord UniqueClient [conn nodename] client/Client (open! [this test node] @@ -33,7 +29,8 @@ (teardown! [_ test]) - (close! [_ test])) + (close! [_ test] + (zk/close conn))) (defn workload "A generator, client, and checker for a set test." diff --git a/tests/jepsen.nukeeper/src/jepsen/nukeeper/utils.clj b/tests/jepsen.nukeeper/src/jepsen/nukeeper/utils.clj index 6fd2f3c87f4..fd2b2b5acb3 100644 --- a/tests/jepsen.nukeeper/src/jepsen/nukeeper/utils.clj +++ b/tests/jepsen.nukeeper/src/jepsen/nukeeper/utils.clj @@ -7,8 +7,9 @@ [jepsen.nukeeper.constants :refer :all] [jepsen.control :as c] [clojure.tools.logging :refer :all]) - (:import (org.apache.zookeeper CreateMode - ZooKeeper))) + (:import (org.apache.zookeeper.data Stat) + (org.apache.zookeeper CreateMode + ZooKeeper))) (defn parse-long "Parses a string to a Long. Passes through `nil` and empty strings." @@ -16,6 +17,10 @@ (if (and s (> (count s) 0)) (Long/parseLong s))) +(defn parse-and-get-counter + [path] + (Integer/parseInt (apply str (take-last 10 (seq (str path)))))) + (defn zk-range [] (map (fn [v] (str "/" v)) (range))) @@ -48,6 +53,13 @@ [conn path] (zk/children conn path)) +(defn zk-list-with-stat + [conn path] + (let [stat (new Stat) + children (seq (.getChildren conn path false stat))] + {:children children + :stat (zi/stat-to-map stat)})) + (defn zk-cas [conn path old-value new-value] (let [current-value (zk-get-str conn path)] @@ -81,6 +93,26 @@ CreateMode/PERSISTENT_SEQUENTIAL) (recur (inc i))))))) +(defn zk-parent-path + [path] + (let [rslash_pos (str/last-index-of path "/")] + (if (> rslash_pos 0) + (subs path 0 rslash_pos) + "/"))) + +(defn zk-multi-delete-first-child + [conn path] + (let [{children :children stat :stat} (zk-list-with-stat conn path) + txn (.transaction conn) + first-child (first (sort children))] + (if (not (nil? first-child)) + (do (.check txn path (:version stat)) + (.setData txn path (data/to-bytes "") -1) ; I'm just checking multitransactions + (.delete txn (str path first-child) -1) + (.commit txn) + first-child) + nil))) + (defn clickhouse-alive? [node test] (info "Checking server alive on" node) diff --git a/tests/jepsen.nukeeper/test/jepsen/nukeeper_test.clj b/tests/jepsen.nukeeper/test/jepsen/nukeeper_test.clj index 1a3e8646574..1981e01ebcb 100644 --- a/tests/jepsen.nukeeper/test/jepsen/nukeeper_test.clj +++ b/tests/jepsen.nukeeper/test/jepsen/nukeeper_test.clj @@ -15,16 +15,21 @@ (deftest a-test (testing "nukeeper connection" (let [conn (zk/connect "localhost:9181" :timeout-msec 5000)] - (println (take 10 (zk-range))) - (multidelete conn) - (multicreate conn) - (zk/create-all conn "/0") - (zk/create conn "/0") + ;(println (take 10 (zk-range))) + ;(multidelete conn) + ;(multicreate conn) + ;(zk/create-all conn "/0") + ;(zk/create conn "/0") + ;(println (zk/children conn "/")) + ;(zk/set-data conn "/0" (data/to-bytes "777") -1) + (println (zk-parent-path "/sasds/dasda/das")) + (println (zk-parent-path "/sasds")) + (zk-multi-create-many-seq-nodes conn "/a-" 5) (println (zk/children conn "/")) - (zk/set-data conn "/0" (data/to-bytes "777") -1) - (zk-multi-create-many-seq-nodes conn "/seq-" 5) - (println (zk/children conn "/")) - (Thread/sleep 5000) - (println "VALUE" (data/to-string (:data (zk/data conn "/0")))) - (is (= (data/to-string (:data (zk/data conn "/0"))) "777")) + (println (zk-list-with-stat conn "/")) + (println (zk-multi-delete-first-child conn "/")) + (println (zk-list-with-stat conn "/")) + ;(Thread/sleep 5000) + ;(println "VALUE" (data/to-string (:data (zk/data conn "/0")))) + ;(is (= (data/to-string (:data (zk/data conn "/0"))) "777")) (zk/close conn)))) From 26541471137806f701d7e8c24a9b00c298844cf2 Mon Sep 17 00:00:00 2001 From: alesapin Date: Fri, 19 Mar 2021 00:14:43 +0300 Subject: [PATCH 040/155] Fix on fix --- src/Coordination/NuKeeperStorage.cpp | 2 ++ tests/jepsen.nukeeper/src/jepsen/nukeeper/main.clj | 2 +- 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/src/Coordination/NuKeeperStorage.cpp b/src/Coordination/NuKeeperStorage.cpp index 0b773aeaafd..62f998761ea 100644 --- a/src/Coordination/NuKeeperStorage.cpp +++ b/src/Coordination/NuKeeperStorage.cpp @@ -761,6 +761,8 @@ void NuKeeperStorage::clearData() session_and_timeout.clear(); session_id_counter = 1; zxid = 0; + + container.insert("/", Node()); } } diff --git a/tests/jepsen.nukeeper/src/jepsen/nukeeper/main.clj b/tests/jepsen.nukeeper/src/jepsen/nukeeper/main.clj index b8854638ed0..e852c7c4720 100644 --- a/tests/jepsen.nukeeper/src/jepsen/nukeeper/main.clj +++ b/tests/jepsen.nukeeper/src/jepsen/nukeeper/main.clj @@ -131,7 +131,7 @@ opts {:name (str "clickhouse-keeper quorum=" quorum " " (name (:workload opts)) " " (name (:nemesis opts))) :os ubuntu/os - :db (db "rbtorrent:711cf0ff9281804eb53875d0c12499df1c2a0adc") + :db (db "rbtorrent:af3f7a797953f7f359bd3550fe3fd4a68fd27345") :pure-generators true :client (:client workload) :nemesis (:nemesis current-nemesis) From 6aa9039f7dabe289da918f9bbfdbb2950516cabe Mon Sep 17 00:00:00 2001 From: Alexander Kuzmenkov Date: Fri, 19 Mar 2021 02:05:43 +0300 Subject: [PATCH 041/155] float frames and lag/lead_in_frame --- docker/test/performance-comparison/perf.py | 5 +- .../sql-reference/window-functions/index.md | 4 +- src/Core/Field.h | 23 ++ src/Interpreters/WindowDescription.cpp | 49 ++-- src/Interpreters/WindowDescription.h | 5 +- src/Parsers/ASTWindowDefinition.cpp | 6 +- src/Parsers/ExpressionElementParsers.cpp | 54 ++-- src/Processors/Transforms/WindowTransform.cpp | 243 ++++++++++++++++-- src/Processors/Transforms/WindowTransform.h | 9 +- tests/performance/window_functions.xml | 42 +++ .../01591_window_functions.reference | 39 +++ .../0_stateless/01591_window_functions.sql | 18 ++ 12 files changed, 412 insertions(+), 85 deletions(-) diff --git a/docker/test/performance-comparison/perf.py b/docker/test/performance-comparison/perf.py index f1c5df146aa..c74da2fe8e3 100755 --- a/docker/test/performance-comparison/perf.py +++ b/docker/test/performance-comparison/perf.py @@ -264,7 +264,7 @@ for query_index in queries_to_run: try: prewarm_id = f'{query_prefix}.prewarm0' # Will also detect too long queries during warmup stage - res = c.execute(q, query_id = prewarm_id, settings = {'max_execution_time': 10}) + res = c.execute(q, query_id = prewarm_id, settings = {'max_execution_time': args.max_query_seconds}) print(f'prewarm\t{query_index}\t{prewarm_id}\t{conn_index}\t{c.last_query.elapsed}') except KeyboardInterrupt: raise @@ -311,7 +311,8 @@ for query_index in queries_to_run: for conn_index, c in enumerate(this_query_connections): try: - res = c.execute(q, query_id = run_id) + res = c.execute(q, query_id = run_id, + settings = {'max_execution_time': args.max_query_seconds}) except Exception as e: # Add query id to the exception to make debugging easier. e.args = (run_id, *e.args) diff --git a/docs/en/sql-reference/window-functions/index.md b/docs/en/sql-reference/window-functions/index.md index cbf03a44d46..3d18bc123f9 100644 --- a/docs/en/sql-reference/window-functions/index.md +++ b/docs/en/sql-reference/window-functions/index.md @@ -23,7 +23,9 @@ ClickHouse supports the standard grammar for defining windows and window functio | `GROUPS` frame | not supported | | Calculating aggregate functions over a frame (`sum(value) over (order by time)`) | all aggregate functions are supported | | `rank()`, `dense_rank()`, `row_number()` | supported | -| `lag/lead(value, offset)` | not supported, replace with `any(value) over (.... rows between preceding and preceding)`, or `following` for `lead`| +| `lag/lead(value, offset)` | Not supported. Workarounds: | +| | 1) replace with `any(value) over (.... rows between preceding and preceding)`, or `following` for `lead`| +| | 2) use `lag_in_frame/lead_in_frame`, which are analogous, but respect the window frame. To get behavior identical to `lag/lead`, use `rows between unbounded preceding and unbounded following` | ## References diff --git a/src/Core/Field.h b/src/Core/Field.h index 3a52186167f..30c3938e455 100644 --- a/src/Core/Field.h +++ b/src/Core/Field.h @@ -946,3 +946,26 @@ void writeFieldText(const Field & x, WriteBuffer & buf); String toString(const Field & x); } + +template <> +struct fmt::formatter +{ + constexpr auto parse(format_parse_context & ctx) + { + auto it = ctx.begin(); + auto end = ctx.end(); + + /// Only support {}. + if (it != end && *it != '}') + throw format_error("invalid format"); + + return it; + } + + template + auto format(const DB::Field & x, FormatContext & ctx) + { + return format_to(ctx.out(), "{}", toString(x)); + } +}; + diff --git a/src/Interpreters/WindowDescription.cpp b/src/Interpreters/WindowDescription.cpp index e922f49c896..e81a1d3235c 100644 --- a/src/Interpreters/WindowDescription.cpp +++ b/src/Interpreters/WindowDescription.cpp @@ -1,5 +1,6 @@ #include +#include #include #include @@ -60,7 +61,7 @@ void WindowFrame::toString(WriteBuffer & buf) const } else { - buf << abs(begin_offset); + buf << applyVisitor(FieldVisitorToString(), begin_offset); buf << " " << (begin_preceding ? "PRECEDING" : "FOLLOWING"); } @@ -77,7 +78,7 @@ void WindowFrame::toString(WriteBuffer & buf) const } else { - buf << abs(end_offset); + buf << applyVisitor(FieldVisitorToString(), end_offset); buf << " " << (end_preceding ? "PRECEDING" : "FOLLOWING"); } @@ -121,23 +122,37 @@ void WindowFrame::checkValid() const if (end_type == BoundaryType::Offset && begin_type == BoundaryType::Offset) { - // Frame starting with following rows can't have preceding rows. - if (!(end_preceding && !begin_preceding)) + // Frame start offset must be less or equal that the frame end offset. + bool begin_less_equal_end; + if (begin_preceding && end_preceding) { - // Frame start offset must be less or equal that the frame end offset. - const bool begin_before_end - = begin_offset * (begin_preceding ? -1 : 1) - <= end_offset * (end_preceding ? -1 : 1); - - if (!begin_before_end) - { - throw Exception(ErrorCodes::BAD_ARGUMENTS, - "Frame start offset {} {} does not precede the frame end offset {} {}", - begin_offset, begin_preceding ? "PRECEDING" : "FOLLOWING", - end_offset, end_preceding ? "PRECEDING" : "FOLLOWING"); - } - return; + begin_less_equal_end = begin_offset >= end_offset; } + else if (begin_preceding && !end_preceding) + { + begin_less_equal_end = true; + } + else if (!begin_preceding && end_preceding) + { + begin_less_equal_end = false; + } + else if (!begin_preceding && !end_preceding) + { + begin_less_equal_end = begin_offset <= end_offset; + } + else + { + assert(false); + } + + if (!begin_less_equal_end) + { + throw Exception(ErrorCodes::BAD_ARGUMENTS, + "Frame start offset {} {} does not precede the frame end offset {} {}", + begin_offset, begin_preceding ? "PRECEDING" : "FOLLOWING", + end_offset, end_preceding ? "PRECEDING" : "FOLLOWING"); + } + return; } throw Exception(ErrorCodes::BAD_ARGUMENTS, diff --git a/src/Interpreters/WindowDescription.h b/src/Interpreters/WindowDescription.h index faad4649f91..70a4e0e44e0 100644 --- a/src/Interpreters/WindowDescription.h +++ b/src/Interpreters/WindowDescription.h @@ -44,14 +44,13 @@ struct WindowFrame // Offset might be both preceding and following, controlled by begin_preceding, // but the offset value must be positive. BoundaryType begin_type = BoundaryType::Unbounded; - // This should have been a Field but I'm getting some crazy linker errors. - int64_t begin_offset = 0; + Field begin_offset = 0; bool begin_preceding = true; // Here as well, Unbounded can only be UNBOUNDED FOLLOWING, and end_preceding // must be false. BoundaryType end_type = BoundaryType::Current; - int64_t end_offset = 0; + Field end_offset = 0; bool end_preceding = false; diff --git a/src/Parsers/ASTWindowDefinition.cpp b/src/Parsers/ASTWindowDefinition.cpp index aee951fc1f3..ff08bda65ed 100644 --- a/src/Parsers/ASTWindowDefinition.cpp +++ b/src/Parsers/ASTWindowDefinition.cpp @@ -70,7 +70,8 @@ void ASTWindowDefinition::formatImpl(const FormatSettings & settings, } else { - settings.ostr << abs(frame.begin_offset); + settings.ostr << applyVisitor(FieldVisitorToString(), + frame.begin_offset); settings.ostr << " " << (!frame.begin_preceding ? "FOLLOWING" : "PRECEDING"); } @@ -85,7 +86,8 @@ void ASTWindowDefinition::formatImpl(const FormatSettings & settings, } else { - settings.ostr << abs(frame.end_offset); + settings.ostr << applyVisitor(FieldVisitorToString(), + frame.end_offset); settings.ostr << " " << (!frame.end_preceding ? "FOLLOWING" : "PRECEDING"); } diff --git a/src/Parsers/ExpressionElementParsers.cpp b/src/Parsers/ExpressionElementParsers.cpp index a54573432a1..39e3a0af5b7 100644 --- a/src/Parsers/ExpressionElementParsers.cpp +++ b/src/Parsers/ExpressionElementParsers.cpp @@ -581,30 +581,20 @@ static bool tryParseFrameDefinition(ASTWindowDefinition * node, IParser::Pos & p else if (parser_literal.parse(pos, ast_literal, expected)) { const Field & value = ast_literal->as().value; - if (!isInt64FieldType(value.getType())) + if ((node->frame.type == WindowFrame::FrameType::Rows + || node->frame.type == WindowFrame::FrameType::Groups) + && !(value.getType() == Field::Types::UInt64 + || (value.getType() == Field::Types::Int64 + && value.get() >= 0))) { throw Exception(ErrorCodes::BAD_ARGUMENTS, - "Only integer frame offsets are supported, '{}' is not supported.", + "Frame offset for '{}' frame must be a nonnegative integer, '{}' of type '{}' given.", + WindowFrame::toString(node->frame.type), + applyVisitor(FieldVisitorToString(), value), Field::Types::toString(value.getType())); } - node->frame.begin_offset = value.get(); + node->frame.begin_offset = value; node->frame.begin_type = WindowFrame::BoundaryType::Offset; - // We can easily get a UINT64_MAX here, which doesn't even fit into - // int64_t. Not sure what checks we are going to need here after we - // support floats and dates. - if (node->frame.begin_offset > INT_MAX || node->frame.begin_offset < INT_MIN) - { - throw Exception(ErrorCodes::BAD_ARGUMENTS, - "Frame offset must be between {} and {}, but {} is given", - INT_MAX, INT_MIN, node->frame.begin_offset); - } - - if (node->frame.begin_offset < 0) - { - throw Exception(ErrorCodes::BAD_ARGUMENTS, - "Frame start offset must be greater than zero, {} given", - node->frame.begin_offset); - } } else { @@ -652,28 +642,20 @@ static bool tryParseFrameDefinition(ASTWindowDefinition * node, IParser::Pos & p else if (parser_literal.parse(pos, ast_literal, expected)) { const Field & value = ast_literal->as().value; - if (!isInt64FieldType(value.getType())) + if ((node->frame.type == WindowFrame::FrameType::Rows + || node->frame.type == WindowFrame::FrameType::Groups) + && !(value.getType() == Field::Types::UInt64 + || (value.getType() == Field::Types::Int64 + && value.get() >= 0))) { throw Exception(ErrorCodes::BAD_ARGUMENTS, - "Only integer frame offsets are supported, '{}' is not supported.", + "Frame offset for '{}' frame must be a nonnegative integer, '{}' of type '{}' given.", + WindowFrame::toString(node->frame.type), + applyVisitor(FieldVisitorToString(), value), Field::Types::toString(value.getType())); } - node->frame.end_offset = value.get(); + node->frame.end_offset = value; node->frame.end_type = WindowFrame::BoundaryType::Offset; - - if (node->frame.end_offset > INT_MAX || node->frame.end_offset < INT_MIN) - { - throw Exception(ErrorCodes::BAD_ARGUMENTS, - "Frame offset must be between {} and {}, but {} is given", - INT_MAX, INT_MIN, node->frame.end_offset); - } - - if (node->frame.end_offset < 0) - { - throw Exception(ErrorCodes::BAD_ARGUMENTS, - "Frame end offset must be greater than zero, {} given", - node->frame.end_offset); - } } else { diff --git a/src/Processors/Transforms/WindowTransform.cpp b/src/Processors/Transforms/WindowTransform.cpp index 0013e0061e2..a8e0ed8519b 100644 --- a/src/Processors/Transforms/WindowTransform.cpp +++ b/src/Processors/Transforms/WindowTransform.cpp @@ -3,6 +3,7 @@ #include #include #include +#include #include #include @@ -27,7 +28,8 @@ public: virtual ~IWindowFunction() = default; // Must insert the result for current_row. - virtual void windowInsertResultInto(IColumn & to, const WindowTransform * transform) = 0; + virtual void windowInsertResultInto(const WindowTransform * transform, + size_t function_index) = 0; }; // Compares ORDER BY column values at given rows to find the boundaries of frame: @@ -37,7 +39,7 @@ template static int compareValuesWithOffset(const IColumn * _compared_column, size_t compared_row, const IColumn * _reference_column, size_t reference_row, - uint64_t _offset, + const Field & _offset, bool offset_is_preceding) { // Casting the columns to the known type here makes it faster, probably @@ -46,7 +48,7 @@ static int compareValuesWithOffset(const IColumn * _compared_column, _compared_column); const auto * reference_column = assert_cast( _reference_column); - const auto offset = static_cast(_offset); + const auto offset = _offset.get(); const auto compared_value_data = compared_column->getDataAt(compared_row); assert(compared_value_data.size == sizeof(typename ColumnType::ValueType)); @@ -101,6 +103,54 @@ static int compareValuesWithOffset(const IColumn * _compared_column, } } +// A specialization of compareValuesWithOffset for floats. +template +static int compareValuesWithOffsetFloat(const IColumn * _compared_column, + size_t compared_row, const IColumn * _reference_column, + size_t reference_row, + const Field & _offset, + bool offset_is_preceding) +{ + // Casting the columns to the known type here makes it faster, probably + // because the getData call can be devirtualized. + const auto * compared_column = assert_cast( + _compared_column); + const auto * reference_column = assert_cast( + _reference_column); + // The underlying field type is Float64 for Float32 as well. get() + // would be a reinterpret_cast and yield an incorrect result. + const auto offset = _offset.get(); + + const auto compared_value_data = compared_column->getDataAt(compared_row); + assert(compared_value_data.size == sizeof(typename ColumnType::ValueType)); + auto compared_value = unalignedLoad( + compared_value_data.data); + + const auto reference_value_data = reference_column->getDataAt(reference_row); + assert(reference_value_data.size == sizeof(typename ColumnType::ValueType)); + auto reference_value = unalignedLoad( + reference_value_data.data); + + // Floats overflow to Inf and the comparison will work normally, so we don't + // have to do anything. + if (offset_is_preceding) + { + reference_value -= offset; + } + else + { + reference_value += offset; + } + + const auto result = compared_value < reference_value ? -1 + : compared_value == reference_value ? 0 : 1; + +// fmt::print(stderr, "compared {}, offset {}, reference {}, result {}\n", +// compared_value, offset, reference_value, result); + + return result; +} + // Helper macros to dispatch on type of the ORDER BY column #define APPLY_FOR_ONE_TYPE(FUNCTION, TYPE) \ else if (typeid_cast(column)) \ @@ -114,14 +164,20 @@ if (false) /* NOLINT */ \ { \ /* Do nothing, a starter condition. */ \ } \ -APPLY_FOR_ONE_TYPE(FUNCTION, ColumnVector) \ APPLY_FOR_ONE_TYPE(FUNCTION, ColumnVector) \ -APPLY_FOR_ONE_TYPE(FUNCTION, ColumnVector) \ APPLY_FOR_ONE_TYPE(FUNCTION, ColumnVector) \ -APPLY_FOR_ONE_TYPE(FUNCTION, ColumnVector) \ APPLY_FOR_ONE_TYPE(FUNCTION, ColumnVector) \ -APPLY_FOR_ONE_TYPE(FUNCTION, ColumnVector) \ APPLY_FOR_ONE_TYPE(FUNCTION, ColumnVector) \ +\ +APPLY_FOR_ONE_TYPE(FUNCTION, ColumnVector) \ +APPLY_FOR_ONE_TYPE(FUNCTION, ColumnVector) \ +APPLY_FOR_ONE_TYPE(FUNCTION, ColumnVector) \ +APPLY_FOR_ONE_TYPE(FUNCTION, ColumnVector) \ +APPLY_FOR_ONE_TYPE(FUNCTION, ColumnVector) \ +\ +APPLY_FOR_ONE_TYPE(FUNCTION##Float, ColumnVector) \ +APPLY_FOR_ONE_TYPE(FUNCTION##Float, ColumnVector) \ +\ else \ { \ throw Exception(ErrorCodes::NOT_IMPLEMENTED, \ @@ -193,9 +249,28 @@ WindowTransform::WindowTransform(const Block & input_header_, == WindowFrame::BoundaryType::Offset)) { assert(order_by_indices.size() == 1); - const IColumn * column = input_header.getByPosition( - order_by_indices[0]).column.get(); + const auto & entry = input_header.getByPosition(order_by_indices[0]); + const IColumn * column = entry.column.get(); APPLY_FOR_TYPES(compareValuesWithOffset) + + // Check that the offset type matches the window type. + // Convert the offsets to the ORDER BY column type. We can't just check + // that it matches, because e.g. the int literals are always (U)Int64, + // but the column might be Int8 and so on. + if (window_description.frame.begin_type + == WindowFrame::BoundaryType::Offset) + { + window_description.frame.begin_offset = convertFieldToTypeOrThrow( + window_description.frame.begin_offset, + *entry.type); + } + if (window_description.frame.end_type + == WindowFrame::BoundaryType::Offset) + { + window_description.frame.end_offset = convertFieldToTypeOrThrow( + window_description.frame.end_offset, + *entry.type); + } } } @@ -391,7 +466,7 @@ void WindowTransform::advanceFrameStartRowsOffset() { // Just recalculate it each time by walking blocks. const auto [moved_row, offset_left] = moveRowNumber(current_row, - window_description.frame.begin_offset + window_description.frame.begin_offset.get() * (window_description.frame.begin_preceding ? -1 : 1)); frame_start = moved_row; @@ -638,7 +713,7 @@ void WindowTransform::advanceFrameEndRowsOffset() // Walk the specified offset from the current row. The "+1" is needed // because the frame_end is a past-the-end pointer. const auto [moved_row, offset_left] = moveRowNumber(current_row, - window_description.frame.end_offset + window_description.frame.end_offset.get() * (window_description.frame.end_preceding ? -1 : 1) + 1); @@ -852,14 +927,14 @@ void WindowTransform::writeOutCurrentRow() for (size_t wi = 0; wi < workspaces.size(); ++wi) { auto & ws = workspaces[wi]; - IColumn * result_column = block.output_columns[wi].get(); if (ws.window_function_impl) { - ws.window_function_impl->windowInsertResultInto(*result_column, this); + ws.window_function_impl->windowInsertResultInto(this, wi); } else { + IColumn * result_column = block.output_columns[wi].get(); const auto * a = ws.aggregate_function.get(); auto * buf = ws.aggregate_function_state.data(); // FIXME does it also allocate the result on the arena? @@ -1275,8 +1350,11 @@ struct WindowFunctionRank final : public WindowFunction DataTypePtr getReturnType() const override { return std::make_shared(); } - void windowInsertResultInto(IColumn & to, const WindowTransform * transform) override + void windowInsertResultInto(const WindowTransform * transform, + size_t function_index) override { + IColumn & to = *transform->blockAt(transform->current_row) + .output_columns[function_index]; assert_cast(to).getData().push_back( transform->peer_group_start_row_number); } @@ -1292,8 +1370,11 @@ struct WindowFunctionDenseRank final : public WindowFunction DataTypePtr getReturnType() const override { return std::make_shared(); } - void windowInsertResultInto(IColumn & to, const WindowTransform * transform) override + void windowInsertResultInto(const WindowTransform * transform, + size_t function_index) override { + IColumn & to = *transform->blockAt(transform->current_row) + .output_columns[function_index]; assert_cast(to).getData().push_back( transform->peer_group_number); } @@ -1309,13 +1390,122 @@ struct WindowFunctionRowNumber final : public WindowFunction DataTypePtr getReturnType() const override { return std::make_shared(); } - void windowInsertResultInto(IColumn & to, const WindowTransform * transform) override + void windowInsertResultInto(const WindowTransform * transform, + size_t function_index) override { + IColumn & to = *transform->blockAt(transform->current_row) + .output_columns[function_index]; assert_cast(to).getData().push_back( transform->current_row_number); } }; +template +struct WindowFunctionLagLeadInFrame final : public WindowFunction +{ + WindowFunctionLagLeadInFrame(const std::string & name_, + const DataTypes & argument_types_, const Array & parameters_) + : WindowFunction(name_, argument_types_, parameters_) + { + if (!parameters.empty()) + { + throw Exception(ErrorCodes::BAD_ARGUMENTS, + "Function {} cannot be parameterized", name_); + } + + if (argument_types.empty()) + { + throw Exception(ErrorCodes::BAD_ARGUMENTS, + "Function {} takes at least one argument", name_); + } + + if (argument_types.size() == 1) + { + return; + } + + if (!isInt64FieldType(argument_types[1]->getDefault().getType())) + { + throw Exception(ErrorCodes::BAD_ARGUMENTS, + "Offset must be an integer, '{}' given", + argument_types[1]->getName()); + } + + if (argument_types.size() == 2) + { + return; + } + + if (!getLeastSupertype({argument_types[0], argument_types[2]})) + { + throw Exception(ErrorCodes::BAD_ARGUMENTS, + "The default value type '{}' is not convertible to the argument type '{}'", + argument_types[2]->getName(), + argument_types[0]->getName()); + } + + if (argument_types.size() > 3) + { + throw Exception(ErrorCodes::BAD_ARGUMENTS, + "Function '{}' accepts at most 3 arguments, {} given", + name, argument_types.size()); + } + } + + DataTypePtr getReturnType() const override + { return argument_types[0]; } + + void windowInsertResultInto(const WindowTransform * transform, + size_t function_index) override + { + auto & current_block = transform->blockAt(transform->current_row); + IColumn & to = *current_block.output_columns[function_index]; + auto & workspace = transform->workspaces[function_index]; + + int offset = 1; + if (argument_types.size() > 1) + { + offset = (*current_block.input_columns[ + workspace.argument_column_indices[1]])[ + transform->current_row.row].get(); + if (offset < 0) + { + throw Exception(ErrorCodes::BAD_ARGUMENTS, + "The offset for function {} must be nonnegative, {} given", + getName(), offset); + } + } + + const auto [target_row, offset_left] = transform->moveRowNumber( + transform->current_row, offset * (is_lead ? 1 : -1)); + + if (offset_left != 0 + || target_row < transform->frame_start + || transform->frame_end <= target_row) + { + // Offset is outside the frame. + if (argument_types.size() > 2) + { + // Column with default values is specified. + to.insertFrom(*current_block.input_columns[ + workspace.argument_column_indices[2]], + transform->current_row.row); + } + else + { + to.insertDefault(); + } + } + else + { + // Offset is inside the frame. + to.insertFrom(*transform->blockAt(target_row).input_columns[ + workspace.argument_column_indices[0]], + target_row.row); + } + } +}; + void registerWindowFunctions(AggregateFunctionFactory & factory) { // Why didn't I implement lag/lead yet? Because they are a mess. I imagine @@ -1327,9 +1517,10 @@ void registerWindowFunctions(AggregateFunctionFactory & factory) // the whole partition like Postgres does, because using a linear amount // of additional memory is not an option when we have a lot of data. We must // be able to process at least the lag/lead in streaming fashion. - // Our best bet is probably rewriting, say `lag(value, offset)` to - // `any(value) over (rows between offset preceding and offset preceding)`, - // at the query planning stage. + // A partial solution for constant offsets is rewriting, say `lag(value, offset) + // to `any(value) over (rows between offset preceding and offset preceding)`. + // We also implement non-standard functions `lag/lead_in_frame`, that are + // analogous to `lag/lead`, but respect the frame. // Functions like cume_dist() do require materializing the entire // partition, but it's probably also simpler to implement them by rewriting // to a (rows between unbounded preceding and unbounded following) frame, @@ -1355,6 +1546,20 @@ void registerWindowFunctions(AggregateFunctionFactory & factory) return std::make_shared(name, argument_types, parameters); }); + + factory.registerFunction("lag_in_frame", [](const std::string & name, + const DataTypes & argument_types, const Array & parameters) + { + return std::make_shared>( + name, argument_types, parameters); + }); + + factory.registerFunction("lead_in_frame", [](const std::string & name, + const DataTypes & argument_types, const Array & parameters) + { + return std::make_shared>( + name, argument_types, parameters); + }); } } diff --git a/src/Processors/Transforms/WindowTransform.h b/src/Processors/Transforms/WindowTransform.h index 5001b984e9a..882bf429c0a 100644 --- a/src/Processors/Transforms/WindowTransform.h +++ b/src/Processors/Transforms/WindowTransform.h @@ -110,7 +110,9 @@ public: Status prepare() override; void work() override; -private: + /* + * Implementation details. + */ void advancePartitionEnd(); bool arePeers(const RowNumber & x, const RowNumber & y) const; @@ -321,10 +323,7 @@ public: int (* compare_values_with_offset) ( const IColumn * compared_column, size_t compared_row, const IColumn * reference_column, size_t reference_row, - // We can make it a Field later if we need the Decimals. Now we only - // have ints and datetime, and the underlying Field type for them is - // uint64_t anyway. - uint64_t offset, + const Field & offset, bool offset_is_preceding); }; diff --git a/tests/performance/window_functions.xml b/tests/performance/window_functions.xml index 622e349d060..8db168b1a97 100644 --- a/tests/performance/window_functions.xml +++ b/tests/performance/window_functions.xml @@ -110,4 +110,46 @@ format Null + + + select lead_in_frame(number) over w + from + (select number, intDiv(number, 1111) p, mod(number, 111) o + from numbers(10000000)) t + window w as (partition by p order by o + rows between unbounded preceding and unbounded following) + format Null + + + + + select any(number) over w + from + (select number, intDiv(number, 1111) p, mod(number, 111) o + from numbers(10000000)) t + window w as (partition by p order by o + rows between 1 following and 1 following) + format Null + + + + select lead_in_frame(number, number) over w + from + (select number, intDiv(number, 1111) p, mod(number, 111) o + from numbers(10000000)) t + window w as (partition by p order by o + rows between unbounded preceding and unbounded following) + format Null + + + + select lead_in_frame(number, number, number) over w + from + (select number, intDiv(number, 1111) p, mod(number, 111) o + from numbers(10000000)) t + window w as (partition by p order by o + rows between unbounded preceding and unbounded following) + format Null + + diff --git a/tests/queries/0_stateless/01591_window_functions.reference b/tests/queries/0_stateless/01591_window_functions.reference index d2543f0db75..a1130fc51d7 100644 --- a/tests/queries/0_stateless/01591_window_functions.reference +++ b/tests/queries/0_stateless/01591_window_functions.reference @@ -974,6 +974,32 @@ from numbers(5); 1 3 2 4 3 \N +-- variants of lag/lead that respect the frame +select number, p, pp, + lag_in_frame(number, number - pp, number * 11) over w as lag, + lead_in_frame(number, number - pp, number * 11) over w as lead +from (select number, intDiv(number, 5) p, p * 5 pp from numbers(16)) +window w as (partition by p order by number + rows between unbounded preceding and unbounded following) +order by number +settings max_block_size = 3; +; +0 0 0 0 0 +1 0 0 0 2 +2 0 0 0 4 +3 0 0 0 33 +4 0 0 0 44 +5 1 5 5 5 +6 1 5 5 7 +7 1 5 5 9 +8 1 5 5 88 +9 1 5 5 99 +10 2 10 10 10 +11 2 10 10 12 +12 2 10 10 14 +13 2 10 10 143 +14 2 10 10 154 +15 3 15 15 15 -- case-insensitive SQL-standard synonyms for any and anyLast select number, @@ -993,3 +1019,16 @@ order by number 7 6 8 8 7 9 9 8 9 +-- floating point RANGE frame +select + count(*) over (order by (toFloat32(number) as f32) range 5. preceding), + count(*) over (order by (toFloat64(number) as f64) range 5. preceding) +from numbers(7) +; +1 1 +2 2 +3 3 +4 4 +5 5 +6 6 +6 6 diff --git a/tests/queries/0_stateless/01591_window_functions.sql b/tests/queries/0_stateless/01591_window_functions.sql index 03bd8371e23..9ac009e672d 100644 --- a/tests/queries/0_stateless/01591_window_functions.sql +++ b/tests/queries/0_stateless/01591_window_functions.sql @@ -336,6 +336,17 @@ select over (order by number rows between 1 following and 1 following) from numbers(5); +-- variants of lag/lead that respect the frame +select number, p, pp, + lag_in_frame(number, number - pp, number * 11) over w as lag, + lead_in_frame(number, number - pp, number * 11) over w as lead +from (select number, intDiv(number, 5) p, p * 5 pp from numbers(16)) +window w as (partition by p order by number + rows between unbounded preceding and unbounded following) +order by number +settings max_block_size = 3; +; + -- case-insensitive SQL-standard synonyms for any and anyLast select number, @@ -345,3 +356,10 @@ from numbers(10) window w as (order by number range between 1 preceding and 1 following) order by number ; + +-- floating point RANGE frame +select + count(*) over (order by (toFloat32(number) as f32) range 5. preceding), + count(*) over (order by (toFloat64(number) as f64) range 5. preceding) +from numbers(7) +; From 4bf2e94fa48d49ae734e9598e7b941a9b3066b9e Mon Sep 17 00:00:00 2001 From: Alexander Kuzmenkov Date: Fri, 19 Mar 2021 03:02:35 +0300 Subject: [PATCH 042/155] clang is too smart --- src/Interpreters/WindowDescription.cpp | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) diff --git a/src/Interpreters/WindowDescription.cpp b/src/Interpreters/WindowDescription.cpp index e81a1d3235c..a97ef41204a 100644 --- a/src/Interpreters/WindowDescription.cpp +++ b/src/Interpreters/WindowDescription.cpp @@ -136,14 +136,10 @@ void WindowFrame::checkValid() const { begin_less_equal_end = false; } - else if (!begin_preceding && !end_preceding) + else /* if (!begin_preceding && !end_preceding) */ { begin_less_equal_end = begin_offset <= end_offset; } - else - { - assert(false); - } if (!begin_less_equal_end) { From 81c408cb7f8bd11a121906be33cec4b6e5770553 Mon Sep 17 00:00:00 2001 From: alesapin Date: Fri, 19 Mar 2021 11:08:43 +0300 Subject: [PATCH 043/155] Return meta and storage from snapshot --- src/Coordination/NuKeeperSnapshotManager.cpp | 18 ++++--- src/Coordination/NuKeeperSnapshotManager.h | 10 ++-- src/Coordination/NuKeeperStateMachine.cpp | 26 +++++----- src/Coordination/NuKeeperStateMachine.h | 4 +- src/Coordination/NuKeeperStorage.cpp | 13 ----- src/Coordination/NuKeeperStorage.h | 2 - src/Coordination/tests/gtest_for_build.cpp | 52 +++++++++---------- .../src/jepsen/nukeeper/main.clj | 4 +- 8 files changed, 59 insertions(+), 70 deletions(-) diff --git a/src/Coordination/NuKeeperSnapshotManager.cpp b/src/Coordination/NuKeeperSnapshotManager.cpp index 5cc7bc356be..1caa1ea94b8 100644 --- a/src/Coordination/NuKeeperSnapshotManager.cpp +++ b/src/Coordination/NuKeeperSnapshotManager.cpp @@ -161,7 +161,6 @@ void NuKeeperStorageSnapshot::serialize(const NuKeeperStorageSnapshot & snapshot SnapshotMetadataPtr NuKeeperStorageSnapshot::deserialize(NuKeeperStorage & storage, ReadBuffer & in) { - storage.clearData(); uint8_t version; readBinary(version, in); if (static_cast(version) > SnapshotVersion::V0) @@ -242,9 +241,10 @@ NuKeeperStorageSnapshot::~NuKeeperStorageSnapshot() storage->disableSnapshotMode(); } -NuKeeperSnapshotManager::NuKeeperSnapshotManager(const std::string & snapshots_path_, size_t snapshots_to_keep_) +NuKeeperSnapshotManager::NuKeeperSnapshotManager(const std::string & snapshots_path_, size_t snapshots_to_keep_, size_t storage_tick_time_) : snapshots_path(snapshots_path_) , snapshots_to_keep(snapshots_to_keep_) + , storage_tick_time(storage_tick_time_) { namespace fs = std::filesystem; @@ -326,22 +326,24 @@ nuraft::ptr NuKeeperSnapshotManager::serializeSnapshotToBuffer(c return writer.getBuffer(); } -SnapshotMetadataPtr NuKeeperSnapshotManager::deserializeSnapshotFromBuffer(NuKeeperStorage * storage, nuraft::ptr buffer) +SnapshotMetaAndStorage NuKeeperSnapshotManager::deserializeSnapshotFromBuffer(nuraft::ptr buffer) const { ReadBufferFromNuraftBuffer reader(buffer); CompressedReadBuffer compressed_reader(reader); - return NuKeeperStorageSnapshot::deserialize(*storage, compressed_reader); + auto storage = std::make_unique(storage_tick_time); + auto snapshot_metadata = NuKeeperStorageSnapshot::deserialize(*storage, compressed_reader); + return std::make_pair(snapshot_metadata, std::move(storage)); } -SnapshotMetadataPtr NuKeeperSnapshotManager::restoreFromLatestSnapshot(NuKeeperStorage * storage) +SnapshotMetaAndStorage NuKeeperSnapshotManager::restoreFromLatestSnapshot() { if (existing_snapshots.empty()) - return nullptr; + return {}; auto buffer = deserializeLatestSnapshotBufferFromDisk(); if (!buffer) - return nullptr; - return deserializeSnapshotFromBuffer(storage, buffer); + return {}; + return deserializeSnapshotFromBuffer(buffer); } void NuKeeperSnapshotManager::removeOutdatedSnapshotsIfNeeded() diff --git a/src/Coordination/NuKeeperSnapshotManager.h b/src/Coordination/NuKeeperSnapshotManager.h index 422baf11a65..d844a52eaf4 100644 --- a/src/Coordination/NuKeeperSnapshotManager.h +++ b/src/Coordination/NuKeeperSnapshotManager.h @@ -40,17 +40,20 @@ public: using NuKeeperStorageSnapshotPtr = std::shared_ptr; using CreateSnapshotCallback = std::function; + +using SnapshotMetaAndStorage = std::pair; + class NuKeeperSnapshotManager { public: - NuKeeperSnapshotManager(const std::string & snapshots_path_, size_t snapshots_to_keep_); + NuKeeperSnapshotManager(const std::string & snapshots_path_, size_t snapshots_to_keep_, size_t storage_tick_time_ = 500); - SnapshotMetadataPtr restoreFromLatestSnapshot(NuKeeperStorage * storage); + SnapshotMetaAndStorage restoreFromLatestSnapshot(); static nuraft::ptr serializeSnapshotToBuffer(const NuKeeperStorageSnapshot & snapshot); std::string serializeSnapshotBufferToDisk(nuraft::buffer & buffer, size_t up_to_log_idx); - static SnapshotMetadataPtr deserializeSnapshotFromBuffer(NuKeeperStorage * storage, nuraft::ptr buffer); + SnapshotMetaAndStorage deserializeSnapshotFromBuffer(nuraft::ptr buffer) const; nuraft::ptr deserializeSnapshotBufferFromDisk(size_t up_to_log_idx) const; nuraft::ptr deserializeLatestSnapshotBufferFromDisk(); @@ -74,6 +77,7 @@ private: const std::string snapshots_path; const size_t snapshots_to_keep; std::map existing_snapshots; + size_t storage_tick_time; }; struct CreateSnapshotTask diff --git a/src/Coordination/NuKeeperStateMachine.cpp b/src/Coordination/NuKeeperStateMachine.cpp index 58a7ca3d5bc..32bb4269f20 100644 --- a/src/Coordination/NuKeeperStateMachine.cpp +++ b/src/Coordination/NuKeeperStateMachine.cpp @@ -37,8 +37,7 @@ NuKeeperStorage::RequestForSession parseRequest(nuraft::buffer & data) NuKeeperStateMachine::NuKeeperStateMachine(ResponsesQueue & responses_queue_, SnapshotsQueue & snapshots_queue_, const std::string & snapshots_path_, const CoordinationSettingsPtr & coordination_settings_) : coordination_settings(coordination_settings_) - , storage(coordination_settings->dead_session_check_period_ms.totalMilliseconds()) - , snapshot_manager(snapshots_path_, coordination_settings->snapshots_to_keep) + , snapshot_manager(snapshots_path_, coordination_settings->snapshots_to_keep, coordination_settings->dead_session_check_period_ms.totalMicroseconds()) , responses_queue(responses_queue_) , snapshots_queue(snapshots_queue_) , last_committed_idx(0) @@ -60,7 +59,7 @@ void NuKeeperStateMachine::init() try { latest_snapshot_buf = snapshot_manager.deserializeSnapshotBufferFromDisk(latest_log_index); - latest_snapshot_meta = snapshot_manager.deserializeSnapshotFromBuffer(&storage, latest_snapshot_buf); + std::tie(latest_snapshot_meta, storage) = snapshot_manager.deserializeSnapshotFromBuffer(latest_snapshot_buf); last_committed_idx = latest_snapshot_meta->get_last_log_idx(); loaded = true; break; @@ -83,6 +82,9 @@ void NuKeeperStateMachine::init() { LOG_DEBUG(log, "No existing snapshots, last committed log index {}", last_committed_idx); } + + if (!storage) + storage = std::make_unique(coordination_settings->dead_session_check_period_ms.totalMilliseconds()); } nuraft::ptr NuKeeperStateMachine::commit(const size_t log_idx, nuraft::buffer & data) @@ -96,7 +98,7 @@ nuraft::ptr NuKeeperStateMachine::commit(const size_t log_idx, n nuraft::buffer_serializer bs(response); { std::lock_guard lock(storage_lock); - session_id = storage.getSessionID(session_timeout_ms); + session_id = storage->getSessionID(session_timeout_ms); bs.put_i64(session_id); } LOG_DEBUG(log, "Session ID response {} with timeout {}", session_id, session_timeout_ms); @@ -109,7 +111,7 @@ nuraft::ptr NuKeeperStateMachine::commit(const size_t log_idx, n NuKeeperStorage::ResponsesForSessions responses_for_sessions; { std::lock_guard lock(storage_lock); - responses_for_sessions = storage.processRequest(request_for_session.request, request_for_session.session_id, log_idx); + responses_for_sessions = storage->processRequest(request_for_session.request, request_for_session.session_id, log_idx); for (auto & response_for_session : responses_for_sessions) responses_queue.push(response_for_session); } @@ -133,7 +135,7 @@ bool NuKeeperStateMachine::apply_snapshot(nuraft::snapshot & s) { std::lock_guard lock(storage_lock); - snapshot_manager.deserializeSnapshotFromBuffer(&storage, latest_snapshot_ptr); + std::tie(latest_snapshot_meta, storage) = snapshot_manager.deserializeSnapshotFromBuffer(latest_snapshot_ptr); } last_committed_idx = s.get_last_log_idx(); return true; @@ -157,7 +159,7 @@ void NuKeeperStateMachine::create_snapshot( CreateSnapshotTask snapshot_task; { std::lock_guard lock(storage_lock); - snapshot_task.snapshot = std::make_shared(&storage, snapshot_meta_copy); + snapshot_task.snapshot = std::make_shared(storage.get(), snapshot_meta_copy); } snapshot_task.create_snapshot = [this, when_done] (NuKeeperStorageSnapshotPtr && snapshot) @@ -179,7 +181,7 @@ void NuKeeperStateMachine::create_snapshot( { /// Must do it with lock (clearing elements from list) std::lock_guard lock(storage_lock); - storage.clearGarbageAfterSnapshot(); + storage->clearGarbageAfterSnapshot(); /// Destroy snapshot with lock snapshot.reset(); LOG_TRACE(log, "Cleared garbage after snapshot"); @@ -214,7 +216,7 @@ void NuKeeperStateMachine::save_logical_snp_obj( if (obj_id == 0) { std::lock_guard lock(storage_lock); - NuKeeperStorageSnapshot snapshot(&storage, s.get_last_log_idx()); + NuKeeperStorageSnapshot snapshot(storage.get(), s.get_last_log_idx()); cloned_buffer = snapshot_manager.serializeSnapshotToBuffer(snapshot); } else @@ -271,7 +273,7 @@ void NuKeeperStateMachine::processReadRequest(const NuKeeperStorage::RequestForS NuKeeperStorage::ResponsesForSessions responses; { std::lock_guard lock(storage_lock); - responses = storage.processRequest(request_for_session.request, request_for_session.session_id, std::nullopt); + responses = storage->processRequest(request_for_session.request, request_for_session.session_id, std::nullopt); } for (const auto & response : responses) responses_queue.push(response); @@ -280,13 +282,13 @@ void NuKeeperStateMachine::processReadRequest(const NuKeeperStorage::RequestForS std::unordered_set NuKeeperStateMachine::getDeadSessions() { std::lock_guard lock(storage_lock); - return storage.getDeadSessions(); + return storage->getDeadSessions(); } void NuKeeperStateMachine::shutdownStorage() { std::lock_guard lock(storage_lock); - storage.finalize(); + storage->finalize(); } } diff --git a/src/Coordination/NuKeeperStateMachine.h b/src/Coordination/NuKeeperStateMachine.h index 905f3448c1a..af9ad6de4d2 100644 --- a/src/Coordination/NuKeeperStateMachine.h +++ b/src/Coordination/NuKeeperStateMachine.h @@ -52,7 +52,7 @@ public: NuKeeperStorage & getStorage() { - return storage; + return *storage; } void processReadRequest(const NuKeeperStorage::RequestForSession & request_for_session); @@ -68,7 +68,7 @@ private: CoordinationSettingsPtr coordination_settings; - NuKeeperStorage storage; + NuKeeperStoragePtr storage; NuKeeperSnapshotManager snapshot_manager; diff --git a/src/Coordination/NuKeeperStorage.cpp b/src/Coordination/NuKeeperStorage.cpp index 62f998761ea..2440d6f6613 100644 --- a/src/Coordination/NuKeeperStorage.cpp +++ b/src/Coordination/NuKeeperStorage.cpp @@ -752,17 +752,4 @@ void NuKeeperStorage::clearDeadWatches(int64_t session_id) } } -void NuKeeperStorage::clearData() -{ - container.clear(); - ephemerals.clear(); - sessions_and_watchers.clear(); - session_expiry_queue.clear(); - session_and_timeout.clear(); - session_id_counter = 1; - zxid = 0; - - container.insert("/", Node()); -} - } diff --git a/src/Coordination/NuKeeperStorage.h b/src/Coordination/NuKeeperStorage.h index b44a077c277..058eed55cab 100644 --- a/src/Coordination/NuKeeperStorage.h +++ b/src/Coordination/NuKeeperStorage.h @@ -82,8 +82,6 @@ public: public: NuKeeperStorage(int64_t tick_time_ms); - void clearData(); - int64_t getSessionID(int64_t session_timeout_ms) { auto result = session_id_counter++; diff --git a/src/Coordination/tests/gtest_for_build.cpp b/src/Coordination/tests/gtest_for_build.cpp index 01146248f63..d90b711498e 100644 --- a/src/Coordination/tests/gtest_for_build.cpp +++ b/src/Coordination/tests/gtest_for_build.cpp @@ -897,25 +897,25 @@ TEST(CoordinationTest, TestStorageSnapshotSimple) manager.serializeSnapshotBufferToDisk(*buf, 2); EXPECT_TRUE(fs::exists("./snapshots/snapshot_2.bin")); - DB::NuKeeperStorage restored_storage(500); auto debuf = manager.deserializeSnapshotBufferFromDisk(2); - manager.deserializeSnapshotFromBuffer(&restored_storage, debuf); - EXPECT_EQ(restored_storage.container.size(), 3); - EXPECT_EQ(restored_storage.container.getValue("/").children.size(), 1); - EXPECT_EQ(restored_storage.container.getValue("/hello").children.size(), 1); - EXPECT_EQ(restored_storage.container.getValue("/hello/somepath").children.size(), 0); + auto [snapshot_meta, restored_storage] = manager.deserializeSnapshotFromBuffer(debuf); - EXPECT_EQ(restored_storage.container.getValue("/").data, ""); - EXPECT_EQ(restored_storage.container.getValue("/hello").data, "world"); - EXPECT_EQ(restored_storage.container.getValue("/hello/somepath").data, "somedata"); - EXPECT_EQ(restored_storage.session_id_counter, 7); - EXPECT_EQ(restored_storage.zxid, 2); - EXPECT_EQ(restored_storage.ephemerals.size(), 2); - EXPECT_EQ(restored_storage.ephemerals[3].size(), 1); - EXPECT_EQ(restored_storage.ephemerals[1].size(), 1); - EXPECT_EQ(restored_storage.session_and_timeout.size(), 2); + EXPECT_EQ(restored_storage->container.size(), 3); + EXPECT_EQ(restored_storage->container.getValue("/").children.size(), 1); + EXPECT_EQ(restored_storage->container.getValue("/hello").children.size(), 1); + EXPECT_EQ(restored_storage->container.getValue("/hello/somepath").children.size(), 0); + + EXPECT_EQ(restored_storage->container.getValue("/").data, ""); + EXPECT_EQ(restored_storage->container.getValue("/hello").data, "world"); + EXPECT_EQ(restored_storage->container.getValue("/hello/somepath").data, "somedata"); + EXPECT_EQ(restored_storage->session_id_counter, 7); + EXPECT_EQ(restored_storage->zxid, 2); + EXPECT_EQ(restored_storage->ephemerals.size(), 2); + EXPECT_EQ(restored_storage->ephemerals[3].size(), 1); + EXPECT_EQ(restored_storage->ephemerals[1].size(), 1); + EXPECT_EQ(restored_storage->session_and_timeout.size(), 2); } TEST(CoordinationTest, TestStorageSnapshotMoreWrites) @@ -946,15 +946,14 @@ TEST(CoordinationTest, TestStorageSnapshotMoreWrites) manager.serializeSnapshotBufferToDisk(*buf, 50); EXPECT_TRUE(fs::exists("./snapshots/snapshot_50.bin")); - DB::NuKeeperStorage restored_storage(500); auto debuf = manager.deserializeSnapshotBufferFromDisk(50); - manager.deserializeSnapshotFromBuffer(&restored_storage, debuf); + auto [meta, restored_storage] = manager.deserializeSnapshotFromBuffer(debuf); - EXPECT_EQ(restored_storage.container.size(), 51); + EXPECT_EQ(restored_storage->container.size(), 51); for (size_t i = 0; i < 50; ++i) { - EXPECT_EQ(restored_storage.container.getValue("/hello_" + std::to_string(i)).data, "world_" + std::to_string(i)); + EXPECT_EQ(restored_storage->container.getValue("/hello_" + std::to_string(i)).data, "world_" + std::to_string(i)); } } @@ -987,14 +986,13 @@ TEST(CoordinationTest, TestStorageSnapshotManySnapshots) EXPECT_TRUE(fs::exists("./snapshots/snapshot_250.bin")); - DB::NuKeeperStorage restored_storage(500); - manager.restoreFromLatestSnapshot(&restored_storage); + auto [meta, restored_storage] = manager.restoreFromLatestSnapshot(); - EXPECT_EQ(restored_storage.container.size(), 251); + EXPECT_EQ(restored_storage->container.size(), 251); for (size_t i = 0; i < 250; ++i) { - EXPECT_EQ(restored_storage.container.getValue("/hello_" + std::to_string(i)).data, "world_" + std::to_string(i)); + EXPECT_EQ(restored_storage->container.getValue("/hello_" + std::to_string(i)).data, "world_" + std::to_string(i)); } } @@ -1040,12 +1038,11 @@ TEST(CoordinationTest, TestStorageSnapshotMode) EXPECT_FALSE(storage.container.contains("/hello_" + std::to_string(i))); } - DB::NuKeeperStorage restored_storage(500); - manager.restoreFromLatestSnapshot(&restored_storage); + auto [meta, restored_storage] = manager.restoreFromLatestSnapshot(); for (size_t i = 0; i < 50; ++i) { - EXPECT_EQ(restored_storage.container.getValue("/hello_" + std::to_string(i)).data, "world_" + std::to_string(i)); + EXPECT_EQ(restored_storage->container.getValue("/hello_" + std::to_string(i)).data, "world_" + std::to_string(i)); } } @@ -1071,8 +1068,7 @@ TEST(CoordinationTest, TestStorageSnapshotBroken) plain_buf.truncate(34); plain_buf.sync(); - DB::NuKeeperStorage restored_storage(500); - EXPECT_THROW(manager.restoreFromLatestSnapshot(&restored_storage), DB::Exception); + EXPECT_THROW(manager.restoreFromLatestSnapshot(), DB::Exception); } nuraft::ptr getBufferFromZKRequest(int64_t session_id, const Coordination::ZooKeeperRequestPtr & request) diff --git a/tests/jepsen.nukeeper/src/jepsen/nukeeper/main.clj b/tests/jepsen.nukeeper/src/jepsen/nukeeper/main.clj index e852c7c4720..9ef3ab4ca2d 100644 --- a/tests/jepsen.nukeeper/src/jepsen/nukeeper/main.clj +++ b/tests/jepsen.nukeeper/src/jepsen/nukeeper/main.clj @@ -69,7 +69,7 @@ (info node "tearing down clickhouse") (cu/stop-daemon! (str binary-path "/clickhouse") pidfile) (c/su - (c/exec :rm :-f (str binary-path "/clickhouse")) + ;(c/exec :rm :-f (str binary-path "/clickhouse")) (c/exec :rm :-rf dir) (c/exec :rm :-rf logdir) (c/exec :rm :-rf "/etc/clickhouse-server"))) @@ -131,7 +131,7 @@ opts {:name (str "clickhouse-keeper quorum=" quorum " " (name (:workload opts)) " " (name (:nemesis opts))) :os ubuntu/os - :db (db "rbtorrent:af3f7a797953f7f359bd3550fe3fd4a68fd27345") + :db (db "rbtorrent:71c60699aa56568ded73c4a48cecd2fd5e0956cb") :pure-generators true :client (:client workload) :nemesis (:nemesis current-nemesis) From 58eac8a8b4d15699e2bc8d6784d66076fcb4c2d1 Mon Sep 17 00:00:00 2001 From: alesapin Date: Fri, 19 Mar 2021 12:40:59 +0300 Subject: [PATCH 044/155] Add non-symmetric network partitioners --- .../src/jepsen/nukeeper/nemesis.clj | 35 ++++++++++++++++++- 1 file changed, 34 insertions(+), 1 deletion(-) diff --git a/tests/jepsen.nukeeper/src/jepsen/nukeeper/nemesis.clj b/tests/jepsen.nukeeper/src/jepsen/nukeeper/nemesis.clj index 59f3cb52dae..9e5841ad8e4 100644 --- a/tests/jepsen.nukeeper/src/jepsen/nukeeper/nemesis.clj +++ b/tests/jepsen.nukeeper/src/jepsen/nukeeper/nemesis.clj @@ -93,6 +93,33 @@ (corruptor-nemesis coordinationdir (fn [path] (c/exec :rm :-fr path)))) +(defn partition-bridge-nemesis + [] + (nemesis/partitioner nemesis/bridge)) + +(defn blind-node + [nodes] + (let [[[victim] others] (nemesis/split-one nodes)] + {victim (into #{} others)})) + + +(defn blind-node-partition-nemesis + [] + (nemesis/partitioner blind-node)) + +(defn blind-others + [nodes] + (let [[[victim] others] (nemesis/split-one nodes)] + (into {} (map (fn [node] [node #{victim}])) others))) + +(defn blind-others-partition-nemesis + [] + (nemesis/partitioner blind-others)) + +(defn network-non-symmetric-nemesis + [] + (nemesis/partitioner nemesis/bridge)) + (defn start-stop-generator [time-corrupt time-ok] (->> @@ -125,4 +152,10 @@ "logs-and-snapshots-corruptor" {:nemesis (logs-and-snapshots-corruption-nemesis) :generator (corruption-generator)} "drop-data-corruptor" {:nemesis (drop-all-corruption-nemesis) - :generator (corruption-generator)}}) + :generator (corruption-generator)} + "bridge-partitioner" {:nemesis (partition-bridge-nemesis) + :generator (start-stop-generator 5 5)} + "blind-node-partitioner" {:nemesis (blind-node-partition-nemesis) + :generator (start-stop-generator 5 5)} + "blind-others-partitioner" {:nemesis (blind-others-partition-nemesis) + :generator (start-stop-generator 5 5)}}) From 260a978636cc4273a49739b8a786a0665652706b Mon Sep 17 00:00:00 2001 From: alesapin Date: Fri, 19 Mar 2021 13:46:14 +0300 Subject: [PATCH 045/155] Check linearizeability for queue workload --- .../src/jepsen/nukeeper/main.clj | 3 ++- .../src/jepsen/nukeeper/queue.clj | 20 +++++++++++++++++-- 2 files changed, 20 insertions(+), 3 deletions(-) diff --git a/tests/jepsen.nukeeper/src/jepsen/nukeeper/main.clj b/tests/jepsen.nukeeper/src/jepsen/nukeeper/main.clj index 9ef3ab4ca2d..0d93368595b 100644 --- a/tests/jepsen.nukeeper/src/jepsen/nukeeper/main.clj +++ b/tests/jepsen.nukeeper/src/jepsen/nukeeper/main.clj @@ -88,7 +88,8 @@ "register" register/workload "unique-ids" unique/workload "counter" counter/workload - "queue" queue/workload}) + "total-queue" queue/total-workload + "linear-queue" queue/linear-workload}) (def cli-opts "Additional command line options." diff --git a/tests/jepsen.nukeeper/src/jepsen/nukeeper/queue.clj b/tests/jepsen.nukeeper/src/jepsen/nukeeper/queue.clj index f6f7abb51b6..fa6b96944b2 100644 --- a/tests/jepsen.nukeeper/src/jepsen/nukeeper/queue.clj +++ b/tests/jepsen.nukeeper/src/jepsen/nukeeper/queue.clj @@ -5,6 +5,8 @@ [checker :as checker] [client :as client] [generator :as gen]] + [knossos.model :as model] + [jepsen.checker.timeline :as timeline] [jepsen.nukeeper.utils :refer :all] [zookeeper :as zk]) (:import (org.apache.zookeeper ZooKeeper KeeperException KeeperException$BadVersionException))) @@ -55,13 +57,27 @@ [n] (sort (map (fn [v] (str v)) (take n (range))))) -(defn workload +(defn total-workload "A generator, client, and checker for a set test." [opts] {:client (QueueClient. nil nil) - :checker (checker/total-queue) + :checker (checker/compose + {:total-queue (checker/total-queue) + :timeline (timeline/html)}) :generator (->> (sorted-str-range 10000) (map (fn [x] (rand-nth [{:type :invoke, :f :enqueue :value x} {:type :invoke, :f :dequeue}])))) :final-generator (gen/once {:type :invoke, :f :drain, :value nil})}) + +(defn linear-workload + [opts] + {:client (QueueClient. nil nil) + :checker (checker/compose + {:linear (checker/linearizable {:model (model/unordered-queue) + :algorithm :linear}) + :timeline (timeline/html)}) + :generator (->> (sorted-str-range 10000) + (map (fn [x] + (rand-nth [{:type :invoke, :f :enqueue :value x} + {:type :invoke, :f :dequeue}]))))}) From 2c00b48f858763bc0efef83489b12f3dea8f9841 Mon Sep 17 00:00:00 2001 From: alesapin Date: Fri, 19 Mar 2021 15:10:18 +0300 Subject: [PATCH 046/155] Add an ability to run N random tests --- .../src/jepsen/nukeeper/main.clj | 27 +++++++++++++++++-- 1 file changed, 25 insertions(+), 2 deletions(-) diff --git a/tests/jepsen.nukeeper/src/jepsen/nukeeper/main.clj b/tests/jepsen.nukeeper/src/jepsen/nukeeper/main.clj index 0d93368595b..86297473180 100644 --- a/tests/jepsen.nukeeper/src/jepsen/nukeeper/main.clj +++ b/tests/jepsen.nukeeper/src/jepsen/nukeeper/main.clj @@ -1,6 +1,7 @@ (ns jepsen.nukeeper.main (:require [clojure.tools.logging :refer :all] [jepsen.nukeeper.utils :refer :all] + [clojure.pprint :refer [pprint]] [jepsen.nukeeper.set :as set] [jepsen.nukeeper.nemesis :as custom-nemesis] [jepsen.nukeeper.register :as register] @@ -94,10 +95,10 @@ (def cli-opts "Additional command line options." [["-w" "--workload NAME" "What workload should we run?" - :missing (str "--workload " (cli/one-of workloads)) + :default "set" :validate [workloads (cli/one-of workloads)]] [nil "--nemesis NAME" "Which nemesis will poison our lives?" - :missing (str "--nemesis " (cli/one-of custom-nemesis/custom-nemesises)) + :default "random-node-killer" :validate [custom-nemesis/custom-nemesises (cli/one-of custom-nemesis/custom-nemesises)]] ["-q" "--quorum" "Use quorum reads, instead of reading from any primary."] ["-r" "--rate HZ" "Approximate number of requests per second, per thread." @@ -125,6 +126,7 @@ "Given an options map from the command line runner (e.g. :nodes, :ssh, :concurrency, ...), constructs a test map." [opts] + (info "Test opts\n" (with-out-str (pprint opts))) (let [quorum (boolean (:quorum opts)) workload ((get workloads (:workload opts)) opts) current-nemesis (get custom-nemesis/custom-nemesises (:nemesis opts))] @@ -150,11 +152,32 @@ (gen/sleep 10) (gen/clients (:final-generator workload)))}))) +(def all-nemesises (keys custom-nemesis/custom-nemesises)) + +(def all-workloads (keys workloads)) + +(defn all-test-options + "Takes base cli options, a collection of nemeses, workloads, and a test count, + and constructs a sequence of test options." + [cli nemeses workloads] + (take (:test-count cli) (shuffle (for [n nemeses, w workloads] + (assoc cli + :nemesis n + :workload w + :test-count 1))))) + +(defn all-tests + "Turns CLI options into a sequence of tests." + [test-fn cli] + (map test-fn (all-test-options cli all-nemesises all-workloads))) + (defn -main "Handles command line arguments. Can either run a test, or a web server for browsing results." [& args] (cli/run! (merge (cli/single-test-cmd {:test-fn nukeeper-test :opt-spec cli-opts}) + (cli/test-all-cmd {:tests-fn (partial all-tests nukeeper-test) + :opt-spec cli-opts}) (cli/serve-cmd)) args)) From 95cf05b0ad346b18f10b426723b58269e038c226 Mon Sep 17 00:00:00 2001 From: alesapin Date: Fri, 19 Mar 2021 15:25:44 +0300 Subject: [PATCH 047/155] Fix style and add sync --- .../jepsen.nukeeper/src/jepsen/nukeeper/counter.clj | 8 +++++--- tests/jepsen.nukeeper/src/jepsen/nukeeper/main.clj | 8 ++++---- .../jepsen.nukeeper/src/jepsen/nukeeper/nemesis.clj | 5 ++--- tests/jepsen.nukeeper/src/jepsen/nukeeper/queue.clj | 12 +++++++----- tests/jepsen.nukeeper/src/jepsen/nukeeper/set.clj | 8 +++++--- tests/jepsen.nukeeper/src/jepsen/nukeeper/utils.clj | 5 +++++ 6 files changed, 28 insertions(+), 18 deletions(-) diff --git a/tests/jepsen.nukeeper/src/jepsen/nukeeper/counter.clj b/tests/jepsen.nukeeper/src/jepsen/nukeeper/counter.clj index 48b270517a4..6f0cee113c6 100644 --- a/tests/jepsen.nukeeper/src/jepsen/nukeeper/counter.clj +++ b/tests/jepsen.nukeeper/src/jepsen/nukeeper/counter.clj @@ -25,9 +25,11 @@ (invoke! [this test op] (case (:f op) :read (try - (assoc op - :type :ok - :value (count (zk-list conn "/"))) + (do + (zk-sync conn) + (assoc op + :type :ok + :value (count (zk-list conn "/")))) (catch Exception _ (assoc op :type :fail, :error :connect-error))) :add (try (do diff --git a/tests/jepsen.nukeeper/src/jepsen/nukeeper/main.clj b/tests/jepsen.nukeeper/src/jepsen/nukeeper/main.clj index 86297473180..feca05d8190 100644 --- a/tests/jepsen.nukeeper/src/jepsen/nukeeper/main.clj +++ b/tests/jepsen.nukeeper/src/jepsen/nukeeper/main.clj @@ -161,10 +161,10 @@ and constructs a sequence of test options." [cli nemeses workloads] (take (:test-count cli) (shuffle (for [n nemeses, w workloads] - (assoc cli - :nemesis n - :workload w - :test-count 1))))) + (assoc cli + :nemesis n + :workload w + :test-count 1))))) (defn all-tests "Turns CLI options into a sequence of tests." diff --git a/tests/jepsen.nukeeper/src/jepsen/nukeeper/nemesis.clj b/tests/jepsen.nukeeper/src/jepsen/nukeeper/nemesis.clj index 9e5841ad8e4..ec39c2b3e35 100644 --- a/tests/jepsen.nukeeper/src/jepsen/nukeeper/nemesis.clj +++ b/tests/jepsen.nukeeper/src/jepsen/nukeeper/nemesis.clj @@ -102,7 +102,6 @@ (let [[[victim] others] (nemesis/split-one nodes)] {victim (into #{} others)})) - (defn blind-node-partition-nemesis [] (nemesis/partitioner blind-node)) @@ -156,6 +155,6 @@ "bridge-partitioner" {:nemesis (partition-bridge-nemesis) :generator (start-stop-generator 5 5)} "blind-node-partitioner" {:nemesis (blind-node-partition-nemesis) - :generator (start-stop-generator 5 5)} + :generator (start-stop-generator 5 5)} "blind-others-partitioner" {:nemesis (blind-others-partition-nemesis) - :generator (start-stop-generator 5 5)}}) + :generator (start-stop-generator 5 5)}}) diff --git a/tests/jepsen.nukeeper/src/jepsen/nukeeper/queue.clj b/tests/jepsen.nukeeper/src/jepsen/nukeeper/queue.clj index fa6b96944b2..323d74acd67 100644 --- a/tests/jepsen.nukeeper/src/jepsen/nukeeper/queue.clj +++ b/tests/jepsen.nukeeper/src/jepsen/nukeeper/queue.clj @@ -41,11 +41,13 @@ (catch Exception _ (assoc op :type :info, :error :connect-error))) :drain (try - (loop [result '()] - (let [deleted-child (zk-multi-delete-first-child conn "/")] - (if (not (nil? deleted-child)) - (recur (concat result [deleted-child])) - (assoc op :type :ok :value result)))) + (do + (zk-sync conn) + (loop [result '()] + (let [deleted-child (zk-multi-delete-first-child conn "/")] + (if (not (nil? deleted-child)) + (recur (concat result [deleted-child])) + (assoc op :type :ok :value result))))) (catch Exception _ (assoc op :type :info, :error :connect-error))))) (teardown! [_ test]) diff --git a/tests/jepsen.nukeeper/src/jepsen/nukeeper/set.clj b/tests/jepsen.nukeeper/src/jepsen/nukeeper/set.clj index 3213042a3cc..23461591eaf 100644 --- a/tests/jepsen.nukeeper/src/jepsen/nukeeper/set.clj +++ b/tests/jepsen.nukeeper/src/jepsen/nukeeper/set.clj @@ -22,9 +22,11 @@ (invoke! [this test op] (case (:f op) - :read (assoc op - :type :ok - :value (read-string (:data (zk-get-str conn k)))) + :read (do + (zk-sync conn) + (assoc op + :type :ok + :value (read-string (:data (zk-get-str conn k))))) :add (try (do (zk-add-to-set conn k (:value op)) diff --git a/tests/jepsen.nukeeper/src/jepsen/nukeeper/utils.clj b/tests/jepsen.nukeeper/src/jepsen/nukeeper/utils.clj index fd2b2b5acb3..c7e46a75d5f 100644 --- a/tests/jepsen.nukeeper/src/jepsen/nukeeper/utils.clj +++ b/tests/jepsen.nukeeper/src/jepsen/nukeeper/utils.clj @@ -93,6 +93,11 @@ CreateMode/PERSISTENT_SEQUENTIAL) (recur (inc i))))))) +; sync call not implemented in zookeeper-clj and don't have sync version in java API +(defn zk-sync + [conn] + (zk-set conn "/" "" -1)) + (defn zk-parent-path [path] (let [rslash_pos (str/last-index-of path "/")] From 0bf897993236d68d979a37d59d2d1fa81c6ef394 Mon Sep 17 00:00:00 2001 From: alesapin Date: Fri, 19 Mar 2021 15:27:49 +0300 Subject: [PATCH 048/155] Remove redundant code from counter --- tests/jepsen.nukeeper/src/jepsen/nukeeper/counter.clj | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/tests/jepsen.nukeeper/src/jepsen/nukeeper/counter.clj b/tests/jepsen.nukeeper/src/jepsen/nukeeper/counter.clj index 6f0cee113c6..48b270517a4 100644 --- a/tests/jepsen.nukeeper/src/jepsen/nukeeper/counter.clj +++ b/tests/jepsen.nukeeper/src/jepsen/nukeeper/counter.clj @@ -25,11 +25,9 @@ (invoke! [this test op] (case (:f op) :read (try - (do - (zk-sync conn) - (assoc op - :type :ok - :value (count (zk-list conn "/")))) + (assoc op + :type :ok + :value (count (zk-list conn "/"))) (catch Exception _ (assoc op :type :fail, :error :connect-error))) :add (try (do From 1845df25f316c522717031e3e9866b9368ff6ba1 Mon Sep 17 00:00:00 2001 From: Alexander Tokmakov Date: Fri, 19 Mar 2021 17:02:48 +0300 Subject: [PATCH 049/155] fix possibly dangling reference to Context --- src/Interpreters/InterpreterCreateQuery.cpp | 8 ++++++-- src/Storages/StorageFactory.cpp | 1 + src/Storages/StorageURL.cpp | 9 ++++----- src/Storages/StorageURL.h | 1 - .../integration/test_odbc_interaction/test.py | 18 ++++++++++++++++++ 5 files changed, 29 insertions(+), 8 deletions(-) diff --git a/src/Interpreters/InterpreterCreateQuery.cpp b/src/Interpreters/InterpreterCreateQuery.cpp index d1af86e7b11..24cb8608ab3 100644 --- a/src/Interpreters/InterpreterCreateQuery.cpp +++ b/src/Interpreters/InterpreterCreateQuery.cpp @@ -260,7 +260,8 @@ BlockIO InterpreterCreateQuery::createDatabase(ASTCreateQuery & create) renamed = true; } - database->loadStoredObjects(context, has_force_restore_data_flag, create.attach && force_attach); + /// We use global context here, because storages lifetime is bigger than query context lifetime + database->loadStoredObjects(context.getGlobalContext(), has_force_restore_data_flag, create.attach && force_attach); } catch (...) { @@ -970,7 +971,10 @@ bool InterpreterCreateQuery::doCreateTable(ASTCreateQuery & create, if (create.as_table_function) { const auto & factory = TableFunctionFactory::instance(); - res = factory.get(create.as_table_function, context)->execute(create.as_table_function, context, create.table, properties.columns); + /// We should use global context here because there will be no query context on server startup + /// and because storage lifetime is bigger than query context lifetime. + auto table_func = factory.get(create.as_table_function, context.getGlobalContext()); + res = table_func->execute(create.as_table_function, context.getGlobalContext(), create.table, properties.columns); res->renameInMemory({create.database, create.table, create.uuid}); } else diff --git a/src/Storages/StorageFactory.cpp b/src/Storages/StorageFactory.cpp index 85f3bea9e0c..7aaec9b7e76 100644 --- a/src/Storages/StorageFactory.cpp +++ b/src/Storages/StorageFactory.cpp @@ -179,6 +179,7 @@ StoragePtr StorageFactory::get( .attach = query.attach, .has_force_restore_data_flag = has_force_restore_data_flag }; + assert(&arguments.context == &arguments.context.getGlobalContext()); auto res = storages.at(name).creator_fn(arguments); if (!empty_engine_args.empty()) diff --git a/src/Storages/StorageURL.cpp b/src/Storages/StorageURL.cpp index ca984f9ece9..b59f4b4a02a 100644 --- a/src/Storages/StorageURL.cpp +++ b/src/Storages/StorageURL.cpp @@ -42,12 +42,11 @@ IStorageURLBase::IStorageURLBase( const String & compression_method_) : IStorage(table_id_) , uri(uri_) - , context_global(context_) , compression_method(compression_method_) , format_name(format_name_) , format_settings(format_settings_) { - context_global.getRemoteHostFilter().checkURL(uri); + context_.getRemoteHostFilter().checkURL(uri); StorageInMemoryMetadata storage_metadata; storage_metadata.setColumns(columns_); @@ -237,11 +236,11 @@ Pipe IStorageURLBase::read( chooseCompressionMethod(request_uri.getPath(), compression_method))); } -BlockOutputStreamPtr IStorageURLBase::write(const ASTPtr & /*query*/, const StorageMetadataPtr & metadata_snapshot, const Context & /*context*/) +BlockOutputStreamPtr IStorageURLBase::write(const ASTPtr & /*query*/, const StorageMetadataPtr & metadata_snapshot, const Context & context) { return std::make_shared(uri, format_name, - format_settings, metadata_snapshot->getSampleBlock(), context_global, - ConnectionTimeouts::getHTTPTimeouts(context_global), + format_settings, metadata_snapshot->getSampleBlock(), context, + ConnectionTimeouts::getHTTPTimeouts(context), chooseCompressionMethod(uri.toString(), compression_method)); } diff --git a/src/Storages/StorageURL.h b/src/Storages/StorageURL.h index 21b2e3e27a1..0ea86980b8c 100644 --- a/src/Storages/StorageURL.h +++ b/src/Storages/StorageURL.h @@ -45,7 +45,6 @@ protected: const String & compression_method_); Poco::URI uri; - const Context & context_global; String compression_method; String format_name; // For URL engine, we use format settings from server context + `SETTINGS` diff --git a/tests/integration/test_odbc_interaction/test.py b/tests/integration/test_odbc_interaction/test.py index 6bb6a6ee777..6232168f2e6 100644 --- a/tests/integration/test_odbc_interaction/test.py +++ b/tests/integration/test_odbc_interaction/test.py @@ -74,6 +74,9 @@ def started_cluster(): node1.exec_in_container( ["bash", "-c", "echo 'CREATE TABLE t4(X INTEGER PRIMARY KEY ASC, Y, Z);' | sqlite3 {}".format(sqlite_db)], privileged=True, user='root') + node1.exec_in_container( + ["bash", "-c", "echo 'CREATE TABLE tf1(x INTEGER PRIMARY KEY ASC, y, z);' | sqlite3 {}".format(sqlite_db)], + privileged=True, user='root') print("sqlite tables created") mysql_conn = get_mysql_conn() print("mysql connection received") @@ -177,6 +180,21 @@ def test_sqlite_simple_select_function_works(started_cluster): assert node1.query( "select count(), sum(x) from odbc('DSN={}', '{}') group by x".format(sqlite_setup["DSN"], 't1')) == "1\t1\n" +def test_sqlite_table_function(started_cluster): + sqlite_setup = node1.odbc_drivers["SQLite3"] + sqlite_db = sqlite_setup["Database"] + + node1.exec_in_container(["bash", "-c", "echo 'INSERT INTO tf1 values(1, 2, 3);' | sqlite3 {}".format(sqlite_db)], + privileged=True, user='root') + node1.query("create table odbc_tf as odbc('DSN={}', '{}')".format(sqlite_setup["DSN"], 'tf1')) + assert node1.query("select * from odbc_tf") == "1\t2\t3\n" + + assert node1.query("select y from odbc_tf") == "2\n" + assert node1.query("select z from odbc_tf") == "3\n" + assert node1.query("select x from odbc_tf") == "1\n" + assert node1.query("select x, y from odbc_tf") == "1\t2\n" + assert node1.query("select z, x, y from odbc_tf") == "3\t1\t2\n" + assert node1.query("select count(), sum(x) from odbc_tf group by x") == "1\t1\n" def test_sqlite_simple_select_storage_works(started_cluster): sqlite_setup = node1.odbc_drivers["SQLite3"] From 3166f0cbfcba2bec42c4e0adaf94fe949f9fc41d Mon Sep 17 00:00:00 2001 From: Alexander Kuzmenkov Date: Fri, 19 Mar 2021 20:18:17 +0300 Subject: [PATCH 050/155] cleanup --- src/Parsers/ExpressionElementParsers.cpp | 4 ++-- src/Processors/Transforms/WindowTransform.cpp | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/src/Parsers/ExpressionElementParsers.cpp b/src/Parsers/ExpressionElementParsers.cpp index 39e3a0af5b7..913813d5486 100644 --- a/src/Parsers/ExpressionElementParsers.cpp +++ b/src/Parsers/ExpressionElementParsers.cpp @@ -588,7 +588,7 @@ static bool tryParseFrameDefinition(ASTWindowDefinition * node, IParser::Pos & p && value.get() >= 0))) { throw Exception(ErrorCodes::BAD_ARGUMENTS, - "Frame offset for '{}' frame must be a nonnegative integer, '{}' of type '{}' given.", + "Frame offset for '{}' frame must be a nonnegative integer, '{}' of type '{}' given.", WindowFrame::toString(node->frame.type), applyVisitor(FieldVisitorToString(), value), Field::Types::toString(value.getType())); @@ -649,7 +649,7 @@ static bool tryParseFrameDefinition(ASTWindowDefinition * node, IParser::Pos & p && value.get() >= 0))) { throw Exception(ErrorCodes::BAD_ARGUMENTS, - "Frame offset for '{}' frame must be a nonnegative integer, '{}' of type '{}' given.", + "Frame offset for '{}' frame must be a nonnegative integer, '{}' of type '{}' given.", WindowFrame::toString(node->frame.type), applyVisitor(FieldVisitorToString(), value), Field::Types::toString(value.getType())); diff --git a/src/Processors/Transforms/WindowTransform.cpp b/src/Processors/Transforms/WindowTransform.cpp index c562fb4ec2c..3a97698453a 100644 --- a/src/Processors/Transforms/WindowTransform.cpp +++ b/src/Processors/Transforms/WindowTransform.cpp @@ -1463,9 +1463,9 @@ struct WindowFunctionLagLeadInFrame final : public WindowFunction void windowInsertResultInto(const WindowTransform * transform, size_t function_index) override { - auto & current_block = transform->blockAt(transform->current_row); + const auto & current_block = transform->blockAt(transform->current_row); IColumn & to = *current_block.output_columns[function_index]; - auto & workspace = transform->workspaces[function_index]; + const auto & workspace = transform->workspaces[function_index]; int offset = 1; if (argument_types.size() > 1) From 3159b9dacf545134b1f85829d059123d5d71474a Mon Sep 17 00:00:00 2001 From: alesapin Date: Fri, 19 Mar 2021 21:53:09 +0300 Subject: [PATCH 051/155] Disable zookeeper logger Better --- tests/jepsen.nukeeper/src/jepsen/nukeeper/main.clj | 6 +++++- tests/jepsen.nukeeper/test/jepsen/nukeeper_test.clj | 6 +++++- 2 files changed, 10 insertions(+), 2 deletions(-) diff --git a/tests/jepsen.nukeeper/src/jepsen/nukeeper/main.clj b/tests/jepsen.nukeeper/src/jepsen/nukeeper/main.clj index feca05d8190..b7f2bb0b98b 100644 --- a/tests/jepsen.nukeeper/src/jepsen/nukeeper/main.clj +++ b/tests/jepsen.nukeeper/src/jepsen/nukeeper/main.clj @@ -27,7 +27,9 @@ [clojure.java.io :as io] [zookeeper.data :as data] [zookeeper :as zk]) - (:import (org.apache.zookeeper ZooKeeper KeeperException KeeperException$BadVersionException))) + (:import (org.apache.zookeeper ZooKeeper KeeperException KeeperException$BadVersionException) + (ch.qos.logback.classic Level) + (org.slf4j Logger LoggerFactory))) (defn cluster-config [test node config-template] @@ -175,6 +177,8 @@ "Handles command line arguments. Can either run a test, or a web server for browsing results." [& args] + (.setLevel + (LoggerFactory/getLogger "org.apache.zookeeper") Level/OFF) (cli/run! (merge (cli/single-test-cmd {:test-fn nukeeper-test :opt-spec cli-opts}) (cli/test-all-cmd {:tests-fn (partial all-tests nukeeper-test) diff --git a/tests/jepsen.nukeeper/test/jepsen/nukeeper_test.clj b/tests/jepsen.nukeeper/test/jepsen/nukeeper_test.clj index 1981e01ebcb..db84ff33ee3 100644 --- a/tests/jepsen.nukeeper/test/jepsen/nukeeper_test.clj +++ b/tests/jepsen.nukeeper/test/jepsen/nukeeper_test.clj @@ -2,7 +2,9 @@ (:require [clojure.test :refer :all] [jepsen.nukeeper.utils :refer :all] [zookeeper :as zk] - [zookeeper.data :as data])) + [zookeeper.data :as data]) + (:import (ch.qos.logback.classic Level) + (org.slf4j Logger LoggerFactory))) (defn multicreate [conn] @@ -14,6 +16,8 @@ (deftest a-test (testing "nukeeper connection" + (.setLevel + (LoggerFactory/getLogger "org.apache.zookeeper") Level/OFF) (let [conn (zk/connect "localhost:9181" :timeout-msec 5000)] ;(println (take 10 (zk-range))) ;(multidelete conn) From 957c053f7e3604f6366e0e569fb1bdfdf8fcd8cb Mon Sep 17 00:00:00 2001 From: Ivan Lezhankin Date: Fri, 19 Mar 2021 23:29:01 +0300 Subject: [PATCH 052/155] Fix segfault --- src/IO/PeekableReadBuffer.cpp | 29 ++++++++++++++++++++----- src/IO/PeekableReadBuffer.h | 5 +---- src/Interpreters/InterserverIOHandler.h | 2 -- src/Server/HTTP/HTMLForm.cpp | 5 +++++ 4 files changed, 29 insertions(+), 12 deletions(-) diff --git a/src/IO/PeekableReadBuffer.cpp b/src/IO/PeekableReadBuffer.cpp index 1d999d586b2..551f87d7363 100644 --- a/src/IO/PeekableReadBuffer.cpp +++ b/src/IO/PeekableReadBuffer.cpp @@ -82,6 +82,7 @@ bool PeekableReadBuffer::peekNext() checkpoint.emplace(memory.data()); checkpoint_in_own_memory = true; } + if (currentlyReadFromOwnMemory()) { /// Update buffer size @@ -99,7 +100,6 @@ bool PeekableReadBuffer::peekNext() pos_offset = 0; } BufferBase::set(memory.data(), peeked_size + bytes_to_copy, pos_offset); - } peeked_size += bytes_to_copy; @@ -113,12 +113,21 @@ void PeekableReadBuffer::rollbackToCheckpoint(bool drop) { checkStateCorrect(); - if (!checkpoint) - throw DB::Exception("There is no checkpoint", ErrorCodes::LOGICAL_ERROR); - else if (checkpointInOwnMemory() == currentlyReadFromOwnMemory()) + assert(checkpoint); + + if (checkpointInOwnMemory() == currentlyReadFromOwnMemory()) + { + /// Both checkpoint and position are in the same buffer. pos = *checkpoint; - else /// Checkpoint is in own memory and pos is not. Switch to reading from own memory + } + else + { + /// Checkpoint is in own memory and position is not. + assert(checkpointInOwnMemory()); + + /// Switch to reading from own memory. BufferBase::set(memory.data(), peeked_size, *checkpoint - memory.data()); + } if (drop) dropCheckpoint(); @@ -134,10 +143,11 @@ bool PeekableReadBuffer::nextImpl() checkStateCorrect(); bool res; + bool checkpoint_at_end = checkpoint && *checkpoint == working_buffer.end(); if (checkpoint) { - if (currentlyReadFromOwnMemory()) + if (currentlyReadFromOwnMemory() || checkpoint_at_end) res = sub_buf.hasPendingData() || sub_buf.next(); else res = peekNext(); @@ -163,6 +173,13 @@ bool PeekableReadBuffer::nextImpl() BufferBase::set(sub_working.begin(), sub_working.size(), sub_buf.offset()); nextimpl_working_buffer_offset = sub_buf.offset(); + if (checkpoint_at_end) + { + checkpoint.emplace(working_buffer.begin()); + peeked_size = 0; + checkpoint_in_own_memory = false; + } + checkStateCorrect(); return res; } diff --git a/src/IO/PeekableReadBuffer.h b/src/IO/PeekableReadBuffer.h index 4f6e669b31d..4515c6f8ce5 100644 --- a/src/IO/PeekableReadBuffer.h +++ b/src/IO/PeekableReadBuffer.h @@ -43,10 +43,7 @@ public: /// Forget checkpoint and all data between checkpoint and position ALWAYS_INLINE inline void dropCheckpoint() { -#ifndef NDEBUG - if (!checkpoint) - throw DB::Exception("There is no checkpoint", ErrorCodes::LOGICAL_ERROR); -#endif + assert(checkpoint); if (!currentlyReadFromOwnMemory()) { /// Don't need to store unread data anymore diff --git a/src/Interpreters/InterserverIOHandler.h b/src/Interpreters/InterserverIOHandler.h index b4768c30f32..b0c95ed3835 100644 --- a/src/Interpreters/InterserverIOHandler.h +++ b/src/Interpreters/InterserverIOHandler.h @@ -9,8 +9,6 @@ #include #include -#include - #include #include #include diff --git a/src/Server/HTTP/HTMLForm.cpp b/src/Server/HTTP/HTMLForm.cpp index ca407858c33..a00950c8e27 100644 --- a/src/Server/HTTP/HTMLForm.cpp +++ b/src/Server/HTTP/HTMLForm.cpp @@ -369,6 +369,11 @@ bool HTMLForm::MultipartReadBuffer::nextImpl() else boundary_hit = startsWith(line, boundary); + if (!line.empty()) + /// If we don't make sure that memory is contiguous then situation may happen, when part of the line is inside internal memory + /// and other part is inside sub-buffer, thus we'll be unable to setup our working buffer properly. + in.makeContinuousMemoryFromCheckpointToPos(); + in.rollbackToCheckpoint(true); /// Rolling back to checkpoint may change underlying buffers. From c5f918f198d55150f55b81722993a3bbca06fbdc Mon Sep 17 00:00:00 2001 From: Maksim Kita Date: Sun, 21 Mar 2021 19:26:06 +0300 Subject: [PATCH 053/155] CacheDictionaryStorage insert into default value fix --- src/Dictionaries/CacheDictionaryStorage.h | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/src/Dictionaries/CacheDictionaryStorage.h b/src/Dictionaries/CacheDictionaryStorage.h index 874796d879b..7694176d4aa 100644 --- a/src/Dictionaries/CacheDictionaryStorage.h +++ b/src/Dictionaries/CacheDictionaryStorage.h @@ -276,10 +276,11 @@ private: } else { + auto & data = column_typed.getData(); + for (size_t fetched_key_index = 0; fetched_key_index < fetched_columns_index; ++fetched_key_index) { auto fetched_key = fetched_keys[fetched_key_index]; - auto & data = column_typed.getData(); if (unlikely(fetched_key.is_default)) column_typed.insert(default_value_provider.getDefaultValue(fetched_key_index)); @@ -312,6 +313,7 @@ private: size_t cell_index = getCellIndexForInsert(key); auto & cell = cells[cell_index]; + bool cell_was_default = cell.is_default; cell.is_default = false; bool was_inserted = cell.deadline == 0; @@ -387,9 +389,11 @@ private: StringRef string_ref_value = StringRef {string_value.data(), string_value.size()}; StringRef inserted_value = copyStringInArena(string_ref_value); - StringRef previous_value = container[index_to_use]; - char * data = const_cast(previous_value.data); - arena.free(data, previous_value.size); + if (!cell_was_default) + { + StringRef previous_value = container[index_to_use]; + arena.free(const_cast(previous_value.data), previous_value.size); + } container[index_to_use] = inserted_value; } From 5ec7dbbdad1aa37d626bcc57ba5d8b324849650d Mon Sep 17 00:00:00 2001 From: alesapin Date: Mon, 22 Mar 2021 13:06:09 +0300 Subject: [PATCH 054/155] Add lightweight run and fix queue workload --- .../resources/test_keeper_config.xml | 2 +- .../src/jepsen/nukeeper/main.clj | 36 ++++++++++++++----- .../src/jepsen/nukeeper/queue.clj | 10 ++---- .../src/jepsen/nukeeper/utils.clj | 20 +++++++---- 4 files changed, 46 insertions(+), 22 deletions(-) diff --git a/tests/jepsen.nukeeper/resources/test_keeper_config.xml b/tests/jepsen.nukeeper/resources/test_keeper_config.xml index 7ef34d4bea1..c69fb0f228c 100644 --- a/tests/jepsen.nukeeper/resources/test_keeper_config.xml +++ b/tests/jepsen.nukeeper/resources/test_keeper_config.xml @@ -7,7 +7,7 @@ 10000 30000 false - 60000 + 120000 trace {quorum_reads} {snapshot_distance} diff --git a/tests/jepsen.nukeeper/src/jepsen/nukeeper/main.clj b/tests/jepsen.nukeeper/src/jepsen/nukeeper/main.clj index b7f2bb0b98b..4e7c16930d4 100644 --- a/tests/jepsen.nukeeper/src/jepsen/nukeeper/main.clj +++ b/tests/jepsen.nukeeper/src/jepsen/nukeeper/main.clj @@ -122,7 +122,8 @@ [nil "--ops-per-key NUM" "Maximum number of operations on any given key." :default 100 :parse-fn parse-long - :validate [pos? "Must be a positive integer."]]]) + :validate [pos? "Must be a positive integer."]] + [nil, "--lightweight-run", "Subset of workloads/nemesises which is simple to validate"]]) (defn nukeeper-test "Given an options map from the command line runner (e.g. :nodes, :ssh, @@ -136,7 +137,7 @@ opts {:name (str "clickhouse-keeper quorum=" quorum " " (name (:workload opts)) " " (name (:nemesis opts))) :os ubuntu/os - :db (db "rbtorrent:71c60699aa56568ded73c4a48cecd2fd5e0956cb") + :db (db "rbtorrent:5fecc75309f38e302c95b4a226b2de60dfbb5681") :pure-generators true :client (:client workload) :nemesis (:nemesis current-nemesis) @@ -158,20 +159,39 @@ (def all-workloads (keys workloads)) +(def lightweight-workloads ["set" "unique-ids" "counter" "total-queue"]) + +(def useful-nemesises ["random-node-killer" + "simple-partitioner" + "logs-and-snapshots-corruptor" + "drop-data-corruptor" + "bridge-partitioner" + "blind-node-partitioner" + "blind-others-partitioner"]) + +(defn cart [colls] + (if (empty? colls) + '(()) + (for [more (cart (rest colls)) + x (first colls)] + (cons x more)))) + (defn all-test-options "Takes base cli options, a collection of nemeses, workloads, and a test count, and constructs a sequence of test options." - [cli nemeses workloads] - (take (:test-count cli) (shuffle (for [n nemeses, w workloads] + [cli worload-nemeseis-collection] + (take (:test-count cli) + (shuffle (for [[workload nemesis] worload-nemeseis-collection] (assoc cli - :nemesis n - :workload w + :nemesis nemesis + :workload workload :test-count 1))))) - (defn all-tests "Turns CLI options into a sequence of tests." [test-fn cli] - (map test-fn (all-test-options cli all-nemesises all-workloads))) + (if (boolean (:lightweight-run cli)) + (map test-fn (all-test-options cli (cart [all-workloads all-nemesises]))) + (map test-fn (all-test-options cli (cart [lightweight-workloads useful-nemesises]))))) (defn -main "Handles command line arguments. Can either run a test, or a web server for diff --git a/tests/jepsen.nukeeper/src/jepsen/nukeeper/queue.clj b/tests/jepsen.nukeeper/src/jepsen/nukeeper/queue.clj index 323d74acd67..951c0822ad2 100644 --- a/tests/jepsen.nukeeper/src/jepsen/nukeeper/queue.clj +++ b/tests/jepsen.nukeeper/src/jepsen/nukeeper/queue.clj @@ -37,17 +37,13 @@ (if (not (nil? result)) (assoc op :type :ok :value result) (assoc op :type :fail :value result))) - (catch KeeperException$BadVersionException _ (assoc op :type :fail, :error :bad-version)) (catch Exception _ (assoc op :type :info, :error :connect-error))) :drain + ; drain via delete is to long, just list all nodes (try (do (zk-sync conn) - (loop [result '()] - (let [deleted-child (zk-multi-delete-first-child conn "/")] - (if (not (nil? deleted-child)) - (recur (concat result [deleted-child])) - (assoc op :type :ok :value result))))) + (assoc op :type :ok :value (into #{} (map #(str %1) (zk-list conn "/"))))) (catch Exception _ (assoc op :type :info, :error :connect-error))))) (teardown! [_ test]) @@ -66,7 +62,7 @@ :checker (checker/compose {:total-queue (checker/total-queue) :timeline (timeline/html)}) - :generator (->> (sorted-str-range 10000) + :generator (->> (sorted-str-range 50000) (map (fn [x] (rand-nth [{:type :invoke, :f :enqueue :value x} {:type :invoke, :f :dequeue}])))) diff --git a/tests/jepsen.nukeeper/src/jepsen/nukeeper/utils.clj b/tests/jepsen.nukeeper/src/jepsen/nukeeper/utils.clj index c7e46a75d5f..fe415ff9e51 100644 --- a/tests/jepsen.nukeeper/src/jepsen/nukeeper/utils.clj +++ b/tests/jepsen.nukeeper/src/jepsen/nukeeper/utils.clj @@ -9,7 +9,8 @@ [clojure.tools.logging :refer :all]) (:import (org.apache.zookeeper.data Stat) (org.apache.zookeeper CreateMode - ZooKeeper))) + ZooKeeper) + (org.apache.zookeeper ZooKeeper KeeperException KeeperException$BadVersionException))) (defn parse-long "Parses a string to a Long. Passes through `nil` and empty strings." @@ -111,11 +112,18 @@ txn (.transaction conn) first-child (first (sort children))] (if (not (nil? first-child)) - (do (.check txn path (:version stat)) - (.setData txn path (data/to-bytes "") -1) ; I'm just checking multitransactions - (.delete txn (str path first-child) -1) - (.commit txn) - first-child) + (try + (do (.check txn path (:version stat)) + (.setData txn path (data/to-bytes "") -1) ; I'm just checking multitransactions + (.delete txn (str path first-child) -1) + (.commit txn) + first-child) + (catch KeeperException$BadVersionException _ nil) + ; Even if we got connection loss, delete may actually be executed. + ; This function is used for queue model, which strictly require + ; all enqueued elements to be dequeued, but allow duplicates. + ; So even in case when we not sure about delete we return first-child. + (catch Exception _ first-child)) nil))) (defn clickhouse-alive? From 043b3cc7b589ec29ff03f3c3e005fd6c2718e05c Mon Sep 17 00:00:00 2001 From: alesapin Date: Mon, 22 Mar 2021 13:45:22 +0300 Subject: [PATCH 055/155] Fix startup when leadership changed --- src/Coordination/CoordinationSettings.h | 1 + src/Coordination/NuKeeperServer.cpp | 13 +++++++++++++ tests/jepsen.nukeeper/src/jepsen/nukeeper/main.clj | 2 +- 3 files changed, 15 insertions(+), 1 deletion(-) diff --git a/src/Coordination/CoordinationSettings.h b/src/Coordination/CoordinationSettings.h index c816f8089d5..45eb1348ac6 100644 --- a/src/Coordination/CoordinationSettings.h +++ b/src/Coordination/CoordinationSettings.h @@ -31,6 +31,7 @@ struct Settings; M(UInt64, rotate_log_storage_interval, 10000, "How many records will be stored in one log storage file", 0) \ M(UInt64, snapshots_to_keep, 3, "How many compressed snapshots to keep on disk", 0) \ M(UInt64, stale_log_gap, 10000, "When node became stale and should receive snapshots from leader", 0) \ + M(UInt64, fresh_log_gap, 200, "When node became fresh", 0) \ M(Bool, quorum_reads, false, "Execute read requests as writes through whole RAFT consesus with similar speed", 0) \ M(Bool, force_sync, true, " Call fsync on each change in RAFT changelog", 0) diff --git a/src/Coordination/NuKeeperServer.cpp b/src/Coordination/NuKeeperServer.cpp index 2081c969523..bfff7bf8f69 100644 --- a/src/Coordination/NuKeeperServer.cpp +++ b/src/Coordination/NuKeeperServer.cpp @@ -61,6 +61,7 @@ void NuKeeperServer::startup() params.reserved_log_items_ = coordination_settings->reserved_log_items; params.snapshot_distance_ = coordination_settings->snapshot_distance; params.stale_log_gap_ = coordination_settings->stale_log_gap; + params.fresh_log_gap_ = coordination_settings->fresh_log_gap; params.client_req_timeout_ = coordination_settings->operation_timeout_ms.totalMilliseconds(); params.auto_forwarding_ = coordination_settings->auto_forwarding; params.auto_forwarding_req_timeout_ = coordination_settings->operation_timeout_ms.totalMilliseconds() * 2; @@ -202,6 +203,18 @@ nuraft::cb_func::ReturnCode NuKeeperServer::callbackFunc(nuraft::cb_func::Type t set_initialized(); return nuraft::cb_func::ReturnCode::Ok; } + case nuraft::cb_func::BecomeFollower: + { + auto leader_index = raft_instance->get_leader_committed_log_idx(); + auto our_index = raft_instance->get_committed_log_idx(); + /// This may happen when we start RAFT claster from scratch. + /// Node first became leader, and after that some other node became leader. + /// BecameFresh for this node will not be called because it was already fresh + /// when it was leader. + if (isLeaderAlive() && leader_index < our_index + coordination_settings->fresh_log_gap) + set_initialized(); + return nuraft::cb_func::ReturnCode::Ok; + } case nuraft::cb_func::BecomeFresh: { set_initialized(); /// We are fresh follower, ready to serve requests. diff --git a/tests/jepsen.nukeeper/src/jepsen/nukeeper/main.clj b/tests/jepsen.nukeeper/src/jepsen/nukeeper/main.clj index 4e7c16930d4..5167da96c59 100644 --- a/tests/jepsen.nukeeper/src/jepsen/nukeeper/main.clj +++ b/tests/jepsen.nukeeper/src/jepsen/nukeeper/main.clj @@ -137,7 +137,7 @@ opts {:name (str "clickhouse-keeper quorum=" quorum " " (name (:workload opts)) " " (name (:nemesis opts))) :os ubuntu/os - :db (db "rbtorrent:5fecc75309f38e302c95b4a226b2de60dfbb5681") + :db (db "rbtorrent:156b85947eac9c85ef5d0ef15757a9f9e7c9e430") :pure-generators true :client (:client workload) :nemesis (:nemesis current-nemesis) From bf3a4361caaaa60696ed4fcc3a9d9978b4818503 Mon Sep 17 00:00:00 2001 From: alesapin Date: Mon, 22 Mar 2021 13:49:47 +0300 Subject: [PATCH 056/155] Followup fix --- tests/jepsen.nukeeper/src/jepsen/nukeeper/main.clj | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/jepsen.nukeeper/src/jepsen/nukeeper/main.clj b/tests/jepsen.nukeeper/src/jepsen/nukeeper/main.clj index 5167da96c59..dfa1cfd913e 100644 --- a/tests/jepsen.nukeeper/src/jepsen/nukeeper/main.clj +++ b/tests/jepsen.nukeeper/src/jepsen/nukeeper/main.clj @@ -190,8 +190,8 @@ "Turns CLI options into a sequence of tests." [test-fn cli] (if (boolean (:lightweight-run cli)) - (map test-fn (all-test-options cli (cart [all-workloads all-nemesises]))) - (map test-fn (all-test-options cli (cart [lightweight-workloads useful-nemesises]))))) + (map test-fn (all-test-options cli (cart [lightweight-workloads useful-nemesises]))) + (map test-fn (all-test-options cli (cart [all-workloads all-nemesises]))))) (defn -main "Handles command line arguments. Can either run a test, or a web server for From 56840aba5ae88f7b3e88162ad0daf09afb465c8e Mon Sep 17 00:00:00 2001 From: Maksim Kita Date: Mon, 22 Mar 2021 16:02:32 +0300 Subject: [PATCH 057/155] Fixed tests --- src/Dictionaries/CacheDictionaryStorage.h | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) diff --git a/src/Dictionaries/CacheDictionaryStorage.h b/src/Dictionaries/CacheDictionaryStorage.h index 7694176d4aa..ffab7f1f9cf 100644 --- a/src/Dictionaries/CacheDictionaryStorage.h +++ b/src/Dictionaries/CacheDictionaryStorage.h @@ -421,6 +421,7 @@ private: auto & cell = cells[cell_index]; bool was_inserted = cell.deadline == 0; + bool cell_was_default = cell.is_default; cell.is_default = true; @@ -444,6 +445,23 @@ private: } else { + for (size_t attribute_index = 0; attribute_index < attributes.size(); ++attribute_index) + { + getAttributeContainer(attribute_index, [&](const auto & container) + { + using ElementType = std::decay_t; + + if constexpr (std::is_same_v) + { + if (!cell_was_default) + { + StringRef previous_value = container[cell.element_index]; + arena.free(const_cast(previous_value.data), previous_value.size); + } + } + }); + } + if (cell.key != key) { if constexpr (std::is_same_v) From fc3e11a06844d95e45c8e8eb514157fd95ff8431 Mon Sep 17 00:00:00 2001 From: Alexander Tokmakov Date: Mon, 22 Mar 2021 17:09:38 +0300 Subject: [PATCH 058/155] fix --- src/Interpreters/InterpreterCreateQuery.cpp | 2 +- src/TableFunctions/ITableFunctionXDBC.cpp | 16 +++++++++++----- src/TableFunctions/ITableFunctionXDBC.h | 4 +++- 3 files changed, 15 insertions(+), 7 deletions(-) diff --git a/src/Interpreters/InterpreterCreateQuery.cpp b/src/Interpreters/InterpreterCreateQuery.cpp index 24cb8608ab3..7034e74eaf8 100644 --- a/src/Interpreters/InterpreterCreateQuery.cpp +++ b/src/Interpreters/InterpreterCreateQuery.cpp @@ -971,9 +971,9 @@ bool InterpreterCreateQuery::doCreateTable(ASTCreateQuery & create, if (create.as_table_function) { const auto & factory = TableFunctionFactory::instance(); + auto table_func = factory.get(create.as_table_function, context); /// We should use global context here because there will be no query context on server startup /// and because storage lifetime is bigger than query context lifetime. - auto table_func = factory.get(create.as_table_function, context.getGlobalContext()); res = table_func->execute(create.as_table_function, context.getGlobalContext(), create.table, properties.columns); res->renameInMemory({create.database, create.table, create.uuid}); } diff --git a/src/TableFunctions/ITableFunctionXDBC.cpp b/src/TableFunctions/ITableFunctionXDBC.cpp index e04a86b5abf..21c78d199db 100644 --- a/src/TableFunctions/ITableFunctionXDBC.cpp +++ b/src/TableFunctions/ITableFunctionXDBC.cpp @@ -55,15 +55,21 @@ void ITableFunctionXDBC::parseArguments(const ASTPtr & ast_function, const Conte connection_string = args[0]->as().value.safeGet(); remote_table_name = args[1]->as().value.safeGet(); } +} - /// Have to const_cast, because bridges store their commands inside context - helper = createBridgeHelper(const_cast(context), context.getSettingsRef().http_receive_timeout.value, connection_string); - helper->startBridgeSync(); +void ITableFunctionXDBC::startBridgeIfNot(const Context & context) const +{ + if (!helper) + { + /// Have to const_cast, because bridges store their commands inside context + helper = createBridgeHelper(const_cast(context), context.getSettingsRef().http_receive_timeout.value, connection_string); + helper->startBridgeSync(); + } } ColumnsDescription ITableFunctionXDBC::getActualTableStructure(const Context & context) const { - assert(helper); + startBridgeIfNot(context); /* Infer external table structure */ Poco::URI columns_info_uri = helper->getColumnsInfoURI(); @@ -87,7 +93,7 @@ ColumnsDescription ITableFunctionXDBC::getActualTableStructure(const Context & c StoragePtr ITableFunctionXDBC::executeImpl(const ASTPtr & /*ast_function*/, const Context & context, const std::string & table_name, ColumnsDescription /*cached_columns*/) const { - assert(helper); + startBridgeIfNot(context); auto columns = getActualTableStructure(context); auto result = std::make_shared(StorageID(getDatabaseName(), table_name), schema_name, remote_table_name, columns, context, helper); result->startup(); diff --git a/src/TableFunctions/ITableFunctionXDBC.h b/src/TableFunctions/ITableFunctionXDBC.h index fb0a0fd1185..f3ff64c2f2d 100644 --- a/src/TableFunctions/ITableFunctionXDBC.h +++ b/src/TableFunctions/ITableFunctionXDBC.h @@ -29,10 +29,12 @@ private: void parseArguments(const ASTPtr & ast_function, const Context & context) override; + void startBridgeIfNot(const Context & context) const; + String connection_string; String schema_name; String remote_table_name; - BridgeHelperPtr helper; + mutable BridgeHelperPtr helper; }; class TableFunctionJDBC : public ITableFunctionXDBC From 6d4d669f96cadf7c2994e51e994ec6d6c2c4a99e Mon Sep 17 00:00:00 2001 From: Ivan Lezhankin Date: Mon, 22 Mar 2021 20:20:42 +0300 Subject: [PATCH 059/155] Move checkpoint to sub-buffer only from internal memory --- src/IO/PeekableReadBuffer.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/IO/PeekableReadBuffer.cpp b/src/IO/PeekableReadBuffer.cpp index 551f87d7363..15fdd9448ec 100644 --- a/src/IO/PeekableReadBuffer.cpp +++ b/src/IO/PeekableReadBuffer.cpp @@ -143,11 +143,11 @@ bool PeekableReadBuffer::nextImpl() checkStateCorrect(); bool res; - bool checkpoint_at_end = checkpoint && *checkpoint == working_buffer.end(); + bool checkpoint_at_end = checkpoint && *checkpoint == working_buffer.end() && currentlyReadFromOwnMemory(); if (checkpoint) { - if (currentlyReadFromOwnMemory() || checkpoint_at_end) + if (currentlyReadFromOwnMemory()) res = sub_buf.hasPendingData() || sub_buf.next(); else res = peekNext(); From b824df2d3f35f9a4d9cd6f2bfb0ae465654b1c20 Mon Sep 17 00:00:00 2001 From: Alexey Milovidov Date: Mon, 22 Mar 2021 22:43:26 +0300 Subject: [PATCH 060/155] Add tzdata to Docker --- docker/client/Dockerfile | 1 + docker/server/Dockerfile | 1 + 2 files changed, 2 insertions(+) diff --git a/docker/client/Dockerfile b/docker/client/Dockerfile index 8443eae691b..d9cd68254b7 100644 --- a/docker/client/Dockerfile +++ b/docker/client/Dockerfile @@ -18,6 +18,7 @@ RUN apt-get update \ clickhouse-client=$version \ clickhouse-common-static=$version \ locales \ + tzdata \ && rm -rf /var/lib/apt/lists/* /var/cache/debconf \ && apt-get clean diff --git a/docker/server/Dockerfile b/docker/server/Dockerfile index 295784a6184..414eb23d044 100644 --- a/docker/server/Dockerfile +++ b/docker/server/Dockerfile @@ -32,6 +32,7 @@ RUN groupadd -r clickhouse --gid=101 \ clickhouse-server=$version \ locales \ wget \ + tzdata \ && rm -rf \ /var/lib/apt/lists/* \ /var/cache/debconf \ From 84be58453b5129f3b516eb3011f53806b6bb6f21 Mon Sep 17 00:00:00 2001 From: Alexey Milovidov Date: Mon, 22 Mar 2021 22:50:04 +0300 Subject: [PATCH 061/155] Add function timezoneOf and fix inconsistencies --- .../registerFunctionsMiscellaneous.cpp | 6 +- src/Functions/timezone.cpp | 9 +- src/Functions/timezoneOf.cpp | 111 ++++++++++++++++++ .../{toTimeZone.cpp => toTimezone.cpp} | 9 +- .../0_stateless/01767_timezoneOf.reference | 1 + tests/queries/0_stateless/01767_timezoneOf.sh | 7 ++ 6 files changed, 133 insertions(+), 10 deletions(-) create mode 100644 src/Functions/timezoneOf.cpp rename src/Functions/{toTimeZone.cpp => toTimezone.cpp} (90%) create mode 100644 tests/queries/0_stateless/01767_timezoneOf.reference create mode 100755 tests/queries/0_stateless/01767_timezoneOf.sh diff --git a/src/Functions/registerFunctionsMiscellaneous.cpp b/src/Functions/registerFunctionsMiscellaneous.cpp index 592f0d6774d..ca9bc32486e 100644 --- a/src/Functions/registerFunctionsMiscellaneous.cpp +++ b/src/Functions/registerFunctionsMiscellaneous.cpp @@ -41,7 +41,8 @@ void registerFunctionThrowIf(FunctionFactory &); void registerFunctionVersion(FunctionFactory &); void registerFunctionBuildId(FunctionFactory &); void registerFunctionUptime(FunctionFactory &); -void registerFunctionTimeZone(FunctionFactory &); +void registerFunctionTimezone(FunctionFactory &); +void registerFunctionTimezoneOf(FunctionFactory &); void registerFunctionRunningAccumulate(FunctionFactory &); void registerFunctionRunningDifference(FunctionFactory &); void registerFunctionRunningDifferenceStartingWithFirstValue(FunctionFactory &); @@ -111,7 +112,8 @@ void registerFunctionsMiscellaneous(FunctionFactory & factory) registerFunctionVersion(factory); registerFunctionBuildId(factory); registerFunctionUptime(factory); - registerFunctionTimeZone(factory); + registerFunctionTimezone(factory); + registerFunctionTimezoneOf(factory); registerFunctionRunningAccumulate(factory); registerFunctionRunningDifference(factory); registerFunctionRunningDifferenceStartingWithFirstValue(factory); diff --git a/src/Functions/timezone.cpp b/src/Functions/timezone.cpp index 4522f21c8b2..2cd0c28612b 100644 --- a/src/Functions/timezone.cpp +++ b/src/Functions/timezone.cpp @@ -12,13 +12,13 @@ namespace /** Returns the server time zone. */ -class FunctionTimeZone : public IFunction +class FunctionTimezone : public IFunction { public: static constexpr auto name = "timezone"; static FunctionPtr create(const Context &) { - return std::make_shared(); + return std::make_shared(); } String getName() const override @@ -45,9 +45,10 @@ public: } -void registerFunctionTimeZone(FunctionFactory & factory) +void registerFunctionTimezone(FunctionFactory & factory) { - factory.registerFunction(); + factory.registerFunction(); + factory.registerAlias("timeZone", "timezone"); } } diff --git a/src/Functions/timezoneOf.cpp b/src/Functions/timezoneOf.cpp new file mode 100644 index 00000000000..cdf686e276b --- /dev/null +++ b/src/Functions/timezoneOf.cpp @@ -0,0 +1,111 @@ +#include +#include +#include +#include +#include +#include +#include + + +namespace DB +{ +namespace +{ + + +/** timezoneOf(x) - get the name of the timezone of DateTime data type. + * Example: Europe/Moscow. + */ +class ExecutableFunctionTimezoneOf : public IExecutableFunctionImpl +{ +public: + static constexpr auto name = "timezoneOf"; + String getName() const override { return name; } + + bool useDefaultImplementationForNulls() const override { return false; } + bool useDefaultImplementationForLowCardinalityColumns() const override { return false; } + + /// Execute the function on the columns. + ColumnPtr execute(const ColumnsWithTypeAndName & arguments, const DataTypePtr &, size_t input_rows_count) const override + { + DataTypePtr type_no_nullable = removeNullable(arguments[0].type); + + return DataTypeString().createColumnConst(input_rows_count, + dynamic_cast(*type_no_nullable).getTimeZone().getTimeZone()); + } +}; + + +class BaseFunctionTimezoneOf : public IFunctionBaseImpl +{ +public: + BaseFunctionTimezoneOf(DataTypes argument_types_, DataTypePtr return_type_) + : argument_types(std::move(argument_types_)), return_type(std::move(return_type_)) {} + + static constexpr auto name = "timezoneOf"; + String getName() const override { return name; } + + bool isDeterministic() const override { return true; } + bool isDeterministicInScopeOfQuery() const override { return true; } + + const DataTypes & getArgumentTypes() const override { return argument_types; } + const DataTypePtr & getResultType() const override { return return_type; } + + ExecutableFunctionImplPtr prepare(const ColumnsWithTypeAndName &) const override + { + return std::make_unique(); + } + + ColumnPtr getResultIfAlwaysReturnsConstantAndHasArguments(const ColumnsWithTypeAndName & arguments) const override + { + DataTypePtr type_no_nullable = removeNullable(arguments[0].type); + + return DataTypeString().createColumnConst(1, + dynamic_cast(*type_no_nullable).getTimeZone().getTimeZone()); + } + +private: + DataTypes argument_types; + DataTypePtr return_type; +}; + + +class FunctionTimezoneOfBuilder : public IFunctionOverloadResolverImpl +{ +public: + static constexpr auto name = "timezoneOf"; + String getName() const override { return name; } + static FunctionOverloadResolverImplPtr create(const Context &) { return std::make_unique(); } + + size_t getNumberOfArguments() const override { return 1; } + + DataTypePtr getReturnType(const DataTypes & types) const override + { + DataTypePtr type_no_nullable = removeNullable(types[0]); + + if (isDateTime(type_no_nullable) || isDateTime64(type_no_nullable)) + return std::make_shared(); + else + throw Exception(ErrorCodes::BAD_ARGUMENTS, "Bad argument for function {}, should be DateTime or DateTime64", name); + } + + FunctionBaseImplPtr build(const ColumnsWithTypeAndName & arguments, const DataTypePtr & return_type) const override + { + return std::make_unique(DataTypes{arguments[0].type}, return_type); + } + + bool useDefaultImplementationForNulls() const override { return false; } + bool useDefaultImplementationForLowCardinalityColumns() const override { return false; } + ColumnNumbers getArgumentsThatDontImplyNullableReturnType(size_t /*number_of_arguments*/) const override { return {0}; } +}; + +} + +void registerFunctionTimezoneOf(FunctionFactory & factory) +{ + factory.registerFunction(); + factory.registerAlias("timeZoneOf", "timezoneOf"); +} + +} + diff --git a/src/Functions/toTimeZone.cpp b/src/Functions/toTimezone.cpp similarity index 90% rename from src/Functions/toTimeZone.cpp rename to src/Functions/toTimezone.cpp index fbf3a0778a6..d12f926b284 100644 --- a/src/Functions/toTimeZone.cpp +++ b/src/Functions/toTimezone.cpp @@ -21,11 +21,11 @@ namespace { /// Just changes time zone information for data type. The calculation is free. -class FunctionToTimeZone : public IFunction +class FunctionToTimezone : public IFunction { public: - static constexpr auto name = "toTimeZone"; - static FunctionPtr create(const Context &) { return std::make_shared(); } + static constexpr auto name = "toTimezone"; + static FunctionPtr create(const Context &) { return std::make_shared(); } String getName() const override { @@ -64,7 +64,8 @@ public: void registerFunctionToTimeZone(FunctionFactory & factory) { - factory.registerFunction(); + factory.registerFunction(); + factory.registerAlias("toTimeZone", "toTimezone"); } } diff --git a/tests/queries/0_stateless/01767_timezoneOf.reference b/tests/queries/0_stateless/01767_timezoneOf.reference new file mode 100644 index 00000000000..63c027eecfd --- /dev/null +++ b/tests/queries/0_stateless/01767_timezoneOf.reference @@ -0,0 +1 @@ +Asia/Tehran Asia/Tehran Asia/Tehran Africa/Accra diff --git a/tests/queries/0_stateless/01767_timezoneOf.sh b/tests/queries/0_stateless/01767_timezoneOf.sh new file mode 100755 index 00000000000..428db2ee737 --- /dev/null +++ b/tests/queries/0_stateless/01767_timezoneOf.sh @@ -0,0 +1,7 @@ +#!/usr/bin/env bash + +CUR_DIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd) +# shellcheck source=../shell_config.sh +. "$CUR_DIR"/../shell_config.sh + +TZ=Asia/Tehran $CLICKHOUSE_LOCAL --query "SELECT timezone(), timezoneOf(now()), timeZone(), timeZoneOf(toTimezone(toNullable(now()), 'Africa/Accra'))" From a9c25579ce392bd9e5ed0324524cddfd8850d6fb Mon Sep 17 00:00:00 2001 From: Alexey Milovidov Date: Mon, 22 Mar 2021 22:53:39 +0300 Subject: [PATCH 062/155] More tests --- tests/queries/0_stateless/01767_timezoneOf.reference | 2 +- tests/queries/0_stateless/01767_timezoneOf.sh | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/queries/0_stateless/01767_timezoneOf.reference b/tests/queries/0_stateless/01767_timezoneOf.reference index 63c027eecfd..0a8a8c32d4e 100644 --- a/tests/queries/0_stateless/01767_timezoneOf.reference +++ b/tests/queries/0_stateless/01767_timezoneOf.reference @@ -1 +1 @@ -Asia/Tehran Asia/Tehran Asia/Tehran Africa/Accra +Asia/Tehran Asia/Tehran Asia/Tehran Africa/Accra Pacific/Pitcairn diff --git a/tests/queries/0_stateless/01767_timezoneOf.sh b/tests/queries/0_stateless/01767_timezoneOf.sh index 428db2ee737..9dee051ee3f 100755 --- a/tests/queries/0_stateless/01767_timezoneOf.sh +++ b/tests/queries/0_stateless/01767_timezoneOf.sh @@ -4,4 +4,4 @@ CUR_DIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd) # shellcheck source=../shell_config.sh . "$CUR_DIR"/../shell_config.sh -TZ=Asia/Tehran $CLICKHOUSE_LOCAL --query "SELECT timezone(), timezoneOf(now()), timeZone(), timeZoneOf(toTimezone(toNullable(now()), 'Africa/Accra'))" +TZ=Asia/Tehran $CLICKHOUSE_LOCAL --query "SELECT timezone(), timezoneOf(now()), timeZone(), timeZoneOf(toTimezone(toNullable(now()), 'Africa/Accra')), timeZoneOf(toTimeZone(now64(3), 'Pacific/Pitcairn'))" From 9845ff6694cea2786be7b88f9c90db307d399e7e Mon Sep 17 00:00:00 2001 From: alesapin Date: Mon, 22 Mar 2021 23:03:51 +0300 Subject: [PATCH 063/155] Move db to separate file --- .../src/jepsen/nukeeper/constants.clj | 26 +++-- .../src/jepsen/nukeeper/db.clj | 99 +++++++++++++++++++ .../src/jepsen/nukeeper/main.clj | 57 +---------- .../src/jepsen/nukeeper/nemesis.clj | 8 +- .../src/jepsen/nukeeper/utils.clj | 26 +++-- 5 files changed, 138 insertions(+), 78 deletions(-) create mode 100644 tests/jepsen.nukeeper/src/jepsen/nukeeper/db.clj diff --git a/tests/jepsen.nukeeper/src/jepsen/nukeeper/constants.clj b/tests/jepsen.nukeeper/src/jepsen/nukeeper/constants.clj index 511ff8e3bf3..95b142e43f9 100644 --- a/tests/jepsen.nukeeper/src/jepsen/nukeeper/constants.clj +++ b/tests/jepsen.nukeeper/src/jepsen/nukeeper/constants.clj @@ -1,12 +1,18 @@ (ns jepsen.nukeeper.constants) -(def dir "/var/lib/clickhouse") -(def binary "clickhouse") -(def logdir "/var/log/clickhouse-server") -(def logfile "/var/log/clickhouse-server/stderr.log") -(def serverlog "/var/log/clickhouse-server/clickhouse-server.log") -(def snapshotsdir "/var/lib/clickhouse/coordination/snapshots") -(def coordinationdir "/var/lib/clickhouse/coordination") -(def logsdir "/var/lib/clickhouse/coordination/logs") -(def pidfile (str dir "/clickhouse.pid")) -(def binary-path "/tmp") +(def common-prefix "/tmp/clickhouse") + +(def binary-name "clickhouse") + +(def binary-path (str common-prefix "/" binary-name)) +(def pid-file-path (str common-prefix "/clickhouse.pid")) + +(def data-dir (str common-prefix "/db")) +(def logs-dir (str common-prefix "/logs")) +(def configs-dir (str common-prefix "/config")) +(def sub-configs-dir (str configs-dir "/config.d")) +(def coordination-data-dir (str data-dir "/coordination")) +(def coordination-snapshots-dir (str coordination-data-dir "/snapshots")) +(def coordination-logs-dir (str coordination-data-dir "/logs")) + +(def stderr-file (str logs-dir "/stderr.log")) diff --git a/tests/jepsen.nukeeper/src/jepsen/nukeeper/db.clj b/tests/jepsen.nukeeper/src/jepsen/nukeeper/db.clj new file mode 100644 index 00000000000..b4bcd363740 --- /dev/null +++ b/tests/jepsen.nukeeper/src/jepsen/nukeeper/db.clj @@ -0,0 +1,99 @@ +(ns jepsen.nukeeper.db + (:require [clojure.tools.logging :refer :all] + [jepsen + [control :as c] + [db :as db] + [util :as util :refer [meh]]] + [jepsen.nukeeper.constants :refer :all] + [jepsen.nukeeper.utils :refer :all] + [clojure.java.io :as io] + [jepsen.control.util :as cu] + [jepsen.os.ubuntu :as ubuntu])) + +(defn get-clickhouse-sky + [version] + (c/exec :sky :get :-d common-prefix :-N :Backbone version)) + +(defn get-clickhouse-url + [url] + (let [download-result (cu/wget! url)] + (do (c/exec :mv download-result common-prefix) + (str common-prefix "/" download-result)))) + +(defn unpack-deb + [path] + (do + (c/exec :dpkg :-x path :.) + (c/exec :mv "usr/bin/clickhouse" common-prefix))) + +(defn unpack-tgz + [path] + (do + (c/exec :tar :-zxvf path :.) + (c/exec :mv "usr/bin/clickhouse" common-prefix))) + +(defn prepare-dirs + [] + (do + (c/exec :rm :-rf common-prefix) + (c/exec :mkdir :-p common-prefix) + (c/exec :mkdir :-p data-dir) + (c/exec :mkdir :-p logs-dir) + (c/exec :mkdir :-p configs-dir) + (c/exec :mkdir :-p sub-configs-dir) + (c/exec :touch stderr-file) + (c/exec :chown :-R :root common-prefix))) + +(defn cluster-config + [test node config-template] + (let [nodes (:nodes test) + replacement-map {#"\{srv1\}" (get nodes 0) + #"\{srv2\}" (get nodes 1) + #"\{srv3\}" (get nodes 2) + #"\{id\}" (str (inc (.indexOf nodes node))) + #"\{quorum_reads\}" (str (boolean (:quorum test))) + #"\{snapshot_distance\}" (str (:snapshot-distance test)) + #"\{stale_log_gap\}" (str (:stale-log-gap test)) + #"\{reserved_log_items\}" (str (:reserved-log-items test))}] + (reduce #(clojure.string/replace %1 (get %2 0) (get %2 1)) config-template replacement-map))) + +(defn install-configs + [test node] + (c/exec :echo (slurp (io/resource "config.xml")) :> (str configs-dir "/config.xml")) + (c/exec :echo (slurp (io/resource "users.xml")) :> (str configs-dir "/users.xml")) + (c/exec :echo (slurp (io/resource "listen.xml")) :> (str sub-configs-dir "/listen.xml")) + (c/exec :echo (cluster-config test node (slurp (io/resource "test_keeper_config.xml"))) :> (str sub-configs-dir "/test_keeper_config.xml"))) + +(defn db + [version] + (reify db/DB + (setup! [_ test node] + (c/su + (do + (info "Preparing directories") + (prepare-dirs) + (info "Downloading clickhouse") + (get-clickhouse-sky version) + (info "Installing configs") + (install-configs test node) + (info "Starting server") + (start-clickhouse! node test) + (info "ClickHouse started")))) + + + (teardown! [_ test node] + (info node "Tearing down clickhouse") + (kill-clickhouse! node test) + (c/su + ;(c/exec :rm :-f binary-path) + (c/exec :rm :-rf data-dir) + (c/exec :rm :-rf logs-dir) + (c/exec :rm :-rf configs-dir))) + + db/LogFiles + (log-files [_ test node] + (c/su + (kill-clickhouse! node test) + (c/cd data-dir + (c/exec :tar :czf "coordination.tar.gz" "coordination"))) + [stderr-file (str logs-dir "/clickhouse-server.log") (str data-dir "/coordination.tar.gz")]))) diff --git a/tests/jepsen.nukeeper/src/jepsen/nukeeper/main.clj b/tests/jepsen.nukeeper/src/jepsen/nukeeper/main.clj index dfa1cfd913e..e027b956937 100644 --- a/tests/jepsen.nukeeper/src/jepsen/nukeeper/main.clj +++ b/tests/jepsen.nukeeper/src/jepsen/nukeeper/main.clj @@ -3,6 +3,7 @@ [jepsen.nukeeper.utils :refer :all] [clojure.pprint :refer [pprint]] [jepsen.nukeeper.set :as set] + [jepsen.nukeeper.db :refer :all] [jepsen.nukeeper.nemesis :as custom-nemesis] [jepsen.nukeeper.register :as register] [jepsen.nukeeper.unique :as unique] @@ -31,60 +32,6 @@ (ch.qos.logback.classic Level) (org.slf4j Logger LoggerFactory))) -(defn cluster-config - [test node config-template] - (let [nodes (:nodes test) - replacement-map {#"\{srv1\}" (get nodes 0) - #"\{srv2\}" (get nodes 1) - #"\{srv3\}" (get nodes 2) - #"\{id\}" (str (inc (.indexOf nodes node))) - #"\{quorum_reads\}" (str (boolean (:quorum test))) - #"\{snapshot_distance\}" (str (:snapshot-distance test)) - #"\{stale_log_gap\}" (str (:stale-log-gap test)) - #"\{reserved_log_items\}" (str (:reserved-log-items test))}] - (reduce #(clojure.string/replace %1 (get %2 0) (get %2 1)) config-template replacement-map))) - -(defn db - [version] - (reify db/DB - (setup! [_ test node] - (info node "installing clickhouse" version) - (c/su - (if-not (cu/exists? (str binary-path "/clickhouse")) - (c/exec :sky :get :-d binary-path :-N :Backbone version)) - (c/exec :mkdir :-p logdir) - (c/exec :touch logfile) - (c/exec (str binary-path "/clickhouse") :install) - (c/exec :chown :-R :root dir) - (c/exec :chown :-R :root logdir) - (c/exec :echo (slurp (io/resource "listen.xml")) :> "/etc/clickhouse-server/config.d/listen.xml") - (c/exec :echo (cluster-config test node (slurp (io/resource "test_keeper_config.xml"))) :> "/etc/clickhouse-server/config.d/test_keeper_config.xml") - (cu/start-daemon! - {:pidfile pidfile - :logfile logfile - :chdir dir} - (str binary-path "/clickhouse") - :server - :--config "/etc/clickhouse-server/config.xml") - (wait-clickhouse-alive! node test))) - - (teardown! [_ test node] - (info node "tearing down clickhouse") - (cu/stop-daemon! (str binary-path "/clickhouse") pidfile) - (c/su - ;(c/exec :rm :-f (str binary-path "/clickhouse")) - (c/exec :rm :-rf dir) - (c/exec :rm :-rf logdir) - (c/exec :rm :-rf "/etc/clickhouse-server"))) - - db/LogFiles - (log-files [_ test node] - (c/su - (cu/stop-daemon! (str binary-path "/clickhouse") pidfile) - (c/cd dir - (c/exec :tar :czf "coordination.tar.gz" "coordination"))) - [logfile serverlog (str dir "/coordination.tar.gz")]))) - (def workloads "A map of workload names to functions that construct workloads, given opts." {"set" set/workload @@ -137,7 +84,7 @@ opts {:name (str "clickhouse-keeper quorum=" quorum " " (name (:workload opts)) " " (name (:nemesis opts))) :os ubuntu/os - :db (db "rbtorrent:156b85947eac9c85ef5d0ef15757a9f9e7c9e430") + :db (db "rbtorrent:a284492c715974b69f73add62b4ff590110369af") :pure-generators true :client (:client workload) :nemesis (:nemesis current-nemesis) diff --git a/tests/jepsen.nukeeper/src/jepsen/nukeeper/nemesis.clj b/tests/jepsen.nukeeper/src/jepsen/nukeeper/nemesis.clj index ec39c2b3e35..8314d29f575 100644 --- a/tests/jepsen.nukeeper/src/jepsen/nukeeper/nemesis.clj +++ b/tests/jepsen.nukeeper/src/jepsen/nukeeper/nemesis.clj @@ -76,21 +76,21 @@ (defn logs-corruption-nemesis [] - (corruptor-nemesis logsdir #(corrupt-file (select-last-file %1)))) + (corruptor-nemesis coordination-logs-dir #(corrupt-file (select-last-file %1)))) (defn snapshots-corruption-nemesis [] - (corruptor-nemesis snapshotsdir #(corrupt-file (select-last-file %1)))) + (corruptor-nemesis coordination-snapshots-dir #(corrupt-file (select-last-file %1)))) (defn logs-and-snapshots-corruption-nemesis [] - (corruptor-nemesis coordinationdir (fn [path] + (corruptor-nemesis coordination-data-dir (fn [path] (do (corrupt-file (select-last-file (str path "/snapshots"))) (corrupt-file (select-last-file (str path "/logs"))))))) (defn drop-all-corruption-nemesis [] - (corruptor-nemesis coordinationdir (fn [path] + (corruptor-nemesis coordination-data-dir (fn [path] (c/exec :rm :-fr path)))) (defn partition-bridge-nemesis diff --git a/tests/jepsen.nukeeper/src/jepsen/nukeeper/utils.clj b/tests/jepsen.nukeeper/src/jepsen/nukeeper/utils.clj index fe415ff9e51..30774c24dae 100644 --- a/tests/jepsen.nukeeper/src/jepsen/nukeeper/utils.clj +++ b/tests/jepsen.nukeeper/src/jepsen/nukeeper/utils.clj @@ -130,7 +130,7 @@ [node test] (info "Checking server alive on" node) (try - (c/exec (str binary-path "/clickhouse") :client :--query "SELECT 1") + (c/exec binary-path :client :--query "SELECT 1") (catch Exception _ false))) (defn wait-clickhouse-alive! @@ -144,18 +144,26 @@ [node test] (info "Killing server on node" node) (c/su - (cu/stop-daemon! (str binary-path "/clickhouse") pidfile) - (c/exec :rm :-fr (str dir "/status")))) + (cu/stop-daemon! binary-path pid-file-path) + (c/exec :rm :-fr (str data-dir "/status")))) (defn start-clickhouse! [node test] (info "Starting server on node" node) (c/su (cu/start-daemon! - {:pidfile pidfile - :logfile logfile - :chdir dir} - (str binary-path "/clickhouse") + {:pidfile pid-file-path + :logfile stderr-file + :chdir data-dir} + binary-path :server - :--config "/etc/clickhouse-server/config.xml")) - (wait-clickhouse-alive! node test)) + :--config (str configs-dir "/config.xml") + :-- + :--path data-dir + :--user_files_path (str data-dir "/user_files") + :--top_level_domains_path (str data-dir "/top_level_domains") + :--logger.log (str logs-dir "/clickhouse-server.log") + :--logger.errorlog (str logs-dir "/clickhouse-server.err.log") + :--test_keeper_server.snapshot_storage_path coordination-snapshots-dir + :--test_keeper_server.logs_storage_path coordination-logs-dir) + (wait-clickhouse-alive! node test))) From 3952a8e976cef6a912b93d90b95b0d60a752d262 Mon Sep 17 00:00:00 2001 From: Alexey Milovidov Date: Mon, 22 Mar 2021 23:42:09 +0300 Subject: [PATCH 064/155] Fix UBSan report in addMonths --- base/common/DateLUTImpl.h | 2 +- tests/queries/0_stateless/01770_add_months_ubsan.reference | 1 + tests/queries/0_stateless/01770_add_months_ubsan.sql | 2 ++ 3 files changed, 4 insertions(+), 1 deletion(-) create mode 100644 tests/queries/0_stateless/01770_add_months_ubsan.reference create mode 100644 tests/queries/0_stateless/01770_add_months_ubsan.sql diff --git a/base/common/DateLUTImpl.h b/base/common/DateLUTImpl.h index 43fc1b8befd..1a44c670650 100644 --- a/base/common/DateLUTImpl.h +++ b/base/common/DateLUTImpl.h @@ -1073,7 +1073,7 @@ public: { const Values & values = lut[toLUTIndex(v)]; - Int64 month = static_cast(values.month) + delta; + Int64 month = values.month + static_cast(delta); /// Cast is to avoid UB in signed integer overflow. if (month > 0) { diff --git a/tests/queries/0_stateless/01770_add_months_ubsan.reference b/tests/queries/0_stateless/01770_add_months_ubsan.reference new file mode 100644 index 00000000000..573541ac970 --- /dev/null +++ b/tests/queries/0_stateless/01770_add_months_ubsan.reference @@ -0,0 +1 @@ +0 diff --git a/tests/queries/0_stateless/01770_add_months_ubsan.sql b/tests/queries/0_stateless/01770_add_months_ubsan.sql new file mode 100644 index 00000000000..039434ff9bc --- /dev/null +++ b/tests/queries/0_stateless/01770_add_months_ubsan.sql @@ -0,0 +1,2 @@ +-- Result does not make sense but UBSan report should not be triggered. +SELECT ignore(now() + INTERVAL 9223372036854775807 MONTH); From 8d0210b510dc723cc1737f6ac6aade02dfb7cc11 Mon Sep 17 00:00:00 2001 From: Alexey Milovidov Date: Tue, 23 Mar 2021 01:16:41 +0300 Subject: [PATCH 065/155] Expose DateTime64 minmax part index in system.parts and system.parts_columns #18244 --- src/Storages/MergeTree/IMergeTreeDataPart.cpp | 55 +++++++++++-------- src/Storages/MergeTree/IMergeTreeDataPart.h | 10 ++-- src/Storages/MergeTree/MergeTreeData.cpp | 22 +++++--- src/Storages/System/StorageSystemParts.cpp | 11 ++-- .../System/StorageSystemPartsColumns.cpp | 17 ++++-- ...max_time_system_parts_datetime64.reference | 2 + ...3_min_max_time_system_parts_datetime64.sql | 9 +++ 7 files changed, 81 insertions(+), 45 deletions(-) create mode 100644 tests/queries/0_stateless/01773_min_max_time_system_parts_datetime64.reference create mode 100644 tests/queries/0_stateless/01773_min_max_time_system_parts_datetime64.sql diff --git a/src/Storages/MergeTree/IMergeTreeDataPart.cpp b/src/Storages/MergeTree/IMergeTreeDataPart.cpp index 1f18c894465..50a3169de0e 100644 --- a/src/Storages/MergeTree/IMergeTreeDataPart.cpp +++ b/src/Storages/MergeTree/IMergeTreeDataPart.cpp @@ -333,40 +333,49 @@ IMergeTreeDataPart::State IMergeTreeDataPart::getState() const } -DayNum IMergeTreeDataPart::getMinDate() const +std::pair IMergeTreeDataPart::getMinMaxDate() const { if (storage.minmax_idx_date_column_pos != -1 && minmax_idx.initialized) - return DayNum(minmax_idx.hyperrectangle[storage.minmax_idx_date_column_pos].left.get()); + { + const auto & hyperrectangle = minmax_idx.hyperrectangle[storage.minmax_idx_date_column_pos]; + return {DayNum(hyperrectangle.left.get()), DayNum(hyperrectangle.right.get())}; + } else - return DayNum(); + return {}; } - -DayNum IMergeTreeDataPart::getMaxDate() const -{ - if (storage.minmax_idx_date_column_pos != -1 && minmax_idx.initialized) - return DayNum(minmax_idx.hyperrectangle[storage.minmax_idx_date_column_pos].right.get()); - else - return DayNum(); -} - -time_t IMergeTreeDataPart::getMinTime() const +std::pair IMergeTreeDataPart::getMinMaxTime() const { if (storage.minmax_idx_time_column_pos != -1 && minmax_idx.initialized) - return minmax_idx.hyperrectangle[storage.minmax_idx_time_column_pos].left.get(); + { + const auto & hyperrectangle = minmax_idx.hyperrectangle[storage.minmax_idx_time_column_pos]; + + /// The case of DateTime + if (hyperrectangle.left.getType() == Field::Types::UInt64) + { + assert(hyperrectangle.right.getType() == Field::Types::UInt64); + return {hyperrectangle.left.get(), hyperrectangle.right.get()}; + } + /// The case of DateTime64 + else if (hyperrectangle.left.getType() == Field::Types::Decimal64) + { + assert(hyperrectangle.right.getType() == Field::Types::UInt64); + + auto left = hyperrectangle.left.get>(); + auto right = hyperrectangle.right.get>(); + + assert(left.getScale() == right.getScale()); + + return { left.getValue() / left.getScaleMultiplier(), right.getValue() / right.getScaleMultiplier() }; + } + else + throw Exception(ErrorCodes::LOGICAL_ERROR, "Part minmax index by time is neither DateTime or DateTime64"); + } else - return 0; + return {}; } -time_t IMergeTreeDataPart::getMaxTime() const -{ - if (storage.minmax_idx_time_column_pos != -1 && minmax_idx.initialized) - return minmax_idx.hyperrectangle[storage.minmax_idx_time_column_pos].right.get(); - else - return 0; -} - void IMergeTreeDataPart::setColumns(const NamesAndTypesList & new_columns) { columns = new_columns; diff --git a/src/Storages/MergeTree/IMergeTreeDataPart.h b/src/Storages/MergeTree/IMergeTreeDataPart.h index 83f8c672001..92b05e5cbd2 100644 --- a/src/Storages/MergeTree/IMergeTreeDataPart.h +++ b/src/Storages/MergeTree/IMergeTreeDataPart.h @@ -155,13 +155,11 @@ public: bool contains(const IMergeTreeDataPart & other) const { return info.contains(other.info); } - /// If the partition key includes date column (a common case), these functions will return min and max values for this column. - DayNum getMinDate() const; - DayNum getMaxDate() const; + /// If the partition key includes date column (a common case), this function will return min and max values for that column. + std::pair getMinMaxDate() const; - /// otherwise, if the partition key includes dateTime column (also a common case), these functions will return min and max values for this column. - time_t getMinTime() const; - time_t getMaxTime() const; + /// otherwise, if the partition key includes dateTime column (also a common case), this function will return min and max values for that column. + std::pair getMinMaxTime() const; bool isEmpty() const { return rows_count == 0; } diff --git a/src/Storages/MergeTree/MergeTreeData.cpp b/src/Storages/MergeTree/MergeTreeData.cpp index cf69de44a27..d02f9df4ad1 100644 --- a/src/Storages/MergeTree/MergeTreeData.cpp +++ b/src/Storages/MergeTree/MergeTreeData.cpp @@ -469,15 +469,19 @@ void MergeTreeData::checkPartitionKeyAndInitMinMax(const KeyDescription & new_pa DataTypes minmax_idx_columns_types = getMinMaxColumnsTypes(new_partition_key); /// Try to find the date column in columns used by the partition key (a common case). - bool encountered_date_column = false; + /// If there are no - DateTime or DateTime64 would also suffice. + + bool has_date_column = false; + bool has_datetime_column = false; + for (size_t i = 0; i < minmax_idx_columns_types.size(); ++i) { - if (typeid_cast(minmax_idx_columns_types[i].get())) + if (isDate(minmax_idx_columns_types[i])) { - if (!encountered_date_column) + if (!has_date_column) { minmax_idx_date_column_pos = i; - encountered_date_column = true; + has_date_column = true; } else { @@ -486,16 +490,18 @@ void MergeTreeData::checkPartitionKeyAndInitMinMax(const KeyDescription & new_pa } } } - if (!encountered_date_column) + if (!has_date_column) { for (size_t i = 0; i < minmax_idx_columns_types.size(); ++i) { - if (typeid_cast(minmax_idx_columns_types[i].get())) + if (isDateTime(minmax_idx_columns_types[i]) + || isDateTime64(minmax_idx_columns_types[i]) + ) { - if (!encountered_date_column) + if (!has_datetime_column) { minmax_idx_time_column_pos = i; - encountered_date_column = true; + has_datetime_column = true; } else { diff --git a/src/Storages/System/StorageSystemParts.cpp b/src/Storages/System/StorageSystemParts.cpp index eece092206d..6a643dbe1b9 100644 --- a/src/Storages/System/StorageSystemParts.cpp +++ b/src/Storages/System/StorageSystemParts.cpp @@ -137,14 +137,17 @@ void StorageSystemParts::processNextStorage( if (columns_mask[src_index++]) columns[res_index++]->insert(static_cast(part.use_count() - 1)); + auto min_max_date = part->getMinMaxDate(); + auto min_max_time = part->getMinMaxTime(); + if (columns_mask[src_index++]) - columns[res_index++]->insert(part->getMinDate()); + columns[res_index++]->insert(min_max_date.first); if (columns_mask[src_index++]) - columns[res_index++]->insert(part->getMaxDate()); + columns[res_index++]->insert(min_max_date.second); if (columns_mask[src_index++]) - columns[res_index++]->insert(static_cast(part->getMinTime())); + columns[res_index++]->insert(static_cast(min_max_time.first)); if (columns_mask[src_index++]) - columns[res_index++]->insert(static_cast(part->getMaxTime())); + columns[res_index++]->insert(static_cast(min_max_time.second)); if (columns_mask[src_index++]) columns[res_index++]->insert(part->info.partition_id); if (columns_mask[src_index++]) diff --git a/src/Storages/System/StorageSystemPartsColumns.cpp b/src/Storages/System/StorageSystemPartsColumns.cpp index 8754e424281..703de70d17f 100644 --- a/src/Storages/System/StorageSystemPartsColumns.cpp +++ b/src/Storages/System/StorageSystemPartsColumns.cpp @@ -32,6 +32,8 @@ StorageSystemPartsColumns::StorageSystemPartsColumns(const StorageID & table_id_ {"refcount", std::make_shared()}, {"min_date", std::make_shared()}, {"max_date", std::make_shared()}, + {"min_time", std::make_shared()}, + {"max_time", std::make_shared()}, {"partition_id", std::make_shared()}, {"min_block_number", std::make_shared()}, {"max_block_number", std::make_shared()}, @@ -95,8 +97,10 @@ void StorageSystemPartsColumns::processNextStorage( /// For convenience, in returned refcount, don't add references that was due to local variables in this method: all_parts, active_parts. auto use_count = part.use_count() - 1; - auto min_date = part->getMinDate(); - auto max_date = part->getMaxDate(); + + auto min_max_date = part->getMinMaxDate(); + auto min_max_time = part->getMinMaxTime(); + auto index_size_in_bytes = part->getIndexSizeInBytes(); auto index_size_in_allocated_bytes = part->getIndexSizeInAllocatedBytes(); @@ -141,9 +145,14 @@ void StorageSystemPartsColumns::processNextStorage( columns[res_index++]->insert(UInt64(use_count)); if (columns_mask[src_index++]) - columns[res_index++]->insert(min_date); + columns[res_index++]->insert(min_max_date.first); if (columns_mask[src_index++]) - columns[res_index++]->insert(max_date); + columns[res_index++]->insert(min_max_date.second); + if (columns_mask[src_index++]) + columns[res_index++]->insert(static_cast(min_max_time.first)); + if (columns_mask[src_index++]) + columns[res_index++]->insert(static_cast(min_max_time.second)); + if (columns_mask[src_index++]) columns[res_index++]->insert(part->info.partition_id); if (columns_mask[src_index++]) diff --git a/tests/queries/0_stateless/01773_min_max_time_system_parts_datetime64.reference b/tests/queries/0_stateless/01773_min_max_time_system_parts_datetime64.reference new file mode 100644 index 00000000000..1cea52ec1c2 --- /dev/null +++ b/tests/queries/0_stateless/01773_min_max_time_system_parts_datetime64.reference @@ -0,0 +1,2 @@ +2000-01-02 03:04:05 2001-02-03 04:05:06 +2000-01-02 03:04:05 2001-02-03 04:05:06 diff --git a/tests/queries/0_stateless/01773_min_max_time_system_parts_datetime64.sql b/tests/queries/0_stateless/01773_min_max_time_system_parts_datetime64.sql new file mode 100644 index 00000000000..5a1f809b03b --- /dev/null +++ b/tests/queries/0_stateless/01773_min_max_time_system_parts_datetime64.sql @@ -0,0 +1,9 @@ +DROP TABLE IF EXISTS test; +CREATE TABLE test (time DateTime64(3)) ENGINE = MergeTree ORDER BY tuple() PARTITION BY toStartOfInterval(time, INTERVAL 2 YEAR); + +INSERT INTO test VALUES ('2000-01-02 03:04:05.123'), ('2001-02-03 04:05:06.789'); + +SELECT min_time, max_time FROM system.parts WHERE table = 'test' AND database = currentDatabase(); +SELECT min_time, max_time FROM system.parts_columns WHERE table = 'test' AND database = currentDatabase(); + +DROP TABLE test; From cce2e0acaffaaf5c26b452d26455851b21acaab2 Mon Sep 17 00:00:00 2001 From: alesapin Date: Tue, 23 Mar 2021 10:28:14 +0300 Subject: [PATCH 066/155] Fix typo --- utils/nukeeper-data-dumper/main.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/utils/nukeeper-data-dumper/main.cpp b/utils/nukeeper-data-dumper/main.cpp index 0340c94c5a0..c80aeb473e2 100644 --- a/utils/nukeeper-data-dumper/main.cpp +++ b/utils/nukeeper-data-dumper/main.cpp @@ -66,7 +66,7 @@ int main(int argc, char *argv[]) state_machine->init(); size_t last_commited_index = state_machine->last_commit_index(); - LOG_INFO(logger, "Last commited index: {}", last_commited_index); + LOG_INFO(logger, "Last committed index: {}", last_commited_index); DB::NuKeeperLogStore changelog(argv[2], 10000000, true); changelog.init(last_commited_index, 10000000000UL); /// collect all logs From 77935931120d33e58446be32765fe24566294760 Mon Sep 17 00:00:00 2001 From: filimonov <1549571+filimonov@users.noreply.github.com> Date: Tue, 23 Mar 2021 10:21:51 +0100 Subject: [PATCH 067/155] Update Dockerfile.alpine --- docker/server/Dockerfile.alpine | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docker/server/Dockerfile.alpine b/docker/server/Dockerfile.alpine index 0f9de1996ab..ea64c839cb0 100644 --- a/docker/server/Dockerfile.alpine +++ b/docker/server/Dockerfile.alpine @@ -21,7 +21,7 @@ RUN addgroup -S -g 101 clickhouse \ && chown clickhouse:clickhouse /var/lib/clickhouse \ && chown root:clickhouse /var/log/clickhouse-server \ && chmod +x /entrypoint.sh \ - && apk add --no-cache su-exec bash \ + && apk add --no-cache su-exec bash tzdata \ && chmod ugo+Xrw -R /var/lib/clickhouse /var/log/clickhouse-server /etc/clickhouse-server /etc/clickhouse-client # we need to allow "others" access to clickhouse folder, because docker container From 4cbf741e527275ac73e8c475b0a2125ab8a1ca39 Mon Sep 17 00:00:00 2001 From: filimonov <1549571+filimonov@users.noreply.github.com> Date: Tue, 23 Mar 2021 11:48:28 +0100 Subject: [PATCH 068/155] Update Dockerfile.alpine --- docker/server/Dockerfile.alpine | 2 ++ 1 file changed, 2 insertions(+) diff --git a/docker/server/Dockerfile.alpine b/docker/server/Dockerfile.alpine index ea64c839cb0..cd192c0c9da 100644 --- a/docker/server/Dockerfile.alpine +++ b/docker/server/Dockerfile.alpine @@ -22,6 +22,8 @@ RUN addgroup -S -g 101 clickhouse \ && chown root:clickhouse /var/log/clickhouse-server \ && chmod +x /entrypoint.sh \ && apk add --no-cache su-exec bash tzdata \ + && cp /usr/share/zoneinfo/UTC /etc/localtime \ + && echo "UTC" > /etc/timezone \ && chmod ugo+Xrw -R /var/lib/clickhouse /var/log/clickhouse-server /etc/clickhouse-server /etc/clickhouse-client # we need to allow "others" access to clickhouse folder, because docker container From c938f4f2fe5fc5d07ce3e1fc0616979a80b32cc7 Mon Sep 17 00:00:00 2001 From: Alexander Tokmakov Date: Tue, 23 Mar 2021 14:29:29 +0300 Subject: [PATCH 069/155] fix --- src/Interpreters/InterpreterCreateQuery.cpp | 4 +--- src/Storages/StorageURL.cpp | 18 +++++++++++++++--- src/Storages/StorageURL.h | 6 +----- src/TableFunctions/ITableFunction.cpp | 14 +++++++++++--- 4 files changed, 28 insertions(+), 14 deletions(-) diff --git a/src/Interpreters/InterpreterCreateQuery.cpp b/src/Interpreters/InterpreterCreateQuery.cpp index 7034e74eaf8..f8bcbf02ab4 100644 --- a/src/Interpreters/InterpreterCreateQuery.cpp +++ b/src/Interpreters/InterpreterCreateQuery.cpp @@ -972,9 +972,7 @@ bool InterpreterCreateQuery::doCreateTable(ASTCreateQuery & create, { const auto & factory = TableFunctionFactory::instance(); auto table_func = factory.get(create.as_table_function, context); - /// We should use global context here because there will be no query context on server startup - /// and because storage lifetime is bigger than query context lifetime. - res = table_func->execute(create.as_table_function, context.getGlobalContext(), create.table, properties.columns); + res = table_func->execute(create.as_table_function, context, create.table, properties.columns); res->renameInMemory({create.database, create.table, create.uuid}); } else diff --git a/src/Storages/StorageURL.cpp b/src/Storages/StorageURL.cpp index b59f4b4a02a..2d3879340dc 100644 --- a/src/Storages/StorageURL.cpp +++ b/src/Storages/StorageURL.cpp @@ -33,7 +33,7 @@ namespace ErrorCodes IStorageURLBase::IStorageURLBase( const Poco::URI & uri_, - const Context & context_, + const Context & /*context_*/, const StorageID & table_id_, const String & format_name_, const std::optional & format_settings_, @@ -46,8 +46,6 @@ IStorageURLBase::IStorageURLBase( , format_name(format_name_) , format_settings(format_settings_) { - context_.getRemoteHostFilter().checkURL(uri); - StorageInMemoryMetadata storage_metadata; storage_metadata.setColumns(columns_); storage_metadata.setConstraints(constraints_); @@ -244,6 +242,20 @@ BlockOutputStreamPtr IStorageURLBase::write(const ASTPtr & /*query*/, const Stor chooseCompressionMethod(uri.toString(), compression_method)); } +StorageURL::StorageURL(const Poco::URI & uri_, + const StorageID & table_id_, + const String & format_name_, + const std::optional & format_settings_, + const ColumnsDescription & columns_, + const ConstraintsDescription & constraints_, + Context & context_, + const String & compression_method_) + : IStorageURLBase(uri_, context_, table_id_, format_name_, + format_settings_, columns_, constraints_, compression_method_) +{ + context_.getRemoteHostFilter().checkURL(uri); +} + void registerStorageURL(StorageFactory & factory) { factory.registerStorage("URL", [](const StorageFactory::Arguments & args) diff --git a/src/Storages/StorageURL.h b/src/Storages/StorageURL.h index 0ea86980b8c..2b2384b1043 100644 --- a/src/Storages/StorageURL.h +++ b/src/Storages/StorageURL.h @@ -113,11 +113,7 @@ public: const ColumnsDescription & columns_, const ConstraintsDescription & constraints_, Context & context_, - const String & compression_method_) - : IStorageURLBase(uri_, context_, table_id_, format_name_, - format_settings_, columns_, constraints_, compression_method_) - { - } + const String & compression_method_); String getName() const override { diff --git a/src/TableFunctions/ITableFunction.cpp b/src/TableFunctions/ITableFunction.cpp index 804a5b232ec..b637838c6da 100644 --- a/src/TableFunctions/ITableFunction.cpp +++ b/src/TableFunctions/ITableFunction.cpp @@ -20,12 +20,20 @@ StoragePtr ITableFunction::execute(const ASTPtr & ast_function, const Context & ProfileEvents::increment(ProfileEvents::TableFunctionExecute); context.checkAccess(AccessType::CREATE_TEMPORARY_TABLE | StorageFactory::instance().getSourceAccessType(getStorageTypeName())); - if (cached_columns.empty() || (hasStaticStructure() && cached_columns == getActualTableStructure(context))) + if (cached_columns.empty()) return executeImpl(ast_function, context, table_name, std::move(cached_columns)); - auto get_storage = [=, tf = shared_from_this()]() -> StoragePtr + /// We have table structure, so it's CREATE AS table_function(). + /// We should use global context here because there will be no query context on server startup + /// and because storage lifetime is bigger than query context lifetime. + const Context & global_context = context.getGlobalContext(); + if (hasStaticStructure() && cached_columns == getActualTableStructure(context)) + return executeImpl(ast_function, global_context, table_name, std::move(cached_columns)); + + auto this_table_function = shared_from_this(); + auto get_storage = [=, &global_context]() -> StoragePtr { - return tf->executeImpl(ast_function, context, table_name, cached_columns); + return this_table_function->executeImpl(ast_function, global_context, table_name, cached_columns); }; /// It will request actual table structure and create underlying storage lazily From be76defcfa7e8ba640f9bdcc46d2cd64323c4154 Mon Sep 17 00:00:00 2001 From: Ivan Lezhankin Date: Tue, 23 Mar 2021 14:59:36 +0300 Subject: [PATCH 070/155] Remove check for absent checkpoint on rollback --- src/IO/tests/gtest_peekable_read_buffer.cpp | 14 -------------- 1 file changed, 14 deletions(-) diff --git a/src/IO/tests/gtest_peekable_read_buffer.cpp b/src/IO/tests/gtest_peekable_read_buffer.cpp index 8c491338bd3..ddb947d8b2f 100644 --- a/src/IO/tests/gtest_peekable_read_buffer.cpp +++ b/src/IO/tests/gtest_peekable_read_buffer.cpp @@ -48,20 +48,6 @@ try readAndAssert(peekable, "01234"); } -#ifndef ABORT_ON_LOGICAL_ERROR - bool exception = false; - try - { - peekable.rollbackToCheckpoint(); - } - catch (DB::Exception & e) - { - if (e.code() != DB::ErrorCodes::LOGICAL_ERROR) - throw; - exception = true; - } - ASSERT_TRUE(exception); -#endif assertAvailable(peekable, "56789"); readAndAssert(peekable, "56"); From 0c525b4ec4fcd3f6225333f7c9a9f7ecea5abdbb Mon Sep 17 00:00:00 2001 From: alesapin Date: Tue, 23 Mar 2021 15:07:21 +0300 Subject: [PATCH 071/155] Add an ability to run from .deb and .tgz package --- src/Coordination/NuKeeperServer.cpp | 4 +- src/Coordination/NuKeeperServer.h | 1 + .../src/jepsen/nukeeper/constants.clj | 2 +- .../src/jepsen/nukeeper/db.clj | 47 +++++++++++++++---- .../src/jepsen/nukeeper/main.clj | 8 ++-- .../src/jepsen/nukeeper/utils.clj | 2 +- 6 files changed, 49 insertions(+), 15 deletions(-) diff --git a/src/Coordination/NuKeeperServer.cpp b/src/Coordination/NuKeeperServer.cpp index bfff7bf8f69..62af9656fb9 100644 --- a/src/Coordination/NuKeeperServer.cpp +++ b/src/Coordination/NuKeeperServer.cpp @@ -199,7 +199,8 @@ nuraft::cb_func::ReturnCode NuKeeperServer::callbackFunc(nuraft::cb_func::Type t { case nuraft::cb_func::BecomeLeader: { - if (commited_store) /// We become leader and store is empty, ready to serve requests + /// We become leader and store is empty or we already committed it + if (commited_store || initial_batch_committed) set_initialized(); return nuraft::cb_func::ReturnCode::Ok; } @@ -224,6 +225,7 @@ nuraft::cb_func::ReturnCode NuKeeperServer::callbackFunc(nuraft::cb_func::Type t { if (isLeader()) /// We have committed our log store and we are leader, ready to serve requests. set_initialized(); + initial_batch_committed = true; return nuraft::cb_func::ReturnCode::Ok; } default: /// ignore other events diff --git a/src/Coordination/NuKeeperServer.h b/src/Coordination/NuKeeperServer.h index 17099045640..ba25d5c181b 100644 --- a/src/Coordination/NuKeeperServer.h +++ b/src/Coordination/NuKeeperServer.h @@ -33,6 +33,7 @@ private: std::mutex initialized_mutex; bool initialized_flag = false; std::condition_variable initialized_cv; + std::atomic initial_batch_committed = false; nuraft::cb_func::ReturnCode callbackFunc(nuraft::cb_func::Type type, nuraft::cb_func::Param * param); diff --git a/tests/jepsen.nukeeper/src/jepsen/nukeeper/constants.clj b/tests/jepsen.nukeeper/src/jepsen/nukeeper/constants.clj index 95b142e43f9..d6245d450f5 100644 --- a/tests/jepsen.nukeeper/src/jepsen/nukeeper/constants.clj +++ b/tests/jepsen.nukeeper/src/jepsen/nukeeper/constants.clj @@ -1,6 +1,6 @@ (ns jepsen.nukeeper.constants) -(def common-prefix "/tmp/clickhouse") +(def common-prefix "/home/robot-clickhouse") (def binary-name "clickhouse") diff --git a/tests/jepsen.nukeeper/src/jepsen/nukeeper/db.clj b/tests/jepsen.nukeeper/src/jepsen/nukeeper/db.clj index b4bcd363740..106af25be17 100644 --- a/tests/jepsen.nukeeper/src/jepsen/nukeeper/db.clj +++ b/tests/jepsen.nukeeper/src/jepsen/nukeeper/db.clj @@ -12,7 +12,8 @@ (defn get-clickhouse-sky [version] - (c/exec :sky :get :-d common-prefix :-N :Backbone version)) + (c/exec :sky :get :-d common-prefix :-N :Backbone version) + (str common-prefix "/clickhouse")) (defn get-clickhouse-url [url] @@ -20,22 +21,47 @@ (do (c/exec :mv download-result common-prefix) (str common-prefix "/" download-result)))) +(defn download-clickhouse + [source] + (info "Downloading clickhouse from" source) + (cond + (clojure.string/starts-with? source "rbtorrent:") (get-clickhouse-sky source) + (clojure.string/starts-with? source "http") (get-clickhouse-url source) + :else (throw (Exception. (str "Don't know how to download clickhouse from" source))))) + (defn unpack-deb [path] (do - (c/exec :dpkg :-x path :.) - (c/exec :mv "usr/bin/clickhouse" common-prefix))) + (c/exec :dpkg :-x path common-prefix) + (c/exec :rm :-f path) + (c/exec :mv (str common-prefix "/usr/bin/clickhouse") common-prefix) + (c/exec :rm :-rf (str common-prefix "/usr") (str common-prefix "/etc")))) (defn unpack-tgz [path] (do - (c/exec :tar :-zxvf path :.) - (c/exec :mv "usr/bin/clickhouse" common-prefix))) + (c/exec :mkdir :-p (str common-prefix "/unpacked")) + (c/exec :tar :-zxvf path :-C (str common-prefix "/unpacked")) + (c/exec :rm :-f path) + (let [subdir (c/exec :ls (str common-prefix "/unpacked"))] + (c/exec :mv (str common-prefix "/unpacked/" subdir "/usr/bin/clickhouse") common-prefix) + (c/exec :rm :-fr (str common-prefix "/unpacked"))))) + +(defn chmod-binary + [path] + (c/exec :chmod :+x path)) + +(defn install-downloaded-clickhouse + [path] + (cond + (clojure.string/ends-with? path ".deb") (unpack-deb path) + (clojure.string/ends-with? path ".tgz") (unpack-tgz path) + (clojure.string/ends-with? path "clickhouse") (chmod-binary path) + :else (throw (Exception. (str "Don't know how to install clickhouse from path" path))))) (defn prepare-dirs [] (do - (c/exec :rm :-rf common-prefix) (c/exec :mkdir :-p common-prefix) (c/exec :mkdir :-p data-dir) (c/exec :mkdir :-p logs-dir) @@ -72,8 +98,10 @@ (do (info "Preparing directories") (prepare-dirs) - (info "Downloading clickhouse") - (get-clickhouse-sky version) + (if (not (cu/exists? binary-path)) + (do (info "Downloading clickhouse") + (install-downloaded-clickhouse (download-clickhouse version))) + (info "Binary already exsist on path" binary-path "skipping download")) (info "Installing configs") (install-configs test node) (info "Starting server") @@ -85,7 +113,8 @@ (info node "Tearing down clickhouse") (kill-clickhouse! node test) (c/su - ;(c/exec :rm :-f binary-path) + (c/exec :rm :-rf binary-path) + (c/exec :rm :-rf pid-file-path) (c/exec :rm :-rf data-dir) (c/exec :rm :-rf logs-dir) (c/exec :rm :-rf configs-dir))) diff --git a/tests/jepsen.nukeeper/src/jepsen/nukeeper/main.clj b/tests/jepsen.nukeeper/src/jepsen/nukeeper/main.clj index e027b956937..f3db61c6d53 100644 --- a/tests/jepsen.nukeeper/src/jepsen/nukeeper/main.clj +++ b/tests/jepsen.nukeeper/src/jepsen/nukeeper/main.clj @@ -70,7 +70,9 @@ :default 100 :parse-fn parse-long :validate [pos? "Must be a positive integer."]] - [nil, "--lightweight-run", "Subset of workloads/nemesises which is simple to validate"]]) + [nil, "--lightweight-run" "Subset of workloads/nemesises which is simple to validate"] + ["-c" "--clickhouse-source URL" "URL for clickhouse deb or tgz package" + :default "https://clickhouse-builds.s3.yandex.net/21677/ef82333089156907a0979669d9374c2e18daabe5/clickhouse_build_check/clang-11_relwithdebuginfo_none_bundled_unsplitted_disable_False_deb/clickhouse-common-static_21.4.1.6313_amd64.deb"]]) (defn nukeeper-test "Given an options map from the command line runner (e.g. :nodes, :ssh, @@ -82,9 +84,9 @@ current-nemesis (get custom-nemesis/custom-nemesises (:nemesis opts))] (merge tests/noop-test opts - {:name (str "clickhouse-keeper quorum=" quorum " " (name (:workload opts)) " " (name (:nemesis opts))) + {:name (str "clickhouse-keeper-quorum=" quorum "-" (name (:workload opts)) "-" (name (:nemesis opts))) :os ubuntu/os - :db (db "rbtorrent:a284492c715974b69f73add62b4ff590110369af") + :db (db (:clickhouse-source opts)) :pure-generators true :client (:client workload) :nemesis (:nemesis current-nemesis) diff --git a/tests/jepsen.nukeeper/src/jepsen/nukeeper/utils.clj b/tests/jepsen.nukeeper/src/jepsen/nukeeper/utils.clj index 30774c24dae..0e0db2d3a6d 100644 --- a/tests/jepsen.nukeeper/src/jepsen/nukeeper/utils.clj +++ b/tests/jepsen.nukeeper/src/jepsen/nukeeper/utils.clj @@ -159,7 +159,7 @@ :server :--config (str configs-dir "/config.xml") :-- - :--path data-dir + :--path (str data-dir "/") :--user_files_path (str data-dir "/user_files") :--top_level_domains_path (str data-dir "/top_level_domains") :--logger.log (str logs-dir "/clickhouse-server.log") From 890bd6f1e96a499422fa331706752d9c58b96d6b Mon Sep 17 00:00:00 2001 From: Maksim Kita Date: Tue, 23 Mar 2021 15:14:37 +0300 Subject: [PATCH 072/155] Fixed code review issues --- src/Dictionaries/CacheDictionaryStorage.h | 6 ++++-- src/Dictionaries/SSDCacheDictionaryStorage.h | 14 +++++++++----- 2 files changed, 13 insertions(+), 7 deletions(-) diff --git a/src/Dictionaries/CacheDictionaryStorage.h b/src/Dictionaries/CacheDictionaryStorage.h index ffab7f1f9cf..f0028dd8848 100644 --- a/src/Dictionaries/CacheDictionaryStorage.h +++ b/src/Dictionaries/CacheDictionaryStorage.h @@ -31,7 +31,9 @@ struct CacheDictionaryStorageConfiguration const DictionaryLifetime lifetime; }; -/// TODO: Add documentation +/** ICacheDictionaryStorage implementation that keeps key in hash table with fixed collision length. + * Value in hash table point to index in attributes arrays. + */ template class CacheDictionaryStorage final : public ICacheDictionaryStorage { @@ -484,7 +486,7 @@ private: PaddedPODArray result; result.reserve(size); - for (auto cell : cells) + for (auto & cell : cells) { if (cell.deadline == 0) continue; diff --git a/src/Dictionaries/SSDCacheDictionaryStorage.h b/src/Dictionaries/SSDCacheDictionaryStorage.h index d0b4a5ca835..baac725e184 100644 --- a/src/Dictionaries/SSDCacheDictionaryStorage.h +++ b/src/Dictionaries/SSDCacheDictionaryStorage.h @@ -336,9 +336,7 @@ inline bool operator==(const SSDCacheIndex & lhs, const SSDCacheIndex & rhs) return lhs.block_index == rhs.block_index && lhs.offset_in_block == rhs.offset_in_block; } -/** SSDCacheMemoryBuffer initialized with block size and memory buffer blocks size. - * Allocate block_size * memory_buffer_blocks_size bytes with page alignment. - * Logically represents multiple memory_buffer_blocks_size blocks and current write block. +/** Logically represents multiple memory_buffer_blocks_size SSDCacheBlocks and current write block. * If key cannot be written into current_write_block, current block keys size and check summ is written * and buffer increase index of current_write_block_index. * If current_write_block_index == memory_buffer_blocks_size write key will always returns true. @@ -443,7 +441,7 @@ private: size_t current_block_index = 0; }; -/// TODO: Add documentation +/// Logically represents multiple memory_buffer_blocks_size SSDCacheBlocks on file system template class SSDCacheFileBuffer : private boost::noncopyable { @@ -796,7 +794,13 @@ private: size_t current_blocks_size = 0; }; -/// TODO: Add documentation +/** ICacheDictionaryStorage implementation that keeps column data serialized in memory index and in disk partitions. + * Data is first writen in memory buffer. + * If memory buffer is full then buffer is flushed to disk partition. + * If memory buffer cannot be flushed to associated disk partition, then if partition + * can be allocated (current partition index < max_partitions_size) storage allocates new partition, if not old partitions are reused. + * Index maps key to partition block and offset. + */ template class SSDCacheDictionaryStorage final : public ICacheDictionaryStorage { From 46f4c60839e32a2d46740143dcf59774dfa23d5d Mon Sep 17 00:00:00 2001 From: Vitaly Baranov Date: Tue, 23 Mar 2021 15:15:44 +0300 Subject: [PATCH 073/155] Small simplification in ExternalLoader. --- src/Interpreters/ExternalLoader.cpp | 8 +------- 1 file changed, 1 insertion(+), 7 deletions(-) diff --git a/src/Interpreters/ExternalLoader.cpp b/src/Interpreters/ExternalLoader.cpp index 73257ba5185..853fe296d1c 100644 --- a/src/Interpreters/ExternalLoader.cpp +++ b/src/Interpreters/ExternalLoader.cpp @@ -818,13 +818,10 @@ private: if (!min_id) min_id = getMinIDToFinishLoading(forced_to_reload); - if (info->state_id >= min_id) - return true; /// stop - if (info->loading_id < min_id) startLoading(*info, forced_to_reload, *min_id); - /// Wait for the next event if loading wasn't completed, and stop otherwise. + /// Wait for the next event if loading wasn't completed, or stop otherwise. return (info->state_id >= min_id); }; @@ -850,9 +847,6 @@ private: if (filter && !filter(name)) continue; - if (info.state_id >= min_id) - continue; - if (info.loading_id < min_id) startLoading(info, forced_to_reload, *min_id); From 83255cbd64305910fd2a72b0f1d00fcb63ed5ea6 Mon Sep 17 00:00:00 2001 From: alesapin Date: Tue, 23 Mar 2021 15:19:37 +0300 Subject: [PATCH 074/155] Add option to reuse same binary --- tests/jepsen.nukeeper/src/jepsen/nukeeper/db.clj | 7 ++++--- tests/jepsen.nukeeper/src/jepsen/nukeeper/main.clj | 3 ++- 2 files changed, 6 insertions(+), 4 deletions(-) diff --git a/tests/jepsen.nukeeper/src/jepsen/nukeeper/db.clj b/tests/jepsen.nukeeper/src/jepsen/nukeeper/db.clj index 106af25be17..7bc2b9c6cea 100644 --- a/tests/jepsen.nukeeper/src/jepsen/nukeeper/db.clj +++ b/tests/jepsen.nukeeper/src/jepsen/nukeeper/db.clj @@ -91,14 +91,14 @@ (c/exec :echo (cluster-config test node (slurp (io/resource "test_keeper_config.xml"))) :> (str sub-configs-dir "/test_keeper_config.xml"))) (defn db - [version] + [version reuse-binary] (reify db/DB (setup! [_ test node] (c/su (do (info "Preparing directories") (prepare-dirs) - (if (not (cu/exists? binary-path)) + (if (or (not (cu/exists? binary-path)) (not reuse-binary)) (do (info "Downloading clickhouse") (install-downloaded-clickhouse (download-clickhouse version))) (info "Binary already exsist on path" binary-path "skipping download")) @@ -113,7 +113,8 @@ (info node "Tearing down clickhouse") (kill-clickhouse! node test) (c/su - (c/exec :rm :-rf binary-path) + (if (not reuse-binary) + (c/exec :rm :-rf binary-path)) (c/exec :rm :-rf pid-file-path) (c/exec :rm :-rf data-dir) (c/exec :rm :-rf logs-dir) diff --git a/tests/jepsen.nukeeper/src/jepsen/nukeeper/main.clj b/tests/jepsen.nukeeper/src/jepsen/nukeeper/main.clj index f3db61c6d53..45a1f442d24 100644 --- a/tests/jepsen.nukeeper/src/jepsen/nukeeper/main.clj +++ b/tests/jepsen.nukeeper/src/jepsen/nukeeper/main.clj @@ -71,6 +71,7 @@ :parse-fn parse-long :validate [pos? "Must be a positive integer."]] [nil, "--lightweight-run" "Subset of workloads/nemesises which is simple to validate"] + [nil, "--reuse-binary" "Use already downloaded binary if it exists, don't remove it on shutdown"] ["-c" "--clickhouse-source URL" "URL for clickhouse deb or tgz package" :default "https://clickhouse-builds.s3.yandex.net/21677/ef82333089156907a0979669d9374c2e18daabe5/clickhouse_build_check/clang-11_relwithdebuginfo_none_bundled_unsplitted_disable_False_deb/clickhouse-common-static_21.4.1.6313_amd64.deb"]]) @@ -86,7 +87,7 @@ opts {:name (str "clickhouse-keeper-quorum=" quorum "-" (name (:workload opts)) "-" (name (:nemesis opts))) :os ubuntu/os - :db (db (:clickhouse-source opts)) + :db (db (:clickhouse-source opts) (boolean (:reuse-binary opts))) :pure-generators true :client (:client workload) :nemesis (:nemesis current-nemesis) From 0c70fe1a6d3a5f4db78a97f445bbac72824c6cff Mon Sep 17 00:00:00 2001 From: Alexander Kuzmenkov Date: Tue, 23 Mar 2021 17:38:58 +0300 Subject: [PATCH 075/155] fix field get --- src/Processors/Transforms/WindowTransform.cpp | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/src/Processors/Transforms/WindowTransform.cpp b/src/Processors/Transforms/WindowTransform.cpp index 3a97698453a..121b9c818e1 100644 --- a/src/Processors/Transforms/WindowTransform.cpp +++ b/src/Processors/Transforms/WindowTransform.cpp @@ -117,9 +117,7 @@ static int compareValuesWithOffsetFloat(const IColumn * _compared_column, _compared_column); const auto * reference_column = assert_cast( _reference_column); - // The underlying field type is Float64 for Float32 as well. get() - // would be a reinterpret_cast and yield an incorrect result. - const auto offset = _offset.get(); + const auto offset = _offset.get(); const auto compared_value_data = compared_column->getDataAt(compared_row); assert(compared_value_data.size == sizeof(typename ColumnType::ValueType)); From bde02c72f3f1b082f9337a5aed88b5a50570979f Mon Sep 17 00:00:00 2001 From: Maksim Kita Date: Tue, 23 Mar 2021 18:14:22 +0300 Subject: [PATCH 076/155] Fixed typos --- src/Dictionaries/SSDCacheDictionaryStorage.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/Dictionaries/SSDCacheDictionaryStorage.h b/src/Dictionaries/SSDCacheDictionaryStorage.h index baac725e184..67f0465a2c7 100644 --- a/src/Dictionaries/SSDCacheDictionaryStorage.h +++ b/src/Dictionaries/SSDCacheDictionaryStorage.h @@ -795,7 +795,7 @@ private: }; /** ICacheDictionaryStorage implementation that keeps column data serialized in memory index and in disk partitions. - * Data is first writen in memory buffer. + * Data is first written in memory buffer. * If memory buffer is full then buffer is flushed to disk partition. * If memory buffer cannot be flushed to associated disk partition, then if partition * can be allocated (current partition index < max_partitions_size) storage allocates new partition, if not old partitions are reused. From 4716791a1abf6522c9628d429497229479fba568 Mon Sep 17 00:00:00 2001 From: alesapin Date: Tue, 23 Mar 2021 19:06:13 +0300 Subject: [PATCH 077/155] Better README.md --- tests/jepsen.nukeeper/README.md | 137 +++++++++++++++++++++++++++++++- 1 file changed, 135 insertions(+), 2 deletions(-) diff --git a/tests/jepsen.nukeeper/README.md b/tests/jepsen.nukeeper/README.md index f72409e080f..6bcd7a37069 100644 --- a/tests/jepsen.nukeeper/README.md +++ b/tests/jepsen.nukeeper/README.md @@ -1,10 +1,143 @@ # jepsen.nukeeper -A Clojure library designed to ... well, that part is up to you. +A Clojure library designed to test ZooKeeper-like implementation inside ClickHouse. + +## Test scenarios (workloads) + +### CAS register + +CAS Register has three operations: read number, write number, compare-and-swap number. This register is simulated as a single ZooKeeper node. Read transforms to ZooKeeper's `getData` request. Write transforms to the `set` request. Compare-and-swap implemented via `getData` + compare in code + `set` new value with `version` from `getData`. + +In this test, we use a linearizable checker, so Jepsen validates that history was linearizable. One of the heaviest workloads. + +Strictly requires `quorum_reads` to be true. + +### Set + +Set has two operations: add a number to set and read all values from set. This workload is simulated on a single ZooKeeper node with a string value that represents Clojure set data structure. Add operation very similar to compare-and-swap. We read string value from ZooKeeper node with `getData`, parse it to Clojure's set, add new value to the set and try to write it with the received version. + +In this test, Jepsen validates that all successfully added values can be read. Generator for this workload performs only add operations until a timeout and after that tries to read set once. + +### Unique IDs + +In the Unique IDs workload we have only one operation: generate a new unique number. It's implemented using ZooKeeper's sequential nodes. For each generates request client just creates a new sequential node in ZooKeeper with a fixed prefix. After that cuts the prefix off from the returned path and parses the number from the rest part. + +Jepsen checks that all returned IDs were unique. + +### Counter + +Counter workload has two operations: read counter value and add some number to the counter. Its implementation is quite weird. We add number `N` to the counter creating `N` sequential nodes in a single ZooKeeper transaction. Counter read implemented as `getChildren` ZooKeeper request and count of all returned nodes. + +Jepsen checks that counter value lies in the interval of possible value. Strictly requires `quorum_reads` to be true. + +### Total queue + +Simulates an unordered queue with three operations: enqueue number, dequeue, and drain. Enqueue operation uses `create` request with node name equals to number. `Dequeue` operation is more interesting. We list (`getChildren`) all nodes and remember the parent node version. After that we choose the smallest one and prepare the transaction: `check` parent node version + set an empty value to parent node + delete smalled child node. Drain operation is just `getChildren` on the parent path. + +Jepsen checks that all enqueued values were dequeued or drained. Duplicates are allowed because Jepsen doesn't know the value of the unknown-status (`:info`) dequeue operation. So when we try to `dequeue` some element we should return it even if our delete transaction failed with `Connection loss` error. + +### Linear queue + +Same with the total queue, but without drain operation. Checks linearizability between enqueue and dequeue. Sometimes consume more than 10GB during validation even for very short histories. + + +## Nemesis + +We use almost all standard nemeses with small changes for our storage. + +### Random node killer (random-node-killer) + +Sleep 5 seconds, kills random node, sleep for 5 seconds, and starts it back. + +### All nodes killer (all-nodes-killer) + +Kill all nodes at once, sleep for 5 seconds, and starts them back. + +### Simple partitioner (simple-partitioner) + +Partition one node from others using iptables. No one can see the victim and the victim cannot see anybody. + +### Random node stop (random-node-hammer-time) + +Send `SIGSTOP` to the random node. Sleep 5 seconds. Send `SIGCONT`. + +### All nodes stop (all-nodes-hammer-time) + +Send `SIGSTOP` to all nodes. Sleep 5 seconds. Send `SIGCONT`. + +### Logs corruptor (logs-corruptor) + +Corrupts latest log (change one random byte) in `clickhouse_path/coordination/logs`. Restarts nodes. + +### Snapshots corruptor (snapshots-corruptor) + +Corrupts latest snapshot (change one random byte) in `clickhouse_path/coordination/snapshots`. Restarts nodes. + +### Logs and snapshots corruptor (logs-and-snapshots-corruptor) + +Corrupts both the latest log and snapshot. Restarts node. + +### Drop data corruptor (drop-data-corruptor) + +Drop all data from `clickhouse_path/coordinator`. Restarts node. + +### Bridge partitioner (bridge-partitioner) + +Two nodes don't see each other but can see another node. The last node can see both. + +### Blind node partitioner (blind-node-partitioner) + +One of the nodes cannot see another, but they can see it. + +### Blind others partitioner (blind-others-partitioner) + +Two nodes don't see one node but it can see both. ## Usage -FIXME +### Dependencies + +- leiningen (https://leiningen.org/) +- clojure (https://clojure.org/) +- jvm + +### Options for `lein run` + +- `test` Run a single test. +- `test-all` Run all available tests from tests-set. +- `-w (--workload)` One of the workloads. Option for a single `test`. +- `--nemesis` One of nemeses. Option for a single `test`. +- `-q (--quorum)` Run test with quorum reads. +- `-r (--rate)` How many operations per second Jepsen will generate in a single thread. +- `-s (--snapshot-distance)` ClickHouse Keeper setting. How often we will create a new snapshot. +- `--stale-log-gap` ClickHosue Keeper setting. A leader will send a snapshot instead of a log to this node if it's committed index less than leaders - this setting value. +- `--reserved-log-items` ClickHouse Keeper setting. How many log items to keep after the snapshot. +- `--ops-per-key` Option for CAS register workload. Total ops that will be generated for a single register. +- `--lightweight-run` Run some lightweight tests without linearizability checks. Option for `tests-all` run. +- `--reuse-binary` Don't download clickhouse binary if it already exists on the node. +- `--clickhouse-source` URL to clickhouse `.deb`, `.tgz` or binary. +- `--time-limit` (in seconds) How long Jepsen will generate new operations. +- `--nodes-file` File with nodes for SSH. Newline separated. +- `--username` SSH username for nodes. +- `--password` SSH password for nodes. +- `--concurrency` How many threads Jepsen will use for concurrent requests. +- `--test-count` How many times to run a single test or how many tests to run from the tests set. + + +### Examples: + +1. Run `Set` workload with `logs-and-snapshots-corruptor` ten times: + +```sh +$ lein run test --nodes-file nodes.txt --username root --password '' --time-limit 30 --concurrency 50 -r 50 --workload set --nemesis logs-and-snapshots-corruptor --clickhouse-source 'https://clickhouse-builds.s3.yandex.net/someurl/clickhouse-common-static_21.4.1.6321_amd64.deb' -q --test-count 10 --reuse-binary +``` + +2. Run ten random tests from `lightweight-run` with some custom Keeper settings: + +``` sh +$ lein run test-all --nodes-file nodes.txt --username root --password '' --time-limit 30 --concurrency 50 -r 50 --snapshot-distance 100 --stale-log-gap 100 --reserved-log-items 10 --lightweight-run --clickhouse-source 'someurl' -q --reuse-binary --test-count 10 +``` + ## License From ba6ccbab42fd19da1322f8443de737fc8ef08edc Mon Sep 17 00:00:00 2001 From: alesapin Date: Tue, 23 Mar 2021 19:07:41 +0300 Subject: [PATCH 078/155] Fix header --- tests/jepsen.nukeeper/README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/jepsen.nukeeper/README.md b/tests/jepsen.nukeeper/README.md index 6bcd7a37069..8f3754b8f7b 100644 --- a/tests/jepsen.nukeeper/README.md +++ b/tests/jepsen.nukeeper/README.md @@ -1,4 +1,4 @@ -# jepsen.nukeeper +# Jepsen tests ClickHouse Keeper A Clojure library designed to test ZooKeeper-like implementation inside ClickHouse. From 1f4df07e08fadddf8f3f2ef205691460bd191622 Mon Sep 17 00:00:00 2001 From: Vitaly Baranov Date: Tue, 23 Mar 2021 20:58:30 +0300 Subject: [PATCH 079/155] Update used version of simdjson to 0.9.1 --- .gitmodules | 2 +- contrib/simdjson | 2 +- src/Functions/SimdJSONParser.h | 20 ++++++++++---------- 3 files changed, 12 insertions(+), 12 deletions(-) diff --git a/.gitmodules b/.gitmodules index 7a2c5600e65..f9bc8a56a5c 100644 --- a/.gitmodules +++ b/.gitmodules @@ -93,7 +93,7 @@ url = https://github.com/ClickHouse-Extras/libunwind.git [submodule "contrib/simdjson"] path = contrib/simdjson - url = https://github.com/ClickHouse-Extras/simdjson.git + url = https://github.com/simdjson/simdjson.git [submodule "contrib/rapidjson"] path = contrib/rapidjson url = https://github.com/ClickHouse-Extras/rapidjson diff --git a/contrib/simdjson b/contrib/simdjson index 3190d66a490..95b4870e20b 160000 --- a/contrib/simdjson +++ b/contrib/simdjson @@ -1 +1 @@ -Subproject commit 3190d66a49059092a1753dc35595923debfc1698 +Subproject commit 95b4870e20be5f97d9dcf63b23b1c6f520c366c1 diff --git a/src/Functions/SimdJSONParser.h b/src/Functions/SimdJSONParser.h index a9adfa27e2c..7ff3c45130d 100644 --- a/src/Functions/SimdJSONParser.h +++ b/src/Functions/SimdJSONParser.h @@ -42,11 +42,11 @@ struct SimdJSONParser ALWAYS_INLINE bool isBool() const { return element.type() == simdjson::dom::element_type::BOOL; } ALWAYS_INLINE bool isNull() const { return element.type() == simdjson::dom::element_type::NULL_VALUE; } - ALWAYS_INLINE Int64 getInt64() const { return element.get_int64().first; } - ALWAYS_INLINE UInt64 getUInt64() const { return element.get_uint64().first; } - ALWAYS_INLINE double getDouble() const { return element.get_double().first; } - ALWAYS_INLINE bool getBool() const { return element.get_bool().first; } - ALWAYS_INLINE std::string_view getString() const { return element.get_string().first; } + ALWAYS_INLINE Int64 getInt64() const { return element.get_int64().value_unsafe(); } + ALWAYS_INLINE UInt64 getUInt64() const { return element.get_uint64().value_unsafe(); } + ALWAYS_INLINE double getDouble() const { return element.get_double().value_unsafe(); } + ALWAYS_INLINE bool getBool() const { return element.get_bool().value_unsafe(); } + ALWAYS_INLINE std::string_view getString() const { return element.get_string().value_unsafe(); } ALWAYS_INLINE Array getArray() const; ALWAYS_INLINE Object getObject() const; @@ -75,7 +75,7 @@ struct SimdJSONParser ALWAYS_INLINE Iterator begin() const { return array.begin(); } ALWAYS_INLINE Iterator end() const { return array.end(); } ALWAYS_INLINE size_t size() const { return array.size(); } - ALWAYS_INLINE Element operator[](size_t index) const { assert(index < size()); return array.at(index).first; } + ALWAYS_INLINE Element operator[](size_t index) const { assert(index < size()); return array.at(index).value_unsafe(); } private: simdjson::dom::array array; @@ -111,7 +111,7 @@ struct SimdJSONParser if (x.error()) return false; - result = x.first; + result = x.value_unsafe(); return true; } @@ -137,7 +137,7 @@ struct SimdJSONParser if (document.error()) return false; - result = document.first; + result = document.value_unsafe(); return true; } @@ -155,12 +155,12 @@ private: inline ALWAYS_INLINE SimdJSONParser::Array SimdJSONParser::Element::getArray() const { - return element.get_array().first; + return element.get_array().value_unsafe(); } inline ALWAYS_INLINE SimdJSONParser::Object SimdJSONParser::Element::getObject() const { - return element.get_object().first; + return element.get_object().value_unsafe(); } } From 912144307d974afa5ecbf81e671f6adfba9aa231 Mon Sep 17 00:00:00 2001 From: Azat Khuzhin Date: Tue, 23 Mar 2021 23:18:02 +0300 Subject: [PATCH 080/155] Fix type of the ErrorCodes --- src/Common/ErrorCodes.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/Common/ErrorCodes.cpp b/src/Common/ErrorCodes.cpp index 586c0fbde4d..30714cb82ae 100644 --- a/src/Common/ErrorCodes.cpp +++ b/src/Common/ErrorCodes.cpp @@ -560,7 +560,7 @@ namespace DB { namespace ErrorCodes { -#define M(VALUE, NAME) extern const Value NAME = VALUE; +#define M(VALUE, NAME) extern const ErrorCode NAME = VALUE; APPLY_FOR_ERROR_CODES(M) #undef M From 7154b36a2da6cefac708ed2b0ebc67615ff9ead9 Mon Sep 17 00:00:00 2001 From: Dmitriy Date: Wed, 24 Mar 2021 00:08:07 +0300 Subject: [PATCH 081/155] Add ORC output format MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Задокументировал вывод данных в ORC формате. --- docs/en/interfaces/formats.md | 51 ++++++++++++++++++++--------------- 1 file changed, 29 insertions(+), 22 deletions(-) diff --git a/docs/en/interfaces/formats.md b/docs/en/interfaces/formats.md index ee2235b7861..940fc8cd636 100644 --- a/docs/en/interfaces/formats.md +++ b/docs/en/interfaces/formats.md @@ -50,7 +50,7 @@ The supported formats are: | [Parquet](#data-format-parquet) | ✔ | ✔ | | [Arrow](#data-format-arrow) | ✔ | ✔ | | [ArrowStream](#data-format-arrow-stream) | ✔ | ✔ | -| [ORC](#data-format-orc) | ✔ | ✗ | +| [ORC](#data-format-orc) | ✔ | ✔ | | [RowBinary](#rowbinary) | ✔ | ✔ | | [RowBinaryWithNamesAndTypes](#rowbinarywithnamesandtypes) | ✔ | ✔ | | [Native](#native) | ✔ | ✔ | @@ -1284,36 +1284,37 @@ To exchange data with Hadoop, you can use [HDFS table engine](../engines/table-e ## ORC {#data-format-orc} -[Apache ORC](https://orc.apache.org/) is a columnar storage format widespread in the Hadoop ecosystem. You can only insert data in this format to ClickHouse. +[Apache ORC](https://orc.apache.org/) is a columnar storage format widespread in the [Hadoop](https://hadoop.apache.org/) ecosystem. ### Data Types Matching {#data_types-matching-3} -The table below shows supported data types and how they match ClickHouse [data types](../sql-reference/data-types/index.md) in `INSERT` queries. +The table below shows supported data types and how they match ClickHouse [data types](../sql-reference/data-types/index.md). -| ORC data type (`INSERT`) | ClickHouse data type | -|--------------------------|-----------------------------------------------------| -| `UINT8`, `BOOL` | [UInt8](../sql-reference/data-types/int-uint.md) | -| `INT8` | [Int8](../sql-reference/data-types/int-uint.md) | -| `UINT16` | [UInt16](../sql-reference/data-types/int-uint.md) | -| `INT16` | [Int16](../sql-reference/data-types/int-uint.md) | -| `UINT32` | [UInt32](../sql-reference/data-types/int-uint.md) | -| `INT32` | [Int32](../sql-reference/data-types/int-uint.md) | -| `UINT64` | [UInt64](../sql-reference/data-types/int-uint.md) | -| `INT64` | [Int64](../sql-reference/data-types/int-uint.md) | -| `FLOAT`, `HALF_FLOAT` | [Float32](../sql-reference/data-types/float.md) | -| `DOUBLE` | [Float64](../sql-reference/data-types/float.md) | -| `DATE32` | [Date](../sql-reference/data-types/date.md) | -| `DATE64`, `TIMESTAMP` | [DateTime](../sql-reference/data-types/datetime.md) | -| `STRING`, `BINARY` | [String](../sql-reference/data-types/string.md) | -| `DECIMAL` | [Decimal](../sql-reference/data-types/decimal.md) | +| ORC data type (`INSERT`) | ClickHouse data type | ORC data type (`SELECT`) | +|--------------------------|-----------------------------------------------------|--------------------------| +| `UINT8`, `BOOL` | [UInt8](../sql-reference/data-types/int-uint.md) | `UINT8` | +| `INT8` | [Int8](../sql-reference/data-types/int-uint.md) | `INT8` | +| `UINT16` | [UInt16](../sql-reference/data-types/int-uint.md) | `UINT16` | +| `INT16` | [Int16](../sql-reference/data-types/int-uint.md) | `INT16` | +| `UINT32` | [UInt32](../sql-reference/data-types/int-uint.md) | `UINT32` | +| `INT32` | [Int32](../sql-reference/data-types/int-uint.md) | `INT32` | +| `UINT64` | [UInt64](../sql-reference/data-types/int-uint.md) | `UINT64` | +| `INT64` | [Int64](../sql-reference/data-types/int-uint.md) | `INT64` | +| `FLOAT`, `HALF_FLOAT` | [Float32](../sql-reference/data-types/float.md) | `FLOAT` | +| `DOUBLE` | [Float64](../sql-reference/data-types/float.md) | `DOUBLE` | +| `DATE32` | [Date](../sql-reference/data-types/date.md) | `DATE32` | +| `DATE64`, `TIMESTAMP` | [DateTime](../sql-reference/data-types/datetime.md) | `TIMESTAMP` | +| `STRING`, `BINARY` | [String](../sql-reference/data-types/string.md) | `BINARY` | +| `DECIMAL` | [Decimal](../sql-reference/data-types/decimal.md) | `DECIMAL` | +| `-` | [Array](../sql-reference/data-types/array.md) | `LIST` | -ClickHouse supports configurable precision of the `Decimal` type. The `INSERT` query treats the ORC `DECIMAL` type as the ClickHouse `Decimal128` type. +ClickHouse supports configurable precision of the `Decimal` type. The `INSERT` or `SELECT` query treats the ORC `DECIMAL` type as the ClickHouse `Decimal128` type. -Unsupported ORC data types: `DATE32`, `TIME32`, `FIXED_SIZE_BINARY`, `JSON`, `UUID`, `ENUM`. +Unsupported ORC data types: `TIME32`, `FIXED_SIZE_BINARY`, `JSON`, `UUID`, `ENUM`. The data types of ClickHouse table columns don’t have to match the corresponding ORC data fields. When inserting data, ClickHouse interprets data types according to the table above and then [casts](../sql-reference/functions/type-conversion-functions.md#type_conversion_function-cast) the data to the data type set for the ClickHouse table column. -### Inserting Data {#inserting-data-2} +### Inserting and Selecting Data {#inserting-and-selecting-data-1} You can insert ORC data from a file into ClickHouse table by the following command: @@ -1321,6 +1322,12 @@ You can insert ORC data from a file into ClickHouse table by the following comma $ cat filename.orc | clickhouse-client --query="INSERT INTO some_table FORMAT ORC" ``` +You can select data from a ClickHouse table and save them into some file in the ORC format by the following command: + +``` bash +$ clickhouse-client --query="SELECT * FROM {some_table} FORMAT ORC" > {filename.orc} +``` + To exchange data with Hadoop, you can use [HDFS table engine](../engines/table-engines/integrations/hdfs.md). ## LineAsString {#lineasstring} From 767eba04f99a19cdcd933a73c3f3decee5a1a63c Mon Sep 17 00:00:00 2001 From: Dmitriy Date: Wed, 24 Mar 2021 00:22:38 +0300 Subject: [PATCH 082/155] Update ORC format MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Поправил якоря. --- docs/en/interfaces/formats.md | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/docs/en/interfaces/formats.md b/docs/en/interfaces/formats.md index 940fc8cd636..0d582fab12b 100644 --- a/docs/en/interfaces/formats.md +++ b/docs/en/interfaces/formats.md @@ -1288,7 +1288,7 @@ To exchange data with Hadoop, you can use [HDFS table engine](../engines/table-e ### Data Types Matching {#data_types-matching-3} -The table below shows supported data types and how they match ClickHouse [data types](../sql-reference/data-types/index.md). +The table below shows supported data types and how they match ClickHouse [data types](../sql-reference/data-types/index.md) in `INSERT` and `SELECT` queries. | ORC data type (`INSERT`) | ClickHouse data type | ORC data type (`SELECT`) | |--------------------------|-----------------------------------------------------|--------------------------| @@ -1314,7 +1314,7 @@ Unsupported ORC data types: `TIME32`, `FIXED_SIZE_BINARY`, `JSON`, `UUID`, `ENUM The data types of ClickHouse table columns don’t have to match the corresponding ORC data fields. When inserting data, ClickHouse interprets data types according to the table above and then [casts](../sql-reference/functions/type-conversion-functions.md#type_conversion_function-cast) the data to the data type set for the ClickHouse table column. -### Inserting and Selecting Data {#inserting-and-selecting-data-1} +### Inserting Data {#inserting-data-2} You can insert ORC data from a file into ClickHouse table by the following command: @@ -1322,6 +1322,8 @@ You can insert ORC data from a file into ClickHouse table by the following comma $ cat filename.orc | clickhouse-client --query="INSERT INTO some_table FORMAT ORC" ``` +### Selecting Data {#selecting-data-2} + You can select data from a ClickHouse table and save them into some file in the ORC format by the following command: ``` bash From f2ef536dfb9e7d6b7fd9b6cd95f75293840d9729 Mon Sep 17 00:00:00 2001 From: Alexander Kuzmenkov Date: Wed, 24 Mar 2021 01:40:27 +0300 Subject: [PATCH 083/155] fix formatting --- src/Parsers/ASTSelectQuery.cpp | 2 +- src/Parsers/ASTWindowDefinition.cpp | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/src/Parsers/ASTSelectQuery.cpp b/src/Parsers/ASTSelectQuery.cpp index aa5508bf190..89ef08e0289 100644 --- a/src/Parsers/ASTSelectQuery.cpp +++ b/src/Parsers/ASTSelectQuery.cpp @@ -137,7 +137,7 @@ void ASTSelectQuery::formatImpl(const FormatSettings & s, FormatState & state, F if (window()) { s.ostr << (s.hilite ? hilite_keyword : "") << s.nl_or_ws << indent_str << - "WINDOW " << (s.hilite ? hilite_none : ""); + "WINDOW" << (s.hilite ? hilite_none : ""); window()->formatImpl(s, state, frame); } diff --git a/src/Parsers/ASTWindowDefinition.cpp b/src/Parsers/ASTWindowDefinition.cpp index ff08bda65ed..a645960bd0a 100644 --- a/src/Parsers/ASTWindowDefinition.cpp +++ b/src/Parsers/ASTWindowDefinition.cpp @@ -37,7 +37,7 @@ void ASTWindowDefinition::formatImpl(const FormatSettings & settings, { if (partition_by) { - settings.ostr << "PARTITION BY "; + settings.ostr << "PARTITION BY"; partition_by->formatImpl(settings, state, format_frame); } @@ -48,7 +48,7 @@ void ASTWindowDefinition::formatImpl(const FormatSettings & settings, if (order_by) { - settings.ostr << "ORDER BY "; + settings.ostr << "ORDER BY"; order_by->formatImpl(settings, state, format_frame); } From e0d1f6d80fc3b727acc3b3e336da48bfb005fa61 Mon Sep 17 00:00:00 2001 From: Alexander Kuzmenkov Date: Wed, 24 Mar 2021 01:52:16 +0300 Subject: [PATCH 084/155] fixes --- src/Processors/Transforms/WindowTransform.cpp | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/src/Processors/Transforms/WindowTransform.cpp b/src/Processors/Transforms/WindowTransform.cpp index 121b9c818e1..3ab16d0d1b4 100644 --- a/src/Processors/Transforms/WindowTransform.cpp +++ b/src/Processors/Transforms/WindowTransform.cpp @@ -49,6 +49,7 @@ static int compareValuesWithOffset(const IColumn * _compared_column, const auto * reference_column = assert_cast( _reference_column); const auto offset = _offset.get(); + assert(offset >= 0); const auto compared_value_data = compared_column->getDataAt(compared_row); assert(compared_value_data.size == sizeof(typename ColumnType::ValueType)); @@ -117,7 +118,8 @@ static int compareValuesWithOffsetFloat(const IColumn * _compared_column, _compared_column); const auto * reference_column = assert_cast( _reference_column); - const auto offset = _offset.get(); + const auto offset = _offset.get(); + assert(offset >= 0); const auto compared_value_data = compared_column->getDataAt(compared_row); assert(compared_value_data.size == sizeof(typename ColumnType::ValueType)); @@ -1403,6 +1405,7 @@ struct WindowFunctionRowNumber final : public WindowFunction } }; +// ClickHouse-specific variant of lag/lead that respects the window frame. template struct WindowFunctionLagLeadInFrame final : public WindowFunction { From 612d4fb073e37c4c6b89fb63e5ac864945b43959 Mon Sep 17 00:00:00 2001 From: alexey-milovidov Date: Wed, 24 Mar 2021 02:03:14 +0300 Subject: [PATCH 085/155] Update IMergeTreeDataPart.cpp --- src/Storages/MergeTree/IMergeTreeDataPart.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/Storages/MergeTree/IMergeTreeDataPart.cpp b/src/Storages/MergeTree/IMergeTreeDataPart.cpp index 50a3169de0e..453edcdbbcd 100644 --- a/src/Storages/MergeTree/IMergeTreeDataPart.cpp +++ b/src/Storages/MergeTree/IMergeTreeDataPart.cpp @@ -359,7 +359,7 @@ std::pair IMergeTreeDataPart::getMinMaxTime() const /// The case of DateTime64 else if (hyperrectangle.left.getType() == Field::Types::Decimal64) { - assert(hyperrectangle.right.getType() == Field::Types::UInt64); + assert(hyperrectangle.right.getType() == Field::Types::Decimal64); auto left = hyperrectangle.left.get>(); auto right = hyperrectangle.right.get>(); From eae268f2f0ce9935a0be0dd3d05e0d99897aeb00 Mon Sep 17 00:00:00 2001 From: Alexey Milovidov Date: Wed, 24 Mar 2021 03:15:48 +0300 Subject: [PATCH 086/155] Allow to search tuple of NULLs in a set --- src/Interpreters/convertFieldToType.cpp | 5 +++++ tests/queries/0_stateless/01774_tuple_null_in.reference | 2 ++ tests/queries/0_stateless/01774_tuple_null_in.sql | 2 ++ 3 files changed, 9 insertions(+) create mode 100644 tests/queries/0_stateless/01774_tuple_null_in.reference create mode 100644 tests/queries/0_stateless/01774_tuple_null_in.sql diff --git a/src/Interpreters/convertFieldToType.cpp b/src/Interpreters/convertFieldToType.cpp index d47f64cb1dc..5d124add0df 100644 --- a/src/Interpreters/convertFieldToType.cpp +++ b/src/Interpreters/convertFieldToType.cpp @@ -377,6 +377,11 @@ Field convertFieldToType(const Field & from_value, const IDataType & to_type, co else if (const auto * nullable_type = typeid_cast(&to_type)) { const IDataType & nested_type = *nullable_type->getNestedType(); + + /// NULL remains NULL after any conversion. + if (WhichDataType(nested_type).isNothing()) + return {}; + if (from_type_hint && from_type_hint->equals(nested_type)) return from_value; return convertFieldToTypeImpl(from_value, nested_type, from_type_hint); diff --git a/tests/queries/0_stateless/01774_tuple_null_in.reference b/tests/queries/0_stateless/01774_tuple_null_in.reference new file mode 100644 index 00000000000..aa47d0d46d4 --- /dev/null +++ b/tests/queries/0_stateless/01774_tuple_null_in.reference @@ -0,0 +1,2 @@ +0 +0 diff --git a/tests/queries/0_stateless/01774_tuple_null_in.sql b/tests/queries/0_stateless/01774_tuple_null_in.sql new file mode 100644 index 00000000000..c9dad49e8ed --- /dev/null +++ b/tests/queries/0_stateless/01774_tuple_null_in.sql @@ -0,0 +1,2 @@ +SELECT (NULL, NULL) = (8, 0) OR (NULL, NULL) = (3, 2) OR (NULL, NULL) = (0, 0) OR (NULL, NULL) = (3, 1); +SELECT (NULL, NULL) IN ((NULL, 0), (3, 1), (3, 2), (8, 0)); From 3e74f56261ef0055553f69d014681c7f830400d6 Mon Sep 17 00:00:00 2001 From: Alexey Milovidov Date: Wed, 24 Mar 2021 03:34:30 +0300 Subject: [PATCH 087/155] Fix missing check in decrypt for AEAD mode --- src/Functions/FunctionsAES.h | 18 ++++++++++++++---- .../01776_decrypt_aead_size_check.reference | 0 .../01776_decrypt_aead_size_check.sql | 1 + 3 files changed, 15 insertions(+), 4 deletions(-) create mode 100644 tests/queries/0_stateless/01776_decrypt_aead_size_check.reference create mode 100644 tests/queries/0_stateless/01776_decrypt_aead_size_check.sql diff --git a/src/Functions/FunctionsAES.h b/src/Functions/FunctionsAES.h index 132e94907f5..8af4a27ecc9 100644 --- a/src/Functions/FunctionsAES.h +++ b/src/Functions/FunctionsAES.h @@ -538,8 +538,9 @@ private: [[maybe_unused]] const auto block_size = static_cast(EVP_CIPHER_block_size(evp_cipher)); [[maybe_unused]] const auto iv_size = static_cast(EVP_CIPHER_iv_length(evp_cipher)); - const auto key_size = static_cast(EVP_CIPHER_key_length(evp_cipher)); - const auto tag_size = 16; // https://tools.ietf.org/html/rfc5116#section-5.1 + + const size_t key_size = static_cast(EVP_CIPHER_key_length(evp_cipher)); + static constexpr size_t tag_size = 16; // https://tools.ietf.org/html/rfc5116#section-5.1 auto decrypted_result_column = ColumnString::create(); auto & decrypted_result_column_data = decrypted_result_column->getChars(); @@ -549,9 +550,17 @@ private: size_t resulting_size = 0; for (size_t r = 0; r < input_rows_count; ++r) { - resulting_size += input_column->getDataAt(r).size + 1; + size_t string_size = input_column->getDataAt(r).size; + resulting_size += string_size + 1; /// With terminating zero. + if constexpr (mode == CipherMode::RFC5116_AEAD_AES_GCM) + { + if (string_size < tag_size) + throw Exception("Encrypted data is smaller than the size of additional data for AEAD mode, cannot decrypt.", + ErrorCodes::BAD_ARGUMENTS); + resulting_size -= tag_size; + } } #if defined(MEMORY_SANITIZER) @@ -565,6 +574,7 @@ private: decrypted_result_column_data.resize(resulting_size); #endif } + auto * decrypted = decrypted_result_column_data.data(); KeyHolder key_holder; @@ -631,7 +641,7 @@ private: // 1.a.2: Set AAD if present if (aad_column) { - const auto aad_data = aad_column->getDataAt(r); + StringRef aad_data = aad_column->getDataAt(r); int tmp_len = 0; if (aad_data.size != 0 && EVP_DecryptUpdate(evp_ctx, nullptr, &tmp_len, reinterpret_cast(aad_data.data), aad_data.size) != 1) diff --git a/tests/queries/0_stateless/01776_decrypt_aead_size_check.reference b/tests/queries/0_stateless/01776_decrypt_aead_size_check.reference new file mode 100644 index 00000000000..e69de29bb2d diff --git a/tests/queries/0_stateless/01776_decrypt_aead_size_check.sql b/tests/queries/0_stateless/01776_decrypt_aead_size_check.sql new file mode 100644 index 00000000000..8730ed0eda2 --- /dev/null +++ b/tests/queries/0_stateless/01776_decrypt_aead_size_check.sql @@ -0,0 +1 @@ +SELECT decrypt('aes-128-gcm', 'text', 'key', 'IV'); -- { serverError 36 } From 5dc9223288a5fe3f17a90faabded0c31e74178b7 Mon Sep 17 00:00:00 2001 From: Alexey Milovidov Date: Wed, 24 Mar 2021 04:11:57 +0300 Subject: [PATCH 088/155] Fix Arcadia --- src/Functions/ya.make | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/Functions/ya.make b/src/Functions/ya.make index 3ac64828b9c..aed2bd9b70d 100644 --- a/src/Functions/ya.make +++ b/src/Functions/ya.make @@ -467,6 +467,7 @@ SRCS( timeSlot.cpp timeSlots.cpp timezone.cpp + timezoneOf.cpp timezoneOffset.cpp toColumnTypeName.cpp toCustomWeek.cpp @@ -506,7 +507,7 @@ SRCS( toStartOfTenMinutes.cpp toStartOfYear.cpp toTime.cpp - toTimeZone.cpp + toTimezone.cpp toTypeName.cpp toUnixTimestamp64Micro.cpp toUnixTimestamp64Milli.cpp From 37948ac80a8ddc8f045721409598936757a2d3f2 Mon Sep 17 00:00:00 2001 From: Alexey Milovidov Date: Wed, 24 Mar 2021 04:12:52 +0300 Subject: [PATCH 089/155] Fix style --- src/Functions/timezoneOf.cpp | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/src/Functions/timezoneOf.cpp b/src/Functions/timezoneOf.cpp index cdf686e276b..1d007a6e10e 100644 --- a/src/Functions/timezoneOf.cpp +++ b/src/Functions/timezoneOf.cpp @@ -9,6 +9,13 @@ namespace DB { + +namespace ErrorCodes +{ + extern const int BAD_ARGUMENTS; +} + + namespace { From 7c07b43597c9bebd4853dea50ff484be3a13ee01 Mon Sep 17 00:00:00 2001 From: Azat Khuzhin Date: Tue, 23 Mar 2021 23:03:08 +0300 Subject: [PATCH 090/155] Convert system.errors.stack_trace from String into Array(UInt64) This should decrease overhead for the errors collecting. --- docs/en/operations/system-tables/errors.md | 11 ++++++- src/Common/ErrorCodes.cpp | 8 ++--- src/Common/ErrorCodes.h | 12 +++++--- src/Common/Exception.cpp | 34 ++++++++++++++++++--- src/Common/Exception.h | 4 +++ src/Storages/System/StorageSystemErrors.cpp | 12 ++++++-- 6 files changed, 65 insertions(+), 16 deletions(-) diff --git a/docs/en/operations/system-tables/errors.md b/docs/en/operations/system-tables/errors.md index 72a537f15b9..583cce88ca4 100644 --- a/docs/en/operations/system-tables/errors.md +++ b/docs/en/operations/system-tables/errors.md @@ -9,7 +9,7 @@ Columns: - `value` ([UInt64](../../sql-reference/data-types/int-uint.md)) — the number of times this error has been happened. - `last_error_time` ([DateTime](../../sql-reference/data-types/datetime.md)) — time when the last error happened. - `last_error_message` ([String](../../sql-reference/data-types/string.md)) — message for the last error. -- `last_error_stacktrace` ([String](../../sql-reference/data-types/string.md)) — stacktrace for the last error. +- `last_error_trace` ([Array(UInt64)](../../sql-reference/data-types/array.md)) — A [stack trace](https://en.wikipedia.org/wiki/Stack_trace) which represents a list of physical addresses where the called methods are stored. - `remote` ([UInt8](../../sql-reference/data-types/int-uint.md)) — remote exception (i.e. received during one of the distributed query). **Example** @@ -25,3 +25,12 @@ LIMIT 1 │ CANNOT_OPEN_FILE │ 76 │ 1 │ └──────────────────┴──────┴───────┘ ``` + +``` sql +WITH arrayMap(x -> demangle(addressToSymbol(x)), last_error_trace) AS all +SELECT name, arrayStringConcat(all, '\n') AS res +FROM system.errors +LIMIT 1 +SETTINGS allow_introspection_functions=1\G +``` + diff --git a/src/Common/ErrorCodes.cpp b/src/Common/ErrorCodes.cpp index 30714cb82ae..918bc301754 100644 --- a/src/Common/ErrorCodes.cpp +++ b/src/Common/ErrorCodes.cpp @@ -587,7 +587,7 @@ namespace ErrorCodes ErrorCode end() { return END + 1; } - void increment(ErrorCode error_code, bool remote, const std::string & message, const std::string & stacktrace) + void increment(ErrorCode error_code, bool remote, const std::string & message, const FramePointers & trace) { if (error_code >= end()) { @@ -596,10 +596,10 @@ namespace ErrorCodes error_code = end() - 1; } - values[error_code].increment(remote, message, stacktrace); + values[error_code].increment(remote, message, trace); } - void ErrorPairHolder::increment(bool remote, const std::string & message, const std::string & stacktrace) + void ErrorPairHolder::increment(bool remote, const std::string & message, const FramePointers & trace) { const auto now = std::chrono::system_clock::now(); @@ -609,7 +609,7 @@ namespace ErrorCodes ++error.count; error.message = message; - error.stacktrace = stacktrace; + error.trace = trace; error.error_time_ms = std::chrono::duration_cast(now.time_since_epoch()).count(); } ErrorPair ErrorPairHolder::get() diff --git a/src/Common/ErrorCodes.h b/src/Common/ErrorCodes.h index edb9be9e0c0..ffd0b8b8619 100644 --- a/src/Common/ErrorCodes.h +++ b/src/Common/ErrorCodes.h @@ -1,11 +1,12 @@ #pragma once -#include +#include #include #include #include -#include #include +#include +#include /** Allows to count number of simultaneously happening error codes. * See also Exception.cpp for incrementing part. @@ -19,6 +20,7 @@ namespace ErrorCodes /// ErrorCode identifier (index in array). using ErrorCode = int; using Value = size_t; + using FramePointers = std::vector; /// Get name of error_code by identifier. /// Returns statically allocated string. @@ -33,7 +35,7 @@ namespace ErrorCodes /// Message for the last error. std::string message; /// Stacktrace for the last error. - std::string stacktrace; + FramePointers trace; }; struct ErrorPair { @@ -46,7 +48,7 @@ namespace ErrorCodes { public: ErrorPair get(); - void increment(bool remote, const std::string & message, const std::string & stacktrace); + void increment(bool remote, const std::string & message, const FramePointers & trace); private: ErrorPair value; @@ -60,7 +62,7 @@ namespace ErrorCodes ErrorCode end(); /// Add value for specified error_code. - void increment(ErrorCode error_code, bool remote, const std::string & message, const std::string & stacktrace); + void increment(ErrorCode error_code, bool remote, const std::string & message, const FramePointers & trace); } } diff --git a/src/Common/Exception.cpp b/src/Common/Exception.cpp index 08afd0397f5..ff638af22ad 100644 --- a/src/Common/Exception.cpp +++ b/src/Common/Exception.cpp @@ -36,7 +36,7 @@ namespace ErrorCodes /// - Aborts the process if error code is LOGICAL_ERROR. /// - Increments error codes statistics. -void handle_error_code([[maybe_unused]] const std::string & msg, const std::string & stacktrace, int code, bool remote) +void handle_error_code([[maybe_unused]] const std::string & msg, int code, bool remote, const Exception::FramePointers & trace) { // In debug builds and builds with sanitizers, treat LOGICAL_ERROR as an assertion failure. // Log the message before we fail. @@ -47,20 +47,21 @@ void handle_error_code([[maybe_unused]] const std::string & msg, const std::stri abort(); } #endif - ErrorCodes::increment(code, remote, msg, stacktrace); + + ErrorCodes::increment(code, remote, msg, trace); } Exception::Exception(const std::string & msg, int code, bool remote_) : Poco::Exception(msg, code) , remote(remote_) { - handle_error_code(msg, getStackTraceString(), code, remote); + handle_error_code(msg, code, remote, getStackFramePointers()); } Exception::Exception(const std::string & msg, const Exception & nested, int code) : Poco::Exception(msg, nested, code) { - handle_error_code(msg, getStackTraceString(), code, remote); + handle_error_code(msg, code, remote, getStackFramePointers()); } Exception::Exception(CreateFromPocoTag, const Poco::Exception & exc) @@ -101,6 +102,31 @@ std::string Exception::getStackTraceString() const #endif } +Exception::FramePointers Exception::getStackFramePointers() const +{ + FramePointers trace; +#ifdef STD_EXCEPTION_HAS_STACK_TRACE + { + trace.resize(get_stack_trace_size()); + for (size_t i = 0; i < trace.size(); ++i) + { + trace[i] = get_stack_trace_frames()[i]; + } + } +#else + { + size_t stack_trace_size = trace.getSize(); + size_t stack_trace_offset = trace.getOffset(); + trace.resize(stack_trace_size - stack_trace_offset); + for (size_t i = stack_trace_offset; i < stack_trace_size; ++i) + { + trace[i] = trace.getFramePointers()[i]; + } + } +#endif + return trace; +} + void throwFromErrno(const std::string & s, int code, int the_errno) { diff --git a/src/Common/Exception.h b/src/Common/Exception.h index e487badafa5..79b4394948a 100644 --- a/src/Common/Exception.h +++ b/src/Common/Exception.h @@ -24,6 +24,8 @@ namespace DB class Exception : public Poco::Exception { public: + using FramePointers = std::vector; + Exception() = default; Exception(const std::string & msg, int code, bool remote_ = false); Exception(const std::string & msg, const Exception & nested, int code); @@ -66,6 +68,8 @@ public: bool isRemoteException() const { return remote; } std::string getStackTraceString() const; + /// Used for system.errors + FramePointers getStackFramePointers() const; private: #ifndef STD_EXCEPTION_HAS_STACK_TRACE diff --git a/src/Storages/System/StorageSystemErrors.cpp b/src/Storages/System/StorageSystemErrors.cpp index 5243cb11aa3..09d0aaddb3d 100644 --- a/src/Storages/System/StorageSystemErrors.cpp +++ b/src/Storages/System/StorageSystemErrors.cpp @@ -1,6 +1,7 @@ #include #include #include +#include #include #include #include @@ -16,7 +17,7 @@ NamesAndTypesList StorageSystemErrors::getNamesAndTypes() { "value", std::make_shared() }, { "last_error_time", std::make_shared() }, { "last_error_message", std::make_shared() }, - { "last_error_stacktrace", std::make_shared() }, + { "last_error_trace", std::make_shared(std::make_shared()) }, { "remote", std::make_shared() }, }; } @@ -34,7 +35,14 @@ void StorageSystemErrors::fillData(MutableColumns & res_columns, const Context & res_columns[col_num++]->insert(error.count); res_columns[col_num++]->insert(error.error_time_ms / 1000); res_columns[col_num++]->insert(error.message); - res_columns[col_num++]->insert(error.stacktrace); + { + Array trace_array; + trace_array.reserve(error.trace.size()); + for (size_t i = 0; i < error.trace.size(); ++i) + trace_array.emplace_back(reinterpret_cast(error.trace[i])); + + res_columns[col_num++]->insert(trace_array); + } res_columns[col_num++]->insert(remote); } }; From f164c2462ff3aae8e51fc85abff09037401ba474 Mon Sep 17 00:00:00 2001 From: Nikolai Kochetov Date: Wed, 24 Mar 2021 09:41:57 +0300 Subject: [PATCH 091/155] Update Exception.cpp Fix build --- src/Common/Exception.cpp | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/src/Common/Exception.cpp b/src/Common/Exception.cpp index ff638af22ad..e8a98021588 100644 --- a/src/Common/Exception.cpp +++ b/src/Common/Exception.cpp @@ -104,27 +104,27 @@ std::string Exception::getStackTraceString() const Exception::FramePointers Exception::getStackFramePointers() const { - FramePointers trace; + FramePointers frame_pointers; #ifdef STD_EXCEPTION_HAS_STACK_TRACE { - trace.resize(get_stack_trace_size()); - for (size_t i = 0; i < trace.size(); ++i) + frame_pointers.resize(get_stack_trace_size()); + for (size_t i = 0; i < frame_pointers.size(); ++i) { - trace[i] = get_stack_trace_frames()[i]; + frame_pointers[i] = get_stack_trace_frames()[i]; } } #else { size_t stack_trace_size = trace.getSize(); size_t stack_trace_offset = trace.getOffset(); - trace.resize(stack_trace_size - stack_trace_offset); + frame_pointers.reserve(stack_trace_size - stack_trace_offset); for (size_t i = stack_trace_offset; i < stack_trace_size; ++i) { - trace[i] = trace.getFramePointers()[i]; + frame_pointers.push_back(trace.getFramePointers()[i]); } } #endif - return trace; + return frame_pointers; } From 9d8b21a04dbca5d27fdb504a9ad667de5b540acb Mon Sep 17 00:00:00 2001 From: alesapin Date: Wed, 24 Mar 2021 11:12:37 +0300 Subject: [PATCH 092/155] Fix ephemeral node removal --- src/Coordination/NuKeeperStorage.cpp | 14 ++++++---- src/Coordination/tests/gtest_for_build.cpp | 31 ++++++++++++++++++++++ 2 files changed, 40 insertions(+), 5 deletions(-) diff --git a/src/Coordination/NuKeeperStorage.cpp b/src/Coordination/NuKeeperStorage.cpp index 2440d6f6613..c1a8ebdfb44 100644 --- a/src/Coordination/NuKeeperStorage.cpp +++ b/src/Coordination/NuKeeperStorage.cpp @@ -233,7 +233,7 @@ struct NuKeeperStorageGetRequest final : public NuKeeperStorageRequest struct NuKeeperStorageRemoveRequest final : public NuKeeperStorageRequest { using NuKeeperStorageRequest::NuKeeperStorageRequest; - std::pair process(NuKeeperStorage::Container & container, NuKeeperStorage::Ephemerals & ephemerals, int64_t /*zxid*/, int64_t session_id) const override + std::pair process(NuKeeperStorage::Container & container, NuKeeperStorage::Ephemerals & ephemerals, int64_t /*zxid*/, int64_t /*session_id*/) const override { Coordination::ZooKeeperResponsePtr response_ptr = zk_request->makeResponse(); Coordination::ZooKeeperRemoveResponse & response = dynamic_cast(*response_ptr); @@ -257,7 +257,12 @@ struct NuKeeperStorageRemoveRequest final : public NuKeeperStorageRequest { auto prev_node = it->value; if (prev_node.stat.ephemeralOwner != 0) - ephemerals[session_id].erase(request.path); + { + auto ephemerals_it = ephemerals.find(prev_node.stat.ephemeralOwner); + ephemerals_it->second.erase(request.path); + if (ephemerals_it->second.empty()) + ephemerals.erase(ephemerals_it); + } auto child_basename = getBaseName(it->key); container.updateValue(parentPath(request.path), [&child_basename] (NuKeeperStorage::Node & parent) @@ -271,10 +276,10 @@ struct NuKeeperStorageRemoveRequest final : public NuKeeperStorageRequest container.erase(request.path); - undo = [prev_node, &container, &ephemerals, session_id, path = request.path, child_basename] + undo = [prev_node, &container, &ephemerals, path = request.path, child_basename] { if (prev_node.stat.ephemeralOwner != 0) - ephemerals[session_id].emplace(path); + ephemerals[prev_node.stat.ephemeralOwner].emplace(path); container.insert(path, prev_node); container.updateValue(parentPath(path), [&child_basename] (NuKeeperStorage::Node & parent) @@ -377,7 +382,6 @@ struct NuKeeperStorageSetRequest final : public NuKeeperStorageRequest { return processWatchesImpl(zk_request->getPath(), watches, list_watches, Coordination::Event::CHANGED); } - }; struct NuKeeperStorageListRequest final : public NuKeeperStorageRequest diff --git a/src/Coordination/tests/gtest_for_build.cpp b/src/Coordination/tests/gtest_for_build.cpp index d90b711498e..cc3dcc04e53 100644 --- a/src/Coordination/tests/gtest_for_build.cpp +++ b/src/Coordination/tests/gtest_for_build.cpp @@ -1232,6 +1232,37 @@ TEST(CoordinationTest, TestStateMachineAndLogStore) } } +TEST(CoordinationTest, TestEphemeralNodeRemove) +{ + using namespace Coordination; + using namespace DB; + + ChangelogDirTest snapshots("./snapshots"); + CoordinationSettingsPtr settings = std::make_shared(); + + ResponsesQueue queue; + SnapshotsQueue snapshots_queue{1}; + auto state_machine = std::make_shared(queue, snapshots_queue, "./snapshots", settings); + state_machine->init(); + + std::shared_ptr request_c = std::make_shared(); + request_c->path = "/hello"; + request_c->is_ephemeral = true; + auto entry_c = getLogEntryFromZKRequest(0, 1, request_c); + state_machine->commit(1, entry_c->get_buf()); + const auto & storage = state_machine->getStorage(); + + EXPECT_EQ(storage.ephemerals.size(), 1); + std::shared_ptr request_d = std::make_shared(); + request_d->path = "/hello"; + /// Delete from other session + auto entry_d = getLogEntryFromZKRequest(0, 2, request_d); + state_machine->commit(2, entry_d->get_buf()); + + EXPECT_EQ(storage.ephemerals.size(), 0); +} + + int main(int argc, char ** argv) { Poco::AutoPtr channel(new Poco::ConsoleChannel(std::cerr)); From 487d6bdcd91d8c1267650d2e8bc35b6077a1b071 Mon Sep 17 00:00:00 2001 From: fuqi Date: Wed, 24 Mar 2021 16:35:20 +0800 Subject: [PATCH 093/155] add test case fix order key check --- .../MergeTree/MergeTreeWhereOptimizer.cpp | 9 ++--- .../MergeTree/MergeTreeWhereOptimizer.h | 4 +-- ...der_key_to_prewhere_select_final.reference | 35 +++++++++++++++++++ ...37_move_order_to_prewhere_select_final.sql | 15 ++++++++ 4 files changed, 57 insertions(+), 6 deletions(-) create mode 100644 tests/queries/0_stateless/01737_move_order_key_to_prewhere_select_final.reference create mode 100644 tests/queries/0_stateless/01737_move_order_to_prewhere_select_final.sql diff --git a/src/Storages/MergeTree/MergeTreeWhereOptimizer.cpp b/src/Storages/MergeTree/MergeTreeWhereOptimizer.cpp index 98e40bf394d..692d2ac4b94 100644 --- a/src/Storages/MergeTree/MergeTreeWhereOptimizer.cpp +++ b/src/Storages/MergeTree/MergeTreeWhereOptimizer.cpp @@ -37,7 +37,8 @@ MergeTreeWhereOptimizer::MergeTreeWhereOptimizer( : table_columns{ext::map( metadata_snapshot->getColumns().getAllPhysical(), [](const NameAndTypePair & col) { return col.name; })} , queried_columns{queried_columns_} - , primary_key_columns{metadata_snapshot->getPrimaryKey().column_names} + , sorting_key_names{NameSet( + metadata_snapshot->getSortingKey().column_names.begin(), metadata_snapshot->getSortingKey().column_names.end())} , block_with_constants{KeyCondition::getBlockWithConstants(query_info.query, query_info.syntax_analyzer_result, context)} , log{log_} , column_sizes{std::move(column_sizes_)} @@ -301,9 +302,9 @@ bool MergeTreeWhereOptimizer::isPrimaryKeyAtom(const ASTPtr & ast) const } -bool MergeTreeWhereOptimizer::isPrimaryKey(const String & column_name) const +bool MergeTreeWhereOptimizer::isSortingKey(const String & column_name) const { - return std::find(primary_key_columns.begin(), primary_key_columns.end(), column_name) != primary_key_columns.end(); + return sorting_key_names.count(column_name); } @@ -344,7 +345,7 @@ bool MergeTreeWhereOptimizer::cannotBeMoved(const ASTPtr & ptr, bool is_final) c /// disallow moving result of ARRAY JOIN to PREWHERE if (array_joined_names.count(*opt_name) || array_joined_names.count(Nested::extractTableName(*opt_name)) || - (is_final && !isPrimaryKey(*opt_name))) + (is_final && !isSortingKey(*opt_name))) return true; } diff --git a/src/Storages/MergeTree/MergeTreeWhereOptimizer.h b/src/Storages/MergeTree/MergeTreeWhereOptimizer.h index 85d1df583fa..8fd973e9ba3 100644 --- a/src/Storages/MergeTree/MergeTreeWhereOptimizer.h +++ b/src/Storages/MergeTree/MergeTreeWhereOptimizer.h @@ -85,7 +85,7 @@ private: bool isPrimaryKeyAtom(const ASTPtr & ast) const; - bool isPrimaryKey(const String & column_name) const; + bool isSortingKey(const String & column_name) const; bool isConstant(const ASTPtr & expr) const; @@ -106,7 +106,7 @@ private: String first_primary_key_column; const StringSet table_columns; const Names queried_columns; - const Names primary_key_columns; + const NameSet sorting_key_names; const Block block_with_constants; Poco::Logger * log; std::unordered_map column_sizes; diff --git a/tests/queries/0_stateless/01737_move_order_key_to_prewhere_select_final.reference b/tests/queries/0_stateless/01737_move_order_key_to_prewhere_select_final.reference new file mode 100644 index 00000000000..bde1e20ab10 --- /dev/null +++ b/tests/queries/0_stateless/01737_move_order_key_to_prewhere_select_final.reference @@ -0,0 +1,35 @@ +SELECT + x, + y, + z +FROM prewhere_move_select_final +PREWHERE y > 100 +SELECT + x, + y, + z +FROM prewhere_move_select_final +FINAL +PREWHERE y > 100 +SELECT + x, + y, + z +FROM prewhere_move_select_final +FINAL +WHERE z > 400 +SELECT + x, + y, + z +FROM prewhere_move_select_final +FINAL +WHERE z > 400 +SELECT + x, + y, + z +FROM prewhere_move_select_final +FINAL +PREWHERE y > 100 +WHERE (y > 100) AND (z > 400) \ No newline at end of file diff --git a/tests/queries/0_stateless/01737_move_order_to_prewhere_select_final.sql b/tests/queries/0_stateless/01737_move_order_to_prewhere_select_final.sql new file mode 100644 index 00000000000..a3a882c461a --- /dev/null +++ b/tests/queries/0_stateless/01737_move_order_to_prewhere_select_final.sql @@ -0,0 +1,15 @@ +DROP TABLE IF EXISTS prewhere_move_select_final; +CREATE TABLE prewhere_move_select_final (x Int, y Int, z Int) ENGINE = ReplacingMergeTree() ORDER BY (x, y); +INSERT INTO prewhere_move_select_final SELECT number, number * 2, number * 3 FROM numbers(1000); + +-- order key can be pushed down with final +EXPLAIN SYNTAX SELECT * FROM prewhere_move_select_final WHERE y > 100; +EXPLAIN SYNTAX SELECT * FROM prewhere_move_select_final FINAL WHERE y > 100; + +-- can not be pushed down +EXPLAIN SYNTAX SELECT * FROM prewhere_move_select_final FINAL WHERE z > 400; + +-- only y can be pushed down +EXPLAIN SYNTAX SELECT * FROM prewhere_move_select_final FINAL WHERE y > 100 and z > 400; + +DROP TABLE prewhere_move_select_final; From fb3af77098bf646e456fce5d1639dfc194623bb3 Mon Sep 17 00:00:00 2001 From: Vitaly Baranov Date: Tue, 23 Mar 2021 21:01:40 +0300 Subject: [PATCH 094/155] Add test. --- .../0_stateless/00966_invalid_json_must_not_parse.reference | 4 ++++ .../queries/0_stateless/00966_invalid_json_must_not_parse.sql | 4 ++++ 2 files changed, 8 insertions(+) diff --git a/tests/queries/0_stateless/00966_invalid_json_must_not_parse.reference b/tests/queries/0_stateless/00966_invalid_json_must_not_parse.reference index f7eb44d66e0..4521d575ff3 100644 --- a/tests/queries/0_stateless/00966_invalid_json_must_not_parse.reference +++ b/tests/queries/0_stateless/00966_invalid_json_must_not_parse.reference @@ -4,3 +4,7 @@ 0 0 0 +0 +0 +0 +0 diff --git a/tests/queries/0_stateless/00966_invalid_json_must_not_parse.sql b/tests/queries/0_stateless/00966_invalid_json_must_not_parse.sql index afcbc78cfd5..0e7fa55dbae 100644 --- a/tests/queries/0_stateless/00966_invalid_json_must_not_parse.sql +++ b/tests/queries/0_stateless/00966_invalid_json_must_not_parse.sql @@ -3,6 +3,8 @@ SET allow_simdjson=1; SELECT JSONLength('"HX-='); SELECT JSONLength('[9]\0\x42\xD3\x36\xE3'); SELECT JSONLength(unhex('5B30000E06D7AA5D')); +SELECT JSONLength('{"success"test:"123"}'); +SELECT isValidJSON('{"success"test:"123"}'); SET allow_simdjson=0; @@ -10,3 +12,5 @@ SET allow_simdjson=0; SELECT JSONLength('"HX-='); SELECT JSONLength('[9]\0\x42\xD3\x36\xE3'); SELECT JSONLength(unhex('5B30000E06D7AA5D')); +SELECT JSONLength('{"success"test:"123"}'); +SELECT isValidJSON('{"success"test:"123"}'); From d76edc33d57a6d86d919aa36824a0a7a034cc919 Mon Sep 17 00:00:00 2001 From: fuqi Date: Wed, 24 Mar 2021 18:34:20 +0800 Subject: [PATCH 095/155] rename test case name --- ...inal.sql => 01737_move_order_key_to_prewhere_select_final.sql} | 0 1 file changed, 0 insertions(+), 0 deletions(-) rename tests/queries/0_stateless/{01737_move_order_to_prewhere_select_final.sql => 01737_move_order_key_to_prewhere_select_final.sql} (100%) diff --git a/tests/queries/0_stateless/01737_move_order_to_prewhere_select_final.sql b/tests/queries/0_stateless/01737_move_order_key_to_prewhere_select_final.sql similarity index 100% rename from tests/queries/0_stateless/01737_move_order_to_prewhere_select_final.sql rename to tests/queries/0_stateless/01737_move_order_key_to_prewhere_select_final.sql From 57c6ebc844e53f18a634c0c647e3969b20458edd Mon Sep 17 00:00:00 2001 From: fuqi Date: Wed, 24 Mar 2021 19:37:47 +0800 Subject: [PATCH 096/155] fix test case --- ...737_move_order_key_to_prewhere_select_final.reference | 9 +-------- 1 file changed, 1 insertion(+), 8 deletions(-) diff --git a/tests/queries/0_stateless/01737_move_order_key_to_prewhere_select_final.reference b/tests/queries/0_stateless/01737_move_order_key_to_prewhere_select_final.reference index bde1e20ab10..95479cf37ba 100644 --- a/tests/queries/0_stateless/01737_move_order_key_to_prewhere_select_final.reference +++ b/tests/queries/0_stateless/01737_move_order_key_to_prewhere_select_final.reference @@ -24,12 +24,5 @@ SELECT z FROM prewhere_move_select_final FINAL -WHERE z > 400 -SELECT - x, - y, - z -FROM prewhere_move_select_final -FINAL PREWHERE y > 100 -WHERE (y > 100) AND (z > 400) \ No newline at end of file +WHERE (y > 100) AND (z > 400) From 02eee100a0554c69e85efb5d7cdd88253e7cc9f2 Mon Sep 17 00:00:00 2001 From: Alexander Kuzmenkov Date: Wed, 24 Mar 2021 15:36:39 +0300 Subject: [PATCH 097/155] formatting fixes --- src/Parsers/ASTSelectQuery.cpp | 2 +- src/Parsers/ASTWindowDefinition.cpp | 6 ++++-- 2 files changed, 5 insertions(+), 3 deletions(-) diff --git a/src/Parsers/ASTSelectQuery.cpp b/src/Parsers/ASTSelectQuery.cpp index 89ef08e0289..4715c7f201b 100644 --- a/src/Parsers/ASTSelectQuery.cpp +++ b/src/Parsers/ASTSelectQuery.cpp @@ -138,7 +138,7 @@ void ASTSelectQuery::formatImpl(const FormatSettings & s, FormatState & state, F { s.ostr << (s.hilite ? hilite_keyword : "") << s.nl_or_ws << indent_str << "WINDOW" << (s.hilite ? hilite_none : ""); - window()->formatImpl(s, state, frame); + window()->as().formatImplMultiline(s, state, frame); } if (orderBy()) diff --git a/src/Parsers/ASTWindowDefinition.cpp b/src/Parsers/ASTWindowDefinition.cpp index a645960bd0a..35374df6177 100644 --- a/src/Parsers/ASTWindowDefinition.cpp +++ b/src/Parsers/ASTWindowDefinition.cpp @@ -35,9 +35,11 @@ String ASTWindowDefinition::getID(char) const void ASTWindowDefinition::formatImpl(const FormatSettings & settings, FormatState & state, FormatStateStacked format_frame) const { + format_frame.expression_list_prepend_whitespace = false; + if (partition_by) { - settings.ostr << "PARTITION BY"; + settings.ostr << "PARTITION BY "; partition_by->formatImpl(settings, state, format_frame); } @@ -48,7 +50,7 @@ void ASTWindowDefinition::formatImpl(const FormatSettings & settings, if (order_by) { - settings.ostr << "ORDER BY"; + settings.ostr << "ORDER BY "; order_by->formatImpl(settings, state, format_frame); } From 725c4f254473c71d3863cd47652d0c936f875d69 Mon Sep 17 00:00:00 2001 From: Maksim Kita Date: Wed, 24 Mar 2021 22:20:50 +0300 Subject: [PATCH 098/155] Update 01774_tuple_null_in.sql --- tests/queries/0_stateless/01774_tuple_null_in.sql | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/queries/0_stateless/01774_tuple_null_in.sql b/tests/queries/0_stateless/01774_tuple_null_in.sql index c9dad49e8ed..a9cc39e8840 100644 --- a/tests/queries/0_stateless/01774_tuple_null_in.sql +++ b/tests/queries/0_stateless/01774_tuple_null_in.sql @@ -1,2 +1,2 @@ SELECT (NULL, NULL) = (8, 0) OR (NULL, NULL) = (3, 2) OR (NULL, NULL) = (0, 0) OR (NULL, NULL) = (3, 1); -SELECT (NULL, NULL) IN ((NULL, 0), (3, 1), (3, 2), (8, 0)); +SELECT (NULL, NULL) IN ((NULL, 0), (3, 1), (3, 2), (8, 0), (NULL, NULL)); From 6341b083fb2c6d4550d36b0217fa223f09b2ece1 Mon Sep 17 00:00:00 2001 From: Alexey Milovidov Date: Wed, 24 Mar 2021 22:35:22 +0300 Subject: [PATCH 099/155] Add test to skip list --- docker/test/fasttest/run.sh | 1 + 1 file changed, 1 insertion(+) diff --git a/docker/test/fasttest/run.sh b/docker/test/fasttest/run.sh index 649f9f812e1..bbd5443ffb6 100755 --- a/docker/test/fasttest/run.sh +++ b/docker/test/fasttest/run.sh @@ -292,6 +292,7 @@ function run_tests 01318_decrypt # Depends on OpenSSL 01663_aes_msan # Depends on OpenSSL 01667_aes_args_check # Depends on OpenSSL + 01776_decrypt_aead_size_check # Depends on OpenSSL 01281_unsucceeded_insert_select_queries_counter 01292_create_user 01294_lazy_database_concurrent From b610afe7715f387260803ab41c57173ed20545f0 Mon Sep 17 00:00:00 2001 From: Alexey Milovidov Date: Wed, 24 Mar 2021 22:40:45 +0300 Subject: [PATCH 100/155] Another fix --- base/common/DateLUTImpl.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/base/common/DateLUTImpl.h b/base/common/DateLUTImpl.h index 1a44c670650..8d393465b82 100644 --- a/base/common/DateLUTImpl.h +++ b/base/common/DateLUTImpl.h @@ -1069,11 +1069,11 @@ public: } template - inline LUTIndex addMonthsIndex(DateOrTime v, Int64 delta) const + inline LUTIndex NO_SANITIZE_UNDEFINED addMonthsIndex(DateOrTime v, Int64 delta) const { const Values & values = lut[toLUTIndex(v)]; - Int64 month = values.month + static_cast(delta); /// Cast is to avoid UB in signed integer overflow. + Int64 month = values.month + delta; if (month > 0) { From c325ed65e24d390e6d86d3a47d5134128be507c0 Mon Sep 17 00:00:00 2001 From: Alexey Milovidov Date: Wed, 24 Mar 2021 23:10:15 +0300 Subject: [PATCH 101/155] Fix UBSan report in mapPopulateSeries --- src/Functions/array/mapPopulateSeries.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/Functions/array/mapPopulateSeries.cpp b/src/Functions/array/mapPopulateSeries.cpp index 2050e0c28ab..c025117af69 100644 --- a/src/Functions/array/mapPopulateSeries.cpp +++ b/src/Functions/array/mapPopulateSeries.cpp @@ -190,7 +190,7 @@ private: } static constexpr size_t MAX_ARRAY_SIZE = 1ULL << 30; - if (static_cast(max_key - min_key) > MAX_ARRAY_SIZE) + if (static_cast(max_key) - static_cast(min_key) > MAX_ARRAY_SIZE) throw Exception(ErrorCodes::TOO_LARGE_ARRAY_SIZE, "Too large array size in the result of function {}", getName()); /* fill the result arrays */ From 2d8e82f3d9f43aca0217a7d33e0902e330ac5695 Mon Sep 17 00:00:00 2001 From: Alexey Milovidov Date: Wed, 24 Mar 2021 23:12:58 +0300 Subject: [PATCH 102/155] Add a test #22094 --- .../0_stateless/01777_map_populate_series_ubsan.reference | 0 tests/queries/0_stateless/01777_map_populate_series_ubsan.sql | 2 ++ 2 files changed, 2 insertions(+) create mode 100644 tests/queries/0_stateless/01777_map_populate_series_ubsan.reference create mode 100644 tests/queries/0_stateless/01777_map_populate_series_ubsan.sql diff --git a/tests/queries/0_stateless/01777_map_populate_series_ubsan.reference b/tests/queries/0_stateless/01777_map_populate_series_ubsan.reference new file mode 100644 index 00000000000..e69de29bb2d diff --git a/tests/queries/0_stateless/01777_map_populate_series_ubsan.sql b/tests/queries/0_stateless/01777_map_populate_series_ubsan.sql new file mode 100644 index 00000000000..5a8c182425a --- /dev/null +++ b/tests/queries/0_stateless/01777_map_populate_series_ubsan.sql @@ -0,0 +1,2 @@ +-- Should correctly throw exception about overflow: +SELECT mapPopulateSeries([-9223372036854775808, toUInt32(2)], [toUInt32(1023), -1]); -- { serverError 128 } From d1f72f81f5be0ea6460b24d28cbba881a6d9de0a Mon Sep 17 00:00:00 2001 From: Dmitriy Date: Thu, 25 Mar 2021 00:21:08 +0300 Subject: [PATCH 103/155] Translate to Russian MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Перевел на русский язык. --- docs/en/interfaces/formats.md | 2 +- docs/ru/interfaces/formats.md | 56 ++++++++++++++++++++--------------- 2 files changed, 33 insertions(+), 25 deletions(-) diff --git a/docs/en/interfaces/formats.md b/docs/en/interfaces/formats.md index 0d582fab12b..5987ba0f676 100644 --- a/docs/en/interfaces/formats.md +++ b/docs/en/interfaces/formats.md @@ -1308,7 +1308,7 @@ The table below shows supported data types and how they match ClickHouse [data t | `DECIMAL` | [Decimal](../sql-reference/data-types/decimal.md) | `DECIMAL` | | `-` | [Array](../sql-reference/data-types/array.md) | `LIST` | -ClickHouse supports configurable precision of the `Decimal` type. The `INSERT` or `SELECT` query treats the ORC `DECIMAL` type as the ClickHouse `Decimal128` type. +ClickHouse supports configurable precision of the `Decimal` type. The `INSERT` query treats the ORC `DECIMAL` type as the ClickHouse `Decimal128` type. Unsupported ORC data types: `TIME32`, `FIXED_SIZE_BINARY`, `JSON`, `UUID`, `ENUM`. diff --git a/docs/ru/interfaces/formats.md b/docs/ru/interfaces/formats.md index 67cc80f5cd8..8ec26ec66f5 100644 --- a/docs/ru/interfaces/formats.md +++ b/docs/ru/interfaces/formats.md @@ -49,7 +49,7 @@ ClickHouse может принимать (`INSERT`) и отдавать (`SELECT | [Parquet](#data-format-parquet) | ✔ | ✔ | | [Arrow](#data-format-arrow) | ✔ | ✔ | | [ArrowStream](#data-format-arrow-stream) | ✔ | ✔ | -| [ORC](#data-format-orc) | ✔ | ✗ | +| [ORC](#data-format-orc) | ✔ | ✔ | | [RowBinary](#rowbinary) | ✔ | ✔ | | [RowBinaryWithNamesAndTypes](#rowbinarywithnamesandtypes) | ✔ | ✔ | | [Native](#native) | ✔ | ✔ | @@ -1203,45 +1203,53 @@ $ clickhouse-client --query="SELECT * FROM {some_table} FORMAT Parquet" > {some_ ## ORC {#data-format-orc} -[Apache ORC](https://orc.apache.org/) - это column-oriented формат данных, распространённый в экосистеме Hadoop. Вы можете только вставлять данные этого формата в ClickHouse. +[Apache ORC](https://orc.apache.org/) — это столбцовый формат данных, распространенный в экосистеме [Hadoop](https://hadoop.apache.org/). ### Соответствие типов данных {#sootvetstvie-tipov-dannykh-1} -Таблица показывает поддержанные типы данных и их соответствие [типам данных](../sql-reference/data-types/index.md) ClickHouse для запросов `INSERT`. +Таблица ниже содержит поддерживаемые типы данных и их соответствие [типам данных](../sql-reference/data-types/index.md) ClickHouse для запросов `INSERT` и `SELECT`. -| Тип данных ORC (`INSERT`) | Тип данных ClickHouse | -|---------------------------|-----------------------------------------------------| -| `UINT8`, `BOOL` | [UInt8](../sql-reference/data-types/int-uint.md) | -| `INT8` | [Int8](../sql-reference/data-types/int-uint.md) | -| `UINT16` | [UInt16](../sql-reference/data-types/int-uint.md) | -| `INT16` | [Int16](../sql-reference/data-types/int-uint.md) | -| `UINT32` | [UInt32](../sql-reference/data-types/int-uint.md) | -| `INT32` | [Int32](../sql-reference/data-types/int-uint.md) | -| `UINT64` | [UInt64](../sql-reference/data-types/int-uint.md) | -| `INT64` | [Int64](../sql-reference/data-types/int-uint.md) | -| `FLOAT`, `HALF_FLOAT` | [Float32](../sql-reference/data-types/float.md) | -| `DOUBLE` | [Float64](../sql-reference/data-types/float.md) | -| `DATE32` | [Date](../sql-reference/data-types/date.md) | -| `DATE64`, `TIMESTAMP` | [DateTime](../sql-reference/data-types/datetime.md) | -| `STRING`, `BINARY` | [String](../sql-reference/data-types/string.md) | -| `DECIMAL` | [Decimal](../sql-reference/data-types/decimal.md) | +| Тип данных ORC (`INSERT`) | Тип данных ClickHouse | Тип данных ORC (`SELECT`) | +|---------------------------|-----------------------------------------------------|---------------------------| +| `UINT8`, `BOOL` | [UInt8](../sql-reference/data-types/int-uint.md) | `UINT8` | +| `INT8` | [Int8](../sql-reference/data-types/int-uint.md) | `INT8` | +| `UINT16` | [UInt16](../sql-reference/data-types/int-uint.md) | `UINT16` | +| `INT16` | [Int16](../sql-reference/data-types/int-uint.md) | `INT16` | +| `UINT32` | [UInt32](../sql-reference/data-types/int-uint.md) | `UINT32` | +| `INT32` | [Int32](../sql-reference/data-types/int-uint.md) | `INT32` | +| `UINT64` | [UInt64](../sql-reference/data-types/int-uint.md) | `UINT64` | +| `INT64` | [Int64](../sql-reference/data-types/int-uint.md) | `INT64` | +| `FLOAT`, `HALF_FLOAT` | [Float32](../sql-reference/data-types/float.md) | `FLOAT` | +| `DOUBLE` | [Float64](../sql-reference/data-types/float.md) | `DOUBLE` | +| `DATE32` | [Date](../sql-reference/data-types/date.md) | `DATE32` | +| `DATE64`, `TIMESTAMP` | [DateTime](../sql-reference/data-types/datetime.md) | `TIMESTAMP` | +| `STRING`, `BINARY` | [String](../sql-reference/data-types/string.md) | `BINARY` | +| `DECIMAL` | [Decimal](../sql-reference/data-types/decimal.md) | `DECIMAL` | +| `-` | [Array](../sql-reference/data-types/array.md) | `LIST` | -ClickHouse поддерживает настраиваемую точность для формата `Decimal`. При обработке запроса `INSERT`, ClickHouse обрабатывает тип данных Parquet `DECIMAL` как `Decimal128`. +ClickHouse поддерживает настраиваемую точность для формата `Decimal`. При обработке запроса `INSERT`, ClickHouse обрабатывает тип данных ORC `DECIMAL` как `Decimal128`. -Неподдержанные типы данных ORC: `DATE32`, `TIME32`, `FIXED_SIZE_BINARY`, `JSON`, `UUID`, `ENUM`. +Неподдерживаемые типы данных ORC: `TIME32`, `FIXED_SIZE_BINARY`, `JSON`, `UUID`, `ENUM`. -Типы данных столбцов в таблицах ClickHouse могут отличаться от типов данных для соответствующих полей ORC. При вставке данных, ClickHouse интерпретирует типы данных ORC согласно таблице соответствия, а затем [приводит](../sql-reference/functions/type-conversion-functions/#type_conversion_function-cast) данные к типу, установленному для столбца таблицы ClickHouse. +Типы данных столбцов в таблицах ClickHouse могут отличаться от типов данных для соответствующих полей ORC. При вставке данных ClickHouse интерпретирует типы данных ORC согласно таблице соответствия, а затем [приводит](../sql-reference/functions/type-conversion-functions/#type_conversion_function-cast) данные к типу, установленному для столбца таблицы ClickHouse. ### Вставка данных {#vstavka-dannykh-1} -Данные ORC можно вставить в таблицу ClickHouse командой: +Чтобы вставить в ClickHouse данные из файла в формате ORC, вы можете использовать команду следующего вида: ``` bash $ cat filename.orc | clickhouse-client --query="INSERT INTO some_table FORMAT ORC" ``` -Для обмена данных с Hadoop можно использовать [движок таблиц HDFS](../engines/table-engines/integrations/hdfs.md). +### Выборка данных {#vyborka-dannykh-1} +Чтобы получить данные из таблицы ClickHouse и сохранить их в файл формата ORC, вы можете использовать команду следующего вида: + +``` bash +$ clickhouse-client --query="SELECT * FROM {some_table} FORMAT ORC" > {filename.orc} +``` + +Для обмена данных с экосистемой Hadoop вы можете использовать [движок таблиц HDFS](../engines/table-engines/integrations/hdfs.md). ## LineAsString {#lineasstring} From 8121c52c53b969bd023dc5687ce680efd4a06f82 Mon Sep 17 00:00:00 2001 From: filimonov <1549571+filimonov@users.noreply.github.com> Date: Wed, 24 Mar 2021 22:24:07 +0100 Subject: [PATCH 104/155] Update entrypoint.sh fix for #22100 --- docker/server/entrypoint.sh | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/docker/server/entrypoint.sh b/docker/server/entrypoint.sh index 0138a165505..1e665e0019c 100755 --- a/docker/server/entrypoint.sh +++ b/docker/server/entrypoint.sh @@ -46,9 +46,11 @@ DATA_DIR="$(clickhouse extract-from-config --config-file "$CLICKHOUSE_CONFIG" -- TMP_DIR="$(clickhouse extract-from-config --config-file "$CLICKHOUSE_CONFIG" --key=tmp_path || true)" USER_PATH="$(clickhouse extract-from-config --config-file "$CLICKHOUSE_CONFIG" --key=user_files_path || true)" LOG_PATH="$(clickhouse extract-from-config --config-file "$CLICKHOUSE_CONFIG" --key=logger.log || true)" -LOG_DIR="$(dirname "$LOG_PATH" || true)" +LOG_DIR="" +if [ -n "$LOG_PATH" ]; then LOG_DIR="$(dirname "$LOG_PATH")"; fi ERROR_LOG_PATH="$(clickhouse extract-from-config --config-file "$CLICKHOUSE_CONFIG" --key=logger.errorlog || true)" -ERROR_LOG_DIR="$(dirname "$ERROR_LOG_PATH" || true)" +ERROR_LOG_DIR="" +if [ -n "$ERROR_LOG_PATH" ]; then LOG_DIR="$(dirname "$ERROR_LOG_PATH")"; fi FORMAT_SCHEMA_PATH="$(clickhouse extract-from-config --config-file "$CLICKHOUSE_CONFIG" --key=format_schema_path || true)" CLICKHOUSE_USER="${CLICKHOUSE_USER:-default}" From 4b6b1311ce630513139a53a05c4a7be1036b686b Mon Sep 17 00:00:00 2001 From: filimonov <1549571+filimonov@users.noreply.github.com> Date: Wed, 24 Mar 2021 22:33:08 +0100 Subject: [PATCH 105/155] Update entrypoint.sh --- docker/server/entrypoint.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docker/server/entrypoint.sh b/docker/server/entrypoint.sh index 1e665e0019c..81e04bd7874 100755 --- a/docker/server/entrypoint.sh +++ b/docker/server/entrypoint.sh @@ -50,7 +50,7 @@ LOG_DIR="" if [ -n "$LOG_PATH" ]; then LOG_DIR="$(dirname "$LOG_PATH")"; fi ERROR_LOG_PATH="$(clickhouse extract-from-config --config-file "$CLICKHOUSE_CONFIG" --key=logger.errorlog || true)" ERROR_LOG_DIR="" -if [ -n "$ERROR_LOG_PATH" ]; then LOG_DIR="$(dirname "$ERROR_LOG_PATH")"; fi +if [ -n "$ERROR_LOG_PATH" ]; then ERROR_LOG_DIR="$(dirname "$ERROR_LOG_PATH")"; fi FORMAT_SCHEMA_PATH="$(clickhouse extract-from-config --config-file "$CLICKHOUSE_CONFIG" --key=format_schema_path || true)" CLICKHOUSE_USER="${CLICKHOUSE_USER:-default}" From 1bdf12b3f1f5aafbd3bf4f0113105b93b3373e47 Mon Sep 17 00:00:00 2001 From: Amos Bird Date: Thu, 25 Mar 2021 10:44:10 +0800 Subject: [PATCH 106/155] bump replxx --- contrib/replxx | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/contrib/replxx b/contrib/replxx index cdb6e3f2ce4..2b24f14594d 160000 --- a/contrib/replxx +++ b/contrib/replxx @@ -1 +1 @@ -Subproject commit cdb6e3f2ce4464225daf9c8beeae7db98d590bdc +Subproject commit 2b24f14594d7606792b92544bb112a6322ba34d7 From d36d3f036dfda47bf0212e7810f799377b14aaff Mon Sep 17 00:00:00 2001 From: alesapin Date: Thu, 25 Mar 2021 13:04:16 +0300 Subject: [PATCH 107/155] Fix several races in NuRaft --- contrib/NuRaft | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/contrib/NuRaft b/contrib/NuRaft index 3d3683e7775..70468326ad5 160000 --- a/contrib/NuRaft +++ b/contrib/NuRaft @@ -1 +1 @@ -Subproject commit 3d3683e77753cfe015a05fae95ddf418e19f59e1 +Subproject commit 70468326ad5d72e9497944838484c591dae054ea From 640ba7928880d8bc42b4efc494e2dbd21203fbf5 Mon Sep 17 00:00:00 2001 From: alesapin Date: Thu, 25 Mar 2021 13:23:25 +0300 Subject: [PATCH 108/155] Remove data corruption from lightweight run --- tests/jepsen.nukeeper/src/jepsen/nukeeper/main.clj | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/tests/jepsen.nukeeper/src/jepsen/nukeeper/main.clj b/tests/jepsen.nukeeper/src/jepsen/nukeeper/main.clj index 45a1f442d24..7380a9d9cbb 100644 --- a/tests/jepsen.nukeeper/src/jepsen/nukeeper/main.clj +++ b/tests/jepsen.nukeeper/src/jepsen/nukeeper/main.clj @@ -113,8 +113,10 @@ (def useful-nemesises ["random-node-killer" "simple-partitioner" - "logs-and-snapshots-corruptor" - "drop-data-corruptor" + "all-nodes-hammer-time" + ; can lead to a very rare data loss https://github.com/eBay/NuRaft/issues/185 + ;"logs-and-snapshots-corruptor" + ;"drop-data-corruptor" "bridge-partitioner" "blind-node-partitioner" "blind-others-partitioner"]) From b7622868fc03a76769180f95459137c3ca1c091b Mon Sep 17 00:00:00 2001 From: feng lv Date: Thu, 25 Mar 2021 11:10:41 +0000 Subject: [PATCH 109/155] remove useless code --- src/Storages/MergeTree/MergeTreeDataSelectExecutor.cpp | 5 ----- 1 file changed, 5 deletions(-) diff --git a/src/Storages/MergeTree/MergeTreeDataSelectExecutor.cpp b/src/Storages/MergeTree/MergeTreeDataSelectExecutor.cpp index 747819c77eb..96a3dba12f7 100644 --- a/src/Storages/MergeTree/MergeTreeDataSelectExecutor.cpp +++ b/src/Storages/MergeTree/MergeTreeDataSelectExecutor.cpp @@ -551,11 +551,6 @@ QueryPlanPtr MergeTreeDataSelectExecutor::readFromParts( .checksum_on_read = settings.checksum_on_read, }; - /// PREWHERE - String prewhere_column; - if (select.prewhere()) - prewhere_column = select.prewhere()->getColumnName(); - struct DataSkippingIndexAndCondition { MergeTreeIndexPtr index; From 3d92cb46f7478bc38b4c1f3b6d192cb4cffd824c Mon Sep 17 00:00:00 2001 From: alesapin Date: Thu, 25 Mar 2021 14:22:19 +0300 Subject: [PATCH 110/155] Trying to fix my favorite test --- .../00992_system_parts_race_condition_zookeeper_long.sh | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/tests/queries/0_stateless/00992_system_parts_race_condition_zookeeper_long.sh b/tests/queries/0_stateless/00992_system_parts_race_condition_zookeeper_long.sh index 1e61c8d64f3..fe6246e02f6 100755 --- a/tests/queries/0_stateless/00992_system_parts_race_condition_zookeeper_long.sh +++ b/tests/queries/0_stateless/00992_system_parts_race_condition_zookeeper_long.sh @@ -74,7 +74,7 @@ timeout $TIMEOUT bash -c thread5 2> /dev/null & wait -$CLICKHOUSE_CLIENT -n -q " - DROP TABLE alter_table; - DROP TABLE alter_table2 -" +$CLICKHOUSE_CLIENT -n -q "DROP TABLE alter_table;" & +$CLICKHOUSE_CLIENT -n -q "DROP TABLE alter_table2;" & + +wait From 55ba4ec15e322bc332b845cd90d43bf6cff916e6 Mon Sep 17 00:00:00 2001 From: feng lv Date: Thu, 25 Mar 2021 10:41:03 +0000 Subject: [PATCH 111/155] Fix bar with invalid float value fix --- src/Functions/bar.cpp | 4 ++++ .../0_stateless/01774_bar_with_illegal_value.reference | 0 tests/queries/0_stateless/01774_bar_with_illegal_value.sql | 1 + 3 files changed, 5 insertions(+) create mode 100644 tests/queries/0_stateless/01774_bar_with_illegal_value.reference create mode 100644 tests/queries/0_stateless/01774_bar_with_illegal_value.sql diff --git a/src/Functions/bar.cpp b/src/Functions/bar.cpp index 7364311a1be..6f5298a8c5e 100644 --- a/src/Functions/bar.cpp +++ b/src/Functions/bar.cpp @@ -16,6 +16,7 @@ namespace ErrorCodes extern const int ARGUMENT_OUT_OF_BOUND; extern const int ILLEGAL_COLUMN; extern const int ILLEGAL_TYPE_OF_ARGUMENT; + extern const int BAD_ARGUMENTS; } namespace @@ -110,6 +111,9 @@ public: arguments[2].column->getFloat64(i), max_width); + if (!isFinite(width)) + throw Exception("Value of width must not be NaN and Inf", ErrorCodes::BAD_ARGUMENTS); + size_t next_size = current_offset + UnicodeBar::getWidthInBytes(width) + 1; dst_chars.resize(next_size); UnicodeBar::render(width, reinterpret_cast(&dst_chars[current_offset])); diff --git a/tests/queries/0_stateless/01774_bar_with_illegal_value.reference b/tests/queries/0_stateless/01774_bar_with_illegal_value.reference new file mode 100644 index 00000000000..e69de29bb2d diff --git a/tests/queries/0_stateless/01774_bar_with_illegal_value.sql b/tests/queries/0_stateless/01774_bar_with_illegal_value.sql new file mode 100644 index 00000000000..60c7f303c13 --- /dev/null +++ b/tests/queries/0_stateless/01774_bar_with_illegal_value.sql @@ -0,0 +1 @@ +SELECT greatCircleAngle(1048575, 257, -9223372036854775808, 1048576) - NULL, bar(7, -inf, 1024); -- { serverError 36 } From 8ea697b7df4efa6fb01d2b418b895eb58ead71b9 Mon Sep 17 00:00:00 2001 From: Alexander Kuzmenkov Date: Thu, 25 Mar 2021 18:49:01 +0300 Subject: [PATCH 112/155] use camelCase --- docs/en/sql-reference/window-functions/index.md | 2 +- src/Processors/Transforms/WindowTransform.cpp | 6 +++--- tests/performance/window_functions.xml | 6 +++--- tests/queries/0_stateless/01591_window_functions.reference | 4 ++-- tests/queries/0_stateless/01591_window_functions.sql | 4 ++-- 5 files changed, 11 insertions(+), 11 deletions(-) diff --git a/docs/en/sql-reference/window-functions/index.md b/docs/en/sql-reference/window-functions/index.md index 3d18bc123f9..a646347ea60 100644 --- a/docs/en/sql-reference/window-functions/index.md +++ b/docs/en/sql-reference/window-functions/index.md @@ -25,7 +25,7 @@ ClickHouse supports the standard grammar for defining windows and window functio | `rank()`, `dense_rank()`, `row_number()` | supported | | `lag/lead(value, offset)` | Not supported. Workarounds: | | | 1) replace with `any(value) over (.... rows between preceding and preceding)`, or `following` for `lead`| -| | 2) use `lag_in_frame/lead_in_frame`, which are analogous, but respect the window frame. To get behavior identical to `lag/lead`, use `rows between unbounded preceding and unbounded following` | +| | 2) use `lagInFrame/leadInFrame`, which are analogous, but respect the window frame. To get behavior identical to `lag/lead`, use `rows between unbounded preceding and unbounded following` | ## References diff --git a/src/Processors/Transforms/WindowTransform.cpp b/src/Processors/Transforms/WindowTransform.cpp index 3ab16d0d1b4..4a5282c1e6b 100644 --- a/src/Processors/Transforms/WindowTransform.cpp +++ b/src/Processors/Transforms/WindowTransform.cpp @@ -1525,7 +1525,7 @@ void registerWindowFunctions(AggregateFunctionFactory & factory) // be able to process at least the lag/lead in streaming fashion. // A partial solution for constant offsets is rewriting, say `lag(value, offset) // to `any(value) over (rows between offset preceding and offset preceding)`. - // We also implement non-standard functions `lag/lead_in_frame`, that are + // We also implement non-standard functions `lag/leadInFrame`, that are // analogous to `lag/lead`, but respect the frame. // Functions like cume_dist() do require materializing the entire // partition, but it's probably also simpler to implement them by rewriting @@ -1553,14 +1553,14 @@ void registerWindowFunctions(AggregateFunctionFactory & factory) parameters); }); - factory.registerFunction("lag_in_frame", [](const std::string & name, + factory.registerFunction("lagInFrame", [](const std::string & name, const DataTypes & argument_types, const Array & parameters) { return std::make_shared>( name, argument_types, parameters); }); - factory.registerFunction("lead_in_frame", [](const std::string & name, + factory.registerFunction("leadInFrame", [](const std::string & name, const DataTypes & argument_types, const Array & parameters) { return std::make_shared>( diff --git a/tests/performance/window_functions.xml b/tests/performance/window_functions.xml index 8db168b1a97..6be3d59e2b0 100644 --- a/tests/performance/window_functions.xml +++ b/tests/performance/window_functions.xml @@ -112,7 +112,7 @@ - select lead_in_frame(number) over w + select leadInFrame(number) over w from (select number, intDiv(number, 1111) p, mod(number, 111) o from numbers(10000000)) t @@ -133,7 +133,7 @@ - select lead_in_frame(number, number) over w + select leadInFrame(number, number) over w from (select number, intDiv(number, 1111) p, mod(number, 111) o from numbers(10000000)) t @@ -143,7 +143,7 @@ - select lead_in_frame(number, number, number) over w + select leadInFrame(number, number, number) over w from (select number, intDiv(number, 1111) p, mod(number, 111) o from numbers(10000000)) t diff --git a/tests/queries/0_stateless/01591_window_functions.reference b/tests/queries/0_stateless/01591_window_functions.reference index 05228e5303b..14e5889a811 100644 --- a/tests/queries/0_stateless/01591_window_functions.reference +++ b/tests/queries/0_stateless/01591_window_functions.reference @@ -1004,8 +1004,8 @@ from numbers(5); 3 \N -- variants of lag/lead that respect the frame select number, p, pp, - lag_in_frame(number, number - pp, number * 11) over w as lag, - lead_in_frame(number, number - pp, number * 11) over w as lead + lagInFrame(number, number - pp, number * 11) over w as lag, + leadInFrame(number, number - pp, number * 11) over w as lead from (select number, intDiv(number, 5) p, p * 5 pp from numbers(16)) window w as (partition by p order by number rows between unbounded preceding and unbounded following) diff --git a/tests/queries/0_stateless/01591_window_functions.sql b/tests/queries/0_stateless/01591_window_functions.sql index 2783fc66d78..30847e09246 100644 --- a/tests/queries/0_stateless/01591_window_functions.sql +++ b/tests/queries/0_stateless/01591_window_functions.sql @@ -349,8 +349,8 @@ from numbers(5); -- variants of lag/lead that respect the frame select number, p, pp, - lag_in_frame(number, number - pp, number * 11) over w as lag, - lead_in_frame(number, number - pp, number * 11) over w as lead + lagInFrame(number, number - pp, number * 11) over w as lag, + leadInFrame(number, number - pp, number * 11) over w as lead from (select number, intDiv(number, 5) p, p * 5 pp from numbers(16)) window w as (partition by p order by number rows between unbounded preceding and unbounded following) From b179ae468c7f03bcfd00ee1c444394d0e8a996ec Mon Sep 17 00:00:00 2001 From: Dmitriy Date: Thu, 25 Mar 2021 19:11:30 +0300 Subject: [PATCH 113/155] Update formats.md MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Внес небольшие правки в русскую версию. --- docs/ru/interfaces/formats.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/docs/ru/interfaces/formats.md b/docs/ru/interfaces/formats.md index 8ec26ec66f5..2cb09c2aa17 100644 --- a/docs/ru/interfaces/formats.md +++ b/docs/ru/interfaces/formats.md @@ -1235,7 +1235,7 @@ ClickHouse поддерживает настраиваемую точность ### Вставка данных {#vstavka-dannykh-1} -Чтобы вставить в ClickHouse данные из файла в формате ORC, вы можете использовать команду следующего вида: +Чтобы вставить в ClickHouse данные из файла в формате ORC, используйте команду следующего вида: ``` bash $ cat filename.orc | clickhouse-client --query="INSERT INTO some_table FORMAT ORC" @@ -1243,7 +1243,7 @@ $ cat filename.orc | clickhouse-client --query="INSERT INTO some_table FORMAT OR ### Выборка данных {#vyborka-dannykh-1} -Чтобы получить данные из таблицы ClickHouse и сохранить их в файл формата ORC, вы можете использовать команду следующего вида: +Чтобы получить данные из таблицы ClickHouse и сохранить их в файл формата ORC, используйте команду следующего вида: ``` bash $ clickhouse-client --query="SELECT * FROM {some_table} FORMAT ORC" > {filename.orc} From 177a017c5c5bbc7e4cd4c8b775c7d14b10aff4f8 Mon Sep 17 00:00:00 2001 From: Alexey Milovidov Date: Thu, 25 Mar 2021 20:51:32 +0300 Subject: [PATCH 114/155] Minor modification #22115 --- src/Storages/HDFS/ReadBufferFromHDFS.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/Storages/HDFS/ReadBufferFromHDFS.cpp b/src/Storages/HDFS/ReadBufferFromHDFS.cpp index f3b0e3022f1..affb76314b1 100644 --- a/src/Storages/HDFS/ReadBufferFromHDFS.cpp +++ b/src/Storages/HDFS/ReadBufferFromHDFS.cpp @@ -26,7 +26,7 @@ struct ReadBufferFromHDFS::ReadBufferFromHDFSImpl HDFSBuilderWrapper builder; HDFSFSPtr fs; - explicit ReadBufferFromHDFSImpl(const std::string & hdfs_name_, + ReadBufferFromHDFSImpl(const std::string & hdfs_name_, const Poco::Util::AbstractConfiguration & config_) : hdfs_uri(hdfs_name_), builder(createHDFSBuilder(hdfs_uri, config_)) From a4aff546e95f4a0d72b65c7918c3e6739ca84ee7 Mon Sep 17 00:00:00 2001 From: Dmitriy Date: Thu, 25 Mar 2021 22:05:26 +0300 Subject: [PATCH 115/155] Fix the title MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Поправил название заголовка. --- docs/ru/interfaces/formats.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/ru/interfaces/formats.md b/docs/ru/interfaces/formats.md index 2cb09c2aa17..f67997b58d6 100644 --- a/docs/ru/interfaces/formats.md +++ b/docs/ru/interfaces/formats.md @@ -1241,7 +1241,7 @@ ClickHouse поддерживает настраиваемую точность $ cat filename.orc | clickhouse-client --query="INSERT INTO some_table FORMAT ORC" ``` -### Выборка данных {#vyborka-dannykh-1} +### Вывод данных {#vyvod-dannykh-1} Чтобы получить данные из таблицы ClickHouse и сохранить их в файл формата ORC, используйте команду следующего вида: From a8ce138788de6ac2391d586d25d333c7b069660f Mon Sep 17 00:00:00 2001 From: Alexey Milovidov Date: Thu, 25 Mar 2021 23:08:47 +0300 Subject: [PATCH 116/155] Speedup codec NONE --- .../CachedCompressedReadBuffer.cpp | 2 +- src/Compression/CompressedReadBuffer.cpp | 6 ++-- src/Compression/CompressedReadBufferBase.cpp | 29 ++++++++++++++++++- src/Compression/CompressedReadBufferBase.h | 8 ++++- .../CompressedReadBufferFromFile.cpp | 6 ++-- 5 files changed, 42 insertions(+), 9 deletions(-) diff --git a/src/Compression/CachedCompressedReadBuffer.cpp b/src/Compression/CachedCompressedReadBuffer.cpp index 4b4d33954a9..0548de07859 100644 --- a/src/Compression/CachedCompressedReadBuffer.cpp +++ b/src/Compression/CachedCompressedReadBuffer.cpp @@ -51,7 +51,7 @@ bool CachedCompressedReadBuffer::nextImpl() { owned_cell->additional_bytes = codec->getAdditionalSizeAtTheEndOfBuffer(); owned_cell->data.resize(size_decompressed + owned_cell->additional_bytes); - decompress(owned_cell->data.data(), size_decompressed, size_compressed_without_checksum); + decompressTo(owned_cell->data.data(), size_decompressed, size_compressed_without_checksum); } diff --git a/src/Compression/CompressedReadBuffer.cpp b/src/Compression/CompressedReadBuffer.cpp index 6a082164231..6393723acfd 100644 --- a/src/Compression/CompressedReadBuffer.cpp +++ b/src/Compression/CompressedReadBuffer.cpp @@ -21,7 +21,7 @@ bool CompressedReadBuffer::nextImpl() memory.resize(size_decompressed + additional_size_at_the_end_of_buffer); working_buffer = Buffer(memory.data(), &memory[size_decompressed]); - decompress(working_buffer.begin(), size_decompressed, size_compressed_without_checksum); + decompress(working_buffer, size_decompressed, size_compressed_without_checksum); return true; } @@ -48,7 +48,7 @@ size_t CompressedReadBuffer::readBig(char * to, size_t n) /// If the decompressed block fits entirely where it needs to be copied. if (size_decompressed + additional_size_at_the_end_of_buffer <= n - bytes_read) { - decompress(to + bytes_read, size_decompressed, size_compressed_without_checksum); + decompressTo(to + bytes_read, size_decompressed, size_compressed_without_checksum); bytes_read += size_decompressed; bytes += size_decompressed; } @@ -63,7 +63,7 @@ size_t CompressedReadBuffer::readBig(char * to, size_t n) working_buffer = Buffer(memory.data(), &memory[size_decompressed]); pos = working_buffer.begin(); - decompress(working_buffer.begin(), size_decompressed, size_compressed_without_checksum); + decompress(working_buffer, size_decompressed, size_compressed_without_checksum); bytes_read += read(to + bytes_read, n - bytes_read); break; diff --git a/src/Compression/CompressedReadBufferBase.cpp b/src/Compression/CompressedReadBufferBase.cpp index 8f5b779e4bc..65ba9607468 100644 --- a/src/Compression/CompressedReadBufferBase.cpp +++ b/src/Compression/CompressedReadBufferBase.cpp @@ -184,7 +184,7 @@ size_t CompressedReadBufferBase::readCompressedData(size_t & size_decompressed, } -void CompressedReadBufferBase::decompress(char * to, size_t size_decompressed, size_t size_compressed_without_checksum) +static void readHeaderAndGetCodec(const char * compressed_buffer, size_t size_decompressed, CompressionCodecPtr & codec, bool allow_different_codecs) { ProfileEvents::increment(ProfileEvents::CompressedReadBufferBlocks); ProfileEvents::increment(ProfileEvents::CompressedReadBufferBytes, size_decompressed); @@ -210,11 +210,38 @@ void CompressedReadBufferBase::decompress(char * to, size_t size_decompressed, s ErrorCodes::CANNOT_DECOMPRESS); } } +} + +void CompressedReadBufferBase::decompressTo(char * to, size_t size_decompressed, size_t size_compressed_without_checksum) +{ + readHeaderAndGetCodec(compressed_buffer, size_decompressed, codec, allow_different_codecs); codec->decompress(compressed_buffer, size_compressed_without_checksum, to); } +void CompressedReadBufferBase::decompress(BufferBase::Buffer & to, size_t size_decompressed, size_t size_compressed_without_checksum) +{ + readHeaderAndGetCodec(compressed_buffer, size_decompressed, codec, allow_different_codecs); + + if (codec->isNone()) + { + /// Shortcut for NONE codec to avoid extra memcpy. + /// We doing it by changing the buffer `to` to point to existing uncompressed data. + + UInt8 header_size = ICompressionCodec::getHeaderSize(); + if (size_compressed_without_checksum < header_size) + throw Exception(ErrorCodes::CORRUPTED_DATA, + "Can't decompress data: the compressed data size ({}), this should include header size) is less than the header size ({})", + size_compressed_without_checksum, size_t(header_size)); + + to = BufferBase::Buffer(compressed_buffer + header_size, compressed_buffer + size_compressed_without_checksum); + } + + codec->decompress(compressed_buffer, size_compressed_without_checksum, to.begin()); +} + + /// 'compressed_in' could be initialized lazily, but before first call of 'readCompressedData'. CompressedReadBufferBase::CompressedReadBufferBase(ReadBuffer * in, bool allow_different_codecs_) : compressed_in(in), own_compressed_buffer(0), allow_different_codecs(allow_different_codecs_) diff --git a/src/Compression/CompressedReadBufferBase.h b/src/Compression/CompressedReadBufferBase.h index 60b8847f639..c1e928039ef 100644 --- a/src/Compression/CompressedReadBufferBase.h +++ b/src/Compression/CompressedReadBufferBase.h @@ -3,6 +3,7 @@ #include #include #include +#include namespace DB @@ -37,7 +38,12 @@ protected: /// Returns number of compressed bytes read. size_t readCompressedData(size_t & size_decompressed, size_t & size_compressed_without_checksum, bool always_copy); - void decompress(char * to, size_t size_decompressed, size_t size_compressed_without_checksum); + /// Decompress into memory pointed by `to` + void decompressTo(char * to, size_t size_decompressed, size_t size_compressed_without_checksum); + + /// This method can change location of `to` to avoid unnecessary copy if data is uncompressed. + /// It is more efficient for compression codec NONE but not suitable if you want to decompress into specific location. + void decompress(BufferBase::Buffer & to, size_t size_decompressed, size_t size_compressed_without_checksum); public: /// 'compressed_in' could be initialized lazily, but before first call of 'readCompressedData'. diff --git a/src/Compression/CompressedReadBufferFromFile.cpp b/src/Compression/CompressedReadBufferFromFile.cpp index 54f360f417b..ea12ec7e8b7 100644 --- a/src/Compression/CompressedReadBufferFromFile.cpp +++ b/src/Compression/CompressedReadBufferFromFile.cpp @@ -31,7 +31,7 @@ bool CompressedReadBufferFromFile::nextImpl() memory.resize(size_decompressed + additional_size_at_the_end_of_buffer); working_buffer = Buffer(memory.data(), &memory[size_decompressed]); - decompress(working_buffer.begin(), size_decompressed, size_compressed_without_checksum); + decompress(working_buffer, size_decompressed, size_compressed_without_checksum); return true; } @@ -108,7 +108,7 @@ size_t CompressedReadBufferFromFile::readBig(char * to, size_t n) /// If the decompressed block fits entirely where it needs to be copied. if (size_decompressed + additional_size_at_the_end_of_buffer <= n - bytes_read) { - decompress(to + bytes_read, size_decompressed, size_compressed_without_checksum); + decompressTo(to + bytes_read, size_decompressed, size_compressed_without_checksum); bytes_read += size_decompressed; bytes += size_decompressed; } @@ -124,7 +124,7 @@ size_t CompressedReadBufferFromFile::readBig(char * to, size_t n) working_buffer = Buffer(memory.data(), &memory[size_decompressed]); pos = working_buffer.begin(); - decompress(working_buffer.begin(), size_decompressed, size_compressed_without_checksum); + decompress(working_buffer, size_decompressed, size_compressed_without_checksum); bytes_read += read(to + bytes_read, n - bytes_read); break; From 48fe30e5a28592f326a38d8d8261e782a7b0ba7d Mon Sep 17 00:00:00 2001 From: Azat Khuzhin Date: Thu, 25 Mar 2021 23:41:03 +0300 Subject: [PATCH 117/155] Add missing logging for exception in InterserverIOHTTPHandler --- src/Server/InterserverIOHTTPHandler.cpp | 1 + 1 file changed, 1 insertion(+) diff --git a/src/Server/InterserverIOHTTPHandler.cpp b/src/Server/InterserverIOHTTPHandler.cpp index 740072e8e9f..7d8dfaaf2c8 100644 --- a/src/Server/InterserverIOHTTPHandler.cpp +++ b/src/Server/InterserverIOHTTPHandler.cpp @@ -107,6 +107,7 @@ void InterserverIOHTTPHandler::handleRequest(HTTPServerRequest & request, HTTPSe } catch (...) { + tryLogCurrentException(log); out.finalize(); } }; From e1de9600253c741e0d8f4b15c5c64dab658544b0 Mon Sep 17 00:00:00 2001 From: Azat Khuzhin Date: Thu, 25 Mar 2021 23:41:03 +0300 Subject: [PATCH 118/155] Fix uncaught exception in InterserverIOHTTPHandler There was one more uncaught exception case [1]: 2021.03.19 18:11:00.845632 [ 17469 ] {} InterserverIOHTTPHandler: Done processing query ... 2021.03.19 18:11:31.698961 [ 80145 ] {} BaseDaemon: ######################################## 2021.03.19 18:11:31.699903 [ 80145 ] {} BaseDaemon: (version 21.4.1.6293 (official build), build id: 859E400E1C65C4702FE491420741DD8B58190002) (from thread 17469) (no query) Received signal Aborted (6) 2021.03.19 18:11:32.614075 [ 80145 ] {} BaseDaemon: 8. ./obj-x86_64-linux-gnu/../contrib/libcxxabi/src/cxa_handlers.cpp:89: std::terminate() @ 0x21e9b3a2 in /usr/bin/clickhouse 2021.03.19 18:11:43.831215 [ 80145 ] {} BaseDaemon: 10. ./obj-x86_64-linux-gnu/../src/Server/HTTP/WriteBufferFromHTTPServerResponse.cpp:201: ? @ 0x1be77038 in /usr/bin/clickhouse 2021.03.19 18:11:44.743193 [ 80145 ] {} BaseDaemon: 11. ./obj-x86_64-linux-gnu/../contrib/libcxx/include/memory:892: std::__1::allocator::destroy(DB::WriteBufferFromHTTPServerResponse*) @ 0x1bddd7c9 in /usr/bin/clickhouse 2021.03.19 18:11:45.283905 [ 80145 ] {} BaseDaemon: 12. ./obj-x86_64-linux-gnu/../contrib/libcxx/include/__memory/allocator_traits.h:541: void std::__1::allocator_traits >::__destroy(std::__1::integral_constant, std::__1::allocator&, DB::WriteBufferFromHTTPServerResponse*) @ 0x1bddd79d in /usr/bin/clickhouse 2021.03.19 18:11:45.805233 [ 80145 ] {} BaseDaemon: 13. ./obj-x86_64-linux-gnu/../contrib/libcxx/include/__memory/allocator_traits.h:487: void std::__1::allocator_traits >::destroy(std::__1::allocator&, DB::WriteBufferFromHTTPServerResponse*) @ 0x1bddd76d in /usr/bin/clickhouse 2021.03.19 18:11:46.351371 [ 80145 ] {} BaseDaemon: 14. ./obj-x86_64-linux-gnu/../contrib/libcxx/include/memory:2611: std::__1::__shared_ptr_emplace >::__on_zero_shared() @ 0x1bddd525 in /usr/bin/clickhouse 2021.03.19 18:11:46.579263 [ 80145 ] {} BaseDaemon: 15. ./obj-x86_64-linux-gnu/../contrib/libcxx/include/memory:2476: std::__1::__shared_count::__release_shared() @ 0x119490ed in /usr/bin/clickhouse 2021.03.19 18:11:46.790912 [ 80145 ] {} BaseDaemon: 16. ./obj-x86_64-linux-gnu/../contrib/libcxx/include/memory:2517: std::__1::__shared_weak_count::__release_shared() @ 0x1194908f in /usr/bin/clickhouse 2021.03.19 18:11:47.277990 [ 80145 ] {} BaseDaemon: 17. ./obj-x86_64-linux-gnu/../contrib/libcxx/include/memory:3213: std::__1::shared_ptr::~shared_ptr() @ 0x1bdd75fc in /usr/bin/clickhouse 2021.03.19 18:11:47.649213 [ 80145 ] {} BaseDaemon: 18. ./obj-x86_64-linux-gnu/../src/Server/InterserverIOHTTPHandler.h:34: DB::InterserverIOHTTPHandler::Output::~Output() @ 0x1bdf6bd5 in /usr/bin/clickhouse 2021.03.19 18:11:47.921556 [ 80145 ] {} BaseDaemon: 19. ./obj-x86_64-linux-gnu/../src/Server/InterserverIOHTTPHandler.cpp:154: DB::InterserverIOHTTPHandler::handleRequest(DB::HTTPServerRequest&, DB::HTTPServerResponse&) @ 0x1bdf653f in /usr/bin/clickhouse [1]: https://clickhouse-test-reports.s3.yandex.net/0/78c56b891383288cf3a893139e796fc87476412e/stress_test_(debug).html Since in case of no errors during processing we should call finalize, to ensure that it will not be called from dtor. Fixes: #22046 Fixes: #22067 --- src/Server/InterserverIOHTTPHandler.cpp | 1 + 1 file changed, 1 insertion(+) diff --git a/src/Server/InterserverIOHTTPHandler.cpp b/src/Server/InterserverIOHTTPHandler.cpp index 7d8dfaaf2c8..426e4ca2138 100644 --- a/src/Server/InterserverIOHTTPHandler.cpp +++ b/src/Server/InterserverIOHTTPHandler.cpp @@ -117,6 +117,7 @@ void InterserverIOHTTPHandler::handleRequest(HTTPServerRequest & request, HTTPSe if (auto [message, success] = checkAuthentication(request); success) { processQuery(request, response, used_output); + used_output.out->finalize(); LOG_DEBUG(log, "Done processing query"); } else From 50003e496a1551a49733b3c67b1b0fb4939af54b Mon Sep 17 00:00:00 2001 From: Azat Khuzhin Date: Thu, 25 Mar 2021 23:41:03 +0300 Subject: [PATCH 119/155] Use existing logger for logging from WriteBufferFromS3 dtor --- src/IO/WriteBufferFromS3.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/IO/WriteBufferFromS3.cpp b/src/IO/WriteBufferFromS3.cpp index 7373b24991a..93aaf9456b5 100644 --- a/src/IO/WriteBufferFromS3.cpp +++ b/src/IO/WriteBufferFromS3.cpp @@ -120,7 +120,7 @@ WriteBufferFromS3::~WriteBufferFromS3() } catch (...) { - tryLogCurrentException(__PRETTY_FUNCTION__); + tryLogCurrentException(log); } } From f1907acbcd96b3b5ce0e6073e7faa44479c3221b Mon Sep 17 00:00:00 2001 From: Azat Khuzhin Date: Thu, 25 Mar 2021 23:42:18 +0300 Subject: [PATCH 120/155] Use finalize() over next() for nested writers Refs: https://github.com/ClickHouse/ClickHouse/pull/21325#discussion_r585348309 --- src/IO/BrotliWriteBuffer.cpp | 2 +- src/IO/LZMADeflatingWriteBuffer.cpp | 2 +- src/IO/ZlibDeflatingWriteBuffer.cpp | 2 +- src/IO/ZstdDeflatingWriteBuffer.cpp | 2 +- 4 files changed, 4 insertions(+), 4 deletions(-) diff --git a/src/IO/BrotliWriteBuffer.cpp b/src/IO/BrotliWriteBuffer.cpp index e87eeb1a2be..512ed5fc93f 100644 --- a/src/IO/BrotliWriteBuffer.cpp +++ b/src/IO/BrotliWriteBuffer.cpp @@ -106,7 +106,7 @@ void BrotliWriteBuffer::finish() try { finishImpl(); - out->next(); + out->finalize(); finished = true; } catch (...) diff --git a/src/IO/LZMADeflatingWriteBuffer.cpp b/src/IO/LZMADeflatingWriteBuffer.cpp index 96f1d34b01b..7ea4f7945dc 100644 --- a/src/IO/LZMADeflatingWriteBuffer.cpp +++ b/src/IO/LZMADeflatingWriteBuffer.cpp @@ -105,7 +105,7 @@ void LZMADeflatingWriteBuffer::finish() try { finishImpl(); - out->next(); + out->finalize(); finished = true; } catch (...) diff --git a/src/IO/ZlibDeflatingWriteBuffer.cpp b/src/IO/ZlibDeflatingWriteBuffer.cpp index 5da82b52279..7e91820f298 100644 --- a/src/IO/ZlibDeflatingWriteBuffer.cpp +++ b/src/IO/ZlibDeflatingWriteBuffer.cpp @@ -107,7 +107,7 @@ void ZlibDeflatingWriteBuffer::finish() try { finishImpl(); - out->next(); + out->finalize(); finished = true; } catch (...) diff --git a/src/IO/ZstdDeflatingWriteBuffer.cpp b/src/IO/ZstdDeflatingWriteBuffer.cpp index 27694797db6..5b97588b33e 100644 --- a/src/IO/ZstdDeflatingWriteBuffer.cpp +++ b/src/IO/ZstdDeflatingWriteBuffer.cpp @@ -94,7 +94,7 @@ void ZstdDeflatingWriteBuffer::finish() try { finishImpl(); - out->next(); + out->finalize(); finished = true; } catch (...) From 30cd1c614514973d88d8475daf88fa5bd6ae5a04 Mon Sep 17 00:00:00 2001 From: Azat Khuzhin Date: Sun, 21 Mar 2021 10:58:31 +0300 Subject: [PATCH 121/155] Fix typo in FirstSignificantSubdomainCustomLookup name --- src/Functions/URL/FirstSignificantSubdomainCustomImpl.h | 8 ++++---- .../URL/cutToFirstSignificantSubdomainCustom.cpp | 2 +- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/src/Functions/URL/FirstSignificantSubdomainCustomImpl.h b/src/Functions/URL/FirstSignificantSubdomainCustomImpl.h index 244b32459c1..d6868834f75 100644 --- a/src/Functions/URL/FirstSignificantSubdomainCustomImpl.h +++ b/src/Functions/URL/FirstSignificantSubdomainCustomImpl.h @@ -17,10 +17,10 @@ namespace ErrorCodes extern const int ILLEGAL_TYPE_OF_ARGUMENT; } -struct FirstSignificantSubdomainCustomtLookup +struct FirstSignificantSubdomainCustomLookup { const TLDList & tld_list; - FirstSignificantSubdomainCustomtLookup(const std::string & tld_list_name) + FirstSignificantSubdomainCustomLookup(const std::string & tld_list_name) : tld_list(TLDListsHolder::getInstance().getTldList(tld_list_name)) { } @@ -63,7 +63,7 @@ public: ColumnPtr executeImpl(const ColumnsWithTypeAndName & arguments, const DataTypePtr & /*result_type*/, size_t /*input_rows_count*/) const override { const ColumnConst * column_tld_list_name = checkAndGetColumnConstStringOrFixedString(arguments[1].column.get()); - FirstSignificantSubdomainCustomtLookup tld_lookup(column_tld_list_name->getValue()); + FirstSignificantSubdomainCustomLookup tld_lookup(column_tld_list_name->getValue()); /// FIXME: convertToFullColumnIfConst() is suboptimal auto column = arguments[0].column->convertToFullColumnIfConst(); @@ -79,7 +79,7 @@ public: ErrorCodes::ILLEGAL_COLUMN); } - static void vector(FirstSignificantSubdomainCustomtLookup & tld_lookup, + static void vector(FirstSignificantSubdomainCustomLookup & tld_lookup, const ColumnString::Chars & data, const ColumnString::Offsets & offsets, ColumnString::Chars & res_data, ColumnString::Offsets & res_offsets) { diff --git a/src/Functions/URL/cutToFirstSignificantSubdomainCustom.cpp b/src/Functions/URL/cutToFirstSignificantSubdomainCustom.cpp index 11fd27e317b..88d8fc7704e 100644 --- a/src/Functions/URL/cutToFirstSignificantSubdomainCustom.cpp +++ b/src/Functions/URL/cutToFirstSignificantSubdomainCustom.cpp @@ -10,7 +10,7 @@ struct CutToFirstSignificantSubdomainCustom { static size_t getReserveLengthForElement() { return 15; } - static void execute(FirstSignificantSubdomainCustomtLookup & tld_lookup, const Pos data, const size_t size, Pos & res_data, size_t & res_size) + static void execute(FirstSignificantSubdomainCustomLookup & tld_lookup, const Pos data, const size_t size, Pos & res_data, size_t & res_size) { res_data = data; res_size = 0; From b68517f69ea0b7dd54716734bb1bf33f55c5956a Mon Sep 17 00:00:00 2001 From: Azat Khuzhin Date: Sun, 21 Mar 2021 10:55:28 +0300 Subject: [PATCH 122/155] Fix cutToFirstSignificantSubdomainCustom()/firstSignificantSubdomainCustom() for 3+level domains Custom TLD lists (added in #17748), may contain domain of the 3-d level, however builtin TLD lists does not have such records, so it is not affected. Note that this will significantly increase hashtable lookups. Fixes: #17748 --- .../URL/ExtractFirstSignificantSubdomain.h | 65 ++++++++++++++++++- .../cutToFirstSignificantSubdomainCustom.cpp | 2 +- .../0_stateless/01601_custom_tld.reference | 19 +++++- .../queries/0_stateless/01601_custom_tld.sql | 25 +++++-- 4 files changed, 101 insertions(+), 10 deletions(-) diff --git a/src/Functions/URL/ExtractFirstSignificantSubdomain.h b/src/Functions/URL/ExtractFirstSignificantSubdomain.h index c13b5f50156..974574058e9 100644 --- a/src/Functions/URL/ExtractFirstSignificantSubdomain.h +++ b/src/Functions/URL/ExtractFirstSignificantSubdomain.h @@ -90,7 +90,70 @@ struct ExtractFirstSignificantSubdomain res_data += last_3_periods[1] + 1 - begin; res_size = last_3_periods[0] - last_3_periods[1] - 1; } - } + } + + /// The difference with execute() is due to custom TLD list can have records of any level, + /// not only 2-nd level (like non-custom variant), so it requires more lookups. + template + static void executeCustom(const Lookup & lookup, const Pos data, const size_t size, Pos & res_data, size_t & res_size, Pos * out_domain_end = nullptr) + { + res_data = data; + res_size = 0; + + Pos tmp; + size_t domain_length; + ExtractDomain::execute(data, size, tmp, domain_length); + + if (domain_length == 0) + return; + + if (out_domain_end) + *out_domain_end = tmp + domain_length; + + /// cut useless dot + if (tmp[domain_length - 1] == '.') + --domain_length; + + res_data = tmp; + res_size = domain_length; + + auto begin = tmp; + auto end = begin + domain_length; + const char * last_2_periods[2]{}; + const char * prev = begin - 1; + + auto pos = find_first_symbols<'.'>(begin, end); + while (pos < end) + { + if (lookup(pos + 1, end - pos - 1)) + { + res_data += prev + 1 - begin; + res_size = end - 1 - prev; + return; + } + + last_2_periods[1] = last_2_periods[0]; + last_2_periods[0] = pos; + prev = pos; + pos = find_first_symbols<'.'>(pos + 1, end); + } + + /// if there is domain of the first level (i.e. no dots in the hostname) -> return nothing + if (!last_2_periods[0]) + return; + + /// if there is domain of the second level -> always return itself + if (!last_2_periods[1]) + { + res_size = last_2_periods[0] - begin; + return; + } + + /// if there is domain of the 3+ level, and zero records in TLD list -> + /// fallback to domain of the second level + res_data += last_2_periods[1] + 1 - begin; + res_size = last_2_periods[0] - last_2_periods[1] - 1; + } }; } diff --git a/src/Functions/URL/cutToFirstSignificantSubdomainCustom.cpp b/src/Functions/URL/cutToFirstSignificantSubdomainCustom.cpp index 88d8fc7704e..7532ddd00f2 100644 --- a/src/Functions/URL/cutToFirstSignificantSubdomainCustom.cpp +++ b/src/Functions/URL/cutToFirstSignificantSubdomainCustom.cpp @@ -18,7 +18,7 @@ struct CutToFirstSignificantSubdomainCustom Pos tmp_data; size_t tmp_length; Pos domain_end; - ExtractFirstSignificantSubdomain::execute(tld_lookup, data, size, tmp_data, tmp_length, &domain_end); + ExtractFirstSignificantSubdomain::executeCustom(tld_lookup, data, size, tmp_data, tmp_length, &domain_end); if (tmp_length == 0) return; diff --git a/tests/queries/0_stateless/01601_custom_tld.reference b/tests/queries/0_stateless/01601_custom_tld.reference index 98b99778396..e056505f273 100644 --- a/tests/queries/0_stateless/01601_custom_tld.reference +++ b/tests/queries/0_stateless/01601_custom_tld.reference @@ -1,11 +1,24 @@ -no-tld +-- no-tld + +foo.there-is-no-such-domain +foo.there-is-no-such-domain foo.there-is-no-such-domain foo.there-is-no-such-domain foo -generic +-- generic kernel kernel.biz.ss -difference +-- difference biz.ss kernel.biz.ss +-- 3+level +xx.blogspot.co.at +blogspot +xx.blogspot.co.at +blogspot +-- url +foobar.com +foobar.com +foobar.com +xx.blogspot.co.at diff --git a/tests/queries/0_stateless/01601_custom_tld.sql b/tests/queries/0_stateless/01601_custom_tld.sql index 6d68299c07d..688dd419858 100644 --- a/tests/queries/0_stateless/01601_custom_tld.sql +++ b/tests/queries/0_stateless/01601_custom_tld.sql @@ -1,16 +1,31 @@ -select 'no-tld'; -select cutToFirstSignificantSubdomainCustom('there-is-no-such-domain', 'public_suffix_list'); +select '-- no-tld'; -- even if there is no TLD, 2-nd level by default anyway -- FIXME: make this behavior optional (so that TLD for host never changed, either empty or something real) +select cutToFirstSignificantSubdomain('there-is-no-such-domain'); +select cutToFirstSignificantSubdomain('foo.there-is-no-such-domain'); +select cutToFirstSignificantSubdomain('bar.foo.there-is-no-such-domain'); +select cutToFirstSignificantSubdomainCustom('there-is-no-such-domain', 'public_suffix_list'); select cutToFirstSignificantSubdomainCustom('foo.there-is-no-such-domain', 'public_suffix_list'); select cutToFirstSignificantSubdomainCustom('bar.foo.there-is-no-such-domain', 'public_suffix_list'); select firstSignificantSubdomainCustom('bar.foo.there-is-no-such-domain', 'public_suffix_list'); -select 'generic'; -select firstSignificantSubdomainCustom('foo.kernel.biz.ss', 'public_suffix_list'); -- kernel.biz.ss +select '-- generic'; +select firstSignificantSubdomainCustom('foo.kernel.biz.ss', 'public_suffix_list'); -- kernel select cutToFirstSignificantSubdomainCustom('foo.kernel.biz.ss', 'public_suffix_list'); -- kernel.biz.ss -select 'difference'; +select '-- difference'; -- biz.ss is not in the default TLD list, hence: select cutToFirstSignificantSubdomain('foo.kernel.biz.ss'); -- biz.ss select cutToFirstSignificantSubdomainCustom('foo.kernel.biz.ss', 'public_suffix_list'); -- kernel.biz.ss + +select '-- 3+level'; +select cutToFirstSignificantSubdomainCustom('xx.blogspot.co.at', 'public_suffix_list'); -- xx.blogspot.co.at +select firstSignificantSubdomainCustom('xx.blogspot.co.at', 'public_suffix_list'); -- blogspot +select cutToFirstSignificantSubdomainCustom('foo.bar.xx.blogspot.co.at', 'public_suffix_list'); -- xx.blogspot.co.at +select firstSignificantSubdomainCustom('foo.bar.xx.blogspot.co.at', 'public_suffix_list'); -- blogspot + +select '-- url'; +select cutToFirstSignificantSubdomainCustom('http://foobar.com', 'public_suffix_list'); +select cutToFirstSignificantSubdomainCustom('http://foobar.com/foo', 'public_suffix_list'); +select cutToFirstSignificantSubdomainCustom('http://bar.foobar.com/foo', 'public_suffix_list'); +select cutToFirstSignificantSubdomainCustom('http://xx.blogspot.co.at', 'public_suffix_list'); From a12cc5f559682dee90bfe8703d92b84ec5e9b157 Mon Sep 17 00:00:00 2001 From: Alexander Kuzmenkov Date: Fri, 26 Mar 2021 00:00:18 +0300 Subject: [PATCH 123/155] adjust perf test thresholds --- tests/performance/arithmetic.xml | 2 +- tests/performance/array_join.xml | 2 +- tests/performance/bounding_ratio.xml | 2 +- tests/performance/codecs_float_insert.xml | 2 +- tests/performance/codecs_int_insert.xml | 2 +- tests/performance/conditional.xml | 2 +- tests/performance/constant_column_search.xml | 2 +- tests/performance/date_time_64.xml | 2 +- tests/performance/date_time_long.xml | 2 +- tests/performance/direct_dictionary.xml | 2 +- tests/performance/float_formatting.xml | 2 +- tests/performance/fuzz_bits.xml | 2 +- tests/performance/general_purpose_hashes.xml | 2 +- tests/performance/generate_table_function.xml | 2 +- tests/performance/group_by_sundy_li.xml | 2 +- tests/performance/if_array_string.xml | 2 +- tests/performance/int_parsing.xml | 2 +- tests/performance/jit_small_requests.xml | 2 +- tests/performance/joins_in_memory.xml | 2 +- tests/performance/joins_in_memory_pmj.xml | 2 +- tests/performance/logical_functions_medium.xml | 2 +- tests/performance/logical_functions_small.xml | 2 +- tests/performance/math.xml | 2 +- tests/performance/optimized_select_final.xml | 2 +- tests/performance/optimized_select_final_one_part.xml | 2 +- tests/performance/or_null_default.xml | 2 +- tests/performance/parse_engine_file.xml | 2 +- tests/performance/random_string.xml | 2 +- tests/performance/sum.xml | 2 +- tests/performance/sum_map.xml | 2 +- tests/performance/url_hits.xml | 2 +- 31 files changed, 31 insertions(+), 31 deletions(-) diff --git a/tests/performance/arithmetic.xml b/tests/performance/arithmetic.xml index 0be61eb5823..bf5e7662e37 100644 --- a/tests/performance/arithmetic.xml +++ b/tests/performance/arithmetic.xml @@ -1,4 +1,4 @@ - + 30000000000 diff --git a/tests/performance/array_join.xml b/tests/performance/array_join.xml index ca280ce28ad..cf92b51f545 100644 --- a/tests/performance/array_join.xml +++ b/tests/performance/array_join.xml @@ -1,4 +1,4 @@ - + diff --git a/tests/performance/bounding_ratio.xml b/tests/performance/bounding_ratio.xml index e3a15f90013..e430136b624 100644 --- a/tests/performance/bounding_ratio.xml +++ b/tests/performance/bounding_ratio.xml @@ -1,4 +1,4 @@ - + SELECT boundingRatio(number, number) FROM numbers(100000000) SELECT (argMax(number, number) - argMin(number, number)) / (max(number) - min(number)) FROM numbers(100000000) diff --git a/tests/performance/codecs_float_insert.xml b/tests/performance/codecs_float_insert.xml index a7cb5152c09..b282bcc268f 100644 --- a/tests/performance/codecs_float_insert.xml +++ b/tests/performance/codecs_float_insert.xml @@ -1,5 +1,5 @@ - + 1 diff --git a/tests/performance/codecs_int_insert.xml b/tests/performance/codecs_int_insert.xml index caefaba3725..662df80ae70 100644 --- a/tests/performance/codecs_int_insert.xml +++ b/tests/performance/codecs_int_insert.xml @@ -1,4 +1,4 @@ - + 1 diff --git a/tests/performance/conditional.xml b/tests/performance/conditional.xml index 21623f45b05..91b6cb95ff2 100644 --- a/tests/performance/conditional.xml +++ b/tests/performance/conditional.xml @@ -1,4 +1,4 @@ - + SELECT count() FROM zeros(10000000) WHERE NOT ignore(if(rand() % 2, toDateTime('2019-02-04 01:24:31'), toDate('2019-02-04'))) SELECT count() FROM zeros(10000000) WHERE NOT ignore(multiIf(rand() % 2, toDateTime('2019-02-04 01:24:31'), toDate('2019-02-04'))) SELECT count() FROM zeros(10000000) WHERE NOT ignore(if(rand() % 2, [toDateTime('2019-02-04 01:24:31')], [toDate('2019-02-04')])) diff --git a/tests/performance/constant_column_search.xml b/tests/performance/constant_column_search.xml index cb76fd4cefb..71d8185d818 100644 --- a/tests/performance/constant_column_search.xml +++ b/tests/performance/constant_column_search.xml @@ -1,4 +1,4 @@ - + search diff --git a/tests/performance/date_time_64.xml b/tests/performance/date_time_64.xml index 838aba34d87..fd883416a33 100644 --- a/tests/performance/date_time_64.xml +++ b/tests/performance/date_time_64.xml @@ -1,4 +1,4 @@ - + hits_100m_single diff --git a/tests/performance/date_time_long.xml b/tests/performance/date_time_long.xml index 0c3d85f9659..c2eb42d3318 100644 --- a/tests/performance/date_time_long.xml +++ b/tests/performance/date_time_long.xml @@ -1,4 +1,4 @@ - + datetime_transform diff --git a/tests/performance/direct_dictionary.xml b/tests/performance/direct_dictionary.xml index eb1b4e0da00..cd9aa73a128 100644 --- a/tests/performance/direct_dictionary.xml +++ b/tests/performance/direct_dictionary.xml @@ -1,4 +1,4 @@ - + CREATE TABLE simple_direct_dictionary_test_table ( diff --git a/tests/performance/float_formatting.xml b/tests/performance/float_formatting.xml index d24ccd7664c..71d8aee3f89 100644 --- a/tests/performance/float_formatting.xml +++ b/tests/performance/float_formatting.xml @@ -3,7 +3,7 @@ is 10 times faster than toString(number % 100 + 0.5). The shorter queries are somewhat unstable, so ignore differences less than 10%. --> - + expr diff --git a/tests/performance/fuzz_bits.xml b/tests/performance/fuzz_bits.xml index 2679977cb1d..87064e520c2 100644 --- a/tests/performance/fuzz_bits.xml +++ b/tests/performance/fuzz_bits.xml @@ -1,4 +1,4 @@ - + diff --git a/tests/performance/general_purpose_hashes.xml b/tests/performance/general_purpose_hashes.xml index bd2fa9674f6..f34554360cf 100644 --- a/tests/performance/general_purpose_hashes.xml +++ b/tests/performance/general_purpose_hashes.xml @@ -1,4 +1,4 @@ - + gp_hash_func diff --git a/tests/performance/generate_table_function.xml b/tests/performance/generate_table_function.xml index bc49a7de1bd..0339a8c19e8 100644 --- a/tests/performance/generate_table_function.xml +++ b/tests/performance/generate_table_function.xml @@ -1,4 +1,4 @@ - + SELECT sum(NOT ignore(*)) FROM (SELECT * FROM generateRandom('ui64 UInt64, i64 Int64, ui32 UInt32, i32 Int32, ui16 UInt16, i16 Int16, ui8 UInt8, i8 Int8') LIMIT 1000000000); SELECT sum(NOT ignore(*)) FROM (SELECT * FROM generateRandom('ui64 UInt64, i64 Int64, ui32 UInt32, i32 Int32, ui16 UInt16, i16 Int16, ui8 UInt8, i8 Int8', 0, 10, 10) LIMIT 1000000000); SELECT sum(NOT ignore(*)) FROM (SELECT * FROM generateRandom('i Enum8(\'hello\' = 1, \'world\' = 5)', 0, 10, 10) LIMIT 1000000000); diff --git a/tests/performance/group_by_sundy_li.xml b/tests/performance/group_by_sundy_li.xml index c49712a8519..aebc305335c 100644 --- a/tests/performance/group_by_sundy_li.xml +++ b/tests/performance/group_by_sundy_li.xml @@ -1,4 +1,4 @@ - + 8 diff --git a/tests/performance/if_array_string.xml b/tests/performance/if_array_string.xml index 445b3c8c55a..5d33bfda51f 100644 --- a/tests/performance/if_array_string.xml +++ b/tests/performance/if_array_string.xml @@ -1,4 +1,4 @@ - + SELECT count() FROM zeros(10000000) WHERE NOT ignore(rand() % 2 ? ['Hello', 'World'] : ['a', 'b', 'c']) SELECT count() FROM zeros(10000000) WHERE NOT ignore(rand() % 2 ? materialize(['Hello', 'World']) : ['a', 'b', 'c']) SELECT count() FROM zeros(10000000) WHERE NOT ignore(rand() % 2 ? ['Hello', 'World'] : materialize(['a', 'b', 'c'])) diff --git a/tests/performance/int_parsing.xml b/tests/performance/int_parsing.xml index 3b8620e46c3..32f904331ce 100644 --- a/tests/performance/int_parsing.xml +++ b/tests/performance/int_parsing.xml @@ -1,4 +1,4 @@ - + hits_100m_single hits_10m_single diff --git a/tests/performance/jit_small_requests.xml b/tests/performance/jit_small_requests.xml index c9abec0926b..d8f917fb9af 100644 --- a/tests/performance/jit_small_requests.xml +++ b/tests/performance/jit_small_requests.xml @@ -1,4 +1,4 @@ - + WITH bitXor(number, 0x4CF2D2BAAE6DA887) AS x0, diff --git a/tests/performance/joins_in_memory.xml b/tests/performance/joins_in_memory.xml index bac7679930f..fac6f2659c6 100644 --- a/tests/performance/joins_in_memory.xml +++ b/tests/performance/joins_in_memory.xml @@ -1,4 +1,4 @@ - + CREATE TABLE ints (i64 Int64, i32 Int32, i16 Int16, i8 Int8) ENGINE = Memory INSERT INTO ints SELECT number AS i64, i64 AS i32, i64 AS i16, i64 AS i8 FROM numbers(10000) diff --git a/tests/performance/joins_in_memory_pmj.xml b/tests/performance/joins_in_memory_pmj.xml index 5dd4395513d..87d1c0df14c 100644 --- a/tests/performance/joins_in_memory_pmj.xml +++ b/tests/performance/joins_in_memory_pmj.xml @@ -1,4 +1,4 @@ - + CREATE TABLE ints (i64 Int64, i32 Int32, i16 Int16, i8 Int8) ENGINE = Memory diff --git a/tests/performance/logical_functions_medium.xml b/tests/performance/logical_functions_medium.xml index be474894b54..19572191532 100644 --- a/tests/performance/logical_functions_medium.xml +++ b/tests/performance/logical_functions_medium.xml @@ -1,4 +1,4 @@ - + 1 diff --git a/tests/performance/logical_functions_small.xml b/tests/performance/logical_functions_small.xml index 3d70ef6811d..d5f6a7b99cb 100644 --- a/tests/performance/logical_functions_small.xml +++ b/tests/performance/logical_functions_small.xml @@ -1,4 +1,4 @@ - + 1 diff --git a/tests/performance/math.xml b/tests/performance/math.xml index 006e33548c9..35250351683 100644 --- a/tests/performance/math.xml +++ b/tests/performance/math.xml @@ -1,4 +1,4 @@ - + func_slow diff --git a/tests/performance/optimized_select_final.xml b/tests/performance/optimized_select_final.xml index 2c8254d2b88..d70fccc1330 100644 --- a/tests/performance/optimized_select_final.xml +++ b/tests/performance/optimized_select_final.xml @@ -1,4 +1,4 @@ - + 1 diff --git a/tests/performance/optimized_select_final_one_part.xml b/tests/performance/optimized_select_final_one_part.xml index 92c8eed859a..63541313ac9 100644 --- a/tests/performance/optimized_select_final_one_part.xml +++ b/tests/performance/optimized_select_final_one_part.xml @@ -1,4 +1,4 @@ - + 1 diff --git a/tests/performance/or_null_default.xml b/tests/performance/or_null_default.xml index 6fed0cce4d6..009719f66a5 100644 --- a/tests/performance/or_null_default.xml +++ b/tests/performance/or_null_default.xml @@ -1,4 +1,4 @@ - + SELECT sumOrNull(number) FROM numbers(100000000) SELECT sumOrDefault(toNullable(number)) FROM numbers(100000000) SELECT sumOrNull(number) FROM numbers(10000000) GROUP BY number % 1024 diff --git a/tests/performance/parse_engine_file.xml b/tests/performance/parse_engine_file.xml index 2459ed084cd..d49670b36b5 100644 --- a/tests/performance/parse_engine_file.xml +++ b/tests/performance/parse_engine_file.xml @@ -1,4 +1,4 @@ - + test.hits diff --git a/tests/performance/random_string.xml b/tests/performance/random_string.xml index 1a740ae077a..79f12373f1c 100644 --- a/tests/performance/random_string.xml +++ b/tests/performance/random_string.xml @@ -1,4 +1,4 @@ - + SELECT count() FROM zeros(100000000) WHERE NOT ignore(randomString(10)) SELECT count() FROM zeros(100000000) WHERE NOT ignore(randomString(100)) SELECT count() FROM zeros(1000000) WHERE NOT ignore(randomString(1000)) diff --git a/tests/performance/sum.xml b/tests/performance/sum.xml index 32c194dab6f..9bee2a580c3 100644 --- a/tests/performance/sum.xml +++ b/tests/performance/sum.xml @@ -1,4 +1,4 @@ - + SELECT sum(number) FROM numbers(100000000) SELECT sum(toUInt32(number)) FROM numbers(100000000) SELECT sum(toUInt16(number)) FROM numbers(100000000) diff --git a/tests/performance/sum_map.xml b/tests/performance/sum_map.xml index bc9f9be2a18..b732c150220 100644 --- a/tests/performance/sum_map.xml +++ b/tests/performance/sum_map.xml @@ -1,4 +1,4 @@ - + 1 diff --git a/tests/performance/url_hits.xml b/tests/performance/url_hits.xml index a699ef6ba97..1813b2a72cb 100644 --- a/tests/performance/url_hits.xml +++ b/tests/performance/url_hits.xml @@ -1,4 +1,4 @@ - + hits_100m_single hits_10m_single From f3ca9db832d7ce6d92c49700fc3a944ed31cd817 Mon Sep 17 00:00:00 2001 From: Alexander Kuzmenkov Date: Fri, 26 Mar 2021 01:00:06 +0300 Subject: [PATCH 124/155] forgot abs() --- tests/performance/collations.xml | 2 +- tests/performance/direct_dictionary.xml | 2 +- tests/performance/float_parsing.xml | 2 +- tests/performance/if_array_string.xml | 2 +- tests/performance/synthetic_hardware_benchmark.xml | 2 +- tests/performance/visit_param_extract_raw.xml | 2 +- 6 files changed, 6 insertions(+), 6 deletions(-) diff --git a/tests/performance/collations.xml b/tests/performance/collations.xml index 17b2d36b7e3..52ccede3798 100644 --- a/tests/performance/collations.xml +++ b/tests/performance/collations.xml @@ -1,4 +1,4 @@ - + diff --git a/tests/performance/direct_dictionary.xml b/tests/performance/direct_dictionary.xml index cd9aa73a128..68b52d917dd 100644 --- a/tests/performance/direct_dictionary.xml +++ b/tests/performance/direct_dictionary.xml @@ -1,4 +1,4 @@ - + CREATE TABLE simple_direct_dictionary_test_table ( diff --git a/tests/performance/float_parsing.xml b/tests/performance/float_parsing.xml index 33ab8ba6f10..eb8577bd127 100644 --- a/tests/performance/float_parsing.xml +++ b/tests/performance/float_parsing.xml @@ -1,4 +1,4 @@ - + expr diff --git a/tests/performance/if_array_string.xml b/tests/performance/if_array_string.xml index 5d33bfda51f..773509e1c4b 100644 --- a/tests/performance/if_array_string.xml +++ b/tests/performance/if_array_string.xml @@ -1,4 +1,4 @@ - + SELECT count() FROM zeros(10000000) WHERE NOT ignore(rand() % 2 ? ['Hello', 'World'] : ['a', 'b', 'c']) SELECT count() FROM zeros(10000000) WHERE NOT ignore(rand() % 2 ? materialize(['Hello', 'World']) : ['a', 'b', 'c']) SELECT count() FROM zeros(10000000) WHERE NOT ignore(rand() % 2 ? ['Hello', 'World'] : materialize(['a', 'b', 'c'])) diff --git a/tests/performance/synthetic_hardware_benchmark.xml b/tests/performance/synthetic_hardware_benchmark.xml index 4b94f73a21d..ffcf30db5cb 100644 --- a/tests/performance/synthetic_hardware_benchmark.xml +++ b/tests/performance/synthetic_hardware_benchmark.xml @@ -1,4 +1,4 @@ - + 30000000000 diff --git a/tests/performance/visit_param_extract_raw.xml b/tests/performance/visit_param_extract_raw.xml index 67faeb1f743..358dcc9cc0e 100644 --- a/tests/performance/visit_param_extract_raw.xml +++ b/tests/performance/visit_param_extract_raw.xml @@ -1,4 +1,4 @@ - + param From ddbd95be2d2ca8acba467c88b73afaba2577b121 Mon Sep 17 00:00:00 2001 From: alexey-milovidov Date: Fri, 26 Mar 2021 01:55:42 +0300 Subject: [PATCH 125/155] Suggestion from @l1tsolaiki --- docs/en/faq/integration/json-import.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/en/faq/integration/json-import.md b/docs/en/faq/integration/json-import.md index 7038cc539d2..3fa026c794a 100644 --- a/docs/en/faq/integration/json-import.md +++ b/docs/en/faq/integration/json-import.md @@ -19,7 +19,7 @@ $ echo '{"foo":"bar"}' | curl 'http://localhost:8123/?query=INSERT%20INTO%20test Using [CLI interface](../../interfaces/cli.md): ``` bash -$ echo '{"foo":"bar"}' | clickhouse-client ---query="INSERT INTO test FORMAT JSONEachRow" +$ echo '{"foo":"bar"}' | clickhouse-client --query="INSERT INTO test FORMAT JSONEachRow" ``` Instead of inserting data manually, you might consider to use one of [client libraries](../../interfaces/index.md) instead. From ad66c4a91609b53b46596bd37527fbf1493908c1 Mon Sep 17 00:00:00 2001 From: Alexey Milovidov Date: Fri, 26 Mar 2021 01:56:24 +0300 Subject: [PATCH 126/155] Fix error --- src/Compression/CompressedReadBuffer.cpp | 2 +- src/Compression/CompressedReadBufferFromFile.cpp | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/src/Compression/CompressedReadBuffer.cpp b/src/Compression/CompressedReadBuffer.cpp index 6393723acfd..78241ec1b69 100644 --- a/src/Compression/CompressedReadBuffer.cpp +++ b/src/Compression/CompressedReadBuffer.cpp @@ -61,9 +61,9 @@ size_t CompressedReadBuffer::readBig(char * to, size_t n) memory.resize(size_decompressed + additional_size_at_the_end_of_buffer); working_buffer = Buffer(memory.data(), &memory[size_decompressed]); - pos = working_buffer.begin(); decompress(working_buffer, size_decompressed, size_compressed_without_checksum); + pos = working_buffer.begin(); bytes_read += read(to + bytes_read, n - bytes_read); break; diff --git a/src/Compression/CompressedReadBufferFromFile.cpp b/src/Compression/CompressedReadBufferFromFile.cpp index ea12ec7e8b7..3a75ea14166 100644 --- a/src/Compression/CompressedReadBufferFromFile.cpp +++ b/src/Compression/CompressedReadBufferFromFile.cpp @@ -122,9 +122,9 @@ size_t CompressedReadBufferFromFile::readBig(char * to, size_t n) memory.resize(size_decompressed + additional_size_at_the_end_of_buffer); working_buffer = Buffer(memory.data(), &memory[size_decompressed]); - pos = working_buffer.begin(); decompress(working_buffer, size_decompressed, size_compressed_without_checksum); + pos = working_buffer.begin(); bytes_read += read(to + bytes_read, n - bytes_read); break; From e55f7e63333f5400023210ac194969e17d31d9de Mon Sep 17 00:00:00 2001 From: Alexey Milovidov Date: Fri, 26 Mar 2021 02:21:59 +0300 Subject: [PATCH 127/155] Fix error --- src/Compression/CompressedReadBufferBase.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/Compression/CompressedReadBufferBase.cpp b/src/Compression/CompressedReadBufferBase.cpp index 65ba9607468..eb4d6ea5986 100644 --- a/src/Compression/CompressedReadBufferBase.cpp +++ b/src/Compression/CompressedReadBufferBase.cpp @@ -237,8 +237,8 @@ void CompressedReadBufferBase::decompress(BufferBase::Buffer & to, size_t size_d to = BufferBase::Buffer(compressed_buffer + header_size, compressed_buffer + size_compressed_without_checksum); } - - codec->decompress(compressed_buffer, size_compressed_without_checksum, to.begin()); + else + codec->decompress(compressed_buffer, size_compressed_without_checksum, to.begin()); } From 3107920f315ee21c3bc5938d8b0521bd78cfcf88 Mon Sep 17 00:00:00 2001 From: Ilya Yatsishin <2159081+qoega@users.noreply.github.com> Date: Fri, 26 Mar 2021 09:43:33 +0300 Subject: [PATCH 128/155] Update requirements.txt --- docs/tools/requirements.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/tools/requirements.txt b/docs/tools/requirements.txt index 470bc5e8719..9605525edbf 100644 --- a/docs/tools/requirements.txt +++ b/docs/tools/requirements.txt @@ -26,7 +26,7 @@ numpy==1.19.2 Pygments==2.5.2 pymdown-extensions==8.0 python-slugify==4.0.1 -PyYAML==5.3.1 +PyYAML==5.4.1 repackage==0.7.3 requests==2.24.0 singledispatch==3.4.0.3 From c36f147b1633c7fab5cc23f9124f50ed6592d502 Mon Sep 17 00:00:00 2001 From: Azat Khuzhin Date: Fri, 26 Mar 2021 10:05:56 +0300 Subject: [PATCH 129/155] Fix sleep_in_send_tables_status_ms/sleep_in_send_data_ms in integration tests --- tests/integration/test_secure_socket/test.py | 8 ++++---- .../configs/users.xml | 2 +- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/tests/integration/test_secure_socket/test.py b/tests/integration/test_secure_socket/test.py index 337b6b05bd7..c2bad80bca0 100644 --- a/tests/integration/test_secure_socket/test.py +++ b/tests/integration/test_secure_socket/test.py @@ -12,7 +12,7 @@ NODES = {'node' + str(i): None for i in (1, 2)} config = ''' - {sleep_in_send_data} + {sleep_in_send_data_ms} ''' @@ -45,12 +45,12 @@ def started_cluster(): def test(started_cluster): - NODES['node2'].replace_config('/etc/clickhouse-server/users.d/users.xml', config.format(sleep_in_send_data=1000)) + NODES['node2'].replace_config('/etc/clickhouse-server/users.d/users.xml', config.format(sleep_in_send_data_ms=1000000)) attempts = 0 while attempts < 1000: - setting = NODES['node2'].http_query("SELECT value FROM system.settings WHERE name='sleep_in_send_data'") - if int(setting) == 1000: + setting = NODES['node2'].http_query("SELECT value FROM system.settings WHERE name='sleep_in_send_data_ms'") + if int(setting) == 1000000: break time.sleep(0.1) attempts += 1 diff --git a/tests/integration/test_system_clusters_actual_information/configs/users.xml b/tests/integration/test_system_clusters_actual_information/configs/users.xml index 156cd3a6b59..3dd68165fac 100644 --- a/tests/integration/test_system_clusters_actual_information/configs/users.xml +++ b/tests/integration/test_system_clusters_actual_information/configs/users.xml @@ -2,7 +2,7 @@ - 5 + 5000 From fa930d49c4c7f238ef0b7bd2d228654825d0fa9b Mon Sep 17 00:00:00 2001 From: tavplubix Date: Fri, 26 Mar 2021 10:29:58 +0300 Subject: [PATCH 130/155] Update gtest_peekable_read_buffer.cpp --- src/IO/tests/gtest_peekable_read_buffer.cpp | 5 ----- 1 file changed, 5 deletions(-) diff --git a/src/IO/tests/gtest_peekable_read_buffer.cpp b/src/IO/tests/gtest_peekable_read_buffer.cpp index ddb947d8b2f..2e5ca47c0aa 100644 --- a/src/IO/tests/gtest_peekable_read_buffer.cpp +++ b/src/IO/tests/gtest_peekable_read_buffer.cpp @@ -6,11 +6,6 @@ #include #include -namespace DB::ErrorCodes -{ - extern const int LOGICAL_ERROR; -} - static void readAndAssert(DB::ReadBuffer & buf, const char * str) { size_t n = strlen(str); From ba5c15103720b12ab4adeffb55035ee7c438e3e0 Mon Sep 17 00:00:00 2001 From: alesapin Date: Fri, 26 Mar 2021 13:20:07 +0300 Subject: [PATCH 131/155] Fix race condition on snapshots --- src/Coordination/NuKeeperStateMachine.cpp | 25 +++++++++++++++++++++-- 1 file changed, 23 insertions(+), 2 deletions(-) diff --git a/src/Coordination/NuKeeperStateMachine.cpp b/src/Coordination/NuKeeperStateMachine.cpp index 32bb4269f20..23485cb8b5b 100644 --- a/src/Coordination/NuKeeperStateMachine.cpp +++ b/src/Coordination/NuKeeperStateMachine.cpp @@ -4,6 +4,7 @@ #include #include #include +#include namespace DB { @@ -227,7 +228,28 @@ void NuKeeperStateMachine::save_logical_snp_obj( nuraft::ptr snp_buf = s.serialize(); cloned_meta = nuraft::snapshot::deserialize(*snp_buf); - auto result_path = snapshot_manager.serializeSnapshotBufferToDisk(*cloned_buffer, s.get_last_log_idx()); + /// Sometimes NuRaft can call save and create snapshots from different threads + /// at onces. To avoid race conditions we serialize snapshots through snapshots_queue + /// TODO: make something better + CreateSnapshotTask snapshot_task; + std::shared_ptr> waiter = std::make_shared>(); + auto future = waiter->get_future(); + snapshot_task.snapshot = nullptr; + snapshot_task.create_snapshot = [this, waiter, cloned_buffer, log_idx = s.get_last_log_idx()] (NuKeeperStorageSnapshotPtr &&) + { + try + { + auto result_path = snapshot_manager.serializeSnapshotBufferToDisk(*cloned_buffer, log_idx); + LOG_DEBUG(log, "Saved snapshot {} to path {}", log_idx, result_path); + } + catch (...) + { + tryLogCurrentException(log); + } + waiter->set_value(); + }; + snapshots_queue.push(std::move(snapshot_task)); + future.wait(); { std::lock_guard lock(snapshots_lock); @@ -235,7 +257,6 @@ void NuKeeperStateMachine::save_logical_snp_obj( latest_snapshot_meta = cloned_meta; } - LOG_DEBUG(log, "Created snapshot {} with path {}", s.get_last_log_idx(), result_path); obj_id++; } From 331c5b66365d96541ea1f5a913b0b4beae747416 Mon Sep 17 00:00:00 2001 From: alesapin Date: Fri, 26 Mar 2021 13:55:39 +0300 Subject: [PATCH 132/155] Fix startup one more time --- src/Coordination/NuKeeperServer.cpp | 25 ++++++++++++++++--------- src/Coordination/NuKeeperServer.h | 2 +- 2 files changed, 17 insertions(+), 10 deletions(-) diff --git a/src/Coordination/NuKeeperServer.cpp b/src/Coordination/NuKeeperServer.cpp index 62af9656fb9..7e6c10ca125 100644 --- a/src/Coordination/NuKeeperServer.cpp +++ b/src/Coordination/NuKeeperServer.cpp @@ -188,6 +188,9 @@ nuraft::cb_func::ReturnCode NuKeeperServer::callbackFunc(nuraft::cb_func::Type t if (next_index < last_commited || next_index - last_commited <= 1) commited_store = true; + if (initialized_flag) + return nuraft::cb_func::ReturnCode::Ok; + auto set_initialized = [this] () { std::unique_lock lock(initialized_mutex); @@ -205,15 +208,19 @@ nuraft::cb_func::ReturnCode NuKeeperServer::callbackFunc(nuraft::cb_func::Type t return nuraft::cb_func::ReturnCode::Ok; } case nuraft::cb_func::BecomeFollower: + case nuraft::cb_func::GotAppendEntryReqFromLeader: { - auto leader_index = raft_instance->get_leader_committed_log_idx(); - auto our_index = raft_instance->get_committed_log_idx(); - /// This may happen when we start RAFT claster from scratch. - /// Node first became leader, and after that some other node became leader. - /// BecameFresh for this node will not be called because it was already fresh - /// when it was leader. - if (isLeaderAlive() && leader_index < our_index + coordination_settings->fresh_log_gap) - set_initialized(); + if (isLeaderAlive()) + { + auto leader_index = raft_instance->get_leader_committed_log_idx(); + auto our_index = raft_instance->get_committed_log_idx(); + /// This may happen when we start RAFT cluster from scratch. + /// Node first became leader, and after that some other node became leader. + /// BecameFresh for this node will not be called because it was already fresh + /// when it was leader. + if (leader_index < our_index + coordination_settings->fresh_log_gap) + set_initialized(); + } return nuraft::cb_func::ReturnCode::Ok; } case nuraft::cb_func::BecomeFresh: @@ -237,7 +244,7 @@ void NuKeeperServer::waitInit() { std::unique_lock lock(initialized_mutex); int64_t timeout = coordination_settings->startup_timeout.totalMilliseconds(); - if (!initialized_cv.wait_for(lock, std::chrono::milliseconds(timeout), [&] { return initialized_flag; })) + if (!initialized_cv.wait_for(lock, std::chrono::milliseconds(timeout), [&] { return initialized_flag.load(); })) throw Exception(ErrorCodes::RAFT_ERROR, "Failed to wait RAFT initialization"); } diff --git a/src/Coordination/NuKeeperServer.h b/src/Coordination/NuKeeperServer.h index ba25d5c181b..b5c13e62212 100644 --- a/src/Coordination/NuKeeperServer.h +++ b/src/Coordination/NuKeeperServer.h @@ -31,7 +31,7 @@ private: ResponsesQueue & responses_queue; std::mutex initialized_mutex; - bool initialized_flag = false; + std::atomic initialized_flag = false; std::condition_variable initialized_cv; std::atomic initial_batch_committed = false; From 2db57f0f1669ded0768a00becf7747249f99d930 Mon Sep 17 00:00:00 2001 From: alesapin Date: Fri, 26 Mar 2021 14:18:31 +0300 Subject: [PATCH 133/155] Followup fix --- src/Coordination/NuKeeperStorageDispatcher.cpp | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/src/Coordination/NuKeeperStorageDispatcher.cpp b/src/Coordination/NuKeeperStorageDispatcher.cpp index 3aed0d99568..5b35b9c4829 100644 --- a/src/Coordination/NuKeeperStorageDispatcher.cpp +++ b/src/Coordination/NuKeeperStorageDispatcher.cpp @@ -132,6 +132,10 @@ void NuKeeperStorageDispatcher::initialize(const Poco::Util::AbstractConfigurati coordination_settings->loadFromConfig("test_keeper_server.coordination_settings", config); + request_thread = ThreadFromGlobalPool([this] { requestThread(); }); + responses_thread = ThreadFromGlobalPool([this] { responseThread(); }); + snapshot_thread = ThreadFromGlobalPool([this] { snapshotThread(); }); + server = std::make_unique(myid, coordination_settings, config, responses_queue, snapshots_queue); try { @@ -148,10 +152,8 @@ void NuKeeperStorageDispatcher::initialize(const Poco::Util::AbstractConfigurati throw; } - request_thread = ThreadFromGlobalPool([this] { requestThread(); }); - responses_thread = ThreadFromGlobalPool([this] { responseThread(); }); + session_cleaner_thread = ThreadFromGlobalPool([this] { sessionCleanerTask(); }); - snapshot_thread = ThreadFromGlobalPool([this] { snapshotThread(); }); LOG_DEBUG(log, "Dispatcher initialized"); } From 9bdeb436c2671d92d3462374777bdf88b9e06d12 Mon Sep 17 00:00:00 2001 From: alesapin Date: Fri, 26 Mar 2021 15:06:36 +0300 Subject: [PATCH 134/155] Fix typo --- src/Coordination/NuKeeperStateMachine.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/Coordination/NuKeeperStateMachine.cpp b/src/Coordination/NuKeeperStateMachine.cpp index 23485cb8b5b..a7037b8d644 100644 --- a/src/Coordination/NuKeeperStateMachine.cpp +++ b/src/Coordination/NuKeeperStateMachine.cpp @@ -229,7 +229,7 @@ void NuKeeperStateMachine::save_logical_snp_obj( cloned_meta = nuraft::snapshot::deserialize(*snp_buf); /// Sometimes NuRaft can call save and create snapshots from different threads - /// at onces. To avoid race conditions we serialize snapshots through snapshots_queue + /// at once. To avoid race conditions we serialize snapshots through snapshots_queue /// TODO: make something better CreateSnapshotTask snapshot_task; std::shared_ptr> waiter = std::make_shared>(); From b0e401088ada141b6d206a4f0b279c87cacbccee Mon Sep 17 00:00:00 2001 From: alesapin Date: Fri, 26 Mar 2021 15:20:49 +0300 Subject: [PATCH 135/155] Make this test sequential --- tests/queries/skip_list.json | 1 + 1 file changed, 1 insertion(+) diff --git a/tests/queries/skip_list.json b/tests/queries/skip_list.json index 981cf69d676..9c061da7428 100644 --- a/tests/queries/skip_list.json +++ b/tests/queries/skip_list.json @@ -582,6 +582,7 @@ "00980_zookeeper_merge_tree_alter_settings", "00988_constraints_replication_zookeeper", "00989_parallel_parts_loading", + "00992_system_parts_race_condition_zookeeper_long", "00993_system_parts_race_condition_drop_zookeeper", "01012_show_tables_limit", "01013_sync_replica_timeout_zookeeper", From 35d1443a9c075756b6fc2fbe624c97e59b2fe49b Mon Sep 17 00:00:00 2001 From: alesapin Date: Fri, 26 Mar 2021 15:54:08 +0300 Subject: [PATCH 136/155] Don't wait when table shutdown called --- src/Storages/StorageReplicatedMergeTree.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/Storages/StorageReplicatedMergeTree.cpp b/src/Storages/StorageReplicatedMergeTree.cpp index 9b93d7183fd..0849f65477d 100644 --- a/src/Storages/StorageReplicatedMergeTree.cpp +++ b/src/Storages/StorageReplicatedMergeTree.cpp @@ -4932,7 +4932,7 @@ bool StorageReplicatedMergeTree::waitForTableReplicaToProcessLogEntry( const auto & stop_waiting = [&]() { - bool stop_waiting_itself = waiting_itself && is_dropped; + bool stop_waiting_itself = waiting_itself && (partial_shutdown_called || is_dropped); bool stop_waiting_non_active = !wait_for_non_active && !getZooKeeper()->exists(table_zookeeper_path + "/replicas/" + replica + "/is_active"); return stop_waiting_itself || stop_waiting_non_active; }; From 2f07056ef6ff60444ab333d3357431f88fa4f0d5 Mon Sep 17 00:00:00 2001 From: alesapin Date: Fri, 26 Mar 2021 16:39:09 +0300 Subject: [PATCH 137/155] More stable last get --- tests/jepsen.nukeeper/src/jepsen/nukeeper/main.clj | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/jepsen.nukeeper/src/jepsen/nukeeper/main.clj b/tests/jepsen.nukeeper/src/jepsen/nukeeper/main.clj index 7380a9d9cbb..f0b4998dad0 100644 --- a/tests/jepsen.nukeeper/src/jepsen/nukeeper/main.clj +++ b/tests/jepsen.nukeeper/src/jepsen/nukeeper/main.clj @@ -103,7 +103,7 @@ (gen/nemesis (gen/once {:type :info, :f :stop})) (gen/log "Waiting for recovery") (gen/sleep 10) - (gen/clients (:final-generator workload)))}))) + (gen/clients (gen/until-ok (:final-generator workload))))}))) (def all-nemesises (keys custom-nemesis/custom-nemesises)) From 8b08c0c3a667316926c2380612d7917cf78e370a Mon Sep 17 00:00:00 2001 From: alesapin Date: Fri, 26 Mar 2021 17:45:52 +0300 Subject: [PATCH 138/155] Fix test_odbc_interaction --- tests/integration/test_odbc_interaction/test.py | 1 + 1 file changed, 1 insertion(+) diff --git a/tests/integration/test_odbc_interaction/test.py b/tests/integration/test_odbc_interaction/test.py index 6232168f2e6..2ef71927bdf 100644 --- a/tests/integration/test_odbc_interaction/test.py +++ b/tests/integration/test_odbc_interaction/test.py @@ -360,6 +360,7 @@ def test_bridge_dies_with_parent(started_cluster): assert clickhouse_pid is None assert bridge_pid is None + node1.start_clickhouse(20) def test_odbc_postgres_date_data_type(started_cluster): From 0fae73071c5a7fd6deb8068363050b7fa5e89124 Mon Sep 17 00:00:00 2001 From: alesapin Date: Fri, 26 Mar 2021 18:14:42 +0300 Subject: [PATCH 139/155] Fix flaky test --- tests/integration/test_dictionaries_update_and_reload/test.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/tests/integration/test_dictionaries_update_and_reload/test.py b/tests/integration/test_dictionaries_update_and_reload/test.py index 5c8abcda38e..533a29dc245 100644 --- a/tests/integration/test_dictionaries_update_and_reload/test.py +++ b/tests/integration/test_dictionaries_update_and_reload/test.py @@ -141,7 +141,8 @@ def test_reload_after_loading(started_cluster): time.sleep(1) # see the comment above replace_in_file_in_container('/etc/clickhouse-server/config.d/executable.xml', '81', '82') replace_in_file_in_container('/etc/clickhouse-server/config.d/file.txt', '101', '102') - query("SYSTEM RELOAD DICTIONARIES") + query("SYSTEM RELOAD DICTIONARY 'file'") + query("SYSTEM RELOAD DICTIONARY 'executable'") assert query("SELECT dictGetInt32('executable', 'a', toUInt64(7))") == "82\n" assert query("SELECT dictGetInt32('file', 'a', toUInt64(9))") == "102\n" From 5c0c6a9aecc7f05b6f0d73f6a84e937feaf42021 Mon Sep 17 00:00:00 2001 From: Kruglov Pavel <48961922+Avogar@users.noreply.github.com> Date: Fri, 26 Mar 2021 18:16:15 +0300 Subject: [PATCH 140/155] Delete unused users.xml --- .../configs/users.xml | 8 -------- 1 file changed, 8 deletions(-) delete mode 100644 tests/integration/test_system_clusters_actual_information/configs/users.xml diff --git a/tests/integration/test_system_clusters_actual_information/configs/users.xml b/tests/integration/test_system_clusters_actual_information/configs/users.xml deleted file mode 100644 index 3dd68165fac..00000000000 --- a/tests/integration/test_system_clusters_actual_information/configs/users.xml +++ /dev/null @@ -1,8 +0,0 @@ - - - - - 5000 - - - From 482704c4343389fc2e07d7361a2fba22cfbbf1cd Mon Sep 17 00:00:00 2001 From: Sergey Demurin Date: Fri, 26 Mar 2021 18:29:07 +0300 Subject: [PATCH 141/155] Update other-functions.md fix typo --- docs/ru/sql-reference/functions/other-functions.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/ru/sql-reference/functions/other-functions.md b/docs/ru/sql-reference/functions/other-functions.md index 54537b7735d..f9b3e5c3e68 100644 --- a/docs/ru/sql-reference/functions/other-functions.md +++ b/docs/ru/sql-reference/functions/other-functions.md @@ -672,7 +672,7 @@ neighbor(column, offset[, default_value]) Функция может получить доступ к значению в столбце соседней строки только внутри обрабатываемого в данный момент блока данных. Порядок строк, используемый при вычислении функции `neighbor`, может отличаться от порядка строк, возвращаемых пользователю. -Чтобы этого не случилось, вы можете сделать подзапрос с [ORDER BY](../../sql-reference/statements/select/order-by.md) и вызвать функцию изне подзапроса. +Чтобы этого не случилось, вы можете сделать подзапрос с [ORDER BY](../../sql-reference/statements/select/order-by.md) и вызвать функцию извне подзапроса. **Аргументы** From 1b0a9461f0e6500d98be27c36c082ba4fe471d66 Mon Sep 17 00:00:00 2001 From: alesapin Date: Fri, 26 Mar 2021 18:30:35 +0300 Subject: [PATCH 142/155] Fix more tests --- tests/integration/test_dictionaries_postgresql/test.py | 2 +- .../test_distributed_inter_server_secret/test.py | 6 ++++++ 2 files changed, 7 insertions(+), 1 deletion(-) diff --git a/tests/integration/test_dictionaries_postgresql/test.py b/tests/integration/test_dictionaries_postgresql/test.py index 5ceb6496b90..10d9f4213e1 100644 --- a/tests/integration/test_dictionaries_postgresql/test.py +++ b/tests/integration/test_dictionaries_postgresql/test.py @@ -80,7 +80,7 @@ def test_load_dictionaries(started_cluster): create_dict(table_name) dict_name = 'dict0' - node1.query("SYSTEM RELOAD DICTIONARIES") + node1.query("SYSTEM RELOAD DICTIONARY {}".format(dict_name)) assert node1.query("SELECT count() FROM `test`.`dict_table_{}`".format(table_name)).rstrip() == '10000' assert node1.query("SELECT dictGetUInt32('{}', 'id', toUInt64(0))".format(dict_name)) == '0\n' assert node1.query("SELECT dictGetUInt32('{}', 'value', toUInt64(9999))".format(dict_name)) == '9999\n' diff --git a/tests/integration/test_distributed_inter_server_secret/test.py b/tests/integration/test_distributed_inter_server_secret/test.py index b1daf2271d0..1a0e5a3dd91 100644 --- a/tests/integration/test_distributed_inter_server_secret/test.py +++ b/tests/integration/test_distributed_inter_server_secret/test.py @@ -97,12 +97,14 @@ def test_insecure(): n1.query('SELECT * FROM dist_insecure') def test_insecure_insert_async(): + n1.query("TRUNCATE TABLE data") n1.query('INSERT INTO dist_insecure SELECT * FROM numbers(2)') n1.query('SYSTEM FLUSH DISTRIBUTED ON CLUSTER insecure dist_insecure') assert int(n1.query('SELECT count() FROM dist_insecure')) == 2 n1.query('TRUNCATE TABLE data ON CLUSTER insecure') def test_insecure_insert_sync(): + n1.query("TRUNCATE TABLE data") n1.query('INSERT INTO dist_insecure SELECT * FROM numbers(2)', settings={'insert_distributed_sync': 1}) assert int(n1.query('SELECT count() FROM dist_insecure')) == 2 n1.query('TRUNCATE TABLE data ON CLUSTER secure') @@ -111,12 +113,14 @@ def test_secure(): n1.query('SELECT * FROM dist_secure') def test_secure_insert_async(): + n1.query("TRUNCATE TABLE data") n1.query('INSERT INTO dist_secure SELECT * FROM numbers(2)') n1.query('SYSTEM FLUSH DISTRIBUTED ON CLUSTER secure dist_secure') assert int(n1.query('SELECT count() FROM dist_secure')) == 2 n1.query('TRUNCATE TABLE data ON CLUSTER secure') def test_secure_insert_sync(): + n1.query("TRUNCATE TABLE data") n1.query('INSERT INTO dist_secure SELECT * FROM numbers(2)', settings={'insert_distributed_sync': 1}) assert int(n1.query('SELECT count() FROM dist_secure')) == 2 n1.query('TRUNCATE TABLE data ON CLUSTER secure') @@ -126,6 +130,7 @@ def test_secure_insert_sync(): # Buffer() flush happens with global context, that does not have user # And so Context::user/ClientInfo::current_user/ClientInfo::initial_user will be empty def test_secure_insert_buffer_async(): + n1.query("TRUNCATE TABLE data") n1.query('INSERT INTO dist_secure_buffer SELECT * FROM numbers(2)') n1.query('SYSTEM FLUSH DISTRIBUTED ON CLUSTER secure dist_secure') # no Buffer flush happened @@ -141,6 +146,7 @@ def test_secure_disagree(): n1.query('SELECT * FROM dist_secure_disagree') def test_secure_disagree_insert(): + n1.query("TRUNCATE TABLE data") n1.query('INSERT INTO dist_secure_disagree SELECT * FROM numbers(2)') with pytest.raises(QueryRuntimeException, match='.*Hash mismatch.*'): n1.query('SYSTEM FLUSH DISTRIBUTED ON CLUSTER secure_disagree dist_secure_disagree') From 9bb0dc48b28bcb2fd2c12eafbad931331d599103 Mon Sep 17 00:00:00 2001 From: alesapin Date: Fri, 26 Mar 2021 18:37:26 +0300 Subject: [PATCH 143/155] Fix one more test --- .../test_max_http_connections_for_replication/test.py | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/tests/integration/test_max_http_connections_for_replication/test.py b/tests/integration/test_max_http_connections_for_replication/test.py index 2dc4e2a8810..634697c8668 100644 --- a/tests/integration/test_max_http_connections_for_replication/test.py +++ b/tests/integration/test_max_http_connections_for_replication/test.py @@ -43,6 +43,8 @@ def start_small_cluster(): def test_single_endpoint_connections_count(start_small_cluster): + node1.query("TRUNCATE TABLE test_table") + node2.query("SYSTEM SYNC REPLICA test_table") def task(count): print(("Inserting ten times from {}".format(count))) for i in range(count, count + 10): @@ -58,9 +60,11 @@ def test_single_endpoint_connections_count(start_small_cluster): def test_keepalive_timeout(start_small_cluster): - current_count = int(node1.query("select count() from test_table").strip()) + node1.query("TRUNCATE TABLE test_table") + node2.query("SYSTEM SYNC REPLICA test_table") + node1.query("insert into test_table values ('2017-06-16', 777, 0)") - assert_eq_with_retry(node2, "select count() from test_table", str(current_count + 1)) + assert_eq_with_retry(node2, "select count() from test_table", str(1)) # Server keepAliveTimeout is 3 seconds, default client session timeout is 8 # lets sleep in that interval time.sleep(4) @@ -69,7 +73,7 @@ def test_keepalive_timeout(start_small_cluster): time.sleep(3) - assert_eq_with_retry(node2, "select count() from test_table", str(current_count + 2)) + assert_eq_with_retry(node2, "select count() from test_table", str(2)) assert not node2.contains_in_log("No message received"), "Found 'No message received' in clickhouse-server.log" From aa2244bad5b66271dd766c41698189c9504b4d47 Mon Sep 17 00:00:00 2001 From: alesapin Date: Fri, 26 Mar 2021 18:59:14 +0300 Subject: [PATCH 144/155] Fix more --- tests/integration/test_ttl_replicated/test.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/tests/integration/test_ttl_replicated/test.py b/tests/integration/test_ttl_replicated/test.py index 389e249790f..67614b88029 100644 --- a/tests/integration/test_ttl_replicated/test.py +++ b/tests/integration/test_ttl_replicated/test.py @@ -396,6 +396,10 @@ def test_ttl_compatibility(started_cluster, node_left, node_right, num_run): node_right.query("OPTIMIZE TABLE test_ttl_group_by FINAL") node_right.query("OPTIMIZE TABLE test_ttl_where FINAL") + node_left.query("SYSTEM SYNC REPLICA test_ttl_delete", timeout=20) + node_left.query("SYSTEM SYNC REPLICA test_ttl_group_by", timeout=20) + node_left.query("SYSTEM SYNC REPLICA test_ttl_where", timeout=20) + assert node_left.query("SELECT id FROM test_ttl_delete ORDER BY id") == "2\n4\n" assert node_right.query("SELECT id FROM test_ttl_delete ORDER BY id") == "2\n4\n" From f32704101b5e415b341e28e653ab7239e7c440b1 Mon Sep 17 00:00:00 2001 From: alesapin Date: Fri, 26 Mar 2021 19:56:08 +0300 Subject: [PATCH 145/155] Add retries to final operations --- tests/jepsen.nukeeper/src/jepsen/nukeeper/counter.clj | 5 ++--- tests/jepsen.nukeeper/src/jepsen/nukeeper/main.clj | 2 +- tests/jepsen.nukeeper/src/jepsen/nukeeper/queue.clj | 8 +++----- tests/jepsen.nukeeper/src/jepsen/nukeeper/set.clj | 10 +++++----- tests/jepsen.nukeeper/src/jepsen/nukeeper/utils.clj | 11 +++++++++++ 5 files changed, 22 insertions(+), 14 deletions(-) diff --git a/tests/jepsen.nukeeper/src/jepsen/nukeeper/counter.clj b/tests/jepsen.nukeeper/src/jepsen/nukeeper/counter.clj index 48b270517a4..7e2cd00736f 100644 --- a/tests/jepsen.nukeeper/src/jepsen/nukeeper/counter.clj +++ b/tests/jepsen.nukeeper/src/jepsen/nukeeper/counter.clj @@ -24,11 +24,10 @@ (invoke! [this test op] (case (:f op) - :read (try + :read (exec-with-retries 30 (fn [] (assoc op :type :ok - :value (count (zk-list conn "/"))) - (catch Exception _ (assoc op :type :fail, :error :connect-error))) + :value (count (zk-list conn "/"))))) :add (try (do (zk-multi-create-many-seq-nodes conn "/seq-" (:value op)) diff --git a/tests/jepsen.nukeeper/src/jepsen/nukeeper/main.clj b/tests/jepsen.nukeeper/src/jepsen/nukeeper/main.clj index f0b4998dad0..7380a9d9cbb 100644 --- a/tests/jepsen.nukeeper/src/jepsen/nukeeper/main.clj +++ b/tests/jepsen.nukeeper/src/jepsen/nukeeper/main.clj @@ -103,7 +103,7 @@ (gen/nemesis (gen/once {:type :info, :f :stop})) (gen/log "Waiting for recovery") (gen/sleep 10) - (gen/clients (gen/until-ok (:final-generator workload))))}))) + (gen/clients (:final-generator workload)))}))) (def all-nemesises (keys custom-nemesis/custom-nemesises)) diff --git a/tests/jepsen.nukeeper/src/jepsen/nukeeper/queue.clj b/tests/jepsen.nukeeper/src/jepsen/nukeeper/queue.clj index 951c0822ad2..494e0357bc1 100644 --- a/tests/jepsen.nukeeper/src/jepsen/nukeeper/queue.clj +++ b/tests/jepsen.nukeeper/src/jepsen/nukeeper/queue.clj @@ -40,11 +40,9 @@ (catch Exception _ (assoc op :type :info, :error :connect-error))) :drain ; drain via delete is to long, just list all nodes - (try - (do - (zk-sync conn) - (assoc op :type :ok :value (into #{} (map #(str %1) (zk-list conn "/"))))) - (catch Exception _ (assoc op :type :info, :error :connect-error))))) + (exec-with-retries 30 (fn [] + (zk-sync conn) + (assoc op :type :ok :value (into #{} (map #(str %1) (zk-list conn "/")))))))) (teardown! [_ test]) diff --git a/tests/jepsen.nukeeper/src/jepsen/nukeeper/set.clj b/tests/jepsen.nukeeper/src/jepsen/nukeeper/set.clj index 23461591eaf..01cc10e9a0f 100644 --- a/tests/jepsen.nukeeper/src/jepsen/nukeeper/set.clj +++ b/tests/jepsen.nukeeper/src/jepsen/nukeeper/set.clj @@ -22,11 +22,11 @@ (invoke! [this test op] (case (:f op) - :read (do - (zk-sync conn) - (assoc op - :type :ok - :value (read-string (:data (zk-get-str conn k))))) + :read (exec-with-retries 30 (fn [] + (zk-sync conn) + (assoc op + :type :ok + :value (read-string (:data (zk-get-str conn k)))))) :add (try (do (zk-add-to-set conn k (:value op)) diff --git a/tests/jepsen.nukeeper/src/jepsen/nukeeper/utils.clj b/tests/jepsen.nukeeper/src/jepsen/nukeeper/utils.clj index 0e0db2d3a6d..032a8829514 100644 --- a/tests/jepsen.nukeeper/src/jepsen/nukeeper/utils.clj +++ b/tests/jepsen.nukeeper/src/jepsen/nukeeper/utils.clj @@ -167,3 +167,14 @@ :--test_keeper_server.snapshot_storage_path coordination-snapshots-dir :--test_keeper_server.logs_storage_path coordination-logs-dir) (wait-clickhouse-alive! node test))) + +(defn exec-with-retries + [retries f & args] + (let [res (try {:value (apply f args)} + (catch Exception e + (if (zero? retries) + (throw e) + {:exception e})))] + (if (:exception res) + (do (Thread/sleep 1000) (recur (dec retries) f args)) + (:value res)))) From e101fbab53ec7c984bd0e6cc8ec5e6d9fd1a897a Mon Sep 17 00:00:00 2001 From: alesapin Date: Fri, 26 Mar 2021 19:57:23 +0300 Subject: [PATCH 146/155] Fix style --- .../src/jepsen/nukeeper/counter.clj | 6 +-- .../src/jepsen/nukeeper/db.clj | 51 +++++++++---------- .../src/jepsen/nukeeper/main.clj | 8 +-- .../src/jepsen/nukeeper/nemesis.clj | 8 +-- .../src/jepsen/nukeeper/queue.clj | 4 +- .../src/jepsen/nukeeper/set.clj | 8 +-- .../src/jepsen/nukeeper/utils.clj | 12 ++--- 7 files changed, 48 insertions(+), 49 deletions(-) diff --git a/tests/jepsen.nukeeper/src/jepsen/nukeeper/counter.clj b/tests/jepsen.nukeeper/src/jepsen/nukeeper/counter.clj index 7e2cd00736f..b426a8ea90d 100644 --- a/tests/jepsen.nukeeper/src/jepsen/nukeeper/counter.clj +++ b/tests/jepsen.nukeeper/src/jepsen/nukeeper/counter.clj @@ -25,9 +25,9 @@ (invoke! [this test op] (case (:f op) :read (exec-with-retries 30 (fn [] - (assoc op - :type :ok - :value (count (zk-list conn "/"))))) + (assoc op + :type :ok + :value (count (zk-list conn "/"))))) :add (try (do (zk-multi-create-many-seq-nodes conn "/seq-" (:value op)) diff --git a/tests/jepsen.nukeeper/src/jepsen/nukeeper/db.clj b/tests/jepsen.nukeeper/src/jepsen/nukeeper/db.clj index 7bc2b9c6cea..d82d628cc95 100644 --- a/tests/jepsen.nukeeper/src/jepsen/nukeeper/db.clj +++ b/tests/jepsen.nukeeper/src/jepsen/nukeeper/db.clj @@ -32,20 +32,20 @@ (defn unpack-deb [path] (do - (c/exec :dpkg :-x path common-prefix) - (c/exec :rm :-f path) - (c/exec :mv (str common-prefix "/usr/bin/clickhouse") common-prefix) - (c/exec :rm :-rf (str common-prefix "/usr") (str common-prefix "/etc")))) + (c/exec :dpkg :-x path common-prefix) + (c/exec :rm :-f path) + (c/exec :mv (str common-prefix "/usr/bin/clickhouse") common-prefix) + (c/exec :rm :-rf (str common-prefix "/usr") (str common-prefix "/etc")))) (defn unpack-tgz [path] (do - (c/exec :mkdir :-p (str common-prefix "/unpacked")) - (c/exec :tar :-zxvf path :-C (str common-prefix "/unpacked")) - (c/exec :rm :-f path) - (let [subdir (c/exec :ls (str common-prefix "/unpacked"))] - (c/exec :mv (str common-prefix "/unpacked/" subdir "/usr/bin/clickhouse") common-prefix) - (c/exec :rm :-fr (str common-prefix "/unpacked"))))) + (c/exec :mkdir :-p (str common-prefix "/unpacked")) + (c/exec :tar :-zxvf path :-C (str common-prefix "/unpacked")) + (c/exec :rm :-f path) + (let [subdir (c/exec :ls (str common-prefix "/unpacked"))] + (c/exec :mv (str common-prefix "/unpacked/" subdir "/usr/bin/clickhouse") common-prefix) + (c/exec :rm :-fr (str common-prefix "/unpacked"))))) (defn chmod-binary [path] @@ -85,10 +85,10 @@ (defn install-configs [test node] - (c/exec :echo (slurp (io/resource "config.xml")) :> (str configs-dir "/config.xml")) - (c/exec :echo (slurp (io/resource "users.xml")) :> (str configs-dir "/users.xml")) - (c/exec :echo (slurp (io/resource "listen.xml")) :> (str sub-configs-dir "/listen.xml")) - (c/exec :echo (cluster-config test node (slurp (io/resource "test_keeper_config.xml"))) :> (str sub-configs-dir "/test_keeper_config.xml"))) + (c/exec :echo (slurp (io/resource "config.xml")) :> (str configs-dir "/config.xml")) + (c/exec :echo (slurp (io/resource "users.xml")) :> (str configs-dir "/users.xml")) + (c/exec :echo (slurp (io/resource "listen.xml")) :> (str sub-configs-dir "/listen.xml")) + (c/exec :echo (cluster-config test node (slurp (io/resource "test_keeper_config.xml"))) :> (str sub-configs-dir "/test_keeper_config.xml"))) (defn db [version reuse-binary] @@ -96,25 +96,24 @@ (setup! [_ test node] (c/su (do - (info "Preparing directories") - (prepare-dirs) - (if (or (not (cu/exists? binary-path)) (not reuse-binary)) + (info "Preparing directories") + (prepare-dirs) + (if (or (not (cu/exists? binary-path)) (not reuse-binary)) (do (info "Downloading clickhouse") - (install-downloaded-clickhouse (download-clickhouse version))) + (install-downloaded-clickhouse (download-clickhouse version))) (info "Binary already exsist on path" binary-path "skipping download")) - (info "Installing configs") - (install-configs test node) - (info "Starting server") - (start-clickhouse! node test) - (info "ClickHouse started")))) - + (info "Installing configs") + (install-configs test node) + (info "Starting server") + (start-clickhouse! node test) + (info "ClickHouse started")))) (teardown! [_ test node] (info node "Tearing down clickhouse") (kill-clickhouse! node test) (c/su (if (not reuse-binary) - (c/exec :rm :-rf binary-path)) + (c/exec :rm :-rf binary-path)) (c/exec :rm :-rf pid-file-path) (c/exec :rm :-rf data-dir) (c/exec :rm :-rf logs-dir) @@ -125,5 +124,5 @@ (c/su (kill-clickhouse! node test) (c/cd data-dir - (c/exec :tar :czf "coordination.tar.gz" "coordination"))) + (c/exec :tar :czf "coordination.tar.gz" "coordination"))) [stderr-file (str logs-dir "/clickhouse-server.log") (str data-dir "/coordination.tar.gz")]))) diff --git a/tests/jepsen.nukeeper/src/jepsen/nukeeper/main.clj b/tests/jepsen.nukeeper/src/jepsen/nukeeper/main.clj index 7380a9d9cbb..b9439097e85 100644 --- a/tests/jepsen.nukeeper/src/jepsen/nukeeper/main.clj +++ b/tests/jepsen.nukeeper/src/jepsen/nukeeper/main.clj @@ -134,10 +134,10 @@ [cli worload-nemeseis-collection] (take (:test-count cli) (shuffle (for [[workload nemesis] worload-nemeseis-collection] - (assoc cli - :nemesis nemesis - :workload workload - :test-count 1))))) + (assoc cli + :nemesis nemesis + :workload workload + :test-count 1))))) (defn all-tests "Turns CLI options into a sequence of tests." [test-fn cli] diff --git a/tests/jepsen.nukeeper/src/jepsen/nukeeper/nemesis.clj b/tests/jepsen.nukeeper/src/jepsen/nukeeper/nemesis.clj index 8314d29f575..7d4941cdc8e 100644 --- a/tests/jepsen.nukeeper/src/jepsen/nukeeper/nemesis.clj +++ b/tests/jepsen.nukeeper/src/jepsen/nukeeper/nemesis.clj @@ -85,13 +85,13 @@ (defn logs-and-snapshots-corruption-nemesis [] (corruptor-nemesis coordination-data-dir (fn [path] - (do - (corrupt-file (select-last-file (str path "/snapshots"))) - (corrupt-file (select-last-file (str path "/logs"))))))) + (do + (corrupt-file (select-last-file (str path "/snapshots"))) + (corrupt-file (select-last-file (str path "/logs"))))))) (defn drop-all-corruption-nemesis [] (corruptor-nemesis coordination-data-dir (fn [path] - (c/exec :rm :-fr path)))) + (c/exec :rm :-fr path)))) (defn partition-bridge-nemesis [] diff --git a/tests/jepsen.nukeeper/src/jepsen/nukeeper/queue.clj b/tests/jepsen.nukeeper/src/jepsen/nukeeper/queue.clj index 494e0357bc1..308778983aa 100644 --- a/tests/jepsen.nukeeper/src/jepsen/nukeeper/queue.clj +++ b/tests/jepsen.nukeeper/src/jepsen/nukeeper/queue.clj @@ -41,8 +41,8 @@ :drain ; drain via delete is to long, just list all nodes (exec-with-retries 30 (fn [] - (zk-sync conn) - (assoc op :type :ok :value (into #{} (map #(str %1) (zk-list conn "/")))))))) + (zk-sync conn) + (assoc op :type :ok :value (into #{} (map #(str %1) (zk-list conn "/")))))))) (teardown! [_ test]) diff --git a/tests/jepsen.nukeeper/src/jepsen/nukeeper/set.clj b/tests/jepsen.nukeeper/src/jepsen/nukeeper/set.clj index 01cc10e9a0f..f9d21a8dc62 100644 --- a/tests/jepsen.nukeeper/src/jepsen/nukeeper/set.clj +++ b/tests/jepsen.nukeeper/src/jepsen/nukeeper/set.clj @@ -23,10 +23,10 @@ (invoke! [this test op] (case (:f op) :read (exec-with-retries 30 (fn [] - (zk-sync conn) - (assoc op - :type :ok - :value (read-string (:data (zk-get-str conn k)))))) + (zk-sync conn) + (assoc op + :type :ok + :value (read-string (:data (zk-get-str conn k)))))) :add (try (do (zk-add-to-set conn k (:value op)) diff --git a/tests/jepsen.nukeeper/src/jepsen/nukeeper/utils.clj b/tests/jepsen.nukeeper/src/jepsen/nukeeper/utils.clj index 032a8829514..cfe9add238b 100644 --- a/tests/jepsen.nukeeper/src/jepsen/nukeeper/utils.clj +++ b/tests/jepsen.nukeeper/src/jepsen/nukeeper/utils.clj @@ -113,11 +113,11 @@ first-child (first (sort children))] (if (not (nil? first-child)) (try - (do (.check txn path (:version stat)) - (.setData txn path (data/to-bytes "") -1) ; I'm just checking multitransactions - (.delete txn (str path first-child) -1) - (.commit txn) - first-child) + (do (.check txn path (:version stat)) + (.setData txn path (data/to-bytes "") -1) ; I'm just checking multitransactions + (.delete txn (str path first-child) -1) + (.commit txn) + first-child) (catch KeeperException$BadVersionException _ nil) ; Even if we got connection loss, delete may actually be executed. ; This function is used for queue model, which strictly require @@ -166,7 +166,7 @@ :--logger.errorlog (str logs-dir "/clickhouse-server.err.log") :--test_keeper_server.snapshot_storage_path coordination-snapshots-dir :--test_keeper_server.logs_storage_path coordination-logs-dir) - (wait-clickhouse-alive! node test))) + (wait-clickhouse-alive! node test))) (defn exec-with-retries [retries f & args] From 48ba36b682b26b6bd5524df6be19c1938478179d Mon Sep 17 00:00:00 2001 From: Robert Hodges Date: Fri, 26 Mar 2021 10:34:48 -0700 Subject: [PATCH 147/155] Update postgresql.md Corrected typo in PostgreSQL Table Engine page title. --- docs/en/engines/table-engines/integrations/postgresql.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/en/engines/table-engines/integrations/postgresql.md b/docs/en/engines/table-engines/integrations/postgresql.md index 1a2ccf3e0dc..8326038407f 100644 --- a/docs/en/engines/table-engines/integrations/postgresql.md +++ b/docs/en/engines/table-engines/integrations/postgresql.md @@ -3,7 +3,7 @@ toc_priority: 8 toc_title: PostgreSQL --- -# PosgtreSQL {#postgresql} +# PostgreSQL {#postgresql} The PostgreSQL engine allows you to perform `SELECT` queries on data that is stored on a remote PostgreSQL server. From e32beab913ba29ac92b1d91b31aa9171c980a6e6 Mon Sep 17 00:00:00 2001 From: Alexey Milovidov Date: Fri, 26 Mar 2021 22:08:53 +0300 Subject: [PATCH 148/155] Add a test for mmap IO --- .../1_stateful/00162_mmap_compression_none.reference | 1 + tests/queries/1_stateful/00162_mmap_compression_none.sql | 8 ++++++++ 2 files changed, 9 insertions(+) create mode 100644 tests/queries/1_stateful/00162_mmap_compression_none.reference create mode 100644 tests/queries/1_stateful/00162_mmap_compression_none.sql diff --git a/tests/queries/1_stateful/00162_mmap_compression_none.reference b/tests/queries/1_stateful/00162_mmap_compression_none.reference new file mode 100644 index 00000000000..3495cc537c1 --- /dev/null +++ b/tests/queries/1_stateful/00162_mmap_compression_none.reference @@ -0,0 +1 @@ +687074654 diff --git a/tests/queries/1_stateful/00162_mmap_compression_none.sql b/tests/queries/1_stateful/00162_mmap_compression_none.sql new file mode 100644 index 00000000000..2178644214a --- /dev/null +++ b/tests/queries/1_stateful/00162_mmap_compression_none.sql @@ -0,0 +1,8 @@ +DROP TABLE IF EXISTS hits_none; +CREATE TABLE hits_none (Title String CODEC(NONE)) ENGINE = MergeTree ORDER BY tuple(); +INSERT INTO hits_none SELECT Title FROM test.hits; + +SET min_bytes_to_use_mmap_io = 1; +SELECT sum(length(Title)) FROM hits_none; + +DROP TABLE hits_none; From 1e08304fb1ea8b24a9debaf56d3afd40558b993a Mon Sep 17 00:00:00 2001 From: Alexey Milovidov Date: Fri, 26 Mar 2021 22:12:56 +0300 Subject: [PATCH 149/155] Add performance test --- tests/performance/codec_none.xml | 13 +++++++++++++ 1 file changed, 13 insertions(+) create mode 100644 tests/performance/codec_none.xml diff --git a/tests/performance/codec_none.xml b/tests/performance/codec_none.xml new file mode 100644 index 00000000000..e6eb9773a66 --- /dev/null +++ b/tests/performance/codec_none.xml @@ -0,0 +1,13 @@ + + + hits_10m_single + + + CREATE TABLE hits_none (Title String CODEC(NONE)) ENGINE = MergeTree ORDER BY tuple() + INSERT INTO hits_none SELECT Title FROM test.hits + OPTIMIZE TABLE hits_none FINAL + + + + DROP TABLE hits_none + From d01af5e9f8d17b0e4541cb2dda0df847e28dcaa8 Mon Sep 17 00:00:00 2001 From: Ilya Yatsishin <2159081+qoega@users.noreply.github.com> Date: Fri, 26 Mar 2021 22:44:40 +0300 Subject: [PATCH 150/155] touch to rebuild --- docker/test/sqlancer/Dockerfile | 1 - 1 file changed, 1 deletion(-) diff --git a/docker/test/sqlancer/Dockerfile b/docker/test/sqlancer/Dockerfile index 6bcdc3df5cd..253ca1b729a 100644 --- a/docker/test/sqlancer/Dockerfile +++ b/docker/test/sqlancer/Dockerfile @@ -2,7 +2,6 @@ FROM ubuntu:20.04 RUN apt-get update --yes && env DEBIAN_FRONTEND=noninteractive apt-get install wget unzip git openjdk-14-jdk maven python3 --yes --no-install-recommends - RUN wget https://github.com/sqlancer/sqlancer/archive/master.zip -O /sqlancer.zip RUN mkdir /sqlancer && \ cd /sqlancer && \ From 0e3571478d754d50d0397341ccb276d80d1ee993 Mon Sep 17 00:00:00 2001 From: Alexey Milovidov Date: Sat, 27 Mar 2021 02:42:22 +0300 Subject: [PATCH 151/155] Code review changes --- src/Compression/CompressedReadBufferBase.cpp | 4 ++-- src/Compression/ICompressionCodec.cpp | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/src/Compression/CompressedReadBufferBase.cpp b/src/Compression/CompressedReadBufferBase.cpp index eb4d6ea5986..79757d6f151 100644 --- a/src/Compression/CompressedReadBufferBase.cpp +++ b/src/Compression/CompressedReadBufferBase.cpp @@ -232,8 +232,8 @@ void CompressedReadBufferBase::decompress(BufferBase::Buffer & to, size_t size_d UInt8 header_size = ICompressionCodec::getHeaderSize(); if (size_compressed_without_checksum < header_size) throw Exception(ErrorCodes::CORRUPTED_DATA, - "Can't decompress data: the compressed data size ({}), this should include header size) is less than the header size ({})", - size_compressed_without_checksum, size_t(header_size)); + "Can't decompress data: the compressed data size ({}, this should include header size) is less than the header size ({})", + size_compressed_without_checksum, static_cast(header_size)); to = BufferBase::Buffer(compressed_buffer + header_size, compressed_buffer + size_compressed_without_checksum); } diff --git a/src/Compression/ICompressionCodec.cpp b/src/Compression/ICompressionCodec.cpp index dec2b633046..46a12e50828 100644 --- a/src/Compression/ICompressionCodec.cpp +++ b/src/Compression/ICompressionCodec.cpp @@ -98,7 +98,7 @@ UInt32 ICompressionCodec::decompress(const char * source, UInt32 source_size, ch UInt8 header_size = getHeaderSize(); if (source_size < header_size) - throw Exception(ErrorCodes::CORRUPTED_DATA, "Can't decompress data: the compressed data size ({}), this should include header size) is less than the header size ({})", source_size, size_t(header_size)); + throw Exception(ErrorCodes::CORRUPTED_DATA, "Can't decompress data: the compressed data size ({}, this should include header size) is less than the header size ({})", source_size, static_cast(header_size)); uint8_t our_method = getMethodByte(); uint8_t method = source[0]; From 613d1e3c17adf8a7d02520d1392e98a57b269e89 Mon Sep 17 00:00:00 2001 From: robot-clickhouse Date: Sun, 28 Mar 2021 04:47:27 +0300 Subject: [PATCH 152/155] Update version_date.tsv after release 21.3.4.25 --- utils/list-versions/version_date.tsv | 1 + 1 file changed, 1 insertion(+) diff --git a/utils/list-versions/version_date.tsv b/utils/list-versions/version_date.tsv index 231d22b50da..628806902b2 100644 --- a/utils/list-versions/version_date.tsv +++ b/utils/list-versions/version_date.tsv @@ -1,3 +1,4 @@ +v21.3.4.25-lts 2021-03-28 v21.3.3.14-lts 2021-03-19 v21.3.2.5-lts 2021-03-12 v21.2.6.1-stable 2021-03-15 From 771493f03a1bb23a571a653cf3328d7cb2de22a0 Mon Sep 17 00:00:00 2001 From: robot-clickhouse Date: Sun, 28 Mar 2021 05:03:48 +0300 Subject: [PATCH 153/155] Update version_date.tsv after release 21.2.7.11 --- utils/list-versions/version_date.tsv | 1 + 1 file changed, 1 insertion(+) diff --git a/utils/list-versions/version_date.tsv b/utils/list-versions/version_date.tsv index 628806902b2..799492cdd90 100644 --- a/utils/list-versions/version_date.tsv +++ b/utils/list-versions/version_date.tsv @@ -1,6 +1,7 @@ v21.3.4.25-lts 2021-03-28 v21.3.3.14-lts 2021-03-19 v21.3.2.5-lts 2021-03-12 +v21.2.7.11-stable 2021-03-28 v21.2.6.1-stable 2021-03-15 v21.2.5.5-stable 2021-03-02 v21.2.4.6-stable 2021-02-20 From 21ea7bf9ab484b794c34aeda0003239cb0eb0728 Mon Sep 17 00:00:00 2001 From: Alexey Milovidov Date: Sun, 28 Mar 2021 21:09:21 +0300 Subject: [PATCH 154/155] Add results from Kimmo Linna --- website/benchmark/hardware/index.html | 1 + .../benchmark/hardware/results/asus_a15.json | 54 +++++++++++++++++++ 2 files changed, 55 insertions(+) create mode 100644 website/benchmark/hardware/results/asus_a15.json diff --git a/website/benchmark/hardware/index.html b/website/benchmark/hardware/index.html index 92da6328f0f..a57930b279d 100644 --- a/website/benchmark/hardware/index.html +++ b/website/benchmark/hardware/index.html @@ -75,6 +75,7 @@ Results for Raspberry Pi and Digital Ocean CPU-optimized are from Fritz Wijay Results for Digitalocean (Storage-intesinve VMs) + (CPU/GP) are from Yiğit Konur and Metehan Çetinkaya of seo.do.
Results for 2x AMD EPYC 7F72 3.2 Ghz (Total 96 Cores, IBM Cloud's Bare Metal Service) from Yiğit Konur and Metehan Çetinkaya of seo.do.
Results for 2x AMD EPYC 7742 (128 physical cores, 1 TB DDR4-3200 RAM) from Yedige Davletgaliyev and Nikita Zhavoronkov of blockchair.com.
+Results for ASUS A15 (Ryzen laptop) are from Kimmo Linna.

diff --git a/website/benchmark/hardware/results/asus_a15.json b/website/benchmark/hardware/results/asus_a15.json new file mode 100644 index 00000000000..983dbde8681 --- /dev/null +++ b/website/benchmark/hardware/results/asus_a15.json @@ -0,0 +1,54 @@ +[ + { + "system": "Asus A15", + "system_full": "Asus A15 (16 × AMD Ryzen 7 4800H, 16 GiB RAM)", + "time": "2021-03-23 00:00:00", + "kind": "laptop", + "result": + [ +[0.004, 0.003, 0.003], +[0.019, 0.013, 0.012], +[0.053, 0.041, 0.037], +[0.106, 0.057, 0.056], +[0.158, 0.115, 0.110], +[0.324, 0.266, 0.262], +[0.027, 0.024, 0.026], +[0.017, 0.016, 0.017], +[0.644, 0.589, 0.582], +[0.733, 0.679, 0.679], +[0.233, 0.201, 0.197], +[0.276, 0.235, 0.236], +[1.025, 0.962, 0.962], +[1.342, 1.270, 1.264], +[1.170, 1.129, 1.124], +[1.375, 1.346, 1.351], +[3.271, 3.210, 3.242], +[1.960, 1.898, 1.907], +[5.997, 5.965, 5.983], +[0.106, 0.065, 0.055], +[1.264, 0.990, 0.989], +[1.555, 1.241, 1.239], +[3.798, 3.307, 3.280], +[1.949, 1.022, 0.995], +[0.393, 0.292, 0.292], +[0.307, 0.254, 0.255], +[0.378, 0.297, 0.290], +[1.632, 1.399, 1.386], +[2.111, 1.909, 1.900], +[3.349, 3.352, 3.357], +[0.892, 0.824, 0.816], +[1.505, 1.392, 1.378], +[9.105, 8.951, 8.914], +[5.195, 4.975, 4.919], +[5.150, 5.021, 4.955], +[1.756, 1.743, 1.749], +[0.161, 0.154, 0.158], +[0.108, 0.058, 0.055], +[0.101, 0.102, 0.052], +[0.365, 0.309, 0.334], +[0.050, 0.023, 0.023], +[0.037, 0.019, 0.015], +[0.023, 0.013, 0.018] + ] + } +] From a0a3380d91670b6f4e05aacb0e50dfa6ca161ad8 Mon Sep 17 00:00:00 2001 From: Alexey Milovidov Date: Sun, 28 Mar 2021 22:46:45 +0300 Subject: [PATCH 155/155] Remove useless headers --- src/Core/NamesAndTypes.cpp | 1 - src/Storages/IStorage.cpp | 3 --- 2 files changed, 4 deletions(-) diff --git a/src/Core/NamesAndTypes.cpp b/src/Core/NamesAndTypes.cpp index e96ce1824d2..7b1779d4346 100644 --- a/src/Core/NamesAndTypes.cpp +++ b/src/Core/NamesAndTypes.cpp @@ -6,7 +6,6 @@ #include #include #include -#include namespace DB diff --git a/src/Storages/IStorage.cpp b/src/Storages/IStorage.cpp index 2cbc36e02fe..39f6d1f632e 100644 --- a/src/Storages/IStorage.cpp +++ b/src/Storages/IStorage.cpp @@ -1,8 +1,5 @@ #include -#include -#include - #include #include #include