Compare commits

...

140 Commits

Author SHA1 Message Date
Vasily Nemkov
c0ae0240dc
Merge 1964ffe688 into c0c83236b6 2024-09-19 11:45:51 +05:30
Yakov Olkhovskiy
c0c83236b6
Merge pull request #69570 from alexkats/fix-azure
Mask azure connection string sensitive info
2024-09-19 05:40:47 +00:00
Yarik Briukhovetskyi
3eb5bc1a0f
Merge pull request #68963 from yariks5s/hive_partitioning_filtration
Filtering for hive partitioning
2024-09-18 22:16:58 +00:00
Robert Schulze
b94a7167a8
Merge pull request #69580 from rschu1ze/bump-libpqxx
Bump libpqxx to v7.7.5
2024-09-18 18:56:12 +00:00
Alex Katsman
b88cd79959 Mask azure connection string sensitive info 2024-09-18 18:32:22 +00:00
Konstantin Bogdanov
64e58baba1
Merge pull request #69682 from ClickHouse/more-asserts-for-hashjoin
Try fix asserts failure in `HashJoin`
2024-09-18 18:20:27 +00:00
max-vostrikov
a3fe155579
Merge pull request #69737 from ClickHouse/test_printf
added some edge cases for printf tests
2024-09-18 17:49:57 +00:00
Antonio Andelic
a997cfad2b
Merge pull request #68108 from ClickHouse/keeper-some-improvement2
Keeper improvements package
2024-09-18 16:35:57 +00:00
maxvostrikov
f4b4b3cc35 added some edge cases for printf tests
added some edge cases for printf tests
2024-09-18 17:22:36 +02:00
Konstantin Bogdanov
cb24849396
Move assert 2024-09-18 15:24:48 +02:00
Antonio Andelic
4f73c677ac Merge branch 'master' into keeper-some-improvement2 2024-09-18 13:19:24 +02:00
Yarik Briukhovetskyi
143d9f0201
Merge branch 'ClickHouse:master' into hive_partitioning_filtration 2024-09-18 11:11:04 +02:00
Antonio Andelic
3106653852 Fix watches 2024-09-18 10:47:40 +02:00
Konstantin Bogdanov
b08e727aef
Count allocated bytes from scratch after rerange 2024-09-17 19:02:10 +02:00
Yarik Briukhovetskyi
f52cdfb795
Merge branch 'ClickHouse:master' into hive_partitioning_filtration 2024-09-17 18:50:43 +02:00
Konstantin Bogdanov
a210f98819
Lint 2024-09-17 18:28:27 +02:00
Konstantin Bogdanov
7c5d55c6b2
Lint 2024-09-17 18:10:51 +02:00
Konstantin Bogdanov
80259659ff
More asserts 2024-09-17 18:03:19 +02:00
Antonio Andelic
8db3dddb3d Fix watches count and list request 2024-09-17 16:15:55 +02:00
Yarik Briukhovetskyi
3a7c68a052
Update src/Storages/VirtualColumnUtils.cpp
Co-authored-by: Kruglov Pavel <48961922+Avogar@users.noreply.github.com>
2024-09-17 15:39:26 +02:00
Yarik Briukhovetskyi
e8d50aa97f
review 2024-09-17 15:02:33 +02:00
Yarik Briukhovetskyi
cb92aaf968
fix 03232_file_path_normalizing 2024-09-17 11:26:13 +02:00
Antonio Andelic
f3654b8fc8 Merge branch 'master' into keeper-some-improvement2 2024-09-17 10:35:38 +02:00
Antonio Andelic
676b6238d0 Update comments 2024-09-17 10:30:39 +02:00
Antonio Andelic
e876997ebb Merge branch 'master' into keeper-some-improvement2 2024-09-17 10:28:02 +02:00
Yarik Briukhovetskyi
0cdec0acf1
fix logical error 2024-09-16 19:13:30 +02:00
Yarik Briukhovetskyi
04f23332c3
fix filter issue 2024-09-16 15:59:22 +02:00
Yarik Briukhovetskyi
7d5203f8a7
add resize for partitioning_columns 2024-09-13 21:38:48 +02:00
Yarik Briukhovetskyi
0d1d750437
fix crash 2024-09-13 20:43:51 +02:00
Yarik Briukhovetskyi
ad31d86a15
move the block inserting 2024-09-13 19:58:19 +02:00
Yarik Briukhovetskyi
991279e5c6
revert 2024-09-13 19:23:00 +02:00
Yarik Briukhovetskyi
c184aae686
review 2024-09-13 16:40:01 +02:00
Yarik Briukhovetskyi
14a6b0422b
disable optimize_count_from_files 2024-09-13 16:33:17 +02:00
Robert Schulze
aab0d3dd9e
Bump to 7.7.5 2024-09-12 19:42:32 +00:00
Robert Schulze
5a34b9f24e
Bump to 7.6.1 2024-09-12 19:14:41 +00:00
Robert Schulze
a0a4858e00
Scratch build of libpqxx at 7.5.3 + patches 2024-09-12 18:55:35 +00:00
Yarik Briukhovetskyi
e8cec05d08
shellcheck 2024-09-11 13:52:20 +02:00
Yarik Briukhovetskyi
2876a4e714
add retries 2024-09-11 13:32:12 +02:00
Andrey Zvonov
1964ffe688 fix build 2024-09-09 14:24:24 +00:00
Andrey Zvonov
773a9ba52e Merge branch 'master' of github.com:ClickHouse/ClickHouse into zvonand-partmut 2024-09-09 11:01:42 +00:00
Andrey Zvonov
15606f1cef resolve conflicts 2024-09-08 17:28:13 +00:00
Andrey Zvonov
742c382db9 small improvements 2024-09-08 17:23:30 +00:00
Antonio Andelic
65019c4b9b Merge branch 'master' into keeper-some-improvement2 2024-09-07 20:59:04 +02:00
Antonio Andelic
190339c4e6 Fix snapshot sync 2024-09-07 17:34:59 +02:00
Antonio Andelic
5a86371b02 Merge branch 'master' into keeper-some-improvement2 2024-09-07 11:32:44 +02:00
Yarik Briukhovetskyi
a903e1a726
remove logging + fixing bug 2024-09-06 20:24:18 +02:00
Antonio Andelic
03c7f3817b Correct lock order 2024-09-06 15:41:04 +02:00
Antonio Andelic
f44eaa808d Merge branch 'master' into keeper-some-improvement2 2024-09-06 09:35:56 +02:00
Antonio Andelic
e388f6f99b Remove useless log 2024-09-06 09:35:02 +02:00
Yarik Briukhovetskyi
2fa6be55ff
tests fix 2024-09-04 17:02:01 +02:00
Antonio Andelic
a3e233a537 Fix watch 2024-09-04 15:19:56 +02:00
Andrey Zvonov
5039e89b9b Merge branch 'master' of github.com:ClickHouse/ClickHouse into zvonand-partmut 2024-09-04 12:55:52 +00:00
Yarik Briukhovetskyi
8896d1b78b
try to fix tests 2024-09-04 14:46:29 +02:00
Antonio Andelic
955412888c Merge branch 'master' into keeper-some-improvement2 2024-09-04 11:30:29 +02:00
Antonio Andelic
9633563fbd Fix 2024-09-04 11:30:05 +02:00
Yarik Briukhovetskyi
f688b903db
empty commit 2024-09-03 15:58:22 +02:00
Yarik Briukhovetskyi
21f9669836
empty commit 2024-09-03 15:41:43 +02:00
Yarik Briukhovetskyi
1a386ae4d5
Merge branch 'ClickHouse:master' into hive_partitioning_filtration 2024-09-03 15:35:31 +02:00
Yarik Briukhovetskyi
24f4e87f8b
revert debugging in tests 2024-09-03 15:20:22 +02:00
Antonio Andelic
79fc8d67ad More fixes 2024-09-02 15:46:04 +02:00
Antonio Andelic
596ba574e3 Merge branch 'master' into keeper-some-improvement2 2024-09-02 09:31:02 +02:00
Antonio Andelic
e968984d17 More changes 2024-09-02 08:25:17 +02:00
Yarik Briukhovetskyi
620640a042
just to test 2024-08-30 12:58:21 +02:00
Yarik Briukhovetskyi
ec469a117d
testing 2024-08-30 00:56:35 +02:00
Yarik Briukhovetskyi
7a879980d8
try to fix tests 2024-08-29 18:25:11 +02:00
Yarik Briukhovetskyi
2adc61c215
add flush logs 2024-08-29 16:39:22 +02:00
Andrey Zvonov
e3da119d88 Merge branch 'master' of github.com:ClickHouse/ClickHouse into zvonand-partmut 2024-08-29 14:01:32 +00:00
Andrey Zvonov
c3290bd88c merge master 2024-08-29 13:56:15 +00:00
Yarik Briukhovetskyi
afc4d08aad
add no-fasttest tag 2024-08-29 13:31:05 +02:00
yariks5s
edc5d8dd92 fix path 2024-08-28 23:15:01 +00:00
yariks5s
d6b2a9d534 CLICKHOUSE_LOCAL -> CLIENT 2024-08-28 22:32:44 +00:00
yariks5s
dc97bd6b92 review + testing the code 2024-08-28 17:22:47 +00:00
Yarik Briukhovetskyi
60c6eb2610
trying to fix the test 2024-08-27 19:42:47 +02:00
Yarik Briukhovetskyi
9133505952
fix the test 2024-08-27 19:16:05 +02:00
Yarik Briukhovetskyi
2741bf00e4 chmod +x 2024-08-27 16:53:14 +00:00
Yarik Briukhovetskyi
4eca00a666
fix style 2024-08-27 18:10:41 +02:00
Yarik Briukhovetskyi
c6804122cb
fix shell 2024-08-27 16:52:29 +02:00
Yarik Briukhovetskyi
189cbe25fe
init 2024-08-27 16:28:18 +02:00
Antonio Andelic
c61fc591c4 Use functions instead of classes 2024-08-13 11:33:17 +02:00
Antonio Andelic
dcbc590302 Merge branch 'master' into keeper-some-improvement2 2024-08-13 09:01:10 +02:00
Antonio Andelic
b6c3619543 Whitespace 2024-08-09 15:41:11 +02:00
Antonio Andelic
b2172af817 Merge branch 'master' into keeper-some-improvement2 2024-08-09 14:50:52 +02:00
Antonio Andelic
5ea4844d69 Merge branch 'master' into keeper-some-improvement2 2024-08-07 11:26:33 +02:00
Andrey Zvonov
3c2d8023e1 Merge branch 'master' of github.com:ClickHouse/ClickHouse into zvonand-partmut 2024-07-27 10:05:25 +00:00
Andrey Zvonov
6971623a5b Merge branch 'master' of github.com:ClickHouse/ClickHouse into zvonand-partmut 2024-07-26 12:27:14 +00:00
Andrey Zvonov
3197dd9033 Merge branch 'master' of github.com:ClickHouse/ClickHouse into zvonand-partmut 2024-07-22 18:52:34 +00:00
Andrey Zvonov
13d5b1e474 Merge branch 'master' of github.com:ClickHouse/ClickHouse into zvonand-partmut 2024-07-22 15:15:51 +00:00
Antonio Andelic
48e7057200 Merge branch 'master' into keeper-some-improvement2 2024-07-22 16:51:20 +02:00
Andrey Zvonov
bb97dca72d Merge branch 'master' of github.com:ClickHouse/ClickHouse into zvonand-partmut 2024-07-22 14:43:47 +00:00
Antonio Andelic
5a96290cce Merge branch 'master' into keeper-some-improvement2 2024-07-10 12:45:43 +02:00
Andrey Zvonov
6087d4032e merge master 2024-07-05 11:45:51 +00:00
Andrey Zvonov
b323a212f6 remove excessive test 2024-07-05 11:25:15 +00:00
Antonio Andelic
7e22af06f1 Merge branch 'master' into keeper-some-improvement2 2024-07-02 09:01:48 +02:00
Andrey Zvonov
1b5340a25c Merge branch 'master' of github.com:ClickHouse/ClickHouse into zvonand-partmut 2024-06-24 17:37:49 +00:00
Andrey Zvonov
c04bbdf191 remove unused variables(2) 2024-06-24 15:16:13 +00:00
Andrey Zvonov
7c4dd00662 remove unused variables 2024-06-24 13:19:44 +00:00
Andrey Zvonov
fcbd389d45 Merge branch 'master' of github.com:ClickHouse/ClickHouse into zvonand-partmut 2024-06-24 13:18:02 +00:00
Andrey Zvonov
77189321f6 remove debug gtest 2024-06-24 08:11:06 +00:00
Andrey Zvonov
4f1df2baef remove debug output from unittest 2024-06-21 18:50:27 +00:00
Andrey Zvonov
3870072aa3 remove unused function 2024-06-21 16:09:09 +00:00
Andrey Zvonov
6b8f80a6bc fix shellcheck in test 2024-06-21 14:57:55 +00:00
Andrey Zvonov
644e52a274 merge latest master 2024-06-21 14:30:15 +00:00
Andrey Zvonov
e80e29c1bc try make test not flaky 2024-06-21 13:17:20 +00:00
Antonio Andelic
ac78184fe7 Merge branch 'tracing-try-2' into keeper-some-improvement2 2024-06-18 11:04:00 +02:00
Antonio Andelic
1777ff37c0 Merge branch 'master' into keeper-some-improvement2 2024-06-18 11:03:38 +02:00
Antonio Andelic
7dca59da56 Revert "Merge branch 'use-thread-from-global-pool-in-poco-threadpool' into keeper-some-improvement"
This reverts commit 737d7484c5, reversing
changes made to b3a742304e.
2024-06-17 09:03:49 +02:00
Antonio Andelic
0fa45c3954 More parallel storage 2024-06-11 16:39:35 +02:00
Antonio Andelic
c802d7d58a Writing improvements 2024-06-11 14:35:26 +02:00
Antonio Andelic
5ab06caffc Merge branch 'keeper-parallel-storage' into keeper-some-improvement2 2024-06-11 10:18:27 +02:00
Antonio Andelic
737d7484c5 Merge branch 'use-thread-from-global-pool-in-poco-threadpool' into keeper-some-improvement 2024-06-11 09:46:58 +02:00
Antonio Andelic
b3a742304e Merge branch 'master' into keeper-some-improvement 2024-06-11 09:46:41 +02:00
kssenii
6514d72fea Move servers pool back 2024-06-10 18:53:51 +02:00
kssenii
c3d4b429d9 Fix merge 2024-06-10 15:39:54 +02:00
kssenii
7ff848c2c8 Merge remote-tracking branch 'origin/master' into use-thread-from-global-pool-in-poco-threadpool 2024-06-10 15:20:03 +02:00
kssenii
a11ba3f437 Fix shutdown 2024-06-10 15:19:03 +02:00
kssenii
6604d94271 Ping CI: skip fast test to see all stateless runs 2024-06-07 17:11:49 +02:00
kssenii
e30fa1da4d Fix ThreadStatus 2024-06-07 15:03:13 +02:00
kssenii
7ea3345e0d Use ThreadFromGlobalPool in Poco::ThreadPool 2024-06-06 17:25:15 +02:00
kssenii
1e97d73bd0 Squashed commit of the following:
commit 27fe0439fa
Merge: bfb1c4c793 bb469e0d45
Author: Antonio Andelic <antonio@clickhouse.com>
Date:   Thu Jun 6 14:36:02 2024 +0200

    Merge branch 'master' into fix-global-trace-collector

commit bfb1c4c793
Author: Antonio Andelic <antonio@clickhouse.com>
Date:   Thu Jun 6 11:29:42 2024 +0200

    better

commit fcee260b25
Author: Antonio Andelic <antonio2368@users.noreply.github.com>
Date:   Thu Jun 6 11:22:48 2024 +0200

    Update src/Interpreters/TraceCollector.h

    Co-authored-by: alesapin <alesapin@clickhouse.com>

commit 1d3cf17053
Author: Antonio Andelic <antonio@clickhouse.com>
Date:   Thu Jun 6 11:11:08 2024 +0200

    Fix global trace collector
2024-06-06 17:13:37 +02:00
Antonio Andelic
f0e9703384 Some small improvements 2024-06-06 09:45:07 +02:00
Antonio Andelic
514941627b Merge branch 'master' into keeper-parallel-storage 2024-06-05 15:31:57 +02:00
Antonio Andelic
acc08c65d9 Add stopwatch 2024-05-22 11:56:45 +02:00
Antonio Andelic
f1e4403f98 Merge branch 'master' into keeper-parallel-storage 2024-05-22 11:39:57 +02:00
Antonio Andelic
b1d53f0472 Merge branch 'master' into keeper-parallel-storage 2024-04-29 15:13:19 +02:00
Vasily Nemkov
615b8265dc Less flaky test 2024-04-03 13:29:23 +00:00
Vasily Nemkov
8a31162f47 Fixed binary_tidy build
fixes readability-redundant-access-specifiers warning
2024-04-02 11:38:45 +00:00
Vasily Nemkov
e0673c5014 Updated .reference to match changes in the test 2024-04-01 16:11:52 +00:00
Vasily Nemkov
345011385f Fixed style issue 2024-03-29 21:19:36 +00:00
Vasily Nemkov
e984bb5bff Merge remote-tracking branch 'upstream/master' into fix_replace_partition_stop_the_world 2024-03-29 19:47:37 +00:00
Vasily Nemkov
8ace74e4b2 Fixed crash caused by dereferencing nullptr. 2024-03-29 19:26:15 +00:00
Antonio Andelic
bc3cfb008e Merge branch 'master' into keeper-parallel-storage 2024-03-25 13:14:57 +01:00
Vasily Nemkov
a0ade7efc0 Fixed build issues 2024-03-13 11:29:14 +00:00
Vasily Nemkov
b24e415b0a Using PartitionActionBlocker to block merges on certain partitions
+ some tests
2024-03-08 15:02:26 +00:00
Antonio Andelic
9791a2ea40 Merge branch 'keeper-batch-flushes' into keeper-parallel-storage 2023-09-08 16:26:12 +00:00
Antonio Andelic
9fb9d16737 Merge branch 'keeper-batch-flushes' into keeper-parallel-storage 2023-09-06 13:30:05 +00:00
Antonio Andelic
6be1d0724a More mutex 2023-09-06 13:04:08 +00:00
Antonio Andelic
9238520490 Merge branch 'master' into keeper-parallel-storage 2023-09-06 10:57:33 +00:00
Antonio Andelic
dd1bb579df Better 2023-09-05 12:05:37 +00:00
Antonio Andelic
57943798b7 Merge branch 'master' into keeper-parallel-storage 2023-09-05 08:46:38 +00:00
Antonio Andelic
b43c3d75a2 Initial implementation 2023-09-04 14:49:49 +00:00
66 changed files with 4281 additions and 2764 deletions

2
contrib/libpqxx vendored

@ -1 +1 @@
Subproject commit c995193a3a14d71f4711f1f421f65a1a1db64640
Subproject commit 41e4c331564167cca97ad6eccbd5b8879c2ca044

View File

@ -1,9 +1,9 @@
set (LIBRARY_DIR "${ClickHouse_SOURCE_DIR}/contrib/libpqxx")
set (SRCS
"${LIBRARY_DIR}/src/strconv.cxx"
"${LIBRARY_DIR}/src/array.cxx"
"${LIBRARY_DIR}/src/binarystring.cxx"
"${LIBRARY_DIR}/src/blob.cxx"
"${LIBRARY_DIR}/src/connection.cxx"
"${LIBRARY_DIR}/src/cursor.cxx"
"${LIBRARY_DIR}/src/encodings.cxx"
@ -12,59 +12,25 @@ set (SRCS
"${LIBRARY_DIR}/src/field.cxx"
"${LIBRARY_DIR}/src/largeobject.cxx"
"${LIBRARY_DIR}/src/notification.cxx"
"${LIBRARY_DIR}/src/params.cxx"
"${LIBRARY_DIR}/src/pipeline.cxx"
"${LIBRARY_DIR}/src/result.cxx"
"${LIBRARY_DIR}/src/robusttransaction.cxx"
"${LIBRARY_DIR}/src/row.cxx"
"${LIBRARY_DIR}/src/sql_cursor.cxx"
"${LIBRARY_DIR}/src/strconv.cxx"
"${LIBRARY_DIR}/src/stream_from.cxx"
"${LIBRARY_DIR}/src/stream_to.cxx"
"${LIBRARY_DIR}/src/subtransaction.cxx"
"${LIBRARY_DIR}/src/time.cxx"
"${LIBRARY_DIR}/src/transaction.cxx"
"${LIBRARY_DIR}/src/transaction_base.cxx"
"${LIBRARY_DIR}/src/row.cxx"
"${LIBRARY_DIR}/src/params.cxx"
"${LIBRARY_DIR}/src/util.cxx"
"${LIBRARY_DIR}/src/version.cxx"
"${LIBRARY_DIR}/src/wait.cxx"
)
# Need to explicitly include each header file, because in the directory include/pqxx there are also files
# like just 'array'. So if including the whole directory with `target_include_directories`, it will make
# conflicts with all includes of <array>.
set (HDRS
"${LIBRARY_DIR}/include/pqxx/array.hxx"
"${LIBRARY_DIR}/include/pqxx/params.hxx"
"${LIBRARY_DIR}/include/pqxx/binarystring.hxx"
"${LIBRARY_DIR}/include/pqxx/composite.hxx"
"${LIBRARY_DIR}/include/pqxx/connection.hxx"
"${LIBRARY_DIR}/include/pqxx/cursor.hxx"
"${LIBRARY_DIR}/include/pqxx/dbtransaction.hxx"
"${LIBRARY_DIR}/include/pqxx/errorhandler.hxx"
"${LIBRARY_DIR}/include/pqxx/except.hxx"
"${LIBRARY_DIR}/include/pqxx/field.hxx"
"${LIBRARY_DIR}/include/pqxx/isolation.hxx"
"${LIBRARY_DIR}/include/pqxx/largeobject.hxx"
"${LIBRARY_DIR}/include/pqxx/nontransaction.hxx"
"${LIBRARY_DIR}/include/pqxx/notification.hxx"
"${LIBRARY_DIR}/include/pqxx/pipeline.hxx"
"${LIBRARY_DIR}/include/pqxx/prepared_statement.hxx"
"${LIBRARY_DIR}/include/pqxx/result.hxx"
"${LIBRARY_DIR}/include/pqxx/robusttransaction.hxx"
"${LIBRARY_DIR}/include/pqxx/row.hxx"
"${LIBRARY_DIR}/include/pqxx/separated_list.hxx"
"${LIBRARY_DIR}/include/pqxx/strconv.hxx"
"${LIBRARY_DIR}/include/pqxx/stream_from.hxx"
"${LIBRARY_DIR}/include/pqxx/stream_to.hxx"
"${LIBRARY_DIR}/include/pqxx/subtransaction.hxx"
"${LIBRARY_DIR}/include/pqxx/transaction.hxx"
"${LIBRARY_DIR}/include/pqxx/transaction_base.hxx"
"${LIBRARY_DIR}/include/pqxx/types.hxx"
"${LIBRARY_DIR}/include/pqxx/util.hxx"
"${LIBRARY_DIR}/include/pqxx/version.hxx"
"${LIBRARY_DIR}/include/pqxx/zview.hxx"
)
add_library(_libpqxx ${SRCS} ${HDRS})
add_library(_libpqxx ${SRCS})
target_link_libraries(_libpqxx PUBLIC ch_contrib::libpq)
target_include_directories (_libpqxx SYSTEM BEFORE PUBLIC "${LIBRARY_DIR}/include")

View File

@ -17,7 +17,7 @@ public:
bool isCancelled() const { return *counter > 0; }
/// Temporarily blocks corresponding actions (while the returned object is alive)
/// Temporary blocks corresponding actions (while the returned object is alive)
friend class ActionLock;
ActionLock cancel() { return ActionLock(*this); }

View File

@ -181,12 +181,6 @@ void SetACLRequest::addRootPath(const String & root_path) { Coordination::addRoo
void GetACLRequest::addRootPath(const String & root_path) { Coordination::addRootPath(path, root_path); }
void SyncRequest::addRootPath(const String & root_path) { Coordination::addRootPath(path, root_path); }
void MultiRequest::addRootPath(const String & root_path)
{
for (auto & request : requests)
request->addRootPath(root_path);
}
void CreateResponse::removeRootPath(const String & root_path) { Coordination::removeRootPath(path_created, root_path); }
void WatchResponse::removeRootPath(const String & root_path) { Coordination::removeRootPath(path, root_path); }

View File

@ -408,11 +408,17 @@ struct ReconfigResponse : virtual Response
size_t bytesSize() const override { return value.size() + sizeof(stat); }
};
template <typename T>
struct MultiRequest : virtual Request
{
Requests requests;
std::vector<T> requests;
void addRootPath(const String & root_path) override
{
for (auto & request : requests)
request->addRootPath(root_path);
}
void addRootPath(const String & root_path) override;
String getPath() const override { return {}; }
size_t bytesSize() const override

View File

@ -184,7 +184,7 @@ struct TestKeeperReconfigRequest final : ReconfigRequest, TestKeeperRequest
std::pair<ResponsePtr, Undo> process(TestKeeper::Container & container, int64_t zxid) const override;
};
struct TestKeeperMultiRequest final : MultiRequest, TestKeeperRequest
struct TestKeeperMultiRequest final : MultiRequest<RequestPtr>, TestKeeperRequest
{
explicit TestKeeperMultiRequest(const Requests & generic_requests)
: TestKeeperMultiRequest(std::span(generic_requests))

View File

@ -18,14 +18,16 @@ using namespace DB;
void ZooKeeperResponse::write(WriteBuffer & out) const
{
/// Excessive copy to calculate length.
WriteBufferFromOwnString buf;
Coordination::write(xid, buf);
Coordination::write(zxid, buf);
Coordination::write(error, buf);
auto response_size = Coordination::size(xid) + Coordination::size(zxid) + Coordination::size(error);
if (error == Error::ZOK)
writeImpl(buf);
Coordination::write(buf.str(), out);
response_size += sizeImpl();
Coordination::write(static_cast<int32_t>(response_size), out);
Coordination::write(xid, out);
Coordination::write(zxid, out);
Coordination::write(error, out);
if (error == Error::ZOK)
writeImpl(out);
}
std::string ZooKeeperRequest::toString(bool short_format) const
@ -41,12 +43,12 @@ std::string ZooKeeperRequest::toString(bool short_format) const
void ZooKeeperRequest::write(WriteBuffer & out) const
{
/// Excessive copy to calculate length.
WriteBufferFromOwnString buf;
Coordination::write(xid, buf);
Coordination::write(getOpNum(), buf);
writeImpl(buf);
Coordination::write(buf.str(), out);
auto request_size = Coordination::size(xid) + Coordination::size(getOpNum()) + sizeImpl();
Coordination::write(static_cast<int32_t>(request_size), out);
Coordination::write(xid, out);
Coordination::write(getOpNum(), out);
writeImpl(out);
}
void ZooKeeperSyncRequest::writeImpl(WriteBuffer & out) const
@ -54,6 +56,11 @@ void ZooKeeperSyncRequest::writeImpl(WriteBuffer & out) const
Coordination::write(path, out);
}
size_t ZooKeeperSyncRequest::sizeImpl() const
{
return Coordination::size(path);
}
void ZooKeeperSyncRequest::readImpl(ReadBuffer & in)
{
Coordination::read(path, in);
@ -74,6 +81,11 @@ void ZooKeeperSyncResponse::writeImpl(WriteBuffer & out) const
Coordination::write(path, out);
}
size_t ZooKeeperSyncResponse::sizeImpl() const
{
return Coordination::size(path);
}
void ZooKeeperReconfigRequest::writeImpl(WriteBuffer & out) const
{
Coordination::write(joining, out);
@ -82,6 +94,11 @@ void ZooKeeperReconfigRequest::writeImpl(WriteBuffer & out) const
Coordination::write(version, out);
}
size_t ZooKeeperReconfigRequest::sizeImpl() const
{
return Coordination::size(joining) + Coordination::size(leaving) + Coordination::size(new_members) + Coordination::size(version);
}
void ZooKeeperReconfigRequest::readImpl(ReadBuffer & in)
{
Coordination::read(joining, in);
@ -109,6 +126,11 @@ void ZooKeeperReconfigResponse::writeImpl(WriteBuffer & out) const
Coordination::write(stat, out);
}
size_t ZooKeeperReconfigResponse::sizeImpl() const
{
return Coordination::size(value) + Coordination::size(stat);
}
void ZooKeeperWatchResponse::readImpl(ReadBuffer & in)
{
Coordination::read(type, in);
@ -123,6 +145,11 @@ void ZooKeeperWatchResponse::writeImpl(WriteBuffer & out) const
Coordination::write(path, out);
}
size_t ZooKeeperWatchResponse::sizeImpl() const
{
return Coordination::size(type) + Coordination::size(state) + Coordination::size(path);
}
void ZooKeeperWatchResponse::write(WriteBuffer & out) const
{
if (error == Error::ZOK)
@ -137,6 +164,11 @@ void ZooKeeperAuthRequest::writeImpl(WriteBuffer & out) const
Coordination::write(data, out);
}
size_t ZooKeeperAuthRequest::sizeImpl() const
{
return Coordination::size(type) + Coordination::size(scheme) + Coordination::size(data);
}
void ZooKeeperAuthRequest::readImpl(ReadBuffer & in)
{
Coordination::read(type, in);
@ -175,6 +207,12 @@ void ZooKeeperCreateRequest::writeImpl(WriteBuffer & out) const
Coordination::write(flags, out);
}
size_t ZooKeeperCreateRequest::sizeImpl() const
{
int32_t flags = 0;
return Coordination::size(path) + Coordination::size(data) + Coordination::size(acls) + Coordination::size(flags);
}
void ZooKeeperCreateRequest::readImpl(ReadBuffer & in)
{
Coordination::read(path, in);
@ -211,12 +249,22 @@ void ZooKeeperCreateResponse::writeImpl(WriteBuffer & out) const
Coordination::write(path_created, out);
}
size_t ZooKeeperCreateResponse::sizeImpl() const
{
return Coordination::size(path_created);
}
void ZooKeeperRemoveRequest::writeImpl(WriteBuffer & out) const
{
Coordination::write(path, out);
Coordination::write(version, out);
}
size_t ZooKeeperRemoveRequest::sizeImpl() const
{
return Coordination::size(path) + Coordination::size(version);
}
std::string ZooKeeperRemoveRequest::toStringImpl(bool /*short_format*/) const
{
return fmt::format(
@ -244,6 +292,11 @@ void ZooKeeperRemoveRecursiveRequest::readImpl(ReadBuffer & in)
Coordination::read(remove_nodes_limit, in);
}
size_t ZooKeeperRemoveRecursiveRequest::sizeImpl() const
{
return Coordination::size(path) + Coordination::size(remove_nodes_limit);
}
std::string ZooKeeperRemoveRecursiveRequest::toStringImpl(bool /*short_format*/) const
{
return fmt::format(
@ -259,6 +312,11 @@ void ZooKeeperExistsRequest::writeImpl(WriteBuffer & out) const
Coordination::write(has_watch, out);
}
size_t ZooKeeperExistsRequest::sizeImpl() const
{
return Coordination::size(path) + Coordination::size(has_watch);
}
void ZooKeeperExistsRequest::readImpl(ReadBuffer & in)
{
Coordination::read(path, in);
@ -280,12 +338,22 @@ void ZooKeeperExistsResponse::writeImpl(WriteBuffer & out) const
Coordination::write(stat, out);
}
size_t ZooKeeperExistsResponse::sizeImpl() const
{
return Coordination::size(stat);
}
void ZooKeeperGetRequest::writeImpl(WriteBuffer & out) const
{
Coordination::write(path, out);
Coordination::write(has_watch, out);
}
size_t ZooKeeperGetRequest::sizeImpl() const
{
return Coordination::size(path) + Coordination::size(has_watch);
}
void ZooKeeperGetRequest::readImpl(ReadBuffer & in)
{
Coordination::read(path, in);
@ -309,6 +377,11 @@ void ZooKeeperGetResponse::writeImpl(WriteBuffer & out) const
Coordination::write(stat, out);
}
size_t ZooKeeperGetResponse::sizeImpl() const
{
return Coordination::size(data) + Coordination::size(stat);
}
void ZooKeeperSetRequest::writeImpl(WriteBuffer & out) const
{
Coordination::write(path, out);
@ -316,6 +389,11 @@ void ZooKeeperSetRequest::writeImpl(WriteBuffer & out) const
Coordination::write(version, out);
}
size_t ZooKeeperSetRequest::sizeImpl() const
{
return Coordination::size(path) + Coordination::size(data) + Coordination::size(version);
}
void ZooKeeperSetRequest::readImpl(ReadBuffer & in)
{
Coordination::read(path, in);
@ -342,12 +420,22 @@ void ZooKeeperSetResponse::writeImpl(WriteBuffer & out) const
Coordination::write(stat, out);
}
size_t ZooKeeperSetResponse::sizeImpl() const
{
return Coordination::size(stat);
}
void ZooKeeperListRequest::writeImpl(WriteBuffer & out) const
{
Coordination::write(path, out);
Coordination::write(has_watch, out);
}
size_t ZooKeeperListRequest::sizeImpl() const
{
return Coordination::size(path) + Coordination::size(has_watch);
}
void ZooKeeperListRequest::readImpl(ReadBuffer & in)
{
Coordination::read(path, in);
@ -366,6 +454,11 @@ void ZooKeeperFilteredListRequest::writeImpl(WriteBuffer & out) const
Coordination::write(static_cast<uint8_t>(list_request_type), out);
}
size_t ZooKeeperFilteredListRequest::sizeImpl() const
{
return Coordination::size(path) + Coordination::size(has_watch) + Coordination::size(static_cast<uint8_t>(list_request_type));
}
void ZooKeeperFilteredListRequest::readImpl(ReadBuffer & in)
{
Coordination::read(path, in);
@ -397,6 +490,11 @@ void ZooKeeperListResponse::writeImpl(WriteBuffer & out) const
Coordination::write(stat, out);
}
size_t ZooKeeperListResponse::sizeImpl() const
{
return Coordination::size(names) + Coordination::size(stat);
}
void ZooKeeperSimpleListResponse::readImpl(ReadBuffer & in)
{
Coordination::read(names, in);
@ -407,6 +505,11 @@ void ZooKeeperSimpleListResponse::writeImpl(WriteBuffer & out) const
Coordination::write(names, out);
}
size_t ZooKeeperSimpleListResponse::sizeImpl() const
{
return Coordination::size(names);
}
void ZooKeeperSetACLRequest::writeImpl(WriteBuffer & out) const
{
Coordination::write(path, out);
@ -414,6 +517,11 @@ void ZooKeeperSetACLRequest::writeImpl(WriteBuffer & out) const
Coordination::write(version, out);
}
size_t ZooKeeperSetACLRequest::sizeImpl() const
{
return Coordination::size(path) + Coordination::size(acls) + Coordination::size(version);
}
void ZooKeeperSetACLRequest::readImpl(ReadBuffer & in)
{
Coordination::read(path, in);
@ -431,6 +539,11 @@ void ZooKeeperSetACLResponse::writeImpl(WriteBuffer & out) const
Coordination::write(stat, out);
}
size_t ZooKeeperSetACLResponse::sizeImpl() const
{
return Coordination::size(stat);
}
void ZooKeeperSetACLResponse::readImpl(ReadBuffer & in)
{
Coordination::read(stat, in);
@ -446,6 +559,11 @@ void ZooKeeperGetACLRequest::writeImpl(WriteBuffer & out) const
Coordination::write(path, out);
}
size_t ZooKeeperGetACLRequest::sizeImpl() const
{
return Coordination::size(path);
}
std::string ZooKeeperGetACLRequest::toStringImpl(bool /*short_format*/) const
{
return fmt::format("path = {}", path);
@ -457,6 +575,11 @@ void ZooKeeperGetACLResponse::writeImpl(WriteBuffer & out) const
Coordination::write(stat, out);
}
size_t ZooKeeperGetACLResponse::sizeImpl() const
{
return Coordination::size(acl) + Coordination::size(stat);
}
void ZooKeeperGetACLResponse::readImpl(ReadBuffer & in)
{
Coordination::read(acl, in);
@ -469,6 +592,11 @@ void ZooKeeperCheckRequest::writeImpl(WriteBuffer & out) const
Coordination::write(version, out);
}
size_t ZooKeeperCheckRequest::sizeImpl() const
{
return Coordination::size(path) + Coordination::size(version);
}
void ZooKeeperCheckRequest::readImpl(ReadBuffer & in)
{
Coordination::read(path, in);
@ -494,6 +622,11 @@ void ZooKeeperErrorResponse::writeImpl(WriteBuffer & out) const
Coordination::write(error, out);
}
size_t ZooKeeperErrorResponse::sizeImpl() const
{
return Coordination::size(error);
}
void ZooKeeperMultiRequest::checkOperationType(OperationType type)
{
chassert(!operation_type.has_value() || *operation_type == type);
@ -596,6 +729,27 @@ void ZooKeeperMultiRequest::writeImpl(WriteBuffer & out) const
Coordination::write(error, out);
}
size_t ZooKeeperMultiRequest::sizeImpl() const
{
size_t total_size = 0;
for (const auto & request : requests)
{
const auto & zk_request = dynamic_cast<const ZooKeeperRequest &>(*request);
bool done = false;
int32_t error = -1;
total_size
+= Coordination::size(zk_request.getOpNum()) + Coordination::size(done) + Coordination::size(error) + zk_request.sizeImpl();
}
OpNum op_num = OpNum::Error;
bool done = true;
int32_t error = -1;
return total_size + Coordination::size(op_num) + Coordination::size(done) + Coordination::size(error);
}
void ZooKeeperMultiRequest::readImpl(ReadBuffer & in)
{
while (true)
@ -729,31 +883,54 @@ void ZooKeeperMultiResponse::writeImpl(WriteBuffer & out) const
}
}
ZooKeeperResponsePtr ZooKeeperHeartbeatRequest::makeResponse() const { return setTime(std::make_shared<ZooKeeperHeartbeatResponse>()); }
ZooKeeperResponsePtr ZooKeeperSyncRequest::makeResponse() const { return setTime(std::make_shared<ZooKeeperSyncResponse>()); }
ZooKeeperResponsePtr ZooKeeperAuthRequest::makeResponse() const { return setTime(std::make_shared<ZooKeeperAuthResponse>()); }
ZooKeeperResponsePtr ZooKeeperRemoveRequest::makeResponse() const { return setTime(std::make_shared<ZooKeeperRemoveResponse>()); }
ZooKeeperResponsePtr ZooKeeperRemoveRecursiveRequest::makeResponse() const { return setTime(std::make_shared<ZooKeeperRemoveRecursiveResponse>()); }
ZooKeeperResponsePtr ZooKeeperExistsRequest::makeResponse() const { return setTime(std::make_shared<ZooKeeperExistsResponse>()); }
ZooKeeperResponsePtr ZooKeeperGetRequest::makeResponse() const { return setTime(std::make_shared<ZooKeeperGetResponse>()); }
ZooKeeperResponsePtr ZooKeeperSetRequest::makeResponse() const { return setTime(std::make_shared<ZooKeeperSetResponse>()); }
ZooKeeperResponsePtr ZooKeeperReconfigRequest::makeResponse() const { return setTime(std::make_shared<ZooKeeperReconfigResponse>()); }
ZooKeeperResponsePtr ZooKeeperListRequest::makeResponse() const { return setTime(std::make_shared<ZooKeeperListResponse>()); }
ZooKeeperResponsePtr ZooKeeperSimpleListRequest::makeResponse() const { return setTime(std::make_shared<ZooKeeperSimpleListResponse>()); }
size_t ZooKeeperMultiResponse::sizeImpl() const
{
size_t total_size = 0;
for (const auto & response : responses)
{
const ZooKeeperResponse & zk_response = dynamic_cast<const ZooKeeperResponse &>(*response);
OpNum op_num = zk_response.getOpNum();
bool done = false;
Error op_error = zk_response.error;
total_size += Coordination::size(op_num) + Coordination::size(done) + Coordination::size(op_error);
if (op_error == Error::ZOK || op_num == OpNum::Error)
total_size += zk_response.sizeImpl();
}
/// Footer.
OpNum op_num = OpNum::Error;
bool done = true;
int32_t error_read = - 1;
return total_size + Coordination::size(op_num) + Coordination::size(done) + Coordination::size(error_read);
}
ZooKeeperResponsePtr ZooKeeperHeartbeatRequest::makeResponse() const { return std::make_shared<ZooKeeperHeartbeatResponse>(); }
ZooKeeperResponsePtr ZooKeeperSyncRequest::makeResponse() const { return std::make_shared<ZooKeeperSyncResponse>(); }
ZooKeeperResponsePtr ZooKeeperAuthRequest::makeResponse() const { return std::make_shared<ZooKeeperAuthResponse>(); }
ZooKeeperResponsePtr ZooKeeperRemoveRequest::makeResponse() const { return std::make_shared<ZooKeeperRemoveResponse>(); }
ZooKeeperResponsePtr ZooKeeperRemoveRecursiveRequest::makeResponse() const { return std::make_shared<ZooKeeperRemoveRecursiveResponse>(); }
ZooKeeperResponsePtr ZooKeeperExistsRequest::makeResponse() const { return std::make_shared<ZooKeeperExistsResponse>(); }
ZooKeeperResponsePtr ZooKeeperGetRequest::makeResponse() const { return std::make_shared<ZooKeeperGetResponse>(); }
ZooKeeperResponsePtr ZooKeeperSetRequest::makeResponse() const { return std::make_shared<ZooKeeperSetResponse>(); }
ZooKeeperResponsePtr ZooKeeperReconfigRequest::makeResponse() const { return std::make_shared<ZooKeeperReconfigResponse>(); }
ZooKeeperResponsePtr ZooKeeperListRequest::makeResponse() const { return std::make_shared<ZooKeeperListResponse>(); }
ZooKeeperResponsePtr ZooKeeperSimpleListRequest::makeResponse() const { return std::make_shared<ZooKeeperSimpleListResponse>(); }
ZooKeeperResponsePtr ZooKeeperCreateRequest::makeResponse() const
{
if (not_exists)
return setTime(std::make_shared<ZooKeeperCreateIfNotExistsResponse>());
return setTime(std::make_shared<ZooKeeperCreateResponse>());
return std::make_shared<ZooKeeperCreateIfNotExistsResponse>();
return std::make_shared<ZooKeeperCreateResponse>();
}
ZooKeeperResponsePtr ZooKeeperCheckRequest::makeResponse() const
{
if (not_exists)
return setTime(std::make_shared<ZooKeeperCheckNotExistsResponse>());
return std::make_shared<ZooKeeperCheckNotExistsResponse>();
return setTime(std::make_shared<ZooKeeperCheckResponse>());
return std::make_shared<ZooKeeperCheckResponse>();
}
ZooKeeperResponsePtr ZooKeeperMultiRequest::makeResponse() const
@ -764,11 +941,12 @@ ZooKeeperResponsePtr ZooKeeperMultiRequest::makeResponse() const
else
response = std::make_shared<ZooKeeperMultiReadResponse>(requests);
return setTime(std::move(response));
return std::move(response);
}
ZooKeeperResponsePtr ZooKeeperCloseRequest::makeResponse() const { return setTime(std::make_shared<ZooKeeperCloseResponse>()); }
ZooKeeperResponsePtr ZooKeeperSetACLRequest::makeResponse() const { return setTime(std::make_shared<ZooKeeperSetACLResponse>()); }
ZooKeeperResponsePtr ZooKeeperGetACLRequest::makeResponse() const { return setTime(std::make_shared<ZooKeeperGetACLResponse>()); }
ZooKeeperResponsePtr ZooKeeperCloseRequest::makeResponse() const { return std::make_shared<ZooKeeperCloseResponse>(); }
ZooKeeperResponsePtr ZooKeeperSetACLRequest::makeResponse() const { return std::make_shared<ZooKeeperSetACLResponse>(); }
ZooKeeperResponsePtr ZooKeeperGetACLRequest::makeResponse() const { return std::make_shared<ZooKeeperGetACLResponse>(); }
void ZooKeeperSessionIDRequest::writeImpl(WriteBuffer & out) const
{
@ -777,6 +955,11 @@ void ZooKeeperSessionIDRequest::writeImpl(WriteBuffer & out) const
Coordination::write(server_id, out);
}
size_t ZooKeeperSessionIDRequest::sizeImpl() const
{
return Coordination::size(internal_id) + Coordination::size(session_timeout_ms) + Coordination::size(server_id);
}
void ZooKeeperSessionIDRequest::readImpl(ReadBuffer & in)
{
Coordination::read(internal_id, in);
@ -803,6 +986,11 @@ void ZooKeeperSessionIDResponse::writeImpl(WriteBuffer & out) const
Coordination::write(server_id, out);
}
size_t ZooKeeperSessionIDResponse::sizeImpl() const
{
return Coordination::size(internal_id) + Coordination::size(session_id) + Coordination::size(server_id);
}
void ZooKeeperRequest::createLogElements(LogElements & elems) const
{
@ -960,40 +1148,6 @@ std::shared_ptr<ZooKeeperRequest> ZooKeeperRequest::read(ReadBuffer & in)
return request;
}
ZooKeeperRequest::~ZooKeeperRequest()
{
if (!request_created_time_ns)
return;
UInt64 elapsed_ns = clock_gettime_ns() - request_created_time_ns;
constexpr UInt64 max_request_time_ns = 1000000000ULL; /// 1 sec
if (max_request_time_ns < elapsed_ns)
{
LOG_TEST(getLogger(__PRETTY_FUNCTION__), "Processing of request xid={} took {} ms", xid, elapsed_ns / 1000000UL);
}
}
ZooKeeperResponsePtr ZooKeeperRequest::setTime(ZooKeeperResponsePtr response) const
{
if (request_created_time_ns)
{
response->response_created_time_ns = clock_gettime_ns();
}
return response;
}
ZooKeeperResponse::~ZooKeeperResponse()
{
if (!response_created_time_ns)
return;
UInt64 elapsed_ns = clock_gettime_ns() - response_created_time_ns;
constexpr UInt64 max_request_time_ns = 1000000000ULL; /// 1 sec
if (max_request_time_ns < elapsed_ns)
{
LOG_TEST(getLogger(__PRETTY_FUNCTION__), "Processing of response xid={} took {} ms", xid, elapsed_ns / 1000000UL);
}
}
ZooKeeperRequestPtr ZooKeeperRequestFactory::get(OpNum op_num) const
{
auto it = op_num_to_request.find(op_num);
@ -1015,7 +1169,6 @@ void registerZooKeeperRequest(ZooKeeperRequestFactory & factory)
factory.registerRequest(num, []
{
auto res = std::make_shared<RequestT>();
res->request_created_time_ns = clock_gettime_ns();
if constexpr (num == OpNum::MultiRead)
res->operation_type = ZooKeeperMultiRequest::OperationType::Read;

View File

@ -7,13 +7,11 @@
#include <boost/noncopyable.hpp>
#include <IO/ReadBuffer.h>
#include <IO/WriteBuffer.h>
#include <unordered_map>
#include <vector>
#include <memory>
#include <cstdint>
#include <optional>
#include <functional>
#include <span>
namespace Coordination
@ -25,13 +23,11 @@ struct ZooKeeperResponse : virtual Response
{
XID xid = 0;
UInt64 response_created_time_ns = 0;
ZooKeeperResponse() = default;
ZooKeeperResponse(const ZooKeeperResponse &) = default;
~ZooKeeperResponse() override;
virtual void readImpl(ReadBuffer &) = 0;
virtual void writeImpl(WriteBuffer &) const = 0;
virtual size_t sizeImpl() const = 0;
virtual void write(WriteBuffer & out) const;
virtual OpNum getOpNum() const = 0;
virtual void fillLogElements(LogElements & elems, size_t idx) const;
@ -51,13 +47,11 @@ struct ZooKeeperRequest : virtual Request
bool restored_from_zookeeper_log = false;
UInt64 request_created_time_ns = 0;
UInt64 thread_id = 0;
String query_id;
ZooKeeperRequest() = default;
ZooKeeperRequest(const ZooKeeperRequest &) = default;
~ZooKeeperRequest() override;
virtual OpNum getOpNum() const = 0;
@ -66,6 +60,7 @@ struct ZooKeeperRequest : virtual Request
std::string toString(bool short_format = false) const;
virtual void writeImpl(WriteBuffer &) const = 0;
virtual size_t sizeImpl() const = 0;
virtual void readImpl(ReadBuffer &) = 0;
virtual std::string toStringImpl(bool /*short_format*/) const { return ""; }
@ -73,7 +68,6 @@ struct ZooKeeperRequest : virtual Request
static std::shared_ptr<ZooKeeperRequest> read(ReadBuffer & in);
virtual ZooKeeperResponsePtr makeResponse() const = 0;
ZooKeeperResponsePtr setTime(ZooKeeperResponsePtr response) const;
virtual bool isReadRequest() const = 0;
virtual void createLogElements(LogElements & elems) const;
@ -86,6 +80,7 @@ struct ZooKeeperHeartbeatRequest final : ZooKeeperRequest
String getPath() const override { return {}; }
OpNum getOpNum() const override { return OpNum::Heartbeat; }
void writeImpl(WriteBuffer &) const override {}
size_t sizeImpl() const override { return 0; }
void readImpl(ReadBuffer &) override {}
ZooKeeperResponsePtr makeResponse() const override;
bool isReadRequest() const override { return false; }
@ -97,6 +92,7 @@ struct ZooKeeperSyncRequest final : ZooKeeperRequest
String getPath() const override { return path; }
OpNum getOpNum() const override { return OpNum::Sync; }
void writeImpl(WriteBuffer & out) const override;
size_t sizeImpl() const override;
void readImpl(ReadBuffer & in) override;
std::string toStringImpl(bool short_format) const override;
ZooKeeperResponsePtr makeResponse() const override;
@ -109,6 +105,7 @@ struct ZooKeeperSyncResponse final : SyncResponse, ZooKeeperResponse
{
void readImpl(ReadBuffer & in) override;
void writeImpl(WriteBuffer & out) const override;
size_t sizeImpl() const override;
OpNum getOpNum() const override { return OpNum::Sync; }
};
@ -122,6 +119,7 @@ struct ZooKeeperReconfigRequest final : ZooKeeperRequest
String getPath() const override { return keeper_config_path; }
OpNum getOpNum() const override { return OpNum::Reconfig; }
void writeImpl(WriteBuffer & out) const override;
size_t sizeImpl() const override;
void readImpl(ReadBuffer & in) override;
std::string toStringImpl(bool short_format) const override;
ZooKeeperResponsePtr makeResponse() const override;
@ -138,6 +136,7 @@ struct ZooKeeperReconfigResponse final : ReconfigResponse, ZooKeeperResponse
{
void readImpl(ReadBuffer & in) override;
void writeImpl(WriteBuffer & out) const override;
size_t sizeImpl() const override;
OpNum getOpNum() const override { return OpNum::Reconfig; }
};
@ -145,6 +144,7 @@ struct ZooKeeperHeartbeatResponse final : ZooKeeperResponse
{
void readImpl(ReadBuffer &) override {}
void writeImpl(WriteBuffer &) const override {}
size_t sizeImpl() const override { return 0; }
OpNum getOpNum() const override { return OpNum::Heartbeat; }
};
@ -153,6 +153,7 @@ struct ZooKeeperWatchResponse final : WatchResponse, ZooKeeperResponse
void readImpl(ReadBuffer & in) override;
void writeImpl(WriteBuffer & out) const override;
size_t sizeImpl() const override;
void write(WriteBuffer & out) const override;
@ -175,6 +176,7 @@ struct ZooKeeperAuthRequest final : ZooKeeperRequest
String getPath() const override { return {}; }
OpNum getOpNum() const override { return OpNum::Auth; }
void writeImpl(WriteBuffer & out) const override;
size_t sizeImpl() const override;
void readImpl(ReadBuffer & in) override;
std::string toStringImpl(bool short_format) const override;
@ -189,6 +191,7 @@ struct ZooKeeperAuthResponse final : ZooKeeperResponse
{
void readImpl(ReadBuffer &) override {}
void writeImpl(WriteBuffer &) const override {}
size_t sizeImpl() const override { return 0; }
OpNum getOpNum() const override { return OpNum::Auth; }
@ -200,6 +203,7 @@ struct ZooKeeperCloseRequest final : ZooKeeperRequest
String getPath() const override { return {}; }
OpNum getOpNum() const override { return OpNum::Close; }
void writeImpl(WriteBuffer &) const override {}
size_t sizeImpl() const override { return 0; }
void readImpl(ReadBuffer &) override {}
ZooKeeperResponsePtr makeResponse() const override;
@ -214,6 +218,7 @@ struct ZooKeeperCloseResponse final : ZooKeeperResponse
}
void writeImpl(WriteBuffer &) const override {}
size_t sizeImpl() const override { return 0; }
OpNum getOpNum() const override { return OpNum::Close; }
};
@ -228,6 +233,7 @@ struct ZooKeeperCreateRequest final : public CreateRequest, ZooKeeperRequest
OpNum getOpNum() const override { return not_exists ? OpNum::CreateIfNotExists : OpNum::Create; }
void writeImpl(WriteBuffer & out) const override;
size_t sizeImpl() const override;
void readImpl(ReadBuffer & in) override;
std::string toStringImpl(bool short_format) const override;
@ -244,6 +250,7 @@ struct ZooKeeperCreateResponse : CreateResponse, ZooKeeperResponse
void readImpl(ReadBuffer & in) override;
void writeImpl(WriteBuffer & out) const override;
size_t sizeImpl() const override;
OpNum getOpNum() const override { return OpNum::Create; }
@ -265,6 +272,7 @@ struct ZooKeeperRemoveRequest final : RemoveRequest, ZooKeeperRequest
OpNum getOpNum() const override { return OpNum::Remove; }
void writeImpl(WriteBuffer & out) const override;
size_t sizeImpl() const override;
void readImpl(ReadBuffer & in) override;
std::string toStringImpl(bool short_format) const override;
@ -280,6 +288,7 @@ struct ZooKeeperRemoveResponse final : RemoveResponse, ZooKeeperResponse
{
void readImpl(ReadBuffer &) override {}
void writeImpl(WriteBuffer &) const override {}
size_t sizeImpl() const override { return 0; }
OpNum getOpNum() const override { return OpNum::Remove; }
size_t bytesSize() const override { return RemoveResponse::bytesSize() + sizeof(xid) + sizeof(zxid); }
@ -293,6 +302,7 @@ struct ZooKeeperRemoveRecursiveRequest final : RemoveRecursiveRequest, ZooKeeper
OpNum getOpNum() const override { return OpNum::RemoveRecursive; }
void writeImpl(WriteBuffer & out) const override;
void readImpl(ReadBuffer & in) override;
size_t sizeImpl() const override;
std::string toStringImpl(bool short_format) const override;
ZooKeeperResponsePtr makeResponse() const override;
@ -305,6 +315,7 @@ struct ZooKeeperRemoveRecursiveResponse : RemoveRecursiveResponse, ZooKeeperResp
{
void readImpl(ReadBuffer &) override {}
void writeImpl(WriteBuffer &) const override {}
size_t sizeImpl() const override { return 0; }
OpNum getOpNum() const override { return OpNum::RemoveRecursive; }
size_t bytesSize() const override { return RemoveRecursiveResponse::bytesSize() + sizeof(xid) + sizeof(zxid); }
@ -317,6 +328,7 @@ struct ZooKeeperExistsRequest final : ExistsRequest, ZooKeeperRequest
OpNum getOpNum() const override { return OpNum::Exists; }
void writeImpl(WriteBuffer & out) const override;
size_t sizeImpl() const override;
void readImpl(ReadBuffer & in) override;
std::string toStringImpl(bool short_format) const override;
@ -330,6 +342,7 @@ struct ZooKeeperExistsResponse final : ExistsResponse, ZooKeeperResponse
{
void readImpl(ReadBuffer & in) override;
void writeImpl(WriteBuffer & out) const override;
size_t sizeImpl() const override;
OpNum getOpNum() const override { return OpNum::Exists; }
size_t bytesSize() const override { return ExistsResponse::bytesSize() + sizeof(xid) + sizeof(zxid); }
@ -344,6 +357,7 @@ struct ZooKeeperGetRequest final : GetRequest, ZooKeeperRequest
OpNum getOpNum() const override { return OpNum::Get; }
void writeImpl(WriteBuffer & out) const override;
size_t sizeImpl() const override;
void readImpl(ReadBuffer & in) override;
std::string toStringImpl(bool short_format) const override;
@ -357,6 +371,7 @@ struct ZooKeeperGetResponse final : GetResponse, ZooKeeperResponse
{
void readImpl(ReadBuffer & in) override;
void writeImpl(WriteBuffer & out) const override;
size_t sizeImpl() const override;
OpNum getOpNum() const override { return OpNum::Get; }
size_t bytesSize() const override { return GetResponse::bytesSize() + sizeof(xid) + sizeof(zxid); }
@ -371,6 +386,7 @@ struct ZooKeeperSetRequest final : SetRequest, ZooKeeperRequest
OpNum getOpNum() const override { return OpNum::Set; }
void writeImpl(WriteBuffer & out) const override;
size_t sizeImpl() const override;
void readImpl(ReadBuffer & in) override;
std::string toStringImpl(bool short_format) const override;
ZooKeeperResponsePtr makeResponse() const override;
@ -385,6 +401,7 @@ struct ZooKeeperSetResponse final : SetResponse, ZooKeeperResponse
{
void readImpl(ReadBuffer & in) override;
void writeImpl(WriteBuffer & out) const override;
size_t sizeImpl() const override;
OpNum getOpNum() const override { return OpNum::Set; }
size_t bytesSize() const override { return SetResponse::bytesSize() + sizeof(xid) + sizeof(zxid); }
@ -399,6 +416,7 @@ struct ZooKeeperListRequest : ListRequest, ZooKeeperRequest
OpNum getOpNum() const override { return OpNum::List; }
void writeImpl(WriteBuffer & out) const override;
size_t sizeImpl() const override;
void readImpl(ReadBuffer & in) override;
std::string toStringImpl(bool short_format) const override;
ZooKeeperResponsePtr makeResponse() const override;
@ -419,6 +437,7 @@ struct ZooKeeperFilteredListRequest final : ZooKeeperListRequest
OpNum getOpNum() const override { return OpNum::FilteredList; }
void writeImpl(WriteBuffer & out) const override;
size_t sizeImpl() const override;
void readImpl(ReadBuffer & in) override;
std::string toStringImpl(bool short_format) const override;
@ -429,6 +448,7 @@ struct ZooKeeperListResponse : ListResponse, ZooKeeperResponse
{
void readImpl(ReadBuffer & in) override;
void writeImpl(WriteBuffer & out) const override;
size_t sizeImpl() const override;
OpNum getOpNum() const override { return OpNum::List; }
size_t bytesSize() const override { return ListResponse::bytesSize() + sizeof(xid) + sizeof(zxid); }
@ -440,6 +460,7 @@ struct ZooKeeperSimpleListResponse final : ZooKeeperListResponse
{
void readImpl(ReadBuffer & in) override;
void writeImpl(WriteBuffer & out) const override;
size_t sizeImpl() const override;
OpNum getOpNum() const override { return OpNum::SimpleList; }
size_t bytesSize() const override { return ZooKeeperListResponse::bytesSize() - sizeof(stat); }
@ -452,6 +473,7 @@ struct ZooKeeperCheckRequest : CheckRequest, ZooKeeperRequest
OpNum getOpNum() const override { return not_exists ? OpNum::CheckNotExists : OpNum::Check; }
void writeImpl(WriteBuffer & out) const override;
size_t sizeImpl() const override;
void readImpl(ReadBuffer & in) override;
std::string toStringImpl(bool short_format) const override;
@ -467,6 +489,7 @@ struct ZooKeeperCheckResponse : CheckResponse, ZooKeeperResponse
{
void readImpl(ReadBuffer &) override {}
void writeImpl(WriteBuffer &) const override {}
size_t sizeImpl() const override { return 0; }
OpNum getOpNum() const override { return OpNum::Check; }
size_t bytesSize() const override { return CheckResponse::bytesSize() + sizeof(xid) + sizeof(zxid); }
@ -483,6 +506,7 @@ struct ZooKeeperErrorResponse final : ErrorResponse, ZooKeeperResponse
{
void readImpl(ReadBuffer & in) override;
void writeImpl(WriteBuffer & out) const override;
size_t sizeImpl() const override;
OpNum getOpNum() const override { return OpNum::Error; }
@ -493,6 +517,7 @@ struct ZooKeeperSetACLRequest final : SetACLRequest, ZooKeeperRequest
{
OpNum getOpNum() const override { return OpNum::SetACL; }
void writeImpl(WriteBuffer & out) const override;
size_t sizeImpl() const override;
void readImpl(ReadBuffer & in) override;
std::string toStringImpl(bool short_format) const override;
ZooKeeperResponsePtr makeResponse() const override;
@ -505,6 +530,7 @@ struct ZooKeeperSetACLResponse final : SetACLResponse, ZooKeeperResponse
{
void readImpl(ReadBuffer & in) override;
void writeImpl(WriteBuffer & out) const override;
size_t sizeImpl() const override;
OpNum getOpNum() const override { return OpNum::SetACL; }
size_t bytesSize() const override { return SetACLResponse::bytesSize() + sizeof(xid) + sizeof(zxid); }
@ -514,6 +540,7 @@ struct ZooKeeperGetACLRequest final : GetACLRequest, ZooKeeperRequest
{
OpNum getOpNum() const override { return OpNum::GetACL; }
void writeImpl(WriteBuffer & out) const override;
size_t sizeImpl() const override;
void readImpl(ReadBuffer & in) override;
std::string toStringImpl(bool short_format) const override;
ZooKeeperResponsePtr makeResponse() const override;
@ -526,12 +553,13 @@ struct ZooKeeperGetACLResponse final : GetACLResponse, ZooKeeperResponse
{
void readImpl(ReadBuffer & in) override;
void writeImpl(WriteBuffer & out) const override;
size_t sizeImpl() const override;
OpNum getOpNum() const override { return OpNum::GetACL; }
size_t bytesSize() const override { return GetACLResponse::bytesSize() + sizeof(xid) + sizeof(zxid); }
};
struct ZooKeeperMultiRequest final : MultiRequest, ZooKeeperRequest
struct ZooKeeperMultiRequest final : MultiRequest<ZooKeeperRequestPtr>, ZooKeeperRequest
{
OpNum getOpNum() const override;
ZooKeeperMultiRequest() = default;
@ -540,6 +568,7 @@ struct ZooKeeperMultiRequest final : MultiRequest, ZooKeeperRequest
ZooKeeperMultiRequest(std::span<const Coordination::RequestPtr> generic_requests, const ACLs & default_acls);
void writeImpl(WriteBuffer & out) const override;
size_t sizeImpl() const override;
void readImpl(ReadBuffer & in) override;
std::string toStringImpl(bool short_format) const override;
@ -563,12 +592,14 @@ private:
struct ZooKeeperMultiResponse : MultiResponse, ZooKeeperResponse
{
explicit ZooKeeperMultiResponse(const Requests & requests)
ZooKeeperMultiResponse() = default;
explicit ZooKeeperMultiResponse(const std::vector<ZooKeeperRequestPtr> & requests)
{
responses.reserve(requests.size());
for (const auto & request : requests)
responses.emplace_back(dynamic_cast<const ZooKeeperRequest &>(*request).makeResponse());
responses.emplace_back(request->makeResponse());
}
explicit ZooKeeperMultiResponse(const Responses & responses_)
@ -579,6 +610,7 @@ struct ZooKeeperMultiResponse : MultiResponse, ZooKeeperResponse
void readImpl(ReadBuffer & in) override;
void writeImpl(WriteBuffer & out) const override;
size_t sizeImpl() const override;
size_t bytesSize() const override { return MultiResponse::bytesSize() + sizeof(xid) + sizeof(zxid); }
@ -609,6 +641,7 @@ struct ZooKeeperSessionIDRequest final : ZooKeeperRequest
Coordination::OpNum getOpNum() const override { return OpNum::SessionID; }
String getPath() const override { return {}; }
void writeImpl(WriteBuffer & out) const override;
size_t sizeImpl() const override;
void readImpl(ReadBuffer & in) override;
Coordination::ZooKeeperResponsePtr makeResponse() const override;
@ -627,6 +660,7 @@ struct ZooKeeperSessionIDResponse final : ZooKeeperResponse
void readImpl(ReadBuffer & in) override;
void writeImpl(WriteBuffer & out) const override;
size_t sizeImpl() const override;
Coordination::OpNum getOpNum() const override { return OpNum::SessionID; }
};

View File

@ -42,6 +42,32 @@ void write(const Error & x, WriteBuffer & out)
write(static_cast<int32_t>(x), out);
}
size_t size(OpNum x)
{
return size(static_cast<int32_t>(x));
}
size_t size(const std::string & s)
{
return size(static_cast<int32_t>(s.size())) + s.size();
}
size_t size(const ACL & acl)
{
return size(acl.permissions) + size(acl.scheme) + size(acl.id);
}
size_t size(const Stat & stat)
{
return size(stat.czxid) + size(stat.mzxid) + size(stat.ctime) + size(stat.mtime) + size(stat.version) + size(stat.cversion)
+ size(stat.aversion) + size(stat.ephemeralOwner) + size(stat.dataLength) + size(stat.numChildren) + size(stat.pzxid);
}
size_t size(const Error & x)
{
return size(static_cast<int32_t>(x));
}
void read(OpNum & x, ReadBuffer & in)
{
int32_t raw_op_num;

View File

@ -43,6 +43,36 @@ void write(const std::vector<T> & arr, WriteBuffer & out)
write(elem, out);
}
template <typename T>
requires is_arithmetic_v<T>
size_t size(T x)
{
return sizeof(x);
}
size_t size(OpNum x);
size_t size(const std::string & s);
size_t size(const ACL & acl);
size_t size(const Stat & stat);
size_t size(const Error & x);
template <size_t N>
size_t size(const std::array<char, N>)
{
return size(static_cast<int32_t>(N)) + N;
}
template <typename T>
size_t size(const std::vector<T> & arr)
{
size_t total_size = size(static_cast<int32_t>(arr.size()));
for (const auto & elem : arr)
total_size += size(elem);
return total_size;
}
template <typename T>
requires is_arithmetic_v<T>
void read(T & x, ReadBuffer & in)

View File

@ -0,0 +1,128 @@
#include <gtest/gtest.h>
#include <chrono>
#include <mutex>
#include <stdexcept>
#include <string_view>
#include <vector>
#include <thread>
#include <Common/ActionBlocker.h>
#include <Common/ActionLock.h>
using namespace DB;
TEST(ActionBlocker, TestDefaultConstructor)
{
ActionBlocker blocker;
EXPECT_FALSE(blocker.isCancelled());
EXPECT_EQ(0, blocker.getCounter().load());
}
TEST(ActionBlocker, TestCancelForever)
{
ActionBlocker blocker;
blocker.cancelForever();
EXPECT_TRUE(blocker.isCancelled());
EXPECT_EQ(1, blocker.getCounter().load());
}
TEST(ActionBlocker, TestCancel)
{
ActionBlocker blocker;
{
auto lock = blocker.cancel();
EXPECT_TRUE(blocker.isCancelled());
EXPECT_EQ(1, blocker.getCounter().load());
}
// automatically un-cancelled on `lock` destruction
EXPECT_FALSE(blocker.isCancelled());
}
TEST(ActionLock, TestDefaultConstructor)
{
ActionLock locker;
EXPECT_TRUE(locker.expired());
}
TEST(ActionLock, TestConstructorWithActionBlocker)
{
ActionBlocker blocker;
ActionLock lock(blocker);
EXPECT_FALSE(lock.expired());
EXPECT_TRUE(blocker.isCancelled());
EXPECT_EQ(1, blocker.getCounter().load());
}
TEST(ActionLock, TestMoveAssignmentToEmpty)
{
ActionBlocker blocker;
{
ActionLock lock;
lock = blocker.cancel();
EXPECT_TRUE(blocker.isCancelled());
}
// automatically un-cancelled on `lock` destruction
EXPECT_FALSE(blocker.isCancelled());
EXPECT_EQ(0, blocker.getCounter().load());
}
TEST(ActionLock, TestMoveAssignmentToNonEmpty)
{
ActionBlocker blocker;
{
auto lock = blocker.cancel();
EXPECT_TRUE(blocker.isCancelled());
// cause a move
lock = blocker.cancel();
// blocker should remain locked
EXPECT_TRUE(blocker.isCancelled());
}
// automatically un-cancelled on `lock` destruction
EXPECT_FALSE(blocker.isCancelled());
EXPECT_EQ(0, blocker.getCounter().load());
}
TEST(ActionLock, TestMoveAssignmentToNonEmpty2)
{
ActionBlocker blocker1;
ActionBlocker blocker2;
{
auto lock = blocker1.cancel();
EXPECT_TRUE(blocker1.isCancelled());
// cause a move
lock = blocker2.cancel();
// blocker2 be remain locked, blocker1 - unlocked
EXPECT_TRUE(blocker2.isCancelled());
EXPECT_FALSE(blocker1.isCancelled());
}
// automatically un-cancelled on `lock` destruction
EXPECT_FALSE(blocker1.isCancelled());
EXPECT_FALSE(blocker2.isCancelled());
EXPECT_EQ(0, blocker1.getCounter().load());
EXPECT_EQ(0, blocker2.getCounter().load());
}
TEST(ActionLock, TestExpiration)
{
ActionLock lock;
{
ActionBlocker blocker;
lock = blocker.cancel();
EXPECT_FALSE(lock.expired());
}
EXPECT_TRUE(lock.expired());
}

View File

@ -45,6 +45,7 @@ uint64_t ACLMap::convertACLs(const Coordination::ACLs & acls)
if (acls.empty())
return 0;
std::lock_guard lock(map_mutex);
if (acl_to_num.contains(acls))
return acl_to_num[acls];
@ -62,6 +63,7 @@ Coordination::ACLs ACLMap::convertNumber(uint64_t acls_id) const
if (acls_id == 0)
return Coordination::ACLs{};
std::lock_guard lock(map_mutex);
if (!num_to_acl.contains(acls_id))
throw Exception(ErrorCodes::LOGICAL_ERROR, "Unknown ACL id {}. It's a bug", acls_id);
@ -70,6 +72,7 @@ Coordination::ACLs ACLMap::convertNumber(uint64_t acls_id) const
void ACLMap::addMapping(uint64_t acls_id, const Coordination::ACLs & acls)
{
std::lock_guard lock(map_mutex);
num_to_acl[acls_id] = acls;
acl_to_num[acls] = acls_id;
max_acl_id = std::max(acls_id + 1, max_acl_id); /// max_acl_id pointer next slot
@ -77,11 +80,13 @@ void ACLMap::addMapping(uint64_t acls_id, const Coordination::ACLs & acls)
void ACLMap::addUsage(uint64_t acl_id)
{
std::lock_guard lock(map_mutex);
usage_counter[acl_id]++;
}
void ACLMap::removeUsage(uint64_t acl_id)
{
std::lock_guard lock(map_mutex);
if (!usage_counter.contains(acl_id))
return;

View File

@ -32,6 +32,8 @@ private:
NumToACLMap num_to_acl;
UsageCounter usage_counter;
uint64_t max_acl_id{1};
mutable std::mutex map_mutex;
public:
/// Convert ACL to number. If it's new ACL than adds it to map

View File

@ -301,11 +301,13 @@ String MonitorCommand::run()
print(ret, "server_state", keeper_info.getRole());
print(ret, "znode_count", state_machine.getNodesCount());
print(ret, "watch_count", state_machine.getTotalWatchesCount());
print(ret, "ephemerals_count", state_machine.getTotalEphemeralNodesCount());
print(ret, "approximate_data_size", state_machine.getApproximateDataSize());
print(ret, "key_arena_size", state_machine.getKeyArenaSize());
const auto & storage_stats = state_machine.getStorageStats();
print(ret, "znode_count", storage_stats.nodes_count.load(std::memory_order_relaxed));
print(ret, "watch_count", storage_stats.total_watches_count.load(std::memory_order_relaxed));
print(ret, "ephemerals_count", storage_stats.total_emphemeral_nodes_count.load(std::memory_order_relaxed));
print(ret, "approximate_data_size", storage_stats.approximate_data_size.load(std::memory_order_relaxed));
print(ret, "key_arena_size", 0);
print(ret, "latest_snapshot_size", state_machine.getLatestSnapshotSize());
#if defined(OS_LINUX) || defined(OS_DARWIN)
@ -387,6 +389,7 @@ String ServerStatCommand::run()
auto & stats = keeper_dispatcher.getKeeperConnectionStats();
Keeper4LWInfo keeper_info = keeper_dispatcher.getKeeper4LWInfo();
const auto & storage_stats = keeper_dispatcher.getStateMachine().getStorageStats();
write("ClickHouse Keeper version", String(VERSION_DESCRIBE) + "-" + VERSION_GITHASH);
@ -398,9 +401,9 @@ String ServerStatCommand::run()
write("Sent", toString(stats.getPacketsSent()));
write("Connections", toString(keeper_info.alive_connections_count));
write("Outstanding", toString(keeper_info.outstanding_requests_count));
write("Zxid", formatZxid(keeper_info.last_zxid));
write("Zxid", formatZxid(storage_stats.last_zxid.load(std::memory_order_relaxed)));
write("Mode", keeper_info.getRole());
write("Node count", toString(keeper_info.total_nodes_count));
write("Node count", toString(storage_stats.nodes_count.load(std::memory_order_relaxed)));
return buf.str();
}
@ -416,6 +419,7 @@ String StatCommand::run()
auto & stats = keeper_dispatcher.getKeeperConnectionStats();
Keeper4LWInfo keeper_info = keeper_dispatcher.getKeeper4LWInfo();
const auto & storage_stats = keeper_dispatcher.getStateMachine().getStorageStats();
write("ClickHouse Keeper version", String(VERSION_DESCRIBE) + "-" + VERSION_GITHASH);
@ -431,9 +435,9 @@ String StatCommand::run()
write("Sent", toString(stats.getPacketsSent()));
write("Connections", toString(keeper_info.alive_connections_count));
write("Outstanding", toString(keeper_info.outstanding_requests_count));
write("Zxid", formatZxid(keeper_info.last_zxid));
write("Zxid", formatZxid(storage_stats.last_zxid.load(std::memory_order_relaxed)));
write("Mode", keeper_info.getRole());
write("Node count", toString(keeper_info.total_nodes_count));
write("Node count", toString(storage_stats.nodes_count.load(std::memory_order_relaxed)));
return buf.str();
}

View File

@ -1,7 +1,5 @@
#pragma once
#include <string>
#include <base/types.h>
#include <Common/Exception.h>
@ -30,9 +28,6 @@ struct Keeper4LWInfo
uint64_t follower_count;
uint64_t synced_follower_count;
uint64_t total_nodes_count;
int64_t last_zxid;
String getRole() const
{
if (is_standalone)

View File

@ -38,15 +38,16 @@ void updateKeeperInformation(KeeperDispatcher & keeper_dispatcher, AsynchronousM
is_follower = static_cast<size_t>(keeper_info.is_follower);
is_exceeding_mem_soft_limit = static_cast<size_t>(keeper_info.is_exceeding_mem_soft_limit);
zxid = keeper_info.last_zxid;
const auto & state_machine = keeper_dispatcher.getStateMachine();
znode_count = state_machine.getNodesCount();
watch_count = state_machine.getTotalWatchesCount();
ephemerals_count = state_machine.getTotalEphemeralNodesCount();
approximate_data_size = state_machine.getApproximateDataSize();
key_arena_size = state_machine.getKeyArenaSize();
session_with_watches = state_machine.getSessionsWithWatchesCount();
paths_watched = state_machine.getWatchedPathsCount();
const auto & storage_stats = state_machine.getStorageStats();
zxid = storage_stats.last_zxid.load(std::memory_order_relaxed);
znode_count = storage_stats.nodes_count.load(std::memory_order_relaxed);
watch_count = storage_stats.total_watches_count.load(std::memory_order_relaxed);
ephemerals_count = storage_stats.total_emphemeral_nodes_count.load(std::memory_order_relaxed);
approximate_data_size = storage_stats.approximate_data_size.load(std::memory_order_relaxed);
key_arena_size = 0;
session_with_watches = storage_stats.sessions_with_watches_count.load(std::memory_order_relaxed);
paths_watched = storage_stats.watched_paths_count.load(std::memory_order_relaxed);
# if defined(__linux__) || defined(__APPLE__)
open_file_descriptor_count = getCurrentProcessFDCount();

View File

@ -305,7 +305,7 @@ void KeeperDispatcher::requestThread()
if (has_read_request)
{
if (server->isLeaderAlive())
server->putLocalReadRequest(request);
server->putLocalReadRequest({request});
else
addErrorResponses({request}, Coordination::Error::ZCONNECTIONLOSS);
}

View File

@ -1207,8 +1207,6 @@ Keeper4LWInfo KeeperServer::getPartiallyFilled4LWInfo() const
result.synced_follower_count = getSyncedFollowerCount();
}
result.is_exceeding_mem_soft_limit = isExceedingMemorySoftLimit();
result.total_nodes_count = getKeeperStateMachine()->getNodesCount();
result.last_zxid = getKeeperStateMachine()->getLastProcessedZxid();
return result;
}

View File

@ -78,20 +78,20 @@ namespace
writeBinary(false, out);
/// Serialize stat
writeBinary(node.czxid, out);
writeBinary(node.mzxid, out);
writeBinary(node.ctime(), out);
writeBinary(node.mtime, out);
writeBinary(node.version, out);
writeBinary(node.cversion, out);
writeBinary(node.aversion, out);
writeBinary(node.ephemeralOwner(), out);
writeBinary(node.stats.czxid, out);
writeBinary(node.stats.mzxid, out);
writeBinary(node.stats.ctime(), out);
writeBinary(node.stats.mtime, out);
writeBinary(node.stats.version, out);
writeBinary(node.stats.cversion, out);
writeBinary(node.stats.aversion, out);
writeBinary(node.stats.ephemeralOwner(), out);
if (version < SnapshotVersion::V6)
writeBinary(static_cast<int32_t>(node.getData().size()), out);
writeBinary(node.numChildren(), out);
writeBinary(node.pzxid, out);
writeBinary(static_cast<int32_t>(node.stats.data_size), out);
writeBinary(node.stats.numChildren(), out);
writeBinary(node.stats.pzxid, out);
writeBinary(node.seqNum(), out);
writeBinary(node.stats.seqNum(), out);
if (version >= SnapshotVersion::V4 && version <= SnapshotVersion::V5)
writeBinary(node.sizeInBytes(), out);
@ -100,11 +100,11 @@ namespace
template<typename Node>
void readNode(Node & node, ReadBuffer & in, SnapshotVersion version, ACLMap & acl_map)
{
readVarUInt(node.data_size, in);
if (node.data_size != 0)
readVarUInt(node.stats.data_size, in);
if (node.stats.data_size != 0)
{
node.data = std::unique_ptr<char[]>(new char[node.data_size]);
in.readStrict(node.data.get(), node.data_size);
node.data = std::unique_ptr<char[]>(new char[node.stats.data_size]);
in.readStrict(node.data.get(), node.stats.data_size);
}
if (version >= SnapshotVersion::V1)
@ -141,19 +141,19 @@ namespace
}
/// Deserialize stat
readBinary(node.czxid, in);
readBinary(node.mzxid, in);
readBinary(node.stats.czxid, in);
readBinary(node.stats.mzxid, in);
int64_t ctime;
readBinary(ctime, in);
node.setCtime(ctime);
readBinary(node.mtime, in);
readBinary(node.version, in);
readBinary(node.cversion, in);
readBinary(node.aversion, in);
node.stats.setCtime(ctime);
readBinary(node.stats.mtime, in);
readBinary(node.stats.version, in);
readBinary(node.stats.cversion, in);
readBinary(node.stats.aversion, in);
int64_t ephemeral_owner = 0;
readBinary(ephemeral_owner, in);
if (ephemeral_owner != 0)
node.setEphemeralOwner(ephemeral_owner);
node.stats.setEphemeralOwner(ephemeral_owner);
if (version < SnapshotVersion::V6)
{
@ -163,14 +163,14 @@ namespace
int32_t num_children = 0;
readBinary(num_children, in);
if (ephemeral_owner == 0)
node.setNumChildren(num_children);
node.stats.setNumChildren(num_children);
readBinary(node.pzxid, in);
readBinary(node.stats.pzxid, in);
int32_t seq_num = 0;
readBinary(seq_num, in);
if (ephemeral_owner == 0)
node.setSeqNum(seq_num);
node.stats.setSeqNum(seq_num);
if (version >= SnapshotVersion::V4 && version <= SnapshotVersion::V5)
{
@ -256,7 +256,7 @@ void KeeperStorageSnapshot<Storage>::serialize(const KeeperStorageSnapshot<Stora
/// Benign race condition possible while taking snapshot: NuRaft decide to create snapshot at some log id
/// and only after some time we lock storage and enable snapshot mode. So snapshot_container_size can be
/// slightly bigger than required.
if (node.mzxid > snapshot.zxid)
if (node.stats.mzxid > snapshot.zxid)
break;
writeBinary(path, out);
writeNode(node, snapshot.version, out);
@ -306,7 +306,7 @@ void KeeperStorageSnapshot<Storage>::serialize(const KeeperStorageSnapshot<Stora
}
template<typename Storage>
void KeeperStorageSnapshot<Storage>::deserialize(SnapshotDeserializationResult<Storage> & deserialization_result, ReadBuffer & in, KeeperContextPtr keeper_context)
void KeeperStorageSnapshot<Storage>::deserialize(SnapshotDeserializationResult<Storage> & deserialization_result, ReadBuffer & in, KeeperContextPtr keeper_context) TSA_NO_THREAD_SAFETY_ANALYSIS
{
uint8_t version;
readBinary(version, in);
@ -435,13 +435,13 @@ void KeeperStorageSnapshot<Storage>::deserialize(SnapshotDeserializationResult<S
}
}
auto ephemeral_owner = node.ephemeralOwner();
auto ephemeral_owner = node.stats.ephemeralOwner();
if constexpr (!use_rocksdb)
if (!node.isEphemeral() && node.numChildren() > 0)
node.getChildren().reserve(node.numChildren());
if (!node.stats.isEphemeral() && node.stats.numChildren() > 0)
node.getChildren().reserve(node.stats.numChildren());
if (ephemeral_owner != 0)
storage.ephemerals[node.ephemeralOwner()].insert(std::string{path});
storage.committed_ephemerals[node.stats.ephemeralOwner()].insert(std::string{path});
if (recalculate_digest)
storage.nodes_digest += node.getDigest(path);
@ -467,16 +467,25 @@ void KeeperStorageSnapshot<Storage>::deserialize(SnapshotDeserializationResult<S
{
if (itr.key != "/")
{
if (itr.value.numChildren() != static_cast<int32_t>(itr.value.getChildren().size()))
if (itr.value.stats.numChildren() != static_cast<int32_t>(itr.value.getChildren().size()))
{
#ifdef NDEBUG
/// TODO (alesapin) remove this, it should be always CORRUPTED_DATA.
LOG_ERROR(getLogger("KeeperSnapshotManager"), "Children counter in stat.numChildren {}"
" is different from actual children size {} for node {}", itr.value.numChildren(), itr.value.getChildren().size(), itr.key);
LOG_ERROR(
getLogger("KeeperSnapshotManager"),
"Children counter in stat.numChildren {}"
" is different from actual children size {} for node {}",
itr.value.stats.numChildren(),
itr.value.getChildren().size(),
itr.key);
#else
throw Exception(ErrorCodes::LOGICAL_ERROR, "Children counter in stat.numChildren {}"
" is different from actual children size {} for node {}",
itr.value.numChildren(), itr.value.getChildren().size(), itr.key);
throw Exception(
ErrorCodes::LOGICAL_ERROR,
"Children counter in stat.numChildren {}"
" is different from actual children size {} for node {}",
itr.value.stats.numChildren(),
itr.value.getChildren().size(),
itr.key);
#endif
}
}
@ -511,7 +520,7 @@ void KeeperStorageSnapshot<Storage>::deserialize(SnapshotDeserializationResult<S
session_auth_counter++;
}
if (!ids.empty())
storage.session_and_auth[active_session_id] = ids;
storage.committed_session_and_auth[active_session_id] = ids;
}
current_session_size++;
}
@ -527,6 +536,8 @@ void KeeperStorageSnapshot<Storage>::deserialize(SnapshotDeserializationResult<S
buffer->pos(0);
deserialization_result.cluster_config = ClusterConfig::deserialize(*buffer);
}
storage.updateStats();
}
template<typename Storage>
@ -544,7 +555,7 @@ KeeperStorageSnapshot<Storage>::KeeperStorageSnapshot(Storage * storage_, uint64
begin = storage->getSnapshotIteratorBegin();
session_and_timeout = storage->getActiveSessions();
acl_map = storage->acl_map.getMapping();
session_and_auth = storage->session_and_auth;
session_and_auth = storage->committed_session_and_auth;
}
template<typename Storage>
@ -563,7 +574,7 @@ KeeperStorageSnapshot<Storage>::KeeperStorageSnapshot(
begin = storage->getSnapshotIteratorBegin();
session_and_timeout = storage->getActiveSessions();
acl_map = storage->acl_map.getMapping();
session_and_auth = storage->session_and_auth;
session_and_auth = storage->committed_session_and_auth;
}
template<typename Storage>

View File

@ -36,6 +36,11 @@ namespace ProfileEvents
extern const Event KeeperStorageLockWaitMicroseconds;
}
namespace CurrentMetrics
{
extern const Metric KeeperAliveConnections;
}
namespace DB
{
@ -56,6 +61,7 @@ IKeeperStateMachine::IKeeperStateMachine(
, snapshots_queue(snapshots_queue_)
, min_request_size_to_cache(keeper_context_->getCoordinationSettings()->min_request_size_for_cache)
, log(getLogger("KeeperStateMachine"))
, read_pool(CurrentMetrics::KeeperAliveConnections, CurrentMetrics::KeeperAliveConnections, CurrentMetrics::KeeperAliveConnections, 100, 10000, 10000)
, superdigest(superdigest_)
, keeper_context(keeper_context_)
, snapshot_manager_s3(snapshot_manager_s3_)
@ -175,18 +181,20 @@ void assertDigest(
}
}
struct TSA_SCOPED_LOCKABLE LockGuardWithStats final
template <bool shared = false>
struct LockGuardWithStats final
{
std::unique_lock<std::mutex> lock;
explicit LockGuardWithStats(std::mutex & mutex) TSA_ACQUIRE(mutex)
using LockType = std::conditional_t<shared, std::shared_lock<SharedMutex>, std::unique_lock<SharedMutex>>;
LockType lock;
explicit LockGuardWithStats(SharedMutex & mutex)
{
Stopwatch watch;
std::unique_lock l(mutex);
LockType l(mutex);
ProfileEvents::increment(ProfileEvents::KeeperStorageLockWaitMicroseconds, watch.elapsedMicroseconds());
lock = std::move(l);
}
~LockGuardWithStats() TSA_RELEASE() = default;
~LockGuardWithStats() = default;
};
}
@ -312,13 +320,12 @@ bool KeeperStateMachine<Storage>::preprocess(const KeeperStorageBase::RequestFor
if (op_num == Coordination::OpNum::SessionID || op_num == Coordination::OpNum::Reconfig)
return true;
LockGuardWithStats lock(storage_and_responses_lock);
if (storage->isFinalized())
return false;
try
{
LockGuardWithStats<true> lock(storage_mutex);
storage->preprocessRequest(
request_for_session.request,
request_for_session.session_id,
@ -335,7 +342,12 @@ bool KeeperStateMachine<Storage>::preprocess(const KeeperStorageBase::RequestFor
}
if (keeper_context->digestEnabled() && request_for_session.digest)
assertDigest(*request_for_session.digest, storage->getNodesDigest(false), *request_for_session.request, request_for_session.log_idx, false);
assertDigest(
*request_for_session.digest,
storage->getNodesDigest(false, /*lock_transaction_mutex=*/true),
*request_for_session.request,
request_for_session.log_idx,
false);
return true;
}
@ -343,7 +355,7 @@ bool KeeperStateMachine<Storage>::preprocess(const KeeperStorageBase::RequestFor
template<typename Storage>
void KeeperStateMachine<Storage>::reconfigure(const KeeperStorageBase::RequestForSession& request_for_session)
{
LockGuardWithStats lock(storage_and_responses_lock);
LockGuardWithStats lock(storage_mutex);
KeeperStorageBase::ResponseForSession response = processReconfiguration(request_for_session);
if (!responses_queue.push(response))
{
@ -461,7 +473,7 @@ nuraft::ptr<nuraft::buffer> KeeperStateMachine<Storage>::commit(const uint64_t l
response_for_session.response = response;
response_for_session.request = request_for_session->request;
LockGuardWithStats lock(storage_and_responses_lock);
LockGuardWithStats lock(storage_mutex);
session_id = storage->getSessionID(session_id_request.session_timeout_ms);
LOG_DEBUG(log, "Session ID response {} with timeout {}", session_id, session_id_request.session_timeout_ms);
response->session_id = session_id;
@ -472,24 +484,31 @@ nuraft::ptr<nuraft::buffer> KeeperStateMachine<Storage>::commit(const uint64_t l
if (op_num == Coordination::OpNum::Close)
{
std::lock_guard lock(request_cache_mutex);
std::lock_guard cache_lock(request_cache_mutex);
parsed_request_cache.erase(request_for_session->session_id);
}
LockGuardWithStats lock(storage_and_responses_lock);
KeeperStorageBase::ResponsesForSessions responses_for_sessions
= storage->processRequest(request_for_session->request, request_for_session->session_id, request_for_session->zxid);
for (auto & response_for_session : responses_for_sessions)
{
if (response_for_session.response->xid != Coordination::WATCH_XID)
response_for_session.request = request_for_session->request;
LockGuardWithStats<true> lock(storage_mutex);
std::lock_guard response_lock(process_and_responses_lock);
KeeperStorageBase::ResponsesForSessions responses_for_sessions
= storage->processRequest(request_for_session->request, request_for_session->session_id, request_for_session->zxid);
for (auto & response_for_session : responses_for_sessions)
{
if (response_for_session.response->xid != Coordination::WATCH_XID)
response_for_session.request = request_for_session->request;
try_push(response_for_session);
try_push(response_for_session);
}
}
if (keeper_context->digestEnabled() && request_for_session->digest)
assertDigest(*request_for_session->digest, storage->getNodesDigest(true), *request_for_session->request, request_for_session->log_idx, true);
assertDigest(
*request_for_session->digest,
storage->getNodesDigest(true, /*lock_transaction_mutex=*/true),
*request_for_session->request,
request_for_session->log_idx,
true);
}
ProfileEvents::increment(ProfileEvents::KeeperCommits);
@ -534,8 +553,6 @@ bool KeeperStateMachine<Storage>::apply_snapshot(nuraft::snapshot & s)
}
{ /// deserialize and apply snapshot to storage
LockGuardWithStats lock(storage_and_responses_lock);
SnapshotDeserializationResult<Storage> snapshot_deserialization_result;
if (latest_snapshot_ptr)
snapshot_deserialization_result = snapshot_manager.deserializeSnapshotFromBuffer(latest_snapshot_ptr);
@ -543,6 +560,7 @@ bool KeeperStateMachine<Storage>::apply_snapshot(nuraft::snapshot & s)
snapshot_deserialization_result
= snapshot_manager.deserializeSnapshotFromBuffer(snapshot_manager.deserializeSnapshotBufferFromDisk(s.get_last_log_idx()));
LockGuardWithStats storage_lock(storage_mutex);
/// maybe some logs were preprocessed with log idx larger than the snapshot idx
/// we have to apply them to the new storage
storage->applyUncommittedState(*snapshot_deserialization_result.storage, snapshot_deserialization_result.snapshot_meta->get_last_log_idx());
@ -587,16 +605,7 @@ void KeeperStateMachine<Storage>::rollbackRequest(const KeeperStorageBase::Reque
if (request_for_session.request->getOpNum() == Coordination::OpNum::SessionID)
return;
LockGuardWithStats lock(storage_and_responses_lock);
storage->rollbackRequest(request_for_session.zxid, allow_missing);
}
template<typename Storage>
void KeeperStateMachine<Storage>::rollbackRequestNoLock(const KeeperStorageBase::RequestForSession & request_for_session, bool allow_missing)
{
if (request_for_session.request->getOpNum() == Coordination::OpNum::SessionID)
return;
LockGuardWithStats lock(storage_mutex);
storage->rollbackRequest(request_for_session.zxid, allow_missing);
}
@ -616,7 +625,7 @@ void KeeperStateMachine<Storage>::create_snapshot(nuraft::snapshot & s, nuraft::
auto snapshot_meta_copy = nuraft::snapshot::deserialize(*snp_buf);
CreateSnapshotTask snapshot_task;
{ /// lock storage for a short period time to turn on "snapshot mode". After that we can read consistent storage state without locking.
LockGuardWithStats lock(storage_and_responses_lock);
LockGuardWithStats lock(storage_mutex);
snapshot_task.snapshot = std::make_shared<KeeperStorageSnapshot<Storage>>(storage.get(), snapshot_meta_copy, getClusterConfig());
}
@ -681,7 +690,7 @@ void KeeperStateMachine<Storage>::create_snapshot(nuraft::snapshot & s, nuraft::
}
{
/// Destroy snapshot with lock
LockGuardWithStats lock(storage_and_responses_lock);
LockGuardWithStats lock(storage_mutex);
LOG_TRACE(log, "Clearing garbage after snapshot");
/// Turn off "snapshot mode" and clear outdate part of storage state
storage->clearGarbageAfterSnapshot();
@ -824,10 +833,10 @@ template<typename Storage>
void KeeperStateMachine<Storage>::processReadRequest(const KeeperStorageBase::RequestForSession & request_for_session)
{
/// Pure local request, just process it with storage
LockGuardWithStats lock(storage_and_responses_lock);
LockGuardWithStats<true> storage_lock(storage_mutex);
std::lock_guard response_lock(process_and_responses_lock);
auto responses = storage->processRequest(
request_for_session.request, request_for_session.session_id, std::nullopt, true /*check_acl*/, true /*is_local*/);
for (auto & response_for_session : responses)
{
if (response_for_session.response->xid != Coordination::WATCH_XID)
@ -840,112 +849,116 @@ void KeeperStateMachine<Storage>::processReadRequest(const KeeperStorageBase::Re
template<typename Storage>
void KeeperStateMachine<Storage>::shutdownStorage()
{
LockGuardWithStats lock(storage_and_responses_lock);
LockGuardWithStats lock(storage_mutex);
storage->finalize();
}
template<typename Storage>
std::vector<int64_t> KeeperStateMachine<Storage>::getDeadSessions()
{
LockGuardWithStats lock(storage_and_responses_lock);
LockGuardWithStats lock(storage_mutex);
return storage->getDeadSessions();
}
template<typename Storage>
int64_t KeeperStateMachine<Storage>::getNextZxid() const
{
LockGuardWithStats lock(storage_and_responses_lock);
return storage->getNextZXID();
}
template<typename Storage>
KeeperStorageBase::Digest KeeperStateMachine<Storage>::getNodesDigest() const
{
LockGuardWithStats lock(storage_and_responses_lock);
return storage->getNodesDigest(false);
LockGuardWithStats lock(storage_mutex);
return storage->getNodesDigest(false, /*lock_transaction_mutex=*/true);
}
template<typename Storage>
uint64_t KeeperStateMachine<Storage>::getLastProcessedZxid() const
{
LockGuardWithStats lock(storage_and_responses_lock);
return storage->getZXID();
}
template<typename Storage>
const KeeperStorageBase::Stats & KeeperStateMachine<Storage>::getStorageStats() const TSA_NO_THREAD_SAFETY_ANALYSIS
{
return storage->getStorageStats();
}
template<typename Storage>
uint64_t KeeperStateMachine<Storage>::getNodesCount() const
{
LockGuardWithStats lock(storage_and_responses_lock);
LockGuardWithStats lock(storage_mutex);
return storage->getNodesCount();
}
template<typename Storage>
uint64_t KeeperStateMachine<Storage>::getTotalWatchesCount() const
{
LockGuardWithStats lock(storage_and_responses_lock);
LockGuardWithStats lock(storage_mutex);
return storage->getTotalWatchesCount();
}
template<typename Storage>
uint64_t KeeperStateMachine<Storage>::getWatchedPathsCount() const
{
LockGuardWithStats lock(storage_and_responses_lock);
LockGuardWithStats lock(storage_mutex);
return storage->getWatchedPathsCount();
}
template<typename Storage>
uint64_t KeeperStateMachine<Storage>::getSessionsWithWatchesCount() const
{
LockGuardWithStats lock(storage_and_responses_lock);
LockGuardWithStats lock(storage_mutex);
return storage->getSessionsWithWatchesCount();
}
template<typename Storage>
uint64_t KeeperStateMachine<Storage>::getTotalEphemeralNodesCount() const
{
LockGuardWithStats lock(storage_and_responses_lock);
LockGuardWithStats lock(storage_mutex);
return storage->getTotalEphemeralNodesCount();
}
template<typename Storage>
uint64_t KeeperStateMachine<Storage>::getSessionWithEphemeralNodesCount() const
{
LockGuardWithStats lock(storage_and_responses_lock);
LockGuardWithStats lock(storage_mutex);
return storage->getSessionWithEphemeralNodesCount();
}
template<typename Storage>
void KeeperStateMachine<Storage>::dumpWatches(WriteBufferFromOwnString & buf) const
{
LockGuardWithStats lock(storage_and_responses_lock);
LockGuardWithStats lock(storage_mutex);
storage->dumpWatches(buf);
}
template<typename Storage>
void KeeperStateMachine<Storage>::dumpWatchesByPath(WriteBufferFromOwnString & buf) const
{
LockGuardWithStats lock(storage_and_responses_lock);
LockGuardWithStats lock(storage_mutex);
storage->dumpWatchesByPath(buf);
}
template<typename Storage>
void KeeperStateMachine<Storage>::dumpSessionsAndEphemerals(WriteBufferFromOwnString & buf) const
{
LockGuardWithStats lock(storage_and_responses_lock);
LockGuardWithStats lock(storage_mutex);
storage->dumpSessionsAndEphemerals(buf);
}
template<typename Storage>
uint64_t KeeperStateMachine<Storage>::getApproximateDataSize() const
{
LockGuardWithStats lock(storage_and_responses_lock);
LockGuardWithStats lock(storage_mutex);
return storage->getApproximateDataSize();
}
template<typename Storage>
uint64_t KeeperStateMachine<Storage>::getKeyArenaSize() const
{
LockGuardWithStats lock(storage_and_responses_lock);
LockGuardWithStats lock(storage_mutex);
return storage->getArenaDataSize();
}
@ -988,7 +1001,7 @@ ClusterConfigPtr IKeeperStateMachine::getClusterConfig() const
template<typename Storage>
void KeeperStateMachine<Storage>::recalculateStorageStats()
{
LockGuardWithStats lock(storage_and_responses_lock);
LockGuardWithStats lock(storage_mutex);
LOG_INFO(log, "Recalculating storage stats");
storage->recalculateStats();
LOG_INFO(log, "Done recalculating storage stats");

View File

@ -85,6 +85,8 @@ public:
/// Introspection functions for 4lw commands
virtual uint64_t getLastProcessedZxid() const = 0;
virtual const KeeperStorageBase::Stats & getStorageStats() const = 0;
virtual uint64_t getNodesCount() const = 0;
virtual uint64_t getTotalWatchesCount() const = 0;
virtual uint64_t getWatchedPathsCount() const = 0;
@ -124,12 +126,16 @@ protected:
/// Mutex for snapshots
mutable std::mutex snapshots_lock;
/// Lock for storage and responses_queue. It's important to process requests
/// Lock for the storage
/// Storage works in thread-safe way ONLY for preprocessing/processing
/// In any other case, unique storage lock needs to be taken
mutable SharedMutex storage_mutex;
/// Lock for processing and responses_queue. It's important to process requests
/// and push them to the responses queue while holding this lock. Otherwise
/// we can get strange cases when, for example client send read request with
/// watch and after that receive watch response and only receive response
/// for request.
mutable std::mutex storage_and_responses_lock;
mutable std::mutex process_and_responses_lock;
std::unordered_map<int64_t, std::unordered_map<Coordination::XID, std::shared_ptr<KeeperStorageBase::RequestForSession>>> parsed_request_cache;
uint64_t min_request_size_to_cache{0};
@ -146,6 +152,7 @@ protected:
mutable std::mutex cluster_config_lock;
ClusterConfigPtr cluster_config;
ThreadPool read_pool;
/// Special part of ACL system -- superdigest specified in server config.
const std::string superdigest;
@ -153,10 +160,8 @@ protected:
KeeperSnapshotManagerS3 * snapshot_manager_s3;
virtual KeeperStorageBase::ResponseForSession processReconfiguration(
const KeeperStorageBase::RequestForSession& request_for_session)
TSA_REQUIRES(storage_and_responses_lock) = 0;
virtual KeeperStorageBase::ResponseForSession processReconfiguration(const KeeperStorageBase::RequestForSession & request_for_session)
= 0;
};
/// ClickHouse Keeper state machine. Wrapper for KeeperStorage.
@ -189,10 +194,6 @@ public:
// (can happen in case of exception during preprocessing)
void rollbackRequest(const KeeperStorageBase::RequestForSession & request_for_session, bool allow_missing) override;
void rollbackRequestNoLock(
const KeeperStorageBase::RequestForSession & request_for_session,
bool allow_missing) TSA_NO_THREAD_SAFETY_ANALYSIS;
/// Apply preliminarily saved (save_logical_snp_obj) snapshot to our state.
bool apply_snapshot(nuraft::snapshot & s) override;
@ -205,7 +206,7 @@ public:
// This should be used only for tests or keeper-data-dumper because it violates
// TSA -- we can't acquire the lock outside of this class or return a storage under lock
// in a reasonable way.
Storage & getStorageUnsafe() TSA_NO_THREAD_SAFETY_ANALYSIS
Storage & getStorageUnsafe()
{
return *storage;
}
@ -224,6 +225,8 @@ public:
/// Introspection functions for 4lw commands
uint64_t getLastProcessedZxid() const override;
const KeeperStorageBase::Stats & getStorageStats() const override;
uint64_t getNodesCount() const override;
uint64_t getTotalWatchesCount() const override;
uint64_t getWatchedPathsCount() const override;
@ -245,12 +248,12 @@ public:
private:
/// Main state machine logic
std::unique_ptr<Storage> storage; //TSA_PT_GUARDED_BY(storage_and_responses_lock);
std::unique_ptr<Storage> storage;
/// Save/Load and Serialize/Deserialize logic for snapshots.
KeeperSnapshotManager<Storage> snapshot_manager;
KeeperStorageBase::ResponseForSession processReconfiguration(const KeeperStorageBase::RequestForSession & request_for_session)
TSA_REQUIRES(storage_and_responses_lock) override;
KeeperStorageBase::ResponseForSession processReconfiguration(const KeeperStorageBase::RequestForSession & request_for_session) override;
};
}

File diff suppressed because it is too large Load Diff

View File

@ -1,10 +1,16 @@
#pragma once
#include <unordered_map>
#include <unordered_set>
#include <vector>
#include <Coordination/ACLMap.h>
#include <Coordination/SessionExpiryQueue.h>
#include <Coordination/SnapshotableHashTable.h>
#include "Common/StringHashForHeterogeneousLookup.h"
#include <Common/SharedMutex.h>
#include <Common/Concepts.h>
#include <base/defines.h>
#include <absl/container/flat_hash_set.h>
@ -23,14 +29,11 @@ using ResponseCallback = std::function<void(const Coordination::ZooKeeperRespons
using ChildrenSet = absl::flat_hash_set<StringRef, StringRefHash>;
using SessionAndTimeout = std::unordered_map<int64_t, int64_t>;
/// KeeperRocksNodeInfo is used in RocksDB keeper.
/// It is serialized directly as POD to RocksDB.
struct KeeperRocksNodeInfo
struct NodeStats
{
int64_t czxid{0};
int64_t mzxid{0};
int64_t pzxid{0};
uint64_t acl_id = 0; /// 0 -- no ACL by default
int64_t mtime{0};
@ -38,225 +41,9 @@ struct KeeperRocksNodeInfo
int32_t cversion{0};
int32_t aversion{0};
int32_t seq_num = 0;
mutable UInt64 digest = 0; /// we cached digest for this node.
/// as ctime can't be negative because it stores the timestamp when the
/// node was created, we can use the MSB for a bool
struct
{
bool is_ephemeral : 1;
int64_t ctime : 63;
} is_ephemeral_and_ctime{false, 0};
/// ephemeral notes cannot have children so a node can set either
/// ephemeral_owner OR seq_num + num_children
union
{
int64_t ephemeral_owner;
struct
{
int32_t seq_num;
int32_t num_children;
} children_info;
} ephemeral_or_children_data{0};
bool isEphemeral() const
{
return is_ephemeral_and_ctime.is_ephemeral;
}
int64_t ephemeralOwner() const
{
if (isEphemeral())
return ephemeral_or_children_data.ephemeral_owner;
return 0;
}
void setEphemeralOwner(int64_t ephemeral_owner)
{
is_ephemeral_and_ctime.is_ephemeral = ephemeral_owner != 0;
ephemeral_or_children_data.ephemeral_owner = ephemeral_owner;
}
int32_t numChildren() const
{
if (isEphemeral())
return 0;
return ephemeral_or_children_data.children_info.num_children;
}
void setNumChildren(int32_t num_children)
{
ephemeral_or_children_data.children_info.num_children = num_children;
}
/// dummy interface for test
void addChild(StringRef) {}
auto getChildren() const
{
return std::vector<int>(numChildren());
}
void increaseNumChildren()
{
chassert(!isEphemeral());
++ephemeral_or_children_data.children_info.num_children;
}
void decreaseNumChildren()
{
chassert(!isEphemeral());
--ephemeral_or_children_data.children_info.num_children;
}
int32_t seqNum() const
{
if (isEphemeral())
return 0;
return ephemeral_or_children_data.children_info.seq_num;
}
void setSeqNum(int32_t seq_num_)
{
ephemeral_or_children_data.children_info.seq_num = seq_num_;
}
void increaseSeqNum()
{
chassert(!isEphemeral());
++ephemeral_or_children_data.children_info.seq_num;
}
int64_t ctime() const
{
return is_ephemeral_and_ctime.ctime;
}
void setCtime(uint64_t ctime)
{
is_ephemeral_and_ctime.ctime = ctime;
}
uint32_t data_size{0};
void copyStats(const Coordination::Stat & stat);
};
/// KeeperRocksNode is the memory structure used by RocksDB
struct KeeperRocksNode : public KeeperRocksNodeInfo
{
#if USE_ROCKSDB
friend struct RocksDBContainer<KeeperRocksNode>;
#endif
using Meta = KeeperRocksNodeInfo;
uint64_t size_bytes = 0; // only for compatible, should be deprecated
uint64_t sizeInBytes() const { return data_size + sizeof(KeeperRocksNodeInfo); }
void setData(String new_data)
{
data_size = static_cast<uint32_t>(new_data.size());
if (data_size != 0)
{
data = std::unique_ptr<char[]>(new char[new_data.size()]);
memcpy(data.get(), new_data.data(), data_size);
}
}
void shallowCopy(const KeeperRocksNode & other)
{
czxid = other.czxid;
mzxid = other.mzxid;
pzxid = other.pzxid;
acl_id = other.acl_id; /// 0 -- no ACL by default
mtime = other.mtime;
is_ephemeral_and_ctime = other.is_ephemeral_and_ctime;
ephemeral_or_children_data = other.ephemeral_or_children_data;
data_size = other.data_size;
if (data_size != 0)
{
data = std::unique_ptr<char[]>(new char[data_size]);
memcpy(data.get(), other.data.get(), data_size);
}
version = other.version;
cversion = other.cversion;
aversion = other.aversion;
/// cached_digest = other.cached_digest;
}
void invalidateDigestCache() const;
UInt64 getDigest(std::string_view path) const;
String getEncodedString();
void decodeFromString(const String & buffer_str);
void recalculateSize() {}
std::string_view getData() const noexcept { return {data.get(), data_size}; }
void setResponseStat(Coordination::Stat & response_stat) const
{
response_stat.czxid = czxid;
response_stat.mzxid = mzxid;
response_stat.ctime = ctime();
response_stat.mtime = mtime;
response_stat.version = version;
response_stat.cversion = cversion;
response_stat.aversion = aversion;
response_stat.ephemeralOwner = ephemeralOwner();
response_stat.dataLength = static_cast<int32_t>(data_size);
response_stat.numChildren = numChildren();
response_stat.pzxid = pzxid;
}
void reset()
{
serialized = false;
}
bool empty() const
{
return data_size == 0 && mzxid == 0;
}
std::unique_ptr<char[]> data{nullptr};
uint32_t data_size{0};
private:
bool serialized = false;
};
/// KeeperMemNode should have as minimal size as possible to reduce memory footprint
/// of stored nodes
/// New fields should be added to the struct only if it's really necessary
struct KeeperMemNode
{
int64_t czxid{0};
int64_t mzxid{0};
int64_t pzxid{0};
uint64_t acl_id = 0; /// 0 -- no ACL by default
int64_t mtime{0};
std::unique_ptr<char[]> data{nullptr};
uint32_t data_size{0};
int32_t version{0};
int32_t cversion{0};
int32_t aversion{0};
mutable uint64_t cached_digest = 0;
KeeperMemNode() = default;
KeeperMemNode & operator=(const KeeperMemNode & other);
KeeperMemNode(const KeeperMemNode & other);
KeeperMemNode & operator=(KeeperMemNode && other) noexcept;
KeeperMemNode(KeeperMemNode && other) noexcept;
bool empty() const;
bool isEphemeral() const
{
@ -287,6 +74,7 @@ struct KeeperMemNode
void setNumChildren(int32_t num_children)
{
is_ephemeral_and_ctime.is_ephemeral = false;
ephemeral_or_children_data.children_info.num_children = num_children;
}
@ -331,34 +119,6 @@ struct KeeperMemNode
is_ephemeral_and_ctime.ctime = ctime;
}
void copyStats(const Coordination::Stat & stat);
void setResponseStat(Coordination::Stat & response_stat) const;
/// Object memory size
uint64_t sizeInBytes() const;
void setData(const String & new_data);
std::string_view getData() const noexcept { return {data.get(), data_size}; }
void addChild(StringRef child_path);
void removeChild(StringRef child_path);
const auto & getChildren() const noexcept { return children; }
auto & getChildren() { return children; }
// Invalidate the calculated digest so it's recalculated again on the next
// getDigest call
void invalidateDigestCache() const;
// get the calculated digest of the node
UInt64 getDigest(std::string_view path) const;
// copy only necessary information for preprocessing and digest calculation
// (e.g. we don't need to copy list of children)
void shallowCopy(const KeeperMemNode & other);
private:
/// as ctime can't be negative because it stores the timestamp when the
/// node was created, we can use the MSB for a bool
@ -379,7 +139,132 @@ private:
int32_t num_children;
} children_info;
} ephemeral_or_children_data{0};
};
/// KeeperRocksNodeInfo is used in RocksDB keeper.
/// It is serialized directly as POD to RocksDB.
struct KeeperRocksNodeInfo
{
NodeStats stats;
uint64_t acl_id = 0; /// 0 -- no ACL by default
/// dummy interface for test
void addChild(StringRef) {}
auto getChildren() const
{
return std::vector<int>(stats.numChildren());
}
void copyStats(const Coordination::Stat & stat);
};
/// KeeperRocksNode is the memory structure used by RocksDB
struct KeeperRocksNode : public KeeperRocksNodeInfo
{
#if USE_ROCKSDB
friend struct RocksDBContainer<KeeperRocksNode>;
#endif
using Meta = KeeperRocksNodeInfo;
uint64_t size_bytes = 0; // only for compatible, should be deprecated
uint64_t sizeInBytes() const { return stats.data_size + sizeof(KeeperRocksNodeInfo); }
void setData(String new_data)
{
stats.data_size = static_cast<uint32_t>(new_data.size());
if (stats.data_size != 0)
{
data = std::unique_ptr<char[]>(new char[new_data.size()]);
memcpy(data.get(), new_data.data(), stats.data_size);
}
}
void shallowCopy(const KeeperRocksNode & other)
{
stats = other.stats;
acl_id = other.acl_id;
if (stats.data_size != 0)
{
data = std::unique_ptr<char[]>(new char[stats.data_size]);
memcpy(data.get(), other.data.get(), stats.data_size);
}
/// cached_digest = other.cached_digest;
}
void invalidateDigestCache() const;
UInt64 getDigest(std::string_view path) const;
String getEncodedString();
void decodeFromString(const String & buffer_str);
void recalculateSize() {}
std::string_view getData() const noexcept { return {data.get(), stats.data_size}; }
void setResponseStat(Coordination::Stat & response_stat) const;
void reset()
{
serialized = false;
}
bool empty() const
{
return stats.data_size == 0 && stats.mzxid == 0;
}
std::unique_ptr<char[]> data{nullptr};
mutable UInt64 cached_digest = 0; /// we cached digest for this node.
private:
bool serialized = false;
};
/// KeeperMemNode should have as minimal size as possible to reduce memory footprint
/// of stored nodes
/// New fields should be added to the struct only if it's really necessary
struct KeeperMemNode
{
NodeStats stats;
std::unique_ptr<char[]> data{nullptr};
mutable uint64_t cached_digest = 0;
uint64_t acl_id = 0; /// 0 -- no ACL by default
KeeperMemNode() = default;
KeeperMemNode & operator=(const KeeperMemNode & other);
KeeperMemNode(const KeeperMemNode & other);
KeeperMemNode & operator=(KeeperMemNode && other) noexcept;
KeeperMemNode(KeeperMemNode && other) noexcept;
bool empty() const;
void copyStats(const Coordination::Stat & stat);
void setResponseStat(Coordination::Stat & response_stat) const;
/// Object memory size
uint64_t sizeInBytes() const;
void setData(const String & new_data);
std::string_view getData() const noexcept { return {data.get(), stats.data_size}; }
void addChild(StringRef child_path);
void removeChild(StringRef child_path);
const auto & getChildren() const noexcept { return children; }
auto & getChildren() { return children; }
// Invalidate the calculated digest so it's recalculated again on the next
// getDigest call
void invalidateDigestCache() const;
// get the calculated digest of the node
UInt64 getDigest(std::string_view path) const;
// copy only necessary information for preprocessing and digest calculation
// (e.g. we don't need to copy list of children)
void shallowCopy(const KeeperMemNode & other);
private:
ChildrenSet children{};
};
@ -430,18 +315,187 @@ public:
};
using Ephemerals = std::unordered_map<int64_t, std::unordered_set<std::string>>;
using SessionAndWatcher = std::unordered_map<int64_t, std::unordered_set<std::string>>;
struct WatchInfo
{
std::string_view path;
bool is_list_watch;
bool operator==(const WatchInfo &) const = default;
};
struct WatchInfoHash
{
auto operator()(WatchInfo info) const
{
SipHash hash;
hash.update(info.path);
hash.update(info.is_list_watch);
return hash.get64();
}
};
using SessionAndWatcher = std::unordered_map<int64_t, std::unordered_set<WatchInfo, WatchInfoHash>>;
using SessionIDs = std::unordered_set<int64_t>;
/// Just vector of SHA1 from user:password
using AuthIDs = std::vector<AuthID>;
using SessionAndAuth = std::unordered_map<int64_t, AuthIDs>;
using Watches = std::unordered_map<String /* path, relative of root_path */, SessionIDs>;
using Watches = std::unordered_map<
String /* path, relative of root_path */,
SessionIDs,
StringHashForHeterogeneousLookup,
StringHashForHeterogeneousLookup::transparent_key_equal>;
// Applying ZooKeeper request to storage consists of two steps:
// - preprocessing which, instead of applying the changes directly to storage,
// generates deltas with those changes, denoted with the request ZXID
// - processing which applies deltas with the correct ZXID to the storage
//
// Delta objects allow us two things:
// - fetch the latest, uncommitted state of an object by getting the committed
// state of that same object from the storage and applying the deltas
// in the same order as they are defined
// - quickly commit the changes to the storage
struct CreateNodeDelta
{
Coordination::Stat stat;
Coordination::ACLs acls;
String data;
};
struct RemoveNodeDelta
{
int32_t version{-1};
NodeStats stat;
Coordination::ACLs acls;
String data;
};
struct UpdateNodeStatDelta
{
template <is_any_of<KeeperMemNode, KeeperRocksNode> Node>
explicit UpdateNodeStatDelta(const Node & node)
: old_stats(node.stats)
, new_stats(node.stats)
{}
NodeStats old_stats;
NodeStats new_stats;
int32_t version{-1};
};
struct UpdateNodeDataDelta
{
std::string old_data;
std::string new_data;
int32_t version{-1};
};
struct SetACLDelta
{
Coordination::ACLs old_acls;
Coordination::ACLs new_acls;
int32_t version{-1};
};
struct ErrorDelta
{
Coordination::Error error;
};
struct FailedMultiDelta
{
std::vector<Coordination::Error> error_codes;
Coordination::Error global_error{Coordination::Error::ZOK};
};
// Denotes end of a subrequest in multi request
struct SubDeltaEnd
{
};
struct AddAuthDelta
{
int64_t session_id;
std::shared_ptr<AuthID> auth_id;
};
struct CloseSessionDelta
{
int64_t session_id;
};
using Operation = std::variant<
CreateNodeDelta,
RemoveNodeDelta,
UpdateNodeStatDelta,
UpdateNodeDataDelta,
SetACLDelta,
AddAuthDelta,
ErrorDelta,
SubDeltaEnd,
FailedMultiDelta,
CloseSessionDelta>;
struct Delta
{
Delta(String path_, int64_t zxid_, Operation operation_) : path(std::move(path_)), zxid(zxid_), operation(std::move(operation_)) { }
Delta(int64_t zxid_, Coordination::Error error) : Delta("", zxid_, ErrorDelta{error}) { }
Delta(int64_t zxid_, Operation subdelta) : Delta("", zxid_, subdelta) { }
String path;
int64_t zxid;
Operation operation;
};
using DeltaIterator = std::list<KeeperStorageBase::Delta>::const_iterator;
struct DeltaRange
{
DeltaIterator begin_it;
DeltaIterator end_it;
auto begin() const
{
return begin_it;
}
auto end() const
{
return end_it;
}
bool empty() const
{
return begin_it == end_it;
}
const auto & front() const
{
return *begin_it;
}
};
struct Stats
{
std::atomic<uint64_t> nodes_count = 0;
std::atomic<uint64_t> approximate_data_size = 0;
std::atomic<uint64_t> total_watches_count = 0;
std::atomic<uint64_t> watched_paths_count = 0;
std::atomic<uint64_t> sessions_with_watches_count = 0;
std::atomic<uint64_t> session_with_ephemeral_nodes_count = 0;
std::atomic<uint64_t> total_emphemeral_nodes_count = 0;
std::atomic<int64_t> last_zxid = 0;
};
Stats stats;
static bool checkDigest(const Digest & first, const Digest & second);
};
/// Keeper state machine almost equal to the ZooKeeper's state machine.
/// Implements all logic of operations, data changes, sessions allocation.
/// In-memory and not thread safe.
@ -472,143 +526,49 @@ public:
int64_t session_id_counter{1};
SessionAndAuth session_and_auth;
mutable SharedMutex auth_mutex;
SessionAndAuth committed_session_and_auth;
mutable SharedMutex storage_mutex;
/// Main hashtable with nodes. Contain all information about data.
/// All other structures expect session_and_timeout can be restored from
/// container.
Container container;
// Applying ZooKeeper request to storage consists of two steps:
// - preprocessing which, instead of applying the changes directly to storage,
// generates deltas with those changes, denoted with the request ZXID
// - processing which applies deltas with the correct ZXID to the storage
//
// Delta objects allow us two things:
// - fetch the latest, uncommitted state of an object by getting the committed
// state of that same object from the storage and applying the deltas
// in the same order as they are defined
// - quickly commit the changes to the storage
struct CreateNodeDelta
{
Coordination::Stat stat;
Coordination::ACLs acls;
String data;
};
struct RemoveNodeDelta
{
int32_t version{-1};
int64_t ephemeral_owner{0};
};
struct UpdateNodeDelta
{
std::function<void(Node &)> update_fn;
int32_t version{-1};
};
struct SetACLDelta
{
Coordination::ACLs acls;
int32_t version{-1};
};
struct ErrorDelta
{
Coordination::Error error;
};
struct FailedMultiDelta
{
std::vector<Coordination::Error> error_codes;
Coordination::Error global_error{Coordination::Error::ZOK};
};
// Denotes end of a subrequest in multi request
struct SubDeltaEnd
{
};
struct AddAuthDelta
{
int64_t session_id;
AuthID auth_id;
};
struct CloseSessionDelta
{
int64_t session_id;
};
using Operation = std::
variant<CreateNodeDelta, RemoveNodeDelta, UpdateNodeDelta, SetACLDelta, AddAuthDelta, ErrorDelta, SubDeltaEnd, FailedMultiDelta, CloseSessionDelta>;
struct Delta
{
Delta(String path_, int64_t zxid_, Operation operation_) : path(std::move(path_)), zxid(zxid_), operation(std::move(operation_)) { }
Delta(int64_t zxid_, Coordination::Error error) : Delta("", zxid_, ErrorDelta{error}) { }
Delta(int64_t zxid_, Operation subdelta) : Delta("", zxid_, subdelta) { }
String path;
int64_t zxid;
Operation operation;
};
struct UncommittedState
{
explicit UncommittedState(KeeperStorage & storage_) : storage(storage_) { }
void addDelta(Delta new_delta);
void addDeltas(std::vector<Delta> new_deltas);
void commit(int64_t commit_zxid);
void addDeltas(std::list<Delta> new_deltas);
void cleanup(int64_t commit_zxid);
void rollback(int64_t rollback_zxid);
void rollback(std::list<Delta> rollback_deltas);
std::shared_ptr<Node> getNode(StringRef path) const;
std::shared_ptr<Node> getNode(StringRef path, bool should_lock_storage = true) const;
const Node * getActualNodeView(StringRef path, const Node & storage_node) const;
Coordination::ACLs getACLs(StringRef path) const;
void applyDeltas(const std::list<Delta> & new_deltas);
void applyDelta(const Delta & delta);
void rollbackDelta(const Delta & delta);
bool hasACL(int64_t session_id, bool is_local, std::function<bool(const AuthID &)> predicate) const;
void forEachAuthInSession(int64_t session_id, std::function<void(const AuthID &)> func) const;
std::shared_ptr<Node> tryGetNodeFromStorage(StringRef path) const;
std::shared_ptr<Node> tryGetNodeFromStorage(StringRef path, bool should_lock_storage = true) const;
std::unordered_map<int64_t, std::list<const AuthID *>> session_and_auth;
std::unordered_set<int64_t> closed_sessions;
using ZxidToNodes = std::map<int64_t, std::unordered_set<std::string_view>>;
struct UncommittedNode
{
std::shared_ptr<Node> node{nullptr};
Coordination::ACLs acls{};
int64_t zxid{0};
};
std::optional<Coordination::ACLs> acls{};
std::unordered_set<uint64_t> applied_zxids{};
struct Hash
{
auto operator()(const std::string_view view) const
{
SipHash hash;
hash.update(view);
return hash.get64();
}
using is_transparent = void; // required to make find() work with different type than key_type
};
struct Equal
{
auto operator()(const std::string_view a,
const std::string_view b) const
{
return a == b;
}
using is_transparent = void; // required to make find() work with different type than key_type
void materializeACL(const ACLMap & current_acl_map);
};
struct PathCmp
@ -624,10 +584,15 @@ public:
using is_transparent = void; // required to make find() work with different type than key_type
};
mutable std::map<std::string, UncommittedNode, PathCmp> nodes;
std::unordered_map<std::string, std::list<const Delta *>, Hash, Equal> deltas_for_path;
Ephemerals ephemerals;
std::list<Delta> deltas;
std::unordered_map<int64_t, std::list<std::pair<int64_t, std::shared_ptr<AuthID>>>> session_and_auth;
mutable std::map<std::string, UncommittedNode, PathCmp> nodes;
mutable ZxidToNodes zxid_to_nodes;
mutable std::mutex deltas_mutex;
std::list<Delta> deltas TSA_GUARDED_BY(deltas_mutex);
KeeperStorage<Container> & storage;
};
@ -637,7 +602,7 @@ public:
// with zxid > last_zxid
void applyUncommittedState(KeeperStorage & other, int64_t last_log_idx);
Coordination::Error commit(int64_t zxid);
Coordination::Error commit(DeltaRange deltas);
// Create node in the storage
// Returns false if it failed to create the node, true otherwise
@ -655,12 +620,11 @@ public:
bool checkACL(StringRef path, int32_t permissions, int64_t session_id, bool is_local);
void unregisterEphemeralPath(int64_t session_id, const std::string & path);
std::mutex ephemeral_mutex;
/// Mapping session_id -> set of ephemeral nodes paths
Ephemerals ephemerals;
/// Mapping session_id -> set of watched nodes paths
SessionAndWatcher sessions_and_watchers;
Ephemerals committed_ephemerals;
size_t committed_ephemeral_nodes{0};
/// Expiration queue for session, allows to get dead sessions at some point of time
SessionExpiryQueue session_expiry_queue;
/// All active sessions with timeout
@ -669,8 +633,10 @@ public:
/// ACLMap for more compact ACLs storage inside nodes.
ACLMap acl_map;
mutable std::mutex transaction_mutex;
/// Global id of all requests applied to storage
int64_t zxid{0};
int64_t zxid TSA_GUARDED_BY(transaction_mutex) = 0;
// older Keeper node (pre V5 snapshots) can create snapshots and receive logs from newer Keeper nodes
// this can lead to some inconsistencies, e.g. from snapshot it will use log_idx as zxid
@ -687,11 +653,16 @@ public:
int64_t log_idx = 0;
};
std::deque<TransactionInfo> uncommitted_transactions;
std::list<TransactionInfo> uncommitted_transactions TSA_GUARDED_BY(transaction_mutex);
uint64_t nodes_digest{0};
uint64_t nodes_digest = 0;
bool finalized{false};
std::atomic<bool> finalized{false};
/// Mapping session_id -> set of watched nodes paths
SessionAndWatcher sessions_and_watchers;
size_t total_watches_count = 0;
/// Currently active watches (node_path -> subscribed sessions)
Watches watches;
@ -700,45 +671,30 @@ public:
void clearDeadWatches(int64_t session_id);
/// Get current committed zxid
int64_t getZXID() const { return zxid; }
int64_t getZXID() const;
int64_t getNextZXID() const
{
if (uncommitted_transactions.empty())
return zxid + 1;
int64_t getNextZXID() const;
int64_t getNextZXIDLocked() const TSA_REQUIRES(transaction_mutex);
return uncommitted_transactions.back().zxid + 1;
}
Digest getNodesDigest(bool committed) const;
Digest getNodesDigest(bool committed, bool lock_transaction_mutex) const;
KeeperContextPtr keeper_context;
const String superdigest;
bool initialized{false};
std::atomic<bool> initialized{false};
KeeperStorage(int64_t tick_time_ms, const String & superdigest_, const KeeperContextPtr & keeper_context_, bool initialize_system_nodes = true);
void initializeSystemNodes();
void initializeSystemNodes() TSA_NO_THREAD_SAFETY_ANALYSIS;
/// Allocate new session id with the specified timeouts
int64_t getSessionID(int64_t session_timeout_ms)
{
auto result = session_id_counter++;
session_and_timeout.emplace(result, session_timeout_ms);
session_expiry_queue.addNewSessionOrUpdate(result, session_timeout_ms);
return result;
}
int64_t getSessionID(int64_t session_timeout_ms);
/// Add session id. Used when restoring KeeperStorage from snapshot.
void addSessionID(int64_t session_id, int64_t session_timeout_ms)
{
session_and_timeout.emplace(session_id, session_timeout_ms);
session_expiry_queue.addNewSessionOrUpdate(session_id, session_timeout_ms);
}
void addSessionID(int64_t session_id, int64_t session_timeout_ms) TSA_NO_THREAD_SAFETY_ANALYSIS;
UInt64 calculateNodesDigest(UInt64 current_digest, const std::vector<Delta> & new_deltas) const;
UInt64 calculateNodesDigest(UInt64 current_digest, const std::list<Delta> & new_deltas) const;
/// Process user request and return response.
/// check_acl = false only when converting data from ZooKeeper.
@ -765,42 +721,39 @@ public:
/// Set of methods for creating snapshots
/// Turn on snapshot mode, so data inside Container is not deleted, but replaced with new version.
void enableSnapshotMode(size_t up_to_version)
{
container.enableSnapshotMode(up_to_version);
}
void enableSnapshotMode(size_t up_to_version);
/// Turn off snapshot mode.
void disableSnapshotMode()
{
container.disableSnapshotMode();
}
void disableSnapshotMode();
Container::const_iterator getSnapshotIteratorBegin() const { return container.begin(); }
Container::const_iterator getSnapshotIteratorBegin() const;
/// Clear outdated data from internal container.
void clearGarbageAfterSnapshot() { container.clearOutdatedNodes(); }
void clearGarbageAfterSnapshot();
/// Get all active sessions
const SessionAndTimeout & getActiveSessions() const { return session_and_timeout; }
SessionAndTimeout getActiveSessions() const;
/// Get all dead sessions
std::vector<int64_t> getDeadSessions() const { return session_expiry_queue.getExpiredSessions(); }
std::vector<int64_t> getDeadSessions() const;
void updateStats();
const Stats & getStorageStats() const;
/// Introspection functions mostly used in 4-letter commands
uint64_t getNodesCount() const { return container.size(); }
uint64_t getNodesCount() const;
uint64_t getApproximateDataSize() const { return container.getApproximateDataSize(); }
uint64_t getApproximateDataSize() const;
uint64_t getArenaDataSize() const { return container.keyArenaSize(); }
uint64_t getArenaDataSize() const;
uint64_t getTotalWatchesCount() const;
uint64_t getWatchedPathsCount() const { return watches.size() + list_watches.size(); }
uint64_t getWatchedPathsCount() const;
uint64_t getSessionsWithWatchesCount() const;
uint64_t getSessionWithEphemeralNodesCount() const { return ephemerals.size(); }
uint64_t getSessionWithEphemeralNodesCount() const;
uint64_t getTotalEphemeralNodesCount() const;
void dumpWatches(WriteBufferFromOwnString & buf) const;

View File

@ -155,11 +155,11 @@ public:
ReadBufferFromOwnString buffer(iter->value().ToStringView());
typename Node::Meta & meta = new_pair->value;
readPODBinary(meta, buffer);
readVarUInt(new_pair->value.data_size, buffer);
if (new_pair->value.data_size)
readVarUInt(new_pair->value.stats.data_size, buffer);
if (new_pair->value.stats.data_size)
{
new_pair->value.data = std::unique_ptr<char[]>(new char[new_pair->value.data_size]);
buffer.readStrict(new_pair->value.data.get(), new_pair->value.data_size);
new_pair->value.data = std::unique_ptr<char[]>(new char[new_pair->value.stats.data_size]);
buffer.readStrict(new_pair->value.data.get(), new_pair->value.stats.data_size);
}
pair = new_pair;
}
@ -211,7 +211,7 @@ public:
}
}
std::vector<std::pair<std::string, Node>> getChildren(const std::string & key_)
std::vector<std::pair<std::string, Node>> getChildren(const std::string & key_, bool read_data = false)
{
rocksdb::ReadOptions read_options;
read_options.total_order_seek = true;
@ -232,6 +232,15 @@ public:
typename Node::Meta & meta = node;
/// We do not read data here
readPODBinary(meta, buffer);
if (read_data)
{
readVarUInt(meta.stats.data_size, buffer);
if (meta.stats.data_size)
{
node.data = std::unique_ptr<char[]>(new char[meta.stats.data_size]);
buffer.readStrict(node.data.get(), meta.stats.data_size);
}
}
std::string real_key(iter->key().data() + len, iter->key().size() - len);
// std::cout << "real key: " << real_key << std::endl;
result.emplace_back(std::move(real_key), std::move(node));
@ -268,11 +277,11 @@ public:
typename Node::Meta & meta = kv->value;
readPODBinary(meta, buffer);
/// TODO: Sometimes we don't need to load data.
readVarUInt(kv->value.data_size, buffer);
if (kv->value.data_size)
readVarUInt(kv->value.stats.data_size, buffer);
if (kv->value.stats.data_size)
{
kv->value.data = std::unique_ptr<char[]>(new char[kv->value.data_size]);
buffer.readStrict(kv->value.data.get(), kv->value.data_size);
kv->value.data = std::unique_ptr<char[]>(new char[kv->value.stats.data_size]);
buffer.readStrict(kv->value.data.get(), kv->value.stats.data_size);
}
return const_iterator(kv);
}
@ -281,7 +290,7 @@ public:
{
auto it = find(key);
chassert(it != end());
return MockNode(it->value.numChildren(), it->value.getData());
return MockNode(it->value.stats.numChildren(), it->value.getData());
}
const_iterator updateValue(StringRef key_, ValueUpdater updater)

View File

@ -93,7 +93,7 @@ void deserializeACLMap(Storage & storage, ReadBuffer & in)
}
template<typename Storage>
int64_t deserializeStorageData(Storage & storage, ReadBuffer & in, LoggerPtr log)
int64_t deserializeStorageData(Storage & storage, ReadBuffer & in, LoggerPtr log) TSA_NO_THREAD_SAFETY_ANALYSIS
{
int64_t max_zxid = 0;
std::string path;
@ -108,33 +108,33 @@ int64_t deserializeStorageData(Storage & storage, ReadBuffer & in, LoggerPtr log
Coordination::read(node.acl_id, in);
/// Deserialize stat
Coordination::read(node.czxid, in);
Coordination::read(node.mzxid, in);
Coordination::read(node.stats.czxid, in);
Coordination::read(node.stats.mzxid, in);
/// For some reason ZXID specified in filename can be smaller
/// then actual zxid from nodes. In this case we will use zxid from nodes.
max_zxid = std::max(max_zxid, node.mzxid);
max_zxid = std::max(max_zxid, node.stats.mzxid);
int64_t ctime;
Coordination::read(ctime, in);
node.setCtime(ctime);
Coordination::read(node.mtime, in);
Coordination::read(node.version, in);
Coordination::read(node.cversion, in);
Coordination::read(node.aversion, in);
node.stats.setCtime(ctime);
Coordination::read(node.stats.mtime, in);
Coordination::read(node.stats.version, in);
Coordination::read(node.stats.cversion, in);
Coordination::read(node.stats.aversion, in);
int64_t ephemeral_owner;
Coordination::read(ephemeral_owner, in);
if (ephemeral_owner != 0)
node.setEphemeralOwner(ephemeral_owner);
Coordination::read(node.pzxid, in);
node.stats.setEphemeralOwner(ephemeral_owner);
Coordination::read(node.stats.pzxid, in);
if (!path.empty())
{
if (ephemeral_owner == 0)
node.setSeqNum(node.cversion);
node.stats.setSeqNum(node.stats.cversion);
storage.container.insertOrReplace(path, node);
if (ephemeral_owner != 0)
storage.ephemerals[ephemeral_owner].insert(path);
storage.committed_ephemerals[ephemeral_owner].insert(path);
storage.acl_map.addUsage(node.acl_id);
}
@ -149,7 +149,13 @@ int64_t deserializeStorageData(Storage & storage, ReadBuffer & in, LoggerPtr log
if (itr.key != "/")
{
auto parent_path = parentNodePath(itr.key);
storage.container.updateValue(parent_path, [my_path = itr.key] (typename Storage::Node & value) { value.addChild(getBaseNodeName(my_path)); value.increaseNumChildren(); });
storage.container.updateValue(
parent_path,
[my_path = itr.key](typename Storage::Node & value)
{
value.addChild(getBaseNodeName(my_path));
value.stats.increaseNumChildren();
});
}
}
@ -157,7 +163,7 @@ int64_t deserializeStorageData(Storage & storage, ReadBuffer & in, LoggerPtr log
}
template<typename Storage>
void deserializeKeeperStorageFromSnapshot(Storage & storage, const std::string & snapshot_path, LoggerPtr log)
void deserializeKeeperStorageFromSnapshot(Storage & storage, const std::string & snapshot_path, LoggerPtr log) TSA_NO_THREAD_SAFETY_ANALYSIS
{
LOG_INFO(log, "Deserializing storage snapshot {}", snapshot_path);
int64_t zxid = getZxidFromName(snapshot_path);
@ -487,7 +493,7 @@ bool hasErrorsInMultiRequest(Coordination::ZooKeeperRequestPtr request)
}
template<typename Storage>
bool deserializeTxn(Storage & storage, ReadBuffer & in, LoggerPtr /*log*/)
bool deserializeTxn(Storage & storage, ReadBuffer & in, LoggerPtr /*log*/) TSA_NO_THREAD_SAFETY_ANALYSIS
{
int64_t checksum;
Coordination::read(checksum, in);
@ -568,7 +574,7 @@ void deserializeLogAndApplyToStorage(Storage & storage, const std::string & log_
}
template<typename Storage>
void deserializeLogsAndApplyToStorage(Storage & storage, const std::string & path, LoggerPtr log)
void deserializeLogsAndApplyToStorage(Storage & storage, const std::string & path, LoggerPtr log) TSA_NO_THREAD_SAFETY_ANALYSIS
{
std::map<int64_t, std::string> existing_logs;
for (const auto & p : fs::directory_iterator(path))

View File

@ -1,6 +1,7 @@
#include <chrono>
#include <gtest/gtest.h>
#include "base/defines.h"
#include "config.h"
#if USE_NURAFT
@ -1540,7 +1541,7 @@ void addNode(Storage & storage, const std::string & path, const std::string & da
using Node = typename Storage::Node;
Node node{};
node.setData(data);
node.setEphemeralOwner(ephemeral_owner);
node.stats.setEphemeralOwner(ephemeral_owner);
storage.container.insertOrReplace(path, node);
auto child_it = storage.container.find(path);
auto child_path = DB::getBaseNodeName(child_it->key);
@ -1549,7 +1550,7 @@ void addNode(Storage & storage, const std::string & path, const std::string & da
[&](auto & parent)
{
parent.addChild(child_path);
parent.increaseNumChildren();
parent.stats.increaseNumChildren();
});
}
@ -1570,9 +1571,9 @@ TYPED_TEST(CoordinationTest, TestStorageSnapshotSimple)
addNode(storage, "/hello1", "world", 1);
addNode(storage, "/hello2", "somedata", 3);
storage.session_id_counter = 5;
storage.zxid = 2;
storage.ephemerals[3] = {"/hello2"};
storage.ephemerals[1] = {"/hello1"};
TSA_SUPPRESS_WARNING_FOR_WRITE(storage.zxid) = 2;
storage.committed_ephemerals[3] = {"/hello2"};
storage.committed_ephemerals[1] = {"/hello1"};
storage.getSessionID(130);
storage.getSessionID(130);
@ -1601,10 +1602,10 @@ TYPED_TEST(CoordinationTest, TestStorageSnapshotSimple)
EXPECT_EQ(restored_storage->container.getValue("/hello1").getData(), "world");
EXPECT_EQ(restored_storage->container.getValue("/hello2").getData(), "somedata");
EXPECT_EQ(restored_storage->session_id_counter, 7);
EXPECT_EQ(restored_storage->zxid, 2);
EXPECT_EQ(restored_storage->ephemerals.size(), 2);
EXPECT_EQ(restored_storage->ephemerals[3].size(), 1);
EXPECT_EQ(restored_storage->ephemerals[1].size(), 1);
EXPECT_EQ(restored_storage->getZXID(), 2);
EXPECT_EQ(restored_storage->committed_ephemerals.size(), 2);
EXPECT_EQ(restored_storage->committed_ephemerals[3].size(), 1);
EXPECT_EQ(restored_storage->committed_ephemerals[1].size(), 1);
EXPECT_EQ(restored_storage->session_and_timeout.size(), 2);
}
@ -2027,7 +2028,7 @@ TYPED_TEST(CoordinationTest, TestEphemeralNodeRemove)
state_machine->commit(1, entry_c->get_buf());
const auto & storage = state_machine->getStorageUnsafe();
EXPECT_EQ(storage.ephemerals.size(), 1);
EXPECT_EQ(storage.committed_ephemerals.size(), 1);
std::shared_ptr<ZooKeeperRemoveRequest> request_d = std::make_shared<ZooKeeperRemoveRequest>();
request_d->path = "/hello";
/// Delete from other session
@ -2035,7 +2036,7 @@ TYPED_TEST(CoordinationTest, TestEphemeralNodeRemove)
state_machine->pre_commit(2, entry_d->get_buf());
state_machine->commit(2, entry_d->get_buf());
EXPECT_EQ(storage.ephemerals.size(), 0);
EXPECT_EQ(storage.committed_ephemerals.size(), 0);
}
@ -2590,9 +2591,9 @@ TYPED_TEST(CoordinationTest, TestStorageSnapshotDifferentCompressions)
addNode(storage, "/hello1", "world", 1);
addNode(storage, "/hello2", "somedata", 3);
storage.session_id_counter = 5;
storage.zxid = 2;
storage.ephemerals[3] = {"/hello2"};
storage.ephemerals[1] = {"/hello1"};
TSA_SUPPRESS_WARNING_FOR_WRITE(storage.zxid) = 2;
storage.committed_ephemerals[3] = {"/hello2"};
storage.committed_ephemerals[1] = {"/hello1"};
storage.getSessionID(130);
storage.getSessionID(130);
@ -2617,10 +2618,10 @@ TYPED_TEST(CoordinationTest, TestStorageSnapshotDifferentCompressions)
EXPECT_EQ(restored_storage->container.getValue("/hello1").getData(), "world");
EXPECT_EQ(restored_storage->container.getValue("/hello2").getData(), "somedata");
EXPECT_EQ(restored_storage->session_id_counter, 7);
EXPECT_EQ(restored_storage->zxid, 2);
EXPECT_EQ(restored_storage->ephemerals.size(), 2);
EXPECT_EQ(restored_storage->ephemerals[3].size(), 1);
EXPECT_EQ(restored_storage->ephemerals[1].size(), 1);
EXPECT_EQ(restored_storage->getZXID(), 2);
EXPECT_EQ(restored_storage->committed_ephemerals.size(), 2);
EXPECT_EQ(restored_storage->committed_ephemerals[3].size(), 1);
EXPECT_EQ(restored_storage->committed_ephemerals[1].size(), 1);
EXPECT_EQ(restored_storage->session_and_timeout.size(), 2);
}
@ -2805,13 +2806,13 @@ TYPED_TEST(CoordinationTest, TestStorageSnapshotEqual)
storage.session_id_counter = 5;
storage.ephemerals[3] = {"/hello"};
storage.ephemerals[1] = {"/hello/somepath"};
storage.committed_ephemerals[3] = {"/hello"};
storage.committed_ephemerals[1] = {"/hello/somepath"};
for (size_t j = 0; j < 3333; ++j)
storage.getSessionID(130 * j);
DB::KeeperStorageSnapshot<Storage> snapshot(&storage, storage.zxid);
DB::KeeperStorageSnapshot<Storage> snapshot(&storage, storage.getZXID());
auto buf = manager.serializeSnapshotToBuffer(snapshot);
@ -3315,7 +3316,7 @@ TYPED_TEST(CoordinationTest, TestCheckNotExistsRequest)
create_path("/test_node");
auto node_it = storage.container.find("/test_node");
ASSERT_NE(node_it, storage.container.end());
auto node_version = node_it->value.version;
auto node_version = node_it->value.stats.version;
{
SCOPED_TRACE("CheckNotExists returns ZNODEEXISTS");
@ -3566,12 +3567,12 @@ TYPED_TEST(CoordinationTest, TestRemoveRecursiveRequest)
{
SCOPED_TRACE("Recursive Remove Ephemeral");
create("/T7", zkutil::CreateMode::Ephemeral);
ASSERT_EQ(storage.ephemerals.size(), 1);
ASSERT_EQ(storage.committed_ephemerals.size(), 1);
auto responses = remove_recursive("/T7", 100);
ASSERT_EQ(responses.size(), 1);
ASSERT_EQ(responses[0].response->error, Coordination::Error::ZOK);
ASSERT_EQ(storage.ephemerals.size(), 0);
ASSERT_EQ(storage.committed_ephemerals.size(), 0);
ASSERT_FALSE(exists("/T7"));
}
@ -3581,12 +3582,12 @@ TYPED_TEST(CoordinationTest, TestRemoveRecursiveRequest)
create("/T8/A", zkutil::CreateMode::Persistent);
create("/T8/B", zkutil::CreateMode::Ephemeral);
create("/T8/A/C", zkutil::CreateMode::Ephemeral);
ASSERT_EQ(storage.ephemerals.size(), 1);
ASSERT_EQ(storage.committed_ephemerals.size(), 1);
auto responses = remove_recursive("/T8", 4);
ASSERT_EQ(responses.size(), 1);
ASSERT_EQ(responses[0].response->error, Coordination::Error::ZOK);
ASSERT_EQ(storage.ephemerals.size(), 0);
ASSERT_EQ(storage.committed_ephemerals.size(), 0);
ASSERT_FALSE(exists("/T8"));
ASSERT_FALSE(exists("/T8/A"));
ASSERT_FALSE(exists("/T8/B"));
@ -3889,14 +3890,26 @@ TYPED_TEST(CoordinationTest, TestRemoveRecursiveWatches)
auto responses = storage.processRequest(remove_request, 1, new_zxid);
ASSERT_EQ(responses.size(), 7);
/// request response is last
ASSERT_EQ(dynamic_cast<Coordination::ZooKeeperWatchResponse *>(responses.back().response.get()), nullptr);
for (size_t i = 0; i < 7; ++i)
std::unordered_map<std::string, std::vector<Coordination::Event>> expected_watch_responses
{
{"/A/B/D", {Coordination::Event::DELETED}},
{"/A/B", {Coordination::Event::CHILD, Coordination::Event::DELETED}},
{"/A/C", {Coordination::Event::DELETED}},
{"/A", {Coordination::Event::CHILD, Coordination::Event::DELETED}},
};
std::unordered_map<std::string, std::vector<Coordination::Event>> actual_watch_responses;
for (size_t i = 0; i < 6; ++i)
{
ASSERT_EQ(responses[i].response->error, Coordination::Error::ZOK);
if (const auto * watch_response = dynamic_cast<Coordination::ZooKeeperWatchResponse *>(responses[i].response.get()))
ASSERT_EQ(watch_response->type, Coordination::Event::DELETED);
const auto & watch_response = dynamic_cast<Coordination::ZooKeeperWatchResponse &>(*responses[i].response);
actual_watch_responses[watch_response.path].push_back(static_cast<Coordination::Event>(watch_response.type));
}
ASSERT_EQ(expected_watch_responses, actual_watch_responses);
ASSERT_EQ(storage.watches.size(), 0);
ASSERT_EQ(storage.list_watches.size(), 0);

View File

@ -151,6 +151,15 @@ Names NamesAndTypesList::getNames() const
return res;
}
NameSet NamesAndTypesList::getNameSet() const
{
NameSet res;
res.reserve(size());
for (const NameAndTypePair & column : *this)
res.insert(column.name);
return res;
}
DataTypes NamesAndTypesList::getTypes() const
{
DataTypes res;

View File

@ -100,6 +100,7 @@ public:
void getDifference(const NamesAndTypesList & rhs, NamesAndTypesList & deleted, NamesAndTypesList & added) const;
Names getNames() const;
NameSet getNameSet() const;
DataTypes getTypes() const;
/// Remove columns which names are not in the `names`.

View File

@ -338,11 +338,8 @@ size_t HashJoin::getTotalRowCount() const
return res;
}
size_t HashJoin::getTotalByteCount() const
void HashJoin::doDebugAsserts() const
{
if (!data)
return 0;
#ifndef NDEBUG
size_t debug_blocks_allocated_size = 0;
for (const auto & block : data->blocks)
@ -360,6 +357,14 @@ size_t HashJoin::getTotalByteCount() const
throw Exception(ErrorCodes::LOGICAL_ERROR, "data->blocks_nullmaps_allocated_size != debug_blocks_nullmaps_allocated_size ({} != {})",
data->blocks_nullmaps_allocated_size, debug_blocks_nullmaps_allocated_size);
#endif
}
size_t HashJoin::getTotalByteCount() const
{
if (!data)
return 0;
doDebugAsserts();
size_t res = 0;
@ -544,9 +549,11 @@ bool HashJoin::addBlockToJoin(const Block & source_block_, bool check_limits)
have_compressed = true;
}
doDebugAsserts();
data->blocks_allocated_size += block_to_save.allocatedBytes();
data->blocks.emplace_back(std::move(block_to_save));
Block * stored_block = &data->blocks.back();
doDebugAsserts();
if (rows)
data->empty = false;
@ -634,9 +641,11 @@ bool HashJoin::addBlockToJoin(const Block & source_block_, bool check_limits)
if (!flag_per_row && !is_inserted)
{
doDebugAsserts();
LOG_TRACE(log, "Skipping inserting block with {} rows", rows);
data->blocks_allocated_size -= stored_block->allocatedBytes();
data->blocks.pop_back();
doDebugAsserts();
}
if (!check_limits)
@ -683,6 +692,8 @@ void HashJoin::shrinkStoredBlocksToFit(size_t & total_bytes_in_join, bool force_
for (auto & stored_block : data->blocks)
{
doDebugAsserts();
size_t old_size = stored_block.allocatedBytes();
stored_block = stored_block.shrinkToFit();
size_t new_size = stored_block.allocatedBytes();
@ -700,6 +711,8 @@ void HashJoin::shrinkStoredBlocksToFit(size_t & total_bytes_in_join, bool force_
else
/// Sometimes after clone resized block can be bigger than original
data->blocks_allocated_size += new_size - old_size;
doDebugAsserts();
}
auto new_total_bytes_in_join = getTotalByteCount();
@ -1416,7 +1429,13 @@ void HashJoin::tryRerangeRightTableDataImpl(Map & map [[maybe_unused]])
};
BlocksList sorted_blocks;
visit_rows_map(sorted_blocks, map);
doDebugAsserts();
data->blocks.swap(sorted_blocks);
size_t new_blocks_allocated_size = 0;
for (const auto & block : data->blocks)
new_blocks_allocated_size += block.allocatedBytes();
data->blocks_allocated_size = new_blocks_allocated_size;
doDebugAsserts();
}
}

View File

@ -470,6 +470,7 @@ private:
void tryRerangeRightTableData() override;
template <JoinKind KIND, typename Map, JoinStrictness STRICTNESS>
void tryRerangeRightTableDataImpl(Map & map);
void doDebugAsserts() const;
};
}

View File

@ -722,7 +722,14 @@ void ASTFunction::formatImplWithoutAlias(const FormatSettings & settings, Format
assert_cast<const ASTFunction *>(argument.get())->arguments->children[0]->formatImpl(settings, state, nested_dont_need_parens);
settings.ostr << (settings.hilite ? hilite_operator : "") << " = " << (settings.hilite ? hilite_none : "");
}
settings.ostr << "'[HIDDEN]'";
if (!secret_arguments.replacement.empty())
{
settings.ostr << "'" << secret_arguments.replacement << "'";
}
else
{
settings.ostr << "'[HIDDEN]'";
}
if (size <= secret_arguments.start + secret_arguments.count && !secret_arguments.are_named)
break; /// All other arguments should also be hidden.
continue;

View File

@ -1,6 +1,7 @@
#pragma once
#include <Common/KnownObjectNames.h>
#include <Common/re2.h>
#include <Core/QualifiedTableName.h>
#include <base/defines.h>
#include <boost/algorithm/string/predicate.hpp>
@ -49,6 +50,11 @@ public:
bool are_named = false; /// Arguments like `password = 'password'` are considered as named arguments.
/// E.g. "headers" in `url('..', headers('foo' = '[HIDDEN]'))`
std::vector<std::string> nested_maps;
/// Full replacement of an argument. Only supported when count is 1, otherwise all arguments will be replaced with this string.
/// It's needed in cases when we don't want to hide the entire parameter, but some part of it, e.g. "connection_string" in
/// `azureBlobStorage('DefaultEndpointsProtocol=https;AccountKey=secretkey;...', ...)` should be replaced with
/// `azureBlobStorage('DefaultEndpointsProtocol=https;AccountKey=[HIDDEN];...', ...)`.
std::string replacement;
bool hasSecrets() const
{
@ -74,6 +80,7 @@ protected:
result.are_named = argument_is_named;
}
chassert(index >= result.start); /// We always check arguments consecutively
chassert(result.replacement.empty()); /// We shouldn't use replacement with masking other arguments
result.count = index + 1 - result.start;
if (!argument_is_named)
result.are_named = false;
@ -199,32 +206,39 @@ protected:
void findAzureBlobStorageFunctionSecretArguments(bool is_cluster_function)
{
/// azureBlobStorage('cluster_name', 'conn_string/storage_account_url', ...) has 'conn_string/storage_account_url' as its second argument.
/// azureBlobStorageCluster('cluster_name', 'conn_string/storage_account_url', ...) has 'conn_string/storage_account_url' as its second argument.
size_t url_arg_idx = is_cluster_function ? 1 : 0;
if (!is_cluster_function && isNamedCollectionName(0))
{
/// azureBlobStorage(named_collection, ..., account_key = 'account_key', ...)
if (maskAzureConnectionString(-1, true, 1))
return;
findSecretNamedArgument("account_key", 1);
return;
}
else if (is_cluster_function && isNamedCollectionName(1))
{
/// azureBlobStorageCluster(cluster, named_collection, ..., account_key = 'account_key', ...)
if (maskAzureConnectionString(-1, true, 2))
return;
findSecretNamedArgument("account_key", 2);
return;
}
/// We should check other arguments first because we don't need to do any replacement in case storage_account_url is not used
/// azureBlobStorage(connection_string|storage_account_url, container_name, blobpath, account_name, account_key, format, compression, structure)
/// azureBlobStorageCluster(cluster, connection_string|storage_account_url, container_name, blobpath, [account_name, account_key, format, compression, structure])
if (maskAzureConnectionString(url_arg_idx))
return;
/// We should check other arguments first because we don't need to do any replacement in case of
/// azureBlobStorage(connection_string|storage_account_url, container_name, blobpath, format, [account_name, account_key, ...])
/// azureBlobStorageCluster(cluster, connection_string|storage_account_url, container_name, blobpath, format, [account_name, account_key, ...])
size_t count = function->arguments->size();
if ((url_arg_idx + 4 <= count) && (count <= url_arg_idx + 7))
{
String second_arg;
if (tryGetStringFromArgument(url_arg_idx + 3, &second_arg))
String fourth_arg;
if (tryGetStringFromArgument(url_arg_idx + 3, &fourth_arg))
{
if (second_arg == "auto" || KnownFormatNames::instance().exists(second_arg))
if (fourth_arg == "auto" || KnownFormatNames::instance().exists(fourth_arg))
return; /// The argument after 'url' is a format: s3('url', 'format', ...)
}
}
@ -234,6 +248,40 @@ protected:
markSecretArgument(url_arg_idx + 4);
}
bool maskAzureConnectionString(ssize_t url_arg_idx, bool argument_is_named = false, size_t start = 0)
{
String url_arg;
if (argument_is_named)
{
url_arg_idx = findNamedArgument(&url_arg, "connection_string", start);
if (url_arg_idx == -1 || url_arg.empty())
url_arg_idx = findNamedArgument(&url_arg, "storage_account_url", start);
if (url_arg_idx == -1 || url_arg.empty())
return false;
}
else
{
if (!tryGetStringFromArgument(url_arg_idx, &url_arg))
return false;
}
if (!url_arg.starts_with("http"))
{
static re2::RE2 account_key_pattern = "AccountKey=.*?(;|$)";
if (RE2::Replace(&url_arg, account_key_pattern, "AccountKey=[HIDDEN]\\1"))
{
chassert(result.count == 0); /// We shouldn't use replacement with masking other arguments
result.start = url_arg_idx;
result.are_named = argument_is_named;
result.count = 1;
result.replacement = url_arg;
return true;
}
}
return false;
}
void findURLSecretArguments()
{
if (!isNamedCollectionName(0))
@ -513,8 +561,9 @@ protected:
return function->arguments->at(arg_idx)->isIdentifier();
}
/// Looks for a secret argument with a specified name. This function looks for arguments in format `key=value` where the key is specified.
void findSecretNamedArgument(const std::string_view & key, size_t start = 0)
/// Looks for an argument with a specified name. This function looks for arguments in format `key=value` where the key is specified.
/// Returns -1 if no argument was found.
ssize_t findNamedArgument(String * res, const std::string_view & key, size_t start = 0)
{
for (size_t i = start; i < function->arguments->size(); ++i)
{
@ -531,8 +580,22 @@ protected:
continue;
if (found_key == key)
markSecretArgument(i, /* argument_is_named= */ true);
{
tryGetStringFromArgument(*equals_func->arguments->at(1), res);
return i;
}
}
return -1;
}
/// Looks for a secret argument with a specified name. This function looks for arguments in format `key=value` where the key is specified.
/// If the argument is found, it is marked as a secret.
void findSecretNamedArgument(const std::string_view & key, size_t start = 0)
{
ssize_t arg_idx = findNamedArgument(nullptr, key, start);
if (arg_idx >= 0)
markSecretArgument(arg_idx, /* argument_is_named= */ true);
}
};

View File

@ -8,6 +8,7 @@
#include <IO/copyData.h>
#include <IO/ConnectionTimeouts.h>
#include <Common/Throttler.h>
#include <Common/ActionBlocker.h>
namespace zkutil

View File

@ -179,6 +179,20 @@ static void addMissedColumnsToSerializationInfos(
}
}
bool MergeTask::GlobalRuntimeContext::isCancelled() const
{
return (future_part ? merges_blocker->isCancelledForPartition(future_part->part_info.partition_id) : merges_blocker->isCancelled())
|| merge_list_element_ptr->is_cancelled.load(std::memory_order_relaxed);
}
void MergeTask::GlobalRuntimeContext::checkOperationIsNotCanceled() const
{
if (isCancelled())
{
throw Exception(ErrorCodes::ABORTED, "Cancelled merging parts");
}
}
/// PK columns are sorted and merged, ordinary columns are gathered using info from merge step
void MergeTask::ExecuteAndFinalizeHorizontalPart::extractMergingAndGatheringColumns() const
{
@ -272,8 +286,7 @@ bool MergeTask::ExecuteAndFinalizeHorizontalPart::prepare() const
const String local_tmp_suffix = global_ctx->parent_part ? global_ctx->suffix : "";
if (global_ctx->merges_blocker->isCancelled() || global_ctx->merge_list_element_ptr->is_cancelled.load(std::memory_order_relaxed))
throw Exception(ErrorCodes::ABORTED, "Cancelled merging parts");
global_ctx->checkOperationIsNotCanceled();
/// We don't want to perform merge assigned with TTL as normal merge, so
/// throw exception
@ -518,9 +531,10 @@ bool MergeTask::ExecuteAndFinalizeHorizontalPart::prepare() const
ctx->is_cancelled = [merges_blocker = global_ctx->merges_blocker,
ttl_merges_blocker = global_ctx->ttl_merges_blocker,
need_remove = ctx->need_remove_expired_values,
merge_list_element = global_ctx->merge_list_element_ptr]() -> bool
merge_list_element = global_ctx->merge_list_element_ptr,
partition_id = global_ctx->future_part->part_info.partition_id]() -> bool
{
return merges_blocker->isCancelled()
return merges_blocker->isCancelledForPartition(partition_id)
|| (need_remove && ttl_merges_blocker->isCancelled())
|| merge_list_element->is_cancelled.load(std::memory_order_relaxed);
};
@ -797,8 +811,7 @@ void MergeTask::ExecuteAndFinalizeHorizontalPart::finalize() const
global_ctx->merging_executor.reset();
global_ctx->merged_pipeline.reset();
if (global_ctx->merges_blocker->isCancelled() || global_ctx->merge_list_element_ptr->is_cancelled.load(std::memory_order_relaxed))
throw Exception(ErrorCodes::ABORTED, "Cancelled merging parts");
global_ctx->checkOperationIsNotCanceled();
if (ctx->need_remove_expired_values && global_ctx->ttl_merges_blocker->isCancelled())
throw Exception(ErrorCodes::ABORTED, "Cancelled merging parts with expired TTL");
@ -1061,8 +1074,7 @@ bool MergeTask::VerticalMergeStage::executeVerticalMergeForOneColumn() const
{
Block block;
if (global_ctx->merges_blocker->isCancelled()
|| global_ctx->merge_list_element_ptr->is_cancelled.load(std::memory_order_relaxed)
if (global_ctx->isCancelled()
|| !ctx->executor->pull(block))
return false;
@ -1078,8 +1090,7 @@ bool MergeTask::VerticalMergeStage::executeVerticalMergeForOneColumn() const
void MergeTask::VerticalMergeStage::finalizeVerticalMergeForOneColumn() const
{
const String & column_name = ctx->it_name_and_type->name;
if (global_ctx->merges_blocker->isCancelled() || global_ctx->merge_list_element_ptr->is_cancelled.load(std::memory_order_relaxed))
throw Exception(ErrorCodes::ABORTED, "Cancelled merging parts");
global_ctx->checkOperationIsNotCanceled();
ctx->executor.reset();
auto changed_checksums = ctx->column_to->fillChecksums(global_ctx->new_data_part, global_ctx->checksums_gathered_columns);

View File

@ -27,6 +27,7 @@
#include <Storages/MergeTree/MergeProgress.h>
#include <Storages/MergeTree/MergeTreeData.h>
#include <Storages/MergeTree/MergeTreeIndices.h>
#include <Storages/MergeTree/PartitionActionBlocker.h>
namespace ProfileEvents
{
@ -86,7 +87,7 @@ public:
MergeTreeTransactionPtr txn,
MergeTreeData * data_,
MergeTreeDataMergerMutator * mutator_,
ActionBlocker * merges_blocker_,
PartitionActionBlocker * merges_blocker_,
ActionBlocker * ttl_merges_blocker_)
{
global_ctx = std::make_shared<GlobalRuntimeContext>();
@ -161,7 +162,7 @@ private:
MergeListElement * merge_list_element_ptr{nullptr};
MergeTreeData * data{nullptr};
MergeTreeDataMergerMutator * mutator{nullptr};
ActionBlocker * merges_blocker{nullptr};
PartitionActionBlocker * merges_blocker{nullptr};
ActionBlocker * ttl_merges_blocker{nullptr};
StorageSnapshotPtr storage_snapshot{nullptr};
StorageMetadataPtr metadata_snapshot{nullptr};
@ -215,7 +216,12 @@ private:
MergeTreeData::MergingParams merging_params{};
scope_guard temporary_directory_lock;
UInt64 prev_elapsed_ms{0};
// will throw an exception if merge was cancelled in any way.
void checkOperationIsNotCanceled() const;
bool isCancelled() const;
};
using GlobalRuntimeContextPtr = std::shared_ptr<GlobalRuntimeContext>;

View File

@ -207,7 +207,7 @@ public :
/** Is used to cancel all merges and mutations. On cancel() call all currently running actions will throw exception soon.
* All new attempts to start a merge or mutation will throw an exception until all 'LockHolder' objects will be destroyed.
*/
ActionBlocker merges_blocker;
PartitionActionBlocker merges_blocker;
ActionBlocker ttl_merges_blocker;
private:

View File

@ -9,6 +9,7 @@
#include <Storages/Statistics/Statistics.h>
#include <Columns/ColumnsNumber.h>
#include <Parsers/queryToString.h>
#include <Interpreters/Context.h>
#include <Interpreters/Squashing.h>
#include <Interpreters/MergeTreeTransaction.h>
#include <Interpreters/PreparedSets.h>
@ -65,14 +66,6 @@ namespace ErrorCodes
namespace MutationHelpers
{
static bool checkOperationIsNotCanceled(ActionBlocker & merges_blocker, MergeListEntry * mutate_entry)
{
if (merges_blocker.isCancelled() || (*mutate_entry)->is_cancelled)
throw Exception(ErrorCodes::ABORTED, "Cancelled mutating parts");
return true;
}
static bool haveMutationsOfDynamicColumns(const MergeTreeData::DataPartPtr & data_part, const MutationCommands & commands)
{
for (const auto & command : commands)
@ -985,7 +978,7 @@ struct MutationContext
{
MergeTreeData * data;
MergeTreeDataMergerMutator * mutator;
ActionBlocker * merges_blocker;
PartitionActionBlocker * merges_blocker;
TableLockHolder * holder;
MergeListEntry * mutate_entry;
@ -1049,6 +1042,17 @@ struct MutationContext
scope_guard temporary_directory_lock;
bool checkOperationIsNotCanceled() const
{
if (new_data_part ? merges_blocker->isCancelledForPartition(new_data_part->info.partition_id) : merges_blocker->isCancelled()
|| (*mutate_entry)->is_cancelled)
{
throw Exception(ErrorCodes::ABORTED, "Cancelled mutating parts");
}
return true;
}
/// Whether we need to count lightweight delete rows in this mutation
bool count_lightweight_deleted_rows;
UInt64 execute_elapsed_ns = 0;
@ -1056,7 +1060,6 @@ struct MutationContext
using MutationContextPtr = std::shared_ptr<MutationContext>;
// This class is responsible for:
// 1. get projection pipeline and a sink to write parts
// 2. build an executor that can write block to the input stream (actually we can write through it to generate as many parts as possible)
@ -1186,9 +1189,7 @@ bool PartMergerWriter::mutateOriginalPartAndPrepareProjections()
Block cur_block;
Block projection_header;
MutationHelpers::checkOperationIsNotCanceled(*ctx->merges_blocker, ctx->mutate_entry);
if (!ctx->mutating_executor->pull(cur_block))
if (!ctx->checkOperationIsNotCanceled() || !ctx->mutating_executor->pull(cur_block))
{
finalizeTempProjections();
return false;
@ -1897,7 +1898,7 @@ MutateTask::MutateTask(
const MergeTreeTransactionPtr & txn,
MergeTreeData & data_,
MergeTreeDataMergerMutator & mutator_,
ActionBlocker & merges_blocker_,
PartitionActionBlocker & merges_blocker_,
bool need_prefix_)
: ctx(std::make_shared<MutationContext>())
{
@ -1939,7 +1940,7 @@ bool MutateTask::execute()
}
case State::NEED_EXECUTE:
{
MutationHelpers::checkOperationIsNotCanceled(*ctx->merges_blocker, ctx->mutate_entry);
ctx->checkOperationIsNotCanceled();
if (task->executeStep())
return true;
@ -2032,7 +2033,7 @@ static bool canSkipMutationCommandForPart(const MergeTreeDataPartPtr & part, con
bool MutateTask::prepare()
{
ProfileEvents::increment(ProfileEvents::MutationTotalParts);
MutationHelpers::checkOperationIsNotCanceled(*ctx->merges_blocker, ctx->mutate_entry);
ctx->checkOperationIsNotCanceled();
if (ctx->future_part->parts.size() != 1)
throw Exception(ErrorCodes::LOGICAL_ERROR, "Trying to mutate {} parts, not one. "

View File

@ -4,6 +4,7 @@
#include <Storages/MergeTree/MergeProgress.h>
#include <Storages/MergeTree/FutureMergedMutatedPart.h>
#include <Storages/MergeTree/IMergedBlockOutputStream.h>
#include <Storages/MergeTree/PartitionActionBlocker.h>
#include <Storages/MutationCommands.h>
#include <Interpreters/MutationsInterpreter.h>
@ -35,7 +36,7 @@ public:
const MergeTreeTransactionPtr & txn,
MergeTreeData & data_,
MergeTreeDataMergerMutator & mutator_,
ActionBlocker & merges_blocker_,
PartitionActionBlocker & merges_blocker_,
bool need_prefix_);
bool execute();

View File

@ -0,0 +1,86 @@
#include <Storages/MergeTree/PartitionActionBlocker.h>
#include <IO/WriteBufferFromString.h>
#include <IO/WriteHelpers.h>
#include <IO/Operators.h>
namespace DB
{
ActionLock PartitionActionBlocker::cancelForPartition(const std::string & partition_id)
{
std::unique_lock lock(mutex);
const size_t prev_size = partition_blockers.size();
ActionLock result = partition_blockers[partition_id].cancel();
// Cleanup stale `ActionBlocker` instances once in a while, to prevent unbound growth.
cleanup_counter++;
if (prev_size != partition_blockers.size() && cleanup_counter > 32) // 32 is arbitrary.
compactPartitionBlockersLocked();
return result;
}
bool PartitionActionBlocker::isCancelledForPartition(const std::string & partition_id) const
{
if (isCancelled())
return true;
std::shared_lock lock(mutex);
return isCancelledForPartitionOnlyLocked(partition_id);
}
bool PartitionActionBlocker::isCancelledForPartitionOnlyLocked(const std::string & partition_id) const
{
auto p = partition_blockers.find(partition_id);
return p != partition_blockers.end() && p->second.isCancelled();
}
size_t PartitionActionBlocker::countPartitionBlockers() const
{
std::shared_lock lock(mutex);
return partition_blockers.size();
}
void PartitionActionBlocker::compactPartitionBlockers()
{
std::unique_lock lock(mutex);
compactPartitionBlockersLocked();
}
void PartitionActionBlocker::compactPartitionBlockersLocked()
{
std::erase_if(partition_blockers, [](const auto & p)
{
return !p.second.isCancelled();
});
cleanup_counter = 0;
}
std::string PartitionActionBlocker::formatDebug() const
{
std::shared_lock lock(mutex);
WriteBufferFromOwnString out;
out << "Global lock: " << global_blocker.getCounter().load()
<< "\n"
<< partition_blockers.size() << " live partition locks: {";
size_t i = 0;
for (const auto & p : partition_blockers)
{
out << "\n\t" << DB::double_quote << p.first << ": " << p.second.getCounter().load();
if (++i < partition_blockers.size())
out << ",";
else
out << "\n";
}
out << "}";
return out.str();
}
}

View File

@ -0,0 +1,69 @@
#pragma once
#include <memory>
#include <shared_mutex>
#include <string>
#include <unordered_map>
#include <Common/ActionLock.h>
#include <Common/ActionBlocker.h>
namespace DB
{
/// Locks actions on given MergeTree-family table.
/// Like, ActionBlocker, but has two modes:
/// - 'partition-specific' lock, @see `cancelForPartition()`, and `isCancelledForPartition()`.
/// Only actions on *specific partition* are prohibited to start
/// and must interrupt at firsts chance.
/// There could be arbitrary number of partitions locked simultaneously.
///
/// - 'global' lock, @see `cancel()`, `isCancelled()`, and `cancelForever()`
/// Any new actions for *ALL* partitions are prohibited to start
/// and are required to interrupt at first possible moment.
/// As if all individual partitions were locked with 'partition-specific lock'.
///
/// Any lock can be set multiple times and considered fully un-locked
/// only when it was un-locked same number of times (by destructing/resetting of `ActionLock`).
///
/// There could be any number of locks active at given point on single ActionBlocker instance:
/// - 'global lock' locked `N` times.
/// - `M` 'partition lock's each locked different number of times.
class PartitionActionBlocker
{
public:
PartitionActionBlocker() = default;
bool isCancelled() const { return global_blocker.isCancelled(); }
bool isCancelledForPartition(const std::string & /*partition_id*/) const;
/// Temporarily blocks corresponding actions (while the returned object is alive)
friend class ActionLock;
ActionLock cancel() { return ActionLock(global_blocker); }
ActionLock cancelForPartition(const std::string & partition_id);
/// Cancel the actions forever.
void cancelForever() { global_blocker.cancelForever(); }
const std::atomic<int> & getCounter() const { return global_blocker.getCounter(); }
size_t countPartitionBlockers() const;
/// Cleanup partition blockers
void compactPartitionBlockers();
std::string formatDebug() const;
private:
void compactPartitionBlockersLocked();
bool isCancelledForPartitionOnlyLocked(const std::string & partition_id) const;
ActionBlocker global_blocker;
mutable std::shared_mutex mutex;
std::unordered_map<std::string, ActionBlocker> partition_blockers;
size_t cleanup_counter = 0;
};
}

View File

@ -0,0 +1,257 @@
#include <gtest/gtest.h>
#include <vector>
#include <thread>
#include <fmt/format.h>
#include <Storages/MergeTree/PartitionActionBlocker.h>
#include <iomanip>
using namespace DB;
TEST(PartitionActionBlocker, TestDefaultConstructor)
{
PartitionActionBlocker blocker;
EXPECT_FALSE(blocker.isCancelled());
// EXPECT_EQ(0, blocker.getCounter().load());
}
TEST(PartitionActionBlocker, TestCancelForever)
{
PartitionActionBlocker blocker;
blocker.cancelForever();
EXPECT_TRUE(blocker.isCancelled());
// EXPECT_EQ(1, blocker.getCounter().load());
}
TEST(PartitionActionBlocker, TestCancel)
{
PartitionActionBlocker blocker;
{
auto lock = blocker.cancel();
EXPECT_TRUE(blocker.isCancelled());
// EXPECT_EQ(1, blocker.getCounter().load());
}
// automatically un-cancelled on `lock` destruction
EXPECT_FALSE(blocker.isCancelled());
}
TEST(PartitionActionBlocker, TestCancelForPartition)
{
const std::string partition_id = "some partition id";
const std::string partition_id2 = "some other partition id";
PartitionActionBlocker blocker;
{
auto lock = blocker.cancelForPartition(partition_id);
// blocker is not locked fully
EXPECT_FALSE(blocker.isCancelled());
// EXPECT_EQ(0, blocker.getCounter().load());
// blocker reports that only partition is locked
EXPECT_TRUE(blocker.isCancelledForPartition(partition_id));
// doesn't affect other partitions
EXPECT_FALSE(blocker.isCancelledForPartition(partition_id2));
}
EXPECT_FALSE(blocker.isCancelled());
EXPECT_FALSE(blocker.isCancelledForPartition(partition_id));
}
TEST(PartitionActionBlocker, TestCancelForTwoPartitions)
{
const std::string partition_id1 = "some partition id";
const std::string partition_id2 = "some other partition id";
PartitionActionBlocker blocker;
{
auto lock1 = blocker.cancelForPartition(partition_id1);
auto lock2 = blocker.cancelForPartition(partition_id2);
// blocker is not locked fully
EXPECT_FALSE(blocker.isCancelled());
// EXPECT_EQ(0, blocker.getCounter().load());
// blocker reports that both partitions are locked
EXPECT_TRUE(blocker.isCancelledForPartition(partition_id1));
EXPECT_TRUE(blocker.isCancelledForPartition(partition_id2));
}
// blocker is not locked fully
EXPECT_FALSE(blocker.isCancelled());
// EXPECT_EQ(0, blocker.getCounter().load());
// blocker reports that only partition is locked
EXPECT_FALSE(blocker.isCancelledForPartition(partition_id1));
}
TEST(PartitionActionBlocker, TestCancelForSamePartitionTwice)
{
// Partition is unlocked only when all locks are destroyed.
const std::string partition_id = "some partition id";
const std::string partition_id2 = "some other partition id";
// Lock `partition_id` twice, make sure that global lock
// and other partitions are unaffected.
// Check that `partition_id` is unlocked only after both locks are destroyed.
PartitionActionBlocker blocker;
{
auto lock1 = blocker.cancelForPartition(partition_id);
{
auto lock2 = blocker.cancelForPartition(partition_id);
EXPECT_FALSE(blocker.isCancelled());
EXPECT_TRUE(blocker.isCancelledForPartition(partition_id));
EXPECT_FALSE(blocker.isCancelledForPartition(partition_id2));
}
EXPECT_FALSE(blocker.isCancelled());
EXPECT_TRUE(blocker.isCancelledForPartition(partition_id));
EXPECT_FALSE(blocker.isCancelledForPartition(partition_id2));
}
// All locks lifted
EXPECT_FALSE(blocker.isCancelled());
EXPECT_FALSE(blocker.isCancelledForPartition(partition_id));
}
TEST(PartitionActionBlocker, TestCancelAndThenCancelForPartition)
{
// Partition is unlocked only when all locks are destroyed.
const std::string partition_id = "some partition id";
const std::string partition_id2 = "some other partition id";
PartitionActionBlocker blocker;
{
auto global_lock = blocker.cancel();
{
auto lock1 = blocker.cancelForPartition(partition_id);
EXPECT_TRUE(blocker.isCancelled());
EXPECT_TRUE(blocker.isCancelledForPartition(partition_id));
EXPECT_TRUE(blocker.isCancelledForPartition(partition_id2));
}
EXPECT_TRUE(blocker.isCancelled());
EXPECT_TRUE(blocker.isCancelledForPartition(partition_id));
EXPECT_TRUE(blocker.isCancelledForPartition(partition_id2));
}
// All locks lifted
EXPECT_FALSE(blocker.isCancelled());
EXPECT_FALSE(blocker.isCancelledForPartition(partition_id));
EXPECT_FALSE(blocker.isCancelledForPartition(partition_id2));
}
TEST(PartitionActionBlocker, TestCancelForPartitionAndThenCancel)
{
// Partition is unlocked only when all locks are destroyed.
const std::string partition_id = "some partition id";
const std::string partition_id2 = "some other partition id";
PartitionActionBlocker blocker;
{
auto lock1 = blocker.cancelForPartition(partition_id);
{
auto global_lock = blocker.cancel();
EXPECT_TRUE(blocker.isCancelled());
EXPECT_TRUE(blocker.isCancelledForPartition(partition_id));
EXPECT_TRUE(blocker.isCancelledForPartition(partition_id2));
}
// global_locked is 'no more', so only 1 partition is locked now.
EXPECT_FALSE(blocker.isCancelled());
EXPECT_TRUE(blocker.isCancelledForPartition(partition_id));
EXPECT_FALSE(blocker.isCancelledForPartition(partition_id2));
}
// All locks lifted
EXPECT_FALSE(blocker.isCancelled());
EXPECT_FALSE(blocker.isCancelledForPartition(partition_id));
EXPECT_FALSE(blocker.isCancelledForPartition(partition_id2));
}
TEST(PartitionActionBlocker, TestAutomaticCompactPartitionBlockers)
{
const size_t partitions_count = 100;
const std::string partition_id = "some partition id";
PartitionActionBlocker blocker;
for (size_t i = 0; i < partitions_count; ++i)
{
blocker.cancelForPartition(partition_id + "_" + std::to_string(i));
}
// Automatic cleanup happens once in a while, 100 stale locks should trigger it.
EXPECT_LT(blocker.countPartitionBlockers(), partitions_count);
}
TEST(PartitionActionBlocker, TestCompactPartitionBlockers)
{
const size_t partitions_count = 100;
const std::string partition_id = "some partition id";
PartitionActionBlocker blocker;
for (size_t i = 0; i < partitions_count; ++i)
{
blocker.cancelForPartition(partition_id + "_" + std::to_string(i));
}
// Manually cleanup all stale blockers (all blockers in this case).
blocker.compactPartitionBlockers();
EXPECT_EQ(0, blocker.countPartitionBlockers());
}
TEST(PartitionActionBlocker, TestCompactPartitionBlockersDoesntRemoveActiveBlockers)
{
const size_t partitions_count = 100;
const std::string partition_id = "some partition id";
PartitionActionBlocker blocker;
auto lock_foo = blocker.cancelForPartition("FOO");
for (size_t i = 0; i < partitions_count; ++i)
{
blocker.cancelForPartition(partition_id + "_" + std::to_string(i));
}
auto lock_bar = blocker.cancelForPartition("BAR");
EXPECT_LT(2, blocker.countPartitionBlockers());
// Manually cleanup all stale blockers (all except held by lock_foo and lock_bar).
blocker.compactPartitionBlockers();
EXPECT_EQ(2, blocker.countPartitionBlockers());
}
TEST(PartitionActionBlocker, TestFormatDebug)
{
// Do not validate contents, just make sure that something is printed out
const size_t partitions_count = 100;
const std::string partition_id = "some partition id";
PartitionActionBlocker blocker;
auto global_lock = blocker.cancel();
auto lock_foo = blocker.cancelForPartition("FOO");
auto lock_foo2 = blocker.cancelForPartition("FOO");
for (size_t i = 0; i < partitions_count; ++i)
{
blocker.cancelForPartition(partition_id + "_" + std::to_string(i));
}
auto lock_bar = blocker.cancelForPartition("BAR");
EXPECT_NE("", blocker.formatDebug());
}

View File

@ -223,7 +223,7 @@ void StorageAzureConfiguration::fromAST(ASTs & engine_args, ContextPtr context,
{
account_name = fourth_arg;
account_key = checkAndGetLiteralArgument<String>(engine_args[4], "account_key");
auto sixth_arg = checkAndGetLiteralArgument<String>(engine_args[5], "format/account_name");
auto sixth_arg = checkAndGetLiteralArgument<String>(engine_args[5], "format/structure");
if (is_format_arg(sixth_arg))
{
format = sixth_arg;
@ -257,10 +257,10 @@ void StorageAzureConfiguration::fromAST(ASTs & engine_args, ContextPtr context,
}
else if (with_structure && engine_args.size() == 8)
{
auto fourth_arg = checkAndGetLiteralArgument<String>(engine_args[3], "format/account_name");
auto fourth_arg = checkAndGetLiteralArgument<String>(engine_args[3], "account_name");
account_name = fourth_arg;
account_key = checkAndGetLiteralArgument<String>(engine_args[4], "account_key");
auto sixth_arg = checkAndGetLiteralArgument<String>(engine_args[5], "format/account_name");
auto sixth_arg = checkAndGetLiteralArgument<String>(engine_args[5], "format");
if (!is_format_arg(sixth_arg))
throw Exception(ErrorCodes::BAD_ARGUMENTS, "Unknown format {}", sixth_arg);
format = sixth_arg;

View File

@ -131,7 +131,7 @@ std::shared_ptr<StorageObjectStorageSource::IIterator> StorageObjectStorageSourc
else
{
ConfigurationPtr copy_configuration = configuration->clone();
auto filter_dag = VirtualColumnUtils::createPathAndFileFilterDAG(predicate, virtual_columns);
auto filter_dag = VirtualColumnUtils::createPathAndFileFilterDAG(predicate, virtual_columns, local_context);
if (filter_dag)
{
auto keys = configuration->getPaths();
@ -142,7 +142,7 @@ std::shared_ptr<StorageObjectStorageSource::IIterator> StorageObjectStorageSourc
VirtualColumnUtils::buildSetsForDAG(*filter_dag, local_context);
auto actions = std::make_shared<ExpressionActions>(std::move(*filter_dag));
VirtualColumnUtils::filterByPathOrFile(keys, paths, actions, virtual_columns);
VirtualColumnUtils::filterByPathOrFile(keys, paths, actions, virtual_columns, local_context);
copy_configuration->setPaths(keys);
}
@ -489,6 +489,7 @@ StorageObjectStorageSource::GlobIterator::GlobIterator(
, virtual_columns(virtual_columns_)
, throw_on_zero_files_match(throw_on_zero_files_match_)
, read_keys(read_keys_)
, local_context(context_)
, file_progress_callback(file_progress_callback_)
{
if (configuration->isNamespaceWithGlobs())
@ -510,7 +511,7 @@ StorageObjectStorageSource::GlobIterator::GlobIterator(
}
recursive = key_with_globs == "/**";
if (auto filter_dag = VirtualColumnUtils::createPathAndFileFilterDAG(predicate, virtual_columns))
if (auto filter_dag = VirtualColumnUtils::createPathAndFileFilterDAG(predicate, virtual_columns, local_context))
{
VirtualColumnUtils::buildSetsForDAG(*filter_dag, getContext());
filter_expr = std::make_shared<ExpressionActions>(std::move(*filter_dag));
@ -585,7 +586,7 @@ StorageObjectStorage::ObjectInfoPtr StorageObjectStorageSource::GlobIterator::ne
for (const auto & object_info : new_batch)
paths.push_back(getUniqueStoragePathIdentifier(*configuration, *object_info, false));
VirtualColumnUtils::filterByPathOrFile(new_batch, paths, filter_expr, virtual_columns);
VirtualColumnUtils::filterByPathOrFile(new_batch, paths, filter_expr, virtual_columns, local_context);
LOG_TEST(logger, "Filtered files: {} -> {}", paths.size(), new_batch.size());
}

View File

@ -220,6 +220,7 @@ private:
bool is_finished = false;
bool first_iteration = true;
std::mutex next_mutex;
const ContextPtr local_context;
std::function<void(FileProgress)> file_progress_callback;
};

View File

@ -1141,13 +1141,13 @@ StorageFileSource::FilesIterator::FilesIterator(
{
std::optional<ActionsDAG> filter_dag;
if (!distributed_processing && !archive_info && !files.empty())
filter_dag = VirtualColumnUtils::createPathAndFileFilterDAG(predicate, virtual_columns);
filter_dag = VirtualColumnUtils::createPathAndFileFilterDAG(predicate, virtual_columns, context_);
if (filter_dag)
{
VirtualColumnUtils::buildSetsForDAG(*filter_dag, context_);
auto actions = std::make_shared<ExpressionActions>(std::move(*filter_dag));
VirtualColumnUtils::filterByPathOrFile(files, files, actions, virtual_columns);
VirtualColumnUtils::filterByPathOrFile(files, files, actions, virtual_columns, context_);
}
}

View File

@ -1145,7 +1145,7 @@ bool StorageMergeTree::merge(
{
std::unique_lock lock(currently_processing_in_background_mutex);
if (merger_mutator.merges_blocker.isCancelled())
if (merger_mutator.merges_blocker.isCancelledForPartition(partition_id))
throw Exception(ErrorCodes::ABORTED, "Cancelled merging parts");
merge_mutate_entry = selectPartsToMerge(
@ -1413,8 +1413,19 @@ bool StorageMergeTree::scheduleDataProcessingJob(BackgroundJobsAssignee & assign
has_mutations = !current_mutations_by_version.empty();
}
auto isCancelled = [&merges_blocker = merger_mutator.merges_blocker](const MergeMutateSelectedEntryPtr & entry)
{
if (entry->future_part)
return merges_blocker.isCancelledForPartition(entry->future_part->part_info.partition_id);
return merges_blocker.isCancelled();
};
if (merge_entry)
{
if (isCancelled(merge_entry))
return false;
auto task = std::make_shared<MergePlainMergeTreeTask>(*this, metadata_snapshot, /* deduplicate */ false, Names{}, /* cleanup */ false, merge_entry, shared_lock, common_assignee_trigger);
task->setCurrentTransaction(std::move(transaction_for_merge), std::move(txn));
bool scheduled = assignee.scheduleMergeMutateTask(task);
@ -1426,6 +1437,9 @@ bool StorageMergeTree::scheduleDataProcessingJob(BackgroundJobsAssignee & assign
}
if (mutate_entry)
{
if (isCancelled(mutate_entry))
return false;
/// We take new metadata snapshot here. It's because mutation commands can be executed only with metadata snapshot
/// which is equal or more fresh than commands themselves. In extremely rare case it can happen that we will have alter
/// in between we took snapshot above and selected commands. That is why we take new snapshot here.
@ -1645,6 +1659,66 @@ bool StorageMergeTree::optimize(
return true;
}
namespace
{
size_t countOccurrences(const StorageMergeTree::DataParts & haystack, const DataPartsVector & needle)
{
size_t total = 0;
for (const auto & n : needle)
total += haystack.count(n);
return total;
}
auto getNameWithState(const auto & parts)
{
return std::views::transform(parts, [](const auto & p)
{
return p->getNameWithState();
});
}
}
// Same as stopMergesAndWait, but waits only for merges on parts belonging to a certain partition.
ActionLock StorageMergeTree::stopMergesAndWaitForPartition(String partition_id)
{
LOG_DEBUG(log, "StorageMergeTree::stopMergesAndWaitForPartition partition_id: `{}`", partition_id);
/// Stop all merges and prevent new from starting, BUT unlike stopMergesAndWait(), only wait for the merges on small set of parts to finish.
std::unique_lock lock(currently_processing_in_background_mutex);
/// Asks to complete merges and does not allow them to start.
/// This protects against "revival" of data for a removed partition after completion of merge.
auto merge_blocker = merger_mutator.merges_blocker.cancelForPartition(partition_id);
const DataPartsVector parts_to_wait = getDataPartsVectorInPartitionForInternalUsage(MergeTreeDataPartState::Active, partition_id);
LOG_TRACE(log, "StorageMergeTree::stopMergesAndWaitForPartition parts to wait: {} ({} items)",
fmt::join(getNameWithState(parts_to_wait), ", "), parts_to_wait.size());
LOG_DEBUG(log, "StorageMergeTree::stopMergesAndWaitForPartition all mutating parts: {} ({} items)",
fmt::join(getNameWithState(currently_merging_mutating_parts), ", "), currently_merging_mutating_parts.size());
// TODO allow to stop merges in specific partition only (like it's done in ReplicatedMergeTree)
while (size_t still_merging = countOccurrences(currently_merging_mutating_parts, parts_to_wait))
{
LOG_DEBUG(log, "StorageMergeTree::stopMergesAndWaitForPartition Waiting for currently running merges ({} {} parts are merging right now)",
fmt::join(getNameWithState(currently_merging_mutating_parts), ", "), still_merging);
if (std::cv_status::timeout == currently_processing_in_background_condition.wait_for(
lock, std::chrono::seconds(DBMS_DEFAULT_LOCK_ACQUIRE_TIMEOUT_SEC)))
{
throw Exception(ErrorCodes::TIMEOUT_EXCEEDED, "Timeout while waiting for already running merges");
}
}
LOG_DEBUG(log, "StorageMergeTree::stopMergesAndWaitForPartition done waiting, still merging {} ({} items)",
fmt::join(getNameWithState(currently_merging_mutating_parts), ", "), currently_merging_mutating_parts.size());
return merge_blocker;
}
ActionLock StorageMergeTree::stopMergesAndWait()
{
/// TODO allow to stop merges in specific partition only (like it's done in ReplicatedMergeTree)
@ -2079,10 +2153,14 @@ PartitionCommandsResultInfo StorageMergeTree::attachPartition(
void StorageMergeTree::replacePartitionFrom(const StoragePtr & source_table, const ASTPtr & partition, bool replace, ContextPtr local_context)
{
assertNotReadonly();
LOG_DEBUG(log, "StorageMergeTree::replacePartitionFrom\tsource_table: {}, replace: {}", source_table->getStorageID().getShortName(), replace);
auto lock1 = lockForShare(local_context->getCurrentQueryId(), local_context->getSettingsRef().lock_acquire_timeout);
auto lock2 = source_table->lockForShare(local_context->getCurrentQueryId(), local_context->getSettingsRef().lock_acquire_timeout);
auto merges_blocker = stopMergesAndWait();
const String partition_id = getPartitionIDFromQuery(partition, local_context);
auto merges_blocker = stopMergesAndWaitForPartition(partition_id);
auto source_metadata_snapshot = source_table->getInMemoryMetadataPtr();
auto my_metadata_snapshot = getInMemoryMetadataPtr();
@ -2091,7 +2169,7 @@ void StorageMergeTree::replacePartitionFrom(const StoragePtr & source_table, con
MergeTreeData & src_data = checkStructureAndGetMergeTreeData(source_table, source_metadata_snapshot, my_metadata_snapshot);
DataPartsVector src_parts;
String partition_id;
bool is_all = partition->as<ASTPartition>()->all;
if (is_all)
{

View File

@ -189,6 +189,7 @@ private:
/// If not force, then take merges selector and check that part is not participating in background operations.
MergeTreeDataPartPtr outdatePart(MergeTreeTransaction * txn, const String & part_name, bool force, bool clear_without_timeout = true);
ActionLock stopMergesAndWait();
ActionLock stopMergesAndWaitForPartition(String partition_id);
/// Allocate block number for new mutation, write mutation to disk
/// and into in-memory structures. Wake up merge-mutation task.

View File

@ -227,7 +227,7 @@ public:
std::optional<ActionsDAG> filter_dag;
if (!uris.empty())
filter_dag = VirtualColumnUtils::createPathAndFileFilterDAG(predicate, virtual_columns);
filter_dag = VirtualColumnUtils::createPathAndFileFilterDAG(predicate, virtual_columns, context);
if (filter_dag)
{
@ -238,7 +238,7 @@ public:
VirtualColumnUtils::buildSetsForDAG(*filter_dag, context);
auto actions = std::make_shared<ExpressionActions>(std::move(*filter_dag));
VirtualColumnUtils::filterByPathOrFile(uris, paths, actions, virtual_columns);
VirtualColumnUtils::filterByPathOrFile(uris, paths, actions, virtual_columns, context);
}
}

View File

@ -1,5 +1,6 @@
#include <memory>
#include <stack>
#include <unordered_set>
#include <Core/NamesAndTypes.h>
#include <Core/TypeId.h>
@ -46,6 +47,7 @@
#include "Functions/IFunction.h"
#include "Functions/IFunctionAdaptors.h"
#include "Functions/indexHint.h"
#include <IO/ReadBufferFromString.h>
#include <Interpreters/convertFieldToType.h>
#include <Parsers/makeASTForLogicalFunction.h>
#include <Columns/ColumnSet.h>
@ -124,9 +126,18 @@ void filterBlockWithExpression(const ExpressionActionsPtr & actions, Block & blo
}
}
NamesAndTypesList getCommonVirtualsForFileLikeStorage()
{
return {{"_path", std::make_shared<DataTypeLowCardinality>(std::make_shared<DataTypeString>())},
{"_file", std::make_shared<DataTypeLowCardinality>(std::make_shared<DataTypeString>())},
{"_size", makeNullable(std::make_shared<DataTypeUInt64>())},
{"_time", makeNullable(std::make_shared<DataTypeDateTime>())},
{"_etag", std::make_shared<DataTypeLowCardinality>(std::make_shared<DataTypeString>())}};
}
NameSet getVirtualNamesForFileLikeStorage()
{
return {"_path", "_file", "_size", "_time", "_etag"};
return getCommonVirtualsForFileLikeStorage().getNameSet();
}
std::unordered_map<std::string, std::string> parseHivePartitioningKeysAndValues(const String & path)
@ -154,8 +165,10 @@ VirtualColumnsDescription getVirtualsForFileLikeStorage(ColumnsDescription & sto
{
VirtualColumnsDescription desc;
auto add_virtual = [&](const auto & name, const auto & type)
auto add_virtual = [&](const NameAndTypePair & pair)
{
const auto & name = pair.getNameInStorage();
const auto & type = pair.getTypeInStorage();
if (storage_columns.has(name))
{
if (!context->getSettingsRef().use_hive_partitioning)
@ -172,11 +185,8 @@ VirtualColumnsDescription getVirtualsForFileLikeStorage(ColumnsDescription & sto
desc.addEphemeral(name, type, "");
};
add_virtual("_path", std::make_shared<DataTypeLowCardinality>(std::make_shared<DataTypeString>()));
add_virtual("_file", std::make_shared<DataTypeLowCardinality>(std::make_shared<DataTypeString>()));
add_virtual("_size", makeNullable(std::make_shared<DataTypeUInt64>()));
add_virtual("_time", makeNullable(std::make_shared<DataTypeDateTime>()));
add_virtual("_etag", std::make_shared<DataTypeLowCardinality>(std::make_shared<DataTypeString>()));
for (const auto & item : getCommonVirtualsForFileLikeStorage())
add_virtual(item);
if (context->getSettingsRef().use_hive_partitioning)
{
@ -188,16 +198,16 @@ VirtualColumnsDescription getVirtualsForFileLikeStorage(ColumnsDescription & sto
if (type == nullptr)
type = std::make_shared<DataTypeString>();
if (type->canBeInsideLowCardinality())
add_virtual(item.first, std::make_shared<DataTypeLowCardinality>(type));
add_virtual({item.first, std::make_shared<DataTypeLowCardinality>(type)});
else
add_virtual(item.first, type);
add_virtual({item.first, type});
}
}
return desc;
}
static void addPathAndFileToVirtualColumns(Block & block, const String & path, size_t idx)
static void addPathAndFileToVirtualColumns(Block & block, const String & path, size_t idx, const FormatSettings & format_settings, bool use_hive_partitioning)
{
if (block.has("_path"))
block.getByName("_path").column->assumeMutableRef().insert(path);
@ -214,18 +224,34 @@ static void addPathAndFileToVirtualColumns(Block & block, const String & path, s
block.getByName("_file").column->assumeMutableRef().insert(file);
}
if (use_hive_partitioning)
{
auto keys_and_values = parseHivePartitioningKeysAndValues(path);
for (const auto & [key, value] : keys_and_values)
{
if (const auto * column = block.findByName(key))
{
ReadBufferFromString buf(value);
column->type->getDefaultSerialization()->deserializeWholeText(column->column->assumeMutableRef(), buf, format_settings);
}
}
}
block.getByName("_idx").column->assumeMutableRef().insert(idx);
}
std::optional<ActionsDAG> createPathAndFileFilterDAG(const ActionsDAG::Node * predicate, const NamesAndTypesList & virtual_columns)
std::optional<ActionsDAG> createPathAndFileFilterDAG(const ActionsDAG::Node * predicate, const NamesAndTypesList & virtual_columns, const ContextPtr & context)
{
if (!predicate || virtual_columns.empty())
return {};
Block block;
NameSet common_virtuals;
if (context->getSettingsRef().use_hive_partitioning)
common_virtuals = getVirtualNamesForFileLikeStorage();
for (const auto & column : virtual_columns)
{
if (column.name == "_file" || column.name == "_path")
if (column.name == "_file" || column.name == "_path" || !common_virtuals.contains(column.name))
block.insert({column.type->createColumn(), column.type, column.name});
}
@ -233,18 +259,19 @@ std::optional<ActionsDAG> createPathAndFileFilterDAG(const ActionsDAG::Node * pr
return splitFilterDagForAllowedInputs(predicate, &block);
}
ColumnPtr getFilterByPathAndFileIndexes(const std::vector<String> & paths, const ExpressionActionsPtr & actions, const NamesAndTypesList & virtual_columns)
ColumnPtr getFilterByPathAndFileIndexes(const std::vector<String> & paths, const ExpressionActionsPtr & actions, const NamesAndTypesList & virtual_columns, const ContextPtr & context)
{
Block block;
NameSet common_virtuals = getVirtualNamesForFileLikeStorage();
for (const auto & column : virtual_columns)
{
if (column.name == "_file" || column.name == "_path")
if (column.name == "_file" || column.name == "_path" || !common_virtuals.contains(column.name))
block.insert({column.type->createColumn(), column.type, column.name});
}
block.insert({ColumnUInt64::create(), std::make_shared<DataTypeUInt64>(), "_idx"});
for (size_t i = 0; i != paths.size(); ++i)
addPathAndFileToVirtualColumns(block, paths[i], i);
addPathAndFileToVirtualColumns(block, paths[i], i, getFormatSettings(context), context->getSettingsRef().use_hive_partitioning);
filterBlockWithExpression(actions, block);

View File

@ -75,14 +75,14 @@ VirtualColumnsDescription getVirtualsForFileLikeStorage(
const std::string & sample_path = "",
std::optional<FormatSettings> format_settings_ = std::nullopt);
std::optional<ActionsDAG> createPathAndFileFilterDAG(const ActionsDAG::Node * predicate, const NamesAndTypesList & virtual_columns);
std::optional<ActionsDAG> createPathAndFileFilterDAG(const ActionsDAG::Node * predicate, const NamesAndTypesList & virtual_columns, const ContextPtr & context);
ColumnPtr getFilterByPathAndFileIndexes(const std::vector<String> & paths, const ExpressionActionsPtr & actions, const NamesAndTypesList & virtual_columns);
ColumnPtr getFilterByPathAndFileIndexes(const std::vector<String> & paths, const ExpressionActionsPtr & actions, const NamesAndTypesList & virtual_columns, const ContextPtr & context);
template <typename T>
void filterByPathOrFile(std::vector<T> & sources, const std::vector<String> & paths, const ExpressionActionsPtr & actions, const NamesAndTypesList & virtual_columns)
void filterByPathOrFile(std::vector<T> & sources, const std::vector<String> & paths, const ExpressionActionsPtr & actions, const NamesAndTypesList & virtual_columns, const ContextPtr & context)
{
auto indexes_column = getFilterByPathAndFileIndexes(paths, actions, virtual_columns);
auto indexes_column = getFilterByPathAndFileIndexes(paths, actions, virtual_columns, context);
const auto & indexes = typeid_cast<const ColumnUInt64 &>(*indexes_column).getData();
if (indexes.size() == sources.size())
return;

View File

@ -1,5 +1,6 @@
import pytest
import random, string
import re
from helpers.cluster import ClickHouseCluster
from helpers.test_tools import TSV
@ -336,6 +337,10 @@ def test_create_database():
def test_table_functions():
password = new_password()
azure_conn_string = cluster.env_variables["AZURITE_CONNECTION_STRING"]
account_key_pattern = re.compile("AccountKey=.*?(;|$)")
masked_azure_conn_string = re.sub(
account_key_pattern, "AccountKey=[HIDDEN]\\1", azure_conn_string
)
azure_storage_account_url = cluster.env_variables["AZURITE_STORAGE_ACCOUNT_URL"]
azure_account_name = "devstoreaccount1"
azure_account_key = "Eby8vdM02xNOcqFlqUwJPLlmEtlCDXJ1OUzFT50uSRZ6IFsuFq2UVErCz4I6tq/K1SZFPTOtr/KBHBeksoGMGw=="
@ -467,23 +472,23 @@ def test_table_functions():
"CREATE TABLE tablefunc30 (x int) AS s3('http://minio1:9001/root/data/test9.csv.gz', 'NOSIGN', 'CSV')",
"CREATE TABLE tablefunc31 (`x` int) AS s3('http://minio1:9001/root/data/test10.csv.gz', 'minio', '[HIDDEN]')",
"CREATE TABLE tablefunc32 (`x` int) AS deltaLake('http://minio1:9001/root/data/test11.csv.gz', 'minio', '[HIDDEN]')",
f"CREATE TABLE tablefunc33 (x int) AS azureBlobStorage('{azure_conn_string}', 'cont', 'test_simple.csv', 'CSV')",
f"CREATE TABLE tablefunc34 (x int) AS azureBlobStorage('{azure_conn_string}', 'cont', 'test_simple_1.csv', 'CSV', 'none')",
f"CREATE TABLE tablefunc35 (x int) AS azureBlobStorage('{azure_conn_string}', 'cont', 'test_simple_2.csv', 'CSV', 'none', 'auto')",
f"CREATE TABLE tablefunc33 (`x` int) AS azureBlobStorage('{masked_azure_conn_string}', 'cont', 'test_simple.csv', 'CSV')",
f"CREATE TABLE tablefunc34 (`x` int) AS azureBlobStorage('{masked_azure_conn_string}', 'cont', 'test_simple_1.csv', 'CSV', 'none')",
f"CREATE TABLE tablefunc35 (`x` int) AS azureBlobStorage('{masked_azure_conn_string}', 'cont', 'test_simple_2.csv', 'CSV', 'none', 'auto')",
f"CREATE TABLE tablefunc36 (`x` int) AS azureBlobStorage('{azure_storage_account_url}', 'cont', 'test_simple_3.csv', '{azure_account_name}', '[HIDDEN]')",
f"CREATE TABLE tablefunc37 (`x` int) AS azureBlobStorage('{azure_storage_account_url}', 'cont', 'test_simple_4.csv', '{azure_account_name}', '[HIDDEN]', 'CSV')",
f"CREATE TABLE tablefunc38 (`x` int) AS azureBlobStorage('{azure_storage_account_url}', 'cont', 'test_simple_5.csv', '{azure_account_name}', '[HIDDEN]', 'CSV', 'none')",
f"CREATE TABLE tablefunc39 (`x` int) AS azureBlobStorage('{azure_storage_account_url}', 'cont', 'test_simple_6.csv', '{azure_account_name}', '[HIDDEN]', 'CSV', 'none', 'auto')",
f"CREATE TABLE tablefunc40 (x int) AS azureBlobStorage(named_collection_2, connection_string = '{azure_conn_string}', container = 'cont', blob_path = 'test_simple_7.csv', format = 'CSV')",
f"CREATE TABLE tablefunc40 (`x` int) AS azureBlobStorage(named_collection_2, connection_string = '{masked_azure_conn_string}', container = 'cont', blob_path = 'test_simple_7.csv', format = 'CSV')",
f"CREATE TABLE tablefunc41 (`x` int) AS azureBlobStorage(named_collection_2, storage_account_url = '{azure_storage_account_url}', container = 'cont', blob_path = 'test_simple_8.csv', account_name = '{azure_account_name}', account_key = '[HIDDEN]')",
f"CREATE TABLE tablefunc42 (x int) AS azureBlobStorageCluster('test_shard_localhost', '{azure_conn_string}', 'cont', 'test_simple_9.csv', 'CSV')",
f"CREATE TABLE tablefunc43 (x int) AS azureBlobStorageCluster('test_shard_localhost', '{azure_conn_string}', 'cont', 'test_simple_10.csv', 'CSV', 'none')",
f"CREATE TABLE tablefunc44 (x int) AS azureBlobStorageCluster('test_shard_localhost', '{azure_conn_string}', 'cont', 'test_simple_11.csv', 'CSV', 'none', 'auto')",
f"CREATE TABLE tablefunc42 (`x` int) AS azureBlobStorageCluster('test_shard_localhost', '{masked_azure_conn_string}', 'cont', 'test_simple_9.csv', 'CSV')",
f"CREATE TABLE tablefunc43 (`x` int) AS azureBlobStorageCluster('test_shard_localhost', '{masked_azure_conn_string}', 'cont', 'test_simple_10.csv', 'CSV', 'none')",
f"CREATE TABLE tablefunc44 (`x` int) AS azureBlobStorageCluster('test_shard_localhost', '{masked_azure_conn_string}', 'cont', 'test_simple_11.csv', 'CSV', 'none', 'auto')",
f"CREATE TABLE tablefunc45 (`x` int) AS azureBlobStorageCluster('test_shard_localhost', '{azure_storage_account_url}', 'cont', 'test_simple_12.csv', '{azure_account_name}', '[HIDDEN]')",
f"CREATE TABLE tablefunc46 (`x` int) AS azureBlobStorageCluster('test_shard_localhost', '{azure_storage_account_url}', 'cont', 'test_simple_13.csv', '{azure_account_name}', '[HIDDEN]', 'CSV')",
f"CREATE TABLE tablefunc47 (`x` int) AS azureBlobStorageCluster('test_shard_localhost', '{azure_storage_account_url}', 'cont', 'test_simple_14.csv', '{azure_account_name}', '[HIDDEN]', 'CSV', 'none')",
f"CREATE TABLE tablefunc48 (`x` int) AS azureBlobStorageCluster('test_shard_localhost', '{azure_storage_account_url}', 'cont', 'test_simple_15.csv', '{azure_account_name}', '[HIDDEN]', 'CSV', 'none', 'auto')",
f"CREATE TABLE tablefunc49 (x int) AS azureBlobStorageCluster('test_shard_localhost', named_collection_2, connection_string = '{azure_conn_string}', container = 'cont', blob_path = 'test_simple_16.csv', format = 'CSV')",
f"CREATE TABLE tablefunc49 (`x` int) AS azureBlobStorageCluster('test_shard_localhost', named_collection_2, connection_string = '{masked_azure_conn_string}', container = 'cont', blob_path = 'test_simple_16.csv', format = 'CSV')",
f"CREATE TABLE tablefunc50 (`x` int) AS azureBlobStorageCluster('test_shard_localhost', named_collection_2, storage_account_url = '{azure_storage_account_url}', container = 'cont', blob_path = 'test_simple_17.csv', account_name = '{azure_account_name}', account_key = '[HIDDEN]')",
"CREATE TABLE tablefunc51 (`x` int) AS iceberg('http://minio1:9001/root/data/test11.csv.gz', 'minio', '[HIDDEN]')",
],

View File

@ -0,0 +1,33 @@
-- https://github.com/ClickHouse/ClickHouse/issues/45328
-- Check that replacing one partition on a table with `ALTER TABLE REPLACE PARTITION`
-- doesn't wait for mutations on other partitions.
-- { echo }
DROP TABLE IF EXISTS t1;
DROP TABLE IF EXISTS t2;
CREATE TABLE t1
(
`p` UInt8,
`i` UInt64
)
ENGINE = MergeTree
PARTITION BY p
ORDER BY tuple();
INSERT INTO t1 VALUES (1, 1), (2, 2);
CREATE TABLE t2 AS t1;
INSERT INTO t2 VALUES (2, 2000);
-- mutation that is supposed to be running in bg while REPLACE is performed.
-- sleep(1) is arbitrary here, we just need mutation to take long enough to be noticeable.
ALTER TABLE t1 UPDATE i = if(sleep(1), 0, 9000) IN PARTITION id '1' WHERE p == 1;
-- check that mutation is started
SELECT not(is_done) as is_running FROM system.mutations WHERE database==currentDatabase() AND table=='t1';
1
ALTER TABLE t1 REPLACE PARTITION id '2' FROM t2;
-- check that mutation is still running
SELECT is_done, latest_fail_reason, parts_to_do FROM system.mutations WHERE database==currentDatabase() AND table=='t1';
0 1
-- Expecting that mutation hasn't finished yet (since ALTER TABLE .. REPLACE wasn't waiting for it),
-- so row with `p == 1` still has the old value of `i == 1`.
SELECT * FROM t1 ORDER BY p;
1 1
2 2000

View File

@ -0,0 +1,39 @@
-- https://github.com/ClickHouse/ClickHouse/issues/45328
-- Check that replacing one partition on a table with `ALTER TABLE REPLACE PARTITION`
-- doesn't wait for mutations on other partitions.
-- { echo }
DROP TABLE IF EXISTS t1;
DROP TABLE IF EXISTS t2;
CREATE TABLE t1
(
`p` UInt8,
`i` UInt64
)
ENGINE = MergeTree
PARTITION BY p
ORDER BY tuple();
INSERT INTO t1 VALUES (1, 1), (2, 2);
CREATE TABLE t2 AS t1;
INSERT INTO t2 VALUES (2, 2000);
-- mutation that is supposed to be running in background while REPLACE is performed.
-- sleep(1) is arbitrary here, we just need mutation to take long enough to be noticeable.
ALTER TABLE t1 UPDATE i = if(sleep(1), 0, 9000) IN PARTITION id '1' WHERE p == 1;
-- check that mutation is started
SELECT not(is_done) as is_running FROM system.mutations WHERE database==currentDatabase() AND table=='t1';
ALTER TABLE t1 REPLACE PARTITION id '2' FROM t2;
-- check that mutation is still running
SELECT is_done, latest_fail_reason, parts_to_do FROM system.mutations WHERE database==currentDatabase() AND table=='t1';
-- Expecting that mutation hasn't finished yet (since ALTER TABLE .. REPLACE wasn't waiting for it),
-- so row with `p == 1` still has the old value of `i == 1`.
SELECT * FROM t1 ORDER BY p;
DROP TABLE IF EXISTS t1;
DROP TABLE IF EXISTS t2;

View File

@ -1,21 +1,31 @@
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
%d: 123
%d: -123
%d: 0
%d: 9223372036854775807
%i: 123
%u: 123
%o: 173
%x: 7b
%X: 7B
%f: 0.000000
%f: 123.456000
%f: -123.456000
%F: 123.456000
%e: 1.234560e+02
%E: 1.234560E+02
%g: 123.456
%G: 123.456
%a: 0x1.edd2f1a9fbe77p+6
%A: 0X1.EDD2F1A9FBE77P+6
%s: abc
┌─printf('%%s: %s', '\n\t')─┐
1. │ %s:
└───────────────────────────┘
%s:
%%: %
%.5d: 00123
%.2f: 123.46
%.2e: 1.23e+02
%.2g: 1.2e+02
%.2s: ab

View File

@ -1,39 +1,47 @@
-- Testing integer formats
select printf('%%d: %d', 123) = '%d: 123';
select printf('%%i: %i', 123) = '%i: 123';
select printf('%%u: %u', 123) = '%u: 123';
select printf('%%o: %o', 123) = '%o: 173';
select printf('%%x: %x', 123) = '%x: 7b';
select printf('%%X: %X', 123) = '%X: 7B';
select printf('%%d: %d', 123);
select printf('%%d: %d', -123);
select printf('%%d: %d', 0);
select printf('%%d: %d', 9223372036854775807);
select printf('%%i: %i', 123);
select printf('%%u: %u', 123);
select printf('%%o: %o', 123);
select printf('%%x: %x', 123);
select printf('%%X: %X', 123);
-- Testing floating point formats
select printf('%%f: %f', 123.456) = '%f: 123.456000';
select printf('%%F: %F', 123.456) = '%F: 123.456000';
select printf('%%e: %e', 123.456) = '%e: 1.234560e+02';
select printf('%%E: %E', 123.456) = '%E: 1.234560E+02';
select printf('%%g: %g', 123.456) = '%g: 123.456';
select printf('%%G: %G', 123.456) = '%G: 123.456';
select printf('%%a: %a', 123.456) = '%a: 0x1.edd2f1a9fbe77p+6';
select printf('%%A: %A', 123.456) = '%A: 0X1.EDD2F1A9FBE77P+6';
select printf('%%f: %f', 0.0);
select printf('%%f: %f', 123.456);
select printf('%%f: %f', -123.456);
select printf('%%F: %F', 123.456);
select printf('%%e: %e', 123.456);
select printf('%%E: %E', 123.456);
select printf('%%g: %g', 123.456);
select printf('%%G: %G', 123.456);
select printf('%%a: %a', 123.456);
select printf('%%A: %A', 123.456);
-- Testing character formats
select printf('%%s: %s', 'abc') = '%s: abc';
select printf('%%s: %s', 'abc');
SELECT printf('%%s: %s', '\n\t') FORMAT PrettyCompact;
select printf('%%s: %s', '');
-- Testing the %% specifier
select printf('%%%%: %%') = '%%: %';
select printf('%%%%: %%');
-- Testing integer formats with precision
select printf('%%.5d: %.5d', 123) = '%.5d: 00123';
select printf('%%.5d: %.5d', 123);
-- Testing floating point formats with precision
select printf('%%.2f: %.2f', 123.456) = '%.2f: 123.46';
select printf('%%.2e: %.2e', 123.456) = '%.2e: 1.23e+02';
select printf('%%.2g: %.2g', 123.456) = '%.2g: 1.2e+02';
select printf('%%.2f: %.2f', 123.456);
select printf('%%.2e: %.2e', 123.456);
select printf('%%.2g: %.2g', 123.456);
-- Testing character formats with precision
select printf('%%.2s: %.2s', 'abc') = '%.2s: ab';
select printf('%%.2s: %.2s', 'abc');
select printf('%%X: %X', 123.123); -- { serverError BAD_ARGUMENTS }
select printf('%%A: %A', 'abc'); -- { serverError BAD_ARGUMENTS }
select printf('%%s: %s', 100); -- { serverError BAD_ARGUMENTS }
select printf('%%n: %n', 100); -- { serverError BAD_ARGUMENTS }
select printf('%%f: %f', 0); -- { serverError BAD_ARGUMENTS }

View File

@ -33,8 +33,8 @@ Cross Elizabeth
[1,2,3] 42.42
Array(Int64) LowCardinality(Float64)
101
2070
2070
2071
2071
b
1
1

View File

@ -0,0 +1,6 @@
1
1
1
1
1
1

View File

@ -0,0 +1,72 @@
#!/usr/bin/env bash
# Tags: no-fasttest
CURDIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)
# shellcheck source=../shell_config.sh
. "$CURDIR"/../shell_config.sh
DATA_DIR=$USER_FILES_PATH/$CLICKHOUSE_TEST_UNIQUE_NAME
mkdir -p $DATA_DIR
cp -r $CURDIR/data_hive/ $DATA_DIR
$CLICKHOUSE_CLIENT --query_id="test_03231_1_$CLICKHOUSE_TEST_UNIQUE_NAME" --query "
SELECT countDistinct(_path) FROM file('$DATA_DIR/data_hive/partitioning/column0=*/sample.parquet') WHERE column0 = 'Elizabeth' SETTINGS use_hive_partitioning=1, optimize_count_from_files=0;
"
$CLICKHOUSE_CLIENT --query "
SYSTEM FLUSH LOGS;
"
for _ in {1..5}; do
count=$( $CLICKHOUSE_CLIENT --query "
SELECT ProfileEvents['EngineFileLikeReadFiles'] FROM system.query_log
WHERE query_id='test_03231_1_$CLICKHOUSE_TEST_UNIQUE_NAME' AND
current_database = currentDatabase() and type='QueryFinish';" )
if [[ "$count" == "1" ]]; then
echo "1"
break
fi
sleep 1
done
$CLICKHOUSE_CLIENT --query_id="test_03231_2_$CLICKHOUSE_TEST_UNIQUE_NAME" --query "
SELECT countDistinct(_path) FROM file('$DATA_DIR/data_hive/partitioning/identifier=*/email.csv') WHERE identifier = 2070 SETTINGS use_hive_partitioning=1, optimize_count_from_files=0;
"
$CLICKHOUSE_CLIENT --query "
SYSTEM FLUSH LOGS;
"
for _ in {1..5}; do
count=$( $CLICKHOUSE_CLIENT --query "
SELECT ProfileEvents['EngineFileLikeReadFiles'] FROM system.query_log
WHERE query_id='test_03231_2_$CLICKHOUSE_TEST_UNIQUE_NAME' AND
current_database = currentDatabase() and type='QueryFinish';" )
if [[ "$count" == "1" ]]; then
echo "1"
break
fi
sleep 1
done
$CLICKHOUSE_CLIENT --query_id="test_03231_3_$CLICKHOUSE_TEST_UNIQUE_NAME" --query "
SELECT countDistinct(_path) FROM file('$DATA_DIR/data_hive/partitioning/array=*/sample.parquet') WHERE array = [1,2,3] SETTINGS use_hive_partitioning=1, optimize_count_from_files=0;
"
$CLICKHOUSE_CLIENT --query "
SYSTEM FLUSH LOGS;
"
for _ in {1..5}; do
count=$( $CLICKHOUSE_CLIENT --query "
SELECT ProfileEvents['EngineFileLikeReadFiles'] FROM system.query_log
WHERE query_id='test_03231_3_$CLICKHOUSE_TEST_UNIQUE_NAME' AND
current_database = currentDatabase() and type='QueryFinish';" )
if [[ "$count" == "1" ]]; then
echo "1"
break
fi
sleep 1
done
rm -rf $DATA_DIR

View File

@ -1 +1 @@
data_hive/partitioning/column0=Elizabeth/sample.parquet
data_hive/partitioning/non_existing_column=Elizabeth/sample.parquet

View File

@ -5,4 +5,4 @@ CURDIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)
# shellcheck source=../shell_config.sh
. "$CURDIR"/../shell_config.sh
$CLICKHOUSE_LOCAL -q "SELECT substring(_path, position(_path, 'data_hive')) FROM file('$CURDIR/data_hive/partitioning/column0=*/sample.parquet') LIMIT 1;"
$CLICKHOUSE_LOCAL -q "SELECT substring(_path, position(_path, 'data_hive')) FROM file('$CURDIR/data_hive/partitioning/non_existing_column=*/sample.parquet') LIMIT 1;"

View File

@ -0,0 +1,5 @@
_login_email,_identifier,_first_name,_last_name
laura@example.com,2070,Laura,Grey
craig@example.com,4081,Craig,Johnson
mary@example.com,9346,Mary,Jenkins
jamie@example.com,5079,Jamie,Smith
1 _login_email _identifier _first_name _last_name
2 laura@example.com 2070 Laura Grey
3 craig@example.com 4081 Craig Johnson
4 mary@example.com 9346 Mary Jenkins
5 jamie@example.com 5079 Jamie Smith

View File

@ -1114,6 +1114,7 @@ void Runner::runBenchmarkFromLog()
else
{
request_from_log->connection = get_zookeeper_connection(request_from_log->session_id);
request_from_log->executor_id %= concurrency;
push_request(std::move(*request_from_log));
}

View File

@ -28,13 +28,13 @@ void dumpMachine(std::shared_ptr<KeeperStateMachine<DB::KeeperMemoryStorage>> ma
keys.pop();
std::cout << key << "\n";
auto value = storage.container.getValue(key);
std::cout << "\tStat: {version: " << value.version <<
", mtime: " << value.mtime <<
", emphemeralOwner: " << value.ephemeralOwner() <<
", czxid: " << value.czxid <<
", mzxid: " << value.mzxid <<
", numChildren: " << value.numChildren() <<
", dataLength: " << value.data_size <<
std::cout << "\tStat: {version: " << value.stats.version <<
", mtime: " << value.stats.mtime <<
", emphemeralOwner: " << value.stats.ephemeralOwner() <<
", czxid: " << value.stats.czxid <<
", mzxid: " << value.stats.mzxid <<
", numChildren: " << value.stats.numChildren() <<
", dataLength: " << value.stats.data_size <<
"}" << std::endl;
std::cout << "\tData: " << storage.container.getValue(key).getData() << std::endl;