From 7803ecaee54558e91f696793c477fc67e8554c5d Mon Sep 17 00:00:00 2001 From: Maksim Kita Date: Tue, 5 Apr 2022 23:10:23 +0200 Subject: [PATCH 01/55] Fix performance tests --- tests/performance/group_array_sorted.xml | 30 ++++++----------------- tests/performance/joins_in_memory_pmj.xml | 28 +++++++-------------- 2 files changed, 16 insertions(+), 42 deletions(-) diff --git a/tests/performance/group_array_sorted.xml b/tests/performance/group_array_sorted.xml index 7d26459b4b1..d0092e38659 100644 --- a/tests/performance/group_array_sorted.xml +++ b/tests/performance/group_array_sorted.xml @@ -1,27 +1,11 @@ - - 10 - - - - items - - 1000 - 100000 - 10000000 - - - + CREATE TABLE test (`id` UInt64, `value` UInt64, `text` String ) ENGINE = Memory + INSERT INTO test SELECT number as id, rand64() as value, toString(number) as text FROM system.numbers_mt LIMIT 10000000 + + SELECT groupArraySorted(100000)(id, value) FROM test + SELECT groupArraySorted(100000)(text, value) FROM test + SELECT groupArraySorted(100000)((id, text), value) FROM test + SELECT groupArraySorted(100000)(text) FROM test - CREATE TABLE test ( `id` UInt64, `value` UInt64, `text` String ) ENGINE = Memory - INSERT INTO test SELECT number as id, rand64() as value, toString(number) as text FROM numbers({items}) - SELECT groupArraySorted(10)(id, value) FROM test - SELECT groupArraySorted(10)(text, value) FROM test - SELECT groupArraySorted(10)((id, text), value) FROM test - SELECT groupArraySorted(10)(text) FROM test - SELECT groupArraySorted(10000)(id, value) FROM test - SELECT groupArraySorted(10000)(text, value) FROM test - SELECT groupArraySorted(10000)((id, text), value) FROM test - SELECT groupArraySorted(10000)(text) FROM test DROP TABLE IF EXISTS test diff --git a/tests/performance/joins_in_memory_pmj.xml b/tests/performance/joins_in_memory_pmj.xml index 6eaf0bdd81a..9ebc48585f4 100644 --- a/tests/performance/joins_in_memory_pmj.xml +++ b/tests/performance/joins_in_memory_pmj.xml @@ -6,31 +6,21 @@ 0 - INSERT INTO ints SELECT number AS i64, i64 AS i32, i64 AS i16, i64 AS i8 FROM numbers(10000) settings query_plan_filter_push_down = 0 - INSERT INTO ints SELECT 10000 + number % 1000 AS i64, i64 AS i32, i64 AS i16, i64 AS i8 FROM numbers(10000) settings query_plan_filter_push_down = 0 - INSERT INTO ints SELECT 20000 + number % 100 AS i64, i64 AS i32, i64 AS i16, i64 AS i8 FROM numbers(10000) settings query_plan_filter_push_down = 0 - INSERT INTO ints SELECT 30000 + number % 10 AS i64, i64 AS i32, i64 AS i16, i64 AS i8 FROM numbers(10000) settings query_plan_filter_push_down = 0 - INSERT INTO ints SELECT 40000 + number % 1 AS i64, i64 AS i32, i64 AS i16, i64 AS i8 FROM numbers(10000) settings query_plan_filter_push_down = 0 - - SELECT COUNT() FROM ints l ANY LEFT JOIN ints r USING i64 WHERE i32 = 20042 settings query_plan_filter_push_down = 0 - SELECT COUNT() FROM ints l ANY LEFT JOIN ints r USING i64,i32,i16,i8 WHERE i32 = 20042 settings query_plan_filter_push_down = 0 - SELECT COUNT() FROM ints l ANY LEFT JOIN ints r ON l.i64 = r.i64 WHERE i32 = 20042 settings query_plan_filter_push_down = 0 - SELECT COUNT() FROM ints l ANY LEFT JOIN ints r USING i64 WHERE i32 IN(42, 10042, 20042, 30042, 40042) settings query_plan_filter_push_down = 0 - - SELECT COUNT() FROM ints l INNER JOIN ints r USING i64 WHERE i32 = 20042 settings query_plan_filter_push_down = 0 - SELECT COUNT() FROM ints l INNER JOIN ints r USING i64,i32,i16,i8 WHERE i32 = 20042 settings query_plan_filter_push_down = 0 - SELECT COUNT() FROM ints l INNER JOIN ints r ON l.i64 = r.i64 WHERE i32 = 20042 settings query_plan_filter_push_down = 0 - SELECT COUNT() FROM ints l INNER JOIN ints r USING i64 WHERE i32 IN(42, 10042, 20042, 30042, 40042) settings query_plan_filter_push_down = 0 + INSERT INTO ints SELECT number AS i64, i64 AS i32, i64 AS i16, i64 AS i8 FROM numbers(1000000) settings query_plan_filter_push_down = 0 + INSERT INTO ints SELECT 10000 + number % 1000 AS i64, i64 AS i32, i64 AS i16, i64 AS i8 FROM numbers(1000000) settings query_plan_filter_push_down = 0 + INSERT INTO ints SELECT 20000 + number % 100 AS i64, i64 AS i32, i64 AS i16, i64 AS i8 FROM numbers(1000000) settings query_plan_filter_push_down = 0 + INSERT INTO ints SELECT 30000 + number % 10 AS i64, i64 AS i32, i64 AS i16, i64 AS i8 FROM numbers(1000000) settings query_plan_filter_push_down = 0 + INSERT INTO ints SELECT 40000 + number % 1 AS i64, i64 AS i32, i64 AS i16, i64 AS i8 FROM numbers(1000000) settings query_plan_filter_push_down = 0 SELECT COUNT() FROM ints l LEFT JOIN ints r USING i64 WHERE i32 = 20042 settings query_plan_filter_push_down = 0 SELECT COUNT() FROM ints l LEFT JOIN ints r USING i64,i32,i16,i8 WHERE i32 = 20042 settings query_plan_filter_push_down = 0 SELECT COUNT() FROM ints l LEFT JOIN ints r ON l.i64 = r.i64 WHERE i32 = 20042 settings query_plan_filter_push_down = 0 SELECT COUNT() FROM ints l LEFT JOIN ints r USING i64 WHERE i32 IN(42, 10042, 20042, 30042, 40042) settings query_plan_filter_push_down = 0 - SELECT COUNT() FROM ints l ANY LEFT JOIN ints r USING i64 WHERE i32 = 20042 SETTINGS partial_merge_join_optimizations = 0, query_plan_filter_push_down = 0 - SELECT COUNT() FROM ints l ANY LEFT JOIN ints r USING i64,i32,i16,i8 WHERE i32 = 20042 SETTINGS partial_merge_join_optimizations = 0, query_plan_filter_push_down = 0 - SELECT COUNT() FROM ints l ANY LEFT JOIN ints r ON l.i64 = r.i64 WHERE i32 = 20042 SETTINGS partial_merge_join_optimizations = 0, query_plan_filter_push_down = 0 - SELECT COUNT() FROM ints l ANY LEFT JOIN ints r USING i64 WHERE i32 IN(42, 10042, 20042, 30042, 40042) SETTINGS partial_merge_join_optimizations = 0, query_plan_filter_push_down = 0 + SELECT COUNT() FROM ints l ANY LEFT JOIN ints r USING i64 WHERE i32 = 20042 SETTINGS partial_merge_join_optimizations = 0, query_plan_filter_push_down = 0 + SELECT COUNT() FROM ints l ANY LEFT JOIN ints r USING i64,i32,i16,i8 WHERE i32 = 20042 SETTINGS partial_merge_join_optimizations = 0, query_plan_filter_push_down = 0 + SELECT COUNT() FROM ints l ANY LEFT JOIN ints r ON l.i64 = r.i64 WHERE i32 = 20042 SETTINGS partial_merge_join_optimizations = 0, query_plan_filter_push_down = 0 + SELECT COUNT() FROM ints l ANY LEFT JOIN ints r USING i64 WHERE i32 IN(42, 10042, 20042, 30042, 40042) SETTINGS partial_merge_join_optimizations = 0, query_plan_filter_push_down = 0 SELECT COUNT() FROM ints l INNER JOIN ints r USING i64 WHERE i32 = 20042 SETTINGS partial_merge_join_optimizations = 0, query_plan_filter_push_down = 0 SELECT COUNT() FROM ints l INNER JOIN ints r USING i64,i32,i16,i8 WHERE i32 = 20042 SETTINGS partial_merge_join_optimizations = 0, query_plan_filter_push_down = 0 From 8cf67ed4c0a45b08f01f1860189d2ba8af0b22f3 Mon Sep 17 00:00:00 2001 From: Maksim Kita Date: Tue, 12 Apr 2022 15:22:00 +0200 Subject: [PATCH 02/55] Fix constant_column_search performance tests --- tests/performance/constant_column_search.xml | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/tests/performance/constant_column_search.xml b/tests/performance/constant_column_search.xml index cb76fd4cefb..993d7c6483f 100644 --- a/tests/performance/constant_column_search.xml +++ b/tests/performance/constant_column_search.xml @@ -10,10 +10,10 @@ - - - - + 0), count(position(URL, 'google')) FROM hits_100m_single]]> + + + From 9b332c1e31fa81865c59c70de0d75a86d2a83fa7 Mon Sep 17 00:00:00 2001 From: Maksim Kita Date: Tue, 12 Apr 2022 17:06:21 +0200 Subject: [PATCH 03/55] Fix early_constant_folding performance test --- tests/performance/early_constant_folding.xml | 9 --------- .../0_stateless/02267_early_constant_folding.reference | 8 ++++++++ .../queries/0_stateless/02267_early_constant_folding.sql | 1 + 3 files changed, 9 insertions(+), 9 deletions(-) delete mode 100644 tests/performance/early_constant_folding.xml create mode 100644 tests/queries/0_stateless/02267_early_constant_folding.reference create mode 100644 tests/queries/0_stateless/02267_early_constant_folding.sql diff --git a/tests/performance/early_constant_folding.xml b/tests/performance/early_constant_folding.xml deleted file mode 100644 index 9def3efd891..00000000000 --- a/tests/performance/early_constant_folding.xml +++ /dev/null @@ -1,9 +0,0 @@ - - - hits_100m_single - - 1 - - - SELECT count(JavaEnable) FROM hits_100m_single WHERE WatchID = 1 OR Title = 'next' OR URL = 'prev' OR OriginalURL = '???' OR 1 - diff --git a/tests/queries/0_stateless/02267_early_constant_folding.reference b/tests/queries/0_stateless/02267_early_constant_folding.reference new file mode 100644 index 00000000000..da2d9a690ee --- /dev/null +++ b/tests/queries/0_stateless/02267_early_constant_folding.reference @@ -0,0 +1,8 @@ +(Expression) +ExpressionTransform + (SettingQuotaAndLimits) + (ReadFromStorage) + AggregatingTransform + StrictResize + ExpressionTransform + SourceFromSingleChunk 0 → 1 diff --git a/tests/queries/0_stateless/02267_early_constant_folding.sql b/tests/queries/0_stateless/02267_early_constant_folding.sql new file mode 100644 index 00000000000..2835a0a24ea --- /dev/null +++ b/tests/queries/0_stateless/02267_early_constant_folding.sql @@ -0,0 +1 @@ +EXPLAIN PIPELINE SELECT count(JavaEnable) FROM hits_100m_single WHERE WatchID = 1 OR Title = 'next' OR URL = 'prev' OR OriginalURL = '???' OR 1 From 3d36698f56a9c53613e7ea5da54a097bd2b2d8ac Mon Sep 17 00:00:00 2001 From: Maksim Kita Date: Tue, 12 Apr 2022 17:06:38 +0200 Subject: [PATCH 04/55] Fix group_by_sundy_li performance test --- tests/performance/group_by_sundy_li.xml | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/tests/performance/group_by_sundy_li.xml b/tests/performance/group_by_sundy_li.xml index f1187e48528..07c5aa5cd7f 100644 --- a/tests/performance/group_by_sundy_li.xml +++ b/tests/performance/group_by_sundy_li.xml @@ -1,6 +1,7 @@ 8 + 0 @@ -15,10 +16,10 @@ ORDER BY (d, n) - insert into a select '2000-01-01', ['aa','bb','cc','dd'][number % 4 + 1], number from numbers_mt(100000000) - insert into a select '2000-01-02', ['aa','bb','cc','dd'][number % 4 + 1], number from numbers_mt(100000000) - insert into a select '2000-01-03', ['aa','bb','cc','dd'][number % 4 + 1], number from numbers_mt(100000000) - insert into a select '2000-01-04', ['aa','bb','cc','dd'][number % 4 + 1], number from numbers_mt(100000000) + insert into a select '2000-01-01', ['aa','bb','cc','dd'][number % 4 + 1], number from numbers_mt(100000000) + insert into a select '2000-01-02', ['aa','bb','cc','dd'][number % 4 + 1], number from numbers_mt(100000000) + insert into a select '2000-01-03', ['aa','bb','cc','dd'][number % 4 + 1], number from numbers_mt(100000000) + insert into a select '2000-01-04', ['aa','bb','cc','dd'][number % 4 + 1], number from numbers_mt(100000000) OPTIMIZE TABLE a FINAL From 04089be14427379dc3aa24abb16903981dbcff7a Mon Sep 17 00:00:00 2001 From: Maksim Kita Date: Tue, 12 Apr 2022 17:46:16 +0200 Subject: [PATCH 05/55] Fix hash_table_sizes_stats performance test --- tests/performance/hash_table_sizes_stats.xml | 52 +++++++++++--------- 1 file changed, 30 insertions(+), 22 deletions(-) diff --git a/tests/performance/hash_table_sizes_stats.xml b/tests/performance/hash_table_sizes_stats.xml index 5bdd83126b8..79b3523d5de 100644 --- a/tests/performance/hash_table_sizes_stats.xml +++ b/tests/performance/hash_table_sizes_stats.xml @@ -1,29 +1,37 @@ - - hits_10m_single - hits_100m_single - - 1000000000 - SELECT number FROM numbers(5000000) GROUP BY number FORMAT Null - SELECT number FROM numbers(10000000) GROUP BY number FORMAT Null - SELECT number FROM numbers_mt(500000) GROUP BY number FORMAT Null - SELECT number FROM numbers_mt(1000000) GROUP BY number FORMAT Null - SELECT number FROM numbers_mt(10000000) GROUP BY number FORMAT Null - SELECT number FROM numbers_mt(50000000) GROUP BY number FORMAT Null - WITH number % 524289 AS k, toUInt64(k) AS k1, k1 + 1 AS k2 SELECT k1, k2, count() FROM numbers(100000000) GROUP BY k1, k2 FORMAT Null - SELECT number FROM numbers_mt(10000000) GROUP BY number FORMAT Null SETTINGS group_by_two_level_threshold = 1e12, group_by_two_level_threshold_bytes = 1e12 - SELECT number FROM numbers_mt(50000000) GROUP BY number FORMAT Null SETTINGS group_by_two_level_threshold = 1e12, group_by_two_level_threshold_bytes = 1e12 + + hits_100m_single + - SELECT WatchID FROM hits_10m_single GROUP BY WatchID FORMAT Null - SELECT WatchID FROM hits_100m_single GROUP BY WatchID FORMAT Null - SELECT ClientIP AS x, x - 1, x - 2, x - 3, count() AS c FROM hits_10m_single GROUP BY x, x - 1, x - 2, x - 3 ORDER BY c DESC LIMIT 10 - SELECT ClientIP AS x, x - 1, x - 2, x - 3, count() AS c FROM hits_100m_single GROUP BY x, x - 1, x - 2, x - 3 ORDER BY c DESC LIMIT 10 - SELECT WatchID, ClientIP, count() AS c, sum(Refresh), avg(ResolutionWidth) FROM hits_10m_single WHERE SearchPhrase != '' GROUP BY WatchID, ClientIP ORDER BY c DESC LIMIT 10 - SELECT WatchID, ClientIP, count() AS c, sum(Refresh), avg(ResolutionWidth) FROM hits_100m_single WHERE SearchPhrase != '' GROUP BY WatchID, ClientIP ORDER BY c DESC LIMIT 10 - SELECT min(MobilePhoneModel) FROM hits_10m_single WHERE MobilePhoneModel != '' GROUP BY intHash32(UserID) % 1000000 FORMAT Null - SELECT min(MobilePhoneModel) FROM hits_100m_single WHERE MobilePhoneModel != '' GROUP BY intHash32(UserID) % 1000000 FORMAT Null + + + table_name + + hits_100m_single + + + + + numbers_threshold_value + + 5000000 + 10000000 + + + + + SELECT number FROM numbers({numbers_threshold_value}) GROUP BY number FORMAT Null + SELECT number FROM numbers_mt({numbers_threshold_value}) GROUP BY number FORMAT Null + SELECT number FROM numbers_mt({numbers_threshold_value}) GROUP BY number FORMAT Null + WITH number % 524289 AS k, toUInt64(k) AS k1, k1 + 1 AS k2 SELECT k1, k2, count() FROM numbers({numbers_threshold_value}) GROUP BY k1, k2 FORMAT Null + SELECT number FROM numbers_mt({numbers_threshold_value}) GROUP BY number FORMAT Null SETTINGS group_by_two_level_threshold = 1e12, group_by_two_level_threshold_bytes = 1e12 + + SELECT WatchID FROM {table_name} GROUP BY WatchID FORMAT Null + SELECT ClientIP AS x, x - 1, x - 2, x - 3, count() AS c FROM {table_name} GROUP BY x, x - 1, x - 2, x - 3 ORDER BY c DESC LIMIT 10 + SELECT WatchID, ClientIP, count() AS c, sum(Refresh), avg(ResolutionWidth) FROM {table_name} WHERE SearchPhrase != '' GROUP BY WatchID, ClientIP ORDER BY c DESC LIMIT 10 + SELECT min(MobilePhoneModel) FROM {table_name} WHERE MobilePhoneModel != '' GROUP BY intHash32(UserID) % 1000000 FORMAT Null From aa0464d81efacca224ff13dd3171e63774483438 Mon Sep 17 00:00:00 2001 From: Maksim Kita Date: Tue, 12 Apr 2022 18:50:51 +0200 Subject: [PATCH 06/55] Fixed tests --- tests/queries/0_stateless/02267_early_constant_folding.sql | 1 - .../00172_early_constant_folding.reference} | 0 tests/queries/1_stateful/00172_early_constant_folding.sql | 1 + 3 files changed, 1 insertion(+), 1 deletion(-) delete mode 100644 tests/queries/0_stateless/02267_early_constant_folding.sql rename tests/queries/{0_stateless/02267_early_constant_folding.reference => 1_stateful/00172_early_constant_folding.reference} (100%) create mode 100644 tests/queries/1_stateful/00172_early_constant_folding.sql diff --git a/tests/queries/0_stateless/02267_early_constant_folding.sql b/tests/queries/0_stateless/02267_early_constant_folding.sql deleted file mode 100644 index 2835a0a24ea..00000000000 --- a/tests/queries/0_stateless/02267_early_constant_folding.sql +++ /dev/null @@ -1 +0,0 @@ -EXPLAIN PIPELINE SELECT count(JavaEnable) FROM hits_100m_single WHERE WatchID = 1 OR Title = 'next' OR URL = 'prev' OR OriginalURL = '???' OR 1 diff --git a/tests/queries/0_stateless/02267_early_constant_folding.reference b/tests/queries/1_stateful/00172_early_constant_folding.reference similarity index 100% rename from tests/queries/0_stateless/02267_early_constant_folding.reference rename to tests/queries/1_stateful/00172_early_constant_folding.reference diff --git a/tests/queries/1_stateful/00172_early_constant_folding.sql b/tests/queries/1_stateful/00172_early_constant_folding.sql new file mode 100644 index 00000000000..480c42fc136 --- /dev/null +++ b/tests/queries/1_stateful/00172_early_constant_folding.sql @@ -0,0 +1 @@ +EXPLAIN PIPELINE SELECT count(JavaEnable) FROM test.hits WHERE WatchID = 1 OR Title = 'next' OR URL = 'prev' OR OriginalURL = '???' OR 1; From 2a950102bce237bf7c8c801766e761a2d9cc0d02 Mon Sep 17 00:00:00 2001 From: Maksim Kita Date: Wed, 13 Apr 2022 16:41:21 +0200 Subject: [PATCH 07/55] Fixed tests --- tests/queries/1_stateful/00172_early_constant_folding.sql | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/queries/1_stateful/00172_early_constant_folding.sql b/tests/queries/1_stateful/00172_early_constant_folding.sql index 480c42fc136..cc3d2274ecd 100644 --- a/tests/queries/1_stateful/00172_early_constant_folding.sql +++ b/tests/queries/1_stateful/00172_early_constant_folding.sql @@ -1 +1 @@ -EXPLAIN PIPELINE SELECT count(JavaEnable) FROM test.hits WHERE WatchID = 1 OR Title = 'next' OR URL = 'prev' OR OriginalURL = '???' OR 1; +EXPLAIN PIPELINE SELECT count(JavaEnable) FROM test.hits WHERE WatchID = 1 OR Title = 'next' OR URL = 'prev' OR URL = '???' OR 1; From 3a715aba790b98cd4587465027239101ab218e16 Mon Sep 17 00:00:00 2001 From: Tyler Hannan Date: Wed, 13 Apr 2022 22:54:06 +0200 Subject: [PATCH 08/55] add corei9 benchmark --- website/benchmark/hardware/index.html | 1 + .../results/intel_core_i9_11900kf.json | 54 +++++++++++++++++++ 2 files changed, 55 insertions(+) create mode 100644 website/benchmark/hardware/results/intel_core_i9_11900kf.json diff --git a/website/benchmark/hardware/index.html b/website/benchmark/hardware/index.html index 42c87c334c0..476cd672b30 100644 --- a/website/benchmark/hardware/index.html +++ b/website/benchmark/hardware/index.html @@ -95,6 +95,7 @@ Results for AWS instance type im4gn.8xlarge are from Ananth Gundabattula Results for AWS instance type im4gn.16xlarge are from Ananth Gundabattula (Darwinium).
Results for AWS instance type i3.2xlarge are from Ananth Gundabattula (Darwinium).
Results for 2x EPYC 7702 on ZFS mirror NVME are from Alibek A.
+Results for Intel 11th Gen Core i9-11900KF are from Tim Xian.

diff --git a/website/benchmark/hardware/results/intel_core_i9_11900kf.json b/website/benchmark/hardware/results/intel_core_i9_11900kf.json new file mode 100644 index 00000000000..6bc774bb14a --- /dev/null +++ b/website/benchmark/hardware/results/intel_core_i9_11900kf.json @@ -0,0 +1,54 @@ +[ + { + "system": "Intel(R) 11th Gen Core i9-11900KF", + "system_full": "Intel(R) 11th Gen Core i9-11900KF, 64 GB RAM, 1 TB SSD", + "time": "2022-03-27 04:04:00", + "kind": "server", + "result": + [ + [0.001, 0.001, 0.001], + [0.010, 0.007, 0.006], + [0.030, 0.022, 0.020], + [0.100, 0.033, 0.035], + [0.114, 0.079, 0.074], + [0.241, 0.209, 0.225], + [0.002, 0.001, 0.001], + [0.008, 0.007, 0.007], + [0.565, 0.519, 0.511], + [0.629, 0.590, 0.599], + [0.159, 0.130, 0.134], + [0.190, 0.149, 0.150], + [0.976, 0.927, 0.915], + [1.273, 1.208, 1.199], + [1.086, 1.044, 1.041], + [1.229, 1.196, 1.200], + [3.206, 3.491, 3.213], + [1.841, 1.774, 1.809], + [5.919, 5.897, 5.799], + [0.104, 0.039, 0.037], + [1.176, 0.639, 0.694], + [1.407, 0.814, 0.825], + [2.984, 2.391, 2.121], + [2.100, 0.770, 0.716], + [0.342, 0.220, 0.211], + [0.222, 0.211, 0.189], + [0.346, 0.222, 0.224], + [1.272, 0.832, 0.822], + [1.507, 1.306, 1.282], + [3.619, 3.573, 3.597], + [0.761, 0.695, 0.703], + [1.375, 1.217, 1.229], + [8.576, 9.686, 9.070], + [5.634, 5.699, 5.801], + [6.090, 5.789, 5.797], + [1.996, 2.057, 1.946], + [0.119, 0.105, 0.112], + [0.049, 0.040, 0.040], + [0.048, 0.038, 0.038], + [0.261, 0.237, 0.231], + [0.029, 0.013, 0.014], + [0.017, 0.013, 0.011], + [0.003, 0.002, 0.003], + ] + } +] From 2fcc00380de969f6907f1396ca539d57c724ac8b Mon Sep 17 00:00:00 2001 From: alesapin Date: Fri, 15 Apr 2022 14:00:52 +0200 Subject: [PATCH 09/55] Get rid of iheritance --- src/Disks/DiskWebServer.cpp | 6 +++--- src/Disks/IDiskRemote.cpp | 3 ++- src/Disks/IDiskRemote.h | 28 ++++++++++------------------ 3 files changed, 15 insertions(+), 22 deletions(-) diff --git a/src/Disks/DiskWebServer.cpp b/src/Disks/DiskWebServer.cpp index 2f8929982e3..f7b399c4c97 100644 --- a/src/Disks/DiskWebServer.cpp +++ b/src/Disks/DiskWebServer.cpp @@ -165,10 +165,10 @@ std::unique_ptr DiskWebServer::readFile(const String & p auto remote_path = fs_path.parent_path() / (escapeForFileName(fs_path.stem()) + fs_path.extension().string()); remote_path = remote_path.string().substr(url.size()); - RemoteMetadata meta(path, remote_path); - meta.remote_fs_objects.emplace_back(remote_path, iter->second.size); + std::vector blobs_to_read; + blobs_to_read.emplace_back(remote_path, iter->second.size); - auto web_impl = std::make_unique(url, meta.remote_fs_root_path, meta.remote_fs_objects, getContext(), read_settings); + auto web_impl = std::make_unique(url, path, blobs_to_read, getContext(), read_settings); if (read_settings.remote_fs_method == RemoteFSReadMethod::threadpool) { diff --git a/src/Disks/IDiskRemote.cpp b/src/Disks/IDiskRemote.cpp index ead951084ad..cf03ce3c27d 100644 --- a/src/Disks/IDiskRemote.cpp +++ b/src/Disks/IDiskRemote.cpp @@ -154,7 +154,8 @@ IDiskRemote::Metadata::Metadata( const String & remote_fs_root_path_, DiskPtr metadata_disk_, const String & metadata_file_path_) - : RemoteMetadata(remote_fs_root_path_, metadata_file_path_) + : remote_fs_root_path(remote_fs_root_path_) + , metadata_file_path(metadata_file_path_) , metadata_disk(metadata_disk_) , total_size(0), ref_count(0) { diff --git a/src/Disks/IDiskRemote.h b/src/Disks/IDiskRemote.h index ac2b1634d05..7ab11aa7aca 100644 --- a/src/Disks/IDiskRemote.h +++ b/src/Disks/IDiskRemote.h @@ -184,10 +184,18 @@ private: using RemoteDiskPtr = std::shared_ptr; +/// Remote FS (S3, HDFS) metadata file layout: +/// FS objects, their number and total size of all FS objects. +/// Each FS object represents a file path in remote FS and its size. -/// Minimum info, required to be passed to ReadIndirectBufferFromRemoteFS -struct RemoteMetadata +struct IDiskRemote::Metadata { + using Updater = std::function; + /// Metadata file version. + static constexpr UInt32 VERSION_ABSOLUTE_PATHS = 1; + static constexpr UInt32 VERSION_RELATIVE_PATHS = 2; + static constexpr UInt32 VERSION_READ_ONLY_FLAG = 3; + /// Remote FS objects paths and their sizes. std::vector remote_fs_objects; @@ -197,22 +205,6 @@ struct RemoteMetadata /// Relative path to metadata file on local FS. const String metadata_file_path; - RemoteMetadata(const String & remote_fs_root_path_, const String & metadata_file_path_) - : remote_fs_root_path(remote_fs_root_path_), metadata_file_path(metadata_file_path_) {} -}; - -/// Remote FS (S3, HDFS) metadata file layout: -/// FS objects, their number and total size of all FS objects. -/// Each FS object represents a file path in remote FS and its size. - -struct IDiskRemote::Metadata : RemoteMetadata -{ - using Updater = std::function; - /// Metadata file version. - static constexpr UInt32 VERSION_ABSOLUTE_PATHS = 1; - static constexpr UInt32 VERSION_RELATIVE_PATHS = 2; - static constexpr UInt32 VERSION_READ_ONLY_FLAG = 3; - DiskPtr metadata_disk; /// Total size of all remote FS (S3, HDFS) objects. From eb7593f78683e580ecb7c254e3324c6f37e572ff Mon Sep 17 00:00:00 2001 From: alesapin Date: Fri, 15 Apr 2022 16:24:38 +0200 Subject: [PATCH 10/55] Remove more trash --- src/Storages/MergeTree/IMergeTreeDataPart.cpp | 10 ---------- src/Storages/MergeTree/IMergeTreeDataPart.h | 3 --- src/Storages/MergeTree/MergeTreeData.cpp | 5 ----- src/Storages/MergeTree/MergedBlockOutputStream.cpp | 2 -- src/Storages/MergeTree/MutateTask.cpp | 1 - src/Storages/MergeTree/ReplicatedMergeTreeSink.cpp | 11 ++++++----- src/Storages/StorageReplicatedMergeTree.cpp | 14 ++++++++++++-- 7 files changed, 18 insertions(+), 28 deletions(-) diff --git a/src/Storages/MergeTree/IMergeTreeDataPart.cpp b/src/Storages/MergeTree/IMergeTreeDataPart.cpp index d704d8fc435..43887f89394 100644 --- a/src/Storages/MergeTree/IMergeTreeDataPart.cpp +++ b/src/Storages/MergeTree/IMergeTreeDataPart.cpp @@ -1514,8 +1514,6 @@ try SyncGuardPtr sync_guard; if (storage.getSettings()->fsync_part_directory) sync_guard = volume->getDisk()->getDirectorySyncGuard(to); - - storage.lockSharedData(*this); } catch (...) { @@ -1530,14 +1528,6 @@ catch (...) throw; } -void IMergeTreeDataPart::cleanupOldName(const String & old_part_name) const -{ - if (name == old_part_name) - return; - - storage.unlockSharedData(*this, old_part_name); -} - std::optional IMergeTreeDataPart::keepSharedDataInDecoupledStorage() const { /// NOTE: It's needed for zero-copy replication diff --git a/src/Storages/MergeTree/IMergeTreeDataPart.h b/src/Storages/MergeTree/IMergeTreeDataPart.h index 19df88c5466..7307619a70e 100644 --- a/src/Storages/MergeTree/IMergeTreeDataPart.h +++ b/src/Storages/MergeTree/IMergeTreeDataPart.h @@ -354,9 +354,6 @@ public: /// Changes only relative_dir_name, you need to update other metadata (name, is_temp) explicitly virtual void renameTo(const String & new_relative_path, bool remove_new_dir_if_exists) const; - /// Cleanup shared locks made with old name after part renaming - virtual void cleanupOldName(const String & old_part_name) const; - /// Makes clone of a part in detached/ directory via hard links virtual void makeCloneInDetached(const String & prefix, const StorageMetadataPtr & metadata_snapshot) const; diff --git a/src/Storages/MergeTree/MergeTreeData.cpp b/src/Storages/MergeTree/MergeTreeData.cpp index 29b3083c38f..89b57137979 100644 --- a/src/Storages/MergeTree/MergeTreeData.cpp +++ b/src/Storages/MergeTree/MergeTreeData.cpp @@ -2663,8 +2663,6 @@ bool MergeTreeData::renameTempPartAndReplace( MergeTreePartInfo part_info = part->info; String part_name; - String old_part_name = part->name; - if (DataPartPtr existing_part_in_partition = getAnyPartInPartition(part->info.partition_id, lock)) { if (part->partition.value != existing_part_in_partition->partition.value) @@ -2786,9 +2784,6 @@ bool MergeTreeData::renameTempPartAndReplace( out_covered_parts->emplace_back(std::move(covered_part)); } - /// Cleanup shared locks made with old name - part->cleanupOldName(old_part_name); - return true; } diff --git a/src/Storages/MergeTree/MergedBlockOutputStream.cpp b/src/Storages/MergeTree/MergedBlockOutputStream.cpp index 6acbfacd4c1..09711e512a5 100644 --- a/src/Storages/MergeTree/MergedBlockOutputStream.cpp +++ b/src/Storages/MergeTree/MergedBlockOutputStream.cpp @@ -100,8 +100,6 @@ void MergedBlockOutputStream::Finalizer::Impl::finish() if (sync) file->sync(); } - - part->storage.lockSharedData(*part); } MergedBlockOutputStream::Finalizer::~Finalizer() diff --git a/src/Storages/MergeTree/MutateTask.cpp b/src/Storages/MergeTree/MutateTask.cpp index b9bebc665b2..96b0aa7ef7d 100644 --- a/src/Storages/MergeTree/MutateTask.cpp +++ b/src/Storages/MergeTree/MutateTask.cpp @@ -481,7 +481,6 @@ void finalizeMutatedPart( MergeTreeData::DataPart::calculateTotalSizeOnDisk(new_data_part->volume->getDisk(), part_path)); new_data_part->default_codec = codec; new_data_part->calculateColumnsAndSecondaryIndicesSizesOnDisk(); - new_data_part->storage.lockSharedData(*new_data_part); } } diff --git a/src/Storages/MergeTree/ReplicatedMergeTreeSink.cpp b/src/Storages/MergeTree/ReplicatedMergeTreeSink.cpp index 187e4eb96c5..96f829ac96c 100644 --- a/src/Storages/MergeTree/ReplicatedMergeTreeSink.cpp +++ b/src/Storages/MergeTree/ReplicatedMergeTreeSink.cpp @@ -314,8 +314,6 @@ void ReplicatedMergeTreeSink::commitPart( bool is_already_existing_part = false; - String old_part_name = part->name; - while (true) { /// Obtain incremental block number and lock it. The lock holds our intention to add the block to the filesystem. @@ -499,6 +497,8 @@ void ReplicatedMergeTreeSink::commitPart( part->name); } + storage.lockSharedData(*part, false); + Coordination::Responses responses; Coordination::Error multi_code = zookeeper->tryMultiNoThrow(ops, responses); /// 1 RTT @@ -553,11 +553,13 @@ void ReplicatedMergeTreeSink::commitPart( } else if (multi_code == Coordination::Error::ZNODEEXISTS && failed_op_path == quorum_info.status_path) { + storage.unlockSharedData(*part); transaction.rollback(); throw Exception("Another quorum insert has been already started", ErrorCodes::UNSATISFIED_QUORUM_FOR_PREVIOUS_WRITE); } else { + storage.unlockSharedData(*part); /// NOTE: We could be here if the node with the quorum existed, but was quickly removed. transaction.rollback(); throw Exception("Unexpected logical error while adding block " + toString(block_number) + " with ID '" + block_id + "': " @@ -567,12 +569,14 @@ void ReplicatedMergeTreeSink::commitPart( } else if (Coordination::isHardwareError(multi_code)) { + storage.unlockSharedData(*part); transaction.rollback(); throw Exception("Unrecoverable network error while adding block " + toString(block_number) + " with ID '" + block_id + "': " + Coordination::errorMessage(multi_code), ErrorCodes::UNEXPECTED_ZOOKEEPER_ERROR); } else { + storage.unlockSharedData(*part); transaction.rollback(); throw Exception("Unexpected ZooKeeper error while adding block " + toString(block_number) + " with ID '" + block_id + "': " + Coordination::errorMessage(multi_code), ErrorCodes::UNEXPECTED_ZOOKEEPER_ERROR); @@ -595,9 +599,6 @@ void ReplicatedMergeTreeSink::commitPart( waitForQuorum(zookeeper, part->name, quorum_info.status_path, quorum_info.is_active_node_value); } - - /// Cleanup shared locks made with old name - part->cleanupOldName(old_part_name); } void ReplicatedMergeTreeSink::onStart() diff --git a/src/Storages/StorageReplicatedMergeTree.cpp b/src/Storages/StorageReplicatedMergeTree.cpp index db1de14f6a6..67d4d05bb9c 100644 --- a/src/Storages/StorageReplicatedMergeTree.cpp +++ b/src/Storages/StorageReplicatedMergeTree.cpp @@ -1450,11 +1450,14 @@ MergeTreeData::DataPartsVector StorageReplicatedMergeTree::checkPartChecksumsAnd { auto zookeeper = getZooKeeper(); + while (true) { Coordination::Requests ops; NameSet absent_part_paths_on_replicas; + lockSharedData(*part, false); + /// Checksums are checked here and `ops` is filled. In fact, the part is added to ZK just below, when executing `multi`. checkPartChecksumsAndAddCommitOps(zookeeper, part, ops, part->name, &absent_part_paths_on_replicas); @@ -1493,7 +1496,10 @@ MergeTreeData::DataPartsVector StorageReplicatedMergeTree::checkPartChecksumsAnd LOG_INFO(log, "The part {} on a replica suddenly appeared, will recheck checksums", e.getPathForFirstFailedOp()); } else + { + unlockSharedData(*part); throw; + } } } } @@ -7296,7 +7302,9 @@ void StorageReplicatedMergeTree::createTableSharedID() void StorageReplicatedMergeTree::lockSharedDataTemporary(const String & part_name, const String & part_id, const DiskPtr & disk) const { - if (!disk || !disk->supportZeroCopyReplication()) + auto settings = getSettings(); + + if (!disk || !disk->supportZeroCopyReplication() || !settings->allow_remote_fs_zero_copy_replication) return; zkutil::ZooKeeperPtr zookeeper = tryGetZooKeeper(); @@ -7320,7 +7328,9 @@ void StorageReplicatedMergeTree::lockSharedDataTemporary(const String & part_nam void StorageReplicatedMergeTree::lockSharedData(const IMergeTreeDataPart & part, bool replace_existing_lock) const { - if (!part.volume || !part.isStoredOnDisk()) + auto settings = getSettings(); + + if (!part.volume || !part.isStoredOnDisk() || !settings->allow_remote_fs_zero_copy_replication) return; DiskPtr disk = part.volume->getDisk(); From 5a8419a48e1939e07a307cc40cd7a78a87363367 Mon Sep 17 00:00:00 2001 From: alesapin Date: Fri, 15 Apr 2022 17:05:17 +0200 Subject: [PATCH 11/55] Remove more trash --- src/Storages/MergeTree/IMergeTreeDataPart.cpp | 24 ++++++++----------- src/Storages/MergeTree/IMergeTreeDataPart.h | 10 +++++++- src/Storages/MergeTree/MutateTask.cpp | 12 ++++++++++ src/Storages/StorageReplicatedMergeTree.cpp | 14 ++++------- src/Storages/StorageReplicatedMergeTree.h | 3 --- 5 files changed, 35 insertions(+), 28 deletions(-) diff --git a/src/Storages/MergeTree/IMergeTreeDataPart.cpp b/src/Storages/MergeTree/IMergeTreeDataPart.cpp index 43887f89394..ba2463a18e6 100644 --- a/src/Storages/MergeTree/IMergeTreeDataPart.cpp +++ b/src/Storages/MergeTree/IMergeTreeDataPart.cpp @@ -504,10 +504,8 @@ void IMergeTreeDataPart::removeIfNeeded() if (parent_part) { - std::optional keep_shared_data = keepSharedDataInDecoupledStorage(); - if (!keep_shared_data.has_value()) - return; - projectionRemove(parent_part->getFullRelativePath(), *keep_shared_data); + bool keep_shared_data = keepSharedDataInDecoupledStorage(); + projectionRemove(parent_part->getFullRelativePath(), keep_shared_data); } else remove(); @@ -1528,7 +1526,7 @@ catch (...) throw; } -std::optional IMergeTreeDataPart::keepSharedDataInDecoupledStorage() const +bool IMergeTreeDataPart::keepSharedDataInDecoupledStorage() const { /// NOTE: It's needed for zero-copy replication if (force_keep_shared_data) @@ -1554,9 +1552,7 @@ void IMergeTreeDataPart::remove() const assert(assertHasValidVersionMetadata()); part_is_probably_removed_from_disk = true; - std::optional keep_shared_data = keepSharedDataInDecoupledStorage(); - if (!keep_shared_data.has_value()) - return; + bool keep_shared_data = keepSharedDataInDecoupledStorage(); if (!isStoredOnDisk()) return; @@ -1567,7 +1563,7 @@ void IMergeTreeDataPart::remove() const if (isProjectionPart()) { LOG_WARNING(storage.log, "Projection part {} should be removed by its parent {}.", name, parent_part->name); - projectionRemove(parent_part->getFullRelativePath(), *keep_shared_data); + projectionRemove(parent_part->getFullRelativePath(), keep_shared_data); return; } @@ -1599,7 +1595,7 @@ void IMergeTreeDataPart::remove() const LOG_WARNING(storage.log, "Directory {} (to which part must be renamed before removing) already exists. Most likely this is due to unclean restart or race condition. Removing it.", fullPath(disk, to)); try { - disk->removeSharedRecursive(fs::path(to) / "", *keep_shared_data); + disk->removeSharedRecursive(fs::path(to) / "", keep_shared_data); } catch (...) { @@ -1626,7 +1622,7 @@ void IMergeTreeDataPart::remove() const std::unordered_set projection_directories; for (const auto & [p_name, projection_part] : projection_parts) { - projection_part->projectionRemove(to, *keep_shared_data); + projection_part->projectionRemove(to, keep_shared_data); projection_directories.emplace(p_name + ".proj"); } @@ -1634,7 +1630,7 @@ void IMergeTreeDataPart::remove() const if (checksums.empty()) { /// If the part is not completely written, we cannot use fast path by listing files. - disk->removeSharedRecursive(fs::path(to) / "", *keep_shared_data); + disk->removeSharedRecursive(fs::path(to) / "", keep_shared_data); } else { @@ -1663,7 +1659,7 @@ void IMergeTreeDataPart::remove() const request.emplace_back(fs::path(to) / DELETE_ON_DESTROY_MARKER_FILE_NAME, true); request.emplace_back(fs::path(to) / TXN_VERSION_METADATA_FILE_NAME, true); - disk->removeSharedFiles(request, *keep_shared_data); + disk->removeSharedFiles(request, keep_shared_data); disk->removeDirectory(to); } catch (...) @@ -1672,7 +1668,7 @@ void IMergeTreeDataPart::remove() const LOG_ERROR(storage.log, "Cannot quickly remove directory {} by removing files; fallback to recursive removal. Reason: {}", fullPath(disk, to), getCurrentExceptionMessage(false)); - disk->removeSharedRecursive(fs::path(to) / "", *keep_shared_data); + disk->removeSharedRecursive(fs::path(to) / "", keep_shared_data); } } } diff --git a/src/Storages/MergeTree/IMergeTreeDataPart.h b/src/Storages/MergeTree/IMergeTreeDataPart.h index 7307619a70e..e9c0738e114 100644 --- a/src/Storages/MergeTree/IMergeTreeDataPart.h +++ b/src/Storages/MergeTree/IMergeTreeDataPart.h @@ -331,6 +331,14 @@ public: mutable VersionMetadata version; + struct HardlinkedFiles + { + std::string source_part_name; + std::vector hardlinks_from_source_part; + }; + + HardlinkedFiles hardlinked_files; + /// For data in RAM ('index') UInt64 getIndexSizeInBytes() const; UInt64 getIndexSizeInAllocatedBytes() const; @@ -512,7 +520,7 @@ protected: String getRelativePathForDetachedPart(const String & prefix) const; - std::optional keepSharedDataInDecoupledStorage() const; + bool keepSharedDataInDecoupledStorage() const; void initializePartMetadataManager(); diff --git a/src/Storages/MergeTree/MutateTask.cpp b/src/Storages/MergeTree/MutateTask.cpp index 96b0aa7ef7d..c5e5338d6fb 100644 --- a/src/Storages/MergeTree/MutateTask.cpp +++ b/src/Storages/MergeTree/MutateTask.cpp @@ -1070,6 +1070,7 @@ private: ctx->new_data_part->version.setCreationTID(tid, nullptr); ctx->new_data_part->storeVersionMetadata(); + std::vector hardlinked_files; /// Create hardlinks for unchanged files for (auto it = ctx->disk->iterateDirectory(ctx->source_part->getFullRelativePath()); it->isValid(); it->next()) { @@ -1083,10 +1084,12 @@ private: { return rename_pair.first == file_name; }); + if (rename_it != ctx->files_to_rename.end()) { if (rename_it->second.empty()) continue; + destination += rename_it->second; } else @@ -1094,8 +1097,13 @@ private: destination += it->name(); } + if (!ctx->disk->isDirectory(it->path())) + { ctx->disk->createHardLink(it->path(), destination); + hardlinked_files.push_back(it->name()); + } + else if (!endsWith(".tmp_proj", it->name())) // ignore projection tmp merge dir { // it's a projection part directory @@ -1104,10 +1112,14 @@ private: { String p_destination = fs::path(destination) / p_it->name(); ctx->disk->createHardLink(p_it->path(), p_destination); + hardlinked_files.push_back(p_it->name()); } } } + ctx->new_data_part->hardlinked_files.source_part_name = ctx->source_part->name; + ctx->new_data_part->hardlinked_files.hardlinks_from_source_part = hardlinked_files; + (*ctx->mutate_entry)->columns_written = ctx->storage_columns.size() - ctx->updated_header.columns(); ctx->new_data_part->checksums = ctx->source_part->checksums; diff --git a/src/Storages/StorageReplicatedMergeTree.cpp b/src/Storages/StorageReplicatedMergeTree.cpp index 67d4d05bb9c..e97b3f7c6e4 100644 --- a/src/Storages/StorageReplicatedMergeTree.cpp +++ b/src/Storages/StorageReplicatedMergeTree.cpp @@ -7344,8 +7344,9 @@ void StorageReplicatedMergeTree::lockSharedData(const IMergeTreeDataPart & part, String id = part.getUniqueId(); boost::replace_all(id, "/", "_"); - Strings zc_zookeeper_paths = getZeroCopyPartPath(*getSettings(), disk->getType(), getTableSharedID(), - part.name, zookeeper_path); + Strings zc_zookeeper_paths = getZeroCopyPartPath( + *getSettings(), disk->getType(), getTableSharedID(), part.name, zookeeper_path); + for (const auto & zc_zookeeper_path : zc_zookeeper_paths) { String zookeeper_node = fs::path(zc_zookeeper_path) / id / replica_name; @@ -7356,14 +7357,7 @@ void StorageReplicatedMergeTree::lockSharedData(const IMergeTreeDataPart & part, } } - bool StorageReplicatedMergeTree::unlockSharedData(const IMergeTreeDataPart & part) const -{ - return unlockSharedData(part, part.name); -} - - -bool StorageReplicatedMergeTree::unlockSharedData(const IMergeTreeDataPart & part, const String & name) const { if (!part.volume || !part.isStoredOnDisk()) return true; @@ -7386,7 +7380,7 @@ bool StorageReplicatedMergeTree::unlockSharedData(const IMergeTreeDataPart & par return true; } - return unlockSharedDataByID(part.getUniqueId(), getTableSharedID(), name, replica_name, disk, getZooKeeper(), *getSettings(), log, + return unlockSharedDataByID(part.getUniqueId(), getTableSharedID(), part.name, replica_name, disk, getZooKeeper(), *getSettings(), log, zookeeper_path); } diff --git a/src/Storages/StorageReplicatedMergeTree.h b/src/Storages/StorageReplicatedMergeTree.h index 59fb3f124c7..de4bf698184 100644 --- a/src/Storages/StorageReplicatedMergeTree.h +++ b/src/Storages/StorageReplicatedMergeTree.h @@ -238,9 +238,6 @@ public: /// Return false if data is still used by another node bool unlockSharedData(const IMergeTreeDataPart & part) const override; - /// Remove lock with old name for shared data part after rename - bool unlockSharedData(const IMergeTreeDataPart & part, const String & name) const override; - /// Unlock shared data part in zookeeper by part id /// Return true if data unlocked /// Return false if data is still used by another node From 1706ae9e1597aab77c59d65015cf8d96ae66f506 Mon Sep 17 00:00:00 2001 From: alesapin Date: Fri, 15 Apr 2022 18:36:23 +0200 Subject: [PATCH 12/55] Some trash implementation --- src/Storages/MergeTree/IMergeTreeDataPart.cpp | 24 ++++---- src/Storages/MergeTree/IMergeTreeDataPart.h | 2 +- src/Storages/MergeTree/MergeTreeData.cpp | 29 ++++++++- src/Storages/MergeTree/MergeTreeData.h | 5 +- src/Storages/StorageReplicatedMergeTree.cpp | 59 +++++++++++++++---- src/Storages/StorageReplicatedMergeTree.h | 6 +- 6 files changed, 89 insertions(+), 36 deletions(-) diff --git a/src/Storages/MergeTree/IMergeTreeDataPart.cpp b/src/Storages/MergeTree/IMergeTreeDataPart.cpp index ba2463a18e6..6e48d9c0277 100644 --- a/src/Storages/MergeTree/IMergeTreeDataPart.cpp +++ b/src/Storages/MergeTree/IMergeTreeDataPart.cpp @@ -504,8 +504,8 @@ void IMergeTreeDataPart::removeIfNeeded() if (parent_part) { - bool keep_shared_data = keepSharedDataInDecoupledStorage(); - projectionRemove(parent_part->getFullRelativePath(), keep_shared_data); + auto [can_remove, _] = canRemovePart(); + projectionRemove(parent_part->getFullRelativePath(), !can_remove); } else remove(); @@ -1526,13 +1526,13 @@ catch (...) throw; } -bool IMergeTreeDataPart::keepSharedDataInDecoupledStorage() const +std::pair> IMergeTreeDataPart::canRemovePart() const { /// NOTE: It's needed for zero-copy replication if (force_keep_shared_data) - return true; + return std::make_pair(false, std::vector{}); - return !storage.unlockSharedData(*this); + return storage.unlockSharedData(*this); } void IMergeTreeDataPart::initializePartMetadataManager() @@ -1552,7 +1552,7 @@ void IMergeTreeDataPart::remove() const assert(assertHasValidVersionMetadata()); part_is_probably_removed_from_disk = true; - bool keep_shared_data = keepSharedDataInDecoupledStorage(); + auto [can_remove, files_not_to_remove] = canRemovePart(); if (!isStoredOnDisk()) return; @@ -1563,7 +1563,7 @@ void IMergeTreeDataPart::remove() const if (isProjectionPart()) { LOG_WARNING(storage.log, "Projection part {} should be removed by its parent {}.", name, parent_part->name); - projectionRemove(parent_part->getFullRelativePath(), keep_shared_data); + projectionRemove(parent_part->getFullRelativePath(), !can_remove); return; } @@ -1595,7 +1595,7 @@ void IMergeTreeDataPart::remove() const LOG_WARNING(storage.log, "Directory {} (to which part must be renamed before removing) already exists. Most likely this is due to unclean restart or race condition. Removing it.", fullPath(disk, to)); try { - disk->removeSharedRecursive(fs::path(to) / "", keep_shared_data); + disk->removeSharedRecursive(fs::path(to) / "", !can_remove); } catch (...) { @@ -1622,7 +1622,7 @@ void IMergeTreeDataPart::remove() const std::unordered_set projection_directories; for (const auto & [p_name, projection_part] : projection_parts) { - projection_part->projectionRemove(to, keep_shared_data); + projection_part->projectionRemove(to, !can_remove); projection_directories.emplace(p_name + ".proj"); } @@ -1630,7 +1630,7 @@ void IMergeTreeDataPart::remove() const if (checksums.empty()) { /// If the part is not completely written, we cannot use fast path by listing files. - disk->removeSharedRecursive(fs::path(to) / "", keep_shared_data); + disk->removeSharedRecursive(fs::path(to) / "", !can_remove); } else { @@ -1659,7 +1659,7 @@ void IMergeTreeDataPart::remove() const request.emplace_back(fs::path(to) / DELETE_ON_DESTROY_MARKER_FILE_NAME, true); request.emplace_back(fs::path(to) / TXN_VERSION_METADATA_FILE_NAME, true); - disk->removeSharedFiles(request, keep_shared_data); + disk->removeSharedFiles(request, !can_remove); disk->removeDirectory(to); } catch (...) @@ -1668,7 +1668,7 @@ void IMergeTreeDataPart::remove() const LOG_ERROR(storage.log, "Cannot quickly remove directory {} by removing files; fallback to recursive removal. Reason: {}", fullPath(disk, to), getCurrentExceptionMessage(false)); - disk->removeSharedRecursive(fs::path(to) / "", keep_shared_data); + disk->removeSharedRecursive(fs::path(to) / "", !can_remove); } } } diff --git a/src/Storages/MergeTree/IMergeTreeDataPart.h b/src/Storages/MergeTree/IMergeTreeDataPart.h index e9c0738e114..3597120a05b 100644 --- a/src/Storages/MergeTree/IMergeTreeDataPart.h +++ b/src/Storages/MergeTree/IMergeTreeDataPart.h @@ -520,7 +520,7 @@ protected: String getRelativePathForDetachedPart(const String & prefix) const; - bool keepSharedDataInDecoupledStorage() const; + std::pair> canRemovePart() const; void initializePartMetadataManager(); diff --git a/src/Storages/MergeTree/MergeTreeData.cpp b/src/Storages/MergeTree/MergeTreeData.cpp index 89b57137979..c46616dccb3 100644 --- a/src/Storages/MergeTree/MergeTreeData.cpp +++ b/src/Storages/MergeTree/MergeTreeData.cpp @@ -1575,6 +1575,7 @@ MergeTreeData::DataPartsVector MergeTreeData::grabOldParts(bool force) if (!lock.try_lock()) return res; + bool need_remove_parts_in_order = supportsReplication() && getSettings()->allow_remote_fs_zero_copy_replication; time_t now = time(nullptr); std::vector parts_to_delete; @@ -1588,13 +1589,22 @@ MergeTreeData::DataPartsVector MergeTreeData::grabOldParts(bool force) /// Do not remove outdated part if it may be visible for some transaction if (!part->version.canBeRemoved()) + { + if (need_remove_parts_in_order) + break; continue; + } auto part_remove_time = part->remove_time.load(std::memory_order_relaxed); /// Grab only parts that are not used by anyone (SELECTs for example). if (!part.unique()) + { + if (need_remove_parts_in_order) + break; + continue; + } if ((part_remove_time < now && now - part_remove_time > getSettings()->old_parts_lifetime.totalSeconds()) || force @@ -1603,6 +1613,8 @@ MergeTreeData::DataPartsVector MergeTreeData::grabOldParts(bool force) { parts_to_delete.emplace_back(it); } + else if (need_remove_parts_in_order) + break; } res.reserve(parts_to_delete.size()); @@ -3328,7 +3340,6 @@ void MergeTreeData::swapActivePart(MergeTreeData::DataPartPtr part_copy) /// We do not check allow_remote_fs_zero_copy_replication here because data may be shared /// when allow_remote_fs_zero_copy_replication turned on and off again - original_active_part->force_keep_shared_data = false; if (original_active_part->volume->getDisk()->supportZeroCopyReplication() && @@ -5724,6 +5735,10 @@ MergeTreeData::MutableDataPartPtr MergeTreeData::cloneAndLoadDataPartOnSameDisk( if (disk->exists(dst_part_path)) throw Exception("Part in " + fullPath(disk, dst_part_path) + " already exists", ErrorCodes::DIRECTORY_ALREADY_EXISTS); + auto single_disk_volume = std::make_shared(disk->getName(), disk, 0); + auto dst_data_part = createPart(dst_part_name, dst_part_info, single_disk_volume, tmp_dst_part_name); + + /// If source part is in memory, flush it to disk and clone it already in on-disk format if (auto src_part_in_memory = asInMemoryPart(src_part)) { @@ -5732,14 +5747,22 @@ MergeTreeData::MutableDataPartPtr MergeTreeData::cloneAndLoadDataPartOnSameDisk( src_part_in_memory->flushToDisk(src_relative_data_path, flushed_part_path, metadata_snapshot); src_part_path = fs::path(src_relative_data_path) / flushed_part_path / ""; } + else + { + dst_data_part->hardlinked_files.source_part_name = src_part->name; + + for (auto it = disk->iterateDirectory(src_part_path); it->isValid(); it->next()) + { + if (it->name() != IMergeTreeDataPart::DELETE_ON_DESTROY_MARKER_FILE_NAME && it->name() != IMergeTreeDataPart::TXN_VERSION_METADATA_FILE_NAME) + dst_data_part->hardlinked_files.hardlinks_from_source_part.push_back(it->name()); + } + } LOG_DEBUG(log, "Cloning part {} to {}", fullPath(disk, src_part_path), fullPath(disk, dst_part_path)); localBackup(disk, src_part_path, dst_part_path, /* make_source_readonly */ false); disk->removeFileIfExists(fs::path(dst_part_path) / IMergeTreeDataPart::DELETE_ON_DESTROY_MARKER_FILE_NAME); disk->removeFileIfExists(fs::path(dst_part_path) / IMergeTreeDataPart::TXN_VERSION_METADATA_FILE_NAME); - auto single_disk_volume = std::make_shared(disk->getName(), disk, 0); - auto dst_data_part = createPart(dst_part_name, dst_part_info, single_disk_volume, tmp_dst_part_name); /// We should write version metadata on part creation to distinguish it from parts that were created without transaction. TransactionID tid = txn ? txn->tid : Tx::PrehistoricTID; diff --git a/src/Storages/MergeTree/MergeTreeData.h b/src/Storages/MergeTree/MergeTreeData.h index 1601a6714d3..25e0ee3e7c6 100644 --- a/src/Storages/MergeTree/MergeTreeData.h +++ b/src/Storages/MergeTree/MergeTreeData.h @@ -943,10 +943,7 @@ public: /// Unlock shared data part in zookeeper /// Overridden in StorageReplicatedMergeTree - virtual bool unlockSharedData(const IMergeTreeDataPart &) const { return true; } - - /// Remove lock with old name for shared data part after rename - virtual bool unlockSharedData(const IMergeTreeDataPart &, const String &) const { return true; } + virtual std::pair> unlockSharedData(const IMergeTreeDataPart &) const { return std::make_pair(true, std::vector{}); } /// Fetch part only if some replica has it on shared storage like S3 /// Overridden in StorageReplicatedMergeTree diff --git a/src/Storages/StorageReplicatedMergeTree.cpp b/src/Storages/StorageReplicatedMergeTree.cpp index e97b3f7c6e4..96ff6a41d65 100644 --- a/src/Storages/StorageReplicatedMergeTree.cpp +++ b/src/Storages/StorageReplicatedMergeTree.cpp @@ -75,6 +75,7 @@ #include #include +#include #include #include @@ -7345,7 +7346,16 @@ void StorageReplicatedMergeTree::lockSharedData(const IMergeTreeDataPart & part, boost::replace_all(id, "/", "_"); Strings zc_zookeeper_paths = getZeroCopyPartPath( - *getSettings(), disk->getType(), getTableSharedID(), part.name, zookeeper_path); + *getSettings(), disk->getType(), getTableSharedID(), + part.name, zookeeper_path); + + String path_to_set; + if (!part.hardlinked_files.hardlinks_from_source_part.empty()) + { + path_to_set = getZeroCopyPartPath( + *getSettings(), disk->getType(), getTableSharedID(), + part.hardlinked_files.source_part_name, zookeeper_path)[0]; + } for (const auto & zc_zookeeper_path : zc_zookeeper_paths) { @@ -7353,18 +7363,20 @@ void StorageReplicatedMergeTree::lockSharedData(const IMergeTreeDataPart & part, LOG_TRACE(log, "Set zookeeper persistent lock {}", zookeeper_node); - createZeroCopyLockNode(zookeeper, zookeeper_node, zkutil::CreateMode::Persistent, replace_existing_lock); + createZeroCopyLockNode( + zookeeper, zookeeper_node, zkutil::CreateMode::Persistent, + replace_existing_lock, path_to_set, part.hardlinked_files.hardlinks_from_source_part); } } -bool StorageReplicatedMergeTree::unlockSharedData(const IMergeTreeDataPart & part) const +std::pair> StorageReplicatedMergeTree::unlockSharedData(const IMergeTreeDataPart & part) const { if (!part.volume || !part.isStoredOnDisk()) - return true; + return std::make_pair(true, std::vector{}); DiskPtr disk = part.volume->getDisk(); if (!disk || !disk->supportZeroCopyReplication()) - return true; + return std::make_pair(true, std::vector{}); /// If part is temporary refcount file may be absent auto ref_count_path = fs::path(part.getFullRelativePath()) / IMergeTreeDataPart::FILE_FOR_REFERENCES_CHECK; @@ -7372,20 +7384,20 @@ bool StorageReplicatedMergeTree::unlockSharedData(const IMergeTreeDataPart & par { auto ref_count = disk->getRefCount(ref_count_path); if (ref_count > 0) /// Keep part shard info for frozen backups - return false; + return std::make_pair(false, std::vector{}); } else { /// Temporary part with some absent file cannot be locked in shared mode - return true; + return std::make_pair(true, std::vector{}); } return unlockSharedDataByID(part.getUniqueId(), getTableSharedID(), part.name, replica_name, disk, getZooKeeper(), *getSettings(), log, zookeeper_path); } - -bool StorageReplicatedMergeTree::unlockSharedDataByID(String part_id, const String & table_uuid, const String & part_name, +std::pair> StorageReplicatedMergeTree::unlockSharedDataByID( + String part_id, const String & table_uuid, const String & part_name, const String & replica_name_, DiskPtr disk, zkutil::ZooKeeperPtr zookeeper_ptr, const MergeTreeSettings & settings, Poco::Logger * logger, const String & zookeeper_path_old) { @@ -7394,9 +7406,13 @@ bool StorageReplicatedMergeTree::unlockSharedDataByID(String part_id, const Stri Strings zc_zookeeper_paths = getZeroCopyPartPath(settings, disk->getType(), table_uuid, part_name, zookeeper_path_old); bool part_has_no_more_locks = true; + std::vector files_not_to_remove; for (const auto & zc_zookeeper_path : zc_zookeeper_paths) { + files_not_to_remove.clear(); + auto content = zookeeper_ptr->get(zc_zookeeper_path); + boost::split(files_not_to_remove, content, boost::is_any_of("\n ")); String zookeeper_part_uniq_node = fs::path(zc_zookeeper_path) / part_id; /// Delete our replica node for part from zookeeper (we are not interested in it anymore) @@ -7442,7 +7458,7 @@ bool StorageReplicatedMergeTree::unlockSharedDataByID(String part_id, const Stri } } - return part_has_no_more_locks; + return std::make_pair(part_has_no_more_locks, files_not_to_remove); } @@ -7832,7 +7848,7 @@ bool StorageReplicatedMergeTree::createEmptyPartInsteadOfLost(zkutil::ZooKeeperP } -void StorageReplicatedMergeTree::createZeroCopyLockNode(const zkutil::ZooKeeperPtr & zookeeper, const String & zookeeper_node, int32_t mode, bool replace_existing_lock) +void StorageReplicatedMergeTree::createZeroCopyLockNode(const zkutil::ZooKeeperPtr & zookeeper, const String & zookeeper_node, int32_t mode, bool replace_existing_lock, const std::string & path_to_set, const std::vector & hardlinked_files) { /// In rare case other replica can remove path between createAncestors and createIfNotExists /// So we make up to 5 attempts @@ -7852,6 +7868,11 @@ void StorageReplicatedMergeTree::createZeroCopyLockNode(const zkutil::ZooKeeperP Coordination::Requests ops; ops.emplace_back(zkutil::makeRemoveRequest(zookeeper_node, -1)); ops.emplace_back(zkutil::makeCreateRequest(zookeeper_node, "", mode)); + if (!path_to_set.empty() && !hardlinked_files.empty()) + { + std::string data = boost::algorithm::join(hardlinked_files, "\n"); + ops.emplace_back(zkutil::makeSetRequest(path_to_set, data, -1)); + } Coordination::Responses responses; auto error = zookeeper->tryMulti(ops, responses); if (error == Coordination::Error::ZOK) @@ -7859,7 +7880,16 @@ void StorageReplicatedMergeTree::createZeroCopyLockNode(const zkutil::ZooKeeperP } else { - auto error = zookeeper->tryCreate(zookeeper_node, "", mode); + Coordination::Requests ops; + if (!path_to_set.empty() && !hardlinked_files.empty()) + { + std::string data = boost::algorithm::join(hardlinked_files, "\n"); + ops.emplace_back(zkutil::makeSetRequest(path_to_set, data, -1)); + } + ops.emplace_back(zkutil::makeCreateRequest(zookeeper_node, "", mode)); + + Coordination::Responses responses; + auto error = zookeeper->tryMulti(ops, responses); if (error == Coordination::Error::ZOK || error == Coordination::Error::ZNODEEXISTS) break; } @@ -8002,9 +8032,12 @@ bool StorageReplicatedMergeTree::removeSharedDetachedPart(DiskPtr disk, const St if (disk->getRefCount(checksums) == 0) { String id = disk->getUniqueId(checksums); - keep_shared = !StorageReplicatedMergeTree::unlockSharedDataByID(id, table_uuid, part_name, + bool can_remove = false; + std::tie(can_remove, std::ignore) = StorageReplicatedMergeTree::unlockSharedDataByID(id, table_uuid, part_name, detached_replica_name, disk, zookeeper, getContext()->getReplicatedMergeTreeSettings(), log, detached_zookeeper_path); + + keep_shared = !can_remove; } else keep_shared = true; diff --git a/src/Storages/StorageReplicatedMergeTree.h b/src/Storages/StorageReplicatedMergeTree.h index de4bf698184..2fb2494ca64 100644 --- a/src/Storages/StorageReplicatedMergeTree.h +++ b/src/Storages/StorageReplicatedMergeTree.h @@ -236,12 +236,12 @@ public: /// Unlock shared data part in zookeeper /// Return true if data unlocked /// Return false if data is still used by another node - bool unlockSharedData(const IMergeTreeDataPart & part) const override; + std::pair> unlockSharedData(const IMergeTreeDataPart & part) const override; /// Unlock shared data part in zookeeper by part id /// Return true if data unlocked /// Return false if data is still used by another node - static bool unlockSharedDataByID(String part_id, const String & table_uuid, const String & part_name, const String & replica_name_, + static std::pair> unlockSharedDataByID(String part_id, const String & table_uuid, const String & part_name, const String & replica_name_, DiskPtr disk, zkutil::ZooKeeperPtr zookeeper_, const MergeTreeSettings & settings, Poco::Logger * logger, const String & zookeeper_path_old); @@ -756,7 +756,7 @@ private: static Strings getZeroCopyPartPath(const MergeTreeSettings & settings, DiskType disk_type, const String & table_uuid, const String & part_name, const String & zookeeper_path_old); - static void createZeroCopyLockNode(const zkutil::ZooKeeperPtr & zookeeper, const String & zookeeper_node, int32_t mode = zkutil::CreateMode::Persistent, bool replace_existing_lock = false); + static void createZeroCopyLockNode(const zkutil::ZooKeeperPtr & zookeeper, const String & zookeeper_node, int32_t mode = zkutil::CreateMode::Persistent, bool replace_existing_lock = false, const std::string & path_to_set = "", const std::vector & hardlinked_files = {}); bool removeDetachedPart(DiskPtr disk, const String & path, const String & part_name, bool is_freezed) override; From a97754f462c362c32f7375e6541bab333dea5aa7 Mon Sep 17 00:00:00 2001 From: alesapin Date: Mon, 18 Apr 2022 13:39:09 +0200 Subject: [PATCH 13/55] Fix --- src/Storages/MergeTree/MergeTreeData.cpp | 22 ++++++++++++---------- 1 file changed, 12 insertions(+), 10 deletions(-) diff --git a/src/Storages/MergeTree/MergeTreeData.cpp b/src/Storages/MergeTree/MergeTreeData.cpp index c46616dccb3..de6658e536c 100644 --- a/src/Storages/MergeTree/MergeTreeData.cpp +++ b/src/Storages/MergeTree/MergeTreeData.cpp @@ -5735,10 +5735,8 @@ MergeTreeData::MutableDataPartPtr MergeTreeData::cloneAndLoadDataPartOnSameDisk( if (disk->exists(dst_part_path)) throw Exception("Part in " + fullPath(disk, dst_part_path) + " already exists", ErrorCodes::DIRECTORY_ALREADY_EXISTS); - auto single_disk_volume = std::make_shared(disk->getName(), disk, 0); - auto dst_data_part = createPart(dst_part_name, dst_part_info, single_disk_volume, tmp_dst_part_name); - + bool src_memory_part = false; /// If source part is in memory, flush it to disk and clone it already in on-disk format if (auto src_part_in_memory = asInMemoryPart(src_part)) { @@ -5746,8 +5744,18 @@ MergeTreeData::MutableDataPartPtr MergeTreeData::cloneAndLoadDataPartOnSameDisk( auto flushed_part_path = src_part_in_memory->getRelativePathForPrefix(tmp_part_prefix); src_part_in_memory->flushToDisk(src_relative_data_path, flushed_part_path, metadata_snapshot); src_part_path = fs::path(src_relative_data_path) / flushed_part_path / ""; + src_memory_part = true; } - else + + LOG_DEBUG(log, "Cloning part {} to {}", fullPath(disk, src_part_path), fullPath(disk, dst_part_path)); + localBackup(disk, src_part_path, dst_part_path, /* make_source_readonly */ false); + disk->removeFileIfExists(fs::path(dst_part_path) / IMergeTreeDataPart::DELETE_ON_DESTROY_MARKER_FILE_NAME); + disk->removeFileIfExists(fs::path(dst_part_path) / IMergeTreeDataPart::TXN_VERSION_METADATA_FILE_NAME); + + auto single_disk_volume = std::make_shared(disk->getName(), disk, 0); + auto dst_data_part = createPart(dst_part_name, dst_part_info, single_disk_volume, tmp_dst_part_name); + + if (!src_memory_part && !asInMemoryPart(dst_data_part)) { dst_data_part->hardlinked_files.source_part_name = src_part->name; @@ -5758,12 +5766,6 @@ MergeTreeData::MutableDataPartPtr MergeTreeData::cloneAndLoadDataPartOnSameDisk( } } - LOG_DEBUG(log, "Cloning part {} to {}", fullPath(disk, src_part_path), fullPath(disk, dst_part_path)); - localBackup(disk, src_part_path, dst_part_path, /* make_source_readonly */ false); - disk->removeFileIfExists(fs::path(dst_part_path) / IMergeTreeDataPart::DELETE_ON_DESTROY_MARKER_FILE_NAME); - disk->removeFileIfExists(fs::path(dst_part_path) / IMergeTreeDataPart::TXN_VERSION_METADATA_FILE_NAME); - - /// We should write version metadata on part creation to distinguish it from parts that were created without transaction. TransactionID tid = txn ? txn->tid : Tx::PrehistoricTID; dst_data_part->version.setCreationTID(tid, nullptr); From bd7b3847c1ffe46b9bd43b3aadce1f3da273a8d8 Mon Sep 17 00:00:00 2001 From: alesapin Date: Tue, 19 Apr 2022 01:09:09 +0200 Subject: [PATCH 14/55] Some code --- src/Disks/DiskCacheWrapper.cpp | 15 +++--- src/Disks/DiskCacheWrapper.h | 5 +- src/Disks/DiskDecorator.cpp | 8 ++-- src/Disks/DiskDecorator.h | 4 +- src/Disks/DiskEncrypted.h | 17 ++++++- src/Disks/DiskRestartProxy.cpp | 8 ++-- src/Disks/DiskRestartProxy.h | 4 +- src/Disks/DiskWebServer.h | 2 +- src/Disks/IDisk.h | 9 ++-- src/Disks/IDiskRemote.cpp | 48 ++++++++++++------- src/Disks/IDiskRemote.h | 11 +++-- src/Disks/S3/DiskS3.cpp | 2 +- src/Storages/MergeTree/DataPartsExchange.cpp | 2 +- src/Storages/MergeTree/IMergeTreeDataPart.cpp | 20 ++++---- src/Storages/MergeTree/IMergeTreeDataPart.h | 4 +- src/Storages/MergeTree/MergeTreeData.cpp | 2 +- src/Storages/MergeTree/MergeTreeData.h | 2 +- src/Storages/MergeTree/MutateTask.cpp | 6 +-- src/Storages/StorageReplicatedMergeTree.cpp | 26 +++++----- src/Storages/StorageReplicatedMergeTree.h | 6 +-- .../0_stateless/02067_lost_part_s3.reference | 3 ++ .../0_stateless/02067_lost_part_s3.sql | 26 ++++++++++ 22 files changed, 148 insertions(+), 82 deletions(-) create mode 100644 tests/queries/0_stateless/02067_lost_part_s3.reference create mode 100644 tests/queries/0_stateless/02067_lost_part_s3.sql diff --git a/src/Disks/DiskCacheWrapper.cpp b/src/Disks/DiskCacheWrapper.cpp index cc2c330975a..8e355f70432 100644 --- a/src/Disks/DiskCacheWrapper.cpp +++ b/src/Disks/DiskCacheWrapper.cpp @@ -309,23 +309,26 @@ void DiskCacheWrapper::removeSharedFile(const String & path, bool keep_s3) DiskDecorator::removeSharedFile(path, keep_s3); } -void DiskCacheWrapper::removeSharedRecursive(const String & path, bool keep_s3) +void DiskCacheWrapper::removeSharedRecursive(const String & path, bool keep_all, const NameSet & files_to_keep) { if (cache_disk->exists(path)) - cache_disk->removeSharedRecursive(path, keep_s3); - DiskDecorator::removeSharedRecursive(path, keep_s3); + cache_disk->removeSharedRecursive(path, keep_all, files_to_keep); + DiskDecorator::removeSharedRecursive(path, keep_all, files_to_keep); } -void DiskCacheWrapper::removeSharedFiles(const RemoveBatchRequest & files, bool keep_s3) +void DiskCacheWrapper::removeSharedFiles(const RemoveBatchRequest & files, bool keep_all, const NameSet & files_to_keep) { for (const auto & file : files) { if (cache_disk->exists(file.path)) - cache_disk->removeSharedFile(file.path, keep_s3); + { + bool keep_file = keep_all || files_to_keep.contains(fs::path(file.path).filename()); + cache_disk->removeSharedFile(file.path, keep_file); + } } - DiskDecorator::removeSharedFiles(files, keep_s3); + DiskDecorator::removeSharedFiles(files, keep_all, files_to_keep); } void DiskCacheWrapper::createHardLink(const String & src_path, const String & dst_path) diff --git a/src/Disks/DiskCacheWrapper.h b/src/Disks/DiskCacheWrapper.h index e413a3742f3..0e0b2e25f2c 100644 --- a/src/Disks/DiskCacheWrapper.h +++ b/src/Disks/DiskCacheWrapper.h @@ -47,8 +47,9 @@ public: void removeDirectory(const String & path) override; void removeRecursive(const String & path) override; void removeSharedFile(const String & path, bool keep_s3) override; - void removeSharedRecursive(const String & path, bool keep_s3) override; - void removeSharedFiles(const RemoveBatchRequest & files, bool keep_s3) override; + + void removeSharedRecursive(const String & path, bool keep_all, const NameSet & files_to_keep) override; + void removeSharedFiles(const RemoveBatchRequest & files, bool keep_all, const NameSet & files_to_keep) override; void createHardLink(const String & src_path, const String & dst_path) override; ReservationPtr reserve(UInt64 bytes) override; diff --git a/src/Disks/DiskDecorator.cpp b/src/Disks/DiskDecorator.cpp index 14f507af55d..44358847961 100644 --- a/src/Disks/DiskDecorator.cpp +++ b/src/Disks/DiskDecorator.cpp @@ -151,14 +151,14 @@ void DiskDecorator::removeSharedFile(const String & path, bool keep_s3) delegate->removeSharedFile(path, keep_s3); } -void DiskDecorator::removeSharedFiles(const RemoveBatchRequest & files, bool keep_in_remote_fs) +void DiskDecorator::removeSharedFiles(const RemoveBatchRequest & files, bool keep_all_batch_metadata, const NameSet & file_names_remove_metadata_only) { - delegate->removeSharedFiles(files, keep_in_remote_fs); + delegate->removeSharedFiles(files, keep_all_batch_metadata, file_names_remove_metadata_only); } -void DiskDecorator::removeSharedRecursive(const String & path, bool keep_s3) +void DiskDecorator::removeSharedRecursive(const String & path, bool keep_all_batch_metadata, const NameSet & file_names_remove_metadata_only) { - delegate->removeSharedRecursive(path, keep_s3); + delegate->removeSharedRecursive(path, keep_all_batch_metadata, file_names_remove_metadata_only); } void DiskDecorator::setLastModified(const String & path, const Poco::Timestamp & timestamp) diff --git a/src/Disks/DiskDecorator.h b/src/Disks/DiskDecorator.h index e5c9c7699bf..6c70e9a3571 100644 --- a/src/Disks/DiskDecorator.h +++ b/src/Disks/DiskDecorator.h @@ -52,8 +52,8 @@ public: void removeDirectory(const String & path) override; void removeRecursive(const String & path) override; void removeSharedFile(const String & path, bool keep_s3) override; - void removeSharedRecursive(const String & path, bool keep_s3) override; - void removeSharedFiles(const RemoveBatchRequest & files, bool keep_in_remote_fs) override; + void removeSharedRecursive(const String & path, bool keep_all_batch_metadata, const NameSet & file_names_remove_metadata_only) override; + void removeSharedFiles(const RemoveBatchRequest & files, bool keep_all_batch_metadata, const NameSet & file_names_remove_metadata_only) override; void setLastModified(const String & path, const Poco::Timestamp & timestamp) override; Poco::Timestamp getLastModified(const String & path) override; void setReadOnly(const String & path) override; diff --git a/src/Disks/DiskEncrypted.h b/src/Disks/DiskEncrypted.h index 07a2ad81010..28156551e85 100644 --- a/src/Disks/DiskEncrypted.h +++ b/src/Disks/DiskEncrypted.h @@ -159,10 +159,23 @@ public: delegate->removeSharedFile(wrapped_path, flag); } - void removeSharedRecursive(const String & path, bool flag) override + void removeSharedRecursive(const String & path, bool keep_all_batch_metadata, const NameSet & file_names_remove_metadata_only) override { auto wrapped_path = wrappedPath(path); - delegate->removeSharedRecursive(wrapped_path, flag); + delegate->removeSharedRecursive(wrapped_path, keep_all_batch_metadata, file_names_remove_metadata_only); + } + + void removeSharedFiles(const RemoveBatchRequest & files, bool keep_all_batch_metadata, const NameSet & file_names_remove_metadata_only) override + { + for (const auto & file : files) + { + auto wrapped_path = wrappedPath(file.path); + bool keep = keep_all_batch_metadata || file_names_remove_metadata_only.contains(fs::path(file.path).filename()); + if (file.if_exists) + delegate->removeSharedFileIfExists(wrapped_path, keep); + else + delegate->removeSharedFile(wrapped_path, keep); + } } void removeSharedFileIfExists(const String & path, bool flag) override diff --git a/src/Disks/DiskRestartProxy.cpp b/src/Disks/DiskRestartProxy.cpp index 8045a0e8c72..ed0bc74b1a8 100644 --- a/src/Disks/DiskRestartProxy.cpp +++ b/src/Disks/DiskRestartProxy.cpp @@ -251,16 +251,16 @@ void DiskRestartProxy::removeSharedFile(const String & path, bool keep_s3) DiskDecorator::removeSharedFile(path, keep_s3); } -void DiskRestartProxy::removeSharedFiles(const RemoveBatchRequest & files, bool keep_in_remote_fs) +void DiskRestartProxy::removeSharedFiles(const RemoveBatchRequest & files, bool keep_all_batch_metadata, const NameSet & file_names_remove_metadata_only) { ReadLock lock (mutex); - DiskDecorator::removeSharedFiles(files, keep_in_remote_fs); + DiskDecorator::removeSharedFiles(files, keep_all_batch_metadata, file_names_remove_metadata_only); } -void DiskRestartProxy::removeSharedRecursive(const String & path, bool keep_s3) +void DiskRestartProxy::removeSharedRecursive(const String & path, bool keep_all_batch_metadata, const NameSet & file_names_remove_metadata_only) { ReadLock lock (mutex); - DiskDecorator::removeSharedRecursive(path, keep_s3); + DiskDecorator::removeSharedRecursive(path, keep_all_batch_metadata, file_names_remove_metadata_only); } void DiskRestartProxy::setLastModified(const String & path, const Poco::Timestamp & timestamp) diff --git a/src/Disks/DiskRestartProxy.h b/src/Disks/DiskRestartProxy.h index baa57386e68..03bdc047218 100644 --- a/src/Disks/DiskRestartProxy.h +++ b/src/Disks/DiskRestartProxy.h @@ -54,8 +54,8 @@ public: void removeDirectory(const String & path) override; void removeRecursive(const String & path) override; void removeSharedFile(const String & path, bool keep_s3) override; - void removeSharedFiles(const RemoveBatchRequest & files, bool keep_in_remote_fs) override; - void removeSharedRecursive(const String & path, bool keep_s3) override; + void removeSharedFiles(const RemoveBatchRequest & files, bool keep_all_batch_metadata, const NameSet & file_names_remove_metadata_only) override; + void removeSharedRecursive(const String & path, bool keep_all_batch_metadata, const NameSet & file_names_remove_metadata_only) override; void setLastModified(const String & path, const Poco::Timestamp & timestamp) override; Poco::Timestamp getLastModified(const String & path) override; void setReadOnly(const String & path) override; diff --git a/src/Disks/DiskWebServer.h b/src/Disks/DiskWebServer.h index 98f92fe5986..dd699921f7c 100644 --- a/src/Disks/DiskWebServer.h +++ b/src/Disks/DiskWebServer.h @@ -139,7 +139,7 @@ public: throw Exception(ErrorCodes::NOT_IMPLEMENTED, "Disk {} is read-only", getName()); } - void removeSharedRecursive(const String &, bool) override + void removeSharedRecursive(const String &, bool, const NameSet &) override { throw Exception(ErrorCodes::NOT_IMPLEMENTED, "Disk {} is read-only", getName()); } diff --git a/src/Disks/IDisk.h b/src/Disks/IDisk.h index 81cdf47e1fb..30b8ac2c44b 100644 --- a/src/Disks/IDisk.h +++ b/src/Disks/IDisk.h @@ -197,7 +197,7 @@ public: /// Remove file or directory with all children. Use with extra caution. Throws exception if file doesn't exists. /// Differs from removeRecursive for S3/HDFS disks /// Second bool param is a flag to remove (true) or keep (false) shared data on S3 - virtual void removeSharedRecursive(const String & path, bool) { removeRecursive(path); } + virtual void removeSharedRecursive(const String & path, bool, const NameSet &) { removeRecursive(path); } /// Remove file or directory if it exists. /// Differs from removeFileIfExists for S3/HDFS disks @@ -237,14 +237,15 @@ public: /// Batch request to remove multiple files. /// May be much faster for blob storage. - virtual void removeSharedFiles(const RemoveBatchRequest & files, bool keep_in_remote_fs) + virtual void removeSharedFiles(const RemoveBatchRequest & files, bool keep_all_batch_metadata, const NameSet & file_names_remove_metadata_only) { for (const auto & file : files) { + bool keep_file = keep_all_batch_metadata || file_names_remove_metadata_only.contains(fs::path(file.path).filename()); if (file.if_exists) - removeSharedFileIfExists(file.path, keep_in_remote_fs); + removeSharedFileIfExists(file.path, keep_file); else - removeSharedFile(file.path, keep_in_remote_fs); + removeSharedFile(file.path, keep_file); } } diff --git a/src/Disks/IDiskRemote.cpp b/src/Disks/IDiskRemote.cpp index cf03ce3c27d..a546ab56dbf 100644 --- a/src/Disks/IDiskRemote.cpp +++ b/src/Disks/IDiskRemote.cpp @@ -271,7 +271,7 @@ std::unordered_map IDiskRemote::getSerializedMetadata(const std: return metadatas; } -void IDiskRemote::removeMetadata(const String & path, std::vector & paths_to_remove) +void IDiskRemote::removeMetadata(const String & path, std::unordered_map> & paths_to_remove) { LOG_TRACE(log, "Remove file by path: {}", backQuote(metadata_disk->getPath() + path)); @@ -283,13 +283,13 @@ void IDiskRemote::removeMetadata(const String & path, std::vector & try { - auto metadata_updater = [&paths_to_remove, this] (Metadata & metadata) + auto metadata_updater = [&paths_to_remove, path, this] (Metadata & metadata) { if (metadata.ref_count == 0) { for (const auto & [remote_fs_object_path, _] : metadata.remote_fs_objects) { - paths_to_remove.push_back(remote_fs_root_path + remote_fs_object_path); + paths_to_remove[path].push_back(remote_fs_root_path + remote_fs_object_path); if (cache) { @@ -328,7 +328,7 @@ void IDiskRemote::removeMetadata(const String & path, std::vector & } -void IDiskRemote::removeMetadataRecursive(const String & path, std::vector & paths_to_remove) +void IDiskRemote::removeMetadataRecursive(const String & path, std::unordered_map> & paths_to_remove) { checkStackSize(); /// This is needed to prevent stack overflow in case of cyclic symlinks. @@ -483,27 +483,27 @@ void IDiskRemote::replaceFile(const String & from_path, const String & to_path) void IDiskRemote::removeSharedFile(const String & path, bool delete_metadata_only) { - std::vector paths_to_remove; + std::unordered_map> paths_to_remove; removeMetadata(path, paths_to_remove); if (!delete_metadata_only) - removeFromRemoteFS(paths_to_remove); + removeFromRemoteFS(paths_to_remove[path]); } void IDiskRemote::removeSharedFileIfExists(const String & path, bool delete_metadata_only) { - std::vector paths_to_remove; + std::unordered_map> paths_to_remove; if (metadata_disk->exists(path)) { removeMetadata(path, paths_to_remove); if (!delete_metadata_only) - removeFromRemoteFS(paths_to_remove); + removeFromRemoteFS(paths_to_remove[path]); } } -void IDiskRemote::removeSharedFiles(const RemoveBatchRequest & files, bool delete_metadata_only) +void IDiskRemote::removeSharedFiles(const RemoveBatchRequest & files, bool keep_all_batch_metadata, const NameSet & file_names_remove_metadata_only) { - std::vector paths_to_remove; + std::unordered_map> paths_to_remove; for (const auto & file : files) { bool skip = file.if_exists && !metadata_disk->exists(file.path); @@ -511,17 +511,33 @@ void IDiskRemote::removeSharedFiles(const RemoveBatchRequest & files, bool delet removeMetadata(file.path, paths_to_remove); } - if (!delete_metadata_only) - removeFromRemoteFS(paths_to_remove); + if (!keep_all_batch_metadata) + { + std::vector remove_from_remote; + for (auto && [path, remote_paths] : paths_to_remove) + { + if (!file_names_remove_metadata_only.contains(fs::path(path).filename())) + remove_from_remote.insert(remove_from_remote.end(), remote_paths.begin(), remote_paths.end()); + } + removeFromRemoteFS(remove_from_remote); + } } -void IDiskRemote::removeSharedRecursive(const String & path, bool delete_metadata_only) +void IDiskRemote::removeSharedRecursive(const String & path, bool keep_all_batch_metadata, const NameSet & file_names_remove_metadata_only) { - std::vector paths_to_remove; + std::unordered_map> paths_to_remove; removeMetadataRecursive(path, paths_to_remove); - if (!delete_metadata_only) - removeFromRemoteFS(paths_to_remove); + if (!keep_all_batch_metadata) + { + std::vector remove_from_remote; + for (auto && [local_path, remote_paths] : paths_to_remove) + { + if (!file_names_remove_metadata_only.contains(fs::path(local_path).filename())) + remove_from_remote.insert(remove_from_remote.end(), remote_paths.begin(), remote_paths.end()); + } + removeFromRemoteFS(remove_from_remote); + } } diff --git a/src/Disks/IDiskRemote.h b/src/Disks/IDiskRemote.h index 7ab11aa7aca..901403f4689 100644 --- a/src/Disks/IDiskRemote.h +++ b/src/Disks/IDiskRemote.h @@ -107,15 +107,16 @@ public: void removeFileIfExists(const String & path) override { removeSharedFileIfExists(path, false); } - void removeRecursive(const String & path) override { removeSharedRecursive(path, false); } + void removeRecursive(const String & path) override { removeSharedRecursive(path, false, {}); } + void removeSharedFile(const String & path, bool delete_metadata_only) override; void removeSharedFileIfExists(const String & path, bool delete_metadata_only) override; - void removeSharedFiles(const RemoveBatchRequest & files, bool delete_metadata_only) override; + void removeSharedFiles(const RemoveBatchRequest & files, bool keep_all_batch_metadata, const NameSet & file_names_remove_metadata_only) override; - void removeSharedRecursive(const String & path, bool delete_metadata_only) override; + void removeSharedRecursive(const String & path, bool keep_all_batch_metadata, const NameSet & file_names_remove_metadata_only) override; void listFiles(const String & path, std::vector & file_names) override; @@ -170,9 +171,9 @@ protected: FileCachePtr cache; private: - void removeMetadata(const String & path, std::vector & paths_to_remove); + void removeMetadata(const String & path, std::unordered_map> & paths_to_remove); - void removeMetadataRecursive(const String & path, std::vector & paths_to_remove); + void removeMetadataRecursive(const String & path, std::unordered_map> & paths_to_remove); bool tryReserve(UInt64 bytes); diff --git a/src/Disks/S3/DiskS3.cpp b/src/Disks/S3/DiskS3.cpp index 6ed53bcb9c2..524a31a3bbb 100644 --- a/src/Disks/S3/DiskS3.cpp +++ b/src/Disks/S3/DiskS3.cpp @@ -764,7 +764,7 @@ void DiskS3::restore() bool cleanup_s3 = information.source_bucket != bucket || information.source_path != remote_fs_root_path; for (const auto & root : data_roots) if (exists(root)) - removeSharedRecursive(root + '/', !cleanup_s3); + removeSharedRecursive(root + '/', !cleanup_s3, {}); restoreFiles(information); restoreFileOperations(information); diff --git a/src/Storages/MergeTree/DataPartsExchange.cpp b/src/Storages/MergeTree/DataPartsExchange.cpp index 0dcccc33266..a74e4505789 100644 --- a/src/Storages/MergeTree/DataPartsExchange.cpp +++ b/src/Storages/MergeTree/DataPartsExchange.cpp @@ -826,7 +826,7 @@ MergeTreeData::MutableDataPartPtr Fetcher::downloadPartToDiskRemoteMeta( /// NOTE The is_cancelled flag also makes sense to check every time you read over the network, /// performing a poll with a not very large timeout. /// And now we check it only between read chunks (in the `copyData` function). - disk->removeSharedRecursive(part_download_path, true); + disk->removeSharedRecursive(part_download_path, true, {}); throw Exception("Fetching of part was cancelled", ErrorCodes::ABORTED); } diff --git a/src/Storages/MergeTree/IMergeTreeDataPart.cpp b/src/Storages/MergeTree/IMergeTreeDataPart.cpp index 6e48d9c0277..13fd6d6d09e 100644 --- a/src/Storages/MergeTree/IMergeTreeDataPart.cpp +++ b/src/Storages/MergeTree/IMergeTreeDataPart.cpp @@ -1526,11 +1526,11 @@ catch (...) throw; } -std::pair> IMergeTreeDataPart::canRemovePart() const +std::pair IMergeTreeDataPart::canRemovePart() const { /// NOTE: It's needed for zero-copy replication if (force_keep_shared_data) - return std::make_pair(false, std::vector{}); + return std::make_pair(false, NameSet{}); return storage.unlockSharedData(*this); } @@ -1595,7 +1595,7 @@ void IMergeTreeDataPart::remove() const LOG_WARNING(storage.log, "Directory {} (to which part must be renamed before removing) already exists. Most likely this is due to unclean restart or race condition. Removing it.", fullPath(disk, to)); try { - disk->removeSharedRecursive(fs::path(to) / "", !can_remove); + disk->removeSharedRecursive(fs::path(to) / "", !can_remove, files_not_to_remove); } catch (...) { @@ -1630,7 +1630,7 @@ void IMergeTreeDataPart::remove() const if (checksums.empty()) { /// If the part is not completely written, we cannot use fast path by listing files. - disk->removeSharedRecursive(fs::path(to) / "", !can_remove); + disk->removeSharedRecursive(fs::path(to) / "", !can_remove, files_not_to_remove); } else { @@ -1659,7 +1659,7 @@ void IMergeTreeDataPart::remove() const request.emplace_back(fs::path(to) / DELETE_ON_DESTROY_MARKER_FILE_NAME, true); request.emplace_back(fs::path(to) / TXN_VERSION_METADATA_FILE_NAME, true); - disk->removeSharedFiles(request, !can_remove); + disk->removeSharedFiles(request, !can_remove, files_not_to_remove); disk->removeDirectory(to); } catch (...) @@ -1668,7 +1668,7 @@ void IMergeTreeDataPart::remove() const LOG_ERROR(storage.log, "Cannot quickly remove directory {} by removing files; fallback to recursive removal. Reason: {}", fullPath(disk, to), getCurrentExceptionMessage(false)); - disk->removeSharedRecursive(fs::path(to) / "", !can_remove); + disk->removeSharedRecursive(fs::path(to) / "", !can_remove, files_not_to_remove); } } } @@ -1689,7 +1689,7 @@ void IMergeTreeDataPart::projectionRemove(const String & parent_to, bool keep_sh "Cannot quickly remove directory {} by removing files; fallback to recursive removal. Reason: checksums.txt is missing", fullPath(disk, to)); /// If the part is not completely written, we cannot use fast path by listing files. - disk->removeSharedRecursive(fs::path(to) / "", keep_shared_data); + disk->removeSharedRecursive(fs::path(to) / "", keep_shared_data, {}); } else { @@ -1713,8 +1713,8 @@ void IMergeTreeDataPart::projectionRemove(const String & parent_to, bool keep_sh request.emplace_back(fs::path(to) / DEFAULT_COMPRESSION_CODEC_FILE_NAME, true); request.emplace_back(fs::path(to) / DELETE_ON_DESTROY_MARKER_FILE_NAME, true); - disk->removeSharedFiles(request, keep_shared_data); - disk->removeSharedRecursive(to, keep_shared_data); + disk->removeSharedFiles(request, keep_shared_data, {}); + disk->removeSharedRecursive(to, keep_shared_data, {}); } catch (...) { @@ -1722,7 +1722,7 @@ void IMergeTreeDataPart::projectionRemove(const String & parent_to, bool keep_sh LOG_ERROR(storage.log, "Cannot quickly remove directory {} by removing files; fallback to recursive removal. Reason: {}", fullPath(disk, to), getCurrentExceptionMessage(false)); - disk->removeSharedRecursive(fs::path(to) / "", keep_shared_data); + disk->removeSharedRecursive(fs::path(to) / "", keep_shared_data, {}); } } } diff --git a/src/Storages/MergeTree/IMergeTreeDataPart.h b/src/Storages/MergeTree/IMergeTreeDataPart.h index 3597120a05b..c7e119388c5 100644 --- a/src/Storages/MergeTree/IMergeTreeDataPart.h +++ b/src/Storages/MergeTree/IMergeTreeDataPart.h @@ -334,7 +334,7 @@ public: struct HardlinkedFiles { std::string source_part_name; - std::vector hardlinks_from_source_part; + NameSet hardlinks_from_source_part; }; HardlinkedFiles hardlinked_files; @@ -520,7 +520,7 @@ protected: String getRelativePathForDetachedPart(const String & prefix) const; - std::pair> canRemovePart() const; + std::pair canRemovePart() const; void initializePartMetadataManager(); diff --git a/src/Storages/MergeTree/MergeTreeData.cpp b/src/Storages/MergeTree/MergeTreeData.cpp index de6658e536c..e47077baffa 100644 --- a/src/Storages/MergeTree/MergeTreeData.cpp +++ b/src/Storages/MergeTree/MergeTreeData.cpp @@ -5762,7 +5762,7 @@ MergeTreeData::MutableDataPartPtr MergeTreeData::cloneAndLoadDataPartOnSameDisk( for (auto it = disk->iterateDirectory(src_part_path); it->isValid(); it->next()) { if (it->name() != IMergeTreeDataPart::DELETE_ON_DESTROY_MARKER_FILE_NAME && it->name() != IMergeTreeDataPart::TXN_VERSION_METADATA_FILE_NAME) - dst_data_part->hardlinked_files.hardlinks_from_source_part.push_back(it->name()); + dst_data_part->hardlinked_files.hardlinks_from_source_part.insert(it->name()); } } diff --git a/src/Storages/MergeTree/MergeTreeData.h b/src/Storages/MergeTree/MergeTreeData.h index 25e0ee3e7c6..7d51a8fa37c 100644 --- a/src/Storages/MergeTree/MergeTreeData.h +++ b/src/Storages/MergeTree/MergeTreeData.h @@ -943,7 +943,7 @@ public: /// Unlock shared data part in zookeeper /// Overridden in StorageReplicatedMergeTree - virtual std::pair> unlockSharedData(const IMergeTreeDataPart &) const { return std::make_pair(true, std::vector{}); } + virtual std::pair unlockSharedData(const IMergeTreeDataPart &) const { return std::make_pair(true, NameSet{}); } /// Fetch part only if some replica has it on shared storage like S3 /// Overridden in StorageReplicatedMergeTree diff --git a/src/Storages/MergeTree/MutateTask.cpp b/src/Storages/MergeTree/MutateTask.cpp index c5e5338d6fb..5c960898e3c 100644 --- a/src/Storages/MergeTree/MutateTask.cpp +++ b/src/Storages/MergeTree/MutateTask.cpp @@ -1070,7 +1070,7 @@ private: ctx->new_data_part->version.setCreationTID(tid, nullptr); ctx->new_data_part->storeVersionMetadata(); - std::vector hardlinked_files; + NameSet hardlinked_files; /// Create hardlinks for unchanged files for (auto it = ctx->disk->iterateDirectory(ctx->source_part->getFullRelativePath()); it->isValid(); it->next()) { @@ -1101,7 +1101,7 @@ private: if (!ctx->disk->isDirectory(it->path())) { ctx->disk->createHardLink(it->path(), destination); - hardlinked_files.push_back(it->name()); + hardlinked_files.insert(it->name()); } else if (!endsWith(".tmp_proj", it->name())) // ignore projection tmp merge dir @@ -1112,7 +1112,7 @@ private: { String p_destination = fs::path(destination) / p_it->name(); ctx->disk->createHardLink(p_it->path(), p_destination); - hardlinked_files.push_back(p_it->name()); + hardlinked_files.insert(p_it->name()); } } } diff --git a/src/Storages/StorageReplicatedMergeTree.cpp b/src/Storages/StorageReplicatedMergeTree.cpp index 96ff6a41d65..4403fd2617f 100644 --- a/src/Storages/StorageReplicatedMergeTree.cpp +++ b/src/Storages/StorageReplicatedMergeTree.cpp @@ -7369,14 +7369,14 @@ void StorageReplicatedMergeTree::lockSharedData(const IMergeTreeDataPart & part, } } -std::pair> StorageReplicatedMergeTree::unlockSharedData(const IMergeTreeDataPart & part) const +std::pair StorageReplicatedMergeTree::unlockSharedData(const IMergeTreeDataPart & part) const { if (!part.volume || !part.isStoredOnDisk()) - return std::make_pair(true, std::vector{}); + return std::make_pair(true, NameSet{}); DiskPtr disk = part.volume->getDisk(); if (!disk || !disk->supportZeroCopyReplication()) - return std::make_pair(true, std::vector{}); + return std::make_pair(true, NameSet{}); /// If part is temporary refcount file may be absent auto ref_count_path = fs::path(part.getFullRelativePath()) / IMergeTreeDataPart::FILE_FOR_REFERENCES_CHECK; @@ -7384,19 +7384,19 @@ std::pair> StorageReplicatedMergeTree::unlockShar { auto ref_count = disk->getRefCount(ref_count_path); if (ref_count > 0) /// Keep part shard info for frozen backups - return std::make_pair(false, std::vector{}); + return std::make_pair(false, NameSet{}); } else { /// Temporary part with some absent file cannot be locked in shared mode - return std::make_pair(true, std::vector{}); + return std::make_pair(true, NameSet{}); } return unlockSharedDataByID(part.getUniqueId(), getTableSharedID(), part.name, replica_name, disk, getZooKeeper(), *getSettings(), log, zookeeper_path); } -std::pair> StorageReplicatedMergeTree::unlockSharedDataByID( +std::pair StorageReplicatedMergeTree::unlockSharedDataByID( String part_id, const String & table_uuid, const String & part_name, const String & replica_name_, DiskPtr disk, zkutil::ZooKeeperPtr zookeeper_ptr, const MergeTreeSettings & settings, Poco::Logger * logger, const String & zookeeper_path_old) @@ -7406,13 +7406,14 @@ std::pair> StorageReplicatedMergeTree::unlockShar Strings zc_zookeeper_paths = getZeroCopyPartPath(settings, disk->getType(), table_uuid, part_name, zookeeper_path_old); bool part_has_no_more_locks = true; - std::vector files_not_to_remove; + NameSet files_not_to_remove; for (const auto & zc_zookeeper_path : zc_zookeeper_paths) { files_not_to_remove.clear(); - auto content = zookeeper_ptr->get(zc_zookeeper_path); - boost::split(files_not_to_remove, content, boost::is_any_of("\n ")); + auto files_not_to_remove_str = zookeeper_ptr->get(zc_zookeeper_path); + boost::split(files_not_to_remove, files_not_to_remove_str, boost::is_any_of("\n ")); + String zookeeper_part_uniq_node = fs::path(zc_zookeeper_path) / part_id; /// Delete our replica node for part from zookeeper (we are not interested in it anymore) @@ -7848,7 +7849,7 @@ bool StorageReplicatedMergeTree::createEmptyPartInsteadOfLost(zkutil::ZooKeeperP } -void StorageReplicatedMergeTree::createZeroCopyLockNode(const zkutil::ZooKeeperPtr & zookeeper, const String & zookeeper_node, int32_t mode, bool replace_existing_lock, const std::string & path_to_set, const std::vector & hardlinked_files) +void StorageReplicatedMergeTree::createZeroCopyLockNode(const zkutil::ZooKeeperPtr & zookeeper, const String & zookeeper_node, int32_t mode, bool replace_existing_lock, const std::string & path_to_set, const NameSet & hardlinked_files) { /// In rare case other replica can remove path between createAncestors and createIfNotExists /// So we make up to 5 attempts @@ -8025,6 +8026,7 @@ bool StorageReplicatedMergeTree::removeSharedDetachedPart(DiskPtr disk, const St bool keep_shared = false; zkutil::ZooKeeperPtr zookeeper = getZooKeeper(); + NameSet files_not_to_remove; fs::path checksums = fs::path(path) / IMergeTreeDataPart::FILE_FOR_REFERENCES_CHECK; if (disk->exists(checksums)) @@ -8033,7 +8035,7 @@ bool StorageReplicatedMergeTree::removeSharedDetachedPart(DiskPtr disk, const St { String id = disk->getUniqueId(checksums); bool can_remove = false; - std::tie(can_remove, std::ignore) = StorageReplicatedMergeTree::unlockSharedDataByID(id, table_uuid, part_name, + std::tie(can_remove, files_not_to_remove) = StorageReplicatedMergeTree::unlockSharedDataByID(id, table_uuid, part_name, detached_replica_name, disk, zookeeper, getContext()->getReplicatedMergeTreeSettings(), log, detached_zookeeper_path); @@ -8043,7 +8045,7 @@ bool StorageReplicatedMergeTree::removeSharedDetachedPart(DiskPtr disk, const St keep_shared = true; } - disk->removeSharedRecursive(path, keep_shared); + disk->removeSharedRecursive(path, keep_shared, files_not_to_remove); return keep_shared; } diff --git a/src/Storages/StorageReplicatedMergeTree.h b/src/Storages/StorageReplicatedMergeTree.h index 2fb2494ca64..10efd6982b5 100644 --- a/src/Storages/StorageReplicatedMergeTree.h +++ b/src/Storages/StorageReplicatedMergeTree.h @@ -236,12 +236,12 @@ public: /// Unlock shared data part in zookeeper /// Return true if data unlocked /// Return false if data is still used by another node - std::pair> unlockSharedData(const IMergeTreeDataPart & part) const override; + std::pair unlockSharedData(const IMergeTreeDataPart & part) const override; /// Unlock shared data part in zookeeper by part id /// Return true if data unlocked /// Return false if data is still used by another node - static std::pair> unlockSharedDataByID(String part_id, const String & table_uuid, const String & part_name, const String & replica_name_, + static std::pair unlockSharedDataByID(String part_id, const String & table_uuid, const String & part_name, const String & replica_name_, DiskPtr disk, zkutil::ZooKeeperPtr zookeeper_, const MergeTreeSettings & settings, Poco::Logger * logger, const String & zookeeper_path_old); @@ -756,7 +756,7 @@ private: static Strings getZeroCopyPartPath(const MergeTreeSettings & settings, DiskType disk_type, const String & table_uuid, const String & part_name, const String & zookeeper_path_old); - static void createZeroCopyLockNode(const zkutil::ZooKeeperPtr & zookeeper, const String & zookeeper_node, int32_t mode = zkutil::CreateMode::Persistent, bool replace_existing_lock = false, const std::string & path_to_set = "", const std::vector & hardlinked_files = {}); + static void createZeroCopyLockNode(const zkutil::ZooKeeperPtr & zookeeper, const String & zookeeper_node, int32_t mode = zkutil::CreateMode::Persistent, bool replace_existing_lock = false, const std::string & path_to_set = "", const NameSet & hardlinked_files = {}); bool removeDetachedPart(DiskPtr disk, const String & path, const String & part_name, bool is_freezed) override; diff --git a/tests/queries/0_stateless/02067_lost_part_s3.reference b/tests/queries/0_stateless/02067_lost_part_s3.reference new file mode 100644 index 00000000000..f6f7853d3b6 --- /dev/null +++ b/tests/queries/0_stateless/02067_lost_part_s3.reference @@ -0,0 +1,3 @@ +10000 +10000 +10000 diff --git a/tests/queries/0_stateless/02067_lost_part_s3.sql b/tests/queries/0_stateless/02067_lost_part_s3.sql new file mode 100644 index 00000000000..f3f529a7052 --- /dev/null +++ b/tests/queries/0_stateless/02067_lost_part_s3.sql @@ -0,0 +1,26 @@ +drop table if exists partslost_0; +drop table if exists partslost_1; +drop table if exists partslost_2; + +CREATE TABLE partslost_0 (x String) ENGINE=ReplicatedMergeTree('/clickhouse/table/partslost/0', '0') ORDER BY tuple() SETTINGS min_rows_for_wide_part = 0, min_bytes_for_wide_part = 0, old_parts_lifetime = 1, cleanup_delay_period = 1, cleanup_delay_period_random_add = 1; + +CREATE TABLE partslost_1 (x String) ENGINE=ReplicatedMergeTree('/clickhouse/table/partslost/0', '1') ORDER BY tuple() SETTINGS min_rows_for_wide_part = 0, min_bytes_for_wide_part = 0, old_parts_lifetime = 1, cleanup_delay_period = 1, cleanup_delay_period_random_add = 1; + +CREATE TABLE partslost_2 (x String) ENGINE=ReplicatedMergeTree('/clickhouse/table/partslost/0', '2') ORDER BY tuple() SETTINGS min_rows_for_wide_part = 0, min_bytes_for_wide_part = 0, old_parts_lifetime = 1, cleanup_delay_period = 1, cleanup_delay_period_random_add = 1; + + +INSERT INTO partslost_0 SELECT toString(number) AS x from system.numbers LIMIT 10000; + +ALTER TABLE partslost_0 ADD INDEX idx x TYPE tokenbf_v1(285000, 3, 12345) GRANULARITY 3; +ALTER TABLE partslost_0 MATERIALIZE INDEX idx; + +select sleep(3) FORMAT Null; +select sleep(3) FORMAT Null; +select sleep(3) FORMAT Null; +select sleep(3) FORMAT Null; + +ALTER TABLE partslost_0 DROP INDEX idx; + +select count() from partslost_0; +select count() from partslost_1; +select count() from partslost_2; From c81cb9e5632698d9bca5646f31093ea398fc8ea1 Mon Sep 17 00:00:00 2001 From: alesapin Date: Tue, 19 Apr 2022 01:12:07 +0200 Subject: [PATCH 15/55] Better --- src/Storages/StorageReplicatedMergeTree.cpp | 3 ++- tests/queries/0_stateless/02067_lost_part_s3.sql | 13 ++++++++++--- 2 files changed, 12 insertions(+), 4 deletions(-) diff --git a/src/Storages/StorageReplicatedMergeTree.cpp b/src/Storages/StorageReplicatedMergeTree.cpp index 4403fd2617f..f4df673a483 100644 --- a/src/Storages/StorageReplicatedMergeTree.cpp +++ b/src/Storages/StorageReplicatedMergeTree.cpp @@ -7412,7 +7412,8 @@ std::pair StorageReplicatedMergeTree::unlockSharedDataByID( { files_not_to_remove.clear(); auto files_not_to_remove_str = zookeeper_ptr->get(zc_zookeeper_path); - boost::split(files_not_to_remove, files_not_to_remove_str, boost::is_any_of("\n ")); + if (!files_not_to_remove_str.empty()) + boost::split(files_not_to_remove, files_not_to_remove_str, boost::is_any_of("\n ")); String zookeeper_part_uniq_node = fs::path(zc_zookeeper_path) / part_id; diff --git a/tests/queries/0_stateless/02067_lost_part_s3.sql b/tests/queries/0_stateless/02067_lost_part_s3.sql index f3f529a7052..ac02318c261 100644 --- a/tests/queries/0_stateless/02067_lost_part_s3.sql +++ b/tests/queries/0_stateless/02067_lost_part_s3.sql @@ -1,6 +1,6 @@ -drop table if exists partslost_0; -drop table if exists partslost_1; -drop table if exists partslost_2; +DROP TABLE IF EXISTS partslost_0; +DROP TABLE IF EXISTS partslost_1; +DROP TABLE IF EXISTS partslost_2; CREATE TABLE partslost_0 (x String) ENGINE=ReplicatedMergeTree('/clickhouse/table/partslost/0', '0') ORDER BY tuple() SETTINGS min_rows_for_wide_part = 0, min_bytes_for_wide_part = 0, old_parts_lifetime = 1, cleanup_delay_period = 1, cleanup_delay_period_random_add = 1; @@ -12,8 +12,11 @@ CREATE TABLE partslost_2 (x String) ENGINE=ReplicatedMergeTree('/clickhouse/tabl INSERT INTO partslost_0 SELECT toString(number) AS x from system.numbers LIMIT 10000; ALTER TABLE partslost_0 ADD INDEX idx x TYPE tokenbf_v1(285000, 3, 12345) GRANULARITY 3; + ALTER TABLE partslost_0 MATERIALIZE INDEX idx; +-- FIXME +select sleep(3) FORMAT Null; select sleep(3) FORMAT Null; select sleep(3) FORMAT Null; select sleep(3) FORMAT Null; @@ -24,3 +27,7 @@ ALTER TABLE partslost_0 DROP INDEX idx; select count() from partslost_0; select count() from partslost_1; select count() from partslost_2; + +DROP TABLE IF EXISTS partslost_0; +DROP TABLE IF EXISTS partslost_1; +DROP TABLE IF EXISTS partslost_2; From 8ade9fba529540c3246c855f427bb2be4268d8e1 Mon Sep 17 00:00:00 2001 From: alesapin Date: Tue, 19 Apr 2022 11:56:14 +0200 Subject: [PATCH 16/55] Fix --- src/Storages/MergeTree/MergeTreeData.cpp | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/src/Storages/MergeTree/MergeTreeData.cpp b/src/Storages/MergeTree/MergeTreeData.cpp index e47077baffa..f18f5f147a7 100644 --- a/src/Storages/MergeTree/MergeTreeData.cpp +++ b/src/Storages/MergeTree/MergeTreeData.cpp @@ -1576,6 +1576,7 @@ MergeTreeData::DataPartsVector MergeTreeData::grabOldParts(bool force) return res; bool need_remove_parts_in_order = supportsReplication() && getSettings()->allow_remote_fs_zero_copy_replication; + time_t now = time(nullptr); std::vector parts_to_delete; @@ -1592,6 +1593,7 @@ MergeTreeData::DataPartsVector MergeTreeData::grabOldParts(bool force) { if (need_remove_parts_in_order) break; + continue; } @@ -1614,7 +1616,9 @@ MergeTreeData::DataPartsVector MergeTreeData::grabOldParts(bool force) parts_to_delete.emplace_back(it); } else if (need_remove_parts_in_order) + { break; + } } res.reserve(parts_to_delete.size()); From ee8d26ff0e57427467bc5d4158b79b29932249a2 Mon Sep 17 00:00:00 2001 From: alesapin Date: Tue, 19 Apr 2022 13:27:55 +0200 Subject: [PATCH 17/55] Better test and fix for move --- src/Storages/MergeTree/MergeTreeData.cpp | 17 +++++++++++++++++ src/Storages/MergeTree/MergeTreePartsMover.cpp | 4 ++++ .../queries/0_stateless/02067_lost_part_s3.sql | 9 ++++----- 3 files changed, 25 insertions(+), 5 deletions(-) diff --git a/src/Storages/MergeTree/MergeTreeData.cpp b/src/Storages/MergeTree/MergeTreeData.cpp index f18f5f147a7..6229d4bb08e 100644 --- a/src/Storages/MergeTree/MergeTreeData.cpp +++ b/src/Storages/MergeTree/MergeTreeData.cpp @@ -1576,6 +1576,19 @@ MergeTreeData::DataPartsVector MergeTreeData::grabOldParts(bool force) return res; bool need_remove_parts_in_order = supportsReplication() && getSettings()->allow_remote_fs_zero_copy_replication; + if (need_remove_parts_in_order) + { + bool has_zero_copy_disk = false; + for (const auto & disk : getDisks()) + { + if (disk->supportZeroCopyReplication()) + { + has_zero_copy_disk = true; + break; + } + } + need_remove_parts_in_order = has_zero_copy_disk; + } time_t now = time(nullptr); std::vector parts_to_delete; @@ -3364,6 +3377,10 @@ void MergeTreeData::swapActivePart(MergeTreeData::DataPartPtr part_copy) ssize_t diff_rows = part_copy->rows_count - original_active_part->rows_count; increaseDataVolume(diff_bytes, diff_rows, /* parts= */ 0); + /// Move parts are non replicated operations, so we take lock here. + /// All other locks are taken in StorageReplicatedMergeTree + lockSharedData(*part_copy); + auto disk = original_active_part->volume->getDisk(); String marker_path = fs::path(original_active_part->getFullRelativePath()) / IMergeTreeDataPart::DELETE_ON_DESTROY_MARKER_FILE_NAME; try diff --git a/src/Storages/MergeTree/MergeTreePartsMover.cpp b/src/Storages/MergeTree/MergeTreePartsMover.cpp index 83b58960ad1..bb625d74ead 100644 --- a/src/Storages/MergeTree/MergeTreePartsMover.cpp +++ b/src/Storages/MergeTree/MergeTreePartsMover.cpp @@ -214,10 +214,14 @@ MergeTreeData::DataPartPtr MergeTreePartsMover::clonePart(const MergeTreeMoveEnt LOG_WARNING(log, "Path {} already exists. Will remove it and clone again.", fullPath(disk, path_to_clone + relative_path)); disk->removeRecursive(fs::path(path_to_clone) / relative_path / ""); } + disk->createDirectories(path_to_clone); bool is_fetched = data->tryToFetchIfShared(*part, disk, fs::path(path_to_clone) / part->name); if (!is_fetched) + { + LOG_INFO(log, "Part {} was not fetched, we are the first who move it to another disk, so we will copy it", part->name); part->volume->getDisk()->copy(fs::path(data->getRelativeDataPath()) / relative_path / "", disk, path_to_clone); + } part->volume->getDisk()->removeFileIfExists(fs::path(path_to_clone) / IMergeTreeDataPart::DELETE_ON_DESTROY_MARKER_FILE_NAME); } else diff --git a/tests/queries/0_stateless/02067_lost_part_s3.sql b/tests/queries/0_stateless/02067_lost_part_s3.sql index ac02318c261..ee3297331cd 100644 --- a/tests/queries/0_stateless/02067_lost_part_s3.sql +++ b/tests/queries/0_stateless/02067_lost_part_s3.sql @@ -2,11 +2,11 @@ DROP TABLE IF EXISTS partslost_0; DROP TABLE IF EXISTS partslost_1; DROP TABLE IF EXISTS partslost_2; -CREATE TABLE partslost_0 (x String) ENGINE=ReplicatedMergeTree('/clickhouse/table/partslost/0', '0') ORDER BY tuple() SETTINGS min_rows_for_wide_part = 0, min_bytes_for_wide_part = 0, old_parts_lifetime = 1, cleanup_delay_period = 1, cleanup_delay_period_random_add = 1; +CREATE TABLE partslost_0 (x String) ENGINE=ReplicatedMergeTree('/clickhouse/table/{database}_02067_lost/partslost', '0') ORDER BY tuple() SETTINGS min_rows_for_wide_part = 0, min_bytes_for_wide_part = 0, old_parts_lifetime = 1, cleanup_delay_period = 1, cleanup_delay_period_random_add = 1; -CREATE TABLE partslost_1 (x String) ENGINE=ReplicatedMergeTree('/clickhouse/table/partslost/0', '1') ORDER BY tuple() SETTINGS min_rows_for_wide_part = 0, min_bytes_for_wide_part = 0, old_parts_lifetime = 1, cleanup_delay_period = 1, cleanup_delay_period_random_add = 1; +CREATE TABLE partslost_1 (x String) ENGINE=ReplicatedMergeTree('/clickhouse/table/{database}_02067_lost/partslost', '1') ORDER BY tuple() SETTINGS min_rows_for_wide_part = 0, min_bytes_for_wide_part = 0, old_parts_lifetime = 1, cleanup_delay_period = 1, cleanup_delay_period_random_add = 1; -CREATE TABLE partslost_2 (x String) ENGINE=ReplicatedMergeTree('/clickhouse/table/partslost/0', '2') ORDER BY tuple() SETTINGS min_rows_for_wide_part = 0, min_bytes_for_wide_part = 0, old_parts_lifetime = 1, cleanup_delay_period = 1, cleanup_delay_period_random_add = 1; +CREATE TABLE partslost_2 (x String) ENGINE=ReplicatedMergeTree('/clickhouse/table/{database}_02067_lost/partslost', '2') ORDER BY tuple() SETTINGS min_rows_for_wide_part = 0, min_bytes_for_wide_part = 0, old_parts_lifetime = 1, cleanup_delay_period = 1, cleanup_delay_period_random_add = 1; INSERT INTO partslost_0 SELECT toString(number) AS x from system.numbers LIMIT 10000; @@ -15,8 +15,7 @@ ALTER TABLE partslost_0 ADD INDEX idx x TYPE tokenbf_v1(285000, 3, 12345) GRANUL ALTER TABLE partslost_0 MATERIALIZE INDEX idx; --- FIXME -select sleep(3) FORMAT Null; +-- In worst case doesn't check anything, but it's not flaky select sleep(3) FORMAT Null; select sleep(3) FORMAT Null; select sleep(3) FORMAT Null; From 2ad60301671cbd4be9e56eaaa03c94ba8ccfb4f5 Mon Sep 17 00:00:00 2001 From: Tyler Hannan Date: Tue, 19 Apr 2022 13:42:17 +0200 Subject: [PATCH 18/55] add AWS m5a.4xlarge benchmark --- website/benchmark/hardware/index.html | 1 + .../hardware/results/aws_m5a_4xlarge.json | 55 +++++++++++++++++++ 2 files changed, 56 insertions(+) create mode 100644 website/benchmark/hardware/results/aws_m5a_4xlarge.json diff --git a/website/benchmark/hardware/index.html b/website/benchmark/hardware/index.html index 42c87c334c0..fb6a572f050 100644 --- a/website/benchmark/hardware/index.html +++ b/website/benchmark/hardware/index.html @@ -95,6 +95,7 @@ Results for AWS instance type im4gn.8xlarge are from Ananth Gundabattula Results for AWS instance type im4gn.16xlarge are from Ananth Gundabattula (Darwinium).
Results for AWS instance type i3.2xlarge are from Ananth Gundabattula (Darwinium).
Results for 2x EPYC 7702 on ZFS mirror NVME are from Alibek A.
+Results for AWS instance type m5a.4xlarge are from Daniel Chimeno.

diff --git a/website/benchmark/hardware/results/aws_m5a_4xlarge.json b/website/benchmark/hardware/results/aws_m5a_4xlarge.json new file mode 100644 index 00000000000..48d7168eec5 --- /dev/null +++ b/website/benchmark/hardware/results/aws_m5a_4xlarge.json @@ -0,0 +1,55 @@ +[ + { + "system": "AWS m5a.4xlarge", + "system_full": "AWS m5a.4xlarge 16 vCPU, 512 GiB RAM, EBS GP3", + "time": "2020-08-13 00:00:00", + "kind": "cloud", + "result": + [ + [0.002, 0.002, 0.002], + [0.028, 0.017, 0.016], + [0.074, 0.040, 0.042], + [0.758, 0.061, 0.062], + [1.380, 0.151, 0.147], + [2.204, 0.394, 0.385], + [0.002, 0.002, 0.002], + [0.019, 0.018, 0.018], + [1.385, 0.751, 0.740], + [1.754, 0.873, 0.870], + [0.817, 0.222, 0.218], + [1.219, 0.268, 0.263], + [2.303, 1.092, 1.057], + [3.892, 1.431, 1.420], + [2.388, 1.199, 1.188], + [1.439, 1.329, 1.339], + [5.108, 3.816, 3.799], + [3.549, 2.273, 2.271], + [9.976, 7.023, 7.043], + [0.706, 0.068, 0.066], + [19.187, 1.101, 1.076], + [21.210, 1.315, 1.276], + [39.984, 3.065, 3.043], + [32.102, 1.328, 1.295], + [4.599, 0.341, 0.343], + [1.762, 0.294, 0.287], + [4.786, 0.345, 0.338], + [18.596, 1.190, 1.175], + [15.549, 1.973, 1.931], + [3.643, 3.628, 3.625], + [3.658, 0.781, 0.777], + [9.786, 1.383, 1.381], + [11.912, 10.221, 10.227], + [20.664, 5.942, 5.954], + [20.707, 6.092, 5.920], + [1.847, 1.758, 1.756], + [0.292, 0.221, 0.216], + [0.109, 0.086, 0.085], + [0.118, 0.084, 0.074], + [0.542, 0.452, 0.445], + [0.060, 0.026, 0.025], + [0.031, 0.019, 0.020], + [0.007, 0.005, 0.007] + ] + } +] + From cc06bc3d994268cd8731dd17ca310af144b2b846 Mon Sep 17 00:00:00 2001 From: alesapin Date: Tue, 19 Apr 2022 14:01:30 +0200 Subject: [PATCH 19/55] Add some clarifications --- src/Disks/DiskDecorator.cpp | 8 +++--- src/Disks/DiskDecorator.h | 4 +-- src/Disks/DiskEncrypted.h | 8 +++--- src/Disks/DiskRestartProxy.cpp | 8 +++--- src/Disks/DiskRestartProxy.h | 4 +-- src/Disks/IDisk.h | 9 ++++-- src/Disks/IDiskRemote.cpp | 8 +++--- src/Disks/IDiskRemote.h | 4 +-- src/Storages/MergeTree/IMergeTreeDataPart.cpp | 3 +- src/Storages/MergeTree/IMergeTreeDataPart.h | 17 +++++++++++ src/Storages/MergeTree/MutateTask.cpp | 3 ++ src/Storages/StorageReplicatedMergeTree.cpp | 28 ++++++++++++------- src/Storages/StorageReplicatedMergeTree.h | 5 +++- 13 files changed, 72 insertions(+), 37 deletions(-) diff --git a/src/Disks/DiskDecorator.cpp b/src/Disks/DiskDecorator.cpp index 44358847961..47a6bd92c3a 100644 --- a/src/Disks/DiskDecorator.cpp +++ b/src/Disks/DiskDecorator.cpp @@ -151,14 +151,14 @@ void DiskDecorator::removeSharedFile(const String & path, bool keep_s3) delegate->removeSharedFile(path, keep_s3); } -void DiskDecorator::removeSharedFiles(const RemoveBatchRequest & files, bool keep_all_batch_metadata, const NameSet & file_names_remove_metadata_only) +void DiskDecorator::removeSharedFiles(const RemoveBatchRequest & files, bool keep_all_batch_data, const NameSet & file_names_remove_metadata_only) { - delegate->removeSharedFiles(files, keep_all_batch_metadata, file_names_remove_metadata_only); + delegate->removeSharedFiles(files, keep_all_batch_data, file_names_remove_metadata_only); } -void DiskDecorator::removeSharedRecursive(const String & path, bool keep_all_batch_metadata, const NameSet & file_names_remove_metadata_only) +void DiskDecorator::removeSharedRecursive(const String & path, bool keep_all_batch_data, const NameSet & file_names_remove_metadata_only) { - delegate->removeSharedRecursive(path, keep_all_batch_metadata, file_names_remove_metadata_only); + delegate->removeSharedRecursive(path, keep_all_batch_data, file_names_remove_metadata_only); } void DiskDecorator::setLastModified(const String & path, const Poco::Timestamp & timestamp) diff --git a/src/Disks/DiskDecorator.h b/src/Disks/DiskDecorator.h index 6c70e9a3571..27ddb001a83 100644 --- a/src/Disks/DiskDecorator.h +++ b/src/Disks/DiskDecorator.h @@ -52,8 +52,8 @@ public: void removeDirectory(const String & path) override; void removeRecursive(const String & path) override; void removeSharedFile(const String & path, bool keep_s3) override; - void removeSharedRecursive(const String & path, bool keep_all_batch_metadata, const NameSet & file_names_remove_metadata_only) override; - void removeSharedFiles(const RemoveBatchRequest & files, bool keep_all_batch_metadata, const NameSet & file_names_remove_metadata_only) override; + void removeSharedRecursive(const String & path, bool keep_all_batch_data, const NameSet & file_names_remove_metadata_only) override; + void removeSharedFiles(const RemoveBatchRequest & files, bool keep_all_batch_data, const NameSet & file_names_remove_metadata_only) override; void setLastModified(const String & path, const Poco::Timestamp & timestamp) override; Poco::Timestamp getLastModified(const String & path) override; void setReadOnly(const String & path) override; diff --git a/src/Disks/DiskEncrypted.h b/src/Disks/DiskEncrypted.h index 28156551e85..2f47dda59b3 100644 --- a/src/Disks/DiskEncrypted.h +++ b/src/Disks/DiskEncrypted.h @@ -159,18 +159,18 @@ public: delegate->removeSharedFile(wrapped_path, flag); } - void removeSharedRecursive(const String & path, bool keep_all_batch_metadata, const NameSet & file_names_remove_metadata_only) override + void removeSharedRecursive(const String & path, bool keep_all_batch_data, const NameSet & file_names_remove_metadata_only) override { auto wrapped_path = wrappedPath(path); - delegate->removeSharedRecursive(wrapped_path, keep_all_batch_metadata, file_names_remove_metadata_only); + delegate->removeSharedRecursive(wrapped_path, keep_all_batch_data, file_names_remove_metadata_only); } - void removeSharedFiles(const RemoveBatchRequest & files, bool keep_all_batch_metadata, const NameSet & file_names_remove_metadata_only) override + void removeSharedFiles(const RemoveBatchRequest & files, bool keep_all_batch_data, const NameSet & file_names_remove_metadata_only) override { for (const auto & file : files) { auto wrapped_path = wrappedPath(file.path); - bool keep = keep_all_batch_metadata || file_names_remove_metadata_only.contains(fs::path(file.path).filename()); + bool keep = keep_all_batch_data || file_names_remove_metadata_only.contains(fs::path(file.path).filename()); if (file.if_exists) delegate->removeSharedFileIfExists(wrapped_path, keep); else diff --git a/src/Disks/DiskRestartProxy.cpp b/src/Disks/DiskRestartProxy.cpp index ed0bc74b1a8..ba5d60901b3 100644 --- a/src/Disks/DiskRestartProxy.cpp +++ b/src/Disks/DiskRestartProxy.cpp @@ -251,16 +251,16 @@ void DiskRestartProxy::removeSharedFile(const String & path, bool keep_s3) DiskDecorator::removeSharedFile(path, keep_s3); } -void DiskRestartProxy::removeSharedFiles(const RemoveBatchRequest & files, bool keep_all_batch_metadata, const NameSet & file_names_remove_metadata_only) +void DiskRestartProxy::removeSharedFiles(const RemoveBatchRequest & files, bool keep_all_batch_data, const NameSet & file_names_remove_metadata_only) { ReadLock lock (mutex); - DiskDecorator::removeSharedFiles(files, keep_all_batch_metadata, file_names_remove_metadata_only); + DiskDecorator::removeSharedFiles(files, keep_all_batch_data, file_names_remove_metadata_only); } -void DiskRestartProxy::removeSharedRecursive(const String & path, bool keep_all_batch_metadata, const NameSet & file_names_remove_metadata_only) +void DiskRestartProxy::removeSharedRecursive(const String & path, bool keep_all_batch_data, const NameSet & file_names_remove_metadata_only) { ReadLock lock (mutex); - DiskDecorator::removeSharedRecursive(path, keep_all_batch_metadata, file_names_remove_metadata_only); + DiskDecorator::removeSharedRecursive(path, keep_all_batch_data, file_names_remove_metadata_only); } void DiskRestartProxy::setLastModified(const String & path, const Poco::Timestamp & timestamp) diff --git a/src/Disks/DiskRestartProxy.h b/src/Disks/DiskRestartProxy.h index 03bdc047218..25b4787b482 100644 --- a/src/Disks/DiskRestartProxy.h +++ b/src/Disks/DiskRestartProxy.h @@ -54,8 +54,8 @@ public: void removeDirectory(const String & path) override; void removeRecursive(const String & path) override; void removeSharedFile(const String & path, bool keep_s3) override; - void removeSharedFiles(const RemoveBatchRequest & files, bool keep_all_batch_metadata, const NameSet & file_names_remove_metadata_only) override; - void removeSharedRecursive(const String & path, bool keep_all_batch_metadata, const NameSet & file_names_remove_metadata_only) override; + void removeSharedFiles(const RemoveBatchRequest & files, bool keep_all_batch_data, const NameSet & file_names_remove_metadata_only) override; + void removeSharedRecursive(const String & path, bool keep_all_batch_data, const NameSet & file_names_remove_metadata_only) override; void setLastModified(const String & path, const Poco::Timestamp & timestamp) override; Poco::Timestamp getLastModified(const String & path) override; void setReadOnly(const String & path) override; diff --git a/src/Disks/IDisk.h b/src/Disks/IDisk.h index 30b8ac2c44b..cc47c4262f5 100644 --- a/src/Disks/IDisk.h +++ b/src/Disks/IDisk.h @@ -196,7 +196,8 @@ public: /// Remove file or directory with all children. Use with extra caution. Throws exception if file doesn't exists. /// Differs from removeRecursive for S3/HDFS disks - /// Second bool param is a flag to remove (true) or keep (false) shared data on S3 + /// Second bool param is a flag to remove (true) or keep (false) shared data on S3. + /// Third param determines which files cannot be removed even if second is true. virtual void removeSharedRecursive(const String & path, bool, const NameSet &) { removeRecursive(path); } /// Remove file or directory if it exists. @@ -237,11 +238,13 @@ public: /// Batch request to remove multiple files. /// May be much faster for blob storage. - virtual void removeSharedFiles(const RemoveBatchRequest & files, bool keep_all_batch_metadata, const NameSet & file_names_remove_metadata_only) + /// Second bool param is a flag to remove (true) or keep (false) shared data on S3. + /// Third param determines which files cannot be removed even if second is true. + virtual void removeSharedFiles(const RemoveBatchRequest & files, bool keep_all_batch_data, const NameSet & file_names_remove_metadata_only) { for (const auto & file : files) { - bool keep_file = keep_all_batch_metadata || file_names_remove_metadata_only.contains(fs::path(file.path).filename()); + bool keep_file = keep_all_batch_data || file_names_remove_metadata_only.contains(fs::path(file.path).filename()); if (file.if_exists) removeSharedFileIfExists(file.path, keep_file); else diff --git a/src/Disks/IDiskRemote.cpp b/src/Disks/IDiskRemote.cpp index a546ab56dbf..d155616acb5 100644 --- a/src/Disks/IDiskRemote.cpp +++ b/src/Disks/IDiskRemote.cpp @@ -501,7 +501,7 @@ void IDiskRemote::removeSharedFileIfExists(const String & path, bool delete_meta } } -void IDiskRemote::removeSharedFiles(const RemoveBatchRequest & files, bool keep_all_batch_metadata, const NameSet & file_names_remove_metadata_only) +void IDiskRemote::removeSharedFiles(const RemoveBatchRequest & files, bool keep_all_batch_data, const NameSet & file_names_remove_metadata_only) { std::unordered_map> paths_to_remove; for (const auto & file : files) @@ -511,7 +511,7 @@ void IDiskRemote::removeSharedFiles(const RemoveBatchRequest & files, bool keep_ removeMetadata(file.path, paths_to_remove); } - if (!keep_all_batch_metadata) + if (!keep_all_batch_data) { std::vector remove_from_remote; for (auto && [path, remote_paths] : paths_to_remove) @@ -523,12 +523,12 @@ void IDiskRemote::removeSharedFiles(const RemoveBatchRequest & files, bool keep_ } } -void IDiskRemote::removeSharedRecursive(const String & path, bool keep_all_batch_metadata, const NameSet & file_names_remove_metadata_only) +void IDiskRemote::removeSharedRecursive(const String & path, bool keep_all_batch_data, const NameSet & file_names_remove_metadata_only) { std::unordered_map> paths_to_remove; removeMetadataRecursive(path, paths_to_remove); - if (!keep_all_batch_metadata) + if (!keep_all_batch_data) { std::vector remove_from_remote; for (auto && [local_path, remote_paths] : paths_to_remove) diff --git a/src/Disks/IDiskRemote.h b/src/Disks/IDiskRemote.h index 901403f4689..abb311ac32c 100644 --- a/src/Disks/IDiskRemote.h +++ b/src/Disks/IDiskRemote.h @@ -114,9 +114,9 @@ public: void removeSharedFileIfExists(const String & path, bool delete_metadata_only) override; - void removeSharedFiles(const RemoveBatchRequest & files, bool keep_all_batch_metadata, const NameSet & file_names_remove_metadata_only) override; + void removeSharedFiles(const RemoveBatchRequest & files, bool keep_all_batch_data, const NameSet & file_names_remove_metadata_only) override; - void removeSharedRecursive(const String & path, bool keep_all_batch_metadata, const NameSet & file_names_remove_metadata_only) override; + void removeSharedRecursive(const String & path, bool keep_all_batch_data, const NameSet & file_names_remove_metadata_only) override; void listFiles(const String & path, std::vector & file_names) override; diff --git a/src/Storages/MergeTree/IMergeTreeDataPart.cpp b/src/Storages/MergeTree/IMergeTreeDataPart.cpp index 13fd6d6d09e..9fdf0c2cac7 100644 --- a/src/Storages/MergeTree/IMergeTreeDataPart.cpp +++ b/src/Storages/MergeTree/IMergeTreeDataPart.cpp @@ -1622,6 +1622,8 @@ void IMergeTreeDataPart::remove() const std::unordered_set projection_directories; for (const auto & [p_name, projection_part] : projection_parts) { + /// NOTE: projections currently unsupported with zero copy replication. + /// TODO: fix it. projection_part->projectionRemove(to, !can_remove); projection_directories.emplace(p_name + ".proj"); } @@ -1665,7 +1667,6 @@ void IMergeTreeDataPart::remove() const catch (...) { /// Recursive directory removal does many excessive "stat" syscalls under the hood. - LOG_ERROR(storage.log, "Cannot quickly remove directory {} by removing files; fallback to recursive removal. Reason: {}", fullPath(disk, to), getCurrentExceptionMessage(false)); disk->removeSharedRecursive(fs::path(to) / "", !can_remove, files_not_to_remove); diff --git a/src/Storages/MergeTree/IMergeTreeDataPart.h b/src/Storages/MergeTree/IMergeTreeDataPart.h index c7e119388c5..cf869a8f34e 100644 --- a/src/Storages/MergeTree/IMergeTreeDataPart.h +++ b/src/Storages/MergeTree/IMergeTreeDataPart.h @@ -331,12 +331,19 @@ public: mutable VersionMetadata version; + /// struct HardlinkedFiles { + /// Hardlinked from part std::string source_part_name; + /// Hardlinked files list NameSet hardlinks_from_source_part; }; + /// Files hardlinked from source part during mutation/backup + /// Need only for zero-copy replication implementation. + /// Don't use, it's only in memory and don't survive a server restart or table + /// DETACH/ATTACH. HardlinkedFiles hardlinked_files; /// For data in RAM ('index') @@ -520,6 +527,16 @@ protected: String getRelativePathForDetachedPart(const String & prefix) const; + /// Checks that part can be actually removed from disk. + /// In ordinary scenario always returns true, but in case of + /// zero-copy replication part can be hold by some other replicas. + /// + /// If method return false than only metadata of part from + /// local storage can be removed, leaving data in remove FS untouched. + /// + /// If method return true, than files can be actually removed from remote + /// storage storage, excluding files in the second returned argument. + /// They can be hardlinks to some newer parts. std::pair canRemovePart() const; void initializePartMetadataManager(); diff --git a/src/Storages/MergeTree/MutateTask.cpp b/src/Storages/MergeTree/MutateTask.cpp index 5c960898e3c..df02e9fe6f9 100644 --- a/src/Storages/MergeTree/MutateTask.cpp +++ b/src/Storages/MergeTree/MutateTask.cpp @@ -1117,6 +1117,9 @@ private: } } + /// Traking of hardlinked files required for zero-copy replication. + /// We don't remove them when we delete last copy of source part because + /// new part can use them. ctx->new_data_part->hardlinked_files.source_part_name = ctx->source_part->name; ctx->new_data_part->hardlinked_files.hardlinks_from_source_part = hardlinked_files; diff --git a/src/Storages/StorageReplicatedMergeTree.cpp b/src/Storages/StorageReplicatedMergeTree.cpp index f4df673a483..a4f176a213e 100644 --- a/src/Storages/StorageReplicatedMergeTree.cpp +++ b/src/Storages/StorageReplicatedMergeTree.cpp @@ -7349,12 +7349,12 @@ void StorageReplicatedMergeTree::lockSharedData(const IMergeTreeDataPart & part, *getSettings(), disk->getType(), getTableSharedID(), part.name, zookeeper_path); - String path_to_set; + String path_to_set_hardlinked_files; if (!part.hardlinked_files.hardlinks_from_source_part.empty()) { - path_to_set = getZeroCopyPartPath( - *getSettings(), disk->getType(), getTableSharedID(), - part.hardlinked_files.source_part_name, zookeeper_path)[0]; + path_to_set_hardlinked_files = getZeroCopyPartPath( + *getSettings(), disk->getType(), getTableSharedID(), + part.hardlinked_files.source_part_name, zookeeper_path)[0]; } for (const auto & zc_zookeeper_path : zc_zookeeper_paths) @@ -7365,7 +7365,7 @@ void StorageReplicatedMergeTree::lockSharedData(const IMergeTreeDataPart & part, createZeroCopyLockNode( zookeeper, zookeeper_node, zkutil::CreateMode::Persistent, - replace_existing_lock, path_to_set, part.hardlinked_files.hardlinks_from_source_part); + replace_existing_lock, path_to_set_hardlinked_files, part.hardlinked_files.hardlinks_from_source_part); } } @@ -7850,7 +7850,9 @@ bool StorageReplicatedMergeTree::createEmptyPartInsteadOfLost(zkutil::ZooKeeperP } -void StorageReplicatedMergeTree::createZeroCopyLockNode(const zkutil::ZooKeeperPtr & zookeeper, const String & zookeeper_node, int32_t mode, bool replace_existing_lock, const std::string & path_to_set, const NameSet & hardlinked_files) +void StorageReplicatedMergeTree::createZeroCopyLockNode( + const zkutil::ZooKeeperPtr & zookeeper, const String & zookeeper_node, int32_t mode, + bool replace_existing_lock, const String & path_to_set_hardlinked_files, const NameSet & hardlinked_files) { /// In rare case other replica can remove path between createAncestors and createIfNotExists /// So we make up to 5 attempts @@ -7870,10 +7872,13 @@ void StorageReplicatedMergeTree::createZeroCopyLockNode(const zkutil::ZooKeeperP Coordination::Requests ops; ops.emplace_back(zkutil::makeRemoveRequest(zookeeper_node, -1)); ops.emplace_back(zkutil::makeCreateRequest(zookeeper_node, "", mode)); - if (!path_to_set.empty() && !hardlinked_files.empty()) + if (!path_to_set_hardlinked_files.empty() && !hardlinked_files.empty()) { std::string data = boost::algorithm::join(hardlinked_files, "\n"); - ops.emplace_back(zkutil::makeSetRequest(path_to_set, data, -1)); + /// List of files used to detect hardlinks. path_to_set_hardlinked_files -- + /// is a path to source part zero copy node. During part removal hardlinked + /// files will be left for source part. + ops.emplace_back(zkutil::makeSetRequest(path_to_set_hardlinked_files, data, -1)); } Coordination::Responses responses; auto error = zookeeper->tryMulti(ops, responses); @@ -7883,10 +7888,13 @@ void StorageReplicatedMergeTree::createZeroCopyLockNode(const zkutil::ZooKeeperP else { Coordination::Requests ops; - if (!path_to_set.empty() && !hardlinked_files.empty()) + if (!path_to_set_hardlinked_files.empty() && !hardlinked_files.empty()) { std::string data = boost::algorithm::join(hardlinked_files, "\n"); - ops.emplace_back(zkutil::makeSetRequest(path_to_set, data, -1)); + /// List of files used to detect hardlinks. path_to_set_hardlinked_files -- + /// is a path to source part zero copy node. During part removal hardlinked + /// files will be left for source part. + ops.emplace_back(zkutil::makeSetRequest(path_to_set_hardlinked_files, data, -1)); } ops.emplace_back(zkutil::makeCreateRequest(zookeeper_node, "", mode)); diff --git a/src/Storages/StorageReplicatedMergeTree.h b/src/Storages/StorageReplicatedMergeTree.h index 10efd6982b5..54d6977d115 100644 --- a/src/Storages/StorageReplicatedMergeTree.h +++ b/src/Storages/StorageReplicatedMergeTree.h @@ -756,7 +756,10 @@ private: static Strings getZeroCopyPartPath(const MergeTreeSettings & settings, DiskType disk_type, const String & table_uuid, const String & part_name, const String & zookeeper_path_old); - static void createZeroCopyLockNode(const zkutil::ZooKeeperPtr & zookeeper, const String & zookeeper_node, int32_t mode = zkutil::CreateMode::Persistent, bool replace_existing_lock = false, const std::string & path_to_set = "", const NameSet & hardlinked_files = {}); + static void createZeroCopyLockNode( + const zkutil::ZooKeeperPtr & zookeeper, const String & zookeeper_node, + int32_t mode = zkutil::CreateMode::Persistent, bool replace_existing_lock = false, + const String & path_to_set_hardlinked_files = "", const NameSet & hardlinked_files = {}); bool removeDetachedPart(DiskPtr disk, const String & path, const String & part_name, bool is_freezed) override; From 7c2fcee3518ca8a3c12c8bf947e82a25ed2a4659 Mon Sep 17 00:00:00 2001 From: alesapin Date: Tue, 19 Apr 2022 14:13:11 +0200 Subject: [PATCH 20/55] Simplify interface --- src/Disks/IDiskRemote.cpp | 16 ++++++++-------- src/Disks/IDiskRemote.h | 2 +- 2 files changed, 9 insertions(+), 9 deletions(-) diff --git a/src/Disks/IDiskRemote.cpp b/src/Disks/IDiskRemote.cpp index d155616acb5..bc224904a5a 100644 --- a/src/Disks/IDiskRemote.cpp +++ b/src/Disks/IDiskRemote.cpp @@ -271,7 +271,7 @@ std::unordered_map IDiskRemote::getSerializedMetadata(const std: return metadatas; } -void IDiskRemote::removeMetadata(const String & path, std::unordered_map> & paths_to_remove) +void IDiskRemote::removeMetadata(const String & path, std::vector & paths_to_remove) { LOG_TRACE(log, "Remove file by path: {}", backQuote(metadata_disk->getPath() + path)); @@ -289,7 +289,7 @@ void IDiskRemote::removeMetadata(const String & path, std::unordered_mapisFile(path)) { - removeMetadata(path, paths_to_remove); + removeMetadata(path, paths_to_remove[path]); } else { @@ -483,21 +483,21 @@ void IDiskRemote::replaceFile(const String & from_path, const String & to_path) void IDiskRemote::removeSharedFile(const String & path, bool delete_metadata_only) { - std::unordered_map> paths_to_remove; + std::vector paths_to_remove; removeMetadata(path, paths_to_remove); if (!delete_metadata_only) - removeFromRemoteFS(paths_to_remove[path]); + removeFromRemoteFS(paths_to_remove); } void IDiskRemote::removeSharedFileIfExists(const String & path, bool delete_metadata_only) { - std::unordered_map> paths_to_remove; + std::vector paths_to_remove; if (metadata_disk->exists(path)) { removeMetadata(path, paths_to_remove); if (!delete_metadata_only) - removeFromRemoteFS(paths_to_remove[path]); + removeFromRemoteFS(paths_to_remove); } } @@ -508,7 +508,7 @@ void IDiskRemote::removeSharedFiles(const RemoveBatchRequest & files, bool keep_ { bool skip = file.if_exists && !metadata_disk->exists(file.path); if (!skip) - removeMetadata(file.path, paths_to_remove); + removeMetadata(file.path, paths_to_remove[file.path]); } if (!keep_all_batch_data) diff --git a/src/Disks/IDiskRemote.h b/src/Disks/IDiskRemote.h index abb311ac32c..b2251e1abe3 100644 --- a/src/Disks/IDiskRemote.h +++ b/src/Disks/IDiskRemote.h @@ -171,7 +171,7 @@ protected: FileCachePtr cache; private: - void removeMetadata(const String & path, std::unordered_map> & paths_to_remove); + void removeMetadata(const String & path, std::vector & paths_to_remove); void removeMetadataRecursive(const String & path, std::unordered_map> & paths_to_remove); From 7cb7c120cc1f70b91e1e41e8e91a878bdd240995 Mon Sep 17 00:00:00 2001 From: alesapin Date: Tue, 19 Apr 2022 15:53:10 +0200 Subject: [PATCH 21/55] Less ugly --- src/Storages/MergeTree/IMergeTreeDataPart.h | 15 ----- src/Storages/MergeTree/MergeTreeData.cpp | 12 ++-- src/Storages/MergeTree/MergeTreeData.h | 13 +++- .../MergeTree/MutateFromLogEntryTask.cpp | 2 +- src/Storages/MergeTree/MutateTask.cpp | 15 +++-- src/Storages/MergeTree/MutateTask.h | 3 +- src/Storages/StorageMergeTree.cpp | 4 +- src/Storages/StorageReplicatedMergeTree.cpp | 60 +++++++++++++++---- src/Storages/StorageReplicatedMergeTree.h | 4 +- 9 files changed, 83 insertions(+), 45 deletions(-) diff --git a/src/Storages/MergeTree/IMergeTreeDataPart.h b/src/Storages/MergeTree/IMergeTreeDataPart.h index cf869a8f34e..2301bac55bd 100644 --- a/src/Storages/MergeTree/IMergeTreeDataPart.h +++ b/src/Storages/MergeTree/IMergeTreeDataPart.h @@ -331,21 +331,6 @@ public: mutable VersionMetadata version; - /// - struct HardlinkedFiles - { - /// Hardlinked from part - std::string source_part_name; - /// Hardlinked files list - NameSet hardlinks_from_source_part; - }; - - /// Files hardlinked from source part during mutation/backup - /// Need only for zero-copy replication implementation. - /// Don't use, it's only in memory and don't survive a server restart or table - /// DETACH/ATTACH. - HardlinkedFiles hardlinked_files; - /// For data in RAM ('index') UInt64 getIndexSizeInBytes() const; UInt64 getIndexSizeInAllocatedBytes() const; diff --git a/src/Storages/MergeTree/MergeTreeData.cpp b/src/Storages/MergeTree/MergeTreeData.cpp index 6229d4bb08e..28221aac91e 100644 --- a/src/Storages/MergeTree/MergeTreeData.cpp +++ b/src/Storages/MergeTree/MergeTreeData.cpp @@ -5727,7 +5727,8 @@ MergeTreeData::MutableDataPartPtr MergeTreeData::cloneAndLoadDataPartOnSameDisk( const String & tmp_part_prefix, const MergeTreePartInfo & dst_part_info, const StorageMetadataPtr & metadata_snapshot, - const MergeTreeTransactionPtr & txn) + const MergeTreeTransactionPtr & txn, + HardlinkedFiles * hardlinked_files) { /// Check that the storage policy contains the disk where the src_part is located. bool does_storage_policy_allow_same_disk = false; @@ -5756,8 +5757,6 @@ MergeTreeData::MutableDataPartPtr MergeTreeData::cloneAndLoadDataPartOnSameDisk( if (disk->exists(dst_part_path)) throw Exception("Part in " + fullPath(disk, dst_part_path) + " already exists", ErrorCodes::DIRECTORY_ALREADY_EXISTS); - - bool src_memory_part = false; /// If source part is in memory, flush it to disk and clone it already in on-disk format if (auto src_part_in_memory = asInMemoryPart(src_part)) { @@ -5765,7 +5764,6 @@ MergeTreeData::MutableDataPartPtr MergeTreeData::cloneAndLoadDataPartOnSameDisk( auto flushed_part_path = src_part_in_memory->getRelativePathForPrefix(tmp_part_prefix); src_part_in_memory->flushToDisk(src_relative_data_path, flushed_part_path, metadata_snapshot); src_part_path = fs::path(src_relative_data_path) / flushed_part_path / ""; - src_memory_part = true; } LOG_DEBUG(log, "Cloning part {} to {}", fullPath(disk, src_part_path), fullPath(disk, dst_part_path)); @@ -5776,14 +5774,14 @@ MergeTreeData::MutableDataPartPtr MergeTreeData::cloneAndLoadDataPartOnSameDisk( auto single_disk_volume = std::make_shared(disk->getName(), disk, 0); auto dst_data_part = createPart(dst_part_name, dst_part_info, single_disk_volume, tmp_dst_part_name); - if (!src_memory_part && !asInMemoryPart(dst_data_part)) + if (hardlinked_files) { - dst_data_part->hardlinked_files.source_part_name = src_part->name; + hardlinked_files->source_part_name = src_part->name; for (auto it = disk->iterateDirectory(src_part_path); it->isValid(); it->next()) { if (it->name() != IMergeTreeDataPart::DELETE_ON_DESTROY_MARKER_FILE_NAME && it->name() != IMergeTreeDataPart::TXN_VERSION_METADATA_FILE_NAME) - dst_data_part->hardlinked_files.hardlinks_from_source_part.insert(it->name()); + hardlinked_files->hardlinks_from_source_part.insert(it->name()); } } diff --git a/src/Storages/MergeTree/MergeTreeData.h b/src/Storages/MergeTree/MergeTreeData.h index 7d51a8fa37c..42d9d984be3 100644 --- a/src/Storages/MergeTree/MergeTreeData.h +++ b/src/Storages/MergeTree/MergeTreeData.h @@ -767,9 +767,17 @@ public: MergeTreeData & checkStructureAndGetMergeTreeData(const StoragePtr & source_table, const StorageMetadataPtr & src_snapshot, const StorageMetadataPtr & my_snapshot) const; MergeTreeData & checkStructureAndGetMergeTreeData(IStorage & source_table, const StorageMetadataPtr & src_snapshot, const StorageMetadataPtr & my_snapshot) const; + struct HardlinkedFiles + { + /// Hardlinked from part + std::string source_part_name; + /// Hardlinked files list + NameSet hardlinks_from_source_part; + }; + MergeTreeData::MutableDataPartPtr cloneAndLoadDataPartOnSameDisk( const MergeTreeData::DataPartPtr & src_part, const String & tmp_part_prefix, const MergeTreePartInfo & dst_part_info, - const StorageMetadataPtr & metadata_snapshot, const MergeTreeTransactionPtr & txn); + const StorageMetadataPtr & metadata_snapshot, const MergeTreeTransactionPtr & txn, HardlinkedFiles * hardlinked_files); virtual std::vector getMutationsStatus() const = 0; @@ -937,9 +945,10 @@ public: bool scheduleDataMovingJob(BackgroundJobsAssignee & assignee); bool areBackgroundMovesNeeded() const; + /// Lock part in zookeeper for shared data in several nodes /// Overridden in StorageReplicatedMergeTree - virtual void lockSharedData(const IMergeTreeDataPart &, bool = false) const {} /// NOLINT + virtual void lockSharedData(const IMergeTreeDataPart &, bool = false, std::optional = {}) const {} /// NOLINT /// Unlock shared data part in zookeeper /// Overridden in StorageReplicatedMergeTree diff --git a/src/Storages/MergeTree/MutateFromLogEntryTask.cpp b/src/Storages/MergeTree/MutateFromLogEntryTask.cpp index de31fbe3c56..554053c1bd3 100644 --- a/src/Storages/MergeTree/MutateFromLogEntryTask.cpp +++ b/src/Storages/MergeTree/MutateFromLogEntryTask.cpp @@ -175,7 +175,7 @@ bool MutateFromLogEntryTask::finalize(ReplicatedMergeMutateTaskBase::PartLogWrit try { - storage.checkPartChecksumsAndCommit(*transaction_ptr, new_part); + storage.checkPartChecksumsAndCommit(*transaction_ptr, new_part, mutate_task->getHardlinkedFiles()); } catch (const Exception & e) { diff --git a/src/Storages/MergeTree/MutateTask.cpp b/src/Storages/MergeTree/MutateTask.cpp index df02e9fe6f9..83502f1031d 100644 --- a/src/Storages/MergeTree/MutateTask.cpp +++ b/src/Storages/MergeTree/MutateTask.cpp @@ -548,6 +548,8 @@ struct MutationContext ExecuteTTLType execute_ttl_type{ExecuteTTLType::NONE}; MergeTreeTransactionPtr txn; + + MergeTreeData::HardlinkedFiles hardlinked_files; }; using MutationContextPtr = std::shared_ptr; @@ -1120,8 +1122,8 @@ private: /// Traking of hardlinked files required for zero-copy replication. /// We don't remove them when we delete last copy of source part because /// new part can use them. - ctx->new_data_part->hardlinked_files.source_part_name = ctx->source_part->name; - ctx->new_data_part->hardlinked_files.hardlinks_from_source_part = hardlinked_files; + ctx->hardlinked_files.source_part_name = ctx->source_part->name; + ctx->hardlinked_files.hardlinks_from_source_part = hardlinked_files; (*ctx->mutate_entry)->columns_written = ctx->storage_columns.size() - ctx->updated_header.columns(); @@ -1297,7 +1299,7 @@ bool MutateTask::prepare() storage_from_source_part, ctx->metadata_snapshot, ctx->commands_for_part, Context::createCopy(context_for_reading))) { LOG_TRACE(ctx->log, "Part {} doesn't change up to mutation version {}", ctx->source_part->name, ctx->future_part->part_info.mutation); - promise.set_value(ctx->data->cloneAndLoadDataPartOnSameDisk(ctx->source_part, "tmp_clone_", ctx->future_part->part_info, ctx->metadata_snapshot, ctx->txn)); + promise.set_value(ctx->data->cloneAndLoadDataPartOnSameDisk(ctx->source_part, "tmp_clone_", ctx->future_part->part_info, ctx->metadata_snapshot, ctx->txn, &ctx->hardlinked_files)); return false; } else @@ -1388,7 +1390,7 @@ bool MutateTask::prepare() && ctx->files_to_rename.empty()) { LOG_TRACE(ctx->log, "Part {} doesn't change up to mutation version {} (optimized)", ctx->source_part->name, ctx->future_part->part_info.mutation); - promise.set_value(ctx->data->cloneAndLoadDataPartOnSameDisk(ctx->source_part, "tmp_mut_", ctx->future_part->part_info, ctx->metadata_snapshot, ctx->txn)); + promise.set_value(ctx->data->cloneAndLoadDataPartOnSameDisk(ctx->source_part, "tmp_mut_", ctx->future_part->part_info, ctx->metadata_snapshot, ctx->txn, &ctx->hardlinked_files)); return false; } @@ -1398,5 +1400,10 @@ bool MutateTask::prepare() return true; } +const MergeTreeData::HardlinkedFiles & MutateTask::getHardlinkedFiles() const +{ + return ctx->hardlinked_files; +} + } diff --git a/src/Storages/MergeTree/MutateTask.h b/src/Storages/MergeTree/MutateTask.h index aa38ee34b4a..40447cc63c8 100644 --- a/src/Storages/MergeTree/MutateTask.h +++ b/src/Storages/MergeTree/MutateTask.h @@ -44,6 +44,8 @@ public: return promise.get_future(); } + const MergeTreeData::HardlinkedFiles & getHardlinkedFiles() const; + private: bool prepare(); @@ -56,7 +58,6 @@ private: State state{State::NEED_PREPARE}; - std::promise promise; std::shared_ptr ctx; diff --git a/src/Storages/StorageMergeTree.cpp b/src/Storages/StorageMergeTree.cpp index e322d8785fa..50c0c083c99 100644 --- a/src/Storages/StorageMergeTree.cpp +++ b/src/Storages/StorageMergeTree.cpp @@ -1583,7 +1583,7 @@ void StorageMergeTree::replacePartitionFrom(const StoragePtr & source_table, con Int64 temp_index = insert_increment.get(); MergeTreePartInfo dst_part_info(partition_id, temp_index, temp_index, src_part->info.level); - auto dst_part = cloneAndLoadDataPartOnSameDisk(src_part, TMP_PREFIX, dst_part_info, my_metadata_snapshot, local_context->getCurrentTransaction()); + auto dst_part = cloneAndLoadDataPartOnSameDisk(src_part, TMP_PREFIX, dst_part_info, my_metadata_snapshot, local_context->getCurrentTransaction(), {}); dst_parts.emplace_back(std::move(dst_part)); } @@ -1669,7 +1669,7 @@ void StorageMergeTree::movePartitionToTable(const StoragePtr & dest_table, const Int64 temp_index = insert_increment.get(); MergeTreePartInfo dst_part_info(partition_id, temp_index, temp_index, src_part->info.level); - auto dst_part = dest_table_storage->cloneAndLoadDataPartOnSameDisk(src_part, TMP_PREFIX, dst_part_info, dest_metadata_snapshot, local_context->getCurrentTransaction()); + auto dst_part = dest_table_storage->cloneAndLoadDataPartOnSameDisk(src_part, TMP_PREFIX, dst_part_info, dest_metadata_snapshot, local_context->getCurrentTransaction(), {}); dst_parts.emplace_back(std::move(dst_part)); } diff --git a/src/Storages/StorageReplicatedMergeTree.cpp b/src/Storages/StorageReplicatedMergeTree.cpp index a4f176a213e..2389746c90b 100644 --- a/src/Storages/StorageReplicatedMergeTree.cpp +++ b/src/Storages/StorageReplicatedMergeTree.cpp @@ -1447,7 +1447,7 @@ void StorageReplicatedMergeTree::checkPartChecksumsAndAddCommitOps(const zkutil: } MergeTreeData::DataPartsVector StorageReplicatedMergeTree::checkPartChecksumsAndCommit(Transaction & transaction, - const DataPartPtr & part) + const DataPartPtr & part, std::optional hardlinked_files) { auto zookeeper = getZooKeeper(); @@ -1457,7 +1457,7 @@ MergeTreeData::DataPartsVector StorageReplicatedMergeTree::checkPartChecksumsAnd Coordination::Requests ops; NameSet absent_part_paths_on_replicas; - lockSharedData(*part, false); + lockSharedData(*part, false, hardlinked_files); /// Checksums are checked here and `ops` is filled. In fact, the part is added to ZK just below, when executing `multi`. checkPartChecksumsAndAddCommitOps(zookeeper, part, ops, part->name, &absent_part_paths_on_replicas); @@ -1992,6 +1992,8 @@ bool StorageReplicatedMergeTree::executeReplaceRange(const LogEntry & entry) /// A replica that will be used to fetch part String replica; + + MergeTreeData::HardlinkedFiles hardlinked_files; }; using PartDescriptionPtr = std::shared_ptr; @@ -2197,6 +2199,7 @@ bool StorageReplicatedMergeTree::executeReplaceRange(const LogEntry & entry) static const String TMP_PREFIX = "tmp_replace_from_"; + std::vector hardlinked_files_for_parts; auto obtain_part = [&] (PartDescriptionPtr & part_desc) { if (part_desc->src_table_part) @@ -2206,7 +2209,7 @@ bool StorageReplicatedMergeTree::executeReplaceRange(const LogEntry & entry) throw Exception("Checksums of " + part_desc->src_table_part->name + " is suddenly changed", ErrorCodes::UNFINISHED); part_desc->res_part = cloneAndLoadDataPartOnSameDisk( - part_desc->src_table_part, TMP_PREFIX + "clone_", part_desc->new_part_info, metadata_snapshot, NO_TRANSACTION_PTR); + part_desc->src_table_part, TMP_PREFIX + "clone_", part_desc->new_part_info, metadata_snapshot, NO_TRANSACTION_PTR, &part_desc->hardlinked_files); } else if (!part_desc->replica.empty()) { @@ -2253,8 +2256,11 @@ bool StorageReplicatedMergeTree::executeReplaceRange(const LogEntry & entry) { renameTempPartAndReplace(part_desc->res_part, NO_TRANSACTION_RAW, nullptr, &transaction); getCommitPartOps(ops, part_desc->res_part); + + lockSharedData(*part_desc->res_part, false, part_desc->hardlinked_files); } + if (!ops.empty()) zookeeper->multi(ops); @@ -2281,6 +2287,10 @@ bool StorageReplicatedMergeTree::executeReplaceRange(const LogEntry & entry) catch (...) { PartLog::addNewParts(getContext(), res_parts, watch.elapsed(), ExecutionStatus::fromCurrentException()); + + for (const auto & res_part : res_parts) + unlockSharedData(*res_part); + throw; } @@ -3933,12 +3943,13 @@ bool StorageReplicatedMergeTree::fetchPart(const String & part_name, const Stora InterserverCredentialsPtr credentials; std::optional tagger_ptr; std::function get_part; + MergeTreeData::HardlinkedFiles hardlinked_files; if (part_to_clone) { get_part = [&, part_to_clone]() { - return cloneAndLoadDataPartOnSameDisk(part_to_clone, "tmp_clone_", part_info, metadata_snapshot, NO_TRANSACTION_PTR); + return cloneAndLoadDataPartOnSameDisk(part_to_clone, "tmp_clone_", part_info, metadata_snapshot, NO_TRANSACTION_PTR, &hardlinked_files); }; } else @@ -3984,7 +3995,7 @@ bool StorageReplicatedMergeTree::fetchPart(const String & part_name, const Stora Transaction transaction(*this, NO_TRANSACTION_RAW); renameTempPartAndReplace(part, NO_TRANSACTION_RAW, nullptr, &transaction); - replaced_parts = checkPartChecksumsAndCommit(transaction, part); + replaced_parts = checkPartChecksumsAndCommit(transaction, part, hardlinked_files); /** If a quorum is tracked for this part, you must update it. * If you do not have time, in case of losing the session, when you restart the server - see the `ReplicatedMergeTreeRestartingThread::updateQuorumIfWeHavePart` method. @@ -6381,6 +6392,7 @@ void StorageReplicatedMergeTree::replacePartitionFrom( assert(replace == !LogEntry::ReplaceRangeEntry::isMovePartitionOrAttachFrom(drop_range)); String drop_range_fake_part_name = getPartNamePossiblyFake(format_version, drop_range); + std::vector hardlinked_files_for_parts; for (const auto & src_part : src_all_parts) { @@ -6411,13 +6423,16 @@ void StorageReplicatedMergeTree::replacePartitionFrom( UInt64 index = lock->getNumber(); MergeTreePartInfo dst_part_info(partition_id, index, index, src_part->info.level); - auto dst_part = cloneAndLoadDataPartOnSameDisk(src_part, TMP_PREFIX, dst_part_info, metadata_snapshot, NO_TRANSACTION_PTR); + MergeTreeData::HardlinkedFiles hardlinked_files; + + auto dst_part = cloneAndLoadDataPartOnSameDisk(src_part, TMP_PREFIX, dst_part_info, metadata_snapshot, NO_TRANSACTION_PTR, &hardlinked_files); src_parts.emplace_back(src_part); dst_parts.emplace_back(dst_part); ephemeral_locks.emplace_back(std::move(*lock)); block_id_paths.emplace_back(block_id_path); part_checksums.emplace_back(hash_hex); + hardlinked_files_for_parts.emplace_back(hardlinked_files); } ReplicatedMergeTreeLogEntryData entry; @@ -6475,6 +6490,9 @@ void StorageReplicatedMergeTree::replacePartitionFrom( renameTempPartAndReplace(part, query_context->getCurrentTransaction().get(), nullptr, &transaction, data_parts_lock); } + for (size_t i = 0; i < dst_parts.size(); ++i) + lockSharedData(*dst_parts[i], false, hardlinked_files_for_parts[i]); + Coordination::Error code = zookeeper->tryMulti(ops, op_results); if (code == Coordination::Error::ZOK) delimiting_block_lock->assumeUnlocked(); @@ -6502,6 +6520,9 @@ void StorageReplicatedMergeTree::replacePartitionFrom( catch (...) { PartLog::addNewParts(getContext(), dst_parts, watch.elapsed(), ExecutionStatus::fromCurrentException()); + for (const auto & dst_part : dst_parts) + unlockSharedData(*dst_part); + throw; } @@ -6603,6 +6624,8 @@ void StorageReplicatedMergeTree::movePartitionToTable(const StoragePtr & dest_ta String dest_alter_partition_version_path = dest_table_storage->zookeeper_path + "/alter_partition_version"; Coordination::Stat dest_alter_partition_version_stat; zookeeper->get(dest_alter_partition_version_path, &dest_alter_partition_version_stat); + std::vector hardlinked_files_for_parts; + for (const auto & src_part : src_all_parts) { if (!dest_table_storage->canReplacePartition(src_part)) @@ -6622,13 +6645,16 @@ void StorageReplicatedMergeTree::movePartitionToTable(const StoragePtr & dest_ta UInt64 index = lock->getNumber(); MergeTreePartInfo dst_part_info(partition_id, index, index, src_part->info.level); - auto dst_part = dest_table_storage->cloneAndLoadDataPartOnSameDisk(src_part, TMP_PREFIX, dst_part_info, dest_metadata_snapshot, NO_TRANSACTION_PTR); + + MergeTreeData::HardlinkedFiles hardlinked_files; + auto dst_part = dest_table_storage->cloneAndLoadDataPartOnSameDisk(src_part, TMP_PREFIX, dst_part_info, dest_metadata_snapshot, NO_TRANSACTION_PTR, &hardlinked_files); src_parts.emplace_back(src_part); dst_parts.emplace_back(dst_part); ephemeral_locks.emplace_back(std::move(*lock)); block_id_paths.emplace_back(block_id_path); part_checksums.emplace_back(hash_hex); + hardlinked_files_for_parts.emplace_back(hardlinked_files); } ReplicatedMergeTreeLogEntryData entry_delete; @@ -6695,6 +6721,9 @@ void StorageReplicatedMergeTree::movePartitionToTable(const StoragePtr & dest_ta for (MutableDataPartPtr & part : dst_parts) dest_table_storage->renameTempPartAndReplace(part, query_context->getCurrentTransaction().get(), nullptr, &transaction, lock); + for (size_t i = 0; i < dst_parts.size(); ++i) + lockSharedData(*dst_parts[i], false, hardlinked_files_for_parts[i]); + Coordination::Error code = zookeeper->tryMulti(ops, op_results); if (code == Coordination::Error::ZBADVERSION) continue; @@ -6710,6 +6739,10 @@ void StorageReplicatedMergeTree::movePartitionToTable(const StoragePtr & dest_ta catch (...) { PartLog::addNewParts(getContext(), dst_parts, watch.elapsed(), ExecutionStatus::fromCurrentException()); + + for (const auto & dst_part : dst_parts) + unlockSharedData(*dst_part); + throw; } @@ -7327,7 +7360,7 @@ void StorageReplicatedMergeTree::lockSharedDataTemporary(const String & part_nam } } -void StorageReplicatedMergeTree::lockSharedData(const IMergeTreeDataPart & part, bool replace_existing_lock) const +void StorageReplicatedMergeTree::lockSharedData(const IMergeTreeDataPart & part, bool replace_existing_lock, std::optional hardlinked_files) const { auto settings = getSettings(); @@ -7350,11 +7383,16 @@ void StorageReplicatedMergeTree::lockSharedData(const IMergeTreeDataPart & part, part.name, zookeeper_path); String path_to_set_hardlinked_files; - if (!part.hardlinked_files.hardlinks_from_source_part.empty()) + NameSet hardlinks; + + if (hardlinked_files.has_value() && !hardlinked_files->hardlinks_from_source_part.empty()) { + LOG_DEBUG(&Poco::Logger::get("DEBUG"), "ADDING SOME HARDLINKS {}", fmt::join(hardlinked_files->hardlinks_from_source_part, ", ")); path_to_set_hardlinked_files = getZeroCopyPartPath( *getSettings(), disk->getType(), getTableSharedID(), - part.hardlinked_files.source_part_name, zookeeper_path)[0]; + hardlinked_files->source_part_name, zookeeper_path)[0]; + + hardlinks = hardlinked_files->hardlinks_from_source_part; } for (const auto & zc_zookeeper_path : zc_zookeeper_paths) @@ -7365,7 +7403,7 @@ void StorageReplicatedMergeTree::lockSharedData(const IMergeTreeDataPart & part, createZeroCopyLockNode( zookeeper, zookeeper_node, zkutil::CreateMode::Persistent, - replace_existing_lock, path_to_set_hardlinked_files, part.hardlinked_files.hardlinks_from_source_part); + replace_existing_lock, path_to_set_hardlinked_files, hardlinks); } } diff --git a/src/Storages/StorageReplicatedMergeTree.h b/src/Storages/StorageReplicatedMergeTree.h index 54d6977d115..b6a003280f1 100644 --- a/src/Storages/StorageReplicatedMergeTree.h +++ b/src/Storages/StorageReplicatedMergeTree.h @@ -229,7 +229,7 @@ public: bool executeFetchShared(const String & source_replica, const String & new_part_name, const DiskPtr & disk, const String & path); /// Lock part in zookeeper for use shared data in several nodes - void lockSharedData(const IMergeTreeDataPart & part, bool replace_existing_lock) const override; + void lockSharedData(const IMergeTreeDataPart & part, bool replace_existing_lock, std::optional hardlinked_files = {}) const override; void lockSharedDataTemporary(const String & part_name, const String & part_id, const DiskPtr & disk) const; @@ -469,7 +469,7 @@ private: String getChecksumsForZooKeeper(const MergeTreeDataPartChecksums & checksums) const; /// Accepts a PreActive part, atomically checks its checksums with ones on other replicas and commit the part - DataPartsVector checkPartChecksumsAndCommit(Transaction & transaction, const DataPartPtr & part); + DataPartsVector checkPartChecksumsAndCommit(Transaction & transaction, const DataPartPtr & part, std::optional hardlinked_files = {}); bool partIsAssignedToBackgroundOperation(const DataPartPtr & part) const override; From 7919485db2357d5c3e8a6e8252842d60982e4531 Mon Sep 17 00:00:00 2001 From: alesapin Date: Tue, 19 Apr 2022 15:57:25 +0200 Subject: [PATCH 22/55] Fix --- src/Storages/StorageReplicatedMergeTree.cpp | 1 - 1 file changed, 1 deletion(-) diff --git a/src/Storages/StorageReplicatedMergeTree.cpp b/src/Storages/StorageReplicatedMergeTree.cpp index 2389746c90b..1487d165046 100644 --- a/src/Storages/StorageReplicatedMergeTree.cpp +++ b/src/Storages/StorageReplicatedMergeTree.cpp @@ -7387,7 +7387,6 @@ void StorageReplicatedMergeTree::lockSharedData(const IMergeTreeDataPart & part, if (hardlinked_files.has_value() && !hardlinked_files->hardlinks_from_source_part.empty()) { - LOG_DEBUG(&Poco::Logger::get("DEBUG"), "ADDING SOME HARDLINKS {}", fmt::join(hardlinked_files->hardlinks_from_source_part, ", ")); path_to_set_hardlinked_files = getZeroCopyPartPath( *getSettings(), disk->getType(), getTableSharedID(), hardlinked_files->source_part_name, zookeeper_path)[0]; From aea7c7755ad8d9e6752187d8b6ec64b656b974d3 Mon Sep 17 00:00:00 2001 From: alesapin Date: Tue, 19 Apr 2022 17:25:08 +0200 Subject: [PATCH 23/55] Fix style --- src/Storages/MergeTree/MutateTask.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/Storages/MergeTree/MutateTask.cpp b/src/Storages/MergeTree/MutateTask.cpp index 83502f1031d..bf1d034263d 100644 --- a/src/Storages/MergeTree/MutateTask.cpp +++ b/src/Storages/MergeTree/MutateTask.cpp @@ -1119,7 +1119,7 @@ private: } } - /// Traking of hardlinked files required for zero-copy replication. + /// Tracking of hardlinked files required for zero-copy replication. /// We don't remove them when we delete last copy of source part because /// new part can use them. ctx->hardlinked_files.source_part_name = ctx->source_part->name; From 033a3475a57172aa2d3677dc93af24421897aa74 Mon Sep 17 00:00:00 2001 From: alesapin Date: Tue, 19 Apr 2022 17:57:12 +0200 Subject: [PATCH 24/55] Update src/Disks/IDiskRemote.cpp --- src/Disks/IDiskRemote.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/Disks/IDiskRemote.cpp b/src/Disks/IDiskRemote.cpp index bc224904a5a..e4a610180d0 100644 --- a/src/Disks/IDiskRemote.cpp +++ b/src/Disks/IDiskRemote.cpp @@ -283,7 +283,7 @@ void IDiskRemote::removeMetadata(const String & path, std::vector & path try { - auto metadata_updater = [&paths_to_remove, path, this] (Metadata & metadata) + auto metadata_updater = [&paths_to_remove, this] (Metadata & metadata) { if (metadata.ref_count == 0) { From 158a25d5fabe422f72214901b244b851d6a3403e Mon Sep 17 00:00:00 2001 From: Amos Bird Date: Wed, 20 Apr 2022 01:22:04 +0800 Subject: [PATCH 25/55] Fix column matcher and column transformer --- .../JoinToSubqueryTransformVisitor.cpp | 47 +++++-- src/Interpreters/PredicateRewriteVisitor.cpp | 4 +- .../TranslateQualifiedNamesVisitor.cpp | 62 ++++----- src/Parsers/ASTAsterisk.cpp | 2 + src/Parsers/ASTColumnsMatcher.cpp | 93 +++++++++++--- src/Parsers/ASTColumnsMatcher.h | 32 ++--- src/Parsers/ASTColumnsTransformers.cpp | 119 +++++++++++++++++- src/Parsers/ASTColumnsTransformers.h | 11 +- src/Parsers/ASTQualifiedAsterisk.cpp | 2 + src/Parsers/ExpressionElementParsers.cpp | 12 +- ...n_matcher_and_column_transformer.reference | 0 ..._column_matcher_and_column_transformer.sql | 64 ++++++++++ 12 files changed, 358 insertions(+), 90 deletions(-) create mode 100644 tests/queries/0_stateless/02271_fix_column_matcher_and_column_transformer.reference create mode 100644 tests/queries/0_stateless/02271_fix_column_matcher_and_column_transformer.sql diff --git a/src/Interpreters/JoinToSubqueryTransformVisitor.cpp b/src/Interpreters/JoinToSubqueryTransformVisitor.cpp index c43302e0de9..a6e43993196 100644 --- a/src/Interpreters/JoinToSubqueryTransformVisitor.cpp +++ b/src/Interpreters/JoinToSubqueryTransformVisitor.cpp @@ -12,6 +12,7 @@ #include #include #include +#include #include #include #include @@ -81,6 +82,7 @@ public: /// By default should_add_column_predicate returns true for any column name void addTableColumns( const String & table_name, + ASTs & columns, ShouldAddColumnPredicate should_add_column_predicate = [](const String &) { return true; }) { auto it = table_columns.find(table_name); @@ -105,7 +107,7 @@ public: else identifier = std::make_shared(std::vector{it->first, column.name}); - new_select_expression_list->children.emplace_back(std::move(identifier)); + columns.emplace_back(std::move(identifier)); } } } @@ -129,14 +131,18 @@ private: for (const auto & child : node.children) { - if (child->as()) + ASTs columns; + if (const auto * asterisk = child->as()) { has_asterisks = true; for (auto & table_name : data.tables_order) - data.addTableColumns(table_name); + data.addTableColumns(table_name, columns); + + for (const auto & transformer : asterisk->children) + IASTColumnsTransformer::transform(transformer, columns); } - else if (child->as()) + else if (const auto * qualified_asterisk = child->as()) { has_asterisks = true; @@ -144,17 +150,44 @@ private: throw Exception("Logical error: qualified asterisk must have exactly one child", ErrorCodes::LOGICAL_ERROR); auto & identifier = child->children[0]->as(); - data.addTableColumns(identifier.name()); + data.addTableColumns(identifier.name(), columns); + + // QualifiedAsterisk's transformers start to appear at child 1 + for (auto it = qualified_asterisk->children.begin() + 1; it != qualified_asterisk->children.end(); ++it) + { + IASTColumnsTransformer::transform(*it, columns); + } } - else if (auto * columns_matcher = child->as()) + else if (const auto * columns_list_matcher = child->as()) + { + has_asterisks = true; + + for (const auto & ident : columns_list_matcher->column_list->children) + columns.emplace_back(ident->clone()); + + for (const auto & transformer : columns_list_matcher->children) + IASTColumnsTransformer::transform(transformer, columns); + } + else if (const auto * columns_regexp_matcher = child->as()) { has_asterisks = true; for (auto & table_name : data.tables_order) - data.addTableColumns(table_name, [&](const String & column_name) { return columns_matcher->isColumnMatching(column_name); }); + data.addTableColumns( + table_name, + columns, + [&](const String & column_name) { return columns_regexp_matcher->isColumnMatching(column_name); }); + + for (const auto & transformer : columns_regexp_matcher->children) + IASTColumnsTransformer::transform(transformer, columns); } else data.new_select_expression_list->children.push_back(child); + + data.new_select_expression_list->children.insert( + data.new_select_expression_list->children.end(), + std::make_move_iterator(columns.begin()), + std::make_move_iterator(columns.end())); } if (!has_asterisks) diff --git a/src/Interpreters/PredicateRewriteVisitor.cpp b/src/Interpreters/PredicateRewriteVisitor.cpp index 74ecc247eb1..910c846d58b 100644 --- a/src/Interpreters/PredicateRewriteVisitor.cpp +++ b/src/Interpreters/PredicateRewriteVisitor.cpp @@ -96,8 +96,8 @@ void PredicateRewriteVisitorData::visitOtherInternalSelect(ASTSelectQuery & sele size_t alias_index = 0; for (auto & ref_select : temp_select_query->refSelect()->children) { - if (!ref_select->as() && !ref_select->as() && !ref_select->as() && - !ref_select->as()) + if (!ref_select->as() && !ref_select->as() && !ref_select->as() + && !ref_select->as() && !ref_select->as()) { if (const auto & alias = ref_select->tryGetAlias(); alias.empty()) ref_select->setAlias("--predicate_optimizer_" + toString(alias_index++)); diff --git a/src/Interpreters/TranslateQualifiedNamesVisitor.cpp b/src/Interpreters/TranslateQualifiedNamesVisitor.cpp index 6016d54c7dc..63f99b2a398 100644 --- a/src/Interpreters/TranslateQualifiedNamesVisitor.cpp +++ b/src/Interpreters/TranslateQualifiedNamesVisitor.cpp @@ -196,7 +196,7 @@ void TranslateQualifiedNamesMatcher::visit(ASTExpressionList & node, const ASTPt bool has_asterisk = false; for (const auto & child : node.children) { - if (child->as() || child->as()) + if (child->as() || child->as() || child->as()) { if (tables_with_columns.empty()) throw Exception("An asterisk cannot be replaced with empty columns.", ErrorCodes::LOGICAL_ERROR); @@ -229,47 +229,38 @@ void TranslateQualifiedNamesMatcher::visit(ASTExpressionList & node, const ASTPt for (const auto & column : *cols) { if (first_table || !data.join_using_columns.count(column.name)) - { addIdentifier(columns, table.table, column.name); - } } } - first_table = false; } - for (const auto & transformer : asterisk->children) - { - IASTColumnsTransformer::transform(transformer, columns); - } - } - else if (const auto * asterisk_pattern = child->as()) - { - if (asterisk_pattern->column_list) - { - for (const auto & ident : asterisk_pattern->column_list->children) - columns.emplace_back(ident->clone()); - } - else - { - bool first_table = true; - for (const auto & table : tables_with_columns) - { - for (const auto & column : table.columns) - { - if (asterisk_pattern->isColumnMatching(column.name) && (first_table || !data.join_using_columns.count(column.name))) - { - addIdentifier(columns, table.table, column.name); - } - } - first_table = false; - } - } - // ColumnsMatcher's transformers start to appear at child 1 - for (auto it = asterisk_pattern->children.begin() + 1; it != asterisk_pattern->children.end(); ++it) + for (const auto & transformer : asterisk->children) + IASTColumnsTransformer::transform(transformer, columns); + } + else if (auto * asterisk_column_list = child->as()) + { + for (const auto & ident : asterisk_column_list->column_list->children) + columns.emplace_back(ident->clone()); + + for (const auto & transformer : asterisk_column_list->children) + IASTColumnsTransformer::transform(transformer, columns); + } + else if (const auto * asterisk_regexp_pattern = child->as()) + { + bool first_table = true; + for (const auto & table : tables_with_columns) { - IASTColumnsTransformer::transform(*it, columns); + for (const auto & column : table.columns) + { + if (asterisk_regexp_pattern->isColumnMatching(column.name) && (first_table || !data.join_using_columns.count(column.name))) + addIdentifier(columns, table.table, column.name); + } + first_table = false; } + + for (const auto & transformer : asterisk_regexp_pattern->children) + IASTColumnsTransformer::transform(transformer, columns); } else if (const auto * qualified_asterisk = child->as()) { @@ -280,12 +271,11 @@ void TranslateQualifiedNamesMatcher::visit(ASTExpressionList & node, const ASTPt if (ident_db_and_name.satisfies(table.table, true)) { for (const auto & column : table.columns) - { addIdentifier(columns, table.table, column.name); - } break; } } + // QualifiedAsterisk's transformers start to appear at child 1 for (auto it = qualified_asterisk->children.begin() + 1; it != qualified_asterisk->children.end(); ++it) { diff --git a/src/Parsers/ASTAsterisk.cpp b/src/Parsers/ASTAsterisk.cpp index ed733e62ca3..e2f45d04fa4 100644 --- a/src/Parsers/ASTAsterisk.cpp +++ b/src/Parsers/ASTAsterisk.cpp @@ -17,6 +17,8 @@ void ASTAsterisk::appendColumnName(WriteBuffer & ostr) const { ostr.write('*'); void ASTAsterisk::formatImpl(const FormatSettings & settings, FormatState & state, FormatStateStacked frame) const { settings.ostr << "*"; + + /// Format column transformers for (const auto & child : children) { settings.ostr << ' '; diff --git a/src/Parsers/ASTColumnsMatcher.cpp b/src/Parsers/ASTColumnsMatcher.cpp index 45799cb7ffe..8f167f99b37 100644 --- a/src/Parsers/ASTColumnsMatcher.cpp +++ b/src/Parsers/ASTColumnsMatcher.cpp @@ -1,64 +1,117 @@ -#include "ASTColumnsMatcher.h" +#include + +#include #include -#include #include #include -#include +#include namespace DB { + namespace ErrorCodes { extern const int CANNOT_COMPILE_REGEXP; } -ASTPtr ASTColumnsMatcher::clone() const +ASTPtr ASTColumnsRegexpMatcher::clone() const { - auto clone = std::make_shared(*this); + auto clone = std::make_shared(*this); clone->cloneChildren(); return clone; } -void ASTColumnsMatcher::appendColumnName(WriteBuffer & ostr) const { writeString(original_pattern, ostr); } +void ASTColumnsRegexpMatcher::appendColumnName(WriteBuffer & ostr) const +{ + writeCString("COLUMNS(", ostr); + writeQuotedString(original_pattern, ostr); + writeChar(')', ostr); +} -void ASTColumnsMatcher::updateTreeHashImpl(SipHash & hash_state) const +void ASTColumnsRegexpMatcher::updateTreeHashImpl(SipHash & hash_state) const { hash_state.update(original_pattern.size()); hash_state.update(original_pattern); IAST::updateTreeHashImpl(hash_state); } -void ASTColumnsMatcher::formatImpl(const FormatSettings & settings, FormatState & state, FormatStateStacked frame) const +void ASTColumnsRegexpMatcher::formatImpl(const FormatSettings & settings, FormatState & state, FormatStateStacked frame) const { settings.ostr << (settings.hilite ? hilite_keyword : "") << "COLUMNS" << (settings.hilite ? hilite_none : "") << "("; - if (column_list) - { - frame.expression_list_prepend_whitespace = false; - column_list->formatImpl(settings, state, frame); - } - else - settings.ostr << quoteString(original_pattern); + settings.ostr << quoteString(original_pattern); settings.ostr << ")"; - for (ASTs::const_iterator it = children.begin() + 1; it != children.end(); ++it) + + /// Format column transformers + for (const auto & child : children) { settings.ostr << ' '; - (*it)->formatImpl(settings, state, frame); + child->formatImpl(settings, state, frame); } } -void ASTColumnsMatcher::setPattern(String pattern) +void ASTColumnsRegexpMatcher::setPattern(String pattern) { original_pattern = std::move(pattern); column_matcher = std::make_shared(original_pattern, RE2::Quiet); if (!column_matcher->ok()) - throw DB::Exception("COLUMNS pattern " + original_pattern + " cannot be compiled: " + column_matcher->error(), DB::ErrorCodes::CANNOT_COMPILE_REGEXP); + throw DB::Exception( + "COLUMNS pattern " + original_pattern + " cannot be compiled: " + column_matcher->error(), + DB::ErrorCodes::CANNOT_COMPILE_REGEXP); } -bool ASTColumnsMatcher::isColumnMatching(const String & column_name) const +bool ASTColumnsRegexpMatcher::isColumnMatching(const String & column_name) const { return RE2::PartialMatch(column_name, *column_matcher); } +ASTPtr ASTColumnsListMatcher::clone() const +{ + auto clone = std::make_shared(*this); + clone->column_list = column_list->clone(); + clone->cloneChildren(); + return clone; +} + +void ASTColumnsListMatcher::updateTreeHashImpl(SipHash & hash_state) const +{ + column_list->updateTreeHash(hash_state); + IAST::updateTreeHashImpl(hash_state); +} + +void ASTColumnsListMatcher::appendColumnName(WriteBuffer & ostr) const +{ + writeCString("COLUMNS(", ostr); + for (auto it = column_list->children.begin(); it != column_list->children.end(); ++it) + { + if (it != column_list->children.begin()) + writeCString(", ", ostr); + + (*it)->appendColumnName(ostr); + } + writeChar(')', ostr); +} + +void ASTColumnsListMatcher::formatImpl(const FormatSettings & settings, FormatState & state, FormatStateStacked frame) const +{ + settings.ostr << (settings.hilite ? hilite_keyword : "") << "COLUMNS" << (settings.hilite ? hilite_none : "") << "("; + + for (ASTs::const_iterator it = column_list->children.begin(); it != column_list->children.end(); ++it) + { + if (it != column_list->children.begin()) + { + settings.ostr << ", "; + } + (*it)->formatImpl(settings, state, frame); + } + settings.ostr << ")"; + + /// Format column transformers + for (const auto & child : children) + { + settings.ostr << ' '; + child->formatImpl(settings, state, frame); + } +} } diff --git a/src/Parsers/ASTColumnsMatcher.h b/src/Parsers/ASTColumnsMatcher.h index 76ece9c95cc..5aaf3cbe30d 100644 --- a/src/Parsers/ASTColumnsMatcher.h +++ b/src/Parsers/ASTColumnsMatcher.h @@ -2,10 +2,9 @@ #include - namespace re2 { - class RE2; +class RE2; } @@ -14,21 +13,13 @@ namespace DB class WriteBuffer; -namespace ErrorCodes -{ -} - -struct AsteriskSemantic; -struct AsteriskSemanticImpl; - - /** SELECT COLUMNS('regexp') is expanded to multiple columns like * (asterisk). * Optional transformers can be attached to further manipulate these expanded columns. */ -class ASTColumnsMatcher : public IAST +class ASTColumnsRegexpMatcher : public IAST { public: - String getID(char) const override { return "ColumnsMatcher"; } + String getID(char) const override { return "ColumnsRegexpMatcher"; } ASTPtr clone() const override; void appendColumnName(WriteBuffer & ostr) const override; @@ -36,17 +27,26 @@ public: bool isColumnMatching(const String & column_name) const; void updateTreeHashImpl(SipHash & hash_state) const override; - ASTPtr column_list; - protected: void formatImpl(const FormatSettings & settings, FormatState &, FormatStateStacked) const override; private: std::shared_ptr column_matcher; String original_pattern; - std::shared_ptr semantic; /// pimpl +}; - friend struct AsteriskSemantic; +/// Same as the above but use a list of column names to do matching. +class ASTColumnsListMatcher : public IAST +{ +public: + String getID(char) const override { return "ColumnsListMatcher"; } + ASTPtr clone() const override; + void appendColumnName(WriteBuffer & ostr) const override; + void updateTreeHashImpl(SipHash & hash_state) const override; + + ASTPtr column_list; +protected: + void formatImpl(const FormatSettings & settings, FormatState &, FormatStateStacked) const override; }; diff --git a/src/Parsers/ASTColumnsTransformers.cpp b/src/Parsers/ASTColumnsTransformers.cpp index 451ecf0d4dd..d90d1e747f4 100644 --- a/src/Parsers/ASTColumnsTransformers.cpp +++ b/src/Parsers/ASTColumnsTransformers.cpp @@ -105,6 +105,49 @@ void ASTColumnsApplyTransformer::transform(ASTs & nodes) const } } +void ASTColumnsApplyTransformer::appendColumnName(WriteBuffer & ostr) const +{ + writeCString("APPLY ", ostr); + if (!column_name_prefix.empty()) + writeChar('(', ostr); + + if (lambda) + lambda->appendColumnName(ostr); + else + { + writeString(func_name, ostr); + + if (parameters) + parameters->appendColumnName(ostr); + } + + if (!column_name_prefix.empty()) + { + writeCString(", '", ostr); + writeString(column_name_prefix, ostr); + writeCString("')", ostr); + } +} + +void ASTColumnsApplyTransformer::updateTreeHashImpl(SipHash & hash_state) const +{ + hash_state.update(func_name.size()); + hash_state.update(func_name); + if (parameters) + parameters->updateTreeHashImpl(hash_state); + + if (lambda) + lambda->updateTreeHashImpl(hash_state); + + hash_state.update(lambda_arg.size()); + hash_state.update(lambda_arg); + + hash_state.update(column_name_prefix.size()); + hash_state.update(column_name_prefix); + + IAST::updateTreeHashImpl(hash_state); +} + void ASTColumnsExceptTransformer::formatImpl(const FormatSettings & settings, FormatState & state, FormatStateStacked frame) const { settings.ostr << (settings.hilite ? hilite_keyword : "") << "EXCEPT" << (is_strict ? " STRICT " : " ") << (settings.hilite ? hilite_none : ""); @@ -128,6 +171,38 @@ void ASTColumnsExceptTransformer::formatImpl(const FormatSettings & settings, Fo settings.ostr << ")"; } +void ASTColumnsExceptTransformer::appendColumnName(WriteBuffer & ostr) const +{ + writeCString("EXCEPT ", ostr); + if (is_strict) + writeCString("STRICT ", ostr); + + if (children.size() > 1) + writeChar('(', ostr); + + for (ASTs::const_iterator it = children.begin(); it != children.end(); ++it) + { + if (it != children.begin()) + writeCString(", ", ostr); + (*it)->appendColumnName(ostr); + } + + if (!original_pattern.empty()) + writeQuotedString(original_pattern, ostr); + + if (children.size() > 1) + writeChar(')', ostr); +} + +void ASTColumnsExceptTransformer::updateTreeHashImpl(SipHash & hash_state) const +{ + hash_state.update(is_strict); + hash_state.update(original_pattern.size()); + hash_state.update(original_pattern); + + IAST::updateTreeHashImpl(hash_state); +} + void ASTColumnsExceptTransformer::transform(ASTs & nodes) const { std::set expected_columns; @@ -201,6 +276,21 @@ void ASTColumnsReplaceTransformer::Replacement::formatImpl( settings.ostr << (settings.hilite ? hilite_keyword : "") << " AS " << (settings.hilite ? hilite_none : "") << backQuoteIfNeed(name); } +void ASTColumnsReplaceTransformer::Replacement::appendColumnName(WriteBuffer & ostr) const +{ + expr->appendColumnName(ostr); + writeCString(" AS ", ostr); + writeProbablyBackQuotedString(name, ostr); +} + +void ASTColumnsReplaceTransformer::Replacement::updateTreeHashImpl(SipHash & hash_state) const +{ + hash_state.update(name.size()); + hash_state.update(name); + expr->updateTreeHashImpl(hash_state); + IAST::updateTreeHashImpl(hash_state); +} + void ASTColumnsReplaceTransformer::formatImpl(const FormatSettings & settings, FormatState & state, FormatStateStacked frame) const { settings.ostr << (settings.hilite ? hilite_keyword : "") << "REPLACE" << (is_strict ? " STRICT " : " ") << (settings.hilite ? hilite_none : ""); @@ -211,9 +301,8 @@ void ASTColumnsReplaceTransformer::formatImpl(const FormatSettings & settings, F for (ASTs::const_iterator it = children.begin(); it != children.end(); ++it) { if (it != children.begin()) - { settings.ostr << ", "; - } + (*it)->formatImpl(settings, state, frame); } @@ -221,6 +310,32 @@ void ASTColumnsReplaceTransformer::formatImpl(const FormatSettings & settings, F settings.ostr << ")"; } +void ASTColumnsReplaceTransformer::appendColumnName(WriteBuffer & ostr) const +{ + writeCString("REPLACE ", ostr); + if (is_strict) + writeCString("STRICT ", ostr); + + if (children.size() > 1) + writeChar('(', ostr); + + for (ASTs::const_iterator it = children.begin(); it != children.end(); ++it) + { + if (it != children.begin()) + writeCString(", ", ostr); + (*it)->appendColumnName(ostr); + } + + if (children.size() > 1) + writeChar(')', ostr); +} + +void ASTColumnsReplaceTransformer::updateTreeHashImpl(SipHash & hash_state) const +{ + hash_state.update(is_strict); + IAST::updateTreeHashImpl(hash_state); +} + void ASTColumnsReplaceTransformer::replaceChildren(ASTPtr & node, const ASTPtr & replacement, const String & name) { for (auto & child : node->children) diff --git a/src/Parsers/ASTColumnsTransformers.h b/src/Parsers/ASTColumnsTransformers.h index 1064beb44bd..0f16f6b93e7 100644 --- a/src/Parsers/ASTColumnsTransformers.h +++ b/src/Parsers/ASTColumnsTransformers.h @@ -30,6 +30,8 @@ public: return res; } void transform(ASTs & nodes) const override; + void appendColumnName(WriteBuffer & ostr) const override; + void updateTreeHashImpl(SipHash & hash_state) const override; // Case 1 APPLY (quantile(0.9)) String func_name; @@ -59,6 +61,8 @@ public: void transform(ASTs & nodes) const override; void setPattern(String pattern); bool isColumnMatching(const String & column_name) const; + void appendColumnName(WriteBuffer & ostr) const override; + void updateTreeHashImpl(SipHash & hash_state) const override; protected: void formatImpl(const FormatSettings & settings, FormatState &, FormatStateStacked) const override; @@ -76,12 +80,13 @@ public: ASTPtr clone() const override { auto replacement = std::make_shared(*this); - replacement->children.clear(); replacement->expr = expr->clone(); - replacement->children.push_back(replacement->expr); return replacement; } + void appendColumnName(WriteBuffer & ostr) const override; + void updateTreeHashImpl(SipHash & hash_state) const override; + String name; ASTPtr expr; @@ -98,6 +103,8 @@ public: return clone; } void transform(ASTs & nodes) const override; + void appendColumnName(WriteBuffer & ostr) const override; + void updateTreeHashImpl(SipHash & hash_state) const override; protected: void formatImpl(const FormatSettings & settings, FormatState &, FormatStateStacked) const override; diff --git a/src/Parsers/ASTQualifiedAsterisk.cpp b/src/Parsers/ASTQualifiedAsterisk.cpp index 2491dcb36b7..b755e4eb98c 100644 --- a/src/Parsers/ASTQualifiedAsterisk.cpp +++ b/src/Parsers/ASTQualifiedAsterisk.cpp @@ -17,6 +17,8 @@ void ASTQualifiedAsterisk::formatImpl(const FormatSettings & settings, FormatSta const auto & qualifier = children.at(0); qualifier->formatImpl(settings, state, frame); settings.ostr << ".*"; + + /// Format column transformers for (ASTs::const_iterator it = children.begin() + 1; it != children.end(); ++it) { settings.ostr << ' '; diff --git a/src/Parsers/ExpressionElementParsers.cpp b/src/Parsers/ExpressionElementParsers.cpp index 29c7846283e..c13e6831780 100644 --- a/src/Parsers/ExpressionElementParsers.cpp +++ b/src/Parsers/ExpressionElementParsers.cpp @@ -1796,16 +1796,18 @@ bool ParserColumnsMatcher::parseImpl(Pos & pos, ASTPtr & node, Expected & expect return false; ++pos; - auto res = std::make_shared(); + ASTPtr res; if (column_list) { - res->column_list = column_list; - res->children.push_back(res->column_list); + auto list_matcher = std::make_shared(); + list_matcher->column_list = column_list; + res = list_matcher; } else { - res->setPattern(regex_node->as().value.get()); - res->children.push_back(regex_node); + auto regexp_matcher = std::make_shared(); + regexp_matcher->setPattern(regex_node->as().value.get()); + res = regexp_matcher; } ParserColumnsTransformers transformers_p(allowed_transformers); diff --git a/tests/queries/0_stateless/02271_fix_column_matcher_and_column_transformer.reference b/tests/queries/0_stateless/02271_fix_column_matcher_and_column_transformer.reference new file mode 100644 index 00000000000..e69de29bb2d diff --git a/tests/queries/0_stateless/02271_fix_column_matcher_and_column_transformer.sql b/tests/queries/0_stateless/02271_fix_column_matcher_and_column_transformer.sql new file mode 100644 index 00000000000..f0c0e2bae46 --- /dev/null +++ b/tests/queries/0_stateless/02271_fix_column_matcher_and_column_transformer.sql @@ -0,0 +1,64 @@ +DROP TABLE IF EXISTS github_events; + +CREATE TABLE github_events +( + `file_time` DateTime, + `event_type` Enum8('CommitCommentEvent' = 1, 'CreateEvent' = 2, 'DeleteEvent' = 3, 'ForkEvent' = 4, 'GollumEvent' = 5, 'IssueCommentEvent' = 6, 'IssuesEvent' = 7, 'MemberEvent' = 8, 'PublicEvent' = 9, 'PullRequestEvent' = 10, 'PullRequestReviewCommentEvent' = 11, 'PushEvent' = 12, 'ReleaseEvent' = 13, 'SponsorshipEvent' = 14, 'WatchEvent' = 15, 'GistEvent' = 16, 'FollowEvent' = 17, 'DownloadEvent' = 18, 'PullRequestReviewEvent' = 19, 'ForkApplyEvent' = 20, 'Event' = 21, 'TeamAddEvent' = 22), + `actor_login` LowCardinality(String), + `repo_name` LowCardinality(String), + `created_at` DateTime, + `updated_at` DateTime, + `action` Enum8('none' = 0, 'created' = 1, 'added' = 2, 'edited' = 3, 'deleted' = 4, 'opened' = 5, 'closed' = 6, 'reopened' = 7, 'assigned' = 8, 'unassigned' = 9, 'labeled' = 10, 'unlabeled' = 11, 'review_requested' = 12, 'review_request_removed' = 13, 'synchronize' = 14, 'started' = 15, 'published' = 16, 'update' = 17, 'create' = 18, 'fork' = 19, 'merged' = 20), + `comment_id` UInt64, + `body` String, + `path` String, + `position` Int32, + `line` Int32, + `ref` LowCardinality(String), + `ref_type` Enum8('none' = 0, 'branch' = 1, 'tag' = 2, 'repository' = 3, 'unknown' = 4), + `creator_user_login` LowCardinality(String), + `number` UInt32, + `title` String, + `labels` Array(LowCardinality(String)), + `state` Enum8('none' = 0, 'open' = 1, 'closed' = 2), + `locked` UInt8, + `assignee` LowCardinality(String), + `assignees` Array(LowCardinality(String)), + `comments` UInt32, + `author_association` Enum8('NONE' = 0, 'CONTRIBUTOR' = 1, 'OWNER' = 2, 'COLLABORATOR' = 3, 'MEMBER' = 4, 'MANNEQUIN' = 5), + `closed_at` DateTime, + `merged_at` DateTime, + `merge_commit_sha` String, + `requested_reviewers` Array(LowCardinality(String)), + `requested_teams` Array(LowCardinality(String)), + `head_ref` LowCardinality(String), + `head_sha` String, + `base_ref` LowCardinality(String), + `base_sha` String, + `merged` UInt8, + `mergeable` UInt8, + `rebaseable` UInt8, + `mergeable_state` Enum8('unknown' = 0, 'dirty' = 1, 'clean' = 2, 'unstable' = 3, 'draft' = 4), + `merged_by` LowCardinality(String), + `review_comments` UInt32, + `maintainer_can_modify` UInt8, + `commits` UInt32, + `additions` UInt32, + `deletions` UInt32, + `changed_files` UInt32, + `diff_hunk` String, + `original_position` UInt32, + `commit_id` String, + `original_commit_id` String, + `push_size` UInt32, + `push_distinct_size` UInt32, + `member_login` LowCardinality(String), + `release_tag_name` String, + `release_name` String, + `review_state` Enum8('none' = 0, 'approved' = 1, 'changes_requested' = 2, 'commented' = 3, 'dismissed' = 4, 'pending' = 5) +) +ENGINE = MergeTree ORDER BY (event_type, repo_name, created_at); + +with top_repos as ( select repo_name from github_events where event_type = 'WatchEvent' and toDate(created_at) = today() - 1 group by repo_name order by count() desc limit 100 union distinct select repo_name from github_events where event_type = 'WatchEvent' and toMonday(created_at) = toMonday(today() - interval 1 week) group by repo_name order by count() desc limit 100 union distinct select repo_name from github_events where event_type = 'WatchEvent' and toStartOfMonth(created_at) = toStartOfMonth(today()) - interval 1 month group by repo_name order by count() desc limit 100 union distinct select repo_name from github_events where event_type = 'WatchEvent' and toYear(created_at) = toYear(today()) - 1 group by repo_name order by count() desc limit 100 ), last_day as ( select repo_name, count() as count_last_day, rowNumberInAllBlocks() + 1 as position_last_day from github_events where repo_name in (select repo_name from top_repos) and toDate(created_at) = today() - 1 group by repo_name order by count_last_day desc ), last_week as ( select repo_name, count() as count_last_week, rowNumberInAllBlocks() + 1 as position_last_week from github_events where repo_name in (select repo_name from top_repos) and toMonday(created_at) = toMonday(today()) - interval 1 week group by repo_name order by count_last_week desc ), last_month as ( select repo_name, count() as count_last_month, rowNumberInAllBlocks() + 1 as position_last_month from github_events where repo_name in (select repo_name from top_repos) and toStartOfMonth(created_at) = toStartOfMonth(today()) - interval 1 month group by repo_name order by count_last_month desc ) select d.repo_name, columns(count) from last_day d join last_week w on d.repo_name = w.repo_name join last_month m on d.repo_name = m.repo_name FORMAT TabSeparatedWithNamesAndTypes; -- { serverError 47 } + +DROP TABLE github_events; From 2eee79dc2a8264375e996637876697c4980218f0 Mon Sep 17 00:00:00 2001 From: alesapin Date: Wed, 20 Apr 2022 13:26:20 +0200 Subject: [PATCH 26/55] Review fixes --- src/Disks/IDisk.h | 6 ++--- src/Storages/MergeTree/MergeTreeData.cpp | 27 ++++++++++++++-------- src/Storages/MergeTree/MergeTreePartInfo.h | 10 ++++++++ 3 files changed, 31 insertions(+), 12 deletions(-) diff --git a/src/Disks/IDisk.h b/src/Disks/IDisk.h index cc47c4262f5..669897e7c67 100644 --- a/src/Disks/IDisk.h +++ b/src/Disks/IDisk.h @@ -192,18 +192,18 @@ public: /// Remove file. Throws exception if file doesn't exists or if directory is not empty. /// Differs from removeFile for S3/HDFS disks /// Second bool param is a flag to remove (true) or keep (false) shared data on S3 - virtual void removeSharedFile(const String & path, bool) { removeFile(path); } + virtual void removeSharedFile(const String & path, bool /* keep_shared_data */) { removeFile(path); } /// Remove file or directory with all children. Use with extra caution. Throws exception if file doesn't exists. /// Differs from removeRecursive for S3/HDFS disks /// Second bool param is a flag to remove (true) or keep (false) shared data on S3. /// Third param determines which files cannot be removed even if second is true. - virtual void removeSharedRecursive(const String & path, bool, const NameSet &) { removeRecursive(path); } + virtual void removeSharedRecursive(const String & path, bool /* keep_all_shared_data */, const NameSet & /* file_names_remove_metadata_only */) { removeRecursive(path); } /// Remove file or directory if it exists. /// Differs from removeFileIfExists for S3/HDFS disks /// Second bool param is a flag to remove (true) or keep (false) shared data on S3 - virtual void removeSharedFileIfExists(const String & path, bool) { removeFileIfExists(path); } + virtual void removeSharedFileIfExists(const String & path, bool /* keep_shared_data */) { removeFileIfExists(path); } virtual String getCacheBasePath() const { return ""; } diff --git a/src/Storages/MergeTree/MergeTreeData.cpp b/src/Storages/MergeTree/MergeTreeData.cpp index 6d2eea6cdd4..e5403574741 100644 --- a/src/Storages/MergeTree/MergeTreeData.cpp +++ b/src/Storages/MergeTree/MergeTreeData.cpp @@ -1592,6 +1592,19 @@ MergeTreeData::DataPartsVector MergeTreeData::grabOldParts(bool force) time_t now = time(nullptr); std::vector parts_to_delete; + std::vector skipped_parts; + + auto has_skipped_mutation_parent = [&skipped_parts, need_remove_parts_in_order] (const DataPartPtr & part) + { + if (!need_remove_parts_in_order) + return false; + + for (const auto & part_info : skipped_parts) + if (part->info.isMutationChildOf(part_info)) + return true; + + return false; + }; { auto parts_lock = lockParts(); @@ -1604,9 +1617,7 @@ MergeTreeData::DataPartsVector MergeTreeData::grabOldParts(bool force) /// Do not remove outdated part if it may be visible for some transaction if (!part->version.canBeRemoved()) { - if (need_remove_parts_in_order) - break; - + skipped_parts.push_back(part->info); continue; } @@ -1615,22 +1626,20 @@ MergeTreeData::DataPartsVector MergeTreeData::grabOldParts(bool force) /// Grab only parts that are not used by anyone (SELECTs for example). if (!part.unique()) { - if (need_remove_parts_in_order) - break; - + skipped_parts.push_back(part->info); continue; } - if ((part_remove_time < now && now - part_remove_time > getSettings()->old_parts_lifetime.totalSeconds()) + if ((part_remove_time < now && now - part_remove_time > getSettings()->old_parts_lifetime.totalSeconds() && !has_skipped_mutation_parent(part)) || force || isInMemoryPart(part) /// Remove in-memory parts immediately to not store excessive data in RAM || (part->version.creation_csn == Tx::RolledBackCSN && getSettings()->remove_rolled_back_parts_immediately)) { parts_to_delete.emplace_back(it); } - else if (need_remove_parts_in_order) + else { - break; + skipped_parts.push_back(part->info); } } diff --git a/src/Storages/MergeTree/MergeTreePartInfo.h b/src/Storages/MergeTree/MergeTreePartInfo.h index e9ff6f87f0b..d6d9f45bc1b 100644 --- a/src/Storages/MergeTree/MergeTreePartInfo.h +++ b/src/Storages/MergeTree/MergeTreePartInfo.h @@ -72,6 +72,16 @@ struct MergeTreePartInfo && strictly_contains_block_range; } + /// Part was created with mutation of parent_candidate part + bool isMutationChildOf(const MergeTreePartInfo & parent_candidate) const + { + return partition_id == parent_candidate.partition_id + && min_block == parent_candidate.min_block + && max_block == parent_candidate.max_block + && level == parent_candidate.level + && mutation >= parent_candidate.mutation; + } + /// Return part mutation version, if part wasn't mutated return zero Int64 getMutationVersion() const { From 5334522d5c0f6c10bf11a49967078aad5dc4590c Mon Sep 17 00:00:00 2001 From: alesapin Date: Wed, 20 Apr 2022 13:33:42 +0200 Subject: [PATCH 27/55] Extreme settings --- src/Storages/MergeTree/MergeTreeSettings.h | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/Storages/MergeTree/MergeTreeSettings.h b/src/Storages/MergeTree/MergeTreeSettings.h index 0d984dc4dee..e31c0686a16 100644 --- a/src/Storages/MergeTree/MergeTreeSettings.h +++ b/src/Storages/MergeTree/MergeTreeSettings.h @@ -45,7 +45,7 @@ struct Settings; M(UInt64, number_of_free_entries_in_pool_to_lower_max_size_of_merge, 8, "When there is less than specified number of free entries in pool (or replicated queue), start to lower maximum size of merge to process (or to put in queue). This is to allow small merges to process - not filling the pool with long running merges.", 0) \ M(UInt64, number_of_free_entries_in_pool_to_execute_mutation, 20, "When there is less than specified number of free entries in pool, do not execute part mutations. This is to leave free threads for regular merges and avoid \"Too many parts\"", 0) \ M(UInt64, max_number_of_merges_with_ttl_in_pool, 2, "When there is more than specified number of merges with TTL entries in pool, do not assign new merge with TTL. This is to leave free threads for regular merges and avoid \"Too many parts\"", 0) \ - M(Seconds, old_parts_lifetime, 8 * 60, "How many seconds to keep obsolete parts.", 0) \ + M(Seconds, old_parts_lifetime, 1, "How many seconds to keep obsolete parts.", 0) \ M(Seconds, temporary_directories_lifetime, 86400, "How many seconds to keep tmp_-directories. You should not lower this value because merges and mutations may not be able to work with low value of this setting.", 0) \ M(Seconds, lock_acquire_timeout_for_background_operations, DBMS_DEFAULT_LOCK_ACQUIRE_TIMEOUT_SEC, "For background operations like merges, mutations etc. How many seconds before failing to acquire table locks.", 0) \ M(UInt64, min_rows_to_fsync_after_merge, 0, "Minimal number of rows to do fsync for part after merge (0 - disabled)", 0) \ @@ -100,8 +100,8 @@ struct Settings; \ /** Check delay of replicas settings. */ \ M(UInt64, min_relative_delay_to_measure, 120, "Calculate relative replica delay only if absolute delay is not less that this value.", 0) \ - M(UInt64, cleanup_delay_period, 30, "Period to clean old queue logs, blocks hashes and parts.", 0) \ - M(UInt64, cleanup_delay_period_random_add, 10, "Add uniformly distributed value from 0 to x seconds to cleanup_delay_period to avoid thundering herd effect and subsequent DoS of ZooKeeper in case of very large number of tables.", 0) \ + M(UInt64, cleanup_delay_period, 1, "Period to clean old queue logs, blocks hashes and parts.", 0) \ + M(UInt64, cleanup_delay_period_random_add, 1, "Add uniformly distributed value from 0 to x seconds to cleanup_delay_period to avoid thundering herd effect and subsequent DoS of ZooKeeper in case of very large number of tables.", 0) \ M(UInt64, min_relative_delay_to_close, 300, "Minimal delay from other replicas to close, stop serving requests and not return Ok during status check.", 0) \ M(UInt64, min_absolute_delay_to_close, 0, "Minimal absolute delay to close, stop serving requests and not return Ok during status check.", 0) \ M(UInt64, enable_vertical_merge_algorithm, 1, "Enable usage of Vertical merge algorithm.", 0) \ From 612a81ba24506bbbb9803ff8b2a406547ef0e032 Mon Sep 17 00:00:00 2001 From: alesapin Date: Wed, 20 Apr 2022 13:58:31 +0200 Subject: [PATCH 28/55] Update src/Disks/IDisk.h Co-authored-by: tavplubix --- src/Disks/IDisk.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/Disks/IDisk.h b/src/Disks/IDisk.h index 669897e7c67..3e80195e7ab 100644 --- a/src/Disks/IDisk.h +++ b/src/Disks/IDisk.h @@ -196,7 +196,7 @@ public: /// Remove file or directory with all children. Use with extra caution. Throws exception if file doesn't exists. /// Differs from removeRecursive for S3/HDFS disks - /// Second bool param is a flag to remove (true) or keep (false) shared data on S3. + /// Second bool param is a flag to remove (false) or keep (true) shared data on S3. /// Third param determines which files cannot be removed even if second is true. virtual void removeSharedRecursive(const String & path, bool /* keep_all_shared_data */, const NameSet & /* file_names_remove_metadata_only */) { removeRecursive(path); } From 41b57f78cf0408b6619d18cdbc1bb884d5b4b487 Mon Sep 17 00:00:00 2001 From: alesapin Date: Wed, 20 Apr 2022 13:59:02 +0200 Subject: [PATCH 29/55] Revert extreme settings --- src/Storages/MergeTree/MergeTreeSettings.h | 6 +++--- src/Storages/MergeTree/MutateFromLogEntryTask.cpp | 4 ++-- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/src/Storages/MergeTree/MergeTreeSettings.h b/src/Storages/MergeTree/MergeTreeSettings.h index e31c0686a16..0d984dc4dee 100644 --- a/src/Storages/MergeTree/MergeTreeSettings.h +++ b/src/Storages/MergeTree/MergeTreeSettings.h @@ -45,7 +45,7 @@ struct Settings; M(UInt64, number_of_free_entries_in_pool_to_lower_max_size_of_merge, 8, "When there is less than specified number of free entries in pool (or replicated queue), start to lower maximum size of merge to process (or to put in queue). This is to allow small merges to process - not filling the pool with long running merges.", 0) \ M(UInt64, number_of_free_entries_in_pool_to_execute_mutation, 20, "When there is less than specified number of free entries in pool, do not execute part mutations. This is to leave free threads for regular merges and avoid \"Too many parts\"", 0) \ M(UInt64, max_number_of_merges_with_ttl_in_pool, 2, "When there is more than specified number of merges with TTL entries in pool, do not assign new merge with TTL. This is to leave free threads for regular merges and avoid \"Too many parts\"", 0) \ - M(Seconds, old_parts_lifetime, 1, "How many seconds to keep obsolete parts.", 0) \ + M(Seconds, old_parts_lifetime, 8 * 60, "How many seconds to keep obsolete parts.", 0) \ M(Seconds, temporary_directories_lifetime, 86400, "How many seconds to keep tmp_-directories. You should not lower this value because merges and mutations may not be able to work with low value of this setting.", 0) \ M(Seconds, lock_acquire_timeout_for_background_operations, DBMS_DEFAULT_LOCK_ACQUIRE_TIMEOUT_SEC, "For background operations like merges, mutations etc. How many seconds before failing to acquire table locks.", 0) \ M(UInt64, min_rows_to_fsync_after_merge, 0, "Minimal number of rows to do fsync for part after merge (0 - disabled)", 0) \ @@ -100,8 +100,8 @@ struct Settings; \ /** Check delay of replicas settings. */ \ M(UInt64, min_relative_delay_to_measure, 120, "Calculate relative replica delay only if absolute delay is not less that this value.", 0) \ - M(UInt64, cleanup_delay_period, 1, "Period to clean old queue logs, blocks hashes and parts.", 0) \ - M(UInt64, cleanup_delay_period_random_add, 1, "Add uniformly distributed value from 0 to x seconds to cleanup_delay_period to avoid thundering herd effect and subsequent DoS of ZooKeeper in case of very large number of tables.", 0) \ + M(UInt64, cleanup_delay_period, 30, "Period to clean old queue logs, blocks hashes and parts.", 0) \ + M(UInt64, cleanup_delay_period_random_add, 10, "Add uniformly distributed value from 0 to x seconds to cleanup_delay_period to avoid thundering herd effect and subsequent DoS of ZooKeeper in case of very large number of tables.", 0) \ M(UInt64, min_relative_delay_to_close, 300, "Minimal delay from other replicas to close, stop serving requests and not return Ok during status check.", 0) \ M(UInt64, min_absolute_delay_to_close, 0, "Minimal absolute delay to close, stop serving requests and not return Ok during status check.", 0) \ M(UInt64, enable_vertical_merge_algorithm, 1, "Enable usage of Vertical merge algorithm.", 0) \ diff --git a/src/Storages/MergeTree/MutateFromLogEntryTask.cpp b/src/Storages/MergeTree/MutateFromLogEntryTask.cpp index 554053c1bd3..64c2d6fb8a6 100644 --- a/src/Storages/MergeTree/MutateFromLogEntryTask.cpp +++ b/src/Storages/MergeTree/MutateFromLogEntryTask.cpp @@ -115,7 +115,7 @@ ReplicatedMergeMutateTaskBase::PrepareResult MutateFromLogEntryTask::prepare() String dummy; if (!storage.findReplicaHavingCoveringPart(entry.new_part_name, true, dummy).empty()) { - LOG_DEBUG(log, "Mutation of part {} finished by some other replica, will download merged part", entry.new_part_name); + LOG_DEBUG(log, "Mutation of part {} finished by some other replica, will download mutated part", entry.new_part_name); return PrepareResult{ .prepared_successfully = false, .need_to_check_missing_part_in_fetch = true, @@ -127,7 +127,7 @@ ReplicatedMergeMutateTaskBase::PrepareResult MutateFromLogEntryTask::prepare() if (!zero_copy_lock) { - LOG_DEBUG(log, "Mutation of part {} started by some other replica, will wait it and fetch merged part", entry.new_part_name); + LOG_DEBUG(log, "Mutation of part {} started by some other replica, will wait it and mutated merged part", entry.new_part_name); return PrepareResult{ .prepared_successfully = false, .need_to_check_missing_part_in_fetch = false, From e8fba1644c0e3648213f9e0a8f3a87b94f970d40 Mon Sep 17 00:00:00 2001 From: alesapin Date: Wed, 20 Apr 2022 14:13:29 +0200 Subject: [PATCH 30/55] Fix bug --- src/Storages/StorageReplicatedMergeTree.cpp | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/src/Storages/StorageReplicatedMergeTree.cpp b/src/Storages/StorageReplicatedMergeTree.cpp index 8f376e3bffa..a776347c5f4 100644 --- a/src/Storages/StorageReplicatedMergeTree.cpp +++ b/src/Storages/StorageReplicatedMergeTree.cpp @@ -7456,10 +7456,14 @@ std::pair StorageReplicatedMergeTree::unlockSharedDataByID( for (const auto & zc_zookeeper_path : zc_zookeeper_paths) { + String files_not_to_remove_str; + zookeeper_ptr->tryGet(zc_zookeeper_path, files_not_to_remove_str); + files_not_to_remove.clear(); - auto files_not_to_remove_str = zookeeper_ptr->get(zc_zookeeper_path); if (!files_not_to_remove_str.empty()) + { boost::split(files_not_to_remove, files_not_to_remove_str, boost::is_any_of("\n ")); + } String zookeeper_part_uniq_node = fs::path(zc_zookeeper_path) / part_id; From 829854113b34075676fc224c0db2df69a429ea6b Mon Sep 17 00:00:00 2001 From: alesapin Date: Wed, 20 Apr 2022 15:10:36 +0200 Subject: [PATCH 31/55] Add a little thread safety --- src/Disks/IDiskRemote.cpp | 21 +++++++++++++++++---- src/Disks/IDiskRemote.h | 3 +++ src/Storages/StorageReplicatedMergeTree.cpp | 2 -- 3 files changed, 20 insertions(+), 6 deletions(-) diff --git a/src/Disks/IDiskRemote.cpp b/src/Disks/IDiskRemote.cpp index acc3517d30e..80f1a3ff116 100644 --- a/src/Disks/IDiskRemote.cpp +++ b/src/Disks/IDiskRemote.cpp @@ -45,7 +45,6 @@ IDiskRemote::Metadata IDiskRemote::Metadata::createAndStoreMetadata(const String return result; } - IDiskRemote::Metadata IDiskRemote::Metadata::readUpdateAndStoreMetadata(const String & remote_fs_root_path_, DiskPtr metadata_disk_, const String & metadata_file_path_, bool sync, IDiskRemote::MetadataUpdater updater) { Metadata result(remote_fs_root_path_, metadata_disk_, metadata_file_path_); @@ -55,7 +54,6 @@ IDiskRemote::Metadata IDiskRemote::Metadata::readUpdateAndStoreMetadata(const St return result; } - IDiskRemote::Metadata IDiskRemote::Metadata::createUpdateAndStoreMetadata(const String & remote_fs_root_path_, DiskPtr metadata_disk_, const String & metadata_file_path_, bool sync, IDiskRemote::MetadataUpdater updater) { Metadata result(remote_fs_root_path_, metadata_disk_, metadata_file_path_); @@ -64,6 +62,16 @@ IDiskRemote::Metadata IDiskRemote::Metadata::createUpdateAndStoreMetadata(const return result; } +IDiskRemote::Metadata IDiskRemote::Metadata::readUpdateStoreMetadataAndRemove(const String & remote_fs_root_path_, DiskPtr metadata_disk_, const String & metadata_file_path_, bool sync, IDiskRemote::MetadataUpdater updater) +{ + Metadata result(remote_fs_root_path_, metadata_disk_, metadata_file_path_); + if (updater(result)) + result.save(sync); + metadata_disk_->removeFile(metadata_file_path_); + + return result; + +} IDiskRemote::Metadata IDiskRemote::Metadata::createAndStoreMetadataIfNotExists(const String & remote_fs_root_path_, DiskPtr metadata_disk_, const String & metadata_file_path_, bool sync, bool overwrite) { @@ -231,6 +239,12 @@ IDiskRemote::Metadata IDiskRemote::readUpdateAndStoreMetadata(const String & pat } +IDiskRemote::Metadata IDiskRemote::readUpdateStoreMetadataAndRemove(const String & path, bool sync, IDiskRemote::MetadataUpdater updater) +{ + std::unique_lock lock(metadata_mutex); + return Metadata::readUpdateStoreMetadataAndRemove(remote_fs_root_path, metadata_disk, path, sync, updater); +} + IDiskRemote::Metadata IDiskRemote::readOrCreateUpdateAndStoreMetadata(const String & path, WriteMode mode, bool sync, IDiskRemote::MetadataUpdater updater) { if (mode == WriteMode::Rewrite || !metadata_disk->exists(path)) @@ -308,8 +322,7 @@ void IDiskRemote::removeMetadata(const String & path, std::vector & path return true; }; - readUpdateAndStoreMetadata(path, false, metadata_updater); - metadata_disk->removeFile(path); + readUpdateStoreMetadataAndRemove(path, false, metadata_updater); /// If there is no references - delete content from remote FS. } catch (const Exception & e) diff --git a/src/Disks/IDiskRemote.h b/src/Disks/IDiskRemote.h index b2251e1abe3..65bcdf3e719 100644 --- a/src/Disks/IDiskRemote.h +++ b/src/Disks/IDiskRemote.h @@ -78,6 +78,8 @@ public: Metadata readMetadata(const String & path) const; Metadata readMetadataUnlocked(const String & path, std::shared_lock &) const; Metadata readUpdateAndStoreMetadata(const String & path, bool sync, MetadataUpdater updater); + Metadata readUpdateStoreMetadataAndRemove(const String & path, bool sync, MetadataUpdater updater); + Metadata readOrCreateUpdateAndStoreMetadata(const String & path, WriteMode mode, bool sync, MetadataUpdater updater); Metadata createAndStoreMetadata(const String & path, bool sync); @@ -229,6 +231,7 @@ struct IDiskRemote::Metadata static Metadata readMetadata(const String & remote_fs_root_path_, DiskPtr metadata_disk_, const String & metadata_file_path_); static Metadata readUpdateAndStoreMetadata(const String & remote_fs_root_path_, DiskPtr metadata_disk_, const String & metadata_file_path_, bool sync, Updater updater); + static Metadata readUpdateStoreMetadataAndRemove(const String & remote_fs_root_path_, DiskPtr metadata_disk_, const String & metadata_file_path_, bool sync, Updater updater); static Metadata createAndStoreMetadata(const String & remote_fs_root_path_, DiskPtr metadata_disk_, const String & metadata_file_path_, bool sync); static Metadata createUpdateAndStoreMetadata(const String & remote_fs_root_path_, DiskPtr metadata_disk_, const String & metadata_file_path_, bool sync, Updater updater); diff --git a/src/Storages/StorageReplicatedMergeTree.cpp b/src/Storages/StorageReplicatedMergeTree.cpp index a776347c5f4..858841a4986 100644 --- a/src/Storages/StorageReplicatedMergeTree.cpp +++ b/src/Storages/StorageReplicatedMergeTree.cpp @@ -7461,9 +7461,7 @@ std::pair StorageReplicatedMergeTree::unlockSharedDataByID( files_not_to_remove.clear(); if (!files_not_to_remove_str.empty()) - { boost::split(files_not_to_remove, files_not_to_remove_str, boost::is_any_of("\n ")); - } String zookeeper_part_uniq_node = fs::path(zc_zookeeper_path) / part_id; From 6ee0df251fb34130df88f9b734af0fc0f155be04 Mon Sep 17 00:00:00 2001 From: alesapin Date: Wed, 20 Apr 2022 15:11:03 +0200 Subject: [PATCH 32/55] Add ditry settings --- src/Storages/MergeTree/MergeTreeSettings.h | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/Storages/MergeTree/MergeTreeSettings.h b/src/Storages/MergeTree/MergeTreeSettings.h index 0d984dc4dee..e31c0686a16 100644 --- a/src/Storages/MergeTree/MergeTreeSettings.h +++ b/src/Storages/MergeTree/MergeTreeSettings.h @@ -45,7 +45,7 @@ struct Settings; M(UInt64, number_of_free_entries_in_pool_to_lower_max_size_of_merge, 8, "When there is less than specified number of free entries in pool (or replicated queue), start to lower maximum size of merge to process (or to put in queue). This is to allow small merges to process - not filling the pool with long running merges.", 0) \ M(UInt64, number_of_free_entries_in_pool_to_execute_mutation, 20, "When there is less than specified number of free entries in pool, do not execute part mutations. This is to leave free threads for regular merges and avoid \"Too many parts\"", 0) \ M(UInt64, max_number_of_merges_with_ttl_in_pool, 2, "When there is more than specified number of merges with TTL entries in pool, do not assign new merge with TTL. This is to leave free threads for regular merges and avoid \"Too many parts\"", 0) \ - M(Seconds, old_parts_lifetime, 8 * 60, "How many seconds to keep obsolete parts.", 0) \ + M(Seconds, old_parts_lifetime, 1, "How many seconds to keep obsolete parts.", 0) \ M(Seconds, temporary_directories_lifetime, 86400, "How many seconds to keep tmp_-directories. You should not lower this value because merges and mutations may not be able to work with low value of this setting.", 0) \ M(Seconds, lock_acquire_timeout_for_background_operations, DBMS_DEFAULT_LOCK_ACQUIRE_TIMEOUT_SEC, "For background operations like merges, mutations etc. How many seconds before failing to acquire table locks.", 0) \ M(UInt64, min_rows_to_fsync_after_merge, 0, "Minimal number of rows to do fsync for part after merge (0 - disabled)", 0) \ @@ -100,8 +100,8 @@ struct Settings; \ /** Check delay of replicas settings. */ \ M(UInt64, min_relative_delay_to_measure, 120, "Calculate relative replica delay only if absolute delay is not less that this value.", 0) \ - M(UInt64, cleanup_delay_period, 30, "Period to clean old queue logs, blocks hashes and parts.", 0) \ - M(UInt64, cleanup_delay_period_random_add, 10, "Add uniformly distributed value from 0 to x seconds to cleanup_delay_period to avoid thundering herd effect and subsequent DoS of ZooKeeper in case of very large number of tables.", 0) \ + M(UInt64, cleanup_delay_period, 1, "Period to clean old queue logs, blocks hashes and parts.", 0) \ + M(UInt64, cleanup_delay_period_random_add, 1, "Add uniformly distributed value from 0 to x seconds to cleanup_delay_period to avoid thundering herd effect and subsequent DoS of ZooKeeper in case of very large number of tables.", 0) \ M(UInt64, min_relative_delay_to_close, 300, "Minimal delay from other replicas to close, stop serving requests and not return Ok during status check.", 0) \ M(UInt64, min_absolute_delay_to_close, 0, "Minimal absolute delay to close, stop serving requests and not return Ok during status check.", 0) \ M(UInt64, enable_vertical_merge_algorithm, 1, "Enable usage of Vertical merge algorithm.", 0) \ From c14e2e0b96dc42ad780006994c6b472502d6828e Mon Sep 17 00:00:00 2001 From: alesapin Date: Wed, 20 Apr 2022 21:08:26 +0200 Subject: [PATCH 33/55] Fix more --- src/Disks/IDiskRemote.cpp | 2 ++ src/Storages/MergeTree/MergeTreeData.cpp | 1 + src/Storages/MergeTree/MergeTreeData.h | 4 ++++ src/Storages/MergeTree/MutateTask.cpp | 1 + src/Storages/MergeTree/MutateTask.h | 2 +- src/Storages/StorageReplicatedMergeTree.cpp | 2 +- src/Storages/StorageReplicatedMergeTree.h | 2 +- 7 files changed, 11 insertions(+), 3 deletions(-) diff --git a/src/Disks/IDiskRemote.cpp b/src/Disks/IDiskRemote.cpp index 80f1a3ff116..cd8e999103d 100644 --- a/src/Disks/IDiskRemote.cpp +++ b/src/Disks/IDiskRemote.cpp @@ -65,6 +65,7 @@ IDiskRemote::Metadata IDiskRemote::Metadata::createUpdateAndStoreMetadata(const IDiskRemote::Metadata IDiskRemote::Metadata::readUpdateStoreMetadataAndRemove(const String & remote_fs_root_path_, DiskPtr metadata_disk_, const String & metadata_file_path_, bool sync, IDiskRemote::MetadataUpdater updater) { Metadata result(remote_fs_root_path_, metadata_disk_, metadata_file_path_); + result.load(); if (updater(result)) result.save(sync); metadata_disk_->removeFile(metadata_file_path_); @@ -303,6 +304,7 @@ void IDiskRemote::removeMetadata(const String & path, std::vector & path { for (const auto & [remote_fs_object_path, _] : metadata.remote_fs_objects) { + paths_to_remove.push_back(remote_fs_root_path + remote_fs_object_path); if (cache) diff --git a/src/Storages/MergeTree/MergeTreeData.cpp b/src/Storages/MergeTree/MergeTreeData.cpp index e5403574741..595a3fffa1b 100644 --- a/src/Storages/MergeTree/MergeTreeData.cpp +++ b/src/Storages/MergeTree/MergeTreeData.cpp @@ -5786,6 +5786,7 @@ MergeTreeData::MutableDataPartPtr MergeTreeData::cloneAndLoadDataPartOnSameDisk( if (hardlinked_files) { hardlinked_files->source_part_name = src_part->name; + hardlinked_files->source_table_shared_id = src_part->storage.getTableSharedID(); for (auto it = disk->iterateDirectory(src_part_path); it->isValid(); it->next()) { diff --git a/src/Storages/MergeTree/MergeTreeData.h b/src/Storages/MergeTree/MergeTreeData.h index 283a719f26e..fcc23c9c2ef 100644 --- a/src/Storages/MergeTree/MergeTreeData.h +++ b/src/Storages/MergeTree/MergeTreeData.h @@ -771,6 +771,8 @@ public: struct HardlinkedFiles { + /// Shared table uuid where hardlinks live + std::string source_table_shared_id; /// Hardlinked from part std::string source_part_name; /// Hardlinked files list @@ -964,6 +966,8 @@ public: /// Remove local files and remote files if needed virtual bool removeDetachedPart(DiskPtr disk, const String & path, const String & part_name, bool is_freezed); + virtual String getTableSharedID() const { return ""; } + /// Store metadata for replicated tables /// Do nothing for non-replicated tables virtual void createAndStoreFreezeMetadata(DiskPtr disk, DataPartPtr part, String backup_part_path) const; diff --git a/src/Storages/MergeTree/MutateTask.cpp b/src/Storages/MergeTree/MutateTask.cpp index 5e7cc8ff171..f449e5e3c04 100644 --- a/src/Storages/MergeTree/MutateTask.cpp +++ b/src/Storages/MergeTree/MutateTask.cpp @@ -1122,6 +1122,7 @@ private: /// Tracking of hardlinked files required for zero-copy replication. /// We don't remove them when we delete last copy of source part because /// new part can use them. + ctx->hardlinked_files.source_table_shared_id = ctx->source_part->storage.getTableSharedID(); ctx->hardlinked_files.source_part_name = ctx->source_part->name; ctx->hardlinked_files.hardlinks_from_source_part = hardlinked_files; diff --git a/src/Storages/MergeTree/MutateTask.h b/src/Storages/MergeTree/MutateTask.h index 40447cc63c8..f4848dac3b3 100644 --- a/src/Storages/MergeTree/MutateTask.h +++ b/src/Storages/MergeTree/MutateTask.h @@ -13,7 +13,7 @@ namespace DB class MutateTask; -using MutateTaskPtr = std::shared_ptr;\ +using MutateTaskPtr = std::shared_ptr; class MergeTreeDataMergerMutator; diff --git a/src/Storages/StorageReplicatedMergeTree.cpp b/src/Storages/StorageReplicatedMergeTree.cpp index 858841a4986..49eae770eb9 100644 --- a/src/Storages/StorageReplicatedMergeTree.cpp +++ b/src/Storages/StorageReplicatedMergeTree.cpp @@ -7397,7 +7397,7 @@ void StorageReplicatedMergeTree::lockSharedData(const IMergeTreeDataPart & part, if (hardlinked_files.has_value() && !hardlinked_files->hardlinks_from_source_part.empty()) { path_to_set_hardlinked_files = getZeroCopyPartPath( - *getSettings(), disk->getType(), getTableSharedID(), + *getSettings(), disk->getType(), hardlinked_files->source_table_shared_id, hardlinked_files->source_part_name, zookeeper_path)[0]; hardlinks = hardlinked_files->hardlinks_from_source_part; diff --git a/src/Storages/StorageReplicatedMergeTree.h b/src/Storages/StorageReplicatedMergeTree.h index fb284893436..845182445a5 100644 --- a/src/Storages/StorageReplicatedMergeTree.h +++ b/src/Storages/StorageReplicatedMergeTree.h @@ -283,7 +283,7 @@ public: String getZooKeeperName() const { return zookeeper_name; } // Return table id, common for different replicas - String getTableSharedID() const; + String getTableSharedID() const override; static String getDefaultZooKeeperName() { return default_zookeeper_name; } From cbfe00bb47a4069baf86b4d98f58ff776fce17be Mon Sep 17 00:00:00 2001 From: alesapin Date: Thu, 21 Apr 2022 00:30:13 +0200 Subject: [PATCH 34/55] Fix tidy --- src/Storages/MergeTree/DataPartsExchange.cpp | 2 +- .../MergeTree/ReplicatedMergeTreeSink.cpp | 2 +- src/Storages/StorageReplicatedMergeTree.h | 2 +- ...71_replace_partition_many_tables.reference | 5 ++ .../02271_replace_partition_many_tables.sql | 82 +++++++++++++++++++ 5 files changed, 90 insertions(+), 3 deletions(-) create mode 100644 tests/queries/0_stateless/02271_replace_partition_many_tables.reference create mode 100644 tests/queries/0_stateless/02271_replace_partition_many_tables.sql diff --git a/src/Storages/MergeTree/DataPartsExchange.cpp b/src/Storages/MergeTree/DataPartsExchange.cpp index 166314ab7c2..dd36a45f829 100644 --- a/src/Storages/MergeTree/DataPartsExchange.cpp +++ b/src/Storages/MergeTree/DataPartsExchange.cpp @@ -850,7 +850,7 @@ MergeTreeData::MutableDataPartPtr Fetcher::downloadPartToDiskRemoteMeta( new_data_part->modification_time = time(nullptr); new_data_part->loadColumnsChecksumsIndexes(true, false); - data.lockSharedData(*new_data_part, /* replace_existing_lock = */ true); + data.lockSharedData(*new_data_part, /* replace_existing_lock = */ true, {}); LOG_DEBUG(log, "Download of part {} unique id {} metadata onto disk {} finished.", part_name, part_id, disk->getName()); diff --git a/src/Storages/MergeTree/ReplicatedMergeTreeSink.cpp b/src/Storages/MergeTree/ReplicatedMergeTreeSink.cpp index 96f829ac96c..c96c180b83d 100644 --- a/src/Storages/MergeTree/ReplicatedMergeTreeSink.cpp +++ b/src/Storages/MergeTree/ReplicatedMergeTreeSink.cpp @@ -497,7 +497,7 @@ void ReplicatedMergeTreeSink::commitPart( part->name); } - storage.lockSharedData(*part, false); + storage.lockSharedData(*part, false, {}); Coordination::Responses responses; Coordination::Error multi_code = zookeeper->tryMultiNoThrow(ops, responses); /// 1 RTT diff --git a/src/Storages/StorageReplicatedMergeTree.h b/src/Storages/StorageReplicatedMergeTree.h index 845182445a5..ff389e6f3a3 100644 --- a/src/Storages/StorageReplicatedMergeTree.h +++ b/src/Storages/StorageReplicatedMergeTree.h @@ -236,7 +236,7 @@ public: bool executeFetchShared(const String & source_replica, const String & new_part_name, const DiskPtr & disk, const String & path); /// Lock part in zookeeper for use shared data in several nodes - void lockSharedData(const IMergeTreeDataPart & part, bool replace_existing_lock, std::optional hardlinked_files = {}) const override; + void lockSharedData(const IMergeTreeDataPart & part, bool replace_existing_lock, std::optional hardlinked_files) const override; void lockSharedDataTemporary(const String & part_name, const String & part_id, const DiskPtr & disk) const; diff --git a/tests/queries/0_stateless/02271_replace_partition_many_tables.reference b/tests/queries/0_stateless/02271_replace_partition_many_tables.reference new file mode 100644 index 00000000000..627e1097cda --- /dev/null +++ b/tests/queries/0_stateless/02271_replace_partition_many_tables.reference @@ -0,0 +1,5 @@ +1 +1 +1 +1 +1 diff --git a/tests/queries/0_stateless/02271_replace_partition_many_tables.sql b/tests/queries/0_stateless/02271_replace_partition_many_tables.sql new file mode 100644 index 00000000000..a31a2bee58a --- /dev/null +++ b/tests/queries/0_stateless/02271_replace_partition_many_tables.sql @@ -0,0 +1,82 @@ +DROP TABLE IF EXISTS replace_partition_source; +DROP TABLE IF EXISTS replace_partition_dest1; +DROP TABLE IF EXISTS replace_partition_dest1_2; +DROP TABLE IF EXISTS replace_partition_dest2; +DROP TABLE IF EXISTS replace_partition_dest2_2; + +CREATE TABLE replace_partition_source +( + key UInt64 +) +ENGINE = ReplicatedMergeTree('/test/02271_replace_partition_many/{database}/source', '1') +PARTITION BY key +ORDER BY tuple(); + +INSERT INTO replace_partition_source VALUES (1); + +CREATE TABLE replace_partition_dest1 +( + key UInt64 +) +ENGINE = ReplicatedMergeTree('/test/02271_replace_partition_many/{database}/dest1', '1') +PARTITION BY key +ORDER BY tuple(); + +CREATE TABLE replace_partition_dest1_2 +( + key UInt64 +) +ENGINE = ReplicatedMergeTree('/test/02271_replace_partition_many/{database}/dest1', '2') +PARTITION BY key +ORDER BY tuple(); + + +CREATE TABLE replace_partition_dest2 +( + key UInt64 +) +ENGINE = ReplicatedMergeTree('/test/02271_replace_partition_many/{database}/dest2', '1') +PARTITION BY key +ORDER BY tuple(); + +CREATE TABLE replace_partition_dest2_2 +( + key UInt64 +) +ENGINE = ReplicatedMergeTree('/test/02271_replace_partition_many/{database}/dest2', '2') +PARTITION BY key +ORDER BY tuple(); + + +ALTER TABLE replace_partition_dest1 REPLACE PARTITION 1 FROM replace_partition_source; +ALTER TABLE replace_partition_dest2 REPLACE PARTITION 1 FROM replace_partition_source; + +OPTIMIZE TABLE replace_partition_source FINAL; + +SELECT sleep(3) FORMAT Null; +SELECT sleep(3) FORMAT Null; +SELECT sleep(3) FORMAT Null; +SELECT sleep(3) FORMAT Null; +SELECT sleep(3) FORMAT Null; + +OPTIMIZE TABLE replace_partition_dest1_2 FINAL; +OPTIMIZE TABLE replace_partition_dest2_2 FINAL; + +SELECT sleep(3) FORMAT Null; +SELECT sleep(3) FORMAT Null; +SELECT sleep(3) FORMAT Null; +SELECT sleep(3) FORMAT Null; +SELECT sleep(3) FORMAT Null; + +SELECT * FROM replace_partition_source; +SELECT * FROM replace_partition_dest1; +SELECT * FROM replace_partition_dest2; +SELECT * FROM replace_partition_dest1_2; +SELECT * FROM replace_partition_dest2_2; + + +--DROP TABLE IF EXISTS replace_partition_source; +--DROP TABLE IF EXISTS replace_partition_dest1; +--DROP TABLE IF EXISTS replace_partition_dest1_2; +--DROP TABLE IF EXISTS replace_partition_dest2; +--DROP TABLE IF EXISTS replace_partition_dest2_2; From 397603e9be288f87cc13041298ff8781d423cd08 Mon Sep 17 00:00:00 2001 From: Maksim Kita Date: Thu, 21 Apr 2022 13:44:56 +0200 Subject: [PATCH 35/55] ExecutableUserDefinedFunction fix usage in GROUP BY --- src/Interpreters/TreeOptimizer.cpp | 15 ++++++++++++--- ...table_user_defined_function_group_by.reference | 10 ++++++++++ ..._executable_user_defined_function_group_by.sql | 1 + 3 files changed, 23 insertions(+), 3 deletions(-) create mode 100644 tests/queries/0_stateless/02285_executable_user_defined_function_group_by.reference create mode 100644 tests/queries/0_stateless/02285_executable_user_defined_function_group_by.sql diff --git a/src/Interpreters/TreeOptimizer.cpp b/src/Interpreters/TreeOptimizer.cpp index badeb3e4ee2..cb0fdc5b162 100644 --- a/src/Interpreters/TreeOptimizer.cpp +++ b/src/Interpreters/TreeOptimizer.cpp @@ -21,6 +21,7 @@ #include #include #include +#include #include #include @@ -138,10 +139,18 @@ void optimizeGroupBy(ASTSelectQuery * select_query, ContextPtr context) continue; } } - else if (!function_factory.get(function->name, context)->isInjective({})) + else { - ++i; - continue; + FunctionOverloadResolverPtr function_builder = UserDefinedExecutableFunctionFactory::instance().tryGet(function->name, context); + + if (!function_builder) + function_builder = function_factory.get(function->name, context); + + if (function_builder->isInjective({})) + { + ++i; + continue; + } } /// copy shared pointer to args in order to ensure lifetime diff --git a/tests/queries/0_stateless/02285_executable_user_defined_function_group_by.reference b/tests/queries/0_stateless/02285_executable_user_defined_function_group_by.reference new file mode 100644 index 00000000000..1c59de1a03b --- /dev/null +++ b/tests/queries/0_stateless/02285_executable_user_defined_function_group_by.reference @@ -0,0 +1,10 @@ +0 +2 +4 +6 +8 +10 +12 +14 +16 +18 diff --git a/tests/queries/0_stateless/02285_executable_user_defined_function_group_by.sql b/tests/queries/0_stateless/02285_executable_user_defined_function_group_by.sql new file mode 100644 index 00000000000..96db8c088dc --- /dev/null +++ b/tests/queries/0_stateless/02285_executable_user_defined_function_group_by.sql @@ -0,0 +1 @@ +SELECT test_function(number, number) as a FROM numbers(10) GROUP BY a ORDER BY a; From 5465415751741cf388c61dbb623fc06281a0a08b Mon Sep 17 00:00:00 2001 From: alesapin Date: Thu, 21 Apr 2022 14:39:12 +0200 Subject: [PATCH 36/55] Fix replace/move partition with zero copy replication --- src/Disks/DiskDecorator.cpp | 5 +++ src/Disks/DiskDecorator.h | 1 + src/Disks/DiskEncrypted.cpp | 30 ++++++++++++++ src/Disks/DiskEncrypted.h | 2 + src/Disks/DiskLocal.cpp | 8 ++++ src/Disks/DiskLocal.h | 2 + src/Disks/DiskRestartProxy.cpp | 6 +++ src/Disks/DiskRestartProxy.h | 1 + src/Disks/IDisk.cpp | 40 ++++++++++++------- src/Disks/IDisk.h | 7 +++- src/Storages/MergeTree/IMergeTreeDataPart.h | 2 + src/Storages/MergeTree/MergeTreeData.cpp | 15 +++++-- src/Storages/MergeTree/MergeTreeData.h | 6 ++- .../MergeTree/MergeTreeDataPartCompact.cpp | 5 +++ .../MergeTree/MergeTreeDataPartCompact.h | 2 + .../MergeTree/MergeTreeDataPartInMemory.h | 1 + .../MergeTree/MergeTreeDataPartWide.cpp | 5 +++ .../MergeTree/MergeTreeDataPartWide.h | 2 + src/Storages/MergeTree/MutateTask.cpp | 4 +- src/Storages/MergeTree/localBackup.cpp | 20 +++++++--- src/Storages/MergeTree/localBackup.h | 2 +- src/Storages/StorageMergeTree.cpp | 4 +- src/Storages/StorageReplicatedMergeTree.cpp | 28 ++++++++++--- 23 files changed, 161 insertions(+), 37 deletions(-) diff --git a/src/Disks/DiskDecorator.cpp b/src/Disks/DiskDecorator.cpp index 47a6bd92c3a..80cfc23d210 100644 --- a/src/Disks/DiskDecorator.cpp +++ b/src/Disks/DiskDecorator.cpp @@ -108,6 +108,11 @@ void DiskDecorator::copy(const String & from_path, const std::shared_ptr delegate->copy(from_path, to_disk, to_path); } +void DiskDecorator::copyDirectoryContent(const String & from_dir, const std::shared_ptr & to_disk, const String & to_dir) +{ + delegate->copyDirectoryContent(from_dir, to_disk, to_dir); +} + void DiskDecorator::listFiles(const String & path, std::vector & file_names) { delegate->listFiles(path, file_names); diff --git a/src/Disks/DiskDecorator.h b/src/Disks/DiskDecorator.h index 27ddb001a83..d707eb3e51d 100644 --- a/src/Disks/DiskDecorator.h +++ b/src/Disks/DiskDecorator.h @@ -33,6 +33,7 @@ public: void moveFile(const String & from_path, const String & to_path) override; void replaceFile(const String & from_path, const String & to_path) override; void copy(const String & from_path, const std::shared_ptr & to_disk, const String & to_path) override; + void copyDirectoryContent(const String & from_dir, const std::shared_ptr & to_disk, const String & to_dir) override; void listFiles(const String & path, std::vector & file_names) override; std::unique_ptr readFile( diff --git a/src/Disks/DiskEncrypted.cpp b/src/Disks/DiskEncrypted.cpp index 3cee205fafc..4ac59af95ab 100644 --- a/src/Disks/DiskEncrypted.cpp +++ b/src/Disks/DiskEncrypted.cpp @@ -249,6 +249,36 @@ void DiskEncrypted::copy(const String & from_path, const std::shared_ptr copyThroughBuffers(from_path, to_disk, to_path); } + +void DiskEncrypted::copyDirectoryContent(const String & from_dir, const std::shared_ptr & to_disk, const String & to_dir) +{ + /// Check if we can copy the file without deciphering. + if (isSameDiskType(*this, *to_disk)) + { + /// Disk type is the same, check if the key is the same too. + if (auto * to_disk_enc = typeid_cast(to_disk.get())) + { + auto from_settings = current_settings.get(); + auto to_settings = to_disk_enc->current_settings.get(); + if (from_settings->keys == to_settings->keys) + { + /// Keys are the same so we can simply copy the encrypted file. + auto wrapped_from_path = wrappedPath(from_dir); + auto to_delegate = to_disk_enc->delegate; + auto wrapped_to_path = to_disk_enc->wrappedPath(to_dir); + delegate->copyDirectoryContent(wrapped_from_path, to_delegate, wrapped_to_path); + return; + } + } + } + + if (!to_disk->exists(to_dir)) + to_disk->createDirectories(to_dir); + + /// Copy the file through buffers with deciphering. + copyThroughBuffers(from_dir, to_disk, to_dir); +} + std::unique_ptr DiskEncrypted::readFile( const String & path, const ReadSettings & settings, diff --git a/src/Disks/DiskEncrypted.h b/src/Disks/DiskEncrypted.h index 2f47dda59b3..14793818f07 100644 --- a/src/Disks/DiskEncrypted.h +++ b/src/Disks/DiskEncrypted.h @@ -117,6 +117,8 @@ public: void copy(const String & from_path, const std::shared_ptr & to_disk, const String & to_path) override; + void copyDirectoryContent(const String & from_dir, const std::shared_ptr & to_disk, const String & to_dir) override; + std::unique_ptr readFile( const String & path, const ReadSettings & settings, diff --git a/src/Disks/DiskLocal.cpp b/src/Disks/DiskLocal.cpp index d81782a8af1..29277d0cb4e 100644 --- a/src/Disks/DiskLocal.cpp +++ b/src/Disks/DiskLocal.cpp @@ -440,6 +440,14 @@ void DiskLocal::copy(const String & from_path, const std::shared_ptr & to copyThroughBuffers(from_path, to_disk, to_path); /// Base implementation. } +void DiskLocal::copyDirectoryContent(const String & from_dir, const std::shared_ptr & to_disk, const String & to_dir) +{ + if (isSameDiskType(*this, *to_disk)) + fs::copy(from_dir, to_dir, fs::copy_options::recursive | fs::copy_options::overwrite_existing); /// Use more optimal way. + else + copyThroughBuffers(from_dir, to_disk, to_dir); /// Base implementation. +} + SyncGuardPtr DiskLocal::getDirectorySyncGuard(const String & path) const { return std::make_unique(fs::path(disk_path) / path); diff --git a/src/Disks/DiskLocal.h b/src/Disks/DiskLocal.h index 59dcf5e5c13..14ce8831c50 100644 --- a/src/Disks/DiskLocal.h +++ b/src/Disks/DiskLocal.h @@ -68,6 +68,8 @@ public: void copy(const String & from_path, const std::shared_ptr & to_disk, const String & to_path) override; + void copyDirectoryContent(const String & from_dir, const std::shared_ptr & to_disk, const String & to_dir) override; + void listFiles(const String & path, std::vector & file_names) override; std::unique_ptr readFile( diff --git a/src/Disks/DiskRestartProxy.cpp b/src/Disks/DiskRestartProxy.cpp index ba5d60901b3..8bb31cec55f 100644 --- a/src/Disks/DiskRestartProxy.cpp +++ b/src/Disks/DiskRestartProxy.cpp @@ -200,6 +200,12 @@ void DiskRestartProxy::copy(const String & from_path, const std::shared_ptr & to_disk, const String & to_dir) +{ + ReadLock lock (mutex); + DiskDecorator::copyDirectoryContent(from_dir, to_disk, to_dir); +} + void DiskRestartProxy::listFiles(const String & path, std::vector & file_names) { ReadLock lock (mutex); diff --git a/src/Disks/DiskRestartProxy.h b/src/Disks/DiskRestartProxy.h index 25b4787b482..07d1524af08 100644 --- a/src/Disks/DiskRestartProxy.h +++ b/src/Disks/DiskRestartProxy.h @@ -42,6 +42,7 @@ public: void moveFile(const String & from_path, const String & to_path) override; void replaceFile(const String & from_path, const String & to_path) override; void copy(const String & from_path, const DiskPtr & to_disk, const String & to_path) override; + void copyDirectoryContent(const String & from_dir, const std::shared_ptr & to_disk, const String & to_dir) override; void listFiles(const String & path, std::vector & file_names) override; std::unique_ptr readFile( const String & path, diff --git a/src/Disks/IDisk.cpp b/src/Disks/IDisk.cpp index 42d5f5fce10..a68a32858a1 100644 --- a/src/Disks/IDisk.cpp +++ b/src/Disks/IDisk.cpp @@ -4,7 +4,6 @@ #include #include #include -#include #include namespace DB @@ -20,13 +19,13 @@ bool IDisk::isDirectoryEmpty(const String & path) return !iterateDirectory(path)->isValid(); } -void copyFile(IDisk & from_disk, const String & from_path, IDisk & to_disk, const String & to_path) +void IDisk::copyFile(const String & from_file_path, IDisk & to_disk, const String & to_file_path) { LOG_DEBUG(&Poco::Logger::get("IDisk"), "Copying from {} (path: {}) {} to {} (path: {}) {}.", - from_disk.getName(), from_disk.getPath(), from_path, to_disk.getName(), to_disk.getPath(), to_path); + getName(), getPath(), from_file_path, to_disk.getName(), to_disk.getPath(), to_file_path); - auto in = from_disk.readFile(from_path); - auto out = to_disk.writeFile(to_path); + auto in = readFile(from_file_path); + auto out = to_disk.writeFile(to_file_path); copyData(*in, *out); out->finalize(); } @@ -34,7 +33,7 @@ void copyFile(IDisk & from_disk, const String & from_path, IDisk & to_disk, cons using ResultsCollector = std::vector>; -void asyncCopy(IDisk & from_disk, String from_path, IDisk & to_disk, String to_path, Executor & exec, ResultsCollector & results) +void asyncCopy(IDisk & from_disk, String from_path, IDisk & to_disk, String to_path, Executor & exec, ResultsCollector & results, bool copy_root_dir) { if (from_disk.isFile(from_path)) { @@ -42,28 +41,32 @@ void asyncCopy(IDisk & from_disk, String from_path, IDisk & to_disk, String to_p [&from_disk, from_path, &to_disk, to_path]() { setThreadName("DiskCopier"); - DB::copyFile(from_disk, from_path, to_disk, fs::path(to_path) / fileName(from_path)); + from_disk.copyFile(from_path, to_disk, fs::path(to_path) / fileName(from_path)); }); results.push_back(std::move(result)); } else { - fs::path dir_name = fs::path(from_path).parent_path().filename(); - fs::path dest(fs::path(to_path) / dir_name); - to_disk.createDirectories(dest); + fs::path dest(to_path); + if (copy_root_dir) + { + fs::path dir_name = fs::path(from_path).parent_path().filename(); + dest /= dir_name; + to_disk.createDirectories(dest); + } for (auto it = from_disk.iterateDirectory(from_path); it->isValid(); it->next()) - asyncCopy(from_disk, it->path(), to_disk, dest, exec, results); + asyncCopy(from_disk, it->path(), to_disk, dest, exec, results, true); } } -void IDisk::copyThroughBuffers(const String & from_path, const std::shared_ptr & to_disk, const String & to_path) +void IDisk::copyThroughBuffers(const String & from_path, const std::shared_ptr & to_disk, const String & to_path, bool copy_root_dir) { auto & exec = to_disk->getExecutor(); ResultsCollector results; - asyncCopy(*this, from_path, *to_disk, to_path, exec, results); + asyncCopy(*this, from_path, *to_disk, to_path, exec, results, copy_root_dir); for (auto & result : results) result.wait(); @@ -73,7 +76,16 @@ void IDisk::copyThroughBuffers(const String & from_path, const std::shared_ptr & to_disk, const String & to_path) { - copyThroughBuffers(from_path, to_disk, to_path); + copyThroughBuffers(from_path, to_disk, to_path, true); +} + + +void IDisk::copyDirectoryContent(const String & from_dir, const std::shared_ptr & to_disk, const String & to_dir) +{ + if (!to_disk->exists(to_dir)) + to_disk->createDirectories(to_dir); + + copyThroughBuffers(from_dir, to_disk, to_dir, false); } void IDisk::truncateFile(const String &, size_t) diff --git a/src/Disks/IDisk.h b/src/Disks/IDisk.h index 3e80195e7ab..b2489944fbd 100644 --- a/src/Disks/IDisk.h +++ b/src/Disks/IDisk.h @@ -160,6 +160,11 @@ public: /// Recursively copy data containing at `from_path` to `to_path` located at `to_disk`. virtual void copy(const String & from_path, const std::shared_ptr & to_disk, const String & to_path); + virtual void copyDirectoryContent(const String & from_dir, const std::shared_ptr & to_disk, const String & to_dir); + + /// Copy file `from_file_path` to `to_file_path` located at `to_disk`. + virtual void copyFile(const String & from_file_path, IDisk & to_disk, const String & to_file_path); + /// List files at `path` and add their names to `file_names` virtual void listFiles(const String & path, std::vector & file_names) = 0; @@ -347,7 +352,7 @@ protected: /// Base implementation of the function copy(). /// It just opens two files, reads data by portions from the first file, and writes it to the second one. /// A derived class may override copy() to provide a faster implementation. - void copyThroughBuffers(const String & from_path, const std::shared_ptr & to_disk, const String & to_path); + void copyThroughBuffers(const String & from_path, const std::shared_ptr & to_disk, const String & to_path, bool copy_root_dir = true); private: std::unique_ptr executor; diff --git a/src/Storages/MergeTree/IMergeTreeDataPart.h b/src/Storages/MergeTree/IMergeTreeDataPart.h index 2301bac55bd..0b57d726ccd 100644 --- a/src/Storages/MergeTree/IMergeTreeDataPart.h +++ b/src/Storages/MergeTree/IMergeTreeDataPart.h @@ -105,6 +105,8 @@ public: virtual bool isStoredOnRemoteDisk() const = 0; + virtual bool isStoredOnRemoteDiskWithZeroCopySupport() const = 0; + virtual bool supportsVerticalMerge() const { return false; } /// NOTE: Returns zeros if column files are not found in checksums. diff --git a/src/Storages/MergeTree/MergeTreeData.cpp b/src/Storages/MergeTree/MergeTreeData.cpp index 595a3fffa1b..199328f5895 100644 --- a/src/Storages/MergeTree/MergeTreeData.cpp +++ b/src/Storages/MergeTree/MergeTreeData.cpp @@ -5737,7 +5737,8 @@ MergeTreeData::MutableDataPartPtr MergeTreeData::cloneAndLoadDataPartOnSameDisk( const MergeTreePartInfo & dst_part_info, const StorageMetadataPtr & metadata_snapshot, const MergeTreeTransactionPtr & txn, - HardlinkedFiles * hardlinked_files) + HardlinkedFiles * hardlinked_files, + bool copy_instead_of_hardlink) { /// Check that the storage policy contains the disk where the src_part is located. bool does_storage_policy_allow_same_disk = false; @@ -5775,15 +5776,21 @@ MergeTreeData::MutableDataPartPtr MergeTreeData::cloneAndLoadDataPartOnSameDisk( src_part_path = fs::path(src_relative_data_path) / flushed_part_path / ""; } - LOG_DEBUG(log, "Cloning part {} to {}", fullPath(disk, src_part_path), fullPath(disk, dst_part_path)); - localBackup(disk, src_part_path, dst_part_path, /* make_source_readonly */ false); + String with_copy; + if (copy_instead_of_hardlink) + with_copy = " (copying data)"; + + LOG_DEBUG(log, "Cloning part {} to {}{}", fullPath(disk, src_part_path), fullPath(disk, dst_part_path), with_copy); + + localBackup(disk, src_part_path, dst_part_path, /* make_source_readonly */ false, {}, /* copy_instead_of_hardlinks */ copy_instead_of_hardlink); + disk->removeFileIfExists(fs::path(dst_part_path) / IMergeTreeDataPart::DELETE_ON_DESTROY_MARKER_FILE_NAME); disk->removeFileIfExists(fs::path(dst_part_path) / IMergeTreeDataPart::TXN_VERSION_METADATA_FILE_NAME); auto single_disk_volume = std::make_shared(disk->getName(), disk, 0); auto dst_data_part = createPart(dst_part_name, dst_part_info, single_disk_volume, tmp_dst_part_name); - if (hardlinked_files) + if (!copy_instead_of_hardlink && hardlinked_files) { hardlinked_files->source_part_name = src_part->name; hardlinked_files->source_table_shared_id = src_part->storage.getTableSharedID(); diff --git a/src/Storages/MergeTree/MergeTreeData.h b/src/Storages/MergeTree/MergeTreeData.h index fcc23c9c2ef..7590d2e1d49 100644 --- a/src/Storages/MergeTree/MergeTreeData.h +++ b/src/Storages/MergeTree/MergeTreeData.h @@ -780,8 +780,10 @@ public: }; MergeTreeData::MutableDataPartPtr cloneAndLoadDataPartOnSameDisk( - const MergeTreeData::DataPartPtr & src_part, const String & tmp_part_prefix, const MergeTreePartInfo & dst_part_info, - const StorageMetadataPtr & metadata_snapshot, const MergeTreeTransactionPtr & txn, HardlinkedFiles * hardlinked_files); + const MergeTreeData::DataPartPtr & src_part, const String & tmp_part_prefix, + const MergeTreePartInfo & dst_part_info, const StorageMetadataPtr & metadata_snapshot, + const MergeTreeTransactionPtr & txn, HardlinkedFiles * hardlinked_files, + bool copy_instead_of_hardlink); virtual std::vector getMutationsStatus() const = 0; diff --git a/src/Storages/MergeTree/MergeTreeDataPartCompact.cpp b/src/Storages/MergeTree/MergeTreeDataPartCompact.cpp index 14713541d3f..41ff202bf14 100644 --- a/src/Storages/MergeTree/MergeTreeDataPartCompact.cpp +++ b/src/Storages/MergeTree/MergeTreeDataPartCompact.cpp @@ -187,6 +187,11 @@ bool MergeTreeDataPartCompact::isStoredOnRemoteDisk() const return volume->getDisk()->isRemote(); } +bool MergeTreeDataPartCompact::isStoredOnRemoteDiskWithZeroCopySupport() const +{ + return volume->getDisk()->supportZeroCopyReplication(); +} + MergeTreeDataPartCompact::~MergeTreeDataPartCompact() { removeIfNeeded(); diff --git a/src/Storages/MergeTree/MergeTreeDataPartCompact.h b/src/Storages/MergeTree/MergeTreeDataPartCompact.h index 38bfa11652a..79d1af06db5 100644 --- a/src/Storages/MergeTree/MergeTreeDataPartCompact.h +++ b/src/Storages/MergeTree/MergeTreeDataPartCompact.h @@ -58,6 +58,8 @@ public: bool isStoredOnRemoteDisk() const override; + bool isStoredOnRemoteDiskWithZeroCopySupport() const override; + bool hasColumnFiles(const NameAndTypePair & column) const override; String getFileNameForColumn(const NameAndTypePair & /* column */) const override { return DATA_FILE_NAME; } diff --git a/src/Storages/MergeTree/MergeTreeDataPartInMemory.h b/src/Storages/MergeTree/MergeTreeDataPartInMemory.h index c5ee9ebd01f..d64245ca616 100644 --- a/src/Storages/MergeTree/MergeTreeDataPartInMemory.h +++ b/src/Storages/MergeTree/MergeTreeDataPartInMemory.h @@ -45,6 +45,7 @@ public: bool isStoredOnDisk() const override { return false; } bool isStoredOnRemoteDisk() const override { return false; } + bool isStoredOnRemoteDiskWithZeroCopySupport() const override { return false; } bool hasColumnFiles(const NameAndTypePair & column) const override { return !!getColumnPosition(column.getNameInStorage()); } String getFileNameForColumn(const NameAndTypePair & /* column */) const override { return ""; } void renameTo(const String & new_relative_path, bool remove_new_dir_if_exists) const override; diff --git a/src/Storages/MergeTree/MergeTreeDataPartWide.cpp b/src/Storages/MergeTree/MergeTreeDataPartWide.cpp index c8ddf3c8e2f..aa5dc5422c0 100644 --- a/src/Storages/MergeTree/MergeTreeDataPartWide.cpp +++ b/src/Storages/MergeTree/MergeTreeDataPartWide.cpp @@ -146,6 +146,11 @@ bool MergeTreeDataPartWide::isStoredOnRemoteDisk() const return volume->getDisk()->isRemote(); } +bool MergeTreeDataPartWide::isStoredOnRemoteDiskWithZeroCopySupport() const +{ + return volume->getDisk()->supportZeroCopyReplication(); +} + MergeTreeDataPartWide::~MergeTreeDataPartWide() { removeIfNeeded(); diff --git a/src/Storages/MergeTree/MergeTreeDataPartWide.h b/src/Storages/MergeTree/MergeTreeDataPartWide.h index 078dda394ee..bc2c399c100 100644 --- a/src/Storages/MergeTree/MergeTreeDataPartWide.h +++ b/src/Storages/MergeTree/MergeTreeDataPartWide.h @@ -52,6 +52,8 @@ public: bool isStoredOnRemoteDisk() const override; + bool isStoredOnRemoteDiskWithZeroCopySupport() const override; + bool supportsVerticalMerge() const override { return true; } String getFileNameForColumn(const NameAndTypePair & column) const override; diff --git a/src/Storages/MergeTree/MutateTask.cpp b/src/Storages/MergeTree/MutateTask.cpp index f449e5e3c04..73b703aa9ca 100644 --- a/src/Storages/MergeTree/MutateTask.cpp +++ b/src/Storages/MergeTree/MutateTask.cpp @@ -1300,7 +1300,7 @@ bool MutateTask::prepare() storage_from_source_part, ctx->metadata_snapshot, ctx->commands_for_part, Context::createCopy(context_for_reading))) { LOG_TRACE(ctx->log, "Part {} doesn't change up to mutation version {}", ctx->source_part->name, ctx->future_part->part_info.mutation); - promise.set_value(ctx->data->cloneAndLoadDataPartOnSameDisk(ctx->source_part, "tmp_clone_", ctx->future_part->part_info, ctx->metadata_snapshot, ctx->txn, &ctx->hardlinked_files)); + promise.set_value(ctx->data->cloneAndLoadDataPartOnSameDisk(ctx->source_part, "tmp_clone_", ctx->future_part->part_info, ctx->metadata_snapshot, ctx->txn, &ctx->hardlinked_files, false)); return false; } else @@ -1391,7 +1391,7 @@ bool MutateTask::prepare() && ctx->files_to_rename.empty()) { LOG_TRACE(ctx->log, "Part {} doesn't change up to mutation version {} (optimized)", ctx->source_part->name, ctx->future_part->part_info.mutation); - promise.set_value(ctx->data->cloneAndLoadDataPartOnSameDisk(ctx->source_part, "tmp_mut_", ctx->future_part->part_info, ctx->metadata_snapshot, ctx->txn, &ctx->hardlinked_files)); + promise.set_value(ctx->data->cloneAndLoadDataPartOnSameDisk(ctx->source_part, "tmp_mut_", ctx->future_part->part_info, ctx->metadata_snapshot, ctx->txn, &ctx->hardlinked_files, false)); return false; } diff --git a/src/Storages/MergeTree/localBackup.cpp b/src/Storages/MergeTree/localBackup.cpp index 8bb3e4cf78a..10aff144914 100644 --- a/src/Storages/MergeTree/localBackup.cpp +++ b/src/Storages/MergeTree/localBackup.cpp @@ -13,9 +13,13 @@ namespace ErrorCodes extern const int DIRECTORY_ALREADY_EXISTS; } +namespace +{ -static void localBackupImpl(const DiskPtr & disk, const String & source_path, const String & destination_path, bool make_source_readonly, size_t level, - std::optional max_level) +void localBackupImpl( + const DiskPtr & disk, const String & source_path, + const String & destination_path, bool make_source_readonly, size_t level, + std::optional max_level) { if (max_level && level > *max_level) return; @@ -43,8 +47,6 @@ static void localBackupImpl(const DiskPtr & disk, const String & source_path, co } } -namespace -{ class CleanupOnFail { public: @@ -81,7 +83,10 @@ private: }; } -void localBackup(const DiskPtr & disk, const String & source_path, const String & destination_path, bool make_source_readonly, std::optional max_level) +void localBackup( + const DiskPtr & disk, const String & source_path, + const String & destination_path, bool make_source_readonly, + std::optional max_level, bool copy_instead_of_hardlinks) { if (disk->exists(destination_path) && !disk->isDirectoryEmpty(destination_path)) { @@ -101,7 +106,10 @@ void localBackup(const DiskPtr & disk, const String & source_path, const String { try { - localBackupImpl(disk, source_path, destination_path, make_source_readonly, 0, max_level); + if (copy_instead_of_hardlinks) + disk->copyDirectoryContent(source_path, disk, destination_path); + else + localBackupImpl(disk, source_path, destination_path, make_source_readonly, 0, max_level); } catch (const DB::ErrnoException & e) { diff --git a/src/Storages/MergeTree/localBackup.h b/src/Storages/MergeTree/localBackup.h index c6a46620447..2eac45ac6a3 100644 --- a/src/Storages/MergeTree/localBackup.h +++ b/src/Storages/MergeTree/localBackup.h @@ -20,6 +20,6 @@ namespace DB * If max_level is specified, than only files which depth relative source_path less or equal max_level will be copied. * So, if max_level=0 than only direct file child are copied. */ -void localBackup(const DiskPtr & disk, const String & source_path, const String & destination_path, bool make_source_readonly = true, std::optional max_level = {}); +void localBackup(const DiskPtr & disk, const String & source_path, const String & destination_path, bool make_source_readonly = true, std::optional max_level = {}, bool copy_instead_of_hardlinks = false); } diff --git a/src/Storages/StorageMergeTree.cpp b/src/Storages/StorageMergeTree.cpp index 4b8c9665939..8db6fc6c88e 100644 --- a/src/Storages/StorageMergeTree.cpp +++ b/src/Storages/StorageMergeTree.cpp @@ -1583,7 +1583,7 @@ void StorageMergeTree::replacePartitionFrom(const StoragePtr & source_table, con Int64 temp_index = insert_increment.get(); MergeTreePartInfo dst_part_info(partition_id, temp_index, temp_index, src_part->info.level); - auto dst_part = cloneAndLoadDataPartOnSameDisk(src_part, TMP_PREFIX, dst_part_info, my_metadata_snapshot, local_context->getCurrentTransaction(), {}); + auto dst_part = cloneAndLoadDataPartOnSameDisk(src_part, TMP_PREFIX, dst_part_info, my_metadata_snapshot, local_context->getCurrentTransaction(), {}, false); dst_parts.emplace_back(std::move(dst_part)); } @@ -1669,7 +1669,7 @@ void StorageMergeTree::movePartitionToTable(const StoragePtr & dest_table, const Int64 temp_index = insert_increment.get(); MergeTreePartInfo dst_part_info(partition_id, temp_index, temp_index, src_part->info.level); - auto dst_part = dest_table_storage->cloneAndLoadDataPartOnSameDisk(src_part, TMP_PREFIX, dst_part_info, dest_metadata_snapshot, local_context->getCurrentTransaction(), {}); + auto dst_part = dest_table_storage->cloneAndLoadDataPartOnSameDisk(src_part, TMP_PREFIX, dst_part_info, dest_metadata_snapshot, local_context->getCurrentTransaction(), {}, false); dst_parts.emplace_back(std::move(dst_part)); } diff --git a/src/Storages/StorageReplicatedMergeTree.cpp b/src/Storages/StorageReplicatedMergeTree.cpp index 49eae770eb9..b9fde9ee041 100644 --- a/src/Storages/StorageReplicatedMergeTree.cpp +++ b/src/Storages/StorageReplicatedMergeTree.cpp @@ -1940,6 +1940,7 @@ bool StorageReplicatedMergeTree::executeReplaceRange(const LogEntry & entry) entry.znode_name, entry_replace.drop_range_part_name, entry_replace.new_part_names.size(), entry_replace.from_database, entry_replace.from_table); auto metadata_snapshot = getInMemoryMetadataPtr(); + auto storage_settings_ptr = getSettings(); MergeTreePartInfo drop_range = MergeTreePartInfo::fromPartName(entry_replace.drop_range_part_name, format_version); /// Range with only one block has special meaning: it's ATTACH PARTITION or MOVE PARTITION, so there is no drop range @@ -2092,6 +2093,14 @@ bool StorageReplicatedMergeTree::executeReplaceRange(const LogEntry & entry) continue; } + bool avoid_copy_local_part = storage_settings_ptr->allow_remote_fs_zero_copy_replication && src_part->isStoredOnRemoteDiskWithZeroCopySupport(); + + if (avoid_copy_local_part) + { + LOG_DEBUG(log, "Avoid copy local part {} from table {} because of zero-copy replication", part_desc->src_part_name, source_table_id.getNameForLogs()); + continue; + } + String checksum_hex = src_part->checksums.getTotalChecksumHex(); if (checksum_hex != part_desc->checksum_hex) @@ -2200,16 +2209,16 @@ bool StorageReplicatedMergeTree::executeReplaceRange(const LogEntry & entry) static const String TMP_PREFIX = "tmp_replace_from_"; std::vector hardlinked_files_for_parts; + auto obtain_part = [&] (PartDescriptionPtr & part_desc) { if (part_desc->src_table_part) { - if (part_desc->checksum_hex != part_desc->src_table_part->checksums.getTotalChecksumHex()) throw Exception("Checksums of " + part_desc->src_table_part->name + " is suddenly changed", ErrorCodes::UNFINISHED); part_desc->res_part = cloneAndLoadDataPartOnSameDisk( - part_desc->src_table_part, TMP_PREFIX + "clone_", part_desc->new_part_info, metadata_snapshot, NO_TRANSACTION_PTR, &part_desc->hardlinked_files); + part_desc->src_table_part, TMP_PREFIX + "clone_", part_desc->new_part_info, metadata_snapshot, NO_TRANSACTION_PTR, &part_desc->hardlinked_files, false); } else if (!part_desc->replica.empty()) { @@ -3949,7 +3958,7 @@ bool StorageReplicatedMergeTree::fetchPart(const String & part_name, const Stora { get_part = [&, part_to_clone]() { - return cloneAndLoadDataPartOnSameDisk(part_to_clone, "tmp_clone_", part_info, metadata_snapshot, NO_TRANSACTION_PTR, &hardlinked_files); + return cloneAndLoadDataPartOnSameDisk(part_to_clone, "tmp_clone_", part_info, metadata_snapshot, NO_TRANSACTION_PTR, &hardlinked_files, false); }; } else @@ -6350,6 +6359,7 @@ void StorageReplicatedMergeTree::replacePartitionFrom( /// First argument is true, because we possibly will add new data to current table. auto lock1 = lockForShare(query_context->getCurrentQueryId(), query_context->getSettingsRef().lock_acquire_timeout); auto lock2 = source_table->lockForShare(query_context->getCurrentQueryId(), query_context->getSettingsRef().lock_acquire_timeout); + auto storage_settings_ptr = getSettings(); auto source_metadata_snapshot = source_table->getInMemoryMetadataPtr(); auto metadata_snapshot = getInMemoryMetadataPtr(); @@ -6434,7 +6444,10 @@ void StorageReplicatedMergeTree::replacePartitionFrom( MergeTreePartInfo dst_part_info(partition_id, index, index, src_part->info.level); MergeTreeData::HardlinkedFiles hardlinked_files; - auto dst_part = cloneAndLoadDataPartOnSameDisk(src_part, TMP_PREFIX, dst_part_info, metadata_snapshot, NO_TRANSACTION_PTR, &hardlinked_files); + bool copy_instead_of_hardlink = storage_settings_ptr->allow_remote_fs_zero_copy_replication + && src_part->isStoredOnRemoteDiskWithZeroCopySupport(); + + auto dst_part = cloneAndLoadDataPartOnSameDisk(src_part, TMP_PREFIX, dst_part_info, metadata_snapshot, NO_TRANSACTION_PTR, &hardlinked_files, copy_instead_of_hardlink); src_parts.emplace_back(src_part); dst_parts.emplace_back(dst_part); @@ -6564,6 +6577,7 @@ void StorageReplicatedMergeTree::movePartitionToTable(const StoragePtr & dest_ta { auto lock1 = lockForShare(query_context->getCurrentQueryId(), query_context->getSettingsRef().lock_acquire_timeout); auto lock2 = dest_table->lockForShare(query_context->getCurrentQueryId(), query_context->getSettingsRef().lock_acquire_timeout); + auto storage_settings_ptr = getSettings(); auto dest_table_storage = std::dynamic_pointer_cast(dest_table); if (!dest_table_storage) @@ -6656,7 +6670,11 @@ void StorageReplicatedMergeTree::movePartitionToTable(const StoragePtr & dest_ta MergeTreePartInfo dst_part_info(partition_id, index, index, src_part->info.level); MergeTreeData::HardlinkedFiles hardlinked_files; - auto dst_part = dest_table_storage->cloneAndLoadDataPartOnSameDisk(src_part, TMP_PREFIX, dst_part_info, dest_metadata_snapshot, NO_TRANSACTION_PTR, &hardlinked_files); + + bool copy_instead_of_hardlink = storage_settings_ptr->allow_remote_fs_zero_copy_replication + && src_part->isStoredOnRemoteDiskWithZeroCopySupport(); + + auto dst_part = dest_table_storage->cloneAndLoadDataPartOnSameDisk(src_part, TMP_PREFIX, dst_part_info, dest_metadata_snapshot, NO_TRANSACTION_PTR, &hardlinked_files, copy_instead_of_hardlink); src_parts.emplace_back(src_part); dst_parts.emplace_back(dst_part); From cbced6567d954af6c4f435ebfc2742c2f4bb55f1 Mon Sep 17 00:00:00 2001 From: alesapin Date: Thu, 21 Apr 2022 17:32:23 +0200 Subject: [PATCH 37/55] Fix build --- src/Disks/IDisk.cpp | 1 + 1 file changed, 1 insertion(+) diff --git a/src/Disks/IDisk.cpp b/src/Disks/IDisk.cpp index a68a32858a1..b4d1c52c1d8 100644 --- a/src/Disks/IDisk.cpp +++ b/src/Disks/IDisk.cpp @@ -4,6 +4,7 @@ #include #include #include +#include #include namespace DB From 485c1d407286d1c1142a817e4a73c26a545e10fe Mon Sep 17 00:00:00 2001 From: Maksim Kita Date: Thu, 21 Apr 2022 19:10:11 +0200 Subject: [PATCH 38/55] Fixed tests --- src/Interpreters/TreeOptimizer.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/Interpreters/TreeOptimizer.cpp b/src/Interpreters/TreeOptimizer.cpp index cb0fdc5b162..b25ed415734 100644 --- a/src/Interpreters/TreeOptimizer.cpp +++ b/src/Interpreters/TreeOptimizer.cpp @@ -146,7 +146,7 @@ void optimizeGroupBy(ASTSelectQuery * select_query, ContextPtr context) if (!function_builder) function_builder = function_factory.get(function->name, context); - if (function_builder->isInjective({})) + if (!function_builder->isInjective({})) { ++i; continue; From 64383bcb0a67486268bc9db8f564338d7af009bd Mon Sep 17 00:00:00 2001 From: alesapin Date: Fri, 22 Apr 2022 11:35:36 +0200 Subject: [PATCH 39/55] Better --- src/Storages/MergeTree/MergeTreeData.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/Storages/MergeTree/MergeTreeData.cpp b/src/Storages/MergeTree/MergeTreeData.cpp index 0ca3915f831..3571e09ff3c 100644 --- a/src/Storages/MergeTree/MergeTreeData.cpp +++ b/src/Storages/MergeTree/MergeTreeData.cpp @@ -3369,7 +3369,7 @@ void MergeTreeData::swapActivePart(MergeTreeData::DataPartPtr part_copy) original_active_part->force_keep_shared_data = false; if (original_active_part->volume->getDisk()->supportZeroCopyReplication() && - part_copy->volume->getDisk()->supportZeroCopyReplication() && + part_copy->isStoredOnRemoteDiskWithZeroCopySupport() && original_active_part->getUniqueId() == part_copy->getUniqueId()) { /// May be when several volumes use the same S3/HDFS storage From 2559d62d74c43360a1ace7912a08e837ee568ea1 Mon Sep 17 00:00:00 2001 From: alesapin Date: Fri, 22 Apr 2022 15:20:02 +0200 Subject: [PATCH 40/55] Better logging and bugfixes --- src/Disks/IDisk.h | 1 + src/Storages/StorageReplicatedMergeTree.cpp | 82 ++++++++++++++++++--- 2 files changed, 71 insertions(+), 12 deletions(-) diff --git a/src/Disks/IDisk.h b/src/Disks/IDisk.h index b2489944fbd..c4578d51b6e 100644 --- a/src/Disks/IDisk.h +++ b/src/Disks/IDisk.h @@ -160,6 +160,7 @@ public: /// Recursively copy data containing at `from_path` to `to_path` located at `to_disk`. virtual void copy(const String & from_path, const std::shared_ptr & to_disk, const String & to_path); + /// Recursively copy files from from_dir to to_dir. Create to_dir if not exists. virtual void copyDirectoryContent(const String & from_dir, const std::shared_ptr & to_disk, const String & to_dir); /// Copy file `from_file_path` to `to_file_path` located at `to_disk`. diff --git a/src/Storages/StorageReplicatedMergeTree.cpp b/src/Storages/StorageReplicatedMergeTree.cpp index b9fde9ee041..cf5cf861ffa 100644 --- a/src/Storages/StorageReplicatedMergeTree.cpp +++ b/src/Storages/StorageReplicatedMergeTree.cpp @@ -6749,7 +6749,7 @@ void StorageReplicatedMergeTree::movePartitionToTable(const StoragePtr & dest_ta dest_table_storage->renameTempPartAndReplace(part, query_context->getCurrentTransaction().get(), nullptr, &transaction, lock); for (size_t i = 0; i < dst_parts.size(); ++i) - lockSharedData(*dst_parts[i], false, hardlinked_files_for_parts[i]); + dest_table_storage->lockSharedData(*dst_parts[i], false, hardlinked_files_for_parts[i]); Coordination::Error code = zookeeper->tryMulti(ops, op_results); if (code == Coordination::Error::ZBADVERSION) @@ -6768,7 +6768,7 @@ void StorageReplicatedMergeTree::movePartitionToTable(const StoragePtr & dest_ta PartLog::addNewParts(getContext(), dst_parts, watch.elapsed(), ExecutionStatus::fromCurrentException()); for (const auto & dst_part : dst_parts) - unlockSharedData(*dst_part); + dest_table_storage->unlockSharedData(*dst_part); throw; } @@ -7486,9 +7486,12 @@ std::pair StorageReplicatedMergeTree::unlockSharedDataByID( /// Delete our replica node for part from zookeeper (we are not interested in it anymore) String zookeeper_part_replica_node = fs::path(zookeeper_part_uniq_node) / replica_name_; - LOG_TRACE(logger, "Remove zookeeper lock {}", zookeeper_part_replica_node); + LOG_TRACE(logger, "Remove zookeeper lock {} for part {}", zookeeper_part_replica_node, part_name); - zookeeper_ptr->tryRemove(zookeeper_part_replica_node); + if (auto ec = zookeeper_ptr->tryRemove(zookeeper_part_replica_node); ec != Coordination::Error::ZOK && ec != Coordination::Error::ZNONODE) + { + throw zkutil::KeeperException(ec, zookeeper_part_replica_node); + } /// Check, maybe we were the last replica and can remove part forever Strings children; @@ -7496,33 +7499,62 @@ std::pair StorageReplicatedMergeTree::unlockSharedDataByID( if (!children.empty()) { - LOG_TRACE(logger, "Found zookeper locks for {}", zookeeper_part_uniq_node); + LOG_TRACE(logger, "Found {} ({}) zookeper locks for {}", zookeeper_part_uniq_node, children.size(), fmt::join(children, ", ")); part_has_no_more_locks = false; continue; } auto error_code = zookeeper_ptr->tryRemove(zookeeper_part_uniq_node); - LOG_TRACE(logger, "Remove parent zookeeper lock {} : {}", zookeeper_part_uniq_node, error_code != Coordination::Error::ZNOTEMPTY); + if (error_code == Coordination::Error::ZOK) + { + LOG_TRACE(logger, "Removed last parent zookeeper lock {} for part {} with id {}", zookeeper_part_uniq_node, part_name, part_id); + } + else if (error_code == Coordination::Error::ZNOTEMPTY) + { + LOG_TRACE(logger, "Cannot remove last parent zookeeper lock {} for part {} with id {}, another replica locked part concurrently", zookeeper_part_uniq_node, part_name, part_id); + } + else if (error_code == Coordination::Error::ZNONODE) + { + LOG_TRACE(logger, "Node with parent zookeeper lock {} for part {} with id {} doesn't exist", zookeeper_part_uniq_node, part_name, part_id); + } + else + { + throw zkutil::KeeperException(error_code, zookeeper_part_uniq_node); + } + /// Even when we have lock with same part name, but with different uniq, we can remove files on S3 children.clear(); String zookeeper_part_node = fs::path(zookeeper_part_uniq_node).parent_path(); zookeeper_ptr->tryGetChildren(zookeeper_part_node, children); + if (children.empty()) { /// Cleanup after last uniq removing error_code = zookeeper_ptr->tryRemove(zookeeper_part_node); - LOG_TRACE(logger, "Remove parent zookeeper lock {} : {}", zookeeper_part_node, error_code != Coordination::Error::ZNOTEMPTY); + if (error_code == Coordination::Error::ZOK) + { + LOG_TRACE(logger, "Removed last parent zookeeper lock {} for part {} (part is finally unlocked)", zookeeper_part_uniq_node, part_name); + } + else if (error_code == Coordination::Error::ZNOTEMPTY) + { + LOG_TRACE(logger, "Cannot remove last parent zookeeper lock {} for part {}, another replica locked part concurrently", zookeeper_part_uniq_node, part_name); + } + else if (error_code == Coordination::Error::ZNONODE) + { + LOG_TRACE(logger, "Node with parent zookeeper lock {} for part {} doesn't exist (part was unlocked before)", zookeeper_part_uniq_node, part_name); + } + else + { + throw zkutil::KeeperException(error_code, zookeeper_part_uniq_node); + } } else { - LOG_TRACE(logger, "Can't remove parent zookeeper lock {} : {}", zookeeper_part_node, children.size()); - for (auto & c : children) - { - LOG_TRACE(logger, "Child node {}", c); - } + LOG_TRACE(logger, "Can't remove parent zookeeper lock {} for part {}, because children {} ({}) were concurrently created", + zookeeper_part_node, part_name, children.size(), fmt::join(children, ", ")); } } @@ -7859,6 +7891,8 @@ bool StorageReplicatedMergeTree::createEmptyPartInsteadOfLost(zkutil::ZooKeeperP throw Exception(ErrorCodes::INCORRECT_DATA, "Tried to create empty part {}, but it replaces existing parts {}.", lost_part_name, fmt::join(part_names, ", ")); } + lockSharedData(*new_data_part, false, {}); + while (true) { @@ -7906,6 +7940,7 @@ bool StorageReplicatedMergeTree::createEmptyPartInsteadOfLost(zkutil::ZooKeeperP } catch (const Exception & ex) { + unlockSharedData(*new_data_part); LOG_WARNING(log, "Cannot commit empty part {} with error {}", lost_part_name, ex.displayText()); return false; } @@ -7923,6 +7958,7 @@ void StorageReplicatedMergeTree::createZeroCopyLockNode( /// In rare case other replica can remove path between createAncestors and createIfNotExists /// So we make up to 5 attempts + bool created = false; for (int attempts = 5; attempts > 0; --attempts) { try @@ -7949,7 +7985,14 @@ void StorageReplicatedMergeTree::createZeroCopyLockNode( Coordination::Responses responses; auto error = zookeeper->tryMulti(ops, responses); if (error == Coordination::Error::ZOK) + { + created = true; break; + } + else if (error == Coordination::Error::ZNONODE && mode != zkutil::CreateMode::Persistent) + { + throw Exception(ErrorCodes::NOT_FOUND_NODE, "Cannot create ephemeral zero copy lock {} because part was unlocked from zookeeper", zookeeper_node); + } } else { @@ -7967,16 +8010,31 @@ void StorageReplicatedMergeTree::createZeroCopyLockNode( Coordination::Responses responses; auto error = zookeeper->tryMulti(ops, responses); if (error == Coordination::Error::ZOK || error == Coordination::Error::ZNODEEXISTS) + { + created = true; break; + } + else if (error == Coordination::Error::ZNONODE && mode != zkutil::CreateMode::Persistent) + { + /// Ephemeral locks used during fetches so if parent node was removed we cannot do anything + throw Exception(ErrorCodes::NOT_FOUND_NODE, "Cannot create ephemeral zero copy lock {} because part was unlocked from zookeeper", zookeeper_node); + } } } catch (const zkutil::KeeperException & e) { if (e.code == Coordination::Error::ZNONODE) continue; + throw; } } + + if (!created) + { + String mode_str = mode == zkutil::CreateMode::Persistent ? "persistent" : "ephemral"; + throw Exception(ErrorCodes::NOT_FOUND_NODE, "Cannot create {} zero copy lock {} because part was unlocked from zookeeper", mode_str, zookeeper_node); + } } From ebdd53ef8fc1449daf35d4d73c2982afd52f89a6 Mon Sep 17 00:00:00 2001 From: alesapin Date: Fri, 22 Apr 2022 19:18:18 +0200 Subject: [PATCH 41/55] Fix shared fetch --- src/Storages/MergeTree/DataPartsExchange.cpp | 1 + src/Storages/StorageReplicatedMergeTree.cpp | 8 ++++---- src/Storages/StorageReplicatedMergeTree.h | 3 ++- 3 files changed, 7 insertions(+), 5 deletions(-) diff --git a/src/Storages/MergeTree/DataPartsExchange.cpp b/src/Storages/MergeTree/DataPartsExchange.cpp index dd36a45f829..064447c54ad 100644 --- a/src/Storages/MergeTree/DataPartsExchange.cpp +++ b/src/Storages/MergeTree/DataPartsExchange.cpp @@ -534,6 +534,7 @@ MergeTreeData::MutableDataPartPtr Fetcher::fetchPart( { if (e.code() != ErrorCodes::S3_ERROR && e.code() != ErrorCodes::ZERO_COPY_REPLICATION_ERROR) throw; + LOG_WARNING(log, fmt::runtime(e.message() + " Will retry fetching part without zero-copy.")); /// Try again but without zero-copy return fetchPart(metadata_snapshot, context, part_name, replica_path, host, port, timeouts, diff --git a/src/Storages/StorageReplicatedMergeTree.cpp b/src/Storages/StorageReplicatedMergeTree.cpp index cf5cf861ffa..15b8b642098 100644 --- a/src/Storages/StorageReplicatedMergeTree.cpp +++ b/src/Storages/StorageReplicatedMergeTree.cpp @@ -3844,7 +3844,7 @@ bool StorageReplicatedMergeTree::partIsLastQuorumPart(const MergeTreePartInfo & bool StorageReplicatedMergeTree::fetchPart(const String & part_name, const StorageMetadataPtr & metadata_snapshot, - const String & source_replica_path, bool to_detached, size_t quorum, zkutil::ZooKeeper::Ptr zookeeper_) + const String & source_replica_path, bool to_detached, size_t quorum, zkutil::ZooKeeper::Ptr zookeeper_, bool try_fetch_shared) { auto zookeeper = zookeeper_ ? zookeeper_ : getZooKeeper(); const auto part_info = MergeTreePartInfo::fromPartName(part_name, format_version); @@ -3991,7 +3991,7 @@ bool StorageReplicatedMergeTree::fetchPart(const String & part_name, const Stora to_detached, "", &tagger_ptr, - true); + try_fetch_shared); }; } @@ -5743,7 +5743,7 @@ void StorageReplicatedMergeTree::fetchPartition( try { /// part name , metadata, part_path , true, 0, zookeeper - if (!fetchPart(part_name, metadata_snapshot, part_path, true, 0, zookeeper)) + if (!fetchPart(part_name, metadata_snapshot, part_path, true, 0, zookeeper, /* try_shared = */ false)) throw Exception(ErrorCodes::UNFINISHED, "Failed to fetch part {} from {}", part_name, from_); } catch (const DB::Exception & e) @@ -5881,7 +5881,7 @@ void StorageReplicatedMergeTree::fetchPartition( try { - fetched = fetchPart(part, metadata_snapshot, best_replica_path, true, 0, zookeeper); + fetched = fetchPart(part, metadata_snapshot, best_replica_path, true, 0, zookeeper, /* try_shared = */ false); } catch (const DB::Exception & e) { diff --git a/src/Storages/StorageReplicatedMergeTree.h b/src/Storages/StorageReplicatedMergeTree.h index ff389e6f3a3..096ef20cf7f 100644 --- a/src/Storages/StorageReplicatedMergeTree.h +++ b/src/Storages/StorageReplicatedMergeTree.h @@ -621,7 +621,8 @@ private: const String & replica_path, bool to_detached, size_t quorum, - zkutil::ZooKeeper::Ptr zookeeper_ = nullptr); + zkutil::ZooKeeper::Ptr zookeeper_ = nullptr, + bool try_fetch_shared = true); /** Download the specified part from the specified replica. * Used for replace local part on the same s3-shared part in hybrid storage. From f183eabaa286166bd3c206f1e20f790254298dd3 Mon Sep 17 00:00:00 2001 From: alesapin Date: Fri, 22 Apr 2022 22:36:55 +0200 Subject: [PATCH 42/55] Return normal settings --- src/Storages/MergeTree/MergeTreeSettings.h | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/Storages/MergeTree/MergeTreeSettings.h b/src/Storages/MergeTree/MergeTreeSettings.h index e31c0686a16..0d984dc4dee 100644 --- a/src/Storages/MergeTree/MergeTreeSettings.h +++ b/src/Storages/MergeTree/MergeTreeSettings.h @@ -45,7 +45,7 @@ struct Settings; M(UInt64, number_of_free_entries_in_pool_to_lower_max_size_of_merge, 8, "When there is less than specified number of free entries in pool (or replicated queue), start to lower maximum size of merge to process (or to put in queue). This is to allow small merges to process - not filling the pool with long running merges.", 0) \ M(UInt64, number_of_free_entries_in_pool_to_execute_mutation, 20, "When there is less than specified number of free entries in pool, do not execute part mutations. This is to leave free threads for regular merges and avoid \"Too many parts\"", 0) \ M(UInt64, max_number_of_merges_with_ttl_in_pool, 2, "When there is more than specified number of merges with TTL entries in pool, do not assign new merge with TTL. This is to leave free threads for regular merges and avoid \"Too many parts\"", 0) \ - M(Seconds, old_parts_lifetime, 1, "How many seconds to keep obsolete parts.", 0) \ + M(Seconds, old_parts_lifetime, 8 * 60, "How many seconds to keep obsolete parts.", 0) \ M(Seconds, temporary_directories_lifetime, 86400, "How many seconds to keep tmp_-directories. You should not lower this value because merges and mutations may not be able to work with low value of this setting.", 0) \ M(Seconds, lock_acquire_timeout_for_background_operations, DBMS_DEFAULT_LOCK_ACQUIRE_TIMEOUT_SEC, "For background operations like merges, mutations etc. How many seconds before failing to acquire table locks.", 0) \ M(UInt64, min_rows_to_fsync_after_merge, 0, "Minimal number of rows to do fsync for part after merge (0 - disabled)", 0) \ @@ -100,8 +100,8 @@ struct Settings; \ /** Check delay of replicas settings. */ \ M(UInt64, min_relative_delay_to_measure, 120, "Calculate relative replica delay only if absolute delay is not less that this value.", 0) \ - M(UInt64, cleanup_delay_period, 1, "Period to clean old queue logs, blocks hashes and parts.", 0) \ - M(UInt64, cleanup_delay_period_random_add, 1, "Add uniformly distributed value from 0 to x seconds to cleanup_delay_period to avoid thundering herd effect and subsequent DoS of ZooKeeper in case of very large number of tables.", 0) \ + M(UInt64, cleanup_delay_period, 30, "Period to clean old queue logs, blocks hashes and parts.", 0) \ + M(UInt64, cleanup_delay_period_random_add, 10, "Add uniformly distributed value from 0 to x seconds to cleanup_delay_period to avoid thundering herd effect and subsequent DoS of ZooKeeper in case of very large number of tables.", 0) \ M(UInt64, min_relative_delay_to_close, 300, "Minimal delay from other replicas to close, stop serving requests and not return Ok during status check.", 0) \ M(UInt64, min_absolute_delay_to_close, 0, "Minimal absolute delay to close, stop serving requests and not return Ok during status check.", 0) \ M(UInt64, enable_vertical_merge_algorithm, 1, "Enable usage of Vertical merge algorithm.", 0) \ From d18ed43b06559a4ba108446178bd10b0f7ea2a37 Mon Sep 17 00:00:00 2001 From: Julio Jimenez Date: Fri, 22 Apr 2022 22:28:59 -0400 Subject: [PATCH 43/55] CodeQL experiment excluding contrib Signed-off-by: Julio Jimenez --- .github/workflows/codeql.yml | 87 ++++++++++++++++++++++++++++++++++++ 1 file changed, 87 insertions(+) create mode 100644 .github/workflows/codeql.yml diff --git a/.github/workflows/codeql.yml b/.github/workflows/codeql.yml new file mode 100644 index 00000000000..d9a14ac6982 --- /dev/null +++ b/.github/workflows/codeql.yml @@ -0,0 +1,87 @@ +# For most projects, this workflow file will not need changing; you simply need +# to commit it to your repository. +# +# You may wish to alter this file to override the set of languages analyzed, +# or to provide custom queries or build logic. +# +# ******** NOTE ******** +# We have attempted to detect the languages in your repository. Please check +# the `language` matrix defined below to confirm you have the correct set of +# supported CodeQL languages. +# +name: "CodeQL" + +on: + # push: + # branches: [ master ] + # pull_request: + # # The branches below must be a subset of the branches above + # branches: [ master ] + schedule: + - cron: '0 */6 * * *' + workflow_dispatch: + +env: + CC: clang-14 + CXX: clang++-14 + +jobs: + analyze: + name: Analyze + runs-on: ubuntu-latest + permissions: + actions: read + contents: read + security-events: write + + strategy: + fail-fast: false + matrix: + language: [ 'cpp' ] + # CodeQL supports [ 'cpp', 'csharp', 'go', 'java', 'javascript', 'python', 'ruby' ] + # Learn more about CodeQL language support at https://git.io/codeql-language-support + + steps: + - name: Checkout repository + uses: actions/checkout@v3 + with: + submodules: 'true' + + # Initializes the CodeQL tools for scanning. + - name: Initialize CodeQL + uses: github/codeql-action/init@v2 + with: + languages: ${{ matrix.language }} + # If you wish to specify custom queries, you can do so here or in a config file. + # By default, queries listed here will override any specified in a config file. + # Prefix the list here with "+" to use these queries and those in the config file. + # queries: ./path/to/local/query, your-org/your-repo/queries@main + + # Autobuild attempts to build any compiled languages (C/C++, C#, or Java). + # If this step fails, then you should remove it and run the build manually (see below) + # - name: Autobuild + # uses: github/codeql-action/autobuild@v2 + + # ℹ️ Command-line programs to run using the OS shell. + # 📚 https://git.io/JvXDl + + # ✏️ If the Autobuild fails above, remove it and uncomment the following three lines + # and modify them (or add more) to build your code if your project + # uses a compiled language + + #- run: | + # make bootstrap + # make release + + - name: Build + run: | + sudo apt-get install -yq git cmake python ninja-build + sudo bash -c "$(wget -O - https://apt.llvm.org/llvm.sh)" + mkdir build + cd build + cmake -DUSE_STATIC_LIBRARIES=0 -DSPLIT_SHARED_LIBRARIES=1 -DCLICKHOUSE_SPLIT_BINARY=1 .. + ninja + rm -rf ../contrib + + - name: Perform CodeQL Analysis + uses: github/codeql-action/analyze@v2 \ No newline at end of file From a17ccb4fe3ec8ff160c55199286b07199ee4202b Mon Sep 17 00:00:00 2001 From: Julio Jimenez Date: Fri, 22 Apr 2022 22:47:33 -0400 Subject: [PATCH 44/55] get the label checker to run again ;-) Signed-off-by: Julio Jimenez --- .github/workflows/codeql.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/codeql.yml b/.github/workflows/codeql.yml index d9a14ac6982..0ed305a0315 100644 --- a/.github/workflows/codeql.yml +++ b/.github/workflows/codeql.yml @@ -84,4 +84,4 @@ jobs: rm -rf ../contrib - name: Perform CodeQL Analysis - uses: github/codeql-action/analyze@v2 \ No newline at end of file + uses: github/codeql-action/analyze@v2 From 2b04c194358f8921d319af07ccd0c502267b91a4 Mon Sep 17 00:00:00 2001 From: alesapin Date: Sat, 23 Apr 2022 11:58:56 +0200 Subject: [PATCH 45/55] Satisfy clang-tidy --- src/Storages/StorageReplicatedMergeTree.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/Storages/StorageReplicatedMergeTree.cpp b/src/Storages/StorageReplicatedMergeTree.cpp index 15b8b642098..ea69f658aa9 100644 --- a/src/Storages/StorageReplicatedMergeTree.cpp +++ b/src/Storages/StorageReplicatedMergeTree.cpp @@ -5743,7 +5743,7 @@ void StorageReplicatedMergeTree::fetchPartition( try { /// part name , metadata, part_path , true, 0, zookeeper - if (!fetchPart(part_name, metadata_snapshot, part_path, true, 0, zookeeper, /* try_shared = */ false)) + if (!fetchPart(part_name, metadata_snapshot, part_path, true, 0, zookeeper, /* try_fetch_shared = */ false)) throw Exception(ErrorCodes::UNFINISHED, "Failed to fetch part {} from {}", part_name, from_); } catch (const DB::Exception & e) @@ -5881,7 +5881,7 @@ void StorageReplicatedMergeTree::fetchPartition( try { - fetched = fetchPart(part, metadata_snapshot, best_replica_path, true, 0, zookeeper, /* try_shared = */ false); + fetched = fetchPart(part, metadata_snapshot, best_replica_path, true, 0, zookeeper, /* try_fetch_shared = */ false); } catch (const DB::Exception & e) { From c22b2f46bca3a43187c4b0a392831f92bd9995ff Mon Sep 17 00:00:00 2001 From: Alexey Milovidov Date: Sun, 24 Apr 2022 03:46:13 +0300 Subject: [PATCH 46/55] Fix style: https://s3.amazonaws.com/clickhouse-test-reports/36561/a17ccb4fe3ec8ff160c55199286b07199ee4202b/style_check__actions_/style_output.txt --- .github/workflows/codeql.yml | 12 ++---------- 1 file changed, 2 insertions(+), 10 deletions(-) diff --git a/.github/workflows/codeql.yml b/.github/workflows/codeql.yml index 0ed305a0315..934f43123c7 100644 --- a/.github/workflows/codeql.yml +++ b/.github/workflows/codeql.yml @@ -11,7 +11,7 @@ # name: "CodeQL" -on: +"on": # push: # branches: [ master ] # pull_request: @@ -37,7 +37,7 @@ jobs: strategy: fail-fast: false matrix: - language: [ 'cpp' ] + language: ['cpp'] # CodeQL supports [ 'cpp', 'csharp', 'go', 'java', 'javascript', 'python', 'ruby' ] # Learn more about CodeQL language support at https://git.io/codeql-language-support @@ -65,14 +65,6 @@ jobs: # ℹ️ Command-line programs to run using the OS shell. # 📚 https://git.io/JvXDl - # ✏️ If the Autobuild fails above, remove it and uncomment the following three lines - # and modify them (or add more) to build your code if your project - # uses a compiled language - - #- run: | - # make bootstrap - # make release - - name: Build run: | sudo apt-get install -yq git cmake python ninja-build From 909646247d591df484a8982609f405403f997419 Mon Sep 17 00:00:00 2001 From: Alexey Milovidov Date: Sun, 24 Apr 2022 05:04:13 +0300 Subject: [PATCH 47/55] Update TranslateQualifiedNamesVisitor.cpp --- src/Interpreters/TranslateQualifiedNamesVisitor.cpp | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) diff --git a/src/Interpreters/TranslateQualifiedNamesVisitor.cpp b/src/Interpreters/TranslateQualifiedNamesVisitor.cpp index 030e02378d5..35fb0828b3e 100644 --- a/src/Interpreters/TranslateQualifiedNamesVisitor.cpp +++ b/src/Interpreters/TranslateQualifiedNamesVisitor.cpp @@ -253,12 +253,9 @@ void TranslateQualifiedNamesMatcher::visit(ASTExpressionList & node, const ASTPt { for (const auto & column : table.columns) { - for (const auto & column : table.columns) + if (asterisk_regexp_pattern->isColumnMatching(column.name) && (first_table || !data.join_using_columns.count(column.name))) { - if (asterisk_regexp_pattern->isColumnMatching(column.name) && (first_table || !data.join_using_columns.count(column.name))) - { - addIdentifier(columns, table.table, column.name); - } + addIdentifier(columns, table.table, column.name); } } first_table = false; From f5ed37714eebfcd67cd1874297d7538c41f84d66 Mon Sep 17 00:00:00 2001 From: Julio Jimenez Date: Sat, 23 Apr 2022 23:57:24 -0400 Subject: [PATCH 48/55] retry checks From 1f7b3f0dc5b3a8b22efae47926b0783d25dbc750 Mon Sep 17 00:00:00 2001 From: Julio Jimenez Date: Sun, 24 Apr 2022 00:03:04 -0400 Subject: [PATCH 49/55] fix style Signed-off-by: Julio Jimenez --- .github/workflows/codeql.yml | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/.github/workflows/codeql.yml b/.github/workflows/codeql.yml index 0ed305a0315..6349895de59 100644 --- a/.github/workflows/codeql.yml +++ b/.github/workflows/codeql.yml @@ -11,7 +11,7 @@ # name: "CodeQL" -on: +"on": # push: # branches: [ master ] # pull_request: @@ -37,7 +37,7 @@ jobs: strategy: fail-fast: false matrix: - language: [ 'cpp' ] + language: ['cpp'] # CodeQL supports [ 'cpp', 'csharp', 'go', 'java', 'javascript', 'python', 'ruby' ] # Learn more about CodeQL language support at https://git.io/codeql-language-support @@ -69,7 +69,7 @@ jobs: # and modify them (or add more) to build your code if your project # uses a compiled language - #- run: | + # - run: | # make bootstrap # make release From 8e60720095d29f2b6add0ed54aaf3b0c93ecd97c Mon Sep 17 00:00:00 2001 From: homeward <97138894+hf400159@users.noreply.github.com> Date: Sun, 24 Apr 2022 15:36:39 +0800 Subject: [PATCH 50/55] Update adopters.md --- docs/zh/introduction/adopters.md | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/docs/zh/introduction/adopters.md b/docs/zh/introduction/adopters.md index 5e1e6bfe571..de57330082b 100644 --- a/docs/zh/introduction/adopters.md +++ b/docs/zh/introduction/adopters.md @@ -5,8 +5,9 @@ sidebar_label: "ClickHouse用户" # ClickHouse用户 {#clickhouse-adopters} -!!! warning "免责声明" +:::note "免责声明" 如下使用ClickHouse的公司和他们的成功案例来源于公开资源,因此和实际情况可能有所出入。如果您分享您公司使用ClickHouse的故事,我们将不胜感激 [将其添加到列表](https://github.com/ClickHouse/ClickHouse/edit/master/docs/en/introduction/adopters.md),但请确保你这样做不会有任何保密协议的问题。也欢迎提供来自其他公司的出版物的更新。 +::: | 公司简介 | 行业 | 用例 | 群集大小 | (Un)压缩数据大小\* | 参考资料 | |-----------------------------------------------------------------|------------------|----------------|---------------------------------------------------|----------------------------------------------------------------------|-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| From 1ae00fe9de0b5336801b59def315569531bb2cb7 Mon Sep 17 00:00:00 2001 From: homeward <97138894+hf400159@users.noreply.github.com> Date: Sun, 24 Apr 2022 15:36:57 +0800 Subject: [PATCH 51/55] Update adopters.md --- docs/zh/introduction/adopters.md | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/docs/zh/introduction/adopters.md b/docs/zh/introduction/adopters.md index de57330082b..52c4f6362ee 100644 --- a/docs/zh/introduction/adopters.md +++ b/docs/zh/introduction/adopters.md @@ -6,7 +6,9 @@ sidebar_label: "ClickHouse用户" # ClickHouse用户 {#clickhouse-adopters} :::note "免责声明" - 如下使用ClickHouse的公司和他们的成功案例来源于公开资源,因此和实际情况可能有所出入。如果您分享您公司使用ClickHouse的故事,我们将不胜感激 [将其添加到列表](https://github.com/ClickHouse/ClickHouse/edit/master/docs/en/introduction/adopters.md),但请确保你这样做不会有任何保密协议的问题。也欢迎提供来自其他公司的出版物的更新。 + +如下使用ClickHouse的公司和他们的成功案例来源于公开资源,因此和实际情况可能有所出入。如果您分享您公司使用ClickHouse的故事,我们将不胜感激 [将其添加到列表](https://github.com/ClickHouse/ClickHouse/edit/master/docs/en/introduction/adopters.md),但请确保你这样做不会有任何保密协议的问题。也欢迎提供来自其他公司的出版物的更新。 + ::: | 公司简介 | 行业 | 用例 | 群集大小 | (Un)压缩数据大小\* | 参考资料 | From e801f27ee97e0661e4d840126943f332a7b85560 Mon Sep 17 00:00:00 2001 From: homeward <97138894+hf400159@users.noreply.github.com> Date: Sun, 24 Apr 2022 15:37:27 +0800 Subject: [PATCH 52/55] Update adopters.md --- docs/zh/introduction/adopters.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/zh/introduction/adopters.md b/docs/zh/introduction/adopters.md index 52c4f6362ee..fb0025448cb 100644 --- a/docs/zh/introduction/adopters.md +++ b/docs/zh/introduction/adopters.md @@ -5,7 +5,7 @@ sidebar_label: "ClickHouse用户" # ClickHouse用户 {#clickhouse-adopters} -:::note "免责声明" +:::note 免责声明 如下使用ClickHouse的公司和他们的成功案例来源于公开资源,因此和实际情况可能有所出入。如果您分享您公司使用ClickHouse的故事,我们将不胜感激 [将其添加到列表](https://github.com/ClickHouse/ClickHouse/edit/master/docs/en/introduction/adopters.md),但请确保你这样做不会有任何保密协议的问题。也欢迎提供来自其他公司的出版物的更新。 From b1794607c398c18e3f34cf6b9bda037d3be30528 Mon Sep 17 00:00:00 2001 From: homeward <97138894+hf400159@users.noreply.github.com> Date: Sun, 24 Apr 2022 15:40:35 +0800 Subject: [PATCH 53/55] Update adopters.md --- docs/zh/introduction/adopters.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/zh/introduction/adopters.md b/docs/zh/introduction/adopters.md index fb0025448cb..ff499ac3c74 100644 --- a/docs/zh/introduction/adopters.md +++ b/docs/zh/introduction/adopters.md @@ -7,7 +7,7 @@ sidebar_label: "ClickHouse用户" :::note 免责声明 -如下使用ClickHouse的公司和他们的成功案例来源于公开资源,因此和实际情况可能有所出入。如果您分享您公司使用ClickHouse的故事,我们将不胜感激 [将其添加到列表](https://github.com/ClickHouse/ClickHouse/edit/master/docs/en/introduction/adopters.md),但请确保你这样做不会有任何保密协议的问题。也欢迎提供来自其他公司的出版物的更新。 +如下使用ClickHouse的公司和他们的成功案例来源于公开资源,因此和实际情况可能有所出入。如果您分享您公司使用ClickHouse的故事,我们将不胜感激 [将其添加到列表](https://github.com/ClickHouse/ClickHouse/edit/master/docs/zh/introduction/adopters.md),但请确保你这样做不会有任何保密协议的问题。也欢迎提供来自其他公司的出版物的更新。 ::: From bc19f4e633fe0645a860743a3b0fb3e039330042 Mon Sep 17 00:00:00 2001 From: homeward <97138894+hf400159@users.noreply.github.com> Date: Sun, 24 Apr 2022 15:46:58 +0800 Subject: [PATCH 54/55] Update adopters.md --- docs/zh/introduction/adopters.md | 2 -- 1 file changed, 2 deletions(-) diff --git a/docs/zh/introduction/adopters.md b/docs/zh/introduction/adopters.md index ff499ac3c74..2219894df5c 100644 --- a/docs/zh/introduction/adopters.md +++ b/docs/zh/introduction/adopters.md @@ -6,9 +6,7 @@ sidebar_label: "ClickHouse用户" # ClickHouse用户 {#clickhouse-adopters} :::note 免责声明 - 如下使用ClickHouse的公司和他们的成功案例来源于公开资源,因此和实际情况可能有所出入。如果您分享您公司使用ClickHouse的故事,我们将不胜感激 [将其添加到列表](https://github.com/ClickHouse/ClickHouse/edit/master/docs/zh/introduction/adopters.md),但请确保你这样做不会有任何保密协议的问题。也欢迎提供来自其他公司的出版物的更新。 - ::: | 公司简介 | 行业 | 用例 | 群集大小 | (Un)压缩数据大小\* | 参考资料 | From 74bb6349c2d0f5444fa199205eddebe0f9b4729a Mon Sep 17 00:00:00 2001 From: Alexey Milovidov Date: Mon, 25 Apr 2022 01:15:08 +0300 Subject: [PATCH 55/55] Revert "Strict taskstats parser" --- programs/server/Server.cpp | 7 +----- src/Common/TaskStatsInfoGetter.cpp | 36 ++++++++++++++++-------------- src/Common/ThreadProfileEvents.cpp | 14 ------------ src/Common/ThreadProfileEvents.h | 18 +++++++-------- 4 files changed, 28 insertions(+), 47 deletions(-) diff --git a/programs/server/Server.cpp b/programs/server/Server.cpp index 085018fbb87..786fff9b0e4 100644 --- a/programs/server/Server.cpp +++ b/programs/server/Server.cpp @@ -1507,8 +1507,7 @@ int Server::main(const std::vector & /*args*/) } #if defined(OS_LINUX) - auto tasks_stats_provider = TasksStatsCounters::findBestAvailableProvider(); - if (tasks_stats_provider == TasksStatsCounters::MetricsProvider::None) + if (!TasksStatsCounters::checkIfAvailable()) { LOG_INFO(log, "It looks like this system does not have procfs mounted at /proc location," " neither clickhouse-server process has CAP_NET_ADMIN capability." @@ -1519,10 +1518,6 @@ int Server::main(const std::vector & /*args*/) " It also doesn't work if you run clickhouse-server inside network namespace as it happens in some containers.", executable_path); } - else - { - LOG_INFO(log, "Tasks stats provider: {}", TasksStatsCounters::metricsProviderString(tasks_stats_provider)); - } if (!hasLinuxCapability(CAP_SYS_NICE)) { diff --git a/src/Common/TaskStatsInfoGetter.cpp b/src/Common/TaskStatsInfoGetter.cpp index a74b4c2b89b..f2ef0f6d2aa 100644 --- a/src/Common/TaskStatsInfoGetter.cpp +++ b/src/Common/TaskStatsInfoGetter.cpp @@ -265,24 +265,26 @@ void TaskStatsInfoGetter::getStat(::taskstats & out_stats, pid_t tid) const { NetlinkMessage answer = query(netlink_socket_fd, taskstats_family_id, tid, TASKSTATS_CMD_GET, TASKSTATS_CMD_ATTR_PID, &tid, sizeof(tid)); - const NetlinkMessage::Attribute * attr = &answer.payload.attribute; - if (attr->header.nla_type != TASKSTATS_TYPE_AGGR_PID) - throw Exception("Expected TASKSTATS_TYPE_AGGR_PID", ErrorCodes::NETLINK_ERROR); + for (const NetlinkMessage::Attribute * attr = &answer.payload.attribute; + attr < answer.end(); + attr = attr->next()) + { + if (attr->header.nla_type == TASKSTATS_TYPE_AGGR_TGID || attr->header.nla_type == TASKSTATS_TYPE_AGGR_PID) + { + for (const NetlinkMessage::Attribute * nested_attr = reinterpret_cast(attr->payload); + nested_attr < attr->next(); + nested_attr = nested_attr->next()) + { + if (nested_attr->header.nla_type == TASKSTATS_TYPE_STATS) + { + out_stats = unalignedLoad<::taskstats>(nested_attr->payload); + return; + } + } + } + } - /// TASKSTATS_TYPE_AGGR_PID - const NetlinkMessage::Attribute * nested_attr = reinterpret_cast(attr->payload); - if (nested_attr->header.nla_type != TASKSTATS_TYPE_PID) - throw Exception("Expected TASKSTATS_TYPE_PID", ErrorCodes::NETLINK_ERROR); - if (nested_attr == nested_attr->next()) - throw Exception("No TASKSTATS_TYPE_STATS packet after TASKSTATS_TYPE_PID", ErrorCodes::NETLINK_ERROR); - nested_attr = nested_attr->next(); - if (nested_attr->header.nla_type != TASKSTATS_TYPE_STATS) - throw Exception("Expected TASKSTATS_TYPE_STATS", ErrorCodes::NETLINK_ERROR); - - out_stats = unalignedLoad<::taskstats>(nested_attr->payload); - - if (attr->next() != answer.end()) - throw Exception("Unexpected end of response", ErrorCodes::NETLINK_ERROR); + throw Exception("There is no TASKSTATS_TYPE_STATS attribute in the Netlink response", ErrorCodes::NETLINK_ERROR); } diff --git a/src/Common/ThreadProfileEvents.cpp b/src/Common/ThreadProfileEvents.cpp index 1a7b4d88322..e28e5d3e85e 100644 --- a/src/Common/ThreadProfileEvents.cpp +++ b/src/Common/ThreadProfileEvents.cpp @@ -67,20 +67,6 @@ namespace ProfileEvents namespace DB { -const char * TasksStatsCounters::metricsProviderString(MetricsProvider provider) -{ - switch (provider) - { - case MetricsProvider::None: - return "none"; - case MetricsProvider::Procfs: - return "procfs"; - case MetricsProvider::Netlink: - return "netlink"; - } - __builtin_unreachable(); -} - bool TasksStatsCounters::checkIfAvailable() { return findBestAvailableProvider() != MetricsProvider::None; diff --git a/src/Common/ThreadProfileEvents.h b/src/Common/ThreadProfileEvents.h index 0f2ad7dfd74..befd759c71b 100644 --- a/src/Common/ThreadProfileEvents.h +++ b/src/Common/ThreadProfileEvents.h @@ -176,17 +176,7 @@ extern PerfEventsCounters current_thread_counters; class TasksStatsCounters { public: - enum class MetricsProvider - { - None, - Procfs, - Netlink, - }; - - static const char * metricsProviderString(MetricsProvider provider); static bool checkIfAvailable(); - static MetricsProvider findBestAvailableProvider(); - static std::unique_ptr create(UInt64 tid); void reset(); @@ -196,8 +186,16 @@ private: ::taskstats stats; //-V730_NOINIT std::function<::taskstats()> stats_getter; + enum class MetricsProvider + { + None, + Procfs, + Netlink + }; + explicit TasksStatsCounters(UInt64 tid, MetricsProvider provider); + static MetricsProvider findBestAvailableProvider(); static void incrementProfileEvents(const ::taskstats & prev, const ::taskstats & curr, ProfileEvents::Counters & profile_events); };