From 7803ecaee54558e91f696793c477fc67e8554c5d Mon Sep 17 00:00:00 2001
From: Maksim Kita
Date: Tue, 5 Apr 2022 23:10:23 +0200
Subject: [PATCH 01/55] Fix performance tests
---
tests/performance/group_array_sorted.xml | 30 ++++++-----------------
tests/performance/joins_in_memory_pmj.xml | 28 +++++++--------------
2 files changed, 16 insertions(+), 42 deletions(-)
diff --git a/tests/performance/group_array_sorted.xml b/tests/performance/group_array_sorted.xml
index 7d26459b4b1..d0092e38659 100644
--- a/tests/performance/group_array_sorted.xml
+++ b/tests/performance/group_array_sorted.xml
@@ -1,27 +1,11 @@
-
- 10
-
-
-
- items
-
- 1000
- 100000
- 10000000
-
-
-
+ CREATE TABLE test (`id` UInt64, `value` UInt64, `text` String ) ENGINE = Memory
+ INSERT INTO test SELECT number as id, rand64() as value, toString(number) as text FROM system.numbers_mt LIMIT 10000000
+
+ SELECT groupArraySorted(100000)(id, value) FROM test
+ SELECT groupArraySorted(100000)(text, value) FROM test
+ SELECT groupArraySorted(100000)((id, text), value) FROM test
+ SELECT groupArraySorted(100000)(text) FROM test
- CREATE TABLE test ( `id` UInt64, `value` UInt64, `text` String ) ENGINE = Memory
- INSERT INTO test SELECT number as id, rand64() as value, toString(number) as text FROM numbers({items})
- SELECT groupArraySorted(10)(id, value) FROM test
- SELECT groupArraySorted(10)(text, value) FROM test
- SELECT groupArraySorted(10)((id, text), value) FROM test
- SELECT groupArraySorted(10)(text) FROM test
- SELECT groupArraySorted(10000)(id, value) FROM test
- SELECT groupArraySorted(10000)(text, value) FROM test
- SELECT groupArraySorted(10000)((id, text), value) FROM test
- SELECT groupArraySorted(10000)(text) FROM test
DROP TABLE IF EXISTS test
diff --git a/tests/performance/joins_in_memory_pmj.xml b/tests/performance/joins_in_memory_pmj.xml
index 6eaf0bdd81a..9ebc48585f4 100644
--- a/tests/performance/joins_in_memory_pmj.xml
+++ b/tests/performance/joins_in_memory_pmj.xml
@@ -6,31 +6,21 @@
0
- INSERT INTO ints SELECT number AS i64, i64 AS i32, i64 AS i16, i64 AS i8 FROM numbers(10000) settings query_plan_filter_push_down = 0
- INSERT INTO ints SELECT 10000 + number % 1000 AS i64, i64 AS i32, i64 AS i16, i64 AS i8 FROM numbers(10000) settings query_plan_filter_push_down = 0
- INSERT INTO ints SELECT 20000 + number % 100 AS i64, i64 AS i32, i64 AS i16, i64 AS i8 FROM numbers(10000) settings query_plan_filter_push_down = 0
- INSERT INTO ints SELECT 30000 + number % 10 AS i64, i64 AS i32, i64 AS i16, i64 AS i8 FROM numbers(10000) settings query_plan_filter_push_down = 0
- INSERT INTO ints SELECT 40000 + number % 1 AS i64, i64 AS i32, i64 AS i16, i64 AS i8 FROM numbers(10000) settings query_plan_filter_push_down = 0
-
- SELECT COUNT() FROM ints l ANY LEFT JOIN ints r USING i64 WHERE i32 = 20042 settings query_plan_filter_push_down = 0
- SELECT COUNT() FROM ints l ANY LEFT JOIN ints r USING i64,i32,i16,i8 WHERE i32 = 20042 settings query_plan_filter_push_down = 0
- SELECT COUNT() FROM ints l ANY LEFT JOIN ints r ON l.i64 = r.i64 WHERE i32 = 20042 settings query_plan_filter_push_down = 0
- SELECT COUNT() FROM ints l ANY LEFT JOIN ints r USING i64 WHERE i32 IN(42, 10042, 20042, 30042, 40042) settings query_plan_filter_push_down = 0
-
- SELECT COUNT() FROM ints l INNER JOIN ints r USING i64 WHERE i32 = 20042 settings query_plan_filter_push_down = 0
- SELECT COUNT() FROM ints l INNER JOIN ints r USING i64,i32,i16,i8 WHERE i32 = 20042 settings query_plan_filter_push_down = 0
- SELECT COUNT() FROM ints l INNER JOIN ints r ON l.i64 = r.i64 WHERE i32 = 20042 settings query_plan_filter_push_down = 0
- SELECT COUNT() FROM ints l INNER JOIN ints r USING i64 WHERE i32 IN(42, 10042, 20042, 30042, 40042) settings query_plan_filter_push_down = 0
+ INSERT INTO ints SELECT number AS i64, i64 AS i32, i64 AS i16, i64 AS i8 FROM numbers(1000000) settings query_plan_filter_push_down = 0
+ INSERT INTO ints SELECT 10000 + number % 1000 AS i64, i64 AS i32, i64 AS i16, i64 AS i8 FROM numbers(1000000) settings query_plan_filter_push_down = 0
+ INSERT INTO ints SELECT 20000 + number % 100 AS i64, i64 AS i32, i64 AS i16, i64 AS i8 FROM numbers(1000000) settings query_plan_filter_push_down = 0
+ INSERT INTO ints SELECT 30000 + number % 10 AS i64, i64 AS i32, i64 AS i16, i64 AS i8 FROM numbers(1000000) settings query_plan_filter_push_down = 0
+ INSERT INTO ints SELECT 40000 + number % 1 AS i64, i64 AS i32, i64 AS i16, i64 AS i8 FROM numbers(1000000) settings query_plan_filter_push_down = 0
SELECT COUNT() FROM ints l LEFT JOIN ints r USING i64 WHERE i32 = 20042 settings query_plan_filter_push_down = 0
SELECT COUNT() FROM ints l LEFT JOIN ints r USING i64,i32,i16,i8 WHERE i32 = 20042 settings query_plan_filter_push_down = 0
SELECT COUNT() FROM ints l LEFT JOIN ints r ON l.i64 = r.i64 WHERE i32 = 20042 settings query_plan_filter_push_down = 0
SELECT COUNT() FROM ints l LEFT JOIN ints r USING i64 WHERE i32 IN(42, 10042, 20042, 30042, 40042) settings query_plan_filter_push_down = 0
- SELECT COUNT() FROM ints l ANY LEFT JOIN ints r USING i64 WHERE i32 = 20042 SETTINGS partial_merge_join_optimizations = 0, query_plan_filter_push_down = 0
- SELECT COUNT() FROM ints l ANY LEFT JOIN ints r USING i64,i32,i16,i8 WHERE i32 = 20042 SETTINGS partial_merge_join_optimizations = 0, query_plan_filter_push_down = 0
- SELECT COUNT() FROM ints l ANY LEFT JOIN ints r ON l.i64 = r.i64 WHERE i32 = 20042 SETTINGS partial_merge_join_optimizations = 0, query_plan_filter_push_down = 0
- SELECT COUNT() FROM ints l ANY LEFT JOIN ints r USING i64 WHERE i32 IN(42, 10042, 20042, 30042, 40042) SETTINGS partial_merge_join_optimizations = 0, query_plan_filter_push_down = 0
+ SELECT COUNT() FROM ints l ANY LEFT JOIN ints r USING i64 WHERE i32 = 20042 SETTINGS partial_merge_join_optimizations = 0, query_plan_filter_push_down = 0
+ SELECT COUNT() FROM ints l ANY LEFT JOIN ints r USING i64,i32,i16,i8 WHERE i32 = 20042 SETTINGS partial_merge_join_optimizations = 0, query_plan_filter_push_down = 0
+ SELECT COUNT() FROM ints l ANY LEFT JOIN ints r ON l.i64 = r.i64 WHERE i32 = 20042 SETTINGS partial_merge_join_optimizations = 0, query_plan_filter_push_down = 0
+ SELECT COUNT() FROM ints l ANY LEFT JOIN ints r USING i64 WHERE i32 IN(42, 10042, 20042, 30042, 40042) SETTINGS partial_merge_join_optimizations = 0, query_plan_filter_push_down = 0
SELECT COUNT() FROM ints l INNER JOIN ints r USING i64 WHERE i32 = 20042 SETTINGS partial_merge_join_optimizations = 0, query_plan_filter_push_down = 0
SELECT COUNT() FROM ints l INNER JOIN ints r USING i64,i32,i16,i8 WHERE i32 = 20042 SETTINGS partial_merge_join_optimizations = 0, query_plan_filter_push_down = 0
From 8cf67ed4c0a45b08f01f1860189d2ba8af0b22f3 Mon Sep 17 00:00:00 2001
From: Maksim Kita
Date: Tue, 12 Apr 2022 15:22:00 +0200
Subject: [PATCH 02/55] Fix constant_column_search performance tests
---
tests/performance/constant_column_search.xml | 8 ++++----
1 file changed, 4 insertions(+), 4 deletions(-)
diff --git a/tests/performance/constant_column_search.xml b/tests/performance/constant_column_search.xml
index cb76fd4cefb..993d7c6483f 100644
--- a/tests/performance/constant_column_search.xml
+++ b/tests/performance/constant_column_search.xml
@@ -10,10 +10,10 @@
-
-
-
-
+ 0), count(position(URL, 'google')) FROM hits_100m_single]]>
+
+
+
From 9b332c1e31fa81865c59c70de0d75a86d2a83fa7 Mon Sep 17 00:00:00 2001
From: Maksim Kita
Date: Tue, 12 Apr 2022 17:06:21 +0200
Subject: [PATCH 03/55] Fix early_constant_folding performance test
---
tests/performance/early_constant_folding.xml | 9 ---------
.../0_stateless/02267_early_constant_folding.reference | 8 ++++++++
.../queries/0_stateless/02267_early_constant_folding.sql | 1 +
3 files changed, 9 insertions(+), 9 deletions(-)
delete mode 100644 tests/performance/early_constant_folding.xml
create mode 100644 tests/queries/0_stateless/02267_early_constant_folding.reference
create mode 100644 tests/queries/0_stateless/02267_early_constant_folding.sql
diff --git a/tests/performance/early_constant_folding.xml b/tests/performance/early_constant_folding.xml
deleted file mode 100644
index 9def3efd891..00000000000
--- a/tests/performance/early_constant_folding.xml
+++ /dev/null
@@ -1,9 +0,0 @@
-
-
- hits_100m_single
-
- 1
-
-
- SELECT count(JavaEnable) FROM hits_100m_single WHERE WatchID = 1 OR Title = 'next' OR URL = 'prev' OR OriginalURL = '???' OR 1
-
diff --git a/tests/queries/0_stateless/02267_early_constant_folding.reference b/tests/queries/0_stateless/02267_early_constant_folding.reference
new file mode 100644
index 00000000000..da2d9a690ee
--- /dev/null
+++ b/tests/queries/0_stateless/02267_early_constant_folding.reference
@@ -0,0 +1,8 @@
+(Expression)
+ExpressionTransform
+ (SettingQuotaAndLimits)
+ (ReadFromStorage)
+ AggregatingTransform
+ StrictResize
+ ExpressionTransform
+ SourceFromSingleChunk 0 → 1
diff --git a/tests/queries/0_stateless/02267_early_constant_folding.sql b/tests/queries/0_stateless/02267_early_constant_folding.sql
new file mode 100644
index 00000000000..2835a0a24ea
--- /dev/null
+++ b/tests/queries/0_stateless/02267_early_constant_folding.sql
@@ -0,0 +1 @@
+EXPLAIN PIPELINE SELECT count(JavaEnable) FROM hits_100m_single WHERE WatchID = 1 OR Title = 'next' OR URL = 'prev' OR OriginalURL = '???' OR 1
From 3d36698f56a9c53613e7ea5da54a097bd2b2d8ac Mon Sep 17 00:00:00 2001
From: Maksim Kita
Date: Tue, 12 Apr 2022 17:06:38 +0200
Subject: [PATCH 04/55] Fix group_by_sundy_li performance test
---
tests/performance/group_by_sundy_li.xml | 9 +++++----
1 file changed, 5 insertions(+), 4 deletions(-)
diff --git a/tests/performance/group_by_sundy_li.xml b/tests/performance/group_by_sundy_li.xml
index f1187e48528..07c5aa5cd7f 100644
--- a/tests/performance/group_by_sundy_li.xml
+++ b/tests/performance/group_by_sundy_li.xml
@@ -1,6 +1,7 @@
8
+ 0
@@ -15,10 +16,10 @@
ORDER BY (d, n)
- insert into a select '2000-01-01', ['aa','bb','cc','dd'][number % 4 + 1], number from numbers_mt(100000000)
- insert into a select '2000-01-02', ['aa','bb','cc','dd'][number % 4 + 1], number from numbers_mt(100000000)
- insert into a select '2000-01-03', ['aa','bb','cc','dd'][number % 4 + 1], number from numbers_mt(100000000)
- insert into a select '2000-01-04', ['aa','bb','cc','dd'][number % 4 + 1], number from numbers_mt(100000000)
+ insert into a select '2000-01-01', ['aa','bb','cc','dd'][number % 4 + 1], number from numbers_mt(100000000)
+ insert into a select '2000-01-02', ['aa','bb','cc','dd'][number % 4 + 1], number from numbers_mt(100000000)
+ insert into a select '2000-01-03', ['aa','bb','cc','dd'][number % 4 + 1], number from numbers_mt(100000000)
+ insert into a select '2000-01-04', ['aa','bb','cc','dd'][number % 4 + 1], number from numbers_mt(100000000)
OPTIMIZE TABLE a FINAL
From 04089be14427379dc3aa24abb16903981dbcff7a Mon Sep 17 00:00:00 2001
From: Maksim Kita
Date: Tue, 12 Apr 2022 17:46:16 +0200
Subject: [PATCH 05/55] Fix hash_table_sizes_stats performance test
---
tests/performance/hash_table_sizes_stats.xml | 52 +++++++++++---------
1 file changed, 30 insertions(+), 22 deletions(-)
diff --git a/tests/performance/hash_table_sizes_stats.xml b/tests/performance/hash_table_sizes_stats.xml
index 5bdd83126b8..79b3523d5de 100644
--- a/tests/performance/hash_table_sizes_stats.xml
+++ b/tests/performance/hash_table_sizes_stats.xml
@@ -1,29 +1,37 @@
-
- hits_10m_single
- hits_100m_single
-
-
1000000000
- SELECT number FROM numbers(5000000) GROUP BY number FORMAT Null
- SELECT number FROM numbers(10000000) GROUP BY number FORMAT Null
- SELECT number FROM numbers_mt(500000) GROUP BY number FORMAT Null
- SELECT number FROM numbers_mt(1000000) GROUP BY number FORMAT Null
- SELECT number FROM numbers_mt(10000000) GROUP BY number FORMAT Null
- SELECT number FROM numbers_mt(50000000) GROUP BY number FORMAT Null
- WITH number % 524289 AS k, toUInt64(k) AS k1, k1 + 1 AS k2 SELECT k1, k2, count() FROM numbers(100000000) GROUP BY k1, k2 FORMAT Null
- SELECT number FROM numbers_mt(10000000) GROUP BY number FORMAT Null SETTINGS group_by_two_level_threshold = 1e12, group_by_two_level_threshold_bytes = 1e12
- SELECT number FROM numbers_mt(50000000) GROUP BY number FORMAT Null SETTINGS group_by_two_level_threshold = 1e12, group_by_two_level_threshold_bytes = 1e12
+
+ hits_100m_single
+
- SELECT WatchID FROM hits_10m_single GROUP BY WatchID FORMAT Null
- SELECT WatchID FROM hits_100m_single GROUP BY WatchID FORMAT Null
- SELECT ClientIP AS x, x - 1, x - 2, x - 3, count() AS c FROM hits_10m_single GROUP BY x, x - 1, x - 2, x - 3 ORDER BY c DESC LIMIT 10
- SELECT ClientIP AS x, x - 1, x - 2, x - 3, count() AS c FROM hits_100m_single GROUP BY x, x - 1, x - 2, x - 3 ORDER BY c DESC LIMIT 10
- SELECT WatchID, ClientIP, count() AS c, sum(Refresh), avg(ResolutionWidth) FROM hits_10m_single WHERE SearchPhrase != '' GROUP BY WatchID, ClientIP ORDER BY c DESC LIMIT 10
- SELECT WatchID, ClientIP, count() AS c, sum(Refresh), avg(ResolutionWidth) FROM hits_100m_single WHERE SearchPhrase != '' GROUP BY WatchID, ClientIP ORDER BY c DESC LIMIT 10
- SELECT min(MobilePhoneModel) FROM hits_10m_single WHERE MobilePhoneModel != '' GROUP BY intHash32(UserID) % 1000000 FORMAT Null
- SELECT min(MobilePhoneModel) FROM hits_100m_single WHERE MobilePhoneModel != '' GROUP BY intHash32(UserID) % 1000000 FORMAT Null
+
+
+ table_name
+
+ hits_100m_single
+
+
+
+
+ numbers_threshold_value
+
+ 5000000
+ 10000000
+
+
+
+
+ SELECT number FROM numbers({numbers_threshold_value}) GROUP BY number FORMAT Null
+ SELECT number FROM numbers_mt({numbers_threshold_value}) GROUP BY number FORMAT Null
+ SELECT number FROM numbers_mt({numbers_threshold_value}) GROUP BY number FORMAT Null
+ WITH number % 524289 AS k, toUInt64(k) AS k1, k1 + 1 AS k2 SELECT k1, k2, count() FROM numbers({numbers_threshold_value}) GROUP BY k1, k2 FORMAT Null
+ SELECT number FROM numbers_mt({numbers_threshold_value}) GROUP BY number FORMAT Null SETTINGS group_by_two_level_threshold = 1e12, group_by_two_level_threshold_bytes = 1e12
+
+ SELECT WatchID FROM {table_name} GROUP BY WatchID FORMAT Null
+ SELECT ClientIP AS x, x - 1, x - 2, x - 3, count() AS c FROM {table_name} GROUP BY x, x - 1, x - 2, x - 3 ORDER BY c DESC LIMIT 10
+ SELECT WatchID, ClientIP, count() AS c, sum(Refresh), avg(ResolutionWidth) FROM {table_name} WHERE SearchPhrase != '' GROUP BY WatchID, ClientIP ORDER BY c DESC LIMIT 10
+ SELECT min(MobilePhoneModel) FROM {table_name} WHERE MobilePhoneModel != '' GROUP BY intHash32(UserID) % 1000000 FORMAT Null
From aa0464d81efacca224ff13dd3171e63774483438 Mon Sep 17 00:00:00 2001
From: Maksim Kita
Date: Tue, 12 Apr 2022 18:50:51 +0200
Subject: [PATCH 06/55] Fixed tests
---
tests/queries/0_stateless/02267_early_constant_folding.sql | 1 -
.../00172_early_constant_folding.reference} | 0
tests/queries/1_stateful/00172_early_constant_folding.sql | 1 +
3 files changed, 1 insertion(+), 1 deletion(-)
delete mode 100644 tests/queries/0_stateless/02267_early_constant_folding.sql
rename tests/queries/{0_stateless/02267_early_constant_folding.reference => 1_stateful/00172_early_constant_folding.reference} (100%)
create mode 100644 tests/queries/1_stateful/00172_early_constant_folding.sql
diff --git a/tests/queries/0_stateless/02267_early_constant_folding.sql b/tests/queries/0_stateless/02267_early_constant_folding.sql
deleted file mode 100644
index 2835a0a24ea..00000000000
--- a/tests/queries/0_stateless/02267_early_constant_folding.sql
+++ /dev/null
@@ -1 +0,0 @@
-EXPLAIN PIPELINE SELECT count(JavaEnable) FROM hits_100m_single WHERE WatchID = 1 OR Title = 'next' OR URL = 'prev' OR OriginalURL = '???' OR 1
diff --git a/tests/queries/0_stateless/02267_early_constant_folding.reference b/tests/queries/1_stateful/00172_early_constant_folding.reference
similarity index 100%
rename from tests/queries/0_stateless/02267_early_constant_folding.reference
rename to tests/queries/1_stateful/00172_early_constant_folding.reference
diff --git a/tests/queries/1_stateful/00172_early_constant_folding.sql b/tests/queries/1_stateful/00172_early_constant_folding.sql
new file mode 100644
index 00000000000..480c42fc136
--- /dev/null
+++ b/tests/queries/1_stateful/00172_early_constant_folding.sql
@@ -0,0 +1 @@
+EXPLAIN PIPELINE SELECT count(JavaEnable) FROM test.hits WHERE WatchID = 1 OR Title = 'next' OR URL = 'prev' OR OriginalURL = '???' OR 1;
From 2a950102bce237bf7c8c801766e761a2d9cc0d02 Mon Sep 17 00:00:00 2001
From: Maksim Kita
Date: Wed, 13 Apr 2022 16:41:21 +0200
Subject: [PATCH 07/55] Fixed tests
---
tests/queries/1_stateful/00172_early_constant_folding.sql | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/tests/queries/1_stateful/00172_early_constant_folding.sql b/tests/queries/1_stateful/00172_early_constant_folding.sql
index 480c42fc136..cc3d2274ecd 100644
--- a/tests/queries/1_stateful/00172_early_constant_folding.sql
+++ b/tests/queries/1_stateful/00172_early_constant_folding.sql
@@ -1 +1 @@
-EXPLAIN PIPELINE SELECT count(JavaEnable) FROM test.hits WHERE WatchID = 1 OR Title = 'next' OR URL = 'prev' OR OriginalURL = '???' OR 1;
+EXPLAIN PIPELINE SELECT count(JavaEnable) FROM test.hits WHERE WatchID = 1 OR Title = 'next' OR URL = 'prev' OR URL = '???' OR 1;
From 3a715aba790b98cd4587465027239101ab218e16 Mon Sep 17 00:00:00 2001
From: Tyler Hannan
Date: Wed, 13 Apr 2022 22:54:06 +0200
Subject: [PATCH 08/55] add corei9 benchmark
---
website/benchmark/hardware/index.html | 1 +
.../results/intel_core_i9_11900kf.json | 54 +++++++++++++++++++
2 files changed, 55 insertions(+)
create mode 100644 website/benchmark/hardware/results/intel_core_i9_11900kf.json
diff --git a/website/benchmark/hardware/index.html b/website/benchmark/hardware/index.html
index 42c87c334c0..476cd672b30 100644
--- a/website/benchmark/hardware/index.html
+++ b/website/benchmark/hardware/index.html
@@ -95,6 +95,7 @@ Results for AWS instance type im4gn.8xlarge are from Ananth Gundabattula
Results for AWS instance type im4gn.16xlarge are from Ananth Gundabattula (Darwinium).
Results for AWS instance type i3.2xlarge are from Ananth Gundabattula (Darwinium).
Results for 2x EPYC 7702 on ZFS mirror NVME are from Alibek A.
+Results for Intel 11th Gen Core i9-11900KF are from Tim Xian.
diff --git a/website/benchmark/hardware/results/intel_core_i9_11900kf.json b/website/benchmark/hardware/results/intel_core_i9_11900kf.json
new file mode 100644
index 00000000000..6bc774bb14a
--- /dev/null
+++ b/website/benchmark/hardware/results/intel_core_i9_11900kf.json
@@ -0,0 +1,54 @@
+[
+ {
+ "system": "Intel(R) 11th Gen Core i9-11900KF",
+ "system_full": "Intel(R) 11th Gen Core i9-11900KF, 64 GB RAM, 1 TB SSD",
+ "time": "2022-03-27 04:04:00",
+ "kind": "server",
+ "result":
+ [
+ [0.001, 0.001, 0.001],
+ [0.010, 0.007, 0.006],
+ [0.030, 0.022, 0.020],
+ [0.100, 0.033, 0.035],
+ [0.114, 0.079, 0.074],
+ [0.241, 0.209, 0.225],
+ [0.002, 0.001, 0.001],
+ [0.008, 0.007, 0.007],
+ [0.565, 0.519, 0.511],
+ [0.629, 0.590, 0.599],
+ [0.159, 0.130, 0.134],
+ [0.190, 0.149, 0.150],
+ [0.976, 0.927, 0.915],
+ [1.273, 1.208, 1.199],
+ [1.086, 1.044, 1.041],
+ [1.229, 1.196, 1.200],
+ [3.206, 3.491, 3.213],
+ [1.841, 1.774, 1.809],
+ [5.919, 5.897, 5.799],
+ [0.104, 0.039, 0.037],
+ [1.176, 0.639, 0.694],
+ [1.407, 0.814, 0.825],
+ [2.984, 2.391, 2.121],
+ [2.100, 0.770, 0.716],
+ [0.342, 0.220, 0.211],
+ [0.222, 0.211, 0.189],
+ [0.346, 0.222, 0.224],
+ [1.272, 0.832, 0.822],
+ [1.507, 1.306, 1.282],
+ [3.619, 3.573, 3.597],
+ [0.761, 0.695, 0.703],
+ [1.375, 1.217, 1.229],
+ [8.576, 9.686, 9.070],
+ [5.634, 5.699, 5.801],
+ [6.090, 5.789, 5.797],
+ [1.996, 2.057, 1.946],
+ [0.119, 0.105, 0.112],
+ [0.049, 0.040, 0.040],
+ [0.048, 0.038, 0.038],
+ [0.261, 0.237, 0.231],
+ [0.029, 0.013, 0.014],
+ [0.017, 0.013, 0.011],
+ [0.003, 0.002, 0.003],
+ ]
+ }
+]
From 2fcc00380de969f6907f1396ca539d57c724ac8b Mon Sep 17 00:00:00 2001
From: alesapin
Date: Fri, 15 Apr 2022 14:00:52 +0200
Subject: [PATCH 09/55] Get rid of iheritance
---
src/Disks/DiskWebServer.cpp | 6 +++---
src/Disks/IDiskRemote.cpp | 3 ++-
src/Disks/IDiskRemote.h | 28 ++++++++++------------------
3 files changed, 15 insertions(+), 22 deletions(-)
diff --git a/src/Disks/DiskWebServer.cpp b/src/Disks/DiskWebServer.cpp
index 2f8929982e3..f7b399c4c97 100644
--- a/src/Disks/DiskWebServer.cpp
+++ b/src/Disks/DiskWebServer.cpp
@@ -165,10 +165,10 @@ std::unique_ptr DiskWebServer::readFile(const String & p
auto remote_path = fs_path.parent_path() / (escapeForFileName(fs_path.stem()) + fs_path.extension().string());
remote_path = remote_path.string().substr(url.size());
- RemoteMetadata meta(path, remote_path);
- meta.remote_fs_objects.emplace_back(remote_path, iter->second.size);
+ std::vector blobs_to_read;
+ blobs_to_read.emplace_back(remote_path, iter->second.size);
- auto web_impl = std::make_unique(url, meta.remote_fs_root_path, meta.remote_fs_objects, getContext(), read_settings);
+ auto web_impl = std::make_unique(url, path, blobs_to_read, getContext(), read_settings);
if (read_settings.remote_fs_method == RemoteFSReadMethod::threadpool)
{
diff --git a/src/Disks/IDiskRemote.cpp b/src/Disks/IDiskRemote.cpp
index ead951084ad..cf03ce3c27d 100644
--- a/src/Disks/IDiskRemote.cpp
+++ b/src/Disks/IDiskRemote.cpp
@@ -154,7 +154,8 @@ IDiskRemote::Metadata::Metadata(
const String & remote_fs_root_path_,
DiskPtr metadata_disk_,
const String & metadata_file_path_)
- : RemoteMetadata(remote_fs_root_path_, metadata_file_path_)
+ : remote_fs_root_path(remote_fs_root_path_)
+ , metadata_file_path(metadata_file_path_)
, metadata_disk(metadata_disk_)
, total_size(0), ref_count(0)
{
diff --git a/src/Disks/IDiskRemote.h b/src/Disks/IDiskRemote.h
index ac2b1634d05..7ab11aa7aca 100644
--- a/src/Disks/IDiskRemote.h
+++ b/src/Disks/IDiskRemote.h
@@ -184,10 +184,18 @@ private:
using RemoteDiskPtr = std::shared_ptr;
+/// Remote FS (S3, HDFS) metadata file layout:
+/// FS objects, their number and total size of all FS objects.
+/// Each FS object represents a file path in remote FS and its size.
-/// Minimum info, required to be passed to ReadIndirectBufferFromRemoteFS
-struct RemoteMetadata
+struct IDiskRemote::Metadata
{
+ using Updater = std::function;
+ /// Metadata file version.
+ static constexpr UInt32 VERSION_ABSOLUTE_PATHS = 1;
+ static constexpr UInt32 VERSION_RELATIVE_PATHS = 2;
+ static constexpr UInt32 VERSION_READ_ONLY_FLAG = 3;
+
/// Remote FS objects paths and their sizes.
std::vector remote_fs_objects;
@@ -197,22 +205,6 @@ struct RemoteMetadata
/// Relative path to metadata file on local FS.
const String metadata_file_path;
- RemoteMetadata(const String & remote_fs_root_path_, const String & metadata_file_path_)
- : remote_fs_root_path(remote_fs_root_path_), metadata_file_path(metadata_file_path_) {}
-};
-
-/// Remote FS (S3, HDFS) metadata file layout:
-/// FS objects, their number and total size of all FS objects.
-/// Each FS object represents a file path in remote FS and its size.
-
-struct IDiskRemote::Metadata : RemoteMetadata
-{
- using Updater = std::function;
- /// Metadata file version.
- static constexpr UInt32 VERSION_ABSOLUTE_PATHS = 1;
- static constexpr UInt32 VERSION_RELATIVE_PATHS = 2;
- static constexpr UInt32 VERSION_READ_ONLY_FLAG = 3;
-
DiskPtr metadata_disk;
/// Total size of all remote FS (S3, HDFS) objects.
From eb7593f78683e580ecb7c254e3324c6f37e572ff Mon Sep 17 00:00:00 2001
From: alesapin
Date: Fri, 15 Apr 2022 16:24:38 +0200
Subject: [PATCH 10/55] Remove more trash
---
src/Storages/MergeTree/IMergeTreeDataPart.cpp | 10 ----------
src/Storages/MergeTree/IMergeTreeDataPart.h | 3 ---
src/Storages/MergeTree/MergeTreeData.cpp | 5 -----
src/Storages/MergeTree/MergedBlockOutputStream.cpp | 2 --
src/Storages/MergeTree/MutateTask.cpp | 1 -
src/Storages/MergeTree/ReplicatedMergeTreeSink.cpp | 11 ++++++-----
src/Storages/StorageReplicatedMergeTree.cpp | 14 ++++++++++++--
7 files changed, 18 insertions(+), 28 deletions(-)
diff --git a/src/Storages/MergeTree/IMergeTreeDataPart.cpp b/src/Storages/MergeTree/IMergeTreeDataPart.cpp
index d704d8fc435..43887f89394 100644
--- a/src/Storages/MergeTree/IMergeTreeDataPart.cpp
+++ b/src/Storages/MergeTree/IMergeTreeDataPart.cpp
@@ -1514,8 +1514,6 @@ try
SyncGuardPtr sync_guard;
if (storage.getSettings()->fsync_part_directory)
sync_guard = volume->getDisk()->getDirectorySyncGuard(to);
-
- storage.lockSharedData(*this);
}
catch (...)
{
@@ -1530,14 +1528,6 @@ catch (...)
throw;
}
-void IMergeTreeDataPart::cleanupOldName(const String & old_part_name) const
-{
- if (name == old_part_name)
- return;
-
- storage.unlockSharedData(*this, old_part_name);
-}
-
std::optional IMergeTreeDataPart::keepSharedDataInDecoupledStorage() const
{
/// NOTE: It's needed for zero-copy replication
diff --git a/src/Storages/MergeTree/IMergeTreeDataPart.h b/src/Storages/MergeTree/IMergeTreeDataPart.h
index 19df88c5466..7307619a70e 100644
--- a/src/Storages/MergeTree/IMergeTreeDataPart.h
+++ b/src/Storages/MergeTree/IMergeTreeDataPart.h
@@ -354,9 +354,6 @@ public:
/// Changes only relative_dir_name, you need to update other metadata (name, is_temp) explicitly
virtual void renameTo(const String & new_relative_path, bool remove_new_dir_if_exists) const;
- /// Cleanup shared locks made with old name after part renaming
- virtual void cleanupOldName(const String & old_part_name) const;
-
/// Makes clone of a part in detached/ directory via hard links
virtual void makeCloneInDetached(const String & prefix, const StorageMetadataPtr & metadata_snapshot) const;
diff --git a/src/Storages/MergeTree/MergeTreeData.cpp b/src/Storages/MergeTree/MergeTreeData.cpp
index 29b3083c38f..89b57137979 100644
--- a/src/Storages/MergeTree/MergeTreeData.cpp
+++ b/src/Storages/MergeTree/MergeTreeData.cpp
@@ -2663,8 +2663,6 @@ bool MergeTreeData::renameTempPartAndReplace(
MergeTreePartInfo part_info = part->info;
String part_name;
- String old_part_name = part->name;
-
if (DataPartPtr existing_part_in_partition = getAnyPartInPartition(part->info.partition_id, lock))
{
if (part->partition.value != existing_part_in_partition->partition.value)
@@ -2786,9 +2784,6 @@ bool MergeTreeData::renameTempPartAndReplace(
out_covered_parts->emplace_back(std::move(covered_part));
}
- /// Cleanup shared locks made with old name
- part->cleanupOldName(old_part_name);
-
return true;
}
diff --git a/src/Storages/MergeTree/MergedBlockOutputStream.cpp b/src/Storages/MergeTree/MergedBlockOutputStream.cpp
index 6acbfacd4c1..09711e512a5 100644
--- a/src/Storages/MergeTree/MergedBlockOutputStream.cpp
+++ b/src/Storages/MergeTree/MergedBlockOutputStream.cpp
@@ -100,8 +100,6 @@ void MergedBlockOutputStream::Finalizer::Impl::finish()
if (sync)
file->sync();
}
-
- part->storage.lockSharedData(*part);
}
MergedBlockOutputStream::Finalizer::~Finalizer()
diff --git a/src/Storages/MergeTree/MutateTask.cpp b/src/Storages/MergeTree/MutateTask.cpp
index b9bebc665b2..96b0aa7ef7d 100644
--- a/src/Storages/MergeTree/MutateTask.cpp
+++ b/src/Storages/MergeTree/MutateTask.cpp
@@ -481,7 +481,6 @@ void finalizeMutatedPart(
MergeTreeData::DataPart::calculateTotalSizeOnDisk(new_data_part->volume->getDisk(), part_path));
new_data_part->default_codec = codec;
new_data_part->calculateColumnsAndSecondaryIndicesSizesOnDisk();
- new_data_part->storage.lockSharedData(*new_data_part);
}
}
diff --git a/src/Storages/MergeTree/ReplicatedMergeTreeSink.cpp b/src/Storages/MergeTree/ReplicatedMergeTreeSink.cpp
index 187e4eb96c5..96f829ac96c 100644
--- a/src/Storages/MergeTree/ReplicatedMergeTreeSink.cpp
+++ b/src/Storages/MergeTree/ReplicatedMergeTreeSink.cpp
@@ -314,8 +314,6 @@ void ReplicatedMergeTreeSink::commitPart(
bool is_already_existing_part = false;
- String old_part_name = part->name;
-
while (true)
{
/// Obtain incremental block number and lock it. The lock holds our intention to add the block to the filesystem.
@@ -499,6 +497,8 @@ void ReplicatedMergeTreeSink::commitPart(
part->name);
}
+ storage.lockSharedData(*part, false);
+
Coordination::Responses responses;
Coordination::Error multi_code = zookeeper->tryMultiNoThrow(ops, responses); /// 1 RTT
@@ -553,11 +553,13 @@ void ReplicatedMergeTreeSink::commitPart(
}
else if (multi_code == Coordination::Error::ZNODEEXISTS && failed_op_path == quorum_info.status_path)
{
+ storage.unlockSharedData(*part);
transaction.rollback();
throw Exception("Another quorum insert has been already started", ErrorCodes::UNSATISFIED_QUORUM_FOR_PREVIOUS_WRITE);
}
else
{
+ storage.unlockSharedData(*part);
/// NOTE: We could be here if the node with the quorum existed, but was quickly removed.
transaction.rollback();
throw Exception("Unexpected logical error while adding block " + toString(block_number) + " with ID '" + block_id + "': "
@@ -567,12 +569,14 @@ void ReplicatedMergeTreeSink::commitPart(
}
else if (Coordination::isHardwareError(multi_code))
{
+ storage.unlockSharedData(*part);
transaction.rollback();
throw Exception("Unrecoverable network error while adding block " + toString(block_number) + " with ID '" + block_id + "': "
+ Coordination::errorMessage(multi_code), ErrorCodes::UNEXPECTED_ZOOKEEPER_ERROR);
}
else
{
+ storage.unlockSharedData(*part);
transaction.rollback();
throw Exception("Unexpected ZooKeeper error while adding block " + toString(block_number) + " with ID '" + block_id + "': "
+ Coordination::errorMessage(multi_code), ErrorCodes::UNEXPECTED_ZOOKEEPER_ERROR);
@@ -595,9 +599,6 @@ void ReplicatedMergeTreeSink::commitPart(
waitForQuorum(zookeeper, part->name, quorum_info.status_path, quorum_info.is_active_node_value);
}
-
- /// Cleanup shared locks made with old name
- part->cleanupOldName(old_part_name);
}
void ReplicatedMergeTreeSink::onStart()
diff --git a/src/Storages/StorageReplicatedMergeTree.cpp b/src/Storages/StorageReplicatedMergeTree.cpp
index db1de14f6a6..67d4d05bb9c 100644
--- a/src/Storages/StorageReplicatedMergeTree.cpp
+++ b/src/Storages/StorageReplicatedMergeTree.cpp
@@ -1450,11 +1450,14 @@ MergeTreeData::DataPartsVector StorageReplicatedMergeTree::checkPartChecksumsAnd
{
auto zookeeper = getZooKeeper();
+
while (true)
{
Coordination::Requests ops;
NameSet absent_part_paths_on_replicas;
+ lockSharedData(*part, false);
+
/// Checksums are checked here and `ops` is filled. In fact, the part is added to ZK just below, when executing `multi`.
checkPartChecksumsAndAddCommitOps(zookeeper, part, ops, part->name, &absent_part_paths_on_replicas);
@@ -1493,7 +1496,10 @@ MergeTreeData::DataPartsVector StorageReplicatedMergeTree::checkPartChecksumsAnd
LOG_INFO(log, "The part {} on a replica suddenly appeared, will recheck checksums", e.getPathForFirstFailedOp());
}
else
+ {
+ unlockSharedData(*part);
throw;
+ }
}
}
}
@@ -7296,7 +7302,9 @@ void StorageReplicatedMergeTree::createTableSharedID()
void StorageReplicatedMergeTree::lockSharedDataTemporary(const String & part_name, const String & part_id, const DiskPtr & disk) const
{
- if (!disk || !disk->supportZeroCopyReplication())
+ auto settings = getSettings();
+
+ if (!disk || !disk->supportZeroCopyReplication() || !settings->allow_remote_fs_zero_copy_replication)
return;
zkutil::ZooKeeperPtr zookeeper = tryGetZooKeeper();
@@ -7320,7 +7328,9 @@ void StorageReplicatedMergeTree::lockSharedDataTemporary(const String & part_nam
void StorageReplicatedMergeTree::lockSharedData(const IMergeTreeDataPart & part, bool replace_existing_lock) const
{
- if (!part.volume || !part.isStoredOnDisk())
+ auto settings = getSettings();
+
+ if (!part.volume || !part.isStoredOnDisk() || !settings->allow_remote_fs_zero_copy_replication)
return;
DiskPtr disk = part.volume->getDisk();
From 5a8419a48e1939e07a307cc40cd7a78a87363367 Mon Sep 17 00:00:00 2001
From: alesapin
Date: Fri, 15 Apr 2022 17:05:17 +0200
Subject: [PATCH 11/55] Remove more trash
---
src/Storages/MergeTree/IMergeTreeDataPart.cpp | 24 ++++++++-----------
src/Storages/MergeTree/IMergeTreeDataPart.h | 10 +++++++-
src/Storages/MergeTree/MutateTask.cpp | 12 ++++++++++
src/Storages/StorageReplicatedMergeTree.cpp | 14 ++++-------
src/Storages/StorageReplicatedMergeTree.h | 3 ---
5 files changed, 35 insertions(+), 28 deletions(-)
diff --git a/src/Storages/MergeTree/IMergeTreeDataPart.cpp b/src/Storages/MergeTree/IMergeTreeDataPart.cpp
index 43887f89394..ba2463a18e6 100644
--- a/src/Storages/MergeTree/IMergeTreeDataPart.cpp
+++ b/src/Storages/MergeTree/IMergeTreeDataPart.cpp
@@ -504,10 +504,8 @@ void IMergeTreeDataPart::removeIfNeeded()
if (parent_part)
{
- std::optional keep_shared_data = keepSharedDataInDecoupledStorage();
- if (!keep_shared_data.has_value())
- return;
- projectionRemove(parent_part->getFullRelativePath(), *keep_shared_data);
+ bool keep_shared_data = keepSharedDataInDecoupledStorage();
+ projectionRemove(parent_part->getFullRelativePath(), keep_shared_data);
}
else
remove();
@@ -1528,7 +1526,7 @@ catch (...)
throw;
}
-std::optional IMergeTreeDataPart::keepSharedDataInDecoupledStorage() const
+bool IMergeTreeDataPart::keepSharedDataInDecoupledStorage() const
{
/// NOTE: It's needed for zero-copy replication
if (force_keep_shared_data)
@@ -1554,9 +1552,7 @@ void IMergeTreeDataPart::remove() const
assert(assertHasValidVersionMetadata());
part_is_probably_removed_from_disk = true;
- std::optional keep_shared_data = keepSharedDataInDecoupledStorage();
- if (!keep_shared_data.has_value())
- return;
+ bool keep_shared_data = keepSharedDataInDecoupledStorage();
if (!isStoredOnDisk())
return;
@@ -1567,7 +1563,7 @@ void IMergeTreeDataPart::remove() const
if (isProjectionPart())
{
LOG_WARNING(storage.log, "Projection part {} should be removed by its parent {}.", name, parent_part->name);
- projectionRemove(parent_part->getFullRelativePath(), *keep_shared_data);
+ projectionRemove(parent_part->getFullRelativePath(), keep_shared_data);
return;
}
@@ -1599,7 +1595,7 @@ void IMergeTreeDataPart::remove() const
LOG_WARNING(storage.log, "Directory {} (to which part must be renamed before removing) already exists. Most likely this is due to unclean restart or race condition. Removing it.", fullPath(disk, to));
try
{
- disk->removeSharedRecursive(fs::path(to) / "", *keep_shared_data);
+ disk->removeSharedRecursive(fs::path(to) / "", keep_shared_data);
}
catch (...)
{
@@ -1626,7 +1622,7 @@ void IMergeTreeDataPart::remove() const
std::unordered_set projection_directories;
for (const auto & [p_name, projection_part] : projection_parts)
{
- projection_part->projectionRemove(to, *keep_shared_data);
+ projection_part->projectionRemove(to, keep_shared_data);
projection_directories.emplace(p_name + ".proj");
}
@@ -1634,7 +1630,7 @@ void IMergeTreeDataPart::remove() const
if (checksums.empty())
{
/// If the part is not completely written, we cannot use fast path by listing files.
- disk->removeSharedRecursive(fs::path(to) / "", *keep_shared_data);
+ disk->removeSharedRecursive(fs::path(to) / "", keep_shared_data);
}
else
{
@@ -1663,7 +1659,7 @@ void IMergeTreeDataPart::remove() const
request.emplace_back(fs::path(to) / DELETE_ON_DESTROY_MARKER_FILE_NAME, true);
request.emplace_back(fs::path(to) / TXN_VERSION_METADATA_FILE_NAME, true);
- disk->removeSharedFiles(request, *keep_shared_data);
+ disk->removeSharedFiles(request, keep_shared_data);
disk->removeDirectory(to);
}
catch (...)
@@ -1672,7 +1668,7 @@ void IMergeTreeDataPart::remove() const
LOG_ERROR(storage.log, "Cannot quickly remove directory {} by removing files; fallback to recursive removal. Reason: {}", fullPath(disk, to), getCurrentExceptionMessage(false));
- disk->removeSharedRecursive(fs::path(to) / "", *keep_shared_data);
+ disk->removeSharedRecursive(fs::path(to) / "", keep_shared_data);
}
}
}
diff --git a/src/Storages/MergeTree/IMergeTreeDataPart.h b/src/Storages/MergeTree/IMergeTreeDataPart.h
index 7307619a70e..e9c0738e114 100644
--- a/src/Storages/MergeTree/IMergeTreeDataPart.h
+++ b/src/Storages/MergeTree/IMergeTreeDataPart.h
@@ -331,6 +331,14 @@ public:
mutable VersionMetadata version;
+ struct HardlinkedFiles
+ {
+ std::string source_part_name;
+ std::vector hardlinks_from_source_part;
+ };
+
+ HardlinkedFiles hardlinked_files;
+
/// For data in RAM ('index')
UInt64 getIndexSizeInBytes() const;
UInt64 getIndexSizeInAllocatedBytes() const;
@@ -512,7 +520,7 @@ protected:
String getRelativePathForDetachedPart(const String & prefix) const;
- std::optional keepSharedDataInDecoupledStorage() const;
+ bool keepSharedDataInDecoupledStorage() const;
void initializePartMetadataManager();
diff --git a/src/Storages/MergeTree/MutateTask.cpp b/src/Storages/MergeTree/MutateTask.cpp
index 96b0aa7ef7d..c5e5338d6fb 100644
--- a/src/Storages/MergeTree/MutateTask.cpp
+++ b/src/Storages/MergeTree/MutateTask.cpp
@@ -1070,6 +1070,7 @@ private:
ctx->new_data_part->version.setCreationTID(tid, nullptr);
ctx->new_data_part->storeVersionMetadata();
+ std::vector hardlinked_files;
/// Create hardlinks for unchanged files
for (auto it = ctx->disk->iterateDirectory(ctx->source_part->getFullRelativePath()); it->isValid(); it->next())
{
@@ -1083,10 +1084,12 @@ private:
{
return rename_pair.first == file_name;
});
+
if (rename_it != ctx->files_to_rename.end())
{
if (rename_it->second.empty())
continue;
+
destination += rename_it->second;
}
else
@@ -1094,8 +1097,13 @@ private:
destination += it->name();
}
+
if (!ctx->disk->isDirectory(it->path()))
+ {
ctx->disk->createHardLink(it->path(), destination);
+ hardlinked_files.push_back(it->name());
+ }
+
else if (!endsWith(".tmp_proj", it->name())) // ignore projection tmp merge dir
{
// it's a projection part directory
@@ -1104,10 +1112,14 @@ private:
{
String p_destination = fs::path(destination) / p_it->name();
ctx->disk->createHardLink(p_it->path(), p_destination);
+ hardlinked_files.push_back(p_it->name());
}
}
}
+ ctx->new_data_part->hardlinked_files.source_part_name = ctx->source_part->name;
+ ctx->new_data_part->hardlinked_files.hardlinks_from_source_part = hardlinked_files;
+
(*ctx->mutate_entry)->columns_written = ctx->storage_columns.size() - ctx->updated_header.columns();
ctx->new_data_part->checksums = ctx->source_part->checksums;
diff --git a/src/Storages/StorageReplicatedMergeTree.cpp b/src/Storages/StorageReplicatedMergeTree.cpp
index 67d4d05bb9c..e97b3f7c6e4 100644
--- a/src/Storages/StorageReplicatedMergeTree.cpp
+++ b/src/Storages/StorageReplicatedMergeTree.cpp
@@ -7344,8 +7344,9 @@ void StorageReplicatedMergeTree::lockSharedData(const IMergeTreeDataPart & part,
String id = part.getUniqueId();
boost::replace_all(id, "/", "_");
- Strings zc_zookeeper_paths = getZeroCopyPartPath(*getSettings(), disk->getType(), getTableSharedID(),
- part.name, zookeeper_path);
+ Strings zc_zookeeper_paths = getZeroCopyPartPath(
+ *getSettings(), disk->getType(), getTableSharedID(), part.name, zookeeper_path);
+
for (const auto & zc_zookeeper_path : zc_zookeeper_paths)
{
String zookeeper_node = fs::path(zc_zookeeper_path) / id / replica_name;
@@ -7356,14 +7357,7 @@ void StorageReplicatedMergeTree::lockSharedData(const IMergeTreeDataPart & part,
}
}
-
bool StorageReplicatedMergeTree::unlockSharedData(const IMergeTreeDataPart & part) const
-{
- return unlockSharedData(part, part.name);
-}
-
-
-bool StorageReplicatedMergeTree::unlockSharedData(const IMergeTreeDataPart & part, const String & name) const
{
if (!part.volume || !part.isStoredOnDisk())
return true;
@@ -7386,7 +7380,7 @@ bool StorageReplicatedMergeTree::unlockSharedData(const IMergeTreeDataPart & par
return true;
}
- return unlockSharedDataByID(part.getUniqueId(), getTableSharedID(), name, replica_name, disk, getZooKeeper(), *getSettings(), log,
+ return unlockSharedDataByID(part.getUniqueId(), getTableSharedID(), part.name, replica_name, disk, getZooKeeper(), *getSettings(), log,
zookeeper_path);
}
diff --git a/src/Storages/StorageReplicatedMergeTree.h b/src/Storages/StorageReplicatedMergeTree.h
index 59fb3f124c7..de4bf698184 100644
--- a/src/Storages/StorageReplicatedMergeTree.h
+++ b/src/Storages/StorageReplicatedMergeTree.h
@@ -238,9 +238,6 @@ public:
/// Return false if data is still used by another node
bool unlockSharedData(const IMergeTreeDataPart & part) const override;
- /// Remove lock with old name for shared data part after rename
- bool unlockSharedData(const IMergeTreeDataPart & part, const String & name) const override;
-
/// Unlock shared data part in zookeeper by part id
/// Return true if data unlocked
/// Return false if data is still used by another node
From 1706ae9e1597aab77c59d65015cf8d96ae66f506 Mon Sep 17 00:00:00 2001
From: alesapin
Date: Fri, 15 Apr 2022 18:36:23 +0200
Subject: [PATCH 12/55] Some trash implementation
---
src/Storages/MergeTree/IMergeTreeDataPart.cpp | 24 ++++----
src/Storages/MergeTree/IMergeTreeDataPart.h | 2 +-
src/Storages/MergeTree/MergeTreeData.cpp | 29 ++++++++-
src/Storages/MergeTree/MergeTreeData.h | 5 +-
src/Storages/StorageReplicatedMergeTree.cpp | 59 +++++++++++++++----
src/Storages/StorageReplicatedMergeTree.h | 6 +-
6 files changed, 89 insertions(+), 36 deletions(-)
diff --git a/src/Storages/MergeTree/IMergeTreeDataPart.cpp b/src/Storages/MergeTree/IMergeTreeDataPart.cpp
index ba2463a18e6..6e48d9c0277 100644
--- a/src/Storages/MergeTree/IMergeTreeDataPart.cpp
+++ b/src/Storages/MergeTree/IMergeTreeDataPart.cpp
@@ -504,8 +504,8 @@ void IMergeTreeDataPart::removeIfNeeded()
if (parent_part)
{
- bool keep_shared_data = keepSharedDataInDecoupledStorage();
- projectionRemove(parent_part->getFullRelativePath(), keep_shared_data);
+ auto [can_remove, _] = canRemovePart();
+ projectionRemove(parent_part->getFullRelativePath(), !can_remove);
}
else
remove();
@@ -1526,13 +1526,13 @@ catch (...)
throw;
}
-bool IMergeTreeDataPart::keepSharedDataInDecoupledStorage() const
+std::pair> IMergeTreeDataPart::canRemovePart() const
{
/// NOTE: It's needed for zero-copy replication
if (force_keep_shared_data)
- return true;
+ return std::make_pair(false, std::vector{});
- return !storage.unlockSharedData(*this);
+ return storage.unlockSharedData(*this);
}
void IMergeTreeDataPart::initializePartMetadataManager()
@@ -1552,7 +1552,7 @@ void IMergeTreeDataPart::remove() const
assert(assertHasValidVersionMetadata());
part_is_probably_removed_from_disk = true;
- bool keep_shared_data = keepSharedDataInDecoupledStorage();
+ auto [can_remove, files_not_to_remove] = canRemovePart();
if (!isStoredOnDisk())
return;
@@ -1563,7 +1563,7 @@ void IMergeTreeDataPart::remove() const
if (isProjectionPart())
{
LOG_WARNING(storage.log, "Projection part {} should be removed by its parent {}.", name, parent_part->name);
- projectionRemove(parent_part->getFullRelativePath(), keep_shared_data);
+ projectionRemove(parent_part->getFullRelativePath(), !can_remove);
return;
}
@@ -1595,7 +1595,7 @@ void IMergeTreeDataPart::remove() const
LOG_WARNING(storage.log, "Directory {} (to which part must be renamed before removing) already exists. Most likely this is due to unclean restart or race condition. Removing it.", fullPath(disk, to));
try
{
- disk->removeSharedRecursive(fs::path(to) / "", keep_shared_data);
+ disk->removeSharedRecursive(fs::path(to) / "", !can_remove);
}
catch (...)
{
@@ -1622,7 +1622,7 @@ void IMergeTreeDataPart::remove() const
std::unordered_set projection_directories;
for (const auto & [p_name, projection_part] : projection_parts)
{
- projection_part->projectionRemove(to, keep_shared_data);
+ projection_part->projectionRemove(to, !can_remove);
projection_directories.emplace(p_name + ".proj");
}
@@ -1630,7 +1630,7 @@ void IMergeTreeDataPart::remove() const
if (checksums.empty())
{
/// If the part is not completely written, we cannot use fast path by listing files.
- disk->removeSharedRecursive(fs::path(to) / "", keep_shared_data);
+ disk->removeSharedRecursive(fs::path(to) / "", !can_remove);
}
else
{
@@ -1659,7 +1659,7 @@ void IMergeTreeDataPart::remove() const
request.emplace_back(fs::path(to) / DELETE_ON_DESTROY_MARKER_FILE_NAME, true);
request.emplace_back(fs::path(to) / TXN_VERSION_METADATA_FILE_NAME, true);
- disk->removeSharedFiles(request, keep_shared_data);
+ disk->removeSharedFiles(request, !can_remove);
disk->removeDirectory(to);
}
catch (...)
@@ -1668,7 +1668,7 @@ void IMergeTreeDataPart::remove() const
LOG_ERROR(storage.log, "Cannot quickly remove directory {} by removing files; fallback to recursive removal. Reason: {}", fullPath(disk, to), getCurrentExceptionMessage(false));
- disk->removeSharedRecursive(fs::path(to) / "", keep_shared_data);
+ disk->removeSharedRecursive(fs::path(to) / "", !can_remove);
}
}
}
diff --git a/src/Storages/MergeTree/IMergeTreeDataPart.h b/src/Storages/MergeTree/IMergeTreeDataPart.h
index e9c0738e114..3597120a05b 100644
--- a/src/Storages/MergeTree/IMergeTreeDataPart.h
+++ b/src/Storages/MergeTree/IMergeTreeDataPart.h
@@ -520,7 +520,7 @@ protected:
String getRelativePathForDetachedPart(const String & prefix) const;
- bool keepSharedDataInDecoupledStorage() const;
+ std::pair> canRemovePart() const;
void initializePartMetadataManager();
diff --git a/src/Storages/MergeTree/MergeTreeData.cpp b/src/Storages/MergeTree/MergeTreeData.cpp
index 89b57137979..c46616dccb3 100644
--- a/src/Storages/MergeTree/MergeTreeData.cpp
+++ b/src/Storages/MergeTree/MergeTreeData.cpp
@@ -1575,6 +1575,7 @@ MergeTreeData::DataPartsVector MergeTreeData::grabOldParts(bool force)
if (!lock.try_lock())
return res;
+ bool need_remove_parts_in_order = supportsReplication() && getSettings()->allow_remote_fs_zero_copy_replication;
time_t now = time(nullptr);
std::vector parts_to_delete;
@@ -1588,13 +1589,22 @@ MergeTreeData::DataPartsVector MergeTreeData::grabOldParts(bool force)
/// Do not remove outdated part if it may be visible for some transaction
if (!part->version.canBeRemoved())
+ {
+ if (need_remove_parts_in_order)
+ break;
continue;
+ }
auto part_remove_time = part->remove_time.load(std::memory_order_relaxed);
/// Grab only parts that are not used by anyone (SELECTs for example).
if (!part.unique())
+ {
+ if (need_remove_parts_in_order)
+ break;
+
continue;
+ }
if ((part_remove_time < now && now - part_remove_time > getSettings()->old_parts_lifetime.totalSeconds())
|| force
@@ -1603,6 +1613,8 @@ MergeTreeData::DataPartsVector MergeTreeData::grabOldParts(bool force)
{
parts_to_delete.emplace_back(it);
}
+ else if (need_remove_parts_in_order)
+ break;
}
res.reserve(parts_to_delete.size());
@@ -3328,7 +3340,6 @@ void MergeTreeData::swapActivePart(MergeTreeData::DataPartPtr part_copy)
/// We do not check allow_remote_fs_zero_copy_replication here because data may be shared
/// when allow_remote_fs_zero_copy_replication turned on and off again
-
original_active_part->force_keep_shared_data = false;
if (original_active_part->volume->getDisk()->supportZeroCopyReplication() &&
@@ -5724,6 +5735,10 @@ MergeTreeData::MutableDataPartPtr MergeTreeData::cloneAndLoadDataPartOnSameDisk(
if (disk->exists(dst_part_path))
throw Exception("Part in " + fullPath(disk, dst_part_path) + " already exists", ErrorCodes::DIRECTORY_ALREADY_EXISTS);
+ auto single_disk_volume = std::make_shared(disk->getName(), disk, 0);
+ auto dst_data_part = createPart(dst_part_name, dst_part_info, single_disk_volume, tmp_dst_part_name);
+
+
/// If source part is in memory, flush it to disk and clone it already in on-disk format
if (auto src_part_in_memory = asInMemoryPart(src_part))
{
@@ -5732,14 +5747,22 @@ MergeTreeData::MutableDataPartPtr MergeTreeData::cloneAndLoadDataPartOnSameDisk(
src_part_in_memory->flushToDisk(src_relative_data_path, flushed_part_path, metadata_snapshot);
src_part_path = fs::path(src_relative_data_path) / flushed_part_path / "";
}
+ else
+ {
+ dst_data_part->hardlinked_files.source_part_name = src_part->name;
+
+ for (auto it = disk->iterateDirectory(src_part_path); it->isValid(); it->next())
+ {
+ if (it->name() != IMergeTreeDataPart::DELETE_ON_DESTROY_MARKER_FILE_NAME && it->name() != IMergeTreeDataPart::TXN_VERSION_METADATA_FILE_NAME)
+ dst_data_part->hardlinked_files.hardlinks_from_source_part.push_back(it->name());
+ }
+ }
LOG_DEBUG(log, "Cloning part {} to {}", fullPath(disk, src_part_path), fullPath(disk, dst_part_path));
localBackup(disk, src_part_path, dst_part_path, /* make_source_readonly */ false);
disk->removeFileIfExists(fs::path(dst_part_path) / IMergeTreeDataPart::DELETE_ON_DESTROY_MARKER_FILE_NAME);
disk->removeFileIfExists(fs::path(dst_part_path) / IMergeTreeDataPart::TXN_VERSION_METADATA_FILE_NAME);
- auto single_disk_volume = std::make_shared(disk->getName(), disk, 0);
- auto dst_data_part = createPart(dst_part_name, dst_part_info, single_disk_volume, tmp_dst_part_name);
/// We should write version metadata on part creation to distinguish it from parts that were created without transaction.
TransactionID tid = txn ? txn->tid : Tx::PrehistoricTID;
diff --git a/src/Storages/MergeTree/MergeTreeData.h b/src/Storages/MergeTree/MergeTreeData.h
index 1601a6714d3..25e0ee3e7c6 100644
--- a/src/Storages/MergeTree/MergeTreeData.h
+++ b/src/Storages/MergeTree/MergeTreeData.h
@@ -943,10 +943,7 @@ public:
/// Unlock shared data part in zookeeper
/// Overridden in StorageReplicatedMergeTree
- virtual bool unlockSharedData(const IMergeTreeDataPart &) const { return true; }
-
- /// Remove lock with old name for shared data part after rename
- virtual bool unlockSharedData(const IMergeTreeDataPart &, const String &) const { return true; }
+ virtual std::pair> unlockSharedData(const IMergeTreeDataPart &) const { return std::make_pair(true, std::vector{}); }
/// Fetch part only if some replica has it on shared storage like S3
/// Overridden in StorageReplicatedMergeTree
diff --git a/src/Storages/StorageReplicatedMergeTree.cpp b/src/Storages/StorageReplicatedMergeTree.cpp
index e97b3f7c6e4..96ff6a41d65 100644
--- a/src/Storages/StorageReplicatedMergeTree.cpp
+++ b/src/Storages/StorageReplicatedMergeTree.cpp
@@ -75,6 +75,7 @@
#include
#include
+#include
#include
#include
@@ -7345,7 +7346,16 @@ void StorageReplicatedMergeTree::lockSharedData(const IMergeTreeDataPart & part,
boost::replace_all(id, "/", "_");
Strings zc_zookeeper_paths = getZeroCopyPartPath(
- *getSettings(), disk->getType(), getTableSharedID(), part.name, zookeeper_path);
+ *getSettings(), disk->getType(), getTableSharedID(),
+ part.name, zookeeper_path);
+
+ String path_to_set;
+ if (!part.hardlinked_files.hardlinks_from_source_part.empty())
+ {
+ path_to_set = getZeroCopyPartPath(
+ *getSettings(), disk->getType(), getTableSharedID(),
+ part.hardlinked_files.source_part_name, zookeeper_path)[0];
+ }
for (const auto & zc_zookeeper_path : zc_zookeeper_paths)
{
@@ -7353,18 +7363,20 @@ void StorageReplicatedMergeTree::lockSharedData(const IMergeTreeDataPart & part,
LOG_TRACE(log, "Set zookeeper persistent lock {}", zookeeper_node);
- createZeroCopyLockNode(zookeeper, zookeeper_node, zkutil::CreateMode::Persistent, replace_existing_lock);
+ createZeroCopyLockNode(
+ zookeeper, zookeeper_node, zkutil::CreateMode::Persistent,
+ replace_existing_lock, path_to_set, part.hardlinked_files.hardlinks_from_source_part);
}
}
-bool StorageReplicatedMergeTree::unlockSharedData(const IMergeTreeDataPart & part) const
+std::pair> StorageReplicatedMergeTree::unlockSharedData(const IMergeTreeDataPart & part) const
{
if (!part.volume || !part.isStoredOnDisk())
- return true;
+ return std::make_pair(true, std::vector{});
DiskPtr disk = part.volume->getDisk();
if (!disk || !disk->supportZeroCopyReplication())
- return true;
+ return std::make_pair(true, std::vector{});
/// If part is temporary refcount file may be absent
auto ref_count_path = fs::path(part.getFullRelativePath()) / IMergeTreeDataPart::FILE_FOR_REFERENCES_CHECK;
@@ -7372,20 +7384,20 @@ bool StorageReplicatedMergeTree::unlockSharedData(const IMergeTreeDataPart & par
{
auto ref_count = disk->getRefCount(ref_count_path);
if (ref_count > 0) /// Keep part shard info for frozen backups
- return false;
+ return std::make_pair(false, std::vector{});
}
else
{
/// Temporary part with some absent file cannot be locked in shared mode
- return true;
+ return std::make_pair(true, std::vector{});
}
return unlockSharedDataByID(part.getUniqueId(), getTableSharedID(), part.name, replica_name, disk, getZooKeeper(), *getSettings(), log,
zookeeper_path);
}
-
-bool StorageReplicatedMergeTree::unlockSharedDataByID(String part_id, const String & table_uuid, const String & part_name,
+std::pair> StorageReplicatedMergeTree::unlockSharedDataByID(
+ String part_id, const String & table_uuid, const String & part_name,
const String & replica_name_, DiskPtr disk, zkutil::ZooKeeperPtr zookeeper_ptr, const MergeTreeSettings & settings,
Poco::Logger * logger, const String & zookeeper_path_old)
{
@@ -7394,9 +7406,13 @@ bool StorageReplicatedMergeTree::unlockSharedDataByID(String part_id, const Stri
Strings zc_zookeeper_paths = getZeroCopyPartPath(settings, disk->getType(), table_uuid, part_name, zookeeper_path_old);
bool part_has_no_more_locks = true;
+ std::vector files_not_to_remove;
for (const auto & zc_zookeeper_path : zc_zookeeper_paths)
{
+ files_not_to_remove.clear();
+ auto content = zookeeper_ptr->get(zc_zookeeper_path);
+ boost::split(files_not_to_remove, content, boost::is_any_of("\n "));
String zookeeper_part_uniq_node = fs::path(zc_zookeeper_path) / part_id;
/// Delete our replica node for part from zookeeper (we are not interested in it anymore)
@@ -7442,7 +7458,7 @@ bool StorageReplicatedMergeTree::unlockSharedDataByID(String part_id, const Stri
}
}
- return part_has_no_more_locks;
+ return std::make_pair(part_has_no_more_locks, files_not_to_remove);
}
@@ -7832,7 +7848,7 @@ bool StorageReplicatedMergeTree::createEmptyPartInsteadOfLost(zkutil::ZooKeeperP
}
-void StorageReplicatedMergeTree::createZeroCopyLockNode(const zkutil::ZooKeeperPtr & zookeeper, const String & zookeeper_node, int32_t mode, bool replace_existing_lock)
+void StorageReplicatedMergeTree::createZeroCopyLockNode(const zkutil::ZooKeeperPtr & zookeeper, const String & zookeeper_node, int32_t mode, bool replace_existing_lock, const std::string & path_to_set, const std::vector & hardlinked_files)
{
/// In rare case other replica can remove path between createAncestors and createIfNotExists
/// So we make up to 5 attempts
@@ -7852,6 +7868,11 @@ void StorageReplicatedMergeTree::createZeroCopyLockNode(const zkutil::ZooKeeperP
Coordination::Requests ops;
ops.emplace_back(zkutil::makeRemoveRequest(zookeeper_node, -1));
ops.emplace_back(zkutil::makeCreateRequest(zookeeper_node, "", mode));
+ if (!path_to_set.empty() && !hardlinked_files.empty())
+ {
+ std::string data = boost::algorithm::join(hardlinked_files, "\n");
+ ops.emplace_back(zkutil::makeSetRequest(path_to_set, data, -1));
+ }
Coordination::Responses responses;
auto error = zookeeper->tryMulti(ops, responses);
if (error == Coordination::Error::ZOK)
@@ -7859,7 +7880,16 @@ void StorageReplicatedMergeTree::createZeroCopyLockNode(const zkutil::ZooKeeperP
}
else
{
- auto error = zookeeper->tryCreate(zookeeper_node, "", mode);
+ Coordination::Requests ops;
+ if (!path_to_set.empty() && !hardlinked_files.empty())
+ {
+ std::string data = boost::algorithm::join(hardlinked_files, "\n");
+ ops.emplace_back(zkutil::makeSetRequest(path_to_set, data, -1));
+ }
+ ops.emplace_back(zkutil::makeCreateRequest(zookeeper_node, "", mode));
+
+ Coordination::Responses responses;
+ auto error = zookeeper->tryMulti(ops, responses);
if (error == Coordination::Error::ZOK || error == Coordination::Error::ZNODEEXISTS)
break;
}
@@ -8002,9 +8032,12 @@ bool StorageReplicatedMergeTree::removeSharedDetachedPart(DiskPtr disk, const St
if (disk->getRefCount(checksums) == 0)
{
String id = disk->getUniqueId(checksums);
- keep_shared = !StorageReplicatedMergeTree::unlockSharedDataByID(id, table_uuid, part_name,
+ bool can_remove = false;
+ std::tie(can_remove, std::ignore) = StorageReplicatedMergeTree::unlockSharedDataByID(id, table_uuid, part_name,
detached_replica_name, disk, zookeeper, getContext()->getReplicatedMergeTreeSettings(), log,
detached_zookeeper_path);
+
+ keep_shared = !can_remove;
}
else
keep_shared = true;
diff --git a/src/Storages/StorageReplicatedMergeTree.h b/src/Storages/StorageReplicatedMergeTree.h
index de4bf698184..2fb2494ca64 100644
--- a/src/Storages/StorageReplicatedMergeTree.h
+++ b/src/Storages/StorageReplicatedMergeTree.h
@@ -236,12 +236,12 @@ public:
/// Unlock shared data part in zookeeper
/// Return true if data unlocked
/// Return false if data is still used by another node
- bool unlockSharedData(const IMergeTreeDataPart & part) const override;
+ std::pair> unlockSharedData(const IMergeTreeDataPart & part) const override;
/// Unlock shared data part in zookeeper by part id
/// Return true if data unlocked
/// Return false if data is still used by another node
- static bool unlockSharedDataByID(String part_id, const String & table_uuid, const String & part_name, const String & replica_name_,
+ static std::pair> unlockSharedDataByID(String part_id, const String & table_uuid, const String & part_name, const String & replica_name_,
DiskPtr disk, zkutil::ZooKeeperPtr zookeeper_, const MergeTreeSettings & settings, Poco::Logger * logger,
const String & zookeeper_path_old);
@@ -756,7 +756,7 @@ private:
static Strings getZeroCopyPartPath(const MergeTreeSettings & settings, DiskType disk_type, const String & table_uuid,
const String & part_name, const String & zookeeper_path_old);
- static void createZeroCopyLockNode(const zkutil::ZooKeeperPtr & zookeeper, const String & zookeeper_node, int32_t mode = zkutil::CreateMode::Persistent, bool replace_existing_lock = false);
+ static void createZeroCopyLockNode(const zkutil::ZooKeeperPtr & zookeeper, const String & zookeeper_node, int32_t mode = zkutil::CreateMode::Persistent, bool replace_existing_lock = false, const std::string & path_to_set = "", const std::vector & hardlinked_files = {});
bool removeDetachedPart(DiskPtr disk, const String & path, const String & part_name, bool is_freezed) override;
From a97754f462c362c32f7375e6541bab333dea5aa7 Mon Sep 17 00:00:00 2001
From: alesapin
Date: Mon, 18 Apr 2022 13:39:09 +0200
Subject: [PATCH 13/55] Fix
---
src/Storages/MergeTree/MergeTreeData.cpp | 22 ++++++++++++----------
1 file changed, 12 insertions(+), 10 deletions(-)
diff --git a/src/Storages/MergeTree/MergeTreeData.cpp b/src/Storages/MergeTree/MergeTreeData.cpp
index c46616dccb3..de6658e536c 100644
--- a/src/Storages/MergeTree/MergeTreeData.cpp
+++ b/src/Storages/MergeTree/MergeTreeData.cpp
@@ -5735,10 +5735,8 @@ MergeTreeData::MutableDataPartPtr MergeTreeData::cloneAndLoadDataPartOnSameDisk(
if (disk->exists(dst_part_path))
throw Exception("Part in " + fullPath(disk, dst_part_path) + " already exists", ErrorCodes::DIRECTORY_ALREADY_EXISTS);
- auto single_disk_volume = std::make_shared(disk->getName(), disk, 0);
- auto dst_data_part = createPart(dst_part_name, dst_part_info, single_disk_volume, tmp_dst_part_name);
-
+ bool src_memory_part = false;
/// If source part is in memory, flush it to disk and clone it already in on-disk format
if (auto src_part_in_memory = asInMemoryPart(src_part))
{
@@ -5746,8 +5744,18 @@ MergeTreeData::MutableDataPartPtr MergeTreeData::cloneAndLoadDataPartOnSameDisk(
auto flushed_part_path = src_part_in_memory->getRelativePathForPrefix(tmp_part_prefix);
src_part_in_memory->flushToDisk(src_relative_data_path, flushed_part_path, metadata_snapshot);
src_part_path = fs::path(src_relative_data_path) / flushed_part_path / "";
+ src_memory_part = true;
}
- else
+
+ LOG_DEBUG(log, "Cloning part {} to {}", fullPath(disk, src_part_path), fullPath(disk, dst_part_path));
+ localBackup(disk, src_part_path, dst_part_path, /* make_source_readonly */ false);
+ disk->removeFileIfExists(fs::path(dst_part_path) / IMergeTreeDataPart::DELETE_ON_DESTROY_MARKER_FILE_NAME);
+ disk->removeFileIfExists(fs::path(dst_part_path) / IMergeTreeDataPart::TXN_VERSION_METADATA_FILE_NAME);
+
+ auto single_disk_volume = std::make_shared(disk->getName(), disk, 0);
+ auto dst_data_part = createPart(dst_part_name, dst_part_info, single_disk_volume, tmp_dst_part_name);
+
+ if (!src_memory_part && !asInMemoryPart(dst_data_part))
{
dst_data_part->hardlinked_files.source_part_name = src_part->name;
@@ -5758,12 +5766,6 @@ MergeTreeData::MutableDataPartPtr MergeTreeData::cloneAndLoadDataPartOnSameDisk(
}
}
- LOG_DEBUG(log, "Cloning part {} to {}", fullPath(disk, src_part_path), fullPath(disk, dst_part_path));
- localBackup(disk, src_part_path, dst_part_path, /* make_source_readonly */ false);
- disk->removeFileIfExists(fs::path(dst_part_path) / IMergeTreeDataPart::DELETE_ON_DESTROY_MARKER_FILE_NAME);
- disk->removeFileIfExists(fs::path(dst_part_path) / IMergeTreeDataPart::TXN_VERSION_METADATA_FILE_NAME);
-
-
/// We should write version metadata on part creation to distinguish it from parts that were created without transaction.
TransactionID tid = txn ? txn->tid : Tx::PrehistoricTID;
dst_data_part->version.setCreationTID(tid, nullptr);
From bd7b3847c1ffe46b9bd43b3aadce1f3da273a8d8 Mon Sep 17 00:00:00 2001
From: alesapin
Date: Tue, 19 Apr 2022 01:09:09 +0200
Subject: [PATCH 14/55] Some code
---
src/Disks/DiskCacheWrapper.cpp | 15 +++---
src/Disks/DiskCacheWrapper.h | 5 +-
src/Disks/DiskDecorator.cpp | 8 ++--
src/Disks/DiskDecorator.h | 4 +-
src/Disks/DiskEncrypted.h | 17 ++++++-
src/Disks/DiskRestartProxy.cpp | 8 ++--
src/Disks/DiskRestartProxy.h | 4 +-
src/Disks/DiskWebServer.h | 2 +-
src/Disks/IDisk.h | 9 ++--
src/Disks/IDiskRemote.cpp | 48 ++++++++++++-------
src/Disks/IDiskRemote.h | 11 +++--
src/Disks/S3/DiskS3.cpp | 2 +-
src/Storages/MergeTree/DataPartsExchange.cpp | 2 +-
src/Storages/MergeTree/IMergeTreeDataPart.cpp | 20 ++++----
src/Storages/MergeTree/IMergeTreeDataPart.h | 4 +-
src/Storages/MergeTree/MergeTreeData.cpp | 2 +-
src/Storages/MergeTree/MergeTreeData.h | 2 +-
src/Storages/MergeTree/MutateTask.cpp | 6 +--
src/Storages/StorageReplicatedMergeTree.cpp | 26 +++++-----
src/Storages/StorageReplicatedMergeTree.h | 6 +--
.../0_stateless/02067_lost_part_s3.reference | 3 ++
.../0_stateless/02067_lost_part_s3.sql | 26 ++++++++++
22 files changed, 148 insertions(+), 82 deletions(-)
create mode 100644 tests/queries/0_stateless/02067_lost_part_s3.reference
create mode 100644 tests/queries/0_stateless/02067_lost_part_s3.sql
diff --git a/src/Disks/DiskCacheWrapper.cpp b/src/Disks/DiskCacheWrapper.cpp
index cc2c330975a..8e355f70432 100644
--- a/src/Disks/DiskCacheWrapper.cpp
+++ b/src/Disks/DiskCacheWrapper.cpp
@@ -309,23 +309,26 @@ void DiskCacheWrapper::removeSharedFile(const String & path, bool keep_s3)
DiskDecorator::removeSharedFile(path, keep_s3);
}
-void DiskCacheWrapper::removeSharedRecursive(const String & path, bool keep_s3)
+void DiskCacheWrapper::removeSharedRecursive(const String & path, bool keep_all, const NameSet & files_to_keep)
{
if (cache_disk->exists(path))
- cache_disk->removeSharedRecursive(path, keep_s3);
- DiskDecorator::removeSharedRecursive(path, keep_s3);
+ cache_disk->removeSharedRecursive(path, keep_all, files_to_keep);
+ DiskDecorator::removeSharedRecursive(path, keep_all, files_to_keep);
}
-void DiskCacheWrapper::removeSharedFiles(const RemoveBatchRequest & files, bool keep_s3)
+void DiskCacheWrapper::removeSharedFiles(const RemoveBatchRequest & files, bool keep_all, const NameSet & files_to_keep)
{
for (const auto & file : files)
{
if (cache_disk->exists(file.path))
- cache_disk->removeSharedFile(file.path, keep_s3);
+ {
+ bool keep_file = keep_all || files_to_keep.contains(fs::path(file.path).filename());
+ cache_disk->removeSharedFile(file.path, keep_file);
+ }
}
- DiskDecorator::removeSharedFiles(files, keep_s3);
+ DiskDecorator::removeSharedFiles(files, keep_all, files_to_keep);
}
void DiskCacheWrapper::createHardLink(const String & src_path, const String & dst_path)
diff --git a/src/Disks/DiskCacheWrapper.h b/src/Disks/DiskCacheWrapper.h
index e413a3742f3..0e0b2e25f2c 100644
--- a/src/Disks/DiskCacheWrapper.h
+++ b/src/Disks/DiskCacheWrapper.h
@@ -47,8 +47,9 @@ public:
void removeDirectory(const String & path) override;
void removeRecursive(const String & path) override;
void removeSharedFile(const String & path, bool keep_s3) override;
- void removeSharedRecursive(const String & path, bool keep_s3) override;
- void removeSharedFiles(const RemoveBatchRequest & files, bool keep_s3) override;
+
+ void removeSharedRecursive(const String & path, bool keep_all, const NameSet & files_to_keep) override;
+ void removeSharedFiles(const RemoveBatchRequest & files, bool keep_all, const NameSet & files_to_keep) override;
void createHardLink(const String & src_path, const String & dst_path) override;
ReservationPtr reserve(UInt64 bytes) override;
diff --git a/src/Disks/DiskDecorator.cpp b/src/Disks/DiskDecorator.cpp
index 14f507af55d..44358847961 100644
--- a/src/Disks/DiskDecorator.cpp
+++ b/src/Disks/DiskDecorator.cpp
@@ -151,14 +151,14 @@ void DiskDecorator::removeSharedFile(const String & path, bool keep_s3)
delegate->removeSharedFile(path, keep_s3);
}
-void DiskDecorator::removeSharedFiles(const RemoveBatchRequest & files, bool keep_in_remote_fs)
+void DiskDecorator::removeSharedFiles(const RemoveBatchRequest & files, bool keep_all_batch_metadata, const NameSet & file_names_remove_metadata_only)
{
- delegate->removeSharedFiles(files, keep_in_remote_fs);
+ delegate->removeSharedFiles(files, keep_all_batch_metadata, file_names_remove_metadata_only);
}
-void DiskDecorator::removeSharedRecursive(const String & path, bool keep_s3)
+void DiskDecorator::removeSharedRecursive(const String & path, bool keep_all_batch_metadata, const NameSet & file_names_remove_metadata_only)
{
- delegate->removeSharedRecursive(path, keep_s3);
+ delegate->removeSharedRecursive(path, keep_all_batch_metadata, file_names_remove_metadata_only);
}
void DiskDecorator::setLastModified(const String & path, const Poco::Timestamp & timestamp)
diff --git a/src/Disks/DiskDecorator.h b/src/Disks/DiskDecorator.h
index e5c9c7699bf..6c70e9a3571 100644
--- a/src/Disks/DiskDecorator.h
+++ b/src/Disks/DiskDecorator.h
@@ -52,8 +52,8 @@ public:
void removeDirectory(const String & path) override;
void removeRecursive(const String & path) override;
void removeSharedFile(const String & path, bool keep_s3) override;
- void removeSharedRecursive(const String & path, bool keep_s3) override;
- void removeSharedFiles(const RemoveBatchRequest & files, bool keep_in_remote_fs) override;
+ void removeSharedRecursive(const String & path, bool keep_all_batch_metadata, const NameSet & file_names_remove_metadata_only) override;
+ void removeSharedFiles(const RemoveBatchRequest & files, bool keep_all_batch_metadata, const NameSet & file_names_remove_metadata_only) override;
void setLastModified(const String & path, const Poco::Timestamp & timestamp) override;
Poco::Timestamp getLastModified(const String & path) override;
void setReadOnly(const String & path) override;
diff --git a/src/Disks/DiskEncrypted.h b/src/Disks/DiskEncrypted.h
index 07a2ad81010..28156551e85 100644
--- a/src/Disks/DiskEncrypted.h
+++ b/src/Disks/DiskEncrypted.h
@@ -159,10 +159,23 @@ public:
delegate->removeSharedFile(wrapped_path, flag);
}
- void removeSharedRecursive(const String & path, bool flag) override
+ void removeSharedRecursive(const String & path, bool keep_all_batch_metadata, const NameSet & file_names_remove_metadata_only) override
{
auto wrapped_path = wrappedPath(path);
- delegate->removeSharedRecursive(wrapped_path, flag);
+ delegate->removeSharedRecursive(wrapped_path, keep_all_batch_metadata, file_names_remove_metadata_only);
+ }
+
+ void removeSharedFiles(const RemoveBatchRequest & files, bool keep_all_batch_metadata, const NameSet & file_names_remove_metadata_only) override
+ {
+ for (const auto & file : files)
+ {
+ auto wrapped_path = wrappedPath(file.path);
+ bool keep = keep_all_batch_metadata || file_names_remove_metadata_only.contains(fs::path(file.path).filename());
+ if (file.if_exists)
+ delegate->removeSharedFileIfExists(wrapped_path, keep);
+ else
+ delegate->removeSharedFile(wrapped_path, keep);
+ }
}
void removeSharedFileIfExists(const String & path, bool flag) override
diff --git a/src/Disks/DiskRestartProxy.cpp b/src/Disks/DiskRestartProxy.cpp
index 8045a0e8c72..ed0bc74b1a8 100644
--- a/src/Disks/DiskRestartProxy.cpp
+++ b/src/Disks/DiskRestartProxy.cpp
@@ -251,16 +251,16 @@ void DiskRestartProxy::removeSharedFile(const String & path, bool keep_s3)
DiskDecorator::removeSharedFile(path, keep_s3);
}
-void DiskRestartProxy::removeSharedFiles(const RemoveBatchRequest & files, bool keep_in_remote_fs)
+void DiskRestartProxy::removeSharedFiles(const RemoveBatchRequest & files, bool keep_all_batch_metadata, const NameSet & file_names_remove_metadata_only)
{
ReadLock lock (mutex);
- DiskDecorator::removeSharedFiles(files, keep_in_remote_fs);
+ DiskDecorator::removeSharedFiles(files, keep_all_batch_metadata, file_names_remove_metadata_only);
}
-void DiskRestartProxy::removeSharedRecursive(const String & path, bool keep_s3)
+void DiskRestartProxy::removeSharedRecursive(const String & path, bool keep_all_batch_metadata, const NameSet & file_names_remove_metadata_only)
{
ReadLock lock (mutex);
- DiskDecorator::removeSharedRecursive(path, keep_s3);
+ DiskDecorator::removeSharedRecursive(path, keep_all_batch_metadata, file_names_remove_metadata_only);
}
void DiskRestartProxy::setLastModified(const String & path, const Poco::Timestamp & timestamp)
diff --git a/src/Disks/DiskRestartProxy.h b/src/Disks/DiskRestartProxy.h
index baa57386e68..03bdc047218 100644
--- a/src/Disks/DiskRestartProxy.h
+++ b/src/Disks/DiskRestartProxy.h
@@ -54,8 +54,8 @@ public:
void removeDirectory(const String & path) override;
void removeRecursive(const String & path) override;
void removeSharedFile(const String & path, bool keep_s3) override;
- void removeSharedFiles(const RemoveBatchRequest & files, bool keep_in_remote_fs) override;
- void removeSharedRecursive(const String & path, bool keep_s3) override;
+ void removeSharedFiles(const RemoveBatchRequest & files, bool keep_all_batch_metadata, const NameSet & file_names_remove_metadata_only) override;
+ void removeSharedRecursive(const String & path, bool keep_all_batch_metadata, const NameSet & file_names_remove_metadata_only) override;
void setLastModified(const String & path, const Poco::Timestamp & timestamp) override;
Poco::Timestamp getLastModified(const String & path) override;
void setReadOnly(const String & path) override;
diff --git a/src/Disks/DiskWebServer.h b/src/Disks/DiskWebServer.h
index 98f92fe5986..dd699921f7c 100644
--- a/src/Disks/DiskWebServer.h
+++ b/src/Disks/DiskWebServer.h
@@ -139,7 +139,7 @@ public:
throw Exception(ErrorCodes::NOT_IMPLEMENTED, "Disk {} is read-only", getName());
}
- void removeSharedRecursive(const String &, bool) override
+ void removeSharedRecursive(const String &, bool, const NameSet &) override
{
throw Exception(ErrorCodes::NOT_IMPLEMENTED, "Disk {} is read-only", getName());
}
diff --git a/src/Disks/IDisk.h b/src/Disks/IDisk.h
index 81cdf47e1fb..30b8ac2c44b 100644
--- a/src/Disks/IDisk.h
+++ b/src/Disks/IDisk.h
@@ -197,7 +197,7 @@ public:
/// Remove file or directory with all children. Use with extra caution. Throws exception if file doesn't exists.
/// Differs from removeRecursive for S3/HDFS disks
/// Second bool param is a flag to remove (true) or keep (false) shared data on S3
- virtual void removeSharedRecursive(const String & path, bool) { removeRecursive(path); }
+ virtual void removeSharedRecursive(const String & path, bool, const NameSet &) { removeRecursive(path); }
/// Remove file or directory if it exists.
/// Differs from removeFileIfExists for S3/HDFS disks
@@ -237,14 +237,15 @@ public:
/// Batch request to remove multiple files.
/// May be much faster for blob storage.
- virtual void removeSharedFiles(const RemoveBatchRequest & files, bool keep_in_remote_fs)
+ virtual void removeSharedFiles(const RemoveBatchRequest & files, bool keep_all_batch_metadata, const NameSet & file_names_remove_metadata_only)
{
for (const auto & file : files)
{
+ bool keep_file = keep_all_batch_metadata || file_names_remove_metadata_only.contains(fs::path(file.path).filename());
if (file.if_exists)
- removeSharedFileIfExists(file.path, keep_in_remote_fs);
+ removeSharedFileIfExists(file.path, keep_file);
else
- removeSharedFile(file.path, keep_in_remote_fs);
+ removeSharedFile(file.path, keep_file);
}
}
diff --git a/src/Disks/IDiskRemote.cpp b/src/Disks/IDiskRemote.cpp
index cf03ce3c27d..a546ab56dbf 100644
--- a/src/Disks/IDiskRemote.cpp
+++ b/src/Disks/IDiskRemote.cpp
@@ -271,7 +271,7 @@ std::unordered_map IDiskRemote::getSerializedMetadata(const std:
return metadatas;
}
-void IDiskRemote::removeMetadata(const String & path, std::vector & paths_to_remove)
+void IDiskRemote::removeMetadata(const String & path, std::unordered_map> & paths_to_remove)
{
LOG_TRACE(log, "Remove file by path: {}", backQuote(metadata_disk->getPath() + path));
@@ -283,13 +283,13 @@ void IDiskRemote::removeMetadata(const String & path, std::vector &
try
{
- auto metadata_updater = [&paths_to_remove, this] (Metadata & metadata)
+ auto metadata_updater = [&paths_to_remove, path, this] (Metadata & metadata)
{
if (metadata.ref_count == 0)
{
for (const auto & [remote_fs_object_path, _] : metadata.remote_fs_objects)
{
- paths_to_remove.push_back(remote_fs_root_path + remote_fs_object_path);
+ paths_to_remove[path].push_back(remote_fs_root_path + remote_fs_object_path);
if (cache)
{
@@ -328,7 +328,7 @@ void IDiskRemote::removeMetadata(const String & path, std::vector &
}
-void IDiskRemote::removeMetadataRecursive(const String & path, std::vector & paths_to_remove)
+void IDiskRemote::removeMetadataRecursive(const String & path, std::unordered_map> & paths_to_remove)
{
checkStackSize(); /// This is needed to prevent stack overflow in case of cyclic symlinks.
@@ -483,27 +483,27 @@ void IDiskRemote::replaceFile(const String & from_path, const String & to_path)
void IDiskRemote::removeSharedFile(const String & path, bool delete_metadata_only)
{
- std::vector paths_to_remove;
+ std::unordered_map> paths_to_remove;
removeMetadata(path, paths_to_remove);
if (!delete_metadata_only)
- removeFromRemoteFS(paths_to_remove);
+ removeFromRemoteFS(paths_to_remove[path]);
}
void IDiskRemote::removeSharedFileIfExists(const String & path, bool delete_metadata_only)
{
- std::vector paths_to_remove;
+ std::unordered_map> paths_to_remove;
if (metadata_disk->exists(path))
{
removeMetadata(path, paths_to_remove);
if (!delete_metadata_only)
- removeFromRemoteFS(paths_to_remove);
+ removeFromRemoteFS(paths_to_remove[path]);
}
}
-void IDiskRemote::removeSharedFiles(const RemoveBatchRequest & files, bool delete_metadata_only)
+void IDiskRemote::removeSharedFiles(const RemoveBatchRequest & files, bool keep_all_batch_metadata, const NameSet & file_names_remove_metadata_only)
{
- std::vector paths_to_remove;
+ std::unordered_map> paths_to_remove;
for (const auto & file : files)
{
bool skip = file.if_exists && !metadata_disk->exists(file.path);
@@ -511,17 +511,33 @@ void IDiskRemote::removeSharedFiles(const RemoveBatchRequest & files, bool delet
removeMetadata(file.path, paths_to_remove);
}
- if (!delete_metadata_only)
- removeFromRemoteFS(paths_to_remove);
+ if (!keep_all_batch_metadata)
+ {
+ std::vector remove_from_remote;
+ for (auto && [path, remote_paths] : paths_to_remove)
+ {
+ if (!file_names_remove_metadata_only.contains(fs::path(path).filename()))
+ remove_from_remote.insert(remove_from_remote.end(), remote_paths.begin(), remote_paths.end());
+ }
+ removeFromRemoteFS(remove_from_remote);
+ }
}
-void IDiskRemote::removeSharedRecursive(const String & path, bool delete_metadata_only)
+void IDiskRemote::removeSharedRecursive(const String & path, bool keep_all_batch_metadata, const NameSet & file_names_remove_metadata_only)
{
- std::vector paths_to_remove;
+ std::unordered_map> paths_to_remove;
removeMetadataRecursive(path, paths_to_remove);
- if (!delete_metadata_only)
- removeFromRemoteFS(paths_to_remove);
+ if (!keep_all_batch_metadata)
+ {
+ std::vector remove_from_remote;
+ for (auto && [local_path, remote_paths] : paths_to_remove)
+ {
+ if (!file_names_remove_metadata_only.contains(fs::path(local_path).filename()))
+ remove_from_remote.insert(remove_from_remote.end(), remote_paths.begin(), remote_paths.end());
+ }
+ removeFromRemoteFS(remove_from_remote);
+ }
}
diff --git a/src/Disks/IDiskRemote.h b/src/Disks/IDiskRemote.h
index 7ab11aa7aca..901403f4689 100644
--- a/src/Disks/IDiskRemote.h
+++ b/src/Disks/IDiskRemote.h
@@ -107,15 +107,16 @@ public:
void removeFileIfExists(const String & path) override { removeSharedFileIfExists(path, false); }
- void removeRecursive(const String & path) override { removeSharedRecursive(path, false); }
+ void removeRecursive(const String & path) override { removeSharedRecursive(path, false, {}); }
+
void removeSharedFile(const String & path, bool delete_metadata_only) override;
void removeSharedFileIfExists(const String & path, bool delete_metadata_only) override;
- void removeSharedFiles(const RemoveBatchRequest & files, bool delete_metadata_only) override;
+ void removeSharedFiles(const RemoveBatchRequest & files, bool keep_all_batch_metadata, const NameSet & file_names_remove_metadata_only) override;
- void removeSharedRecursive(const String & path, bool delete_metadata_only) override;
+ void removeSharedRecursive(const String & path, bool keep_all_batch_metadata, const NameSet & file_names_remove_metadata_only) override;
void listFiles(const String & path, std::vector & file_names) override;
@@ -170,9 +171,9 @@ protected:
FileCachePtr cache;
private:
- void removeMetadata(const String & path, std::vector & paths_to_remove);
+ void removeMetadata(const String & path, std::unordered_map> & paths_to_remove);
- void removeMetadataRecursive(const String & path, std::vector & paths_to_remove);
+ void removeMetadataRecursive(const String & path, std::unordered_map> & paths_to_remove);
bool tryReserve(UInt64 bytes);
diff --git a/src/Disks/S3/DiskS3.cpp b/src/Disks/S3/DiskS3.cpp
index 6ed53bcb9c2..524a31a3bbb 100644
--- a/src/Disks/S3/DiskS3.cpp
+++ b/src/Disks/S3/DiskS3.cpp
@@ -764,7 +764,7 @@ void DiskS3::restore()
bool cleanup_s3 = information.source_bucket != bucket || information.source_path != remote_fs_root_path;
for (const auto & root : data_roots)
if (exists(root))
- removeSharedRecursive(root + '/', !cleanup_s3);
+ removeSharedRecursive(root + '/', !cleanup_s3, {});
restoreFiles(information);
restoreFileOperations(information);
diff --git a/src/Storages/MergeTree/DataPartsExchange.cpp b/src/Storages/MergeTree/DataPartsExchange.cpp
index 0dcccc33266..a74e4505789 100644
--- a/src/Storages/MergeTree/DataPartsExchange.cpp
+++ b/src/Storages/MergeTree/DataPartsExchange.cpp
@@ -826,7 +826,7 @@ MergeTreeData::MutableDataPartPtr Fetcher::downloadPartToDiskRemoteMeta(
/// NOTE The is_cancelled flag also makes sense to check every time you read over the network,
/// performing a poll with a not very large timeout.
/// And now we check it only between read chunks (in the `copyData` function).
- disk->removeSharedRecursive(part_download_path, true);
+ disk->removeSharedRecursive(part_download_path, true, {});
throw Exception("Fetching of part was cancelled", ErrorCodes::ABORTED);
}
diff --git a/src/Storages/MergeTree/IMergeTreeDataPart.cpp b/src/Storages/MergeTree/IMergeTreeDataPart.cpp
index 6e48d9c0277..13fd6d6d09e 100644
--- a/src/Storages/MergeTree/IMergeTreeDataPart.cpp
+++ b/src/Storages/MergeTree/IMergeTreeDataPart.cpp
@@ -1526,11 +1526,11 @@ catch (...)
throw;
}
-std::pair> IMergeTreeDataPart::canRemovePart() const
+std::pair IMergeTreeDataPart::canRemovePart() const
{
/// NOTE: It's needed for zero-copy replication
if (force_keep_shared_data)
- return std::make_pair(false, std::vector{});
+ return std::make_pair(false, NameSet{});
return storage.unlockSharedData(*this);
}
@@ -1595,7 +1595,7 @@ void IMergeTreeDataPart::remove() const
LOG_WARNING(storage.log, "Directory {} (to which part must be renamed before removing) already exists. Most likely this is due to unclean restart or race condition. Removing it.", fullPath(disk, to));
try
{
- disk->removeSharedRecursive(fs::path(to) / "", !can_remove);
+ disk->removeSharedRecursive(fs::path(to) / "", !can_remove, files_not_to_remove);
}
catch (...)
{
@@ -1630,7 +1630,7 @@ void IMergeTreeDataPart::remove() const
if (checksums.empty())
{
/// If the part is not completely written, we cannot use fast path by listing files.
- disk->removeSharedRecursive(fs::path(to) / "", !can_remove);
+ disk->removeSharedRecursive(fs::path(to) / "", !can_remove, files_not_to_remove);
}
else
{
@@ -1659,7 +1659,7 @@ void IMergeTreeDataPart::remove() const
request.emplace_back(fs::path(to) / DELETE_ON_DESTROY_MARKER_FILE_NAME, true);
request.emplace_back(fs::path(to) / TXN_VERSION_METADATA_FILE_NAME, true);
- disk->removeSharedFiles(request, !can_remove);
+ disk->removeSharedFiles(request, !can_remove, files_not_to_remove);
disk->removeDirectory(to);
}
catch (...)
@@ -1668,7 +1668,7 @@ void IMergeTreeDataPart::remove() const
LOG_ERROR(storage.log, "Cannot quickly remove directory {} by removing files; fallback to recursive removal. Reason: {}", fullPath(disk, to), getCurrentExceptionMessage(false));
- disk->removeSharedRecursive(fs::path(to) / "", !can_remove);
+ disk->removeSharedRecursive(fs::path(to) / "", !can_remove, files_not_to_remove);
}
}
}
@@ -1689,7 +1689,7 @@ void IMergeTreeDataPart::projectionRemove(const String & parent_to, bool keep_sh
"Cannot quickly remove directory {} by removing files; fallback to recursive removal. Reason: checksums.txt is missing",
fullPath(disk, to));
/// If the part is not completely written, we cannot use fast path by listing files.
- disk->removeSharedRecursive(fs::path(to) / "", keep_shared_data);
+ disk->removeSharedRecursive(fs::path(to) / "", keep_shared_data, {});
}
else
{
@@ -1713,8 +1713,8 @@ void IMergeTreeDataPart::projectionRemove(const String & parent_to, bool keep_sh
request.emplace_back(fs::path(to) / DEFAULT_COMPRESSION_CODEC_FILE_NAME, true);
request.emplace_back(fs::path(to) / DELETE_ON_DESTROY_MARKER_FILE_NAME, true);
- disk->removeSharedFiles(request, keep_shared_data);
- disk->removeSharedRecursive(to, keep_shared_data);
+ disk->removeSharedFiles(request, keep_shared_data, {});
+ disk->removeSharedRecursive(to, keep_shared_data, {});
}
catch (...)
{
@@ -1722,7 +1722,7 @@ void IMergeTreeDataPart::projectionRemove(const String & parent_to, bool keep_sh
LOG_ERROR(storage.log, "Cannot quickly remove directory {} by removing files; fallback to recursive removal. Reason: {}", fullPath(disk, to), getCurrentExceptionMessage(false));
- disk->removeSharedRecursive(fs::path(to) / "", keep_shared_data);
+ disk->removeSharedRecursive(fs::path(to) / "", keep_shared_data, {});
}
}
}
diff --git a/src/Storages/MergeTree/IMergeTreeDataPart.h b/src/Storages/MergeTree/IMergeTreeDataPart.h
index 3597120a05b..c7e119388c5 100644
--- a/src/Storages/MergeTree/IMergeTreeDataPart.h
+++ b/src/Storages/MergeTree/IMergeTreeDataPart.h
@@ -334,7 +334,7 @@ public:
struct HardlinkedFiles
{
std::string source_part_name;
- std::vector hardlinks_from_source_part;
+ NameSet hardlinks_from_source_part;
};
HardlinkedFiles hardlinked_files;
@@ -520,7 +520,7 @@ protected:
String getRelativePathForDetachedPart(const String & prefix) const;
- std::pair> canRemovePart() const;
+ std::pair canRemovePart() const;
void initializePartMetadataManager();
diff --git a/src/Storages/MergeTree/MergeTreeData.cpp b/src/Storages/MergeTree/MergeTreeData.cpp
index de6658e536c..e47077baffa 100644
--- a/src/Storages/MergeTree/MergeTreeData.cpp
+++ b/src/Storages/MergeTree/MergeTreeData.cpp
@@ -5762,7 +5762,7 @@ MergeTreeData::MutableDataPartPtr MergeTreeData::cloneAndLoadDataPartOnSameDisk(
for (auto it = disk->iterateDirectory(src_part_path); it->isValid(); it->next())
{
if (it->name() != IMergeTreeDataPart::DELETE_ON_DESTROY_MARKER_FILE_NAME && it->name() != IMergeTreeDataPart::TXN_VERSION_METADATA_FILE_NAME)
- dst_data_part->hardlinked_files.hardlinks_from_source_part.push_back(it->name());
+ dst_data_part->hardlinked_files.hardlinks_from_source_part.insert(it->name());
}
}
diff --git a/src/Storages/MergeTree/MergeTreeData.h b/src/Storages/MergeTree/MergeTreeData.h
index 25e0ee3e7c6..7d51a8fa37c 100644
--- a/src/Storages/MergeTree/MergeTreeData.h
+++ b/src/Storages/MergeTree/MergeTreeData.h
@@ -943,7 +943,7 @@ public:
/// Unlock shared data part in zookeeper
/// Overridden in StorageReplicatedMergeTree
- virtual std::pair> unlockSharedData(const IMergeTreeDataPart &) const { return std::make_pair(true, std::vector{}); }
+ virtual std::pair unlockSharedData(const IMergeTreeDataPart &) const { return std::make_pair(true, NameSet{}); }
/// Fetch part only if some replica has it on shared storage like S3
/// Overridden in StorageReplicatedMergeTree
diff --git a/src/Storages/MergeTree/MutateTask.cpp b/src/Storages/MergeTree/MutateTask.cpp
index c5e5338d6fb..5c960898e3c 100644
--- a/src/Storages/MergeTree/MutateTask.cpp
+++ b/src/Storages/MergeTree/MutateTask.cpp
@@ -1070,7 +1070,7 @@ private:
ctx->new_data_part->version.setCreationTID(tid, nullptr);
ctx->new_data_part->storeVersionMetadata();
- std::vector hardlinked_files;
+ NameSet hardlinked_files;
/// Create hardlinks for unchanged files
for (auto it = ctx->disk->iterateDirectory(ctx->source_part->getFullRelativePath()); it->isValid(); it->next())
{
@@ -1101,7 +1101,7 @@ private:
if (!ctx->disk->isDirectory(it->path()))
{
ctx->disk->createHardLink(it->path(), destination);
- hardlinked_files.push_back(it->name());
+ hardlinked_files.insert(it->name());
}
else if (!endsWith(".tmp_proj", it->name())) // ignore projection tmp merge dir
@@ -1112,7 +1112,7 @@ private:
{
String p_destination = fs::path(destination) / p_it->name();
ctx->disk->createHardLink(p_it->path(), p_destination);
- hardlinked_files.push_back(p_it->name());
+ hardlinked_files.insert(p_it->name());
}
}
}
diff --git a/src/Storages/StorageReplicatedMergeTree.cpp b/src/Storages/StorageReplicatedMergeTree.cpp
index 96ff6a41d65..4403fd2617f 100644
--- a/src/Storages/StorageReplicatedMergeTree.cpp
+++ b/src/Storages/StorageReplicatedMergeTree.cpp
@@ -7369,14 +7369,14 @@ void StorageReplicatedMergeTree::lockSharedData(const IMergeTreeDataPart & part,
}
}
-std::pair> StorageReplicatedMergeTree::unlockSharedData(const IMergeTreeDataPart & part) const
+std::pair StorageReplicatedMergeTree::unlockSharedData(const IMergeTreeDataPart & part) const
{
if (!part.volume || !part.isStoredOnDisk())
- return std::make_pair(true, std::vector{});
+ return std::make_pair(true, NameSet{});
DiskPtr disk = part.volume->getDisk();
if (!disk || !disk->supportZeroCopyReplication())
- return std::make_pair(true, std::vector{});
+ return std::make_pair(true, NameSet{});
/// If part is temporary refcount file may be absent
auto ref_count_path = fs::path(part.getFullRelativePath()) / IMergeTreeDataPart::FILE_FOR_REFERENCES_CHECK;
@@ -7384,19 +7384,19 @@ std::pair> StorageReplicatedMergeTree::unlockShar
{
auto ref_count = disk->getRefCount(ref_count_path);
if (ref_count > 0) /// Keep part shard info for frozen backups
- return std::make_pair(false, std::vector{});
+ return std::make_pair(false, NameSet{});
}
else
{
/// Temporary part with some absent file cannot be locked in shared mode
- return std::make_pair(true, std::vector{});
+ return std::make_pair(true, NameSet{});
}
return unlockSharedDataByID(part.getUniqueId(), getTableSharedID(), part.name, replica_name, disk, getZooKeeper(), *getSettings(), log,
zookeeper_path);
}
-std::pair> StorageReplicatedMergeTree::unlockSharedDataByID(
+std::pair StorageReplicatedMergeTree::unlockSharedDataByID(
String part_id, const String & table_uuid, const String & part_name,
const String & replica_name_, DiskPtr disk, zkutil::ZooKeeperPtr zookeeper_ptr, const MergeTreeSettings & settings,
Poco::Logger * logger, const String & zookeeper_path_old)
@@ -7406,13 +7406,14 @@ std::pair> StorageReplicatedMergeTree::unlockShar
Strings zc_zookeeper_paths = getZeroCopyPartPath(settings, disk->getType(), table_uuid, part_name, zookeeper_path_old);
bool part_has_no_more_locks = true;
- std::vector files_not_to_remove;
+ NameSet files_not_to_remove;
for (const auto & zc_zookeeper_path : zc_zookeeper_paths)
{
files_not_to_remove.clear();
- auto content = zookeeper_ptr->get(zc_zookeeper_path);
- boost::split(files_not_to_remove, content, boost::is_any_of("\n "));
+ auto files_not_to_remove_str = zookeeper_ptr->get(zc_zookeeper_path);
+ boost::split(files_not_to_remove, files_not_to_remove_str, boost::is_any_of("\n "));
+
String zookeeper_part_uniq_node = fs::path(zc_zookeeper_path) / part_id;
/// Delete our replica node for part from zookeeper (we are not interested in it anymore)
@@ -7848,7 +7849,7 @@ bool StorageReplicatedMergeTree::createEmptyPartInsteadOfLost(zkutil::ZooKeeperP
}
-void StorageReplicatedMergeTree::createZeroCopyLockNode(const zkutil::ZooKeeperPtr & zookeeper, const String & zookeeper_node, int32_t mode, bool replace_existing_lock, const std::string & path_to_set, const std::vector & hardlinked_files)
+void StorageReplicatedMergeTree::createZeroCopyLockNode(const zkutil::ZooKeeperPtr & zookeeper, const String & zookeeper_node, int32_t mode, bool replace_existing_lock, const std::string & path_to_set, const NameSet & hardlinked_files)
{
/// In rare case other replica can remove path between createAncestors and createIfNotExists
/// So we make up to 5 attempts
@@ -8025,6 +8026,7 @@ bool StorageReplicatedMergeTree::removeSharedDetachedPart(DiskPtr disk, const St
bool keep_shared = false;
zkutil::ZooKeeperPtr zookeeper = getZooKeeper();
+ NameSet files_not_to_remove;
fs::path checksums = fs::path(path) / IMergeTreeDataPart::FILE_FOR_REFERENCES_CHECK;
if (disk->exists(checksums))
@@ -8033,7 +8035,7 @@ bool StorageReplicatedMergeTree::removeSharedDetachedPart(DiskPtr disk, const St
{
String id = disk->getUniqueId(checksums);
bool can_remove = false;
- std::tie(can_remove, std::ignore) = StorageReplicatedMergeTree::unlockSharedDataByID(id, table_uuid, part_name,
+ std::tie(can_remove, files_not_to_remove) = StorageReplicatedMergeTree::unlockSharedDataByID(id, table_uuid, part_name,
detached_replica_name, disk, zookeeper, getContext()->getReplicatedMergeTreeSettings(), log,
detached_zookeeper_path);
@@ -8043,7 +8045,7 @@ bool StorageReplicatedMergeTree::removeSharedDetachedPart(DiskPtr disk, const St
keep_shared = true;
}
- disk->removeSharedRecursive(path, keep_shared);
+ disk->removeSharedRecursive(path, keep_shared, files_not_to_remove);
return keep_shared;
}
diff --git a/src/Storages/StorageReplicatedMergeTree.h b/src/Storages/StorageReplicatedMergeTree.h
index 2fb2494ca64..10efd6982b5 100644
--- a/src/Storages/StorageReplicatedMergeTree.h
+++ b/src/Storages/StorageReplicatedMergeTree.h
@@ -236,12 +236,12 @@ public:
/// Unlock shared data part in zookeeper
/// Return true if data unlocked
/// Return false if data is still used by another node
- std::pair> unlockSharedData(const IMergeTreeDataPart & part) const override;
+ std::pair unlockSharedData(const IMergeTreeDataPart & part) const override;
/// Unlock shared data part in zookeeper by part id
/// Return true if data unlocked
/// Return false if data is still used by another node
- static std::pair> unlockSharedDataByID(String part_id, const String & table_uuid, const String & part_name, const String & replica_name_,
+ static std::pair unlockSharedDataByID(String part_id, const String & table_uuid, const String & part_name, const String & replica_name_,
DiskPtr disk, zkutil::ZooKeeperPtr zookeeper_, const MergeTreeSettings & settings, Poco::Logger * logger,
const String & zookeeper_path_old);
@@ -756,7 +756,7 @@ private:
static Strings getZeroCopyPartPath(const MergeTreeSettings & settings, DiskType disk_type, const String & table_uuid,
const String & part_name, const String & zookeeper_path_old);
- static void createZeroCopyLockNode(const zkutil::ZooKeeperPtr & zookeeper, const String & zookeeper_node, int32_t mode = zkutil::CreateMode::Persistent, bool replace_existing_lock = false, const std::string & path_to_set = "", const std::vector & hardlinked_files = {});
+ static void createZeroCopyLockNode(const zkutil::ZooKeeperPtr & zookeeper, const String & zookeeper_node, int32_t mode = zkutil::CreateMode::Persistent, bool replace_existing_lock = false, const std::string & path_to_set = "", const NameSet & hardlinked_files = {});
bool removeDetachedPart(DiskPtr disk, const String & path, const String & part_name, bool is_freezed) override;
diff --git a/tests/queries/0_stateless/02067_lost_part_s3.reference b/tests/queries/0_stateless/02067_lost_part_s3.reference
new file mode 100644
index 00000000000..f6f7853d3b6
--- /dev/null
+++ b/tests/queries/0_stateless/02067_lost_part_s3.reference
@@ -0,0 +1,3 @@
+10000
+10000
+10000
diff --git a/tests/queries/0_stateless/02067_lost_part_s3.sql b/tests/queries/0_stateless/02067_lost_part_s3.sql
new file mode 100644
index 00000000000..f3f529a7052
--- /dev/null
+++ b/tests/queries/0_stateless/02067_lost_part_s3.sql
@@ -0,0 +1,26 @@
+drop table if exists partslost_0;
+drop table if exists partslost_1;
+drop table if exists partslost_2;
+
+CREATE TABLE partslost_0 (x String) ENGINE=ReplicatedMergeTree('/clickhouse/table/partslost/0', '0') ORDER BY tuple() SETTINGS min_rows_for_wide_part = 0, min_bytes_for_wide_part = 0, old_parts_lifetime = 1, cleanup_delay_period = 1, cleanup_delay_period_random_add = 1;
+
+CREATE TABLE partslost_1 (x String) ENGINE=ReplicatedMergeTree('/clickhouse/table/partslost/0', '1') ORDER BY tuple() SETTINGS min_rows_for_wide_part = 0, min_bytes_for_wide_part = 0, old_parts_lifetime = 1, cleanup_delay_period = 1, cleanup_delay_period_random_add = 1;
+
+CREATE TABLE partslost_2 (x String) ENGINE=ReplicatedMergeTree('/clickhouse/table/partslost/0', '2') ORDER BY tuple() SETTINGS min_rows_for_wide_part = 0, min_bytes_for_wide_part = 0, old_parts_lifetime = 1, cleanup_delay_period = 1, cleanup_delay_period_random_add = 1;
+
+
+INSERT INTO partslost_0 SELECT toString(number) AS x from system.numbers LIMIT 10000;
+
+ALTER TABLE partslost_0 ADD INDEX idx x TYPE tokenbf_v1(285000, 3, 12345) GRANULARITY 3;
+ALTER TABLE partslost_0 MATERIALIZE INDEX idx;
+
+select sleep(3) FORMAT Null;
+select sleep(3) FORMAT Null;
+select sleep(3) FORMAT Null;
+select sleep(3) FORMAT Null;
+
+ALTER TABLE partslost_0 DROP INDEX idx;
+
+select count() from partslost_0;
+select count() from partslost_1;
+select count() from partslost_2;
From c81cb9e5632698d9bca5646f31093ea398fc8ea1 Mon Sep 17 00:00:00 2001
From: alesapin
Date: Tue, 19 Apr 2022 01:12:07 +0200
Subject: [PATCH 15/55] Better
---
src/Storages/StorageReplicatedMergeTree.cpp | 3 ++-
tests/queries/0_stateless/02067_lost_part_s3.sql | 13 ++++++++++---
2 files changed, 12 insertions(+), 4 deletions(-)
diff --git a/src/Storages/StorageReplicatedMergeTree.cpp b/src/Storages/StorageReplicatedMergeTree.cpp
index 4403fd2617f..f4df673a483 100644
--- a/src/Storages/StorageReplicatedMergeTree.cpp
+++ b/src/Storages/StorageReplicatedMergeTree.cpp
@@ -7412,7 +7412,8 @@ std::pair StorageReplicatedMergeTree::unlockSharedDataByID(
{
files_not_to_remove.clear();
auto files_not_to_remove_str = zookeeper_ptr->get(zc_zookeeper_path);
- boost::split(files_not_to_remove, files_not_to_remove_str, boost::is_any_of("\n "));
+ if (!files_not_to_remove_str.empty())
+ boost::split(files_not_to_remove, files_not_to_remove_str, boost::is_any_of("\n "));
String zookeeper_part_uniq_node = fs::path(zc_zookeeper_path) / part_id;
diff --git a/tests/queries/0_stateless/02067_lost_part_s3.sql b/tests/queries/0_stateless/02067_lost_part_s3.sql
index f3f529a7052..ac02318c261 100644
--- a/tests/queries/0_stateless/02067_lost_part_s3.sql
+++ b/tests/queries/0_stateless/02067_lost_part_s3.sql
@@ -1,6 +1,6 @@
-drop table if exists partslost_0;
-drop table if exists partslost_1;
-drop table if exists partslost_2;
+DROP TABLE IF EXISTS partslost_0;
+DROP TABLE IF EXISTS partslost_1;
+DROP TABLE IF EXISTS partslost_2;
CREATE TABLE partslost_0 (x String) ENGINE=ReplicatedMergeTree('/clickhouse/table/partslost/0', '0') ORDER BY tuple() SETTINGS min_rows_for_wide_part = 0, min_bytes_for_wide_part = 0, old_parts_lifetime = 1, cleanup_delay_period = 1, cleanup_delay_period_random_add = 1;
@@ -12,8 +12,11 @@ CREATE TABLE partslost_2 (x String) ENGINE=ReplicatedMergeTree('/clickhouse/tabl
INSERT INTO partslost_0 SELECT toString(number) AS x from system.numbers LIMIT 10000;
ALTER TABLE partslost_0 ADD INDEX idx x TYPE tokenbf_v1(285000, 3, 12345) GRANULARITY 3;
+
ALTER TABLE partslost_0 MATERIALIZE INDEX idx;
+-- FIXME
+select sleep(3) FORMAT Null;
select sleep(3) FORMAT Null;
select sleep(3) FORMAT Null;
select sleep(3) FORMAT Null;
@@ -24,3 +27,7 @@ ALTER TABLE partslost_0 DROP INDEX idx;
select count() from partslost_0;
select count() from partslost_1;
select count() from partslost_2;
+
+DROP TABLE IF EXISTS partslost_0;
+DROP TABLE IF EXISTS partslost_1;
+DROP TABLE IF EXISTS partslost_2;
From 8ade9fba529540c3246c855f427bb2be4268d8e1 Mon Sep 17 00:00:00 2001
From: alesapin
Date: Tue, 19 Apr 2022 11:56:14 +0200
Subject: [PATCH 16/55] Fix
---
src/Storages/MergeTree/MergeTreeData.cpp | 4 ++++
1 file changed, 4 insertions(+)
diff --git a/src/Storages/MergeTree/MergeTreeData.cpp b/src/Storages/MergeTree/MergeTreeData.cpp
index e47077baffa..f18f5f147a7 100644
--- a/src/Storages/MergeTree/MergeTreeData.cpp
+++ b/src/Storages/MergeTree/MergeTreeData.cpp
@@ -1576,6 +1576,7 @@ MergeTreeData::DataPartsVector MergeTreeData::grabOldParts(bool force)
return res;
bool need_remove_parts_in_order = supportsReplication() && getSettings()->allow_remote_fs_zero_copy_replication;
+
time_t now = time(nullptr);
std::vector parts_to_delete;
@@ -1592,6 +1593,7 @@ MergeTreeData::DataPartsVector MergeTreeData::grabOldParts(bool force)
{
if (need_remove_parts_in_order)
break;
+
continue;
}
@@ -1614,7 +1616,9 @@ MergeTreeData::DataPartsVector MergeTreeData::grabOldParts(bool force)
parts_to_delete.emplace_back(it);
}
else if (need_remove_parts_in_order)
+ {
break;
+ }
}
res.reserve(parts_to_delete.size());
From ee8d26ff0e57427467bc5d4158b79b29932249a2 Mon Sep 17 00:00:00 2001
From: alesapin
Date: Tue, 19 Apr 2022 13:27:55 +0200
Subject: [PATCH 17/55] Better test and fix for move
---
src/Storages/MergeTree/MergeTreeData.cpp | 17 +++++++++++++++++
src/Storages/MergeTree/MergeTreePartsMover.cpp | 4 ++++
.../queries/0_stateless/02067_lost_part_s3.sql | 9 ++++-----
3 files changed, 25 insertions(+), 5 deletions(-)
diff --git a/src/Storages/MergeTree/MergeTreeData.cpp b/src/Storages/MergeTree/MergeTreeData.cpp
index f18f5f147a7..6229d4bb08e 100644
--- a/src/Storages/MergeTree/MergeTreeData.cpp
+++ b/src/Storages/MergeTree/MergeTreeData.cpp
@@ -1576,6 +1576,19 @@ MergeTreeData::DataPartsVector MergeTreeData::grabOldParts(bool force)
return res;
bool need_remove_parts_in_order = supportsReplication() && getSettings()->allow_remote_fs_zero_copy_replication;
+ if (need_remove_parts_in_order)
+ {
+ bool has_zero_copy_disk = false;
+ for (const auto & disk : getDisks())
+ {
+ if (disk->supportZeroCopyReplication())
+ {
+ has_zero_copy_disk = true;
+ break;
+ }
+ }
+ need_remove_parts_in_order = has_zero_copy_disk;
+ }
time_t now = time(nullptr);
std::vector parts_to_delete;
@@ -3364,6 +3377,10 @@ void MergeTreeData::swapActivePart(MergeTreeData::DataPartPtr part_copy)
ssize_t diff_rows = part_copy->rows_count - original_active_part->rows_count;
increaseDataVolume(diff_bytes, diff_rows, /* parts= */ 0);
+ /// Move parts are non replicated operations, so we take lock here.
+ /// All other locks are taken in StorageReplicatedMergeTree
+ lockSharedData(*part_copy);
+
auto disk = original_active_part->volume->getDisk();
String marker_path = fs::path(original_active_part->getFullRelativePath()) / IMergeTreeDataPart::DELETE_ON_DESTROY_MARKER_FILE_NAME;
try
diff --git a/src/Storages/MergeTree/MergeTreePartsMover.cpp b/src/Storages/MergeTree/MergeTreePartsMover.cpp
index 83b58960ad1..bb625d74ead 100644
--- a/src/Storages/MergeTree/MergeTreePartsMover.cpp
+++ b/src/Storages/MergeTree/MergeTreePartsMover.cpp
@@ -214,10 +214,14 @@ MergeTreeData::DataPartPtr MergeTreePartsMover::clonePart(const MergeTreeMoveEnt
LOG_WARNING(log, "Path {} already exists. Will remove it and clone again.", fullPath(disk, path_to_clone + relative_path));
disk->removeRecursive(fs::path(path_to_clone) / relative_path / "");
}
+
disk->createDirectories(path_to_clone);
bool is_fetched = data->tryToFetchIfShared(*part, disk, fs::path(path_to_clone) / part->name);
if (!is_fetched)
+ {
+ LOG_INFO(log, "Part {} was not fetched, we are the first who move it to another disk, so we will copy it", part->name);
part->volume->getDisk()->copy(fs::path(data->getRelativeDataPath()) / relative_path / "", disk, path_to_clone);
+ }
part->volume->getDisk()->removeFileIfExists(fs::path(path_to_clone) / IMergeTreeDataPart::DELETE_ON_DESTROY_MARKER_FILE_NAME);
}
else
diff --git a/tests/queries/0_stateless/02067_lost_part_s3.sql b/tests/queries/0_stateless/02067_lost_part_s3.sql
index ac02318c261..ee3297331cd 100644
--- a/tests/queries/0_stateless/02067_lost_part_s3.sql
+++ b/tests/queries/0_stateless/02067_lost_part_s3.sql
@@ -2,11 +2,11 @@ DROP TABLE IF EXISTS partslost_0;
DROP TABLE IF EXISTS partslost_1;
DROP TABLE IF EXISTS partslost_2;
-CREATE TABLE partslost_0 (x String) ENGINE=ReplicatedMergeTree('/clickhouse/table/partslost/0', '0') ORDER BY tuple() SETTINGS min_rows_for_wide_part = 0, min_bytes_for_wide_part = 0, old_parts_lifetime = 1, cleanup_delay_period = 1, cleanup_delay_period_random_add = 1;
+CREATE TABLE partslost_0 (x String) ENGINE=ReplicatedMergeTree('/clickhouse/table/{database}_02067_lost/partslost', '0') ORDER BY tuple() SETTINGS min_rows_for_wide_part = 0, min_bytes_for_wide_part = 0, old_parts_lifetime = 1, cleanup_delay_period = 1, cleanup_delay_period_random_add = 1;
-CREATE TABLE partslost_1 (x String) ENGINE=ReplicatedMergeTree('/clickhouse/table/partslost/0', '1') ORDER BY tuple() SETTINGS min_rows_for_wide_part = 0, min_bytes_for_wide_part = 0, old_parts_lifetime = 1, cleanup_delay_period = 1, cleanup_delay_period_random_add = 1;
+CREATE TABLE partslost_1 (x String) ENGINE=ReplicatedMergeTree('/clickhouse/table/{database}_02067_lost/partslost', '1') ORDER BY tuple() SETTINGS min_rows_for_wide_part = 0, min_bytes_for_wide_part = 0, old_parts_lifetime = 1, cleanup_delay_period = 1, cleanup_delay_period_random_add = 1;
-CREATE TABLE partslost_2 (x String) ENGINE=ReplicatedMergeTree('/clickhouse/table/partslost/0', '2') ORDER BY tuple() SETTINGS min_rows_for_wide_part = 0, min_bytes_for_wide_part = 0, old_parts_lifetime = 1, cleanup_delay_period = 1, cleanup_delay_period_random_add = 1;
+CREATE TABLE partslost_2 (x String) ENGINE=ReplicatedMergeTree('/clickhouse/table/{database}_02067_lost/partslost', '2') ORDER BY tuple() SETTINGS min_rows_for_wide_part = 0, min_bytes_for_wide_part = 0, old_parts_lifetime = 1, cleanup_delay_period = 1, cleanup_delay_period_random_add = 1;
INSERT INTO partslost_0 SELECT toString(number) AS x from system.numbers LIMIT 10000;
@@ -15,8 +15,7 @@ ALTER TABLE partslost_0 ADD INDEX idx x TYPE tokenbf_v1(285000, 3, 12345) GRANUL
ALTER TABLE partslost_0 MATERIALIZE INDEX idx;
--- FIXME
-select sleep(3) FORMAT Null;
+-- In worst case doesn't check anything, but it's not flaky
select sleep(3) FORMAT Null;
select sleep(3) FORMAT Null;
select sleep(3) FORMAT Null;
From 2ad60301671cbd4be9e56eaaa03c94ba8ccfb4f5 Mon Sep 17 00:00:00 2001
From: Tyler Hannan
Date: Tue, 19 Apr 2022 13:42:17 +0200
Subject: [PATCH 18/55] add AWS m5a.4xlarge benchmark
---
website/benchmark/hardware/index.html | 1 +
.../hardware/results/aws_m5a_4xlarge.json | 55 +++++++++++++++++++
2 files changed, 56 insertions(+)
create mode 100644 website/benchmark/hardware/results/aws_m5a_4xlarge.json
diff --git a/website/benchmark/hardware/index.html b/website/benchmark/hardware/index.html
index 42c87c334c0..fb6a572f050 100644
--- a/website/benchmark/hardware/index.html
+++ b/website/benchmark/hardware/index.html
@@ -95,6 +95,7 @@ Results for AWS instance type im4gn.8xlarge are from Ananth Gundabattula
Results for AWS instance type im4gn.16xlarge are from Ananth Gundabattula (Darwinium).
Results for AWS instance type i3.2xlarge are from Ananth Gundabattula (Darwinium).
Results for 2x EPYC 7702 on ZFS mirror NVME are from Alibek A.
+Results for AWS instance type m5a.4xlarge are from Daniel Chimeno.
diff --git a/website/benchmark/hardware/results/aws_m5a_4xlarge.json b/website/benchmark/hardware/results/aws_m5a_4xlarge.json
new file mode 100644
index 00000000000..48d7168eec5
--- /dev/null
+++ b/website/benchmark/hardware/results/aws_m5a_4xlarge.json
@@ -0,0 +1,55 @@
+[
+ {
+ "system": "AWS m5a.4xlarge",
+ "system_full": "AWS m5a.4xlarge 16 vCPU, 512 GiB RAM, EBS GP3",
+ "time": "2020-08-13 00:00:00",
+ "kind": "cloud",
+ "result":
+ [
+ [0.002, 0.002, 0.002],
+ [0.028, 0.017, 0.016],
+ [0.074, 0.040, 0.042],
+ [0.758, 0.061, 0.062],
+ [1.380, 0.151, 0.147],
+ [2.204, 0.394, 0.385],
+ [0.002, 0.002, 0.002],
+ [0.019, 0.018, 0.018],
+ [1.385, 0.751, 0.740],
+ [1.754, 0.873, 0.870],
+ [0.817, 0.222, 0.218],
+ [1.219, 0.268, 0.263],
+ [2.303, 1.092, 1.057],
+ [3.892, 1.431, 1.420],
+ [2.388, 1.199, 1.188],
+ [1.439, 1.329, 1.339],
+ [5.108, 3.816, 3.799],
+ [3.549, 2.273, 2.271],
+ [9.976, 7.023, 7.043],
+ [0.706, 0.068, 0.066],
+ [19.187, 1.101, 1.076],
+ [21.210, 1.315, 1.276],
+ [39.984, 3.065, 3.043],
+ [32.102, 1.328, 1.295],
+ [4.599, 0.341, 0.343],
+ [1.762, 0.294, 0.287],
+ [4.786, 0.345, 0.338],
+ [18.596, 1.190, 1.175],
+ [15.549, 1.973, 1.931],
+ [3.643, 3.628, 3.625],
+ [3.658, 0.781, 0.777],
+ [9.786, 1.383, 1.381],
+ [11.912, 10.221, 10.227],
+ [20.664, 5.942, 5.954],
+ [20.707, 6.092, 5.920],
+ [1.847, 1.758, 1.756],
+ [0.292, 0.221, 0.216],
+ [0.109, 0.086, 0.085],
+ [0.118, 0.084, 0.074],
+ [0.542, 0.452, 0.445],
+ [0.060, 0.026, 0.025],
+ [0.031, 0.019, 0.020],
+ [0.007, 0.005, 0.007]
+ ]
+ }
+]
+
From cc06bc3d994268cd8731dd17ca310af144b2b846 Mon Sep 17 00:00:00 2001
From: alesapin
Date: Tue, 19 Apr 2022 14:01:30 +0200
Subject: [PATCH 19/55] Add some clarifications
---
src/Disks/DiskDecorator.cpp | 8 +++---
src/Disks/DiskDecorator.h | 4 +--
src/Disks/DiskEncrypted.h | 8 +++---
src/Disks/DiskRestartProxy.cpp | 8 +++---
src/Disks/DiskRestartProxy.h | 4 +--
src/Disks/IDisk.h | 9 ++++--
src/Disks/IDiskRemote.cpp | 8 +++---
src/Disks/IDiskRemote.h | 4 +--
src/Storages/MergeTree/IMergeTreeDataPart.cpp | 3 +-
src/Storages/MergeTree/IMergeTreeDataPart.h | 17 +++++++++++
src/Storages/MergeTree/MutateTask.cpp | 3 ++
src/Storages/StorageReplicatedMergeTree.cpp | 28 ++++++++++++-------
src/Storages/StorageReplicatedMergeTree.h | 5 +++-
13 files changed, 72 insertions(+), 37 deletions(-)
diff --git a/src/Disks/DiskDecorator.cpp b/src/Disks/DiskDecorator.cpp
index 44358847961..47a6bd92c3a 100644
--- a/src/Disks/DiskDecorator.cpp
+++ b/src/Disks/DiskDecorator.cpp
@@ -151,14 +151,14 @@ void DiskDecorator::removeSharedFile(const String & path, bool keep_s3)
delegate->removeSharedFile(path, keep_s3);
}
-void DiskDecorator::removeSharedFiles(const RemoveBatchRequest & files, bool keep_all_batch_metadata, const NameSet & file_names_remove_metadata_only)
+void DiskDecorator::removeSharedFiles(const RemoveBatchRequest & files, bool keep_all_batch_data, const NameSet & file_names_remove_metadata_only)
{
- delegate->removeSharedFiles(files, keep_all_batch_metadata, file_names_remove_metadata_only);
+ delegate->removeSharedFiles(files, keep_all_batch_data, file_names_remove_metadata_only);
}
-void DiskDecorator::removeSharedRecursive(const String & path, bool keep_all_batch_metadata, const NameSet & file_names_remove_metadata_only)
+void DiskDecorator::removeSharedRecursive(const String & path, bool keep_all_batch_data, const NameSet & file_names_remove_metadata_only)
{
- delegate->removeSharedRecursive(path, keep_all_batch_metadata, file_names_remove_metadata_only);
+ delegate->removeSharedRecursive(path, keep_all_batch_data, file_names_remove_metadata_only);
}
void DiskDecorator::setLastModified(const String & path, const Poco::Timestamp & timestamp)
diff --git a/src/Disks/DiskDecorator.h b/src/Disks/DiskDecorator.h
index 6c70e9a3571..27ddb001a83 100644
--- a/src/Disks/DiskDecorator.h
+++ b/src/Disks/DiskDecorator.h
@@ -52,8 +52,8 @@ public:
void removeDirectory(const String & path) override;
void removeRecursive(const String & path) override;
void removeSharedFile(const String & path, bool keep_s3) override;
- void removeSharedRecursive(const String & path, bool keep_all_batch_metadata, const NameSet & file_names_remove_metadata_only) override;
- void removeSharedFiles(const RemoveBatchRequest & files, bool keep_all_batch_metadata, const NameSet & file_names_remove_metadata_only) override;
+ void removeSharedRecursive(const String & path, bool keep_all_batch_data, const NameSet & file_names_remove_metadata_only) override;
+ void removeSharedFiles(const RemoveBatchRequest & files, bool keep_all_batch_data, const NameSet & file_names_remove_metadata_only) override;
void setLastModified(const String & path, const Poco::Timestamp & timestamp) override;
Poco::Timestamp getLastModified(const String & path) override;
void setReadOnly(const String & path) override;
diff --git a/src/Disks/DiskEncrypted.h b/src/Disks/DiskEncrypted.h
index 28156551e85..2f47dda59b3 100644
--- a/src/Disks/DiskEncrypted.h
+++ b/src/Disks/DiskEncrypted.h
@@ -159,18 +159,18 @@ public:
delegate->removeSharedFile(wrapped_path, flag);
}
- void removeSharedRecursive(const String & path, bool keep_all_batch_metadata, const NameSet & file_names_remove_metadata_only) override
+ void removeSharedRecursive(const String & path, bool keep_all_batch_data, const NameSet & file_names_remove_metadata_only) override
{
auto wrapped_path = wrappedPath(path);
- delegate->removeSharedRecursive(wrapped_path, keep_all_batch_metadata, file_names_remove_metadata_only);
+ delegate->removeSharedRecursive(wrapped_path, keep_all_batch_data, file_names_remove_metadata_only);
}
- void removeSharedFiles(const RemoveBatchRequest & files, bool keep_all_batch_metadata, const NameSet & file_names_remove_metadata_only) override
+ void removeSharedFiles(const RemoveBatchRequest & files, bool keep_all_batch_data, const NameSet & file_names_remove_metadata_only) override
{
for (const auto & file : files)
{
auto wrapped_path = wrappedPath(file.path);
- bool keep = keep_all_batch_metadata || file_names_remove_metadata_only.contains(fs::path(file.path).filename());
+ bool keep = keep_all_batch_data || file_names_remove_metadata_only.contains(fs::path(file.path).filename());
if (file.if_exists)
delegate->removeSharedFileIfExists(wrapped_path, keep);
else
diff --git a/src/Disks/DiskRestartProxy.cpp b/src/Disks/DiskRestartProxy.cpp
index ed0bc74b1a8..ba5d60901b3 100644
--- a/src/Disks/DiskRestartProxy.cpp
+++ b/src/Disks/DiskRestartProxy.cpp
@@ -251,16 +251,16 @@ void DiskRestartProxy::removeSharedFile(const String & path, bool keep_s3)
DiskDecorator::removeSharedFile(path, keep_s3);
}
-void DiskRestartProxy::removeSharedFiles(const RemoveBatchRequest & files, bool keep_all_batch_metadata, const NameSet & file_names_remove_metadata_only)
+void DiskRestartProxy::removeSharedFiles(const RemoveBatchRequest & files, bool keep_all_batch_data, const NameSet & file_names_remove_metadata_only)
{
ReadLock lock (mutex);
- DiskDecorator::removeSharedFiles(files, keep_all_batch_metadata, file_names_remove_metadata_only);
+ DiskDecorator::removeSharedFiles(files, keep_all_batch_data, file_names_remove_metadata_only);
}
-void DiskRestartProxy::removeSharedRecursive(const String & path, bool keep_all_batch_metadata, const NameSet & file_names_remove_metadata_only)
+void DiskRestartProxy::removeSharedRecursive(const String & path, bool keep_all_batch_data, const NameSet & file_names_remove_metadata_only)
{
ReadLock lock (mutex);
- DiskDecorator::removeSharedRecursive(path, keep_all_batch_metadata, file_names_remove_metadata_only);
+ DiskDecorator::removeSharedRecursive(path, keep_all_batch_data, file_names_remove_metadata_only);
}
void DiskRestartProxy::setLastModified(const String & path, const Poco::Timestamp & timestamp)
diff --git a/src/Disks/DiskRestartProxy.h b/src/Disks/DiskRestartProxy.h
index 03bdc047218..25b4787b482 100644
--- a/src/Disks/DiskRestartProxy.h
+++ b/src/Disks/DiskRestartProxy.h
@@ -54,8 +54,8 @@ public:
void removeDirectory(const String & path) override;
void removeRecursive(const String & path) override;
void removeSharedFile(const String & path, bool keep_s3) override;
- void removeSharedFiles(const RemoveBatchRequest & files, bool keep_all_batch_metadata, const NameSet & file_names_remove_metadata_only) override;
- void removeSharedRecursive(const String & path, bool keep_all_batch_metadata, const NameSet & file_names_remove_metadata_only) override;
+ void removeSharedFiles(const RemoveBatchRequest & files, bool keep_all_batch_data, const NameSet & file_names_remove_metadata_only) override;
+ void removeSharedRecursive(const String & path, bool keep_all_batch_data, const NameSet & file_names_remove_metadata_only) override;
void setLastModified(const String & path, const Poco::Timestamp & timestamp) override;
Poco::Timestamp getLastModified(const String & path) override;
void setReadOnly(const String & path) override;
diff --git a/src/Disks/IDisk.h b/src/Disks/IDisk.h
index 30b8ac2c44b..cc47c4262f5 100644
--- a/src/Disks/IDisk.h
+++ b/src/Disks/IDisk.h
@@ -196,7 +196,8 @@ public:
/// Remove file or directory with all children. Use with extra caution. Throws exception if file doesn't exists.
/// Differs from removeRecursive for S3/HDFS disks
- /// Second bool param is a flag to remove (true) or keep (false) shared data on S3
+ /// Second bool param is a flag to remove (true) or keep (false) shared data on S3.
+ /// Third param determines which files cannot be removed even if second is true.
virtual void removeSharedRecursive(const String & path, bool, const NameSet &) { removeRecursive(path); }
/// Remove file or directory if it exists.
@@ -237,11 +238,13 @@ public:
/// Batch request to remove multiple files.
/// May be much faster for blob storage.
- virtual void removeSharedFiles(const RemoveBatchRequest & files, bool keep_all_batch_metadata, const NameSet & file_names_remove_metadata_only)
+ /// Second bool param is a flag to remove (true) or keep (false) shared data on S3.
+ /// Third param determines which files cannot be removed even if second is true.
+ virtual void removeSharedFiles(const RemoveBatchRequest & files, bool keep_all_batch_data, const NameSet & file_names_remove_metadata_only)
{
for (const auto & file : files)
{
- bool keep_file = keep_all_batch_metadata || file_names_remove_metadata_only.contains(fs::path(file.path).filename());
+ bool keep_file = keep_all_batch_data || file_names_remove_metadata_only.contains(fs::path(file.path).filename());
if (file.if_exists)
removeSharedFileIfExists(file.path, keep_file);
else
diff --git a/src/Disks/IDiskRemote.cpp b/src/Disks/IDiskRemote.cpp
index a546ab56dbf..d155616acb5 100644
--- a/src/Disks/IDiskRemote.cpp
+++ b/src/Disks/IDiskRemote.cpp
@@ -501,7 +501,7 @@ void IDiskRemote::removeSharedFileIfExists(const String & path, bool delete_meta
}
}
-void IDiskRemote::removeSharedFiles(const RemoveBatchRequest & files, bool keep_all_batch_metadata, const NameSet & file_names_remove_metadata_only)
+void IDiskRemote::removeSharedFiles(const RemoveBatchRequest & files, bool keep_all_batch_data, const NameSet & file_names_remove_metadata_only)
{
std::unordered_map> paths_to_remove;
for (const auto & file : files)
@@ -511,7 +511,7 @@ void IDiskRemote::removeSharedFiles(const RemoveBatchRequest & files, bool keep_
removeMetadata(file.path, paths_to_remove);
}
- if (!keep_all_batch_metadata)
+ if (!keep_all_batch_data)
{
std::vector remove_from_remote;
for (auto && [path, remote_paths] : paths_to_remove)
@@ -523,12 +523,12 @@ void IDiskRemote::removeSharedFiles(const RemoveBatchRequest & files, bool keep_
}
}
-void IDiskRemote::removeSharedRecursive(const String & path, bool keep_all_batch_metadata, const NameSet & file_names_remove_metadata_only)
+void IDiskRemote::removeSharedRecursive(const String & path, bool keep_all_batch_data, const NameSet & file_names_remove_metadata_only)
{
std::unordered_map> paths_to_remove;
removeMetadataRecursive(path, paths_to_remove);
- if (!keep_all_batch_metadata)
+ if (!keep_all_batch_data)
{
std::vector remove_from_remote;
for (auto && [local_path, remote_paths] : paths_to_remove)
diff --git a/src/Disks/IDiskRemote.h b/src/Disks/IDiskRemote.h
index 901403f4689..abb311ac32c 100644
--- a/src/Disks/IDiskRemote.h
+++ b/src/Disks/IDiskRemote.h
@@ -114,9 +114,9 @@ public:
void removeSharedFileIfExists(const String & path, bool delete_metadata_only) override;
- void removeSharedFiles(const RemoveBatchRequest & files, bool keep_all_batch_metadata, const NameSet & file_names_remove_metadata_only) override;
+ void removeSharedFiles(const RemoveBatchRequest & files, bool keep_all_batch_data, const NameSet & file_names_remove_metadata_only) override;
- void removeSharedRecursive(const String & path, bool keep_all_batch_metadata, const NameSet & file_names_remove_metadata_only) override;
+ void removeSharedRecursive(const String & path, bool keep_all_batch_data, const NameSet & file_names_remove_metadata_only) override;
void listFiles(const String & path, std::vector & file_names) override;
diff --git a/src/Storages/MergeTree/IMergeTreeDataPart.cpp b/src/Storages/MergeTree/IMergeTreeDataPart.cpp
index 13fd6d6d09e..9fdf0c2cac7 100644
--- a/src/Storages/MergeTree/IMergeTreeDataPart.cpp
+++ b/src/Storages/MergeTree/IMergeTreeDataPart.cpp
@@ -1622,6 +1622,8 @@ void IMergeTreeDataPart::remove() const
std::unordered_set projection_directories;
for (const auto & [p_name, projection_part] : projection_parts)
{
+ /// NOTE: projections currently unsupported with zero copy replication.
+ /// TODO: fix it.
projection_part->projectionRemove(to, !can_remove);
projection_directories.emplace(p_name + ".proj");
}
@@ -1665,7 +1667,6 @@ void IMergeTreeDataPart::remove() const
catch (...)
{
/// Recursive directory removal does many excessive "stat" syscalls under the hood.
-
LOG_ERROR(storage.log, "Cannot quickly remove directory {} by removing files; fallback to recursive removal. Reason: {}", fullPath(disk, to), getCurrentExceptionMessage(false));
disk->removeSharedRecursive(fs::path(to) / "", !can_remove, files_not_to_remove);
diff --git a/src/Storages/MergeTree/IMergeTreeDataPart.h b/src/Storages/MergeTree/IMergeTreeDataPart.h
index c7e119388c5..cf869a8f34e 100644
--- a/src/Storages/MergeTree/IMergeTreeDataPart.h
+++ b/src/Storages/MergeTree/IMergeTreeDataPart.h
@@ -331,12 +331,19 @@ public:
mutable VersionMetadata version;
+ ///
struct HardlinkedFiles
{
+ /// Hardlinked from part
std::string source_part_name;
+ /// Hardlinked files list
NameSet hardlinks_from_source_part;
};
+ /// Files hardlinked from source part during mutation/backup
+ /// Need only for zero-copy replication implementation.
+ /// Don't use, it's only in memory and don't survive a server restart or table
+ /// DETACH/ATTACH.
HardlinkedFiles hardlinked_files;
/// For data in RAM ('index')
@@ -520,6 +527,16 @@ protected:
String getRelativePathForDetachedPart(const String & prefix) const;
+ /// Checks that part can be actually removed from disk.
+ /// In ordinary scenario always returns true, but in case of
+ /// zero-copy replication part can be hold by some other replicas.
+ ///
+ /// If method return false than only metadata of part from
+ /// local storage can be removed, leaving data in remove FS untouched.
+ ///
+ /// If method return true, than files can be actually removed from remote
+ /// storage storage, excluding files in the second returned argument.
+ /// They can be hardlinks to some newer parts.
std::pair canRemovePart() const;
void initializePartMetadataManager();
diff --git a/src/Storages/MergeTree/MutateTask.cpp b/src/Storages/MergeTree/MutateTask.cpp
index 5c960898e3c..df02e9fe6f9 100644
--- a/src/Storages/MergeTree/MutateTask.cpp
+++ b/src/Storages/MergeTree/MutateTask.cpp
@@ -1117,6 +1117,9 @@ private:
}
}
+ /// Traking of hardlinked files required for zero-copy replication.
+ /// We don't remove them when we delete last copy of source part because
+ /// new part can use them.
ctx->new_data_part->hardlinked_files.source_part_name = ctx->source_part->name;
ctx->new_data_part->hardlinked_files.hardlinks_from_source_part = hardlinked_files;
diff --git a/src/Storages/StorageReplicatedMergeTree.cpp b/src/Storages/StorageReplicatedMergeTree.cpp
index f4df673a483..a4f176a213e 100644
--- a/src/Storages/StorageReplicatedMergeTree.cpp
+++ b/src/Storages/StorageReplicatedMergeTree.cpp
@@ -7349,12 +7349,12 @@ void StorageReplicatedMergeTree::lockSharedData(const IMergeTreeDataPart & part,
*getSettings(), disk->getType(), getTableSharedID(),
part.name, zookeeper_path);
- String path_to_set;
+ String path_to_set_hardlinked_files;
if (!part.hardlinked_files.hardlinks_from_source_part.empty())
{
- path_to_set = getZeroCopyPartPath(
- *getSettings(), disk->getType(), getTableSharedID(),
- part.hardlinked_files.source_part_name, zookeeper_path)[0];
+ path_to_set_hardlinked_files = getZeroCopyPartPath(
+ *getSettings(), disk->getType(), getTableSharedID(),
+ part.hardlinked_files.source_part_name, zookeeper_path)[0];
}
for (const auto & zc_zookeeper_path : zc_zookeeper_paths)
@@ -7365,7 +7365,7 @@ void StorageReplicatedMergeTree::lockSharedData(const IMergeTreeDataPart & part,
createZeroCopyLockNode(
zookeeper, zookeeper_node, zkutil::CreateMode::Persistent,
- replace_existing_lock, path_to_set, part.hardlinked_files.hardlinks_from_source_part);
+ replace_existing_lock, path_to_set_hardlinked_files, part.hardlinked_files.hardlinks_from_source_part);
}
}
@@ -7850,7 +7850,9 @@ bool StorageReplicatedMergeTree::createEmptyPartInsteadOfLost(zkutil::ZooKeeperP
}
-void StorageReplicatedMergeTree::createZeroCopyLockNode(const zkutil::ZooKeeperPtr & zookeeper, const String & zookeeper_node, int32_t mode, bool replace_existing_lock, const std::string & path_to_set, const NameSet & hardlinked_files)
+void StorageReplicatedMergeTree::createZeroCopyLockNode(
+ const zkutil::ZooKeeperPtr & zookeeper, const String & zookeeper_node, int32_t mode,
+ bool replace_existing_lock, const String & path_to_set_hardlinked_files, const NameSet & hardlinked_files)
{
/// In rare case other replica can remove path between createAncestors and createIfNotExists
/// So we make up to 5 attempts
@@ -7870,10 +7872,13 @@ void StorageReplicatedMergeTree::createZeroCopyLockNode(const zkutil::ZooKeeperP
Coordination::Requests ops;
ops.emplace_back(zkutil::makeRemoveRequest(zookeeper_node, -1));
ops.emplace_back(zkutil::makeCreateRequest(zookeeper_node, "", mode));
- if (!path_to_set.empty() && !hardlinked_files.empty())
+ if (!path_to_set_hardlinked_files.empty() && !hardlinked_files.empty())
{
std::string data = boost::algorithm::join(hardlinked_files, "\n");
- ops.emplace_back(zkutil::makeSetRequest(path_to_set, data, -1));
+ /// List of files used to detect hardlinks. path_to_set_hardlinked_files --
+ /// is a path to source part zero copy node. During part removal hardlinked
+ /// files will be left for source part.
+ ops.emplace_back(zkutil::makeSetRequest(path_to_set_hardlinked_files, data, -1));
}
Coordination::Responses responses;
auto error = zookeeper->tryMulti(ops, responses);
@@ -7883,10 +7888,13 @@ void StorageReplicatedMergeTree::createZeroCopyLockNode(const zkutil::ZooKeeperP
else
{
Coordination::Requests ops;
- if (!path_to_set.empty() && !hardlinked_files.empty())
+ if (!path_to_set_hardlinked_files.empty() && !hardlinked_files.empty())
{
std::string data = boost::algorithm::join(hardlinked_files, "\n");
- ops.emplace_back(zkutil::makeSetRequest(path_to_set, data, -1));
+ /// List of files used to detect hardlinks. path_to_set_hardlinked_files --
+ /// is a path to source part zero copy node. During part removal hardlinked
+ /// files will be left for source part.
+ ops.emplace_back(zkutil::makeSetRequest(path_to_set_hardlinked_files, data, -1));
}
ops.emplace_back(zkutil::makeCreateRequest(zookeeper_node, "", mode));
diff --git a/src/Storages/StorageReplicatedMergeTree.h b/src/Storages/StorageReplicatedMergeTree.h
index 10efd6982b5..54d6977d115 100644
--- a/src/Storages/StorageReplicatedMergeTree.h
+++ b/src/Storages/StorageReplicatedMergeTree.h
@@ -756,7 +756,10 @@ private:
static Strings getZeroCopyPartPath(const MergeTreeSettings & settings, DiskType disk_type, const String & table_uuid,
const String & part_name, const String & zookeeper_path_old);
- static void createZeroCopyLockNode(const zkutil::ZooKeeperPtr & zookeeper, const String & zookeeper_node, int32_t mode = zkutil::CreateMode::Persistent, bool replace_existing_lock = false, const std::string & path_to_set = "", const NameSet & hardlinked_files = {});
+ static void createZeroCopyLockNode(
+ const zkutil::ZooKeeperPtr & zookeeper, const String & zookeeper_node,
+ int32_t mode = zkutil::CreateMode::Persistent, bool replace_existing_lock = false,
+ const String & path_to_set_hardlinked_files = "", const NameSet & hardlinked_files = {});
bool removeDetachedPart(DiskPtr disk, const String & path, const String & part_name, bool is_freezed) override;
From 7c2fcee3518ca8a3c12c8bf947e82a25ed2a4659 Mon Sep 17 00:00:00 2001
From: alesapin
Date: Tue, 19 Apr 2022 14:13:11 +0200
Subject: [PATCH 20/55] Simplify interface
---
src/Disks/IDiskRemote.cpp | 16 ++++++++--------
src/Disks/IDiskRemote.h | 2 +-
2 files changed, 9 insertions(+), 9 deletions(-)
diff --git a/src/Disks/IDiskRemote.cpp b/src/Disks/IDiskRemote.cpp
index d155616acb5..bc224904a5a 100644
--- a/src/Disks/IDiskRemote.cpp
+++ b/src/Disks/IDiskRemote.cpp
@@ -271,7 +271,7 @@ std::unordered_map IDiskRemote::getSerializedMetadata(const std:
return metadatas;
}
-void IDiskRemote::removeMetadata(const String & path, std::unordered_map> & paths_to_remove)
+void IDiskRemote::removeMetadata(const String & path, std::vector & paths_to_remove)
{
LOG_TRACE(log, "Remove file by path: {}", backQuote(metadata_disk->getPath() + path));
@@ -289,7 +289,7 @@ void IDiskRemote::removeMetadata(const String & path, std::unordered_mapisFile(path))
{
- removeMetadata(path, paths_to_remove);
+ removeMetadata(path, paths_to_remove[path]);
}
else
{
@@ -483,21 +483,21 @@ void IDiskRemote::replaceFile(const String & from_path, const String & to_path)
void IDiskRemote::removeSharedFile(const String & path, bool delete_metadata_only)
{
- std::unordered_map> paths_to_remove;
+ std::vector paths_to_remove;
removeMetadata(path, paths_to_remove);
if (!delete_metadata_only)
- removeFromRemoteFS(paths_to_remove[path]);
+ removeFromRemoteFS(paths_to_remove);
}
void IDiskRemote::removeSharedFileIfExists(const String & path, bool delete_metadata_only)
{
- std::unordered_map> paths_to_remove;
+ std::vector paths_to_remove;
if (metadata_disk->exists(path))
{
removeMetadata(path, paths_to_remove);
if (!delete_metadata_only)
- removeFromRemoteFS(paths_to_remove[path]);
+ removeFromRemoteFS(paths_to_remove);
}
}
@@ -508,7 +508,7 @@ void IDiskRemote::removeSharedFiles(const RemoveBatchRequest & files, bool keep_
{
bool skip = file.if_exists && !metadata_disk->exists(file.path);
if (!skip)
- removeMetadata(file.path, paths_to_remove);
+ removeMetadata(file.path, paths_to_remove[file.path]);
}
if (!keep_all_batch_data)
diff --git a/src/Disks/IDiskRemote.h b/src/Disks/IDiskRemote.h
index abb311ac32c..b2251e1abe3 100644
--- a/src/Disks/IDiskRemote.h
+++ b/src/Disks/IDiskRemote.h
@@ -171,7 +171,7 @@ protected:
FileCachePtr cache;
private:
- void removeMetadata(const String & path, std::unordered_map> & paths_to_remove);
+ void removeMetadata(const String & path, std::vector & paths_to_remove);
void removeMetadataRecursive(const String & path, std::unordered_map> & paths_to_remove);
From 7cb7c120cc1f70b91e1e41e8e91a878bdd240995 Mon Sep 17 00:00:00 2001
From: alesapin
Date: Tue, 19 Apr 2022 15:53:10 +0200
Subject: [PATCH 21/55] Less ugly
---
src/Storages/MergeTree/IMergeTreeDataPart.h | 15 -----
src/Storages/MergeTree/MergeTreeData.cpp | 12 ++--
src/Storages/MergeTree/MergeTreeData.h | 13 +++-
.../MergeTree/MutateFromLogEntryTask.cpp | 2 +-
src/Storages/MergeTree/MutateTask.cpp | 15 +++--
src/Storages/MergeTree/MutateTask.h | 3 +-
src/Storages/StorageMergeTree.cpp | 4 +-
src/Storages/StorageReplicatedMergeTree.cpp | 60 +++++++++++++++----
src/Storages/StorageReplicatedMergeTree.h | 4 +-
9 files changed, 83 insertions(+), 45 deletions(-)
diff --git a/src/Storages/MergeTree/IMergeTreeDataPart.h b/src/Storages/MergeTree/IMergeTreeDataPart.h
index cf869a8f34e..2301bac55bd 100644
--- a/src/Storages/MergeTree/IMergeTreeDataPart.h
+++ b/src/Storages/MergeTree/IMergeTreeDataPart.h
@@ -331,21 +331,6 @@ public:
mutable VersionMetadata version;
- ///
- struct HardlinkedFiles
- {
- /// Hardlinked from part
- std::string source_part_name;
- /// Hardlinked files list
- NameSet hardlinks_from_source_part;
- };
-
- /// Files hardlinked from source part during mutation/backup
- /// Need only for zero-copy replication implementation.
- /// Don't use, it's only in memory and don't survive a server restart or table
- /// DETACH/ATTACH.
- HardlinkedFiles hardlinked_files;
-
/// For data in RAM ('index')
UInt64 getIndexSizeInBytes() const;
UInt64 getIndexSizeInAllocatedBytes() const;
diff --git a/src/Storages/MergeTree/MergeTreeData.cpp b/src/Storages/MergeTree/MergeTreeData.cpp
index 6229d4bb08e..28221aac91e 100644
--- a/src/Storages/MergeTree/MergeTreeData.cpp
+++ b/src/Storages/MergeTree/MergeTreeData.cpp
@@ -5727,7 +5727,8 @@ MergeTreeData::MutableDataPartPtr MergeTreeData::cloneAndLoadDataPartOnSameDisk(
const String & tmp_part_prefix,
const MergeTreePartInfo & dst_part_info,
const StorageMetadataPtr & metadata_snapshot,
- const MergeTreeTransactionPtr & txn)
+ const MergeTreeTransactionPtr & txn,
+ HardlinkedFiles * hardlinked_files)
{
/// Check that the storage policy contains the disk where the src_part is located.
bool does_storage_policy_allow_same_disk = false;
@@ -5756,8 +5757,6 @@ MergeTreeData::MutableDataPartPtr MergeTreeData::cloneAndLoadDataPartOnSameDisk(
if (disk->exists(dst_part_path))
throw Exception("Part in " + fullPath(disk, dst_part_path) + " already exists", ErrorCodes::DIRECTORY_ALREADY_EXISTS);
-
- bool src_memory_part = false;
/// If source part is in memory, flush it to disk and clone it already in on-disk format
if (auto src_part_in_memory = asInMemoryPart(src_part))
{
@@ -5765,7 +5764,6 @@ MergeTreeData::MutableDataPartPtr MergeTreeData::cloneAndLoadDataPartOnSameDisk(
auto flushed_part_path = src_part_in_memory->getRelativePathForPrefix(tmp_part_prefix);
src_part_in_memory->flushToDisk(src_relative_data_path, flushed_part_path, metadata_snapshot);
src_part_path = fs::path(src_relative_data_path) / flushed_part_path / "";
- src_memory_part = true;
}
LOG_DEBUG(log, "Cloning part {} to {}", fullPath(disk, src_part_path), fullPath(disk, dst_part_path));
@@ -5776,14 +5774,14 @@ MergeTreeData::MutableDataPartPtr MergeTreeData::cloneAndLoadDataPartOnSameDisk(
auto single_disk_volume = std::make_shared(disk->getName(), disk, 0);
auto dst_data_part = createPart(dst_part_name, dst_part_info, single_disk_volume, tmp_dst_part_name);
- if (!src_memory_part && !asInMemoryPart(dst_data_part))
+ if (hardlinked_files)
{
- dst_data_part->hardlinked_files.source_part_name = src_part->name;
+ hardlinked_files->source_part_name = src_part->name;
for (auto it = disk->iterateDirectory(src_part_path); it->isValid(); it->next())
{
if (it->name() != IMergeTreeDataPart::DELETE_ON_DESTROY_MARKER_FILE_NAME && it->name() != IMergeTreeDataPart::TXN_VERSION_METADATA_FILE_NAME)
- dst_data_part->hardlinked_files.hardlinks_from_source_part.insert(it->name());
+ hardlinked_files->hardlinks_from_source_part.insert(it->name());
}
}
diff --git a/src/Storages/MergeTree/MergeTreeData.h b/src/Storages/MergeTree/MergeTreeData.h
index 7d51a8fa37c..42d9d984be3 100644
--- a/src/Storages/MergeTree/MergeTreeData.h
+++ b/src/Storages/MergeTree/MergeTreeData.h
@@ -767,9 +767,17 @@ public:
MergeTreeData & checkStructureAndGetMergeTreeData(const StoragePtr & source_table, const StorageMetadataPtr & src_snapshot, const StorageMetadataPtr & my_snapshot) const;
MergeTreeData & checkStructureAndGetMergeTreeData(IStorage & source_table, const StorageMetadataPtr & src_snapshot, const StorageMetadataPtr & my_snapshot) const;
+ struct HardlinkedFiles
+ {
+ /// Hardlinked from part
+ std::string source_part_name;
+ /// Hardlinked files list
+ NameSet hardlinks_from_source_part;
+ };
+
MergeTreeData::MutableDataPartPtr cloneAndLoadDataPartOnSameDisk(
const MergeTreeData::DataPartPtr & src_part, const String & tmp_part_prefix, const MergeTreePartInfo & dst_part_info,
- const StorageMetadataPtr & metadata_snapshot, const MergeTreeTransactionPtr & txn);
+ const StorageMetadataPtr & metadata_snapshot, const MergeTreeTransactionPtr & txn, HardlinkedFiles * hardlinked_files);
virtual std::vector getMutationsStatus() const = 0;
@@ -937,9 +945,10 @@ public:
bool scheduleDataMovingJob(BackgroundJobsAssignee & assignee);
bool areBackgroundMovesNeeded() const;
+
/// Lock part in zookeeper for shared data in several nodes
/// Overridden in StorageReplicatedMergeTree
- virtual void lockSharedData(const IMergeTreeDataPart &, bool = false) const {} /// NOLINT
+ virtual void lockSharedData(const IMergeTreeDataPart &, bool = false, std::optional = {}) const {} /// NOLINT
/// Unlock shared data part in zookeeper
/// Overridden in StorageReplicatedMergeTree
diff --git a/src/Storages/MergeTree/MutateFromLogEntryTask.cpp b/src/Storages/MergeTree/MutateFromLogEntryTask.cpp
index de31fbe3c56..554053c1bd3 100644
--- a/src/Storages/MergeTree/MutateFromLogEntryTask.cpp
+++ b/src/Storages/MergeTree/MutateFromLogEntryTask.cpp
@@ -175,7 +175,7 @@ bool MutateFromLogEntryTask::finalize(ReplicatedMergeMutateTaskBase::PartLogWrit
try
{
- storage.checkPartChecksumsAndCommit(*transaction_ptr, new_part);
+ storage.checkPartChecksumsAndCommit(*transaction_ptr, new_part, mutate_task->getHardlinkedFiles());
}
catch (const Exception & e)
{
diff --git a/src/Storages/MergeTree/MutateTask.cpp b/src/Storages/MergeTree/MutateTask.cpp
index df02e9fe6f9..83502f1031d 100644
--- a/src/Storages/MergeTree/MutateTask.cpp
+++ b/src/Storages/MergeTree/MutateTask.cpp
@@ -548,6 +548,8 @@ struct MutationContext
ExecuteTTLType execute_ttl_type{ExecuteTTLType::NONE};
MergeTreeTransactionPtr txn;
+
+ MergeTreeData::HardlinkedFiles hardlinked_files;
};
using MutationContextPtr = std::shared_ptr;
@@ -1120,8 +1122,8 @@ private:
/// Traking of hardlinked files required for zero-copy replication.
/// We don't remove them when we delete last copy of source part because
/// new part can use them.
- ctx->new_data_part->hardlinked_files.source_part_name = ctx->source_part->name;
- ctx->new_data_part->hardlinked_files.hardlinks_from_source_part = hardlinked_files;
+ ctx->hardlinked_files.source_part_name = ctx->source_part->name;
+ ctx->hardlinked_files.hardlinks_from_source_part = hardlinked_files;
(*ctx->mutate_entry)->columns_written = ctx->storage_columns.size() - ctx->updated_header.columns();
@@ -1297,7 +1299,7 @@ bool MutateTask::prepare()
storage_from_source_part, ctx->metadata_snapshot, ctx->commands_for_part, Context::createCopy(context_for_reading)))
{
LOG_TRACE(ctx->log, "Part {} doesn't change up to mutation version {}", ctx->source_part->name, ctx->future_part->part_info.mutation);
- promise.set_value(ctx->data->cloneAndLoadDataPartOnSameDisk(ctx->source_part, "tmp_clone_", ctx->future_part->part_info, ctx->metadata_snapshot, ctx->txn));
+ promise.set_value(ctx->data->cloneAndLoadDataPartOnSameDisk(ctx->source_part, "tmp_clone_", ctx->future_part->part_info, ctx->metadata_snapshot, ctx->txn, &ctx->hardlinked_files));
return false;
}
else
@@ -1388,7 +1390,7 @@ bool MutateTask::prepare()
&& ctx->files_to_rename.empty())
{
LOG_TRACE(ctx->log, "Part {} doesn't change up to mutation version {} (optimized)", ctx->source_part->name, ctx->future_part->part_info.mutation);
- promise.set_value(ctx->data->cloneAndLoadDataPartOnSameDisk(ctx->source_part, "tmp_mut_", ctx->future_part->part_info, ctx->metadata_snapshot, ctx->txn));
+ promise.set_value(ctx->data->cloneAndLoadDataPartOnSameDisk(ctx->source_part, "tmp_mut_", ctx->future_part->part_info, ctx->metadata_snapshot, ctx->txn, &ctx->hardlinked_files));
return false;
}
@@ -1398,5 +1400,10 @@ bool MutateTask::prepare()
return true;
}
+const MergeTreeData::HardlinkedFiles & MutateTask::getHardlinkedFiles() const
+{
+ return ctx->hardlinked_files;
+}
+
}
diff --git a/src/Storages/MergeTree/MutateTask.h b/src/Storages/MergeTree/MutateTask.h
index aa38ee34b4a..40447cc63c8 100644
--- a/src/Storages/MergeTree/MutateTask.h
+++ b/src/Storages/MergeTree/MutateTask.h
@@ -44,6 +44,8 @@ public:
return promise.get_future();
}
+ const MergeTreeData::HardlinkedFiles & getHardlinkedFiles() const;
+
private:
bool prepare();
@@ -56,7 +58,6 @@ private:
State state{State::NEED_PREPARE};
-
std::promise promise;
std::shared_ptr ctx;
diff --git a/src/Storages/StorageMergeTree.cpp b/src/Storages/StorageMergeTree.cpp
index e322d8785fa..50c0c083c99 100644
--- a/src/Storages/StorageMergeTree.cpp
+++ b/src/Storages/StorageMergeTree.cpp
@@ -1583,7 +1583,7 @@ void StorageMergeTree::replacePartitionFrom(const StoragePtr & source_table, con
Int64 temp_index = insert_increment.get();
MergeTreePartInfo dst_part_info(partition_id, temp_index, temp_index, src_part->info.level);
- auto dst_part = cloneAndLoadDataPartOnSameDisk(src_part, TMP_PREFIX, dst_part_info, my_metadata_snapshot, local_context->getCurrentTransaction());
+ auto dst_part = cloneAndLoadDataPartOnSameDisk(src_part, TMP_PREFIX, dst_part_info, my_metadata_snapshot, local_context->getCurrentTransaction(), {});
dst_parts.emplace_back(std::move(dst_part));
}
@@ -1669,7 +1669,7 @@ void StorageMergeTree::movePartitionToTable(const StoragePtr & dest_table, const
Int64 temp_index = insert_increment.get();
MergeTreePartInfo dst_part_info(partition_id, temp_index, temp_index, src_part->info.level);
- auto dst_part = dest_table_storage->cloneAndLoadDataPartOnSameDisk(src_part, TMP_PREFIX, dst_part_info, dest_metadata_snapshot, local_context->getCurrentTransaction());
+ auto dst_part = dest_table_storage->cloneAndLoadDataPartOnSameDisk(src_part, TMP_PREFIX, dst_part_info, dest_metadata_snapshot, local_context->getCurrentTransaction(), {});
dst_parts.emplace_back(std::move(dst_part));
}
diff --git a/src/Storages/StorageReplicatedMergeTree.cpp b/src/Storages/StorageReplicatedMergeTree.cpp
index a4f176a213e..2389746c90b 100644
--- a/src/Storages/StorageReplicatedMergeTree.cpp
+++ b/src/Storages/StorageReplicatedMergeTree.cpp
@@ -1447,7 +1447,7 @@ void StorageReplicatedMergeTree::checkPartChecksumsAndAddCommitOps(const zkutil:
}
MergeTreeData::DataPartsVector StorageReplicatedMergeTree::checkPartChecksumsAndCommit(Transaction & transaction,
- const DataPartPtr & part)
+ const DataPartPtr & part, std::optional hardlinked_files)
{
auto zookeeper = getZooKeeper();
@@ -1457,7 +1457,7 @@ MergeTreeData::DataPartsVector StorageReplicatedMergeTree::checkPartChecksumsAnd
Coordination::Requests ops;
NameSet absent_part_paths_on_replicas;
- lockSharedData(*part, false);
+ lockSharedData(*part, false, hardlinked_files);
/// Checksums are checked here and `ops` is filled. In fact, the part is added to ZK just below, when executing `multi`.
checkPartChecksumsAndAddCommitOps(zookeeper, part, ops, part->name, &absent_part_paths_on_replicas);
@@ -1992,6 +1992,8 @@ bool StorageReplicatedMergeTree::executeReplaceRange(const LogEntry & entry)
/// A replica that will be used to fetch part
String replica;
+
+ MergeTreeData::HardlinkedFiles hardlinked_files;
};
using PartDescriptionPtr = std::shared_ptr;
@@ -2197,6 +2199,7 @@ bool StorageReplicatedMergeTree::executeReplaceRange(const LogEntry & entry)
static const String TMP_PREFIX = "tmp_replace_from_";
+ std::vector hardlinked_files_for_parts;
auto obtain_part = [&] (PartDescriptionPtr & part_desc)
{
if (part_desc->src_table_part)
@@ -2206,7 +2209,7 @@ bool StorageReplicatedMergeTree::executeReplaceRange(const LogEntry & entry)
throw Exception("Checksums of " + part_desc->src_table_part->name + " is suddenly changed", ErrorCodes::UNFINISHED);
part_desc->res_part = cloneAndLoadDataPartOnSameDisk(
- part_desc->src_table_part, TMP_PREFIX + "clone_", part_desc->new_part_info, metadata_snapshot, NO_TRANSACTION_PTR);
+ part_desc->src_table_part, TMP_PREFIX + "clone_", part_desc->new_part_info, metadata_snapshot, NO_TRANSACTION_PTR, &part_desc->hardlinked_files);
}
else if (!part_desc->replica.empty())
{
@@ -2253,8 +2256,11 @@ bool StorageReplicatedMergeTree::executeReplaceRange(const LogEntry & entry)
{
renameTempPartAndReplace(part_desc->res_part, NO_TRANSACTION_RAW, nullptr, &transaction);
getCommitPartOps(ops, part_desc->res_part);
+
+ lockSharedData(*part_desc->res_part, false, part_desc->hardlinked_files);
}
+
if (!ops.empty())
zookeeper->multi(ops);
@@ -2281,6 +2287,10 @@ bool StorageReplicatedMergeTree::executeReplaceRange(const LogEntry & entry)
catch (...)
{
PartLog::addNewParts(getContext(), res_parts, watch.elapsed(), ExecutionStatus::fromCurrentException());
+
+ for (const auto & res_part : res_parts)
+ unlockSharedData(*res_part);
+
throw;
}
@@ -3933,12 +3943,13 @@ bool StorageReplicatedMergeTree::fetchPart(const String & part_name, const Stora
InterserverCredentialsPtr credentials;
std::optional tagger_ptr;
std::function get_part;
+ MergeTreeData::HardlinkedFiles hardlinked_files;
if (part_to_clone)
{
get_part = [&, part_to_clone]()
{
- return cloneAndLoadDataPartOnSameDisk(part_to_clone, "tmp_clone_", part_info, metadata_snapshot, NO_TRANSACTION_PTR);
+ return cloneAndLoadDataPartOnSameDisk(part_to_clone, "tmp_clone_", part_info, metadata_snapshot, NO_TRANSACTION_PTR, &hardlinked_files);
};
}
else
@@ -3984,7 +3995,7 @@ bool StorageReplicatedMergeTree::fetchPart(const String & part_name, const Stora
Transaction transaction(*this, NO_TRANSACTION_RAW);
renameTempPartAndReplace(part, NO_TRANSACTION_RAW, nullptr, &transaction);
- replaced_parts = checkPartChecksumsAndCommit(transaction, part);
+ replaced_parts = checkPartChecksumsAndCommit(transaction, part, hardlinked_files);
/** If a quorum is tracked for this part, you must update it.
* If you do not have time, in case of losing the session, when you restart the server - see the `ReplicatedMergeTreeRestartingThread::updateQuorumIfWeHavePart` method.
@@ -6381,6 +6392,7 @@ void StorageReplicatedMergeTree::replacePartitionFrom(
assert(replace == !LogEntry::ReplaceRangeEntry::isMovePartitionOrAttachFrom(drop_range));
String drop_range_fake_part_name = getPartNamePossiblyFake(format_version, drop_range);
+ std::vector hardlinked_files_for_parts;
for (const auto & src_part : src_all_parts)
{
@@ -6411,13 +6423,16 @@ void StorageReplicatedMergeTree::replacePartitionFrom(
UInt64 index = lock->getNumber();
MergeTreePartInfo dst_part_info(partition_id, index, index, src_part->info.level);
- auto dst_part = cloneAndLoadDataPartOnSameDisk(src_part, TMP_PREFIX, dst_part_info, metadata_snapshot, NO_TRANSACTION_PTR);
+ MergeTreeData::HardlinkedFiles hardlinked_files;
+
+ auto dst_part = cloneAndLoadDataPartOnSameDisk(src_part, TMP_PREFIX, dst_part_info, metadata_snapshot, NO_TRANSACTION_PTR, &hardlinked_files);
src_parts.emplace_back(src_part);
dst_parts.emplace_back(dst_part);
ephemeral_locks.emplace_back(std::move(*lock));
block_id_paths.emplace_back(block_id_path);
part_checksums.emplace_back(hash_hex);
+ hardlinked_files_for_parts.emplace_back(hardlinked_files);
}
ReplicatedMergeTreeLogEntryData entry;
@@ -6475,6 +6490,9 @@ void StorageReplicatedMergeTree::replacePartitionFrom(
renameTempPartAndReplace(part, query_context->getCurrentTransaction().get(), nullptr, &transaction, data_parts_lock);
}
+ for (size_t i = 0; i < dst_parts.size(); ++i)
+ lockSharedData(*dst_parts[i], false, hardlinked_files_for_parts[i]);
+
Coordination::Error code = zookeeper->tryMulti(ops, op_results);
if (code == Coordination::Error::ZOK)
delimiting_block_lock->assumeUnlocked();
@@ -6502,6 +6520,9 @@ void StorageReplicatedMergeTree::replacePartitionFrom(
catch (...)
{
PartLog::addNewParts(getContext(), dst_parts, watch.elapsed(), ExecutionStatus::fromCurrentException());
+ for (const auto & dst_part : dst_parts)
+ unlockSharedData(*dst_part);
+
throw;
}
@@ -6603,6 +6624,8 @@ void StorageReplicatedMergeTree::movePartitionToTable(const StoragePtr & dest_ta
String dest_alter_partition_version_path = dest_table_storage->zookeeper_path + "/alter_partition_version";
Coordination::Stat dest_alter_partition_version_stat;
zookeeper->get(dest_alter_partition_version_path, &dest_alter_partition_version_stat);
+ std::vector hardlinked_files_for_parts;
+
for (const auto & src_part : src_all_parts)
{
if (!dest_table_storage->canReplacePartition(src_part))
@@ -6622,13 +6645,16 @@ void StorageReplicatedMergeTree::movePartitionToTable(const StoragePtr & dest_ta
UInt64 index = lock->getNumber();
MergeTreePartInfo dst_part_info(partition_id, index, index, src_part->info.level);
- auto dst_part = dest_table_storage->cloneAndLoadDataPartOnSameDisk(src_part, TMP_PREFIX, dst_part_info, dest_metadata_snapshot, NO_TRANSACTION_PTR);
+
+ MergeTreeData::HardlinkedFiles hardlinked_files;
+ auto dst_part = dest_table_storage->cloneAndLoadDataPartOnSameDisk(src_part, TMP_PREFIX, dst_part_info, dest_metadata_snapshot, NO_TRANSACTION_PTR, &hardlinked_files);
src_parts.emplace_back(src_part);
dst_parts.emplace_back(dst_part);
ephemeral_locks.emplace_back(std::move(*lock));
block_id_paths.emplace_back(block_id_path);
part_checksums.emplace_back(hash_hex);
+ hardlinked_files_for_parts.emplace_back(hardlinked_files);
}
ReplicatedMergeTreeLogEntryData entry_delete;
@@ -6695,6 +6721,9 @@ void StorageReplicatedMergeTree::movePartitionToTable(const StoragePtr & dest_ta
for (MutableDataPartPtr & part : dst_parts)
dest_table_storage->renameTempPartAndReplace(part, query_context->getCurrentTransaction().get(), nullptr, &transaction, lock);
+ for (size_t i = 0; i < dst_parts.size(); ++i)
+ lockSharedData(*dst_parts[i], false, hardlinked_files_for_parts[i]);
+
Coordination::Error code = zookeeper->tryMulti(ops, op_results);
if (code == Coordination::Error::ZBADVERSION)
continue;
@@ -6710,6 +6739,10 @@ void StorageReplicatedMergeTree::movePartitionToTable(const StoragePtr & dest_ta
catch (...)
{
PartLog::addNewParts(getContext(), dst_parts, watch.elapsed(), ExecutionStatus::fromCurrentException());
+
+ for (const auto & dst_part : dst_parts)
+ unlockSharedData(*dst_part);
+
throw;
}
@@ -7327,7 +7360,7 @@ void StorageReplicatedMergeTree::lockSharedDataTemporary(const String & part_nam
}
}
-void StorageReplicatedMergeTree::lockSharedData(const IMergeTreeDataPart & part, bool replace_existing_lock) const
+void StorageReplicatedMergeTree::lockSharedData(const IMergeTreeDataPart & part, bool replace_existing_lock, std::optional hardlinked_files) const
{
auto settings = getSettings();
@@ -7350,11 +7383,16 @@ void StorageReplicatedMergeTree::lockSharedData(const IMergeTreeDataPart & part,
part.name, zookeeper_path);
String path_to_set_hardlinked_files;
- if (!part.hardlinked_files.hardlinks_from_source_part.empty())
+ NameSet hardlinks;
+
+ if (hardlinked_files.has_value() && !hardlinked_files->hardlinks_from_source_part.empty())
{
+ LOG_DEBUG(&Poco::Logger::get("DEBUG"), "ADDING SOME HARDLINKS {}", fmt::join(hardlinked_files->hardlinks_from_source_part, ", "));
path_to_set_hardlinked_files = getZeroCopyPartPath(
*getSettings(), disk->getType(), getTableSharedID(),
- part.hardlinked_files.source_part_name, zookeeper_path)[0];
+ hardlinked_files->source_part_name, zookeeper_path)[0];
+
+ hardlinks = hardlinked_files->hardlinks_from_source_part;
}
for (const auto & zc_zookeeper_path : zc_zookeeper_paths)
@@ -7365,7 +7403,7 @@ void StorageReplicatedMergeTree::lockSharedData(const IMergeTreeDataPart & part,
createZeroCopyLockNode(
zookeeper, zookeeper_node, zkutil::CreateMode::Persistent,
- replace_existing_lock, path_to_set_hardlinked_files, part.hardlinked_files.hardlinks_from_source_part);
+ replace_existing_lock, path_to_set_hardlinked_files, hardlinks);
}
}
diff --git a/src/Storages/StorageReplicatedMergeTree.h b/src/Storages/StorageReplicatedMergeTree.h
index 54d6977d115..b6a003280f1 100644
--- a/src/Storages/StorageReplicatedMergeTree.h
+++ b/src/Storages/StorageReplicatedMergeTree.h
@@ -229,7 +229,7 @@ public:
bool executeFetchShared(const String & source_replica, const String & new_part_name, const DiskPtr & disk, const String & path);
/// Lock part in zookeeper for use shared data in several nodes
- void lockSharedData(const IMergeTreeDataPart & part, bool replace_existing_lock) const override;
+ void lockSharedData(const IMergeTreeDataPart & part, bool replace_existing_lock, std::optional hardlinked_files = {}) const override;
void lockSharedDataTemporary(const String & part_name, const String & part_id, const DiskPtr & disk) const;
@@ -469,7 +469,7 @@ private:
String getChecksumsForZooKeeper(const MergeTreeDataPartChecksums & checksums) const;
/// Accepts a PreActive part, atomically checks its checksums with ones on other replicas and commit the part
- DataPartsVector checkPartChecksumsAndCommit(Transaction & transaction, const DataPartPtr & part);
+ DataPartsVector checkPartChecksumsAndCommit(Transaction & transaction, const DataPartPtr & part, std::optional hardlinked_files = {});
bool partIsAssignedToBackgroundOperation(const DataPartPtr & part) const override;
From 7919485db2357d5c3e8a6e8252842d60982e4531 Mon Sep 17 00:00:00 2001
From: alesapin
Date: Tue, 19 Apr 2022 15:57:25 +0200
Subject: [PATCH 22/55] Fix
---
src/Storages/StorageReplicatedMergeTree.cpp | 1 -
1 file changed, 1 deletion(-)
diff --git a/src/Storages/StorageReplicatedMergeTree.cpp b/src/Storages/StorageReplicatedMergeTree.cpp
index 2389746c90b..1487d165046 100644
--- a/src/Storages/StorageReplicatedMergeTree.cpp
+++ b/src/Storages/StorageReplicatedMergeTree.cpp
@@ -7387,7 +7387,6 @@ void StorageReplicatedMergeTree::lockSharedData(const IMergeTreeDataPart & part,
if (hardlinked_files.has_value() && !hardlinked_files->hardlinks_from_source_part.empty())
{
- LOG_DEBUG(&Poco::Logger::get("DEBUG"), "ADDING SOME HARDLINKS {}", fmt::join(hardlinked_files->hardlinks_from_source_part, ", "));
path_to_set_hardlinked_files = getZeroCopyPartPath(
*getSettings(), disk->getType(), getTableSharedID(),
hardlinked_files->source_part_name, zookeeper_path)[0];
From aea7c7755ad8d9e6752187d8b6ec64b656b974d3 Mon Sep 17 00:00:00 2001
From: alesapin
Date: Tue, 19 Apr 2022 17:25:08 +0200
Subject: [PATCH 23/55] Fix style
---
src/Storages/MergeTree/MutateTask.cpp | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/src/Storages/MergeTree/MutateTask.cpp b/src/Storages/MergeTree/MutateTask.cpp
index 83502f1031d..bf1d034263d 100644
--- a/src/Storages/MergeTree/MutateTask.cpp
+++ b/src/Storages/MergeTree/MutateTask.cpp
@@ -1119,7 +1119,7 @@ private:
}
}
- /// Traking of hardlinked files required for zero-copy replication.
+ /// Tracking of hardlinked files required for zero-copy replication.
/// We don't remove them when we delete last copy of source part because
/// new part can use them.
ctx->hardlinked_files.source_part_name = ctx->source_part->name;
From 033a3475a57172aa2d3677dc93af24421897aa74 Mon Sep 17 00:00:00 2001
From: alesapin
Date: Tue, 19 Apr 2022 17:57:12 +0200
Subject: [PATCH 24/55] Update src/Disks/IDiskRemote.cpp
---
src/Disks/IDiskRemote.cpp | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/src/Disks/IDiskRemote.cpp b/src/Disks/IDiskRemote.cpp
index bc224904a5a..e4a610180d0 100644
--- a/src/Disks/IDiskRemote.cpp
+++ b/src/Disks/IDiskRemote.cpp
@@ -283,7 +283,7 @@ void IDiskRemote::removeMetadata(const String & path, std::vector & path
try
{
- auto metadata_updater = [&paths_to_remove, path, this] (Metadata & metadata)
+ auto metadata_updater = [&paths_to_remove, this] (Metadata & metadata)
{
if (metadata.ref_count == 0)
{
From 158a25d5fabe422f72214901b244b851d6a3403e Mon Sep 17 00:00:00 2001
From: Amos Bird
Date: Wed, 20 Apr 2022 01:22:04 +0800
Subject: [PATCH 25/55] Fix column matcher and column transformer
---
.../JoinToSubqueryTransformVisitor.cpp | 47 +++++--
src/Interpreters/PredicateRewriteVisitor.cpp | 4 +-
.../TranslateQualifiedNamesVisitor.cpp | 62 ++++-----
src/Parsers/ASTAsterisk.cpp | 2 +
src/Parsers/ASTColumnsMatcher.cpp | 93 +++++++++++---
src/Parsers/ASTColumnsMatcher.h | 32 ++---
src/Parsers/ASTColumnsTransformers.cpp | 119 +++++++++++++++++-
src/Parsers/ASTColumnsTransformers.h | 11 +-
src/Parsers/ASTQualifiedAsterisk.cpp | 2 +
src/Parsers/ExpressionElementParsers.cpp | 12 +-
...n_matcher_and_column_transformer.reference | 0
..._column_matcher_and_column_transformer.sql | 64 ++++++++++
12 files changed, 358 insertions(+), 90 deletions(-)
create mode 100644 tests/queries/0_stateless/02271_fix_column_matcher_and_column_transformer.reference
create mode 100644 tests/queries/0_stateless/02271_fix_column_matcher_and_column_transformer.sql
diff --git a/src/Interpreters/JoinToSubqueryTransformVisitor.cpp b/src/Interpreters/JoinToSubqueryTransformVisitor.cpp
index c43302e0de9..a6e43993196 100644
--- a/src/Interpreters/JoinToSubqueryTransformVisitor.cpp
+++ b/src/Interpreters/JoinToSubqueryTransformVisitor.cpp
@@ -12,6 +12,7 @@
#include
#include
#include
+#include
#include
#include
#include
@@ -81,6 +82,7 @@ public:
/// By default should_add_column_predicate returns true for any column name
void addTableColumns(
const String & table_name,
+ ASTs & columns,
ShouldAddColumnPredicate should_add_column_predicate = [](const String &) { return true; })
{
auto it = table_columns.find(table_name);
@@ -105,7 +107,7 @@ public:
else
identifier = std::make_shared(std::vector{it->first, column.name});
- new_select_expression_list->children.emplace_back(std::move(identifier));
+ columns.emplace_back(std::move(identifier));
}
}
}
@@ -129,14 +131,18 @@ private:
for (const auto & child : node.children)
{
- if (child->as())
+ ASTs columns;
+ if (const auto * asterisk = child->as())
{
has_asterisks = true;
for (auto & table_name : data.tables_order)
- data.addTableColumns(table_name);
+ data.addTableColumns(table_name, columns);
+
+ for (const auto & transformer : asterisk->children)
+ IASTColumnsTransformer::transform(transformer, columns);
}
- else if (child->as())
+ else if (const auto * qualified_asterisk = child->as())
{
has_asterisks = true;
@@ -144,17 +150,44 @@ private:
throw Exception("Logical error: qualified asterisk must have exactly one child", ErrorCodes::LOGICAL_ERROR);
auto & identifier = child->children[0]->as();
- data.addTableColumns(identifier.name());
+ data.addTableColumns(identifier.name(), columns);
+
+ // QualifiedAsterisk's transformers start to appear at child 1
+ for (auto it = qualified_asterisk->children.begin() + 1; it != qualified_asterisk->children.end(); ++it)
+ {
+ IASTColumnsTransformer::transform(*it, columns);
+ }
}
- else if (auto * columns_matcher = child->as())
+ else if (const auto * columns_list_matcher = child->as())
+ {
+ has_asterisks = true;
+
+ for (const auto & ident : columns_list_matcher->column_list->children)
+ columns.emplace_back(ident->clone());
+
+ for (const auto & transformer : columns_list_matcher->children)
+ IASTColumnsTransformer::transform(transformer, columns);
+ }
+ else if (const auto * columns_regexp_matcher = child->as())
{
has_asterisks = true;
for (auto & table_name : data.tables_order)
- data.addTableColumns(table_name, [&](const String & column_name) { return columns_matcher->isColumnMatching(column_name); });
+ data.addTableColumns(
+ table_name,
+ columns,
+ [&](const String & column_name) { return columns_regexp_matcher->isColumnMatching(column_name); });
+
+ for (const auto & transformer : columns_regexp_matcher->children)
+ IASTColumnsTransformer::transform(transformer, columns);
}
else
data.new_select_expression_list->children.push_back(child);
+
+ data.new_select_expression_list->children.insert(
+ data.new_select_expression_list->children.end(),
+ std::make_move_iterator(columns.begin()),
+ std::make_move_iterator(columns.end()));
}
if (!has_asterisks)
diff --git a/src/Interpreters/PredicateRewriteVisitor.cpp b/src/Interpreters/PredicateRewriteVisitor.cpp
index 74ecc247eb1..910c846d58b 100644
--- a/src/Interpreters/PredicateRewriteVisitor.cpp
+++ b/src/Interpreters/PredicateRewriteVisitor.cpp
@@ -96,8 +96,8 @@ void PredicateRewriteVisitorData::visitOtherInternalSelect(ASTSelectQuery & sele
size_t alias_index = 0;
for (auto & ref_select : temp_select_query->refSelect()->children)
{
- if (!ref_select->as() && !ref_select->as() && !ref_select->as() &&
- !ref_select->as())
+ if (!ref_select->as() && !ref_select->as() && !ref_select->as()
+ && !ref_select->as() && !ref_select->as())
{
if (const auto & alias = ref_select->tryGetAlias(); alias.empty())
ref_select->setAlias("--predicate_optimizer_" + toString(alias_index++));
diff --git a/src/Interpreters/TranslateQualifiedNamesVisitor.cpp b/src/Interpreters/TranslateQualifiedNamesVisitor.cpp
index 6016d54c7dc..63f99b2a398 100644
--- a/src/Interpreters/TranslateQualifiedNamesVisitor.cpp
+++ b/src/Interpreters/TranslateQualifiedNamesVisitor.cpp
@@ -196,7 +196,7 @@ void TranslateQualifiedNamesMatcher::visit(ASTExpressionList & node, const ASTPt
bool has_asterisk = false;
for (const auto & child : node.children)
{
- if (child->as() || child->as())
+ if (child->as() || child->as() || child->as())
{
if (tables_with_columns.empty())
throw Exception("An asterisk cannot be replaced with empty columns.", ErrorCodes::LOGICAL_ERROR);
@@ -229,47 +229,38 @@ void TranslateQualifiedNamesMatcher::visit(ASTExpressionList & node, const ASTPt
for (const auto & column : *cols)
{
if (first_table || !data.join_using_columns.count(column.name))
- {
addIdentifier(columns, table.table, column.name);
- }
}
}
-
first_table = false;
}
- for (const auto & transformer : asterisk->children)
- {
- IASTColumnsTransformer::transform(transformer, columns);
- }
- }
- else if (const auto * asterisk_pattern = child->as())
- {
- if (asterisk_pattern->column_list)
- {
- for (const auto & ident : asterisk_pattern->column_list->children)
- columns.emplace_back(ident->clone());
- }
- else
- {
- bool first_table = true;
- for (const auto & table : tables_with_columns)
- {
- for (const auto & column : table.columns)
- {
- if (asterisk_pattern->isColumnMatching(column.name) && (first_table || !data.join_using_columns.count(column.name)))
- {
- addIdentifier(columns, table.table, column.name);
- }
- }
- first_table = false;
- }
- }
- // ColumnsMatcher's transformers start to appear at child 1
- for (auto it = asterisk_pattern->children.begin() + 1; it != asterisk_pattern->children.end(); ++it)
+ for (const auto & transformer : asterisk->children)
+ IASTColumnsTransformer::transform(transformer, columns);
+ }
+ else if (auto * asterisk_column_list = child->as())
+ {
+ for (const auto & ident : asterisk_column_list->column_list->children)
+ columns.emplace_back(ident->clone());
+
+ for (const auto & transformer : asterisk_column_list->children)
+ IASTColumnsTransformer::transform(transformer, columns);
+ }
+ else if (const auto * asterisk_regexp_pattern = child->as())
+ {
+ bool first_table = true;
+ for (const auto & table : tables_with_columns)
{
- IASTColumnsTransformer::transform(*it, columns);
+ for (const auto & column : table.columns)
+ {
+ if (asterisk_regexp_pattern->isColumnMatching(column.name) && (first_table || !data.join_using_columns.count(column.name)))
+ addIdentifier(columns, table.table, column.name);
+ }
+ first_table = false;
}
+
+ for (const auto & transformer : asterisk_regexp_pattern->children)
+ IASTColumnsTransformer::transform(transformer, columns);
}
else if (const auto * qualified_asterisk = child->as())
{
@@ -280,12 +271,11 @@ void TranslateQualifiedNamesMatcher::visit(ASTExpressionList & node, const ASTPt
if (ident_db_and_name.satisfies(table.table, true))
{
for (const auto & column : table.columns)
- {
addIdentifier(columns, table.table, column.name);
- }
break;
}
}
+
// QualifiedAsterisk's transformers start to appear at child 1
for (auto it = qualified_asterisk->children.begin() + 1; it != qualified_asterisk->children.end(); ++it)
{
diff --git a/src/Parsers/ASTAsterisk.cpp b/src/Parsers/ASTAsterisk.cpp
index ed733e62ca3..e2f45d04fa4 100644
--- a/src/Parsers/ASTAsterisk.cpp
+++ b/src/Parsers/ASTAsterisk.cpp
@@ -17,6 +17,8 @@ void ASTAsterisk::appendColumnName(WriteBuffer & ostr) const { ostr.write('*');
void ASTAsterisk::formatImpl(const FormatSettings & settings, FormatState & state, FormatStateStacked frame) const
{
settings.ostr << "*";
+
+ /// Format column transformers
for (const auto & child : children)
{
settings.ostr << ' ';
diff --git a/src/Parsers/ASTColumnsMatcher.cpp b/src/Parsers/ASTColumnsMatcher.cpp
index 45799cb7ffe..8f167f99b37 100644
--- a/src/Parsers/ASTColumnsMatcher.cpp
+++ b/src/Parsers/ASTColumnsMatcher.cpp
@@ -1,64 +1,117 @@
-#include "ASTColumnsMatcher.h"
+#include
+
+#include
#include
-#include
#include
#include
-#include
+#include
namespace DB
{
+
namespace ErrorCodes
{
extern const int CANNOT_COMPILE_REGEXP;
}
-ASTPtr ASTColumnsMatcher::clone() const
+ASTPtr ASTColumnsRegexpMatcher::clone() const
{
- auto clone = std::make_shared(*this);
+ auto clone = std::make_shared(*this);
clone->cloneChildren();
return clone;
}
-void ASTColumnsMatcher::appendColumnName(WriteBuffer & ostr) const { writeString(original_pattern, ostr); }
+void ASTColumnsRegexpMatcher::appendColumnName(WriteBuffer & ostr) const
+{
+ writeCString("COLUMNS(", ostr);
+ writeQuotedString(original_pattern, ostr);
+ writeChar(')', ostr);
+}
-void ASTColumnsMatcher::updateTreeHashImpl(SipHash & hash_state) const
+void ASTColumnsRegexpMatcher::updateTreeHashImpl(SipHash & hash_state) const
{
hash_state.update(original_pattern.size());
hash_state.update(original_pattern);
IAST::updateTreeHashImpl(hash_state);
}
-void ASTColumnsMatcher::formatImpl(const FormatSettings & settings, FormatState & state, FormatStateStacked frame) const
+void ASTColumnsRegexpMatcher::formatImpl(const FormatSettings & settings, FormatState & state, FormatStateStacked frame) const
{
settings.ostr << (settings.hilite ? hilite_keyword : "") << "COLUMNS" << (settings.hilite ? hilite_none : "") << "(";
- if (column_list)
- {
- frame.expression_list_prepend_whitespace = false;
- column_list->formatImpl(settings, state, frame);
- }
- else
- settings.ostr << quoteString(original_pattern);
+ settings.ostr << quoteString(original_pattern);
settings.ostr << ")";
- for (ASTs::const_iterator it = children.begin() + 1; it != children.end(); ++it)
+
+ /// Format column transformers
+ for (const auto & child : children)
{
settings.ostr << ' ';
- (*it)->formatImpl(settings, state, frame);
+ child->formatImpl(settings, state, frame);
}
}
-void ASTColumnsMatcher::setPattern(String pattern)
+void ASTColumnsRegexpMatcher::setPattern(String pattern)
{
original_pattern = std::move(pattern);
column_matcher = std::make_shared(original_pattern, RE2::Quiet);
if (!column_matcher->ok())
- throw DB::Exception("COLUMNS pattern " + original_pattern + " cannot be compiled: " + column_matcher->error(), DB::ErrorCodes::CANNOT_COMPILE_REGEXP);
+ throw DB::Exception(
+ "COLUMNS pattern " + original_pattern + " cannot be compiled: " + column_matcher->error(),
+ DB::ErrorCodes::CANNOT_COMPILE_REGEXP);
}
-bool ASTColumnsMatcher::isColumnMatching(const String & column_name) const
+bool ASTColumnsRegexpMatcher::isColumnMatching(const String & column_name) const
{
return RE2::PartialMatch(column_name, *column_matcher);
}
+ASTPtr ASTColumnsListMatcher::clone() const
+{
+ auto clone = std::make_shared(*this);
+ clone->column_list = column_list->clone();
+ clone->cloneChildren();
+ return clone;
+}
+
+void ASTColumnsListMatcher::updateTreeHashImpl(SipHash & hash_state) const
+{
+ column_list->updateTreeHash(hash_state);
+ IAST::updateTreeHashImpl(hash_state);
+}
+
+void ASTColumnsListMatcher::appendColumnName(WriteBuffer & ostr) const
+{
+ writeCString("COLUMNS(", ostr);
+ for (auto it = column_list->children.begin(); it != column_list->children.end(); ++it)
+ {
+ if (it != column_list->children.begin())
+ writeCString(", ", ostr);
+
+ (*it)->appendColumnName(ostr);
+ }
+ writeChar(')', ostr);
+}
+
+void ASTColumnsListMatcher::formatImpl(const FormatSettings & settings, FormatState & state, FormatStateStacked frame) const
+{
+ settings.ostr << (settings.hilite ? hilite_keyword : "") << "COLUMNS" << (settings.hilite ? hilite_none : "") << "(";
+
+ for (ASTs::const_iterator it = column_list->children.begin(); it != column_list->children.end(); ++it)
+ {
+ if (it != column_list->children.begin())
+ {
+ settings.ostr << ", ";
+ }
+ (*it)->formatImpl(settings, state, frame);
+ }
+ settings.ostr << ")";
+
+ /// Format column transformers
+ for (const auto & child : children)
+ {
+ settings.ostr << ' ';
+ child->formatImpl(settings, state, frame);
+ }
+}
}
diff --git a/src/Parsers/ASTColumnsMatcher.h b/src/Parsers/ASTColumnsMatcher.h
index 76ece9c95cc..5aaf3cbe30d 100644
--- a/src/Parsers/ASTColumnsMatcher.h
+++ b/src/Parsers/ASTColumnsMatcher.h
@@ -2,10 +2,9 @@
#include
-
namespace re2
{
- class RE2;
+class RE2;
}
@@ -14,21 +13,13 @@ namespace DB
class WriteBuffer;
-namespace ErrorCodes
-{
-}
-
-struct AsteriskSemantic;
-struct AsteriskSemanticImpl;
-
-
/** SELECT COLUMNS('regexp') is expanded to multiple columns like * (asterisk).
* Optional transformers can be attached to further manipulate these expanded columns.
*/
-class ASTColumnsMatcher : public IAST
+class ASTColumnsRegexpMatcher : public IAST
{
public:
- String getID(char) const override { return "ColumnsMatcher"; }
+ String getID(char) const override { return "ColumnsRegexpMatcher"; }
ASTPtr clone() const override;
void appendColumnName(WriteBuffer & ostr) const override;
@@ -36,17 +27,26 @@ public:
bool isColumnMatching(const String & column_name) const;
void updateTreeHashImpl(SipHash & hash_state) const override;
- ASTPtr column_list;
-
protected:
void formatImpl(const FormatSettings & settings, FormatState &, FormatStateStacked) const override;
private:
std::shared_ptr column_matcher;
String original_pattern;
- std::shared_ptr semantic; /// pimpl
+};
- friend struct AsteriskSemantic;
+/// Same as the above but use a list of column names to do matching.
+class ASTColumnsListMatcher : public IAST
+{
+public:
+ String getID(char) const override { return "ColumnsListMatcher"; }
+ ASTPtr clone() const override;
+ void appendColumnName(WriteBuffer & ostr) const override;
+ void updateTreeHashImpl(SipHash & hash_state) const override;
+
+ ASTPtr column_list;
+protected:
+ void formatImpl(const FormatSettings & settings, FormatState &, FormatStateStacked) const override;
};
diff --git a/src/Parsers/ASTColumnsTransformers.cpp b/src/Parsers/ASTColumnsTransformers.cpp
index 451ecf0d4dd..d90d1e747f4 100644
--- a/src/Parsers/ASTColumnsTransformers.cpp
+++ b/src/Parsers/ASTColumnsTransformers.cpp
@@ -105,6 +105,49 @@ void ASTColumnsApplyTransformer::transform(ASTs & nodes) const
}
}
+void ASTColumnsApplyTransformer::appendColumnName(WriteBuffer & ostr) const
+{
+ writeCString("APPLY ", ostr);
+ if (!column_name_prefix.empty())
+ writeChar('(', ostr);
+
+ if (lambda)
+ lambda->appendColumnName(ostr);
+ else
+ {
+ writeString(func_name, ostr);
+
+ if (parameters)
+ parameters->appendColumnName(ostr);
+ }
+
+ if (!column_name_prefix.empty())
+ {
+ writeCString(", '", ostr);
+ writeString(column_name_prefix, ostr);
+ writeCString("')", ostr);
+ }
+}
+
+void ASTColumnsApplyTransformer::updateTreeHashImpl(SipHash & hash_state) const
+{
+ hash_state.update(func_name.size());
+ hash_state.update(func_name);
+ if (parameters)
+ parameters->updateTreeHashImpl(hash_state);
+
+ if (lambda)
+ lambda->updateTreeHashImpl(hash_state);
+
+ hash_state.update(lambda_arg.size());
+ hash_state.update(lambda_arg);
+
+ hash_state.update(column_name_prefix.size());
+ hash_state.update(column_name_prefix);
+
+ IAST::updateTreeHashImpl(hash_state);
+}
+
void ASTColumnsExceptTransformer::formatImpl(const FormatSettings & settings, FormatState & state, FormatStateStacked frame) const
{
settings.ostr << (settings.hilite ? hilite_keyword : "") << "EXCEPT" << (is_strict ? " STRICT " : " ") << (settings.hilite ? hilite_none : "");
@@ -128,6 +171,38 @@ void ASTColumnsExceptTransformer::formatImpl(const FormatSettings & settings, Fo
settings.ostr << ")";
}
+void ASTColumnsExceptTransformer::appendColumnName(WriteBuffer & ostr) const
+{
+ writeCString("EXCEPT ", ostr);
+ if (is_strict)
+ writeCString("STRICT ", ostr);
+
+ if (children.size() > 1)
+ writeChar('(', ostr);
+
+ for (ASTs::const_iterator it = children.begin(); it != children.end(); ++it)
+ {
+ if (it != children.begin())
+ writeCString(", ", ostr);
+ (*it)->appendColumnName(ostr);
+ }
+
+ if (!original_pattern.empty())
+ writeQuotedString(original_pattern, ostr);
+
+ if (children.size() > 1)
+ writeChar(')', ostr);
+}
+
+void ASTColumnsExceptTransformer::updateTreeHashImpl(SipHash & hash_state) const
+{
+ hash_state.update(is_strict);
+ hash_state.update(original_pattern.size());
+ hash_state.update(original_pattern);
+
+ IAST::updateTreeHashImpl(hash_state);
+}
+
void ASTColumnsExceptTransformer::transform(ASTs & nodes) const
{
std::set expected_columns;
@@ -201,6 +276,21 @@ void ASTColumnsReplaceTransformer::Replacement::formatImpl(
settings.ostr << (settings.hilite ? hilite_keyword : "") << " AS " << (settings.hilite ? hilite_none : "") << backQuoteIfNeed(name);
}
+void ASTColumnsReplaceTransformer::Replacement::appendColumnName(WriteBuffer & ostr) const
+{
+ expr->appendColumnName(ostr);
+ writeCString(" AS ", ostr);
+ writeProbablyBackQuotedString(name, ostr);
+}
+
+void ASTColumnsReplaceTransformer::Replacement::updateTreeHashImpl(SipHash & hash_state) const
+{
+ hash_state.update(name.size());
+ hash_state.update(name);
+ expr->updateTreeHashImpl(hash_state);
+ IAST::updateTreeHashImpl(hash_state);
+}
+
void ASTColumnsReplaceTransformer::formatImpl(const FormatSettings & settings, FormatState & state, FormatStateStacked frame) const
{
settings.ostr << (settings.hilite ? hilite_keyword : "") << "REPLACE" << (is_strict ? " STRICT " : " ") << (settings.hilite ? hilite_none : "");
@@ -211,9 +301,8 @@ void ASTColumnsReplaceTransformer::formatImpl(const FormatSettings & settings, F
for (ASTs::const_iterator it = children.begin(); it != children.end(); ++it)
{
if (it != children.begin())
- {
settings.ostr << ", ";
- }
+
(*it)->formatImpl(settings, state, frame);
}
@@ -221,6 +310,32 @@ void ASTColumnsReplaceTransformer::formatImpl(const FormatSettings & settings, F
settings.ostr << ")";
}
+void ASTColumnsReplaceTransformer::appendColumnName(WriteBuffer & ostr) const
+{
+ writeCString("REPLACE ", ostr);
+ if (is_strict)
+ writeCString("STRICT ", ostr);
+
+ if (children.size() > 1)
+ writeChar('(', ostr);
+
+ for (ASTs::const_iterator it = children.begin(); it != children.end(); ++it)
+ {
+ if (it != children.begin())
+ writeCString(", ", ostr);
+ (*it)->appendColumnName(ostr);
+ }
+
+ if (children.size() > 1)
+ writeChar(')', ostr);
+}
+
+void ASTColumnsReplaceTransformer::updateTreeHashImpl(SipHash & hash_state) const
+{
+ hash_state.update(is_strict);
+ IAST::updateTreeHashImpl(hash_state);
+}
+
void ASTColumnsReplaceTransformer::replaceChildren(ASTPtr & node, const ASTPtr & replacement, const String & name)
{
for (auto & child : node->children)
diff --git a/src/Parsers/ASTColumnsTransformers.h b/src/Parsers/ASTColumnsTransformers.h
index 1064beb44bd..0f16f6b93e7 100644
--- a/src/Parsers/ASTColumnsTransformers.h
+++ b/src/Parsers/ASTColumnsTransformers.h
@@ -30,6 +30,8 @@ public:
return res;
}
void transform(ASTs & nodes) const override;
+ void appendColumnName(WriteBuffer & ostr) const override;
+ void updateTreeHashImpl(SipHash & hash_state) const override;
// Case 1 APPLY (quantile(0.9))
String func_name;
@@ -59,6 +61,8 @@ public:
void transform(ASTs & nodes) const override;
void setPattern(String pattern);
bool isColumnMatching(const String & column_name) const;
+ void appendColumnName(WriteBuffer & ostr) const override;
+ void updateTreeHashImpl(SipHash & hash_state) const override;
protected:
void formatImpl(const FormatSettings & settings, FormatState &, FormatStateStacked) const override;
@@ -76,12 +80,13 @@ public:
ASTPtr clone() const override
{
auto replacement = std::make_shared(*this);
- replacement->children.clear();
replacement->expr = expr->clone();
- replacement->children.push_back(replacement->expr);
return replacement;
}
+ void appendColumnName(WriteBuffer & ostr) const override;
+ void updateTreeHashImpl(SipHash & hash_state) const override;
+
String name;
ASTPtr expr;
@@ -98,6 +103,8 @@ public:
return clone;
}
void transform(ASTs & nodes) const override;
+ void appendColumnName(WriteBuffer & ostr) const override;
+ void updateTreeHashImpl(SipHash & hash_state) const override;
protected:
void formatImpl(const FormatSettings & settings, FormatState &, FormatStateStacked) const override;
diff --git a/src/Parsers/ASTQualifiedAsterisk.cpp b/src/Parsers/ASTQualifiedAsterisk.cpp
index 2491dcb36b7..b755e4eb98c 100644
--- a/src/Parsers/ASTQualifiedAsterisk.cpp
+++ b/src/Parsers/ASTQualifiedAsterisk.cpp
@@ -17,6 +17,8 @@ void ASTQualifiedAsterisk::formatImpl(const FormatSettings & settings, FormatSta
const auto & qualifier = children.at(0);
qualifier->formatImpl(settings, state, frame);
settings.ostr << ".*";
+
+ /// Format column transformers
for (ASTs::const_iterator it = children.begin() + 1; it != children.end(); ++it)
{
settings.ostr << ' ';
diff --git a/src/Parsers/ExpressionElementParsers.cpp b/src/Parsers/ExpressionElementParsers.cpp
index 29c7846283e..c13e6831780 100644
--- a/src/Parsers/ExpressionElementParsers.cpp
+++ b/src/Parsers/ExpressionElementParsers.cpp
@@ -1796,16 +1796,18 @@ bool ParserColumnsMatcher::parseImpl(Pos & pos, ASTPtr & node, Expected & expect
return false;
++pos;
- auto res = std::make_shared();
+ ASTPtr res;
if (column_list)
{
- res->column_list = column_list;
- res->children.push_back(res->column_list);
+ auto list_matcher = std::make_shared();
+ list_matcher->column_list = column_list;
+ res = list_matcher;
}
else
{
- res->setPattern(regex_node->as().value.get());
- res->children.push_back(regex_node);
+ auto regexp_matcher = std::make_shared();
+ regexp_matcher->setPattern(regex_node->as().value.get());
+ res = regexp_matcher;
}
ParserColumnsTransformers transformers_p(allowed_transformers);
diff --git a/tests/queries/0_stateless/02271_fix_column_matcher_and_column_transformer.reference b/tests/queries/0_stateless/02271_fix_column_matcher_and_column_transformer.reference
new file mode 100644
index 00000000000..e69de29bb2d
diff --git a/tests/queries/0_stateless/02271_fix_column_matcher_and_column_transformer.sql b/tests/queries/0_stateless/02271_fix_column_matcher_and_column_transformer.sql
new file mode 100644
index 00000000000..f0c0e2bae46
--- /dev/null
+++ b/tests/queries/0_stateless/02271_fix_column_matcher_and_column_transformer.sql
@@ -0,0 +1,64 @@
+DROP TABLE IF EXISTS github_events;
+
+CREATE TABLE github_events
+(
+ `file_time` DateTime,
+ `event_type` Enum8('CommitCommentEvent' = 1, 'CreateEvent' = 2, 'DeleteEvent' = 3, 'ForkEvent' = 4, 'GollumEvent' = 5, 'IssueCommentEvent' = 6, 'IssuesEvent' = 7, 'MemberEvent' = 8, 'PublicEvent' = 9, 'PullRequestEvent' = 10, 'PullRequestReviewCommentEvent' = 11, 'PushEvent' = 12, 'ReleaseEvent' = 13, 'SponsorshipEvent' = 14, 'WatchEvent' = 15, 'GistEvent' = 16, 'FollowEvent' = 17, 'DownloadEvent' = 18, 'PullRequestReviewEvent' = 19, 'ForkApplyEvent' = 20, 'Event' = 21, 'TeamAddEvent' = 22),
+ `actor_login` LowCardinality(String),
+ `repo_name` LowCardinality(String),
+ `created_at` DateTime,
+ `updated_at` DateTime,
+ `action` Enum8('none' = 0, 'created' = 1, 'added' = 2, 'edited' = 3, 'deleted' = 4, 'opened' = 5, 'closed' = 6, 'reopened' = 7, 'assigned' = 8, 'unassigned' = 9, 'labeled' = 10, 'unlabeled' = 11, 'review_requested' = 12, 'review_request_removed' = 13, 'synchronize' = 14, 'started' = 15, 'published' = 16, 'update' = 17, 'create' = 18, 'fork' = 19, 'merged' = 20),
+ `comment_id` UInt64,
+ `body` String,
+ `path` String,
+ `position` Int32,
+ `line` Int32,
+ `ref` LowCardinality(String),
+ `ref_type` Enum8('none' = 0, 'branch' = 1, 'tag' = 2, 'repository' = 3, 'unknown' = 4),
+ `creator_user_login` LowCardinality(String),
+ `number` UInt32,
+ `title` String,
+ `labels` Array(LowCardinality(String)),
+ `state` Enum8('none' = 0, 'open' = 1, 'closed' = 2),
+ `locked` UInt8,
+ `assignee` LowCardinality(String),
+ `assignees` Array(LowCardinality(String)),
+ `comments` UInt32,
+ `author_association` Enum8('NONE' = 0, 'CONTRIBUTOR' = 1, 'OWNER' = 2, 'COLLABORATOR' = 3, 'MEMBER' = 4, 'MANNEQUIN' = 5),
+ `closed_at` DateTime,
+ `merged_at` DateTime,
+ `merge_commit_sha` String,
+ `requested_reviewers` Array(LowCardinality(String)),
+ `requested_teams` Array(LowCardinality(String)),
+ `head_ref` LowCardinality(String),
+ `head_sha` String,
+ `base_ref` LowCardinality(String),
+ `base_sha` String,
+ `merged` UInt8,
+ `mergeable` UInt8,
+ `rebaseable` UInt8,
+ `mergeable_state` Enum8('unknown' = 0, 'dirty' = 1, 'clean' = 2, 'unstable' = 3, 'draft' = 4),
+ `merged_by` LowCardinality(String),
+ `review_comments` UInt32,
+ `maintainer_can_modify` UInt8,
+ `commits` UInt32,
+ `additions` UInt32,
+ `deletions` UInt32,
+ `changed_files` UInt32,
+ `diff_hunk` String,
+ `original_position` UInt32,
+ `commit_id` String,
+ `original_commit_id` String,
+ `push_size` UInt32,
+ `push_distinct_size` UInt32,
+ `member_login` LowCardinality(String),
+ `release_tag_name` String,
+ `release_name` String,
+ `review_state` Enum8('none' = 0, 'approved' = 1, 'changes_requested' = 2, 'commented' = 3, 'dismissed' = 4, 'pending' = 5)
+)
+ENGINE = MergeTree ORDER BY (event_type, repo_name, created_at);
+
+with top_repos as ( select repo_name from github_events where event_type = 'WatchEvent' and toDate(created_at) = today() - 1 group by repo_name order by count() desc limit 100 union distinct select repo_name from github_events where event_type = 'WatchEvent' and toMonday(created_at) = toMonday(today() - interval 1 week) group by repo_name order by count() desc limit 100 union distinct select repo_name from github_events where event_type = 'WatchEvent' and toStartOfMonth(created_at) = toStartOfMonth(today()) - interval 1 month group by repo_name order by count() desc limit 100 union distinct select repo_name from github_events where event_type = 'WatchEvent' and toYear(created_at) = toYear(today()) - 1 group by repo_name order by count() desc limit 100 ), last_day as ( select repo_name, count() as count_last_day, rowNumberInAllBlocks() + 1 as position_last_day from github_events where repo_name in (select repo_name from top_repos) and toDate(created_at) = today() - 1 group by repo_name order by count_last_day desc ), last_week as ( select repo_name, count() as count_last_week, rowNumberInAllBlocks() + 1 as position_last_week from github_events where repo_name in (select repo_name from top_repos) and toMonday(created_at) = toMonday(today()) - interval 1 week group by repo_name order by count_last_week desc ), last_month as ( select repo_name, count() as count_last_month, rowNumberInAllBlocks() + 1 as position_last_month from github_events where repo_name in (select repo_name from top_repos) and toStartOfMonth(created_at) = toStartOfMonth(today()) - interval 1 month group by repo_name order by count_last_month desc ) select d.repo_name, columns(count) from last_day d join last_week w on d.repo_name = w.repo_name join last_month m on d.repo_name = m.repo_name FORMAT TabSeparatedWithNamesAndTypes; -- { serverError 47 }
+
+DROP TABLE github_events;
From 2eee79dc2a8264375e996637876697c4980218f0 Mon Sep 17 00:00:00 2001
From: alesapin
Date: Wed, 20 Apr 2022 13:26:20 +0200
Subject: [PATCH 26/55] Review fixes
---
src/Disks/IDisk.h | 6 ++---
src/Storages/MergeTree/MergeTreeData.cpp | 27 ++++++++++++++--------
src/Storages/MergeTree/MergeTreePartInfo.h | 10 ++++++++
3 files changed, 31 insertions(+), 12 deletions(-)
diff --git a/src/Disks/IDisk.h b/src/Disks/IDisk.h
index cc47c4262f5..669897e7c67 100644
--- a/src/Disks/IDisk.h
+++ b/src/Disks/IDisk.h
@@ -192,18 +192,18 @@ public:
/// Remove file. Throws exception if file doesn't exists or if directory is not empty.
/// Differs from removeFile for S3/HDFS disks
/// Second bool param is a flag to remove (true) or keep (false) shared data on S3
- virtual void removeSharedFile(const String & path, bool) { removeFile(path); }
+ virtual void removeSharedFile(const String & path, bool /* keep_shared_data */) { removeFile(path); }
/// Remove file or directory with all children. Use with extra caution. Throws exception if file doesn't exists.
/// Differs from removeRecursive for S3/HDFS disks
/// Second bool param is a flag to remove (true) or keep (false) shared data on S3.
/// Third param determines which files cannot be removed even if second is true.
- virtual void removeSharedRecursive(const String & path, bool, const NameSet &) { removeRecursive(path); }
+ virtual void removeSharedRecursive(const String & path, bool /* keep_all_shared_data */, const NameSet & /* file_names_remove_metadata_only */) { removeRecursive(path); }
/// Remove file or directory if it exists.
/// Differs from removeFileIfExists for S3/HDFS disks
/// Second bool param is a flag to remove (true) or keep (false) shared data on S3
- virtual void removeSharedFileIfExists(const String & path, bool) { removeFileIfExists(path); }
+ virtual void removeSharedFileIfExists(const String & path, bool /* keep_shared_data */) { removeFileIfExists(path); }
virtual String getCacheBasePath() const { return ""; }
diff --git a/src/Storages/MergeTree/MergeTreeData.cpp b/src/Storages/MergeTree/MergeTreeData.cpp
index 6d2eea6cdd4..e5403574741 100644
--- a/src/Storages/MergeTree/MergeTreeData.cpp
+++ b/src/Storages/MergeTree/MergeTreeData.cpp
@@ -1592,6 +1592,19 @@ MergeTreeData::DataPartsVector MergeTreeData::grabOldParts(bool force)
time_t now = time(nullptr);
std::vector parts_to_delete;
+ std::vector skipped_parts;
+
+ auto has_skipped_mutation_parent = [&skipped_parts, need_remove_parts_in_order] (const DataPartPtr & part)
+ {
+ if (!need_remove_parts_in_order)
+ return false;
+
+ for (const auto & part_info : skipped_parts)
+ if (part->info.isMutationChildOf(part_info))
+ return true;
+
+ return false;
+ };
{
auto parts_lock = lockParts();
@@ -1604,9 +1617,7 @@ MergeTreeData::DataPartsVector MergeTreeData::grabOldParts(bool force)
/// Do not remove outdated part if it may be visible for some transaction
if (!part->version.canBeRemoved())
{
- if (need_remove_parts_in_order)
- break;
-
+ skipped_parts.push_back(part->info);
continue;
}
@@ -1615,22 +1626,20 @@ MergeTreeData::DataPartsVector MergeTreeData::grabOldParts(bool force)
/// Grab only parts that are not used by anyone (SELECTs for example).
if (!part.unique())
{
- if (need_remove_parts_in_order)
- break;
-
+ skipped_parts.push_back(part->info);
continue;
}
- if ((part_remove_time < now && now - part_remove_time > getSettings()->old_parts_lifetime.totalSeconds())
+ if ((part_remove_time < now && now - part_remove_time > getSettings()->old_parts_lifetime.totalSeconds() && !has_skipped_mutation_parent(part))
|| force
|| isInMemoryPart(part) /// Remove in-memory parts immediately to not store excessive data in RAM
|| (part->version.creation_csn == Tx::RolledBackCSN && getSettings()->remove_rolled_back_parts_immediately))
{
parts_to_delete.emplace_back(it);
}
- else if (need_remove_parts_in_order)
+ else
{
- break;
+ skipped_parts.push_back(part->info);
}
}
diff --git a/src/Storages/MergeTree/MergeTreePartInfo.h b/src/Storages/MergeTree/MergeTreePartInfo.h
index e9ff6f87f0b..d6d9f45bc1b 100644
--- a/src/Storages/MergeTree/MergeTreePartInfo.h
+++ b/src/Storages/MergeTree/MergeTreePartInfo.h
@@ -72,6 +72,16 @@ struct MergeTreePartInfo
&& strictly_contains_block_range;
}
+ /// Part was created with mutation of parent_candidate part
+ bool isMutationChildOf(const MergeTreePartInfo & parent_candidate) const
+ {
+ return partition_id == parent_candidate.partition_id
+ && min_block == parent_candidate.min_block
+ && max_block == parent_candidate.max_block
+ && level == parent_candidate.level
+ && mutation >= parent_candidate.mutation;
+ }
+
/// Return part mutation version, if part wasn't mutated return zero
Int64 getMutationVersion() const
{
From 5334522d5c0f6c10bf11a49967078aad5dc4590c Mon Sep 17 00:00:00 2001
From: alesapin
Date: Wed, 20 Apr 2022 13:33:42 +0200
Subject: [PATCH 27/55] Extreme settings
---
src/Storages/MergeTree/MergeTreeSettings.h | 6 +++---
1 file changed, 3 insertions(+), 3 deletions(-)
diff --git a/src/Storages/MergeTree/MergeTreeSettings.h b/src/Storages/MergeTree/MergeTreeSettings.h
index 0d984dc4dee..e31c0686a16 100644
--- a/src/Storages/MergeTree/MergeTreeSettings.h
+++ b/src/Storages/MergeTree/MergeTreeSettings.h
@@ -45,7 +45,7 @@ struct Settings;
M(UInt64, number_of_free_entries_in_pool_to_lower_max_size_of_merge, 8, "When there is less than specified number of free entries in pool (or replicated queue), start to lower maximum size of merge to process (or to put in queue). This is to allow small merges to process - not filling the pool with long running merges.", 0) \
M(UInt64, number_of_free_entries_in_pool_to_execute_mutation, 20, "When there is less than specified number of free entries in pool, do not execute part mutations. This is to leave free threads for regular merges and avoid \"Too many parts\"", 0) \
M(UInt64, max_number_of_merges_with_ttl_in_pool, 2, "When there is more than specified number of merges with TTL entries in pool, do not assign new merge with TTL. This is to leave free threads for regular merges and avoid \"Too many parts\"", 0) \
- M(Seconds, old_parts_lifetime, 8 * 60, "How many seconds to keep obsolete parts.", 0) \
+ M(Seconds, old_parts_lifetime, 1, "How many seconds to keep obsolete parts.", 0) \
M(Seconds, temporary_directories_lifetime, 86400, "How many seconds to keep tmp_-directories. You should not lower this value because merges and mutations may not be able to work with low value of this setting.", 0) \
M(Seconds, lock_acquire_timeout_for_background_operations, DBMS_DEFAULT_LOCK_ACQUIRE_TIMEOUT_SEC, "For background operations like merges, mutations etc. How many seconds before failing to acquire table locks.", 0) \
M(UInt64, min_rows_to_fsync_after_merge, 0, "Minimal number of rows to do fsync for part after merge (0 - disabled)", 0) \
@@ -100,8 +100,8 @@ struct Settings;
\
/** Check delay of replicas settings. */ \
M(UInt64, min_relative_delay_to_measure, 120, "Calculate relative replica delay only if absolute delay is not less that this value.", 0) \
- M(UInt64, cleanup_delay_period, 30, "Period to clean old queue logs, blocks hashes and parts.", 0) \
- M(UInt64, cleanup_delay_period_random_add, 10, "Add uniformly distributed value from 0 to x seconds to cleanup_delay_period to avoid thundering herd effect and subsequent DoS of ZooKeeper in case of very large number of tables.", 0) \
+ M(UInt64, cleanup_delay_period, 1, "Period to clean old queue logs, blocks hashes and parts.", 0) \
+ M(UInt64, cleanup_delay_period_random_add, 1, "Add uniformly distributed value from 0 to x seconds to cleanup_delay_period to avoid thundering herd effect and subsequent DoS of ZooKeeper in case of very large number of tables.", 0) \
M(UInt64, min_relative_delay_to_close, 300, "Minimal delay from other replicas to close, stop serving requests and not return Ok during status check.", 0) \
M(UInt64, min_absolute_delay_to_close, 0, "Minimal absolute delay to close, stop serving requests and not return Ok during status check.", 0) \
M(UInt64, enable_vertical_merge_algorithm, 1, "Enable usage of Vertical merge algorithm.", 0) \
From 612a81ba24506bbbb9803ff8b2a406547ef0e032 Mon Sep 17 00:00:00 2001
From: alesapin
Date: Wed, 20 Apr 2022 13:58:31 +0200
Subject: [PATCH 28/55] Update src/Disks/IDisk.h
Co-authored-by: tavplubix
---
src/Disks/IDisk.h | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/src/Disks/IDisk.h b/src/Disks/IDisk.h
index 669897e7c67..3e80195e7ab 100644
--- a/src/Disks/IDisk.h
+++ b/src/Disks/IDisk.h
@@ -196,7 +196,7 @@ public:
/// Remove file or directory with all children. Use with extra caution. Throws exception if file doesn't exists.
/// Differs from removeRecursive for S3/HDFS disks
- /// Second bool param is a flag to remove (true) or keep (false) shared data on S3.
+ /// Second bool param is a flag to remove (false) or keep (true) shared data on S3.
/// Third param determines which files cannot be removed even if second is true.
virtual void removeSharedRecursive(const String & path, bool /* keep_all_shared_data */, const NameSet & /* file_names_remove_metadata_only */) { removeRecursive(path); }
From 41b57f78cf0408b6619d18cdbc1bb884d5b4b487 Mon Sep 17 00:00:00 2001
From: alesapin
Date: Wed, 20 Apr 2022 13:59:02 +0200
Subject: [PATCH 29/55] Revert extreme settings
---
src/Storages/MergeTree/MergeTreeSettings.h | 6 +++---
src/Storages/MergeTree/MutateFromLogEntryTask.cpp | 4 ++--
2 files changed, 5 insertions(+), 5 deletions(-)
diff --git a/src/Storages/MergeTree/MergeTreeSettings.h b/src/Storages/MergeTree/MergeTreeSettings.h
index e31c0686a16..0d984dc4dee 100644
--- a/src/Storages/MergeTree/MergeTreeSettings.h
+++ b/src/Storages/MergeTree/MergeTreeSettings.h
@@ -45,7 +45,7 @@ struct Settings;
M(UInt64, number_of_free_entries_in_pool_to_lower_max_size_of_merge, 8, "When there is less than specified number of free entries in pool (or replicated queue), start to lower maximum size of merge to process (or to put in queue). This is to allow small merges to process - not filling the pool with long running merges.", 0) \
M(UInt64, number_of_free_entries_in_pool_to_execute_mutation, 20, "When there is less than specified number of free entries in pool, do not execute part mutations. This is to leave free threads for regular merges and avoid \"Too many parts\"", 0) \
M(UInt64, max_number_of_merges_with_ttl_in_pool, 2, "When there is more than specified number of merges with TTL entries in pool, do not assign new merge with TTL. This is to leave free threads for regular merges and avoid \"Too many parts\"", 0) \
- M(Seconds, old_parts_lifetime, 1, "How many seconds to keep obsolete parts.", 0) \
+ M(Seconds, old_parts_lifetime, 8 * 60, "How many seconds to keep obsolete parts.", 0) \
M(Seconds, temporary_directories_lifetime, 86400, "How many seconds to keep tmp_-directories. You should not lower this value because merges and mutations may not be able to work with low value of this setting.", 0) \
M(Seconds, lock_acquire_timeout_for_background_operations, DBMS_DEFAULT_LOCK_ACQUIRE_TIMEOUT_SEC, "For background operations like merges, mutations etc. How many seconds before failing to acquire table locks.", 0) \
M(UInt64, min_rows_to_fsync_after_merge, 0, "Minimal number of rows to do fsync for part after merge (0 - disabled)", 0) \
@@ -100,8 +100,8 @@ struct Settings;
\
/** Check delay of replicas settings. */ \
M(UInt64, min_relative_delay_to_measure, 120, "Calculate relative replica delay only if absolute delay is not less that this value.", 0) \
- M(UInt64, cleanup_delay_period, 1, "Period to clean old queue logs, blocks hashes and parts.", 0) \
- M(UInt64, cleanup_delay_period_random_add, 1, "Add uniformly distributed value from 0 to x seconds to cleanup_delay_period to avoid thundering herd effect and subsequent DoS of ZooKeeper in case of very large number of tables.", 0) \
+ M(UInt64, cleanup_delay_period, 30, "Period to clean old queue logs, blocks hashes and parts.", 0) \
+ M(UInt64, cleanup_delay_period_random_add, 10, "Add uniformly distributed value from 0 to x seconds to cleanup_delay_period to avoid thundering herd effect and subsequent DoS of ZooKeeper in case of very large number of tables.", 0) \
M(UInt64, min_relative_delay_to_close, 300, "Minimal delay from other replicas to close, stop serving requests and not return Ok during status check.", 0) \
M(UInt64, min_absolute_delay_to_close, 0, "Minimal absolute delay to close, stop serving requests and not return Ok during status check.", 0) \
M(UInt64, enable_vertical_merge_algorithm, 1, "Enable usage of Vertical merge algorithm.", 0) \
diff --git a/src/Storages/MergeTree/MutateFromLogEntryTask.cpp b/src/Storages/MergeTree/MutateFromLogEntryTask.cpp
index 554053c1bd3..64c2d6fb8a6 100644
--- a/src/Storages/MergeTree/MutateFromLogEntryTask.cpp
+++ b/src/Storages/MergeTree/MutateFromLogEntryTask.cpp
@@ -115,7 +115,7 @@ ReplicatedMergeMutateTaskBase::PrepareResult MutateFromLogEntryTask::prepare()
String dummy;
if (!storage.findReplicaHavingCoveringPart(entry.new_part_name, true, dummy).empty())
{
- LOG_DEBUG(log, "Mutation of part {} finished by some other replica, will download merged part", entry.new_part_name);
+ LOG_DEBUG(log, "Mutation of part {} finished by some other replica, will download mutated part", entry.new_part_name);
return PrepareResult{
.prepared_successfully = false,
.need_to_check_missing_part_in_fetch = true,
@@ -127,7 +127,7 @@ ReplicatedMergeMutateTaskBase::PrepareResult MutateFromLogEntryTask::prepare()
if (!zero_copy_lock)
{
- LOG_DEBUG(log, "Mutation of part {} started by some other replica, will wait it and fetch merged part", entry.new_part_name);
+ LOG_DEBUG(log, "Mutation of part {} started by some other replica, will wait it and mutated merged part", entry.new_part_name);
return PrepareResult{
.prepared_successfully = false,
.need_to_check_missing_part_in_fetch = false,
From e8fba1644c0e3648213f9e0a8f3a87b94f970d40 Mon Sep 17 00:00:00 2001
From: alesapin
Date: Wed, 20 Apr 2022 14:13:29 +0200
Subject: [PATCH 30/55] Fix bug
---
src/Storages/StorageReplicatedMergeTree.cpp | 6 +++++-
1 file changed, 5 insertions(+), 1 deletion(-)
diff --git a/src/Storages/StorageReplicatedMergeTree.cpp b/src/Storages/StorageReplicatedMergeTree.cpp
index 8f376e3bffa..a776347c5f4 100644
--- a/src/Storages/StorageReplicatedMergeTree.cpp
+++ b/src/Storages/StorageReplicatedMergeTree.cpp
@@ -7456,10 +7456,14 @@ std::pair StorageReplicatedMergeTree::unlockSharedDataByID(
for (const auto & zc_zookeeper_path : zc_zookeeper_paths)
{
+ String files_not_to_remove_str;
+ zookeeper_ptr->tryGet(zc_zookeeper_path, files_not_to_remove_str);
+
files_not_to_remove.clear();
- auto files_not_to_remove_str = zookeeper_ptr->get(zc_zookeeper_path);
if (!files_not_to_remove_str.empty())
+ {
boost::split(files_not_to_remove, files_not_to_remove_str, boost::is_any_of("\n "));
+ }
String zookeeper_part_uniq_node = fs::path(zc_zookeeper_path) / part_id;
From 829854113b34075676fc224c0db2df69a429ea6b Mon Sep 17 00:00:00 2001
From: alesapin
Date: Wed, 20 Apr 2022 15:10:36 +0200
Subject: [PATCH 31/55] Add a little thread safety
---
src/Disks/IDiskRemote.cpp | 21 +++++++++++++++++----
src/Disks/IDiskRemote.h | 3 +++
src/Storages/StorageReplicatedMergeTree.cpp | 2 --
3 files changed, 20 insertions(+), 6 deletions(-)
diff --git a/src/Disks/IDiskRemote.cpp b/src/Disks/IDiskRemote.cpp
index acc3517d30e..80f1a3ff116 100644
--- a/src/Disks/IDiskRemote.cpp
+++ b/src/Disks/IDiskRemote.cpp
@@ -45,7 +45,6 @@ IDiskRemote::Metadata IDiskRemote::Metadata::createAndStoreMetadata(const String
return result;
}
-
IDiskRemote::Metadata IDiskRemote::Metadata::readUpdateAndStoreMetadata(const String & remote_fs_root_path_, DiskPtr metadata_disk_, const String & metadata_file_path_, bool sync, IDiskRemote::MetadataUpdater updater)
{
Metadata result(remote_fs_root_path_, metadata_disk_, metadata_file_path_);
@@ -55,7 +54,6 @@ IDiskRemote::Metadata IDiskRemote::Metadata::readUpdateAndStoreMetadata(const St
return result;
}
-
IDiskRemote::Metadata IDiskRemote::Metadata::createUpdateAndStoreMetadata(const String & remote_fs_root_path_, DiskPtr metadata_disk_, const String & metadata_file_path_, bool sync, IDiskRemote::MetadataUpdater updater)
{
Metadata result(remote_fs_root_path_, metadata_disk_, metadata_file_path_);
@@ -64,6 +62,16 @@ IDiskRemote::Metadata IDiskRemote::Metadata::createUpdateAndStoreMetadata(const
return result;
}
+IDiskRemote::Metadata IDiskRemote::Metadata::readUpdateStoreMetadataAndRemove(const String & remote_fs_root_path_, DiskPtr metadata_disk_, const String & metadata_file_path_, bool sync, IDiskRemote::MetadataUpdater updater)
+{
+ Metadata result(remote_fs_root_path_, metadata_disk_, metadata_file_path_);
+ if (updater(result))
+ result.save(sync);
+ metadata_disk_->removeFile(metadata_file_path_);
+
+ return result;
+
+}
IDiskRemote::Metadata IDiskRemote::Metadata::createAndStoreMetadataIfNotExists(const String & remote_fs_root_path_, DiskPtr metadata_disk_, const String & metadata_file_path_, bool sync, bool overwrite)
{
@@ -231,6 +239,12 @@ IDiskRemote::Metadata IDiskRemote::readUpdateAndStoreMetadata(const String & pat
}
+IDiskRemote::Metadata IDiskRemote::readUpdateStoreMetadataAndRemove(const String & path, bool sync, IDiskRemote::MetadataUpdater updater)
+{
+ std::unique_lock lock(metadata_mutex);
+ return Metadata::readUpdateStoreMetadataAndRemove(remote_fs_root_path, metadata_disk, path, sync, updater);
+}
+
IDiskRemote::Metadata IDiskRemote::readOrCreateUpdateAndStoreMetadata(const String & path, WriteMode mode, bool sync, IDiskRemote::MetadataUpdater updater)
{
if (mode == WriteMode::Rewrite || !metadata_disk->exists(path))
@@ -308,8 +322,7 @@ void IDiskRemote::removeMetadata(const String & path, std::vector & path
return true;
};
- readUpdateAndStoreMetadata(path, false, metadata_updater);
- metadata_disk->removeFile(path);
+ readUpdateStoreMetadataAndRemove(path, false, metadata_updater);
/// If there is no references - delete content from remote FS.
}
catch (const Exception & e)
diff --git a/src/Disks/IDiskRemote.h b/src/Disks/IDiskRemote.h
index b2251e1abe3..65bcdf3e719 100644
--- a/src/Disks/IDiskRemote.h
+++ b/src/Disks/IDiskRemote.h
@@ -78,6 +78,8 @@ public:
Metadata readMetadata(const String & path) const;
Metadata readMetadataUnlocked(const String & path, std::shared_lock &) const;
Metadata readUpdateAndStoreMetadata(const String & path, bool sync, MetadataUpdater updater);
+ Metadata readUpdateStoreMetadataAndRemove(const String & path, bool sync, MetadataUpdater updater);
+
Metadata readOrCreateUpdateAndStoreMetadata(const String & path, WriteMode mode, bool sync, MetadataUpdater updater);
Metadata createAndStoreMetadata(const String & path, bool sync);
@@ -229,6 +231,7 @@ struct IDiskRemote::Metadata
static Metadata readMetadata(const String & remote_fs_root_path_, DiskPtr metadata_disk_, const String & metadata_file_path_);
static Metadata readUpdateAndStoreMetadata(const String & remote_fs_root_path_, DiskPtr metadata_disk_, const String & metadata_file_path_, bool sync, Updater updater);
+ static Metadata readUpdateStoreMetadataAndRemove(const String & remote_fs_root_path_, DiskPtr metadata_disk_, const String & metadata_file_path_, bool sync, Updater updater);
static Metadata createAndStoreMetadata(const String & remote_fs_root_path_, DiskPtr metadata_disk_, const String & metadata_file_path_, bool sync);
static Metadata createUpdateAndStoreMetadata(const String & remote_fs_root_path_, DiskPtr metadata_disk_, const String & metadata_file_path_, bool sync, Updater updater);
diff --git a/src/Storages/StorageReplicatedMergeTree.cpp b/src/Storages/StorageReplicatedMergeTree.cpp
index a776347c5f4..858841a4986 100644
--- a/src/Storages/StorageReplicatedMergeTree.cpp
+++ b/src/Storages/StorageReplicatedMergeTree.cpp
@@ -7461,9 +7461,7 @@ std::pair StorageReplicatedMergeTree::unlockSharedDataByID(
files_not_to_remove.clear();
if (!files_not_to_remove_str.empty())
- {
boost::split(files_not_to_remove, files_not_to_remove_str, boost::is_any_of("\n "));
- }
String zookeeper_part_uniq_node = fs::path(zc_zookeeper_path) / part_id;
From 6ee0df251fb34130df88f9b734af0fc0f155be04 Mon Sep 17 00:00:00 2001
From: alesapin
Date: Wed, 20 Apr 2022 15:11:03 +0200
Subject: [PATCH 32/55] Add ditry settings
---
src/Storages/MergeTree/MergeTreeSettings.h | 6 +++---
1 file changed, 3 insertions(+), 3 deletions(-)
diff --git a/src/Storages/MergeTree/MergeTreeSettings.h b/src/Storages/MergeTree/MergeTreeSettings.h
index 0d984dc4dee..e31c0686a16 100644
--- a/src/Storages/MergeTree/MergeTreeSettings.h
+++ b/src/Storages/MergeTree/MergeTreeSettings.h
@@ -45,7 +45,7 @@ struct Settings;
M(UInt64, number_of_free_entries_in_pool_to_lower_max_size_of_merge, 8, "When there is less than specified number of free entries in pool (or replicated queue), start to lower maximum size of merge to process (or to put in queue). This is to allow small merges to process - not filling the pool with long running merges.", 0) \
M(UInt64, number_of_free_entries_in_pool_to_execute_mutation, 20, "When there is less than specified number of free entries in pool, do not execute part mutations. This is to leave free threads for regular merges and avoid \"Too many parts\"", 0) \
M(UInt64, max_number_of_merges_with_ttl_in_pool, 2, "When there is more than specified number of merges with TTL entries in pool, do not assign new merge with TTL. This is to leave free threads for regular merges and avoid \"Too many parts\"", 0) \
- M(Seconds, old_parts_lifetime, 8 * 60, "How many seconds to keep obsolete parts.", 0) \
+ M(Seconds, old_parts_lifetime, 1, "How many seconds to keep obsolete parts.", 0) \
M(Seconds, temporary_directories_lifetime, 86400, "How many seconds to keep tmp_-directories. You should not lower this value because merges and mutations may not be able to work with low value of this setting.", 0) \
M(Seconds, lock_acquire_timeout_for_background_operations, DBMS_DEFAULT_LOCK_ACQUIRE_TIMEOUT_SEC, "For background operations like merges, mutations etc. How many seconds before failing to acquire table locks.", 0) \
M(UInt64, min_rows_to_fsync_after_merge, 0, "Minimal number of rows to do fsync for part after merge (0 - disabled)", 0) \
@@ -100,8 +100,8 @@ struct Settings;
\
/** Check delay of replicas settings. */ \
M(UInt64, min_relative_delay_to_measure, 120, "Calculate relative replica delay only if absolute delay is not less that this value.", 0) \
- M(UInt64, cleanup_delay_period, 30, "Period to clean old queue logs, blocks hashes and parts.", 0) \
- M(UInt64, cleanup_delay_period_random_add, 10, "Add uniformly distributed value from 0 to x seconds to cleanup_delay_period to avoid thundering herd effect and subsequent DoS of ZooKeeper in case of very large number of tables.", 0) \
+ M(UInt64, cleanup_delay_period, 1, "Period to clean old queue logs, blocks hashes and parts.", 0) \
+ M(UInt64, cleanup_delay_period_random_add, 1, "Add uniformly distributed value from 0 to x seconds to cleanup_delay_period to avoid thundering herd effect and subsequent DoS of ZooKeeper in case of very large number of tables.", 0) \
M(UInt64, min_relative_delay_to_close, 300, "Minimal delay from other replicas to close, stop serving requests and not return Ok during status check.", 0) \
M(UInt64, min_absolute_delay_to_close, 0, "Minimal absolute delay to close, stop serving requests and not return Ok during status check.", 0) \
M(UInt64, enable_vertical_merge_algorithm, 1, "Enable usage of Vertical merge algorithm.", 0) \
From c14e2e0b96dc42ad780006994c6b472502d6828e Mon Sep 17 00:00:00 2001
From: alesapin
Date: Wed, 20 Apr 2022 21:08:26 +0200
Subject: [PATCH 33/55] Fix more
---
src/Disks/IDiskRemote.cpp | 2 ++
src/Storages/MergeTree/MergeTreeData.cpp | 1 +
src/Storages/MergeTree/MergeTreeData.h | 4 ++++
src/Storages/MergeTree/MutateTask.cpp | 1 +
src/Storages/MergeTree/MutateTask.h | 2 +-
src/Storages/StorageReplicatedMergeTree.cpp | 2 +-
src/Storages/StorageReplicatedMergeTree.h | 2 +-
7 files changed, 11 insertions(+), 3 deletions(-)
diff --git a/src/Disks/IDiskRemote.cpp b/src/Disks/IDiskRemote.cpp
index 80f1a3ff116..cd8e999103d 100644
--- a/src/Disks/IDiskRemote.cpp
+++ b/src/Disks/IDiskRemote.cpp
@@ -65,6 +65,7 @@ IDiskRemote::Metadata IDiskRemote::Metadata::createUpdateAndStoreMetadata(const
IDiskRemote::Metadata IDiskRemote::Metadata::readUpdateStoreMetadataAndRemove(const String & remote_fs_root_path_, DiskPtr metadata_disk_, const String & metadata_file_path_, bool sync, IDiskRemote::MetadataUpdater updater)
{
Metadata result(remote_fs_root_path_, metadata_disk_, metadata_file_path_);
+ result.load();
if (updater(result))
result.save(sync);
metadata_disk_->removeFile(metadata_file_path_);
@@ -303,6 +304,7 @@ void IDiskRemote::removeMetadata(const String & path, std::vector & path
{
for (const auto & [remote_fs_object_path, _] : metadata.remote_fs_objects)
{
+
paths_to_remove.push_back(remote_fs_root_path + remote_fs_object_path);
if (cache)
diff --git a/src/Storages/MergeTree/MergeTreeData.cpp b/src/Storages/MergeTree/MergeTreeData.cpp
index e5403574741..595a3fffa1b 100644
--- a/src/Storages/MergeTree/MergeTreeData.cpp
+++ b/src/Storages/MergeTree/MergeTreeData.cpp
@@ -5786,6 +5786,7 @@ MergeTreeData::MutableDataPartPtr MergeTreeData::cloneAndLoadDataPartOnSameDisk(
if (hardlinked_files)
{
hardlinked_files->source_part_name = src_part->name;
+ hardlinked_files->source_table_shared_id = src_part->storage.getTableSharedID();
for (auto it = disk->iterateDirectory(src_part_path); it->isValid(); it->next())
{
diff --git a/src/Storages/MergeTree/MergeTreeData.h b/src/Storages/MergeTree/MergeTreeData.h
index 283a719f26e..fcc23c9c2ef 100644
--- a/src/Storages/MergeTree/MergeTreeData.h
+++ b/src/Storages/MergeTree/MergeTreeData.h
@@ -771,6 +771,8 @@ public:
struct HardlinkedFiles
{
+ /// Shared table uuid where hardlinks live
+ std::string source_table_shared_id;
/// Hardlinked from part
std::string source_part_name;
/// Hardlinked files list
@@ -964,6 +966,8 @@ public:
/// Remove local files and remote files if needed
virtual bool removeDetachedPart(DiskPtr disk, const String & path, const String & part_name, bool is_freezed);
+ virtual String getTableSharedID() const { return ""; }
+
/// Store metadata for replicated tables
/// Do nothing for non-replicated tables
virtual void createAndStoreFreezeMetadata(DiskPtr disk, DataPartPtr part, String backup_part_path) const;
diff --git a/src/Storages/MergeTree/MutateTask.cpp b/src/Storages/MergeTree/MutateTask.cpp
index 5e7cc8ff171..f449e5e3c04 100644
--- a/src/Storages/MergeTree/MutateTask.cpp
+++ b/src/Storages/MergeTree/MutateTask.cpp
@@ -1122,6 +1122,7 @@ private:
/// Tracking of hardlinked files required for zero-copy replication.
/// We don't remove them when we delete last copy of source part because
/// new part can use them.
+ ctx->hardlinked_files.source_table_shared_id = ctx->source_part->storage.getTableSharedID();
ctx->hardlinked_files.source_part_name = ctx->source_part->name;
ctx->hardlinked_files.hardlinks_from_source_part = hardlinked_files;
diff --git a/src/Storages/MergeTree/MutateTask.h b/src/Storages/MergeTree/MutateTask.h
index 40447cc63c8..f4848dac3b3 100644
--- a/src/Storages/MergeTree/MutateTask.h
+++ b/src/Storages/MergeTree/MutateTask.h
@@ -13,7 +13,7 @@ namespace DB
class MutateTask;
-using MutateTaskPtr = std::shared_ptr;\
+using MutateTaskPtr = std::shared_ptr;
class MergeTreeDataMergerMutator;
diff --git a/src/Storages/StorageReplicatedMergeTree.cpp b/src/Storages/StorageReplicatedMergeTree.cpp
index 858841a4986..49eae770eb9 100644
--- a/src/Storages/StorageReplicatedMergeTree.cpp
+++ b/src/Storages/StorageReplicatedMergeTree.cpp
@@ -7397,7 +7397,7 @@ void StorageReplicatedMergeTree::lockSharedData(const IMergeTreeDataPart & part,
if (hardlinked_files.has_value() && !hardlinked_files->hardlinks_from_source_part.empty())
{
path_to_set_hardlinked_files = getZeroCopyPartPath(
- *getSettings(), disk->getType(), getTableSharedID(),
+ *getSettings(), disk->getType(), hardlinked_files->source_table_shared_id,
hardlinked_files->source_part_name, zookeeper_path)[0];
hardlinks = hardlinked_files->hardlinks_from_source_part;
diff --git a/src/Storages/StorageReplicatedMergeTree.h b/src/Storages/StorageReplicatedMergeTree.h
index fb284893436..845182445a5 100644
--- a/src/Storages/StorageReplicatedMergeTree.h
+++ b/src/Storages/StorageReplicatedMergeTree.h
@@ -283,7 +283,7 @@ public:
String getZooKeeperName() const { return zookeeper_name; }
// Return table id, common for different replicas
- String getTableSharedID() const;
+ String getTableSharedID() const override;
static String getDefaultZooKeeperName() { return default_zookeeper_name; }
From cbfe00bb47a4069baf86b4d98f58ff776fce17be Mon Sep 17 00:00:00 2001
From: alesapin
Date: Thu, 21 Apr 2022 00:30:13 +0200
Subject: [PATCH 34/55] Fix tidy
---
src/Storages/MergeTree/DataPartsExchange.cpp | 2 +-
.../MergeTree/ReplicatedMergeTreeSink.cpp | 2 +-
src/Storages/StorageReplicatedMergeTree.h | 2 +-
...71_replace_partition_many_tables.reference | 5 ++
.../02271_replace_partition_many_tables.sql | 82 +++++++++++++++++++
5 files changed, 90 insertions(+), 3 deletions(-)
create mode 100644 tests/queries/0_stateless/02271_replace_partition_many_tables.reference
create mode 100644 tests/queries/0_stateless/02271_replace_partition_many_tables.sql
diff --git a/src/Storages/MergeTree/DataPartsExchange.cpp b/src/Storages/MergeTree/DataPartsExchange.cpp
index 166314ab7c2..dd36a45f829 100644
--- a/src/Storages/MergeTree/DataPartsExchange.cpp
+++ b/src/Storages/MergeTree/DataPartsExchange.cpp
@@ -850,7 +850,7 @@ MergeTreeData::MutableDataPartPtr Fetcher::downloadPartToDiskRemoteMeta(
new_data_part->modification_time = time(nullptr);
new_data_part->loadColumnsChecksumsIndexes(true, false);
- data.lockSharedData(*new_data_part, /* replace_existing_lock = */ true);
+ data.lockSharedData(*new_data_part, /* replace_existing_lock = */ true, {});
LOG_DEBUG(log, "Download of part {} unique id {} metadata onto disk {} finished.",
part_name, part_id, disk->getName());
diff --git a/src/Storages/MergeTree/ReplicatedMergeTreeSink.cpp b/src/Storages/MergeTree/ReplicatedMergeTreeSink.cpp
index 96f829ac96c..c96c180b83d 100644
--- a/src/Storages/MergeTree/ReplicatedMergeTreeSink.cpp
+++ b/src/Storages/MergeTree/ReplicatedMergeTreeSink.cpp
@@ -497,7 +497,7 @@ void ReplicatedMergeTreeSink::commitPart(
part->name);
}
- storage.lockSharedData(*part, false);
+ storage.lockSharedData(*part, false, {});
Coordination::Responses responses;
Coordination::Error multi_code = zookeeper->tryMultiNoThrow(ops, responses); /// 1 RTT
diff --git a/src/Storages/StorageReplicatedMergeTree.h b/src/Storages/StorageReplicatedMergeTree.h
index 845182445a5..ff389e6f3a3 100644
--- a/src/Storages/StorageReplicatedMergeTree.h
+++ b/src/Storages/StorageReplicatedMergeTree.h
@@ -236,7 +236,7 @@ public:
bool executeFetchShared(const String & source_replica, const String & new_part_name, const DiskPtr & disk, const String & path);
/// Lock part in zookeeper for use shared data in several nodes
- void lockSharedData(const IMergeTreeDataPart & part, bool replace_existing_lock, std::optional hardlinked_files = {}) const override;
+ void lockSharedData(const IMergeTreeDataPart & part, bool replace_existing_lock, std::optional hardlinked_files) const override;
void lockSharedDataTemporary(const String & part_name, const String & part_id, const DiskPtr & disk) const;
diff --git a/tests/queries/0_stateless/02271_replace_partition_many_tables.reference b/tests/queries/0_stateless/02271_replace_partition_many_tables.reference
new file mode 100644
index 00000000000..627e1097cda
--- /dev/null
+++ b/tests/queries/0_stateless/02271_replace_partition_many_tables.reference
@@ -0,0 +1,5 @@
+1
+1
+1
+1
+1
diff --git a/tests/queries/0_stateless/02271_replace_partition_many_tables.sql b/tests/queries/0_stateless/02271_replace_partition_many_tables.sql
new file mode 100644
index 00000000000..a31a2bee58a
--- /dev/null
+++ b/tests/queries/0_stateless/02271_replace_partition_many_tables.sql
@@ -0,0 +1,82 @@
+DROP TABLE IF EXISTS replace_partition_source;
+DROP TABLE IF EXISTS replace_partition_dest1;
+DROP TABLE IF EXISTS replace_partition_dest1_2;
+DROP TABLE IF EXISTS replace_partition_dest2;
+DROP TABLE IF EXISTS replace_partition_dest2_2;
+
+CREATE TABLE replace_partition_source
+(
+ key UInt64
+)
+ENGINE = ReplicatedMergeTree('/test/02271_replace_partition_many/{database}/source', '1')
+PARTITION BY key
+ORDER BY tuple();
+
+INSERT INTO replace_partition_source VALUES (1);
+
+CREATE TABLE replace_partition_dest1
+(
+ key UInt64
+)
+ENGINE = ReplicatedMergeTree('/test/02271_replace_partition_many/{database}/dest1', '1')
+PARTITION BY key
+ORDER BY tuple();
+
+CREATE TABLE replace_partition_dest1_2
+(
+ key UInt64
+)
+ENGINE = ReplicatedMergeTree('/test/02271_replace_partition_many/{database}/dest1', '2')
+PARTITION BY key
+ORDER BY tuple();
+
+
+CREATE TABLE replace_partition_dest2
+(
+ key UInt64
+)
+ENGINE = ReplicatedMergeTree('/test/02271_replace_partition_many/{database}/dest2', '1')
+PARTITION BY key
+ORDER BY tuple();
+
+CREATE TABLE replace_partition_dest2_2
+(
+ key UInt64
+)
+ENGINE = ReplicatedMergeTree('/test/02271_replace_partition_many/{database}/dest2', '2')
+PARTITION BY key
+ORDER BY tuple();
+
+
+ALTER TABLE replace_partition_dest1 REPLACE PARTITION 1 FROM replace_partition_source;
+ALTER TABLE replace_partition_dest2 REPLACE PARTITION 1 FROM replace_partition_source;
+
+OPTIMIZE TABLE replace_partition_source FINAL;
+
+SELECT sleep(3) FORMAT Null;
+SELECT sleep(3) FORMAT Null;
+SELECT sleep(3) FORMAT Null;
+SELECT sleep(3) FORMAT Null;
+SELECT sleep(3) FORMAT Null;
+
+OPTIMIZE TABLE replace_partition_dest1_2 FINAL;
+OPTIMIZE TABLE replace_partition_dest2_2 FINAL;
+
+SELECT sleep(3) FORMAT Null;
+SELECT sleep(3) FORMAT Null;
+SELECT sleep(3) FORMAT Null;
+SELECT sleep(3) FORMAT Null;
+SELECT sleep(3) FORMAT Null;
+
+SELECT * FROM replace_partition_source;
+SELECT * FROM replace_partition_dest1;
+SELECT * FROM replace_partition_dest2;
+SELECT * FROM replace_partition_dest1_2;
+SELECT * FROM replace_partition_dest2_2;
+
+
+--DROP TABLE IF EXISTS replace_partition_source;
+--DROP TABLE IF EXISTS replace_partition_dest1;
+--DROP TABLE IF EXISTS replace_partition_dest1_2;
+--DROP TABLE IF EXISTS replace_partition_dest2;
+--DROP TABLE IF EXISTS replace_partition_dest2_2;
From 397603e9be288f87cc13041298ff8781d423cd08 Mon Sep 17 00:00:00 2001
From: Maksim Kita
Date: Thu, 21 Apr 2022 13:44:56 +0200
Subject: [PATCH 35/55] ExecutableUserDefinedFunction fix usage in GROUP BY
---
src/Interpreters/TreeOptimizer.cpp | 15 ++++++++++++---
...table_user_defined_function_group_by.reference | 10 ++++++++++
..._executable_user_defined_function_group_by.sql | 1 +
3 files changed, 23 insertions(+), 3 deletions(-)
create mode 100644 tests/queries/0_stateless/02285_executable_user_defined_function_group_by.reference
create mode 100644 tests/queries/0_stateless/02285_executable_user_defined_function_group_by.sql
diff --git a/src/Interpreters/TreeOptimizer.cpp b/src/Interpreters/TreeOptimizer.cpp
index badeb3e4ee2..cb0fdc5b162 100644
--- a/src/Interpreters/TreeOptimizer.cpp
+++ b/src/Interpreters/TreeOptimizer.cpp
@@ -21,6 +21,7 @@
#include
#include
#include
+#include
#include
#include
@@ -138,10 +139,18 @@ void optimizeGroupBy(ASTSelectQuery * select_query, ContextPtr context)
continue;
}
}
- else if (!function_factory.get(function->name, context)->isInjective({}))
+ else
{
- ++i;
- continue;
+ FunctionOverloadResolverPtr function_builder = UserDefinedExecutableFunctionFactory::instance().tryGet(function->name, context);
+
+ if (!function_builder)
+ function_builder = function_factory.get(function->name, context);
+
+ if (function_builder->isInjective({}))
+ {
+ ++i;
+ continue;
+ }
}
/// copy shared pointer to args in order to ensure lifetime
diff --git a/tests/queries/0_stateless/02285_executable_user_defined_function_group_by.reference b/tests/queries/0_stateless/02285_executable_user_defined_function_group_by.reference
new file mode 100644
index 00000000000..1c59de1a03b
--- /dev/null
+++ b/tests/queries/0_stateless/02285_executable_user_defined_function_group_by.reference
@@ -0,0 +1,10 @@
+0
+2
+4
+6
+8
+10
+12
+14
+16
+18
diff --git a/tests/queries/0_stateless/02285_executable_user_defined_function_group_by.sql b/tests/queries/0_stateless/02285_executable_user_defined_function_group_by.sql
new file mode 100644
index 00000000000..96db8c088dc
--- /dev/null
+++ b/tests/queries/0_stateless/02285_executable_user_defined_function_group_by.sql
@@ -0,0 +1 @@
+SELECT test_function(number, number) as a FROM numbers(10) GROUP BY a ORDER BY a;
From 5465415751741cf388c61dbb623fc06281a0a08b Mon Sep 17 00:00:00 2001
From: alesapin
Date: Thu, 21 Apr 2022 14:39:12 +0200
Subject: [PATCH 36/55] Fix replace/move partition with zero copy replication
---
src/Disks/DiskDecorator.cpp | 5 +++
src/Disks/DiskDecorator.h | 1 +
src/Disks/DiskEncrypted.cpp | 30 ++++++++++++++
src/Disks/DiskEncrypted.h | 2 +
src/Disks/DiskLocal.cpp | 8 ++++
src/Disks/DiskLocal.h | 2 +
src/Disks/DiskRestartProxy.cpp | 6 +++
src/Disks/DiskRestartProxy.h | 1 +
src/Disks/IDisk.cpp | 40 ++++++++++++-------
src/Disks/IDisk.h | 7 +++-
src/Storages/MergeTree/IMergeTreeDataPart.h | 2 +
src/Storages/MergeTree/MergeTreeData.cpp | 15 +++++--
src/Storages/MergeTree/MergeTreeData.h | 6 ++-
.../MergeTree/MergeTreeDataPartCompact.cpp | 5 +++
.../MergeTree/MergeTreeDataPartCompact.h | 2 +
.../MergeTree/MergeTreeDataPartInMemory.h | 1 +
.../MergeTree/MergeTreeDataPartWide.cpp | 5 +++
.../MergeTree/MergeTreeDataPartWide.h | 2 +
src/Storages/MergeTree/MutateTask.cpp | 4 +-
src/Storages/MergeTree/localBackup.cpp | 20 +++++++---
src/Storages/MergeTree/localBackup.h | 2 +-
src/Storages/StorageMergeTree.cpp | 4 +-
src/Storages/StorageReplicatedMergeTree.cpp | 28 ++++++++++---
23 files changed, 161 insertions(+), 37 deletions(-)
diff --git a/src/Disks/DiskDecorator.cpp b/src/Disks/DiskDecorator.cpp
index 47a6bd92c3a..80cfc23d210 100644
--- a/src/Disks/DiskDecorator.cpp
+++ b/src/Disks/DiskDecorator.cpp
@@ -108,6 +108,11 @@ void DiskDecorator::copy(const String & from_path, const std::shared_ptr
delegate->copy(from_path, to_disk, to_path);
}
+void DiskDecorator::copyDirectoryContent(const String & from_dir, const std::shared_ptr & to_disk, const String & to_dir)
+{
+ delegate->copyDirectoryContent(from_dir, to_disk, to_dir);
+}
+
void DiskDecorator::listFiles(const String & path, std::vector & file_names)
{
delegate->listFiles(path, file_names);
diff --git a/src/Disks/DiskDecorator.h b/src/Disks/DiskDecorator.h
index 27ddb001a83..d707eb3e51d 100644
--- a/src/Disks/DiskDecorator.h
+++ b/src/Disks/DiskDecorator.h
@@ -33,6 +33,7 @@ public:
void moveFile(const String & from_path, const String & to_path) override;
void replaceFile(const String & from_path, const String & to_path) override;
void copy(const String & from_path, const std::shared_ptr & to_disk, const String & to_path) override;
+ void copyDirectoryContent(const String & from_dir, const std::shared_ptr & to_disk, const String & to_dir) override;
void listFiles(const String & path, std::vector & file_names) override;
std::unique_ptr readFile(
diff --git a/src/Disks/DiskEncrypted.cpp b/src/Disks/DiskEncrypted.cpp
index 3cee205fafc..4ac59af95ab 100644
--- a/src/Disks/DiskEncrypted.cpp
+++ b/src/Disks/DiskEncrypted.cpp
@@ -249,6 +249,36 @@ void DiskEncrypted::copy(const String & from_path, const std::shared_ptr
copyThroughBuffers(from_path, to_disk, to_path);
}
+
+void DiskEncrypted::copyDirectoryContent(const String & from_dir, const std::shared_ptr & to_disk, const String & to_dir)
+{
+ /// Check if we can copy the file without deciphering.
+ if (isSameDiskType(*this, *to_disk))
+ {
+ /// Disk type is the same, check if the key is the same too.
+ if (auto * to_disk_enc = typeid_cast(to_disk.get()))
+ {
+ auto from_settings = current_settings.get();
+ auto to_settings = to_disk_enc->current_settings.get();
+ if (from_settings->keys == to_settings->keys)
+ {
+ /// Keys are the same so we can simply copy the encrypted file.
+ auto wrapped_from_path = wrappedPath(from_dir);
+ auto to_delegate = to_disk_enc->delegate;
+ auto wrapped_to_path = to_disk_enc->wrappedPath(to_dir);
+ delegate->copyDirectoryContent(wrapped_from_path, to_delegate, wrapped_to_path);
+ return;
+ }
+ }
+ }
+
+ if (!to_disk->exists(to_dir))
+ to_disk->createDirectories(to_dir);
+
+ /// Copy the file through buffers with deciphering.
+ copyThroughBuffers(from_dir, to_disk, to_dir);
+}
+
std::unique_ptr DiskEncrypted::readFile(
const String & path,
const ReadSettings & settings,
diff --git a/src/Disks/DiskEncrypted.h b/src/Disks/DiskEncrypted.h
index 2f47dda59b3..14793818f07 100644
--- a/src/Disks/DiskEncrypted.h
+++ b/src/Disks/DiskEncrypted.h
@@ -117,6 +117,8 @@ public:
void copy(const String & from_path, const std::shared_ptr & to_disk, const String & to_path) override;
+ void copyDirectoryContent(const String & from_dir, const std::shared_ptr & to_disk, const String & to_dir) override;
+
std::unique_ptr readFile(
const String & path,
const ReadSettings & settings,
diff --git a/src/Disks/DiskLocal.cpp b/src/Disks/DiskLocal.cpp
index d81782a8af1..29277d0cb4e 100644
--- a/src/Disks/DiskLocal.cpp
+++ b/src/Disks/DiskLocal.cpp
@@ -440,6 +440,14 @@ void DiskLocal::copy(const String & from_path, const std::shared_ptr & to
copyThroughBuffers(from_path, to_disk, to_path); /// Base implementation.
}
+void DiskLocal::copyDirectoryContent(const String & from_dir, const std::shared_ptr & to_disk, const String & to_dir)
+{
+ if (isSameDiskType(*this, *to_disk))
+ fs::copy(from_dir, to_dir, fs::copy_options::recursive | fs::copy_options::overwrite_existing); /// Use more optimal way.
+ else
+ copyThroughBuffers(from_dir, to_disk, to_dir); /// Base implementation.
+}
+
SyncGuardPtr DiskLocal::getDirectorySyncGuard(const String & path) const
{
return std::make_unique(fs::path(disk_path) / path);
diff --git a/src/Disks/DiskLocal.h b/src/Disks/DiskLocal.h
index 59dcf5e5c13..14ce8831c50 100644
--- a/src/Disks/DiskLocal.h
+++ b/src/Disks/DiskLocal.h
@@ -68,6 +68,8 @@ public:
void copy(const String & from_path, const std::shared_ptr