From 1cbfaf6f0d2136b63e59892f2883cddfdbdd7c7b Mon Sep 17 00:00:00 2001 From: yariks5s Date: Wed, 14 Feb 2024 23:06:04 +0000 Subject: [PATCH 01/19] init --- src/Interpreters/convertFieldToType.cpp | 10 ++++ ..._fix_datetime64_scale_conversion.reference | 33 +++++++++++++ .../02997_fix_datetime64_scale_conversion.sql | 47 +++++++++++++++++++ 3 files changed, 90 insertions(+) create mode 100644 tests/queries/0_stateless/02997_fix_datetime64_scale_conversion.reference create mode 100644 tests/queries/0_stateless/02997_fix_datetime64_scale_conversion.sql diff --git a/src/Interpreters/convertFieldToType.cpp b/src/Interpreters/convertFieldToType.cpp index c3b8405659a..f26d1818ff6 100644 --- a/src/Interpreters/convertFieldToType.cpp +++ b/src/Interpreters/convertFieldToType.cpp @@ -251,6 +251,16 @@ Field convertFieldToTypeImpl(const Field & src, const IDataType & type, const ID if (which_type.isDateTime64() && src.getType() == Field::Types::Decimal64) { + const int scale_diff = static_cast(type).getScale() - src.get().getScale(); + /// in case if we need to make DateTime64(a) from DateTime64(b), we need to convert datetime value to the right scale + if (scale_diff != 0) + { + const auto & date_time64_type = static_cast(type); + const UInt64 value = scale_diff > 0 ? src.get().getValue().value * UInt64(pow(10, scale_diff)) : src.get().getValue().value / UInt64(pow(10, -scale_diff)); + return DecimalField( + DecimalUtils::decimalFromComponentsWithMultiplier(value, 0, 1), + date_time64_type.getScale()); + } /// Already in needed type. return src; } diff --git a/tests/queries/0_stateless/02997_fix_datetime64_scale_conversion.reference b/tests/queries/0_stateless/02997_fix_datetime64_scale_conversion.reference new file mode 100644 index 00000000000..cb9ed94f7a6 --- /dev/null +++ b/tests/queries/0_stateless/02997_fix_datetime64_scale_conversion.reference @@ -0,0 +1,33 @@ +2023-01-01 00:00:00.00 +2023-01-01 01:01:01.00 +2023-01-02 02:02:02.00 +2023-01-03 03:03:03.00 +2023-01-04 04:04:04.00 +2023-01-05 05:05:05.00 +2023-01-06 06:06:06.00 +2023-01-07 07:07:07.00 +2023-01-08 08:08:08.00 +2023-01-09 09:09:09.00 +2023-01-09 09:09:09.12 +2023-01-01 00:00:00.000 +2023-01-01 01:01:01.000 +2023-01-02 02:02:02.000 +2023-01-03 03:03:03.000 +2023-01-04 04:04:04.000 +2023-01-05 05:05:05.000 +2023-01-06 06:06:06.000 +2023-01-07 07:07:07.000 +2023-01-08 08:08:08.000 +2023-01-09 09:09:09.000 +2023-01-09 09:09:09.123 +2023-01-01 00:00:00.000000 +2023-01-01 01:01:01.000000 +2023-01-02 02:02:02.000000 +2023-01-03 03:03:03.000000 +2023-01-04 04:04:04.000000 +2023-01-05 05:05:05.000000 +2023-01-06 06:06:06.000000 +2023-01-07 07:07:07.000000 +2023-01-08 08:08:08.000000 +2023-01-09 09:09:09.000000 +2023-01-09 09:09:09.123456 diff --git a/tests/queries/0_stateless/02997_fix_datetime64_scale_conversion.sql b/tests/queries/0_stateless/02997_fix_datetime64_scale_conversion.sql new file mode 100644 index 00000000000..de468de87ae --- /dev/null +++ b/tests/queries/0_stateless/02997_fix_datetime64_scale_conversion.sql @@ -0,0 +1,47 @@ +DROP TABLE IF EXISTS test_2; +CREATE TABLE IF NOT EXISTS test_2 (a DateTime64(2)) engine = MergeTree order by a; +INSERT INTO test_2 VALUES (toDateTime64('2023-01-01 00:00:00', 1)); +INSERT INTO test_2 VALUES (toDateTime64('2023-01-01 01:01:01', 1)); +INSERT INTO test_2 VALUES (toDateTime64('2023-01-02 02:02:02', 2)); +INSERT INTO test_2 VALUES (toDateTime64('2023-01-03 03:03:03', 3)); +INSERT INTO test_2 VALUES (toDateTime64('2023-01-04 04:04:04', 4)); +INSERT INTO test_2 VALUES (toDateTime64('2023-01-05 05:05:05', 5)); +INSERT INTO test_2 VALUES (toDateTime64('2023-01-06 06:06:06', 6)); +INSERT INTO test_2 VALUES (toDateTime64('2023-01-07 07:07:07', 7)); +INSERT INTO test_2 VALUES (toDateTime64('2023-01-08 08:08:08', 8)); +INSERT INTO test_2 VALUES (toDateTime64('2023-01-09 09:09:09', 9)); +INSERT INTO test_2 VALUES (toDateTime64('2023-01-09 09:09:09.123456789', 9)); +SELECT * FROM test_2 ORDER BY a; +DROP TABLE test_2; + +DROP TABLE IF EXISTS test_3; +CREATE TABLE IF NOT EXISTS test_3 (a DateTime64(3)) engine = MergeTree order by a; +INSERT INTO test_3 VALUES (toDateTime64('2023-01-01 00:00:00', 1)); +INSERT INTO test_3 VALUES (toDateTime64('2023-01-01 01:01:01', 1)); +INSERT INTO test_3 VALUES (toDateTime64('2023-01-02 02:02:02', 2)); +INSERT INTO test_3 VALUES (toDateTime64('2023-01-03 03:03:03', 3)); +INSERT INTO test_3 VALUES (toDateTime64('2023-01-04 04:04:04', 4)); +INSERT INTO test_3 VALUES (toDateTime64('2023-01-05 05:05:05', 5)); +INSERT INTO test_3 VALUES (toDateTime64('2023-01-06 06:06:06', 6)); +INSERT INTO test_3 VALUES (toDateTime64('2023-01-07 07:07:07', 7)); +INSERT INTO test_3 VALUES (toDateTime64('2023-01-08 08:08:08', 8)); +INSERT INTO test_3 VALUES (toDateTime64('2023-01-09 09:09:09', 9)); +INSERT INTO test_3 VALUES (toDateTime64('2023-01-09 09:09:09.123456789', 9)); +SELECT * FROM test_3 ORDER BY a; +DROP TABLE test_3; + +DROP TABLE IF EXISTS test_6; +CREATE TABLE IF NOT EXISTS test_6 (a DateTime64(6)) engine = MergeTree order by a; +INSERT INTO test_6 VALUES (toDateTime64('2023-01-01 00:00:00', 1)); +INSERT INTO test_6 VALUES (toDateTime64('2023-01-01 01:01:01', 1)); +INSERT INTO test_6 VALUES (toDateTime64('2023-01-02 02:02:02', 2)); +INSERT INTO test_6 VALUES (toDateTime64('2023-01-03 03:03:03', 3)); +INSERT INTO test_6 VALUES (toDateTime64('2023-01-04 04:04:04', 4)); +INSERT INTO test_6 VALUES (toDateTime64('2023-01-05 05:05:05', 5)); +INSERT INTO test_6 VALUES (toDateTime64('2023-01-06 06:06:06', 6)); +INSERT INTO test_6 VALUES (toDateTime64('2023-01-07 07:07:07', 7)); +INSERT INTO test_6 VALUES (toDateTime64('2023-01-08 08:08:08', 8)); +INSERT INTO test_6 VALUES (toDateTime64('2023-01-09 09:09:09', 9)); +INSERT INTO test_6 VALUES (toDateTime64('2023-01-09 09:09:09.123456789', 9)); +SELECT * FROM test_6 ORDER BY a; +DROP TABLE test_6; From 903d73526d4ddc8ef2fbf127b49945104f9f3e35 Mon Sep 17 00:00:00 2001 From: yariks5s Date: Thu, 15 Feb 2024 15:26:24 +0000 Subject: [PATCH 02/19] fix test --- .../02346_non_negative_derivative.reference | 54 +++++++++---------- .../02346_non_negative_derivative.sql | 34 ++++++------ 2 files changed, 44 insertions(+), 44 deletions(-) diff --git a/tests/queries/0_stateless/02346_non_negative_derivative.reference b/tests/queries/0_stateless/02346_non_negative_derivative.reference index b81af45962e..22e5f609ad7 100644 --- a/tests/queries/0_stateless/02346_non_negative_derivative.reference +++ b/tests/queries/0_stateless/02346_non_negative_derivative.reference @@ -1,63 +1,63 @@ 1 -1979-12-12 21:21:21.127 3.7 0 -2299-12-31 23:37:36.788 1.1 0 -2299-12-31 23:37:36.789 2.34 0 +1979-12-12 21:21:21.123 1.1 0 +1979-12-12 21:21:21.123 2.34 0 +1979-12-12 21:21:21.127 3.7 340.00000000000006 1979-12-12 21:21:21.129 2.1 0 1979-12-12 21:21:22.000 1.3345 0 1979-12-12 21:21:23.000 1.54 0.20550000000000002 1979-12-12 21:21:23.000 1.54 0 -1979-12-12 21:21:21.127 3.7 0 -2299-12-31 23:37:36.788 1.1 0 -2299-12-31 23:37:36.789 2.34 0 +1979-12-12 21:21:21.123 1.1 0 +1979-12-12 21:21:21.123 2.34 0 +1979-12-12 21:21:21.127 3.7 0.0000010200000000000004 1979-12-12 21:21:21.129 2.1 0 1979-12-12 21:21:22.000 1.3345 0 1979-12-12 21:21:23.000 1.54 6.165000000000001e-10 1979-12-12 21:21:23.000 1.54 0 -1979-12-12 21:21:21.127 3.7 0 -2299-12-31 23:37:36.788 1.1 0 -2299-12-31 23:37:36.789 2.34 0 +1979-12-12 21:21:21.123 1.1 0 +1979-12-12 21:21:21.123 2.34 0 +1979-12-12 21:21:21.127 3.7 0.00136 1979-12-12 21:21:21.129 2.1 0 1979-12-12 21:21:22.000 1.3345 0 1979-12-12 21:21:23.000 1.54 8.22e-7 1979-12-12 21:21:23.000 1.54 0 -1979-12-12 21:21:21.127 3.7 0 -2299-12-31 23:37:36.788 1.1 0 -2299-12-31 23:37:36.789 2.34 0 +1979-12-12 21:21:21.123 1.1 0 +1979-12-12 21:21:21.123 2.34 0 +1979-12-12 21:21:21.127 3.7 1.7000000000000004 1979-12-12 21:21:21.129 2.1 0 1979-12-12 21:21:22.000 1.3345 0 1979-12-12 21:21:23.000 1.54 0.0010275000000000002 1979-12-12 21:21:23.000 1.54 0 -1979-12-12 21:21:21.127 3.7 0 -2299-12-31 23:37:36.788 1.1 0 -2299-12-31 23:37:36.789 2.34 0 +1979-12-12 21:21:21.123 1.1 0 +1979-12-12 21:21:21.123 2.34 0 +1979-12-12 21:21:21.127 3.7 2040.0000000000005 1979-12-12 21:21:21.129 2.1 0 1979-12-12 21:21:22.000 1.3345 0 1979-12-12 21:21:23.000 1.54 1.233 1979-12-12 21:21:23.000 1.54 0 -1979-12-12 21:21:21.127 3.7 0 -2299-12-31 23:37:36.788 1.1 0 -2299-12-31 23:37:36.789 2.34 0 +1979-12-12 21:21:21.123 1.1 0 +1979-12-12 21:21:21.123 2.34 0 +1979-12-12 21:21:21.127 3.7 142800.00000000003 1979-12-12 21:21:21.129 2.1 0 1979-12-12 21:21:22.000 1.3345 0 1979-12-12 21:21:23.000 1.54 86.31 1979-12-12 21:21:23.000 1.54 0 -1979-12-12 21:21:21.127 3.7 0 -2299-12-31 23:37:36.788 1.1 0 -2299-12-31 23:37:36.789 2.34 0 +1979-12-12 21:21:21.123 1.1 0 +1979-12-12 21:21:21.123 2.34 0 +1979-12-12 21:21:21.127 3.7 9792000.000000002 1979-12-12 21:21:21.129 2.1 0 1979-12-12 21:21:22.000 1.3345 0 1979-12-12 21:21:23.000 1.54 5918.400000000001 1979-12-12 21:21:23.000 1.54 0 -1979-12-12 21:21:21.127 3.7 0 -2299-12-31 23:37:36.788 1.1 0 -2299-12-31 23:37:36.789 2.34 0 +1979-12-12 21:21:21.123 1.1 0 +1979-12-12 21:21:21.123 2.34 0 +1979-12-12 21:21:21.127 3.7 264384000.00000003 1979-12-12 21:21:21.129 2.1 0 1979-12-12 21:21:22.000 1.3345 0 1979-12-12 21:21:23.000 1.54 159796.80000000002 1979-12-12 21:21:23.000 1.54 0 -1979-12-12 21:21:21.127 3.7 0 -2299-12-31 23:37:36.788 1.1 0 -2299-12-31 23:37:36.789 2.34 0 +1979-12-12 21:21:21.123 1.1 0 +1979-12-12 21:21:21.123 2.34 0 +1979-12-12 21:21:21.127 3.7 2056320000.0000002 1979-12-12 21:21:21.129 2.1 0 1979-12-12 21:21:22.000 1.3345 0 1979-12-12 21:21:23.000 1.54 1242864 diff --git a/tests/queries/0_stateless/02346_non_negative_derivative.sql b/tests/queries/0_stateless/02346_non_negative_derivative.sql index 265a8afb2cb..704241da16c 100644 --- a/tests/queries/0_stateless/02346_non_negative_derivative.sql +++ b/tests/queries/0_stateless/02346_non_negative_derivative.sql @@ -18,7 +18,7 @@ SELECT ( SELECT ts, metric, - nonNegativeDerivative(metric, ts) OVER (PARTITION BY metric ORDER BY ts ASC Rows BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING) AS deriv + nonNegativeDerivative(metric, ts) OVER (PARTITION BY metric ORDER BY ts, metric ASC Rows BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING) AS deriv FROM nnd LIMIT 5, 1 ) = ( @@ -29,37 +29,37 @@ SELECT ( FROM nnd LIMIT 5, 1 ); -SELECT ts, metric, nonNegativeDerivative(metric, ts) OVER (PARTITION BY id>3 ORDER BY ts ASC Rows BETWEEN 1 PRECEDING AND 1 FOLLOWING) AS deriv FROM nnd; +SELECT ts, metric, nonNegativeDerivative(metric, ts) OVER (PARTITION BY id>3 ORDER BY ts, metric ASC Rows BETWEEN 1 PRECEDING AND 1 FOLLOWING) AS deriv FROM nnd; -- Nanosecond -SELECT ts, metric, nonNegativeDerivative(metric, ts, INTERVAL 3 NANOSECOND) OVER (PARTITION BY id>3 ORDER BY ts ASC Rows BETWEEN 2 PRECEDING AND 2 FOLLOWING) AS deriv FROM nnd; +SELECT ts, metric, nonNegativeDerivative(metric, ts, INTERVAL 3 NANOSECOND) OVER (PARTITION BY id>3 ORDER BY ts, metric ASC Rows BETWEEN 2 PRECEDING AND 2 FOLLOWING) AS deriv FROM nnd; -- Microsecond -SELECT ts, metric, nonNegativeDerivative(metric, ts, INTERVAL 4 MICROSECOND) OVER (PARTITION BY id>3 ORDER BY ts ASC Rows BETWEEN 1 PRECEDING AND 1 FOLLOWING) AS deriv FROM nnd; +SELECT ts, metric, nonNegativeDerivative(metric, ts, INTERVAL 4 MICROSECOND) OVER (PARTITION BY id>3 ORDER BY ts, metric ASC Rows BETWEEN 1 PRECEDING AND 1 FOLLOWING) AS deriv FROM nnd; -- Millisecond -SELECT ts, metric, nonNegativeDerivative(metric, ts, INTERVAL 5 MILLISECOND) OVER (PARTITION BY id>3 ORDER BY ts ASC Rows BETWEEN 1 PRECEDING AND 1 FOLLOWING) AS deriv FROM nnd; +SELECT ts, metric, nonNegativeDerivative(metric, ts, INTERVAL 5 MILLISECOND) OVER (PARTITION BY id>3 ORDER BY ts, metric ASC Rows BETWEEN 1 PRECEDING AND 1 FOLLOWING) AS deriv FROM nnd; -- Second -SELECT ts, metric, nonNegativeDerivative(metric, ts, INTERVAL 6 SECOND) OVER (PARTITION BY id>3 ORDER BY ts ASC Rows BETWEEN 1 PRECEDING AND UNBOUNDED FOLLOWING) AS deriv FROM nnd; +SELECT ts, metric, nonNegativeDerivative(metric, ts, INTERVAL 6 SECOND) OVER (PARTITION BY id>3 ORDER BY ts, metric ASC Rows BETWEEN 1 PRECEDING AND UNBOUNDED FOLLOWING) AS deriv FROM nnd; -- Minute -SELECT ts, metric, nonNegativeDerivative(metric, ts, INTERVAL 7 MINUTE) OVER (PARTITION BY id>3 ORDER BY ts ASC Rows BETWEEN UNBOUNDED PRECEDING AND 2 FOLLOWING) AS deriv FROM nnd; +SELECT ts, metric, nonNegativeDerivative(metric, ts, INTERVAL 7 MINUTE) OVER (PARTITION BY id>3 ORDER BY ts, metric ASC Rows BETWEEN UNBOUNDED PRECEDING AND 2 FOLLOWING) AS deriv FROM nnd; -- Hour -SELECT ts, metric, nonNegativeDerivative(metric, ts, INTERVAL 8 HOUR) OVER (PARTITION BY id>3 ORDER BY ts ASC Rows BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING) AS deriv FROM nnd; +SELECT ts, metric, nonNegativeDerivative(metric, ts, INTERVAL 8 HOUR) OVER (PARTITION BY id>3 ORDER BY ts, metric ASC Rows BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING) AS deriv FROM nnd; -- Day -SELECT ts, metric, nonNegativeDerivative(metric, ts, INTERVAL 9 DAY) OVER (PARTITION BY id>3 ORDER BY ts ASC Rows BETWEEN 3 PRECEDING AND 3 FOLLOWING) AS deriv FROM nnd; +SELECT ts, metric, nonNegativeDerivative(metric, ts, INTERVAL 9 DAY) OVER (PARTITION BY id>3 ORDER BY ts, metric ASC Rows BETWEEN 3 PRECEDING AND 3 FOLLOWING) AS deriv FROM nnd; -- Week -SELECT ts, metric, nonNegativeDerivative(metric, ts, INTERVAL 10 WEEK) OVER (PARTITION BY id>3 ORDER BY ts ASC Rows BETWEEN 1 PRECEDING AND UNBOUNDED FOLLOWING) AS deriv FROM nnd; +SELECT ts, metric, nonNegativeDerivative(metric, ts, INTERVAL 10 WEEK) OVER (PARTITION BY id>3 ORDER BY ts, metric ASC Rows BETWEEN 1 PRECEDING AND UNBOUNDED FOLLOWING) AS deriv FROM nnd; -- shall not work for month, quarter, year (intervals with floating number of seconds) -- Month -SELECT ts, metric, nonNegativeDerivative(metric, ts, INTERVAL 11 MONTH) OVER (PARTITION BY metric ORDER BY ts ASC Rows BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING) AS deriv FROM nnd; -- { serverError ILLEGAL_TYPE_OF_ARGUMENT } +SELECT ts, metric, nonNegativeDerivative(metric, ts, INTERVAL 11 MONTH) OVER (PARTITION BY metric ORDER BY ts, metric ASC Rows BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING) AS deriv FROM nnd; -- { serverError ILLEGAL_TYPE_OF_ARGUMENT } -- Quarter -SELECT ts, metric, nonNegativeDerivative(metric, ts, INTERVAL 12 QUARTER) OVER (PARTITION BY metric ORDER BY ts ASC Rows BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING) AS deriv FROM nnd; -- { serverError ILLEGAL_TYPE_OF_ARGUMENT } +SELECT ts, metric, nonNegativeDerivative(metric, ts, INTERVAL 12 QUARTER) OVER (PARTITION BY metric ORDER BY ts, metric ASC Rows BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING) AS deriv FROM nnd; -- { serverError ILLEGAL_TYPE_OF_ARGUMENT } -- Year -SELECT ts, metric, nonNegativeDerivative(metric, ts, INTERVAL 13 YEAR) OVER (PARTITION BY metric ORDER BY ts ASC Rows BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING) AS deriv FROM nnd; -- { serverError ILLEGAL_TYPE_OF_ARGUMENT } +SELECT ts, metric, nonNegativeDerivative(metric, ts, INTERVAL 13 YEAR) OVER (PARTITION BY metric ORDER BY ts, metric ASC Rows BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING) AS deriv FROM nnd; -- { serverError ILLEGAL_TYPE_OF_ARGUMENT } -- test against wrong arguments/types -SELECT ts, metric, nonNegativeDerivative(metric, 1, INTERVAL 3 NANOSECOND) OVER (PARTITION BY metric ORDER BY ts ASC Rows BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING) AS deriv FROM nnd; -- { serverError BAD_ARGUMENTS } -SELECT ts, metric, nonNegativeDerivative('string not datetime', ts, INTERVAL 3 NANOSECOND) OVER (PARTITION BY metric ORDER BY ts ASC Rows BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING) AS deriv FROM nnd; -- { serverError BAD_ARGUMENTS } -SELECT ts, metric, nonNegativeDerivative(metric, ts, INTERVAL 3 NANOSECOND, id) OVER (PARTITION BY metric ORDER BY ts ASC Rows BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING) AS deriv FROM nnd; -- { serverError BAD_ARGUMENTS } -SELECT ts, metric, nonNegativeDerivative(metric) OVER (PARTITION BY metric ORDER BY ts ASC Rows BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING) AS deriv FROM nnd; -- { serverError BAD_ARGUMENTS } +SELECT ts, metric, nonNegativeDerivative(metric, 1, INTERVAL 3 NANOSECOND) OVER (PARTITION BY metric ORDER BY ts, metric ASC Rows BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING) AS deriv FROM nnd; -- { serverError BAD_ARGUMENTS } +SELECT ts, metric, nonNegativeDerivative('string not datetime', ts, INTERVAL 3 NANOSECOND) OVER (PARTITION BY metric ORDER BY ts, metric ASC Rows BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING) AS deriv FROM nnd; -- { serverError BAD_ARGUMENTS } +SELECT ts, metric, nonNegativeDerivative(metric, ts, INTERVAL 3 NANOSECOND, id) OVER (PARTITION BY metric ORDER BY ts, metric ASC Rows BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING) AS deriv FROM nnd; -- { serverError BAD_ARGUMENTS } +SELECT ts, metric, nonNegativeDerivative(metric) OVER (PARTITION BY metric ORDER BY ts, metric ASC Rows BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING) AS deriv FROM nnd; -- { serverError BAD_ARGUMENTS } -- cleanup DROP TABLE IF EXISTS nnd; From 88f06886ac7627298845e76c8ec4b1b97f180822 Mon Sep 17 00:00:00 2001 From: yariks5s Date: Fri, 16 Feb 2024 18:48:49 +0000 Subject: [PATCH 03/19] enhance tests, review fixes --- src/Interpreters/convertFieldToType.cpp | 16 ++-- ..._fix_datetime64_scale_conversion.reference | 67 +++++++++++++++ .../02997_fix_datetime64_scale_conversion.sql | 83 ++++++++++++++++++- 3 files changed, 157 insertions(+), 9 deletions(-) diff --git a/src/Interpreters/convertFieldToType.cpp b/src/Interpreters/convertFieldToType.cpp index f26d1818ff6..b23c4cda34e 100644 --- a/src/Interpreters/convertFieldToType.cpp +++ b/src/Interpreters/convertFieldToType.cpp @@ -251,15 +251,19 @@ Field convertFieldToTypeImpl(const Field & src, const IDataType & type, const ID if (which_type.isDateTime64() && src.getType() == Field::Types::Decimal64) { - const int scale_diff = static_cast(type).getScale() - src.get().getScale(); - /// in case if we need to make DateTime64(a) from DateTime64(b), we need to convert datetime value to the right scale - if (scale_diff != 0) + const auto & from_type = src.get(); + const auto & to_type = static_cast(type); + + const auto scale_from = from_type.getScale(); + const auto scale_to = to_type.getScale(); + const auto scale_diff = scale_from > scale_to ? from_type.getScaleMultiplier() / to_type.getScaleMultiplier() : to_type.getScaleMultiplier() / from_type.getScaleMultiplier(); + /// in case if we need to make DateTime64(a) from DateTime64(b), a != b, we need to convert datetime value to the right scale + if (scale_diff != 1) { - const auto & date_time64_type = static_cast(type); - const UInt64 value = scale_diff > 0 ? src.get().getValue().value * UInt64(pow(10, scale_diff)) : src.get().getValue().value / UInt64(pow(10, -scale_diff)); + const UInt64 value = scale_from > scale_to ? from_type.getValue().value / scale_diff : from_type.getValue().value * scale_diff; return DecimalField( DecimalUtils::decimalFromComponentsWithMultiplier(value, 0, 1), - date_time64_type.getScale()); + scale_to); } /// Already in needed type. return src; diff --git a/tests/queries/0_stateless/02997_fix_datetime64_scale_conversion.reference b/tests/queries/0_stateless/02997_fix_datetime64_scale_conversion.reference index cb9ed94f7a6..c4ade2ace13 100644 --- a/tests/queries/0_stateless/02997_fix_datetime64_scale_conversion.reference +++ b/tests/queries/0_stateless/02997_fix_datetime64_scale_conversion.reference @@ -1,33 +1,100 @@ +2023-01-01 00:00:00 +2023-01-01 00:00:00 +2023-01-01 01:01:01 +2023-01-01 01:01:01 +2023-01-02 02:02:02 +2023-01-02 02:02:02 +2023-01-03 03:03:03 +2023-01-03 03:03:03 +2023-01-04 04:04:04 +2023-01-04 04:04:04 +2023-01-05 05:05:05 +2023-01-05 05:05:05 +2023-01-06 06:06:06 +2023-01-06 06:06:06 +2023-01-07 07:07:07 +2023-01-07 07:07:07 +2023-01-08 08:08:08 +2023-01-08 08:08:08 +2023-01-09 09:09:09 +2023-01-09 09:09:09 +2023-01-01 00:00:00.00 2023-01-01 00:00:00.00 2023-01-01 01:01:01.00 +2023-01-01 01:01:01.10 2023-01-02 02:02:02.00 +2023-01-02 02:02:02.12 2023-01-03 03:03:03.00 +2023-01-03 03:03:03.12 2023-01-04 04:04:04.00 +2023-01-04 04:04:04.12 2023-01-05 05:05:05.00 +2023-01-05 05:05:05.12 2023-01-06 06:06:06.00 +2023-01-06 06:06:06.12 2023-01-07 07:07:07.00 +2023-01-07 07:07:07.12 2023-01-08 08:08:08.00 +2023-01-08 08:08:08.12 2023-01-09 09:09:09.00 2023-01-09 09:09:09.12 2023-01-01 00:00:00.000 +2023-01-01 00:00:00.000 2023-01-01 01:01:01.000 +2023-01-01 01:01:01.100 2023-01-02 02:02:02.000 +2023-01-02 02:02:02.120 2023-01-03 03:03:03.000 +2023-01-03 03:03:03.123 2023-01-04 04:04:04.000 +2023-01-04 04:04:04.123 2023-01-05 05:05:05.000 +2023-01-05 05:05:05.123 2023-01-06 06:06:06.000 +2023-01-06 06:06:06.123 2023-01-07 07:07:07.000 +2023-01-07 07:07:07.123 2023-01-08 08:08:08.000 +2023-01-08 08:08:08.123 2023-01-09 09:09:09.000 2023-01-09 09:09:09.123 2023-01-01 00:00:00.000000 +2023-01-01 00:00:00.000000 2023-01-01 01:01:01.000000 +2023-01-01 01:01:01.100000 2023-01-02 02:02:02.000000 +2023-01-02 02:02:02.120000 2023-01-03 03:03:03.000000 +2023-01-03 03:03:03.123000 2023-01-04 04:04:04.000000 +2023-01-04 04:04:04.123400 2023-01-05 05:05:05.000000 +2023-01-05 05:05:05.123450 2023-01-06 06:06:06.000000 +2023-01-06 06:06:06.123456 2023-01-07 07:07:07.000000 +2023-01-07 07:07:07.123456 2023-01-08 08:08:08.000000 +2023-01-08 08:08:08.123456 +2023-01-09 09:09:09.000000 +2023-01-09 09:09:09.123456 +2023-01-01 00:00:00.000000 +2023-01-01 00:00:00.000000 +2023-01-01 01:01:01.000000 +2023-01-01 01:01:01.100000 +2023-01-02 02:02:02.000000 +2023-01-02 02:02:02.120000 +2023-01-03 03:03:03.000000 +2023-01-03 03:03:03.123000 +2023-01-04 04:04:04.000000 +2023-01-04 04:04:04.123400 +2023-01-05 05:05:05.000000 +2023-01-05 05:05:05.123450 +2023-01-06 06:06:06.000000 +2023-01-06 06:06:06.123456 +2023-01-07 07:07:07.000000 +2023-01-07 07:07:07.123456 +2023-01-08 08:08:08.000000 +2023-01-08 08:08:08.123456 2023-01-09 09:09:09.000000 2023-01-09 09:09:09.123456 diff --git a/tests/queries/0_stateless/02997_fix_datetime64_scale_conversion.sql b/tests/queries/0_stateless/02997_fix_datetime64_scale_conversion.sql index de468de87ae..b905ef2b972 100644 --- a/tests/queries/0_stateless/02997_fix_datetime64_scale_conversion.sql +++ b/tests/queries/0_stateless/02997_fix_datetime64_scale_conversion.sql @@ -1,14 +1,48 @@ +DROP TABLE IF EXISTS test_0; +CREATE TABLE IF NOT EXISTS test_0 (a DateTime64(0)) engine = MergeTree order by a; +INSERT INTO test_0 VALUES (toDateTime64('2023-01-01 00:00:00', 0)); +INSERT INTO test_0 VALUES (toDateTime64('2023-01-01 00:00:00.123456789', 0)); +INSERT INTO test_0 VALUES (toDateTime64('2023-01-01 01:01:01', 1)); +INSERT INTO test_0 VALUES (toDateTime64('2023-01-01 01:01:01.123456789', 1)); +INSERT INTO test_0 VALUES (toDateTime64('2023-01-02 02:02:02', 2)); +INSERT INTO test_0 VALUES (toDateTime64('2023-01-02 02:02:02.123456789', 2)); +INSERT INTO test_0 VALUES (toDateTime64('2023-01-03 03:03:03', 3)); +INSERT INTO test_0 VALUES (toDateTime64('2023-01-03 03:03:03.123456789', 3)); +INSERT INTO test_0 VALUES (toDateTime64('2023-01-04 04:04:04', 4)); +INSERT INTO test_0 VALUES (toDateTime64('2023-01-04 04:04:04.123456789', 4)); +INSERT INTO test_0 VALUES (toDateTime64('2023-01-05 05:05:05', 5)); +INSERT INTO test_0 VALUES (toDateTime64('2023-01-05 05:05:05.123456789', 5)); +INSERT INTO test_0 VALUES (toDateTime64('2023-01-06 06:06:06', 6)); +INSERT INTO test_0 VALUES (toDateTime64('2023-01-06 06:06:06.123456789', 6)); +INSERT INTO test_0 VALUES (toDateTime64('2023-01-07 07:07:07', 7)); +INSERT INTO test_0 VALUES (toDateTime64('2023-01-07 07:07:07.123456789', 7)); +INSERT INTO test_0 VALUES (toDateTime64('2023-01-08 08:08:08', 8)); +INSERT INTO test_0 VALUES (toDateTime64('2023-01-08 08:08:08.123456789', 8)); +INSERT INTO test_0 VALUES (toDateTime64('2023-01-09 09:09:09', 9)); +INSERT INTO test_0 VALUES (toDateTime64('2023-01-09 09:09:09.123456789', 9)); +SELECT * FROM test_0 ORDER BY a; +DROP TABLE test_0; + DROP TABLE IF EXISTS test_2; CREATE TABLE IF NOT EXISTS test_2 (a DateTime64(2)) engine = MergeTree order by a; -INSERT INTO test_2 VALUES (toDateTime64('2023-01-01 00:00:00', 1)); +INSERT INTO test_2 VALUES (toDateTime64('2023-01-01 00:00:00', 0)); +INSERT INTO test_2 VALUES (toDateTime64('2023-01-01 00:00:00.123456789', 0)); INSERT INTO test_2 VALUES (toDateTime64('2023-01-01 01:01:01', 1)); +INSERT INTO test_2 VALUES (toDateTime64('2023-01-01 01:01:01.123456789', 1)); INSERT INTO test_2 VALUES (toDateTime64('2023-01-02 02:02:02', 2)); +INSERT INTO test_2 VALUES (toDateTime64('2023-01-02 02:02:02.123456789', 2)); INSERT INTO test_2 VALUES (toDateTime64('2023-01-03 03:03:03', 3)); +INSERT INTO test_2 VALUES (toDateTime64('2023-01-03 03:03:03.123456789', 3)); INSERT INTO test_2 VALUES (toDateTime64('2023-01-04 04:04:04', 4)); +INSERT INTO test_2 VALUES (toDateTime64('2023-01-04 04:04:04.123456789', 4)); INSERT INTO test_2 VALUES (toDateTime64('2023-01-05 05:05:05', 5)); +INSERT INTO test_2 VALUES (toDateTime64('2023-01-05 05:05:05.123456789', 5)); INSERT INTO test_2 VALUES (toDateTime64('2023-01-06 06:06:06', 6)); +INSERT INTO test_2 VALUES (toDateTime64('2023-01-06 06:06:06.123456789', 6)); INSERT INTO test_2 VALUES (toDateTime64('2023-01-07 07:07:07', 7)); +INSERT INTO test_2 VALUES (toDateTime64('2023-01-07 07:07:07.123456789', 7)); INSERT INTO test_2 VALUES (toDateTime64('2023-01-08 08:08:08', 8)); +INSERT INTO test_2 VALUES (toDateTime64('2023-01-08 08:08:08.123456789', 8)); INSERT INTO test_2 VALUES (toDateTime64('2023-01-09 09:09:09', 9)); INSERT INTO test_2 VALUES (toDateTime64('2023-01-09 09:09:09.123456789', 9)); SELECT * FROM test_2 ORDER BY a; @@ -16,15 +50,24 @@ DROP TABLE test_2; DROP TABLE IF EXISTS test_3; CREATE TABLE IF NOT EXISTS test_3 (a DateTime64(3)) engine = MergeTree order by a; -INSERT INTO test_3 VALUES (toDateTime64('2023-01-01 00:00:00', 1)); +INSERT INTO test_3 VALUES (toDateTime64('2023-01-01 00:00:00', 0)); +INSERT INTO test_3 VALUES (toDateTime64('2023-01-01 00:00:00.123456789', 0)); INSERT INTO test_3 VALUES (toDateTime64('2023-01-01 01:01:01', 1)); +INSERT INTO test_3 VALUES (toDateTime64('2023-01-01 01:01:01.123456789', 1)); INSERT INTO test_3 VALUES (toDateTime64('2023-01-02 02:02:02', 2)); +INSERT INTO test_3 VALUES (toDateTime64('2023-01-02 02:02:02.123456789', 2)); INSERT INTO test_3 VALUES (toDateTime64('2023-01-03 03:03:03', 3)); +INSERT INTO test_3 VALUES (toDateTime64('2023-01-03 03:03:03.123456789', 3)); INSERT INTO test_3 VALUES (toDateTime64('2023-01-04 04:04:04', 4)); +INSERT INTO test_3 VALUES (toDateTime64('2023-01-04 04:04:04.123456789', 4)); INSERT INTO test_3 VALUES (toDateTime64('2023-01-05 05:05:05', 5)); +INSERT INTO test_3 VALUES (toDateTime64('2023-01-05 05:05:05.123456789', 5)); INSERT INTO test_3 VALUES (toDateTime64('2023-01-06 06:06:06', 6)); +INSERT INTO test_3 VALUES (toDateTime64('2023-01-06 06:06:06.123456789', 6)); INSERT INTO test_3 VALUES (toDateTime64('2023-01-07 07:07:07', 7)); +INSERT INTO test_3 VALUES (toDateTime64('2023-01-07 07:07:07.123456789', 7)); INSERT INTO test_3 VALUES (toDateTime64('2023-01-08 08:08:08', 8)); +INSERT INTO test_3 VALUES (toDateTime64('2023-01-08 08:08:08.123456789', 8)); INSERT INTO test_3 VALUES (toDateTime64('2023-01-09 09:09:09', 9)); INSERT INTO test_3 VALUES (toDateTime64('2023-01-09 09:09:09.123456789', 9)); SELECT * FROM test_3 ORDER BY a; @@ -32,16 +75,50 @@ DROP TABLE test_3; DROP TABLE IF EXISTS test_6; CREATE TABLE IF NOT EXISTS test_6 (a DateTime64(6)) engine = MergeTree order by a; -INSERT INTO test_6 VALUES (toDateTime64('2023-01-01 00:00:00', 1)); +INSERT INTO test_6 VALUES (toDateTime64('2023-01-01 00:00:00', 0)); +INSERT INTO test_6 VALUES (toDateTime64('2023-01-01 00:00:00.123456789', 0)); INSERT INTO test_6 VALUES (toDateTime64('2023-01-01 01:01:01', 1)); +INSERT INTO test_6 VALUES (toDateTime64('2023-01-01 01:01:01.123456789', 1)); INSERT INTO test_6 VALUES (toDateTime64('2023-01-02 02:02:02', 2)); +INSERT INTO test_6 VALUES (toDateTime64('2023-01-02 02:02:02.123456789', 2)); INSERT INTO test_6 VALUES (toDateTime64('2023-01-03 03:03:03', 3)); +INSERT INTO test_6 VALUES (toDateTime64('2023-01-03 03:03:03.123456789', 3)); INSERT INTO test_6 VALUES (toDateTime64('2023-01-04 04:04:04', 4)); +INSERT INTO test_6 VALUES (toDateTime64('2023-01-04 04:04:04.123456789', 4)); INSERT INTO test_6 VALUES (toDateTime64('2023-01-05 05:05:05', 5)); +INSERT INTO test_6 VALUES (toDateTime64('2023-01-05 05:05:05.123456789', 5)); INSERT INTO test_6 VALUES (toDateTime64('2023-01-06 06:06:06', 6)); +INSERT INTO test_6 VALUES (toDateTime64('2023-01-06 06:06:06.123456789', 6)); INSERT INTO test_6 VALUES (toDateTime64('2023-01-07 07:07:07', 7)); +INSERT INTO test_6 VALUES (toDateTime64('2023-01-07 07:07:07.123456789', 7)); INSERT INTO test_6 VALUES (toDateTime64('2023-01-08 08:08:08', 8)); +INSERT INTO test_6 VALUES (toDateTime64('2023-01-08 08:08:08.123456789', 8)); INSERT INTO test_6 VALUES (toDateTime64('2023-01-09 09:09:09', 9)); INSERT INTO test_6 VALUES (toDateTime64('2023-01-09 09:09:09.123456789', 9)); SELECT * FROM test_6 ORDER BY a; DROP TABLE test_6; + +DROP TABLE IF EXISTS test_9; +CREATE TABLE IF NOT EXISTS test_9 (a DateTime64(6)) engine = MergeTree order by a; +INSERT INTO test_9 VALUES (toDateTime64('2023-01-01 00:00:00', 0)); +INSERT INTO test_9 VALUES (toDateTime64('2023-01-01 00:00:00.123456789', 0)); +INSERT INTO test_9 VALUES (toDateTime64('2023-01-01 01:01:01', 1)); +INSERT INTO test_9 VALUES (toDateTime64('2023-01-01 01:01:01.123456789', 1)); +INSERT INTO test_9 VALUES (toDateTime64('2023-01-02 02:02:02', 2)); +INSERT INTO test_9 VALUES (toDateTime64('2023-01-02 02:02:02.123456789', 2)); +INSERT INTO test_9 VALUES (toDateTime64('2023-01-03 03:03:03', 3)); +INSERT INTO test_9 VALUES (toDateTime64('2023-01-03 03:03:03.123456789', 3)); +INSERT INTO test_9 VALUES (toDateTime64('2023-01-04 04:04:04', 4)); +INSERT INTO test_9 VALUES (toDateTime64('2023-01-04 04:04:04.123456789', 4)); +INSERT INTO test_9 VALUES (toDateTime64('2023-01-05 05:05:05', 5)); +INSERT INTO test_9 VALUES (toDateTime64('2023-01-05 05:05:05.123456789', 5)); +INSERT INTO test_9 VALUES (toDateTime64('2023-01-06 06:06:06', 6)); +INSERT INTO test_9 VALUES (toDateTime64('2023-01-06 06:06:06.123456789', 6)); +INSERT INTO test_9 VALUES (toDateTime64('2023-01-07 07:07:07', 7)); +INSERT INTO test_9 VALUES (toDateTime64('2023-01-07 07:07:07.123456789', 7)); +INSERT INTO test_9 VALUES (toDateTime64('2023-01-08 08:08:08', 8)); +INSERT INTO test_9 VALUES (toDateTime64('2023-01-08 08:08:08.123456789', 8)); +INSERT INTO test_9 VALUES (toDateTime64('2023-01-09 09:09:09', 9)); +INSERT INTO test_9 VALUES (toDateTime64('2023-01-09 09:09:09.123456789', 9)); +SELECT * FROM test_9 ORDER BY a; +DROP TABLE test_9; From 4c9bd4b58a898aece49b4d06def681d7d1b2b7f4 Mon Sep 17 00:00:00 2001 From: Yarik Briukhovetskyi <114298166+yariks5s@users.noreply.github.com> Date: Mon, 19 Feb 2024 11:32:44 +0000 Subject: [PATCH 04/19] review --- src/Interpreters/convertFieldToType.cpp | 21 ++++++++++----------- 1 file changed, 10 insertions(+), 11 deletions(-) diff --git a/src/Interpreters/convertFieldToType.cpp b/src/Interpreters/convertFieldToType.cpp index b23c4cda34e..60732bf374b 100644 --- a/src/Interpreters/convertFieldToType.cpp +++ b/src/Interpreters/convertFieldToType.cpp @@ -251,22 +251,21 @@ Field convertFieldToTypeImpl(const Field & src, const IDataType & type, const ID if (which_type.isDateTime64() && src.getType() == Field::Types::Decimal64) { + if (scale_multiplier_diff == 1) /// Already in needed type. + return src; + + /// in case if we need to make DateTime64(a) from DateTime64(b), a != b, we need to convert datetime value to the right scale const auto & from_type = src.get(); const auto & to_type = static_cast(type); const auto scale_from = from_type.getScale(); const auto scale_to = to_type.getScale(); - const auto scale_diff = scale_from > scale_to ? from_type.getScaleMultiplier() / to_type.getScaleMultiplier() : to_type.getScaleMultiplier() / from_type.getScaleMultiplier(); - /// in case if we need to make DateTime64(a) from DateTime64(b), a != b, we need to convert datetime value to the right scale - if (scale_diff != 1) - { - const UInt64 value = scale_from > scale_to ? from_type.getValue().value / scale_diff : from_type.getValue().value * scale_diff; - return DecimalField( - DecimalUtils::decimalFromComponentsWithMultiplier(value, 0, 1), - scale_to); - } - /// Already in needed type. - return src; + const auto scale_multiplier_diff = scale_from > scale_to ? from_type.getScaleMultiplier() / to_type.getScaleMultiplier() : to_type.getScaleMultiplier() / from_type.getScaleMultiplier(); + + const UInt64 value = scale_from > scale_to ? from_type.getValue().value / scale_multiplier_diff : from_type.getValue().value * scale_multiplier_diff; + return DecimalField( + DecimalUtils::decimalFromComponentsWithMultiplier(value, 0, 1), + scale_to); } /// For toDate('xxx') in 1::Int64, we CAST `src` to UInt64, which may From e14c1b45a165263aef3ebd1111104329ae9ee101 Mon Sep 17 00:00:00 2001 From: Yarik Briukhovetskyi <114298166+yariks5s@users.noreply.github.com> Date: Mon, 19 Feb 2024 12:47:10 +0100 Subject: [PATCH 05/19] style --- src/Interpreters/convertFieldToType.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/Interpreters/convertFieldToType.cpp b/src/Interpreters/convertFieldToType.cpp index 60732bf374b..36137769861 100644 --- a/src/Interpreters/convertFieldToType.cpp +++ b/src/Interpreters/convertFieldToType.cpp @@ -261,7 +261,7 @@ Field convertFieldToTypeImpl(const Field & src, const IDataType & type, const ID const auto scale_from = from_type.getScale(); const auto scale_to = to_type.getScale(); const auto scale_multiplier_diff = scale_from > scale_to ? from_type.getScaleMultiplier() / to_type.getScaleMultiplier() : to_type.getScaleMultiplier() / from_type.getScaleMultiplier(); - + const UInt64 value = scale_from > scale_to ? from_type.getValue().value / scale_multiplier_diff : from_type.getValue().value * scale_multiplier_diff; return DecimalField( DecimalUtils::decimalFromComponentsWithMultiplier(value, 0, 1), From 4616ecd7fb4972cf195f5537518d6e70b9db34e7 Mon Sep 17 00:00:00 2001 From: Yarik Briukhovetskyi <114298166+yariks5s@users.noreply.github.com> Date: Mon, 19 Feb 2024 13:10:00 +0100 Subject: [PATCH 06/19] Update convertFieldToType.cpp --- src/Interpreters/convertFieldToType.cpp | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/src/Interpreters/convertFieldToType.cpp b/src/Interpreters/convertFieldToType.cpp index 36137769861..e906c31305b 100644 --- a/src/Interpreters/convertFieldToType.cpp +++ b/src/Interpreters/convertFieldToType.cpp @@ -251,10 +251,6 @@ Field convertFieldToTypeImpl(const Field & src, const IDataType & type, const ID if (which_type.isDateTime64() && src.getType() == Field::Types::Decimal64) { - if (scale_multiplier_diff == 1) /// Already in needed type. - return src; - - /// in case if we need to make DateTime64(a) from DateTime64(b), a != b, we need to convert datetime value to the right scale const auto & from_type = src.get(); const auto & to_type = static_cast(type); @@ -262,6 +258,10 @@ Field convertFieldToTypeImpl(const Field & src, const IDataType & type, const ID const auto scale_to = to_type.getScale(); const auto scale_multiplier_diff = scale_from > scale_to ? from_type.getScaleMultiplier() / to_type.getScaleMultiplier() : to_type.getScaleMultiplier() / from_type.getScaleMultiplier(); + if (scale_multiplier_diff == 1) /// Already in needed type. + return src; + + /// in case if we need to make DateTime64(a) from DateTime64(b), a != b, we need to convert datetime value to the right scale const UInt64 value = scale_from > scale_to ? from_type.getValue().value / scale_multiplier_diff : from_type.getValue().value * scale_multiplier_diff; return DecimalField( DecimalUtils::decimalFromComponentsWithMultiplier(value, 0, 1), From f04debe6484fce2b8d8529a81d4d1bb81dc012fd Mon Sep 17 00:00:00 2001 From: Arthur Passos Date: Mon, 26 Feb 2024 11:10:22 -0300 Subject: [PATCH 07/19] add test that validates attach partition fails if structure differs because of materialized column --- ...attach_partition_from_different_tables.sql | 22 +++++++++++++++++++ 1 file changed, 22 insertions(+) diff --git a/tests/queries/0_stateless/02888_attach_partition_from_different_tables.sql b/tests/queries/0_stateless/02888_attach_partition_from_different_tables.sql index 98f841394e1..5dbe45a0831 100644 --- a/tests/queries/0_stateless/02888_attach_partition_from_different_tables.sql +++ b/tests/queries/0_stateless/02888_attach_partition_from_different_tables.sql @@ -88,3 +88,25 @@ ALTER TABLE attach_partition_t6 ATTACH PARTITION tuple() FROM attach_partition_t SELECT * FROM attach_partition_t6 WHERE b = '1'; SELECT b, sum(a) FROM attach_partition_t6 GROUP BY b ORDER BY b; + +CREATE TABLE attach_partition_t7 ( + a UInt32, + b UInt32 +) + ENGINE = MergeTree +PARTITION BY a ORDER BY a; + +ALTER TABLE attach_partition_t7 + ADD COLUMN mat_column + UInt32 MATERIALIZED a+b; + +insert into attach_partition_t7 values (1, 2); + +CREATE TABLE attach_partition_t8 ( + a UInt32, + b UInt32 +) + ENGINE = MergeTree +PARTITION BY a ORDER BY a; + +ALTER TABLE attach_partition_t8 ATTACH PARTITION ID '1' FROM attach_partition_t7; -- {serverError 122}; From 6a57d665a403e327acfabe0e831e359aa68f2931 Mon Sep 17 00:00:00 2001 From: Arthur Passos Date: Tue, 27 Feb 2024 16:01:09 -0300 Subject: [PATCH 08/19] create new file for test and use alias --- ...ffers_due_to_materialized_column.reference | 0 ...ure_differs_due_to_materialized_column.sql | 21 +++++++++++++++++++ 2 files changed, 21 insertions(+) create mode 100644 tests/queries/0_stateless/02998_attach_partition_fails_structure_differs_due_to_materialized_column.reference create mode 100644 tests/queries/0_stateless/02998_attach_partition_fails_structure_differs_due_to_materialized_column.sql diff --git a/tests/queries/0_stateless/02998_attach_partition_fails_structure_differs_due_to_materialized_column.reference b/tests/queries/0_stateless/02998_attach_partition_fails_structure_differs_due_to_materialized_column.reference new file mode 100644 index 00000000000..e69de29bb2d diff --git a/tests/queries/0_stateless/02998_attach_partition_fails_structure_differs_due_to_materialized_column.sql b/tests/queries/0_stateless/02998_attach_partition_fails_structure_differs_due_to_materialized_column.sql new file mode 100644 index 00000000000..c92d71893c4 --- /dev/null +++ b/tests/queries/0_stateless/02998_attach_partition_fails_structure_differs_due_to_materialized_column.sql @@ -0,0 +1,21 @@ +CREATE TABLE attach_partition_t7 ( + a UInt32, + b UInt32 +) + ENGINE = MergeTree +PARTITION BY a ORDER BY a; + +ALTER TABLE attach_partition_t7 + ADD COLUMN mat_column + UInt32 MATERIALIZED a+b; + +insert into attach_partition_t7 values (1, 2); + +CREATE TABLE attach_partition_t8 ( + a UInt32, + b UInt32 +) + ENGINE = MergeTree +PARTITION BY a ORDER BY a; + +ALTER TABLE attach_partition_t8 ATTACH PARTITION ID '1' FROM attach_partition_t7; -- {serverError INCOMPATIBLE_COLUMNS}; From 7424b15991f92444196e60b1ad4f47ef64aac23e Mon Sep 17 00:00:00 2001 From: Arthur Passos Date: Tue, 27 Feb 2024 17:22:51 -0300 Subject: [PATCH 09/19] rename test --- ...wed_if_structure_differs_due_to_materialized_column.reference} | 0 ...t_allowed_if_structure_differs_due_to_materialized_column.sql} | 0 2 files changed, 0 insertions(+), 0 deletions(-) rename tests/queries/0_stateless/{02998_attach_partition_fails_structure_differs_due_to_materialized_column.reference => 02998_attach_partition_not_allowed_if_structure_differs_due_to_materialized_column.reference} (100%) rename tests/queries/0_stateless/{02998_attach_partition_fails_structure_differs_due_to_materialized_column.sql => 02998_attach_partition_not_allowed_if_structure_differs_due_to_materialized_column.sql} (100%) diff --git a/tests/queries/0_stateless/02998_attach_partition_fails_structure_differs_due_to_materialized_column.reference b/tests/queries/0_stateless/02998_attach_partition_not_allowed_if_structure_differs_due_to_materialized_column.reference similarity index 100% rename from tests/queries/0_stateless/02998_attach_partition_fails_structure_differs_due_to_materialized_column.reference rename to tests/queries/0_stateless/02998_attach_partition_not_allowed_if_structure_differs_due_to_materialized_column.reference diff --git a/tests/queries/0_stateless/02998_attach_partition_fails_structure_differs_due_to_materialized_column.sql b/tests/queries/0_stateless/02998_attach_partition_not_allowed_if_structure_differs_due_to_materialized_column.sql similarity index 100% rename from tests/queries/0_stateless/02998_attach_partition_fails_structure_differs_due_to_materialized_column.sql rename to tests/queries/0_stateless/02998_attach_partition_not_allowed_if_structure_differs_due_to_materialized_column.sql From cd520b281d229b5f7ed00be989c353c8b308dfdb Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ra=C3=BAl=20Mar=C3=ADn?= Date: Wed, 28 Feb 2024 16:57:58 +0100 Subject: [PATCH 10/19] Update 02888_attach_partition_from_different_tables.sql --- ...attach_partition_from_different_tables.sql | 22 ------------------- 1 file changed, 22 deletions(-) diff --git a/tests/queries/0_stateless/02888_attach_partition_from_different_tables.sql b/tests/queries/0_stateless/02888_attach_partition_from_different_tables.sql index 5dbe45a0831..98f841394e1 100644 --- a/tests/queries/0_stateless/02888_attach_partition_from_different_tables.sql +++ b/tests/queries/0_stateless/02888_attach_partition_from_different_tables.sql @@ -88,25 +88,3 @@ ALTER TABLE attach_partition_t6 ATTACH PARTITION tuple() FROM attach_partition_t SELECT * FROM attach_partition_t6 WHERE b = '1'; SELECT b, sum(a) FROM attach_partition_t6 GROUP BY b ORDER BY b; - -CREATE TABLE attach_partition_t7 ( - a UInt32, - b UInt32 -) - ENGINE = MergeTree -PARTITION BY a ORDER BY a; - -ALTER TABLE attach_partition_t7 - ADD COLUMN mat_column - UInt32 MATERIALIZED a+b; - -insert into attach_partition_t7 values (1, 2); - -CREATE TABLE attach_partition_t8 ( - a UInt32, - b UInt32 -) - ENGINE = MergeTree -PARTITION BY a ORDER BY a; - -ALTER TABLE attach_partition_t8 ATTACH PARTITION ID '1' FROM attach_partition_t7; -- {serverError 122}; From 5f88a455ff3ac582b81de02d50eef835c19d434b Mon Sep 17 00:00:00 2001 From: Max Kainov Date: Tue, 27 Feb 2024 15:42:02 +0000 Subject: [PATCH 11/19] CI: remove input params for job scripts #job_style_check #job_Compatibility_check_amd64 #job_Compatibility_check_aarch64 --- .github/workflows/backport_branches.yml | 4 ---- .github/workflows/master.yml | 4 ---- .github/workflows/pull_request.yml | 4 ---- .github/workflows/release_branches.yml | 4 ---- tests/ci/ci_config.py | 8 +++++-- tests/ci/compatibility_check.py | 28 ++++++++++++++----------- 6 files changed, 22 insertions(+), 30 deletions(-) diff --git a/.github/workflows/backport_branches.yml b/.github/workflows/backport_branches.yml index 51670087ffe..f98760f28a5 100644 --- a/.github/workflows/backport_branches.yml +++ b/.github/workflows/backport_branches.yml @@ -67,8 +67,6 @@ jobs: test_name: Compatibility check (amd64) runner_type: style-checker data: ${{ needs.RunConfig.outputs.data }} - run_command: | - python3 compatibility_check.py --check-name "Compatibility check (amd64)" --check-glibc --check-distributions CompatibilityCheckAarch64: needs: [RunConfig, BuilderDebAarch64] if: ${{ !failure() && !cancelled() }} @@ -77,8 +75,6 @@ jobs: test_name: Compatibility check (aarch64) runner_type: style-checker data: ${{ needs.RunConfig.outputs.data }} - run_command: | - python3 compatibility_check.py --check-name "Compatibility check (aarch64)" --check-glibc ######################################################################################### #################################### ORDINARY BUILDS #################################### ######################################################################################### diff --git a/.github/workflows/master.yml b/.github/workflows/master.yml index a76d3fdcf8d..5492d901656 100644 --- a/.github/workflows/master.yml +++ b/.github/workflows/master.yml @@ -73,8 +73,6 @@ jobs: test_name: Compatibility check (amd64) runner_type: style-checker data: ${{ needs.RunConfig.outputs.data }} - run_command: | - python3 compatibility_check.py --check-name "Compatibility check (amd64)" --check-glibc --check-distributions CompatibilityCheckAarch64: needs: [RunConfig, BuilderDebAarch64] if: ${{ !failure() && !cancelled() }} @@ -83,8 +81,6 @@ jobs: test_name: Compatibility check (aarch64) runner_type: style-checker data: ${{ needs.RunConfig.outputs.data }} - run_command: | - python3 compatibility_check.py --check-name "Compatibility check (aarch64)" --check-glibc ######################################################################################### #################################### ORDINARY BUILDS #################################### ######################################################################################### diff --git a/.github/workflows/pull_request.yml b/.github/workflows/pull_request.yml index 2494a8564bb..cdc1052cc3a 100644 --- a/.github/workflows/pull_request.yml +++ b/.github/workflows/pull_request.yml @@ -117,8 +117,6 @@ jobs: test_name: Compatibility check (amd64) runner_type: style-checker data: ${{ needs.RunConfig.outputs.data }} - run_command: | - python3 compatibility_check.py --check-name "Compatibility check (amd64)" --check-glibc --check-distributions CompatibilityCheckAarch64: needs: [RunConfig, BuilderDebAarch64] if: ${{ !failure() && !cancelled() }} @@ -127,8 +125,6 @@ jobs: test_name: Compatibility check (aarch64) runner_type: style-checker data: ${{ needs.RunConfig.outputs.data }} - run_command: | - python3 compatibility_check.py --check-name "Compatibility check (aarch64)" --check-glibc ######################################################################################### #################################### ORDINARY BUILDS #################################### ######################################################################################### diff --git a/.github/workflows/release_branches.yml b/.github/workflows/release_branches.yml index 57e90d79ebd..b56efd57296 100644 --- a/.github/workflows/release_branches.yml +++ b/.github/workflows/release_branches.yml @@ -68,8 +68,6 @@ jobs: test_name: Compatibility check (amd64) runner_type: style-checker data: ${{ needs.RunConfig.outputs.data }} - run_command: | - python3 compatibility_check.py --check-name "Compatibility check (amd64)" --check-glibc --check-distributions CompatibilityCheckAarch64: needs: [RunConfig, BuilderDebAarch64] if: ${{ !failure() && !cancelled() }} @@ -78,8 +76,6 @@ jobs: test_name: Compatibility check (aarch64) runner_type: style-checker data: ${{ needs.RunConfig.outputs.data }} - run_command: | - python3 compatibility_check.py --check-name "Compatibility check (aarch64)" --check-glibc ######################################################################################### #################################### ORDINARY BUILDS #################################### ######################################################################################### diff --git a/tests/ci/ci_config.py b/tests/ci/ci_config.py index 80994f71280..78ed6a3644c 100644 --- a/tests/ci/ci_config.py +++ b/tests/ci/ci_config.py @@ -398,6 +398,10 @@ bugfix_validate_check = DigestConfig( ], ) # common test params +compatibility_test_common_params = { + "digest": compatibility_check_digest, + "run_command": "compatibility_check.py", +} statless_test_common_params = { "digest": stateless_check_digest, "run_command": 'functional_test_check.py "$CHECK_NAME" $KILL_TIMEOUT', @@ -1038,13 +1042,13 @@ CI_CONFIG = CIConfig( JobNames.COMPATIBILITY_TEST: TestConfig( Build.PACKAGE_RELEASE, job_config=JobConfig( - required_on_release_branch=True, digest=compatibility_check_digest + required_on_release_branch=True, **compatibility_test_common_params # type: ignore ), ), JobNames.COMPATIBILITY_TEST_ARM: TestConfig( Build.PACKAGE_AARCH64, job_config=JobConfig( - required_on_release_branch=True, digest=compatibility_check_digest + required_on_release_branch=True, **compatibility_test_common_params # type: ignore ), ), JobNames.UNIT_TEST: TestConfig( diff --git a/tests/ci/compatibility_check.py b/tests/ci/compatibility_check.py index a2e6c94cf48..63a71e1542b 100644 --- a/tests/ci/compatibility_check.py +++ b/tests/ci/compatibility_check.py @@ -2,6 +2,7 @@ import argparse import logging +import os import subprocess import sys from distutils.version import StrictVersion @@ -118,11 +119,7 @@ def get_run_commands_distributions( def parse_args(): parser = argparse.ArgumentParser("Check compatibility with old distributions") - parser.add_argument("--check-name", required=True) - parser.add_argument("--check-glibc", action="store_true") - parser.add_argument( - "--check-distributions", action="store_true" - ) # currently hardcoded to x86, don't enable for ARM + parser.add_argument("--check-name", required=False) return parser.parse_args() @@ -130,6 +127,13 @@ def main(): logging.basicConfig(level=logging.INFO) args = parse_args() + check_name = args.check_name or os.getenv("CHECK_NAME") + assert check_name + check_glibc = True + # currently hardcoded to x86, don't enable for ARM + check_distributions = ( + "aarch64" not in check_name.lower() and "arm64" not in check_name.lower() + ) stopwatch = Stopwatch() @@ -146,7 +150,7 @@ def main(): "clickhouse-common-static_" in url or "clickhouse-server_" in url ) - download_builds_filter(args.check_name, reports_path, packages_path, url_filter) + download_builds_filter(check_name, reports_path, packages_path, url_filter) for package in packages_path.iterdir(): if package.suffix == ".deb": @@ -162,11 +166,11 @@ def main(): run_commands = [] - if args.check_glibc: + if check_glibc: check_glibc_commands = get_run_commands_glibc(packages_path, result_path) run_commands.extend(check_glibc_commands) - if args.check_distributions: + if check_distributions: centos_image = pull_image(get_docker_image(IMAGE_CENTOS)) ubuntu_image = pull_image(get_docker_image(IMAGE_UBUNTU)) check_distributions_commands = get_run_commands_distributions( @@ -191,9 +195,9 @@ def main(): # See https://sourceware.org/glibc/wiki/Glibc%20Timeline max_glibc_version = "" - if "amd64" in args.check_name: + if "amd64" in check_name: max_glibc_version = "2.4" - elif "aarch64" in args.check_name: + elif "aarch64" in check_name: max_glibc_version = "2.18" # because of build with newer sysroot? else: raise Exception("Can't determine max glibc version") @@ -201,8 +205,8 @@ def main(): state, description, test_results, additional_logs = process_result( result_path, server_log_path, - args.check_glibc, - args.check_distributions, + check_glibc, + check_distributions, max_glibc_version, ) From 7b55c61551d4669a1962590399fe077898a1fec3 Mon Sep 17 00:00:00 2001 From: Michael Kolupaev Date: Thu, 29 Feb 2024 03:27:32 -0800 Subject: [PATCH 12/19] Userspace page cache (#53770) * Userspace page cache * Maybe it'll build this time, who knows. * 'auto' went out of fashion, I guess * Documentation, tsan workaround, metric 'UnreclaimableRss', disable page cache in the test that uses DatabaseOrdinary * Moved CachedInMemoryReadBufferFromFile to object store level, changed settings, addressed other comments. * Fix * Another fix * Fix restricted seek, fix ppc64le build * Don't allow page cache with file cache * Adjust tests a little * Fix clang-tidy * Conflicts * Comments * Maybe unbroke AsynchronousBoundedReadBuffer * SettingsChangesHistory.h * Fix warning in test --- docs/en/operations/storing-data.md | 10 + .../example-datasets/opensky.mdx | 12 +- programs/server/Server.cpp | 7 + src/Access/Common/AccessType.h | 1 + src/Common/PageCache.cpp | 688 ++++++++++++++++++ src/Common/PageCache.h | 299 ++++++++ src/Common/ProfileEvents.cpp | 9 + src/Core/Defines.h | 9 + src/Core/ServerSettings.h | 7 +- src/Core/Settings.h | 4 + src/Core/SettingsChangesHistory.h | 3 + .../IO/AsynchronousBoundedReadBuffer.cpp | 12 +- .../IO/CachedOnDiskReadBufferFromFile.cpp | 2 +- src/Disks/IO/ReadBufferFromRemoteFSGather.cpp | 67 +- src/Disks/IO/ReadBufferFromRemoteFSGather.h | 9 +- src/Disks/IO/ThreadPoolRemoteFSReader.cpp | 2 + src/Disks/IO/ThreadPoolRemoteFSReader.h | 3 + .../AzureBlobStorage/AzureObjectStorage.cpp | 8 +- .../ObjectStorages/DiskObjectStorage.cpp | 3 +- .../ObjectStorages/HDFS/HDFSObjectStorage.cpp | 4 +- .../Local/LocalObjectStorage.cpp | 6 +- .../ObjectStorages/S3/S3ObjectStorage.cpp | 9 +- .../ObjectStorages/Web/WebObjectStorage.cpp | 7 +- src/IO/AsynchronousReader.h | 3 + src/IO/BufferBase.h | 3 + src/IO/CachedInMemoryReadBufferFromFile.cpp | 188 +++++ src/IO/CachedInMemoryReadBufferFromFile.h | 41 ++ src/IO/ReadBuffer.h | 19 +- src/IO/ReadSettings.h | 7 + src/Interpreters/Context.cpp | 41 +- src/Interpreters/Context.h | 5 + src/Interpreters/InterpreterSystemQuery.cpp | 9 + .../ServerAsynchronousMetrics.cpp | 12 + src/Interpreters/tests/gtest_page_cache.cpp | 267 +++++++ src/Parsers/ASTSystemQuery.h | 1 + src/Storages/MergeTree/IMergeTreeDataPart.cpp | 4 - src/Storages/StorageS3.cpp | 18 +- tests/clickhouse-test | 2 + .../01271_show_privileges.reference | 1 + .../0_stateless/02867_page_cache.reference | 23 + .../queries/0_stateless/02867_page_cache.sql | 105 +++ 41 files changed, 1854 insertions(+), 76 deletions(-) create mode 100644 src/Common/PageCache.cpp create mode 100644 src/Common/PageCache.h create mode 100644 src/IO/CachedInMemoryReadBufferFromFile.cpp create mode 100644 src/IO/CachedInMemoryReadBufferFromFile.h create mode 100644 src/Interpreters/tests/gtest_page_cache.cpp create mode 100644 tests/queries/0_stateless/02867_page_cache.reference create mode 100644 tests/queries/0_stateless/02867_page_cache.sql diff --git a/docs/en/operations/storing-data.md b/docs/en/operations/storing-data.md index 003277c8d4f..84251812c01 100644 --- a/docs/en/operations/storing-data.md +++ b/docs/en/operations/storing-data.md @@ -275,6 +275,16 @@ Cache profile events: - `CachedWriteBufferCacheWriteBytes`, `CachedWriteBufferCacheWriteMicroseconds` +## Using in-memory cache (userspace page cache) {#userspace-page-cache} + +The File Cache described above stores cached data in local files. Alternatively, object-store-based disks can be configured to use "Userspace Page Cache", which is RAM-only. Userspace page cache is recommended only if file cache can't be used for some reason, e.g. if the machine doesn't have a local disk at all. Note that file cache effectively uses RAM for caching too, since the OS caches contents of local files. + +To enable userspace page cache for disks that don't use file cache, use setting `use_page_cache_for_disks_without_file_cache`. + +By default, on Linux, the userspace page cache will use all available memory, similar to the OS page cache. In tools like `top` and `ps`, the clickhouse server process will typically show resident set size near 100% of the machine's RAM - this is normal, and most of this memory is actually reclaimable by the OS on memory pressure (`MADV_FREE`). This behavior can be disabled with server setting `page_cache_use_madv_free = 0`, making the userspace page cache just use a fixed amount of memory `page_cache_size` with no special interaction with the OS. On Mac OS, `page_cache_use_madv_free` is always disabled as it doesn't have lazy `MADV_FREE`. + +Unfortunately, `page_cache_use_madv_free` makes it difficult to tell if the server is close to running out of memory, since the RSS metric becomes useless. Async metric `UnreclaimableRSS` shows the amount of physical memory used by the server, excluding the memory reclaimable by the OS: `select value from system.asynchronous_metrics where metric = 'UnreclaimableRSS'`. Use it for monitoring instead of RSS. This metric is only available if `page_cache_use_madv_free` is enabled. + ## Storing Data on Web Server {#storing-data-on-webserver} There is a tool `clickhouse-static-files-uploader`, which prepares a data directory for a given table (`SELECT data_paths FROM system.tables WHERE name = 'table_name'`). For each table you need, you get a directory of files. These files can be uploaded to, for example, a web server with static files. After this preparation, you can load this table into any ClickHouse server via `DiskWeb`. diff --git a/docs/zh/getting-started/example-datasets/opensky.mdx b/docs/zh/getting-started/example-datasets/opensky.mdx index 92cd104e06e..b79c02ab780 100644 --- a/docs/zh/getting-started/example-datasets/opensky.mdx +++ b/docs/zh/getting-started/example-datasets/opensky.mdx @@ -1,4 +1,4 @@ ---- +--- slug: /zh/getting-started/example-datasets/opensky sidebar_label: 空中交通数据 description: 该数据集中的数据是从完整的 OpenSky 数据集中衍生而来的,对其中的数据进行了必要的清理,用以展示在 COVID-19 期间空中交通的发展。 @@ -53,12 +53,12 @@ CREATE TABLE opensky ls -1 flightlist_*.csv.gz | xargs -P100 -I{} bash -c 'gzip -c -d "{}" | clickhouse-client --date_time_input_format best_effort --query "INSERT INTO opensky FORMAT CSVWithNames"' ``` -- 这里我们将文件列表(`ls -1 flightlist_*.csv.gz`)传递给`xargs`以进行并行处理。 `xargs -P100` 指定最多使用 100 个并行工作程序,但由于我们只有 30 个文件,工作程序的数量将只有 30 个。 -- 对于每个文件,`xargs` 将通过 `bash -c` 为每个文件运行一个脚本文件。该脚本通过使用 `{}` 表示文件名占位符,然后 `xargs` 由命令进行填充(使用 `-I{}`)。 -- 该脚本会将文件 (`gzip -c -d "{}"`) 解压缩到标准输出(`-c` 参数),并将输出重定向到 `clickhouse-client`。 -- 我们还要求使用扩展解析器解析 [DateTime](../../sql-reference/data-types/datetime.md) 字段 ([--date_time_input_format best_effort](../../operations/settings/ settings.md#settings-date_time_input_format)) 以识别具有时区偏移的 ISO-8601 格式。 +- 这里我们将文件列表(`ls -1 flightlist_*.csv.gz`)传递给`xargs`以进行并行处理。 `xargs -P100` 指定最多使用 100 个并行工作程序,但由于我们只有 30 个文件,工作程序的数量将只有 30 个。 +- 对于每个文件,`xargs` 将通过 `bash -c` 为每个文件运行一个脚本文件。该脚本通过使用 `{}` 表示文件名占位符,然后 `xargs` 由命令进行填充(使用 `-I{}`)。 +- 该脚本会将文件 (`gzip -c -d "{}"`) 解压缩到标准输出(`-c` 参数),并将输出重定向到 `clickhouse-client`。 +- 我们还要求使用扩展解析器解析 [DateTime](/docs/zh/sql-reference/data-types/datetime.md) 字段 ([--date_time_input_format best_effort](/docs/zh/operations/settings/settings.md#settings-date_time_input_format)) 以识别具有时区偏移的 ISO-8601 格式。 -最后,`clickhouse-client` 会以 [CSVWithNames](../../interfaces/formats.md#csvwithnames) 格式读取输入数据然后执行插入。 +最后,`clickhouse-client` 会以 [CSVWithNames](/docs/zh/interfaces/formats.md#csvwithnames) 格式读取输入数据然后执行插入。 并行导入需要 24 秒。 diff --git a/programs/server/Server.cpp b/programs/server/Server.cpp index 6dc33042a05..786cb27d8c4 100644 --- a/programs/server/Server.cpp +++ b/programs/server/Server.cpp @@ -1228,6 +1228,13 @@ try } global_context->setMarkCache(mark_cache_policy, mark_cache_size, mark_cache_size_ratio); + size_t page_cache_size = server_settings.page_cache_size; + if (page_cache_size != 0) + global_context->setPageCache( + server_settings.page_cache_chunk_size, server_settings.page_cache_mmap_size, + page_cache_size, server_settings.page_cache_use_madv_free, + server_settings.page_cache_use_transparent_huge_pages); + String index_uncompressed_cache_policy = server_settings.index_uncompressed_cache_policy; size_t index_uncompressed_cache_size = server_settings.index_uncompressed_cache_size; double index_uncompressed_cache_size_ratio = server_settings.index_uncompressed_cache_size_ratio; diff --git a/src/Access/Common/AccessType.h b/src/Access/Common/AccessType.h index 8172a468f89..de3eda96bac 100644 --- a/src/Access/Common/AccessType.h +++ b/src/Access/Common/AccessType.h @@ -162,6 +162,7 @@ enum class AccessType M(SYSTEM_DROP_COMPILED_EXPRESSION_CACHE, "SYSTEM DROP COMPILED EXPRESSION, DROP COMPILED EXPRESSION CACHE, DROP COMPILED EXPRESSIONS", GLOBAL, SYSTEM_DROP_CACHE) \ M(SYSTEM_DROP_FILESYSTEM_CACHE, "SYSTEM DROP FILESYSTEM CACHE, DROP FILESYSTEM CACHE", GLOBAL, SYSTEM_DROP_CACHE) \ M(SYSTEM_SYNC_FILESYSTEM_CACHE, "SYSTEM REPAIR FILESYSTEM CACHE, REPAIR FILESYSTEM CACHE, SYNC FILESYSTEM CACHE", GLOBAL, SYSTEM) \ + M(SYSTEM_DROP_PAGE_CACHE, "SYSTEM DROP PAGE CACHE, DROP PAGE CACHE", GLOBAL, SYSTEM_DROP_CACHE) \ M(SYSTEM_DROP_SCHEMA_CACHE, "SYSTEM DROP SCHEMA CACHE, DROP SCHEMA CACHE", GLOBAL, SYSTEM_DROP_CACHE) \ M(SYSTEM_DROP_FORMAT_SCHEMA_CACHE, "SYSTEM DROP FORMAT SCHEMA CACHE, DROP FORMAT SCHEMA CACHE", GLOBAL, SYSTEM_DROP_CACHE) \ M(SYSTEM_DROP_S3_CLIENT_CACHE, "SYSTEM DROP S3 CLIENT, DROP S3 CLIENT CACHE", GLOBAL, SYSTEM_DROP_CACHE) \ diff --git a/src/Common/PageCache.cpp b/src/Common/PageCache.cpp new file mode 100644 index 00000000000..511ec23d431 --- /dev/null +++ b/src/Common/PageCache.cpp @@ -0,0 +1,688 @@ +#include "PageCache.h" + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +namespace ProfileEvents +{ + extern const Event PageCacheChunkMisses; + extern const Event PageCacheChunkShared; + extern const Event PageCacheChunkDataHits; + extern const Event PageCacheChunkDataPartialHits; + extern const Event PageCacheChunkDataMisses; + extern const Event PageCacheBytesUnpinnedRoundedToPages; + extern const Event PageCacheBytesUnpinnedRoundedToHugePages; +} + +namespace DB +{ + +namespace ErrorCodes +{ + extern const int SYSTEM_ERROR; + extern const int MEMORY_LIMIT_EXCEEDED; + extern const int CANNOT_ALLOCATE_MEMORY; + extern const int INVALID_SETTING_VALUE; + extern const int FILE_DOESNT_EXIST; +} + +#pragma clang diagnostic push +#pragma clang diagnostic ignored "-Wunknown-warning-option" +#pragma clang diagnostic ignored "-Wreadability-make-member-function-const" + +PinnedPageChunk::PinnedPageChunk(PinnedPageChunk && c) noexcept + : cache(std::exchange(c.cache, nullptr)), chunk(std::exchange(c.chunk, nullptr)) {} + +PinnedPageChunk & PinnedPageChunk::operator=(PinnedPageChunk && c) noexcept +{ + if (cache) + cache->removeRef(chunk); + cache = std::exchange(c.cache, nullptr); + chunk = std::exchange(c.chunk, nullptr); + return *this; +} + +PinnedPageChunk::~PinnedPageChunk() noexcept +{ + if (cache) + cache->removeRef(chunk); +} + +PinnedPageChunk::PinnedPageChunk(PageCache * cache_, PageChunk * chunk_) noexcept : cache(cache_), chunk(chunk_) {} + +const PageChunk * PinnedPageChunk::getChunk() const { return chunk; } + +bool PinnedPageChunk::markPagePopulated(size_t page_idx) +{ + bool r = chunk->pages_populated.set(page_idx); + return r; +} + +void PinnedPageChunk::markPrefixPopulated(size_t bytes) +{ + for (size_t i = 0; i < (bytes + chunk->page_size - 1) / chunk->page_size; ++i) + markPagePopulated(i); +} + +bool PinnedPageChunk::isPrefixPopulated(size_t bytes) const +{ + for (size_t i = 0; i < (bytes + chunk->page_size - 1) / chunk->page_size; ++i) + if (!chunk->pages_populated.get(i)) + return false; + return true; +} + +AtomicBitSet::AtomicBitSet() = default; + +void AtomicBitSet::init(size_t nn) +{ + n = nn; + v = std::make_unique[]>((n + 7) / 8); +} + +bool AtomicBitSet::get(size_t i) const +{ + return (v[i / 8] & (1 << (i % 8))) != 0; +} + +bool AtomicBitSet::any() const +{ + for (size_t i = 0; i < (n + 7) / 8; ++i) + if (v[i]) + return true; + return false; +} + +bool AtomicBitSet::set(size_t i) const +{ + UInt8 prev = v[i / 8].fetch_or(1 << (i % 8)); + return (prev & (1 << (i % 8))) == 0; +} + +bool AtomicBitSet::set(size_t i, bool val) const +{ + if (val) + return set(i); + else + return unset(i); +} + +bool AtomicBitSet::unset(size_t i) const +{ + UInt8 prev = v[i / 8].fetch_and(~(1 << (i % 8))); + return (prev & (1 << (i % 8))) != 0; +} + +void AtomicBitSet::unsetAll() const +{ + for (size_t i = 0; i < (n + 7) / 8; ++i) + v[i].store(0, std::memory_order_relaxed); +} + +PageCache::PageCache(size_t bytes_per_chunk, size_t bytes_per_mmap, size_t bytes_total, bool use_madv_free_, bool use_huge_pages_) + : bytes_per_page(getPageSize()) + , use_madv_free(use_madv_free_) + , use_huge_pages(use_huge_pages_) + , rng(randomSeed()) +{ + if (bytes_per_chunk == 0 || bytes_per_mmap == 0) + throw Exception(ErrorCodes::INVALID_SETTING_VALUE, "Userspace page cache chunk size and mmap size can't be zero."); + + if (use_huge_pages) + { + use_huge_pages = false; + bool print_warning = false; +#ifdef OS_LINUX + try + { + ReadBufferFromFile in("/sys/kernel/mm/transparent_hugepage/hpage_pmd_size"); + size_t huge_page_size; + readIntText(huge_page_size, in); + + if (huge_page_size == 0 || huge_page_size % bytes_per_page != 0) + throw Exception(ErrorCodes::SYSTEM_ERROR, "Invalid huge page size reported by the OS: {}", huge_page_size); + + /// THP can be configured to be 2 MiB or 1 GiB in size. 1 GiB is way too big for us. + if (huge_page_size <= (16 << 20)) + { + pages_per_big_page = huge_page_size / bytes_per_page; + use_huge_pages = true; + } + else + { + LOG_WARNING(&Poco::Logger::get("PageCache"), "The OS huge page size is too large for our purposes: {} KiB. Using regular pages. Userspace page cache will be relatively slow.", huge_page_size); + } + } + catch (Exception & e) + { + if (e.code() != ErrorCodes::FILE_DOESNT_EXIST) + throw; + print_warning = true; + } +#else + print_warning = true; +#endif + if (print_warning) + LOG_WARNING(&Poco::Logger::get("PageCache"), "The OS doesn't support transparent huge pages. Userspace page cache will be relatively slow."); + } + + pages_per_chunk = ((bytes_per_chunk - 1) / (bytes_per_page * pages_per_big_page) + 1) * pages_per_big_page; + chunks_per_mmap_target = (bytes_per_mmap - 1) / (bytes_per_page * pages_per_chunk) + 1; + max_mmaps = (bytes_total - 1) / (bytes_per_page * pages_per_chunk * chunks_per_mmap_target) + 1; +} + +PageCache::~PageCache() +{ + chassert(getPinnedSize() == 0); +} + +size_t PageCache::pageSize() const { return bytes_per_page; } +size_t PageCache::chunkSize() const { return bytes_per_page * pages_per_chunk; } +size_t PageCache::maxChunks() const { return chunks_per_mmap_target * max_mmaps; } + +size_t PageCache::getPinnedSize() const +{ + std::unique_lock lock(global_mutex); + return (total_chunks - lru.size()) * bytes_per_page * pages_per_chunk; +} + +PageCache::MemoryStats PageCache::getResidentSetSize() const +{ + MemoryStats stats; +#ifdef OS_LINUX + if (use_madv_free) + { + std::unordered_set cache_mmap_addrs; + for (const auto & m : mmaps) + cache_mmap_addrs.insert(reinterpret_cast(m.ptr)); + + ReadBufferFromFile in("/proc/self/smaps"); + + /// Parse the smaps contents, which is text consisting of entries like this: + /// + /// 117ba4a00000-117be4a00000 rw-p 00000000 00:00 0 + /// Size: 1048576 kB + /// KernelPageSize: 4 kB + /// MMUPageSize: 4 kB + /// Rss: 539516 kB + /// Pss: 539516 kB + /// ... + + auto read_token = [&] + { + String res; + while (!in.eof()) + { + char c = *in.position(); + if (c == '\n' || c == '\t' || c == ' ' || c == '-') + break; + res += c; + ++in.position(); + } + return res; + }; + + auto skip_whitespace = [&] + { + while (!in.eof()) + { + char c = *in.position(); + if (c != ' ' && c != '\t') + break; + ++in.position(); + } + }; + + bool current_range_is_cache = false; + size_t total_rss = 0; + size_t total_lazy_free = 0; + while (!in.eof()) + { + String s = read_token(); + if (!in.eof() && *in.position() == '-') + { + if (s.size() < 16) + s.insert(0, 16 - s.size(), '0'); + UInt64 addr = unhexUInt(s.c_str()); + current_range_is_cache = cache_mmap_addrs.contains(addr); + } + else if (s == "Rss:" || s == "LazyFree") + { + skip_whitespace(); + size_t val; + readIntText(val, in); + skip_whitespace(); + String unit = read_token(); + if (unit != "kB") + throw Exception(ErrorCodes::SYSTEM_ERROR, "Unexpected units in /proc/self/smaps: {}", unit); + size_t bytes = val * 1024; + + if (s == "Rss:") + { + total_rss += bytes; + if (current_range_is_cache) + stats.page_cache_rss += bytes; + } + else + total_lazy_free += bytes; + } + skipToNextLineOrEOF(in); + } + stats.unreclaimable_rss = total_rss - std::min(total_lazy_free, total_rss); + + return stats; + } +#endif + + stats.page_cache_rss = bytes_per_page * pages_per_chunk * total_chunks; + return stats; +} + +PinnedPageChunk PageCache::getOrSet(PageCacheKey key, bool detached_if_missing, bool inject_eviction) +{ + PageChunk * chunk; + /// Make sure we increment exactly one of the counters about the fate of a chunk lookup. + bool incremented_profile_events = false; + + { + std::unique_lock lock(global_mutex); + + auto * it = chunk_by_key.find(key); + if (it == chunk_by_key.end()) + { + chunk = getFreeChunk(lock); + chassert(!chunk->key.has_value()); + + if (!detached_if_missing) + { + chunk->key = key; + chunk_by_key.insert({key, chunk}); + } + + ProfileEvents::increment(ProfileEvents::PageCacheChunkMisses); + incremented_profile_events = true; + } + else + { + chunk = it->getMapped(); + size_t prev_pin_count = chunk->pin_count.fetch_add(1); + + if (prev_pin_count == 0) + { + /// Not eligible for LRU eviction while pinned. + chassert(chunk->is_linked()); + lru.erase(lru.iterator_to(*chunk)); + + if (detached_if_missing) + { + /// Peek the first page to see if it's evicted. + /// (Why not use the full probing procedure instead, restoreChunkFromLimbo()? + /// Right here we can't do it because of how the two mutexes are organized. + /// And we want to do the check+detach before unlocking global_mutex, because + /// otherwise we may detach a chunk pinned by someone else, which may be unexpected + /// for that someone else. Or maybe the latter is fine, dropCache() already does it.) + if (chunk->pages_populated.get(0) && reinterpret_cast*>(chunk->data)->load(std::memory_order_relaxed) == 0) + evictChunk(chunk, lock); + } + + if (inject_eviction && chunk->key.has_value() && rng() % 10 == 0) + { + /// Simulate eviction of the chunk or some of its pages. + if (rng() % 2 == 0) + evictChunk(chunk, lock); + else + for (size_t i = 0; i < 20; ++i) + chunk->pages_populated.unset(rng() % (chunk->size / chunk->page_size)); + } + } + else + { + ProfileEvents::increment(ProfileEvents::PageCacheChunkShared); + incremented_profile_events = true; + } + } + } + + { + std::unique_lock chunk_lock(chunk->chunk_mutex); + + if (chunk->pages_state == PageChunkState::Limbo) + { + auto [pages_restored, pages_evicted] = restoreChunkFromLimbo(chunk, chunk_lock); + chunk->pages_state = PageChunkState::Stable; + + if (!incremented_profile_events) + { + if (pages_evicted == 0) + ProfileEvents::increment(ProfileEvents::PageCacheChunkDataHits); + else if (pages_evicted < pages_restored) + ProfileEvents::increment(ProfileEvents::PageCacheChunkDataPartialHits); + else + ProfileEvents::increment(ProfileEvents::PageCacheChunkDataMisses); + } + } + } + + return PinnedPageChunk(this, chunk); +} + +void PageCache::removeRef(PageChunk * chunk) noexcept +{ + /// Fast path if this is not the last reference. + size_t prev_pin_count = chunk->pin_count.load(); + if (prev_pin_count > 1 && chunk->pin_count.compare_exchange_strong(prev_pin_count, prev_pin_count - 1)) + return; + + { + std::unique_lock lock(global_mutex); + + prev_pin_count = chunk->pin_count.fetch_sub(1); + if (prev_pin_count > 1) + return; + + chassert(!chunk->is_linked()); + if (chunk->key.has_value()) + lru.push_back(*chunk); + else + /// Unpinning detached chunk. We'd rather reuse it soon, so put it at the front. + lru.push_front(*chunk); + } + + { + std::unique_lock chunk_lock(chunk->chunk_mutex); + + /// Need to be extra careful here because we unlocked global_mutex above, so other + /// getOrSet()/removeRef() calls could have happened during this brief period. + if (use_madv_free && chunk->pages_state == PageChunkState::Stable && chunk->pin_count.load() == 0) + { + sendChunkToLimbo(chunk, chunk_lock); + chunk->pages_state = PageChunkState::Limbo; + } + } +} + +static void logUnexpectedSyscallError(std::string name) +{ + std::string message = fmt::format("{} failed: {}", name, errnoToString()); + LOG_WARNING(&Poco::Logger::get("PageCache"), "{}", message); +#if defined(ABORT_ON_LOGICAL_ERROR) + volatile bool true_ = true; + if (true_) // suppress warning about missing [[noreturn]] + abortOnFailedAssertion(message); +#endif +} + +void PageCache::sendChunkToLimbo(PageChunk * chunk [[maybe_unused]], std::unique_lock & /* chunk_mutex */) const noexcept +{ +#ifdef MADV_FREE // if we're not on a very old version of Linux + chassert(chunk->size == bytes_per_page * pages_per_chunk); + size_t populated_pages = 0; + size_t populated_big_pages = 0; + for (size_t big_page_idx = 0; big_page_idx < pages_per_chunk / pages_per_big_page; ++big_page_idx) + { + bool big_page_populated = false; + for (size_t sub_idx = 0; sub_idx < pages_per_big_page; ++sub_idx) + { + size_t idx = big_page_idx * pages_per_big_page + sub_idx; + if (!chunk->pages_populated.get(idx)) + continue; + big_page_populated = true; + populated_pages += 1; + + auto & byte = reinterpret_cast &>(chunk->data[idx * bytes_per_page]); + chunk->first_bit_of_each_page.set(idx, (byte.load(std::memory_order_relaxed) & 1) != 0); + byte.fetch_or(1, std::memory_order_relaxed); + } + if (big_page_populated) + populated_big_pages += 1; + } + int r = madvise(chunk->data, chunk->size, MADV_FREE); + if (r != 0) + logUnexpectedSyscallError("madvise(MADV_FREE)"); + + ProfileEvents::increment(ProfileEvents::PageCacheBytesUnpinnedRoundedToPages, bytes_per_page * populated_pages); + ProfileEvents::increment(ProfileEvents::PageCacheBytesUnpinnedRoundedToHugePages, bytes_per_page * pages_per_big_page * populated_big_pages); +#endif +} + +std::pair PageCache::restoreChunkFromLimbo(PageChunk * chunk, std::unique_lock & /* chunk_mutex */) const noexcept +{ + static_assert(sizeof(std::atomic) == 1, "char is not atomic?"); + // Make sure our strategic memory reads/writes are not reordered or optimized out. + auto * data = reinterpret_cast *>(chunk->data); + size_t pages_restored = 0; + size_t pages_evicted = 0; + for (size_t idx = 0; idx < chunk->size / bytes_per_page; ++idx) + { + if (!chunk->pages_populated.get(idx)) + continue; + + /// After MADV_FREE, it's guaranteed that: + /// * writing to the page makes it non-freeable again (reading doesn't), + /// * after the write, the page contents are either fully intact or fully zero-filled, + /// * even before the write, reads return either intact data (if the page wasn't freed) or zeroes (if it was, and the read page-faulted). + /// (And when doing the write there's no way to tell whether it page-faulted or not, AFAICT; that would make our life much easier!) + /// + /// With that in mind, we do the following dance to bring the page back from the MADV_FREE limbo: + /// 0. [in advance] Before doing MADV_FREE, make sure the page's first byte is not zero. + /// We do it by setting the lowest bit of the first byte to 1, after saving the original value of that bit into a bitset. + /// 1. Read the second byte. + /// 2. Write the second byte back. This makes the page non-freeable. + /// 3. Read the first byte. + /// 3a. If it's zero, the page was freed. + /// Set the second byte to 0, to keep the buffer zero-filled if the page was freed + /// between steps 1 and 2. + /// 3b. If it's nonzero, the page is intact. + /// Restore the lowest bit of the first byte to the saved original value from the bitset. + + char second_byte = data[idx * bytes_per_page + 1].load(std::memory_order_relaxed); + data[idx * bytes_per_page + 1].store(second_byte, std::memory_order_relaxed); + + char first_byte = data[idx * bytes_per_page].load(std::memory_order_relaxed); + if (first_byte == 0) + { + pages_evicted += 1; + data[idx * bytes_per_page + 1].store(0, std::memory_order_relaxed); + chunk->pages_populated.unset(idx); + } + else + { + pages_restored += 1; + chassert(first_byte & 1); + if (!chunk->first_bit_of_each_page.get(idx)) + data[idx * bytes_per_page].fetch_and(~1, std::memory_order_relaxed); + } + } + return {pages_restored, pages_evicted}; +} + +PageChunk * PageCache::getFreeChunk(std::unique_lock & lock /* global_mutex */) +{ + if (lru.empty() || (mmaps.size() < max_mmaps && lru.front().key.has_value())) + addMmap(lock); + if (lru.empty()) + throw Exception(ErrorCodes::MEMORY_LIMIT_EXCEEDED, "All chunks in the entire page cache ({:.3} GiB) are pinned.", + bytes_per_page * pages_per_chunk * total_chunks * 1. / (1l << 30)); + + PageChunk * chunk = &lru.front(); + lru.erase(lru.iterator_to(*chunk)); + + size_t prev_pin_count = chunk->pin_count.fetch_add(1); + chassert(prev_pin_count == 0); + + evictChunk(chunk, lock); + + return chunk; +} + +void PageCache::evictChunk(PageChunk * chunk, std::unique_lock & /* global_mutex */) +{ + if (chunk->key.has_value()) + { + size_t erased = chunk_by_key.erase(chunk->key.value()); + chassert(erased); + chunk->key.reset(); + } + + chunk->state.reset(); + + /// This is tricky. We're not holding the chunk_mutex, so another thread might be running + /// sendChunkToLimbo() or even restoreChunkFromLimbo() on this chunk right now. + /// + /// Nevertheless, it's correct and sufficient to clear pages_populated here because sendChunkToLimbo() + /// and restoreChunkFromLimbo() only touch pages_populated (only unsetting the bits), + /// first_bit_of_each_page, and the data; and we don't care about first_bit_of_each_page and the data. + /// + /// This is precarious, but I don't have better ideas. Note that this clearing (or something else) + /// must be done before unlocking the global_mutex because otherwise another call to getOrSet() might + /// return this chunk before we clear it. + chunk->pages_populated.unsetAll(); +} + +void PageCache::addMmap(std::unique_lock & /* global_mutex */) +{ + /// ASLR by hand. + void * address_hint = reinterpret_cast(std::uniform_int_distribution(0x100000000000UL, 0x700000000000UL)(rng)); + + mmaps.emplace_back(bytes_per_page, pages_per_chunk, pages_per_big_page, chunks_per_mmap_target, address_hint, use_huge_pages); + + size_t num_chunks = mmaps.back().num_chunks; + total_chunks += num_chunks; + for (size_t i = 0; i < num_chunks; ++i) + /// Link in reverse order, so they get assigned in increasing order. Not important, just seems nice. + lru.push_front(mmaps.back().chunks[num_chunks - 1 - i]); +} + +void PageCache::dropCache() +{ + std::unique_lock lock(global_mutex); + + /// Detach and free unpinned chunks. + bool logged_error = false; + for (PageChunk & chunk : lru) + { + evictChunk(&chunk, lock); + + if (use_madv_free) + { + /// This might happen in parallel with sendChunkToLimbo() or restoreChunkFromLimbo(), but it's ok. + int r = madvise(chunk.data, chunk.size, MADV_DONTNEED); + if (r != 0 && !logged_error) + { + logUnexpectedSyscallError("madvise(MADV_DONTNEED)"); + logged_error = true; + } + } + } + + /// Detach pinned chunks. + for (auto [key, chunk] : chunk_by_key) + { + chassert(chunk->key == key); + chassert(chunk->pin_count > 0); // otherwise it would have been evicted above + chunk->key.reset(); + } + chunk_by_key.clear(); +} + +PageCache::Mmap::Mmap(size_t bytes_per_page_, size_t pages_per_chunk_, size_t pages_per_big_page_, size_t num_chunks_, void * address_hint, bool use_huge_pages_) +{ + num_chunks = num_chunks_; + size = bytes_per_page_ * pages_per_chunk_ * num_chunks; + + size_t alignment = bytes_per_page_ * pages_per_big_page_; + address_hint = reinterpret_cast(reinterpret_cast(address_hint) / alignment * alignment); + + auto temp_chunks = std::make_unique(num_chunks); + + int flags = MAP_PRIVATE | MAP_ANONYMOUS; +#ifdef OS_LINUX + flags |= MAP_NORESERVE; +#endif + ptr = mmap(address_hint, size, PROT_READ | PROT_WRITE, flags, -1, 0); + if (MAP_FAILED == ptr) + throw ErrnoException(ErrorCodes::CANNOT_ALLOCATE_MEMORY, fmt::format("Cannot mmap {}.", ReadableSize(size))); + if (reinterpret_cast(ptr) % bytes_per_page_ != 0) + { + munmap(ptr, size); + throw Exception(ErrorCodes::SYSTEM_ERROR, "mmap returned unaligned address: {}", ptr); + } + + void * chunks_start = ptr; + +#ifdef OS_LINUX + if (madvise(ptr, size, MADV_DONTDUMP) != 0) + logUnexpectedSyscallError("madvise(MADV_DONTDUMP)"); + if (madvise(ptr, size, MADV_DONTFORK) != 0) + logUnexpectedSyscallError("madvise(MADV_DONTFORK)"); + + if (use_huge_pages_) + { + if (reinterpret_cast(ptr) % alignment != 0) + { + LOG_DEBUG(&Poco::Logger::get("PageCache"), "mmap() returned address not aligned on huge page boundary."); + chunks_start = reinterpret_cast((reinterpret_cast(ptr) / alignment + 1) * alignment); + chassert(reinterpret_cast(chunks_start) % alignment == 0); + num_chunks -= 1; + } + + if (madvise(ptr, size, MADV_HUGEPAGE) != 0) + LOG_WARNING(&Poco::Logger::get("PageCache"), + "madvise(MADV_HUGEPAGE) failed: {}. Userspace page cache will be relatively slow.", errnoToString()); + } +#else + (void)use_huge_pages_; +#endif + + chunks = std::move(temp_chunks); + for (size_t i = 0; i < num_chunks; ++i) + { + PageChunk * chunk = &chunks[i]; + chunk->data = reinterpret_cast(chunks_start) + bytes_per_page_ * pages_per_chunk_ * i; + chunk->size = bytes_per_page_ * pages_per_chunk_; + chunk->page_size = bytes_per_page_; + chunk->big_page_size = bytes_per_page_ * pages_per_big_page_; + chunk->pages_populated.init(pages_per_chunk_); + chunk->first_bit_of_each_page.init(pages_per_chunk_); + } +} + +PageCache::Mmap::Mmap(Mmap && m) noexcept : ptr(std::exchange(m.ptr, nullptr)), size(std::exchange(m.size, 0)), chunks(std::move(m.chunks)), num_chunks(std::exchange(m.num_chunks, 0)) {} + +PageCache::Mmap::~Mmap() noexcept +{ + if (ptr && 0 != munmap(ptr, size)) + logUnexpectedSyscallError("munmap"); +} + +void FileChunkState::reset() {} + +PageCacheKey FileChunkAddress::hash() const +{ + SipHash hash(offset); + hash.update(path.data(), path.size()); + if (!file_version.empty()) + { + hash.update("\0", 1); + hash.update(file_version.data(), file_version.size()); + } + return hash.get128(); +} + +std::string FileChunkAddress::toString() const +{ + return fmt::format("{}:{}{}{}", path, offset, file_version.empty() ? "" : ":", file_version); +} + +#pragma clang diagnostic pop + +} diff --git a/src/Common/PageCache.h b/src/Common/PageCache.h new file mode 100644 index 00000000000..7ff376baa6b --- /dev/null +++ b/src/Common/PageCache.h @@ -0,0 +1,299 @@ +#pragma once + +#include +#include +#include +#include +#include + +/// "Userspace page cache" +/// A cache for contents of remote files. +/// Uses MADV_FREE to allow Linux to evict pages from our cache under memory pressure. +/// Typically takes up almost all of the available memory, similar to the actual page cache. +/// +/// Intended for caching data retrieved from distributed cache, but can be used for other things too, +/// just replace FileChunkState with a discriminated union, or something, if needed. +/// +/// There are two fixed-size units of caching here: +/// * OS pages, typically 4 KiB each. +/// * Page chunks, 2 MiB each (configurable with page_cache_block_size setting). +/// +/// Each file is logically split into aligned 2 MiB blocks, which are mapped to page chunks inside the cache. +/// They are cached independently from each other. +/// +/// Each page chunk has a contiguous 2 MiB buffer that can be pinned and directly used e.g. by ReadBuffers. +/// While pinned (by at least one PinnedPageChunk), the pages are not reclaimable by the OS. +/// +/// Inside each page chunk, any subset of pages may be populated. Unpopulated pages may or not be +/// mapped to any physical RAM. We maintain a bitmask that keeps track of which pages are populated. +/// Pages become unpopulated if they're reclaimed by the OS (when the page chunk is not pinned), +/// or if we just never populate them in the first place (e.g. if a file is shorter than 2 MiB we +/// still create a 2 MiB page chunk, but use only a prefix of it). +/// +/// There are two separate eviction mechanisms at play: +/// * LRU eviction of page chunks in PageCache. +/// * OS reclaiming pages on memory pressure. We have no control over the eviction policy. +/// It probably picks the pages in the same order in which they were marked with MADV_FREE, so +/// effectively in the same LRU order as our policy in PageCache. +/// When using PageCache in oversubscribed fashion, using all available memory and relying on OS eviction, +/// the PageCache's eviction policy mostly doesn't matter. It just needs to be similar enough to the OS's +/// policy that we rarely evict chunks with unevicted pages. +/// +/// We mmap memory directly instead of using allocator because this enables: +/// * knowing how much RAM the cache is using, via /proc/self/smaps, +/// * MADV_HUGEPAGE (use transparent huge pages - this makes MADV_FREE 10x less slow), +/// * MAP_NORESERVE (don't reserve swap space - otherwise large mmaps usually fail), +/// * MADV_DONTDUMP (don't include in core dumps), +/// * page-aligned addresses without padding. +/// +/// madvise(MADV_FREE) call is slow: ~6 GiB/s (doesn't scale with more threads). Enabling transparent +/// huge pages (MADV_HUGEPAGE) makes it 10x less slow, so we do that. That makes the physical RAM allocation +/// work at 2 MiB granularity instead of 4 KiB, so the cache becomes less suitable for small files. +/// If this turns out to be a problem, we may consider allowing different mmaps to have different flags, +/// some having no huge pages. +/// Note that we do our bookkeeping at small-page granularity even if huge pages are enabled. +/// +/// It's unfortunate that Linux's MADV_FREE eviction doesn't use the two-list strategy like the real +/// page cache (IIUC, MADV_FREE puts the pages at the head of the inactive list, and they can never +/// get to the active list). +/// If this turns out to be a problem, we could make PageCache do chunk eviction based on observed +/// system memory usage, so that most eviction is done by us, and the MADV_FREE eviction kicks in +/// only as a last resort. Then we can make PageCache's eviction policy arbitrarily more sophisticated. + +namespace DB +{ + +/// Hash of FileChunkAddress. +using PageCacheKey = UInt128; + +/// Identifies a chunk of a file or object. +/// We assume that contents of such file/object don't change (without file_version changing), so +/// cache invalidation is needed. +struct FileChunkAddress +{ + /// Path, usually prefixed with storage system name and anything else needed to make it unique. + /// E.g. "s3:/" + std::string path; + /// Optional string with ETag, or file modification time, or anything else. + std::string file_version; + size_t offset = 0; + + PageCacheKey hash() const; + + std::string toString() const; +}; + +struct AtomicBitSet +{ + size_t n = 0; + std::unique_ptr[]> v; + + AtomicBitSet(); + + void init(size_t n); + + bool get(size_t i) const; + bool any() const; + /// These return true if the bit was changed, false if it already had the target value. + /// (These methods are logically not const, but clang insists that I make them const, and + /// '#pragma clang diagnostic ignored' doesn't seem to work.) + bool set(size_t i) const; + bool set(size_t i, bool val) const; + bool unset(size_t i) const; + void unsetAll() const; +}; + +enum class PageChunkState +{ + /// Pages are not reclaimable by the OS, the buffer has correct contents. + Stable, + /// Pages are reclaimable by the OS, the buffer contents are altered (first bit of each page set to 1). + Limbo, +}; + +/// (This is a separate struct just in case we want to use this cache for other things in future. +/// Then this struct would be the customization point, while the rest of PageChunk can stay unchanged.) +struct FileChunkState +{ + std::mutex download_mutex; + + void reset(); +}; + +using PageChunkLRUListHook = boost::intrusive::list_base_hook<>; + +/// Cache entry. +struct PageChunk : public PageChunkLRUListHook +{ + char * data; + size_t size; // in bytes + /// Page size for use in pages_populated and first_bit_of_each_page. Same as PageCache::pageSize(). + size_t page_size; + + /// Actual eviction granularity. Just for information. If huge pages are used, huge page size, otherwise page_size. + size_t big_page_size; + + mutable FileChunkState state; + + AtomicBitSet pages_populated; + +private: + friend class PinnedPageChunk; + friend class PageCache; + + /// If nullopt, the chunk is "detached", i.e. not associated with any key. + /// Detached chunks may still be pinned. Chunk may get detached even while pinned, in particular when dropping cache. + /// Protected by global_mutex. + std::optional key; + + /// Refcount for usage of this chunk. When zero, the pages are reclaimable by the OS, and + /// the PageChunk itself is evictable (linked into PageCache::lru). + std::atomic pin_count {0}; + + /// Bit mask containing the first bit of data from each page. Needed for the weird probing procedure when un-MADV_FREE-ing the pages. + AtomicBitSet first_bit_of_each_page; + + /// Locked when changing pages_state, along with the corresponding expensive MADV_FREE/un-MADV_FREE operation. + mutable std::mutex chunk_mutex; + + /// Normally pin_count == 0 <=> state == PageChunkState::Limbo, + /// pin_count > 0 <=> state == PageChunkState::Stable. + /// This separate field is needed because of synchronization: pin_count is changed with global_mutex locked, + /// this field is changed with chunk_mutex locked, and we never have to lock both mutexes at once. + PageChunkState pages_state = PageChunkState::Stable; +}; + +class PageCache; + +/// Handle for a cache entry. Neither the entry nor its pages can get evicted while there's at least one PinnedPageChunk pointing to it. +class PinnedPageChunk +{ +public: + const PageChunk * getChunk() const; + + /// Sets the bit in pages_populated. Returns true if it actually changed (i.e. was previously 0). + bool markPagePopulated(size_t page_idx); + + /// Calls markPagePopulated() for pages 0..ceil(bytes/page_size). + void markPrefixPopulated(size_t bytes); + + bool isPrefixPopulated(size_t bytes) const; + + PinnedPageChunk() = default; + ~PinnedPageChunk() noexcept; + + PinnedPageChunk(PinnedPageChunk &&) noexcept; + PinnedPageChunk & operator=(PinnedPageChunk &&) noexcept; + +private: + friend class PageCache; + + PageCache * cache = nullptr; + PageChunk * chunk = nullptr; + + PinnedPageChunk(PageCache * cache_, PageChunk * chunk_) noexcept; +}; + +class PageCache +{ +public: + PageCache(size_t bytes_per_chunk, size_t bytes_per_mmap, size_t bytes_total, bool use_madv_free, bool use_huge_pages); + ~PageCache(); + + /// Get or insert a chunk for the given key. + /// + /// If detached_if_missing = true, and the key is not present in the cache, the returned chunk + /// won't be associated with the key and will be evicted as soon as it's unpinned. + /// It's like "get if exists, otherwise return null", but instead of null we return a usable + /// temporary buffer, for convenience. Pinning and page eviction make the story more complicated: + /// * If the chunk for this key is pinned, we return it even if it's not fully populated + /// (because PageCache doesn't know what "fully populated" means). + /// * If the chunk exists, but some of its pages were evicted, we detach it. (Currently we only + /// check the first page here.) + PinnedPageChunk getOrSet(PageCacheKey key, bool detached_if_missing, bool inject_eviction); + + /// OS page size, e.g. 4 KiB on x86, 4 KiB or 64 KiB on aarch64. + /// + /// If transparent huge pages are enabled, this is still the regular page size, and all our bookkeeping + /// is still based on regular page size (e.g. pages_populated), because (a) it's cheap anyway, + /// and (b) I'm not sure if Linux guarantees that MADV_FREE reclamation always happens at huge page + /// granularity, and wouldn't want to rely on this even if it does. + size_t pageSize() const; + size_t chunkSize() const; + size_t maxChunks() const; + + struct MemoryStats + { + /// How many bytes of actual RAM are used for the cache pages. Doesn't include metadata + /// and overhead (e.g. PageChunk structs). + size_t page_cache_rss = 0; + /// Resident set size for the whole process, excluding any MADV_FREE pages (PageCache's or not). + /// This can be used as a more useful memory usage number for clickhouse server, instead of RSS. + /// Populated only if MADV_FREE is used, otherwise zero. + std::optional unreclaimable_rss; + }; + + /// Reads /proc/self/smaps, so not very fast. + MemoryStats getResidentSetSize() const; + + /// Total length of memory ranges currently pinned by PinnedPageChunk-s, including unpopulated pages. + size_t getPinnedSize() const; + + /// Clears the key -> chunk mapping. Frees memory (MADV_DONTNEED) of all chunks that are not pinned. + /// Doesn't unmap any virtual memory. Detaches but doesn't free the pinned chunks. + /// Locks the global mutex for the duration of the operation, which may block queries for hundreds of milliseconds. + void dropCache(); + +private: + friend class PinnedPageChunk; + + struct Mmap + { + void * ptr = nullptr; + size_t size = 0; + + std::unique_ptr chunks; + size_t num_chunks = 0; // might be smaller than chunks_per_mmap_target because of alignment + + Mmap(Mmap &&) noexcept; + Mmap(size_t bytes_per_page, size_t pages_per_chunk, size_t pages_per_big_page, size_t num_chunks, void * address_hint, bool use_huge_pages_); + ~Mmap() noexcept; + }; + + size_t bytes_per_page; + size_t pages_per_chunk; + size_t chunks_per_mmap_target; + size_t max_mmaps; + size_t pages_per_big_page = 1; // if huge pages are used, huge_page_size/page_size, otherwise 1 + bool use_madv_free = true; + bool use_huge_pages = true; + + mutable std::mutex global_mutex; + + pcg64 rng; + + std::vector mmaps; + size_t total_chunks = 0; + + /// All non-pinned chunks, including ones not assigned to any file. Least recently used is begin(). + boost::intrusive::list, boost::intrusive::constant_time_size> lru; + + HashMap chunk_by_key; + + /// Get a usable chunk, doing eviction or allocation if needed. + /// Caller is responsible for clearing pages_populated. + PageChunk * getFreeChunk(std::unique_lock & /* global_mutex */); + void addMmap(std::unique_lock & /* global_mutex */); + void evictChunk(PageChunk * chunk, std::unique_lock & /* global_mutex */); + + void removeRef(PageChunk * chunk) noexcept; + + /// These may run in parallel with getFreeChunk(), so be very careful about which fields of the PageChunk we touch here. + void sendChunkToLimbo(PageChunk * chunk, std::unique_lock & /* chunk_mutex */) const noexcept; + /// Returns {pages_restored, pages_evicted}. + std::pair restoreChunkFromLimbo(PageChunk * chunk, std::unique_lock & /* chunk_mutex */) const noexcept; +}; + +using PageCachePtr = std::shared_ptr; + +} diff --git a/src/Common/ProfileEvents.cpp b/src/Common/ProfileEvents.cpp index d8ca1ab9e93..3a8659b8b27 100644 --- a/src/Common/ProfileEvents.cpp +++ b/src/Common/ProfileEvents.cpp @@ -63,6 +63,15 @@ M(MarkCacheMisses, "Number of times an entry has not been found in the mark cache, so we had to load a mark file in memory, which is a costly operation, adding to query latency.") \ M(QueryCacheHits, "Number of times a query result has been found in the query cache (and query computation was avoided). Only updated for SELECT queries with SETTING use_query_cache = 1.") \ M(QueryCacheMisses, "Number of times a query result has not been found in the query cache (and required query computation). Only updated for SELECT queries with SETTING use_query_cache = 1.") \ + /* Each page cache chunk access increments exactly one of the following 5 PageCacheChunk* counters. */ \ + /* Something like hit rate: (PageCacheChunkShared + PageCacheChunkDataHits) / [sum of all 5]. */ \ + M(PageCacheChunkMisses, "Number of times a chunk has not been found in the userspace page cache.") \ + M(PageCacheChunkShared, "Number of times a chunk has been found in the userspace page cache, already in use by another thread.") \ + M(PageCacheChunkDataHits, "Number of times a chunk has been found in the userspace page cache, not in use, with all pages intact.") \ + M(PageCacheChunkDataPartialHits, "Number of times a chunk has been found in the userspace page cache, not in use, but some of its pages were evicted by the OS.") \ + M(PageCacheChunkDataMisses, "Number of times a chunk has been found in the userspace page cache, not in use, but all its pages were evicted by the OS.") \ + M(PageCacheBytesUnpinnedRoundedToPages, "Total size of populated pages in chunks that became evictable in PageCache. Rounded up to whole pages.") \ + M(PageCacheBytesUnpinnedRoundedToHugePages, "See PageCacheBytesUnpinnedRoundedToPages, but rounded to huge pages. Use the ratio between the two as a measure of memory waste from using huge pages.") \ M(CreatedReadBufferOrdinary, "Number of times ordinary read buffer was created for reading data (while choosing among other read methods).") \ M(CreatedReadBufferDirectIO, "Number of times a read buffer with O_DIRECT was created for reading data (while choosing among other read methods).") \ M(CreatedReadBufferDirectIOFailed, "Number of times a read buffer with O_DIRECT was attempted to be created for reading data (while choosing among other read methods), but the OS did not allow it (due to lack of filesystem support or other reasons) and we fallen back to the ordinary reading method.") \ diff --git a/src/Core/Defines.h b/src/Core/Defines.h index bf9fb1db6bc..cc6f49aa361 100644 --- a/src/Core/Defines.h +++ b/src/Core/Defines.h @@ -70,6 +70,15 @@ static constexpr auto DBMS_DEFAULT_MAX_QUERY_SIZE = 262144; /// Max depth of hierarchical dictionary static constexpr auto DBMS_HIERARCHICAL_DICTIONARY_MAX_DEPTH = 1000; +#ifdef OS_LINUX +#define DBMS_DEFAULT_PAGE_CACHE_USE_MADV_FREE true +#else +/// On Mac OS, MADV_FREE is not lazy, so page_cache_use_madv_free should be disabled. +/// On FreeBSD, it may work but we haven't tested it. +#define DBMS_DEFAULT_PAGE_CACHE_USE_MADV_FREE false +#endif + + /// Default maximum (total and entry) sizes and policies of various caches static constexpr auto DEFAULT_UNCOMPRESSED_CACHE_POLICY = "SLRU"; static constexpr auto DEFAULT_UNCOMPRESSED_CACHE_MAX_SIZE = 0_MiB; diff --git a/src/Core/ServerSettings.h b/src/Core/ServerSettings.h index 3713d0c3206..a54fb42b464 100644 --- a/src/Core/ServerSettings.h +++ b/src/Core/ServerSettings.h @@ -65,7 +65,7 @@ namespace DB M(UInt64, max_concurrent_insert_queries, 0, "Maximum number of concurrently INSERT queries. Zero means unlimited.", 0) \ M(UInt64, max_concurrent_select_queries, 0, "Maximum number of concurrently SELECT queries. Zero means unlimited.", 0) \ \ - M(Double, cache_size_to_ram_max_ratio, 0.5, "Set cache size ro RAM max ratio. Allows to lower cache size on low-memory systems.", 0) \ + M(Double, cache_size_to_ram_max_ratio, 0.5, "Set cache size to RAM max ratio. Allows to lower cache size on low-memory systems.", 0) \ M(String, uncompressed_cache_policy, DEFAULT_UNCOMPRESSED_CACHE_POLICY, "Uncompressed cache policy name.", 0) \ M(UInt64, uncompressed_cache_size, DEFAULT_UNCOMPRESSED_CACHE_MAX_SIZE, "Size of cache for uncompressed blocks. Zero means disabled.", 0) \ M(Double, uncompressed_cache_size_ratio, DEFAULT_UNCOMPRESSED_CACHE_SIZE_RATIO, "The size of the protected queue in the uncompressed cache relative to the cache's total size.", 0) \ @@ -78,6 +78,11 @@ namespace DB M(String, index_mark_cache_policy, DEFAULT_INDEX_MARK_CACHE_POLICY, "Secondary index mark cache policy name.", 0) \ M(UInt64, index_mark_cache_size, DEFAULT_INDEX_MARK_CACHE_MAX_SIZE, "Size of cache for secondary index marks. Zero means disabled.", 0) \ M(Double, index_mark_cache_size_ratio, DEFAULT_INDEX_MARK_CACHE_SIZE_RATIO, "The size of the protected queue in the secondary index mark cache relative to the cache's total size.", 0) \ + M(UInt64, page_cache_chunk_size, 2 << 20, "Bytes per chunk in userspace page cache. Rounded up to a multiple of page size (typically 4 KiB) or huge page size (typically 2 MiB, only if page_cache_use_thp is enabled).", 0) \ + M(UInt64, page_cache_mmap_size, 1 << 30, "Bytes per memory mapping in userspace page cache. Not important.", 0) \ + M(UInt64, page_cache_size, 10ul << 30, "Amount of virtual memory to map for userspace page cache. If page_cache_use_madv_free is enabled, it's recommended to set this higher than the machine's RAM size. Use 0 to disable userspace page cache.", 0) \ + M(Bool, page_cache_use_madv_free, DBMS_DEFAULT_PAGE_CACHE_USE_MADV_FREE, "If true, the userspace page cache will allow the OS to automatically reclaim memory from the cache on memory pressure (using MADV_FREE).", 0) \ + M(Bool, page_cache_use_transparent_huge_pages, true, "Userspace will attempt to use transparent huge pages on Linux. This is best-effort.", 0) \ M(UInt64, mmap_cache_size, DEFAULT_MMAP_CACHE_MAX_SIZE, "A cache for mmapped files.", 0) \ \ M(Bool, disable_internal_dns_cache, false, "Disable internal DNS caching at all.", 0) \ diff --git a/src/Core/Settings.h b/src/Core/Settings.h index ae6ea165cc9..7d1112af3a7 100644 --- a/src/Core/Settings.h +++ b/src/Core/Settings.h @@ -777,6 +777,10 @@ class IColumn; M(Bool, throw_on_error_from_cache_on_write_operations, false, "Ignore error from cache when caching on write operations (INSERT, merges)", 0) \ M(UInt64, filesystem_cache_segments_batch_size, 20, "Limit on size of a single batch of file segments that a read buffer can request from cache. Too low value will lead to excessive requests to cache, too large may slow down eviction from cache", 0) \ \ + M(Bool, use_page_cache_for_disks_without_file_cache, false, "Use userspace page cache for remote disks that don't have filesystem cache enabled.", 0) \ + M(Bool, read_from_page_cache_if_exists_otherwise_bypass_cache, false, "Use userspace page cache in passive mode, similar to read_from_filesystem_cache_if_exists_otherwise_bypass_cache.", 0) \ + M(Bool, page_cache_inject_eviction, false, "Userspace page cache will sometimes invalidate some pages at random. Intended for testing.", 0) \ + \ M(Bool, load_marks_asynchronously, false, "Load MergeTree marks asynchronously", 0) \ M(Bool, enable_filesystem_read_prefetches_log, false, "Log to system.filesystem prefetch_log during query. Should be used only for testing or debugging, not recommended to be turned on by default", 0) \ M(Bool, allow_prefetched_read_pool_for_remote_filesystem, true, "Prefer prefetched threadpool if all parts are on remote filesystem", 0) \ diff --git a/src/Core/SettingsChangesHistory.h b/src/Core/SettingsChangesHistory.h index e8d013d13ec..02ee641903c 100644 --- a/src/Core/SettingsChangesHistory.h +++ b/src/Core/SettingsChangesHistory.h @@ -114,6 +114,9 @@ static std::map sett {"default_normal_view_sql_security", "INVOKER", "INVOKER", "Allows to set default `SQL SECURITY` option while creating a normal view"}, {"mysql_map_string_to_text_in_show_columns", false, true, "Reduce the configuration effort to connect ClickHouse with BI tools."}, {"mysql_map_fixed_string_to_text_in_show_columns", false, true, "Reduce the configuration effort to connect ClickHouse with BI tools."}, + {"use_page_cache_for_disks_without_file_cache", false, false, "Added userspace page cache"}, + {"read_from_page_cache_if_exists_otherwise_bypass_cache", false, false, "Added userspace page cache"}, + {"page_cache_inject_eviction", false, false, "Added userspace page cache"}, }}, {"24.1", {{"print_pretty_type_names", false, true, "Better user experience."}, {"input_format_json_read_bools_as_strings", false, true, "Allow to read bools as strings in JSON formats by default"}, diff --git a/src/Disks/IO/AsynchronousBoundedReadBuffer.cpp b/src/Disks/IO/AsynchronousBoundedReadBuffer.cpp index 2373640704b..1a9cd2c994c 100644 --- a/src/Disks/IO/AsynchronousBoundedReadBuffer.cpp +++ b/src/Disks/IO/AsynchronousBoundedReadBuffer.cpp @@ -129,6 +129,7 @@ void AsynchronousBoundedReadBuffer::setReadUntilPosition(size_t position) /// new read until position is after the current position in the working buffer file_offset_of_buffer_end = position; working_buffer.resize(working_buffer.size() - (file_offset_of_buffer_end - position)); + pos = std::min(pos, working_buffer.end()); } else { @@ -235,9 +236,6 @@ bool AsynchronousBoundedReadBuffer::nextImpl() file_offset_of_buffer_end = impl->getFileOffsetOfBufferEnd(); - /// In case of multiple files for the same file in clickhouse (i.e. log family) - /// file_offset_of_buffer_end will not match getImplementationBufferOffset() - /// so we use [impl->getImplementationBufferOffset(), impl->getFileSize()] chassert(file_offset_of_buffer_end <= impl->getFileSize()); if (read_until_position && (file_offset_of_buffer_end > *read_until_position)) @@ -264,7 +262,7 @@ off_t AsynchronousBoundedReadBuffer::seek(off_t offset, int whence) size_t new_pos; if (whence == SEEK_SET) { - assert(offset >= 0); + chassert(offset >= 0); new_pos = offset; } else if (whence == SEEK_CUR) @@ -290,8 +288,8 @@ off_t AsynchronousBoundedReadBuffer::seek(off_t offset, int whence) /// Position is still inside the buffer. /// Probably it is at the end of the buffer - then we will load data on the following 'next' call. pos = working_buffer.end() - file_offset_of_buffer_end + new_pos; - assert(pos >= working_buffer.begin()); - assert(pos <= working_buffer.end()); + chassert(pos >= working_buffer.begin()); + chassert(pos <= working_buffer.end()); return new_pos; } @@ -317,7 +315,7 @@ off_t AsynchronousBoundedReadBuffer::seek(off_t offset, int whence) break; } - assert(!prefetch_future.valid()); + chassert(!prefetch_future.valid()); /// First reset the buffer so the next read will fetch new data to the buffer. resetWorkingBuffer(); diff --git a/src/Disks/IO/CachedOnDiskReadBufferFromFile.cpp b/src/Disks/IO/CachedOnDiskReadBufferFromFile.cpp index 7ce3d58dcd8..47ee5858562 100644 --- a/src/Disks/IO/CachedOnDiskReadBufferFromFile.cpp +++ b/src/Disks/IO/CachedOnDiskReadBufferFromFile.cpp @@ -1215,7 +1215,7 @@ size_t CachedOnDiskReadBufferFromFile::getRemainingSizeToRead() void CachedOnDiskReadBufferFromFile::setReadUntilPosition(size_t position) { - if (!allow_seeks_after_first_read) + if (initialized && !allow_seeks_after_first_read) throw Exception(ErrorCodes::LOGICAL_ERROR, "Method `setReadUntilPosition()` not allowed"); if (read_until_position == position) diff --git a/src/Disks/IO/ReadBufferFromRemoteFSGather.cpp b/src/Disks/IO/ReadBufferFromRemoteFSGather.cpp index 0b3ecca3587..417f7615dd7 100644 --- a/src/Disks/IO/ReadBufferFromRemoteFSGather.cpp +++ b/src/Disks/IO/ReadBufferFromRemoteFSGather.cpp @@ -5,6 +5,7 @@ #include #include #include +#include #include #include #include @@ -16,12 +17,16 @@ using namespace DB; namespace { -bool withCache(const ReadSettings & settings) +bool withFileCache(const ReadSettings & settings) { return settings.remote_fs_cache && settings.enable_filesystem_cache && (!CurrentThread::getQueryId().empty() || settings.read_from_filesystem_cache_if_exists_otherwise_bypass_cache || !settings.avoid_readthrough_cache_outside_query_context); } +bool withPageCache(const ReadSettings & settings, bool with_file_cache) +{ + return settings.page_cache && !with_file_cache && settings.use_page_cache_for_disks_without_file_cache; +} } namespace DB @@ -34,7 +39,7 @@ namespace ErrorCodes size_t chooseBufferSizeForRemoteReading(const DB::ReadSettings & settings, size_t file_size) { /// Only when cache is used we could download bigger portions of FileSegments than what we actually gonna read within particular task. - if (!withCache(settings)) + if (!withFileCache(settings)) return settings.remote_fs_buffer_size; /// Buffers used for prefetch and pre-download better to have enough size, but not bigger than the whole file. @@ -44,27 +49,30 @@ size_t chooseBufferSizeForRemoteReading(const DB::ReadSettings & settings, size_ ReadBufferFromRemoteFSGather::ReadBufferFromRemoteFSGather( ReadBufferCreator && read_buffer_creator_, const StoredObjects & blobs_to_read_, + const std::string & cache_path_prefix_, const ReadSettings & settings_, std::shared_ptr cache_log_, bool use_external_buffer_) - : ReadBufferFromFileBase( - use_external_buffer_ ? 0 : chooseBufferSizeForRemoteReading(settings_, getTotalSize(blobs_to_read_)), nullptr, 0) + : ReadBufferFromFileBase(use_external_buffer_ ? 0 : chooseBufferSizeForRemoteReading( + settings_, getTotalSize(blobs_to_read_)), nullptr, 0) , settings(settings_) , blobs_to_read(blobs_to_read_) , read_buffer_creator(std::move(read_buffer_creator_)) + , cache_path_prefix(cache_path_prefix_) , cache_log(settings.enable_filesystem_cache_log ? cache_log_ : nullptr) , query_id(CurrentThread::getQueryId()) , use_external_buffer(use_external_buffer_) - , with_cache(withCache(settings)) + , with_file_cache(withFileCache(settings)) + , with_page_cache(withPageCache(settings, with_file_cache)) , log(getLogger("ReadBufferFromRemoteFSGather")) { if (!blobs_to_read.empty()) current_object = blobs_to_read.front(); } -SeekableReadBufferPtr ReadBufferFromRemoteFSGather::createImplementationBuffer(const StoredObject & object) +SeekableReadBufferPtr ReadBufferFromRemoteFSGather::createImplementationBuffer(const StoredObject & object, size_t start_offset) { - if (current_buf && !with_cache) + if (current_buf && !with_file_cache) { appendUncachedReadInfo(); } @@ -72,30 +80,45 @@ SeekableReadBufferPtr ReadBufferFromRemoteFSGather::createImplementationBuffer(c current_object = object; const auto & object_path = object.remote_path; - size_t current_read_until_position = read_until_position ? read_until_position : object.bytes_size; - auto current_read_buffer_creator = [=, this]() { return read_buffer_creator(object_path, current_read_until_position); }; + std::unique_ptr buf; #ifndef CLICKHOUSE_KEEPER_STANDALONE_BUILD - if (with_cache) + if (with_file_cache) { auto cache_key = settings.remote_fs_cache->createKeyForPath(object_path); - return std::make_shared( + buf = std::make_unique( object_path, cache_key, settings.remote_fs_cache, FileCache::getCommonUser(), - std::move(current_read_buffer_creator), + [=, this]() { return read_buffer_creator(/* restricted_seek */true, object_path); }, settings, query_id, object.bytes_size, /* allow_seeks */false, /* use_external_buffer */true, - read_until_position ? std::optional(read_until_position) : std::nullopt, + /* read_until_position */std::nullopt, cache_log); } #endif - return current_read_buffer_creator(); + /// Can't wrap CachedOnDiskReadBufferFromFile in CachedInMemoryReadBufferFromFile because the + /// former doesn't support seeks. + if (with_page_cache && !buf) + { + auto inner = read_buffer_creator(/* restricted_seek */false, object_path); + auto cache_key = FileChunkAddress { .path = cache_path_prefix + object_path }; + buf = std::make_unique( + cache_key, settings.page_cache, std::move(inner), settings); + } + + if (!buf) + buf = read_buffer_creator(/* restricted_seek */true, object_path); + + if (read_until_position > start_offset && read_until_position < start_offset + object.bytes_size) + buf->setReadUntilPosition(read_until_position - start_offset); + + return buf; } void ReadBufferFromRemoteFSGather::appendUncachedReadInfo() @@ -124,12 +147,12 @@ void ReadBufferFromRemoteFSGather::initialize() return; /// One clickhouse file can be split into multiple files in remote fs. - auto current_buf_offset = file_offset_of_buffer_end; + size_t start_offset = 0; for (size_t i = 0; i < blobs_to_read.size(); ++i) { const auto & object = blobs_to_read[i]; - if (object.bytes_size > current_buf_offset) + if (start_offset + object.bytes_size > file_offset_of_buffer_end) { LOG_TEST(log, "Reading from file: {} ({})", object.remote_path, object.local_path); @@ -137,14 +160,14 @@ void ReadBufferFromRemoteFSGather::initialize() if (!current_buf || current_buf_idx != i) { current_buf_idx = i; - current_buf = createImplementationBuffer(object); + current_buf = createImplementationBuffer(object, start_offset); } - current_buf->seek(current_buf_offset, SEEK_SET); + current_buf->seek(file_offset_of_buffer_end - start_offset, SEEK_SET); return; } - current_buf_offset -= object.bytes_size; + start_offset += object.bytes_size; } current_buf_idx = blobs_to_read.size(); current_buf = nullptr; @@ -171,14 +194,14 @@ bool ReadBufferFromRemoteFSGather::nextImpl() bool ReadBufferFromRemoteFSGather::moveToNextBuffer() { /// If there is no available buffers - nothing to read. - if (current_buf_idx + 1 >= blobs_to_read.size()) + if (current_buf_idx + 1 >= blobs_to_read.size() || (read_until_position && file_offset_of_buffer_end >= read_until_position)) return false; ++current_buf_idx; const auto & object = blobs_to_read[current_buf_idx]; LOG_TEST(log, "Reading from next file: {} ({})", object.remote_path, object.local_path); - current_buf = createImplementationBuffer(object); + current_buf = createImplementationBuffer(object, file_offset_of_buffer_end); return true; } @@ -263,7 +286,7 @@ off_t ReadBufferFromRemoteFSGather::seek(off_t offset, int whence) ReadBufferFromRemoteFSGather::~ReadBufferFromRemoteFSGather() { - if (!with_cache) + if (!with_file_cache) appendUncachedReadInfo(); } diff --git a/src/Disks/IO/ReadBufferFromRemoteFSGather.h b/src/Disks/IO/ReadBufferFromRemoteFSGather.h index f6b7506a54f..8362b354e23 100644 --- a/src/Disks/IO/ReadBufferFromRemoteFSGather.h +++ b/src/Disks/IO/ReadBufferFromRemoteFSGather.h @@ -21,11 +21,12 @@ class ReadBufferFromRemoteFSGather final : public ReadBufferFromFileBase friend class ReadIndirectBufferFromRemoteFS; public: - using ReadBufferCreator = std::function(const std::string & path, size_t read_until_position)>; + using ReadBufferCreator = std::function(bool restricted_seek, const std::string & path)>; ReadBufferFromRemoteFSGather( ReadBufferCreator && read_buffer_creator_, const StoredObjects & blobs_to_read_, + const std::string & cache_path_prefix_, const ReadSettings & settings_, std::shared_ptr cache_log_, bool use_external_buffer_); @@ -53,7 +54,7 @@ public: bool isContentCached(size_t offset, size_t size) override; private: - SeekableReadBufferPtr createImplementationBuffer(const StoredObject & object); + SeekableReadBufferPtr createImplementationBuffer(const StoredObject & object, size_t start_offset); bool nextImpl() override; @@ -70,10 +71,12 @@ private: const ReadSettings settings; const StoredObjects blobs_to_read; const ReadBufferCreator read_buffer_creator; + const std::string cache_path_prefix; const std::shared_ptr cache_log; const String query_id; const bool use_external_buffer; - const bool with_cache; + const bool with_file_cache; + const bool with_page_cache; size_t read_until_position = 0; size_t file_offset_of_buffer_end = 0; diff --git a/src/Disks/IO/ThreadPoolRemoteFSReader.cpp b/src/Disks/IO/ThreadPoolRemoteFSReader.cpp index f3caf62ffd5..590fc4c4656 100644 --- a/src/Disks/IO/ThreadPoolRemoteFSReader.cpp +++ b/src/Disks/IO/ThreadPoolRemoteFSReader.cpp @@ -152,6 +152,8 @@ IAsynchronousReader::Result ThreadPoolRemoteFSReader::execute(Request request, b IAsynchronousReader::Result read_result; if (result) { + chassert(reader.buffer().begin() == request.buf); + chassert(reader.buffer().end() <= request.buf + request.size); read_result.size = reader.buffer().size(); read_result.offset = reader.offset(); ProfileEvents::increment(ProfileEvents::ThreadpoolReaderReadBytes, read_result.size); diff --git a/src/Disks/IO/ThreadPoolRemoteFSReader.h b/src/Disks/IO/ThreadPoolRemoteFSReader.h index abc251b2b10..eacce5a54ac 100644 --- a/src/Disks/IO/ThreadPoolRemoteFSReader.h +++ b/src/Disks/IO/ThreadPoolRemoteFSReader.h @@ -29,6 +29,9 @@ private: class RemoteFSFileDescriptor : public IAsynchronousReader::IFileDescriptor { public: + /// `reader_` implementation must ensure that next() places data at the start of internal_buffer, + /// even if there was previously a seek. I.e. seek() shouldn't leave pending data (no short seek + /// optimization), and nextImpl() shouldn't assign nextimpl_working_buffer_offset. explicit RemoteFSFileDescriptor( SeekableReadBuffer & reader_, std::shared_ptr async_read_counters_) diff --git a/src/Disks/ObjectStorages/AzureBlobStorage/AzureObjectStorage.cpp b/src/Disks/ObjectStorages/AzureBlobStorage/AzureObjectStorage.cpp index 74389aedb64..136f69ab729 100644 --- a/src/Disks/ObjectStorages/AzureBlobStorage/AzureObjectStorage.cpp +++ b/src/Disks/ObjectStorages/AzureBlobStorage/AzureObjectStorage.cpp @@ -206,7 +206,7 @@ std::unique_ptr AzureObjectStorage::readObjects( /// NOL auto read_buffer_creator = [this, settings_ptr, disk_read_settings] - (const std::string & path, size_t read_until_position) -> std::unique_ptr + (bool restricted_seek, const std::string & path) -> std::unique_ptr { return std::make_unique( client.get(), @@ -215,8 +215,7 @@ std::unique_ptr AzureObjectStorage::readObjects( /// NOL settings_ptr->max_single_read_retries, settings_ptr->max_single_download_retries, /* use_external_buffer */true, - /* restricted_seek */true, - read_until_position); + restricted_seek); }; switch (read_settings.remote_fs_method) @@ -226,16 +225,17 @@ std::unique_ptr AzureObjectStorage::readObjects( /// NOL return std::make_unique( std::move(read_buffer_creator), objects, + "azure:", disk_read_settings, global_context->getFilesystemCacheLog(), /* use_external_buffer */false); - } case RemoteFSReadMethod::threadpool: { auto impl = std::make_unique( std::move(read_buffer_creator), objects, + "azure:", disk_read_settings, global_context->getFilesystemCacheLog(), /* use_external_buffer */true); diff --git a/src/Disks/ObjectStorages/DiskObjectStorage.cpp b/src/Disks/ObjectStorages/DiskObjectStorage.cpp index 2a648f28f14..16183ec20c1 100644 --- a/src/Disks/ObjectStorages/DiskObjectStorage.cpp +++ b/src/Disks/ObjectStorages/DiskObjectStorage.cpp @@ -527,10 +527,9 @@ std::unique_ptr DiskObjectStorage::readFile( std::optional read_hint, std::optional file_size) const { - auto storage_objects = metadata_storage->getStorageObjects(path); + const auto storage_objects = metadata_storage->getStorageObjects(path); const bool file_can_be_empty = !file_size.has_value() || *file_size == 0; - if (storage_objects.empty() && file_can_be_empty) return std::make_unique(); diff --git a/src/Disks/ObjectStorages/HDFS/HDFSObjectStorage.cpp b/src/Disks/ObjectStorages/HDFS/HDFSObjectStorage.cpp index fa5e227d853..f8545ecfe39 100644 --- a/src/Disks/ObjectStorages/HDFS/HDFSObjectStorage.cpp +++ b/src/Disks/ObjectStorages/HDFS/HDFSObjectStorage.cpp @@ -60,7 +60,7 @@ std::unique_ptr HDFSObjectStorage::readObjects( /// NOLI auto disk_read_settings = patchSettings(read_settings); auto read_buffer_creator = [this, disk_read_settings] - (const std::string & path, size_t /* read_until_position */) -> std::unique_ptr + (bool /* restricted_seek */, const std::string & path) -> std::unique_ptr { size_t begin_of_path = path.find('/', path.find("//") + 2); auto hdfs_path = path.substr(begin_of_path); @@ -71,7 +71,7 @@ std::unique_ptr HDFSObjectStorage::readObjects( /// NOLI }; return std::make_unique( - std::move(read_buffer_creator), objects, disk_read_settings, nullptr, /* use_external_buffer */false); + std::move(read_buffer_creator), objects, "hdfs:", disk_read_settings, nullptr, /* use_external_buffer */false); } std::unique_ptr HDFSObjectStorage::writeObject( /// NOLINT diff --git a/src/Disks/ObjectStorages/Local/LocalObjectStorage.cpp b/src/Disks/ObjectStorages/Local/LocalObjectStorage.cpp index 02700b358e0..7fd4536f266 100644 --- a/src/Disks/ObjectStorages/Local/LocalObjectStorage.cpp +++ b/src/Disks/ObjectStorages/Local/LocalObjectStorage.cpp @@ -47,7 +47,7 @@ std::unique_ptr LocalObjectStorage::readObjects( /// NOL auto modified_settings = patchSettings(read_settings); auto global_context = Context::getGlobalContextInstance(); auto read_buffer_creator = - [=] (const std::string & file_path, size_t /* read_until_position */) + [=] (bool /* restricted_seek */, const std::string & file_path) -> std::unique_ptr { return createReadBufferFromFileBase(file_path, modified_settings, read_hint, file_size); @@ -58,13 +58,13 @@ std::unique_ptr LocalObjectStorage::readObjects( /// NOL case RemoteFSReadMethod::read: { return std::make_unique( - std::move(read_buffer_creator), objects, modified_settings, + std::move(read_buffer_creator), objects, "file:", modified_settings, global_context->getFilesystemCacheLog(), /* use_external_buffer */false); } case RemoteFSReadMethod::threadpool: { auto impl = std::make_unique( - std::move(read_buffer_creator), objects, modified_settings, + std::move(read_buffer_creator), objects, "file:", modified_settings, global_context->getFilesystemCacheLog(), /* use_external_buffer */true); auto & reader = global_context->getThreadPoolReader(FilesystemReaderType::ASYNCHRONOUS_REMOTE_FS_READER); diff --git a/src/Disks/ObjectStorages/S3/S3ObjectStorage.cpp b/src/Disks/ObjectStorages/S3/S3ObjectStorage.cpp index 5771eb1ebe0..d89c7c93e51 100644 --- a/src/Disks/ObjectStorages/S3/S3ObjectStorage.cpp +++ b/src/Disks/ObjectStorages/S3/S3ObjectStorage.cpp @@ -171,7 +171,7 @@ std::unique_ptr S3ObjectStorage::readObjects( /// NOLINT auto read_buffer_creator = [this, settings_ptr, disk_read_settings] - (const std::string & path, size_t read_until_position) -> std::unique_ptr + (bool restricted_seek, const std::string & path) -> std::unique_ptr { return std::make_unique( client.get(), @@ -182,8 +182,8 @@ std::unique_ptr S3ObjectStorage::readObjects( /// NOLINT disk_read_settings, /* use_external_buffer */true, /* offset */0, - read_until_position, - /* restricted_seek */true); + /* read_until_position */0, + restricted_seek); }; switch (read_settings.remote_fs_method) @@ -193,16 +193,17 @@ std::unique_ptr S3ObjectStorage::readObjects( /// NOLINT return std::make_unique( std::move(read_buffer_creator), objects, + "s3:" + uri.bucket + "/", disk_read_settings, global_context->getFilesystemCacheLog(), /* use_external_buffer */false); - } case RemoteFSReadMethod::threadpool: { auto impl = std::make_unique( std::move(read_buffer_creator), objects, + "s3:" + uri.bucket + "/", disk_read_settings, global_context->getFilesystemCacheLog(), /* use_external_buffer */true); diff --git a/src/Disks/ObjectStorages/Web/WebObjectStorage.cpp b/src/Disks/ObjectStorages/Web/WebObjectStorage.cpp index 786b23caf48..48de0bf4168 100644 --- a/src/Disks/ObjectStorages/Web/WebObjectStorage.cpp +++ b/src/Disks/ObjectStorages/Web/WebObjectStorage.cpp @@ -252,14 +252,13 @@ std::unique_ptr WebObjectStorage::readObject( /// NOLINT { auto read_buffer_creator = [this, read_settings] - (const std::string & path_, size_t read_until_position) -> std::unique_ptr + (bool /* restricted_seek */, const std::string & path_) -> std::unique_ptr { return std::make_unique( fs::path(url) / path_, getContext(), read_settings, - /* use_external_buffer */true, - read_until_position); + /* use_external_buffer */true); }; auto global_context = Context::getGlobalContextInstance(); @@ -271,6 +270,7 @@ std::unique_ptr WebObjectStorage::readObject( /// NOLINT return std::make_unique( std::move(read_buffer_creator), StoredObjects{object}, + "url:" + url + "/", read_settings, global_context->getFilesystemCacheLog(), /* use_external_buffer */false); @@ -280,6 +280,7 @@ std::unique_ptr WebObjectStorage::readObject( /// NOLINT auto impl = std::make_unique( std::move(read_buffer_creator), StoredObjects{object}, + "url:" + url + "/", read_settings, global_context->getFilesystemCacheLog(), /* use_external_buffer */true); diff --git a/src/IO/AsynchronousReader.h b/src/IO/AsynchronousReader.h index 279a399caad..f9590b4419f 100644 --- a/src/IO/AsynchronousReader.h +++ b/src/IO/AsynchronousReader.h @@ -54,6 +54,9 @@ public: struct Result { + /// The read data is at [buf + offset, buf + size), where `buf` is from Request struct. + /// (Notice that `offset` is included in `size`.) + /// size /// Less than requested amount of data can be returned. /// If size is zero - the file has ended. diff --git a/src/IO/BufferBase.h b/src/IO/BufferBase.h index 4c0a467b155..1a087dd87fa 100644 --- a/src/IO/BufferBase.h +++ b/src/IO/BufferBase.h @@ -60,6 +60,9 @@ public: BufferBase(Position ptr, size_t size, size_t offset) : pos(ptr + offset), working_buffer(ptr, ptr + size), internal_buffer(ptr, ptr + size) {} + /// Assign the buffers and pos. + /// Be careful when calling this from ReadBuffer::nextImpl() implementations: `offset` is + /// effectively ignored because ReadBuffer::next() reassigns `pos`. void set(Position ptr, size_t size, size_t offset) { internal_buffer = Buffer(ptr, ptr + size); diff --git a/src/IO/CachedInMemoryReadBufferFromFile.cpp b/src/IO/CachedInMemoryReadBufferFromFile.cpp new file mode 100644 index 00000000000..384d2229f14 --- /dev/null +++ b/src/IO/CachedInMemoryReadBufferFromFile.cpp @@ -0,0 +1,188 @@ +#include "CachedInMemoryReadBufferFromFile.h" +#include +#include +#include + +namespace DB +{ + +namespace ErrorCodes +{ + extern const int UNEXPECTED_END_OF_FILE; + extern const int CANNOT_SEEK_THROUGH_FILE; + extern const int SEEK_POSITION_OUT_OF_BOUND; +} + +CachedInMemoryReadBufferFromFile::CachedInMemoryReadBufferFromFile( + FileChunkAddress cache_key_, PageCachePtr cache_, std::unique_ptr in_, const ReadSettings & settings_) + : ReadBufferFromFileBase(0, nullptr, 0, in_->getFileSize()), cache_key(cache_key_), cache(cache_), settings(settings_), in(std::move(in_)) + , read_until_position(file_size.value()) +{ + cache_key.offset = 0; +} + +String CachedInMemoryReadBufferFromFile::getFileName() const +{ + return in->getFileName(); +} + +off_t CachedInMemoryReadBufferFromFile::seek(off_t off, int whence) +{ + if (whence != SEEK_SET) + throw Exception(ErrorCodes::CANNOT_SEEK_THROUGH_FILE, "Only SEEK_SET mode is allowed."); + + size_t offset = static_cast(off); + if (offset > file_size.value()) + throw Exception(ErrorCodes::SEEK_POSITION_OUT_OF_BOUND, "Seek position is out of bounds. Offset: {}", off); + + if (offset >= file_offset_of_buffer_end - working_buffer.size() && offset <= file_offset_of_buffer_end) + { + pos = working_buffer.end() - (file_offset_of_buffer_end - offset); + chassert(getPosition() == off); + return off; + } + + resetWorkingBuffer(); + + file_offset_of_buffer_end = offset; + chunk.reset(); + + chassert(getPosition() == off); + return off; +} + +off_t CachedInMemoryReadBufferFromFile::getPosition() +{ + return file_offset_of_buffer_end - available(); +} + +size_t CachedInMemoryReadBufferFromFile::getFileOffsetOfBufferEnd() const +{ + return file_offset_of_buffer_end; +} + +void CachedInMemoryReadBufferFromFile::setReadUntilPosition(size_t position) +{ + read_until_position = position; + if (position < static_cast(getPosition())) + { + resetWorkingBuffer(); + chunk.reset(); + } + else if (position < file_offset_of_buffer_end) + { + size_t diff = file_offset_of_buffer_end - position; + working_buffer.resize(working_buffer.size() - diff); + file_offset_of_buffer_end -= diff; + } +} + +void CachedInMemoryReadBufferFromFile::setReadUntilEnd() +{ + setReadUntilPosition(file_size.value()); +} + +bool CachedInMemoryReadBufferFromFile::nextImpl() +{ + chassert(read_until_position <= file_size.value()); + if (file_offset_of_buffer_end >= read_until_position) + return false; + + if (chunk.has_value() && file_offset_of_buffer_end >= cache_key.offset + cache->chunkSize()) + { + chassert(file_offset_of_buffer_end == cache_key.offset + cache->chunkSize()); + chunk.reset(); + } + + if (!chunk.has_value()) + { + cache_key.offset = file_offset_of_buffer_end / cache->chunkSize() * cache->chunkSize(); + chunk = cache->getOrSet(cache_key.hash(), settings.read_from_page_cache_if_exists_otherwise_bypass_cache, settings.page_cache_inject_eviction); + + size_t chunk_size = std::min(cache->chunkSize(), file_size.value() - cache_key.offset); + + std::unique_lock download_lock(chunk->getChunk()->state.download_mutex); + + if (!chunk->isPrefixPopulated(chunk_size)) + { + /// A few things could be improved here, which may or may not be worth the added complexity: + /// * If the next file chunk is in cache, use in->setReadUntilPosition() to limit the read to + /// just one chunk. More generally, look ahead in the cache to count how many next chunks + /// need to be downloaded. (Up to some limit? And avoid changing `in`'s until-position if + /// it's already reasonable; otherwise we'd increase it by one chunk every chunk, discarding + /// a half-completed HTTP request every time.) + /// * If only a subset of pages are missing from this chunk, download only them, + /// with some threshold for avoiding short seeks. + /// In particular, if a previous download failed in the middle of the chunk, we could + /// resume from that position instead of from the beginning of the chunk. + /// (It's also possible in principle that a proper subset of chunk's pages was reclaimed + /// by the OS. But, for performance purposes, we should completely ignore that, because + /// (a) PageCache normally uses 2 MiB transparent huge pages and has just one such page + /// per chunk, and (b) even with 4 KiB pages partial chunk eviction is extremely rare.) + /// * If our [position, read_until_position) covers only part of the chunk, we could download + /// just that part. (Which would be bad if someone else needs the rest of the chunk and has + /// to do a whole new HTTP request to get it. Unclear what the policy should be.) + /// * Instead of doing in->next() in a loop until we get the whole chunk, we could return the + /// results as soon as in->next() produces them. + /// (But this would make the download_mutex situation much more complex, similar to the + /// FileSegment::State::PARTIALLY_DOWNLOADED and FileSegment::setRemoteFileReader() stuff.) + + Buffer prev_in_buffer = in->internalBuffer(); + SCOPE_EXIT({ in->set(prev_in_buffer.begin(), prev_in_buffer.size()); }); + + size_t pos = 0; + while (pos < chunk_size) + { + char * piece_start = chunk->getChunk()->data + pos; + size_t piece_size = chunk_size - pos; + in->set(piece_start, piece_size); + LOG_INFO(&Poco::Logger::get("asdqwe"), "this {:x}, in {:x}, path {}, size {}, offset {:x}, pos {:x}", reinterpret_cast(this), reinterpret_cast(in.get()), cache_key.path, file_size.value(), cache_key.offset, pos); + if (pos == 0) + in->seek(cache_key.offset, SEEK_SET); + else + chassert(!in->available()); + + if (in->eof()) + throw Exception(ErrorCodes::UNEXPECTED_END_OF_FILE, "File {} ended after {} bytes, but we expected {}", + getFileName(), cache_key.offset + pos, file_size.value()); + + chassert(in->position() >= piece_start && in->buffer().end() <= piece_start + piece_size); + chassert(in->getPosition() == static_cast(cache_key.offset + pos)); + + size_t n = in->available(); + chassert(n); + if (in->position() != piece_start) + memmove(piece_start, in->position(), n); + in->position() += n; + pos += n; + LOG_INFO(&Poco::Logger::get("asdqwe"), "this {:x}, got {:x} bytes", reinterpret_cast(this), n); + } + + chunk->markPrefixPopulated(chunk_size); + } + } + + nextimpl_working_buffer_offset = file_offset_of_buffer_end - cache_key.offset; + working_buffer = Buffer( + chunk->getChunk()->data, + chunk->getChunk()->data + std::min(chunk->getChunk()->size, read_until_position - cache_key.offset)); + pos = working_buffer.begin() + nextimpl_working_buffer_offset; + + if (!internal_buffer.empty()) + { + /// We were given an external buffer to read into. Copy the data into it. + /// Would be nice to avoid this copy, somehow, maybe by making ReadBufferFromRemoteFSGather + /// and AsynchronousBoundedReadBuffer explicitly aware of the page cache. + size_t n = std::min(available(), internal_buffer.size()); + memcpy(internal_buffer.begin(), pos, n); + working_buffer = Buffer(internal_buffer.begin(), internal_buffer.begin() + n); + pos = working_buffer.begin(); + nextimpl_working_buffer_offset = 0; + } + + file_offset_of_buffer_end += available(); + + return true; +} + +} diff --git a/src/IO/CachedInMemoryReadBufferFromFile.h b/src/IO/CachedInMemoryReadBufferFromFile.h new file mode 100644 index 00000000000..300c2e82386 --- /dev/null +++ b/src/IO/CachedInMemoryReadBufferFromFile.h @@ -0,0 +1,41 @@ +#pragma once + +#include +#include + +namespace DB +{ + +class CachedInMemoryReadBufferFromFile : public ReadBufferFromFileBase +{ +public: + /// `in_` must support using external buffer. I.e. we assign its internal_buffer before each next() + /// call and expect the read data to be put into that buffer. + /// `in_` should be seekable and should be able to read the whole file from 0 to in_->getFileSize(); + /// if you set `in_`'s read-until-position bypassing CachedInMemoryReadBufferFromFile then + /// CachedInMemoryReadBufferFromFile will break. + CachedInMemoryReadBufferFromFile(FileChunkAddress cache_key_, PageCachePtr cache_, std::unique_ptr in_, const ReadSettings & settings_); + + String getFileName() const override; + off_t seek(off_t off, int whence) override; + off_t getPosition() override; + size_t getFileOffsetOfBufferEnd() const override; + bool supportsRightBoundedReads() const override { return true; } + void setReadUntilPosition(size_t position) override; + void setReadUntilEnd() override; + +private: + FileChunkAddress cache_key; // .offset is offset of `chunk` start + PageCachePtr cache; + ReadSettings settings; + std::unique_ptr in; + + size_t file_offset_of_buffer_end = 0; + size_t read_until_position; + + std::optional chunk; + + bool nextImpl() override; +}; + +} diff --git a/src/IO/ReadBuffer.h b/src/IO/ReadBuffer.h index b45bc8f3dbc..00325734354 100644 --- a/src/IO/ReadBuffer.h +++ b/src/IO/ReadBuffer.h @@ -225,11 +225,22 @@ public: * - seek() to a position above the until position (even if you setReadUntilPosition() to a * higher value right after the seek!), * - * Typical implementations discard any current buffers and connections, even if the position is - * adjusted only a little. + * Implementations are recommended to: + * - Allow the read-until-position to go below current position, e.g.: + * // Read block [300, 400) + * setReadUntilPosition(400); + * seek(300); + * next(); + * // Read block [100, 200) + * setReadUntilPosition(200); // oh oh, this is below the current position, but should be allowed + * seek(100); // but now everything's fine again + * next(); + * // (Swapping the order of seek and setReadUntilPosition doesn't help: then it breaks if the order of blocks is reversed.) + * - Check if new read-until-position value is equal to the current value and do nothing in this case, + * so that the caller doesn't have to. * - * Typical usage is to call it right after creating the ReadBuffer, before it started doing any - * work. + * Typical implementations discard any current buffers and connections when the + * read-until-position changes even by a small (nonzero) amount. */ virtual void setReadUntilPosition(size_t /* position */) {} diff --git a/src/IO/ReadSettings.h b/src/IO/ReadSettings.h index c397689d6ad..f4dc7880be4 100644 --- a/src/IO/ReadSettings.h +++ b/src/IO/ReadSettings.h @@ -61,6 +61,7 @@ enum class RemoteFSReadMethod }; class MMappedFileCache; +class PageCache; struct ReadSettings { @@ -102,6 +103,12 @@ struct ReadSettings bool avoid_readthrough_cache_outside_query_context = true; size_t filesystem_cache_segments_batch_size = 20; + //asdqwe assign these two + bool use_page_cache_for_disks_without_file_cache = false; + bool read_from_page_cache_if_exists_otherwise_bypass_cache = false; + bool page_cache_inject_eviction = false; + std::shared_ptr page_cache; + size_t filesystem_cache_max_download_size = (128UL * 1024 * 1024 * 1024); bool skip_download_if_exceeds_query_cache = true; diff --git a/src/Interpreters/Context.cpp b/src/Interpreters/Context.cpp index 8304a876fb1..53fd7d9b45f 100644 --- a/src/Interpreters/Context.cpp +++ b/src/Interpreters/Context.cpp @@ -17,6 +17,7 @@ #include #include #include +#include #include #include #include @@ -294,6 +295,7 @@ struct ContextSharedPart : boost::noncopyable mutable MarkCachePtr index_mark_cache TSA_GUARDED_BY(mutex); /// Cache of marks in compressed files of MergeTree indices. mutable MMappedFileCachePtr mmap_cache TSA_GUARDED_BY(mutex); /// Cache of mmapped files to avoid frequent open/map/unmap/close and to reuse from several threads. AsynchronousMetrics * asynchronous_metrics TSA_GUARDED_BY(mutex) = nullptr; /// Points to asynchronous metrics + mutable PageCachePtr page_cache TSA_GUARDED_BY(mutex); /// Userspace page cache. ProcessList process_list; /// Executing queries at the moment. SessionTracker session_tracker; GlobalOvercommitTracker global_overcommit_tracker; @@ -1228,7 +1230,7 @@ void Context::setUser(const UUID & user_id_, const std::optional() and other AccessControl's functions may require some IO work, - /// so Context::getLock() must be unlocked while we're doing this. + /// so Context::getLocalLock() and Context::getGlobalLock() must be unlocked while we're doing this. auto & access_control = getAccessControl(); auto user = access_control.read(user_id_); @@ -1358,7 +1360,7 @@ void Context::checkAccess(const AccessRightsElements & elements) const { return std::shared_ptr Context::getAccess() const { - /// A helper function to collect parameters for calculating access rights, called with Context::getLock() acquired. + /// A helper function to collect parameters for calculating access rights, called with Context::getLocalSharedLock() acquired. auto get_params = [this]() { /// If setUserID() was never called then this must be the global context with the full access. @@ -1385,7 +1387,8 @@ std::shared_ptr Context::getAccess() const } /// Calculate new access rights according to the collected parameters. - /// NOTE: AccessControl::getContextAccess() may require some IO work, so Context::getLock() must be unlocked while we're doing this. + /// NOTE: AccessControl::getContextAccess() may require some IO work, so Context::getLocalLock() + /// and Context::getGlobalLock() must be unlocked while we're doing this. auto res = getAccessControl().getContextAccess(*params); { @@ -2714,6 +2717,33 @@ void Context::clearUncompressedCache() const shared->uncompressed_cache->clear(); } +void Context::setPageCache(size_t bytes_per_chunk, size_t bytes_per_mmap, size_t bytes_total, bool use_madv_free, bool use_huge_pages) +{ + std::lock_guard lock(shared->mutex); + + if (shared->page_cache) + throw Exception(ErrorCodes::LOGICAL_ERROR, "Page cache has been already created."); + + shared->page_cache = std::make_shared(bytes_per_chunk, bytes_per_mmap, bytes_total, use_madv_free, use_huge_pages); +} + +PageCachePtr Context::getPageCache() const +{ + SharedLockGuard lock(shared->mutex); + return shared->page_cache; +} + +void Context::dropPageCache() const +{ + PageCachePtr cache; + { + SharedLockGuard lock(shared->mutex); + cache = shared->page_cache; + } + if (cache) + cache->dropCache(); +} + void Context::setMarkCache(const String & cache_policy, size_t max_cache_size_in_bytes, double size_ratio) { std::lock_guard lock(shared->mutex); @@ -5130,6 +5160,11 @@ ReadSettings Context::getReadSettings() const res.filesystem_cache_max_download_size = settings.filesystem_cache_max_download_size; res.skip_download_if_exceeds_query_cache = settings.skip_download_if_exceeds_query_cache; + res.page_cache = getPageCache(); + res.use_page_cache_for_disks_without_file_cache = settings.use_page_cache_for_disks_without_file_cache; + res.read_from_page_cache_if_exists_otherwise_bypass_cache = settings.read_from_page_cache_if_exists_otherwise_bypass_cache; + res.page_cache_inject_eviction = settings.page_cache_inject_eviction; + res.remote_read_min_bytes_for_seek = settings.remote_read_min_bytes_for_seek; /// Zero read buffer will not make progress. diff --git a/src/Interpreters/Context.h b/src/Interpreters/Context.h index 7bbff9c63bb..ec5a044b28f 100644 --- a/src/Interpreters/Context.h +++ b/src/Interpreters/Context.h @@ -79,6 +79,7 @@ class RefreshSet; class Cluster; class Compiler; class MarkCache; +class PageCache; class MMappedFileCache; class UncompressedCache; class ProcessList; @@ -968,6 +969,10 @@ public: std::shared_ptr getUncompressedCache() const; void clearUncompressedCache() const; + void setPageCache(size_t bytes_per_chunk, size_t bytes_per_mmap, size_t bytes_total, bool use_madv_free, bool use_huge_pages); + std::shared_ptr getPageCache() const; + void dropPageCache() const; + void setMarkCache(const String & cache_policy, size_t max_cache_size_in_bytes, double size_ratio); void updateMarkCacheConfiguration(const Poco::Util::AbstractConfiguration & config); std::shared_ptr getMarkCache() const; diff --git a/src/Interpreters/InterpreterSystemQuery.cpp b/src/Interpreters/InterpreterSystemQuery.cpp index a078d99facf..4bb47a8c9e3 100644 --- a/src/Interpreters/InterpreterSystemQuery.cpp +++ b/src/Interpreters/InterpreterSystemQuery.cpp @@ -10,6 +10,7 @@ #include #include #include +#include #include #include #include @@ -460,6 +461,13 @@ BlockIO InterpreterSystemQuery::execute() { throw Exception(ErrorCodes::SUPPORT_IS_DISABLED, "Not implemented"); } + case Type::DROP_PAGE_CACHE: + { + getContext()->checkAccess(AccessType::SYSTEM_DROP_PAGE_CACHE); + + getContext()->dropPageCache(); + break; + } case Type::DROP_SCHEMA_CACHE: { getContext()->checkAccess(AccessType::SYSTEM_DROP_SCHEMA_CACHE); @@ -1201,6 +1209,7 @@ AccessRightsElements InterpreterSystemQuery::getRequiredAccessForDDLOnCluster() case Type::DROP_INDEX_UNCOMPRESSED_CACHE: case Type::DROP_FILESYSTEM_CACHE: case Type::SYNC_FILESYSTEM_CACHE: + case Type::DROP_PAGE_CACHE: case Type::DROP_SCHEMA_CACHE: case Type::DROP_FORMAT_SCHEMA_CACHE: case Type::DROP_S3_CLIENT_CACHE: diff --git a/src/Interpreters/ServerAsynchronousMetrics.cpp b/src/Interpreters/ServerAsynchronousMetrics.cpp index bdf314f35b9..fe7ccd64ffe 100644 --- a/src/Interpreters/ServerAsynchronousMetrics.cpp +++ b/src/Interpreters/ServerAsynchronousMetrics.cpp @@ -9,6 +9,8 @@ #include #include +#include + #include #include @@ -77,6 +79,16 @@ void ServerAsynchronousMetrics::updateImpl(TimePoint update_time, TimePoint curr new_values["MarkCacheFiles"] = { mark_cache->count(), "Total number of mark files cached in the mark cache" }; } + if (auto page_cache = getContext()->getPageCache()) + { + auto rss = page_cache->getResidentSetSize(); + new_values["PageCacheBytes"] = { rss.page_cache_rss, "Userspace page cache memory usage in bytes" }; + new_values["PageCachePinnedBytes"] = { page_cache->getPinnedSize(), "Userspace page cache memory that's currently in use and can't be evicted" }; + + if (rss.unreclaimable_rss.has_value()) + new_values["UnreclaimableRSS"] = { *rss.unreclaimable_rss, "The amount of physical memory used by the server process, in bytes, excluding memory reclaimable by the OS (MADV_FREE)" }; + } + if (auto uncompressed_cache = getContext()->getUncompressedCache()) { new_values["UncompressedCacheBytes"] = { uncompressed_cache->sizeInBytes(), diff --git a/src/Interpreters/tests/gtest_page_cache.cpp b/src/Interpreters/tests/gtest_page_cache.cpp new file mode 100644 index 00000000000..1e2688c0ca2 --- /dev/null +++ b/src/Interpreters/tests/gtest_page_cache.cpp @@ -0,0 +1,267 @@ +#include +#include +#include + +#ifdef OS_LINUX +#include +#endif + +using namespace DB; + +namespace ProfileEvents +{ + extern const Event PageCacheChunkMisses; + extern const Event PageCacheChunkShared; + extern const Event PageCacheChunkDataHits; + extern const Event PageCacheChunkDataPartialHits; + extern const Event PageCacheChunkDataMisses; +} + +#define CHECK(x) \ + do { \ + if (!(x)) \ + { \ + std::cerr << "check on line " << __LINE__ << " failed: " << #x << std::endl; \ + std::abort(); \ + } \ + } while (false) + +size_t estimateRAMSize() +{ +#ifdef OS_LINUX + struct sysinfo info; + int r = sysinfo(&info); + CHECK(r == 0); + return static_cast(info.totalram * info.mem_unit); +#else + return 128ul << 30; +#endif +} + +/// Do random reads and writes in PageCache from multiple threads, check that the data read matches the data written. +TEST(PageCache, DISABLED_Stress) +{ + /// There doesn't seem to be a reasonable way to simulate memory pressure or force the eviction of MADV_FREE-d pages. + /// So we actually map more virtual memory than we have RAM and fill it all up a few times. + /// This takes an eternity (a few minutes), but idk how else to hit MADV_FREE eviction. + /// Expect ~1 GB/s, bottlenecked by page faults. + size_t ram_size = estimateRAMSize(); + PageCache cache(2 << 20, 1 << 30, ram_size + ram_size / 10, /* use_madv_free */ true, /* use_huge_pages */ true); + + CHECK(cache.getResidentSetSize().page_cache_rss); + + const size_t num_keys = static_cast(cache.maxChunks() * 1.5); + const size_t pages_per_chunk = cache.chunkSize() / cache.pageSize(); + const size_t items_per_page = cache.pageSize() / 8; + + const size_t passes = 2; + const size_t step = 20; + const size_t num_threads = 20; + const size_t chunks_touched = num_keys * passes * num_threads / step; + std::atomic progress {0}; + std::atomic threads_finished {0}; + + std::atomic total_racing_writes {0}; + + auto thread_func = [&] + { + pcg64 rng(randomSeed()); + std::vector pinned; + + /// Stats. + size_t racing_writes = 0; + + for (size_t i = 0; i < num_keys * passes; i += step) + { + progress += 1; + + /// Touch the chunks sequentially + noise (to increase interference across threads), or at random 10% of the time. + size_t key_idx; + if (rng() % 10 == 0) + key_idx = std::uniform_int_distribution(0, num_keys - 1)(rng); + else + key_idx = (i + std::uniform_int_distribution(0, num_keys / 1000)(rng)) % num_keys; + + /// For some keys, always use detached_if_missing = true and check that cache always misses. + bool key_detached_if_missing = key_idx % 100 == 42; + bool detached_if_missing = key_detached_if_missing || i % 101 == 42; + + PageCacheKey key = key_idx * 0xcafebabeb0bad00dul; // a simple reversible hash (the constant can be any odd number) + + PinnedPageChunk chunk = cache.getOrSet(key, detached_if_missing, /* inject_eviction */ false); + + if (key_detached_if_missing) + CHECK(!chunk.getChunk()->pages_populated.any()); + + for (size_t page_idx = 0; page_idx < pages_per_chunk; ++page_idx) + { + bool populated = chunk.getChunk()->pages_populated.get(page_idx); + /// Generate page contents deterministically from key and page index. + size_t start = key_idx * page_idx; + if (start % 37 == 13) + { + /// Leave ~1/37 of the pages unpopulated. + CHECK(!populated); + } + else + { + /// We may write/read the same memory from multiple threads in parallel here. + std::atomic * items = reinterpret_cast *>(chunk.getChunk()->data + cache.pageSize() * page_idx); + if (populated) + { + for (size_t j = 0; j < items_per_page; ++j) + CHECK(items[j].load(std::memory_order_relaxed) == start + j); + } + else + { + for (size_t j = 0; j < items_per_page; ++j) + items[j].store(start + j, std::memory_order_relaxed); + if (!chunk.markPagePopulated(page_idx)) + racing_writes += 1; + } + } + } + + pinned.push_back(std::move(chunk)); + CHECK(cache.getPinnedSize() >= cache.chunkSize()); + /// Unpin 2 chunks on average. + while (rng() % 3 != 0 && !pinned.empty()) + { + size_t idx = rng() % pinned.size(); + if (idx != pinned.size() - 1) + pinned[idx] = std::move(pinned.back()); + pinned.pop_back(); + } + } + + total_racing_writes += racing_writes; + threads_finished += 1; + }; + + std::cout << fmt::format("doing {:.1f} passes over {:.1f} GiB of virtual memory\nthis will take a few minutes, progress printed every 10 seconds", + chunks_touched * 1. / cache.maxChunks(), cache.maxChunks() * cache.chunkSize() * 1. / (1ul << 30)) << std::endl; + + auto start_time = std::chrono::steady_clock::now(); + + std::vector threads; + for (size_t i = 0; i < num_threads; ++i) + threads.emplace_back(thread_func); + + for (size_t poll = 0;; ++poll) + { + if (threads_finished == num_threads) + break; + if (poll % 100 == 0) + std::cout << fmt::format("{:.3f}%", progress.load() * 100. / num_keys / passes / num_threads * step) << std::endl; + std::this_thread::sleep_for(std::chrono::milliseconds(100)); + } + for (std::thread & t : threads) + t.join(); + + auto end_time = std::chrono::steady_clock::now(); + double elapsed_seconds = std::chrono::duration_cast>(end_time - start_time).count(); + double touched_gib = chunks_touched * cache.chunkSize() * 1. / (1ul << 30); + std::cout << fmt::format("touched {:.1f} GiB in {:.1f} seconds, that's {:.3f} GiB/s", + touched_gib, elapsed_seconds, touched_gib / elapsed_seconds) << std::endl; + + auto & counters = CurrentThread::getProfileEvents(); + + std::cout << "stats:" + << "\nchunk misses: " << counters[ProfileEvents::PageCacheChunkMisses].load() + << "\nchunk shared: " << counters[ProfileEvents::PageCacheChunkShared].load() + << "\nchunk data misses: " << counters[ProfileEvents::PageCacheChunkDataMisses].load() + << "\nchunk data partial hits: " << counters[ProfileEvents::PageCacheChunkDataPartialHits].load() + << "\nchunk data hits: " << counters[ProfileEvents::PageCacheChunkDataHits].load() + << "\nracing page writes: " << total_racing_writes << std::endl; + + /// Check that we at least hit all the cases. + CHECK(counters[ProfileEvents::PageCacheChunkMisses].load() > 0); + CHECK(counters[ProfileEvents::PageCacheChunkShared].load() > 0); + CHECK(counters[ProfileEvents::PageCacheChunkDataMisses].load() > 0); + /// Partial hits are rare enough that sometimes this is zero, so don't check it. + /// That's good news because we don't need to implement downloading parts of a chunk. + /// CHECK(counters[ProfileEvents::PageCacheChunkDataPartialHits].load() > 0); + CHECK(counters[ProfileEvents::PageCacheChunkDataHits].load() > 0); + CHECK(total_racing_writes > 0); + CHECK(cache.getPinnedSize() == 0); + + size_t rss = cache.getResidentSetSize().page_cache_rss; + std::cout << "RSS: " << rss * 1. / (1ul << 30) << " GiB" << std::endl; + /// This can be flaky if the system has < 10% free memory. If this turns out to be a problem, feel free to remove or reduce. + CHECK(rss > ram_size / 10); + + cache.dropCache(); + +#ifdef OS_LINUX + /// MADV_DONTNEED is not synchronous, and we're freeing lots of pages. Let's give Linux a lot of time. + std::this_thread::sleep_for(std::chrono::seconds(10)); + size_t new_rss = cache.getResidentSetSize().page_cache_rss; + std::cout << "RSS after dropping cache: " << new_rss * 1. / (1ul << 30) << " GiB" << std::endl; + CHECK(new_rss < rss / 2); +#endif +} + +/// Benchmark that measures the PageCache overhead for cache hits. Doesn't touch the actual data, so +/// memory bandwidth mostly doesn't factor into this. +/// This measures the overhead of things like madvise(MADV_FREE) and probing the pages (restoreChunkFromLimbo()). +/// Disabled in CI, run manually with --gtest_also_run_disabled_tests --gtest_filter=PageCache.DISABLED_HitsBench +TEST(PageCache, DISABLED_HitsBench) +{ + /// Do a few runs, with and without MADV_FREE. + for (size_t num_threads = 1; num_threads <= 16; num_threads *= 2) + { + for (size_t run = 0; run < 8; ++ run) + { + bool use_madv_free = run % 2 == 1; + bool use_huge_pages = run % 4 / 2 == 1; + + PageCache cache(2 << 20, 1ul << 30, 20ul << 30, use_madv_free, use_huge_pages); + size_t passes = 3; + std::atomic total_misses {0}; + + /// Prepopulate all chunks. + for (size_t i = 0; i < cache.maxChunks(); ++i) + { + PageCacheKey key = i * 0xcafebabeb0bad00dul; + PinnedPageChunk chunk = cache.getOrSet(key, /* detache_if_missing */ false, /* inject_eviction */ false); + memset(chunk.getChunk()->data, 42, chunk.getChunk()->size); + chunk.markPrefixPopulated(cache.chunkSize()); + } + + auto thread_func = [&] + { + pcg64 rng(randomSeed()); + size_t misses = 0; + for (size_t i = 0; i < cache.maxChunks() * passes; ++i) + { + PageCacheKey key = rng() % cache.maxChunks() * 0xcafebabeb0bad00dul; + PinnedPageChunk chunk = cache.getOrSet(key, /* detache_if_missing */ false, /* inject_eviction */ false); + if (!chunk.isPrefixPopulated(cache.chunkSize())) + misses += 1; + } + total_misses += misses; + }; + + auto start_time = std::chrono::steady_clock::now(); + + std::vector threads; + for (size_t i = 0; i < num_threads; ++i) + threads.emplace_back(thread_func); + + for (std::thread & t : threads) + t.join(); + + auto end_time = std::chrono::steady_clock::now(); + double elapsed_seconds = std::chrono::duration_cast>(end_time - start_time).count(); + double fetched_gib = cache.chunkSize() * cache.maxChunks() * passes * 1. / (1ul << 30); + std::cout << fmt::format( + "threads {}, run {}, use_madv_free = {}, use_huge_pages = {}\nrequested {:.1f} GiB in {:.1f} seconds\n" + "that's {:.1f} GiB/s, or overhead of {:.3}us/{:.1}MiB\n", + num_threads, run, use_madv_free, use_huge_pages, fetched_gib, elapsed_seconds, fetched_gib / elapsed_seconds, + elapsed_seconds * 1e6 / cache.maxChunks() / passes, cache.chunkSize() * 1. / (1 << 20)) << std::endl; + + if (total_misses != 0) + std::cout << "!got " << total_misses.load() << " misses! perhaps your system doesn't have enough free memory, consider decreasing cache size in the benchmark code" << std::endl; + } + } +} diff --git a/src/Parsers/ASTSystemQuery.h b/src/Parsers/ASTSystemQuery.h index 9aa90f499d0..48be7f6b84f 100644 --- a/src/Parsers/ASTSystemQuery.h +++ b/src/Parsers/ASTSystemQuery.h @@ -31,6 +31,7 @@ public: DROP_COMPILED_EXPRESSION_CACHE, DROP_FILESYSTEM_CACHE, DROP_DISK_METADATA_CACHE, + DROP_PAGE_CACHE, DROP_SCHEMA_CACHE, DROP_FORMAT_SCHEMA_CACHE, DROP_S3_CLIENT_CACHE, diff --git a/src/Storages/MergeTree/IMergeTreeDataPart.cpp b/src/Storages/MergeTree/IMergeTreeDataPart.cpp index 39ad28d3dae..a9bdceacef0 100644 --- a/src/Storages/MergeTree/IMergeTreeDataPart.cpp +++ b/src/Storages/MergeTree/IMergeTreeDataPart.cpp @@ -1637,10 +1637,6 @@ bool IMergeTreeDataPart::assertHasValidVersionMetadata() const size_t file_size = getDataPartStorage().getFileSize(TXN_VERSION_METADATA_FILE_NAME); auto buf = getDataPartStorage().readFile(TXN_VERSION_METADATA_FILE_NAME, ReadSettings().adjustBufferSize(file_size), file_size, std::nullopt); - /// FIXME https://github.com/ClickHouse/ClickHouse/issues/48465 - if (dynamic_cast(buf.get())) - return true; - readStringUntilEOF(content, *buf); ReadBufferFromString str_buf{content}; VersionMetadata file; diff --git a/src/Storages/StorageS3.cpp b/src/Storages/StorageS3.cpp index da90dbb4076..53a18d3cc5b 100644 --- a/src/Storages/StorageS3.cpp +++ b/src/Storages/StorageS3.cpp @@ -727,7 +727,7 @@ std::unique_ptr StorageS3Source::createAsyncS3ReadBuffer( auto context = getContext(); auto read_buffer_creator = [this, read_settings, object_size] - (const std::string & path, size_t read_until_position) -> std::unique_ptr + (bool restricted_seek, const std::string & path) -> std::unique_ptr { return std::make_unique( client, @@ -738,21 +738,25 @@ std::unique_ptr StorageS3Source::createAsyncS3ReadBuffer( read_settings, /* use_external_buffer */true, /* offset */0, - read_until_position, - /* restricted_seek */true, + /* read_until_position */0, + restricted_seek, object_size); }; + auto modified_settings{read_settings}; + /// User's S3 object may change, don't cache it. + modified_settings.use_page_cache_for_disks_without_file_cache = false; + + /// FIXME: Changing this setting to default value breaks something around parquet reading + modified_settings.remote_read_min_bytes_for_seek = modified_settings.remote_fs_buffer_size; + auto s3_impl = std::make_unique( std::move(read_buffer_creator), StoredObjects{StoredObject{key, /* local_path */ "", object_size}}, + "", read_settings, /* cache_log */nullptr, /* use_external_buffer */true); - auto modified_settings{read_settings}; - /// FIXME: Changing this setting to default value breaks something around parquet reading - modified_settings.remote_read_min_bytes_for_seek = modified_settings.remote_fs_buffer_size; - auto & pool_reader = context->getThreadPoolReader(FilesystemReaderType::ASYNCHRONOUS_REMOTE_FS_READER); auto async_reader = std::make_unique( std::move(s3_impl), pool_reader, modified_settings, diff --git a/tests/clickhouse-test b/tests/clickhouse-test index f438c6f4f31..d44c80bc410 100755 --- a/tests/clickhouse-test +++ b/tests/clickhouse-test @@ -699,6 +699,8 @@ class SettingsRandomizer: get_localzone(), ] ), + "use_page_cache_for_disks_without_file_cache": lambda: random.random() < 0.7, + "page_cache_inject_eviction": lambda: random.random() < 0.5, } @staticmethod diff --git a/tests/queries/0_stateless/01271_show_privileges.reference b/tests/queries/0_stateless/01271_show_privileges.reference index e1f5213790d..88f18c52536 100644 --- a/tests/queries/0_stateless/01271_show_privileges.reference +++ b/tests/queries/0_stateless/01271_show_privileges.reference @@ -112,6 +112,7 @@ SYSTEM DROP QUERY CACHE ['SYSTEM DROP QUERY','DROP QUERY CACHE','DROP QUERY'] GL SYSTEM DROP COMPILED EXPRESSION CACHE ['SYSTEM DROP COMPILED EXPRESSION','DROP COMPILED EXPRESSION CACHE','DROP COMPILED EXPRESSIONS'] GLOBAL SYSTEM DROP CACHE SYSTEM DROP FILESYSTEM CACHE ['SYSTEM DROP FILESYSTEM CACHE','DROP FILESYSTEM CACHE'] GLOBAL SYSTEM DROP CACHE SYSTEM SYNC FILESYSTEM CACHE ['SYSTEM REPAIR FILESYSTEM CACHE','REPAIR FILESYSTEM CACHE','SYNC FILESYSTEM CACHE'] GLOBAL SYSTEM +SYSTEM DROP PAGE CACHE ['SYSTEM DROP PAGE CACHE','DROP PAGE CACHE'] GLOBAL SYSTEM DROP CACHE SYSTEM DROP SCHEMA CACHE ['SYSTEM DROP SCHEMA CACHE','DROP SCHEMA CACHE'] GLOBAL SYSTEM DROP CACHE SYSTEM DROP FORMAT SCHEMA CACHE ['SYSTEM DROP FORMAT SCHEMA CACHE','DROP FORMAT SCHEMA CACHE'] GLOBAL SYSTEM DROP CACHE SYSTEM DROP S3 CLIENT CACHE ['SYSTEM DROP S3 CLIENT','DROP S3 CLIENT CACHE'] GLOBAL SYSTEM DROP CACHE diff --git a/tests/queries/0_stateless/02867_page_cache.reference b/tests/queries/0_stateless/02867_page_cache.reference new file mode 100644 index 00000000000..5502059508a --- /dev/null +++ b/tests/queries/0_stateless/02867_page_cache.reference @@ -0,0 +1,23 @@ +54975576145920 +PageCacheBytesUnpinnedRoundedToHugePages 1 +PageCacheBytesUnpinnedRoundedToPages 1 +PageCacheChunkMisses 1 +ReadBufferFromS3Bytes 1 +54975576145920 +PageCacheBytesUnpinnedRoundedToHugePages 1 +PageCacheBytesUnpinnedRoundedToPages 1 +PageCacheChunkDataHits 1 +54975576145920 +PageCacheBytesUnpinnedRoundedToHugePages 1 +PageCacheBytesUnpinnedRoundedToPages 1 +PageCacheChunkMisses 1 +ReadBufferFromS3Bytes 1 +54975576145920 +PageCacheBytesUnpinnedRoundedToHugePages 1 +PageCacheBytesUnpinnedRoundedToPages 1 +PageCacheChunkMisses 1 +ReadBufferFromS3Bytes 1 +54975576145920 +PageCacheBytesUnpinnedRoundedToHugePages 1 +PageCacheBytesUnpinnedRoundedToPages 1 +PageCacheChunkDataHits 1 diff --git a/tests/queries/0_stateless/02867_page_cache.sql b/tests/queries/0_stateless/02867_page_cache.sql new file mode 100644 index 00000000000..8765b30ebc3 --- /dev/null +++ b/tests/queries/0_stateless/02867_page_cache.sql @@ -0,0 +1,105 @@ +-- Tags: no-fasttest, no-parallel +-- no-fasttest because we need an S3 storage policy +-- no-parallel because we look at server-wide counters about page cache usage + +set use_page_cache_for_disks_without_file_cache = 1; +set page_cache_inject_eviction = 0; +set enable_filesystem_cache = 0; +set use_uncompressed_cache = 0; + +create table events_snapshot engine Memory as select * from system.events; +create view events_diff as + -- round all stats to 70 MiB to leave a lot of leeway for overhead + with if(event like '%Bytes%', 70*1024*1024, 35) as granularity, + -- cache hits counter can vary a lot depending on other settings: + -- e.g. if merge_tree_min_bytes_for_concurrent_read is small, multiple threads will read each chunk + -- so we just check that the value is not too low + if(event in ( + 'PageCacheBytesUnpinnedRoundedToPages', 'PageCacheBytesUnpinnedRoundedToHugePages', + 'PageCacheChunkDataHits'), 1, 1000) as clamp + select event, min2(intDiv(new.value - old.value, granularity), clamp) as diff + from system.events new + left outer join events_snapshot old + on old.event = new.event + where diff != 0 and + event in ( + 'ReadBufferFromS3Bytes', 'PageCacheChunkMisses', 'PageCacheChunkDataMisses', + 'PageCacheChunkDataHits', 'PageCacheChunkDataPartialHits', + 'PageCacheBytesUnpinnedRoundedToPages', 'PageCacheBytesUnpinnedRoundedToHugePages') + order by event; + +drop table if exists page_cache_03055; +create table page_cache_03055 (k Int64 CODEC(NONE)) engine MergeTree order by k settings storage_policy = 's3_cache'; + +-- Write an 80 MiB file (40 x 2 MiB chunks), and a few small files. +system stop merges page_cache_03055; +insert into page_cache_03055 select * from numbers(10485760) settings max_block_size=100000000, preferred_block_size_bytes=1000000000; + +select * from events_diff; +truncate table events_snapshot; +insert into events_snapshot select * from system.events; + +system start merges page_cache_03055; +optimize table page_cache_03055 final; +truncate table events_snapshot; +insert into events_snapshot select * from system.events; + +-- Cold read, should miss cache. (Populating cache on write is not implemented yet.) + +select sum(k) from page_cache_03055; + +select * from events_diff where event not in ('PageCacheChunkDataHits'); +truncate table events_snapshot; +insert into events_snapshot select * from system.events; + +-- Repeat read, should hit cache. + +select sum(k) from page_cache_03055; + +select * from events_diff; +truncate table events_snapshot; +insert into events_snapshot select * from system.events; + +-- Drop cache and read again, should miss. Also don't write to cache. + +system drop page cache; + +select sum(k) from page_cache_03055 settings read_from_page_cache_if_exists_otherwise_bypass_cache = 1; + +-- Data could be read multiple times because we're not writing to cache. +select event, if(event in ('PageCacheChunkMisses', 'ReadBufferFromS3Bytes'), diff >= 1, diff) from events_diff where event not in ('PageCacheChunkDataHits'); +truncate table events_snapshot; +insert into events_snapshot select * from system.events; + +-- Repeat read, should still miss, but populate cache. + +select sum(k) from page_cache_03055; + +select * from events_diff where event not in ('PageCacheChunkDataHits'); +truncate table events_snapshot; +insert into events_snapshot select * from system.events; + +-- Read again, hit the cache. + +select sum(k) from page_cache_03055 settings read_from_page_cache_if_exists_otherwise_bypass_cache = 1; + +select * from events_diff; +truncate table events_snapshot; +insert into events_snapshot select * from system.events; + + +-- Known limitation: cache is not invalidated if a table is dropped and created again at the same path. +-- set allow_deprecated_database_ordinary=1; +-- create database test_03055 engine = Ordinary; +-- create table test_03055.t (k Int64) engine MergeTree order by k settings storage_policy = 's3_cache'; +-- insert into test_03055.t values (1); +-- select * from test_03055.t; +-- drop table test_03055.t; +-- create table test_03055.t (k Int64) engine MergeTree order by k settings storage_policy = 's3_cache'; +-- insert into test_03055.t values (2); +-- select * from test_03055.t; + + +drop table events_snapshot; +drop table page_cache_03055; +drop view events_diff; From 8f2881f0379389a04ebd11f7c06182c53550bdcb Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ra=C3=BAl=20Mar=C3=ADn?= Date: Thu, 29 Feb 2024 12:49:14 +0100 Subject: [PATCH 13/19] Remove broken test while we fix it --- ...3_parsedatetimebesteffort_syslog.reference | 37 ------------- .../02783_parsedatetimebesteffort_syslog.sql | 54 ------------------- 2 files changed, 91 deletions(-) delete mode 100644 tests/queries/0_stateless/02783_parsedatetimebesteffort_syslog.reference delete mode 100644 tests/queries/0_stateless/02783_parsedatetimebesteffort_syslog.sql diff --git a/tests/queries/0_stateless/02783_parsedatetimebesteffort_syslog.reference b/tests/queries/0_stateless/02783_parsedatetimebesteffort_syslog.reference deleted file mode 100644 index 1340b3affe3..00000000000 --- a/tests/queries/0_stateless/02783_parsedatetimebesteffort_syslog.reference +++ /dev/null @@ -1,37 +0,0 @@ -The reference time point is 2023-06-30 23:59:30 -─────────────────────────────────────────────── -The argument is before the reference time point -─────────────────────────────────────────────── -Row 1: -────── -syslog_arg: Jun 30 23:58:30 -res: 2023-06-30 23:58:30 -res_null: 2023-06-30 23:58:30 -res_zero: 2023-06-30 23:58:30 -res_us: 2023-06-30 23:58:30 -res_us_null: 2023-06-30 23:58:30 -res_us_zero: 2023-06-30 23:58:30 -res64: 2023-06-30 23:58:30.000 -res64_null: 2023-06-30 23:58:30.000 -res64_zero: 2023-06-30 23:58:30.000 -res64_us: 2023-06-30 23:58:30.000 -res64_us_null: 2023-06-30 23:58:30.000 -res64_us_zero: 2023-06-30 23:58:30.000 -────────────────────────────────────────────── -The argument is after the reference time point -────────────────────────────────────────────── -Row 1: -────── -syslog_arg: Jul 1 00:00:30 -res: 2022-07-01 00:00:30 -res_null: 2022-07-01 00:00:30 -res_zero: 2022-07-01 00:00:30 -res_us: 2022-07-01 00:00:30 -res_us_null: 2022-07-01 00:00:30 -res_us_zero: 2022-07-01 00:00:30 -res64: 2022-07-01 00:00:30.000 -res64_null: 2022-07-01 00:00:30.000 -res64_zero: 2022-07-01 00:00:30.000 -res64_us: 2022-07-01 00:00:30.000 -res64_us_null: 2022-07-01 00:00:30.000 -res64_us_zero: 2022-07-01 00:00:30.000 diff --git a/tests/queries/0_stateless/02783_parsedatetimebesteffort_syslog.sql b/tests/queries/0_stateless/02783_parsedatetimebesteffort_syslog.sql deleted file mode 100644 index c67722393ab..00000000000 --- a/tests/queries/0_stateless/02783_parsedatetimebesteffort_syslog.sql +++ /dev/null @@ -1,54 +0,0 @@ -SET session_timezone = 'UTC'; - -SELECT 'The reference time point is 2023-06-30 23:59:30'; -SELECT '───────────────────────────────────────────────'; -SELECT 'The argument is before the reference time point'; -SELECT '───────────────────────────────────────────────'; - -WITH - toDateTime('2023-06-30 23:59:30') AS dt_ref, - now() AS dt_now, - date_sub(MINUTE, 1, dt_now) as dt_before, - dateDiff(SECOND, dt_ref, dt_now) AS time_shift, - formatDateTime(dt_before, '%b %e %T') AS syslog_before -SELECT - formatDateTime(dt_before - time_shift, '%b %e %T') AS syslog_arg, - parseDateTimeBestEffort(syslog_before) - time_shift AS res, - parseDateTimeBestEffortOrNull(syslog_before) - time_shift AS res_null, - parseDateTimeBestEffortOrZero(syslog_before) - time_shift AS res_zero, - parseDateTimeBestEffortUS(syslog_before) - time_shift AS res_us, - parseDateTimeBestEffortUSOrNull(syslog_before) - time_shift AS res_us_null, - parseDateTimeBestEffortUSOrZero(syslog_before) - time_shift AS res_us_zero, - parseDateTime64BestEffort(syslog_before) - time_shift AS res64, - parseDateTime64BestEffortOrNull(syslog_before) - time_shift AS res64_null, - parseDateTime64BestEffortOrZero(syslog_before) - time_shift AS res64_zero, - parseDateTime64BestEffortUS(syslog_before) - time_shift AS res64_us, - parseDateTime64BestEffortUSOrNull(syslog_before) - time_shift AS res64_us_null, - parseDateTime64BestEffortUSOrZero(syslog_before) - time_shift AS res64_us_zero -FORMAT Vertical; - -SELECT '──────────────────────────────────────────────'; -SELECT 'The argument is after the reference time point'; -SELECT '──────────────────────────────────────────────'; - -WITH - toDateTime('2023-06-30 23:59:30') AS dt_ref, - now() AS dt_now, - date_add(MINUTE, 1, dt_now) as dt_after, - dateDiff(SECOND, dt_ref, dt_now) AS time_shift, - formatDateTime(dt_after, '%b %e %T') AS syslog_after -SELECT - formatDateTime(dt_after - time_shift, '%b %e %T') AS syslog_arg, - parseDateTimeBestEffort(syslog_after) - time_shift AS res, - parseDateTimeBestEffortOrNull(syslog_after) - time_shift AS res_null, - parseDateTimeBestEffortOrZero(syslog_after) - time_shift AS res_zero, - parseDateTimeBestEffortUS(syslog_after) - time_shift AS res_us, - parseDateTimeBestEffortUSOrNull(syslog_after) - time_shift AS res_us_null, - parseDateTimeBestEffortUSOrZero(syslog_after) - time_shift AS res_us_zero, - parseDateTime64BestEffort(syslog_after) - time_shift AS res64, - parseDateTime64BestEffortOrNull(syslog_after) - time_shift AS res64_null, - parseDateTime64BestEffortOrZero(syslog_after) - time_shift AS res64_zero, - parseDateTime64BestEffortUS(syslog_after) - time_shift AS res64_us, - parseDateTime64BestEffortUSOrNull(syslog_after) - time_shift AS res64_us_null, - parseDateTime64BestEffortUSOrZero(syslog_after) - time_shift AS res64_us_zero -FORMAT Vertical; From 15e1191f743ca4c7bed0266acb3f59cce9863bd1 Mon Sep 17 00:00:00 2001 From: Alexander Tokmakov Date: Thu, 29 Feb 2024 13:20:17 +0100 Subject: [PATCH 14/19] Revert "Userspace page cache (#53770)" This reverts commit 7b55c61551d4669a1962590399fe077898a1fec3. --- docs/en/operations/storing-data.md | 10 - .../example-datasets/opensky.mdx | 12 +- programs/server/Server.cpp | 7 - src/Access/Common/AccessType.h | 1 - src/Common/PageCache.cpp | 688 ------------------ src/Common/PageCache.h | 299 -------- src/Common/ProfileEvents.cpp | 9 - src/Core/Defines.h | 9 - src/Core/ServerSettings.h | 7 +- src/Core/Settings.h | 4 - src/Core/SettingsChangesHistory.h | 3 - .../IO/AsynchronousBoundedReadBuffer.cpp | 12 +- .../IO/CachedOnDiskReadBufferFromFile.cpp | 2 +- src/Disks/IO/ReadBufferFromRemoteFSGather.cpp | 67 +- src/Disks/IO/ReadBufferFromRemoteFSGather.h | 9 +- src/Disks/IO/ThreadPoolRemoteFSReader.cpp | 2 - src/Disks/IO/ThreadPoolRemoteFSReader.h | 3 - .../AzureBlobStorage/AzureObjectStorage.cpp | 8 +- .../ObjectStorages/DiskObjectStorage.cpp | 3 +- .../ObjectStorages/HDFS/HDFSObjectStorage.cpp | 4 +- .../Local/LocalObjectStorage.cpp | 6 +- .../ObjectStorages/S3/S3ObjectStorage.cpp | 9 +- .../ObjectStorages/Web/WebObjectStorage.cpp | 7 +- src/IO/AsynchronousReader.h | 3 - src/IO/BufferBase.h | 3 - src/IO/CachedInMemoryReadBufferFromFile.cpp | 188 ----- src/IO/CachedInMemoryReadBufferFromFile.h | 41 -- src/IO/ReadBuffer.h | 19 +- src/IO/ReadSettings.h | 7 - src/Interpreters/Context.cpp | 41 +- src/Interpreters/Context.h | 5 - src/Interpreters/InterpreterSystemQuery.cpp | 9 - .../ServerAsynchronousMetrics.cpp | 12 - src/Interpreters/tests/gtest_page_cache.cpp | 267 ------- src/Parsers/ASTSystemQuery.h | 1 - src/Storages/MergeTree/IMergeTreeDataPart.cpp | 4 + src/Storages/StorageS3.cpp | 18 +- tests/clickhouse-test | 2 - .../01271_show_privileges.reference | 1 - .../0_stateless/02867_page_cache.reference | 23 - .../queries/0_stateless/02867_page_cache.sql | 105 --- 41 files changed, 76 insertions(+), 1854 deletions(-) delete mode 100644 src/Common/PageCache.cpp delete mode 100644 src/Common/PageCache.h delete mode 100644 src/IO/CachedInMemoryReadBufferFromFile.cpp delete mode 100644 src/IO/CachedInMemoryReadBufferFromFile.h delete mode 100644 src/Interpreters/tests/gtest_page_cache.cpp delete mode 100644 tests/queries/0_stateless/02867_page_cache.reference delete mode 100644 tests/queries/0_stateless/02867_page_cache.sql diff --git a/docs/en/operations/storing-data.md b/docs/en/operations/storing-data.md index 84251812c01..003277c8d4f 100644 --- a/docs/en/operations/storing-data.md +++ b/docs/en/operations/storing-data.md @@ -275,16 +275,6 @@ Cache profile events: - `CachedWriteBufferCacheWriteBytes`, `CachedWriteBufferCacheWriteMicroseconds` -## Using in-memory cache (userspace page cache) {#userspace-page-cache} - -The File Cache described above stores cached data in local files. Alternatively, object-store-based disks can be configured to use "Userspace Page Cache", which is RAM-only. Userspace page cache is recommended only if file cache can't be used for some reason, e.g. if the machine doesn't have a local disk at all. Note that file cache effectively uses RAM for caching too, since the OS caches contents of local files. - -To enable userspace page cache for disks that don't use file cache, use setting `use_page_cache_for_disks_without_file_cache`. - -By default, on Linux, the userspace page cache will use all available memory, similar to the OS page cache. In tools like `top` and `ps`, the clickhouse server process will typically show resident set size near 100% of the machine's RAM - this is normal, and most of this memory is actually reclaimable by the OS on memory pressure (`MADV_FREE`). This behavior can be disabled with server setting `page_cache_use_madv_free = 0`, making the userspace page cache just use a fixed amount of memory `page_cache_size` with no special interaction with the OS. On Mac OS, `page_cache_use_madv_free` is always disabled as it doesn't have lazy `MADV_FREE`. - -Unfortunately, `page_cache_use_madv_free` makes it difficult to tell if the server is close to running out of memory, since the RSS metric becomes useless. Async metric `UnreclaimableRSS` shows the amount of physical memory used by the server, excluding the memory reclaimable by the OS: `select value from system.asynchronous_metrics where metric = 'UnreclaimableRSS'`. Use it for monitoring instead of RSS. This metric is only available if `page_cache_use_madv_free` is enabled. - ## Storing Data on Web Server {#storing-data-on-webserver} There is a tool `clickhouse-static-files-uploader`, which prepares a data directory for a given table (`SELECT data_paths FROM system.tables WHERE name = 'table_name'`). For each table you need, you get a directory of files. These files can be uploaded to, for example, a web server with static files. After this preparation, you can load this table into any ClickHouse server via `DiskWeb`. diff --git a/docs/zh/getting-started/example-datasets/opensky.mdx b/docs/zh/getting-started/example-datasets/opensky.mdx index b79c02ab780..92cd104e06e 100644 --- a/docs/zh/getting-started/example-datasets/opensky.mdx +++ b/docs/zh/getting-started/example-datasets/opensky.mdx @@ -1,4 +1,4 @@ ---- +--- slug: /zh/getting-started/example-datasets/opensky sidebar_label: 空中交通数据 description: 该数据集中的数据是从完整的 OpenSky 数据集中衍生而来的,对其中的数据进行了必要的清理,用以展示在 COVID-19 期间空中交通的发展。 @@ -53,12 +53,12 @@ CREATE TABLE opensky ls -1 flightlist_*.csv.gz | xargs -P100 -I{} bash -c 'gzip -c -d "{}" | clickhouse-client --date_time_input_format best_effort --query "INSERT INTO opensky FORMAT CSVWithNames"' ``` -- 这里我们将文件列表(`ls -1 flightlist_*.csv.gz`)传递给`xargs`以进行并行处理。 `xargs -P100` 指定最多使用 100 个并行工作程序,但由于我们只有 30 个文件,工作程序的数量将只有 30 个。 -- 对于每个文件,`xargs` 将通过 `bash -c` 为每个文件运行一个脚本文件。该脚本通过使用 `{}` 表示文件名占位符,然后 `xargs` 由命令进行填充(使用 `-I{}`)。 -- 该脚本会将文件 (`gzip -c -d "{}"`) 解压缩到标准输出(`-c` 参数),并将输出重定向到 `clickhouse-client`。 -- 我们还要求使用扩展解析器解析 [DateTime](/docs/zh/sql-reference/data-types/datetime.md) 字段 ([--date_time_input_format best_effort](/docs/zh/operations/settings/settings.md#settings-date_time_input_format)) 以识别具有时区偏移的 ISO-8601 格式。 +- 这里我们将文件列表(`ls -1 flightlist_*.csv.gz`)传递给`xargs`以进行并行处理。 `xargs -P100` 指定最多使用 100 个并行工作程序,但由于我们只有 30 个文件,工作程序的数量将只有 30 个。 +- 对于每个文件,`xargs` 将通过 `bash -c` 为每个文件运行一个脚本文件。该脚本通过使用 `{}` 表示文件名占位符,然后 `xargs` 由命令进行填充(使用 `-I{}`)。 +- 该脚本会将文件 (`gzip -c -d "{}"`) 解压缩到标准输出(`-c` 参数),并将输出重定向到 `clickhouse-client`。 +- 我们还要求使用扩展解析器解析 [DateTime](../../sql-reference/data-types/datetime.md) 字段 ([--date_time_input_format best_effort](../../operations/settings/ settings.md#settings-date_time_input_format)) 以识别具有时区偏移的 ISO-8601 格式。 -最后,`clickhouse-client` 会以 [CSVWithNames](/docs/zh/interfaces/formats.md#csvwithnames) 格式读取输入数据然后执行插入。 +最后,`clickhouse-client` 会以 [CSVWithNames](../../interfaces/formats.md#csvwithnames) 格式读取输入数据然后执行插入。 并行导入需要 24 秒。 diff --git a/programs/server/Server.cpp b/programs/server/Server.cpp index 786cb27d8c4..6dc33042a05 100644 --- a/programs/server/Server.cpp +++ b/programs/server/Server.cpp @@ -1228,13 +1228,6 @@ try } global_context->setMarkCache(mark_cache_policy, mark_cache_size, mark_cache_size_ratio); - size_t page_cache_size = server_settings.page_cache_size; - if (page_cache_size != 0) - global_context->setPageCache( - server_settings.page_cache_chunk_size, server_settings.page_cache_mmap_size, - page_cache_size, server_settings.page_cache_use_madv_free, - server_settings.page_cache_use_transparent_huge_pages); - String index_uncompressed_cache_policy = server_settings.index_uncompressed_cache_policy; size_t index_uncompressed_cache_size = server_settings.index_uncompressed_cache_size; double index_uncompressed_cache_size_ratio = server_settings.index_uncompressed_cache_size_ratio; diff --git a/src/Access/Common/AccessType.h b/src/Access/Common/AccessType.h index de3eda96bac..8172a468f89 100644 --- a/src/Access/Common/AccessType.h +++ b/src/Access/Common/AccessType.h @@ -162,7 +162,6 @@ enum class AccessType M(SYSTEM_DROP_COMPILED_EXPRESSION_CACHE, "SYSTEM DROP COMPILED EXPRESSION, DROP COMPILED EXPRESSION CACHE, DROP COMPILED EXPRESSIONS", GLOBAL, SYSTEM_DROP_CACHE) \ M(SYSTEM_DROP_FILESYSTEM_CACHE, "SYSTEM DROP FILESYSTEM CACHE, DROP FILESYSTEM CACHE", GLOBAL, SYSTEM_DROP_CACHE) \ M(SYSTEM_SYNC_FILESYSTEM_CACHE, "SYSTEM REPAIR FILESYSTEM CACHE, REPAIR FILESYSTEM CACHE, SYNC FILESYSTEM CACHE", GLOBAL, SYSTEM) \ - M(SYSTEM_DROP_PAGE_CACHE, "SYSTEM DROP PAGE CACHE, DROP PAGE CACHE", GLOBAL, SYSTEM_DROP_CACHE) \ M(SYSTEM_DROP_SCHEMA_CACHE, "SYSTEM DROP SCHEMA CACHE, DROP SCHEMA CACHE", GLOBAL, SYSTEM_DROP_CACHE) \ M(SYSTEM_DROP_FORMAT_SCHEMA_CACHE, "SYSTEM DROP FORMAT SCHEMA CACHE, DROP FORMAT SCHEMA CACHE", GLOBAL, SYSTEM_DROP_CACHE) \ M(SYSTEM_DROP_S3_CLIENT_CACHE, "SYSTEM DROP S3 CLIENT, DROP S3 CLIENT CACHE", GLOBAL, SYSTEM_DROP_CACHE) \ diff --git a/src/Common/PageCache.cpp b/src/Common/PageCache.cpp deleted file mode 100644 index 511ec23d431..00000000000 --- a/src/Common/PageCache.cpp +++ /dev/null @@ -1,688 +0,0 @@ -#include "PageCache.h" - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -namespace ProfileEvents -{ - extern const Event PageCacheChunkMisses; - extern const Event PageCacheChunkShared; - extern const Event PageCacheChunkDataHits; - extern const Event PageCacheChunkDataPartialHits; - extern const Event PageCacheChunkDataMisses; - extern const Event PageCacheBytesUnpinnedRoundedToPages; - extern const Event PageCacheBytesUnpinnedRoundedToHugePages; -} - -namespace DB -{ - -namespace ErrorCodes -{ - extern const int SYSTEM_ERROR; - extern const int MEMORY_LIMIT_EXCEEDED; - extern const int CANNOT_ALLOCATE_MEMORY; - extern const int INVALID_SETTING_VALUE; - extern const int FILE_DOESNT_EXIST; -} - -#pragma clang diagnostic push -#pragma clang diagnostic ignored "-Wunknown-warning-option" -#pragma clang diagnostic ignored "-Wreadability-make-member-function-const" - -PinnedPageChunk::PinnedPageChunk(PinnedPageChunk && c) noexcept - : cache(std::exchange(c.cache, nullptr)), chunk(std::exchange(c.chunk, nullptr)) {} - -PinnedPageChunk & PinnedPageChunk::operator=(PinnedPageChunk && c) noexcept -{ - if (cache) - cache->removeRef(chunk); - cache = std::exchange(c.cache, nullptr); - chunk = std::exchange(c.chunk, nullptr); - return *this; -} - -PinnedPageChunk::~PinnedPageChunk() noexcept -{ - if (cache) - cache->removeRef(chunk); -} - -PinnedPageChunk::PinnedPageChunk(PageCache * cache_, PageChunk * chunk_) noexcept : cache(cache_), chunk(chunk_) {} - -const PageChunk * PinnedPageChunk::getChunk() const { return chunk; } - -bool PinnedPageChunk::markPagePopulated(size_t page_idx) -{ - bool r = chunk->pages_populated.set(page_idx); - return r; -} - -void PinnedPageChunk::markPrefixPopulated(size_t bytes) -{ - for (size_t i = 0; i < (bytes + chunk->page_size - 1) / chunk->page_size; ++i) - markPagePopulated(i); -} - -bool PinnedPageChunk::isPrefixPopulated(size_t bytes) const -{ - for (size_t i = 0; i < (bytes + chunk->page_size - 1) / chunk->page_size; ++i) - if (!chunk->pages_populated.get(i)) - return false; - return true; -} - -AtomicBitSet::AtomicBitSet() = default; - -void AtomicBitSet::init(size_t nn) -{ - n = nn; - v = std::make_unique[]>((n + 7) / 8); -} - -bool AtomicBitSet::get(size_t i) const -{ - return (v[i / 8] & (1 << (i % 8))) != 0; -} - -bool AtomicBitSet::any() const -{ - for (size_t i = 0; i < (n + 7) / 8; ++i) - if (v[i]) - return true; - return false; -} - -bool AtomicBitSet::set(size_t i) const -{ - UInt8 prev = v[i / 8].fetch_or(1 << (i % 8)); - return (prev & (1 << (i % 8))) == 0; -} - -bool AtomicBitSet::set(size_t i, bool val) const -{ - if (val) - return set(i); - else - return unset(i); -} - -bool AtomicBitSet::unset(size_t i) const -{ - UInt8 prev = v[i / 8].fetch_and(~(1 << (i % 8))); - return (prev & (1 << (i % 8))) != 0; -} - -void AtomicBitSet::unsetAll() const -{ - for (size_t i = 0; i < (n + 7) / 8; ++i) - v[i].store(0, std::memory_order_relaxed); -} - -PageCache::PageCache(size_t bytes_per_chunk, size_t bytes_per_mmap, size_t bytes_total, bool use_madv_free_, bool use_huge_pages_) - : bytes_per_page(getPageSize()) - , use_madv_free(use_madv_free_) - , use_huge_pages(use_huge_pages_) - , rng(randomSeed()) -{ - if (bytes_per_chunk == 0 || bytes_per_mmap == 0) - throw Exception(ErrorCodes::INVALID_SETTING_VALUE, "Userspace page cache chunk size and mmap size can't be zero."); - - if (use_huge_pages) - { - use_huge_pages = false; - bool print_warning = false; -#ifdef OS_LINUX - try - { - ReadBufferFromFile in("/sys/kernel/mm/transparent_hugepage/hpage_pmd_size"); - size_t huge_page_size; - readIntText(huge_page_size, in); - - if (huge_page_size == 0 || huge_page_size % bytes_per_page != 0) - throw Exception(ErrorCodes::SYSTEM_ERROR, "Invalid huge page size reported by the OS: {}", huge_page_size); - - /// THP can be configured to be 2 MiB or 1 GiB in size. 1 GiB is way too big for us. - if (huge_page_size <= (16 << 20)) - { - pages_per_big_page = huge_page_size / bytes_per_page; - use_huge_pages = true; - } - else - { - LOG_WARNING(&Poco::Logger::get("PageCache"), "The OS huge page size is too large for our purposes: {} KiB. Using regular pages. Userspace page cache will be relatively slow.", huge_page_size); - } - } - catch (Exception & e) - { - if (e.code() != ErrorCodes::FILE_DOESNT_EXIST) - throw; - print_warning = true; - } -#else - print_warning = true; -#endif - if (print_warning) - LOG_WARNING(&Poco::Logger::get("PageCache"), "The OS doesn't support transparent huge pages. Userspace page cache will be relatively slow."); - } - - pages_per_chunk = ((bytes_per_chunk - 1) / (bytes_per_page * pages_per_big_page) + 1) * pages_per_big_page; - chunks_per_mmap_target = (bytes_per_mmap - 1) / (bytes_per_page * pages_per_chunk) + 1; - max_mmaps = (bytes_total - 1) / (bytes_per_page * pages_per_chunk * chunks_per_mmap_target) + 1; -} - -PageCache::~PageCache() -{ - chassert(getPinnedSize() == 0); -} - -size_t PageCache::pageSize() const { return bytes_per_page; } -size_t PageCache::chunkSize() const { return bytes_per_page * pages_per_chunk; } -size_t PageCache::maxChunks() const { return chunks_per_mmap_target * max_mmaps; } - -size_t PageCache::getPinnedSize() const -{ - std::unique_lock lock(global_mutex); - return (total_chunks - lru.size()) * bytes_per_page * pages_per_chunk; -} - -PageCache::MemoryStats PageCache::getResidentSetSize() const -{ - MemoryStats stats; -#ifdef OS_LINUX - if (use_madv_free) - { - std::unordered_set cache_mmap_addrs; - for (const auto & m : mmaps) - cache_mmap_addrs.insert(reinterpret_cast(m.ptr)); - - ReadBufferFromFile in("/proc/self/smaps"); - - /// Parse the smaps contents, which is text consisting of entries like this: - /// - /// 117ba4a00000-117be4a00000 rw-p 00000000 00:00 0 - /// Size: 1048576 kB - /// KernelPageSize: 4 kB - /// MMUPageSize: 4 kB - /// Rss: 539516 kB - /// Pss: 539516 kB - /// ... - - auto read_token = [&] - { - String res; - while (!in.eof()) - { - char c = *in.position(); - if (c == '\n' || c == '\t' || c == ' ' || c == '-') - break; - res += c; - ++in.position(); - } - return res; - }; - - auto skip_whitespace = [&] - { - while (!in.eof()) - { - char c = *in.position(); - if (c != ' ' && c != '\t') - break; - ++in.position(); - } - }; - - bool current_range_is_cache = false; - size_t total_rss = 0; - size_t total_lazy_free = 0; - while (!in.eof()) - { - String s = read_token(); - if (!in.eof() && *in.position() == '-') - { - if (s.size() < 16) - s.insert(0, 16 - s.size(), '0'); - UInt64 addr = unhexUInt(s.c_str()); - current_range_is_cache = cache_mmap_addrs.contains(addr); - } - else if (s == "Rss:" || s == "LazyFree") - { - skip_whitespace(); - size_t val; - readIntText(val, in); - skip_whitespace(); - String unit = read_token(); - if (unit != "kB") - throw Exception(ErrorCodes::SYSTEM_ERROR, "Unexpected units in /proc/self/smaps: {}", unit); - size_t bytes = val * 1024; - - if (s == "Rss:") - { - total_rss += bytes; - if (current_range_is_cache) - stats.page_cache_rss += bytes; - } - else - total_lazy_free += bytes; - } - skipToNextLineOrEOF(in); - } - stats.unreclaimable_rss = total_rss - std::min(total_lazy_free, total_rss); - - return stats; - } -#endif - - stats.page_cache_rss = bytes_per_page * pages_per_chunk * total_chunks; - return stats; -} - -PinnedPageChunk PageCache::getOrSet(PageCacheKey key, bool detached_if_missing, bool inject_eviction) -{ - PageChunk * chunk; - /// Make sure we increment exactly one of the counters about the fate of a chunk lookup. - bool incremented_profile_events = false; - - { - std::unique_lock lock(global_mutex); - - auto * it = chunk_by_key.find(key); - if (it == chunk_by_key.end()) - { - chunk = getFreeChunk(lock); - chassert(!chunk->key.has_value()); - - if (!detached_if_missing) - { - chunk->key = key; - chunk_by_key.insert({key, chunk}); - } - - ProfileEvents::increment(ProfileEvents::PageCacheChunkMisses); - incremented_profile_events = true; - } - else - { - chunk = it->getMapped(); - size_t prev_pin_count = chunk->pin_count.fetch_add(1); - - if (prev_pin_count == 0) - { - /// Not eligible for LRU eviction while pinned. - chassert(chunk->is_linked()); - lru.erase(lru.iterator_to(*chunk)); - - if (detached_if_missing) - { - /// Peek the first page to see if it's evicted. - /// (Why not use the full probing procedure instead, restoreChunkFromLimbo()? - /// Right here we can't do it because of how the two mutexes are organized. - /// And we want to do the check+detach before unlocking global_mutex, because - /// otherwise we may detach a chunk pinned by someone else, which may be unexpected - /// for that someone else. Or maybe the latter is fine, dropCache() already does it.) - if (chunk->pages_populated.get(0) && reinterpret_cast*>(chunk->data)->load(std::memory_order_relaxed) == 0) - evictChunk(chunk, lock); - } - - if (inject_eviction && chunk->key.has_value() && rng() % 10 == 0) - { - /// Simulate eviction of the chunk or some of its pages. - if (rng() % 2 == 0) - evictChunk(chunk, lock); - else - for (size_t i = 0; i < 20; ++i) - chunk->pages_populated.unset(rng() % (chunk->size / chunk->page_size)); - } - } - else - { - ProfileEvents::increment(ProfileEvents::PageCacheChunkShared); - incremented_profile_events = true; - } - } - } - - { - std::unique_lock chunk_lock(chunk->chunk_mutex); - - if (chunk->pages_state == PageChunkState::Limbo) - { - auto [pages_restored, pages_evicted] = restoreChunkFromLimbo(chunk, chunk_lock); - chunk->pages_state = PageChunkState::Stable; - - if (!incremented_profile_events) - { - if (pages_evicted == 0) - ProfileEvents::increment(ProfileEvents::PageCacheChunkDataHits); - else if (pages_evicted < pages_restored) - ProfileEvents::increment(ProfileEvents::PageCacheChunkDataPartialHits); - else - ProfileEvents::increment(ProfileEvents::PageCacheChunkDataMisses); - } - } - } - - return PinnedPageChunk(this, chunk); -} - -void PageCache::removeRef(PageChunk * chunk) noexcept -{ - /// Fast path if this is not the last reference. - size_t prev_pin_count = chunk->pin_count.load(); - if (prev_pin_count > 1 && chunk->pin_count.compare_exchange_strong(prev_pin_count, prev_pin_count - 1)) - return; - - { - std::unique_lock lock(global_mutex); - - prev_pin_count = chunk->pin_count.fetch_sub(1); - if (prev_pin_count > 1) - return; - - chassert(!chunk->is_linked()); - if (chunk->key.has_value()) - lru.push_back(*chunk); - else - /// Unpinning detached chunk. We'd rather reuse it soon, so put it at the front. - lru.push_front(*chunk); - } - - { - std::unique_lock chunk_lock(chunk->chunk_mutex); - - /// Need to be extra careful here because we unlocked global_mutex above, so other - /// getOrSet()/removeRef() calls could have happened during this brief period. - if (use_madv_free && chunk->pages_state == PageChunkState::Stable && chunk->pin_count.load() == 0) - { - sendChunkToLimbo(chunk, chunk_lock); - chunk->pages_state = PageChunkState::Limbo; - } - } -} - -static void logUnexpectedSyscallError(std::string name) -{ - std::string message = fmt::format("{} failed: {}", name, errnoToString()); - LOG_WARNING(&Poco::Logger::get("PageCache"), "{}", message); -#if defined(ABORT_ON_LOGICAL_ERROR) - volatile bool true_ = true; - if (true_) // suppress warning about missing [[noreturn]] - abortOnFailedAssertion(message); -#endif -} - -void PageCache::sendChunkToLimbo(PageChunk * chunk [[maybe_unused]], std::unique_lock & /* chunk_mutex */) const noexcept -{ -#ifdef MADV_FREE // if we're not on a very old version of Linux - chassert(chunk->size == bytes_per_page * pages_per_chunk); - size_t populated_pages = 0; - size_t populated_big_pages = 0; - for (size_t big_page_idx = 0; big_page_idx < pages_per_chunk / pages_per_big_page; ++big_page_idx) - { - bool big_page_populated = false; - for (size_t sub_idx = 0; sub_idx < pages_per_big_page; ++sub_idx) - { - size_t idx = big_page_idx * pages_per_big_page + sub_idx; - if (!chunk->pages_populated.get(idx)) - continue; - big_page_populated = true; - populated_pages += 1; - - auto & byte = reinterpret_cast &>(chunk->data[idx * bytes_per_page]); - chunk->first_bit_of_each_page.set(idx, (byte.load(std::memory_order_relaxed) & 1) != 0); - byte.fetch_or(1, std::memory_order_relaxed); - } - if (big_page_populated) - populated_big_pages += 1; - } - int r = madvise(chunk->data, chunk->size, MADV_FREE); - if (r != 0) - logUnexpectedSyscallError("madvise(MADV_FREE)"); - - ProfileEvents::increment(ProfileEvents::PageCacheBytesUnpinnedRoundedToPages, bytes_per_page * populated_pages); - ProfileEvents::increment(ProfileEvents::PageCacheBytesUnpinnedRoundedToHugePages, bytes_per_page * pages_per_big_page * populated_big_pages); -#endif -} - -std::pair PageCache::restoreChunkFromLimbo(PageChunk * chunk, std::unique_lock & /* chunk_mutex */) const noexcept -{ - static_assert(sizeof(std::atomic) == 1, "char is not atomic?"); - // Make sure our strategic memory reads/writes are not reordered or optimized out. - auto * data = reinterpret_cast *>(chunk->data); - size_t pages_restored = 0; - size_t pages_evicted = 0; - for (size_t idx = 0; idx < chunk->size / bytes_per_page; ++idx) - { - if (!chunk->pages_populated.get(idx)) - continue; - - /// After MADV_FREE, it's guaranteed that: - /// * writing to the page makes it non-freeable again (reading doesn't), - /// * after the write, the page contents are either fully intact or fully zero-filled, - /// * even before the write, reads return either intact data (if the page wasn't freed) or zeroes (if it was, and the read page-faulted). - /// (And when doing the write there's no way to tell whether it page-faulted or not, AFAICT; that would make our life much easier!) - /// - /// With that in mind, we do the following dance to bring the page back from the MADV_FREE limbo: - /// 0. [in advance] Before doing MADV_FREE, make sure the page's first byte is not zero. - /// We do it by setting the lowest bit of the first byte to 1, after saving the original value of that bit into a bitset. - /// 1. Read the second byte. - /// 2. Write the second byte back. This makes the page non-freeable. - /// 3. Read the first byte. - /// 3a. If it's zero, the page was freed. - /// Set the second byte to 0, to keep the buffer zero-filled if the page was freed - /// between steps 1 and 2. - /// 3b. If it's nonzero, the page is intact. - /// Restore the lowest bit of the first byte to the saved original value from the bitset. - - char second_byte = data[idx * bytes_per_page + 1].load(std::memory_order_relaxed); - data[idx * bytes_per_page + 1].store(second_byte, std::memory_order_relaxed); - - char first_byte = data[idx * bytes_per_page].load(std::memory_order_relaxed); - if (first_byte == 0) - { - pages_evicted += 1; - data[idx * bytes_per_page + 1].store(0, std::memory_order_relaxed); - chunk->pages_populated.unset(idx); - } - else - { - pages_restored += 1; - chassert(first_byte & 1); - if (!chunk->first_bit_of_each_page.get(idx)) - data[idx * bytes_per_page].fetch_and(~1, std::memory_order_relaxed); - } - } - return {pages_restored, pages_evicted}; -} - -PageChunk * PageCache::getFreeChunk(std::unique_lock & lock /* global_mutex */) -{ - if (lru.empty() || (mmaps.size() < max_mmaps && lru.front().key.has_value())) - addMmap(lock); - if (lru.empty()) - throw Exception(ErrorCodes::MEMORY_LIMIT_EXCEEDED, "All chunks in the entire page cache ({:.3} GiB) are pinned.", - bytes_per_page * pages_per_chunk * total_chunks * 1. / (1l << 30)); - - PageChunk * chunk = &lru.front(); - lru.erase(lru.iterator_to(*chunk)); - - size_t prev_pin_count = chunk->pin_count.fetch_add(1); - chassert(prev_pin_count == 0); - - evictChunk(chunk, lock); - - return chunk; -} - -void PageCache::evictChunk(PageChunk * chunk, std::unique_lock & /* global_mutex */) -{ - if (chunk->key.has_value()) - { - size_t erased = chunk_by_key.erase(chunk->key.value()); - chassert(erased); - chunk->key.reset(); - } - - chunk->state.reset(); - - /// This is tricky. We're not holding the chunk_mutex, so another thread might be running - /// sendChunkToLimbo() or even restoreChunkFromLimbo() on this chunk right now. - /// - /// Nevertheless, it's correct and sufficient to clear pages_populated here because sendChunkToLimbo() - /// and restoreChunkFromLimbo() only touch pages_populated (only unsetting the bits), - /// first_bit_of_each_page, and the data; and we don't care about first_bit_of_each_page and the data. - /// - /// This is precarious, but I don't have better ideas. Note that this clearing (or something else) - /// must be done before unlocking the global_mutex because otherwise another call to getOrSet() might - /// return this chunk before we clear it. - chunk->pages_populated.unsetAll(); -} - -void PageCache::addMmap(std::unique_lock & /* global_mutex */) -{ - /// ASLR by hand. - void * address_hint = reinterpret_cast(std::uniform_int_distribution(0x100000000000UL, 0x700000000000UL)(rng)); - - mmaps.emplace_back(bytes_per_page, pages_per_chunk, pages_per_big_page, chunks_per_mmap_target, address_hint, use_huge_pages); - - size_t num_chunks = mmaps.back().num_chunks; - total_chunks += num_chunks; - for (size_t i = 0; i < num_chunks; ++i) - /// Link in reverse order, so they get assigned in increasing order. Not important, just seems nice. - lru.push_front(mmaps.back().chunks[num_chunks - 1 - i]); -} - -void PageCache::dropCache() -{ - std::unique_lock lock(global_mutex); - - /// Detach and free unpinned chunks. - bool logged_error = false; - for (PageChunk & chunk : lru) - { - evictChunk(&chunk, lock); - - if (use_madv_free) - { - /// This might happen in parallel with sendChunkToLimbo() or restoreChunkFromLimbo(), but it's ok. - int r = madvise(chunk.data, chunk.size, MADV_DONTNEED); - if (r != 0 && !logged_error) - { - logUnexpectedSyscallError("madvise(MADV_DONTNEED)"); - logged_error = true; - } - } - } - - /// Detach pinned chunks. - for (auto [key, chunk] : chunk_by_key) - { - chassert(chunk->key == key); - chassert(chunk->pin_count > 0); // otherwise it would have been evicted above - chunk->key.reset(); - } - chunk_by_key.clear(); -} - -PageCache::Mmap::Mmap(size_t bytes_per_page_, size_t pages_per_chunk_, size_t pages_per_big_page_, size_t num_chunks_, void * address_hint, bool use_huge_pages_) -{ - num_chunks = num_chunks_; - size = bytes_per_page_ * pages_per_chunk_ * num_chunks; - - size_t alignment = bytes_per_page_ * pages_per_big_page_; - address_hint = reinterpret_cast(reinterpret_cast(address_hint) / alignment * alignment); - - auto temp_chunks = std::make_unique(num_chunks); - - int flags = MAP_PRIVATE | MAP_ANONYMOUS; -#ifdef OS_LINUX - flags |= MAP_NORESERVE; -#endif - ptr = mmap(address_hint, size, PROT_READ | PROT_WRITE, flags, -1, 0); - if (MAP_FAILED == ptr) - throw ErrnoException(ErrorCodes::CANNOT_ALLOCATE_MEMORY, fmt::format("Cannot mmap {}.", ReadableSize(size))); - if (reinterpret_cast(ptr) % bytes_per_page_ != 0) - { - munmap(ptr, size); - throw Exception(ErrorCodes::SYSTEM_ERROR, "mmap returned unaligned address: {}", ptr); - } - - void * chunks_start = ptr; - -#ifdef OS_LINUX - if (madvise(ptr, size, MADV_DONTDUMP) != 0) - logUnexpectedSyscallError("madvise(MADV_DONTDUMP)"); - if (madvise(ptr, size, MADV_DONTFORK) != 0) - logUnexpectedSyscallError("madvise(MADV_DONTFORK)"); - - if (use_huge_pages_) - { - if (reinterpret_cast(ptr) % alignment != 0) - { - LOG_DEBUG(&Poco::Logger::get("PageCache"), "mmap() returned address not aligned on huge page boundary."); - chunks_start = reinterpret_cast((reinterpret_cast(ptr) / alignment + 1) * alignment); - chassert(reinterpret_cast(chunks_start) % alignment == 0); - num_chunks -= 1; - } - - if (madvise(ptr, size, MADV_HUGEPAGE) != 0) - LOG_WARNING(&Poco::Logger::get("PageCache"), - "madvise(MADV_HUGEPAGE) failed: {}. Userspace page cache will be relatively slow.", errnoToString()); - } -#else - (void)use_huge_pages_; -#endif - - chunks = std::move(temp_chunks); - for (size_t i = 0; i < num_chunks; ++i) - { - PageChunk * chunk = &chunks[i]; - chunk->data = reinterpret_cast(chunks_start) + bytes_per_page_ * pages_per_chunk_ * i; - chunk->size = bytes_per_page_ * pages_per_chunk_; - chunk->page_size = bytes_per_page_; - chunk->big_page_size = bytes_per_page_ * pages_per_big_page_; - chunk->pages_populated.init(pages_per_chunk_); - chunk->first_bit_of_each_page.init(pages_per_chunk_); - } -} - -PageCache::Mmap::Mmap(Mmap && m) noexcept : ptr(std::exchange(m.ptr, nullptr)), size(std::exchange(m.size, 0)), chunks(std::move(m.chunks)), num_chunks(std::exchange(m.num_chunks, 0)) {} - -PageCache::Mmap::~Mmap() noexcept -{ - if (ptr && 0 != munmap(ptr, size)) - logUnexpectedSyscallError("munmap"); -} - -void FileChunkState::reset() {} - -PageCacheKey FileChunkAddress::hash() const -{ - SipHash hash(offset); - hash.update(path.data(), path.size()); - if (!file_version.empty()) - { - hash.update("\0", 1); - hash.update(file_version.data(), file_version.size()); - } - return hash.get128(); -} - -std::string FileChunkAddress::toString() const -{ - return fmt::format("{}:{}{}{}", path, offset, file_version.empty() ? "" : ":", file_version); -} - -#pragma clang diagnostic pop - -} diff --git a/src/Common/PageCache.h b/src/Common/PageCache.h deleted file mode 100644 index 7ff376baa6b..00000000000 --- a/src/Common/PageCache.h +++ /dev/null @@ -1,299 +0,0 @@ -#pragma once - -#include -#include -#include -#include -#include - -/// "Userspace page cache" -/// A cache for contents of remote files. -/// Uses MADV_FREE to allow Linux to evict pages from our cache under memory pressure. -/// Typically takes up almost all of the available memory, similar to the actual page cache. -/// -/// Intended for caching data retrieved from distributed cache, but can be used for other things too, -/// just replace FileChunkState with a discriminated union, or something, if needed. -/// -/// There are two fixed-size units of caching here: -/// * OS pages, typically 4 KiB each. -/// * Page chunks, 2 MiB each (configurable with page_cache_block_size setting). -/// -/// Each file is logically split into aligned 2 MiB blocks, which are mapped to page chunks inside the cache. -/// They are cached independently from each other. -/// -/// Each page chunk has a contiguous 2 MiB buffer that can be pinned and directly used e.g. by ReadBuffers. -/// While pinned (by at least one PinnedPageChunk), the pages are not reclaimable by the OS. -/// -/// Inside each page chunk, any subset of pages may be populated. Unpopulated pages may or not be -/// mapped to any physical RAM. We maintain a bitmask that keeps track of which pages are populated. -/// Pages become unpopulated if they're reclaimed by the OS (when the page chunk is not pinned), -/// or if we just never populate them in the first place (e.g. if a file is shorter than 2 MiB we -/// still create a 2 MiB page chunk, but use only a prefix of it). -/// -/// There are two separate eviction mechanisms at play: -/// * LRU eviction of page chunks in PageCache. -/// * OS reclaiming pages on memory pressure. We have no control over the eviction policy. -/// It probably picks the pages in the same order in which they were marked with MADV_FREE, so -/// effectively in the same LRU order as our policy in PageCache. -/// When using PageCache in oversubscribed fashion, using all available memory and relying on OS eviction, -/// the PageCache's eviction policy mostly doesn't matter. It just needs to be similar enough to the OS's -/// policy that we rarely evict chunks with unevicted pages. -/// -/// We mmap memory directly instead of using allocator because this enables: -/// * knowing how much RAM the cache is using, via /proc/self/smaps, -/// * MADV_HUGEPAGE (use transparent huge pages - this makes MADV_FREE 10x less slow), -/// * MAP_NORESERVE (don't reserve swap space - otherwise large mmaps usually fail), -/// * MADV_DONTDUMP (don't include in core dumps), -/// * page-aligned addresses without padding. -/// -/// madvise(MADV_FREE) call is slow: ~6 GiB/s (doesn't scale with more threads). Enabling transparent -/// huge pages (MADV_HUGEPAGE) makes it 10x less slow, so we do that. That makes the physical RAM allocation -/// work at 2 MiB granularity instead of 4 KiB, so the cache becomes less suitable for small files. -/// If this turns out to be a problem, we may consider allowing different mmaps to have different flags, -/// some having no huge pages. -/// Note that we do our bookkeeping at small-page granularity even if huge pages are enabled. -/// -/// It's unfortunate that Linux's MADV_FREE eviction doesn't use the two-list strategy like the real -/// page cache (IIUC, MADV_FREE puts the pages at the head of the inactive list, and they can never -/// get to the active list). -/// If this turns out to be a problem, we could make PageCache do chunk eviction based on observed -/// system memory usage, so that most eviction is done by us, and the MADV_FREE eviction kicks in -/// only as a last resort. Then we can make PageCache's eviction policy arbitrarily more sophisticated. - -namespace DB -{ - -/// Hash of FileChunkAddress. -using PageCacheKey = UInt128; - -/// Identifies a chunk of a file or object. -/// We assume that contents of such file/object don't change (without file_version changing), so -/// cache invalidation is needed. -struct FileChunkAddress -{ - /// Path, usually prefixed with storage system name and anything else needed to make it unique. - /// E.g. "s3:/" - std::string path; - /// Optional string with ETag, or file modification time, or anything else. - std::string file_version; - size_t offset = 0; - - PageCacheKey hash() const; - - std::string toString() const; -}; - -struct AtomicBitSet -{ - size_t n = 0; - std::unique_ptr[]> v; - - AtomicBitSet(); - - void init(size_t n); - - bool get(size_t i) const; - bool any() const; - /// These return true if the bit was changed, false if it already had the target value. - /// (These methods are logically not const, but clang insists that I make them const, and - /// '#pragma clang diagnostic ignored' doesn't seem to work.) - bool set(size_t i) const; - bool set(size_t i, bool val) const; - bool unset(size_t i) const; - void unsetAll() const; -}; - -enum class PageChunkState -{ - /// Pages are not reclaimable by the OS, the buffer has correct contents. - Stable, - /// Pages are reclaimable by the OS, the buffer contents are altered (first bit of each page set to 1). - Limbo, -}; - -/// (This is a separate struct just in case we want to use this cache for other things in future. -/// Then this struct would be the customization point, while the rest of PageChunk can stay unchanged.) -struct FileChunkState -{ - std::mutex download_mutex; - - void reset(); -}; - -using PageChunkLRUListHook = boost::intrusive::list_base_hook<>; - -/// Cache entry. -struct PageChunk : public PageChunkLRUListHook -{ - char * data; - size_t size; // in bytes - /// Page size for use in pages_populated and first_bit_of_each_page. Same as PageCache::pageSize(). - size_t page_size; - - /// Actual eviction granularity. Just for information. If huge pages are used, huge page size, otherwise page_size. - size_t big_page_size; - - mutable FileChunkState state; - - AtomicBitSet pages_populated; - -private: - friend class PinnedPageChunk; - friend class PageCache; - - /// If nullopt, the chunk is "detached", i.e. not associated with any key. - /// Detached chunks may still be pinned. Chunk may get detached even while pinned, in particular when dropping cache. - /// Protected by global_mutex. - std::optional key; - - /// Refcount for usage of this chunk. When zero, the pages are reclaimable by the OS, and - /// the PageChunk itself is evictable (linked into PageCache::lru). - std::atomic pin_count {0}; - - /// Bit mask containing the first bit of data from each page. Needed for the weird probing procedure when un-MADV_FREE-ing the pages. - AtomicBitSet first_bit_of_each_page; - - /// Locked when changing pages_state, along with the corresponding expensive MADV_FREE/un-MADV_FREE operation. - mutable std::mutex chunk_mutex; - - /// Normally pin_count == 0 <=> state == PageChunkState::Limbo, - /// pin_count > 0 <=> state == PageChunkState::Stable. - /// This separate field is needed because of synchronization: pin_count is changed with global_mutex locked, - /// this field is changed with chunk_mutex locked, and we never have to lock both mutexes at once. - PageChunkState pages_state = PageChunkState::Stable; -}; - -class PageCache; - -/// Handle for a cache entry. Neither the entry nor its pages can get evicted while there's at least one PinnedPageChunk pointing to it. -class PinnedPageChunk -{ -public: - const PageChunk * getChunk() const; - - /// Sets the bit in pages_populated. Returns true if it actually changed (i.e. was previously 0). - bool markPagePopulated(size_t page_idx); - - /// Calls markPagePopulated() for pages 0..ceil(bytes/page_size). - void markPrefixPopulated(size_t bytes); - - bool isPrefixPopulated(size_t bytes) const; - - PinnedPageChunk() = default; - ~PinnedPageChunk() noexcept; - - PinnedPageChunk(PinnedPageChunk &&) noexcept; - PinnedPageChunk & operator=(PinnedPageChunk &&) noexcept; - -private: - friend class PageCache; - - PageCache * cache = nullptr; - PageChunk * chunk = nullptr; - - PinnedPageChunk(PageCache * cache_, PageChunk * chunk_) noexcept; -}; - -class PageCache -{ -public: - PageCache(size_t bytes_per_chunk, size_t bytes_per_mmap, size_t bytes_total, bool use_madv_free, bool use_huge_pages); - ~PageCache(); - - /// Get or insert a chunk for the given key. - /// - /// If detached_if_missing = true, and the key is not present in the cache, the returned chunk - /// won't be associated with the key and will be evicted as soon as it's unpinned. - /// It's like "get if exists, otherwise return null", but instead of null we return a usable - /// temporary buffer, for convenience. Pinning and page eviction make the story more complicated: - /// * If the chunk for this key is pinned, we return it even if it's not fully populated - /// (because PageCache doesn't know what "fully populated" means). - /// * If the chunk exists, but some of its pages were evicted, we detach it. (Currently we only - /// check the first page here.) - PinnedPageChunk getOrSet(PageCacheKey key, bool detached_if_missing, bool inject_eviction); - - /// OS page size, e.g. 4 KiB on x86, 4 KiB or 64 KiB on aarch64. - /// - /// If transparent huge pages are enabled, this is still the regular page size, and all our bookkeeping - /// is still based on regular page size (e.g. pages_populated), because (a) it's cheap anyway, - /// and (b) I'm not sure if Linux guarantees that MADV_FREE reclamation always happens at huge page - /// granularity, and wouldn't want to rely on this even if it does. - size_t pageSize() const; - size_t chunkSize() const; - size_t maxChunks() const; - - struct MemoryStats - { - /// How many bytes of actual RAM are used for the cache pages. Doesn't include metadata - /// and overhead (e.g. PageChunk structs). - size_t page_cache_rss = 0; - /// Resident set size for the whole process, excluding any MADV_FREE pages (PageCache's or not). - /// This can be used as a more useful memory usage number for clickhouse server, instead of RSS. - /// Populated only if MADV_FREE is used, otherwise zero. - std::optional unreclaimable_rss; - }; - - /// Reads /proc/self/smaps, so not very fast. - MemoryStats getResidentSetSize() const; - - /// Total length of memory ranges currently pinned by PinnedPageChunk-s, including unpopulated pages. - size_t getPinnedSize() const; - - /// Clears the key -> chunk mapping. Frees memory (MADV_DONTNEED) of all chunks that are not pinned. - /// Doesn't unmap any virtual memory. Detaches but doesn't free the pinned chunks. - /// Locks the global mutex for the duration of the operation, which may block queries for hundreds of milliseconds. - void dropCache(); - -private: - friend class PinnedPageChunk; - - struct Mmap - { - void * ptr = nullptr; - size_t size = 0; - - std::unique_ptr chunks; - size_t num_chunks = 0; // might be smaller than chunks_per_mmap_target because of alignment - - Mmap(Mmap &&) noexcept; - Mmap(size_t bytes_per_page, size_t pages_per_chunk, size_t pages_per_big_page, size_t num_chunks, void * address_hint, bool use_huge_pages_); - ~Mmap() noexcept; - }; - - size_t bytes_per_page; - size_t pages_per_chunk; - size_t chunks_per_mmap_target; - size_t max_mmaps; - size_t pages_per_big_page = 1; // if huge pages are used, huge_page_size/page_size, otherwise 1 - bool use_madv_free = true; - bool use_huge_pages = true; - - mutable std::mutex global_mutex; - - pcg64 rng; - - std::vector mmaps; - size_t total_chunks = 0; - - /// All non-pinned chunks, including ones not assigned to any file. Least recently used is begin(). - boost::intrusive::list, boost::intrusive::constant_time_size> lru; - - HashMap chunk_by_key; - - /// Get a usable chunk, doing eviction or allocation if needed. - /// Caller is responsible for clearing pages_populated. - PageChunk * getFreeChunk(std::unique_lock & /* global_mutex */); - void addMmap(std::unique_lock & /* global_mutex */); - void evictChunk(PageChunk * chunk, std::unique_lock & /* global_mutex */); - - void removeRef(PageChunk * chunk) noexcept; - - /// These may run in parallel with getFreeChunk(), so be very careful about which fields of the PageChunk we touch here. - void sendChunkToLimbo(PageChunk * chunk, std::unique_lock & /* chunk_mutex */) const noexcept; - /// Returns {pages_restored, pages_evicted}. - std::pair restoreChunkFromLimbo(PageChunk * chunk, std::unique_lock & /* chunk_mutex */) const noexcept; -}; - -using PageCachePtr = std::shared_ptr; - -} diff --git a/src/Common/ProfileEvents.cpp b/src/Common/ProfileEvents.cpp index 3a8659b8b27..d8ca1ab9e93 100644 --- a/src/Common/ProfileEvents.cpp +++ b/src/Common/ProfileEvents.cpp @@ -63,15 +63,6 @@ M(MarkCacheMisses, "Number of times an entry has not been found in the mark cache, so we had to load a mark file in memory, which is a costly operation, adding to query latency.") \ M(QueryCacheHits, "Number of times a query result has been found in the query cache (and query computation was avoided). Only updated for SELECT queries with SETTING use_query_cache = 1.") \ M(QueryCacheMisses, "Number of times a query result has not been found in the query cache (and required query computation). Only updated for SELECT queries with SETTING use_query_cache = 1.") \ - /* Each page cache chunk access increments exactly one of the following 5 PageCacheChunk* counters. */ \ - /* Something like hit rate: (PageCacheChunkShared + PageCacheChunkDataHits) / [sum of all 5]. */ \ - M(PageCacheChunkMisses, "Number of times a chunk has not been found in the userspace page cache.") \ - M(PageCacheChunkShared, "Number of times a chunk has been found in the userspace page cache, already in use by another thread.") \ - M(PageCacheChunkDataHits, "Number of times a chunk has been found in the userspace page cache, not in use, with all pages intact.") \ - M(PageCacheChunkDataPartialHits, "Number of times a chunk has been found in the userspace page cache, not in use, but some of its pages were evicted by the OS.") \ - M(PageCacheChunkDataMisses, "Number of times a chunk has been found in the userspace page cache, not in use, but all its pages were evicted by the OS.") \ - M(PageCacheBytesUnpinnedRoundedToPages, "Total size of populated pages in chunks that became evictable in PageCache. Rounded up to whole pages.") \ - M(PageCacheBytesUnpinnedRoundedToHugePages, "See PageCacheBytesUnpinnedRoundedToPages, but rounded to huge pages. Use the ratio between the two as a measure of memory waste from using huge pages.") \ M(CreatedReadBufferOrdinary, "Number of times ordinary read buffer was created for reading data (while choosing among other read methods).") \ M(CreatedReadBufferDirectIO, "Number of times a read buffer with O_DIRECT was created for reading data (while choosing among other read methods).") \ M(CreatedReadBufferDirectIOFailed, "Number of times a read buffer with O_DIRECT was attempted to be created for reading data (while choosing among other read methods), but the OS did not allow it (due to lack of filesystem support or other reasons) and we fallen back to the ordinary reading method.") \ diff --git a/src/Core/Defines.h b/src/Core/Defines.h index cc6f49aa361..bf9fb1db6bc 100644 --- a/src/Core/Defines.h +++ b/src/Core/Defines.h @@ -70,15 +70,6 @@ static constexpr auto DBMS_DEFAULT_MAX_QUERY_SIZE = 262144; /// Max depth of hierarchical dictionary static constexpr auto DBMS_HIERARCHICAL_DICTIONARY_MAX_DEPTH = 1000; -#ifdef OS_LINUX -#define DBMS_DEFAULT_PAGE_CACHE_USE_MADV_FREE true -#else -/// On Mac OS, MADV_FREE is not lazy, so page_cache_use_madv_free should be disabled. -/// On FreeBSD, it may work but we haven't tested it. -#define DBMS_DEFAULT_PAGE_CACHE_USE_MADV_FREE false -#endif - - /// Default maximum (total and entry) sizes and policies of various caches static constexpr auto DEFAULT_UNCOMPRESSED_CACHE_POLICY = "SLRU"; static constexpr auto DEFAULT_UNCOMPRESSED_CACHE_MAX_SIZE = 0_MiB; diff --git a/src/Core/ServerSettings.h b/src/Core/ServerSettings.h index a54fb42b464..3713d0c3206 100644 --- a/src/Core/ServerSettings.h +++ b/src/Core/ServerSettings.h @@ -65,7 +65,7 @@ namespace DB M(UInt64, max_concurrent_insert_queries, 0, "Maximum number of concurrently INSERT queries. Zero means unlimited.", 0) \ M(UInt64, max_concurrent_select_queries, 0, "Maximum number of concurrently SELECT queries. Zero means unlimited.", 0) \ \ - M(Double, cache_size_to_ram_max_ratio, 0.5, "Set cache size to RAM max ratio. Allows to lower cache size on low-memory systems.", 0) \ + M(Double, cache_size_to_ram_max_ratio, 0.5, "Set cache size ro RAM max ratio. Allows to lower cache size on low-memory systems.", 0) \ M(String, uncompressed_cache_policy, DEFAULT_UNCOMPRESSED_CACHE_POLICY, "Uncompressed cache policy name.", 0) \ M(UInt64, uncompressed_cache_size, DEFAULT_UNCOMPRESSED_CACHE_MAX_SIZE, "Size of cache for uncompressed blocks. Zero means disabled.", 0) \ M(Double, uncompressed_cache_size_ratio, DEFAULT_UNCOMPRESSED_CACHE_SIZE_RATIO, "The size of the protected queue in the uncompressed cache relative to the cache's total size.", 0) \ @@ -78,11 +78,6 @@ namespace DB M(String, index_mark_cache_policy, DEFAULT_INDEX_MARK_CACHE_POLICY, "Secondary index mark cache policy name.", 0) \ M(UInt64, index_mark_cache_size, DEFAULT_INDEX_MARK_CACHE_MAX_SIZE, "Size of cache for secondary index marks. Zero means disabled.", 0) \ M(Double, index_mark_cache_size_ratio, DEFAULT_INDEX_MARK_CACHE_SIZE_RATIO, "The size of the protected queue in the secondary index mark cache relative to the cache's total size.", 0) \ - M(UInt64, page_cache_chunk_size, 2 << 20, "Bytes per chunk in userspace page cache. Rounded up to a multiple of page size (typically 4 KiB) or huge page size (typically 2 MiB, only if page_cache_use_thp is enabled).", 0) \ - M(UInt64, page_cache_mmap_size, 1 << 30, "Bytes per memory mapping in userspace page cache. Not important.", 0) \ - M(UInt64, page_cache_size, 10ul << 30, "Amount of virtual memory to map for userspace page cache. If page_cache_use_madv_free is enabled, it's recommended to set this higher than the machine's RAM size. Use 0 to disable userspace page cache.", 0) \ - M(Bool, page_cache_use_madv_free, DBMS_DEFAULT_PAGE_CACHE_USE_MADV_FREE, "If true, the userspace page cache will allow the OS to automatically reclaim memory from the cache on memory pressure (using MADV_FREE).", 0) \ - M(Bool, page_cache_use_transparent_huge_pages, true, "Userspace will attempt to use transparent huge pages on Linux. This is best-effort.", 0) \ M(UInt64, mmap_cache_size, DEFAULT_MMAP_CACHE_MAX_SIZE, "A cache for mmapped files.", 0) \ \ M(Bool, disable_internal_dns_cache, false, "Disable internal DNS caching at all.", 0) \ diff --git a/src/Core/Settings.h b/src/Core/Settings.h index 7d1112af3a7..ae6ea165cc9 100644 --- a/src/Core/Settings.h +++ b/src/Core/Settings.h @@ -777,10 +777,6 @@ class IColumn; M(Bool, throw_on_error_from_cache_on_write_operations, false, "Ignore error from cache when caching on write operations (INSERT, merges)", 0) \ M(UInt64, filesystem_cache_segments_batch_size, 20, "Limit on size of a single batch of file segments that a read buffer can request from cache. Too low value will lead to excessive requests to cache, too large may slow down eviction from cache", 0) \ \ - M(Bool, use_page_cache_for_disks_without_file_cache, false, "Use userspace page cache for remote disks that don't have filesystem cache enabled.", 0) \ - M(Bool, read_from_page_cache_if_exists_otherwise_bypass_cache, false, "Use userspace page cache in passive mode, similar to read_from_filesystem_cache_if_exists_otherwise_bypass_cache.", 0) \ - M(Bool, page_cache_inject_eviction, false, "Userspace page cache will sometimes invalidate some pages at random. Intended for testing.", 0) \ - \ M(Bool, load_marks_asynchronously, false, "Load MergeTree marks asynchronously", 0) \ M(Bool, enable_filesystem_read_prefetches_log, false, "Log to system.filesystem prefetch_log during query. Should be used only for testing or debugging, not recommended to be turned on by default", 0) \ M(Bool, allow_prefetched_read_pool_for_remote_filesystem, true, "Prefer prefetched threadpool if all parts are on remote filesystem", 0) \ diff --git a/src/Core/SettingsChangesHistory.h b/src/Core/SettingsChangesHistory.h index 02ee641903c..e8d013d13ec 100644 --- a/src/Core/SettingsChangesHistory.h +++ b/src/Core/SettingsChangesHistory.h @@ -114,9 +114,6 @@ static std::map sett {"default_normal_view_sql_security", "INVOKER", "INVOKER", "Allows to set default `SQL SECURITY` option while creating a normal view"}, {"mysql_map_string_to_text_in_show_columns", false, true, "Reduce the configuration effort to connect ClickHouse with BI tools."}, {"mysql_map_fixed_string_to_text_in_show_columns", false, true, "Reduce the configuration effort to connect ClickHouse with BI tools."}, - {"use_page_cache_for_disks_without_file_cache", false, false, "Added userspace page cache"}, - {"read_from_page_cache_if_exists_otherwise_bypass_cache", false, false, "Added userspace page cache"}, - {"page_cache_inject_eviction", false, false, "Added userspace page cache"}, }}, {"24.1", {{"print_pretty_type_names", false, true, "Better user experience."}, {"input_format_json_read_bools_as_strings", false, true, "Allow to read bools as strings in JSON formats by default"}, diff --git a/src/Disks/IO/AsynchronousBoundedReadBuffer.cpp b/src/Disks/IO/AsynchronousBoundedReadBuffer.cpp index 1a9cd2c994c..2373640704b 100644 --- a/src/Disks/IO/AsynchronousBoundedReadBuffer.cpp +++ b/src/Disks/IO/AsynchronousBoundedReadBuffer.cpp @@ -129,7 +129,6 @@ void AsynchronousBoundedReadBuffer::setReadUntilPosition(size_t position) /// new read until position is after the current position in the working buffer file_offset_of_buffer_end = position; working_buffer.resize(working_buffer.size() - (file_offset_of_buffer_end - position)); - pos = std::min(pos, working_buffer.end()); } else { @@ -236,6 +235,9 @@ bool AsynchronousBoundedReadBuffer::nextImpl() file_offset_of_buffer_end = impl->getFileOffsetOfBufferEnd(); + /// In case of multiple files for the same file in clickhouse (i.e. log family) + /// file_offset_of_buffer_end will not match getImplementationBufferOffset() + /// so we use [impl->getImplementationBufferOffset(), impl->getFileSize()] chassert(file_offset_of_buffer_end <= impl->getFileSize()); if (read_until_position && (file_offset_of_buffer_end > *read_until_position)) @@ -262,7 +264,7 @@ off_t AsynchronousBoundedReadBuffer::seek(off_t offset, int whence) size_t new_pos; if (whence == SEEK_SET) { - chassert(offset >= 0); + assert(offset >= 0); new_pos = offset; } else if (whence == SEEK_CUR) @@ -288,8 +290,8 @@ off_t AsynchronousBoundedReadBuffer::seek(off_t offset, int whence) /// Position is still inside the buffer. /// Probably it is at the end of the buffer - then we will load data on the following 'next' call. pos = working_buffer.end() - file_offset_of_buffer_end + new_pos; - chassert(pos >= working_buffer.begin()); - chassert(pos <= working_buffer.end()); + assert(pos >= working_buffer.begin()); + assert(pos <= working_buffer.end()); return new_pos; } @@ -315,7 +317,7 @@ off_t AsynchronousBoundedReadBuffer::seek(off_t offset, int whence) break; } - chassert(!prefetch_future.valid()); + assert(!prefetch_future.valid()); /// First reset the buffer so the next read will fetch new data to the buffer. resetWorkingBuffer(); diff --git a/src/Disks/IO/CachedOnDiskReadBufferFromFile.cpp b/src/Disks/IO/CachedOnDiskReadBufferFromFile.cpp index 47ee5858562..7ce3d58dcd8 100644 --- a/src/Disks/IO/CachedOnDiskReadBufferFromFile.cpp +++ b/src/Disks/IO/CachedOnDiskReadBufferFromFile.cpp @@ -1215,7 +1215,7 @@ size_t CachedOnDiskReadBufferFromFile::getRemainingSizeToRead() void CachedOnDiskReadBufferFromFile::setReadUntilPosition(size_t position) { - if (initialized && !allow_seeks_after_first_read) + if (!allow_seeks_after_first_read) throw Exception(ErrorCodes::LOGICAL_ERROR, "Method `setReadUntilPosition()` not allowed"); if (read_until_position == position) diff --git a/src/Disks/IO/ReadBufferFromRemoteFSGather.cpp b/src/Disks/IO/ReadBufferFromRemoteFSGather.cpp index 417f7615dd7..0b3ecca3587 100644 --- a/src/Disks/IO/ReadBufferFromRemoteFSGather.cpp +++ b/src/Disks/IO/ReadBufferFromRemoteFSGather.cpp @@ -5,7 +5,6 @@ #include #include #include -#include #include #include #include @@ -17,16 +16,12 @@ using namespace DB; namespace { -bool withFileCache(const ReadSettings & settings) +bool withCache(const ReadSettings & settings) { return settings.remote_fs_cache && settings.enable_filesystem_cache && (!CurrentThread::getQueryId().empty() || settings.read_from_filesystem_cache_if_exists_otherwise_bypass_cache || !settings.avoid_readthrough_cache_outside_query_context); } -bool withPageCache(const ReadSettings & settings, bool with_file_cache) -{ - return settings.page_cache && !with_file_cache && settings.use_page_cache_for_disks_without_file_cache; -} } namespace DB @@ -39,7 +34,7 @@ namespace ErrorCodes size_t chooseBufferSizeForRemoteReading(const DB::ReadSettings & settings, size_t file_size) { /// Only when cache is used we could download bigger portions of FileSegments than what we actually gonna read within particular task. - if (!withFileCache(settings)) + if (!withCache(settings)) return settings.remote_fs_buffer_size; /// Buffers used for prefetch and pre-download better to have enough size, but not bigger than the whole file. @@ -49,30 +44,27 @@ size_t chooseBufferSizeForRemoteReading(const DB::ReadSettings & settings, size_ ReadBufferFromRemoteFSGather::ReadBufferFromRemoteFSGather( ReadBufferCreator && read_buffer_creator_, const StoredObjects & blobs_to_read_, - const std::string & cache_path_prefix_, const ReadSettings & settings_, std::shared_ptr cache_log_, bool use_external_buffer_) - : ReadBufferFromFileBase(use_external_buffer_ ? 0 : chooseBufferSizeForRemoteReading( - settings_, getTotalSize(blobs_to_read_)), nullptr, 0) + : ReadBufferFromFileBase( + use_external_buffer_ ? 0 : chooseBufferSizeForRemoteReading(settings_, getTotalSize(blobs_to_read_)), nullptr, 0) , settings(settings_) , blobs_to_read(blobs_to_read_) , read_buffer_creator(std::move(read_buffer_creator_)) - , cache_path_prefix(cache_path_prefix_) , cache_log(settings.enable_filesystem_cache_log ? cache_log_ : nullptr) , query_id(CurrentThread::getQueryId()) , use_external_buffer(use_external_buffer_) - , with_file_cache(withFileCache(settings)) - , with_page_cache(withPageCache(settings, with_file_cache)) + , with_cache(withCache(settings)) , log(getLogger("ReadBufferFromRemoteFSGather")) { if (!blobs_to_read.empty()) current_object = blobs_to_read.front(); } -SeekableReadBufferPtr ReadBufferFromRemoteFSGather::createImplementationBuffer(const StoredObject & object, size_t start_offset) +SeekableReadBufferPtr ReadBufferFromRemoteFSGather::createImplementationBuffer(const StoredObject & object) { - if (current_buf && !with_file_cache) + if (current_buf && !with_cache) { appendUncachedReadInfo(); } @@ -80,45 +72,30 @@ SeekableReadBufferPtr ReadBufferFromRemoteFSGather::createImplementationBuffer(c current_object = object; const auto & object_path = object.remote_path; - std::unique_ptr buf; + size_t current_read_until_position = read_until_position ? read_until_position : object.bytes_size; + auto current_read_buffer_creator = [=, this]() { return read_buffer_creator(object_path, current_read_until_position); }; #ifndef CLICKHOUSE_KEEPER_STANDALONE_BUILD - if (with_file_cache) + if (with_cache) { auto cache_key = settings.remote_fs_cache->createKeyForPath(object_path); - buf = std::make_unique( + return std::make_shared( object_path, cache_key, settings.remote_fs_cache, FileCache::getCommonUser(), - [=, this]() { return read_buffer_creator(/* restricted_seek */true, object_path); }, + std::move(current_read_buffer_creator), settings, query_id, object.bytes_size, /* allow_seeks */false, /* use_external_buffer */true, - /* read_until_position */std::nullopt, + read_until_position ? std::optional(read_until_position) : std::nullopt, cache_log); } #endif - /// Can't wrap CachedOnDiskReadBufferFromFile in CachedInMemoryReadBufferFromFile because the - /// former doesn't support seeks. - if (with_page_cache && !buf) - { - auto inner = read_buffer_creator(/* restricted_seek */false, object_path); - auto cache_key = FileChunkAddress { .path = cache_path_prefix + object_path }; - buf = std::make_unique( - cache_key, settings.page_cache, std::move(inner), settings); - } - - if (!buf) - buf = read_buffer_creator(/* restricted_seek */true, object_path); - - if (read_until_position > start_offset && read_until_position < start_offset + object.bytes_size) - buf->setReadUntilPosition(read_until_position - start_offset); - - return buf; + return current_read_buffer_creator(); } void ReadBufferFromRemoteFSGather::appendUncachedReadInfo() @@ -147,12 +124,12 @@ void ReadBufferFromRemoteFSGather::initialize() return; /// One clickhouse file can be split into multiple files in remote fs. - size_t start_offset = 0; + auto current_buf_offset = file_offset_of_buffer_end; for (size_t i = 0; i < blobs_to_read.size(); ++i) { const auto & object = blobs_to_read[i]; - if (start_offset + object.bytes_size > file_offset_of_buffer_end) + if (object.bytes_size > current_buf_offset) { LOG_TEST(log, "Reading from file: {} ({})", object.remote_path, object.local_path); @@ -160,14 +137,14 @@ void ReadBufferFromRemoteFSGather::initialize() if (!current_buf || current_buf_idx != i) { current_buf_idx = i; - current_buf = createImplementationBuffer(object, start_offset); + current_buf = createImplementationBuffer(object); } - current_buf->seek(file_offset_of_buffer_end - start_offset, SEEK_SET); + current_buf->seek(current_buf_offset, SEEK_SET); return; } - start_offset += object.bytes_size; + current_buf_offset -= object.bytes_size; } current_buf_idx = blobs_to_read.size(); current_buf = nullptr; @@ -194,14 +171,14 @@ bool ReadBufferFromRemoteFSGather::nextImpl() bool ReadBufferFromRemoteFSGather::moveToNextBuffer() { /// If there is no available buffers - nothing to read. - if (current_buf_idx + 1 >= blobs_to_read.size() || (read_until_position && file_offset_of_buffer_end >= read_until_position)) + if (current_buf_idx + 1 >= blobs_to_read.size()) return false; ++current_buf_idx; const auto & object = blobs_to_read[current_buf_idx]; LOG_TEST(log, "Reading from next file: {} ({})", object.remote_path, object.local_path); - current_buf = createImplementationBuffer(object, file_offset_of_buffer_end); + current_buf = createImplementationBuffer(object); return true; } @@ -286,7 +263,7 @@ off_t ReadBufferFromRemoteFSGather::seek(off_t offset, int whence) ReadBufferFromRemoteFSGather::~ReadBufferFromRemoteFSGather() { - if (!with_file_cache) + if (!with_cache) appendUncachedReadInfo(); } diff --git a/src/Disks/IO/ReadBufferFromRemoteFSGather.h b/src/Disks/IO/ReadBufferFromRemoteFSGather.h index 8362b354e23..f6b7506a54f 100644 --- a/src/Disks/IO/ReadBufferFromRemoteFSGather.h +++ b/src/Disks/IO/ReadBufferFromRemoteFSGather.h @@ -21,12 +21,11 @@ class ReadBufferFromRemoteFSGather final : public ReadBufferFromFileBase friend class ReadIndirectBufferFromRemoteFS; public: - using ReadBufferCreator = std::function(bool restricted_seek, const std::string & path)>; + using ReadBufferCreator = std::function(const std::string & path, size_t read_until_position)>; ReadBufferFromRemoteFSGather( ReadBufferCreator && read_buffer_creator_, const StoredObjects & blobs_to_read_, - const std::string & cache_path_prefix_, const ReadSettings & settings_, std::shared_ptr cache_log_, bool use_external_buffer_); @@ -54,7 +53,7 @@ public: bool isContentCached(size_t offset, size_t size) override; private: - SeekableReadBufferPtr createImplementationBuffer(const StoredObject & object, size_t start_offset); + SeekableReadBufferPtr createImplementationBuffer(const StoredObject & object); bool nextImpl() override; @@ -71,12 +70,10 @@ private: const ReadSettings settings; const StoredObjects blobs_to_read; const ReadBufferCreator read_buffer_creator; - const std::string cache_path_prefix; const std::shared_ptr cache_log; const String query_id; const bool use_external_buffer; - const bool with_file_cache; - const bool with_page_cache; + const bool with_cache; size_t read_until_position = 0; size_t file_offset_of_buffer_end = 0; diff --git a/src/Disks/IO/ThreadPoolRemoteFSReader.cpp b/src/Disks/IO/ThreadPoolRemoteFSReader.cpp index 590fc4c4656..f3caf62ffd5 100644 --- a/src/Disks/IO/ThreadPoolRemoteFSReader.cpp +++ b/src/Disks/IO/ThreadPoolRemoteFSReader.cpp @@ -152,8 +152,6 @@ IAsynchronousReader::Result ThreadPoolRemoteFSReader::execute(Request request, b IAsynchronousReader::Result read_result; if (result) { - chassert(reader.buffer().begin() == request.buf); - chassert(reader.buffer().end() <= request.buf + request.size); read_result.size = reader.buffer().size(); read_result.offset = reader.offset(); ProfileEvents::increment(ProfileEvents::ThreadpoolReaderReadBytes, read_result.size); diff --git a/src/Disks/IO/ThreadPoolRemoteFSReader.h b/src/Disks/IO/ThreadPoolRemoteFSReader.h index eacce5a54ac..abc251b2b10 100644 --- a/src/Disks/IO/ThreadPoolRemoteFSReader.h +++ b/src/Disks/IO/ThreadPoolRemoteFSReader.h @@ -29,9 +29,6 @@ private: class RemoteFSFileDescriptor : public IAsynchronousReader::IFileDescriptor { public: - /// `reader_` implementation must ensure that next() places data at the start of internal_buffer, - /// even if there was previously a seek. I.e. seek() shouldn't leave pending data (no short seek - /// optimization), and nextImpl() shouldn't assign nextimpl_working_buffer_offset. explicit RemoteFSFileDescriptor( SeekableReadBuffer & reader_, std::shared_ptr async_read_counters_) diff --git a/src/Disks/ObjectStorages/AzureBlobStorage/AzureObjectStorage.cpp b/src/Disks/ObjectStorages/AzureBlobStorage/AzureObjectStorage.cpp index 136f69ab729..74389aedb64 100644 --- a/src/Disks/ObjectStorages/AzureBlobStorage/AzureObjectStorage.cpp +++ b/src/Disks/ObjectStorages/AzureBlobStorage/AzureObjectStorage.cpp @@ -206,7 +206,7 @@ std::unique_ptr AzureObjectStorage::readObjects( /// NOL auto read_buffer_creator = [this, settings_ptr, disk_read_settings] - (bool restricted_seek, const std::string & path) -> std::unique_ptr + (const std::string & path, size_t read_until_position) -> std::unique_ptr { return std::make_unique( client.get(), @@ -215,7 +215,8 @@ std::unique_ptr AzureObjectStorage::readObjects( /// NOL settings_ptr->max_single_read_retries, settings_ptr->max_single_download_retries, /* use_external_buffer */true, - restricted_seek); + /* restricted_seek */true, + read_until_position); }; switch (read_settings.remote_fs_method) @@ -225,17 +226,16 @@ std::unique_ptr AzureObjectStorage::readObjects( /// NOL return std::make_unique( std::move(read_buffer_creator), objects, - "azure:", disk_read_settings, global_context->getFilesystemCacheLog(), /* use_external_buffer */false); + } case RemoteFSReadMethod::threadpool: { auto impl = std::make_unique( std::move(read_buffer_creator), objects, - "azure:", disk_read_settings, global_context->getFilesystemCacheLog(), /* use_external_buffer */true); diff --git a/src/Disks/ObjectStorages/DiskObjectStorage.cpp b/src/Disks/ObjectStorages/DiskObjectStorage.cpp index 16183ec20c1..2a648f28f14 100644 --- a/src/Disks/ObjectStorages/DiskObjectStorage.cpp +++ b/src/Disks/ObjectStorages/DiskObjectStorage.cpp @@ -527,9 +527,10 @@ std::unique_ptr DiskObjectStorage::readFile( std::optional read_hint, std::optional file_size) const { - const auto storage_objects = metadata_storage->getStorageObjects(path); + auto storage_objects = metadata_storage->getStorageObjects(path); const bool file_can_be_empty = !file_size.has_value() || *file_size == 0; + if (storage_objects.empty() && file_can_be_empty) return std::make_unique(); diff --git a/src/Disks/ObjectStorages/HDFS/HDFSObjectStorage.cpp b/src/Disks/ObjectStorages/HDFS/HDFSObjectStorage.cpp index f8545ecfe39..fa5e227d853 100644 --- a/src/Disks/ObjectStorages/HDFS/HDFSObjectStorage.cpp +++ b/src/Disks/ObjectStorages/HDFS/HDFSObjectStorage.cpp @@ -60,7 +60,7 @@ std::unique_ptr HDFSObjectStorage::readObjects( /// NOLI auto disk_read_settings = patchSettings(read_settings); auto read_buffer_creator = [this, disk_read_settings] - (bool /* restricted_seek */, const std::string & path) -> std::unique_ptr + (const std::string & path, size_t /* read_until_position */) -> std::unique_ptr { size_t begin_of_path = path.find('/', path.find("//") + 2); auto hdfs_path = path.substr(begin_of_path); @@ -71,7 +71,7 @@ std::unique_ptr HDFSObjectStorage::readObjects( /// NOLI }; return std::make_unique( - std::move(read_buffer_creator), objects, "hdfs:", disk_read_settings, nullptr, /* use_external_buffer */false); + std::move(read_buffer_creator), objects, disk_read_settings, nullptr, /* use_external_buffer */false); } std::unique_ptr HDFSObjectStorage::writeObject( /// NOLINT diff --git a/src/Disks/ObjectStorages/Local/LocalObjectStorage.cpp b/src/Disks/ObjectStorages/Local/LocalObjectStorage.cpp index 7fd4536f266..02700b358e0 100644 --- a/src/Disks/ObjectStorages/Local/LocalObjectStorage.cpp +++ b/src/Disks/ObjectStorages/Local/LocalObjectStorage.cpp @@ -47,7 +47,7 @@ std::unique_ptr LocalObjectStorage::readObjects( /// NOL auto modified_settings = patchSettings(read_settings); auto global_context = Context::getGlobalContextInstance(); auto read_buffer_creator = - [=] (bool /* restricted_seek */, const std::string & file_path) + [=] (const std::string & file_path, size_t /* read_until_position */) -> std::unique_ptr { return createReadBufferFromFileBase(file_path, modified_settings, read_hint, file_size); @@ -58,13 +58,13 @@ std::unique_ptr LocalObjectStorage::readObjects( /// NOL case RemoteFSReadMethod::read: { return std::make_unique( - std::move(read_buffer_creator), objects, "file:", modified_settings, + std::move(read_buffer_creator), objects, modified_settings, global_context->getFilesystemCacheLog(), /* use_external_buffer */false); } case RemoteFSReadMethod::threadpool: { auto impl = std::make_unique( - std::move(read_buffer_creator), objects, "file:", modified_settings, + std::move(read_buffer_creator), objects, modified_settings, global_context->getFilesystemCacheLog(), /* use_external_buffer */true); auto & reader = global_context->getThreadPoolReader(FilesystemReaderType::ASYNCHRONOUS_REMOTE_FS_READER); diff --git a/src/Disks/ObjectStorages/S3/S3ObjectStorage.cpp b/src/Disks/ObjectStorages/S3/S3ObjectStorage.cpp index d89c7c93e51..5771eb1ebe0 100644 --- a/src/Disks/ObjectStorages/S3/S3ObjectStorage.cpp +++ b/src/Disks/ObjectStorages/S3/S3ObjectStorage.cpp @@ -171,7 +171,7 @@ std::unique_ptr S3ObjectStorage::readObjects( /// NOLINT auto read_buffer_creator = [this, settings_ptr, disk_read_settings] - (bool restricted_seek, const std::string & path) -> std::unique_ptr + (const std::string & path, size_t read_until_position) -> std::unique_ptr { return std::make_unique( client.get(), @@ -182,8 +182,8 @@ std::unique_ptr S3ObjectStorage::readObjects( /// NOLINT disk_read_settings, /* use_external_buffer */true, /* offset */0, - /* read_until_position */0, - restricted_seek); + read_until_position, + /* restricted_seek */true); }; switch (read_settings.remote_fs_method) @@ -193,17 +193,16 @@ std::unique_ptr S3ObjectStorage::readObjects( /// NOLINT return std::make_unique( std::move(read_buffer_creator), objects, - "s3:" + uri.bucket + "/", disk_read_settings, global_context->getFilesystemCacheLog(), /* use_external_buffer */false); + } case RemoteFSReadMethod::threadpool: { auto impl = std::make_unique( std::move(read_buffer_creator), objects, - "s3:" + uri.bucket + "/", disk_read_settings, global_context->getFilesystemCacheLog(), /* use_external_buffer */true); diff --git a/src/Disks/ObjectStorages/Web/WebObjectStorage.cpp b/src/Disks/ObjectStorages/Web/WebObjectStorage.cpp index 48de0bf4168..786b23caf48 100644 --- a/src/Disks/ObjectStorages/Web/WebObjectStorage.cpp +++ b/src/Disks/ObjectStorages/Web/WebObjectStorage.cpp @@ -252,13 +252,14 @@ std::unique_ptr WebObjectStorage::readObject( /// NOLINT { auto read_buffer_creator = [this, read_settings] - (bool /* restricted_seek */, const std::string & path_) -> std::unique_ptr + (const std::string & path_, size_t read_until_position) -> std::unique_ptr { return std::make_unique( fs::path(url) / path_, getContext(), read_settings, - /* use_external_buffer */true); + /* use_external_buffer */true, + read_until_position); }; auto global_context = Context::getGlobalContextInstance(); @@ -270,7 +271,6 @@ std::unique_ptr WebObjectStorage::readObject( /// NOLINT return std::make_unique( std::move(read_buffer_creator), StoredObjects{object}, - "url:" + url + "/", read_settings, global_context->getFilesystemCacheLog(), /* use_external_buffer */false); @@ -280,7 +280,6 @@ std::unique_ptr WebObjectStorage::readObject( /// NOLINT auto impl = std::make_unique( std::move(read_buffer_creator), StoredObjects{object}, - "url:" + url + "/", read_settings, global_context->getFilesystemCacheLog(), /* use_external_buffer */true); diff --git a/src/IO/AsynchronousReader.h b/src/IO/AsynchronousReader.h index f9590b4419f..279a399caad 100644 --- a/src/IO/AsynchronousReader.h +++ b/src/IO/AsynchronousReader.h @@ -54,9 +54,6 @@ public: struct Result { - /// The read data is at [buf + offset, buf + size), where `buf` is from Request struct. - /// (Notice that `offset` is included in `size`.) - /// size /// Less than requested amount of data can be returned. /// If size is zero - the file has ended. diff --git a/src/IO/BufferBase.h b/src/IO/BufferBase.h index 1a087dd87fa..4c0a467b155 100644 --- a/src/IO/BufferBase.h +++ b/src/IO/BufferBase.h @@ -60,9 +60,6 @@ public: BufferBase(Position ptr, size_t size, size_t offset) : pos(ptr + offset), working_buffer(ptr, ptr + size), internal_buffer(ptr, ptr + size) {} - /// Assign the buffers and pos. - /// Be careful when calling this from ReadBuffer::nextImpl() implementations: `offset` is - /// effectively ignored because ReadBuffer::next() reassigns `pos`. void set(Position ptr, size_t size, size_t offset) { internal_buffer = Buffer(ptr, ptr + size); diff --git a/src/IO/CachedInMemoryReadBufferFromFile.cpp b/src/IO/CachedInMemoryReadBufferFromFile.cpp deleted file mode 100644 index 384d2229f14..00000000000 --- a/src/IO/CachedInMemoryReadBufferFromFile.cpp +++ /dev/null @@ -1,188 +0,0 @@ -#include "CachedInMemoryReadBufferFromFile.h" -#include -#include -#include - -namespace DB -{ - -namespace ErrorCodes -{ - extern const int UNEXPECTED_END_OF_FILE; - extern const int CANNOT_SEEK_THROUGH_FILE; - extern const int SEEK_POSITION_OUT_OF_BOUND; -} - -CachedInMemoryReadBufferFromFile::CachedInMemoryReadBufferFromFile( - FileChunkAddress cache_key_, PageCachePtr cache_, std::unique_ptr in_, const ReadSettings & settings_) - : ReadBufferFromFileBase(0, nullptr, 0, in_->getFileSize()), cache_key(cache_key_), cache(cache_), settings(settings_), in(std::move(in_)) - , read_until_position(file_size.value()) -{ - cache_key.offset = 0; -} - -String CachedInMemoryReadBufferFromFile::getFileName() const -{ - return in->getFileName(); -} - -off_t CachedInMemoryReadBufferFromFile::seek(off_t off, int whence) -{ - if (whence != SEEK_SET) - throw Exception(ErrorCodes::CANNOT_SEEK_THROUGH_FILE, "Only SEEK_SET mode is allowed."); - - size_t offset = static_cast(off); - if (offset > file_size.value()) - throw Exception(ErrorCodes::SEEK_POSITION_OUT_OF_BOUND, "Seek position is out of bounds. Offset: {}", off); - - if (offset >= file_offset_of_buffer_end - working_buffer.size() && offset <= file_offset_of_buffer_end) - { - pos = working_buffer.end() - (file_offset_of_buffer_end - offset); - chassert(getPosition() == off); - return off; - } - - resetWorkingBuffer(); - - file_offset_of_buffer_end = offset; - chunk.reset(); - - chassert(getPosition() == off); - return off; -} - -off_t CachedInMemoryReadBufferFromFile::getPosition() -{ - return file_offset_of_buffer_end - available(); -} - -size_t CachedInMemoryReadBufferFromFile::getFileOffsetOfBufferEnd() const -{ - return file_offset_of_buffer_end; -} - -void CachedInMemoryReadBufferFromFile::setReadUntilPosition(size_t position) -{ - read_until_position = position; - if (position < static_cast(getPosition())) - { - resetWorkingBuffer(); - chunk.reset(); - } - else if (position < file_offset_of_buffer_end) - { - size_t diff = file_offset_of_buffer_end - position; - working_buffer.resize(working_buffer.size() - diff); - file_offset_of_buffer_end -= diff; - } -} - -void CachedInMemoryReadBufferFromFile::setReadUntilEnd() -{ - setReadUntilPosition(file_size.value()); -} - -bool CachedInMemoryReadBufferFromFile::nextImpl() -{ - chassert(read_until_position <= file_size.value()); - if (file_offset_of_buffer_end >= read_until_position) - return false; - - if (chunk.has_value() && file_offset_of_buffer_end >= cache_key.offset + cache->chunkSize()) - { - chassert(file_offset_of_buffer_end == cache_key.offset + cache->chunkSize()); - chunk.reset(); - } - - if (!chunk.has_value()) - { - cache_key.offset = file_offset_of_buffer_end / cache->chunkSize() * cache->chunkSize(); - chunk = cache->getOrSet(cache_key.hash(), settings.read_from_page_cache_if_exists_otherwise_bypass_cache, settings.page_cache_inject_eviction); - - size_t chunk_size = std::min(cache->chunkSize(), file_size.value() - cache_key.offset); - - std::unique_lock download_lock(chunk->getChunk()->state.download_mutex); - - if (!chunk->isPrefixPopulated(chunk_size)) - { - /// A few things could be improved here, which may or may not be worth the added complexity: - /// * If the next file chunk is in cache, use in->setReadUntilPosition() to limit the read to - /// just one chunk. More generally, look ahead in the cache to count how many next chunks - /// need to be downloaded. (Up to some limit? And avoid changing `in`'s until-position if - /// it's already reasonable; otherwise we'd increase it by one chunk every chunk, discarding - /// a half-completed HTTP request every time.) - /// * If only a subset of pages are missing from this chunk, download only them, - /// with some threshold for avoiding short seeks. - /// In particular, if a previous download failed in the middle of the chunk, we could - /// resume from that position instead of from the beginning of the chunk. - /// (It's also possible in principle that a proper subset of chunk's pages was reclaimed - /// by the OS. But, for performance purposes, we should completely ignore that, because - /// (a) PageCache normally uses 2 MiB transparent huge pages and has just one such page - /// per chunk, and (b) even with 4 KiB pages partial chunk eviction is extremely rare.) - /// * If our [position, read_until_position) covers only part of the chunk, we could download - /// just that part. (Which would be bad if someone else needs the rest of the chunk and has - /// to do a whole new HTTP request to get it. Unclear what the policy should be.) - /// * Instead of doing in->next() in a loop until we get the whole chunk, we could return the - /// results as soon as in->next() produces them. - /// (But this would make the download_mutex situation much more complex, similar to the - /// FileSegment::State::PARTIALLY_DOWNLOADED and FileSegment::setRemoteFileReader() stuff.) - - Buffer prev_in_buffer = in->internalBuffer(); - SCOPE_EXIT({ in->set(prev_in_buffer.begin(), prev_in_buffer.size()); }); - - size_t pos = 0; - while (pos < chunk_size) - { - char * piece_start = chunk->getChunk()->data + pos; - size_t piece_size = chunk_size - pos; - in->set(piece_start, piece_size); - LOG_INFO(&Poco::Logger::get("asdqwe"), "this {:x}, in {:x}, path {}, size {}, offset {:x}, pos {:x}", reinterpret_cast(this), reinterpret_cast(in.get()), cache_key.path, file_size.value(), cache_key.offset, pos); - if (pos == 0) - in->seek(cache_key.offset, SEEK_SET); - else - chassert(!in->available()); - - if (in->eof()) - throw Exception(ErrorCodes::UNEXPECTED_END_OF_FILE, "File {} ended after {} bytes, but we expected {}", - getFileName(), cache_key.offset + pos, file_size.value()); - - chassert(in->position() >= piece_start && in->buffer().end() <= piece_start + piece_size); - chassert(in->getPosition() == static_cast(cache_key.offset + pos)); - - size_t n = in->available(); - chassert(n); - if (in->position() != piece_start) - memmove(piece_start, in->position(), n); - in->position() += n; - pos += n; - LOG_INFO(&Poco::Logger::get("asdqwe"), "this {:x}, got {:x} bytes", reinterpret_cast(this), n); - } - - chunk->markPrefixPopulated(chunk_size); - } - } - - nextimpl_working_buffer_offset = file_offset_of_buffer_end - cache_key.offset; - working_buffer = Buffer( - chunk->getChunk()->data, - chunk->getChunk()->data + std::min(chunk->getChunk()->size, read_until_position - cache_key.offset)); - pos = working_buffer.begin() + nextimpl_working_buffer_offset; - - if (!internal_buffer.empty()) - { - /// We were given an external buffer to read into. Copy the data into it. - /// Would be nice to avoid this copy, somehow, maybe by making ReadBufferFromRemoteFSGather - /// and AsynchronousBoundedReadBuffer explicitly aware of the page cache. - size_t n = std::min(available(), internal_buffer.size()); - memcpy(internal_buffer.begin(), pos, n); - working_buffer = Buffer(internal_buffer.begin(), internal_buffer.begin() + n); - pos = working_buffer.begin(); - nextimpl_working_buffer_offset = 0; - } - - file_offset_of_buffer_end += available(); - - return true; -} - -} diff --git a/src/IO/CachedInMemoryReadBufferFromFile.h b/src/IO/CachedInMemoryReadBufferFromFile.h deleted file mode 100644 index 300c2e82386..00000000000 --- a/src/IO/CachedInMemoryReadBufferFromFile.h +++ /dev/null @@ -1,41 +0,0 @@ -#pragma once - -#include -#include - -namespace DB -{ - -class CachedInMemoryReadBufferFromFile : public ReadBufferFromFileBase -{ -public: - /// `in_` must support using external buffer. I.e. we assign its internal_buffer before each next() - /// call and expect the read data to be put into that buffer. - /// `in_` should be seekable and should be able to read the whole file from 0 to in_->getFileSize(); - /// if you set `in_`'s read-until-position bypassing CachedInMemoryReadBufferFromFile then - /// CachedInMemoryReadBufferFromFile will break. - CachedInMemoryReadBufferFromFile(FileChunkAddress cache_key_, PageCachePtr cache_, std::unique_ptr in_, const ReadSettings & settings_); - - String getFileName() const override; - off_t seek(off_t off, int whence) override; - off_t getPosition() override; - size_t getFileOffsetOfBufferEnd() const override; - bool supportsRightBoundedReads() const override { return true; } - void setReadUntilPosition(size_t position) override; - void setReadUntilEnd() override; - -private: - FileChunkAddress cache_key; // .offset is offset of `chunk` start - PageCachePtr cache; - ReadSettings settings; - std::unique_ptr in; - - size_t file_offset_of_buffer_end = 0; - size_t read_until_position; - - std::optional chunk; - - bool nextImpl() override; -}; - -} diff --git a/src/IO/ReadBuffer.h b/src/IO/ReadBuffer.h index 00325734354..b45bc8f3dbc 100644 --- a/src/IO/ReadBuffer.h +++ b/src/IO/ReadBuffer.h @@ -225,22 +225,11 @@ public: * - seek() to a position above the until position (even if you setReadUntilPosition() to a * higher value right after the seek!), * - * Implementations are recommended to: - * - Allow the read-until-position to go below current position, e.g.: - * // Read block [300, 400) - * setReadUntilPosition(400); - * seek(300); - * next(); - * // Read block [100, 200) - * setReadUntilPosition(200); // oh oh, this is below the current position, but should be allowed - * seek(100); // but now everything's fine again - * next(); - * // (Swapping the order of seek and setReadUntilPosition doesn't help: then it breaks if the order of blocks is reversed.) - * - Check if new read-until-position value is equal to the current value and do nothing in this case, - * so that the caller doesn't have to. + * Typical implementations discard any current buffers and connections, even if the position is + * adjusted only a little. * - * Typical implementations discard any current buffers and connections when the - * read-until-position changes even by a small (nonzero) amount. + * Typical usage is to call it right after creating the ReadBuffer, before it started doing any + * work. */ virtual void setReadUntilPosition(size_t /* position */) {} diff --git a/src/IO/ReadSettings.h b/src/IO/ReadSettings.h index f4dc7880be4..c397689d6ad 100644 --- a/src/IO/ReadSettings.h +++ b/src/IO/ReadSettings.h @@ -61,7 +61,6 @@ enum class RemoteFSReadMethod }; class MMappedFileCache; -class PageCache; struct ReadSettings { @@ -103,12 +102,6 @@ struct ReadSettings bool avoid_readthrough_cache_outside_query_context = true; size_t filesystem_cache_segments_batch_size = 20; - //asdqwe assign these two - bool use_page_cache_for_disks_without_file_cache = false; - bool read_from_page_cache_if_exists_otherwise_bypass_cache = false; - bool page_cache_inject_eviction = false; - std::shared_ptr page_cache; - size_t filesystem_cache_max_download_size = (128UL * 1024 * 1024 * 1024); bool skip_download_if_exceeds_query_cache = true; diff --git a/src/Interpreters/Context.cpp b/src/Interpreters/Context.cpp index 53fd7d9b45f..8304a876fb1 100644 --- a/src/Interpreters/Context.cpp +++ b/src/Interpreters/Context.cpp @@ -17,7 +17,6 @@ #include #include #include -#include #include #include #include @@ -295,7 +294,6 @@ struct ContextSharedPart : boost::noncopyable mutable MarkCachePtr index_mark_cache TSA_GUARDED_BY(mutex); /// Cache of marks in compressed files of MergeTree indices. mutable MMappedFileCachePtr mmap_cache TSA_GUARDED_BY(mutex); /// Cache of mmapped files to avoid frequent open/map/unmap/close and to reuse from several threads. AsynchronousMetrics * asynchronous_metrics TSA_GUARDED_BY(mutex) = nullptr; /// Points to asynchronous metrics - mutable PageCachePtr page_cache TSA_GUARDED_BY(mutex); /// Userspace page cache. ProcessList process_list; /// Executing queries at the moment. SessionTracker session_tracker; GlobalOvercommitTracker global_overcommit_tracker; @@ -1230,7 +1228,7 @@ void Context::setUser(const UUID & user_id_, const std::optional() and other AccessControl's functions may require some IO work, - /// so Context::getLocalLock() and Context::getGlobalLock() must be unlocked while we're doing this. + /// so Context::getLock() must be unlocked while we're doing this. auto & access_control = getAccessControl(); auto user = access_control.read(user_id_); @@ -1360,7 +1358,7 @@ void Context::checkAccess(const AccessRightsElements & elements) const { return std::shared_ptr Context::getAccess() const { - /// A helper function to collect parameters for calculating access rights, called with Context::getLocalSharedLock() acquired. + /// A helper function to collect parameters for calculating access rights, called with Context::getLock() acquired. auto get_params = [this]() { /// If setUserID() was never called then this must be the global context with the full access. @@ -1387,8 +1385,7 @@ std::shared_ptr Context::getAccess() const } /// Calculate new access rights according to the collected parameters. - /// NOTE: AccessControl::getContextAccess() may require some IO work, so Context::getLocalLock() - /// and Context::getGlobalLock() must be unlocked while we're doing this. + /// NOTE: AccessControl::getContextAccess() may require some IO work, so Context::getLock() must be unlocked while we're doing this. auto res = getAccessControl().getContextAccess(*params); { @@ -2717,33 +2714,6 @@ void Context::clearUncompressedCache() const shared->uncompressed_cache->clear(); } -void Context::setPageCache(size_t bytes_per_chunk, size_t bytes_per_mmap, size_t bytes_total, bool use_madv_free, bool use_huge_pages) -{ - std::lock_guard lock(shared->mutex); - - if (shared->page_cache) - throw Exception(ErrorCodes::LOGICAL_ERROR, "Page cache has been already created."); - - shared->page_cache = std::make_shared(bytes_per_chunk, bytes_per_mmap, bytes_total, use_madv_free, use_huge_pages); -} - -PageCachePtr Context::getPageCache() const -{ - SharedLockGuard lock(shared->mutex); - return shared->page_cache; -} - -void Context::dropPageCache() const -{ - PageCachePtr cache; - { - SharedLockGuard lock(shared->mutex); - cache = shared->page_cache; - } - if (cache) - cache->dropCache(); -} - void Context::setMarkCache(const String & cache_policy, size_t max_cache_size_in_bytes, double size_ratio) { std::lock_guard lock(shared->mutex); @@ -5160,11 +5130,6 @@ ReadSettings Context::getReadSettings() const res.filesystem_cache_max_download_size = settings.filesystem_cache_max_download_size; res.skip_download_if_exceeds_query_cache = settings.skip_download_if_exceeds_query_cache; - res.page_cache = getPageCache(); - res.use_page_cache_for_disks_without_file_cache = settings.use_page_cache_for_disks_without_file_cache; - res.read_from_page_cache_if_exists_otherwise_bypass_cache = settings.read_from_page_cache_if_exists_otherwise_bypass_cache; - res.page_cache_inject_eviction = settings.page_cache_inject_eviction; - res.remote_read_min_bytes_for_seek = settings.remote_read_min_bytes_for_seek; /// Zero read buffer will not make progress. diff --git a/src/Interpreters/Context.h b/src/Interpreters/Context.h index ec5a044b28f..7bbff9c63bb 100644 --- a/src/Interpreters/Context.h +++ b/src/Interpreters/Context.h @@ -79,7 +79,6 @@ class RefreshSet; class Cluster; class Compiler; class MarkCache; -class PageCache; class MMappedFileCache; class UncompressedCache; class ProcessList; @@ -969,10 +968,6 @@ public: std::shared_ptr getUncompressedCache() const; void clearUncompressedCache() const; - void setPageCache(size_t bytes_per_chunk, size_t bytes_per_mmap, size_t bytes_total, bool use_madv_free, bool use_huge_pages); - std::shared_ptr getPageCache() const; - void dropPageCache() const; - void setMarkCache(const String & cache_policy, size_t max_cache_size_in_bytes, double size_ratio); void updateMarkCacheConfiguration(const Poco::Util::AbstractConfiguration & config); std::shared_ptr getMarkCache() const; diff --git a/src/Interpreters/InterpreterSystemQuery.cpp b/src/Interpreters/InterpreterSystemQuery.cpp index 4bb47a8c9e3..a078d99facf 100644 --- a/src/Interpreters/InterpreterSystemQuery.cpp +++ b/src/Interpreters/InterpreterSystemQuery.cpp @@ -10,7 +10,6 @@ #include #include #include -#include #include #include #include @@ -461,13 +460,6 @@ BlockIO InterpreterSystemQuery::execute() { throw Exception(ErrorCodes::SUPPORT_IS_DISABLED, "Not implemented"); } - case Type::DROP_PAGE_CACHE: - { - getContext()->checkAccess(AccessType::SYSTEM_DROP_PAGE_CACHE); - - getContext()->dropPageCache(); - break; - } case Type::DROP_SCHEMA_CACHE: { getContext()->checkAccess(AccessType::SYSTEM_DROP_SCHEMA_CACHE); @@ -1209,7 +1201,6 @@ AccessRightsElements InterpreterSystemQuery::getRequiredAccessForDDLOnCluster() case Type::DROP_INDEX_UNCOMPRESSED_CACHE: case Type::DROP_FILESYSTEM_CACHE: case Type::SYNC_FILESYSTEM_CACHE: - case Type::DROP_PAGE_CACHE: case Type::DROP_SCHEMA_CACHE: case Type::DROP_FORMAT_SCHEMA_CACHE: case Type::DROP_S3_CLIENT_CACHE: diff --git a/src/Interpreters/ServerAsynchronousMetrics.cpp b/src/Interpreters/ServerAsynchronousMetrics.cpp index fe7ccd64ffe..bdf314f35b9 100644 --- a/src/Interpreters/ServerAsynchronousMetrics.cpp +++ b/src/Interpreters/ServerAsynchronousMetrics.cpp @@ -9,8 +9,6 @@ #include #include -#include - #include #include @@ -79,16 +77,6 @@ void ServerAsynchronousMetrics::updateImpl(TimePoint update_time, TimePoint curr new_values["MarkCacheFiles"] = { mark_cache->count(), "Total number of mark files cached in the mark cache" }; } - if (auto page_cache = getContext()->getPageCache()) - { - auto rss = page_cache->getResidentSetSize(); - new_values["PageCacheBytes"] = { rss.page_cache_rss, "Userspace page cache memory usage in bytes" }; - new_values["PageCachePinnedBytes"] = { page_cache->getPinnedSize(), "Userspace page cache memory that's currently in use and can't be evicted" }; - - if (rss.unreclaimable_rss.has_value()) - new_values["UnreclaimableRSS"] = { *rss.unreclaimable_rss, "The amount of physical memory used by the server process, in bytes, excluding memory reclaimable by the OS (MADV_FREE)" }; - } - if (auto uncompressed_cache = getContext()->getUncompressedCache()) { new_values["UncompressedCacheBytes"] = { uncompressed_cache->sizeInBytes(), diff --git a/src/Interpreters/tests/gtest_page_cache.cpp b/src/Interpreters/tests/gtest_page_cache.cpp deleted file mode 100644 index 1e2688c0ca2..00000000000 --- a/src/Interpreters/tests/gtest_page_cache.cpp +++ /dev/null @@ -1,267 +0,0 @@ -#include -#include -#include - -#ifdef OS_LINUX -#include -#endif - -using namespace DB; - -namespace ProfileEvents -{ - extern const Event PageCacheChunkMisses; - extern const Event PageCacheChunkShared; - extern const Event PageCacheChunkDataHits; - extern const Event PageCacheChunkDataPartialHits; - extern const Event PageCacheChunkDataMisses; -} - -#define CHECK(x) \ - do { \ - if (!(x)) \ - { \ - std::cerr << "check on line " << __LINE__ << " failed: " << #x << std::endl; \ - std::abort(); \ - } \ - } while (false) - -size_t estimateRAMSize() -{ -#ifdef OS_LINUX - struct sysinfo info; - int r = sysinfo(&info); - CHECK(r == 0); - return static_cast(info.totalram * info.mem_unit); -#else - return 128ul << 30; -#endif -} - -/// Do random reads and writes in PageCache from multiple threads, check that the data read matches the data written. -TEST(PageCache, DISABLED_Stress) -{ - /// There doesn't seem to be a reasonable way to simulate memory pressure or force the eviction of MADV_FREE-d pages. - /// So we actually map more virtual memory than we have RAM and fill it all up a few times. - /// This takes an eternity (a few minutes), but idk how else to hit MADV_FREE eviction. - /// Expect ~1 GB/s, bottlenecked by page faults. - size_t ram_size = estimateRAMSize(); - PageCache cache(2 << 20, 1 << 30, ram_size + ram_size / 10, /* use_madv_free */ true, /* use_huge_pages */ true); - - CHECK(cache.getResidentSetSize().page_cache_rss); - - const size_t num_keys = static_cast(cache.maxChunks() * 1.5); - const size_t pages_per_chunk = cache.chunkSize() / cache.pageSize(); - const size_t items_per_page = cache.pageSize() / 8; - - const size_t passes = 2; - const size_t step = 20; - const size_t num_threads = 20; - const size_t chunks_touched = num_keys * passes * num_threads / step; - std::atomic progress {0}; - std::atomic threads_finished {0}; - - std::atomic total_racing_writes {0}; - - auto thread_func = [&] - { - pcg64 rng(randomSeed()); - std::vector pinned; - - /// Stats. - size_t racing_writes = 0; - - for (size_t i = 0; i < num_keys * passes; i += step) - { - progress += 1; - - /// Touch the chunks sequentially + noise (to increase interference across threads), or at random 10% of the time. - size_t key_idx; - if (rng() % 10 == 0) - key_idx = std::uniform_int_distribution(0, num_keys - 1)(rng); - else - key_idx = (i + std::uniform_int_distribution(0, num_keys / 1000)(rng)) % num_keys; - - /// For some keys, always use detached_if_missing = true and check that cache always misses. - bool key_detached_if_missing = key_idx % 100 == 42; - bool detached_if_missing = key_detached_if_missing || i % 101 == 42; - - PageCacheKey key = key_idx * 0xcafebabeb0bad00dul; // a simple reversible hash (the constant can be any odd number) - - PinnedPageChunk chunk = cache.getOrSet(key, detached_if_missing, /* inject_eviction */ false); - - if (key_detached_if_missing) - CHECK(!chunk.getChunk()->pages_populated.any()); - - for (size_t page_idx = 0; page_idx < pages_per_chunk; ++page_idx) - { - bool populated = chunk.getChunk()->pages_populated.get(page_idx); - /// Generate page contents deterministically from key and page index. - size_t start = key_idx * page_idx; - if (start % 37 == 13) - { - /// Leave ~1/37 of the pages unpopulated. - CHECK(!populated); - } - else - { - /// We may write/read the same memory from multiple threads in parallel here. - std::atomic * items = reinterpret_cast *>(chunk.getChunk()->data + cache.pageSize() * page_idx); - if (populated) - { - for (size_t j = 0; j < items_per_page; ++j) - CHECK(items[j].load(std::memory_order_relaxed) == start + j); - } - else - { - for (size_t j = 0; j < items_per_page; ++j) - items[j].store(start + j, std::memory_order_relaxed); - if (!chunk.markPagePopulated(page_idx)) - racing_writes += 1; - } - } - } - - pinned.push_back(std::move(chunk)); - CHECK(cache.getPinnedSize() >= cache.chunkSize()); - /// Unpin 2 chunks on average. - while (rng() % 3 != 0 && !pinned.empty()) - { - size_t idx = rng() % pinned.size(); - if (idx != pinned.size() - 1) - pinned[idx] = std::move(pinned.back()); - pinned.pop_back(); - } - } - - total_racing_writes += racing_writes; - threads_finished += 1; - }; - - std::cout << fmt::format("doing {:.1f} passes over {:.1f} GiB of virtual memory\nthis will take a few minutes, progress printed every 10 seconds", - chunks_touched * 1. / cache.maxChunks(), cache.maxChunks() * cache.chunkSize() * 1. / (1ul << 30)) << std::endl; - - auto start_time = std::chrono::steady_clock::now(); - - std::vector threads; - for (size_t i = 0; i < num_threads; ++i) - threads.emplace_back(thread_func); - - for (size_t poll = 0;; ++poll) - { - if (threads_finished == num_threads) - break; - if (poll % 100 == 0) - std::cout << fmt::format("{:.3f}%", progress.load() * 100. / num_keys / passes / num_threads * step) << std::endl; - std::this_thread::sleep_for(std::chrono::milliseconds(100)); - } - for (std::thread & t : threads) - t.join(); - - auto end_time = std::chrono::steady_clock::now(); - double elapsed_seconds = std::chrono::duration_cast>(end_time - start_time).count(); - double touched_gib = chunks_touched * cache.chunkSize() * 1. / (1ul << 30); - std::cout << fmt::format("touched {:.1f} GiB in {:.1f} seconds, that's {:.3f} GiB/s", - touched_gib, elapsed_seconds, touched_gib / elapsed_seconds) << std::endl; - - auto & counters = CurrentThread::getProfileEvents(); - - std::cout << "stats:" - << "\nchunk misses: " << counters[ProfileEvents::PageCacheChunkMisses].load() - << "\nchunk shared: " << counters[ProfileEvents::PageCacheChunkShared].load() - << "\nchunk data misses: " << counters[ProfileEvents::PageCacheChunkDataMisses].load() - << "\nchunk data partial hits: " << counters[ProfileEvents::PageCacheChunkDataPartialHits].load() - << "\nchunk data hits: " << counters[ProfileEvents::PageCacheChunkDataHits].load() - << "\nracing page writes: " << total_racing_writes << std::endl; - - /// Check that we at least hit all the cases. - CHECK(counters[ProfileEvents::PageCacheChunkMisses].load() > 0); - CHECK(counters[ProfileEvents::PageCacheChunkShared].load() > 0); - CHECK(counters[ProfileEvents::PageCacheChunkDataMisses].load() > 0); - /// Partial hits are rare enough that sometimes this is zero, so don't check it. - /// That's good news because we don't need to implement downloading parts of a chunk. - /// CHECK(counters[ProfileEvents::PageCacheChunkDataPartialHits].load() > 0); - CHECK(counters[ProfileEvents::PageCacheChunkDataHits].load() > 0); - CHECK(total_racing_writes > 0); - CHECK(cache.getPinnedSize() == 0); - - size_t rss = cache.getResidentSetSize().page_cache_rss; - std::cout << "RSS: " << rss * 1. / (1ul << 30) << " GiB" << std::endl; - /// This can be flaky if the system has < 10% free memory. If this turns out to be a problem, feel free to remove or reduce. - CHECK(rss > ram_size / 10); - - cache.dropCache(); - -#ifdef OS_LINUX - /// MADV_DONTNEED is not synchronous, and we're freeing lots of pages. Let's give Linux a lot of time. - std::this_thread::sleep_for(std::chrono::seconds(10)); - size_t new_rss = cache.getResidentSetSize().page_cache_rss; - std::cout << "RSS after dropping cache: " << new_rss * 1. / (1ul << 30) << " GiB" << std::endl; - CHECK(new_rss < rss / 2); -#endif -} - -/// Benchmark that measures the PageCache overhead for cache hits. Doesn't touch the actual data, so -/// memory bandwidth mostly doesn't factor into this. -/// This measures the overhead of things like madvise(MADV_FREE) and probing the pages (restoreChunkFromLimbo()). -/// Disabled in CI, run manually with --gtest_also_run_disabled_tests --gtest_filter=PageCache.DISABLED_HitsBench -TEST(PageCache, DISABLED_HitsBench) -{ - /// Do a few runs, with and without MADV_FREE. - for (size_t num_threads = 1; num_threads <= 16; num_threads *= 2) - { - for (size_t run = 0; run < 8; ++ run) - { - bool use_madv_free = run % 2 == 1; - bool use_huge_pages = run % 4 / 2 == 1; - - PageCache cache(2 << 20, 1ul << 30, 20ul << 30, use_madv_free, use_huge_pages); - size_t passes = 3; - std::atomic total_misses {0}; - - /// Prepopulate all chunks. - for (size_t i = 0; i < cache.maxChunks(); ++i) - { - PageCacheKey key = i * 0xcafebabeb0bad00dul; - PinnedPageChunk chunk = cache.getOrSet(key, /* detache_if_missing */ false, /* inject_eviction */ false); - memset(chunk.getChunk()->data, 42, chunk.getChunk()->size); - chunk.markPrefixPopulated(cache.chunkSize()); - } - - auto thread_func = [&] - { - pcg64 rng(randomSeed()); - size_t misses = 0; - for (size_t i = 0; i < cache.maxChunks() * passes; ++i) - { - PageCacheKey key = rng() % cache.maxChunks() * 0xcafebabeb0bad00dul; - PinnedPageChunk chunk = cache.getOrSet(key, /* detache_if_missing */ false, /* inject_eviction */ false); - if (!chunk.isPrefixPopulated(cache.chunkSize())) - misses += 1; - } - total_misses += misses; - }; - - auto start_time = std::chrono::steady_clock::now(); - - std::vector threads; - for (size_t i = 0; i < num_threads; ++i) - threads.emplace_back(thread_func); - - for (std::thread & t : threads) - t.join(); - - auto end_time = std::chrono::steady_clock::now(); - double elapsed_seconds = std::chrono::duration_cast>(end_time - start_time).count(); - double fetched_gib = cache.chunkSize() * cache.maxChunks() * passes * 1. / (1ul << 30); - std::cout << fmt::format( - "threads {}, run {}, use_madv_free = {}, use_huge_pages = {}\nrequested {:.1f} GiB in {:.1f} seconds\n" - "that's {:.1f} GiB/s, or overhead of {:.3}us/{:.1}MiB\n", - num_threads, run, use_madv_free, use_huge_pages, fetched_gib, elapsed_seconds, fetched_gib / elapsed_seconds, - elapsed_seconds * 1e6 / cache.maxChunks() / passes, cache.chunkSize() * 1. / (1 << 20)) << std::endl; - - if (total_misses != 0) - std::cout << "!got " << total_misses.load() << " misses! perhaps your system doesn't have enough free memory, consider decreasing cache size in the benchmark code" << std::endl; - } - } -} diff --git a/src/Parsers/ASTSystemQuery.h b/src/Parsers/ASTSystemQuery.h index 48be7f6b84f..9aa90f499d0 100644 --- a/src/Parsers/ASTSystemQuery.h +++ b/src/Parsers/ASTSystemQuery.h @@ -31,7 +31,6 @@ public: DROP_COMPILED_EXPRESSION_CACHE, DROP_FILESYSTEM_CACHE, DROP_DISK_METADATA_CACHE, - DROP_PAGE_CACHE, DROP_SCHEMA_CACHE, DROP_FORMAT_SCHEMA_CACHE, DROP_S3_CLIENT_CACHE, diff --git a/src/Storages/MergeTree/IMergeTreeDataPart.cpp b/src/Storages/MergeTree/IMergeTreeDataPart.cpp index a9bdceacef0..39ad28d3dae 100644 --- a/src/Storages/MergeTree/IMergeTreeDataPart.cpp +++ b/src/Storages/MergeTree/IMergeTreeDataPart.cpp @@ -1637,6 +1637,10 @@ bool IMergeTreeDataPart::assertHasValidVersionMetadata() const size_t file_size = getDataPartStorage().getFileSize(TXN_VERSION_METADATA_FILE_NAME); auto buf = getDataPartStorage().readFile(TXN_VERSION_METADATA_FILE_NAME, ReadSettings().adjustBufferSize(file_size), file_size, std::nullopt); + /// FIXME https://github.com/ClickHouse/ClickHouse/issues/48465 + if (dynamic_cast(buf.get())) + return true; + readStringUntilEOF(content, *buf); ReadBufferFromString str_buf{content}; VersionMetadata file; diff --git a/src/Storages/StorageS3.cpp b/src/Storages/StorageS3.cpp index 53a18d3cc5b..da90dbb4076 100644 --- a/src/Storages/StorageS3.cpp +++ b/src/Storages/StorageS3.cpp @@ -727,7 +727,7 @@ std::unique_ptr StorageS3Source::createAsyncS3ReadBuffer( auto context = getContext(); auto read_buffer_creator = [this, read_settings, object_size] - (bool restricted_seek, const std::string & path) -> std::unique_ptr + (const std::string & path, size_t read_until_position) -> std::unique_ptr { return std::make_unique( client, @@ -738,25 +738,21 @@ std::unique_ptr StorageS3Source::createAsyncS3ReadBuffer( read_settings, /* use_external_buffer */true, /* offset */0, - /* read_until_position */0, - restricted_seek, + read_until_position, + /* restricted_seek */true, object_size); }; - auto modified_settings{read_settings}; - /// User's S3 object may change, don't cache it. - modified_settings.use_page_cache_for_disks_without_file_cache = false; - - /// FIXME: Changing this setting to default value breaks something around parquet reading - modified_settings.remote_read_min_bytes_for_seek = modified_settings.remote_fs_buffer_size; - auto s3_impl = std::make_unique( std::move(read_buffer_creator), StoredObjects{StoredObject{key, /* local_path */ "", object_size}}, - "", read_settings, /* cache_log */nullptr, /* use_external_buffer */true); + auto modified_settings{read_settings}; + /// FIXME: Changing this setting to default value breaks something around parquet reading + modified_settings.remote_read_min_bytes_for_seek = modified_settings.remote_fs_buffer_size; + auto & pool_reader = context->getThreadPoolReader(FilesystemReaderType::ASYNCHRONOUS_REMOTE_FS_READER); auto async_reader = std::make_unique( std::move(s3_impl), pool_reader, modified_settings, diff --git a/tests/clickhouse-test b/tests/clickhouse-test index d44c80bc410..f438c6f4f31 100755 --- a/tests/clickhouse-test +++ b/tests/clickhouse-test @@ -699,8 +699,6 @@ class SettingsRandomizer: get_localzone(), ] ), - "use_page_cache_for_disks_without_file_cache": lambda: random.random() < 0.7, - "page_cache_inject_eviction": lambda: random.random() < 0.5, } @staticmethod diff --git a/tests/queries/0_stateless/01271_show_privileges.reference b/tests/queries/0_stateless/01271_show_privileges.reference index 88f18c52536..e1f5213790d 100644 --- a/tests/queries/0_stateless/01271_show_privileges.reference +++ b/tests/queries/0_stateless/01271_show_privileges.reference @@ -112,7 +112,6 @@ SYSTEM DROP QUERY CACHE ['SYSTEM DROP QUERY','DROP QUERY CACHE','DROP QUERY'] GL SYSTEM DROP COMPILED EXPRESSION CACHE ['SYSTEM DROP COMPILED EXPRESSION','DROP COMPILED EXPRESSION CACHE','DROP COMPILED EXPRESSIONS'] GLOBAL SYSTEM DROP CACHE SYSTEM DROP FILESYSTEM CACHE ['SYSTEM DROP FILESYSTEM CACHE','DROP FILESYSTEM CACHE'] GLOBAL SYSTEM DROP CACHE SYSTEM SYNC FILESYSTEM CACHE ['SYSTEM REPAIR FILESYSTEM CACHE','REPAIR FILESYSTEM CACHE','SYNC FILESYSTEM CACHE'] GLOBAL SYSTEM -SYSTEM DROP PAGE CACHE ['SYSTEM DROP PAGE CACHE','DROP PAGE CACHE'] GLOBAL SYSTEM DROP CACHE SYSTEM DROP SCHEMA CACHE ['SYSTEM DROP SCHEMA CACHE','DROP SCHEMA CACHE'] GLOBAL SYSTEM DROP CACHE SYSTEM DROP FORMAT SCHEMA CACHE ['SYSTEM DROP FORMAT SCHEMA CACHE','DROP FORMAT SCHEMA CACHE'] GLOBAL SYSTEM DROP CACHE SYSTEM DROP S3 CLIENT CACHE ['SYSTEM DROP S3 CLIENT','DROP S3 CLIENT CACHE'] GLOBAL SYSTEM DROP CACHE diff --git a/tests/queries/0_stateless/02867_page_cache.reference b/tests/queries/0_stateless/02867_page_cache.reference deleted file mode 100644 index 5502059508a..00000000000 --- a/tests/queries/0_stateless/02867_page_cache.reference +++ /dev/null @@ -1,23 +0,0 @@ -54975576145920 -PageCacheBytesUnpinnedRoundedToHugePages 1 -PageCacheBytesUnpinnedRoundedToPages 1 -PageCacheChunkMisses 1 -ReadBufferFromS3Bytes 1 -54975576145920 -PageCacheBytesUnpinnedRoundedToHugePages 1 -PageCacheBytesUnpinnedRoundedToPages 1 -PageCacheChunkDataHits 1 -54975576145920 -PageCacheBytesUnpinnedRoundedToHugePages 1 -PageCacheBytesUnpinnedRoundedToPages 1 -PageCacheChunkMisses 1 -ReadBufferFromS3Bytes 1 -54975576145920 -PageCacheBytesUnpinnedRoundedToHugePages 1 -PageCacheBytesUnpinnedRoundedToPages 1 -PageCacheChunkMisses 1 -ReadBufferFromS3Bytes 1 -54975576145920 -PageCacheBytesUnpinnedRoundedToHugePages 1 -PageCacheBytesUnpinnedRoundedToPages 1 -PageCacheChunkDataHits 1 diff --git a/tests/queries/0_stateless/02867_page_cache.sql b/tests/queries/0_stateless/02867_page_cache.sql deleted file mode 100644 index 8765b30ebc3..00000000000 --- a/tests/queries/0_stateless/02867_page_cache.sql +++ /dev/null @@ -1,105 +0,0 @@ --- Tags: no-fasttest, no-parallel --- no-fasttest because we need an S3 storage policy --- no-parallel because we look at server-wide counters about page cache usage - -set use_page_cache_for_disks_without_file_cache = 1; -set page_cache_inject_eviction = 0; -set enable_filesystem_cache = 0; -set use_uncompressed_cache = 0; - -create table events_snapshot engine Memory as select * from system.events; -create view events_diff as - -- round all stats to 70 MiB to leave a lot of leeway for overhead - with if(event like '%Bytes%', 70*1024*1024, 35) as granularity, - -- cache hits counter can vary a lot depending on other settings: - -- e.g. if merge_tree_min_bytes_for_concurrent_read is small, multiple threads will read each chunk - -- so we just check that the value is not too low - if(event in ( - 'PageCacheBytesUnpinnedRoundedToPages', 'PageCacheBytesUnpinnedRoundedToHugePages', - 'PageCacheChunkDataHits'), 1, 1000) as clamp - select event, min2(intDiv(new.value - old.value, granularity), clamp) as diff - from system.events new - left outer join events_snapshot old - on old.event = new.event - where diff != 0 and - event in ( - 'ReadBufferFromS3Bytes', 'PageCacheChunkMisses', 'PageCacheChunkDataMisses', - 'PageCacheChunkDataHits', 'PageCacheChunkDataPartialHits', - 'PageCacheBytesUnpinnedRoundedToPages', 'PageCacheBytesUnpinnedRoundedToHugePages') - order by event; - -drop table if exists page_cache_03055; -create table page_cache_03055 (k Int64 CODEC(NONE)) engine MergeTree order by k settings storage_policy = 's3_cache'; - --- Write an 80 MiB file (40 x 2 MiB chunks), and a few small files. -system stop merges page_cache_03055; -insert into page_cache_03055 select * from numbers(10485760) settings max_block_size=100000000, preferred_block_size_bytes=1000000000; - -select * from events_diff; -truncate table events_snapshot; -insert into events_snapshot select * from system.events; - -system start merges page_cache_03055; -optimize table page_cache_03055 final; -truncate table events_snapshot; -insert into events_snapshot select * from system.events; - --- Cold read, should miss cache. (Populating cache on write is not implemented yet.) - -select sum(k) from page_cache_03055; - -select * from events_diff where event not in ('PageCacheChunkDataHits'); -truncate table events_snapshot; -insert into events_snapshot select * from system.events; - --- Repeat read, should hit cache. - -select sum(k) from page_cache_03055; - -select * from events_diff; -truncate table events_snapshot; -insert into events_snapshot select * from system.events; - --- Drop cache and read again, should miss. Also don't write to cache. - -system drop page cache; - -select sum(k) from page_cache_03055 settings read_from_page_cache_if_exists_otherwise_bypass_cache = 1; - --- Data could be read multiple times because we're not writing to cache. -select event, if(event in ('PageCacheChunkMisses', 'ReadBufferFromS3Bytes'), diff >= 1, diff) from events_diff where event not in ('PageCacheChunkDataHits'); -truncate table events_snapshot; -insert into events_snapshot select * from system.events; - --- Repeat read, should still miss, but populate cache. - -select sum(k) from page_cache_03055; - -select * from events_diff where event not in ('PageCacheChunkDataHits'); -truncate table events_snapshot; -insert into events_snapshot select * from system.events; - --- Read again, hit the cache. - -select sum(k) from page_cache_03055 settings read_from_page_cache_if_exists_otherwise_bypass_cache = 1; - -select * from events_diff; -truncate table events_snapshot; -insert into events_snapshot select * from system.events; - - --- Known limitation: cache is not invalidated if a table is dropped and created again at the same path. --- set allow_deprecated_database_ordinary=1; --- create database test_03055 engine = Ordinary; --- create table test_03055.t (k Int64) engine MergeTree order by k settings storage_policy = 's3_cache'; --- insert into test_03055.t values (1); --- select * from test_03055.t; --- drop table test_03055.t; --- create table test_03055.t (k Int64) engine MergeTree order by k settings storage_policy = 's3_cache'; --- insert into test_03055.t values (2); --- select * from test_03055.t; - - -drop table events_snapshot; -drop table page_cache_03055; -drop view events_diff; From 28735a4ba48da06bd1c6c0295c2ae8a8cb9d0176 Mon Sep 17 00:00:00 2001 From: Pablo Marcos Date: Thu, 29 Feb 2024 13:23:36 +0100 Subject: [PATCH 15/19] Fix typo in query_thread_log doc Fix typo in query_thread_log doc --- docs/en/operations/system-tables/query_thread_log.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/en/operations/system-tables/query_thread_log.md b/docs/en/operations/system-tables/query_thread_log.md index 0420a0392f2..65582b6bf21 100644 --- a/docs/en/operations/system-tables/query_thread_log.md +++ b/docs/en/operations/system-tables/query_thread_log.md @@ -21,7 +21,7 @@ Columns: - `hostname` ([LowCardinality(String)](../../sql-reference/data-types/string.md)) — Hostname of the server executing the query. - `event_date` ([Date](../../sql-reference/data-types/date.md)) — The date when the thread has finished execution of the query. - `event_time` ([DateTime](../../sql-reference/data-types/datetime.md)) — The date and time when the thread has finished execution of the query. -- `event_time_microsecinds` ([DateTime](../../sql-reference/data-types/datetime.md)) — The date and time when the thread has finished execution of the query with microseconds precision. +- `event_time_microseconds` ([DateTime](../../sql-reference/data-types/datetime.md)) — The date and time when the thread has finished execution of the query with microseconds precision. - `query_start_time` ([DateTime](../../sql-reference/data-types/datetime.md)) — Start time of query execution. - `query_start_time_microseconds` ([DateTime64](../../sql-reference/data-types/datetime64.md)) — Start time of query execution with microsecond precision. - `query_duration_ms` ([UInt64](../../sql-reference/data-types/int-uint.md#uint-ranges)) — Duration of query execution. From 4db4a82ae08d36c70f3abeb7fd01ed4ac905836d Mon Sep 17 00:00:00 2001 From: Pablo Marcos Date: Thu, 29 Feb 2024 13:31:36 +0100 Subject: [PATCH 16/19] Update thread_id to sync with Russian doc --- docs/en/operations/system-tables/query_thread_log.md | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/docs/en/operations/system-tables/query_thread_log.md b/docs/en/operations/system-tables/query_thread_log.md index 65582b6bf21..a0712c78409 100644 --- a/docs/en/operations/system-tables/query_thread_log.md +++ b/docs/en/operations/system-tables/query_thread_log.md @@ -32,8 +32,7 @@ Columns: - `memory_usage` ([Int64](../../sql-reference/data-types/int-uint.md)) — The difference between the amount of allocated and freed memory in context of this thread. - `peak_memory_usage` ([Int64](../../sql-reference/data-types/int-uint.md)) — The maximum difference between the amount of allocated and freed memory in context of this thread. - `thread_name` ([String](../../sql-reference/data-types/string.md)) — Name of the thread. -- `thread_number` ([UInt32](../../sql-reference/data-types/int-uint.md)) — Internal thread ID. -- `thread_id` ([Int32](../../sql-reference/data-types/int-uint.md)) — thread ID. +- `thread_id` ([UInt64](../../sql-reference/data-types/int-uint.md)) — OS thread ID. - `master_thread_id` ([UInt64](../../sql-reference/data-types/int-uint.md#uint-ranges)) — OS initial ID of initial thread. - `query` ([String](../../sql-reference/data-types/string.md)) — Query string. - `is_initial_query` ([UInt8](../../sql-reference/data-types/int-uint.md#uint-ranges)) — Query type. Possible values: From 0df8db5c3f3341fc379076c3e31d50cbecc1683d Mon Sep 17 00:00:00 2001 From: Max Kainov Date: Thu, 29 Feb 2024 12:44:55 +0000 Subject: [PATCH 17/19] CI: configure ci_set_analyzer #do_not_test --- .gitmessage | 1 + tests/ci/ci_config.py | 11 +++++++++++ 2 files changed, 12 insertions(+) diff --git a/.gitmessage b/.gitmessage index 760cfec97a4..2ad30596de6 100644 --- a/.gitmessage +++ b/.gitmessage @@ -16,6 +16,7 @@ #ci_set_reduced #ci_set_arm #ci_set_integration +#ci_set_analyzer ## To run specified job in CI: #job_ diff --git a/tests/ci/ci_config.py b/tests/ci/ci_config.py index 80994f71280..2dc5dbf7669 100644 --- a/tests/ci/ci_config.py +++ b/tests/ci/ci_config.py @@ -21,6 +21,7 @@ class Labels(metaclass=WithIter): CI_SET_REDUCED = "ci_set_reduced" CI_SET_ARM = "ci_set_arm" CI_SET_INTEGRATION = "ci_set_integration" + CI_SET_ANALYZER = "ci_set_analyzer" libFuzzer = "libFuzzer" @@ -647,6 +648,16 @@ CI_CONFIG = CIConfig( JobNames.INTEGRATION_TEST, ] ), + Labels.CI_SET_ANALYZER: LabelConfig( + run_jobs=[ + JobNames.STYLE_CHECK, + JobNames.FAST_TEST, + Build.PACKAGE_RELEASE, + Build.PACKAGE_ASAN, + JobNames.STATELESS_TEST_ANALYZER_S3_REPLICATED_RELEASE, + JobNames.INTEGRATION_TEST_ASAN_ANALYZER, + ] + ), Labels.CI_SET_REDUCED: LabelConfig( run_jobs=[ job From 2dedfd6cf9c2d26d572d05ff3b3ec4cf973ef94f Mon Sep 17 00:00:00 2001 From: Alexander Tokmakov Date: Thu, 29 Feb 2024 14:22:07 +0100 Subject: [PATCH 18/19] Revert "Analyzer: compute ALIAS columns right after reading" --- src/Analyzer/Passes/QueryAnalysisPass.cpp | 5 +- .../getHeaderForProcessingStage.cpp | 7 +- src/Planner/CollectTableExpressionData.cpp | 166 +++++++----------- src/Planner/CollectTableExpressionData.h | 2 +- src/Planner/PlannerActionsVisitor.cpp | 5 +- src/Planner/PlannerJoinTree.cpp | 96 +++++++--- src/Planner/TableExpressionData.h | 115 +++++++----- src/Planner/Utils.cpp | 11 +- src/Planner/Utils.h | 5 - .../QueryPlan/ReadFromMergeTree.cpp | 5 - src/Storages/StorageDistributed.cpp | 28 --- tests/analyzer_integration_broken_tests.txt | 1 + .../test_row_policy.py | 2 +- tests/integration/test_row_policy/test.py | 3 +- .../02514_analyzer_drop_join_on.reference | 12 +- ..._support_alias_column_in_indices.reference | 13 +- 16 files changed, 231 insertions(+), 245 deletions(-) diff --git a/src/Analyzer/Passes/QueryAnalysisPass.cpp b/src/Analyzer/Passes/QueryAnalysisPass.cpp index 1f81ac54078..907a732493d 100644 --- a/src/Analyzer/Passes/QueryAnalysisPass.cpp +++ b/src/Analyzer/Passes/QueryAnalysisPass.cpp @@ -6651,6 +6651,7 @@ void QueryAnalyzer::initializeTableExpressionData(const QueryTreeNodePtr & table if (column_default && column_default->kind == ColumnDefaultKind::Alias) { auto alias_expression = buildQueryTree(column_default->expression, scope.context); + alias_expression = buildCastFunction(alias_expression, column_name_and_type.type, scope.context, false /*resolve*/); auto column_node = std::make_shared(column_name_and_type, std::move(alias_expression), table_expression_node); column_name_to_column_node.emplace(column_name_and_type.name, column_node); alias_columns_to_resolve.emplace_back(column_name_and_type.name, column_node); @@ -6683,9 +6684,7 @@ void QueryAnalyzer::initializeTableExpressionData(const QueryTreeNodePtr & table alias_column_resolve_scope, false /*allow_lambda_expression*/, false /*allow_table_expression*/); - auto & resolved_expression = alias_column_to_resolve->getExpression(); - if (!resolved_expression->getResultType()->equals(*alias_column_to_resolve->getResultType())) - resolved_expression = buildCastFunction(resolved_expression, alias_column_to_resolve->getResultType(), scope.context, true); + column_name_to_column_node = std::move(alias_column_resolve_scope.column_name_to_column_node); column_name_to_column_node[alias_column_to_resolve_name] = alias_column_to_resolve; } diff --git a/src/Interpreters/getHeaderForProcessingStage.cpp b/src/Interpreters/getHeaderForProcessingStage.cpp index 75b0e710fbe..21739298036 100644 --- a/src/Interpreters/getHeaderForProcessingStage.cpp +++ b/src/Interpreters/getHeaderForProcessingStage.cpp @@ -121,12 +121,7 @@ Block getHeaderForProcessingStage( auto & table_expression_data = query_info.planner_context->getTableExpressionDataOrThrow(left_table_expression); const auto & query_context = query_info.planner_context->getQueryContext(); - - NamesAndTypes columns; - const auto & column_name_to_column = table_expression_data.getColumnNameToColumn(); - for (const auto & column_name : table_expression_data.getSelectedColumnsNames()) - columns.push_back(column_name_to_column.at(column_name)); - + auto columns = table_expression_data.getColumns(); auto new_query_node = buildSubqueryToReadColumnsFromTableExpression(columns, left_table_expression, query_context); query = new_query_node->toAST(); } diff --git a/src/Planner/CollectTableExpressionData.cpp b/src/Planner/CollectTableExpressionData.cpp index 385381f1355..78a7c7074c3 100644 --- a/src/Planner/CollectTableExpressionData.cpp +++ b/src/Planner/CollectTableExpressionData.cpp @@ -29,13 +29,34 @@ namespace class CollectSourceColumnsVisitor : public InDepthQueryTreeVisitor { public: - explicit CollectSourceColumnsVisitor(PlannerContextPtr & planner_context_, bool keep_alias_columns_ = true) + explicit CollectSourceColumnsVisitor(PlannerContext & planner_context_) : planner_context(planner_context_) - , keep_alias_columns(keep_alias_columns_) {} void visitImpl(QueryTreeNodePtr & node) { + /// Special case for USING clause which contains references to ALIAS columns. + /// We can not modify such ColumnNode. + if (auto * join_node = node->as()) + { + if (!join_node->isUsingJoinExpression()) + return; + + auto & using_list = join_node->getJoinExpression()->as(); + for (auto & using_element : using_list) + { + auto & column_node = using_element->as(); + /// This list contains column nodes from left and right tables. + auto & columns_from_subtrees = column_node.getExpressionOrThrow()->as().getNodes(); + + /// Visit left table column node. + visitUsingColumn(columns_from_subtrees[0]); + /// Visit right table column node. + visitUsingColumn(columns_from_subtrees[1]); + } + return; + } + auto * column_node = node->as(); if (!column_node) return; @@ -51,55 +72,22 @@ public: /// JOIN using expression if (column_node->hasExpression() && column_source_node_type == QueryTreeNodeType::JOIN) - { - auto & columns_from_subtrees = column_node->getExpression()->as().getNodes(); - if (columns_from_subtrees.size() != 2) - throw Exception(ErrorCodes::LOGICAL_ERROR, - "Expected two columns in JOIN using expression for column {}", column_node->dumpTree()); - - visit(columns_from_subtrees[0]); - visit(columns_from_subtrees[1]); return; - } - auto & table_expression_data = planner_context->getOrCreateTableExpressionData(column_source_node); + auto & table_expression_data = planner_context.getOrCreateTableExpressionData(column_source_node); - if (isAliasColumn(node)) + if (column_node->hasExpression() && column_source_node_type != QueryTreeNodeType::ARRAY_JOIN) { - /// Column is an ALIAS column with expression + /// Replace ALIAS column with expression bool column_already_exists = table_expression_data.hasColumn(column_node->getColumnName()); if (!column_already_exists) { - CollectSourceColumnsVisitor visitor_for_alias_column(planner_context); - /// While we are processing expression of ALIAS columns we should not add source columns to selected. - /// See also comment for `select_added_columns` - visitor_for_alias_column.select_added_columns = false; - visitor_for_alias_column.keep_alias_columns = keep_alias_columns; - visitor_for_alias_column.visit(column_node->getExpression()); - - if (!keep_alias_columns) - { - /// For PREWHERE we can just replace ALIAS column with it's expression, - /// because ActionsDAG for PREWHERE applied right on top of table expression - /// and cannot affect subqueries or other table expressions. - node = column_node->getExpression(); - return; - } - - auto column_identifier = planner_context->getGlobalPlannerContext()->createColumnIdentifier(node); - - ActionsDAGPtr alias_column_actions_dag = std::make_shared(); - PlannerActionsVisitor actions_visitor(planner_context, false); - auto outputs = actions_visitor.visit(alias_column_actions_dag, column_node->getExpression()); - if (outputs.size() != 1) - throw Exception(ErrorCodes::LOGICAL_ERROR, - "Expected single output in actions dag for alias column {}. Actual {}", column_node->dumpTree(), outputs.size()); - const auto & column_name = column_node->getColumnName(); - const auto & alias_node = alias_column_actions_dag->addAlias(*outputs[0], column_name); - alias_column_actions_dag->addOrReplaceInOutputs(alias_node); - table_expression_data.addAliasColumn(column_node->getColumn(), column_identifier, alias_column_actions_dag, select_added_columns); + auto column_identifier = planner_context.getGlobalPlannerContext()->createColumnIdentifier(node); + table_expression_data.addAliasColumnName(column_node->getColumnName(), column_identifier); } + node = column_node->getExpression(); + visitImpl(node); return; } @@ -114,58 +102,45 @@ public: bool column_already_exists = table_expression_data.hasColumn(column_node->getColumnName()); if (column_already_exists) - { - /// Column may be added when we collected data for ALIAS column - /// But now we see it directly in the query, so make sure it's marked as selected - if (select_added_columns) - table_expression_data.markSelectedColumn(column_node->getColumnName()); return; + + auto column_identifier = planner_context.getGlobalPlannerContext()->createColumnIdentifier(node); + table_expression_data.addColumn(column_node->getColumn(), column_identifier); + } + + static bool needChildVisit(const QueryTreeNodePtr & parent, const QueryTreeNodePtr & child_node) + { + if (auto * join_node = parent->as()) + { + if (join_node->getJoinExpression() == child_node && join_node->isUsingJoinExpression()) + return false; } - - auto column_identifier = planner_context->getGlobalPlannerContext()->createColumnIdentifier(node); - table_expression_data.addColumn(column_node->getColumn(), column_identifier, select_added_columns); - } - - static bool isAliasColumn(const QueryTreeNodePtr & node) - { - const auto * column_node = node->as(); - if (!column_node || !column_node->hasExpression()) - return false; - const auto & column_source = column_node->getColumnSourceOrNull(); - if (!column_source) - return false; - return column_source->getNodeType() != QueryTreeNodeType::JOIN && - column_source->getNodeType() != QueryTreeNodeType::ARRAY_JOIN; - } - - static bool needChildVisit(const QueryTreeNodePtr & parent_node, const QueryTreeNodePtr & child_node) - { auto child_node_type = child_node->getNodeType(); - return !(child_node_type == QueryTreeNodeType::QUERY || - child_node_type == QueryTreeNodeType::UNION || - isAliasColumn(parent_node)); - } - - void setKeepAliasColumns(bool keep_alias_columns_) - { - keep_alias_columns = keep_alias_columns_; + return !(child_node_type == QueryTreeNodeType::QUERY || child_node_type == QueryTreeNodeType::UNION); } private: - PlannerContextPtr & planner_context; - /// Replace ALIAS columns with their expressions or register them in table expression data. - /// Usually we can replace them when we build some "local" actions DAG - /// (for example Row Policy or PREWHERE) that is applied on top of the table expression. - /// In other cases, we keep ALIAS columns as ColumnNode with an expression child node, - /// and handle them in the Planner by inserting ActionsDAG to compute them after reading from storage. - bool keep_alias_columns = true; + void visitUsingColumn(QueryTreeNodePtr & node) + { + auto & column_node = node->as(); + if (column_node.hasExpression()) + { + auto & table_expression_data = planner_context.getOrCreateTableExpressionData(column_node.getColumnSource()); + bool column_already_exists = table_expression_data.hasColumn(column_node.getColumnName()); + if (column_already_exists) + return; - /// Flag `select_added_columns` indicates if we should mark column as explicitly selected. - /// For example, for table with columns (a Int32, b ALIAS a+1) and query SELECT b FROM table - /// Column `b` is selected explicitly by user, but not `a` (that is also read though). - /// Distinguishing such columns is important for checking access rights for ALIAS columns. - bool select_added_columns = true; + auto column_identifier = planner_context.getGlobalPlannerContext()->createColumnIdentifier(node); + table_expression_data.addAliasColumnName(column_node.getColumnName(), column_identifier); + + visitImpl(column_node.getExpressionOrThrow()); + } + else + visitImpl(node); + } + + PlannerContext & planner_context; }; class CollectPrewhereTableExpressionVisitor : public ConstInDepthQueryTreeVisitor @@ -299,7 +274,7 @@ void collectTableExpressionData(QueryTreeNodePtr & query_node, PlannerContextPtr } } - CollectSourceColumnsVisitor collect_source_columns_visitor(planner_context); + CollectSourceColumnsVisitor collect_source_columns_visitor(*planner_context); for (auto & node : query_node_typed.getChildren()) { if (!node || node == query_node_typed.getPrewhere()) @@ -325,26 +300,21 @@ void collectTableExpressionData(QueryTreeNodePtr & query_node, PlannerContextPtr } auto & table_expression_data = planner_context->getOrCreateTableExpressionData(prewhere_table_expression); - const auto & read_column_names = table_expression_data.getColumnNames(); - NameSet required_column_names_without_prewhere(read_column_names.begin(), read_column_names.end()); - const auto & selected_column_names = table_expression_data.getSelectedColumnsNames(); - required_column_names_without_prewhere.insert(selected_column_names.begin(), selected_column_names.end()); + const auto & column_names = table_expression_data.getColumnNames(); + NameSet required_column_names_without_prewhere(column_names.begin(), column_names.end()); - collect_source_columns_visitor.setKeepAliasColumns(false); collect_source_columns_visitor.visit(query_node_typed.getPrewhere()); auto prewhere_actions_dag = std::make_shared(); - QueryTreeNodePtr query_tree_node = query_node_typed.getPrewhere(); - PlannerActionsVisitor visitor(planner_context, false /*use_column_identifier_as_action_node_name*/); - auto expression_nodes = visitor.visit(prewhere_actions_dag, query_tree_node); + auto expression_nodes = visitor.visit(prewhere_actions_dag, query_node_typed.getPrewhere()); if (expression_nodes.size() != 1) throw Exception(ErrorCodes::ILLEGAL_PREWHERE, "Invalid PREWHERE. Expected single boolean expression. In query {}", query_node->formatASTForErrorMessage()); - prewhere_actions_dag->getOutputs().push_back(expression_nodes.back()); + prewhere_actions_dag->getOutputs().push_back(expression_nodes[0]); for (const auto & prewhere_input_node : prewhere_actions_dag->getInputs()) if (required_column_names_without_prewhere.contains(prewhere_input_node->result_name)) @@ -354,9 +324,9 @@ void collectTableExpressionData(QueryTreeNodePtr & query_node, PlannerContextPtr } } -void collectSourceColumns(QueryTreeNodePtr & expression_node, PlannerContextPtr & planner_context, bool keep_alias_columns) +void collectSourceColumns(QueryTreeNodePtr & expression_node, PlannerContextPtr & planner_context) { - CollectSourceColumnsVisitor collect_source_columns_visitor(planner_context, keep_alias_columns); + CollectSourceColumnsVisitor collect_source_columns_visitor(*planner_context); collect_source_columns_visitor.visit(expression_node); } diff --git a/src/Planner/CollectTableExpressionData.h b/src/Planner/CollectTableExpressionData.h index b0cebc15682..ed3f0ff7a47 100644 --- a/src/Planner/CollectTableExpressionData.h +++ b/src/Planner/CollectTableExpressionData.h @@ -19,6 +19,6 @@ void collectTableExpressionData(QueryTreeNodePtr & query_node, PlannerContextPtr * * ALIAS table column nodes are registered in table expression data and replaced in query tree with inner alias expression. */ -void collectSourceColumns(QueryTreeNodePtr & expression_node, PlannerContextPtr & planner_context, bool keep_alias_columns = true); +void collectSourceColumns(QueryTreeNodePtr & expression_node, PlannerContextPtr & planner_context); } diff --git a/src/Planner/PlannerActionsVisitor.cpp b/src/Planner/PlannerActionsVisitor.cpp index c417d463c73..511e9396a35 100644 --- a/src/Planner/PlannerActionsVisitor.cpp +++ b/src/Planner/PlannerActionsVisitor.cpp @@ -451,7 +451,6 @@ private: std::unordered_map node_to_node_name; const PlannerContextPtr planner_context; ActionNodeNameHelper action_node_name_helper; - bool use_column_identifier_as_action_node_name; }; PlannerActionsVisitorImpl::PlannerActionsVisitorImpl(ActionsDAGPtr actions_dag, @@ -459,7 +458,6 @@ PlannerActionsVisitorImpl::PlannerActionsVisitorImpl(ActionsDAGPtr actions_dag, bool use_column_identifier_as_action_node_name_) : planner_context(planner_context_) , action_node_name_helper(node_to_node_name, *planner_context, use_column_identifier_as_action_node_name_) - , use_column_identifier_as_action_node_name(use_column_identifier_as_action_node_name_) { actions_stack.emplace_back(std::move(actions_dag), nullptr); } @@ -505,8 +503,7 @@ PlannerActionsVisitorImpl::NodeNameAndNodeMinLevel PlannerActionsVisitorImpl::vi { auto column_node_name = action_node_name_helper.calculateActionNodeName(node); const auto & column_node = node->as(); - if (column_node.hasExpression() && !use_column_identifier_as_action_node_name) - return visitImpl(column_node.getExpression()); + Int64 actions_stack_size = static_cast(actions_stack.size() - 1); for (Int64 i = actions_stack_size; i >= 0; --i) { diff --git a/src/Planner/PlannerJoinTree.cpp b/src/Planner/PlannerJoinTree.cpp index 7b3fb0c5c91..59da88f4e45 100644 --- a/src/Planner/PlannerJoinTree.cpp +++ b/src/Planner/PlannerJoinTree.cpp @@ -86,7 +86,7 @@ namespace /// Check if current user has privileges to SELECT columns from table /// Throws an exception if access to any column from `column_names` is not granted /// If `column_names` is empty, check access to any columns and return names of accessible columns -NameSet checkAccessRights(const TableNode & table_node, const Names & column_names, const ContextPtr & query_context) +NameSet checkAccessRights(const TableNode & table_node, Names & column_names, const ContextPtr & query_context) { /// StorageDummy is created on preliminary stage, ignore access check for it. if (typeid_cast(table_node.getStorage().get())) @@ -353,7 +353,9 @@ void prepareBuildQueryPlanForTableExpression(const QueryTreeNodePtr & table_expr NameSet columns_names_allowed_to_select; if (table_node) { - const auto & column_names_with_aliases = table_expression_data.getSelectedColumnsNames(); + auto column_names_with_aliases = columns_names; + const auto & alias_columns_names = table_expression_data.getAliasColumnsNames(); + column_names_with_aliases.insert(column_names_with_aliases.end(), alias_columns_names.begin(), alias_columns_names.end()); columns_names_allowed_to_select = checkAccessRights(*table_node, column_names_with_aliases, query_context); } @@ -862,28 +864,6 @@ JoinTreeQueryPlan buildQueryPlanForTableExpression(QueryTreeNodePtr table_expres max_block_size, max_streams); - const auto & alias_column_expressions = table_expression_data.getAliasColumnExpressions(); - if (!alias_column_expressions.empty() && query_plan.isInitialized() && from_stage == QueryProcessingStage::FetchColumns) - { - ActionsDAGPtr merged_alias_columns_actions_dag = std::make_shared(query_plan.getCurrentDataStream().header.getColumnsWithTypeAndName()); - ActionsDAG::NodeRawConstPtrs action_dag_outputs = merged_alias_columns_actions_dag->getInputs(); - - for (const auto & [column_name, alias_column_actions_dag] : alias_column_expressions) - { - const auto & current_outputs = alias_column_actions_dag->getOutputs(); - action_dag_outputs.insert(action_dag_outputs.end(), current_outputs.begin(), current_outputs.end()); - merged_alias_columns_actions_dag->mergeNodes(std::move(*alias_column_actions_dag)); - } - - for (const auto * output_node : action_dag_outputs) - merged_alias_columns_actions_dag->addOrReplaceInOutputs(*output_node); - merged_alias_columns_actions_dag->removeUnusedActions(false); - - auto alias_column_step = std::make_unique(query_plan.getCurrentDataStream(), std::move(merged_alias_columns_actions_dag)); - alias_column_step->setStepDescription("Compute alias columns"); - query_plan.addStep(std::move(alias_column_step)); - } - for (const auto & filter_info_and_description : where_filters) { const auto & [filter_info, description] = filter_info_and_description; @@ -927,8 +907,7 @@ JoinTreeQueryPlan buildQueryPlanForTableExpression(QueryTreeNodePtr table_expres else { /// Create step which reads from empty source if storage has no data. - const auto & column_names = table_expression_data.getSelectedColumnsNames(); - auto source_header = storage_snapshot->getSampleBlockForColumns(column_names); + auto source_header = storage_snapshot->getSampleBlockForColumns(table_expression_data.getColumnNames()); Pipe pipe(std::make_shared(source_header)); auto read_from_pipe = std::make_unique(std::move(pipe)); read_from_pipe->setStepDescription("Read from NullSource"); @@ -1045,6 +1024,57 @@ void joinCastPlanColumnsToNullable(QueryPlan & plan_to_add_cast, PlannerContextP plan_to_add_cast.addStep(std::move(cast_join_columns_step)); } +/// Actions to calculate table columns that have a functional representation (ALIASes and subcolumns) +/// and used in USING clause of JOIN expression. +struct UsingAliasKeyActions +{ + UsingAliasKeyActions( + const ColumnsWithTypeAndName & left_plan_output_columns, + const ColumnsWithTypeAndName & right_plan_output_columns + ) + : left_alias_columns_keys(std::make_shared(left_plan_output_columns)) + , right_alias_columns_keys(std::make_shared(right_plan_output_columns)) + {} + + void addLeftColumn(QueryTreeNodePtr & node, const ColumnsWithTypeAndName & plan_output_columns, const PlannerContextPtr & planner_context) + { + addColumnImpl(left_alias_columns_keys, node, plan_output_columns, planner_context); + } + + void addRightColumn(QueryTreeNodePtr & node, const ColumnsWithTypeAndName & plan_output_columns, const PlannerContextPtr & planner_context) + { + addColumnImpl(right_alias_columns_keys, node, plan_output_columns, planner_context); + } + + ActionsDAGPtr getLeftActions() + { + left_alias_columns_keys->projectInput(); + return std::move(left_alias_columns_keys); + } + + ActionsDAGPtr getRightActions() + { + right_alias_columns_keys->projectInput(); + return std::move(right_alias_columns_keys); + } + +private: + void addColumnImpl(ActionsDAGPtr & alias_columns_keys, QueryTreeNodePtr & node, const ColumnsWithTypeAndName & plan_output_columns, const PlannerContextPtr & planner_context) + { + auto & column_node = node->as(); + if (column_node.hasExpression()) + { + auto dag = buildActionsDAGFromExpressionNode(column_node.getExpressionOrThrow(), plan_output_columns, planner_context); + const auto & left_inner_column_identifier = planner_context->getColumnNodeIdentifierOrThrow(node); + dag->addOrReplaceInOutputs(dag->addAlias(*dag->getOutputs().front(), left_inner_column_identifier)); + alias_columns_keys->mergeInplace(std::move(*dag)); + } + } + + ActionsDAGPtr left_alias_columns_keys; + ActionsDAGPtr right_alias_columns_keys; +}; + JoinTreeQueryPlan buildQueryPlanForJoinNode(const QueryTreeNodePtr & join_table_expression, JoinTreeQueryPlan left_join_tree_query_plan, JoinTreeQueryPlan right_join_tree_query_plan, @@ -1113,6 +1143,8 @@ JoinTreeQueryPlan buildQueryPlanForJoinNode(const QueryTreeNodePtr & join_table_ if (join_node.isUsingJoinExpression()) { + UsingAliasKeyActions using_alias_key_actions{left_plan_output_columns, right_plan_output_columns}; + auto & join_node_using_columns_list = join_node.getJoinExpression()->as(); for (auto & join_node_using_node : join_node_using_columns_list.getNodes()) { @@ -1122,9 +1154,13 @@ JoinTreeQueryPlan buildQueryPlanForJoinNode(const QueryTreeNodePtr & join_table_ auto & left_inner_column_node = inner_columns_list.getNodes().at(0); auto & left_inner_column = left_inner_column_node->as(); + using_alias_key_actions.addLeftColumn(left_inner_column_node, left_plan_output_columns, planner_context); + auto & right_inner_column_node = inner_columns_list.getNodes().at(1); auto & right_inner_column = right_inner_column_node->as(); + using_alias_key_actions.addRightColumn(right_inner_column_node, right_plan_output_columns, planner_context); + const auto & join_node_using_column_node_type = join_node_using_column_node.getColumnType(); if (!left_inner_column.getColumnType()->equals(*join_node_using_column_node_type)) { @@ -1138,6 +1174,14 @@ JoinTreeQueryPlan buildQueryPlanForJoinNode(const QueryTreeNodePtr & join_table_ right_plan_column_name_to_cast_type.emplace(right_inner_column_identifier, join_node_using_column_node_type); } } + + auto left_alias_columns_keys_step = std::make_unique(left_plan.getCurrentDataStream(), using_alias_key_actions.getLeftActions()); + left_alias_columns_keys_step->setStepDescription("Actions for left table alias column keys"); + left_plan.addStep(std::move(left_alias_columns_keys_step)); + + auto right_alias_columns_keys_step = std::make_unique(right_plan.getCurrentDataStream(), using_alias_key_actions.getRightActions()); + right_alias_columns_keys_step->setStepDescription("Actions for right table alias column keys"); + right_plan.addStep(std::move(right_alias_columns_keys_step)); } auto join_cast_plan_output_nodes = [&](QueryPlan & plan_to_add_cast, std::unordered_map & plan_column_name_to_cast_type) diff --git a/src/Planner/TableExpressionData.h b/src/Planner/TableExpressionData.h index 9ab7a8e64fe..20c4f05ea7e 100644 --- a/src/Planner/TableExpressionData.h +++ b/src/Planner/TableExpressionData.h @@ -55,7 +55,7 @@ public: /// Return true if column with name exists, false otherwise bool hasColumn(const std::string & column_name) const { - return column_name_to_column.contains(column_name); + return alias_columns_names.contains(column_name) || column_name_to_column.contains(column_name); } /** Add column in table expression data. @@ -63,40 +63,37 @@ public: * * Logical error exception is thrown if column already exists. */ - void addColumn(const NameAndTypePair & column, const ColumnIdentifier & column_identifier, bool is_selected_column = true) + void addColumn(const NameAndTypePair & column, const ColumnIdentifier & column_identifier) { if (hasColumn(column.name)) throw Exception(ErrorCodes::LOGICAL_ERROR, "Column with name {} already exists", column.name); - column_names.push_back(column.name); - addColumnImpl(column, column_identifier, is_selected_column); + addColumnImpl(column, column_identifier); } - /// Add alias column - void addAliasColumn(const NameAndTypePair & column, const ColumnIdentifier & column_identifier, ActionsDAGPtr actions_dag, bool is_selected_column = true) + /** Add column if it does not exists in table expression data. + * Column identifier must be created using global planner context. + */ + void addColumnIfNotExists(const NameAndTypePair & column, const ColumnIdentifier & column_identifier) { - alias_column_expressions.emplace(column.name, std::move(actions_dag)); - addColumnImpl(column, column_identifier, is_selected_column); + if (hasColumn(column.name)) + return; + + addColumnImpl(column, column_identifier); } - /// Mark existing column as selected - void markSelectedColumn(const std::string & column_name) + /// Add alias column name + void addAliasColumnName(const std::string & column_name, const ColumnIdentifier & column_identifier) { - auto [_, inserted] = selected_column_names_set.emplace(column_name); - if (inserted) - selected_column_names.push_back(column_name); + alias_columns_names.insert(column_name); + + column_name_to_column_identifier.emplace(column_name, column_identifier); } - /// Get columns that are requested from table expression, including ALIAS columns - const Names & getSelectedColumnsNames() const + /// Get alias columns names + const NameSet & getAliasColumnsNames() const { - return selected_column_names; - } - - /// Get ALIAS columns names mapped to expressions - const std::unordered_map & getAliasColumnExpressions() const - { - return alias_column_expressions; + return alias_columns_names; } /// Get column name to column map @@ -105,7 +102,7 @@ public: return column_name_to_column; } - /// Get column names that are read from table expression + /// Get column names const Names & getColumnNames() const { return column_names; @@ -122,6 +119,23 @@ public: return result; } + ColumnIdentifiers getColumnIdentifiers() const + { + ColumnIdentifiers result; + result.reserve(column_identifier_to_column_name.size()); + + for (const auto & [column_identifier, _] : column_identifier_to_column_name) + result.push_back(column_identifier); + + return result; + } + + /// Get column name to column identifier map + const ColumnNameToColumnIdentifier & getColumnNameToIdentifier() const + { + return column_name_to_column_identifier; + } + /// Get column identifier to column name map const ColumnNameToColumnIdentifier & getColumnIdentifierToColumnName() const { @@ -145,6 +159,18 @@ public: return it->second; } + /** Get column for column name. + * Null is returned if there are no column for column name. + */ + const NameAndTypePair * getColumnOrNull(const std::string & column_name) const + { + auto it = column_name_to_column.find(column_name); + if (it == column_name_to_column.end()) + return nullptr; + + return &it->second; + } + /** Get column identifier for column name. * Exception is thrown if there are no column identifier for column name. */ @@ -174,6 +200,24 @@ public: return &it->second; } + /** Get column name for column identifier. + * Exception is thrown if there are no column name for column identifier. + */ + const std::string & getColumnNameOrThrow(const ColumnIdentifier & column_identifier) const + { + auto it = column_identifier_to_column_name.find(column_identifier); + if (it == column_identifier_to_column_name.end()) + { + auto column_identifiers = getColumnIdentifiers(); + throw Exception(ErrorCodes::LOGICAL_ERROR, + "Column name for column identifier {} does not exists. There are only column identifiers: {}", + column_identifier, + fmt::join(column_identifiers.begin(), column_identifiers.end(), ", ")); + } + + return it->second; + } + /** Get column name for column identifier. * Null is returned if there are no column name for column identifier. */ @@ -252,36 +296,23 @@ public: } private: - void addColumnImpl(const NameAndTypePair & column, const ColumnIdentifier & column_identifier, bool add_to_selected_columns) + void addColumnImpl(const NameAndTypePair & column, const ColumnIdentifier & column_identifier) { - if (add_to_selected_columns) - markSelectedColumn(column.name); - + column_names.push_back(column.name); column_name_to_column.emplace(column.name, column); column_name_to_column_identifier.emplace(column.name, column_identifier); column_identifier_to_column_name.emplace(column_identifier, column.name); } - /// Set of columns that are physically read from table expression - /// In case of ALIAS columns it contains source column names that are used to calculate alias - /// This source column may be not used by user + /// Valid for table, table function, array join, query, union nodes Names column_names; - /// Set of columns that are SELECTed from table expression - /// It may contain ALIAS columns. - /// Mainly it's used to determine access to which columns to check - /// For example user may have an access to column `a ALIAS x + y` but not to `x` and `y` - /// In that case we can read `x` and `y` and calculate `a`, but not return `x` and `y` to user - Names selected_column_names; - /// To deduplicate columns in `selected_column_names` - NameSet selected_column_names_set; - - /// Expression to calculate ALIAS columns - std::unordered_map alias_column_expressions; - /// Valid for table, table function, array join, query, union nodes ColumnNameToColumn column_name_to_column; + /// Valid only for table node + NameSet alias_columns_names; + /// Valid for table, table function, array join, query, union nodes ColumnNameToColumnIdentifier column_name_to_column_identifier; diff --git a/src/Planner/Utils.cpp b/src/Planner/Utils.cpp index bd0b831ee58..5f5875b8019 100644 --- a/src/Planner/Utils.cpp +++ b/src/Planner/Utils.cpp @@ -469,19 +469,12 @@ FilterDAGInfo buildFilterInfo(ASTPtr filter_expression, NameSet table_expression_required_names_without_filter) { const auto & query_context = planner_context->getQueryContext(); + auto filter_query_tree = buildQueryTree(filter_expression, query_context); QueryAnalysisPass query_analysis_pass(table_expression); query_analysis_pass.run(filter_query_tree, query_context); - return buildFilterInfo(std::move(filter_query_tree), table_expression, planner_context, std::move(table_expression_required_names_without_filter)); -} - -FilterDAGInfo buildFilterInfo(QueryTreeNodePtr filter_query_tree, - const QueryTreeNodePtr & table_expression, - PlannerContextPtr & planner_context, - NameSet table_expression_required_names_without_filter) -{ if (table_expression_required_names_without_filter.empty()) { auto & table_expression_data = planner_context->getTableExpressionDataOrThrow(table_expression); @@ -489,7 +482,7 @@ FilterDAGInfo buildFilterInfo(QueryTreeNodePtr filter_query_tree, table_expression_required_names_without_filter.insert(table_expression_names.begin(), table_expression_names.end()); } - collectSourceColumns(filter_query_tree, planner_context, false /*keep_alias_columns*/); + collectSourceColumns(filter_query_tree, planner_context); collectSets(filter_query_tree, *planner_context); auto filter_actions_dag = std::make_shared(); diff --git a/src/Planner/Utils.h b/src/Planner/Utils.h index bf45770552b..3060b1c2711 100644 --- a/src/Planner/Utils.h +++ b/src/Planner/Utils.h @@ -89,11 +89,6 @@ FilterDAGInfo buildFilterInfo(ASTPtr filter_expression, PlannerContextPtr & planner_context, NameSet table_expression_required_names_without_filter = {}); -FilterDAGInfo buildFilterInfo(QueryTreeNodePtr filter_query_tree, - const QueryTreeNodePtr & table_expression, - PlannerContextPtr & planner_context, - NameSet table_expression_required_names_without_filter = {}); - ASTPtr parseAdditionalResultFilter(const Settings & settings); } diff --git a/src/Processors/QueryPlan/ReadFromMergeTree.cpp b/src/Processors/QueryPlan/ReadFromMergeTree.cpp index 331bd46f909..c7b9eb72d4d 100644 --- a/src/Processors/QueryPlan/ReadFromMergeTree.cpp +++ b/src/Processors/QueryPlan/ReadFromMergeTree.cpp @@ -1434,13 +1434,8 @@ void ReadFromMergeTree::applyFilters(ActionDAGNodes added_filter_nodes) if (query_info.planner_context) { const auto & table_expression_data = query_info.planner_context->getTableExpressionDataOrThrow(query_info.table_expression); - const auto & alias_column_expressions = table_expression_data.getAliasColumnExpressions(); for (const auto & [column_identifier, column_name] : table_expression_data.getColumnIdentifierToColumnName()) { - /// ALIAS columns cannot be used in the filter expression without being calculated in ActionsDAG, - /// so they should not be added to the input nodes. - if (alias_column_expressions.contains(column_name)) - continue; const auto & column = table_expression_data.getColumnOrThrow(column_name); node_name_to_input_node_column.emplace(column_identifier, ColumnWithTypeAndName(column.type, column_name)); } diff --git a/src/Storages/StorageDistributed.cpp b/src/Storages/StorageDistributed.cpp index 7370bd3ab8f..92e7dcdf4f2 100644 --- a/src/Storages/StorageDistributed.cpp +++ b/src/Storages/StorageDistributed.cpp @@ -744,32 +744,6 @@ StorageSnapshotPtr StorageDistributed::getStorageSnapshotForQuery( namespace { -class ReplaseAliasColumnsVisitor : public InDepthQueryTreeVisitor -{ - static QueryTreeNodePtr getColumnNodeAliasExpression(const QueryTreeNodePtr & node) - { - const auto * column_node = node->as(); - if (!column_node || !column_node->hasExpression()) - return nullptr; - - const auto & column_source = column_node->getColumnSourceOrNull(); - if (!column_source || column_source->getNodeType() == QueryTreeNodeType::JOIN - || column_source->getNodeType() == QueryTreeNodeType::ARRAY_JOIN) - return nullptr; - - auto column_expression = column_node->getExpression(); - column_expression->setAlias(column_node->getColumnName()); - return column_expression; - } - -public: - void visitImpl(QueryTreeNodePtr & node) - { - if (auto column_expression = getColumnNodeAliasExpression(node)) - node = column_expression; - } -}; - QueryTreeNodePtr buildQueryTreeDistributed(SelectQueryInfo & query_info, const StorageSnapshotPtr & distributed_storage_snapshot, const StorageID & remote_storage_id, @@ -822,8 +796,6 @@ QueryTreeNodePtr buildQueryTreeDistributed(SelectQueryInfo & query_info, replacement_table_expression->setAlias(query_info.table_expression->getAlias()); auto query_tree_to_modify = query_info.query_tree->cloneAndReplace(query_info.table_expression, std::move(replacement_table_expression)); - ReplaseAliasColumnsVisitor replase_alias_columns_visitor; - replase_alias_columns_visitor.visit(query_tree_to_modify); return buildQueryTreeForShard(query_info.planner_context, query_tree_to_modify); } diff --git a/tests/analyzer_integration_broken_tests.txt b/tests/analyzer_integration_broken_tests.txt index 8f72fcd4050..796ca6bca22 100644 --- a/tests/analyzer_integration_broken_tests.txt +++ b/tests/analyzer_integration_broken_tests.txt @@ -4,3 +4,4 @@ test_distributed_type_object/test.py::test_distributed_type_object test_merge_table_over_distributed/test.py::test_global_in test_merge_table_over_distributed/test.py::test_select_table_name_from_merge_over_distributed test_passing_max_partitions_to_read_remotely/test.py::test_default_database_on_cluster +test_select_access_rights/test_main.py::test_alias_columns diff --git a/tests/integration/test_disabled_access_control_improvements/test_row_policy.py b/tests/integration/test_disabled_access_control_improvements/test_row_policy.py index c09a80cea06..b620e88e7eb 100644 --- a/tests/integration/test_disabled_access_control_improvements/test_row_policy.py +++ b/tests/integration/test_disabled_access_control_improvements/test_row_policy.py @@ -41,7 +41,7 @@ def started_cluster(): CREATE TABLE mydb.filtered_table2 (a UInt8, b UInt8, c UInt8, d UInt8) ENGINE MergeTree ORDER BY a; INSERT INTO mydb.filtered_table2 values (0, 0, 0, 0), (1, 2, 3, 4), (4, 3, 2, 1), (0, 0, 6, 0); - CREATE TABLE mydb.filtered_table3 (a UInt8, b UInt8, bb ALIAS b + 1, c UInt16 ALIAS a + bb - 1) ENGINE MergeTree ORDER BY a; + CREATE TABLE mydb.filtered_table3 (a UInt8, b UInt8, c UInt16 ALIAS a + b) ENGINE MergeTree ORDER BY a; INSERT INTO mydb.filtered_table3 values (0, 0), (0, 1), (1, 0), (1, 1); CREATE TABLE mydb.`.filtered_table4` (a UInt8, b UInt8, c UInt16 ALIAS a + b) ENGINE MergeTree ORDER BY a; diff --git a/tests/integration/test_row_policy/test.py b/tests/integration/test_row_policy/test.py index 8260be78e82..98653bf6106 100644 --- a/tests/integration/test_row_policy/test.py +++ b/tests/integration/test_row_policy/test.py @@ -60,7 +60,7 @@ def started_cluster(): CREATE TABLE mydb.filtered_table2 (a UInt8, b UInt8, c UInt8, d UInt8) ENGINE MergeTree ORDER BY a; INSERT INTO mydb.filtered_table2 values (0, 0, 0, 0), (1, 2, 3, 4), (4, 3, 2, 1), (0, 0, 6, 0); - CREATE TABLE mydb.filtered_table3 (a UInt8, b UInt8, bb ALIAS b + 1, c UInt16 ALIAS a + bb - 1) ENGINE MergeTree ORDER BY a; + CREATE TABLE mydb.filtered_table3 (a UInt8, b UInt8, c UInt16 ALIAS a + b) ENGINE MergeTree ORDER BY a; INSERT INTO mydb.filtered_table3 values (0, 0), (0, 1), (1, 0), (1, 1); CREATE TABLE mydb.`.filtered_table4` (a UInt8, b UInt8, c UInt16 ALIAS a + b) ENGINE MergeTree ORDER BY a; @@ -113,7 +113,6 @@ def test_smoke(): assert node.query("SELECT a FROM mydb.filtered_table3") == TSV([[0], [1]]) assert node.query("SELECT b FROM mydb.filtered_table3") == TSV([[1], [0]]) - assert node.query("SELECT bb FROM mydb.filtered_table3") == TSV([[2], [1]]) assert node.query("SELECT c FROM mydb.filtered_table3") == TSV([[1], [1]]) assert node.query("SELECT a + b FROM mydb.filtered_table3") == TSV([[1], [1]]) assert node.query("SELECT a FROM mydb.filtered_table3 WHERE c = 1") == TSV( diff --git a/tests/queries/0_stateless/02514_analyzer_drop_join_on.reference b/tests/queries/0_stateless/02514_analyzer_drop_join_on.reference index 2c62e278050..a5a71560d00 100644 --- a/tests/queries/0_stateless/02514_analyzer_drop_join_on.reference +++ b/tests/queries/0_stateless/02514_analyzer_drop_join_on.reference @@ -55,33 +55,33 @@ Header: a2 String Header: __table1.a2 String __table1.k UInt64 __table4.d2 String - Expression (DROP unused columns after JOIN) + Expression ((Actions for left table alias column keys + DROP unused columns after JOIN)) Header: __table1.a2 String __table1.k UInt64 Join (JOIN FillRightFirst) Header: __table1.a2 String __table1.k UInt64 - Expression (DROP unused columns after JOIN) + Expression ((Actions for left table alias column keys + DROP unused columns after JOIN)) Header: __table1.a2 String __table1.k UInt64 Join (JOIN FillRightFirst) Header: __table1.a2 String __table1.k UInt64 - Expression (Change column names to column identifiers) + Expression ((Actions for left table alias column keys + Change column names to column identifiers)) Header: __table1.a2 String __table1.k UInt64 ReadFromMemoryStorage Header: a2 String k UInt64 - Expression (Change column names to column identifiers) + Expression ((Actions for right table alias column keys + Change column names to column identifiers)) Header: __table2.k UInt64 ReadFromMemoryStorage Header: k UInt64 - Expression (Change column names to column identifiers) + Expression ((Actions for right table alias column keys + Change column names to column identifiers)) Header: __table3.k UInt64 ReadFromMemoryStorage Header: k UInt64 - Expression (Change column names to column identifiers) + Expression ((Actions for right table alias column keys + Change column names to column identifiers)) Header: __table4.d2 String __table4.k UInt64 ReadFromMemoryStorage diff --git a/tests/queries/0_stateless/02911_support_alias_column_in_indices.reference b/tests/queries/0_stateless/02911_support_alias_column_in_indices.reference index b867a31dcc3..883966ce6b5 100644 --- a/tests/queries/0_stateless/02911_support_alias_column_in_indices.reference +++ b/tests/queries/0_stateless/02911_support_alias_column_in_indices.reference @@ -14,13 +14,13 @@ Expression ((Projection + Before ORDER BY)) Parts: 1/1 Granules: 1/1 Expression ((Project names + Projection)) - Filter ((WHERE + (Change column names to column identifiers + Compute alias columns))) + Filter ((WHERE + Change column names to column identifiers)) ReadFromMergeTree (02911_support_alias_column_in_indices.test1) Indexes: PrimaryKey Keys: c - Condition: (plus(c, 1) in [11, +Inf)) + Condition: (_CAST(plus(c, \'UInt64\'), 1) in [11, +Inf)) Parts: 1/2 Granules: 1/2 Skip @@ -44,17 +44,12 @@ Expression ((Projection + Before ORDER BY)) Parts: 1/1 Granules: 1/1 Expression ((Project names + Projection)) - Filter ((WHERE + (Change column names to column identifiers + Compute alias columns))) + Filter ((WHERE + Change column names to column identifiers)) ReadFromMergeTree (02911_support_alias_column_in_indices.test2) Indexes: PrimaryKey Keys: c - Condition: (plus(plus(c, 1), 1) in [16, +Inf)) + Condition: (_CAST(plus(_CAST(plus(c, \'UInt64\'), 1), \'UInt64\'), 1) in [16, +Inf)) Parts: 1/2 Granules: 1/2 - Skip - Name: i - Description: minmax GRANULARITY 1 - Parts: 1/1 - Granules: 1/1 From 64a2b6551ab5c1a97e2a8022329fb20d2fe3f2fa Mon Sep 17 00:00:00 2001 From: Max Kainov Date: Thu, 29 Feb 2024 12:32:05 +0000 Subject: [PATCH 19/19] CI: fix docker build job name #do_not_test --- tests/ci/docker_images_check.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/tests/ci/docker_images_check.py b/tests/ci/docker_images_check.py index af0416d83dc..ad497a00eba 100644 --- a/tests/ci/docker_images_check.py +++ b/tests/ci/docker_images_check.py @@ -25,7 +25,6 @@ from stopwatch import Stopwatch from tee_popen import TeePopen from upload_result_helper import upload_results -NAME = "Push to Dockerhub" TEMP_PATH = Path(RUNNER_TEMP) / "docker_images_check" TEMP_PATH.mkdir(parents=True, exist_ok=True) @@ -177,6 +176,9 @@ def main(): stopwatch = Stopwatch() args = parse_args() + + NAME = f"Push to Dockerhub {args.suffix}" + if args.push: logging.info("login to docker hub") docker_login()