diff --git a/.github/workflows/backport_branches.yml b/.github/workflows/backport_branches.yml
index 6b05f1fe9f4..51670087ffe 100644
--- a/.github/workflows/backport_branches.yml
+++ b/.github/workflows/backport_branches.yml
@@ -11,7 +11,7 @@ on: # yamllint disable-line rule:truthy
- 'backport/**'
jobs:
RunConfig:
- runs-on: [self-hosted, style-checker]
+ runs-on: [self-hosted, style-checker-aarch64]
outputs:
data: ${{ steps.runconfig.outputs.CI_DATA }}
steps:
diff --git a/.github/workflows/master.yml b/.github/workflows/master.yml
index 24daca44da6..7cb5455ed73 100644
--- a/.github/workflows/master.yml
+++ b/.github/workflows/master.yml
@@ -11,7 +11,7 @@ on: # yamllint disable-line rule:truthy
- 'master'
jobs:
RunConfig:
- runs-on: [self-hosted, style-checker]
+ runs-on: [self-hosted, style-checker-aarch64]
outputs:
data: ${{ steps.runconfig.outputs.CI_DATA }}
steps:
diff --git a/.github/workflows/nightly.yml b/.github/workflows/nightly.yml
index 770e1ec3789..93ac2be19b4 100644
--- a/.github/workflows/nightly.yml
+++ b/.github/workflows/nightly.yml
@@ -14,7 +14,7 @@ jobs:
# The task for having a preserved ENV and event.json for later investigation
uses: ./.github/workflows/debug.yml
RunConfig:
- runs-on: [self-hosted, style-checker]
+ runs-on: [self-hosted, style-checker-aarch64]
outputs:
data: ${{ steps.runconfig.outputs.CI_DATA }}
steps:
diff --git a/.github/workflows/pull_request.yml b/.github/workflows/pull_request.yml
index c9cf5ab90dd..1afcdab938b 100644
--- a/.github/workflows/pull_request.yml
+++ b/.github/workflows/pull_request.yml
@@ -18,7 +18,7 @@ on: # yamllint disable-line rule:truthy
##########################################################################################
jobs:
RunConfig:
- runs-on: [self-hosted, style-checker]
+ runs-on: [self-hosted, style-checker-aarch64]
outputs:
data: ${{ steps.runconfig.outputs.CI_DATA }}
steps:
diff --git a/.github/workflows/release_branches.yml b/.github/workflows/release_branches.yml
index c076c2209ec..57e90d79ebd 100644
--- a/.github/workflows/release_branches.yml
+++ b/.github/workflows/release_branches.yml
@@ -14,7 +14,7 @@ on: # yamllint disable-line rule:truthy
jobs:
RunConfig:
- runs-on: [self-hosted, style-checker]
+ runs-on: [self-hosted, style-checker-aarch64]
outputs:
data: ${{ steps.runconfig.outputs.CI_DATA }}
steps:
diff --git a/base/base/Decimal_fwd.h b/base/base/Decimal_fwd.h
index 589d6224917..beb228cea3c 100644
--- a/base/base/Decimal_fwd.h
+++ b/base/base/Decimal_fwd.h
@@ -1,6 +1,7 @@
#pragma once
#include
+#include
namespace wide
{
@@ -44,3 +45,8 @@ concept is_over_big_int =
|| std::is_same_v
|| std::is_same_v;
}
+
+template <> struct is_signed { static constexpr bool value = true; };
+template <> struct is_signed { static constexpr bool value = true; };
+template <> struct is_signed { static constexpr bool value = true; };
+template <> struct is_signed { static constexpr bool value = true; };
diff --git a/contrib/NuRaft b/contrib/NuRaft
index 1278e32bb0d..5bb3a0e8257 160000
--- a/contrib/NuRaft
+++ b/contrib/NuRaft
@@ -1 +1 @@
-Subproject commit 1278e32bb0d5dc489f947e002bdf8c71b0ddaa63
+Subproject commit 5bb3a0e8257bacd65b099cb1b7239bd6b9a2c477
diff --git a/docker/test/integration/runner/compose/docker_compose_mysql_cluster.yml b/docker/test/integration/runner/compose/docker_compose_mysql_cluster.yml
index 73f9e39f0d6..079c451b9d6 100644
--- a/docker/test/integration/runner/compose/docker_compose_mysql_cluster.yml
+++ b/docker/test/integration/runner/compose/docker_compose_mysql_cluster.yml
@@ -1,7 +1,7 @@
version: '2.3'
services:
mysql2:
- image: mysql:5.7
+ image: mysql:8.0
restart: always
environment:
MYSQL_ROOT_PASSWORD: clickhouse
@@ -23,7 +23,7 @@ services:
source: ${MYSQL_CLUSTER_LOGS:-}
target: /mysql/
mysql3:
- image: mysql:5.7
+ image: mysql:8.0
restart: always
environment:
MYSQL_ROOT_PASSWORD: clickhouse
@@ -45,7 +45,7 @@ services:
source: ${MYSQL_CLUSTER_LOGS:-}
target: /mysql/
mysql4:
- image: mysql:5.7
+ image: mysql:8.0
restart: always
environment:
MYSQL_ROOT_PASSWORD: clickhouse
diff --git a/docker/test/upgrade/run.sh b/docker/test/upgrade/run.sh
index aaba5cc6a8c..1aecc7331cd 100644
--- a/docker/test/upgrade/run.sh
+++ b/docker/test/upgrade/run.sh
@@ -77,6 +77,12 @@ remove_keeper_config "async_replication" "1"
# create_if_not_exists feature flag doesn't exist on some older versions
remove_keeper_config "create_if_not_exists" "[01]"
+# latest_logs_cache_size_threshold setting doesn't exist on some older versions
+remove_keeper_config "latest_logs_cache_size_threshold" "[[:digit:]]\+"
+
+# commit_logs_cache_size_threshold setting doesn't exist on some older versions
+remove_keeper_config "commit_logs_cache_size_threshold" "[[:digit:]]\+"
+
# it contains some new settings, but we can safely remove it
rm /etc/clickhouse-server/config.d/merge_tree.xml
rm /etc/clickhouse-server/config.d/enable_wait_for_shutdown_replicated_tables.xml
@@ -109,6 +115,12 @@ remove_keeper_config "async_replication" "1"
# create_if_not_exists feature flag doesn't exist on some older versions
remove_keeper_config "create_if_not_exists" "[01]"
+# latest_logs_cache_size_threshold setting doesn't exist on some older versions
+remove_keeper_config "latest_logs_cache_size_threshold" "[[:digit:]]\+"
+
+# commit_logs_cache_size_threshold setting doesn't exist on some older versions
+remove_keeper_config "commit_logs_cache_size_threshold" "[[:digit:]]\+"
+
# But we still need default disk because some tables loaded only into it
sudo cat /etc/clickhouse-server/config.d/s3_storage_policy_by_default.xml \
| sed "s|s3|s3default|" \
diff --git a/docs/en/operations/configuration-files.md b/docs/en/operations/configuration-files.md
index 81b25a4e897..9f17f4af1e8 100644
--- a/docs/en/operations/configuration-files.md
+++ b/docs/en/operations/configuration-files.md
@@ -10,11 +10,62 @@ The ClickHouse server can be configured with configuration files in XML or YAML
It is possible to mix XML and YAML configuration files, for example you could have a main configuration file `config.xml` and additional configuration files `config.d/network.xml`, `config.d/timezone.yaml` and `config.d/keeper.yaml`. Mixing XML and YAML within a single configuration file is not supported. XML configuration files should use `...` as top-level tag. In YAML configuration files, `clickhouse:` is optional, the parser inserts it implicitly if absent.
-## Overriding Configuration {#override}
+## Merging Configuration {#merging}
-The merge of configuration files behaves as one intuitively expects: The contents of both files are combined recursively, children with the same name are replaced by the element of the more specific configuration file. The merge can be customized using attributes `replace` and `remove`.
-- Attribute `replace` means that the element is replaced by the specified one.
-- Attribute `remove` means that the element is deleted.
+Two configuration files (usually the main configuration file and another configuration files from `config.d/`) are merged as follows:
+
+- If a node (i.e. a path leading to an element) appears in both files and does not have attributes `replace` or `remove`, it is included in the merged configuration file and children from both nodes are included and merged recursively.
+- If one of both nodes contains attribute `replace`, it is included in the merged configuration file but only children from the node with attribute `replace` are included.
+- If one of both nodes contains attribute `remove`, the node is not included in the merged configuration file (if it exists already, it is deleted).
+
+Example:
+
+
+```xml
+
+
+
+ 1
+
+
+ 2
+
+
+ 3
+
+
+```
+
+and
+
+```xml
+
+
+
+ 4
+
+
+ 5
+
+
+ 6
+
+
+```
+
+generates merged configuration file:
+
+```xml
+
+
+ 1
+ 4
+
+
+ 5
+
+
+```
To specify that a value of an element should be replaced by the value of an environment variable, you can use attribute `from_env`.
@@ -125,7 +176,7 @@ Users configuration can be split into separate files similar to `config.xml` and
Directory name is defined as `users_config` setting without `.xml` postfix concatenated with `.d`.
Directory `users.d` is used by default, as `users_config` defaults to `users.xml`.
-Note that configuration files are first merged taking into account [Override](#override) settings and includes are processed after that.
+Note that configuration files are first [merged](#merging) taking into account settings, and includes are processed after that.
## XML example {#example}
diff --git a/docs/en/operations/settings/settings.md b/docs/en/operations/settings/settings.md
index 1bdec81ae88..b11a04e10ec 100644
--- a/docs/en/operations/settings/settings.md
+++ b/docs/en/operations/settings/settings.md
@@ -4279,41 +4279,6 @@ Result:
└─────┴─────┴───────┘
```
-## enable_order_by_all {#enable-order-by-all}
-
-Enables or disables sorting by `ALL` columns, i.e. [ORDER BY](../../sql-reference/statements/select/order-by.md)
-
-Possible values:
-
-- 0 — Disable ORDER BY ALL.
-- 1 — Enable ORDER BY ALL.
-
-Default value: `1`.
-
-**Example**
-
-Query:
-
-```sql
-CREATE TABLE TAB(C1 Int, C2 Int, ALL Int) ENGINE=Memory();
-
-INSERT INTO TAB VALUES (10, 20, 30), (20, 20, 10), (30, 10, 20);
-
-SELECT * FROM TAB ORDER BY ALL; -- returns an error that ALL is ambiguous
-
-SELECT * FROM TAB ORDER BY ALL SETTINGS enable_order_by_all;
-```
-
-Result:
-
-```text
-┌─C1─┬─C2─┬─ALL─┐
-│ 20 │ 20 │ 10 │
-│ 30 │ 10 │ 20 │
-│ 10 │ 20 │ 30 │
-└────┴────┴─────┘
-```
-
## splitby_max_substrings_includes_remaining_string {#splitby_max_substrings_includes_remaining_string}
Controls whether function [splitBy*()](../../sql-reference/functions/splitting-merging-functions.md) with argument `max_substrings` > 0 will include the remaining string in the last element of the result array.
diff --git a/docs/en/sql-reference/aggregate-functions/reference/simplelinearregression.md b/docs/en/sql-reference/aggregate-functions/reference/simplelinearregression.md
index bcff05ada47..ea3dbff8691 100644
--- a/docs/en/sql-reference/aggregate-functions/reference/simplelinearregression.md
+++ b/docs/en/sql-reference/aggregate-functions/reference/simplelinearregression.md
@@ -13,8 +13,8 @@ simpleLinearRegression(x, y)
Parameters:
-- `x` — Column with dependent variable values.
-- `y` — Column with explanatory variable values.
+- `x` — Column with explanatory variable values.
+- `y` — Column with dependent variable values.
Returned values:
diff --git a/docs/en/sql-reference/functions/distance-functions.md b/docs/en/sql-reference/functions/distance-functions.md
index 1774c22014d..e20c35c6b6f 100644
--- a/docs/en/sql-reference/functions/distance-functions.md
+++ b/docs/en/sql-reference/functions/distance-functions.md
@@ -509,7 +509,7 @@ Result:
## cosineDistance
-Calculates the cosine distance between two vectors (the values of the tuples are the coordinates). The less the returned value is, the more similar are the vectors.
+Calculates the cosine distance between two vectors (the values of the tuples are the coordinates). The smaller the returned value is, the more similar are the vectors.
**Syntax**
diff --git a/docs/en/sql-reference/functions/tuple-functions.md b/docs/en/sql-reference/functions/tuple-functions.md
index 5930239dc56..b089de67e98 100644
--- a/docs/en/sql-reference/functions/tuple-functions.md
+++ b/docs/en/sql-reference/functions/tuple-functions.md
@@ -542,7 +542,7 @@ Alias: `scalarProduct`.
- Scalar product.
-Type: [Int/UInt](../../sql-reference/data-types/int-uint.md), [Float](../../sql-reference/data-types/float.md) or [Decimal](../../sql-reference/data-types/decimal.md).
+Type: [Int/UInt](../../sql-reference/data-types/int-uint.md) or [Float](../../sql-reference/data-types/float.md).
**Example**
diff --git a/docs/en/sql-reference/statements/select/order-by.md b/docs/en/sql-reference/statements/select/order-by.md
index d6432a7b4f8..29aca70762e 100644
--- a/docs/en/sql-reference/statements/select/order-by.md
+++ b/docs/en/sql-reference/statements/select/order-by.md
@@ -9,10 +9,9 @@ The `ORDER BY` clause contains
- a list of expressions, e.g. `ORDER BY visits, search_phrase`,
- a list of numbers referring to columns in the `SELECT` clause, e.g. `ORDER BY 2, 1`, or
-- `ALL` which means all columns of the `SELECT` clause, e.g. `ORDER BY ALL`.
+- `*` (without other expressions or numbers) which means all columns of the `SELECT` clause: `ORDER BY *`.
To disable sorting by column numbers, set setting [enable_positional_arguments](../../../operations/settings/settings.md#enable-positional-arguments) = 0.
-To disable sorting by `ALL`, set setting [enable_order_by_all](../../../operations/settings/settings.md#enable-order-by-all) = 0.
The `ORDER BY` clause can be attributed by a `DESC` (descending) or `ASC` (ascending) modifier which determines the sorting direction.
Unless an explicit sort order is specified, `ASC` is used by default.
diff --git a/docs/zh/sql-reference/statements/select/order-by.md b/docs/zh/sql-reference/statements/select/order-by.md
index 3286fc9f9e7..9540c96a10d 100644
--- a/docs/zh/sql-reference/statements/select/order-by.md
+++ b/docs/zh/sql-reference/statements/select/order-by.md
@@ -61,14 +61,14 @@ sidebar_label: ORDER BY
我们只建议使用 `COLLATE` 对于少量行的最终排序,因为排序与 `COLLATE` 比正常的按字节排序效率低。
-## ORDER BY ALL
+## ORDER BY *
-`ORDER BY ALL` 对所有选定的列进行升序排序。
+`ORDER BY *` 对所有选定的列进行升序排序。
示例:
``` sql
-SELECT a, b, c FROM t ORDER BY ALL
+SELECT a, b, c FROM t ORDER BY *
```
等同于:
diff --git a/programs/benchmark/Benchmark.cpp b/programs/benchmark/Benchmark.cpp
index 961c678b936..fac88c0621f 100644
--- a/programs/benchmark/Benchmark.cpp
+++ b/programs/benchmark/Benchmark.cpp
@@ -2,7 +2,6 @@
#include
#include
#include
-#include
#include
#include
#include
diff --git a/programs/keeper-converter/KeeperConverter.cpp b/programs/keeper-converter/KeeperConverter.cpp
index 92bdea28738..8cd50d0892f 100644
--- a/programs/keeper-converter/KeeperConverter.cpp
+++ b/programs/keeper-converter/KeeperConverter.cpp
@@ -1,6 +1,7 @@
#include
#include
+#include
#include
#include
#include
@@ -39,7 +40,7 @@ int mainEntryClickHouseKeeperConverter(int argc, char ** argv)
try
{
- auto keeper_context = std::make_shared(true);
+ auto keeper_context = std::make_shared(true, std::make_shared());
keeper_context->setDigestEnabled(true);
keeper_context->setSnapshotDisk(std::make_shared("Keeper-snapshots", options["output-dir"].as()));
diff --git a/programs/keeper/CMakeLists.txt b/programs/keeper/CMakeLists.txt
index b8a5d9c9c19..70e0f229fd4 100644
--- a/programs/keeper/CMakeLists.txt
+++ b/programs/keeper/CMakeLists.txt
@@ -41,7 +41,7 @@ if (BUILD_STANDALONE_KEEPER)
${CMAKE_CURRENT_SOURCE_DIR}/../../src/Coordination/KeeperStorage.cpp
${CMAKE_CURRENT_SOURCE_DIR}/../../src/Coordination/KeeperConstants.cpp
${CMAKE_CURRENT_SOURCE_DIR}/../../src/Coordination/KeeperAsynchronousMetrics.cpp
- ${CMAKE_CURRENT_SOURCE_DIR}/../../src/Coordination/pathUtils.cpp
+ ${CMAKE_CURRENT_SOURCE_DIR}/../../src/Coordination/KeeperCommon.cpp
${CMAKE_CURRENT_SOURCE_DIR}/../../src/Coordination/SessionExpiryQueue.cpp
${CMAKE_CURRENT_SOURCE_DIR}/../../src/Coordination/SummingStateMachine.cpp
${CMAKE_CURRENT_SOURCE_DIR}/../../src/Coordination/WriteBufferFromNuraftBuffer.cpp
diff --git a/programs/keeper/Keeper.cpp b/programs/keeper/Keeper.cpp
index 5b844e7d650..8972c82eab8 100644
--- a/programs/keeper/Keeper.cpp
+++ b/programs/keeper/Keeper.cpp
@@ -560,7 +560,7 @@ try
auto main_config_reloader = std::make_unique(
config_path,
extra_paths,
- config().getString("path", ""),
+ config().getString("path", KEEPER_DEFAULT_PATH),
std::move(unused_cache),
unused_event,
[&](ConfigurationPtr config, bool /* initial_loading */)
diff --git a/programs/server/Server.cpp b/programs/server/Server.cpp
index 0a3c23d746a..74fcc7326fc 100644
--- a/programs/server/Server.cpp
+++ b/programs/server/Server.cpp
@@ -1292,7 +1292,7 @@ try
auto main_config_reloader = std::make_unique(
config_path,
extra_paths,
- config().getString("path", ""),
+ config().getString("path", DBMS_DEFAULT_PATH),
std::move(main_config_zk_node_cache),
main_config_zk_changed_event,
[&](ConfigurationPtr config, bool initial_loading)
@@ -1391,7 +1391,7 @@ try
global_context->setMaxDatabaseNumToWarn(new_server_settings.max_database_num_to_warn);
global_context->setMaxPartNumToWarn(new_server_settings.max_part_num_to_warn);
- ConcurrencyControl::SlotCount concurrent_threads_soft_limit = ConcurrencyControl::Unlimited;
+ SlotCount concurrent_threads_soft_limit = UnlimitedSlots;
if (new_server_settings.concurrent_threads_soft_limit_num > 0 && new_server_settings.concurrent_threads_soft_limit_num < concurrent_threads_soft_limit)
concurrent_threads_soft_limit = new_server_settings.concurrent_threads_soft_limit_num;
if (new_server_settings.concurrent_threads_soft_limit_ratio_to_cores > 0)
diff --git a/src/AggregateFunctions/AggregateFunctionCount.h b/src/AggregateFunctions/AggregateFunctionCount.h
index f5d6030a777..36a8617ba91 100644
--- a/src/AggregateFunctions/AggregateFunctionCount.h
+++ b/src/AggregateFunctions/AggregateFunctionCount.h
@@ -219,7 +219,7 @@ public:
: IAggregateFunctionDataHelper({argument}, params, createResultType())
{
if (!argument->isNullable())
- throw Exception(ErrorCodes::LOGICAL_ERROR, "Not Nullable data type passed to AggregateFunctionCountNotNullUnary");
+ throw Exception(ErrorCodes::LOGICAL_ERROR, "Logical error: not Nullable data type passed to AggregateFunctionCountNotNullUnary");
}
String getName() const override { return "count"; }
diff --git a/src/AggregateFunctions/AggregateFunctionFactory.cpp b/src/AggregateFunctions/AggregateFunctionFactory.cpp
index 18edb7c8ce0..b6ba562045d 100644
--- a/src/AggregateFunctions/AggregateFunctionFactory.cpp
+++ b/src/AggregateFunctions/AggregateFunctionFactory.cpp
@@ -100,7 +100,7 @@ AggregateFunctionPtr AggregateFunctionFactory::get(
{
AggregateFunctionCombinatorPtr combinator = AggregateFunctionCombinatorFactory::instance().tryFindSuffix("Null");
if (!combinator)
- throw Exception(ErrorCodes::LOGICAL_ERROR, "Cannot find aggregate function combinator "
+ throw Exception(ErrorCodes::LOGICAL_ERROR, "Logical error: cannot find aggregate function combinator "
"to apply a function to Nullable arguments.");
DataTypes nested_types = combinator->transformArguments(types_without_low_cardinality);
@@ -123,7 +123,7 @@ AggregateFunctionPtr AggregateFunctionFactory::get(
auto with_original_arguments = getImpl(name, action, types_without_low_cardinality, parameters, out_properties, false);
if (!with_original_arguments)
- throw Exception(ErrorCodes::LOGICAL_ERROR, "AggregateFunctionFactory returned nullptr");
+ throw Exception(ErrorCodes::LOGICAL_ERROR, "Logical error: AggregateFunctionFactory returned nullptr");
return with_original_arguments;
}
diff --git a/src/AggregateFunctions/AggregateFunctionSum.h b/src/AggregateFunctions/AggregateFunctionSum.h
index 5781ab69c6b..58aaddf357a 100644
--- a/src/AggregateFunctions/AggregateFunctionSum.h
+++ b/src/AggregateFunctions/AggregateFunctionSum.h
@@ -146,9 +146,7 @@ struct AggregateFunctionSumData
size_t count = end - start;
const auto * end_ptr = ptr + count;
- if constexpr (
- (is_integer && !is_big_int_v)
- || (is_decimal && !std::is_same_v && !std::is_same_v))
+ if constexpr ((is_integer || is_decimal) && !is_over_big_int)
{
/// For integers we can vectorize the operation if we replace the null check using a multiplication (by 0 for null, 1 for not null)
/// https://quick-bench.com/q/MLTnfTvwC2qZFVeWHfOBR3U7a8I
@@ -163,8 +161,39 @@ struct AggregateFunctionSumData
Impl::add(sum, local_sum);
return;
}
+ else if constexpr (is_over_big_int)
+ {
+ /// Use a mask to discard or keep the value to reduce branch miss.
+ /// Notice that for (U)Int128 or Decimal128, MaskType is Int8 instead of Int64, otherwise extra branches will be introduced by compiler (for unknown reason) and performance will be worse.
+ using MaskType = std::conditional_t;
+ alignas(64) const MaskType masks[2] = {0, -1};
+ T local_sum{};
+ while (ptr < end_ptr)
+ {
+ Value v = *ptr;
+ if constexpr (!add_if_zero)
+ {
+ if constexpr (is_integer)
+ v &= masks[!!*condition_map];
+ else
+ v.value &= masks[!!*condition_map];
+ }
+ else
+ {
+ if constexpr (is_integer)
+ v &= masks[!*condition_map];
+ else
+ v.value &= masks[!*condition_map];
+ }
- if constexpr (std::is_floating_point_v)
+ Impl::add(local_sum, v);
+ ++ptr;
+ ++condition_map;
+ }
+ Impl::add(sum, local_sum);
+ return;
+ }
+ else if constexpr (std::is_floating_point_v)
{
/// For floating point we use a similar trick as above, except that now we reinterpret the floating point number as an unsigned
/// integer of the same size and use a mask instead (0 to discard, 0xFF..FF to keep)
diff --git a/src/AggregateFunctions/Combinators/AggregateFunctionIf.cpp b/src/AggregateFunctions/Combinators/AggregateFunctionIf.cpp
index 9b5ee79a533..20a4bde6bb4 100644
--- a/src/AggregateFunctions/Combinators/AggregateFunctionIf.cpp
+++ b/src/AggregateFunctions/Combinators/AggregateFunctionIf.cpp
@@ -249,7 +249,7 @@ public:
: Base(std::move(nested_function_), arguments, params), number_of_arguments(arguments.size())
{
if (number_of_arguments == 1)
- throw Exception(ErrorCodes::LOGICAL_ERROR, "Single argument is passed to AggregateFunctionIfNullVariadic");
+ throw Exception(ErrorCodes::LOGICAL_ERROR, "Logical error: single argument is passed to AggregateFunctionIfNullVariadic");
if (number_of_arguments > MAX_ARGS)
throw Exception(ErrorCodes::NUMBER_OF_ARGUMENTS_DOESNT_MATCH,
diff --git a/src/AggregateFunctions/Combinators/AggregateFunctionNull.h b/src/AggregateFunctions/Combinators/AggregateFunctionNull.h
index c8574e82be5..6b6580bf4c4 100644
--- a/src/AggregateFunctions/Combinators/AggregateFunctionNull.h
+++ b/src/AggregateFunctions/Combinators/AggregateFunctionNull.h
@@ -429,7 +429,7 @@ public:
, number_of_arguments(arguments.size())
{
if (number_of_arguments == 1)
- throw Exception(ErrorCodes::LOGICAL_ERROR, "Single argument is passed to AggregateFunctionNullVariadic");
+ throw Exception(ErrorCodes::LOGICAL_ERROR, "Logical error: single argument is passed to AggregateFunctionNullVariadic");
if (number_of_arguments > MAX_ARGS)
throw Exception(ErrorCodes::NUMBER_OF_ARGUMENTS_DOESNT_MATCH,
diff --git a/src/Analyzer/Passes/ArrayExistsToHasPass.cpp b/src/Analyzer/Passes/ArrayExistsToHasPass.cpp
index 36c3df4d93a..62db502e1dc 100644
--- a/src/Analyzer/Passes/ArrayExistsToHasPass.cpp
+++ b/src/Analyzer/Passes/ArrayExistsToHasPass.cpp
@@ -1,6 +1,7 @@
#include
#include
+#include
#include
@@ -83,7 +84,8 @@ public:
return;
}
- auto has_function = FunctionFactory::instance().get("has", getContext());
+ auto has_function = createInternalFunctionHasOverloadResolver();
+
array_exists_function_arguments_nodes[0] = std::move(array_exists_function_arguments_nodes[1]);
array_exists_function_arguments_nodes[1] = std::move(has_constant_element_argument);
array_exists_function_node->resolveAsFunction(has_function->build(array_exists_function_node->getArgumentColumns()));
diff --git a/src/Analyzer/Passes/CNF.cpp b/src/Analyzer/Passes/CNF.cpp
index aa6ee539934..71549f9e71d 100644
--- a/src/Analyzer/Passes/CNF.cpp
+++ b/src/Analyzer/Passes/CNF.cpp
@@ -10,6 +10,7 @@
#include
#include
+#include
#include
@@ -79,7 +80,7 @@ public:
if (name == "and" || name == "or")
{
- auto function_resolver = FunctionFactory::instance().get(name, current_context);
+ auto function_resolver = name == "and" ? createInternalFunctionAndOverloadResolver() : createInternalFunctionOrOverloadResolver();
const auto & arguments = function_node->getArguments().getNodes();
if (arguments.size() > 2)
@@ -110,10 +111,10 @@ private:
class PushNotVisitor
{
public:
- explicit PushNotVisitor(const ContextPtr & context)
- : not_function_resolver(FunctionFactory::instance().get("not", context))
- , or_function_resolver(FunctionFactory::instance().get("or", context))
- , and_function_resolver(FunctionFactory::instance().get("and", context))
+ explicit PushNotVisitor()
+ : not_function_resolver(createInternalFunctionNotOverloadResolver())
+ , or_function_resolver(createInternalFunctionOrOverloadResolver())
+ , and_function_resolver(createInternalFunctionAndOverloadResolver())
{}
void visit(QueryTreeNodePtr & node, bool add_negation)
@@ -162,10 +163,10 @@ private:
class PushOrVisitor
{
public:
- PushOrVisitor(ContextPtr context, size_t max_atoms_)
+ explicit PushOrVisitor(size_t max_atoms_)
: max_atoms(max_atoms_)
- , and_resolver(FunctionFactory::instance().get("and", context))
- , or_resolver(FunctionFactory::instance().get("or", context))
+ , and_resolver(createInternalFunctionAndOverloadResolver())
+ , or_resolver(createInternalFunctionOrOverloadResolver())
{}
bool visit(QueryTreeNodePtr & node, size_t num_atoms)
@@ -513,11 +514,11 @@ std::optional CNF::tryBuildCNF(const QueryTreeNodePtr & node, ContextPtr co
}
{
- PushNotVisitor visitor(context);
+ PushNotVisitor visitor;
visitor.visit(node_cloned, false);
}
- if (PushOrVisitor visitor(context, max_atoms);
+ if (PushOrVisitor visitor(max_atoms);
!visitor.visit(node_cloned, atom_count))
return std::nullopt;
@@ -542,7 +543,7 @@ CNF CNF::toCNF(const QueryTreeNodePtr & node, ContextPtr context, size_t max_gro
return *cnf;
}
-QueryTreeNodePtr CNF::toQueryTree(ContextPtr context) const
+QueryTreeNodePtr CNF::toQueryTree() const
{
if (statements.empty())
return nullptr;
@@ -550,9 +551,9 @@ QueryTreeNodePtr CNF::toQueryTree(ContextPtr context) const
QueryTreeNodes and_arguments;
and_arguments.reserve(statements.size());
- auto not_resolver = FunctionFactory::instance().get("not", context);
- auto or_resolver = FunctionFactory::instance().get("or", context);
- auto and_resolver = FunctionFactory::instance().get("and", context);
+ auto not_resolver = createInternalFunctionNotOverloadResolver();
+ auto or_resolver = createInternalFunctionOrOverloadResolver();
+ auto and_resolver = createInternalFunctionAndOverloadResolver();
const auto function_node_from_atom = [&](const auto & atom) -> QueryTreeNodePtr
{
diff --git a/src/Analyzer/Passes/CNF.h b/src/Analyzer/Passes/CNF.h
index ec639cd6679..9325d97d2f2 100644
--- a/src/Analyzer/Passes/CNF.h
+++ b/src/Analyzer/Passes/CNF.h
@@ -54,7 +54,7 @@ public:
static std::optional tryBuildCNF(const QueryTreeNodePtr & node, ContextPtr context, size_t max_growth_multiplier = DEFAULT_MAX_GROWTH_MULTIPLIER);
static CNF toCNF(const QueryTreeNodePtr & node, ContextPtr context, size_t max_growth_multiplier = DEFAULT_MAX_GROWTH_MULTIPLIER);
- QueryTreeNodePtr toQueryTree(ContextPtr context) const;
+ QueryTreeNodePtr toQueryTree() const;
const auto & getStatements() const
{
diff --git a/src/Analyzer/Passes/ConvertOrLikeChainPass.cpp b/src/Analyzer/Passes/ConvertOrLikeChainPass.cpp
index 905819bf49f..eb897ef8746 100644
--- a/src/Analyzer/Passes/ConvertOrLikeChainPass.cpp
+++ b/src/Analyzer/Passes/ConvertOrLikeChainPass.cpp
@@ -11,6 +11,8 @@
#include
#include
+#include
+#include
#include
@@ -134,8 +136,10 @@ private:
void ConvertOrLikeChainPass::run(QueryTreeNodePtr & query_tree_node, ContextPtr context)
{
- auto or_function_resolver = FunctionFactory::instance().get("or", context);
- auto match_function_resolver = FunctionFactory::instance().get("multiMatchAny", context);
+ const auto & settings = context->getSettingsRef();
+ auto match_function_resolver = createInternalMultiMatchAnyOverloadResolver(settings.allow_hyperscan, settings.max_hyperscan_regexp_length, settings.max_hyperscan_regexp_total_length, settings.reject_expensive_hyperscan_regexps);
+ auto or_function_resolver = createInternalFunctionOrOverloadResolver();
+
ConvertOrLikeChainVisitor visitor(std::move(or_function_resolver), std::move(match_function_resolver), std::move(context));
visitor.visit(query_tree_node);
}
diff --git a/src/Analyzer/Passes/ConvertQueryToCNFPass.cpp b/src/Analyzer/Passes/ConvertQueryToCNFPass.cpp
index 5ce1ea43f2f..96bc62212fd 100644
--- a/src/Analyzer/Passes/ConvertQueryToCNFPass.cpp
+++ b/src/Analyzer/Passes/ConvertQueryToCNFPass.cpp
@@ -339,7 +339,7 @@ void addIndexConstraint(Analyzer::CNF & cnf, const QueryTreeNodes & table_expres
{
Analyzer::CNF::OrGroup new_group;
auto index_hint_node = std::make_shared("indexHint");
- index_hint_node->getArguments().getNodes().push_back(Analyzer::CNF{std::move(and_group)}.toQueryTree(context));
+ index_hint_node->getArguments().getNodes().push_back(Analyzer::CNF{std::move(and_group)}.toQueryTree());
index_hint_node->resolveAsFunction(FunctionFactory::instance().get("indexHint", context));
new_group.insert({false, QueryTreeNodePtrWithHash{std::move(index_hint_node)}});
@@ -676,7 +676,7 @@ void optimizeNode(QueryTreeNodePtr & node, const QueryTreeNodes & table_expressi
if (settings.optimize_using_constraints)
optimizeWithConstraints(*cnf, table_expressions, context);
- auto new_node = cnf->toQueryTree(context);
+ auto new_node = cnf->toQueryTree();
node = std::move(new_node);
}
diff --git a/src/Analyzer/Passes/CrossToInnerJoinPass.cpp b/src/Analyzer/Passes/CrossToInnerJoinPass.cpp
index 154babf3d9a..d0a5656d334 100644
--- a/src/Analyzer/Passes/CrossToInnerJoinPass.cpp
+++ b/src/Analyzer/Passes/CrossToInnerJoinPass.cpp
@@ -12,6 +12,7 @@
#include
#include
+#include
#include
@@ -256,7 +257,7 @@ private:
for (const auto & node : nodes)
function_node->getArguments().getNodes().push_back(node);
- const auto & function = FunctionFactory::instance().get("and", getContext());
+ const auto & function = createInternalFunctionAndOverloadResolver();
function_node->resolveAsFunction(function->build(function_node->getArgumentColumns()));
return function_node;
}
diff --git a/src/Analyzer/Passes/IfChainToMultiIfPass.cpp b/src/Analyzer/Passes/IfChainToMultiIfPass.cpp
index 88e350ffa2e..70b717f3108 100644
--- a/src/Analyzer/Passes/IfChainToMultiIfPass.cpp
+++ b/src/Analyzer/Passes/IfChainToMultiIfPass.cpp
@@ -5,6 +5,7 @@
#include
#include
#include
+#include
namespace DB
{
@@ -75,7 +76,8 @@ private:
void IfChainToMultiIfPass::run(QueryTreeNodePtr & query_tree_node, ContextPtr context)
{
- auto multi_if_function_ptr = FunctionFactory::instance().get("multiIf", context);
+ const auto & settings = context->getSettingsRef();
+ auto multi_if_function_ptr = createInternalMultiIfOverloadResolver(settings.allow_execute_multiif_columnar, settings.allow_experimental_variant_type, settings.use_variant_as_common_type);
IfChainToMultiIfPassVisitor visitor(std::move(multi_if_function_ptr), std::move(context));
visitor.visit(query_tree_node);
}
diff --git a/src/Analyzer/Passes/MultiIfToIfPass.cpp b/src/Analyzer/Passes/MultiIfToIfPass.cpp
index 8e09d5cab38..c42ea61b34a 100644
--- a/src/Analyzer/Passes/MultiIfToIfPass.cpp
+++ b/src/Analyzer/Passes/MultiIfToIfPass.cpp
@@ -3,6 +3,7 @@
#include
#include
#include
+#include
namespace DB
{
@@ -54,7 +55,8 @@ private:
void MultiIfToIfPass::run(QueryTreeNodePtr & query_tree_node, ContextPtr context)
{
- auto if_function_ptr = FunctionFactory::instance().get("if", context);
+ const auto & settings = context->getSettingsRef();
+ auto if_function_ptr = createInternalFunctionIfOverloadResolver(settings.allow_experimental_variant_type, settings.use_variant_as_common_type);
MultiIfToIfVisitor visitor(std::move(if_function_ptr), std::move(context));
visitor.visit(query_tree_node);
}
diff --git a/src/Analyzer/Passes/QueryAnalysisPass.cpp b/src/Analyzer/Passes/QueryAnalysisPass.cpp
index 3766d1aa6b9..376701f777e 100644
--- a/src/Analyzer/Passes/QueryAnalysisPass.cpp
+++ b/src/Analyzer/Passes/QueryAnalysisPass.cpp
@@ -120,7 +120,6 @@ namespace ErrorCodes
extern const int NUMBER_OF_COLUMNS_DOESNT_MATCH;
extern const int FUNCTION_CANNOT_HAVE_PARAMETERS;
extern const int SYNTAX_ERROR;
- extern const int UNEXPECTED_EXPRESSION;
extern const int INVALID_IDENTIFIER;
}
@@ -1215,7 +1214,7 @@ private:
static void expandGroupByAll(QueryNode & query_tree_node_typed);
- void expandOrderByAll(QueryNode & query_tree_node_typed, const Settings & settings);
+ void expandOrderByAll(QueryNode & query_tree_node_typed);
static std::string
rewriteAggregateFunctionNameIfNeeded(const std::string & aggregate_function_name, NullsAction action, const ContextPtr & context);
@@ -2367,9 +2366,9 @@ void QueryAnalyzer::expandGroupByAll(QueryNode & query_tree_node_typed)
query_tree_node_typed.setIsGroupByAll(false);
}
-void QueryAnalyzer::expandOrderByAll(QueryNode & query_tree_node_typed, const Settings & settings)
+void QueryAnalyzer::expandOrderByAll(QueryNode & query_tree_node_typed)
{
- if (!settings.enable_order_by_all || !query_tree_node_typed.isOrderByAll())
+ if (!query_tree_node_typed.isOrderByAll())
return;
auto * all_node = query_tree_node_typed.getOrderBy().getNodes()[0]->as();
@@ -2390,9 +2389,6 @@ void QueryAnalyzer::expandOrderByAll(QueryNode & query_tree_node_typed, const Se
throw Exception(ErrorCodes::LOGICAL_ERROR,
"Expression nodes list expected 1 projection names. Actual {}",
projection_names.size());
- if (Poco::toUpper(projection_names[0]) == "ALL")
- throw Exception(ErrorCodes::UNEXPECTED_EXPRESSION,
- "Cannot use ORDER BY ALL to sort a column with name 'all', please disable setting `enable_order_by_all` and try again");
}
auto sort_node = std::make_shared(node, all_node->getSortDirection(), all_node->getNullsSortDirection());
@@ -7559,7 +7555,7 @@ void QueryAnalyzer::resolveQuery(const QueryTreeNodePtr & query_node, Identifier
if (settings.enable_positional_arguments)
replaceNodesWithPositionalArguments(query_node_typed.getOrderByNode(), query_node_typed.getProjection().getNodes(), scope);
- expandOrderByAll(query_node_typed, settings);
+ expandOrderByAll(query_node_typed);
resolveSortNodeList(query_node_typed.getOrderByNode(), scope);
}
diff --git a/src/Analyzer/QueryNode.h b/src/Analyzer/QueryNode.h
index d8b8741afb2..1b389572e42 100644
--- a/src/Analyzer/QueryNode.h
+++ b/src/Analyzer/QueryNode.h
@@ -219,13 +219,13 @@ public:
is_group_by_all = is_group_by_all_value;
}
- /// Returns true, if query node has ORDER BY ALL modifier, false otherwise
+ /// Returns true, if query node has ORDER BY * modifier, false otherwise
bool isOrderByAll() const
{
return is_order_by_all;
}
- /// Set query node ORDER BY ALL modifier value
+ /// Set query node ORDER BY * modifier value
void setIsOrderByAll(bool is_order_by_all_value)
{
is_order_by_all = is_order_by_all_value;
diff --git a/src/Backups/BackupIO_AzureBlobStorage.cpp b/src/Backups/BackupIO_AzureBlobStorage.cpp
index 52ce20d5108..b3b92323109 100644
--- a/src/Backups/BackupIO_AzureBlobStorage.cpp
+++ b/src/Backups/BackupIO_AzureBlobStorage.cpp
@@ -2,7 +2,7 @@
#if USE_AZURE_BLOB_STORAGE
#include
-#include
+#include
#include
#include
#include
diff --git a/src/Backups/BackupIO_S3.cpp b/src/Backups/BackupIO_S3.cpp
index fa4c1af3698..9359602a651 100644
--- a/src/Backups/BackupIO_S3.cpp
+++ b/src/Backups/BackupIO_S3.cpp
@@ -2,7 +2,7 @@
#if USE_AWS_S3
#include
-#include
+#include
#include
#include
#include
@@ -127,7 +127,7 @@ BackupReaderS3::BackupReaderS3(
: BackupReaderDefault(read_settings_, write_settings_, getLogger("BackupReaderS3"))
, s3_uri(s3_uri_)
, data_source_description{DataSourceType::ObjectStorage, ObjectStorageType::S3, MetadataStorageType::None, s3_uri.endpoint, false, false}
- , s3_settings(context_->getStorageS3Settings().getSettings(s3_uri.uri.toString()))
+ , s3_settings(context_->getStorageS3Settings().getSettings(s3_uri.uri.toString(), context_->getUserName()))
{
auto & request_settings = s3_settings.request_settings;
request_settings.updateFromSettings(context_->getSettingsRef());
@@ -217,7 +217,7 @@ BackupWriterS3::BackupWriterS3(
: BackupWriterDefault(read_settings_, write_settings_, getLogger("BackupWriterS3"))
, s3_uri(s3_uri_)
, data_source_description{DataSourceType::ObjectStorage, ObjectStorageType::S3, MetadataStorageType::None, s3_uri.endpoint, false, false}
- , s3_settings(context_->getStorageS3Settings().getSettings(s3_uri.uri.toString()))
+ , s3_settings(context_->getStorageS3Settings().getSettings(s3_uri.uri.toString(), context_->getUserName()))
{
auto & request_settings = s3_settings.request_settings;
request_settings.updateFromSettings(context_->getSettingsRef());
diff --git a/src/Backups/BackupOperationInfo.h b/src/Backups/BackupOperationInfo.h
index e57b57d75f1..21b5284458c 100644
--- a/src/Backups/BackupOperationInfo.h
+++ b/src/Backups/BackupOperationInfo.h
@@ -20,6 +20,9 @@ struct BackupOperationInfo
/// Base Backup Operation name, a string like "Disk('backups', 'my_base_backup')"
String base_backup_name;
+ /// Query ID of a query that started backup
+ String query_id;
+
/// This operation is internal and should not be shown in system.backups
bool internal = false;
diff --git a/src/Backups/BackupsWorker.cpp b/src/Backups/BackupsWorker.cpp
index c19be22c749..5905d723800 100644
--- a/src/Backups/BackupsWorker.cpp
+++ b/src/Backups/BackupsWorker.cpp
@@ -440,7 +440,13 @@ OperationID BackupsWorker::startMakingBackup(const ASTPtr & query, const Context
try
{
- addInfo(backup_id, backup_name_for_logging, base_backup_name, backup_settings.internal, context->getProcessListElement(), BackupStatus::CREATING_BACKUP);
+ addInfo(backup_id,
+ backup_name_for_logging,
+ base_backup_name,
+ context->getCurrentQueryId(),
+ backup_settings.internal,
+ context->getProcessListElement(),
+ BackupStatus::CREATING_BACKUP);
/// Prepare context to use.
ContextPtr context_in_use = context;
@@ -823,7 +829,13 @@ OperationID BackupsWorker::startRestoring(const ASTPtr & query, ContextMutablePt
if (restore_settings.base_backup_info)
base_backup_name = restore_settings.base_backup_info->toStringForLogging();
- addInfo(restore_id, backup_name_for_logging, base_backup_name, restore_settings.internal, context->getProcessListElement(), BackupStatus::RESTORING);
+ addInfo(restore_id,
+ backup_name_for_logging,
+ base_backup_name,
+ context->getCurrentQueryId(),
+ restore_settings.internal,
+ context->getProcessListElement(),
+ BackupStatus::RESTORING);
/// Prepare context to use.
ContextMutablePtr context_in_use = context;
@@ -1108,13 +1120,15 @@ void BackupsWorker::restoreTablesData(const OperationID & restore_id, BackupPtr
}
-void BackupsWorker::addInfo(const OperationID & id, const String & name, const String & base_backup_name, bool internal, QueryStatusPtr process_list_element, BackupStatus status)
+void BackupsWorker::addInfo(const OperationID & id, const String & name, const String & base_backup_name, const String & query_id,
+ bool internal, QueryStatusPtr process_list_element, BackupStatus status)
{
ExtendedOperationInfo extended_info;
auto & info = extended_info.info;
info.id = id;
info.name = name;
info.base_backup_name = base_backup_name;
+ info.query_id = query_id;
info.internal = internal;
info.status = status;
info.start_time = std::chrono::system_clock::now();
@@ -1183,7 +1197,7 @@ void BackupsWorker::setStatus(const String & id, BackupStatus status, bool throw
if (isFailedOrCancelled(status))
{
- info.error_message = getCurrentExceptionMessage(false);
+ info.error_message = getCurrentExceptionMessage(true /*with_stacktrace*/);
info.exception = std::current_exception();
}
diff --git a/src/Backups/BackupsWorker.h b/src/Backups/BackupsWorker.h
index 73c8bf19473..ad187552c31 100644
--- a/src/Backups/BackupsWorker.h
+++ b/src/Backups/BackupsWorker.h
@@ -108,7 +108,8 @@ private:
/// Run data restoring tasks which insert data to tables.
void restoreTablesData(const BackupOperationID & restore_id, BackupPtr backup, DataRestoreTasks && tasks, ThreadPool & thread_pool, QueryStatusPtr process_list_element);
- void addInfo(const BackupOperationID & id, const String & name, const String & base_backup_name, bool internal, QueryStatusPtr process_list_element, BackupStatus status);
+ void addInfo(const BackupOperationID & id, const String & name, const String & base_backup_name, const String & query_id,
+ bool internal, QueryStatusPtr process_list_element, BackupStatus status);
void setStatus(const BackupOperationID & id, BackupStatus status, bool throw_if_error = true);
void setStatusSafe(const String & id, BackupStatus status) { setStatus(id, status, false); }
void setNumFilesAndSize(const BackupOperationID & id, size_t num_files, UInt64 total_size, size_t num_entries,
diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt
index 08913ed1b5a..dff70e06ce4 100644
--- a/src/CMakeLists.txt
+++ b/src/CMakeLists.txt
@@ -506,6 +506,10 @@ if (TARGET ch_contrib::s2)
dbms_target_link_libraries (PUBLIC ch_contrib::s2)
endif()
+if (TARGET ch_contrib::vectorscan)
+ dbms_target_link_libraries (PRIVATE ch_contrib::vectorscan)
+endif()
+
if (TARGET ch_contrib::brotli)
target_link_libraries (clickhouse_common_io PRIVATE ch_contrib::brotli)
endif()
diff --git a/src/Client/ConnectionEstablisher.h b/src/Client/ConnectionEstablisher.h
index a3a01e63246..1fa08d435e9 100644
--- a/src/Client/ConnectionEstablisher.h
+++ b/src/Client/ConnectionEstablisher.h
@@ -3,6 +3,7 @@
#include
#include
#include
+#include
#include
#include
#include
diff --git a/src/Client/ConnectionPool.h b/src/Client/ConnectionPool.h
index 8e707e8190f..574c4992d75 100644
--- a/src/Client/ConnectionPool.h
+++ b/src/Client/ConnectionPool.h
@@ -28,7 +28,10 @@ public:
using Entry = PoolBase::Entry;
IConnectionPool() = default;
- IConnectionPool(String host_, UInt16 port_) : host(host_), port(port_), address(host + ":" + toString(port_)) {}
+ IConnectionPool(String host_, UInt16 port_, Priority config_priority_)
+ : host(host_), port(port_), address(host + ":" + toString(port_)), config_priority(config_priority_)
+ {
+ }
virtual ~IConnectionPool() = default;
@@ -42,12 +45,13 @@ public:
const std::string & getHost() const { return host; }
UInt16 getPort() const { return port; }
const String & getAddress() const { return address; }
- virtual Priority getPriority() const { return Priority{1}; }
+ Priority getConfigPriority() const { return config_priority; }
protected:
const String host;
const UInt16 port = 0;
const String address;
+ const Priority config_priority;
};
using ConnectionPoolPtr = std::shared_ptr;
@@ -61,32 +65,31 @@ public:
using Entry = IConnectionPool::Entry;
using Base = PoolBase;
- ConnectionPool(unsigned max_connections_,
- const String & host_,
- UInt16 port_,
- const String & default_database_,
- const String & user_,
- const String & password_,
- const String & quota_key_,
- const String & cluster_,
- const String & cluster_secret_,
- const String & client_name_,
- Protocol::Compression compression_,
- Protocol::Secure secure_,
- Priority priority_ = Priority{1})
- : IConnectionPool(host_, port_),
- Base(max_connections_,
- getLogger("ConnectionPool (" + host_ + ":" + toString(port_) + ")")),
- default_database(default_database_),
- user(user_),
- password(password_),
- quota_key(quota_key_),
- cluster(cluster_),
- cluster_secret(cluster_secret_),
- client_name(client_name_),
- compression(compression_),
- secure(secure_),
- priority(priority_)
+ ConnectionPool(
+ unsigned max_connections_,
+ const String & host_,
+ UInt16 port_,
+ const String & default_database_,
+ const String & user_,
+ const String & password_,
+ const String & quota_key_,
+ const String & cluster_,
+ const String & cluster_secret_,
+ const String & client_name_,
+ Protocol::Compression compression_,
+ Protocol::Secure secure_,
+ Priority config_priority_ = Priority{1})
+ : IConnectionPool(host_, port_, config_priority_)
+ , Base(max_connections_, getLogger("ConnectionPool (" + host_ + ":" + toString(port_) + ")"))
+ , default_database(default_database_)
+ , user(user_)
+ , password(password_)
+ , quota_key(quota_key_)
+ , cluster(cluster_)
+ , cluster_secret(cluster_secret_)
+ , client_name(client_name_)
+ , compression(compression_)
+ , secure(secure_)
{
}
@@ -114,11 +117,6 @@ public:
return host + ":" + toString(port);
}
- Priority getPriority() const override
- {
- return priority;
- }
-
protected:
/** Creates a new object to put in the pool. */
ConnectionPtr allocObject() override
@@ -143,7 +141,6 @@ private:
String client_name;
Protocol::Compression compression; /// Whether to compress data when interacting with the server.
Protocol::Secure secure; /// Whether to encrypt data when interacting with the server.
- Priority priority; /// priority from
};
/**
diff --git a/src/Client/ConnectionPoolWithFailover.cpp b/src/Client/ConnectionPoolWithFailover.cpp
index fdc0a11e533..492fd4ae9e2 100644
--- a/src/Client/ConnectionPoolWithFailover.cpp
+++ b/src/Client/ConnectionPoolWithFailover.cpp
@@ -79,14 +79,6 @@ IConnectionPool::Entry ConnectionPoolWithFailover::get(const ConnectionTimeouts
return Base::get(max_ignored_errors, fallback_to_stale_replicas, try_get_entry, get_priority);
}
-Priority ConnectionPoolWithFailover::getPriority() const
-{
- return (*std::max_element(nested_pools.begin(), nested_pools.end(), [](const auto & a, const auto & b)
- {
- return a->getPriority() < b->getPriority();
- }))->getPriority();
-}
-
ConnectionPoolWithFailover::Status ConnectionPoolWithFailover::getStatus() const
{
const auto [states, pools, error_decrease_time] = getPoolExtendedStates();
@@ -253,13 +245,13 @@ ConnectionPoolWithFailover::tryGetEntry(
}
std::vector
-ConnectionPoolWithFailover::getShuffledPools(const Settings & settings, GetPriorityForLoadBalancing::Func priority_func)
+ConnectionPoolWithFailover::getShuffledPools(const Settings & settings, GetPriorityForLoadBalancing::Func priority_func, bool use_slowdown_count)
{
if (!priority_func)
priority_func = makeGetPriorityFunc(settings);
UInt64 max_ignored_errors = settings.distributed_replica_max_ignored_errors.value;
- return Base::getShuffledPools(max_ignored_errors, priority_func);
+ return Base::getShuffledPools(max_ignored_errors, priority_func, use_slowdown_count);
}
}
diff --git a/src/Client/ConnectionPoolWithFailover.h b/src/Client/ConnectionPoolWithFailover.h
index 7ccdd4787a4..edfcbe6e4df 100644
--- a/src/Client/ConnectionPoolWithFailover.h
+++ b/src/Client/ConnectionPoolWithFailover.h
@@ -49,8 +49,6 @@ public:
const Settings & settings,
bool force_connected) override; /// From IConnectionPool
- Priority getPriority() const override; /// From IConnectionPool
-
/** Allocates up to the specified number of connections to work.
* Connections provide access to different replicas of one shard.
*/
@@ -83,15 +81,15 @@ public:
struct NestedPoolStatus
{
const Base::NestedPoolPtr pool;
- size_t error_count;
- size_t slowdown_count;
+ size_t error_count = 0;
+ size_t slowdown_count = 0;
std::chrono::seconds estimated_recovery_time;
};
using Status = std::vector;
Status getStatus() const;
- std::vector getShuffledPools(const Settings & settings, GetPriorityFunc priority_func = {});
+ std::vector getShuffledPools(const Settings & settings, GetPriorityFunc priority_func = {}, bool use_slowdown_count = false);
size_t getMaxErrorCup() const { return Base::max_error_cap; }
diff --git a/src/Client/HedgedConnectionsFactory.cpp b/src/Client/HedgedConnectionsFactory.cpp
index 82bacece415..f5b074a0257 100644
--- a/src/Client/HedgedConnectionsFactory.cpp
+++ b/src/Client/HedgedConnectionsFactory.cpp
@@ -40,7 +40,8 @@ HedgedConnectionsFactory::HedgedConnectionsFactory(
, max_parallel_replicas(max_parallel_replicas_)
, skip_unavailable_shards(skip_unavailable_shards_)
{
- shuffled_pools = pool->getShuffledPools(settings_, priority_func);
+ shuffled_pools = pool->getShuffledPools(settings_, priority_func, /* use_slowdown_count */ true);
+
for (const auto & shuffled_pool : shuffled_pools)
replicas.emplace_back(
std::make_unique(shuffled_pool.pool, &timeouts, settings_, log, table_to_check.get()));
diff --git a/src/Client/MultiplexedConnections.cpp b/src/Client/MultiplexedConnections.cpp
index 8433c8afe9f..c7d7d0143c8 100644
--- a/src/Client/MultiplexedConnections.cpp
+++ b/src/Client/MultiplexedConnections.cpp
@@ -320,7 +320,7 @@ Packet MultiplexedConnections::receivePacketUnlocked(AsyncCallback async_callbac
ReplicaState & state = getReplicaForReading();
current_connection = state.connection;
if (current_connection == nullptr)
- throw Exception(ErrorCodes::NO_AVAILABLE_REPLICA, "No available replica");
+ throw Exception(ErrorCodes::NO_AVAILABLE_REPLICA, "Logical error: no available replica");
Packet packet;
try
diff --git a/src/Client/PacketReceiver.h b/src/Client/PacketReceiver.h
index 6b3da659290..deedf5cccdc 100644
--- a/src/Client/PacketReceiver.h
+++ b/src/Client/PacketReceiver.h
@@ -5,6 +5,7 @@
#include
#include
+#include
#include
#include
#include
diff --git a/src/Columns/ColumnArray.cpp b/src/Columns/ColumnArray.cpp
index b3376b35b2e..6f60ec0e642 100644
--- a/src/Columns/ColumnArray.cpp
+++ b/src/Columns/ColumnArray.cpp
@@ -810,7 +810,7 @@ ColumnPtr ColumnArray::filterTuple(const Filter & filt, ssize_t result_size_hint
size_t tuple_size = tuple.tupleSize();
if (tuple_size == 0)
- throw Exception(ErrorCodes::LOGICAL_ERROR, "Empty tuple");
+ throw Exception(ErrorCodes::LOGICAL_ERROR, "Logical error: empty tuple");
Columns temporary_arrays(tuple_size);
for (size_t i = 0; i < tuple_size; ++i)
@@ -1263,7 +1263,7 @@ ColumnPtr ColumnArray::replicateTuple(const Offsets & replicate_offsets) const
size_t tuple_size = tuple.tupleSize();
if (tuple_size == 0)
- throw Exception(ErrorCodes::LOGICAL_ERROR, "Empty tuple");
+ throw Exception(ErrorCodes::LOGICAL_ERROR, "Logical error: empty tuple");
Columns temporary_arrays(tuple_size);
for (size_t i = 0; i < tuple_size; ++i)
diff --git a/src/Columns/ColumnNullable.cpp b/src/Columns/ColumnNullable.cpp
index ddf5fc696fb..93638371b84 100644
--- a/src/Columns/ColumnNullable.cpp
+++ b/src/Columns/ColumnNullable.cpp
@@ -1,5 +1,7 @@
#include
#include
+#include
+#include
#include
#include
#include
@@ -26,6 +28,7 @@ namespace ErrorCodes
{
extern const int LOGICAL_ERROR;
extern const int ILLEGAL_COLUMN;
+ extern const int SIZES_OF_NESTED_COLUMNS_ARE_INCONSISTENT;
extern const int NOT_IMPLEMENTED;
}
@@ -826,7 +829,8 @@ void ColumnNullable::applyNullMap(const ColumnNullable & other)
void ColumnNullable::checkConsistency() const
{
if (null_map->size() != getNestedColumn().size())
- throw Exception(ErrorCodes::LOGICAL_ERROR, "Sizes of nested column and null map of Nullable column are not equal");
+ throw Exception(ErrorCodes::SIZES_OF_NESTED_COLUMNS_ARE_INCONSISTENT,
+ "Logical error: Sizes of nested column and null map of Nullable column are not equal");
}
ColumnPtr ColumnNullable::createWithOffsets(const IColumn::Offsets & offsets, const ColumnConst & column_with_default_value, size_t total_rows, size_t shift) const
diff --git a/src/Columns/getLeastSuperColumn.cpp b/src/Columns/getLeastSuperColumn.cpp
index 4f4a5f2b9b8..6ec5ca7a9c1 100644
--- a/src/Columns/getLeastSuperColumn.cpp
+++ b/src/Columns/getLeastSuperColumn.cpp
@@ -21,7 +21,7 @@ static bool sameConstants(const IColumn & a, const IColumn & b)
ColumnWithTypeAndName getLeastSuperColumn(const std::vector & columns)
{
if (columns.empty())
- throw Exception(ErrorCodes::LOGICAL_ERROR, "No src columns for supercolumn");
+ throw Exception(ErrorCodes::LOGICAL_ERROR, "Logical error: no src columns for supercolumn");
ColumnWithTypeAndName result = *columns[0];
diff --git a/src/Common/CPUID.h b/src/Common/CPUID.h
index b47e7e808d7..d7a714ec5af 100644
--- a/src/Common/CPUID.h
+++ b/src/Common/CPUID.h
@@ -57,6 +57,249 @@ inline bool cpuid(UInt32 op, UInt32 * res) noexcept /// NOLINT
#endif
}
+union CPUInfo
+{
+ UInt32 info[4];
+
+ struct Registers
+ {
+ UInt32 eax;
+ UInt32 ebx;
+ UInt32 ecx;
+ UInt32 edx;
+ } registers;
+
+ inline explicit CPUInfo(UInt32 op) noexcept { cpuid(op, info); }
+
+ inline CPUInfo(UInt32 op, UInt32 sub_op) noexcept { cpuid(op, sub_op, info); }
+};
+
+inline bool haveRDTSCP() noexcept
+{
+ return (CPUInfo(0x80000001).registers.edx >> 27) & 1u;
+}
+
+inline bool haveSSE() noexcept
+{
+ return (CPUInfo(0x1).registers.edx >> 25) & 1u;
+}
+
+inline bool haveSSE2() noexcept
+{
+ return (CPUInfo(0x1).registers.edx >> 26) & 1u;
+}
+
+inline bool haveSSE3() noexcept
+{
+ return CPUInfo(0x1).registers.ecx & 1u;
+}
+
+inline bool havePCLMUL() noexcept
+{
+ return (CPUInfo(0x1).registers.ecx >> 1) & 1u;
+}
+
+inline bool haveSSSE3() noexcept
+{
+ return (CPUInfo(0x1).registers.ecx >> 9) & 1u;
+}
+
+inline bool haveSSE41() noexcept
+{
+ return (CPUInfo(0x1).registers.ecx >> 19) & 1u;
+}
+
+inline bool haveSSE42() noexcept
+{
+ return (CPUInfo(0x1).registers.ecx >> 20) & 1u;
+}
+
+inline bool haveF16C() noexcept
+{
+ return (CPUInfo(0x1).registers.ecx >> 29) & 1u;
+}
+
+inline bool havePOPCNT() noexcept
+{
+ return (CPUInfo(0x1).registers.ecx >> 23) & 1u;
+}
+
+inline bool haveAES() noexcept
+{
+ return (CPUInfo(0x1).registers.ecx >> 25) & 1u;
+}
+
+inline bool haveXSAVE() noexcept
+{
+ return (CPUInfo(0x1).registers.ecx >> 26) & 1u;
+}
+
+inline bool haveOSXSAVE() noexcept
+{
+ return (CPUInfo(0x1).registers.ecx >> 27) & 1u;
+}
+
+inline bool haveAVX() noexcept
+{
+#if defined(__x86_64__)
+ // http://www.intel.com/content/dam/www/public/us/en/documents/manuals/64-ia-32-architectures-optimization-manual.pdf
+ // https://bugs.chromium.org/p/chromium/issues/detail?id=375968
+ return haveOSXSAVE() // implies haveXSAVE()
+ && (our_xgetbv(0) & 6u) == 6u // XMM state and YMM state are enabled by OS
+ && ((CPUInfo(0x1).registers.ecx >> 28) & 1u); // AVX bit
+#else
+ return false;
+#endif
+}
+
+inline bool haveFMA() noexcept
+{
+ return haveAVX() && ((CPUInfo(0x1).registers.ecx >> 12) & 1u);
+}
+
+inline bool haveAVX2() noexcept
+{
+ return haveAVX() && ((CPUInfo(0x7, 0).registers.ebx >> 5) & 1u);
+}
+
+inline bool haveBMI1() noexcept
+{
+ return (CPUInfo(0x7, 0).registers.ebx >> 3) & 1u;
+}
+
+inline bool haveBMI2() noexcept
+{
+ return (CPUInfo(0x7, 0).registers.ebx >> 8) & 1u;
+}
+
+inline bool haveAVX512F() noexcept
+{
+#if defined(__x86_64__)
+ // https://software.intel.com/en-us/articles/how-to-detect-knl-instruction-support
+ return haveOSXSAVE() // implies haveXSAVE()
+ && (our_xgetbv(0) & 6u) == 6u // XMM state and YMM state are enabled by OS
+ && ((our_xgetbv(0) >> 5) & 7u) == 7u // ZMM state is enabled by OS
+ && CPUInfo(0x0).registers.eax >= 0x7 // leaf 7 is present
+ && ((CPUInfo(0x7, 0).registers.ebx >> 16) & 1u); // AVX512F bit
+#else
+ return false;
+#endif
+}
+
+inline bool haveAVX512DQ() noexcept
+{
+ return haveAVX512F() && ((CPUInfo(0x7, 0).registers.ebx >> 17) & 1u);
+}
+
+inline bool haveRDSEED() noexcept
+{
+ return CPUInfo(0x0).registers.eax >= 0x7 && ((CPUInfo(0x7, 0).registers.ebx >> 18) & 1u);
+}
+
+inline bool haveADX() noexcept
+{
+ return CPUInfo(0x0).registers.eax >= 0x7 && ((CPUInfo(0x7, 0).registers.ebx >> 19) & 1u);
+}
+
+inline bool haveAVX512IFMA() noexcept
+{
+ return haveAVX512F() && ((CPUInfo(0x7, 0).registers.ebx >> 21) & 1u);
+}
+
+inline bool havePCOMMIT() noexcept
+{
+ return CPUInfo(0x0).registers.eax >= 0x7 && ((CPUInfo(0x7, 0).registers.ebx >> 22) & 1u);
+}
+
+inline bool haveCLFLUSHOPT() noexcept
+{
+ return CPUInfo(0x0).registers.eax >= 0x7 && ((CPUInfo(0x7, 0).registers.ebx >> 23) & 1u);
+}
+
+inline bool haveCLWB() noexcept
+{
+ return CPUInfo(0x0).registers.eax >= 0x7 && ((CPUInfo(0x7, 0).registers.ebx >> 24) & 1u);
+}
+
+inline bool haveAVX512PF() noexcept
+{
+ return haveAVX512F() && ((CPUInfo(0x7, 0).registers.ebx >> 26) & 1u);
+}
+
+inline bool haveAVX512ER() noexcept
+{
+ return haveAVX512F() && ((CPUInfo(0x7, 0).registers.ebx >> 27) & 1u);
+}
+
+inline bool haveAVX512CD() noexcept
+{
+ return haveAVX512F() && ((CPUInfo(0x7, 0).registers.ebx >> 28) & 1u);
+}
+
+inline bool haveSHA() noexcept
+{
+ return CPUInfo(0x0).registers.eax >= 0x7 && ((CPUInfo(0x7, 0).registers.ebx >> 29) & 1u);
+}
+
+inline bool haveAVX512BW() noexcept
+{
+ return haveAVX512F() && ((CPUInfo(0x7, 0).registers.ebx >> 30) & 1u);
+}
+
+inline bool haveAVX512VL() noexcept
+{
+ return haveAVX512F() && ((CPUInfo(0x7, 0).registers.ebx >> 31) & 1u);
+}
+
+inline bool havePREFETCHWT1() noexcept
+{
+ return CPUInfo(0x0).registers.eax >= 0x7 && ((CPUInfo(0x7, 0).registers.ecx >> 0) & 1u);
+}
+
+inline bool haveAVX512VBMI() noexcept
+{
+ return haveAVX512F() && ((CPUInfo(0x7, 0).registers.ecx >> 1) & 1u);
+}
+
+inline bool haveAVX512VBMI2() noexcept
+{
+ return haveAVX512F() && ((CPUInfo(0x7, 0).registers.ecx >> 6) & 1u);
+}
+
+inline bool haveRDRAND() noexcept
+{
+ return CPUInfo(0x0).registers.eax >= 0x7 && ((CPUInfo(0x1).registers.ecx >> 30) & 1u);
+}
+
+inline bool haveAMX() noexcept
+{
+#if defined(__x86_64__)
+ // http://www.intel.com/content/dam/www/public/us/en/documents/manuals/64-ia-32-architectures-optimization-manual.pdf
+ return haveOSXSAVE() // implies haveXSAVE()
+ && ((our_xgetbv(0) >> 17) & 0x3) == 0x3; // AMX state are enabled by OS
+#else
+ return false;
+#endif
+}
+
+inline bool haveAMXBF16() noexcept
+{
+ return haveAMX()
+ && ((CPUInfo(0x7, 0).registers.edx >> 22) & 1u); // AMX-BF16 bit
+}
+
+inline bool haveAMXTILE() noexcept
+{
+ return haveAMX()
+ && ((CPUInfo(0x7, 0).registers.edx >> 24) & 1u); // AMX-TILE bit
+}
+
+inline bool haveAMXINT8() noexcept
+{
+ return haveAMX()
+ && ((CPUInfo(0x7, 0).registers.edx >> 25) & 1u); // AMX-INT8 bit
+}
+
#define CPU_ID_ENUMERATE(OP) \
OP(SSE) \
OP(SSE2) \
@@ -98,253 +341,6 @@ inline bool cpuid(UInt32 op, UInt32 * res) noexcept /// NOLINT
OP(AMXTILE) \
OP(AMXINT8)
-union CPUInfo
-{
- UInt32 info[4];
-
- struct Registers
- {
- UInt32 eax;
- UInt32 ebx;
- UInt32 ecx;
- UInt32 edx;
- } registers;
-
- inline explicit CPUInfo(UInt32 op) noexcept { cpuid(op, info); }
-
- inline CPUInfo(UInt32 op, UInt32 sub_op) noexcept { cpuid(op, sub_op, info); }
-};
-
-#define DEF_NAME(X) inline bool have##X() noexcept;
- CPU_ID_ENUMERATE(DEF_NAME)
-#undef DEF_NAME
-
-bool haveRDTSCP() noexcept
-{
- return (CPUInfo(0x80000001).registers.edx >> 27) & 1u;
-}
-
-bool haveSSE() noexcept
-{
- return (CPUInfo(0x1).registers.edx >> 25) & 1u;
-}
-
-bool haveSSE2() noexcept
-{
- return (CPUInfo(0x1).registers.edx >> 26) & 1u;
-}
-
-bool haveSSE3() noexcept
-{
- return CPUInfo(0x1).registers.ecx & 1u;
-}
-
-bool havePCLMUL() noexcept
-{
- return (CPUInfo(0x1).registers.ecx >> 1) & 1u;
-}
-
-bool haveSSSE3() noexcept
-{
- return (CPUInfo(0x1).registers.ecx >> 9) & 1u;
-}
-
-bool haveSSE41() noexcept
-{
- return (CPUInfo(0x1).registers.ecx >> 19) & 1u;
-}
-
-bool haveSSE42() noexcept
-{
- return (CPUInfo(0x1).registers.ecx >> 20) & 1u;
-}
-
-bool haveF16C() noexcept
-{
- return (CPUInfo(0x1).registers.ecx >> 29) & 1u;
-}
-
-bool havePOPCNT() noexcept
-{
- return (CPUInfo(0x1).registers.ecx >> 23) & 1u;
-}
-
-bool haveAES() noexcept
-{
- return (CPUInfo(0x1).registers.ecx >> 25) & 1u;
-}
-
-bool haveXSAVE() noexcept
-{
- return (CPUInfo(0x1).registers.ecx >> 26) & 1u;
-}
-
-bool haveOSXSAVE() noexcept
-{
- return (CPUInfo(0x1).registers.ecx >> 27) & 1u;
-}
-
-bool haveAVX() noexcept
-{
-#if defined(__x86_64__)
- // http://www.intel.com/content/dam/www/public/us/en/documents/manuals/64-ia-32-architectures-optimization-manual.pdf
- // https://bugs.chromium.org/p/chromium/issues/detail?id=375968
- return haveOSXSAVE() // implies haveXSAVE()
- && (our_xgetbv(0) & 6u) == 6u // XMM state and YMM state are enabled by OS
- && ((CPUInfo(0x1).registers.ecx >> 28) & 1u); // AVX bit
-#else
- return false;
-#endif
-}
-
-bool haveFMA() noexcept
-{
- return haveAVX() && ((CPUInfo(0x1).registers.ecx >> 12) & 1u);
-}
-
-bool haveAVX2() noexcept
-{
- return haveAVX() && ((CPUInfo(0x7, 0).registers.ebx >> 5) & 1u);
-}
-
-bool haveBMI1() noexcept
-{
- return (CPUInfo(0x7, 0).registers.ebx >> 3) & 1u;
-}
-
-bool haveBMI2() noexcept
-{
- return (CPUInfo(0x7, 0).registers.ebx >> 8) & 1u;
-}
-
-bool haveAVX512F() noexcept
-{
-#if defined(__x86_64__)
- // https://software.intel.com/en-us/articles/how-to-detect-knl-instruction-support
- return haveOSXSAVE() // implies haveXSAVE()
- && (our_xgetbv(0) & 6u) == 6u // XMM state and YMM state are enabled by OS
- && ((our_xgetbv(0) >> 5) & 7u) == 7u // ZMM state is enabled by OS
- && CPUInfo(0x0).registers.eax >= 0x7 // leaf 7 is present
- && ((CPUInfo(0x7, 0).registers.ebx >> 16) & 1u); // AVX512F bit
-#else
- return false;
-#endif
-}
-
-bool haveAVX512DQ() noexcept
-{
- return haveAVX512F() && ((CPUInfo(0x7, 0).registers.ebx >> 17) & 1u);
-}
-
-bool haveRDSEED() noexcept
-{
- return CPUInfo(0x0).registers.eax >= 0x7 && ((CPUInfo(0x7, 0).registers.ebx >> 18) & 1u);
-}
-
-bool haveADX() noexcept
-{
- return CPUInfo(0x0).registers.eax >= 0x7 && ((CPUInfo(0x7, 0).registers.ebx >> 19) & 1u);
-}
-
-bool haveAVX512IFMA() noexcept
-{
- return haveAVX512F() && ((CPUInfo(0x7, 0).registers.ebx >> 21) & 1u);
-}
-
-bool havePCOMMIT() noexcept
-{
- return CPUInfo(0x0).registers.eax >= 0x7 && ((CPUInfo(0x7, 0).registers.ebx >> 22) & 1u);
-}
-
-bool haveCLFLUSHOPT() noexcept
-{
- return CPUInfo(0x0).registers.eax >= 0x7 && ((CPUInfo(0x7, 0).registers.ebx >> 23) & 1u);
-}
-
-bool haveCLWB() noexcept
-{
- return CPUInfo(0x0).registers.eax >= 0x7 && ((CPUInfo(0x7, 0).registers.ebx >> 24) & 1u);
-}
-
-bool haveAVX512PF() noexcept
-{
- return haveAVX512F() && ((CPUInfo(0x7, 0).registers.ebx >> 26) & 1u);
-}
-
-bool haveAVX512ER() noexcept
-{
- return haveAVX512F() && ((CPUInfo(0x7, 0).registers.ebx >> 27) & 1u);
-}
-
-bool haveAVX512CD() noexcept
-{
- return haveAVX512F() && ((CPUInfo(0x7, 0).registers.ebx >> 28) & 1u);
-}
-
-bool haveSHA() noexcept
-{
- return CPUInfo(0x0).registers.eax >= 0x7 && ((CPUInfo(0x7, 0).registers.ebx >> 29) & 1u);
-}
-
-bool haveAVX512BW() noexcept
-{
- return haveAVX512F() && ((CPUInfo(0x7, 0).registers.ebx >> 30) & 1u);
-}
-
-bool haveAVX512VL() noexcept
-{
- return haveAVX512F() && ((CPUInfo(0x7, 0).registers.ebx >> 31) & 1u);
-}
-
-bool havePREFETCHWT1() noexcept
-{
- return CPUInfo(0x0).registers.eax >= 0x7 && ((CPUInfo(0x7, 0).registers.ecx >> 0) & 1u);
-}
-
-bool haveAVX512VBMI() noexcept
-{
- return haveAVX512F() && ((CPUInfo(0x7, 0).registers.ecx >> 1) & 1u);
-}
-
-bool haveAVX512VBMI2() noexcept
-{
- return haveAVX512F() && ((CPUInfo(0x7, 0).registers.ecx >> 6) & 1u);
-}
-
-bool haveRDRAND() noexcept
-{
- return CPUInfo(0x0).registers.eax >= 0x7 && ((CPUInfo(0x1).registers.ecx >> 30) & 1u);
-}
-
-inline bool haveAMX() noexcept
-{
-#if defined(__x86_64__)
- // http://www.intel.com/content/dam/www/public/us/en/documents/manuals/64-ia-32-architectures-optimization-manual.pdf
- return haveOSXSAVE() // implies haveXSAVE()
- && ((our_xgetbv(0) >> 17) & 0x3) == 0x3; // AMX state are enabled by OS
-#else
- return false;
-#endif
-}
-
-bool haveAMXBF16() noexcept
-{
- return haveAMX()
- && ((CPUInfo(0x7, 0).registers.edx >> 22) & 1u); // AMX-BF16 bit
-}
-
-bool haveAMXTILE() noexcept
-{
- return haveAMX()
- && ((CPUInfo(0x7, 0).registers.edx >> 24) & 1u); // AMX-TILE bit
-}
-
-bool haveAMXINT8() noexcept
-{
- return haveAMX()
- && ((CPUInfo(0x7, 0).registers.edx >> 25) & 1u); // AMX-INT8 bit
-}
-
struct CPUFlagsCache
{
#define DEF_NAME(X) static inline bool have_##X = have##X();
diff --git a/src/Common/ConcurrencyControl.cpp b/src/Common/ConcurrencyControl.cpp
index c9fe51550dc..0893cfce955 100644
--- a/src/Common/ConcurrencyControl.cpp
+++ b/src/Common/ConcurrencyControl.cpp
@@ -12,10 +12,10 @@ namespace ErrorCodes
ConcurrencyControl::Slot::~Slot()
{
- allocation->release();
+ static_cast(*allocation).release();
}
-ConcurrencyControl::Slot::Slot(AllocationPtr && allocation_)
+ConcurrencyControl::Slot::Slot(SlotAllocationPtr && allocation_)
: allocation(std::move(allocation_))
{
}
@@ -27,7 +27,7 @@ ConcurrencyControl::Allocation::~Allocation()
parent.free(this);
}
-[[nodiscard]] ConcurrencyControl::SlotPtr ConcurrencyControl::Allocation::tryAcquire()
+[[nodiscard]] AcquiredSlotPtr ConcurrencyControl::Allocation::tryAcquire()
{
SlotCount value = granted.load();
while (value)
@@ -35,15 +35,21 @@ ConcurrencyControl::Allocation::~Allocation()
if (granted.compare_exchange_strong(value, value - 1))
{
std::unique_lock lock{mutex};
- return SlotPtr(new Slot(shared_from_this())); // can't use std::make_shared due to private ctor
+ return AcquiredSlotPtr(new Slot(shared_from_this())); // can't use std::make_shared due to private ctor
}
}
return {}; // avoid unnecessary locking
}
-ConcurrencyControl::SlotCount ConcurrencyControl::Allocation::grantedCount() const
+SlotCount ConcurrencyControl::Allocation::grantedCount() const
{
- return granted;
+ return granted.load();
+}
+
+SlotCount ConcurrencyControl::Allocation::allocatedCount() const
+{
+ std::unique_lock lock{mutex};
+ return allocated;
}
ConcurrencyControl::Allocation::Allocation(ConcurrencyControl & parent_, SlotCount limit_, SlotCount granted_, Waiters::iterator waiter_)
@@ -87,7 +93,7 @@ ConcurrencyControl::~ConcurrencyControl()
abort();
}
-[[nodiscard]] ConcurrencyControl::AllocationPtr ConcurrencyControl::allocate(SlotCount min, SlotCount max)
+[[nodiscard]] SlotAllocationPtr ConcurrencyControl::allocate(SlotCount min, SlotCount max)
{
if (min > max)
throw Exception(ErrorCodes::LOGICAL_ERROR, "ConcurrencyControl: invalid allocation requirements");
@@ -100,13 +106,13 @@ ConcurrencyControl::~ConcurrencyControl()
// Create allocation and start waiting if more slots are required
if (granted < max)
- return AllocationPtr(new Allocation(*this, max, granted,
+ return SlotAllocationPtr(new Allocation(*this, max, granted,
waiters.insert(cur_waiter, nullptr /* pointer is set by Allocation ctor */)));
else
- return AllocationPtr(new Allocation(*this, max, granted));
+ return SlotAllocationPtr(new Allocation(*this, max, granted));
}
-void ConcurrencyControl::setMaxConcurrency(ConcurrencyControl::SlotCount value)
+void ConcurrencyControl::setMaxConcurrency(SlotCount value)
{
std::unique_lock lock{mutex};
max_concurrency = std::max(1, value); // never allow max_concurrency to be zero
@@ -162,7 +168,7 @@ void ConcurrencyControl::schedule(std::unique_lock &)
}
}
-ConcurrencyControl::SlotCount ConcurrencyControl::available(std::unique_lock &) const
+SlotCount ConcurrencyControl::available(std::unique_lock &) const
{
if (cur_concurrency < max_concurrency)
return max_concurrency - cur_concurrency;
diff --git a/src/Common/ConcurrencyControl.h b/src/Common/ConcurrencyControl.h
index 7e20384aa2a..ba94502962c 100644
--- a/src/Common/ConcurrencyControl.h
+++ b/src/Common/ConcurrencyControl.h
@@ -7,6 +7,7 @@
#include
#include
+#include
namespace DB
{
@@ -34,41 +35,35 @@ namespace DB
* Oversubscription is possible: total amount of allocated slots can exceed `setMaxConcurrency(limit)`
* because `min` amount of slots is allocated for each query unconditionally.
*/
-class ConcurrencyControl : boost::noncopyable
+class ConcurrencyControl : public ISlotControl
{
public:
struct Allocation;
- using AllocationPtr = std::shared_ptr;
- using SlotCount = UInt64;
using Waiters = std::list;
- static constexpr SlotCount Unlimited = std::numeric_limits::max();
-
// Scoped guard for acquired slot, see Allocation::tryAcquire()
- struct Slot : boost::noncopyable
+ struct Slot : public IAcquiredSlot
{
- ~Slot();
+ ~Slot() override;
private:
friend struct Allocation; // for ctor
- explicit Slot(AllocationPtr && allocation_);
+ explicit Slot(SlotAllocationPtr && allocation_);
- AllocationPtr allocation;
+ SlotAllocationPtr allocation;
};
- // FIXME: have to be unique_ptr, but ThreadFromGlobalPool does not support move semantics yet
- using SlotPtr = std::shared_ptr;
-
// Manages group of slots for a single query, see ConcurrencyControl::allocate(min, max)
- struct Allocation : std::enable_shared_from_this, boost::noncopyable
+ struct Allocation : public ISlotAllocation
{
- ~Allocation();
+ ~Allocation() override;
// Take one already granted slot if available. Lock-free iff there is no granted slot.
- [[nodiscard]] SlotPtr tryAcquire();
+ [[nodiscard]] AcquiredSlotPtr tryAcquire() override;
- SlotCount grantedCount() const;
+ SlotCount grantedCount() const override;
+ SlotCount allocatedCount() const override;
private:
friend struct Slot; // for release()
@@ -94,7 +89,7 @@ public:
ConcurrencyControl & parent;
const SlotCount limit;
- std::mutex mutex; // the following values must be accessed under this mutex
+ mutable std::mutex mutex; // the following values must be accessed under this mutex
SlotCount allocated; // allocated total (including already `released`)
SlotCount released = 0;
@@ -103,17 +98,16 @@ public:
const Waiters::iterator waiter; // iterator to itself in Waiters list; valid iff allocated < limit
};
-public:
ConcurrencyControl();
// WARNING: all Allocation objects MUST be destructed before ConcurrencyControl
// NOTE: Recommended way to achieve this is to use `instance()` and do graceful shutdown of queries
- ~ConcurrencyControl();
+ ~ConcurrencyControl() override;
// Allocate at least `min` and at most `max` slots.
// If not all `max` slots were successfully allocated, a subscription for later allocation is created
// Use `Allocation::tryAcquire()` to acquire allocated slot, before running a thread.
- [[nodiscard]] AllocationPtr allocate(SlotCount min, SlotCount max);
+ [[nodiscard]] SlotAllocationPtr allocate(SlotCount min, SlotCount max) override;
void setMaxConcurrency(SlotCount value);
@@ -134,7 +128,7 @@ private:
std::mutex mutex;
Waiters waiters;
Waiters::iterator cur_waiter; // round-robin pointer
- SlotCount max_concurrency = Unlimited;
+ SlotCount max_concurrency = UnlimitedSlots;
SlotCount cur_concurrency = 0;
};
diff --git a/src/Common/CurrentMetrics.cpp b/src/Common/CurrentMetrics.cpp
index c6fbafa8dc3..6931001202d 100644
--- a/src/Common/CurrentMetrics.cpp
+++ b/src/Common/CurrentMetrics.cpp
@@ -262,6 +262,9 @@
M(ActiveTimersInQueryProfiler, "Number of Active thread local timers in QueryProfiler") \
M(RefreshableViews, "Number materialized views with periodic refreshing (REFRESH)") \
M(RefreshingViews, "Number of materialized views currently executing a refresh") \
+ M(StorageBufferFlushThreads, "Number of threads for background flushes in StorageBuffer") \
+ M(StorageBufferFlushThreadsActive, "Number of threads for background flushes in StorageBuffer running a task") \
+ M(StorageBufferFlushThreadsScheduled, "Number of queued or active threads for background flushes in StorageBuffer")
#ifdef APPLY_FOR_EXTERNAL_METRICS
#define APPLY_FOR_METRICS(M) APPLY_FOR_BUILTIN_METRICS(M) APPLY_FOR_EXTERNAL_METRICS(M)
diff --git a/src/Common/Fiber.h b/src/Common/Fiber.h
index 8b88bd323ef..f48ace149f4 100644
--- a/src/Common/Fiber.h
+++ b/src/Common/Fiber.h
@@ -17,7 +17,7 @@ private:
template friend class FiberLocal;
public:
- template
+ template< typename StackAlloc, typename Fn>
Fiber(StackAlloc && salloc, Fn && fn) : impl(std::allocator_arg_t(), std::forward(salloc), RoutineImpl(std::forward(fn)))
{
}
@@ -46,12 +46,6 @@ public:
current_fiber = parent_fiber;
}
- static FiberPtr & getCurrentFiber()
- {
- thread_local static FiberPtr current_fiber;
- return current_fiber;
- }
-
private:
template
struct RoutineImpl
@@ -80,6 +74,12 @@ private:
Fn fn;
};
+ static FiberPtr & getCurrentFiber()
+ {
+ thread_local static FiberPtr current_fiber;
+ return current_fiber;
+ }
+
/// Special wrapper to store data in uniquer_ptr.
struct DataWrapper
{
@@ -146,3 +146,4 @@ private:
T main_instance;
};
+
diff --git a/src/Common/ISlotControl.h b/src/Common/ISlotControl.h
new file mode 100644
index 00000000000..daeb956f5a8
--- /dev/null
+++ b/src/Common/ISlotControl.h
@@ -0,0 +1,76 @@
+#pragma once
+
+#include
+#include
+#include
+#include
+
+
+namespace DB
+{
+
+// Interfaces for abstract "slot" allocation and control.
+// Slot is a virtual entity existing in a limited amount (CPUs or memory chunks, etc).
+//
+// Every slot can be in one of the following states:
+// * free: slot is available to be allocated.
+// * allocated: slot is allocated to a specific ISlotAllocation.
+//
+// Allocated slots can be in one of the following states:
+// * granted: allocated, but not yet acquired.
+// * acquired: a granted slot becomes acquired by using IAcquiredSlot.
+//
+// Example for CPU (see ConcurrencyControl.h). Every slot represents one CPU in the system.
+// Slot allocation is a request to allocate specific number of CPUs for a specific query.
+// Acquired slot is an entity that is held by a thread as long as it is running. This allows
+// total number of threads in the system to be limited and the distribution process to be controlled.
+//
+// TODO:
+// - for preemption - ability to return granted slot back and reacquire it later.
+// - for memory allocations - variable size of slots (in bytes).
+
+/// Number of slots
+using SlotCount = UInt64;
+
+/// Unlimited number of slots
+constexpr SlotCount UnlimitedSlots = std::numeric_limits::max();
+
+/// Acquired slot holder. Slot is considered to be acquired as long as the object exists.
+class IAcquiredSlot : public std::enable_shared_from_this, boost::noncopyable
+{
+public:
+ virtual ~IAcquiredSlot() = default;
+};
+
+using AcquiredSlotPtr = std::shared_ptr;
+
+/// Request for allocation of slots from ISlotControl.
+/// Allows for more slots to be acquired and the whole request to be canceled.
+class ISlotAllocation : public std::enable_shared_from_this, boost::noncopyable
+{
+public:
+ virtual ~ISlotAllocation() = default;
+
+ /// Take one already granted slot if available.
+ [[nodiscard]] virtual AcquiredSlotPtr tryAcquire() = 0;
+
+ /// Returns the number of granted slots for given allocation (i.e. available to be acquired)
+ virtual SlotCount grantedCount() const = 0;
+
+ /// Returns the total number of slots allocated at the moment (acquired and granted)
+ virtual SlotCount allocatedCount() const = 0;
+};
+
+using SlotAllocationPtr = std::shared_ptr;
+
+class ISlotControl : boost::noncopyable
+{
+public:
+ virtual ~ISlotControl() = default;
+
+ // Allocate at least `min` and at most `max` slots.
+ // If not all `max` slots were successfully allocated, a "subscription" for later allocation is created
+ [[nodiscard]] virtual SlotAllocationPtr allocate(SlotCount min, SlotCount max) = 0;
+};
+
+}
diff --git a/src/Common/PoolWithFailoverBase.h b/src/Common/PoolWithFailoverBase.h
index 8fd83300eff..cf270c9dad0 100644
--- a/src/Common/PoolWithFailoverBase.h
+++ b/src/Common/PoolWithFailoverBase.h
@@ -66,7 +66,7 @@ public:
, log(log_)
{
for (size_t i = 0;i < nested_pools.size(); ++i)
- shared_pool_states[i].config_priority = nested_pools[i]->getPriority();
+ shared_pool_states[i].config_priority = nested_pools[i]->getConfigPriority();
}
struct TryResult
@@ -133,7 +133,7 @@ protected:
void updateErrorCounts(PoolStates & states, time_t & last_decrease_time) const;
- std::vector getShuffledPools(size_t max_ignored_errors, const GetPriorityFunc & get_priority);
+ std::vector getShuffledPools(size_t max_ignored_errors, const GetPriorityFunc & get_priority, bool use_slowdown_count = false);
inline void updateSharedErrorCounts(std::vector & shuffled_pools);
@@ -160,7 +160,7 @@ protected:
template
std::vector::ShuffledPool>
PoolWithFailoverBase::getShuffledPools(
- size_t max_ignored_errors, const PoolWithFailoverBase::GetPriorityFunc & get_priority)
+ size_t max_ignored_errors, const PoolWithFailoverBase::GetPriorityFunc & get_priority, bool use_slowdown_count)
{
/// Update random numbers and error counts.
PoolStates pool_states = updatePoolStates(max_ignored_errors);
@@ -175,13 +175,13 @@ PoolWithFailoverBase::getShuffledPools(
std::vector shuffled_pools;
shuffled_pools.reserve(nested_pools.size());
for (size_t i = 0; i < nested_pools.size(); ++i)
- shuffled_pools.push_back(ShuffledPool{nested_pools[i], &pool_states[i], i, /* error_count = */ 0, /* slowdown_count = */ 0});
+ shuffled_pools.emplace_back(ShuffledPool{.pool = nested_pools[i], .state = &pool_states[i], .index = i});
::sort(
shuffled_pools.begin(), shuffled_pools.end(),
- [](const ShuffledPool & lhs, const ShuffledPool & rhs)
+ [use_slowdown_count](const ShuffledPool & lhs, const ShuffledPool & rhs)
{
- return PoolState::compare(*lhs.state, *rhs.state);
+ return PoolState::compare(*lhs.state, *rhs.state, use_slowdown_count);
});
return shuffled_pools;
@@ -344,10 +344,14 @@ struct PoolWithFailoverBase::PoolState
random = rng();
}
- static bool compare(const PoolState & lhs, const PoolState & rhs)
+ static bool compare(const PoolState & lhs, const PoolState & rhs, bool use_slowdown_count)
{
- return std::forward_as_tuple(lhs.error_count, lhs.slowdown_count, lhs.config_priority, lhs.priority, lhs.random)
- < std::forward_as_tuple(rhs.error_count, rhs.slowdown_count, rhs.config_priority, rhs.priority, rhs.random);
+ if (use_slowdown_count)
+ return std::forward_as_tuple(lhs.error_count, lhs.slowdown_count, lhs.config_priority, lhs.priority, lhs.random)
+ < std::forward_as_tuple(rhs.error_count, rhs.slowdown_count, rhs.config_priority, rhs.priority, rhs.random);
+ else
+ return std::forward_as_tuple(lhs.error_count, lhs.config_priority, lhs.priority, lhs.random)
+ < std::forward_as_tuple(rhs.error_count, rhs.config_priority, rhs.priority, rhs.random);
}
private:
diff --git a/src/Common/ProfileEvents.cpp b/src/Common/ProfileEvents.cpp
index f14223ec644..d8ca1ab9e93 100644
--- a/src/Common/ProfileEvents.cpp
+++ b/src/Common/ProfileEvents.cpp
@@ -632,6 +632,12 @@ The server successfully detected this situation and will download merged part fr
M(InterfacePostgreSQLReceiveBytes, "Number of bytes received through PostgreSQL interfaces") \
\
M(ParallelReplicasUsedCount, "Number of replicas used to execute a query with task-based parallel replicas") \
+ \
+ M(KeeperLogsEntryReadFromLatestCache, "Number of log entries in Keeper being read from latest logs cache") \
+ M(KeeperLogsEntryReadFromCommitCache, "Number of log entries in Keeper being read from commit logs cache") \
+ M(KeeperLogsEntryReadFromFile, "Number of log entries in Keeper being read directly from the changelog file") \
+ M(KeeperLogsPrefetchedEntries, "Number of log entries in Keeper being prefetched from the changelog file") \
+ \
M(ParallelReplicasAvailableCount, "Number of replicas available to execute a query with task-based parallel replicas") \
M(ParallelReplicasUnavailableCount, "Number of replicas which was chosen, but found to be unavailable during query execution with task-based parallel replicas") \
diff --git a/src/Common/SensitiveDataMasker.cpp b/src/Common/SensitiveDataMasker.cpp
index 28eae6f451d..70346919f65 100644
--- a/src/Common/SensitiveDataMasker.cpp
+++ b/src/Common/SensitiveDataMasker.cpp
@@ -91,7 +91,7 @@ void SensitiveDataMasker::setInstance(std::unique_ptr&& sen
{
if (!sensitive_data_masker_)
- throw Exception(ErrorCodes::LOGICAL_ERROR, "The 'sensitive_data_masker' is not set");
+ throw Exception(ErrorCodes::LOGICAL_ERROR, "Logical error: the 'sensitive_data_masker' is not set");
if (sensitive_data_masker_->rulesCount() > 0)
{
diff --git a/src/Common/SipHash.h b/src/Common/SipHash.h
index 729fb76a573..5f27fdaa4b6 100644
--- a/src/Common/SipHash.h
+++ b/src/Common/SipHash.h
@@ -209,7 +209,7 @@ public:
{
if (!is_reference_128)
throw DB::Exception(
- DB::ErrorCodes::LOGICAL_ERROR, "Can't call get128Reference when is_reference_128 is not set");
+ DB::ErrorCodes::LOGICAL_ERROR, "Logical error: can't call get128Reference when is_reference_128 is not set");
finalize();
const auto lo = v0 ^ v1 ^ v2 ^ v3;
v1 ^= 0xdd;
diff --git a/src/Common/StackTrace.cpp b/src/Common/StackTrace.cpp
index 436b85ff30b..7e683ae91de 100644
--- a/src/Common/StackTrace.cpp
+++ b/src/Common/StackTrace.cpp
@@ -448,6 +448,9 @@ toStringEveryLineImpl([[maybe_unused]] bool fatal, const StackTraceRefTriple & s
DB::writePointerHex(frame.physical_addr, out);
}
+ if (frame.object.has_value())
+ out << " in " << *frame.object;
+
callback(out.str());
};
#else
diff --git a/src/Common/checkStackSize.cpp b/src/Common/checkStackSize.cpp
index c88554ca8fe..8c2a0aaed7f 100644
--- a/src/Common/checkStackSize.cpp
+++ b/src/Common/checkStackSize.cpp
@@ -1,8 +1,8 @@
-#include
-#include /// THREAD_SANITIZER
#include
#include
-#include
+#include
+#include
+#include /// THREAD_SANITIZER
#include
#include
#include
@@ -114,10 +114,6 @@ __attribute__((__weak__)) void checkStackSize()
{
using namespace DB;
- /// Not implemented for coroutines.
- if (Fiber::getCurrentFiber())
- return;
-
if (!stack_address)
max_stack_size = getStackSize(&stack_address);
@@ -140,7 +136,7 @@ __attribute__((__weak__)) void checkStackSize()
/// We assume that stack grows towards lower addresses. And that it starts to grow from the end of a chunk of memory of max_stack_size.
if (int_frame_address > int_stack_address + max_stack_size)
- throw Exception(ErrorCodes::LOGICAL_ERROR, "Frame address is greater than stack begin address");
+ throw Exception(ErrorCodes::LOGICAL_ERROR, "Logical error: frame address is greater than stack begin address");
size_t stack_size = int_stack_address + max_stack_size - int_frame_address;
size_t max_stack_size_allowed = static_cast(max_stack_size * STACK_SIZE_FREE_RATIO);
diff --git a/src/Common/tests/gtest_async_loader.cpp b/src/Common/tests/gtest_async_loader.cpp
index 950c7bbab76..fc2537abcfc 100644
--- a/src/Common/tests/gtest_async_loader.cpp
+++ b/src/Common/tests/gtest_async_loader.cpp
@@ -427,9 +427,7 @@ TEST(AsyncLoader, CancelExecutingTask)
}
}
-// This test is disabled due to `MemorySanitizer: use-of-uninitialized-value` issue in `collectSymbolsFromProgramHeaders` function
-// More details: https://github.com/ClickHouse/ClickHouse/pull/48923#issuecomment-1545415482
-TEST(AsyncLoader, DISABLED_JobFailure)
+TEST(AsyncLoader, JobFailure)
{
AsyncLoaderTest t;
t.loader.start();
diff --git a/src/Common/tests/gtest_concurrency_control.cpp b/src/Common/tests/gtest_concurrency_control.cpp
index 8e5b89a72a0..5e579317ade 100644
--- a/src/Common/tests/gtest_concurrency_control.cpp
+++ b/src/Common/tests/gtest_concurrency_control.cpp
@@ -15,7 +15,7 @@ struct ConcurrencyControlTest
{
ConcurrencyControl cc;
- explicit ConcurrencyControlTest(ConcurrencyControl::SlotCount limit = ConcurrencyControl::Unlimited)
+ explicit ConcurrencyControlTest(SlotCount limit = UnlimitedSlots)
{
cc.setMaxConcurrency(limit);
}
@@ -25,7 +25,7 @@ TEST(ConcurrencyControl, Unlimited)
{
ConcurrencyControlTest t; // unlimited number of slots
auto slots = t.cc.allocate(0, 100500);
- std::vector acquired;
+ std::vector acquired;
while (auto slot = slots->tryAcquire())
acquired.emplace_back(std::move(slot));
ASSERT_TRUE(acquired.size() == 100500);
@@ -34,14 +34,14 @@ TEST(ConcurrencyControl, Unlimited)
TEST(ConcurrencyControl, Fifo)
{
ConcurrencyControlTest t(1); // use single slot
- std::vector allocations;
+ std::vector allocations;
constexpr int count = 42;
allocations.reserve(count);
for (int i = 0; i < count; i++)
allocations.emplace_back(t.cc.allocate(0, 1));
for (int i = 0; i < count; i++)
{
- ConcurrencyControl::SlotPtr holder;
+ AcquiredSlotPtr holder;
for (int j = 0; j < count; j++)
{
auto slot = allocations[j]->tryAcquire();
@@ -60,11 +60,11 @@ TEST(ConcurrencyControl, Fifo)
TEST(ConcurrencyControl, Oversubscription)
{
ConcurrencyControlTest t(10);
- std::vector allocations;
+ std::vector allocations;
allocations.reserve(10);
for (int i = 0; i < 10; i++)
allocations.emplace_back(t.cc.allocate(1, 2));
- std::vector slots;
+ std::vector slots;
// Normal allocation using maximum amount of slots
for (int i = 0; i < 5; i++)
{
@@ -90,7 +90,7 @@ TEST(ConcurrencyControl, ReleaseUnacquiredSlots)
{
ConcurrencyControlTest t(10);
{
- std::vector allocations;
+ std::vector allocations;
allocations.reserve(10);
for (int i = 0; i < 10; i++)
allocations.emplace_back(t.cc.allocate(1, 2));
@@ -98,7 +98,7 @@ TEST(ConcurrencyControl, ReleaseUnacquiredSlots)
}
// Check that slots were actually released
auto allocation = t.cc.allocate(0, 20);
- std::vector acquired;
+ std::vector acquired;
while (auto slot = allocation->tryAcquire())
acquired.emplace_back(std::move(slot));
ASSERT_TRUE(acquired.size() == 10);
@@ -110,7 +110,7 @@ TEST(ConcurrencyControl, DestroyNotFullyAllocatedAllocation)
for (int i = 0; i < 3; i++)
{
auto allocation = t.cc.allocate(5, 20);
- std::vector acquired;
+ std::vector acquired;
while (auto slot = allocation->tryAcquire())
acquired.emplace_back(std::move(slot));
ASSERT_TRUE(acquired.size() == 10);
@@ -122,7 +122,7 @@ TEST(ConcurrencyControl, DestroyAllocationBeforeSlots)
ConcurrencyControlTest t(10);
for (int i = 0; i < 3; i++)
{
- std::vector acquired;
+ std::vector acquired;
auto allocation = t.cc.allocate(5, 20);
while (auto slot = allocation->tryAcquire())
acquired.emplace_back(std::move(slot));
@@ -135,7 +135,7 @@ TEST(ConcurrencyControl, GrantReleasedToTheSameAllocation)
{
ConcurrencyControlTest t(3);
auto allocation = t.cc.allocate(0, 10);
- std::list acquired;
+ std::list acquired;
while (auto slot = allocation->tryAcquire())
acquired.emplace_back(std::move(slot));
ASSERT_TRUE(acquired.size() == 3); // 0 1 2
@@ -183,7 +183,7 @@ TEST(ConcurrencyControl, SetSlotCount)
{
ConcurrencyControlTest t(10);
auto allocation = t.cc.allocate(5, 30);
- std::vector acquired;
+ std::vector acquired;
while (auto slot = allocation->tryAcquire())
acquired.emplace_back(std::move(slot));
ASSERT_TRUE(acquired.size() == 10);
@@ -200,7 +200,7 @@ TEST(ConcurrencyControl, SetSlotCount)
ASSERT_TRUE(acquired.size() == 5);
// Check that newly added slots are equally distributed over waiting allocations
- std::vector acquired2;
+ std::vector acquired2;
auto allocation2 = t.cc.allocate(0, 30);
ASSERT_TRUE(!allocation->tryAcquire());
t.cc.setMaxConcurrency(15); // 10 slots added: 5 to the first allocation and 5 to the second one
@@ -224,7 +224,7 @@ TEST(ConcurrencyControl, MultipleThreads)
auto run_query = [&] (size_t max_threads)
{
- ConcurrencyControl::AllocationPtr slots = t.cc.allocate(1, max_threads);
+ SlotAllocationPtr slots = t.cc.allocate(1, max_threads);
std::mutex threads_mutex;
std::vector threads;
threads.reserve(max_threads);
diff --git a/src/Interpreters/threadPoolCallbackRunner.h b/src/Common/threadPoolCallbackRunner.h
similarity index 100%
rename from src/Interpreters/threadPoolCallbackRunner.h
rename to src/Common/threadPoolCallbackRunner.h
diff --git a/src/Compression/CompressionCodecT64.cpp b/src/Compression/CompressionCodecT64.cpp
index bf9a9414bc1..3ddc56fe4f6 100644
--- a/src/Compression/CompressionCodecT64.cpp
+++ b/src/Compression/CompressionCodecT64.cpp
@@ -91,6 +91,7 @@ enum class MagicNumber : uint8_t
Decimal32 = 19,
Decimal64 = 20,
IPv4 = 21,
+ Date32 = 22,
};
MagicNumber serializeTypeId(std::optional type_id)
@@ -109,6 +110,7 @@ MagicNumber serializeTypeId(std::optional type_id)
case TypeIndex::Int32: return MagicNumber::Int32;
case TypeIndex::Int64: return MagicNumber::Int64;
case TypeIndex::Date: return MagicNumber::Date;
+ case TypeIndex::Date32: return MagicNumber::Date32;
case TypeIndex::DateTime: return MagicNumber::DateTime;
case TypeIndex::DateTime64: return MagicNumber::DateTime64;
case TypeIndex::Enum8: return MagicNumber::Enum8;
@@ -137,6 +139,7 @@ TypeIndex deserializeTypeId(uint8_t serialized_type_id)
case MagicNumber::Int32: return TypeIndex::Int32;
case MagicNumber::Int64: return TypeIndex::Int64;
case MagicNumber::Date: return TypeIndex::Date;
+ case MagicNumber::Date32: return TypeIndex::Date32;
case MagicNumber::DateTime: return TypeIndex::DateTime;
case MagicNumber::DateTime64: return TypeIndex::DateTime64;
case MagicNumber::Enum8: return TypeIndex::Enum8;
@@ -165,6 +168,7 @@ TypeIndex baseType(TypeIndex type_idx)
return TypeIndex::Int16;
case TypeIndex::Int32:
case TypeIndex::Decimal32:
+ case TypeIndex::Date32:
return TypeIndex::Int32;
case TypeIndex::Int64:
case TypeIndex::Decimal64:
@@ -205,6 +209,7 @@ TypeIndex typeIdx(const IDataType * data_type)
case TypeIndex::UInt16:
case TypeIndex::Enum16:
case TypeIndex::Date:
+ case TypeIndex::Date32:
case TypeIndex::Int32:
case TypeIndex::UInt32:
case TypeIndex::IPv4:
diff --git a/src/Coordination/Changelog.cpp b/src/Coordination/Changelog.cpp
index 5a58932606e..469244605e0 100644
--- a/src/Coordination/Changelog.cpp
+++ b/src/Coordination/Changelog.cpp
@@ -1,5 +1,12 @@
+#include
#include
+#include
+#include
#include
+#include
+#include
+#include
+#include
#include
#include
#include
@@ -14,8 +21,19 @@
#include
#include
#include
+#include
+#include
#include
+#include
+#include
+namespace ProfileEvents
+{
+ extern const Event KeeperLogsEntryReadFromLatestCache;
+ extern const Event KeeperLogsEntryReadFromCommitCache;
+ extern const Event KeeperLogsEntryReadFromFile;
+ extern const Event KeeperLogsPrefetchedEntries;
+}
namespace DB
{
@@ -33,25 +51,31 @@ namespace ErrorCodes
namespace
{
-constexpr std::string_view tmp_prefix = "tmp_";
-
-void moveFileBetweenDisks(DiskPtr disk_from, ChangelogFileDescriptionPtr description, DiskPtr disk_to, const std::string & path_to)
+void moveChangelogBetweenDisks(
+ DiskPtr disk_from,
+ ChangelogFileDescriptionPtr description,
+ DiskPtr disk_to,
+ const std::string & path_to,
+ const KeeperContextPtr & keeper_context)
{
- /// we use empty file with prefix tmp_ to detect incomplete copies
- /// if a copy is complete we don't care from which disk we use the same file
- /// so it's okay if a failure happens after removing of tmp file but before we remove
- /// the changelog from the source disk
- auto from_path = fs::path(description->path);
- auto tmp_changelog_name = from_path.parent_path() / (std::string{tmp_prefix} + from_path.filename().string());
- {
- auto buf = disk_to->writeFile(tmp_changelog_name);
- buf->finalize();
- }
- disk_from->copyFile(from_path, *disk_to, path_to, {});
- disk_to->removeFile(tmp_changelog_name);
- disk_from->removeFile(description->path);
- description->path = path_to;
- description->disk = disk_to;
+ moveFileBetweenDisks(
+ disk_from,
+ description->path,
+ disk_to,
+ path_to,
+ [&]
+ {
+ /// a different thread could be trying to read from the file
+ /// we should make sure the source disk contains the file while read is in progress
+ description->withLock(
+ [&]
+ {
+ description->disk = disk_to;
+ description->path = path_to;
+ });
+ },
+ getLogger("Changelog"),
+ keeper_context);
}
constexpr auto DEFAULT_PREFIX = "changelog";
@@ -111,9 +135,11 @@ class ChangelogWriter
public:
ChangelogWriter(
std::map & existing_changelogs_,
+ LogEntryStorage & entry_storage_,
KeeperContextPtr keeper_context_,
LogFileSettings log_file_settings_)
: existing_changelogs(existing_changelogs_)
+ , entry_storage(entry_storage_)
, log_file_settings(log_file_settings_)
, keeper_context(std::move(keeper_context_))
, log(getLogger("Changelog"))
@@ -173,15 +199,15 @@ public:
}
else
{
- moveFileBetweenDisks(log_disk, current_file_description, disk, new_path);
+ moveChangelogBetweenDisks(log_disk, current_file_description, disk, new_path, keeper_context);
}
}
}
auto latest_log_disk = getLatestLogDisk();
- assert(file_description->disk == latest_log_disk);
+ chassert(file_description->disk == latest_log_disk);
file_buf = latest_log_disk->writeFile(file_description->path, DBMS_DEFAULT_BUFFER_SIZE, mode);
- assert(file_buf);
+ chassert(file_buf);
last_index_written.reset();
current_file_description = std::move(file_description);
@@ -196,7 +222,7 @@ public:
}
catch (...)
{
- tryLogCurrentException(log);
+ tryLogCurrentException(log, "While setting new changelog file");
throw;
}
}
@@ -238,6 +264,7 @@ public:
}
auto & write_buffer = getBuffer();
+ auto current_position = initial_file_size + write_buffer.count();
writeIntBinary(computeRecordChecksum(record), write_buffer);
writeIntBinary(record.header.version, write_buffer);
@@ -255,6 +282,15 @@ public:
/// Flush compressed data to file buffer
compressed_buffer->next();
}
+ else
+ {
+ unflushed_indices_with_log_location.emplace_back(
+ record.header.index,
+ LogLocation{
+ .file_description = current_file_description,
+ .position = current_position,
+ .size = record.header.blob_size});
+ }
last_index_written = record.header.index;
@@ -272,6 +308,8 @@ public:
else
file_buffer->next();
}
+ entry_storage.addLogLocations(std::move(unflushed_indices_with_log_location));
+ unflushed_indices_with_log_location.clear();
}
uint64_t getStartIndex() const
@@ -314,9 +352,9 @@ public:
private:
void finalizeCurrentFile()
{
- assert(prealloc_done);
+ chassert(prealloc_done);
- assert(current_file_description);
+ chassert(current_file_description);
// compact can delete the file and we don't need to do anything
if (current_file_description->deleted)
{
@@ -400,9 +438,11 @@ private:
{
const auto * file_buffer = tryGetFileBuffer();
+ if (file_buffer)
+ initial_file_size = getSizeFromFileDescriptor(file_buffer->getFD());
+
if (log_file_settings.max_size == 0 || !file_buffer)
{
- initial_file_size = 0;
prealloc_done = true;
return;
}
@@ -428,7 +468,6 @@ private:
}
}
#endif
- initial_file_size = getSizeFromFileDescriptor(file_buffer->getFD());
prealloc_done = true;
}
@@ -441,6 +480,10 @@ private:
std::map & existing_changelogs;
+ LogEntryStorage & entry_storage;
+
+ std::vector> unflushed_indices_with_log_location;
+
ChangelogFileDescriptionPtr current_file_description{nullptr};
std::unique_ptr file_buf;
std::optional last_index_written;
@@ -457,22 +500,25 @@ private:
LoggerPtr const log;
};
+namespace
+{
+
struct ChangelogReadResult
{
/// Total entries read from log including skipped.
/// Useful when we decide to continue to write in the same log and want to know
/// how many entries was already written in it.
- uint64_t total_entries_read_from_log;
+ uint64_t total_entries_read_from_log{0};
/// First index in log
- uint64_t log_start_index;
+ uint64_t log_start_index{0};
/// First entry actually read log (not including skipped)
- uint64_t first_read_index;
+ uint64_t first_read_index{0};
/// Last entry read from log (last entry in log)
/// When we don't skip anything last_read_index - first_read_index = total_entries_read_from_log.
/// But when some entries from the start of log can be skipped because they are not required.
- uint64_t last_read_index;
+ uint64_t last_read_index{0};
/// last offset we were able to read from log
off_t last_position;
@@ -482,69 +528,99 @@ struct ChangelogReadResult
bool error;
};
+ChangelogRecord readChangelogRecord(ReadBuffer & read_buf, const std::string & filepath)
+{
+ /// Read checksum
+ Checksum record_checksum;
+ readIntBinary(record_checksum, read_buf);
+
+ /// Read header
+ ChangelogRecord record;
+ readIntBinary(record.header.version, read_buf);
+ readIntBinary(record.header.index, read_buf);
+ readIntBinary(record.header.term, read_buf);
+ readIntBinary(record.header.value_type, read_buf);
+ readIntBinary(record.header.blob_size, read_buf);
+
+ if (record.header.version > CURRENT_CHANGELOG_VERSION)
+ throw Exception(
+ ErrorCodes::UNKNOWN_FORMAT_VERSION, "Unsupported changelog version {} on path {}", static_cast(record.header.version), filepath);
+
+ /// Read data
+ if (record.header.blob_size != 0)
+ {
+ auto buffer = nuraft::buffer::alloc(record.header.blob_size);
+ auto * buffer_begin = reinterpret_cast(buffer->data_begin());
+ read_buf.readStrict(buffer_begin, record.header.blob_size);
+ record.blob = buffer;
+ }
+ else
+ record.blob = nullptr;
+
+ /// Compare checksums
+ Checksum checksum = computeRecordChecksum(record);
+ if (checksum != record_checksum)
+ {
+ throw Exception(
+ ErrorCodes::CHECKSUM_DOESNT_MATCH,
+ "Checksums doesn't match for log {} (version {}), index {}, blob_size {}",
+ filepath,
+ record.header.version,
+ record.header.index,
+ record.header.blob_size);
+ }
+
+ return record;
+}
+
+LogEntryPtr logEntryFromRecord(const ChangelogRecord & record)
+{
+ return nuraft::cs_new(record.header.term, record.blob, static_cast(record.header.value_type));
+}
+
+size_t logEntrySize(const LogEntryPtr & log_entry)
+{
+ return log_entry->get_buf().size();
+}
+
+LogEntryPtr getLogEntry(const CacheEntry & cache_entry)
+{
+ if (const auto * log_entry = std::get_if(&cache_entry))
+ return *log_entry;
+
+ const auto & prefetched_log_entry = std::get(cache_entry);
+ return prefetched_log_entry.getLogEntry();
+}
+
+}
+
class ChangelogReader
{
public:
- explicit ChangelogReader(DiskPtr disk_, const std::string & filepath_) : disk(disk_), filepath(filepath_)
+ explicit ChangelogReader(ChangelogFileDescriptionPtr changelog_description_) : changelog_description(changelog_description_)
{
- compression_method = chooseCompressionMethod(filepath, "");
- auto read_buffer_from_file = disk->readFile(filepath);
+ compression_method = chooseCompressionMethod(changelog_description->path, "");
+ auto read_buffer_from_file = changelog_description->disk->readFile(changelog_description->path);
read_buf = wrapReadBufferWithCompressionMethod(std::move(read_buffer_from_file), compression_method);
}
/// start_log_index -- all entries with index < start_log_index will be skipped, but accounted into total_entries_read_from_log
- ChangelogReadResult readChangelog(IndexToLogEntry & logs, uint64_t start_log_index, LoggerPtr log)
+ ChangelogReadResult readChangelog(LogEntryStorage & entry_storage, uint64_t start_log_index, LoggerPtr log)
{
ChangelogReadResult result{};
result.compressed_log = compression_method != CompressionMethod::None;
+ const auto & filepath = changelog_description->path;
try
{
while (!read_buf->eof())
{
result.last_position = read_buf->count();
- /// Read checksum
- Checksum record_checksum;
- readIntBinary(record_checksum, *read_buf);
- /// Read header
- ChangelogRecord record;
- readIntBinary(record.header.version, *read_buf);
- readIntBinary(record.header.index, *read_buf);
- readIntBinary(record.header.term, *read_buf);
- readIntBinary(record.header.value_type, *read_buf);
- readIntBinary(record.header.blob_size, *read_buf);
-
- if (record.header.version > CURRENT_CHANGELOG_VERSION)
- throw Exception(
- ErrorCodes::UNKNOWN_FORMAT_VERSION, "Unsupported changelog version {} on path {}", static_cast(record.header.version), filepath);
-
- /// Read data
- if (record.header.blob_size != 0)
- {
- auto buffer = nuraft::buffer::alloc(record.header.blob_size);
- auto * buffer_begin = reinterpret_cast(buffer->data_begin());
- read_buf->readStrict(buffer_begin, record.header.blob_size);
- record.blob = buffer;
- }
- else
- record.blob = nullptr;
-
- /// Compare checksums
- Checksum checksum = computeRecordChecksum(record);
- if (checksum != record_checksum)
- {
- throw Exception(
- ErrorCodes::CHECKSUM_DOESNT_MATCH,
- "Checksums doesn't match for log {} (version {}), index {}, blob_size {}",
- filepath,
- record.header.version,
- record.header.index,
- record.header.blob_size);
- }
+ auto record = readChangelogRecord(*read_buf, filepath);
/// Check for duplicated changelog ids
- if (logs.contains(record.header.index))
- std::erase_if(logs, [&record](const auto & item) { return item.first >= record.header.index; });
+ if (entry_storage.contains(record.header.index))
+ entry_storage.cleanAfter(record.header.index - 1);
result.total_entries_read_from_log += 1;
@@ -553,12 +629,18 @@ public:
continue;
/// Create log entry for read data
- auto log_entry = nuraft::cs_new(record.header.term, record.blob, static_cast(record.header.value_type));
+ auto log_entry = logEntryFromRecord(record);
if (result.first_read_index == 0)
result.first_read_index = record.header.index;
/// Put it into in memory structure
- logs.emplace(record.header.index, log_entry);
+ entry_storage.addEntryWithLocation(
+ record.header.index,
+ log_entry,
+ LogLocation{
+ .file_description = changelog_description,
+ .position = static_cast(result.last_position),
+ .size = record.header.blob_size});
result.last_read_index = record.header.index;
if (result.total_entries_read_from_log % 50000 == 0)
@@ -585,131 +667,971 @@ public:
}
private:
- DiskPtr disk;
- std::string filepath;
+ ChangelogFileDescriptionPtr changelog_description;
CompressionMethod compression_method;
std::unique_ptr read_buf;
};
+PrefetchedCacheEntry::PrefetchedCacheEntry()
+ : log_entry(log_entry_resolver.get_future())
+{}
+
+const LogEntryPtr & PrefetchedCacheEntry::getLogEntry() const
+{
+ return log_entry.get();
+}
+
+void PrefetchedCacheEntry::resolve(std::exception_ptr exception)
+{
+ log_entry_resolver.set_exception(exception);
+}
+
+void PrefetchedCacheEntry::resolve(LogEntryPtr log_entry_)
+{
+ log_entry_resolver.set_value(std::move(log_entry_));
+}
+
+LogEntryStorage::LogEntryStorage(const LogFileSettings & log_settings, KeeperContextPtr keeper_context_)
+ : latest_logs_cache(log_settings.latest_logs_cache_size_threshold)
+ , commit_logs_cache(log_settings.commit_logs_cache_size_threshold)
+ , prefetch_queue(std::numeric_limits::max())
+ , keeper_context(std::move(keeper_context_))
+ , log(getLogger("Changelog"))
+{
+ commit_logs_prefetcher = std::make_unique([this] { prefetchCommitLogs(); });
+}
+
+LogEntryStorage::~LogEntryStorage()
+{
+ shutdown();
+}
+
+void LogEntryStorage::prefetchCommitLogs()
+{
+ std::shared_ptr prefetch_info;
+ while (prefetch_queue.pop(prefetch_info))
+ {
+ if (prefetch_info->cancel)
+ {
+ prefetch_info->done = true;
+ prefetch_info->done.notify_all();
+ continue;
+ }
+
+ auto current_index = prefetch_info->commit_prefetch_index_range.first;
+ try
+ {
+ for (const auto & prefetch_file_info : prefetch_info->file_infos)
+ {
+ prefetch_file_info.file_description->withLock(
+ [&]
+ {
+ const auto & [changelog_description, position, count] = prefetch_file_info;
+ auto file = changelog_description->disk->readFile(changelog_description->path, ReadSettings());
+ file->seek(position, SEEK_SET);
+ LOG_TRACE(
+ log, "Prefetching {} log entries from path {}, from position {}", count, changelog_description->path, position);
+ ProfileEvents::increment(ProfileEvents::KeeperLogsPrefetchedEntries, count);
+
+ for (size_t i = 0; i < count; ++i)
+ {
+ if (prefetch_info->cancel)
+ break;
+
+ auto record = readChangelogRecord(*file, changelog_description->path);
+ auto entry = logEntryFromRecord(record);
+ if (current_index != record.header.index)
+ throw Exception(
+ ErrorCodes::LOGICAL_ERROR,
+ "Invalid index prefetched, expected {}, actual {}",
+ current_index,
+ record.header.index);
+
+ commit_logs_cache.getPrefetchedCacheEntry(record.header.index).resolve(std::move(entry));
+ ++current_index;
+ }
+ });
+
+ if (prefetch_info->cancel)
+ break;
+ }
+ }
+ catch (...)
+ {
+ tryLogCurrentException(log, "While prefetching log entries");
+ auto exception = std::current_exception();
+
+ for (; current_index <= prefetch_info->commit_prefetch_index_range.second; ++current_index)
+ commit_logs_cache.getPrefetchedCacheEntry(current_index).resolve(exception);
+ }
+
+ prefetch_info->done = true;
+ prefetch_info->done.notify_all();
+ }
+}
+
+void LogEntryStorage::startCommitLogsPrefetch(uint64_t last_committed_index) const
+{
+ if (keeper_context->isShutdownCalled())
+ return;
+
+ /// commit logs is not empty and it's not next log
+ if (!commit_logs_cache.empty() && commit_logs_cache.max_index_in_cache != last_committed_index)
+ return;
+
+ if (logs_location.empty())
+ return;
+
+ /// we are already prefetching some logs for commit
+ if (current_prefetch_info && !current_prefetch_info->done)
+ return;
+
+ auto new_prefetch_info = std::make_shared();
+ auto & [prefetch_from, prefetch_to] = new_prefetch_info->commit_prefetch_index_range;
+
+ /// if there are no entries in commit cache we will start from the next log that will be committed
+ /// otherwise we continue appending the commit cache from the latest entry stored in it
+ size_t current_index = commit_logs_cache.empty() ? last_committed_index + 1 : commit_logs_cache.max_index_in_cache + 1;
+
+ prefetch_from = current_index;
+
+ size_t total_size = 0;
+ std::vector file_infos;
+ FileReadInfo * current_file_info = nullptr;
+
+ size_t max_index_for_prefetch = 0;
+ if (!latest_logs_cache.empty())
+ max_index_for_prefetch = latest_logs_cache.min_index_in_cache - 1;
+ else
+ max_index_for_prefetch = max_index_with_location;
+
+ for (; current_index <= max_index_for_prefetch; ++current_index)
+ {
+ const auto & [changelog_description, position, size] = logs_location.at(current_index);
+ if (total_size == 0)
+ current_file_info = &file_infos.emplace_back(changelog_description, position, /* count */ 1);
+ else if (total_size + size > commit_logs_cache.size_threshold)
+ break;
+ else if (changelog_description == current_file_info->file_description)
+ ++current_file_info->count;
+ else
+ current_file_info = &file_infos.emplace_back(changelog_description, position, /* count */ 1);
+
+ total_size += size;
+ commit_logs_cache.addEntry(current_index, size, PrefetchedCacheEntry());
+ }
+
+ if (!file_infos.empty())
+ {
+ current_prefetch_info = std::move(new_prefetch_info);
+ prefetch_to = current_index - 1;
+ LOG_TRACE(log, "Will prefetch {} commit log entries [{} - {}]", prefetch_to - prefetch_from + 1, prefetch_from, prefetch_to);
+
+ current_prefetch_info->file_infos = std::move(file_infos);
+ auto inserted = prefetch_queue.push(current_prefetch_info);
+ chassert(inserted);
+ }
+}
+
+LogEntryStorage::InMemoryCache::InMemoryCache(size_t size_threshold_)
+ : size_threshold(size_threshold_)
+{}
+
+void LogEntryStorage::InMemoryCache::updateStatsWithNewEntry(uint64_t index, size_t size)
+{
+ cache_size += size;
+
+ if (cache.size() == 1)
+ {
+ min_index_in_cache = index;
+ max_index_in_cache = index;
+ }
+ else
+ {
+ chassert(index > max_index_in_cache);
+ max_index_in_cache = index;
+ }
+}
+
+void LogEntryStorage::InMemoryCache::addEntry(uint64_t index, size_t size, CacheEntry log_entry)
+{
+ auto [_, inserted] = cache.emplace(index, std::move(log_entry));
+ if (!inserted)
+ throw Exception(ErrorCodes::LOGICAL_ERROR, "Trying to insert log with index {} which is already present in cache", index);
+
+ updateStatsWithNewEntry(index, size);
+}
+
+void LogEntryStorage::InMemoryCache::addEntry(IndexToCacheEntryNode && node)
+{
+ auto index = node.key();
+ auto entry_size = logEntrySize(getLogEntry(node.mapped()));
+
+ auto result = cache.insert(std::move(node));
+ if (!result.inserted)
+ throw Exception(ErrorCodes::LOGICAL_ERROR, "Trying to insert log with index {} which is already present in cache", index);
+
+ updateStatsWithNewEntry(index, entry_size);
+}
+
+IndexToCacheEntryNode LogEntryStorage::InMemoryCache::popOldestEntry()
+{
+ auto node = cache.extract(min_index_in_cache);
+ if (node.empty())
+ throw Exception(ErrorCodes::LOGICAL_ERROR, "Couldn't find the oldest entry of index {} in logs cache", min_index_in_cache);
+ ++min_index_in_cache;
+ cache_size -= logEntrySize(getLogEntry(node.mapped()));
+ return node;
+}
+
+bool LogEntryStorage::InMemoryCache::containsEntry(uint64_t index) const
+{
+ return !cache.empty() && index >= min_index_in_cache && index <= max_index_in_cache;
+}
+
+CacheEntry * LogEntryStorage::InMemoryCache::getCacheEntry(uint64_t index)
+{
+ if (!containsEntry(index))
+ return nullptr;
+
+ auto it = cache.find(index);
+ if (it == cache.end())
+ throw Exception(ErrorCodes::LOGICAL_ERROR, "Index {} missing from cache while it should be present", index);
+
+ return &it->second;
+}
+
+const CacheEntry * LogEntryStorage::InMemoryCache::getCacheEntry(uint64_t index) const
+{
+ return const_cast(*this).getCacheEntry(index);
+}
+
+PrefetchedCacheEntry & LogEntryStorage::InMemoryCache::getPrefetchedCacheEntry(uint64_t index)
+{
+ auto * cache_entry = getCacheEntry(index);
+ if (cache_entry == nullptr)
+ throw Exception(ErrorCodes::LOGICAL_ERROR, "Missing expected index {} in cache", index);
+
+ return std::get(*cache_entry);
+}
+
+
+LogEntryPtr LogEntryStorage::InMemoryCache::getEntry(uint64_t index) const
+{
+ const auto * cache_entry = getCacheEntry(index);
+ if (cache_entry == nullptr)
+ return nullptr;
+
+ return getLogEntry(*cache_entry);
+}
+
+void LogEntryStorage::InMemoryCache::cleanUpTo(uint64_t index)
+{
+ if (empty() || index <= min_index_in_cache)
+ return;
+
+ if (index > max_index_in_cache)
+ {
+ cache.clear();
+ cache_size = 0;
+ return;
+ }
+
+ for (size_t i = min_index_in_cache; i < index; ++i)
+ {
+ auto it = cache.find(i);
+ if (it == cache.end())
+ throw Exception(ErrorCodes::LOGICAL_ERROR, "Log entry with index {} unexpectedly missing from cache", i);
+
+ cache_size -= logEntrySize(getLogEntry(it->second));
+ cache.erase(it);
+ }
+ min_index_in_cache = index;
+}
+
+void LogEntryStorage::InMemoryCache::cleanAfter(uint64_t index)
+{
+ if (empty() || index >= max_index_in_cache)
+ return;
+
+ if (index < min_index_in_cache)
+ {
+ cache.clear();
+ cache_size = 0;
+ return;
+ }
+
+ for (size_t i = index + 1; i <= max_index_in_cache; ++i)
+ {
+ auto it = cache.find(i);
+ if (it == cache.end())
+ throw Exception(ErrorCodes::LOGICAL_ERROR, "Log entry with index {} unexpectedly missing from cache", i);
+
+ cache_size -= logEntrySize(getLogEntry(it->second));
+ cache.erase(it);
+ }
+
+ max_index_in_cache = index;
+}
+
+void LogEntryStorage::InMemoryCache::clear()
+{
+ cache.clear();
+ cache_size = 0;
+}
+
+bool LogEntryStorage::InMemoryCache::empty() const
+{
+ return cache.empty();
+}
+
+size_t LogEntryStorage::InMemoryCache::numberOfEntries() const
+{
+ return cache.size();
+}
+
+bool LogEntryStorage::InMemoryCache::hasSpaceAvailable(size_t log_entry_size) const
+{
+ return size_threshold == 0 || empty() || cache_size + log_entry_size < size_threshold;
+}
+
+void LogEntryStorage::addEntry(uint64_t index, const LogEntryPtr & log_entry)
+{
+ /// we update the cache for added entries on refreshCache call
+ latest_logs_cache.addEntry(index, logEntrySize(log_entry), log_entry);
+
+ if (log_entry->get_val_type() == nuraft::conf)
+ {
+ latest_config = log_entry;
+ latest_config_index = index;
+ logs_with_config_changes.insert(index);
+ }
+
+ updateTermInfoWithNewEntry(index, log_entry->get_term());
+}
+
+bool LogEntryStorage::shouldMoveLogToCommitCache(uint64_t index, size_t log_entry_size)
+{
+ /// if commit logs cache is empty, we need it only if it's the next log to commit
+ if (commit_logs_cache.empty())
+ return keeper_context->lastCommittedIndex() + 1 == index;
+
+ return commit_logs_cache.max_index_in_cache == index - 1 && commit_logs_cache.hasSpaceAvailable(log_entry_size);
+}
+
+void LogEntryStorage::updateTermInfoWithNewEntry(uint64_t index, uint64_t term)
+{
+ if (!log_term_infos.empty() && log_term_infos.back().term == term)
+ return;
+
+ log_term_infos.push_back(LogTermInfo{.term = term, .first_index = index});
+}
+
+void LogEntryStorage::addEntryWithLocation(uint64_t index, const LogEntryPtr & log_entry, LogLocation log_location)
+{
+ auto entry_size = logEntrySize(log_entry);
+ while (!latest_logs_cache.hasSpaceAvailable(entry_size))
+ {
+ auto entry_handle = latest_logs_cache.popOldestEntry();
+ size_t removed_entry_size = logEntrySize(getLogEntry(entry_handle.mapped()));
+ if (shouldMoveLogToCommitCache(entry_handle.key(), removed_entry_size))
+ commit_logs_cache.addEntry(std::move(entry_handle));
+ }
+ latest_logs_cache.addEntry(index, entry_size, CacheEntry(log_entry));
+
+ logs_location.emplace(index, std::move(log_location));
+
+ if (logs_location.size() == 1)
+ min_index_with_location = index;
+
+ max_index_with_location = index;
+
+ if (log_entry->get_val_type() == nuraft::conf)
+ {
+ latest_config = log_entry;
+ latest_config_index = index;
+ logs_with_config_changes.insert(index);
+ }
+
+ updateTermInfoWithNewEntry(index, log_entry->get_term());
+}
+
+void LogEntryStorage::cleanUpTo(uint64_t index)
+{
+ latest_logs_cache.cleanUpTo(index);
+
+ if (!logs_location.empty() && index > min_index_with_location)
+ {
+ if (index > max_index_with_location)
+ {
+ logs_location.clear();
+ }
+ else
+ {
+ for (size_t i = min_index_with_location; i < index; ++i)
+ {
+ auto it = logs_location.find(i);
+ if (it == logs_location.end())
+ throw Exception(ErrorCodes::LOGICAL_ERROR, "Log entry with index {} unexpectedly missing from logs location", i);
+
+ logs_location.erase(it);
+ }
+
+ min_index_with_location = index;
+
+ }
+ }
+
+ {
+ std::lock_guard lock(logs_location_mutex);
+ if (!unapplied_indices_with_log_locations.empty())
+ {
+ auto last = std::ranges::lower_bound(
+ unapplied_indices_with_log_locations,
+ index,
+ std::ranges::less{},
+ [](const auto & index_with_location) { return index_with_location.first; });
+
+ unapplied_indices_with_log_locations.erase(unapplied_indices_with_log_locations.begin(), last);
+ }
+ }
+
+ /// uncommitted logs should be compacted only if we received snapshot from leader
+ if (current_prefetch_info && !current_prefetch_info->done)
+ {
+ auto [prefetch_from, prefetch_to] = current_prefetch_info->commit_prefetch_index_range;
+ /// if we will clean some logs that are currently prefetched, stop prefetching
+ /// and clean all logs from it
+ if (index > prefetch_from)
+ {
+ current_prefetch_info->cancel = true;
+ current_prefetch_info->done.wait(false);
+ commit_logs_cache.clear();
+ }
+
+ /// start prefetching logs for committing at the current index
+ /// the last log index in the snapshot should be the
+ /// last log we cleaned up
+ startCommitLogsPrefetch(index - 1);
+ }
+ else
+ commit_logs_cache.cleanUpTo(index);
+
+ std::erase_if(logs_with_config_changes, [&](const auto conf_index) { return conf_index < index; });
+ if (auto it = std::max_element(logs_with_config_changes.begin(), logs_with_config_changes.end()); it != logs_with_config_changes.end())
+ {
+ latest_config_index = *it;
+ latest_config = getEntry(latest_config_index);
+ }
+ else
+ latest_config = nullptr;
+
+ if (first_log_index < index)
+ first_log_entry = nullptr;
+
+ /// remove all the term infos we don't need (all terms that start before index)
+ uint64_t last_removed_term = 0;
+ while (!log_term_infos.empty() && log_term_infos.front().first_index < index)
+ {
+ last_removed_term = log_term_infos.front().term;
+ log_term_infos.pop_front();
+ }
+
+ /// the last removed term info could contain terms for some indices we didn't cleanup
+ /// so we add the last removed term info back but with new first index
+ if (last_removed_term != 0 && (log_term_infos.empty() || log_term_infos.front().first_index > index))
+ log_term_infos.push_front(LogTermInfo{.term = last_removed_term, .first_index = index});
+}
+
+void LogEntryStorage::cleanAfter(uint64_t index)
+{
+ latest_logs_cache.cleanAfter(index);
+
+ if (!logs_location.empty() && index < max_index_with_location)
+ {
+ if (index < min_index_with_location)
+ {
+ logs_location.clear();
+ }
+ else
+ {
+ for (size_t i = index + 1; i <= max_index_with_location; ++i)
+ {
+ auto it = logs_location.find(i);
+ if (it == logs_location.end())
+ throw Exception(ErrorCodes::LOGICAL_ERROR, "Log entry with index {} unexpectedly missing from logs location", i);
+
+ logs_location.erase(it);
+ }
+
+ max_index_with_location = index;
+ }
+ }
+
+ {
+ std::lock_guard lock(logs_location_mutex);
+ if (!unapplied_indices_with_log_locations.empty())
+ {
+ auto first = std::ranges::upper_bound(
+ unapplied_indices_with_log_locations,
+ index,
+ std::ranges::less{},
+ [](const auto & index_with_location) { return index_with_location.first; });
+
+ unapplied_indices_with_log_locations.erase(first, unapplied_indices_with_log_locations.end());
+ }
+ }
+
+ /// if we cleared all latest logs, there is a possibility we would need to clear commit logs
+ if (latest_logs_cache.empty())
+ {
+ /// we will clean everything after the index, if there is a prefetch in progress
+ /// wait until we fetch everything until index
+ /// afterwards we can stop prefetching of newer logs because they will be cleaned up
+ commit_logs_cache.getEntry(index);
+ if (current_prefetch_info && !current_prefetch_info->done)
+ {
+ auto [prefetch_from, prefetch_to] = current_prefetch_info->commit_prefetch_index_range;
+ /// if we will clean some logs that are currently prefetched, stop prefetching
+ if (index < prefetch_to)
+ {
+ current_prefetch_info->cancel = true;
+ current_prefetch_info->done.wait(false);
+ }
+ }
+
+ commit_logs_cache.cleanAfter(index);
+ startCommitLogsPrefetch(keeper_context->lastCommittedIndex());
+ }
+
+ if (empty() || first_log_index > index)
+ {
+ /// if we don't store any logs or if the first log index changed, reset first log cache
+ first_log_entry = nullptr;
+ }
+
+ std::erase_if(logs_with_config_changes, [&](const auto conf_index) { return conf_index > index; });
+ if (auto it = std::max_element(logs_with_config_changes.begin(), logs_with_config_changes.end()); it != logs_with_config_changes.end())
+ {
+ latest_config_index = *it;
+ latest_config = getEntry(latest_config_index);
+ }
+ else
+ latest_config = nullptr;
+
+ /// remove all the term infos we don't need (all terms that start after index)
+ while (!log_term_infos.empty() && log_term_infos.back().first_index > index)
+ log_term_infos.pop_back();
+}
+
+bool LogEntryStorage::contains(uint64_t index) const
+{
+ return logs_location.contains(index) || latest_logs_cache.containsEntry(index);
+}
+
+LogEntryPtr LogEntryStorage::getEntry(uint64_t index) const
+{
+ auto last_committed_index = keeper_context->lastCommittedIndex();
+ commit_logs_cache.cleanUpTo(last_committed_index);
+ startCommitLogsPrefetch(last_committed_index);
+
+ LogEntryPtr entry = nullptr;
+
+ if (latest_config != nullptr && index == latest_config_index)
+ return latest_config;
+
+ if (first_log_entry != nullptr && index == first_log_index)
+ return first_log_entry;
+
+ if (auto entry_from_latest_cache = latest_logs_cache.getEntry(index))
+ {
+ ProfileEvents::increment(ProfileEvents::KeeperLogsEntryReadFromLatestCache);
+ return entry_from_latest_cache;
+ }
+
+ if (auto entry_from_commit_cache = commit_logs_cache.getEntry(index))
+ {
+ ProfileEvents::increment(ProfileEvents::KeeperLogsEntryReadFromCommitCache);
+ return entry_from_commit_cache;
+ }
+
+ if (auto it = logs_location.find(index); it != logs_location.end())
+ {
+ it->second.file_description->withLock(
+ [&]
+ {
+ const auto & [changelog_description, position, size] = it->second;
+ auto file = changelog_description->disk->readFile(changelog_description->path, ReadSettings());
+ file->seek(position, SEEK_SET);
+ LOG_TRACE(
+ log,
+ "Reading log entry at index {} from path {}, position {}, size {}",
+ index,
+ changelog_description->path,
+ position,
+ size);
+
+ auto record = readChangelogRecord(*file, changelog_description->path);
+ entry = logEntryFromRecord(record);
+ });
+
+ /// if we fetched the first log entry, we will cache it because it's often accessed
+ if (first_log_entry == nullptr && index == getFirstIndex())
+ {
+ first_log_index = index;
+ first_log_entry = entry;
+ }
+
+ ProfileEvents::increment(ProfileEvents::KeeperLogsEntryReadFromFile);
+ }
+
+ return entry;
+}
+
+void LogEntryStorage::clear()
+{
+ latest_logs_cache.clear();
+ commit_logs_cache.clear();
+ logs_location.clear();
+}
+
+LogEntryPtr LogEntryStorage::getLatestConfigChange() const
+{
+ return latest_config;
+}
+
+uint64_t LogEntryStorage::termAt(uint64_t index) const
+{
+ uint64_t term_for_index = 0;
+ for (const auto [term, first_index] : log_term_infos)
+ {
+ if (index < first_index)
+ return term_for_index;
+
+ term_for_index = term;
+ }
+
+ return term_for_index;
+}
+
+void LogEntryStorage::addLogLocations(std::vector> && indices_with_log_locations)
+{
+ /// if we have unlimited space in latest logs cache we don't need log location
+ if (latest_logs_cache.size_threshold == 0)
+ return;
+
+ std::lock_guard lock(logs_location_mutex);
+ unapplied_indices_with_log_locations.insert(
+ unapplied_indices_with_log_locations.end(),
+ std::make_move_iterator(indices_with_log_locations.begin()),
+ std::make_move_iterator(indices_with_log_locations.end()));
+}
+
+void LogEntryStorage::refreshCache()
+{
+ /// if we have unlimited space in latest logs cache we don't need log location
+ if (latest_logs_cache.size_threshold == 0)
+ return;
+
+ std::vector new_unapplied_indices_with_log_locations;
+ {
+ std::lock_guard lock(logs_location_mutex);
+ new_unapplied_indices_with_log_locations.swap(unapplied_indices_with_log_locations);
+ }
+
+ for (auto & [index, log_location] : new_unapplied_indices_with_log_locations)
+ {
+ if (logs_location.empty())
+ min_index_with_location = index;
+
+ logs_location.emplace(index, std::move(log_location));
+ max_index_with_location = index;
+ }
+
+ if (logs_location.empty())
+ return;
+
+ while (latest_logs_cache.numberOfEntries() > 1 && latest_logs_cache.min_index_in_cache <= max_index_with_location
+ && latest_logs_cache.cache_size > latest_logs_cache.size_threshold)
+ {
+ auto node = latest_logs_cache.popOldestEntry();
+ auto log_entry_size = logEntrySize(getLogEntry(node.mapped()));
+ if (shouldMoveLogToCommitCache(node.key(), log_entry_size))
+ commit_logs_cache.addEntry(std::move(node));
+ }
+}
+
+LogEntriesPtr LogEntryStorage::getLogEntriesBetween(uint64_t start, uint64_t end) const
+{
+ LogEntriesPtr ret = nuraft::cs_new>>();
+ ret->reserve(end - start);
+
+ /// we rely on fact that changelogs need to be written sequentially with
+ /// no other writes between
+ std::optional read_info;
+ const auto set_new_file = [&](const auto & log_location)
+ {
+ read_info.emplace();
+ read_info->file_description = log_location.file_description;
+ read_info->position = log_location.position;
+ read_info->count = 1;
+ };
+
+ const auto flush_file = [&]
+ {
+ if (!read_info)
+ return;
+
+ LOG_TRACE(log, "Reading from path {} {} entries", read_info->file_description->path, read_info->count);
+ read_info->file_description->withLock(
+ [&]
+ {
+ const auto & [file_description, start_position, count] = *read_info;
+ auto file = file_description->disk->readFile(file_description->path);
+ file->seek(start_position, SEEK_SET);
+
+ for (size_t i = 0; i < count; ++i)
+ {
+ auto record = readChangelogRecord(*file, file_description->path);
+ ret->push_back(logEntryFromRecord(record));
+ ProfileEvents::increment(ProfileEvents::KeeperLogsEntryReadFromFile);
+ }
+ });
+
+ read_info.reset();
+ };
+
+ for (size_t i = start; i < end; ++i)
+ {
+ if (auto commit_cache_entry = commit_logs_cache.getEntry(i))
+ {
+ flush_file();
+ ret->push_back(std::move(commit_cache_entry));
+ }
+ else if (auto latest_cache_entry = latest_logs_cache.getEntry(i))
+ {
+ flush_file();
+ ret->push_back(std::move(latest_cache_entry));
+ }
+ else
+ {
+ const auto & log_location = logs_location.at(i);
+
+ if (!read_info)
+ set_new_file(log_location);
+ else if (read_info->file_description == log_location.file_description)
+ ++read_info->count;
+ else
+ {
+ flush_file();
+ set_new_file(log_location);
+ }
+ }
+ }
+
+ flush_file();
+ return ret;
+}
+
+void LogEntryStorage::getKeeperLogInfo(KeeperLogInfo & log_info) const
+{
+ log_info.latest_logs_cache_entries = latest_logs_cache.numberOfEntries();
+ log_info.latest_logs_cache_size = latest_logs_cache.cache_size;
+
+ log_info.commit_logs_cache_entries = commit_logs_cache.numberOfEntries();
+ log_info.commit_logs_cache_size = commit_logs_cache.cache_size;
+}
+
+bool LogEntryStorage::isConfigLog(uint64_t index) const
+{
+ return logs_with_config_changes.contains(index);
+}
+
+size_t LogEntryStorage::empty() const
+{
+ return logs_location.empty() && latest_logs_cache.empty();
+}
+
+size_t LogEntryStorage::size() const
+{
+ if (empty())
+ return 0;
+
+ size_t min_index = 0;
+ size_t max_index = 0;
+
+ if (!logs_location.empty())
+ {
+ min_index = min_index_with_location;
+ max_index = max_index_with_location;
+ }
+ else
+ min_index = latest_logs_cache.min_index_in_cache;
+
+ if (!latest_logs_cache.empty())
+ max_index = latest_logs_cache.max_index_in_cache;
+
+ return max_index - min_index + 1;
+}
+
+size_t LogEntryStorage::getFirstIndex() const
+{
+ if (!logs_location.empty())
+ return min_index_with_location;
+
+ if (!latest_logs_cache.empty())
+ return latest_logs_cache.min_index_in_cache;
+
+ return 0;
+}
+
+void LogEntryStorage::shutdown()
+{
+ if (std::exchange(is_shutdown, true))
+ return;
+
+ if (!prefetch_queue.isFinished())
+ prefetch_queue.finish();
+
+ if (current_prefetch_info)
+ {
+ current_prefetch_info->cancel = true;
+ current_prefetch_info->done.wait(false);
+ }
+
+ if (commit_logs_prefetcher->joinable())
+ commit_logs_prefetcher->join();
+}
+
Changelog::Changelog(
LoggerPtr log_, LogFileSettings log_file_settings, FlushSettings flush_settings_, KeeperContextPtr keeper_context_)
: changelogs_detached_dir("detached")
, rotate_interval(log_file_settings.rotate_interval)
, compress_logs(log_file_settings.compress_logs)
, log(log_)
+ , entry_storage(log_file_settings, keeper_context_)
, write_operations(std::numeric_limits::max())
, append_completion_queue(std::numeric_limits::max())
, keeper_context(std::move(keeper_context_))
, flush_settings(flush_settings_)
{
- if (auto latest_log_disk = getLatestLogDisk();
- log_file_settings.force_sync && dynamic_cast(latest_log_disk.get()) == nullptr)
+ try
{
- throw DB::Exception(
- DB::ErrorCodes::BAD_ARGUMENTS,
- "force_sync is set to true for logs but disk '{}' cannot satisfy such guarantee because it's not of type DiskLocal.\n"
- "If you want to use force_sync and same disk for all logs, please set keeper_server.log_storage_disk to a local disk.\n"
- "If you want to use force_sync and different disk only for old logs, please set 'keeper_server.log_storage_disk' to any "
- "supported disk and 'keeper_server.latest_log_storage_disk' to a local disk.\n"
- "Otherwise, disable force_sync",
- latest_log_disk->getName());
- }
-
- /// Load all files on changelog disks
-
- std::unordered_set read_disks;
-
- const auto load_from_disk = [&](const auto & disk)
- {
- if (read_disks.contains(disk))
- return;
-
- LOG_TRACE(log, "Reading from disk {}", disk->getName());
- std::unordered_map incomplete_files;
-
- const auto clean_incomplete_file = [&](const auto & file_path)
+ if (auto latest_log_disk = getLatestLogDisk();
+ log_file_settings.force_sync && dynamic_cast(latest_log_disk.get()) == nullptr)
{
- if (auto incomplete_it = incomplete_files.find(fs::path(file_path).filename()); incomplete_it != incomplete_files.end())
+ throw DB::Exception(
+ DB::ErrorCodes::BAD_ARGUMENTS,
+ "force_sync is set to true for logs but disk '{}' cannot satisfy such guarantee because it's not of type DiskLocal.\n"
+ "If you want to use force_sync and same disk for all logs, please set keeper_server.log_storage_disk to a local disk.\n"
+ "If you want to use force_sync and different disk only for old logs, please set 'keeper_server.log_storage_disk' to any "
+ "supported disk and 'keeper_server.latest_log_storage_disk' to a local disk.\n"
+ "Otherwise, disable force_sync",
+ latest_log_disk->getName());
+ }
+
+ /// Load all files on changelog disks
+
+ std::unordered_set read_disks;
+
+ const auto load_from_disk = [&](const auto & disk)
+ {
+ if (read_disks.contains(disk))
+ return;
+
+ LOG_TRACE(log, "Reading from disk {}", disk->getName());
+ std::unordered_map incomplete_files;
+
+ const auto clean_incomplete_file = [&](const auto & file_path)
{
- LOG_TRACE(log, "Removing {} from {}", file_path, disk->getName());
- disk->removeFile(file_path);
- disk->removeFile(incomplete_it->second);
- incomplete_files.erase(incomplete_it);
- return true;
+ if (auto incomplete_it = incomplete_files.find(fs::path(file_path).filename()); incomplete_it != incomplete_files.end())
+ {
+ LOG_TRACE(log, "Removing {} from {}", file_path, disk->getName());
+ disk->removeFile(file_path);
+ disk->removeFile(incomplete_it->second);
+ incomplete_files.erase(incomplete_it);
+ return true;
+ }
+
+ return false;
+ };
+
+ std::vector changelog_files;
+ for (auto it = disk->iterateDirectory(""); it->isValid(); it->next())
+ {
+ const auto & file_name = it->name();
+ if (file_name == changelogs_detached_dir)
+ continue;
+
+ if (file_name.starts_with(tmp_keeper_file_prefix))
+ {
+ incomplete_files.emplace(file_name.substr(tmp_keeper_file_prefix.size()), it->path());
+ continue;
+ }
+
+ if (file_name.starts_with(DEFAULT_PREFIX))
+ {
+ if (!clean_incomplete_file(it->path()))
+ changelog_files.push_back(it->path());
+ }
+ else
+ {
+ LOG_WARNING(log, "Unknown file found in log directory: {}", file_name);
+ }
}
- return false;
+ for (const auto & changelog_file : changelog_files)
+ {
+ if (clean_incomplete_file(fs::path(changelog_file).filename()))
+ continue;
+
+ auto file_description = getChangelogFileDescription(changelog_file);
+ file_description->disk = disk;
+
+ LOG_TRACE(log, "Found {} on {}", changelog_file, disk->getName());
+ auto [changelog_it, inserted] = existing_changelogs.insert_or_assign(file_description->from_log_index, std::move(file_description));
+
+ if (!inserted)
+ LOG_WARNING(log, "Found duplicate entries for {}, will use the entry from {}", changelog_it->second->path, disk->getName());
+ }
+
+ for (const auto & [name, path] : incomplete_files)
+ disk->removeFile(path);
+
+ read_disks.insert(disk);
};
- std::vector changelog_files;
- for (auto it = disk->iterateDirectory(""); it->isValid(); it->next())
- {
- const auto & file_name = it->name();
- if (file_name == changelogs_detached_dir)
- continue;
+ /// Load all files from old disks
+ for (const auto & disk : keeper_context->getOldLogDisks())
+ load_from_disk(disk);
- if (file_name.starts_with(tmp_prefix))
- {
- incomplete_files.emplace(file_name.substr(tmp_prefix.size()), it->path());
- continue;
- }
-
- if (file_name.starts_with(DEFAULT_PREFIX))
- {
- if (!clean_incomplete_file(it->path()))
- changelog_files.push_back(it->path());
- }
- else
- {
- LOG_WARNING(log, "Unknown file found in log directory: {}", file_name);
- }
- }
-
- for (const auto & changelog_file : changelog_files)
- {
- if (clean_incomplete_file(fs::path(changelog_file).filename()))
- continue;
-
- auto file_description = getChangelogFileDescription(changelog_file);
- file_description->disk = disk;
-
- LOG_TRACE(log, "Found {} on {}", changelog_file, disk->getName());
- auto [changelog_it, inserted] = existing_changelogs.insert_or_assign(file_description->from_log_index, std::move(file_description));
-
- if (!inserted)
- LOG_WARNING(log, "Found duplicate entries for {}, will use the entry from {}", changelog_it->second->path, disk->getName());
- }
-
- for (const auto & [name, path] : incomplete_files)
- disk->removeFile(path);
-
- read_disks.insert(disk);
- };
-
- /// Load all files from old disks
- for (const auto & disk : keeper_context->getOldLogDisks())
+ auto disk = getDisk();
load_from_disk(disk);
- auto disk = getDisk();
- load_from_disk(disk);
+ auto latest_log_disk = getLatestLogDisk();
+ if (disk != latest_log_disk)
+ load_from_disk(latest_log_disk);
- auto latest_log_disk = getLatestLogDisk();
- if (disk != latest_log_disk)
- load_from_disk(latest_log_disk);
+ if (existing_changelogs.empty())
+ LOG_WARNING(log, "No logs exists in {}. It's Ok if it's the first run of clickhouse-keeper.", disk->getPath());
- if (existing_changelogs.empty())
- LOG_WARNING(log, "No logs exists in {}. It's Ok if it's the first run of clickhouse-keeper.", disk->getPath());
+ clean_log_thread = std::make_unique([this] { cleanLogThread(); });
- clean_log_thread = ThreadFromGlobalPool([this] { cleanLogThread(); });
+ write_thread = std::make_unique([this] { writeThread(); });
- write_thread = ThreadFromGlobalPool([this] { writeThread(); });
+ append_completion_thread = std::make_unique([this] { appendCompletionThread(); });
- append_completion_thread = ThreadFromGlobalPool([this] { appendCompletionThread(); });
-
- current_writer = std::make_unique(existing_changelogs, keeper_context, log_file_settings);
+ current_writer = std::make_unique(existing_changelogs, entry_storage, keeper_context, log_file_settings);
+ }
+ catch (...)
+ {
+ tryLogCurrentException(log);
+ throw;
+ }
}
void Changelog::readChangelogAndInitWriter(uint64_t last_commited_log_index, uint64_t logs_to_keep)
+try
{
std::lock_guard writer_lock(writer_mutex);
std::optional last_log_read_result;
@@ -751,7 +1673,6 @@ void Changelog::readChangelogAndInitWriter(uint64_t last_commited_log_index, uin
changelog_description.from_log_index);
/// Nothing to do with our more fresh log, leader will overwrite them, so remove everything and just start from last_commited_index
removeAllLogs();
- min_log_id = last_commited_log_index;
max_log_id = last_commited_log_index == 0 ? 0 : last_commited_log_index - 1;
current_writer->rotate(max_log_id + 1);
initialized = true;
@@ -783,18 +1704,14 @@ void Changelog::readChangelogAndInitWriter(uint64_t last_commited_log_index, uin
break;
}
- ChangelogReader reader(changelog_description.disk, changelog_description.path);
- last_log_read_result = reader.readChangelog(logs, start_to_read_from, log);
+ ChangelogReader reader(changelog_description_ptr);
+ last_log_read_result = reader.readChangelog(entry_storage, start_to_read_from, log);
if (last_log_read_result->last_read_index != 0)
last_read_index = last_log_read_result->last_read_index;
last_log_read_result->log_start_index = changelog_description.from_log_index;
- /// Otherwise we have already initialized it
- if (min_log_id == 0)
- min_log_id = last_log_read_result->first_read_index;
-
if (last_log_read_result->last_read_index != 0)
max_log_id = last_log_read_result->last_read_index;
@@ -813,16 +1730,14 @@ void Changelog::readChangelogAndInitWriter(uint64_t last_commited_log_index, uin
auto disk = getDisk();
if (latest_log_disk != disk && latest_log_disk == description->disk)
- moveFileBetweenDisks(latest_log_disk, description, disk, description->path);
+ moveChangelogBetweenDisks(latest_log_disk, description, disk, description->path, keeper_context);
};
/// we can have empty log (with zero entries) and last_log_read_result will be initialized
- if (!last_log_read_result || min_log_id == 0) /// We just may have no logs (only snapshot or nothing)
+ if (!last_log_read_result || entry_storage.empty()) /// We just may have no logs (only snapshot or nothing)
{
/// Just to be sure they don't exist
removeAllLogs();
-
- min_log_id = last_commited_log_index;
max_log_id = last_commited_log_index == 0 ? 0 : last_commited_log_index - 1;
}
else if (last_commited_log_index != 0 && max_log_id < last_commited_log_index - 1) /// If we have more fresh snapshot than our logs
@@ -834,7 +1749,6 @@ void Changelog::readChangelogAndInitWriter(uint64_t last_commited_log_index, uin
last_commited_log_index - 1);
removeAllLogs();
- min_log_id = last_commited_log_index;
max_log_id = last_commited_log_index - 1;
}
else if (last_log_is_not_complete) /// if it's complete just start new one
@@ -861,13 +1775,13 @@ void Changelog::readChangelogAndInitWriter(uint64_t last_commited_log_index, uin
remove_invalid_logs();
description->disk->removeFile(description->path);
existing_changelogs.erase(last_log_read_result->log_start_index);
- std::erase_if(logs, [last_log_read_result](const auto & item) { return item.first >= last_log_read_result->log_start_index; });
+ entry_storage.cleanAfter(last_log_read_result->log_start_index - 1);
}
else if (last_log_read_result->error)
{
- LOG_INFO(log, "Chagelog {} read finished with error but some logs were read from it, file will not be removed", description->path);
+ LOG_INFO(log, "Changelog {} read finished with error but some logs were read from it, file will not be removed", description->path);
remove_invalid_logs();
- std::erase_if(logs, [last_log_read_result](const auto & item) { return item.first > last_log_read_result->last_read_index; });
+ entry_storage.cleanAfter(last_log_read_result->last_read_index);
move_from_latest_logs_disks(existing_changelogs.at(last_log_read_result->log_start_index));
}
/// don't mix compressed and uncompressed writes
@@ -899,12 +1813,15 @@ void Changelog::readChangelogAndInitWriter(uint64_t last_commited_log_index, uin
}
if (description->disk != disk)
- moveFileBetweenDisks(description->disk, description, disk, description->path);
+ moveChangelogBetweenDisks(description->disk, description, disk, description->path, keeper_context);
}
-
initialized = true;
}
+catch (...)
+{
+ tryLogCurrentException(__PRETTY_FUNCTION__);
+}
void Changelog::initWriter(ChangelogFileDescriptionPtr description)
@@ -921,7 +1838,7 @@ void Changelog::initWriter(ChangelogFileDescriptionPtr description)
auto log_disk = description->disk;
auto latest_log_disk = getLatestLogDisk();
if (log_disk != latest_log_disk)
- moveFileBetweenDisks(log_disk, description, latest_log_disk, description->path);
+ moveChangelogBetweenDisks(log_disk, description, latest_log_disk, description->path, keeper_context);
current_writer->setFile(std::move(description), WriteMode::Append);
}
@@ -984,11 +1901,11 @@ void Changelog::removeExistingLogs(ChangelogIter begin, ChangelogIter end)
catch (const DB::Exception & e)
{
if (e.code() == DB::ErrorCodes::NOT_IMPLEMENTED)
- moveFileBetweenDisks(changelog_disk, changelog_description, disk, new_path);
+ moveChangelogBetweenDisks(changelog_disk, changelog_description, disk, new_path, keeper_context);
}
}
else
- moveFileBetweenDisks(changelog_disk, changelog_description, disk, new_path);
+ moveChangelogBetweenDisks(changelog_disk, changelog_description, disk, new_path, keeper_context);
itr = existing_changelogs.erase(itr);
}
@@ -1006,14 +1923,14 @@ void Changelog::removeAllLogsAfter(uint64_t remove_after_log_start_index)
LOG_WARNING(log, "Removing changelogs that go after broken changelog entry");
removeExistingLogs(start_to_remove_from_itr, existing_changelogs.end());
- std::erase_if(logs, [start_to_remove_from_log_id](const auto & item) { return item.first >= start_to_remove_from_log_id; });
+ entry_storage.cleanAfter(start_to_remove_from_log_id - 1);
}
void Changelog::removeAllLogs()
{
LOG_WARNING(log, "Removing all changelogs");
removeExistingLogs(existing_changelogs.begin(), existing_changelogs.end());
- logs.clear();
+ entry_storage.clear();
}
ChangelogRecord Changelog::buildRecord(uint64_t index, const LogEntryPtr & log_entry)
@@ -1045,7 +1962,7 @@ void Changelog::appendCompletionThread()
if (auto raft_server_locked = raft_server.lock())
raft_server_locked->notify_log_append_completion(append_ok);
else
- LOG_WARNING(log, "Raft server is not set in LogStore.");
+ LOG_INFO(log, "Raft server is not set in LogStore.");
}
}
@@ -1085,70 +2002,78 @@ void Changelog::writeThread()
LOG_WARNING(log, "Changelog is shut down");
};
- /// NuRaft writes a batch of request by first calling multiple store requests, i.e. AppendLog
- /// finished by a flush request
- /// We assume that after some number of appends, we always get flush request
- while (true)
+ try
{
- if (try_batch_flush)
+ /// NuRaft writes a batch of request by first calling multiple store requests, i.e. AppendLog
+ /// finished by a flush request
+ /// We assume that after some number of appends, we always get flush request
+ while (true)
{
- try_batch_flush = false;
- /// we have Flush request stored in write operation
- /// but we try to get new append operations
- /// if there are none, we apply the currently set Flush
- chassert(std::holds_alternative(write_operation));
- if (!write_operations.tryPop(write_operation))
+ if (try_batch_flush)
{
- chassert(batch_append_ok);
- const auto & flush = std::get(write_operation);
- flush_logs(flush);
- notify_append_completion();
- if (!write_operations.pop(write_operation))
- break;
- }
- }
- else if (!write_operations.pop(write_operation))
- {
- break;
- }
-
- assert(initialized);
-
- if (auto * append_log = std::get_if(&write_operation))
- {
- if (!batch_append_ok)
- continue;
-
- std::lock_guard writer_lock(writer_mutex);
- assert(current_writer);
-
- batch_append_ok = current_writer->appendRecord(buildRecord(append_log->index, append_log->log_entry));
- ++pending_appends;
- }
- else
- {
- const auto & flush = std::get(write_operation);
-
- if (batch_append_ok)
- {
- /// we can try batching more logs for flush
- if (pending_appends < flush_settings.max_flush_batch_size)
+ try_batch_flush = false;
+ /// we have Flush request stored in write operation
+ /// but we try to get new append operations
+ /// if there are none, we apply the currently set Flush
+ chassert(std::holds_alternative(write_operation));
+ if (!write_operations.tryPop(write_operation))
{
- try_batch_flush = true;
- continue;
+ chassert(batch_append_ok);
+ const auto & flush = std::get(write_operation);
+ flush_logs(flush);
+ notify_append_completion();
+ if (!write_operations.pop(write_operation))
+ break;
}
- /// we need to flush because we have maximum allowed pending records
- flush_logs(flush);
+ }
+ else if (!write_operations.pop(write_operation))
+ {
+ break;
+ }
+
+ assert(initialized);
+
+ if (auto * append_log = std::get_if(&write_operation))
+ {
+ if (!batch_append_ok)
+ continue;
+
+ std::lock_guard writer_lock(writer_mutex);
+ assert(current_writer);
+
+ batch_append_ok = current_writer->appendRecord(buildRecord(append_log->index, append_log->log_entry));
+ ++pending_appends;
}
else
{
- std::lock_guard lock{durable_idx_mutex};
- *flush.failed = true;
+ const auto & flush = std::get(write_operation);
+
+ if (batch_append_ok)
+ {
+ /// we can try batching more logs for flush
+ if (pending_appends < flush_settings.max_flush_batch_size)
+ {
+ try_batch_flush = true;
+ continue;
+ }
+ /// we need to flush because we have maximum allowed pending records
+ flush_logs(flush);
+ }
+ else
+ {
+ std::lock_guard lock{durable_idx_mutex};
+ *flush.failed = true;
+ }
+ notify_append_completion();
+ batch_append_ok = true;
}
- notify_append_completion();
- batch_append_ok = true;
}
}
+ catch (...)
+ {
+ tryLogCurrentException(log, "Write thread failed, aborting");
+ std::abort();
+ }
}
@@ -1157,10 +2082,7 @@ void Changelog::appendEntry(uint64_t index, const LogEntryPtr & log_entry)
if (!initialized)
throw Exception(ErrorCodes::LOGICAL_ERROR, "Changelog must be initialized before appending records");
- if (logs.empty())
- min_log_id = index;
-
- logs[index] = log_entry;
+ entry_storage.addEntry(index, log_entry);
max_log_id = index;
if (!write_operations.push(AppendLog{index, log_entry}))
@@ -1191,7 +2113,7 @@ void Changelog::writeAt(uint64_t index, const LogEntryPtr & log_entry)
auto log_disk = description->disk;
auto latest_log_disk = getLatestLogDisk();
if (log_disk != latest_log_disk)
- moveFileBetweenDisks(log_disk, description, latest_log_disk, description->path);
+ moveChangelogBetweenDisks(log_disk, description, latest_log_disk, description->path, keeper_context);
current_writer->setFile(std::move(description), WriteMode::Append);
@@ -1207,7 +2129,7 @@ void Changelog::writeAt(uint64_t index, const LogEntryPtr & log_entry)
/// Remove redundant logs from memory
/// Everything >= index must be removed
- std::erase_if(logs, [index](const auto & item) { return item.first >= index; });
+ entry_storage.cleanAfter(index - 1);
/// Now we can actually override entry at index
appendEntry(index, log_entry);
@@ -1274,14 +2196,23 @@ void Changelog::compact(uint64_t up_to_log_index)
else /// Files are ordered, so all subsequent should exist
break;
}
- /// Compaction from the past is possible, so don't make our min_log_id smaller.
- min_log_id = std::max(min_log_id, up_to_log_index + 1);
- std::erase_if(logs, [up_to_log_index](const auto & item) { return item.first <= up_to_log_index; });
+
+ entry_storage.cleanUpTo(up_to_log_index + 1);
if (need_rotate)
current_writer->rotate(up_to_log_index + 1);
- LOG_INFO(log, "Compaction up to {} finished new min index {}, new max index {}", up_to_log_index, min_log_id, max_log_id);
+ LOG_INFO(log, "Compaction up to {} finished new min index {}, new max index {}", up_to_log_index, getStartIndex(), max_log_id);
+}
+
+uint64_t Changelog::getNextEntryIndex() const
+{
+ return max_log_id + 1;
+}
+
+uint64_t Changelog::getStartIndex() const
+{
+ return entry_storage.empty() ? max_log_id + 1 : entry_storage.getFirstIndex();
}
LogEntryPtr Changelog::getLastEntry() const
@@ -1289,46 +2220,26 @@ LogEntryPtr Changelog::getLastEntry() const
/// This entry treaded in special way by NuRaft
static LogEntryPtr fake_entry = nuraft::cs_new(0, nuraft::buffer::alloc(sizeof(uint64_t)));
- auto entry = logs.find(max_log_id);
- if (entry == logs.end())
- {
+ auto entry = entry_storage.getEntry(max_log_id);
+ if (entry == nullptr)
return fake_entry;
- }
- return entry->second;
+ return entry;
}
LogEntriesPtr Changelog::getLogEntriesBetween(uint64_t start, uint64_t end)
{
- LogEntriesPtr ret = nuraft::cs_new>>();
-
- ret->resize(end - start);
- uint64_t result_pos = 0;
- for (uint64_t i = start; i < end; ++i)
- {
- (*ret)[result_pos] = entryAt(i);
- result_pos++;
- }
- return ret;
+ return entry_storage.getLogEntriesBetween(start, end);
}
-LogEntryPtr Changelog::entryAt(uint64_t index)
+LogEntryPtr Changelog::entryAt(uint64_t index) const
{
- nuraft::ptr