Merge remote-tracking branch 'rschu1ze/master' into vector-dot-product

This commit is contained in:
Robert Schulze 2024-02-22 10:16:28 +00:00
commit ae597d86dd
No known key found for this signature in database
GPG Key ID: 26703B55FB13728A
290 changed files with 6025 additions and 1578 deletions

View File

@ -11,7 +11,7 @@ on: # yamllint disable-line rule:truthy
- 'backport/**' - 'backport/**'
jobs: jobs:
RunConfig: RunConfig:
runs-on: [self-hosted, style-checker] runs-on: [self-hosted, style-checker-aarch64]
outputs: outputs:
data: ${{ steps.runconfig.outputs.CI_DATA }} data: ${{ steps.runconfig.outputs.CI_DATA }}
steps: steps:

View File

@ -11,7 +11,7 @@ on: # yamllint disable-line rule:truthy
- 'master' - 'master'
jobs: jobs:
RunConfig: RunConfig:
runs-on: [self-hosted, style-checker] runs-on: [self-hosted, style-checker-aarch64]
outputs: outputs:
data: ${{ steps.runconfig.outputs.CI_DATA }} data: ${{ steps.runconfig.outputs.CI_DATA }}
steps: steps:

View File

@ -14,7 +14,7 @@ jobs:
# The task for having a preserved ENV and event.json for later investigation # The task for having a preserved ENV and event.json for later investigation
uses: ./.github/workflows/debug.yml uses: ./.github/workflows/debug.yml
RunConfig: RunConfig:
runs-on: [self-hosted, style-checker] runs-on: [self-hosted, style-checker-aarch64]
outputs: outputs:
data: ${{ steps.runconfig.outputs.CI_DATA }} data: ${{ steps.runconfig.outputs.CI_DATA }}
steps: steps:

View File

@ -18,7 +18,7 @@ on: # yamllint disable-line rule:truthy
########################################################################################## ##########################################################################################
jobs: jobs:
RunConfig: RunConfig:
runs-on: [self-hosted, style-checker] runs-on: [self-hosted, style-checker-aarch64]
outputs: outputs:
data: ${{ steps.runconfig.outputs.CI_DATA }} data: ${{ steps.runconfig.outputs.CI_DATA }}
steps: steps:

View File

@ -14,7 +14,7 @@ on: # yamllint disable-line rule:truthy
jobs: jobs:
RunConfig: RunConfig:
runs-on: [self-hosted, style-checker] runs-on: [self-hosted, style-checker-aarch64]
outputs: outputs:
data: ${{ steps.runconfig.outputs.CI_DATA }} data: ${{ steps.runconfig.outputs.CI_DATA }}
steps: steps:

2
contrib/NuRaft vendored

@ -1 +1 @@
Subproject commit 1278e32bb0d5dc489f947e002bdf8c71b0ddaa63 Subproject commit 5bb3a0e8257bacd65b099cb1b7239bd6b9a2c477

View File

@ -1,7 +1,7 @@
version: '2.3' version: '2.3'
services: services:
mysql2: mysql2:
image: mysql:5.7 image: mysql:8.0
restart: always restart: always
environment: environment:
MYSQL_ROOT_PASSWORD: clickhouse MYSQL_ROOT_PASSWORD: clickhouse
@ -23,7 +23,7 @@ services:
source: ${MYSQL_CLUSTER_LOGS:-} source: ${MYSQL_CLUSTER_LOGS:-}
target: /mysql/ target: /mysql/
mysql3: mysql3:
image: mysql:5.7 image: mysql:8.0
restart: always restart: always
environment: environment:
MYSQL_ROOT_PASSWORD: clickhouse MYSQL_ROOT_PASSWORD: clickhouse
@ -45,7 +45,7 @@ services:
source: ${MYSQL_CLUSTER_LOGS:-} source: ${MYSQL_CLUSTER_LOGS:-}
target: /mysql/ target: /mysql/
mysql4: mysql4:
image: mysql:5.7 image: mysql:8.0
restart: always restart: always
environment: environment:
MYSQL_ROOT_PASSWORD: clickhouse MYSQL_ROOT_PASSWORD: clickhouse

View File

@ -77,6 +77,12 @@ remove_keeper_config "async_replication" "1"
# create_if_not_exists feature flag doesn't exist on some older versions # create_if_not_exists feature flag doesn't exist on some older versions
remove_keeper_config "create_if_not_exists" "[01]" remove_keeper_config "create_if_not_exists" "[01]"
# latest_logs_cache_size_threshold setting doesn't exist on some older versions
remove_keeper_config "latest_logs_cache_size_threshold" "[[:digit:]]\+"
# commit_logs_cache_size_threshold setting doesn't exist on some older versions
remove_keeper_config "commit_logs_cache_size_threshold" "[[:digit:]]\+"
# it contains some new settings, but we can safely remove it # it contains some new settings, but we can safely remove it
rm /etc/clickhouse-server/config.d/merge_tree.xml rm /etc/clickhouse-server/config.d/merge_tree.xml
rm /etc/clickhouse-server/config.d/enable_wait_for_shutdown_replicated_tables.xml rm /etc/clickhouse-server/config.d/enable_wait_for_shutdown_replicated_tables.xml
@ -109,6 +115,12 @@ remove_keeper_config "async_replication" "1"
# create_if_not_exists feature flag doesn't exist on some older versions # create_if_not_exists feature flag doesn't exist on some older versions
remove_keeper_config "create_if_not_exists" "[01]" remove_keeper_config "create_if_not_exists" "[01]"
# latest_logs_cache_size_threshold setting doesn't exist on some older versions
remove_keeper_config "latest_logs_cache_size_threshold" "[[:digit:]]\+"
# commit_logs_cache_size_threshold setting doesn't exist on some older versions
remove_keeper_config "commit_logs_cache_size_threshold" "[[:digit:]]\+"
# But we still need default disk because some tables loaded only into it # But we still need default disk because some tables loaded only into it
sudo cat /etc/clickhouse-server/config.d/s3_storage_policy_by_default.xml \ sudo cat /etc/clickhouse-server/config.d/s3_storage_policy_by_default.xml \
| sed "s|<main><disk>s3</disk></main>|<main><disk>s3</disk></main><default><disk>default</disk></default>|" \ | sed "s|<main><disk>s3</disk></main>|<main><disk>s3</disk></main><default><disk>default</disk></default>|" \

View File

@ -18,7 +18,11 @@ Two configuration files (usually the main configuration file and another configu
- If one of both nodes contains attribute `replace`, it is included in the merged configuration file but only children from the node with attribute `replace` are included. - If one of both nodes contains attribute `replace`, it is included in the merged configuration file but only children from the node with attribute `replace` are included.
- If one of both nodes contains attribute `remove`, the node is not included in the merged configuration file (if it exists already, it is deleted). - If one of both nodes contains attribute `remove`, the node is not included in the merged configuration file (if it exists already, it is deleted).
Example:
```xml ```xml
<!-- config.xml -->
<clickhouse> <clickhouse>
<config_a> <config_a>
<setting_1>1</setting_1> <setting_1>1</setting_1>
@ -35,6 +39,7 @@ Two configuration files (usually the main configuration file and another configu
and and
```xml ```xml
<!-- config.d/other_config.xml -->
<clickhouse> <clickhouse>
<config_a> <config_a>
<setting_4>4</setting_4> <setting_4>4</setting_4>
@ -56,7 +61,7 @@ generates merged configuration file:
<setting_1>1</setting_1> <setting_1>1</setting_1>
<setting_4>4</setting_4> <setting_4>4</setting_4>
</config_a> </config_a>
<config_b replace="replace"> <config_b>
<setting_5>5</setting_5> <setting_5>5</setting_5>
</config_b> </config_b>
</clickhouse> </clickhouse>

View File

@ -4279,41 +4279,6 @@ Result:
└─────┴─────┴───────┘ └─────┴─────┴───────┘
``` ```
## enable_order_by_all {#enable-order-by-all}
Enables or disables sorting by `ALL` columns, i.e. [ORDER BY](../../sql-reference/statements/select/order-by.md)
Possible values:
- 0 — Disable ORDER BY ALL.
- 1 — Enable ORDER BY ALL.
Default value: `1`.
**Example**
Query:
```sql
CREATE TABLE TAB(C1 Int, C2 Int, ALL Int) ENGINE=Memory();
INSERT INTO TAB VALUES (10, 20, 30), (20, 20, 10), (30, 10, 20);
SELECT * FROM TAB ORDER BY ALL; -- returns an error that ALL is ambiguous
SELECT * FROM TAB ORDER BY ALL SETTINGS enable_order_by_all;
```
Result:
```text
┌─C1─┬─C2─┬─ALL─┐
│ 20 │ 20 │ 10 │
│ 30 │ 10 │ 20 │
│ 10 │ 20 │ 30 │
└────┴────┴─────┘
```
## splitby_max_substrings_includes_remaining_string {#splitby_max_substrings_includes_remaining_string} ## splitby_max_substrings_includes_remaining_string {#splitby_max_substrings_includes_remaining_string}
Controls whether function [splitBy*()](../../sql-reference/functions/splitting-merging-functions.md) with argument `max_substrings` > 0 will include the remaining string in the last element of the result array. Controls whether function [splitBy*()](../../sql-reference/functions/splitting-merging-functions.md) with argument `max_substrings` > 0 will include the remaining string in the last element of the result array.

View File

@ -13,8 +13,8 @@ simpleLinearRegression(x, y)
Parameters: Parameters:
- `x` — Column with dependent variable values. - `x` — Column with explanatory variable values.
- `y` — Column with explanatory variable values. - `y` — Column with dependent variable values.
Returned values: Returned values:

View File

@ -509,7 +509,7 @@ Result:
## cosineDistance ## cosineDistance
Calculates the cosine distance between two vectors (the values of the tuples are the coordinates). The less the returned value is, the more similar are the vectors. Calculates the cosine distance between two vectors (the values of the tuples are the coordinates). The smaller the returned value is, the more similar are the vectors.
**Syntax** **Syntax**

View File

@ -9,10 +9,9 @@ The `ORDER BY` clause contains
- a list of expressions, e.g. `ORDER BY visits, search_phrase`, - a list of expressions, e.g. `ORDER BY visits, search_phrase`,
- a list of numbers referring to columns in the `SELECT` clause, e.g. `ORDER BY 2, 1`, or - a list of numbers referring to columns in the `SELECT` clause, e.g. `ORDER BY 2, 1`, or
- `ALL` which means all columns of the `SELECT` clause, e.g. `ORDER BY ALL`. - `*` (without other expressions or numbers) which means all columns of the `SELECT` clause: `ORDER BY *`.
To disable sorting by column numbers, set setting [enable_positional_arguments](../../../operations/settings/settings.md#enable-positional-arguments) = 0. To disable sorting by column numbers, set setting [enable_positional_arguments](../../../operations/settings/settings.md#enable-positional-arguments) = 0.
To disable sorting by `ALL`, set setting [enable_order_by_all](../../../operations/settings/settings.md#enable-order-by-all) = 0.
The `ORDER BY` clause can be attributed by a `DESC` (descending) or `ASC` (ascending) modifier which determines the sorting direction. The `ORDER BY` clause can be attributed by a `DESC` (descending) or `ASC` (ascending) modifier which determines the sorting direction.
Unless an explicit sort order is specified, `ASC` is used by default. Unless an explicit sort order is specified, `ASC` is used by default.

View File

@ -61,14 +61,14 @@ sidebar_label: ORDER BY
我们只建议使用 `COLLATE` 对于少量行的最终排序,因为排序与 `COLLATE` 比正常的按字节排序效率低。 我们只建议使用 `COLLATE` 对于少量行的最终排序,因为排序与 `COLLATE` 比正常的按字节排序效率低。
## ORDER BY ALL ## ORDER BY *
`ORDER BY ALL` 对所有选定的列进行升序排序。 `ORDER BY *` 对所有选定的列进行升序排序。
示例: 示例:
``` sql ``` sql
SELECT a, b, c FROM t ORDER BY ALL SELECT a, b, c FROM t ORDER BY *
``` ```
等同于: 等同于:

View File

@ -2,7 +2,6 @@
#include <cstdlib> #include <cstdlib>
#include <csignal> #include <csignal>
#include <iostream> #include <iostream>
#include <fstream>
#include <iomanip> #include <iomanip>
#include <optional> #include <optional>
#include <random> #include <random>

View File

@ -1,6 +1,7 @@
#include <iostream> #include <iostream>
#include <boost/program_options.hpp> #include <boost/program_options.hpp>
#include <Coordination/CoordinationSettings.h>
#include <Coordination/KeeperSnapshotManager.h> #include <Coordination/KeeperSnapshotManager.h>
#include <Coordination/ZooKeeperDataReader.h> #include <Coordination/ZooKeeperDataReader.h>
#include <Common/TerminalSize.h> #include <Common/TerminalSize.h>
@ -39,7 +40,7 @@ int mainEntryClickHouseKeeperConverter(int argc, char ** argv)
try try
{ {
auto keeper_context = std::make_shared<KeeperContext>(true); auto keeper_context = std::make_shared<KeeperContext>(true, std::make_shared<CoordinationSettings>());
keeper_context->setDigestEnabled(true); keeper_context->setDigestEnabled(true);
keeper_context->setSnapshotDisk(std::make_shared<DiskLocal>("Keeper-snapshots", options["output-dir"].as<std::string>())); keeper_context->setSnapshotDisk(std::make_shared<DiskLocal>("Keeper-snapshots", options["output-dir"].as<std::string>()));

View File

@ -41,7 +41,7 @@ if (BUILD_STANDALONE_KEEPER)
${CMAKE_CURRENT_SOURCE_DIR}/../../src/Coordination/KeeperStorage.cpp ${CMAKE_CURRENT_SOURCE_DIR}/../../src/Coordination/KeeperStorage.cpp
${CMAKE_CURRENT_SOURCE_DIR}/../../src/Coordination/KeeperConstants.cpp ${CMAKE_CURRENT_SOURCE_DIR}/../../src/Coordination/KeeperConstants.cpp
${CMAKE_CURRENT_SOURCE_DIR}/../../src/Coordination/KeeperAsynchronousMetrics.cpp ${CMAKE_CURRENT_SOURCE_DIR}/../../src/Coordination/KeeperAsynchronousMetrics.cpp
${CMAKE_CURRENT_SOURCE_DIR}/../../src/Coordination/pathUtils.cpp ${CMAKE_CURRENT_SOURCE_DIR}/../../src/Coordination/KeeperCommon.cpp
${CMAKE_CURRENT_SOURCE_DIR}/../../src/Coordination/SessionExpiryQueue.cpp ${CMAKE_CURRENT_SOURCE_DIR}/../../src/Coordination/SessionExpiryQueue.cpp
${CMAKE_CURRENT_SOURCE_DIR}/../../src/Coordination/SummingStateMachine.cpp ${CMAKE_CURRENT_SOURCE_DIR}/../../src/Coordination/SummingStateMachine.cpp
${CMAKE_CURRENT_SOURCE_DIR}/../../src/Coordination/WriteBufferFromNuraftBuffer.cpp ${CMAKE_CURRENT_SOURCE_DIR}/../../src/Coordination/WriteBufferFromNuraftBuffer.cpp

View File

@ -560,7 +560,7 @@ try
auto main_config_reloader = std::make_unique<ConfigReloader>( auto main_config_reloader = std::make_unique<ConfigReloader>(
config_path, config_path,
extra_paths, extra_paths,
config().getString("path", ""), config().getString("path", KEEPER_DEFAULT_PATH),
std::move(unused_cache), std::move(unused_cache),
unused_event, unused_event,
[&](ConfigurationPtr config, bool /* initial_loading */) [&](ConfigurationPtr config, bool /* initial_loading */)

View File

@ -1292,7 +1292,7 @@ try
auto main_config_reloader = std::make_unique<ConfigReloader>( auto main_config_reloader = std::make_unique<ConfigReloader>(
config_path, config_path,
extra_paths, extra_paths,
config().getString("path", ""), config().getString("path", DBMS_DEFAULT_PATH),
std::move(main_config_zk_node_cache), std::move(main_config_zk_node_cache),
main_config_zk_changed_event, main_config_zk_changed_event,
[&](ConfigurationPtr config, bool initial_loading) [&](ConfigurationPtr config, bool initial_loading)
@ -1391,7 +1391,7 @@ try
global_context->setMaxDatabaseNumToWarn(new_server_settings.max_database_num_to_warn); global_context->setMaxDatabaseNumToWarn(new_server_settings.max_database_num_to_warn);
global_context->setMaxPartNumToWarn(new_server_settings.max_part_num_to_warn); global_context->setMaxPartNumToWarn(new_server_settings.max_part_num_to_warn);
ConcurrencyControl::SlotCount concurrent_threads_soft_limit = ConcurrencyControl::Unlimited; SlotCount concurrent_threads_soft_limit = UnlimitedSlots;
if (new_server_settings.concurrent_threads_soft_limit_num > 0 && new_server_settings.concurrent_threads_soft_limit_num < concurrent_threads_soft_limit) if (new_server_settings.concurrent_threads_soft_limit_num > 0 && new_server_settings.concurrent_threads_soft_limit_num < concurrent_threads_soft_limit)
concurrent_threads_soft_limit = new_server_settings.concurrent_threads_soft_limit_num; concurrent_threads_soft_limit = new_server_settings.concurrent_threads_soft_limit_num;
if (new_server_settings.concurrent_threads_soft_limit_ratio_to_cores > 0) if (new_server_settings.concurrent_threads_soft_limit_ratio_to_cores > 0)

View File

@ -219,7 +219,7 @@ public:
: IAggregateFunctionDataHelper<AggregateFunctionCountData, AggregateFunctionCountNotNullUnary>({argument}, params, createResultType()) : IAggregateFunctionDataHelper<AggregateFunctionCountData, AggregateFunctionCountNotNullUnary>({argument}, params, createResultType())
{ {
if (!argument->isNullable()) if (!argument->isNullable())
throw Exception(ErrorCodes::LOGICAL_ERROR, "Not Nullable data type passed to AggregateFunctionCountNotNullUnary"); throw Exception(ErrorCodes::LOGICAL_ERROR, "Logical error: not Nullable data type passed to AggregateFunctionCountNotNullUnary");
} }
String getName() const override { return "count"; } String getName() const override { return "count"; }

View File

@ -100,7 +100,7 @@ AggregateFunctionPtr AggregateFunctionFactory::get(
{ {
AggregateFunctionCombinatorPtr combinator = AggregateFunctionCombinatorFactory::instance().tryFindSuffix("Null"); AggregateFunctionCombinatorPtr combinator = AggregateFunctionCombinatorFactory::instance().tryFindSuffix("Null");
if (!combinator) if (!combinator)
throw Exception(ErrorCodes::LOGICAL_ERROR, "Cannot find aggregate function combinator " throw Exception(ErrorCodes::LOGICAL_ERROR, "Logical error: cannot find aggregate function combinator "
"to apply a function to Nullable arguments."); "to apply a function to Nullable arguments.");
DataTypes nested_types = combinator->transformArguments(types_without_low_cardinality); DataTypes nested_types = combinator->transformArguments(types_without_low_cardinality);
@ -123,7 +123,7 @@ AggregateFunctionPtr AggregateFunctionFactory::get(
auto with_original_arguments = getImpl(name, action, types_without_low_cardinality, parameters, out_properties, false); auto with_original_arguments = getImpl(name, action, types_without_low_cardinality, parameters, out_properties, false);
if (!with_original_arguments) if (!with_original_arguments)
throw Exception(ErrorCodes::LOGICAL_ERROR, "AggregateFunctionFactory returned nullptr"); throw Exception(ErrorCodes::LOGICAL_ERROR, "Logical error: AggregateFunctionFactory returned nullptr");
return with_original_arguments; return with_original_arguments;
} }

View File

@ -146,9 +146,7 @@ struct AggregateFunctionSumData
size_t count = end - start; size_t count = end - start;
const auto * end_ptr = ptr + count; const auto * end_ptr = ptr + count;
if constexpr ( if constexpr ((is_integer<T> || is_decimal<T>) && !is_over_big_int<T>)
(is_integer<T> && !is_big_int_v<T>)
|| (is_decimal<T> && !std::is_same_v<T, Decimal256> && !std::is_same_v<T, Decimal128>))
{ {
/// For integers we can vectorize the operation if we replace the null check using a multiplication (by 0 for null, 1 for not null) /// For integers we can vectorize the operation if we replace the null check using a multiplication (by 0 for null, 1 for not null)
/// https://quick-bench.com/q/MLTnfTvwC2qZFVeWHfOBR3U7a8I /// https://quick-bench.com/q/MLTnfTvwC2qZFVeWHfOBR3U7a8I
@ -163,8 +161,39 @@ struct AggregateFunctionSumData
Impl::add(sum, local_sum); Impl::add(sum, local_sum);
return; return;
} }
else if constexpr (is_over_big_int<T>)
{
/// Use a mask to discard or keep the value to reduce branch miss.
/// Notice that for (U)Int128 or Decimal128, MaskType is Int8 instead of Int64, otherwise extra branches will be introduced by compiler (for unknown reason) and performance will be worse.
using MaskType = std::conditional_t<sizeof(T) == 16, Int8, Int64>;
alignas(64) const MaskType masks[2] = {0, -1};
T local_sum{};
while (ptr < end_ptr)
{
Value v = *ptr;
if constexpr (!add_if_zero)
{
if constexpr (is_integer<T>)
v &= masks[!!*condition_map];
else
v.value &= masks[!!*condition_map];
}
else
{
if constexpr (is_integer<T>)
v &= masks[!*condition_map];
else
v.value &= masks[!*condition_map];
}
if constexpr (std::is_floating_point_v<T>) Impl::add(local_sum, v);
++ptr;
++condition_map;
}
Impl::add(sum, local_sum);
return;
}
else if constexpr (std::is_floating_point_v<T>)
{ {
/// For floating point we use a similar trick as above, except that now we reinterpret the floating point number as an unsigned /// For floating point we use a similar trick as above, except that now we reinterpret the floating point number as an unsigned
/// integer of the same size and use a mask instead (0 to discard, 0xFF..FF to keep) /// integer of the same size and use a mask instead (0 to discard, 0xFF..FF to keep)

View File

@ -249,7 +249,7 @@ public:
: Base(std::move(nested_function_), arguments, params), number_of_arguments(arguments.size()) : Base(std::move(nested_function_), arguments, params), number_of_arguments(arguments.size())
{ {
if (number_of_arguments == 1) if (number_of_arguments == 1)
throw Exception(ErrorCodes::LOGICAL_ERROR, "Single argument is passed to AggregateFunctionIfNullVariadic"); throw Exception(ErrorCodes::LOGICAL_ERROR, "Logical error: single argument is passed to AggregateFunctionIfNullVariadic");
if (number_of_arguments > MAX_ARGS) if (number_of_arguments > MAX_ARGS)
throw Exception(ErrorCodes::NUMBER_OF_ARGUMENTS_DOESNT_MATCH, throw Exception(ErrorCodes::NUMBER_OF_ARGUMENTS_DOESNT_MATCH,

View File

@ -429,7 +429,7 @@ public:
, number_of_arguments(arguments.size()) , number_of_arguments(arguments.size())
{ {
if (number_of_arguments == 1) if (number_of_arguments == 1)
throw Exception(ErrorCodes::LOGICAL_ERROR, "Single argument is passed to AggregateFunctionNullVariadic"); throw Exception(ErrorCodes::LOGICAL_ERROR, "Logical error: single argument is passed to AggregateFunctionNullVariadic");
if (number_of_arguments > MAX_ARGS) if (number_of_arguments > MAX_ARGS)
throw Exception(ErrorCodes::NUMBER_OF_ARGUMENTS_DOESNT_MATCH, throw Exception(ErrorCodes::NUMBER_OF_ARGUMENTS_DOESNT_MATCH,

View File

@ -1,6 +1,7 @@
#include <Analyzer/Passes/ArrayExistsToHasPass.h> #include <Analyzer/Passes/ArrayExistsToHasPass.h>
#include <Functions/FunctionFactory.h> #include <Functions/FunctionFactory.h>
#include <Functions/array/has.h>
#include <Interpreters/Context.h> #include <Interpreters/Context.h>
@ -83,7 +84,8 @@ public:
return; return;
} }
auto has_function = FunctionFactory::instance().get("has", getContext()); auto has_function = createInternalFunctionHasOverloadResolver();
array_exists_function_arguments_nodes[0] = std::move(array_exists_function_arguments_nodes[1]); array_exists_function_arguments_nodes[0] = std::move(array_exists_function_arguments_nodes[1]);
array_exists_function_arguments_nodes[1] = std::move(has_constant_element_argument); array_exists_function_arguments_nodes[1] = std::move(has_constant_element_argument);
array_exists_function_node->resolveAsFunction(has_function->build(array_exists_function_node->getArgumentColumns())); array_exists_function_node->resolveAsFunction(has_function->build(array_exists_function_node->getArgumentColumns()));

View File

@ -10,6 +10,7 @@
#include <IO/Operators.h> #include <IO/Operators.h>
#include <Functions/FunctionFactory.h> #include <Functions/FunctionFactory.h>
#include <Functions/logical.h>
#include <Common/checkStackSize.h> #include <Common/checkStackSize.h>
@ -79,7 +80,7 @@ public:
if (name == "and" || name == "or") if (name == "and" || name == "or")
{ {
auto function_resolver = FunctionFactory::instance().get(name, current_context); auto function_resolver = name == "and" ? createInternalFunctionAndOverloadResolver() : createInternalFunctionOrOverloadResolver();
const auto & arguments = function_node->getArguments().getNodes(); const auto & arguments = function_node->getArguments().getNodes();
if (arguments.size() > 2) if (arguments.size() > 2)
@ -110,10 +111,10 @@ private:
class PushNotVisitor class PushNotVisitor
{ {
public: public:
explicit PushNotVisitor(const ContextPtr & context) explicit PushNotVisitor()
: not_function_resolver(FunctionFactory::instance().get("not", context)) : not_function_resolver(createInternalFunctionNotOverloadResolver())
, or_function_resolver(FunctionFactory::instance().get("or", context)) , or_function_resolver(createInternalFunctionOrOverloadResolver())
, and_function_resolver(FunctionFactory::instance().get("and", context)) , and_function_resolver(createInternalFunctionAndOverloadResolver())
{} {}
void visit(QueryTreeNodePtr & node, bool add_negation) void visit(QueryTreeNodePtr & node, bool add_negation)
@ -162,10 +163,10 @@ private:
class PushOrVisitor class PushOrVisitor
{ {
public: public:
PushOrVisitor(ContextPtr context, size_t max_atoms_) explicit PushOrVisitor(size_t max_atoms_)
: max_atoms(max_atoms_) : max_atoms(max_atoms_)
, and_resolver(FunctionFactory::instance().get("and", context)) , and_resolver(createInternalFunctionAndOverloadResolver())
, or_resolver(FunctionFactory::instance().get("or", context)) , or_resolver(createInternalFunctionOrOverloadResolver())
{} {}
bool visit(QueryTreeNodePtr & node, size_t num_atoms) bool visit(QueryTreeNodePtr & node, size_t num_atoms)
@ -513,11 +514,11 @@ std::optional<CNF> CNF::tryBuildCNF(const QueryTreeNodePtr & node, ContextPtr co
} }
{ {
PushNotVisitor visitor(context); PushNotVisitor visitor;
visitor.visit(node_cloned, false); visitor.visit(node_cloned, false);
} }
if (PushOrVisitor visitor(context, max_atoms); if (PushOrVisitor visitor(max_atoms);
!visitor.visit(node_cloned, atom_count)) !visitor.visit(node_cloned, atom_count))
return std::nullopt; return std::nullopt;
@ -542,7 +543,7 @@ CNF CNF::toCNF(const QueryTreeNodePtr & node, ContextPtr context, size_t max_gro
return *cnf; return *cnf;
} }
QueryTreeNodePtr CNF::toQueryTree(ContextPtr context) const QueryTreeNodePtr CNF::toQueryTree() const
{ {
if (statements.empty()) if (statements.empty())
return nullptr; return nullptr;
@ -550,9 +551,9 @@ QueryTreeNodePtr CNF::toQueryTree(ContextPtr context) const
QueryTreeNodes and_arguments; QueryTreeNodes and_arguments;
and_arguments.reserve(statements.size()); and_arguments.reserve(statements.size());
auto not_resolver = FunctionFactory::instance().get("not", context); auto not_resolver = createInternalFunctionNotOverloadResolver();
auto or_resolver = FunctionFactory::instance().get("or", context); auto or_resolver = createInternalFunctionOrOverloadResolver();
auto and_resolver = FunctionFactory::instance().get("and", context); auto and_resolver = createInternalFunctionAndOverloadResolver();
const auto function_node_from_atom = [&](const auto & atom) -> QueryTreeNodePtr const auto function_node_from_atom = [&](const auto & atom) -> QueryTreeNodePtr
{ {

View File

@ -54,7 +54,7 @@ public:
static std::optional<CNF> tryBuildCNF(const QueryTreeNodePtr & node, ContextPtr context, size_t max_growth_multiplier = DEFAULT_MAX_GROWTH_MULTIPLIER); static std::optional<CNF> tryBuildCNF(const QueryTreeNodePtr & node, ContextPtr context, size_t max_growth_multiplier = DEFAULT_MAX_GROWTH_MULTIPLIER);
static CNF toCNF(const QueryTreeNodePtr & node, ContextPtr context, size_t max_growth_multiplier = DEFAULT_MAX_GROWTH_MULTIPLIER); static CNF toCNF(const QueryTreeNodePtr & node, ContextPtr context, size_t max_growth_multiplier = DEFAULT_MAX_GROWTH_MULTIPLIER);
QueryTreeNodePtr toQueryTree(ContextPtr context) const; QueryTreeNodePtr toQueryTree() const;
const auto & getStatements() const const auto & getStatements() const
{ {

View File

@ -11,6 +11,8 @@
#include <DataTypes/DataTypesNumber.h> #include <DataTypes/DataTypesNumber.h>
#include <Functions/FunctionFactory.h> #include <Functions/FunctionFactory.h>
#include <Functions/multiMatchAny.h>
#include <Functions/logical.h>
#include <Interpreters/Context.h> #include <Interpreters/Context.h>
@ -134,8 +136,10 @@ private:
void ConvertOrLikeChainPass::run(QueryTreeNodePtr & query_tree_node, ContextPtr context) void ConvertOrLikeChainPass::run(QueryTreeNodePtr & query_tree_node, ContextPtr context)
{ {
auto or_function_resolver = FunctionFactory::instance().get("or", context); const auto & settings = context->getSettingsRef();
auto match_function_resolver = FunctionFactory::instance().get("multiMatchAny", context); auto match_function_resolver = createInternalMultiMatchAnyOverloadResolver(settings.allow_hyperscan, settings.max_hyperscan_regexp_length, settings.max_hyperscan_regexp_total_length, settings.reject_expensive_hyperscan_regexps);
auto or_function_resolver = createInternalFunctionOrOverloadResolver();
ConvertOrLikeChainVisitor visitor(std::move(or_function_resolver), std::move(match_function_resolver), std::move(context)); ConvertOrLikeChainVisitor visitor(std::move(or_function_resolver), std::move(match_function_resolver), std::move(context));
visitor.visit(query_tree_node); visitor.visit(query_tree_node);
} }

View File

@ -339,7 +339,7 @@ void addIndexConstraint(Analyzer::CNF & cnf, const QueryTreeNodes & table_expres
{ {
Analyzer::CNF::OrGroup new_group; Analyzer::CNF::OrGroup new_group;
auto index_hint_node = std::make_shared<FunctionNode>("indexHint"); auto index_hint_node = std::make_shared<FunctionNode>("indexHint");
index_hint_node->getArguments().getNodes().push_back(Analyzer::CNF{std::move(and_group)}.toQueryTree(context)); index_hint_node->getArguments().getNodes().push_back(Analyzer::CNF{std::move(and_group)}.toQueryTree());
index_hint_node->resolveAsFunction(FunctionFactory::instance().get("indexHint", context)); index_hint_node->resolveAsFunction(FunctionFactory::instance().get("indexHint", context));
new_group.insert({false, QueryTreeNodePtrWithHash{std::move(index_hint_node)}}); new_group.insert({false, QueryTreeNodePtrWithHash{std::move(index_hint_node)}});
@ -676,7 +676,7 @@ void optimizeNode(QueryTreeNodePtr & node, const QueryTreeNodes & table_expressi
if (settings.optimize_using_constraints) if (settings.optimize_using_constraints)
optimizeWithConstraints(*cnf, table_expressions, context); optimizeWithConstraints(*cnf, table_expressions, context);
auto new_node = cnf->toQueryTree(context); auto new_node = cnf->toQueryTree();
node = std::move(new_node); node = std::move(new_node);
} }

View File

@ -12,6 +12,7 @@
#include <Functions/FunctionFactory.h> #include <Functions/FunctionFactory.h>
#include <Functions/IFunction.h> #include <Functions/IFunction.h>
#include <Functions/logical.h>
#include <Common/logger_useful.h> #include <Common/logger_useful.h>
@ -256,7 +257,7 @@ private:
for (const auto & node : nodes) for (const auto & node : nodes)
function_node->getArguments().getNodes().push_back(node); function_node->getArguments().getNodes().push_back(node);
const auto & function = FunctionFactory::instance().get("and", getContext()); const auto & function = createInternalFunctionAndOverloadResolver();
function_node->resolveAsFunction(function->build(function_node->getArgumentColumns())); function_node->resolveAsFunction(function->build(function_node->getArgumentColumns()));
return function_node; return function_node;
} }

View File

@ -5,6 +5,7 @@
#include <Analyzer/InDepthQueryTreeVisitor.h> #include <Analyzer/InDepthQueryTreeVisitor.h>
#include <Analyzer/FunctionNode.h> #include <Analyzer/FunctionNode.h>
#include <Functions/FunctionFactory.h> #include <Functions/FunctionFactory.h>
#include <Functions/multiIf.h>
namespace DB namespace DB
{ {
@ -75,7 +76,8 @@ private:
void IfChainToMultiIfPass::run(QueryTreeNodePtr & query_tree_node, ContextPtr context) void IfChainToMultiIfPass::run(QueryTreeNodePtr & query_tree_node, ContextPtr context)
{ {
auto multi_if_function_ptr = FunctionFactory::instance().get("multiIf", context); const auto & settings = context->getSettingsRef();
auto multi_if_function_ptr = createInternalMultiIfOverloadResolver(settings.allow_execute_multiif_columnar, settings.allow_experimental_variant_type, settings.use_variant_as_common_type);
IfChainToMultiIfPassVisitor visitor(std::move(multi_if_function_ptr), std::move(context)); IfChainToMultiIfPassVisitor visitor(std::move(multi_if_function_ptr), std::move(context));
visitor.visit(query_tree_node); visitor.visit(query_tree_node);
} }

View File

@ -3,6 +3,7 @@
#include <Analyzer/InDepthQueryTreeVisitor.h> #include <Analyzer/InDepthQueryTreeVisitor.h>
#include <Analyzer/FunctionNode.h> #include <Analyzer/FunctionNode.h>
#include <Functions/FunctionFactory.h> #include <Functions/FunctionFactory.h>
#include <Functions/if.h>
namespace DB namespace DB
{ {
@ -54,7 +55,8 @@ private:
void MultiIfToIfPass::run(QueryTreeNodePtr & query_tree_node, ContextPtr context) void MultiIfToIfPass::run(QueryTreeNodePtr & query_tree_node, ContextPtr context)
{ {
auto if_function_ptr = FunctionFactory::instance().get("if", context); const auto & settings = context->getSettingsRef();
auto if_function_ptr = createInternalFunctionIfOverloadResolver(settings.allow_experimental_variant_type, settings.use_variant_as_common_type);
MultiIfToIfVisitor visitor(std::move(if_function_ptr), std::move(context)); MultiIfToIfVisitor visitor(std::move(if_function_ptr), std::move(context));
visitor.visit(query_tree_node); visitor.visit(query_tree_node);
} }

View File

@ -120,7 +120,6 @@ namespace ErrorCodes
extern const int NUMBER_OF_COLUMNS_DOESNT_MATCH; extern const int NUMBER_OF_COLUMNS_DOESNT_MATCH;
extern const int FUNCTION_CANNOT_HAVE_PARAMETERS; extern const int FUNCTION_CANNOT_HAVE_PARAMETERS;
extern const int SYNTAX_ERROR; extern const int SYNTAX_ERROR;
extern const int UNEXPECTED_EXPRESSION;
extern const int INVALID_IDENTIFIER; extern const int INVALID_IDENTIFIER;
} }
@ -1215,7 +1214,7 @@ private:
static void expandGroupByAll(QueryNode & query_tree_node_typed); static void expandGroupByAll(QueryNode & query_tree_node_typed);
void expandOrderByAll(QueryNode & query_tree_node_typed, const Settings & settings); void expandOrderByAll(QueryNode & query_tree_node_typed);
static std::string static std::string
rewriteAggregateFunctionNameIfNeeded(const std::string & aggregate_function_name, NullsAction action, const ContextPtr & context); rewriteAggregateFunctionNameIfNeeded(const std::string & aggregate_function_name, NullsAction action, const ContextPtr & context);
@ -2367,9 +2366,9 @@ void QueryAnalyzer::expandGroupByAll(QueryNode & query_tree_node_typed)
query_tree_node_typed.setIsGroupByAll(false); query_tree_node_typed.setIsGroupByAll(false);
} }
void QueryAnalyzer::expandOrderByAll(QueryNode & query_tree_node_typed, const Settings & settings) void QueryAnalyzer::expandOrderByAll(QueryNode & query_tree_node_typed)
{ {
if (!settings.enable_order_by_all || !query_tree_node_typed.isOrderByAll()) if (!query_tree_node_typed.isOrderByAll())
return; return;
auto * all_node = query_tree_node_typed.getOrderBy().getNodes()[0]->as<SortNode>(); auto * all_node = query_tree_node_typed.getOrderBy().getNodes()[0]->as<SortNode>();
@ -2390,9 +2389,6 @@ void QueryAnalyzer::expandOrderByAll(QueryNode & query_tree_node_typed, const Se
throw Exception(ErrorCodes::LOGICAL_ERROR, throw Exception(ErrorCodes::LOGICAL_ERROR,
"Expression nodes list expected 1 projection names. Actual {}", "Expression nodes list expected 1 projection names. Actual {}",
projection_names.size()); projection_names.size());
if (Poco::toUpper(projection_names[0]) == "ALL")
throw Exception(ErrorCodes::UNEXPECTED_EXPRESSION,
"Cannot use ORDER BY ALL to sort a column with name 'all', please disable setting `enable_order_by_all` and try again");
} }
auto sort_node = std::make_shared<SortNode>(node, all_node->getSortDirection(), all_node->getNullsSortDirection()); auto sort_node = std::make_shared<SortNode>(node, all_node->getSortDirection(), all_node->getNullsSortDirection());
@ -7559,7 +7555,7 @@ void QueryAnalyzer::resolveQuery(const QueryTreeNodePtr & query_node, Identifier
if (settings.enable_positional_arguments) if (settings.enable_positional_arguments)
replaceNodesWithPositionalArguments(query_node_typed.getOrderByNode(), query_node_typed.getProjection().getNodes(), scope); replaceNodesWithPositionalArguments(query_node_typed.getOrderByNode(), query_node_typed.getProjection().getNodes(), scope);
expandOrderByAll(query_node_typed, settings); expandOrderByAll(query_node_typed);
resolveSortNodeList(query_node_typed.getOrderByNode(), scope); resolveSortNodeList(query_node_typed.getOrderByNode(), scope);
} }

View File

@ -219,13 +219,13 @@ public:
is_group_by_all = is_group_by_all_value; is_group_by_all = is_group_by_all_value;
} }
/// Returns true, if query node has ORDER BY ALL modifier, false otherwise /// Returns true, if query node has ORDER BY * modifier, false otherwise
bool isOrderByAll() const bool isOrderByAll() const
{ {
return is_order_by_all; return is_order_by_all;
} }
/// Set query node ORDER BY ALL modifier value /// Set query node ORDER BY * modifier value
void setIsOrderByAll(bool is_order_by_all_value) void setIsOrderByAll(bool is_order_by_all_value)
{ {
is_order_by_all = is_order_by_all_value; is_order_by_all = is_order_by_all_value;

View File

@ -127,7 +127,7 @@ BackupReaderS3::BackupReaderS3(
: BackupReaderDefault(read_settings_, write_settings_, getLogger("BackupReaderS3")) : BackupReaderDefault(read_settings_, write_settings_, getLogger("BackupReaderS3"))
, s3_uri(s3_uri_) , s3_uri(s3_uri_)
, data_source_description{DataSourceType::ObjectStorage, ObjectStorageType::S3, MetadataStorageType::None, s3_uri.endpoint, false, false} , data_source_description{DataSourceType::ObjectStorage, ObjectStorageType::S3, MetadataStorageType::None, s3_uri.endpoint, false, false}
, s3_settings(context_->getStorageS3Settings().getSettings(s3_uri.uri.toString())) , s3_settings(context_->getStorageS3Settings().getSettings(s3_uri.uri.toString(), context_->getUserName()))
{ {
auto & request_settings = s3_settings.request_settings; auto & request_settings = s3_settings.request_settings;
request_settings.updateFromSettings(context_->getSettingsRef()); request_settings.updateFromSettings(context_->getSettingsRef());
@ -217,7 +217,7 @@ BackupWriterS3::BackupWriterS3(
: BackupWriterDefault(read_settings_, write_settings_, getLogger("BackupWriterS3")) : BackupWriterDefault(read_settings_, write_settings_, getLogger("BackupWriterS3"))
, s3_uri(s3_uri_) , s3_uri(s3_uri_)
, data_source_description{DataSourceType::ObjectStorage, ObjectStorageType::S3, MetadataStorageType::None, s3_uri.endpoint, false, false} , data_source_description{DataSourceType::ObjectStorage, ObjectStorageType::S3, MetadataStorageType::None, s3_uri.endpoint, false, false}
, s3_settings(context_->getStorageS3Settings().getSettings(s3_uri.uri.toString())) , s3_settings(context_->getStorageS3Settings().getSettings(s3_uri.uri.toString(), context_->getUserName()))
{ {
auto & request_settings = s3_settings.request_settings; auto & request_settings = s3_settings.request_settings;
request_settings.updateFromSettings(context_->getSettingsRef()); request_settings.updateFromSettings(context_->getSettingsRef());

View File

@ -506,6 +506,10 @@ if (TARGET ch_contrib::s2)
dbms_target_link_libraries (PUBLIC ch_contrib::s2) dbms_target_link_libraries (PUBLIC ch_contrib::s2)
endif() endif()
if (TARGET ch_contrib::vectorscan)
dbms_target_link_libraries (PRIVATE ch_contrib::vectorscan)
endif()
if (TARGET ch_contrib::brotli) if (TARGET ch_contrib::brotli)
target_link_libraries (clickhouse_common_io PRIVATE ch_contrib::brotli) target_link_libraries (clickhouse_common_io PRIVATE ch_contrib::brotli)
endif() endif()

View File

@ -3,6 +3,7 @@
#include <Common/AsyncTaskExecutor.h> #include <Common/AsyncTaskExecutor.h>
#include <Common/Epoll.h> #include <Common/Epoll.h>
#include <Common/Fiber.h> #include <Common/Fiber.h>
#include <Common/FiberStack.h>
#include <Common/TimerDescriptor.h> #include <Common/TimerDescriptor.h>
#include <Common/PoolWithFailoverBase.h> #include <Common/PoolWithFailoverBase.h>
#include <Client/ConnectionPool.h> #include <Client/ConnectionPool.h>

View File

@ -28,7 +28,10 @@ public:
using Entry = PoolBase<Connection>::Entry; using Entry = PoolBase<Connection>::Entry;
IConnectionPool() = default; IConnectionPool() = default;
IConnectionPool(String host_, UInt16 port_) : host(host_), port(port_), address(host + ":" + toString(port_)) {} IConnectionPool(String host_, UInt16 port_, Priority config_priority_)
: host(host_), port(port_), address(host + ":" + toString(port_)), config_priority(config_priority_)
{
}
virtual ~IConnectionPool() = default; virtual ~IConnectionPool() = default;
@ -42,12 +45,13 @@ public:
const std::string & getHost() const { return host; } const std::string & getHost() const { return host; }
UInt16 getPort() const { return port; } UInt16 getPort() const { return port; }
const String & getAddress() const { return address; } const String & getAddress() const { return address; }
virtual Priority getPriority() const { return Priority{1}; } Priority getConfigPriority() const { return config_priority; }
protected: protected:
const String host; const String host;
const UInt16 port = 0; const UInt16 port = 0;
const String address; const String address;
const Priority config_priority;
}; };
using ConnectionPoolPtr = std::shared_ptr<IConnectionPool>; using ConnectionPoolPtr = std::shared_ptr<IConnectionPool>;
@ -61,32 +65,31 @@ public:
using Entry = IConnectionPool::Entry; using Entry = IConnectionPool::Entry;
using Base = PoolBase<Connection>; using Base = PoolBase<Connection>;
ConnectionPool(unsigned max_connections_, ConnectionPool(
const String & host_, unsigned max_connections_,
UInt16 port_, const String & host_,
const String & default_database_, UInt16 port_,
const String & user_, const String & default_database_,
const String & password_, const String & user_,
const String & quota_key_, const String & password_,
const String & cluster_, const String & quota_key_,
const String & cluster_secret_, const String & cluster_,
const String & client_name_, const String & cluster_secret_,
Protocol::Compression compression_, const String & client_name_,
Protocol::Secure secure_, Protocol::Compression compression_,
Priority priority_ = Priority{1}) Protocol::Secure secure_,
: IConnectionPool(host_, port_), Priority config_priority_ = Priority{1})
Base(max_connections_, : IConnectionPool(host_, port_, config_priority_)
getLogger("ConnectionPool (" + host_ + ":" + toString(port_) + ")")), , Base(max_connections_, getLogger("ConnectionPool (" + host_ + ":" + toString(port_) + ")"))
default_database(default_database_), , default_database(default_database_)
user(user_), , user(user_)
password(password_), , password(password_)
quota_key(quota_key_), , quota_key(quota_key_)
cluster(cluster_), , cluster(cluster_)
cluster_secret(cluster_secret_), , cluster_secret(cluster_secret_)
client_name(client_name_), , client_name(client_name_)
compression(compression_), , compression(compression_)
secure(secure_), , secure(secure_)
priority(priority_)
{ {
} }
@ -114,11 +117,6 @@ public:
return host + ":" + toString(port); return host + ":" + toString(port);
} }
Priority getPriority() const override
{
return priority;
}
protected: protected:
/** Creates a new object to put in the pool. */ /** Creates a new object to put in the pool. */
ConnectionPtr allocObject() override ConnectionPtr allocObject() override
@ -143,7 +141,6 @@ private:
String client_name; String client_name;
Protocol::Compression compression; /// Whether to compress data when interacting with the server. Protocol::Compression compression; /// Whether to compress data when interacting with the server.
Protocol::Secure secure; /// Whether to encrypt data when interacting with the server. Protocol::Secure secure; /// Whether to encrypt data when interacting with the server.
Priority priority; /// priority from <remote_servers>
}; };
/** /**

View File

@ -79,14 +79,6 @@ IConnectionPool::Entry ConnectionPoolWithFailover::get(const ConnectionTimeouts
return Base::get(max_ignored_errors, fallback_to_stale_replicas, try_get_entry, get_priority); return Base::get(max_ignored_errors, fallback_to_stale_replicas, try_get_entry, get_priority);
} }
Priority ConnectionPoolWithFailover::getPriority() const
{
return (*std::max_element(nested_pools.begin(), nested_pools.end(), [](const auto & a, const auto & b)
{
return a->getPriority() < b->getPriority();
}))->getPriority();
}
ConnectionPoolWithFailover::Status ConnectionPoolWithFailover::getStatus() const ConnectionPoolWithFailover::Status ConnectionPoolWithFailover::getStatus() const
{ {
const auto [states, pools, error_decrease_time] = getPoolExtendedStates(); const auto [states, pools, error_decrease_time] = getPoolExtendedStates();
@ -253,13 +245,13 @@ ConnectionPoolWithFailover::tryGetEntry(
} }
std::vector<ConnectionPoolWithFailover::Base::ShuffledPool> std::vector<ConnectionPoolWithFailover::Base::ShuffledPool>
ConnectionPoolWithFailover::getShuffledPools(const Settings & settings, GetPriorityForLoadBalancing::Func priority_func) ConnectionPoolWithFailover::getShuffledPools(const Settings & settings, GetPriorityForLoadBalancing::Func priority_func, bool use_slowdown_count)
{ {
if (!priority_func) if (!priority_func)
priority_func = makeGetPriorityFunc(settings); priority_func = makeGetPriorityFunc(settings);
UInt64 max_ignored_errors = settings.distributed_replica_max_ignored_errors.value; UInt64 max_ignored_errors = settings.distributed_replica_max_ignored_errors.value;
return Base::getShuffledPools(max_ignored_errors, priority_func); return Base::getShuffledPools(max_ignored_errors, priority_func, use_slowdown_count);
} }
} }

View File

@ -49,8 +49,6 @@ public:
const Settings & settings, const Settings & settings,
bool force_connected) override; /// From IConnectionPool bool force_connected) override; /// From IConnectionPool
Priority getPriority() const override; /// From IConnectionPool
/** Allocates up to the specified number of connections to work. /** Allocates up to the specified number of connections to work.
* Connections provide access to different replicas of one shard. * Connections provide access to different replicas of one shard.
*/ */
@ -83,15 +81,15 @@ public:
struct NestedPoolStatus struct NestedPoolStatus
{ {
const Base::NestedPoolPtr pool; const Base::NestedPoolPtr pool;
size_t error_count; size_t error_count = 0;
size_t slowdown_count; size_t slowdown_count = 0;
std::chrono::seconds estimated_recovery_time; std::chrono::seconds estimated_recovery_time;
}; };
using Status = std::vector<NestedPoolStatus>; using Status = std::vector<NestedPoolStatus>;
Status getStatus() const; Status getStatus() const;
std::vector<Base::ShuffledPool> getShuffledPools(const Settings & settings, GetPriorityFunc priority_func = {}); std::vector<Base::ShuffledPool> getShuffledPools(const Settings & settings, GetPriorityFunc priority_func = {}, bool use_slowdown_count = false);
size_t getMaxErrorCup() const { return Base::max_error_cap; } size_t getMaxErrorCup() const { return Base::max_error_cap; }

View File

@ -40,7 +40,8 @@ HedgedConnectionsFactory::HedgedConnectionsFactory(
, max_parallel_replicas(max_parallel_replicas_) , max_parallel_replicas(max_parallel_replicas_)
, skip_unavailable_shards(skip_unavailable_shards_) , skip_unavailable_shards(skip_unavailable_shards_)
{ {
shuffled_pools = pool->getShuffledPools(settings_, priority_func); shuffled_pools = pool->getShuffledPools(settings_, priority_func, /* use_slowdown_count */ true);
for (const auto & shuffled_pool : shuffled_pools) for (const auto & shuffled_pool : shuffled_pools)
replicas.emplace_back( replicas.emplace_back(
std::make_unique<ConnectionEstablisherAsync>(shuffled_pool.pool, &timeouts, settings_, log, table_to_check.get())); std::make_unique<ConnectionEstablisherAsync>(shuffled_pool.pool, &timeouts, settings_, log, table_to_check.get()));

View File

@ -320,7 +320,7 @@ Packet MultiplexedConnections::receivePacketUnlocked(AsyncCallback async_callbac
ReplicaState & state = getReplicaForReading(); ReplicaState & state = getReplicaForReading();
current_connection = state.connection; current_connection = state.connection;
if (current_connection == nullptr) if (current_connection == nullptr)
throw Exception(ErrorCodes::NO_AVAILABLE_REPLICA, "No available replica"); throw Exception(ErrorCodes::NO_AVAILABLE_REPLICA, "Logical error: no available replica");
Packet packet; Packet packet;
try try

View File

@ -5,6 +5,7 @@
#include <variant> #include <variant>
#include <Client/IConnections.h> #include <Client/IConnections.h>
#include <Common/FiberStack.h>
#include <Common/Fiber.h> #include <Common/Fiber.h>
#include <Common/Epoll.h> #include <Common/Epoll.h>
#include <Common/TimerDescriptor.h> #include <Common/TimerDescriptor.h>

View File

@ -810,7 +810,7 @@ ColumnPtr ColumnArray::filterTuple(const Filter & filt, ssize_t result_size_hint
size_t tuple_size = tuple.tupleSize(); size_t tuple_size = tuple.tupleSize();
if (tuple_size == 0) if (tuple_size == 0)
throw Exception(ErrorCodes::LOGICAL_ERROR, "Empty tuple"); throw Exception(ErrorCodes::LOGICAL_ERROR, "Logical error: empty tuple");
Columns temporary_arrays(tuple_size); Columns temporary_arrays(tuple_size);
for (size_t i = 0; i < tuple_size; ++i) for (size_t i = 0; i < tuple_size; ++i)
@ -1263,7 +1263,7 @@ ColumnPtr ColumnArray::replicateTuple(const Offsets & replicate_offsets) const
size_t tuple_size = tuple.tupleSize(); size_t tuple_size = tuple.tupleSize();
if (tuple_size == 0) if (tuple_size == 0)
throw Exception(ErrorCodes::LOGICAL_ERROR, "Empty tuple"); throw Exception(ErrorCodes::LOGICAL_ERROR, "Logical error: empty tuple");
Columns temporary_arrays(tuple_size); Columns temporary_arrays(tuple_size);
for (size_t i = 0; i < tuple_size; ++i) for (size_t i = 0; i < tuple_size; ++i)

View File

@ -1,5 +1,7 @@
#include <Common/Arena.h> #include <Common/Arena.h>
#include <Common/SipHash.h> #include <Common/SipHash.h>
#include <Common/NaNUtils.h>
#include <Common/typeid_cast.h>
#include <Common/assert_cast.h> #include <Common/assert_cast.h>
#include <Common/WeakHash.h> #include <Common/WeakHash.h>
#include <Columns/ColumnDecimal.h> #include <Columns/ColumnDecimal.h>
@ -26,6 +28,7 @@ namespace ErrorCodes
{ {
extern const int LOGICAL_ERROR; extern const int LOGICAL_ERROR;
extern const int ILLEGAL_COLUMN; extern const int ILLEGAL_COLUMN;
extern const int SIZES_OF_NESTED_COLUMNS_ARE_INCONSISTENT;
extern const int NOT_IMPLEMENTED; extern const int NOT_IMPLEMENTED;
} }
@ -826,7 +829,8 @@ void ColumnNullable::applyNullMap(const ColumnNullable & other)
void ColumnNullable::checkConsistency() const void ColumnNullable::checkConsistency() const
{ {
if (null_map->size() != getNestedColumn().size()) if (null_map->size() != getNestedColumn().size())
throw Exception(ErrorCodes::LOGICAL_ERROR, "Sizes of nested column and null map of Nullable column are not equal"); throw Exception(ErrorCodes::SIZES_OF_NESTED_COLUMNS_ARE_INCONSISTENT,
"Logical error: Sizes of nested column and null map of Nullable column are not equal");
} }
ColumnPtr ColumnNullable::createWithOffsets(const IColumn::Offsets & offsets, const ColumnConst & column_with_default_value, size_t total_rows, size_t shift) const ColumnPtr ColumnNullable::createWithOffsets(const IColumn::Offsets & offsets, const ColumnConst & column_with_default_value, size_t total_rows, size_t shift) const

View File

@ -21,7 +21,7 @@ static bool sameConstants(const IColumn & a, const IColumn & b)
ColumnWithTypeAndName getLeastSuperColumn(const std::vector<const ColumnWithTypeAndName *> & columns) ColumnWithTypeAndName getLeastSuperColumn(const std::vector<const ColumnWithTypeAndName *> & columns)
{ {
if (columns.empty()) if (columns.empty())
throw Exception(ErrorCodes::LOGICAL_ERROR, "No src columns for supercolumn"); throw Exception(ErrorCodes::LOGICAL_ERROR, "Logical error: no src columns for supercolumn");
ColumnWithTypeAndName result = *columns[0]; ColumnWithTypeAndName result = *columns[0];

View File

@ -12,10 +12,10 @@ namespace ErrorCodes
ConcurrencyControl::Slot::~Slot() ConcurrencyControl::Slot::~Slot()
{ {
allocation->release(); static_cast<ConcurrencyControl::Allocation&>(*allocation).release();
} }
ConcurrencyControl::Slot::Slot(AllocationPtr && allocation_) ConcurrencyControl::Slot::Slot(SlotAllocationPtr && allocation_)
: allocation(std::move(allocation_)) : allocation(std::move(allocation_))
{ {
} }
@ -27,7 +27,7 @@ ConcurrencyControl::Allocation::~Allocation()
parent.free(this); parent.free(this);
} }
[[nodiscard]] ConcurrencyControl::SlotPtr ConcurrencyControl::Allocation::tryAcquire() [[nodiscard]] AcquiredSlotPtr ConcurrencyControl::Allocation::tryAcquire()
{ {
SlotCount value = granted.load(); SlotCount value = granted.load();
while (value) while (value)
@ -35,15 +35,21 @@ ConcurrencyControl::Allocation::~Allocation()
if (granted.compare_exchange_strong(value, value - 1)) if (granted.compare_exchange_strong(value, value - 1))
{ {
std::unique_lock lock{mutex}; std::unique_lock lock{mutex};
return SlotPtr(new Slot(shared_from_this())); // can't use std::make_shared due to private ctor return AcquiredSlotPtr(new Slot(shared_from_this())); // can't use std::make_shared due to private ctor
} }
} }
return {}; // avoid unnecessary locking return {}; // avoid unnecessary locking
} }
ConcurrencyControl::SlotCount ConcurrencyControl::Allocation::grantedCount() const SlotCount ConcurrencyControl::Allocation::grantedCount() const
{ {
return granted; return granted.load();
}
SlotCount ConcurrencyControl::Allocation::allocatedCount() const
{
std::unique_lock lock{mutex};
return allocated;
} }
ConcurrencyControl::Allocation::Allocation(ConcurrencyControl & parent_, SlotCount limit_, SlotCount granted_, Waiters::iterator waiter_) ConcurrencyControl::Allocation::Allocation(ConcurrencyControl & parent_, SlotCount limit_, SlotCount granted_, Waiters::iterator waiter_)
@ -87,7 +93,7 @@ ConcurrencyControl::~ConcurrencyControl()
abort(); abort();
} }
[[nodiscard]] ConcurrencyControl::AllocationPtr ConcurrencyControl::allocate(SlotCount min, SlotCount max) [[nodiscard]] SlotAllocationPtr ConcurrencyControl::allocate(SlotCount min, SlotCount max)
{ {
if (min > max) if (min > max)
throw Exception(ErrorCodes::LOGICAL_ERROR, "ConcurrencyControl: invalid allocation requirements"); throw Exception(ErrorCodes::LOGICAL_ERROR, "ConcurrencyControl: invalid allocation requirements");
@ -100,13 +106,13 @@ ConcurrencyControl::~ConcurrencyControl()
// Create allocation and start waiting if more slots are required // Create allocation and start waiting if more slots are required
if (granted < max) if (granted < max)
return AllocationPtr(new Allocation(*this, max, granted, return SlotAllocationPtr(new Allocation(*this, max, granted,
waiters.insert(cur_waiter, nullptr /* pointer is set by Allocation ctor */))); waiters.insert(cur_waiter, nullptr /* pointer is set by Allocation ctor */)));
else else
return AllocationPtr(new Allocation(*this, max, granted)); return SlotAllocationPtr(new Allocation(*this, max, granted));
} }
void ConcurrencyControl::setMaxConcurrency(ConcurrencyControl::SlotCount value) void ConcurrencyControl::setMaxConcurrency(SlotCount value)
{ {
std::unique_lock lock{mutex}; std::unique_lock lock{mutex};
max_concurrency = std::max<SlotCount>(1, value); // never allow max_concurrency to be zero max_concurrency = std::max<SlotCount>(1, value); // never allow max_concurrency to be zero
@ -162,7 +168,7 @@ void ConcurrencyControl::schedule(std::unique_lock<std::mutex> &)
} }
} }
ConcurrencyControl::SlotCount ConcurrencyControl::available(std::unique_lock<std::mutex> &) const SlotCount ConcurrencyControl::available(std::unique_lock<std::mutex> &) const
{ {
if (cur_concurrency < max_concurrency) if (cur_concurrency < max_concurrency)
return max_concurrency - cur_concurrency; return max_concurrency - cur_concurrency;

View File

@ -7,6 +7,7 @@
#include <base/types.h> #include <base/types.h>
#include <boost/core/noncopyable.hpp> #include <boost/core/noncopyable.hpp>
#include <Common/ISlotControl.h>
namespace DB namespace DB
{ {
@ -34,41 +35,35 @@ namespace DB
* Oversubscription is possible: total amount of allocated slots can exceed `setMaxConcurrency(limit)` * Oversubscription is possible: total amount of allocated slots can exceed `setMaxConcurrency(limit)`
* because `min` amount of slots is allocated for each query unconditionally. * because `min` amount of slots is allocated for each query unconditionally.
*/ */
class ConcurrencyControl : boost::noncopyable class ConcurrencyControl : public ISlotControl
{ {
public: public:
struct Allocation; struct Allocation;
using AllocationPtr = std::shared_ptr<Allocation>;
using SlotCount = UInt64;
using Waiters = std::list<Allocation *>; using Waiters = std::list<Allocation *>;
static constexpr SlotCount Unlimited = std::numeric_limits<SlotCount>::max();
// Scoped guard for acquired slot, see Allocation::tryAcquire() // Scoped guard for acquired slot, see Allocation::tryAcquire()
struct Slot : boost::noncopyable struct Slot : public IAcquiredSlot
{ {
~Slot(); ~Slot() override;
private: private:
friend struct Allocation; // for ctor friend struct Allocation; // for ctor
explicit Slot(AllocationPtr && allocation_); explicit Slot(SlotAllocationPtr && allocation_);
AllocationPtr allocation; SlotAllocationPtr allocation;
}; };
// FIXME: have to be unique_ptr, but ThreadFromGlobalPool does not support move semantics yet
using SlotPtr = std::shared_ptr<Slot>;
// Manages group of slots for a single query, see ConcurrencyControl::allocate(min, max) // Manages group of slots for a single query, see ConcurrencyControl::allocate(min, max)
struct Allocation : std::enable_shared_from_this<Allocation>, boost::noncopyable struct Allocation : public ISlotAllocation
{ {
~Allocation(); ~Allocation() override;
// Take one already granted slot if available. Lock-free iff there is no granted slot. // Take one already granted slot if available. Lock-free iff there is no granted slot.
[[nodiscard]] SlotPtr tryAcquire(); [[nodiscard]] AcquiredSlotPtr tryAcquire() override;
SlotCount grantedCount() const; SlotCount grantedCount() const override;
SlotCount allocatedCount() const override;
private: private:
friend struct Slot; // for release() friend struct Slot; // for release()
@ -94,7 +89,7 @@ public:
ConcurrencyControl & parent; ConcurrencyControl & parent;
const SlotCount limit; const SlotCount limit;
std::mutex mutex; // the following values must be accessed under this mutex mutable std::mutex mutex; // the following values must be accessed under this mutex
SlotCount allocated; // allocated total (including already `released`) SlotCount allocated; // allocated total (including already `released`)
SlotCount released = 0; SlotCount released = 0;
@ -103,17 +98,16 @@ public:
const Waiters::iterator waiter; // iterator to itself in Waiters list; valid iff allocated < limit const Waiters::iterator waiter; // iterator to itself in Waiters list; valid iff allocated < limit
}; };
public:
ConcurrencyControl(); ConcurrencyControl();
// WARNING: all Allocation objects MUST be destructed before ConcurrencyControl // WARNING: all Allocation objects MUST be destructed before ConcurrencyControl
// NOTE: Recommended way to achieve this is to use `instance()` and do graceful shutdown of queries // NOTE: Recommended way to achieve this is to use `instance()` and do graceful shutdown of queries
~ConcurrencyControl(); ~ConcurrencyControl() override;
// Allocate at least `min` and at most `max` slots. // Allocate at least `min` and at most `max` slots.
// If not all `max` slots were successfully allocated, a subscription for later allocation is created // If not all `max` slots were successfully allocated, a subscription for later allocation is created
// Use `Allocation::tryAcquire()` to acquire allocated slot, before running a thread. // Use `Allocation::tryAcquire()` to acquire allocated slot, before running a thread.
[[nodiscard]] AllocationPtr allocate(SlotCount min, SlotCount max); [[nodiscard]] SlotAllocationPtr allocate(SlotCount min, SlotCount max) override;
void setMaxConcurrency(SlotCount value); void setMaxConcurrency(SlotCount value);
@ -134,7 +128,7 @@ private:
std::mutex mutex; std::mutex mutex;
Waiters waiters; Waiters waiters;
Waiters::iterator cur_waiter; // round-robin pointer Waiters::iterator cur_waiter; // round-robin pointer
SlotCount max_concurrency = Unlimited; SlotCount max_concurrency = UnlimitedSlots;
SlotCount cur_concurrency = 0; SlotCount cur_concurrency = 0;
}; };

View File

@ -17,7 +17,7 @@ private:
template <typename T> friend class FiberLocal; template <typename T> friend class FiberLocal;
public: public:
template <typename StackAlloc, typename Fn> template< typename StackAlloc, typename Fn>
Fiber(StackAlloc && salloc, Fn && fn) : impl(std::allocator_arg_t(), std::forward<StackAlloc>(salloc), RoutineImpl(std::forward<Fn>(fn))) Fiber(StackAlloc && salloc, Fn && fn) : impl(std::allocator_arg_t(), std::forward<StackAlloc>(salloc), RoutineImpl(std::forward<Fn>(fn)))
{ {
} }
@ -46,12 +46,6 @@ public:
current_fiber = parent_fiber; current_fiber = parent_fiber;
} }
static FiberPtr & getCurrentFiber()
{
thread_local static FiberPtr current_fiber;
return current_fiber;
}
private: private:
template <typename Fn> template <typename Fn>
struct RoutineImpl struct RoutineImpl
@ -80,6 +74,12 @@ private:
Fn fn; Fn fn;
}; };
static FiberPtr & getCurrentFiber()
{
thread_local static FiberPtr current_fiber;
return current_fiber;
}
/// Special wrapper to store data in uniquer_ptr. /// Special wrapper to store data in uniquer_ptr.
struct DataWrapper struct DataWrapper
{ {
@ -146,3 +146,4 @@ private:
T main_instance; T main_instance;
}; };

76
src/Common/ISlotControl.h Normal file
View File

@ -0,0 +1,76 @@
#pragma once
#include <limits>
#include <memory>
#include <base/types.h>
#include <boost/core/noncopyable.hpp>
namespace DB
{
// Interfaces for abstract "slot" allocation and control.
// Slot is a virtual entity existing in a limited amount (CPUs or memory chunks, etc).
//
// Every slot can be in one of the following states:
// * free: slot is available to be allocated.
// * allocated: slot is allocated to a specific ISlotAllocation.
//
// Allocated slots can be in one of the following states:
// * granted: allocated, but not yet acquired.
// * acquired: a granted slot becomes acquired by using IAcquiredSlot.
//
// Example for CPU (see ConcurrencyControl.h). Every slot represents one CPU in the system.
// Slot allocation is a request to allocate specific number of CPUs for a specific query.
// Acquired slot is an entity that is held by a thread as long as it is running. This allows
// total number of threads in the system to be limited and the distribution process to be controlled.
//
// TODO:
// - for preemption - ability to return granted slot back and reacquire it later.
// - for memory allocations - variable size of slots (in bytes).
/// Number of slots
using SlotCount = UInt64;
/// Unlimited number of slots
constexpr SlotCount UnlimitedSlots = std::numeric_limits<SlotCount>::max();
/// Acquired slot holder. Slot is considered to be acquired as long as the object exists.
class IAcquiredSlot : public std::enable_shared_from_this<IAcquiredSlot>, boost::noncopyable
{
public:
virtual ~IAcquiredSlot() = default;
};
using AcquiredSlotPtr = std::shared_ptr<IAcquiredSlot>;
/// Request for allocation of slots from ISlotControl.
/// Allows for more slots to be acquired and the whole request to be canceled.
class ISlotAllocation : public std::enable_shared_from_this<ISlotAllocation>, boost::noncopyable
{
public:
virtual ~ISlotAllocation() = default;
/// Take one already granted slot if available.
[[nodiscard]] virtual AcquiredSlotPtr tryAcquire() = 0;
/// Returns the number of granted slots for given allocation (i.e. available to be acquired)
virtual SlotCount grantedCount() const = 0;
/// Returns the total number of slots allocated at the moment (acquired and granted)
virtual SlotCount allocatedCount() const = 0;
};
using SlotAllocationPtr = std::shared_ptr<ISlotAllocation>;
class ISlotControl : boost::noncopyable
{
public:
virtual ~ISlotControl() = default;
// Allocate at least `min` and at most `max` slots.
// If not all `max` slots were successfully allocated, a "subscription" for later allocation is created
[[nodiscard]] virtual SlotAllocationPtr allocate(SlotCount min, SlotCount max) = 0;
};
}

View File

@ -66,7 +66,7 @@ public:
, log(log_) , log(log_)
{ {
for (size_t i = 0;i < nested_pools.size(); ++i) for (size_t i = 0;i < nested_pools.size(); ++i)
shared_pool_states[i].config_priority = nested_pools[i]->getPriority(); shared_pool_states[i].config_priority = nested_pools[i]->getConfigPriority();
} }
struct TryResult struct TryResult
@ -133,7 +133,7 @@ protected:
void updateErrorCounts(PoolStates & states, time_t & last_decrease_time) const; void updateErrorCounts(PoolStates & states, time_t & last_decrease_time) const;
std::vector<ShuffledPool> getShuffledPools(size_t max_ignored_errors, const GetPriorityFunc & get_priority); std::vector<ShuffledPool> getShuffledPools(size_t max_ignored_errors, const GetPriorityFunc & get_priority, bool use_slowdown_count = false);
inline void updateSharedErrorCounts(std::vector<ShuffledPool> & shuffled_pools); inline void updateSharedErrorCounts(std::vector<ShuffledPool> & shuffled_pools);
@ -160,7 +160,7 @@ protected:
template <typename TNestedPool> template <typename TNestedPool>
std::vector<typename PoolWithFailoverBase<TNestedPool>::ShuffledPool> std::vector<typename PoolWithFailoverBase<TNestedPool>::ShuffledPool>
PoolWithFailoverBase<TNestedPool>::getShuffledPools( PoolWithFailoverBase<TNestedPool>::getShuffledPools(
size_t max_ignored_errors, const PoolWithFailoverBase::GetPriorityFunc & get_priority) size_t max_ignored_errors, const PoolWithFailoverBase::GetPriorityFunc & get_priority, bool use_slowdown_count)
{ {
/// Update random numbers and error counts. /// Update random numbers and error counts.
PoolStates pool_states = updatePoolStates(max_ignored_errors); PoolStates pool_states = updatePoolStates(max_ignored_errors);
@ -175,13 +175,13 @@ PoolWithFailoverBase<TNestedPool>::getShuffledPools(
std::vector<ShuffledPool> shuffled_pools; std::vector<ShuffledPool> shuffled_pools;
shuffled_pools.reserve(nested_pools.size()); shuffled_pools.reserve(nested_pools.size());
for (size_t i = 0; i < nested_pools.size(); ++i) for (size_t i = 0; i < nested_pools.size(); ++i)
shuffled_pools.push_back(ShuffledPool{nested_pools[i], &pool_states[i], i, /* error_count = */ 0, /* slowdown_count = */ 0}); shuffled_pools.emplace_back(ShuffledPool{.pool = nested_pools[i], .state = &pool_states[i], .index = i});
::sort( ::sort(
shuffled_pools.begin(), shuffled_pools.end(), shuffled_pools.begin(), shuffled_pools.end(),
[](const ShuffledPool & lhs, const ShuffledPool & rhs) [use_slowdown_count](const ShuffledPool & lhs, const ShuffledPool & rhs)
{ {
return PoolState::compare(*lhs.state, *rhs.state); return PoolState::compare(*lhs.state, *rhs.state, use_slowdown_count);
}); });
return shuffled_pools; return shuffled_pools;
@ -344,10 +344,14 @@ struct PoolWithFailoverBase<TNestedPool>::PoolState
random = rng(); random = rng();
} }
static bool compare(const PoolState & lhs, const PoolState & rhs) static bool compare(const PoolState & lhs, const PoolState & rhs, bool use_slowdown_count)
{ {
return std::forward_as_tuple(lhs.error_count, lhs.slowdown_count, lhs.config_priority, lhs.priority, lhs.random) if (use_slowdown_count)
< std::forward_as_tuple(rhs.error_count, rhs.slowdown_count, rhs.config_priority, rhs.priority, rhs.random); return std::forward_as_tuple(lhs.error_count, lhs.slowdown_count, lhs.config_priority, lhs.priority, lhs.random)
< std::forward_as_tuple(rhs.error_count, rhs.slowdown_count, rhs.config_priority, rhs.priority, rhs.random);
else
return std::forward_as_tuple(lhs.error_count, lhs.config_priority, lhs.priority, lhs.random)
< std::forward_as_tuple(rhs.error_count, rhs.config_priority, rhs.priority, rhs.random);
} }
private: private:

View File

@ -632,6 +632,12 @@ The server successfully detected this situation and will download merged part fr
M(InterfacePostgreSQLReceiveBytes, "Number of bytes received through PostgreSQL interfaces") \ M(InterfacePostgreSQLReceiveBytes, "Number of bytes received through PostgreSQL interfaces") \
\ \
M(ParallelReplicasUsedCount, "Number of replicas used to execute a query with task-based parallel replicas") \ M(ParallelReplicasUsedCount, "Number of replicas used to execute a query with task-based parallel replicas") \
\
M(KeeperLogsEntryReadFromLatestCache, "Number of log entries in Keeper being read from latest logs cache") \
M(KeeperLogsEntryReadFromCommitCache, "Number of log entries in Keeper being read from commit logs cache") \
M(KeeperLogsEntryReadFromFile, "Number of log entries in Keeper being read directly from the changelog file") \
M(KeeperLogsPrefetchedEntries, "Number of log entries in Keeper being prefetched from the changelog file") \
\
M(ParallelReplicasAvailableCount, "Number of replicas available to execute a query with task-based parallel replicas") \ M(ParallelReplicasAvailableCount, "Number of replicas available to execute a query with task-based parallel replicas") \
M(ParallelReplicasUnavailableCount, "Number of replicas which was chosen, but found to be unavailable during query execution with task-based parallel replicas") \ M(ParallelReplicasUnavailableCount, "Number of replicas which was chosen, but found to be unavailable during query execution with task-based parallel replicas") \

View File

@ -91,7 +91,7 @@ void SensitiveDataMasker::setInstance(std::unique_ptr<SensitiveDataMasker>&& sen
{ {
if (!sensitive_data_masker_) if (!sensitive_data_masker_)
throw Exception(ErrorCodes::LOGICAL_ERROR, "The 'sensitive_data_masker' is not set"); throw Exception(ErrorCodes::LOGICAL_ERROR, "Logical error: the 'sensitive_data_masker' is not set");
if (sensitive_data_masker_->rulesCount() > 0) if (sensitive_data_masker_->rulesCount() > 0)
{ {

View File

@ -209,7 +209,7 @@ public:
{ {
if (!is_reference_128) if (!is_reference_128)
throw DB::Exception( throw DB::Exception(
DB::ErrorCodes::LOGICAL_ERROR, "Can't call get128Reference when is_reference_128 is not set"); DB::ErrorCodes::LOGICAL_ERROR, "Logical error: can't call get128Reference when is_reference_128 is not set");
finalize(); finalize();
const auto lo = v0 ^ v1 ^ v2 ^ v3; const auto lo = v0 ^ v1 ^ v2 ^ v3;
v1 ^= 0xdd; v1 ^= 0xdd;

View File

@ -448,6 +448,9 @@ toStringEveryLineImpl([[maybe_unused]] bool fatal, const StackTraceRefTriple & s
DB::writePointerHex(frame.physical_addr, out); DB::writePointerHex(frame.physical_addr, out);
} }
if (frame.object.has_value())
out << " in " << *frame.object;
callback(out.str()); callback(out.str());
}; };
#else #else

View File

@ -1,8 +1,8 @@
#include <base/getThreadId.h>
#include <base/defines.h> /// THREAD_SANITIZER
#include <Common/checkStackSize.h> #include <Common/checkStackSize.h>
#include <Common/Exception.h> #include <Common/Exception.h>
#include <Common/Fiber.h> #include <base/getThreadId.h>
#include <base/scope_guard.h>
#include <base/defines.h> /// THREAD_SANITIZER
#include <sys/resource.h> #include <sys/resource.h>
#include <pthread.h> #include <pthread.h>
#include <unistd.h> #include <unistd.h>
@ -114,10 +114,6 @@ __attribute__((__weak__)) void checkStackSize()
{ {
using namespace DB; using namespace DB;
/// Not implemented for coroutines.
if (Fiber::getCurrentFiber())
return;
if (!stack_address) if (!stack_address)
max_stack_size = getStackSize(&stack_address); max_stack_size = getStackSize(&stack_address);
@ -140,7 +136,7 @@ __attribute__((__weak__)) void checkStackSize()
/// We assume that stack grows towards lower addresses. And that it starts to grow from the end of a chunk of memory of max_stack_size. /// We assume that stack grows towards lower addresses. And that it starts to grow from the end of a chunk of memory of max_stack_size.
if (int_frame_address > int_stack_address + max_stack_size) if (int_frame_address > int_stack_address + max_stack_size)
throw Exception(ErrorCodes::LOGICAL_ERROR, "Frame address is greater than stack begin address"); throw Exception(ErrorCodes::LOGICAL_ERROR, "Logical error: frame address is greater than stack begin address");
size_t stack_size = int_stack_address + max_stack_size - int_frame_address; size_t stack_size = int_stack_address + max_stack_size - int_frame_address;
size_t max_stack_size_allowed = static_cast<size_t>(max_stack_size * STACK_SIZE_FREE_RATIO); size_t max_stack_size_allowed = static_cast<size_t>(max_stack_size * STACK_SIZE_FREE_RATIO);

View File

@ -15,7 +15,7 @@ struct ConcurrencyControlTest
{ {
ConcurrencyControl cc; ConcurrencyControl cc;
explicit ConcurrencyControlTest(ConcurrencyControl::SlotCount limit = ConcurrencyControl::Unlimited) explicit ConcurrencyControlTest(SlotCount limit = UnlimitedSlots)
{ {
cc.setMaxConcurrency(limit); cc.setMaxConcurrency(limit);
} }
@ -25,7 +25,7 @@ TEST(ConcurrencyControl, Unlimited)
{ {
ConcurrencyControlTest t; // unlimited number of slots ConcurrencyControlTest t; // unlimited number of slots
auto slots = t.cc.allocate(0, 100500); auto slots = t.cc.allocate(0, 100500);
std::vector<ConcurrencyControl::SlotPtr> acquired; std::vector<AcquiredSlotPtr> acquired;
while (auto slot = slots->tryAcquire()) while (auto slot = slots->tryAcquire())
acquired.emplace_back(std::move(slot)); acquired.emplace_back(std::move(slot));
ASSERT_TRUE(acquired.size() == 100500); ASSERT_TRUE(acquired.size() == 100500);
@ -34,14 +34,14 @@ TEST(ConcurrencyControl, Unlimited)
TEST(ConcurrencyControl, Fifo) TEST(ConcurrencyControl, Fifo)
{ {
ConcurrencyControlTest t(1); // use single slot ConcurrencyControlTest t(1); // use single slot
std::vector<ConcurrencyControl::AllocationPtr> allocations; std::vector<SlotAllocationPtr> allocations;
constexpr int count = 42; constexpr int count = 42;
allocations.reserve(count); allocations.reserve(count);
for (int i = 0; i < count; i++) for (int i = 0; i < count; i++)
allocations.emplace_back(t.cc.allocate(0, 1)); allocations.emplace_back(t.cc.allocate(0, 1));
for (int i = 0; i < count; i++) for (int i = 0; i < count; i++)
{ {
ConcurrencyControl::SlotPtr holder; AcquiredSlotPtr holder;
for (int j = 0; j < count; j++) for (int j = 0; j < count; j++)
{ {
auto slot = allocations[j]->tryAcquire(); auto slot = allocations[j]->tryAcquire();
@ -60,11 +60,11 @@ TEST(ConcurrencyControl, Fifo)
TEST(ConcurrencyControl, Oversubscription) TEST(ConcurrencyControl, Oversubscription)
{ {
ConcurrencyControlTest t(10); ConcurrencyControlTest t(10);
std::vector<ConcurrencyControl::AllocationPtr> allocations; std::vector<SlotAllocationPtr> allocations;
allocations.reserve(10); allocations.reserve(10);
for (int i = 0; i < 10; i++) for (int i = 0; i < 10; i++)
allocations.emplace_back(t.cc.allocate(1, 2)); allocations.emplace_back(t.cc.allocate(1, 2));
std::vector<ConcurrencyControl::SlotPtr> slots; std::vector<AcquiredSlotPtr> slots;
// Normal allocation using maximum amount of slots // Normal allocation using maximum amount of slots
for (int i = 0; i < 5; i++) for (int i = 0; i < 5; i++)
{ {
@ -90,7 +90,7 @@ TEST(ConcurrencyControl, ReleaseUnacquiredSlots)
{ {
ConcurrencyControlTest t(10); ConcurrencyControlTest t(10);
{ {
std::vector<ConcurrencyControl::AllocationPtr> allocations; std::vector<SlotAllocationPtr> allocations;
allocations.reserve(10); allocations.reserve(10);
for (int i = 0; i < 10; i++) for (int i = 0; i < 10; i++)
allocations.emplace_back(t.cc.allocate(1, 2)); allocations.emplace_back(t.cc.allocate(1, 2));
@ -98,7 +98,7 @@ TEST(ConcurrencyControl, ReleaseUnacquiredSlots)
} }
// Check that slots were actually released // Check that slots were actually released
auto allocation = t.cc.allocate(0, 20); auto allocation = t.cc.allocate(0, 20);
std::vector<ConcurrencyControl::SlotPtr> acquired; std::vector<AcquiredSlotPtr> acquired;
while (auto slot = allocation->tryAcquire()) while (auto slot = allocation->tryAcquire())
acquired.emplace_back(std::move(slot)); acquired.emplace_back(std::move(slot));
ASSERT_TRUE(acquired.size() == 10); ASSERT_TRUE(acquired.size() == 10);
@ -110,7 +110,7 @@ TEST(ConcurrencyControl, DestroyNotFullyAllocatedAllocation)
for (int i = 0; i < 3; i++) for (int i = 0; i < 3; i++)
{ {
auto allocation = t.cc.allocate(5, 20); auto allocation = t.cc.allocate(5, 20);
std::vector<ConcurrencyControl::SlotPtr> acquired; std::vector<AcquiredSlotPtr> acquired;
while (auto slot = allocation->tryAcquire()) while (auto slot = allocation->tryAcquire())
acquired.emplace_back(std::move(slot)); acquired.emplace_back(std::move(slot));
ASSERT_TRUE(acquired.size() == 10); ASSERT_TRUE(acquired.size() == 10);
@ -122,7 +122,7 @@ TEST(ConcurrencyControl, DestroyAllocationBeforeSlots)
ConcurrencyControlTest t(10); ConcurrencyControlTest t(10);
for (int i = 0; i < 3; i++) for (int i = 0; i < 3; i++)
{ {
std::vector<ConcurrencyControl::SlotPtr> acquired; std::vector<AcquiredSlotPtr> acquired;
auto allocation = t.cc.allocate(5, 20); auto allocation = t.cc.allocate(5, 20);
while (auto slot = allocation->tryAcquire()) while (auto slot = allocation->tryAcquire())
acquired.emplace_back(std::move(slot)); acquired.emplace_back(std::move(slot));
@ -135,7 +135,7 @@ TEST(ConcurrencyControl, GrantReleasedToTheSameAllocation)
{ {
ConcurrencyControlTest t(3); ConcurrencyControlTest t(3);
auto allocation = t.cc.allocate(0, 10); auto allocation = t.cc.allocate(0, 10);
std::list<ConcurrencyControl::SlotPtr> acquired; std::list<AcquiredSlotPtr> acquired;
while (auto slot = allocation->tryAcquire()) while (auto slot = allocation->tryAcquire())
acquired.emplace_back(std::move(slot)); acquired.emplace_back(std::move(slot));
ASSERT_TRUE(acquired.size() == 3); // 0 1 2 ASSERT_TRUE(acquired.size() == 3); // 0 1 2
@ -183,7 +183,7 @@ TEST(ConcurrencyControl, SetSlotCount)
{ {
ConcurrencyControlTest t(10); ConcurrencyControlTest t(10);
auto allocation = t.cc.allocate(5, 30); auto allocation = t.cc.allocate(5, 30);
std::vector<ConcurrencyControl::SlotPtr> acquired; std::vector<AcquiredSlotPtr> acquired;
while (auto slot = allocation->tryAcquire()) while (auto slot = allocation->tryAcquire())
acquired.emplace_back(std::move(slot)); acquired.emplace_back(std::move(slot));
ASSERT_TRUE(acquired.size() == 10); ASSERT_TRUE(acquired.size() == 10);
@ -200,7 +200,7 @@ TEST(ConcurrencyControl, SetSlotCount)
ASSERT_TRUE(acquired.size() == 5); ASSERT_TRUE(acquired.size() == 5);
// Check that newly added slots are equally distributed over waiting allocations // Check that newly added slots are equally distributed over waiting allocations
std::vector<ConcurrencyControl::SlotPtr> acquired2; std::vector<AcquiredSlotPtr> acquired2;
auto allocation2 = t.cc.allocate(0, 30); auto allocation2 = t.cc.allocate(0, 30);
ASSERT_TRUE(!allocation->tryAcquire()); ASSERT_TRUE(!allocation->tryAcquire());
t.cc.setMaxConcurrency(15); // 10 slots added: 5 to the first allocation and 5 to the second one t.cc.setMaxConcurrency(15); // 10 slots added: 5 to the first allocation and 5 to the second one
@ -224,7 +224,7 @@ TEST(ConcurrencyControl, MultipleThreads)
auto run_query = [&] (size_t max_threads) auto run_query = [&] (size_t max_threads)
{ {
ConcurrencyControl::AllocationPtr slots = t.cc.allocate(1, max_threads); SlotAllocationPtr slots = t.cc.allocate(1, max_threads);
std::mutex threads_mutex; std::mutex threads_mutex;
std::vector<std::thread> threads; std::vector<std::thread> threads;
threads.reserve(max_threads); threads.reserve(max_threads);

File diff suppressed because it is too large Load Diff

View File

@ -1,17 +1,26 @@
#pragma once #pragma once
#include <optional> #include <libnuraft/ptr.hxx>
#include <city.h> #include <Common/ThreadPool_fwd.h>
#include <Disks/IDisk.h>
#include <IO/CompressionMethod.h>
#include <IO/HashingWriteBuffer.h>
#include <IO/WriteBufferFromFile.h>
#include <base/defines.h>
#include <libnuraft/nuraft.hxx>
#include <libnuraft/raft_server.hxx>
#include <Common/ConcurrentBoundedQueue.h> #include <Common/ConcurrentBoundedQueue.h>
#include <Common/ThreadPool.h>
#include <Coordination/KeeperContext.h> #include <map>
#include <unordered_set>
#include <future>
namespace nuraft
{
struct log_entry;
struct buffer;
struct raft_server;
}
namespace Poco
{
class Logger;
}
using LoggerPtr = std::shared_ptr<Poco::Logger>;
namespace DB namespace DB
{ {
@ -23,8 +32,11 @@ using LogEntries = std::vector<LogEntryPtr>;
using LogEntriesPtr = nuraft::ptr<LogEntries>; using LogEntriesPtr = nuraft::ptr<LogEntries>;
using BufferPtr = nuraft::ptr<nuraft::buffer>; using BufferPtr = nuraft::ptr<nuraft::buffer>;
using IndexToOffset = std::unordered_map<uint64_t, off_t>; struct KeeperLogInfo;
using IndexToLogEntry = std::unordered_map<uint64_t, LogEntryPtr>; class KeeperContext;
using KeeperContextPtr = std::shared_ptr<KeeperContext>;
class IDisk;
using DiskPtr = std::shared_ptr<IDisk>;
enum class ChangelogVersion : uint8_t enum class ChangelogVersion : uint8_t
{ {
@ -63,10 +75,19 @@ struct ChangelogFileDescription
DiskPtr disk; DiskPtr disk;
std::string path; std::string path;
std::mutex file_mutex;
bool deleted = false; bool deleted = false;
/// How many entries should be stored in this log /// How many entries should be stored in this log
uint64_t expectedEntriesCountInLog() const { return to_log_index - from_log_index + 1; } uint64_t expectedEntriesCountInLog() const { return to_log_index - from_log_index + 1; }
template <typename TFunction>
void withLock(TFunction && fn)
{
std::lock_guard lock(file_mutex);
fn();
}
}; };
using ChangelogFileDescriptionPtr = std::shared_ptr<ChangelogFileDescription>; using ChangelogFileDescriptionPtr = std::shared_ptr<ChangelogFileDescription>;
@ -80,6 +101,8 @@ struct LogFileSettings
uint64_t rotate_interval = 100000; uint64_t rotate_interval = 100000;
uint64_t max_size = 0; uint64_t max_size = 0;
uint64_t overallocate_size = 0; uint64_t overallocate_size = 0;
uint64_t latest_logs_cache_size_threshold = 0;
uint64_t commit_logs_cache_size_threshold = 0;
}; };
struct FlushSettings struct FlushSettings
@ -87,6 +110,191 @@ struct FlushSettings
uint64_t max_flush_batch_size = 1000; uint64_t max_flush_batch_size = 1000;
}; };
struct LogLocation
{
ChangelogFileDescriptionPtr file_description;
size_t position;
size_t size;
};
struct PrefetchedCacheEntry
{
explicit PrefetchedCacheEntry();
const LogEntryPtr & getLogEntry() const;
void resolve(std::exception_ptr exception);
void resolve(LogEntryPtr log_entry_);
private:
std::promise<LogEntryPtr> log_entry_resolver;
mutable std::shared_future<LogEntryPtr> log_entry;
};
using CacheEntry = std::variant<LogEntryPtr, PrefetchedCacheEntry>;
using IndexToCacheEntry = std::unordered_map<uint64_t, CacheEntry>;
using IndexToCacheEntryNode = typename IndexToCacheEntry::node_type;
/**
* Storage for storing and handling deserialized entries from disk.
* It consists of 2 in-memory caches that rely heavily on the way
* entries are used in Raft.
* Random and repeated access to certain entries is almost never done so we can't implement a solution
* like LRU/SLRU cache because entries would be cached and never read again.
* Entries are often read sequentially for 2 cases:
* - for replication
* - for committing
*
* First cache will store latest logs in memory, limited by the latest_logs_cache_size_threshold coordination setting.
* Once the log is persisted to the disk, we store it's location in the file and allow the storage
* to evict that log from cache if it's needed.
* Latest logs cache should have a high hit rate in "normal" operation for both replication and committing.
*
* As we commit (and read) logs sequentially, we will try to read from latest logs cache.
* In some cases, latest logs could be ahead from last committed log by more than latest_logs_cache_size_threshold
* which means that for each commit we would need to read the log from disk.
* In case latest logs cache hits the threshold we have a second cache called commit logs cache limited by commit_logs_cache_size_threshold.
* If a log is evicted from the latest logs cache, we check if we can move it to commit logs cache to avoid re-reading the log from disk.
* If latest logs cache moves ahead of the commit log by a lot or commit log hits the threshold
* we cannot move the entries from latest logs and we will need to refill the commit cache from disk.
* To avoid reading entry by entry (which can have really bad effect on performance because we support disks based on S3),
* we try to prefetch multiple entries ahead of time because we know that they will be read by commit thread
* in the future.
* Commit logs cache should have a high hit rate if we start with a lot of unprocessed logs that cannot fit in the
* latest logs cache.
*/
struct LogEntryStorage
{
LogEntryStorage(const LogFileSettings & log_settings, KeeperContextPtr keeper_context_);
~LogEntryStorage();
void addEntry(uint64_t index, const LogEntryPtr & log_entry);
void addEntryWithLocation(uint64_t index, const LogEntryPtr & log_entry, LogLocation log_location);
/// clean all logs up to (but not including) index
void cleanUpTo(uint64_t index);
/// clean all logs after (but not including) index
void cleanAfter(uint64_t index);
bool contains(uint64_t index) const;
LogEntryPtr getEntry(uint64_t index) const;
void clear();
LogEntryPtr getLatestConfigChange() const;
uint64_t termAt(uint64_t index) const;
using IndexWithLogLocation = std::pair<uint64_t, LogLocation>;
void addLogLocations(std::vector<IndexWithLogLocation> && indices_with_log_locations);
void refreshCache();
LogEntriesPtr getLogEntriesBetween(uint64_t start, uint64_t end) const;
void getKeeperLogInfo(KeeperLogInfo & log_info) const;
bool isConfigLog(uint64_t index) const;
size_t empty() const;
size_t size() const;
size_t getFirstIndex() const;
void shutdown();
private:
void prefetchCommitLogs();
void startCommitLogsPrefetch(uint64_t last_committed_index) const;
bool shouldMoveLogToCommitCache(uint64_t index, size_t log_entry_size);
void updateTermInfoWithNewEntry(uint64_t index, uint64_t term);
struct InMemoryCache
{
explicit InMemoryCache(size_t size_threshold_);
void addEntry(uint64_t index, size_t size, CacheEntry log_entry);
void addEntry(IndexToCacheEntryNode && node);
void updateStatsWithNewEntry(uint64_t index, size_t size);
IndexToCacheEntryNode popOldestEntry();
bool containsEntry(uint64_t index) const;
LogEntryPtr getEntry(uint64_t index) const;
CacheEntry * getCacheEntry(uint64_t index);
const CacheEntry * getCacheEntry(uint64_t index) const;
PrefetchedCacheEntry & getPrefetchedCacheEntry(uint64_t index);
void cleanUpTo(uint64_t index);
void cleanAfter(uint64_t index);
bool empty() const;
size_t numberOfEntries() const;
bool hasSpaceAvailable(size_t log_entry_size) const;
void clear();
/// Mapping log_id -> log_entry
mutable IndexToCacheEntry cache;
size_t cache_size = 0;
size_t min_index_in_cache = 0;
size_t max_index_in_cache = 0;
const size_t size_threshold;
};
InMemoryCache latest_logs_cache;
mutable InMemoryCache commit_logs_cache;
LogEntryPtr latest_config;
uint64_t latest_config_index = 0;
mutable LogEntryPtr first_log_entry;
mutable uint64_t first_log_index = 0;
std::unique_ptr<ThreadFromGlobalPool> commit_logs_prefetcher;
struct FileReadInfo
{
ChangelogFileDescriptionPtr file_description;
size_t position;
size_t count;
};
struct PrefetchInfo
{
std::vector<FileReadInfo> file_infos;
std::pair<uint64_t, uint64_t> commit_prefetch_index_range;
std::atomic<bool> cancel;
std::atomic<bool> done = false;
};
mutable ConcurrentBoundedQueue<std::shared_ptr<PrefetchInfo>> prefetch_queue;
mutable std::shared_ptr<PrefetchInfo> current_prefetch_info;
mutable std::mutex logs_location_mutex;
std::vector<IndexWithLogLocation> unapplied_indices_with_log_locations;
std::unordered_map<uint64_t, LogLocation> logs_location;
size_t max_index_with_location = 0;
size_t min_index_with_location = 0;
/// store indices of logs that contain config changes
std::unordered_set<uint64_t> logs_with_config_changes;
struct LogTermInfo
{
uint64_t term = 0;
uint64_t first_index = 0;
};
/// store first index of each term
/// so we don't have to fetch log to return that information
/// terms are monotonically increasing so first index is enough
std::deque<LogTermInfo> log_term_infos;
bool is_shutdown = false;
KeeperContextPtr keeper_context;
LoggerPtr log;
};
/// Simplest changelog with files rotation. /// Simplest changelog with files rotation.
/// No compression, no metadata, just entries with headers one by one. /// No compression, no metadata, just entries with headers one by one.
/// Able to read broken files/entries and discard them. Not thread safe. /// Able to read broken files/entries and discard them. Not thread safe.
@ -114,9 +322,9 @@ public:
/// Remove log files with to_log_index <= up_to_log_index. /// Remove log files with to_log_index <= up_to_log_index.
void compact(uint64_t up_to_log_index); void compact(uint64_t up_to_log_index);
uint64_t getNextEntryIndex() const { return max_log_id + 1; } uint64_t getNextEntryIndex() const;
uint64_t getStartIndex() const { return min_log_id; } uint64_t getStartIndex() const;
/// Last entry in log, or fake entry with term 0 if log is empty /// Last entry in log, or fake entry with term 0 if log is empty
LogEntryPtr getLastEntry() const; LogEntryPtr getLastEntry() const;
@ -128,7 +336,7 @@ public:
LogEntriesPtr getLogEntriesBetween(uint64_t start_index, uint64_t end_index); LogEntriesPtr getLogEntriesBetween(uint64_t start_index, uint64_t end_index);
/// Return entry at position index /// Return entry at position index
LogEntryPtr entryAt(uint64_t index); LogEntryPtr entryAt(uint64_t index) const;
/// Serialize entries from index into buffer /// Serialize entries from index into buffer
BufferPtr serializeEntriesToBuffer(uint64_t index, int32_t count); BufferPtr serializeEntriesToBuffer(uint64_t index, int32_t count);
@ -136,6 +344,9 @@ public:
/// Apply entries from buffer overriding existing entries /// Apply entries from buffer overriding existing entries
void applyEntriesFromBuffer(uint64_t index, nuraft::buffer & buffer); void applyEntriesFromBuffer(uint64_t index, nuraft::buffer & buffer);
bool isConfigLog(uint64_t index) const;
uint64_t termAt(uint64_t index) const;
/// Fsync latest log to disk and flush buffer /// Fsync latest log to disk and flush buffer
bool flush(); bool flush();
@ -143,7 +354,7 @@ public:
void shutdown(); void shutdown();
uint64_t size() const { return logs.size(); } uint64_t size() const;
uint64_t lastDurableIndex() const uint64_t lastDurableIndex() const
{ {
@ -155,6 +366,8 @@ public:
bool isInitialized() const; bool isInitialized() const;
void getKeeperLogInfo(KeeperLogInfo & log_info) const;
/// Fsync log to disk /// Fsync log to disk
~Changelog(); ~Changelog();
@ -190,16 +403,14 @@ private:
std::mutex writer_mutex; std::mutex writer_mutex;
/// Current writer for changelog file /// Current writer for changelog file
std::unique_ptr<ChangelogWriter> current_writer; std::unique_ptr<ChangelogWriter> current_writer;
/// Mapping log_id -> log_entry
IndexToLogEntry logs; LogEntryStorage entry_storage;
/// Start log_id which exists in all "active" logs
/// min_log_id + 1 == max_log_id means empty log storage for NuRaft
uint64_t min_log_id = 0;
uint64_t max_log_id = 0; uint64_t max_log_id = 0;
/// For compaction, queue of delete not used logs /// For compaction, queue of delete not used logs
/// 128 is enough, even if log is not removed, it's not a problem /// 128 is enough, even if log is not removed, it's not a problem
ConcurrentBoundedQueue<std::pair<std::string, DiskPtr>> log_files_to_delete_queue{128}; ConcurrentBoundedQueue<std::pair<std::string, DiskPtr>> log_files_to_delete_queue{128};
ThreadFromGlobalPool clean_log_thread; std::unique_ptr<ThreadFromGlobalPool> clean_log_thread;
struct AppendLog struct AppendLog
{ {
@ -217,7 +428,7 @@ private:
void writeThread(); void writeThread();
ThreadFromGlobalPool write_thread; std::unique_ptr<ThreadFromGlobalPool> write_thread;
ConcurrentBoundedQueue<WriteOperation> write_operations; ConcurrentBoundedQueue<WriteOperation> write_operations;
/// Append log completion callback tries to acquire NuRaft's global lock /// Append log completion callback tries to acquire NuRaft's global lock
@ -226,7 +437,7 @@ private:
/// For those reasons we call the completion callback in a different thread /// For those reasons we call the completion callback in a different thread
void appendCompletionThread(); void appendCompletionThread();
ThreadFromGlobalPool append_completion_thread; std::unique_ptr<ThreadFromGlobalPool> append_completion_thread;
ConcurrentBoundedQueue<bool> append_completion_queue; ConcurrentBoundedQueue<bool> append_completion_queue;
// last_durable_index needs to be exposed through const getter so we make mutex mutable // last_durable_index needs to be exposed through const getter so we make mutex mutable

View File

@ -34,6 +34,11 @@ void CoordinationSettings::loadFromConfig(const String & config_elem, const Poco
e.addMessage("in Coordination settings config"); e.addMessage("in Coordination settings config");
throw; throw;
} }
/// for backwards compatibility we set max_requests_append_size to max_requests_batch_size
/// if max_requests_append_size was not changed
if (!max_requests_append_size.changed)
max_requests_append_size = max_requests_batch_size;
} }
@ -41,7 +46,7 @@ const String KeeperConfigurationAndSettings::DEFAULT_FOUR_LETTER_WORD_CMD =
#if USE_JEMALLOC #if USE_JEMALLOC
"jmst,jmfp,jmep,jmdp," "jmst,jmfp,jmep,jmdp,"
#endif #endif
"conf,cons,crst,envi,ruok,srst,srvr,stat,wchs,dirs,mntr,isro,rcvr,apiv,csnp,lgif,rqld,rclc,clrs,ftfl,ydld"; "conf,cons,crst,envi,ruok,srst,srvr,stat,wchs,dirs,mntr,isro,rcvr,apiv,csnp,lgif,rqld,rclc,clrs,ftfl,ydld,pfev";
KeeperConfigurationAndSettings::KeeperConfigurationAndSettings() KeeperConfigurationAndSettings::KeeperConfigurationAndSettings()
: server_id(NOT_EXIST) : server_id(NOT_EXIST)

View File

@ -41,6 +41,7 @@ struct Settings;
M(UInt64, max_request_queue_size, 100000, "Maximum number of request that can be in queue for processing", 0) \ M(UInt64, max_request_queue_size, 100000, "Maximum number of request that can be in queue for processing", 0) \
M(UInt64, max_requests_batch_size, 100, "Max size of batch of requests that can be sent to RAFT", 0) \ M(UInt64, max_requests_batch_size, 100, "Max size of batch of requests that can be sent to RAFT", 0) \
M(UInt64, max_requests_batch_bytes_size, 100*1024, "Max size in bytes of batch of requests that can be sent to RAFT", 0) \ M(UInt64, max_requests_batch_bytes_size, 100*1024, "Max size in bytes of batch of requests that can be sent to RAFT", 0) \
M(UInt64, max_requests_append_size, 100, "Max size of batch of requests that can be sent to replica in append request", 0) \
M(UInt64, max_flush_batch_size, 1000, "Max size of batch of requests that can be flushed together", 0) \ M(UInt64, max_flush_batch_size, 1000, "Max size of batch of requests that can be flushed together", 0) \
M(UInt64, max_requests_quick_batch_size, 100, "Max size of batch of requests to try to get before proceeding with RAFT. Keeper will not wait for requests but take only requests that are already in queue" , 0) \ M(UInt64, max_requests_quick_batch_size, 100, "Max size of batch of requests to try to get before proceeding with RAFT. Keeper will not wait for requests but take only requests that are already in queue" , 0) \
M(Bool, quorum_reads, false, "Execute read requests as writes through whole RAFT consesus with similar speed", 0) \ M(Bool, quorum_reads, false, "Execute read requests as writes through whole RAFT consesus with similar speed", 0) \
@ -52,7 +53,11 @@ struct Settings;
M(UInt64, log_file_overallocate_size, 50 * 1024 * 1024, "If max_log_file_size is not set to 0, this value will be added to it for preallocating bytes on disk. If a log record is larger than this value, it could lead to uncaught out-of-space issues so a larger value is preferred", 0) \ M(UInt64, log_file_overallocate_size, 50 * 1024 * 1024, "If max_log_file_size is not set to 0, this value will be added to it for preallocating bytes on disk. If a log record is larger than this value, it could lead to uncaught out-of-space issues so a larger value is preferred", 0) \
M(UInt64, min_request_size_for_cache, 50 * 1024, "Minimal size of the request to cache the deserialization result. Caching can have negative effect on latency for smaller requests, set to 0 to disable", 0) \ M(UInt64, min_request_size_for_cache, 50 * 1024, "Minimal size of the request to cache the deserialization result. Caching can have negative effect on latency for smaller requests, set to 0 to disable", 0) \
M(UInt64, raft_limits_reconnect_limit, 50, "If connection to a peer is silent longer than this limit * (multiplied by heartbeat interval), we re-establish the connection.", 0) \ M(UInt64, raft_limits_reconnect_limit, 50, "If connection to a peer is silent longer than this limit * (multiplied by heartbeat interval), we re-establish the connection.", 0) \
M(Bool, async_replication, false, "Enable async replication. All write and read guarantees are preserved while better performance is achieved. Settings is disabled by default to not break backwards compatibility.", 0) M(Bool, async_replication, false, "Enable async replication. All write and read guarantees are preserved while better performance is achieved. Settings is disabled by default to not break backwards compatibility.", 0) \
M(UInt64, latest_logs_cache_size_threshold, 1 * 1024 * 1024 * 1024, "Maximum total size of in-memory cache of latest log entries.", 0) \
M(UInt64, commit_logs_cache_size_threshold, 500 * 1024 * 1024, "Maximum total size of in-memory cache of log entries needed next for commit.", 0) \
M(UInt64, disk_move_retries_wait_ms, 1000, "How long to wait between retries after a failure which happened while a file was being moved between disks.", 0) \
M(UInt64, disk_move_retries_during_init, 100, "The amount of retries after a failure which happened while a file was being moved between disks during initialization.", 0)
DECLARE_SETTINGS_TRAITS(CoordinationSettingsTraits, LIST_OF_COORDINATION_SETTINGS) DECLARE_SETTINGS_TRAITS(CoordinationSettingsTraits, LIST_OF_COORDINATION_SETTINGS)

View File

@ -9,6 +9,7 @@
#include <Common/getCurrentProcessFDCount.h> #include <Common/getCurrentProcessFDCount.h>
#include <Common/getMaxFileDescriptorCount.h> #include <Common/getMaxFileDescriptorCount.h>
#include <Common/StringUtils/StringUtils.h> #include <Common/StringUtils/StringUtils.h>
#include <Common/config_version.h>
#include "Coordination/KeeperFeatureFlags.h" #include "Coordination/KeeperFeatureFlags.h"
#include <Coordination/Keeper4LWInfo.h> #include <Coordination/Keeper4LWInfo.h>
#include <IO/WriteHelpers.h> #include <IO/WriteHelpers.h>
@ -37,6 +38,12 @@ String formatZxid(int64_t zxid)
} }
#if USE_NURAFT
namespace ProfileEvents
{
extern const std::vector<Event> keeper_profile_events;
}
#endif
namespace DB namespace DB
{ {
@ -193,6 +200,8 @@ void FourLetterCommandFactory::registerCommands(KeeperDispatcher & keeper_dispat
FourLetterCommandPtr jemalloc_disable_profile = std::make_shared<JemallocDisableProfile>(keeper_dispatcher); FourLetterCommandPtr jemalloc_disable_profile = std::make_shared<JemallocDisableProfile>(keeper_dispatcher);
factory.registerCommand(jemalloc_disable_profile); factory.registerCommand(jemalloc_disable_profile);
#endif #endif
FourLetterCommandPtr profile_events_command = std::make_shared<ProfileEventsCommand>(keeper_dispatcher);
factory.registerCommand(profile_events_command);
factory.initializeAllowList(keeper_dispatcher); factory.initializeAllowList(keeper_dispatcher);
factory.setInitialize(true); factory.setInitialize(true);
@ -561,6 +570,12 @@ String LogInfoCommand::run()
append("leader_committed_log_idx", log_info.leader_committed_log_idx); append("leader_committed_log_idx", log_info.leader_committed_log_idx);
append("target_committed_log_idx", log_info.target_committed_log_idx); append("target_committed_log_idx", log_info.target_committed_log_idx);
append("last_snapshot_idx", log_info.last_snapshot_idx); append("last_snapshot_idx", log_info.last_snapshot_idx);
append("latest_logs_cache_entries", log_info.latest_logs_cache_entries);
append("latest_logs_cache_size", log_info.latest_logs_cache_size);
append("commit_logs_cache_entries", log_info.commit_logs_cache_entries);
append("commit_logs_cache_size", log_info.commit_logs_cache_size);
return ret.str(); return ret.str();
} }
@ -644,4 +659,31 @@ String JemallocDisableProfile::run()
} }
#endif #endif
String ProfileEventsCommand::run()
{
StringBuffer ret;
#if USE_NURAFT
auto append = [&ret] (const String & metric, uint64_t value, const String & docs) -> void
{
writeText(metric, ret);
writeText('\t', ret);
writeText(std::to_string(value), ret);
writeText('\t', ret);
writeText(docs, ret);
writeText('\n', ret);
};
for (auto i : ProfileEvents::keeper_profile_events)
{
const auto counter = ProfileEvents::global_counters[i].load(std::memory_order_relaxed);
std::string metric_name{ProfileEvents::getName(static_cast<ProfileEvents::Event>(i))};
std::string metric_doc{ProfileEvents::getDocumentation(static_cast<ProfileEvents::Event>(i))};
append(metric_name, counter, metric_doc);
}
#endif
return ret.str();
}
} }

View File

@ -1,18 +1,19 @@
#pragma once #pragma once
#include <sstream> #include "config.h"
#include <string>
#include <unordered_map> #include <unordered_map>
#include <string>
#include <Coordination/KeeperDispatcher.h> #include <boost/noncopyable.hpp>
#include <IO/WriteBufferFromString.h>
#include <Common/config_version.h>
namespace DB namespace DB
{ {
class WriteBufferFromOwnString;
class KeeperDispatcher;
using String = std::string;
struct IFourLetterCommand; struct IFourLetterCommand;
using FourLetterCommandPtr = std::shared_ptr<DB::IFourLetterCommand>; using FourLetterCommandPtr = std::shared_ptr<DB::IFourLetterCommand>;
@ -479,4 +480,16 @@ struct JemallocDisableProfile : public IFourLetterCommand
}; };
#endif #endif
struct ProfileEventsCommand : public IFourLetterCommand
{
explicit ProfileEventsCommand(KeeperDispatcher & keeper_dispatcher_)
: IFourLetterCommand(keeper_dispatcher_)
{
}
String name() override { return "pfev"; }
String run() override;
~ProfileEventsCommand() override = default;
};
} }

View File

@ -191,4 +191,10 @@ bool InMemoryLogStore::compact(uint64_t last_log_index)
return true; return true;
} }
bool InMemoryLogStore::is_conf(uint64_t index)
{
auto entry = entry_at(index);
return entry != nullptr && entry->get_val_type() == nuraft::conf;
}
} }

View File

@ -39,6 +39,8 @@ public:
bool flush() override { return true; } bool flush() override { return true; }
bool is_conf(uint64_t index) override;
private: private:
std::map<uint64_t, nuraft::ptr<nuraft::log_entry>> logs TSA_GUARDED_BY(logs_lock); std::map<uint64_t, nuraft::ptr<nuraft::log_entry>> logs TSA_GUARDED_BY(logs_lock);
mutable std::mutex logs_lock; mutable std::mutex logs_lock;

View File

@ -52,16 +52,16 @@ struct Keeper4LWInfo
struct KeeperLogInfo struct KeeperLogInfo
{ {
/// My first log index in log store. /// My first log index in log store.
uint64_t first_log_idx; uint64_t first_log_idx{0};
/// My first log term. /// My first log term.
uint64_t first_log_term; uint64_t first_log_term{0};
/// My last log index in log store. /// My last log index in log store.
uint64_t last_log_idx; uint64_t last_log_idx{0};
/// My last log term. /// My last log term.
uint64_t last_log_term; uint64_t last_log_term{0};
/// My last committed log index in state machine. /// My last committed log index in state machine.
uint64_t last_committed_log_idx; uint64_t last_committed_log_idx;
@ -74,6 +74,12 @@ struct KeeperLogInfo
/// The largest committed log index in last snapshot. /// The largest committed log index in last snapshot.
uint64_t last_snapshot_idx; uint64_t last_snapshot_idx;
uint64_t latest_logs_cache_entries;
uint64_t latest_logs_cache_size;
uint64_t commit_logs_cache_entries;
uint64_t commit_logs_cache_size;
}; };
} }

View File

@ -20,7 +20,6 @@ void updateKeeperInformation(KeeperDispatcher & keeper_dispatcher, AsynchronousM
size_t ephemerals_count = 0; size_t ephemerals_count = 0;
size_t approximate_data_size = 0; size_t approximate_data_size = 0;
size_t key_arena_size = 0; size_t key_arena_size = 0;
size_t latest_snapshot_size = 0;
size_t open_file_descriptor_count = 0; size_t open_file_descriptor_count = 0;
std::optional<size_t> max_file_descriptor_count = 0; std::optional<size_t> max_file_descriptor_count = 0;
size_t followers = 0; size_t followers = 0;
@ -46,11 +45,8 @@ void updateKeeperInformation(KeeperDispatcher & keeper_dispatcher, AsynchronousM
ephemerals_count = state_machine.getTotalEphemeralNodesCount(); ephemerals_count = state_machine.getTotalEphemeralNodesCount();
approximate_data_size = state_machine.getApproximateDataSize(); approximate_data_size = state_machine.getApproximateDataSize();
key_arena_size = state_machine.getKeyArenaSize(); key_arena_size = state_machine.getKeyArenaSize();
latest_snapshot_size = state_machine.getLatestSnapshotBufSize();
session_with_watches = state_machine.getSessionsWithWatchesCount(); session_with_watches = state_machine.getSessionsWithWatchesCount();
paths_watched = state_machine.getWatchedPathsCount(); paths_watched = state_machine.getWatchedPathsCount();
//snapshot_dir_size = keeper_dispatcher.getSnapDirSize();
//log_dir_size = keeper_dispatcher.getLogDirSize();
# if defined(__linux__) || defined(__APPLE__) # if defined(__linux__) || defined(__APPLE__)
open_file_descriptor_count = getCurrentProcessFDCount(); open_file_descriptor_count = getCurrentProcessFDCount();
@ -76,7 +72,9 @@ void updateKeeperInformation(KeeperDispatcher & keeper_dispatcher, AsynchronousM
new_values["KeeperApproximateDataSize"] = { approximate_data_size, "The approximate data size of ClickHouse Keeper, in bytes." }; new_values["KeeperApproximateDataSize"] = { approximate_data_size, "The approximate data size of ClickHouse Keeper, in bytes." };
new_values["KeeperKeyArenaSize"] = { key_arena_size, "The size in bytes of the memory arena for keys in ClickHouse Keeper." }; new_values["KeeperKeyArenaSize"] = { key_arena_size, "The size in bytes of the memory arena for keys in ClickHouse Keeper." };
new_values["KeeperLatestSnapshotSize"] = { latest_snapshot_size, "The uncompressed size in bytes of the latest snapshot created by ClickHouse Keeper." }; /// TODO: value was incorrectly set to 0 previously for local snapshots
/// it needs to be fixed and it needs to be atomic to avoid deadlock
///new_values["KeeperLatestSnapshotSize"] = { latest_snapshot_size, "The uncompressed size in bytes of the latest snapshot created by ClickHouse Keeper." };
new_values["KeeperOpenFileDescriptorCount"] = { open_file_descriptor_count, "The number of open file descriptors in ClickHouse Keeper." }; new_values["KeeperOpenFileDescriptorCount"] = { open_file_descriptor_count, "The number of open file descriptors in ClickHouse Keeper." };
if (max_file_descriptor_count.has_value()) if (max_file_descriptor_count.has_value())
@ -99,6 +97,12 @@ void updateKeeperInformation(KeeperDispatcher & keeper_dispatcher, AsynchronousM
new_values["KeeperTargetCommitLogIdx"] = { keeper_log_info.target_committed_log_idx, "Index until which logs can be committed in ClickHouse Keeper." }; new_values["KeeperTargetCommitLogIdx"] = { keeper_log_info.target_committed_log_idx, "Index until which logs can be committed in ClickHouse Keeper." };
new_values["KeeperLastSnapshotIdx"] = { keeper_log_info.last_snapshot_idx, "Index of the last log present in the last created snapshot." }; new_values["KeeperLastSnapshotIdx"] = { keeper_log_info.last_snapshot_idx, "Index of the last log present in the last created snapshot." };
new_values["KeeperLatestLogsCacheEntries"] = {keeper_log_info.latest_logs_cache_entries, "Number of entries stored in the in-memory cache for latest logs"};
new_values["KeeperLatestLogsCacheSize"] = {keeper_log_info.latest_logs_cache_size, "Total size of in-memory cache for latest logs"};
new_values["KeeperCommitLogsCacheEntries"] = {keeper_log_info.commit_logs_cache_entries, "Number of entries stored in the in-memory cache for next logs to be committed"};
new_values["KeeperCommitLogsCacheSize"] = {keeper_log_info.commit_logs_cache_size, "Total size of in-memory cache for next logs to be committed"};
auto & keeper_connection_stats = keeper_dispatcher.getKeeperConnectionStats(); auto & keeper_connection_stats = keeper_dispatcher.getKeeperConnectionStats();
new_values["KeeperMinLatency"] = { keeper_connection_stats.getMinLatency(), "Minimal request latency of ClickHouse Keeper." }; new_values["KeeperMinLatency"] = { keeper_connection_stats.getMinLatency(), "Minimal request latency of ClickHouse Keeper." };

View File

@ -0,0 +1,122 @@
#include <Coordination/KeeperCommon.h>
#include <string>
#include <filesystem>
#include <Common/logger_useful.h>
#include <Disks/IDisk.h>
#include <Coordination/KeeperContext.h>
#include <Coordination/CoordinationSettings.h>
namespace DB
{
static size_t findLastSlash(StringRef path)
{
if (path.size == 0)
return std::string::npos;
for (size_t i = path.size - 1; i > 0; --i)
{
if (path.data[i] == '/')
return i;
}
if (path.data[0] == '/')
return 0;
return std::string::npos;
}
StringRef parentNodePath(StringRef path)
{
auto rslash_pos = findLastSlash(path);
if (rslash_pos > 0)
return StringRef{path.data, rslash_pos};
return "/";
}
StringRef getBaseNodeName(StringRef path)
{
size_t basename_start = findLastSlash(path);
return StringRef{path.data + basename_start + 1, path.size - basename_start - 1};
}
void moveFileBetweenDisks(
DiskPtr disk_from,
const std::string & path_from,
DiskPtr disk_to,
const std::string & path_to,
std::function<void()> before_file_remove_op,
LoggerPtr logger,
const KeeperContextPtr & keeper_context)
{
LOG_TRACE(logger, "Moving {} to {} from disk {} to disk {}", path_from, path_to, disk_from->getName(), disk_to->getName());
/// we use empty file with prefix tmp_ to detect incomplete copies
/// if a copy is complete we don't care from which disk we use the same file
/// so it's okay if a failure happens after removing of tmp file but before we remove
/// the file from the source disk
auto from_path = fs::path(path_from);
auto tmp_file_name = from_path.parent_path() / (std::string{tmp_keeper_file_prefix} + from_path.filename().string());
const auto & coordination_settings = keeper_context->getCoordinationSettings();
auto max_retries_on_init = coordination_settings->disk_move_retries_during_init.value;
auto retries_sleep = std::chrono::milliseconds(coordination_settings->disk_move_retries_wait_ms);
auto run_with_retries = [&](const auto & op, std::string_view operation_description)
{
size_t retry_num = 0;
do
{
try
{
op();
return true;
}
catch (...)
{
tryLogCurrentException(
logger,
fmt::format(
"While moving file {} to disk {} and running '{}'", path_from, disk_to->getName(), operation_description));
std::this_thread::sleep_for(retries_sleep);
}
++retry_num;
if (keeper_context->getServerState() == KeeperContext::Phase::INIT && retry_num == max_retries_on_init)
{
LOG_ERROR(logger, "Operation '{}' failed too many times", operation_description);
break;
}
} while (!keeper_context->isShutdownCalled());
LOG_ERROR(
logger,
"Failed to run '{}' while moving file {} to disk {}",
operation_description,
path_from,
disk_to->getName());
return false;
};
if (!run_with_retries(
[&]
{
auto buf = disk_to->writeFile(tmp_file_name);
buf->finalize();
},
"creating temporary file"))
return;
if (!run_with_retries([&] { disk_from->copyFile(from_path, *disk_to, path_to, {}); }, "copying file"))
return;
if (!run_with_retries([&] { disk_to->removeFileIfExists(tmp_file_name); }, "removing temporary file"))
return;
if (before_file_remove_op)
before_file_remove_op();
if (!run_with_retries([&] { disk_from->removeFileIfExists(path_from); }, "removing file from source disk"))
return;
}
}

View File

@ -0,0 +1,29 @@
#pragma once
#include <base/StringRef.h>
#include "Common/Logger.h"
namespace DB
{
class IDisk;
using DiskPtr = std::shared_ptr<IDisk>;
class KeeperContext;
using KeeperContextPtr = std::shared_ptr<KeeperContext>;
StringRef parentNodePath(StringRef path);
StringRef getBaseNodeName(StringRef path);
inline static constexpr std::string_view tmp_keeper_file_prefix = "tmp_";
void moveFileBetweenDisks(
DiskPtr disk_from,
const std::string & path_from,
DiskPtr disk_to,
const std::string & path_to,
std::function<void()> before_file_remove_op,
LoggerPtr logger,
const KeeperContextPtr & keeper_context);
}

View File

@ -284,7 +284,12 @@
M(InterfaceMySQLSendBytes) \ M(InterfaceMySQLSendBytes) \
M(InterfaceMySQLReceiveBytes) \ M(InterfaceMySQLReceiveBytes) \
M(InterfacePostgreSQLSendBytes) \ M(InterfacePostgreSQLSendBytes) \
M(InterfacePostgreSQLReceiveBytes) M(InterfacePostgreSQLReceiveBytes) \
\
M(KeeperLogsEntryReadFromLatestCache) \
M(KeeperLogsEntryReadFromCommitCache) \
M(KeeperLogsEntryReadFromFile) \
M(KeeperLogsPrefetchedEntries) \
namespace ProfileEvents namespace ProfileEvents
{ {

View File

@ -1,14 +1,16 @@
#include <Coordination/KeeperContext.h> #include <Coordination/KeeperContext.h>
#include <Coordination/Defines.h> #include <Coordination/Defines.h>
#include <Disks/DiskLocal.h>
#include <Interpreters/Context.h>
#include <IO/S3/Credentials.h>
#include <Poco/Util/AbstractConfiguration.h>
#include <Coordination/KeeperConstants.h> #include <Coordination/KeeperConstants.h>
#include <Common/logger_useful.h>
#include <Server/CloudPlacementInfo.h> #include <Server/CloudPlacementInfo.h>
#include <Coordination/KeeperFeatureFlags.h> #include <Coordination/KeeperFeatureFlags.h>
#include <Disks/DiskLocal.h>
#include <Disks/DiskSelector.h>
#include <IO/S3/Credentials.h>
#include <Interpreters/Context.h>
#include <Poco/Util/AbstractConfiguration.h>
#include <Common/logger_useful.h>
#include <boost/algorithm/string.hpp> #include <boost/algorithm/string.hpp>
namespace DB namespace DB
@ -21,9 +23,10 @@ extern const int BAD_ARGUMENTS;
} }
KeeperContext::KeeperContext(bool standalone_keeper_) KeeperContext::KeeperContext(bool standalone_keeper_, CoordinationSettingsPtr coordination_settings_)
: disk_selector(std::make_shared<DiskSelector>()) : disk_selector(std::make_shared<DiskSelector>())
, standalone_keeper(standalone_keeper_) , standalone_keeper(standalone_keeper_)
, coordination_settings(std::move(coordination_settings_))
{ {
/// enable by default some feature flags /// enable by default some feature flags
feature_flags.enableFeatureFlag(KeeperFeatureFlag::FILTERED_LIST); feature_flags.enableFeatureFlag(KeeperFeatureFlag::FILTERED_LIST);
@ -402,4 +405,9 @@ void KeeperContext::waitLocalLogsPreprocessedOrShutdown()
local_logs_preprocessed_cv.wait(lock, [this]{ return shutdown_called || local_logs_preprocessed; }); local_logs_preprocessed_cv.wait(lock, [this]{ return shutdown_called || local_logs_preprocessed; });
} }
const CoordinationSettingsPtr & KeeperContext::getCoordinationSettings() const
{
return coordination_settings;
}
} }

View File

@ -1,8 +1,7 @@
#pragma once #pragma once
#include <Coordination/KeeperFeatureFlags.h> #include <Coordination/KeeperFeatureFlags.h>
#include <Disks/DiskSelector.h>
#include <IO/WriteBufferFromString.h>
#include <Poco/Util/AbstractConfiguration.h> #include <Poco/Util/AbstractConfiguration.h>
#include <atomic>
#include <condition_variable> #include <condition_variable>
#include <cstdint> #include <cstdint>
#include <memory> #include <memory>
@ -12,10 +11,19 @@ namespace DB
class KeeperDispatcher; class KeeperDispatcher;
struct CoordinationSettings;
using CoordinationSettingsPtr = std::shared_ptr<CoordinationSettings>;
class DiskSelector;
class IDisk;
using DiskPtr = std::shared_ptr<IDisk>;
class WriteBufferFromOwnString;
class KeeperContext class KeeperContext
{ {
public: public:
explicit KeeperContext(bool standalone_keeper_); KeeperContext(bool standalone_keeper_, CoordinationSettingsPtr coordination_settings_);
enum class Phase : uint8_t enum class Phase : uint8_t
{ {
@ -68,6 +76,24 @@ public:
void waitLocalLogsPreprocessedOrShutdown(); void waitLocalLogsPreprocessedOrShutdown();
uint64_t lastCommittedIndex() const
{
return last_committed_log_idx.load(std::memory_order_relaxed);
}
void setLastCommitIndex(uint64_t commit_index)
{
last_committed_log_idx.store(commit_index, std::memory_order_relaxed);
last_committed_log_idx.notify_all();
}
void waitLastCommittedIndexUpdated(uint64_t current_last_committed_idx)
{
last_committed_log_idx.wait(current_last_committed_idx, std::memory_order_relaxed);
}
const CoordinationSettingsPtr & getCoordinationSettings() const;
private: private:
/// local disk defined using path or disk name /// local disk defined using path or disk name
using Storage = std::variant<DiskPtr, std::string>; using Storage = std::variant<DiskPtr, std::string>;
@ -89,7 +115,7 @@ private:
std::atomic<bool> local_logs_preprocessed = false; std::atomic<bool> local_logs_preprocessed = false;
std::atomic<bool> shutdown_called = false; std::atomic<bool> shutdown_called = false;
Phase server_state{Phase::INIT}; std::atomic<Phase> server_state{Phase::INIT};
bool ignore_system_path_on_startup{false}; bool ignore_system_path_on_startup{false};
bool digest_enabled{true}; bool digest_enabled{true};
@ -113,6 +139,10 @@ private:
KeeperDispatcher * dispatcher{nullptr}; KeeperDispatcher * dispatcher{nullptr};
std::atomic<UInt64> memory_soft_limit = 0; std::atomic<UInt64> memory_soft_limit = 0;
std::atomic<UInt64> last_committed_log_idx = 0;
CoordinationSettingsPtr coordination_settings;
}; };
using KeeperContextPtr = std::shared_ptr<KeeperContext>; using KeeperContextPtr = std::shared_ptr<KeeperContext>;

View File

@ -256,11 +256,11 @@ void KeeperDispatcher::requestThread()
if (shutdown_called) if (shutdown_called)
return; return;
auto current_last_committed_idx = our_last_committed_log_idx.load(std::memory_order_relaxed); auto current_last_committed_idx = keeper_context->lastCommittedIndex();
if (current_last_committed_idx >= log_idx) if (current_last_committed_idx >= log_idx)
break; break;
our_last_committed_log_idx.wait(current_last_committed_idx); keeper_context->waitLastCommittedIndexUpdated(current_last_committed_idx);
} }
} }
} }
@ -414,8 +414,8 @@ void KeeperDispatcher::initialize(const Poco::Util::AbstractConfiguration & conf
{ {
LOG_DEBUG(log, "Initializing storage dispatcher"); LOG_DEBUG(log, "Initializing storage dispatcher");
keeper_context = std::make_shared<KeeperContext>(standalone_keeper);
configuration_and_settings = KeeperConfigurationAndSettings::loadFromConfig(config, standalone_keeper); configuration_and_settings = KeeperConfigurationAndSettings::loadFromConfig(config, standalone_keeper);
keeper_context = std::make_shared<KeeperContext>(standalone_keeper, configuration_and_settings->coordination_settings);
keeper_context->initialize(config, this); keeper_context->initialize(config, this);
@ -433,7 +433,7 @@ void KeeperDispatcher::initialize(const Poco::Util::AbstractConfiguration & conf
snapshots_queue, snapshots_queue,
keeper_context, keeper_context,
snapshot_s3, snapshot_s3,
[this](uint64_t log_idx, const KeeperStorage::RequestForSession & request_for_session) [this](uint64_t /*log_idx*/, const KeeperStorage::RequestForSession & request_for_session)
{ {
{ {
/// check if we have queue of read requests depending on this request to be committed /// check if we have queue of read requests depending on this request to be committed
@ -457,9 +457,6 @@ void KeeperDispatcher::initialize(const Poco::Util::AbstractConfiguration & conf
} }
} }
} }
our_last_committed_log_idx.store(log_idx, std::memory_order_relaxed);
our_last_committed_log_idx.notify_all();
}); });
try try
@ -504,8 +501,9 @@ void KeeperDispatcher::shutdown()
LOG_DEBUG(log, "Shutting down storage dispatcher"); LOG_DEBUG(log, "Shutting down storage dispatcher");
our_last_committed_log_idx = std::numeric_limits<uint64_t>::max(); /// some threads can be waiting for certain commits, so we set value
our_last_committed_log_idx.notify_all(); /// of the last commit index to something that will always unblock
keeper_context->setLastCommitIndex(std::numeric_limits<uint64_t>::max());
if (session_cleaner_thread.joinable()) if (session_cleaner_thread.joinable())
session_cleaner_thread.join(); session_cleaner_thread.join();

View File

@ -105,8 +105,6 @@ private:
public: public:
std::mutex read_request_queue_mutex; std::mutex read_request_queue_mutex;
std::atomic<uint64_t> our_last_committed_log_idx = 0;
/// queue of read requests that can be processed after a request with specific session ID and XID is committed /// queue of read requests that can be processed after a request with specific session ID and XID is committed
std::unordered_map<int64_t, std::unordered_map<Coordination::XID, KeeperStorage::RequestsForSessions>> read_request_queue; std::unordered_map<int64_t, std::unordered_map<Coordination::XID, KeeperStorage::RequestsForSessions>> read_request_queue;

View File

@ -66,13 +66,16 @@ nuraft::ptr<nuraft::log_entry> KeeperLogStore::entry_at(uint64_t index)
return changelog.entryAt(index); return changelog.entryAt(index);
} }
bool KeeperLogStore::is_conf(uint64_t index)
{
std::lock_guard lock(changelog_lock);
return changelog.isConfigLog(index);
}
uint64_t KeeperLogStore::term_at(uint64_t index) uint64_t KeeperLogStore::term_at(uint64_t index)
{ {
std::lock_guard lock(changelog_lock); std::lock_guard lock(changelog_lock);
auto entry = changelog.entryAt(index); return changelog.termAt(index);
if (entry)
return entry->get_term();
return 0;
} }
nuraft::ptr<nuraft::buffer> KeeperLogStore::pack(uint64_t index, int32_t cnt) nuraft::ptr<nuraft::buffer> KeeperLogStore::pack(uint64_t index, int32_t cnt)
@ -145,4 +148,10 @@ void KeeperLogStore::setRaftServer(const nuraft::ptr<nuraft::raft_server> & raft
return changelog.setRaftServer(raft_server); return changelog.setRaftServer(raft_server);
} }
void KeeperLogStore::getKeeperLogInfo(KeeperLogInfo & log_info) const
{
std::lock_guard lock(changelog_lock);
changelog.getKeeperLogInfo(log_info);
}
} }

View File

@ -1,10 +1,10 @@
#pragma once #pragma once
#include <libnuraft/log_store.hxx> #include <libnuraft/log_store.hxx>
#include <map>
#include <mutex> #include <mutex>
#include <Core/Types.h> #include <Core/Types.h>
#include <Coordination/Changelog.h> #include <Coordination/Changelog.h>
#include <Coordination/KeeperContext.h> #include <Coordination/KeeperContext.h>
#include <Coordination/Keeper4LWInfo.h>
#include <base/defines.h> #include <base/defines.h>
namespace DB namespace DB
@ -38,6 +38,8 @@ public:
/// Return entry at index /// Return entry at index
nuraft::ptr<nuraft::log_entry> entry_at(uint64_t index) override; nuraft::ptr<nuraft::log_entry> entry_at(uint64_t index) override;
bool is_conf(uint64_t index) override;
/// Term if the index /// Term if the index
uint64_t term_at(uint64_t index) override; uint64_t term_at(uint64_t index) override;
@ -72,6 +74,8 @@ public:
void setRaftServer(const nuraft::ptr<nuraft::raft_server> & raft_server); void setRaftServer(const nuraft::ptr<nuraft::raft_server> & raft_server);
void getKeeperLogInfo(KeeperLogInfo & log_info) const;
private: private:
mutable std::mutex changelog_lock; mutable std::mutex changelog_lock;
LoggerPtr log; LoggerPtr log;

View File

@ -6,6 +6,7 @@
#include <chrono> #include <chrono>
#include <mutex> #include <mutex>
#include <string> #include <string>
#include <Coordination/KeeperLogStore.h>
#include <Coordination/KeeperStateMachine.h> #include <Coordination/KeeperStateMachine.h>
#include <Coordination/KeeperStateManager.h> #include <Coordination/KeeperStateManager.h>
#include <Coordination/KeeperSnapshotManagerS3.h> #include <Coordination/KeeperSnapshotManagerS3.h>
@ -119,22 +120,20 @@ KeeperServer::KeeperServer(
KeeperSnapshotManagerS3 & snapshot_manager_s3, KeeperSnapshotManagerS3 & snapshot_manager_s3,
KeeperStateMachine::CommitCallback commit_callback) KeeperStateMachine::CommitCallback commit_callback)
: server_id(configuration_and_settings_->server_id) : server_id(configuration_and_settings_->server_id)
, coordination_settings(configuration_and_settings_->coordination_settings)
, log(getLogger("KeeperServer")) , log(getLogger("KeeperServer"))
, is_recovering(config.getBool("keeper_server.force_recovery", false)) , is_recovering(config.getBool("keeper_server.force_recovery", false))
, keeper_context{std::move(keeper_context_)} , keeper_context{std::move(keeper_context_)}
, create_snapshot_on_exit(config.getBool("keeper_server.create_snapshot_on_exit", true)) , create_snapshot_on_exit(config.getBool("keeper_server.create_snapshot_on_exit", true))
, enable_reconfiguration(config.getBool("keeper_server.enable_reconfiguration", false)) , enable_reconfiguration(config.getBool("keeper_server.enable_reconfiguration", false))
{ {
if (coordination_settings->quorum_reads) if (keeper_context->getCoordinationSettings()->quorum_reads)
LOG_WARNING(log, "Quorum reads enabled, Keeper will work slower."); LOG_WARNING(log, "Quorum reads enabled, Keeper will work slower.");
state_machine = nuraft::cs_new<KeeperStateMachine>( state_machine = nuraft::cs_new<KeeperStateMachine>(
responses_queue_, responses_queue_,
snapshots_queue_, snapshots_queue_,
coordination_settings,
keeper_context, keeper_context,
config.getBool("keeper_server.upload_snapshot_on_exit", true) ? &snapshot_manager_s3 : nullptr, config.getBool("keeper_server.upload_snapshot_on_exit", false) ? &snapshot_manager_s3 : nullptr,
commit_callback, commit_callback,
checkAndGetSuperdigest(configuration_and_settings_->super_digest)); checkAndGetSuperdigest(configuration_and_settings_->super_digest));
@ -143,7 +142,6 @@ KeeperServer::KeeperServer(
"keeper_server", "keeper_server",
"state", "state",
config, config,
coordination_settings,
keeper_context); keeper_context);
} }
@ -226,7 +224,7 @@ void KeeperServer::loadLatestConfig()
{ {
auto latest_snapshot_config = state_machine->getClusterConfig(); auto latest_snapshot_config = state_machine->getClusterConfig();
auto latest_log_store_config = state_manager->getLatestConfigFromLogStore(); auto latest_log_store_config = state_manager->getLatestConfigFromLogStore();
auto async_replication = coordination_settings->async_replication; auto async_replication = keeper_context->getCoordinationSettings()->async_replication;
if (latest_snapshot_config && latest_log_store_config) if (latest_snapshot_config && latest_log_store_config)
{ {
@ -293,6 +291,8 @@ void KeeperServer::forceRecovery()
void KeeperServer::launchRaftServer(const Poco::Util::AbstractConfiguration & config, bool enable_ipv6) void KeeperServer::launchRaftServer(const Poco::Util::AbstractConfiguration & config, bool enable_ipv6)
{ {
const auto & coordination_settings = keeper_context->getCoordinationSettings();
nuraft::raft_params params; nuraft::raft_params params;
params.parallel_log_appending_ = true; params.parallel_log_appending_ = true;
params.heart_beat_interval_ params.heart_beat_interval_
@ -332,7 +332,7 @@ void KeeperServer::launchRaftServer(const Poco::Util::AbstractConfiguration & co
params.auto_forwarding_req_timeout_ params.auto_forwarding_req_timeout_
= getValueOrMaxInt32AndLogWarning(coordination_settings->operation_timeout_ms.totalMilliseconds() * 2, "operation_timeout_ms", log); = getValueOrMaxInt32AndLogWarning(coordination_settings->operation_timeout_ms.totalMilliseconds() * 2, "operation_timeout_ms", log);
params.max_append_size_ params.max_append_size_
= getValueOrMaxInt32AndLogWarning(coordination_settings->max_requests_batch_size, "max_requests_batch_size", log); = getValueOrMaxInt32AndLogWarning(coordination_settings->max_requests_append_size, "max_requests_append_size", log);
params.return_method_ = nuraft::raft_params::async_handler; params.return_method_ = nuraft::raft_params::async_handler;
@ -427,6 +427,10 @@ void KeeperServer::startup(const Poco::Util::AbstractConfiguration & config, boo
{ {
state_machine->init(); state_machine->init();
keeper_context->setLastCommitIndex(state_machine->last_commit_index());
const auto & coordination_settings = keeper_context->getCoordinationSettings();
state_manager->loadLogStore(state_machine->last_commit_index() + 1, coordination_settings->reserved_log_items); state_manager->loadLogStore(state_machine->last_commit_index() + 1, coordination_settings->reserved_log_items);
auto log_store = state_manager->load_log_store(); auto log_store = state_manager->load_log_store();
@ -446,7 +450,7 @@ void KeeperServer::startup(const Poco::Util::AbstractConfiguration & config, boo
void KeeperServer::shutdownRaftServer() void KeeperServer::shutdownRaftServer()
{ {
size_t timeout = coordination_settings->shutdown_timeout.totalSeconds(); size_t timeout = keeper_context->getCoordinationSettings()->shutdown_timeout.totalSeconds();
if (!raft_instance) if (!raft_instance)
{ {
@ -870,7 +874,7 @@ nuraft::cb_func::ReturnCode KeeperServer::callbackFunc(nuraft::cb_func::Type typ
/// Node first became leader, and after that some other node became leader. /// Node first became leader, and after that some other node became leader.
/// BecameFresh for this node will not be called because it was already fresh /// BecameFresh for this node will not be called because it was already fresh
/// when it was leader. /// when it was leader.
if (leader_index < our_index + coordination_settings->fresh_log_gap) if (leader_index < our_index + keeper_context->getCoordinationSettings()->fresh_log_gap)
set_initialized(); set_initialized();
} }
return nuraft::cb_func::ReturnCode::Ok; return nuraft::cb_func::ReturnCode::Ok;
@ -905,7 +909,7 @@ void KeeperServer::waitInit()
{ {
std::unique_lock lock(initialized_mutex); std::unique_lock lock(initialized_mutex);
int64_t timeout = coordination_settings->startup_timeout.totalMilliseconds(); int64_t timeout = keeper_context->getCoordinationSettings()->startup_timeout.totalMilliseconds();
if (!initialized_cv.wait_for(lock, std::chrono::milliseconds(timeout), [&] { return initialized_flag.load(); })) if (!initialized_cv.wait_for(lock, std::chrono::milliseconds(timeout), [&] { return initialized_flag.load(); }))
LOG_WARNING(log, "Failed to wait for RAFT initialization in {}ms, will continue in background", timeout); LOG_WARNING(log, "Failed to wait for RAFT initialization in {}ms, will continue in background", timeout);
} }
@ -977,6 +981,7 @@ KeeperServer::ConfigUpdateState KeeperServer::applyConfigUpdate(
ClusterUpdateActions KeeperServer::getRaftConfigurationDiff(const Poco::Util::AbstractConfiguration & config) ClusterUpdateActions KeeperServer::getRaftConfigurationDiff(const Poco::Util::AbstractConfiguration & config)
{ {
const auto & coordination_settings = keeper_context->getCoordinationSettings();
auto diff = state_manager->getRaftConfigurationDiff(config, coordination_settings); auto diff = state_manager->getRaftConfigurationDiff(config, coordination_settings);
if (!diff.empty()) if (!diff.empty())
@ -1004,6 +1009,7 @@ void KeeperServer::applyConfigUpdateWithReconfigDisabled(const ClusterUpdateActi
std::this_thread::sleep_for(sleep_time * (i + 1)); std::this_thread::sleep_for(sleep_time * (i + 1));
}; };
const auto & coordination_settings = keeper_context->getCoordinationSettings();
if (const auto * add = std::get_if<AddRaftServer>(&action)) if (const auto * add = std::get_if<AddRaftServer>(&action))
{ {
for (size_t i = 0; i < coordination_settings->configuration_change_tries_count && !is_recovering; ++i) for (size_t i = 0; i < coordination_settings->configuration_change_tries_count && !is_recovering; ++i)
@ -1059,6 +1065,7 @@ bool KeeperServer::waitForConfigUpdateWithReconfigDisabled(const ClusterUpdateAc
auto became_leader = [&] { LOG_INFO(log, "Became leader, aborting"); return false; }; auto became_leader = [&] { LOG_INFO(log, "Became leader, aborting"); return false; };
auto backoff = [&](size_t i) { std::this_thread::sleep_for(sleep_time * (i + 1)); }; auto backoff = [&](size_t i) { std::this_thread::sleep_for(sleep_time * (i + 1)); };
const auto & coordination_settings = keeper_context->getCoordinationSettings();
if (const auto* add = std::get_if<AddRaftServer>(&action)) if (const auto* add = std::get_if<AddRaftServer>(&action))
{ {
for (size_t i = 0; i < coordination_settings->configuration_change_tries_count && !is_recovering; ++i) for (size_t i = 0; i < coordination_settings->configuration_change_tries_count && !is_recovering; ++i)
@ -1125,14 +1132,12 @@ KeeperLogInfo KeeperServer::getKeeperLogInfo()
auto log_store = state_manager->load_log_store(); auto log_store = state_manager->load_log_store();
if (log_store) if (log_store)
{ {
log_info.first_log_idx = log_store->start_index(); const auto & keeper_log_storage = static_cast<const KeeperLogStore &>(*log_store);
log_info.first_log_term = log_store->term_at(log_info.first_log_idx); keeper_log_storage.getKeeperLogInfo(log_info);
} }
if (raft_instance) if (raft_instance)
{ {
log_info.last_log_idx = raft_instance->get_last_log_idx();
log_info.last_log_term = raft_instance->get_last_log_term();
log_info.last_committed_log_idx = raft_instance->get_committed_log_idx(); log_info.last_committed_log_idx = raft_instance->get_committed_log_idx();
log_info.leader_committed_log_idx = raft_instance->get_leader_committed_log_idx(); log_info.leader_committed_log_idx = raft_instance->get_leader_committed_log_idx();
log_info.target_committed_log_idx = raft_instance->get_target_committed_log_idx(); log_info.target_committed_log_idx = raft_instance->get_target_committed_log_idx();

View File

@ -22,8 +22,6 @@ class KeeperServer
private: private:
const int server_id; const int server_id;
CoordinationSettingsPtr coordination_settings;
nuraft::ptr<KeeperStateMachine> state_machine; nuraft::ptr<KeeperStateMachine> state_machine;
nuraft::ptr<KeeperStateManager> state_manager; nuraft::ptr<KeeperStateManager> state_manager;

View File

@ -3,6 +3,7 @@
#include <Coordination/KeeperSnapshotManager.h> #include <Coordination/KeeperSnapshotManager.h>
#include <Coordination/ReadBufferFromNuraftBuffer.h> #include <Coordination/ReadBufferFromNuraftBuffer.h>
#include <Coordination/WriteBufferFromNuraftBuffer.h> #include <Coordination/WriteBufferFromNuraftBuffer.h>
#include <Coordination/CoordinationSettings.h>
#include <IO/ReadBufferFromFile.h> #include <IO/ReadBufferFromFile.h>
#include <IO/ReadHelpers.h> #include <IO/ReadHelpers.h>
#include <IO/WriteBufferFromFile.h> #include <IO/WriteBufferFromFile.h>
@ -13,7 +14,7 @@
#include <memory> #include <memory>
#include <Common/logger_useful.h> #include <Common/logger_useful.h>
#include <Coordination/KeeperContext.h> #include <Coordination/KeeperContext.h>
#include <Coordination/pathUtils.h> #include <Coordination/KeeperCommon.h>
#include <Coordination/KeeperConstants.h> #include <Coordination/KeeperConstants.h>
#include <Common/ZooKeeper/ZooKeeperCommon.h> #include <Common/ZooKeeper/ZooKeeperCommon.h>
#include <Core/Field.h> #include <Core/Field.h>
@ -32,23 +33,21 @@ namespace ErrorCodes
namespace namespace
{ {
constexpr std::string_view tmp_prefix = "tmp_"; void moveSnapshotBetweenDisks(
DiskPtr disk_from,
void moveFileBetweenDisks(DiskPtr disk_from, const std::string & path_from, DiskPtr disk_to, const std::string & path_to) const std::string & path_from,
DiskPtr disk_to,
const std::string & path_to,
const KeeperContextPtr & keeper_context)
{ {
/// we use empty file with prefix tmp_ to detect incomplete copies moveFileBetweenDisks(
/// if a copy is complete we don't care from which disk we use the same file std::move(disk_from),
/// so it's okay if a failure happens after removing of tmp file but before we remove path_from,
/// the snapshot from the source disk std::move(disk_to),
auto from_path = fs::path(path_from); path_to,
auto tmp_snapshot_name = from_path.parent_path() / (std::string{tmp_prefix} + from_path.filename().string()); /*before_file_remove_op=*/{},
{ getLogger("KeeperSnapshotManager"),
auto buf = disk_to->writeFile(tmp_snapshot_name); keeper_context);
buf->finalize();
}
disk_from->copyFile(from_path, *disk_to, path_to, {});
disk_to->removeFile(tmp_snapshot_name);
disk_from->removeFile(path_from);
} }
uint64_t getSnapshotPathUpToLogIdx(const String & snapshot_path) uint64_t getSnapshotPathUpToLogIdx(const String & snapshot_path)
@ -582,9 +581,9 @@ KeeperSnapshotManager::KeeperSnapshotManager(
std::vector<std::string> snapshot_files; std::vector<std::string> snapshot_files;
for (auto it = disk->iterateDirectory(""); it->isValid(); it->next()) for (auto it = disk->iterateDirectory(""); it->isValid(); it->next())
{ {
if (it->name().starts_with(tmp_prefix)) if (it->name().starts_with(tmp_keeper_file_prefix))
{ {
incomplete_files.emplace(it->name().substr(tmp_prefix.size()), it->path()); incomplete_files.emplace(it->name().substr(tmp_keeper_file_prefix.size()), it->path());
continue; continue;
} }
@ -603,7 +602,7 @@ KeeperSnapshotManager::KeeperSnapshotManager(
if (!inserted) if (!inserted)
LOG_WARNING( LOG_WARNING(
getLogger("KeeperSnapshotManager"), log,
"Found another snapshots with last log idx {}, will use snapshot from disk {}", "Found another snapshots with last log idx {}, will use snapshot from disk {}",
snapshot_up_to, snapshot_up_to,
disk->getName()); disk->getName());
@ -612,6 +611,9 @@ KeeperSnapshotManager::KeeperSnapshotManager(
for (const auto & [name, path] : incomplete_files) for (const auto & [name, path] : incomplete_files)
disk->removeFile(path); disk->removeFile(path);
if (snapshot_files.empty())
LOG_TRACE(log, "No snapshots were found on {}", disk->getName());
read_disks.insert(disk); read_disks.insert(disk);
}; };
@ -774,7 +776,7 @@ void KeeperSnapshotManager::moveSnapshotsIfNeeded()
{ {
if (file_info.disk != latest_snapshot_disk) if (file_info.disk != latest_snapshot_disk)
{ {
moveFileBetweenDisks(file_info.disk, file_info.path, latest_snapshot_disk, file_info.path); moveSnapshotBetweenDisks(file_info.disk, file_info.path, latest_snapshot_disk, file_info.path, keeper_context);
file_info.disk = latest_snapshot_disk; file_info.disk = latest_snapshot_disk;
} }
} }
@ -782,7 +784,7 @@ void KeeperSnapshotManager::moveSnapshotsIfNeeded()
{ {
if (file_info.disk != disk) if (file_info.disk != disk)
{ {
moveFileBetweenDisks(file_info.disk, file_info.path, disk, file_info.path); moveSnapshotBetweenDisks(file_info.disk, file_info.path, disk, file_info.path, keeper_context);
file_info.disk = disk; file_info.disk = disk;
} }
} }

View File

@ -11,6 +11,7 @@
#include <base/errnoToString.h> #include <base/errnoToString.h>
#include <base/move_extend.h> #include <base/move_extend.h>
#include <sys/mman.h> #include <sys/mman.h>
#include <Common/Exception.h>
#include <Common/ProfileEvents.h> #include <Common/ProfileEvents.h>
#include <Common/ZooKeeper/ZooKeeperCommon.h> #include <Common/ZooKeeper/ZooKeeperCommon.h>
#include <Common/ZooKeeper/ZooKeeperIO.h> #include <Common/ZooKeeper/ZooKeeperIO.h>
@ -42,23 +43,20 @@ namespace ErrorCodes
KeeperStateMachine::KeeperStateMachine( KeeperStateMachine::KeeperStateMachine(
ResponsesQueue & responses_queue_, ResponsesQueue & responses_queue_,
SnapshotsQueue & snapshots_queue_, SnapshotsQueue & snapshots_queue_,
const CoordinationSettingsPtr & coordination_settings_,
const KeeperContextPtr & keeper_context_, const KeeperContextPtr & keeper_context_,
KeeperSnapshotManagerS3 * snapshot_manager_s3_, KeeperSnapshotManagerS3 * snapshot_manager_s3_,
CommitCallback commit_callback_, CommitCallback commit_callback_,
const std::string & superdigest_) const std::string & superdigest_)
: commit_callback(commit_callback_) : commit_callback(commit_callback_)
, coordination_settings(coordination_settings_)
, snapshot_manager( , snapshot_manager(
coordination_settings->snapshots_to_keep, keeper_context_->getCoordinationSettings()->snapshots_to_keep,
keeper_context_, keeper_context_,
coordination_settings->compress_snapshots_with_zstd_format, keeper_context_->getCoordinationSettings()->compress_snapshots_with_zstd_format,
superdigest_, superdigest_,
coordination_settings->dead_session_check_period_ms.totalMilliseconds()) keeper_context_->getCoordinationSettings()->dead_session_check_period_ms.totalMilliseconds())
, responses_queue(responses_queue_) , responses_queue(responses_queue_)
, snapshots_queue(snapshots_queue_) , snapshots_queue(snapshots_queue_)
, min_request_size_to_cache(coordination_settings_->min_request_size_for_cache) , min_request_size_to_cache(keeper_context_->getCoordinationSettings()->min_request_size_for_cache)
, last_committed_idx(0)
, log(getLogger("KeeperStateMachine")) , log(getLogger("KeeperStateMachine"))
, superdigest(superdigest_) , superdigest(superdigest_)
, keeper_context(keeper_context_) , keeper_context(keeper_context_)
@ -100,7 +98,7 @@ void KeeperStateMachine::init()
storage = std::move(snapshot_deserialization_result.storage); storage = std::move(snapshot_deserialization_result.storage);
latest_snapshot_meta = snapshot_deserialization_result.snapshot_meta; latest_snapshot_meta = snapshot_deserialization_result.snapshot_meta;
cluster_config = snapshot_deserialization_result.cluster_config; cluster_config = snapshot_deserialization_result.cluster_config;
last_committed_idx = latest_snapshot_meta->get_last_log_idx(); keeper_context->setLastCommitIndex(latest_snapshot_meta->get_last_log_idx());
loaded = true; loaded = true;
break; break;
} }
@ -115,6 +113,7 @@ void KeeperStateMachine::init()
} }
} }
auto last_committed_idx = keeper_context->lastCommittedIndex();
if (has_snapshots) if (has_snapshots)
{ {
if (loaded) if (loaded)
@ -129,7 +128,7 @@ void KeeperStateMachine::init()
if (!storage) if (!storage)
storage = std::make_unique<KeeperStorage>( storage = std::make_unique<KeeperStorage>(
coordination_settings->dead_session_check_period_ms.totalMilliseconds(), superdigest, keeper_context); keeper_context->getCoordinationSettings()->dead_session_check_period_ms.totalMilliseconds(), superdigest, keeper_context);
} }
namespace namespace
@ -139,16 +138,18 @@ void assertDigest(
const KeeperStorage::Digest & expected, const KeeperStorage::Digest & expected,
const KeeperStorage::Digest & actual, const KeeperStorage::Digest & actual,
const Coordination::ZooKeeperRequest & request, const Coordination::ZooKeeperRequest & request,
uint64_t log_idx,
bool committing) bool committing)
{ {
if (!KeeperStorage::checkDigest(expected, actual)) if (!KeeperStorage::checkDigest(expected, actual))
{ {
LOG_FATAL( LOG_FATAL(
getLogger("KeeperStateMachine"), getLogger("KeeperStateMachine"),
"Digest for nodes is not matching after {} request of type '{}'.\nExpected digest - {}, actual digest - {} (digest " "Digest for nodes is not matching after {} request of type '{}' at log index {}.\nExpected digest - {}, actual digest - {} "
"{}). Keeper will terminate to avoid inconsistencies.\nExtra information about the request:\n{}", "(digest {}). Keeper will terminate to avoid inconsistencies.\nExtra information about the request:\n{}",
committing ? "committing" : "preprocessing", committing ? "committing" : "preprocessing",
request.getOpNum(), request.getOpNum(),
log_idx,
expected.value, expected.value,
actual.value, actual.value,
expected.version, expected.version,
@ -296,12 +297,12 @@ bool KeeperStateMachine::preprocess(const KeeperStorage::RequestForSession & req
} }
catch (...) catch (...)
{ {
tryLogCurrentException(__PRETTY_FUNCTION__, "Failed to preprocess stored log, aborting to avoid inconsistent state"); tryLogCurrentException(__PRETTY_FUNCTION__, fmt::format("Failed to preprocess stored log at index {}, aborting to avoid inconsistent state", request_for_session.log_idx));
std::abort(); std::abort();
} }
if (keeper_context->digestEnabled() && request_for_session.digest) if (keeper_context->digestEnabled() && request_for_session.digest)
assertDigest(*request_for_session.digest, storage->getNodesDigest(false), *request_for_session.request, false); assertDigest(*request_for_session.digest, storage->getNodesDigest(false), *request_for_session.request, request_for_session.log_idx, false);
return true; return true;
} }
@ -408,48 +409,57 @@ nuraft::ptr<nuraft::buffer> KeeperStateMachine::commit(const uint64_t log_idx, n
} }
}; };
const auto op_num = request_for_session->request->getOpNum(); try
if (op_num == Coordination::OpNum::SessionID)
{ {
const Coordination::ZooKeeperSessionIDRequest & session_id_request const auto op_num = request_for_session->request->getOpNum();
= dynamic_cast<const Coordination::ZooKeeperSessionIDRequest &>(*request_for_session->request); if (op_num == Coordination::OpNum::SessionID)
int64_t session_id;
std::shared_ptr<Coordination::ZooKeeperSessionIDResponse> response = std::make_shared<Coordination::ZooKeeperSessionIDResponse>();
response->internal_id = session_id_request.internal_id;
response->server_id = session_id_request.server_id;
KeeperStorage::ResponseForSession response_for_session;
response_for_session.session_id = -1;
response_for_session.response = response;
std::lock_guard lock(storage_and_responses_lock);
session_id = storage->getSessionID(session_id_request.session_timeout_ms);
LOG_DEBUG(log, "Session ID response {} with timeout {}", session_id, session_id_request.session_timeout_ms);
response->session_id = session_id;
try_push(response_for_session);
}
else
{
if (op_num == Coordination::OpNum::Close)
{ {
std::lock_guard lock(request_cache_mutex); const Coordination::ZooKeeperSessionIDRequest & session_id_request
parsed_request_cache.erase(request_for_session->session_id); = dynamic_cast<const Coordination::ZooKeeperSessionIDRequest &>(*request_for_session->request);
int64_t session_id;
std::shared_ptr<Coordination::ZooKeeperSessionIDResponse> response = std::make_shared<Coordination::ZooKeeperSessionIDResponse>();
response->internal_id = session_id_request.internal_id;
response->server_id = session_id_request.server_id;
KeeperStorage::ResponseForSession response_for_session;
response_for_session.session_id = -1;
response_for_session.response = response;
std::lock_guard lock(storage_and_responses_lock);
session_id = storage->getSessionID(session_id_request.session_timeout_ms);
LOG_DEBUG(log, "Session ID response {} with timeout {}", session_id, session_id_request.session_timeout_ms);
response->session_id = session_id;
try_push(response_for_session);
}
else
{
if (op_num == Coordination::OpNum::Close)
{
std::lock_guard lock(request_cache_mutex);
parsed_request_cache.erase(request_for_session->session_id);
}
std::lock_guard lock(storage_and_responses_lock);
KeeperStorage::ResponsesForSessions responses_for_sessions
= storage->processRequest(request_for_session->request, request_for_session->session_id, request_for_session->zxid);
for (auto & response_for_session : responses_for_sessions)
try_push(response_for_session);
if (keeper_context->digestEnabled() && request_for_session->digest)
assertDigest(*request_for_session->digest, storage->getNodesDigest(true), *request_for_session->request, request_for_session->log_idx, true);
} }
std::lock_guard lock(storage_and_responses_lock); ProfileEvents::increment(ProfileEvents::KeeperCommits);
KeeperStorage::ResponsesForSessions responses_for_sessions keeper_context->setLastCommitIndex(log_idx);
= storage->processRequest(request_for_session->request, request_for_session->session_id, request_for_session->zxid);
for (auto & response_for_session : responses_for_sessions)
try_push(response_for_session);
if (keeper_context->digestEnabled() && request_for_session->digest) if (commit_callback)
assertDigest(*request_for_session->digest, storage->getNodesDigest(true), *request_for_session->request, true); commit_callback(log_idx, *request_for_session);
}
catch (...)
{
tryLogCurrentException(log, fmt::format("Failed to commit stored log at index {}", log_idx));
throw;
} }
ProfileEvents::increment(ProfileEvents::KeeperCommits);
last_committed_idx = log_idx;
if (commit_callback)
commit_callback(log_idx, *request_for_session);
return nullptr; return nullptr;
} }
@ -496,7 +506,7 @@ bool KeeperStateMachine::apply_snapshot(nuraft::snapshot & s)
} }
ProfileEvents::increment(ProfileEvents::KeeperSnapshotApplys); ProfileEvents::increment(ProfileEvents::KeeperSnapshotApplys);
last_committed_idx = s.get_last_log_idx(); keeper_context->setLastCommitIndex(s.get_last_log_idx());
return true; return true;
} }
@ -506,7 +516,7 @@ void KeeperStateMachine::commit_config(const uint64_t log_idx, nuraft::ptr<nuraf
std::lock_guard lock(cluster_config_lock); std::lock_guard lock(cluster_config_lock);
auto tmp = new_conf->serialize(); auto tmp = new_conf->serialize();
cluster_config = ClusterConfig::deserialize(*tmp); cluster_config = ClusterConfig::deserialize(*tmp);
last_committed_idx = log_idx; keeper_context->setLastCommitIndex(log_idx);
} }
void KeeperStateMachine::rollback(uint64_t log_idx, nuraft::buffer & data) void KeeperStateMachine::rollback(uint64_t log_idx, nuraft::buffer & data)

View File

@ -25,7 +25,6 @@ public:
KeeperStateMachine( KeeperStateMachine(
ResponsesQueue & responses_queue_, ResponsesQueue & responses_queue_,
SnapshotsQueue & snapshots_queue_, SnapshotsQueue & snapshots_queue_,
const CoordinationSettingsPtr & coordination_settings_,
const KeeperContextPtr & keeper_context_, const KeeperContextPtr & keeper_context_,
KeeperSnapshotManagerS3 * snapshot_manager_s3_, KeeperSnapshotManagerS3 * snapshot_manager_s3_,
CommitCallback commit_callback_ = {}, CommitCallback commit_callback_ = {},
@ -70,7 +69,7 @@ public:
const KeeperStorage::RequestForSession & request_for_session, const KeeperStorage::RequestForSession & request_for_session,
bool allow_missing) TSA_NO_THREAD_SAFETY_ANALYSIS; bool allow_missing) TSA_NO_THREAD_SAFETY_ANALYSIS;
uint64_t last_commit_index() override { return last_committed_idx; } uint64_t last_commit_index() override { return keeper_context->lastCommittedIndex(); }
/// Apply preliminarily saved (save_logical_snp_obj) snapshot to our state. /// Apply preliminarily saved (save_logical_snp_obj) snapshot to our state.
bool apply_snapshot(nuraft::snapshot & s) override; bool apply_snapshot(nuraft::snapshot & s) override;
@ -139,8 +138,6 @@ private:
SnapshotFileInfo latest_snapshot_info; SnapshotFileInfo latest_snapshot_info;
nuraft::ptr<nuraft::buffer> latest_snapshot_buf = nullptr; nuraft::ptr<nuraft::buffer> latest_snapshot_buf = nullptr;
CoordinationSettingsPtr coordination_settings;
/// Main state machine logic /// Main state machine logic
KeeperStoragePtr storage TSA_PT_GUARDED_BY(storage_and_responses_lock); KeeperStoragePtr storage TSA_PT_GUARDED_BY(storage_and_responses_lock);
@ -170,9 +167,6 @@ private:
/// can be processed only in 1 thread at any point /// can be processed only in 1 thread at any point
std::mutex request_cache_mutex; std::mutex request_cache_mutex;
/// Last committed Raft log number.
std::atomic<uint64_t> last_committed_idx;
LoggerPtr log; LoggerPtr log;
/// Cluster config for our quorum. /// Cluster config for our quorum.

View File

@ -241,23 +241,25 @@ KeeperStateManager::KeeperStateManager(
const std::string & config_prefix_, const std::string & config_prefix_,
const std::string & server_state_file_name_, const std::string & server_state_file_name_,
const Poco::Util::AbstractConfiguration & config, const Poco::Util::AbstractConfiguration & config,
const CoordinationSettingsPtr & coordination_settings,
KeeperContextPtr keeper_context_) KeeperContextPtr keeper_context_)
: my_server_id(my_server_id_) : my_server_id(my_server_id_)
, secure(config.getBool(config_prefix_ + ".raft_configuration.secure", false)) , secure(config.getBool(config_prefix_ + ".raft_configuration.secure", false))
, config_prefix(config_prefix_) , config_prefix(config_prefix_)
, configuration_wrapper(parseServersConfiguration(config, false, coordination_settings->async_replication)) , configuration_wrapper(parseServersConfiguration(config, false, keeper_context_->getCoordinationSettings()->async_replication))
, log_store(nuraft::cs_new<KeeperLogStore>( , log_store(nuraft::cs_new<KeeperLogStore>(
LogFileSettings LogFileSettings
{ {
.force_sync = coordination_settings->force_sync, .force_sync = keeper_context_->getCoordinationSettings()->force_sync,
.compress_logs = coordination_settings->compress_logs, .compress_logs = keeper_context_->getCoordinationSettings()->compress_logs,
.rotate_interval = coordination_settings->rotate_log_storage_interval, .rotate_interval = keeper_context_->getCoordinationSettings()->rotate_log_storage_interval,
.max_size = coordination_settings->max_log_file_size, .max_size = keeper_context_->getCoordinationSettings()->max_log_file_size,
.overallocate_size = coordination_settings->log_file_overallocate_size}, .overallocate_size = keeper_context_->getCoordinationSettings()->log_file_overallocate_size,
.latest_logs_cache_size_threshold = keeper_context_->getCoordinationSettings()->latest_logs_cache_size_threshold,
.commit_logs_cache_size_threshold = keeper_context_->getCoordinationSettings()->commit_logs_cache_size_threshold
},
FlushSettings FlushSettings
{ {
.max_flush_batch_size = coordination_settings->max_flush_batch_size, .max_flush_batch_size = keeper_context_->getCoordinationSettings()->max_flush_batch_size,
}, },
keeper_context_)) keeper_context_))
, server_state_file_name(server_state_file_name_) , server_state_file_name(server_state_file_name_)

View File

@ -23,7 +23,6 @@ public:
const std::string & config_prefix_, const std::string & config_prefix_,
const std::string & server_state_file_name_, const std::string & server_state_file_name_,
const Poco::Util::AbstractConfiguration & config, const Poco::Util::AbstractConfiguration & config,
const CoordinationSettingsPtr & coordination_settings,
KeeperContextPtr keeper_context_); KeeperContextPtr keeper_context_);
/// Constructor for tests /// Constructor for tests

View File

@ -18,7 +18,7 @@
#include <Common/LockMemoryExceptionInThread.h> #include <Common/LockMemoryExceptionInThread.h>
#include <Common/ProfileEvents.h> #include <Common/ProfileEvents.h>
#include <Coordination/pathUtils.h> #include <Coordination/KeeperCommon.h>
#include <Coordination/KeeperConstants.h> #include <Coordination/KeeperConstants.h>
#include <Coordination/KeeperReconfiguration.h> #include <Coordination/KeeperReconfiguration.h>
#include <Coordination/KeeperStorage.h> #include <Coordination/KeeperStorage.h>
@ -26,7 +26,6 @@
#include <functional> #include <functional>
#include <base/defines.h> #include <base/defines.h>
#include <filesystem>
namespace ProfileEvents namespace ProfileEvents
{ {
@ -1583,7 +1582,7 @@ struct KeeperStorageListRequestProcessor final : public KeeperStorageRequestProc
{ {
auto path_prefix = request.path; auto path_prefix = request.path;
if (path_prefix.empty()) if (path_prefix.empty())
throw DB::Exception(ErrorCodes::LOGICAL_ERROR, "Path cannot be empty"); throw DB::Exception(ErrorCodes::LOGICAL_ERROR, "Logical error: path cannot be empty");
const auto & children = node_it->value.getChildren(); const auto & children = node_it->value.getChildren();
response.names.reserve(children.size()); response.names.reserve(children.size());

View File

@ -8,7 +8,7 @@
#include <Common/ZooKeeper/ZooKeeperIO.h> #include <Common/ZooKeeper/ZooKeeperIO.h>
#include <Common/logger_useful.h> #include <Common/logger_useful.h>
#include <IO/ReadBufferFromFile.h> #include <IO/ReadBufferFromFile.h>
#include <Coordination/pathUtils.h> #include <Coordination/KeeperCommon.h>
namespace DB namespace DB

View File

@ -1,37 +0,0 @@
#include <Coordination/pathUtils.h>
namespace DB
{
static size_t findLastSlash(StringRef path)
{
if (path.size == 0)
return std::string::npos;
for (size_t i = path.size - 1; i > 0; --i)
{
if (path.data[i] == '/')
return i;
}
if (path.data[0] == '/')
return 0;
return std::string::npos;
}
StringRef parentNodePath(StringRef path)
{
auto rslash_pos = findLastSlash(path);
if (rslash_pos > 0)
return StringRef{path.data, rslash_pos};
return "/";
}
StringRef getBaseNodeName(StringRef path)
{
size_t basename_start = findLastSlash(path);
return StringRef{path.data + basename_start + 1, path.size - basename_start - 1};
}
}

View File

@ -1,13 +0,0 @@
#pragma once
#include <string>
#include <base/StringRef.h>
namespace DB
{
StringRef parentNodePath(StringRef path);
StringRef getBaseNodeName(StringRef path);
}

View File

@ -1,8 +1,6 @@
#include <chrono> #include <chrono>
#include <gtest/gtest.h> #include <gtest/gtest.h>
#include "Common/ZooKeeper/IKeeper.h"
#include "Core/Defines.h"
#include "config.h" #include "config.h"
#if USE_NURAFT #if USE_NURAFT
@ -22,7 +20,7 @@
#include <Coordination/ReadBufferFromNuraftBuffer.h> #include <Coordination/ReadBufferFromNuraftBuffer.h>
#include <Coordination/SummingStateMachine.h> #include <Coordination/SummingStateMachine.h>
#include <Coordination/WriteBufferFromNuraftBuffer.h> #include <Coordination/WriteBufferFromNuraftBuffer.h>
#include <Coordination/pathUtils.h> #include <Coordination/KeeperCommon.h>
#include <IO/ReadBufferFromString.h> #include <IO/ReadBufferFromString.h>
#include <IO/WriteHelpers.h> #include <IO/WriteHelpers.h>
#include <libnuraft/nuraft.hxx> #include <libnuraft/nuraft.hxx>
@ -65,7 +63,7 @@ struct CompressionParam
class CoordinationTest : public ::testing::TestWithParam<CompressionParam> class CoordinationTest : public ::testing::TestWithParam<CompressionParam>
{ {
protected: protected:
DB::KeeperContextPtr keeper_context = std::make_shared<DB::KeeperContext>(true); DB::KeeperContextPtr keeper_context = std::make_shared<DB::KeeperContext>(true, std::make_shared<DB::CoordinationSettings>());
LoggerPtr log{getLogger("CoordinationTest")}; LoggerPtr log{getLogger("CoordinationTest")};
void SetUp() override void SetUp() override
@ -558,6 +556,7 @@ TEST_P(CoordinationTest, ChangelogTestCompaction)
EXPECT_EQ(changelog.size(), 3); EXPECT_EQ(changelog.size(), 3);
keeper_context->setLastCommitIndex(2);
changelog.compact(2); changelog.compact(2);
EXPECT_EQ(changelog.size(), 1); EXPECT_EQ(changelog.size(), 1);
@ -582,6 +581,7 @@ TEST_P(CoordinationTest, ChangelogTestCompaction)
EXPECT_TRUE(fs::exists("./logs/changelog_1_5.bin" + params.extension)); EXPECT_TRUE(fs::exists("./logs/changelog_1_5.bin" + params.extension));
EXPECT_TRUE(fs::exists("./logs/changelog_6_10.bin" + params.extension)); EXPECT_TRUE(fs::exists("./logs/changelog_6_10.bin" + params.extension));
keeper_context->setLastCommitIndex(6);
changelog.compact(6); changelog.compact(6);
std::this_thread::sleep_for(std::chrono::microseconds(1000)); std::this_thread::sleep_for(std::chrono::microseconds(1000));
@ -1758,22 +1758,30 @@ getLogEntryFromZKRequest(size_t term, int64_t session_id, int64_t zxid, const Co
} }
void testLogAndStateMachine( void testLogAndStateMachine(
Coordination::CoordinationSettingsPtr settings, DB::CoordinationSettingsPtr settings,
uint64_t total_logs, uint64_t total_logs,
bool enable_compression, bool enable_compression)
Coordination::KeeperContextPtr keeper_context)
{ {
using namespace Coordination; using namespace Coordination;
using namespace DB; using namespace DB;
ChangelogDirTest snapshots("./snapshots"); ChangelogDirTest snapshots("./snapshots");
keeper_context->setSnapshotDisk(std::make_shared<DiskLocal>("SnapshotDisk", "./snapshots"));
ChangelogDirTest logs("./logs"); ChangelogDirTest logs("./logs");
keeper_context->setLogDisk(std::make_shared<DiskLocal>("LogDisk", "./logs"));
auto get_keeper_context = [&]
{
auto local_keeper_context = std::make_shared<DB::KeeperContext>(true, settings);
local_keeper_context->setSnapshotDisk(std::make_shared<DiskLocal>("SnapshotDisk", "./snapshots"));
local_keeper_context->setLogDisk(std::make_shared<DiskLocal>("LogDisk", "./logs"));
return local_keeper_context;
};
ResponsesQueue queue(std::numeric_limits<size_t>::max()); ResponsesQueue queue(std::numeric_limits<size_t>::max());
SnapshotsQueue snapshots_queue{1}; SnapshotsQueue snapshots_queue{1};
auto state_machine = std::make_shared<KeeperStateMachine>(queue, snapshots_queue, settings, keeper_context, nullptr);
auto keeper_context = get_keeper_context();
auto state_machine = std::make_shared<KeeperStateMachine>(queue, snapshots_queue, keeper_context, nullptr);
state_machine->init(); state_machine->init();
DB::KeeperLogStore changelog( DB::KeeperLogStore changelog(
DB::LogFileSettings{ DB::LogFileSettings{
@ -1811,12 +1819,14 @@ void testLogAndStateMachine(
snapshot_task.create_snapshot(std::move(snapshot_task.snapshot)); snapshot_task.create_snapshot(std::move(snapshot_task.snapshot));
} }
if (snapshot_created && changelog.size() > settings->reserved_log_items) if (snapshot_created && changelog.size() > settings->reserved_log_items)
changelog.compact(i - settings->reserved_log_items); changelog.compact(i - settings->reserved_log_items);
} }
SnapshotsQueue snapshots_queue1{1}; SnapshotsQueue snapshots_queue1{1};
auto restore_machine = std::make_shared<KeeperStateMachine>(queue, snapshots_queue1, settings, keeper_context, nullptr); keeper_context = get_keeper_context();
auto restore_machine = std::make_shared<KeeperStateMachine>(queue, snapshots_queue1, keeper_context, nullptr);
restore_machine->init(); restore_machine->init();
EXPECT_EQ(restore_machine->last_commit_index(), total_logs - total_logs % settings->snapshot_distance); EXPECT_EQ(restore_machine->last_commit_index(), total_logs - total_logs % settings->snapshot_distance);
@ -1863,63 +1873,64 @@ TEST_P(CoordinationTest, TestStateMachineAndLogStore)
settings->snapshot_distance = 10; settings->snapshot_distance = 10;
settings->reserved_log_items = 10; settings->reserved_log_items = 10;
settings->rotate_log_storage_interval = 10; settings->rotate_log_storage_interval = 10;
testLogAndStateMachine(settings, 37, params.enable_compression, keeper_context);
testLogAndStateMachine(settings, 37, params.enable_compression);
} }
{ {
CoordinationSettingsPtr settings = std::make_shared<CoordinationSettings>(); CoordinationSettingsPtr settings = std::make_shared<CoordinationSettings>();
settings->snapshot_distance = 10; settings->snapshot_distance = 10;
settings->reserved_log_items = 10; settings->reserved_log_items = 10;
settings->rotate_log_storage_interval = 10; settings->rotate_log_storage_interval = 10;
testLogAndStateMachine(settings, 11, params.enable_compression, keeper_context); testLogAndStateMachine(settings, 11, params.enable_compression);
} }
{ {
CoordinationSettingsPtr settings = std::make_shared<CoordinationSettings>(); CoordinationSettingsPtr settings = std::make_shared<CoordinationSettings>();
settings->snapshot_distance = 10; settings->snapshot_distance = 10;
settings->reserved_log_items = 10; settings->reserved_log_items = 10;
settings->rotate_log_storage_interval = 10; settings->rotate_log_storage_interval = 10;
testLogAndStateMachine(settings, 40, params.enable_compression, keeper_context); testLogAndStateMachine(settings, 40, params.enable_compression);
} }
{ {
CoordinationSettingsPtr settings = std::make_shared<CoordinationSettings>(); CoordinationSettingsPtr settings = std::make_shared<CoordinationSettings>();
settings->snapshot_distance = 10; settings->snapshot_distance = 10;
settings->reserved_log_items = 20; settings->reserved_log_items = 20;
settings->rotate_log_storage_interval = 30; settings->rotate_log_storage_interval = 30;
testLogAndStateMachine(settings, 40, params.enable_compression, keeper_context); testLogAndStateMachine(settings, 40, params.enable_compression);
} }
{ {
CoordinationSettingsPtr settings = std::make_shared<CoordinationSettings>(); CoordinationSettingsPtr settings = std::make_shared<CoordinationSettings>();
settings->snapshot_distance = 10; settings->snapshot_distance = 10;
settings->reserved_log_items = 0; settings->reserved_log_items = 0;
settings->rotate_log_storage_interval = 10; settings->rotate_log_storage_interval = 10;
testLogAndStateMachine(settings, 40, params.enable_compression, keeper_context); testLogAndStateMachine(settings, 40, params.enable_compression);
} }
{ {
CoordinationSettingsPtr settings = std::make_shared<CoordinationSettings>(); CoordinationSettingsPtr settings = std::make_shared<CoordinationSettings>();
settings->snapshot_distance = 1; settings->snapshot_distance = 1;
settings->reserved_log_items = 1; settings->reserved_log_items = 1;
settings->rotate_log_storage_interval = 32; settings->rotate_log_storage_interval = 32;
testLogAndStateMachine(settings, 32, params.enable_compression, keeper_context); testLogAndStateMachine(settings, 32, params.enable_compression);
} }
{ {
CoordinationSettingsPtr settings = std::make_shared<CoordinationSettings>(); CoordinationSettingsPtr settings = std::make_shared<CoordinationSettings>();
settings->snapshot_distance = 10; settings->snapshot_distance = 10;
settings->reserved_log_items = 7; settings->reserved_log_items = 7;
settings->rotate_log_storage_interval = 1; settings->rotate_log_storage_interval = 1;
testLogAndStateMachine(settings, 33, params.enable_compression, keeper_context); testLogAndStateMachine(settings, 33, params.enable_compression);
} }
{ {
CoordinationSettingsPtr settings = std::make_shared<CoordinationSettings>(); CoordinationSettingsPtr settings = std::make_shared<CoordinationSettings>();
settings->snapshot_distance = 37; settings->snapshot_distance = 37;
settings->reserved_log_items = 1000; settings->reserved_log_items = 1000;
settings->rotate_log_storage_interval = 5000; settings->rotate_log_storage_interval = 5000;
testLogAndStateMachine(settings, 33, params.enable_compression, keeper_context); testLogAndStateMachine(settings, 33, params.enable_compression);
} }
{ {
CoordinationSettingsPtr settings = std::make_shared<CoordinationSettings>(); CoordinationSettingsPtr settings = std::make_shared<CoordinationSettings>();
settings->snapshot_distance = 37; settings->snapshot_distance = 37;
settings->reserved_log_items = 1000; settings->reserved_log_items = 1000;
settings->rotate_log_storage_interval = 5000; settings->rotate_log_storage_interval = 5000;
testLogAndStateMachine(settings, 45, params.enable_compression, keeper_context); testLogAndStateMachine(settings, 45, params.enable_compression);
} }
} }
@ -1931,11 +1942,10 @@ TEST_P(CoordinationTest, TestEphemeralNodeRemove)
ChangelogDirTest snapshots("./snapshots"); ChangelogDirTest snapshots("./snapshots");
setSnapshotDirectory("./snapshots"); setSnapshotDirectory("./snapshots");
CoordinationSettingsPtr settings = std::make_shared<CoordinationSettings>();
ResponsesQueue queue(std::numeric_limits<size_t>::max()); ResponsesQueue queue(std::numeric_limits<size_t>::max());
SnapshotsQueue snapshots_queue{1}; SnapshotsQueue snapshots_queue{1};
auto state_machine = std::make_shared<KeeperStateMachine>(queue, snapshots_queue, settings, keeper_context, nullptr);
auto state_machine = std::make_shared<KeeperStateMachine>(queue, snapshots_queue, keeper_context, nullptr);
state_machine->init(); state_machine->init();
std::shared_ptr<ZooKeeperCreateRequest> request_c = std::make_shared<ZooKeeperCreateRequest>(); std::shared_ptr<ZooKeeperCreateRequest> request_c = std::make_shared<ZooKeeperCreateRequest>();
@ -1965,11 +1975,10 @@ TEST_P(CoordinationTest, TestCreateNodeWithAuthSchemeForAclWhenAuthIsPrecommitte
ChangelogDirTest snapshots("./snapshots"); ChangelogDirTest snapshots("./snapshots");
setSnapshotDirectory("./snapshots"); setSnapshotDirectory("./snapshots");
CoordinationSettingsPtr settings = std::make_shared<CoordinationSettings>();
ResponsesQueue queue(std::numeric_limits<size_t>::max()); ResponsesQueue queue(std::numeric_limits<size_t>::max());
SnapshotsQueue snapshots_queue{1}; SnapshotsQueue snapshots_queue{1};
auto state_machine = std::make_shared<KeeperStateMachine>(queue, snapshots_queue, settings, keeper_context, nullptr); auto state_machine = std::make_shared<KeeperStateMachine>(queue, snapshots_queue, keeper_context, nullptr);
state_machine->init(); state_machine->init();
String user_auth_data = "test_user:test_password"; String user_auth_data = "test_user:test_password";
@ -2017,11 +2026,10 @@ TEST_P(CoordinationTest, TestSetACLWithAuthSchemeForAclWhenAuthIsPrecommitted)
ChangelogDirTest snapshots("./snapshots"); ChangelogDirTest snapshots("./snapshots");
setSnapshotDirectory("./snapshots"); setSnapshotDirectory("./snapshots");
CoordinationSettingsPtr settings = std::make_shared<CoordinationSettings>();
ResponsesQueue queue(std::numeric_limits<size_t>::max()); ResponsesQueue queue(std::numeric_limits<size_t>::max());
SnapshotsQueue snapshots_queue{1}; SnapshotsQueue snapshots_queue{1};
auto state_machine = std::make_shared<KeeperStateMachine>(queue, snapshots_queue, settings, keeper_context, nullptr); auto state_machine = std::make_shared<KeeperStateMachine>(queue, snapshots_queue, keeper_context, nullptr);
state_machine->init(); state_machine->init();
String user_auth_data = "test_user:test_password"; String user_auth_data = "test_user:test_password";
@ -2132,6 +2140,7 @@ TEST_P(CoordinationTest, TestRotateIntervalChanges)
waitDurableLogs(changelog_2); waitDurableLogs(changelog_2);
keeper_context->setLastCommitIndex(105);
changelog_2.compact(105); changelog_2.compact(105);
std::this_thread::sleep_for(std::chrono::microseconds(1000)); std::this_thread::sleep_for(std::chrono::microseconds(1000));
@ -2157,6 +2166,7 @@ TEST_P(CoordinationTest, TestRotateIntervalChanges)
waitDurableLogs(changelog_3); waitDurableLogs(changelog_3);
keeper_context->setLastCommitIndex(125);
changelog_3.compact(125); changelog_3.compact(125);
std::this_thread::sleep_for(std::chrono::microseconds(1000)); std::this_thread::sleep_for(std::chrono::microseconds(1000));
assertFileDeleted("./logs/changelog_101_110.bin" + params.extension); assertFileDeleted("./logs/changelog_101_110.bin" + params.extension);

View File

@ -40,7 +40,7 @@ bool PacketEndpoint::tryReceivePacket(IMySQLReadPacket & packet, UInt64 millisec
ReadBufferFromPocoSocket * socket_in = typeid_cast<ReadBufferFromPocoSocket *>(in); ReadBufferFromPocoSocket * socket_in = typeid_cast<ReadBufferFromPocoSocket *>(in);
if (!socket_in) if (!socket_in)
throw Exception(ErrorCodes::LOGICAL_ERROR, "Attempt to pull the duration in a non socket stream"); throw Exception(ErrorCodes::LOGICAL_ERROR, "LOGICAL ERROR: Attempt to pull the duration in a non socket stream");
if (!socket_in->poll(millisecond * 1000)) if (!socket_in->poll(millisecond * 1000))
return false; return false;

View File

@ -872,7 +872,6 @@ class IColumn;
M(UInt64, cache_warmer_threads, 4, "Only available in ClickHouse Cloud", 0) \ M(UInt64, cache_warmer_threads, 4, "Only available in ClickHouse Cloud", 0) \
M(Int64, ignore_cold_parts_seconds, 0, "Only available in ClickHouse Cloud", 0) \ M(Int64, ignore_cold_parts_seconds, 0, "Only available in ClickHouse Cloud", 0) \
M(Int64, prefer_warmed_unmerged_parts_seconds, 0, "Only available in ClickHouse Cloud", 0) \ M(Int64, prefer_warmed_unmerged_parts_seconds, 0, "Only available in ClickHouse Cloud", 0) \
M(Bool, enable_order_by_all, true, "Enable sorting expression ORDER BY ALL.", 0) \
M(Bool, iceberg_engine_ignore_schema_evolution, false, "Ignore schema evolution in Iceberg table engine and read all data using latest schema saved on table creation. Note that it can lead to incorrect result", 0) \ M(Bool, iceberg_engine_ignore_schema_evolution, false, "Ignore schema evolution in Iceberg table engine and read all data using latest schema saved on table creation. Note that it can lead to incorrect result", 0) \
// End of COMMON_SETTINGS // End of COMMON_SETTINGS
@ -940,6 +939,7 @@ class IColumn;
MAKE_OBSOLETE(M, Bool, allow_experimental_undrop_table_query, true) \ MAKE_OBSOLETE(M, Bool, allow_experimental_undrop_table_query, true) \
MAKE_OBSOLETE(M, Bool, allow_experimental_s3queue, true) \ MAKE_OBSOLETE(M, Bool, allow_experimental_s3queue, true) \
MAKE_OBSOLETE(M, Bool, query_plan_optimize_primary_key, true) \ MAKE_OBSOLETE(M, Bool, query_plan_optimize_primary_key, true) \
MAKE_OBSOLETE(M, Bool, enable_order_by_all, true) \
/** The section above is for obsolete settings. Do not add anything there. */ /** The section above is for obsolete settings. Do not add anything there. */

View File

@ -78,7 +78,7 @@ void SentryWriter::initialize(Poco::Util::LayeredConfiguration & config)
if (enabled) if (enabled)
{ {
server_data_path = config.getString("path", ""); server_data_path = config.getString("path", DB::DBMS_DEFAULT_PATH);
const std::filesystem::path & default_tmp_path = fs::path(config.getString("tmp_path", fs::temp_directory_path())) / "sentry"; const std::filesystem::path & default_tmp_path = fs::path(config.getString("tmp_path", fs::temp_directory_path())) / "sentry";
const std::string & endpoint const std::string & endpoint
= config.getString("send_crash_reports.endpoint"); = config.getString("send_crash_reports.endpoint");

View File

@ -239,7 +239,7 @@ static DataTypePtr create(const ASTPtr & arguments)
argument_types.push_back(DataTypeFactory::instance().get(arguments->children[i])); argument_types.push_back(DataTypeFactory::instance().get(arguments->children[i]));
if (function_name.empty()) if (function_name.empty())
throw Exception(ErrorCodes::LOGICAL_ERROR, "Empty name of aggregate function passed"); throw Exception(ErrorCodes::LOGICAL_ERROR, "Logical error: empty name of aggregate function passed");
AggregateFunctionProperties properties; AggregateFunctionProperties properties;
AggregateFunctionPtr function = AggregateFunctionFactory::instance().get(function_name, action, argument_types, params_row, properties); AggregateFunctionPtr function = AggregateFunctionFactory::instance().get(function_name, action, argument_types, params_row, properties);

View File

@ -69,6 +69,11 @@ String DataTypeArray::doGetPrettyName(size_t indent) const
return s.str(); return s.str();
} }
void DataTypeArray::forEachChild(const ChildCallback & callback) const
{
callback(*nested);
nested->forEachChild(callback);
}
static DataTypePtr create(const ASTPtr & arguments) static DataTypePtr create(const ASTPtr & arguments)
{ {

View File

@ -43,6 +43,7 @@ public:
MutableColumnPtr createColumn() const override; MutableColumnPtr createColumn() const override;
void forEachChild(const ChildCallback & callback) const override;
Field getDefault() const override; Field getDefault() const override;

View File

@ -141,7 +141,7 @@ static std::pair<DataTypePtr, DataTypeCustomDescPtr> create(const ASTPtr & argum
argument_types.push_back(DataTypeFactory::instance().get(arguments->children[i])); argument_types.push_back(DataTypeFactory::instance().get(arguments->children[i]));
if (function_name.empty()) if (function_name.empty())
throw Exception(ErrorCodes::LOGICAL_ERROR, "Empty name of aggregate function passed"); throw Exception(ErrorCodes::LOGICAL_ERROR, "Logical error: empty name of aggregate function passed");
AggregateFunctionProperties properties; AggregateFunctionProperties properties;
/// NullsAction is not part of the type definition, instead it will have transformed the function into a different one /// NullsAction is not part of the type definition, instead it will have transformed the function into a different one

View File

@ -153,6 +153,12 @@ SerializationPtr DataTypeLowCardinality::doGetDefaultSerialization() const
return std::make_shared<SerializationLowCardinality>(dictionary_type); return std::make_shared<SerializationLowCardinality>(dictionary_type);
} }
void DataTypeLowCardinality::forEachChild(const ChildCallback & callback) const
{
callback(*dictionary_type);
dictionary_type->forEachChild(callback);
}
static DataTypePtr create(const ASTPtr & arguments) static DataTypePtr create(const ASTPtr & arguments)
{ {

View File

@ -60,6 +60,8 @@ public:
static MutableColumnUniquePtr createColumnUnique(const IDataType & keys_type); static MutableColumnUniquePtr createColumnUnique(const IDataType & keys_type);
static MutableColumnUniquePtr createColumnUnique(const IDataType & keys_type, MutableColumnPtr && keys); static MutableColumnUniquePtr createColumnUnique(const IDataType & keys_type, MutableColumnPtr && keys);
void forEachChild(const ChildCallback & callback) const override;
private: private:
SerializationPtr doGetDefaultSerialization() const override; SerializationPtr doGetDefaultSerialization() const override;

View File

@ -143,6 +143,14 @@ DataTypePtr DataTypeMap::getNestedTypeWithUnnamedTuple() const
return std::make_shared<DataTypeArray>(std::make_shared<DataTypeTuple>(from_tuple.getElements())); return std::make_shared<DataTypeArray>(std::make_shared<DataTypeTuple>(from_tuple.getElements()));
} }
void DataTypeMap::forEachChild(const DB::IDataType::ChildCallback & callback) const
{
callback(*key_type);
key_type->forEachChild(callback);
callback(*value_type);
value_type->forEachChild(callback);
}
static DataTypePtr create(const ASTPtr & arguments) static DataTypePtr create(const ASTPtr & arguments)
{ {
if (!arguments || arguments->children.size() != 2) if (!arguments || arguments->children.size() != 2)

View File

@ -54,6 +54,8 @@ public:
static bool checkKeyType(DataTypePtr key_type); static bool checkKeyType(DataTypePtr key_type);
void forEachChild(const ChildCallback & callback) const override;
private: private:
void assertKeyType() const; void assertKeyType() const;
}; };

View File

@ -61,6 +61,12 @@ SerializationPtr DataTypeNullable::doGetDefaultSerialization() const
return std::make_shared<SerializationNullable>(nested_data_type->getDefaultSerialization()); return std::make_shared<SerializationNullable>(nested_data_type->getDefaultSerialization());
} }
void DataTypeNullable::forEachChild(const ChildCallback & callback) const
{
callback(*nested_data_type);
nested_data_type->forEachChild(callback);
}
static DataTypePtr create(const ASTPtr & arguments) static DataTypePtr create(const ASTPtr & arguments)
{ {

Some files were not shown because too many files have changed in this diff Show More