Merge branch 'master' into read-cgroup-memory-usage-async-metrics

This commit is contained in:
Antonio Andelic 2024-07-21 10:37:01 +01:00 committed by GitHub
commit 1d1dc9c7ad
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
444 changed files with 5267 additions and 6720 deletions

View File

@ -36,10 +36,6 @@ jobs:
cd "$GITHUB_WORKSPACE/tests/ci"
echo "Testing the main ci directory"
python3 -m unittest discover -s . -p 'test_*.py'
for dir in *_lambda/; do
echo "Testing $dir"
python3 -m unittest discover -s "$dir" -p 'test_*.py'
done
- name: PrepareRunConfig
id: runconfig
run: |

View File

@ -33,10 +33,6 @@ jobs:
# cd "$GITHUB_WORKSPACE/tests/ci"
# echo "Testing the main ci directory"
# python3 -m unittest discover -s . -p 'test_*.py'
# for dir in *_lambda/; do
# echo "Testing $dir"
# python3 -m unittest discover -s "$dir" -p 'test_*.py'
# done
- name: PrepareRunConfig
id: runconfig
run: |

View File

@ -30,10 +30,6 @@ jobs:
cd "$GITHUB_WORKSPACE/tests/ci"
echo "Testing the main ci directory"
python3 -m unittest discover -s . -p 'test_*.py'
for dir in *_lambda/; do
echo "Testing $dir"
python3 -m unittest discover -s "$dir" -p 'test_*.py'
done
- name: PrepareRunConfig
id: runconfig
run: |

View File

@ -48,10 +48,6 @@ jobs:
cd "$GITHUB_WORKSPACE/tests/ci"
echo "Testing the main ci directory"
python3 -m unittest discover -s . -p 'test_*.py'
for dir in *_lambda/; do
echo "Testing $dir"
python3 -m unittest discover -s "$dir" -p 'test_*.py'
done
- name: PrepareRunConfig
id: runconfig
run: |

View File

@ -33,10 +33,6 @@ jobs:
cd "$GITHUB_WORKSPACE/tests/ci"
echo "Testing the main ci directory"
python3 -m unittest discover -s . -p 'test_*.py'
for dir in *_lambda/; do
echo "Testing $dir"
python3 -m unittest discover -s "$dir" -p 'test_*.py'
done
- name: PrepareRunConfig
id: runconfig
run: |

View File

@ -1298,7 +1298,6 @@ elseif(ARCH_PPC64LE)
${OPENSSL_SOURCE_DIR}/crypto/camellia/camellia.c
${OPENSSL_SOURCE_DIR}/crypto/camellia/cmll_cbc.c
${OPENSSL_SOURCE_DIR}/crypto/chacha/chacha_enc.c
${OPENSSL_SOURCE_DIR}/crypto/mem_clr.c
${OPENSSL_SOURCE_DIR}/crypto/rc4/rc4_enc.c
${OPENSSL_SOURCE_DIR}/crypto/rc4/rc4_skey.c
${OPENSSL_SOURCE_DIR}/crypto/sha/keccak1600.c

View File

@ -4,6 +4,9 @@
source /setup_export_logs.sh
set -e -x
MAX_RUN_TIME=${MAX_RUN_TIME:-3600}
MAX_RUN_TIME=$((MAX_RUN_TIME == 0 ? 3600 : MAX_RUN_TIME))
# Choose random timezone for this test run
TZ="$(rg -v '#' /usr/share/zoneinfo/zone.tab | awk '{print $3}' | shuf | head -n1)"
echo "Choosen random timezone $TZ"
@ -242,7 +245,22 @@ function run_tests()
}
export -f run_tests
timeout "$MAX_RUN_TIME" bash -c run_tests ||:
function timeout_with_logging() {
local exit_code=0
timeout -s TERM --preserve-status "${@}" || exit_code="${?}"
if [[ "${exit_code}" -eq "124" ]]
then
echo "The command 'timeout ${*}' has been killed by timeout"
fi
return $exit_code
}
TIMEOUT=$((MAX_RUN_TIME - 700))
timeout_with_logging "$TIMEOUT" bash -c run_tests ||:
echo "Files in current directory"
ls -la ./

View File

@ -6,18 +6,12 @@ source /setup_export_logs.sh
# fail on errors, verbose and export all env variables
set -e -x -a
MAX_RUN_TIME=${MAX_RUN_TIME:-7200}
MAX_RUN_TIME=$((MAX_RUN_TIME == 0 ? 7200 : MAX_RUN_TIME))
MAX_RUN_TIME=${MAX_RUN_TIME:-9000}
MAX_RUN_TIME=$((MAX_RUN_TIME == 0 ? 9000 : MAX_RUN_TIME))
USE_DATABASE_REPLICATED=${USE_DATABASE_REPLICATED:=0}
USE_SHARED_CATALOG=${USE_SHARED_CATALOG:=0}
RUN_SEQUENTIAL_TESTS_IN_PARALLEL=0
if [[ "$USE_DATABASE_REPLICATED" -eq 1 ]] || [[ "$USE_SHARED_CATALOG" -eq 1 ]]; then
RUN_SEQUENTIAL_TESTS_IN_PARALLEL=0
fi
# Choose random timezone for this test run.
#
# NOTE: that clickhouse-test will randomize session_timezone by itself as well
@ -101,53 +95,6 @@ if [ "$NUM_TRIES" -gt "1" ]; then
mkdir -p /var/run/clickhouse-server
fi
# Run a CH instance to execute sequential tests on it in parallel with all other tests.
if [[ "$RUN_SEQUENTIAL_TESTS_IN_PARALLEL" -eq 1 ]]; then
mkdir -p /var/run/clickhouse-server3 /etc/clickhouse-server3 /var/lib/clickhouse3
cp -r -L /etc/clickhouse-server/* /etc/clickhouse-server3/
sudo chown clickhouse:clickhouse /var/run/clickhouse-server3 /var/lib/clickhouse3 /etc/clickhouse-server3/
sudo chown -R clickhouse:clickhouse /etc/clickhouse-server3/*
function replace(){
sudo find /etc/clickhouse-server3/ -type f -name '*.xml' -exec sed -i "$1" {} \;
}
replace "s|<port>9000</port>|<port>19000</port>|g"
replace "s|<port>9440</port>|<port>19440</port>|g"
replace "s|<port>9988</port>|<port>19988</port>|g"
replace "s|<port>9234</port>|<port>19234</port>|g"
replace "s|<port>9181</port>|<port>19181</port>|g"
replace "s|<https_port>8443</https_port>|<https_port>18443</https_port>|g"
replace "s|<tcp_port>9000</tcp_port>|<tcp_port>19000</tcp_port>|g"
replace "s|<tcp_port>9181</tcp_port>|<tcp_port>19181</tcp_port>|g"
replace "s|<tcp_port_secure>9440</tcp_port_secure>|<tcp_port_secure>19440</tcp_port_secure>|g"
replace "s|<tcp_with_proxy_port>9010</tcp_with_proxy_port>|<tcp_with_proxy_port>19010</tcp_with_proxy_port>|g"
replace "s|<mysql_port>9004</mysql_port>|<mysql_port>19004</mysql_port>|g"
replace "s|<postgresql_port>9005</postgresql_port>|<postgresql_port>19005</postgresql_port>|g"
replace "s|<interserver_http_port>9009</interserver_http_port>|<interserver_http_port>19009</interserver_http_port>|g"
replace "s|8123|18123|g"
replace "s|/var/lib/clickhouse/|/var/lib/clickhouse3/|g"
replace "s|/etc/clickhouse-server/|/etc/clickhouse-server3/|g"
# distributed cache
replace "s|<tcp_port>10001</tcp_port>|<tcp_port>10003</tcp_port>|g"
replace "s|<tcp_port>10002</tcp_port>|<tcp_port>10004</tcp_port>|g"
sudo -E -u clickhouse /usr/bin/clickhouse server --daemon --config /etc/clickhouse-server3/config.xml \
--pid-file /var/run/clickhouse-server3/clickhouse-server.pid \
-- --path /var/lib/clickhouse3/ --logger.stderr /var/log/clickhouse-server/stderr3.log \
--logger.log /var/log/clickhouse-server/clickhouse-server3.log --logger.errorlog /var/log/clickhouse-server/clickhouse-server3.err.log \
--tcp_port 19000 --tcp_port_secure 19440 --http_port 18123 --https_port 18443 --interserver_http_port 19009 --tcp_with_proxy_port 19010 \
--prometheus.port 19988 --keeper_server.raft_configuration.server.port 19234 --keeper_server.tcp_port 19181 \
--mysql_port 19004 --postgresql_port 19005
for _ in {1..100}
do
clickhouse-client --port 19000 --query "SELECT 1" && break
sleep 1
done
fi
# simplest way to forward env variables to server
sudo -E -u clickhouse /usr/bin/clickhouse-server --config /etc/clickhouse-server/config.xml --daemon --pid-file /var/run/clickhouse-server/clickhouse-server.pid
@ -183,9 +130,6 @@ if [[ "$USE_DATABASE_REPLICATED" -eq 1 ]]; then
--keeper_server.tcp_port 29181 --keeper_server.server_id 3 \
--prometheus.port 29988 \
--macros.shard s2 # It doesn't work :(
MAX_RUN_TIME=$((MAX_RUN_TIME < 9000 ? MAX_RUN_TIME : 9000)) # min(MAX_RUN_TIME, 2.5 hours)
MAX_RUN_TIME=$((MAX_RUN_TIME != 0 ? MAX_RUN_TIME : 9000)) # set to 2.5 hours if 0 (unlimited)
fi
if [[ "$USE_SHARED_CATALOG" -eq 1 ]]; then
@ -210,9 +154,6 @@ if [[ "$USE_SHARED_CATALOG" -eq 1 ]]; then
--keeper_server.tcp_port 19181 --keeper_server.server_id 2 \
--prometheus.port 19988 \
--macros.replica r2 # It doesn't work :(
MAX_RUN_TIME=$((MAX_RUN_TIME < 9000 ? MAX_RUN_TIME : 9000)) # min(MAX_RUN_TIME, 2.5 hours)
MAX_RUN_TIME=$((MAX_RUN_TIME != 0 ? MAX_RUN_TIME : 9000)) # set to 2.5 hours if 0 (unlimited)
fi
# Wait for the server to start, but not for too long.
@ -223,7 +164,6 @@ do
done
setup_logs_replication
attach_gdb_to_clickhouse || true # FIXME: to not break old builds, clean on 2023-09-01
function fn_exists() {
@ -284,11 +224,7 @@ function run_tests()
else
# All other configurations are OK.
ADDITIONAL_OPTIONS+=('--jobs')
ADDITIONAL_OPTIONS+=('5')
fi
if [[ "$RUN_SEQUENTIAL_TESTS_IN_PARALLEL" -eq 1 ]]; then
ADDITIONAL_OPTIONS+=('--run-sequential-tests-in-parallel')
ADDITIONAL_OPTIONS+=('8')
fi
if [[ -n "$RUN_BY_HASH_NUM" ]] && [[ -n "$RUN_BY_HASH_TOTAL" ]]; then
@ -373,9 +309,6 @@ done
# Because it's the simplest way to read it when server has crashed.
sudo clickhouse stop ||:
if [[ "$RUN_SEQUENTIAL_TESTS_IN_PARALLEL" -eq 1 ]]; then
sudo clickhouse stop --pid-path /var/run/clickhouse-server3 ||:
fi
if [[ "$USE_DATABASE_REPLICATED" -eq 1 ]]; then
sudo clickhouse stop --pid-path /var/run/clickhouse-server1 ||:
@ -393,12 +326,6 @@ rg -Fa "<Fatal>" /var/log/clickhouse-server/clickhouse-server.log ||:
rg -A50 -Fa "============" /var/log/clickhouse-server/stderr.log ||:
zstd --threads=0 < /var/log/clickhouse-server/clickhouse-server.log > /test_output/clickhouse-server.log.zst &
if [[ "$RUN_SEQUENTIAL_TESTS_IN_PARALLEL" -eq 1 ]]; then
rg -Fa "<Fatal>" /var/log/clickhouse-server3/clickhouse-server.log ||:
rg -A50 -Fa "============" /var/log/clickhouse-server3/stderr.log ||:
zstd --threads=0 < /var/log/clickhouse-server3/clickhouse-server.log > /test_output/clickhouse-server3.log.zst &
fi
data_path_config="--path=/var/lib/clickhouse/"
if [[ -n "$USE_S3_STORAGE_FOR_MERGE_TREE" ]] && [[ "$USE_S3_STORAGE_FOR_MERGE_TREE" -eq 1 ]]; then
# We need s3 storage configuration (but it's more likely that clickhouse-local will fail for some reason)
@ -419,10 +346,6 @@ if [ $failed_to_save_logs -ne 0 ]; then
do
clickhouse-local "$data_path_config" --only-system-tables --stacktrace -q "select * from system.$table format TSVWithNamesAndTypes" | zstd --threads=0 > /test_output/$table.tsv.zst ||:
if [[ "$RUN_SEQUENTIAL_TESTS_IN_PARALLEL" -eq 1 ]]; then
clickhouse-local --path /var/lib/clickhouse3/ --only-system-tables --stacktrace -q "select * from system.$table format TSVWithNamesAndTypes" | zstd --threads=0 > /test_output/$table.3.tsv.zst ||:
fi
if [[ "$USE_DATABASE_REPLICATED" -eq 1 ]]; then
clickhouse-local --path /var/lib/clickhouse1/ --only-system-tables --stacktrace -q "select * from system.$table format TSVWithNamesAndTypes" | zstd --threads=0 > /test_output/$table.1.tsv.zst ||:
clickhouse-local --path /var/lib/clickhouse2/ --only-system-tables --stacktrace -q "select * from system.$table format TSVWithNamesAndTypes" | zstd --threads=0 > /test_output/$table.2.tsv.zst ||:
@ -464,12 +387,6 @@ rm -rf /var/lib/clickhouse/data/system/*/
tar -chf /test_output/store.tar /var/lib/clickhouse/store ||:
tar -chf /test_output/metadata.tar /var/lib/clickhouse/metadata/*.sql ||:
if [[ "$RUN_SEQUENTIAL_TESTS_IN_PARALLEL" -eq 1 ]]; then
rm -rf /var/lib/clickhouse3/data/system/*/
tar -chf /test_output/store.tar /var/lib/clickhouse3/store ||:
tar -chf /test_output/metadata.tar /var/lib/clickhouse3/metadata/*.sql ||:
fi
if [[ "$USE_DATABASE_REPLICATED" -eq 1 ]]; then
rg -Fa "<Fatal>" /var/log/clickhouse-server/clickhouse-server1.log ||:

View File

@ -67,6 +67,7 @@ The supported formats are:
| [Prometheus](#prometheus) | ✗ | ✔ |
| [Protobuf](#protobuf) | ✔ | ✔ |
| [ProtobufSingle](#protobufsingle) | ✔ | ✔ |
| [ProtobufList](#protobuflist) | ✔ | ✔ |
| [Avro](#data-format-avro) | ✔ | ✔ |
| [AvroConfluent](#data-format-avro-confluent) | ✔ | ✗ |
| [Parquet](#data-format-parquet) | ✔ | ✔ |
@ -1952,6 +1953,35 @@ SYSTEM DROP FORMAT SCHEMA CACHE FOR Protobuf
Same as [Protobuf](#protobuf) but for storing/parsing single Protobuf message without length delimiters.
## ProtobufList {#protobuflist}
Similar to Protobuf but rows are represented as a sequence of sub-messages contained in a message with fixed name "Envelope".
Usage example:
``` sql
SELECT * FROM test.table FORMAT ProtobufList SETTINGS format_schema = 'schemafile:MessageType'
```
``` bash
cat protobuflist_messages.bin | clickhouse-client --query "INSERT INTO test.table FORMAT ProtobufList SETTINGS format_schema='schemafile:MessageType'"
```
where the file `schemafile.proto` looks like this:
``` capnp
syntax = "proto3";
message Envelope {
message MessageType {
string name = 1;
string surname = 2;
uint32 birthDate = 3;
repeated string phoneNumbers = 4;
};
MessageType row = 1;
};
```
## Avro {#data-format-avro}
[Apache Avro](https://avro.apache.org/) is a row-oriented data serialization framework developed within Apaches Hadoop project.

View File

@ -124,7 +124,7 @@ which is equal to
#### Default values for from_env and from_zk attributes
It's possible to set the default value and substitute it only if the environment variable or zookeeper node is set using `replace="1"`.
It's possible to set the default value and substitute it only if the environment variable or zookeeper node is set using `replace="1"` (must be declared before from_env).
With previous example, but `MAX_QUERY_SIZE` is unset:
@ -132,7 +132,7 @@ With previous example, but `MAX_QUERY_SIZE` is unset:
<clickhouse>
<profiles>
<default>
<max_query_size from_env="MAX_QUERY_SIZE" replace="1">150000</max_query_size>
<max_query_size replace="1" from_env="MAX_QUERY_SIZE">150000</max_query_size>
</default>
</profiles>
</clickhouse>

View File

@ -9,7 +9,6 @@ Columns:
- `name` ([String](../../sql-reference/data-types/string.md)) The name of the function.
- `is_aggregate` ([UInt8](../../sql-reference/data-types/int-uint.md)) — Whether the function is an aggregate function.
- `is_deterministic` ([Nullable](../../sql-reference/data-types/nullable.md)([UInt8](../../sql-reference/data-types/int-uint.md))) - Whether the function is deterministic.
- `case_insensitive`, ([UInt8](../../sql-reference/data-types/int-uint.md)) - Whether the function name can be used case-insensitively.
- `alias_to`, ([String](../../sql-reference/data-types/string.md)) - The original function name, if the function name is an alias.
- `create_query`, ([String](../../sql-reference/data-types/enum.md)) - Unused.

View File

@ -1,6 +1,6 @@
---
slug: /en/sql-reference/table-functions/azureBlobStorageCluster
sidebar_position: 55
sidebar_position: 15
sidebar_label: azureBlobStorageCluster
title: "azureBlobStorageCluster Table Function"
---

View File

@ -45,16 +45,17 @@ int mainEntryClickHouseKeeperConverter(int argc, char ** argv)
keeper_context->setDigestEnabled(true);
keeper_context->setSnapshotDisk(std::make_shared<DiskLocal>("Keeper-snapshots", options["output-dir"].as<std::string>()));
DB::KeeperStorage storage(/* tick_time_ms */ 500, /* superdigest */ "", keeper_context, /* initialize_system_nodes */ false);
/// TODO(hanfei): support rocksdb here
DB::KeeperMemoryStorage storage(/* tick_time_ms */ 500, /* superdigest */ "", keeper_context, /* initialize_system_nodes */ false);
DB::deserializeKeeperStorageFromSnapshotsDir(storage, options["zookeeper-snapshots-dir"].as<std::string>(), logger);
storage.initializeSystemNodes();
DB::deserializeLogsAndApplyToStorage(storage, options["zookeeper-logs-dir"].as<std::string>(), logger);
DB::SnapshotMetadataPtr snapshot_meta = std::make_shared<DB::SnapshotMetadata>(storage.getZXID(), 1, std::make_shared<nuraft::cluster_config>());
DB::KeeperStorageSnapshot snapshot(&storage, snapshot_meta);
DB::KeeperStorageSnapshot<DB::KeeperMemoryStorage> snapshot(&storage, snapshot_meta);
DB::KeeperSnapshotManager manager(1, keeper_context);
DB::KeeperSnapshotManager<DB::KeeperMemoryStorage> manager(1, keeper_context);
auto snp = manager.serializeSnapshotToBuffer(snapshot);
auto file_info = manager.serializeSnapshotBufferToDisk(*snp, storage.getZXID());
std::cout << "Snapshot serialized to path:" << fs::path(file_info->disk->getPath()) / file_info->path << std::endl;

View File

@ -53,6 +53,10 @@
# include <Server/CertificateReloader.h>
#endif
#if USE_GWP_ASAN
# include <Common/GWPAsan.h>
#endif
#include <Server/ProtocolServerAdapter.h>
#include <Server/KeeperTCPHandlerFactory.h>
@ -642,6 +646,10 @@ try
tryLogCurrentException(log, "Disabling cgroup memory observer because of an error during initialization");
}
#if USE_GWP_ASAN
GWPAsan::initFinished();
#endif
LOG_INFO(log, "Ready for connections.");
waitForTerminationRequest();

View File

@ -2211,6 +2211,7 @@ try
CannotAllocateThreadFaultInjector::setFaultProbability(server_settings.cannot_allocate_thread_fault_injection_probability);
#if USE_GWP_ASAN
GWPAsan::initFinished();
GWPAsan::setForceSampleProbability(server_settings.gwp_asan_force_sample_probability);
#endif
@ -2729,8 +2730,7 @@ void Server::createInterserverServers(
void Server::stopServers(
std::vector<ProtocolServerAdapter> & servers,
const ServerType & server_type
) const
const ServerType & server_type) const
{
LoggerRawPtr log = &logger();

View File

@ -129,8 +129,7 @@ private:
void stopServers(
std::vector<ProtocolServerAdapter> & servers,
const ServerType & server_type
) const;
const ServerType & server_type) const;
};
}

View File

@ -118,10 +118,10 @@ AggregateFunctionPtr createAggregateFunctionAnalysisOfVariance(const std::string
void registerAggregateFunctionAnalysisOfVariance(AggregateFunctionFactory & factory)
{
AggregateFunctionProperties properties = { .is_order_dependent = false };
factory.registerFunction("analysisOfVariance", {createAggregateFunctionAnalysisOfVariance, properties}, AggregateFunctionFactory::CaseInsensitive);
factory.registerFunction("analysisOfVariance", {createAggregateFunctionAnalysisOfVariance, properties}, AggregateFunctionFactory::Case::Insensitive);
/// This is widely used term
factory.registerAlias("anova", "analysisOfVariance", AggregateFunctionFactory::CaseInsensitive);
factory.registerAlias("anova", "analysisOfVariance", AggregateFunctionFactory::Case::Insensitive);
}
}

View File

@ -361,9 +361,9 @@ void registerAggregateFunctionsAny(AggregateFunctionFactory & factory)
AggregateFunctionProperties default_properties = {.returns_default_when_only_null = false, .is_order_dependent = true};
factory.registerFunction("any", {createAggregateFunctionAny, default_properties});
factory.registerAlias("any_value", "any", AggregateFunctionFactory::CaseInsensitive);
factory.registerAlias("first_value", "any", AggregateFunctionFactory::CaseInsensitive);
factory.registerAlias("any_value", "any", AggregateFunctionFactory::Case::Insensitive);
factory.registerAlias("first_value", "any", AggregateFunctionFactory::Case::Insensitive);
factory.registerFunction("anyLast", {createAggregateFunctionAnyLast, default_properties});
factory.registerAlias("last_value", "anyLast", AggregateFunctionFactory::CaseInsensitive);
factory.registerAlias("last_value", "anyLast", AggregateFunctionFactory::Case::Insensitive);
}
}

View File

@ -221,11 +221,11 @@ void registerAggregateFunctionsAnyRespectNulls(AggregateFunctionFactory & factor
= {.returns_default_when_only_null = false, .is_order_dependent = true, .is_window_function = true};
factory.registerFunction("any_respect_nulls", {createAggregateFunctionAnyRespectNulls, default_properties_for_respect_nulls});
factory.registerAlias("any_value_respect_nulls", "any_respect_nulls", AggregateFunctionFactory::CaseInsensitive);
factory.registerAlias("first_value_respect_nulls", "any_respect_nulls", AggregateFunctionFactory::CaseInsensitive);
factory.registerAlias("any_value_respect_nulls", "any_respect_nulls", AggregateFunctionFactory::Case::Insensitive);
factory.registerAlias("first_value_respect_nulls", "any_respect_nulls", AggregateFunctionFactory::Case::Insensitive);
factory.registerFunction("anyLast_respect_nulls", {createAggregateFunctionAnyLastRespectNulls, default_properties_for_respect_nulls});
factory.registerAlias("last_value_respect_nulls", "anyLast_respect_nulls", AggregateFunctionFactory::CaseInsensitive);
factory.registerAlias("last_value_respect_nulls", "anyLast_respect_nulls", AggregateFunctionFactory::Case::Insensitive);
/// Must happen after registering any and anyLast
factory.registerNullsActionTransformation("any", "any_respect_nulls");

View File

@ -46,6 +46,6 @@ AggregateFunctionPtr createAggregateFunctionAvg(const std::string & name, const
void registerAggregateFunctionAvg(AggregateFunctionFactory & factory)
{
factory.registerFunction("avg", createAggregateFunctionAvg, AggregateFunctionFactory::CaseInsensitive);
factory.registerFunction("avg", createAggregateFunctionAvg, AggregateFunctionFactory::Case::Insensitive);
}
}

View File

@ -234,9 +234,9 @@ void registerAggregateFunctionsBitwise(AggregateFunctionFactory & factory)
factory.registerFunction("groupBitXor", createAggregateFunctionBitwise<AggregateFunctionGroupBitXorData>);
/// Aliases for compatibility with MySQL.
factory.registerAlias("BIT_OR", "groupBitOr", AggregateFunctionFactory::CaseInsensitive);
factory.registerAlias("BIT_AND", "groupBitAnd", AggregateFunctionFactory::CaseInsensitive);
factory.registerAlias("BIT_XOR", "groupBitXor", AggregateFunctionFactory::CaseInsensitive);
factory.registerAlias("BIT_OR", "groupBitOr", AggregateFunctionFactory::Case::Insensitive);
factory.registerAlias("BIT_AND", "groupBitAnd", AggregateFunctionFactory::Case::Insensitive);
factory.registerAlias("BIT_XOR", "groupBitXor", AggregateFunctionFactory::Case::Insensitive);
}
}

View File

@ -9,7 +9,7 @@ template <typename T1, typename T2> using AggregateFunctionCorr = AggregateFunct
void registerAggregateFunctionsStatisticsCorr(AggregateFunctionFactory & factory)
{
factory.registerFunction("corr", createAggregateFunctionStatisticsBinary<AggregateFunctionCorr, StatisticsFunctionKind::corr>, AggregateFunctionFactory::CaseInsensitive);
factory.registerFunction("corr", createAggregateFunctionStatisticsBinary<AggregateFunctionCorr, StatisticsFunctionKind::corr>, AggregateFunctionFactory::Case::Insensitive);
}
}

View File

@ -37,7 +37,7 @@ AggregateFunctionPtr createAggregateFunctionCount(const std::string & name, cons
void registerAggregateFunctionCount(AggregateFunctionFactory & factory)
{
AggregateFunctionProperties properties = { .returns_default_when_only_null = true, .is_order_dependent = false };
factory.registerFunction("count", {createAggregateFunctionCount, properties}, AggregateFunctionFactory::CaseInsensitive);
factory.registerFunction("count", {createAggregateFunctionCount, properties}, AggregateFunctionFactory::Case::Insensitive);
}
}

View File

@ -13,8 +13,8 @@ void registerAggregateFunctionsStatisticsCovar(AggregateFunctionFactory & factor
factory.registerFunction("covarPop", createAggregateFunctionStatisticsBinary<AggregateFunctionCovar, StatisticsFunctionKind::covarPop>);
/// Synonyms for compatibility.
factory.registerAlias("COVAR_SAMP", "covarSamp", AggregateFunctionFactory::CaseInsensitive);
factory.registerAlias("COVAR_POP", "covarPop", AggregateFunctionFactory::CaseInsensitive);
factory.registerAlias("COVAR_SAMP", "covarSamp", AggregateFunctionFactory::Case::Insensitive);
factory.registerAlias("COVAR_POP", "covarPop", AggregateFunctionFactory::Case::Insensitive);
}
}

View File

@ -29,7 +29,7 @@ const String & getAggregateFunctionCanonicalNameIfAny(const String & name)
return AggregateFunctionFactory::instance().getCanonicalNameIfAny(name);
}
void AggregateFunctionFactory::registerFunction(const String & name, Value creator_with_properties, CaseSensitiveness case_sensitiveness)
void AggregateFunctionFactory::registerFunction(const String & name, Value creator_with_properties, Case case_sensitiveness)
{
if (creator_with_properties.creator == nullptr)
throw Exception(ErrorCodes::LOGICAL_ERROR, "AggregateFunctionFactory: "
@ -39,7 +39,7 @@ void AggregateFunctionFactory::registerFunction(const String & name, Value creat
throw Exception(ErrorCodes::LOGICAL_ERROR, "AggregateFunctionFactory: the aggregate function name '{}' is not unique",
name);
if (case_sensitiveness == CaseInsensitive)
if (case_sensitiveness == Case::Insensitive)
{
auto key = Poco::toLower(name);
if (!case_insensitive_aggregate_functions.emplace(key, creator_with_properties).second)

View File

@ -60,7 +60,7 @@ public:
void registerFunction(
const String & name,
Value creator,
CaseSensitiveness case_sensitiveness = CaseSensitive);
Case case_sensitiveness = Case::Sensitive);
/// Register how to transform from one aggregate function to other based on NullsAction
/// Registers them both ways:

View File

@ -840,8 +840,8 @@ void registerAggregateFunctionGroupArray(AggregateFunctionFactory & factory)
AggregateFunctionProperties properties = { .returns_default_when_only_null = false, .is_order_dependent = true };
factory.registerFunction("groupArray", { createAggregateFunctionGroupArray<false>, properties });
factory.registerAlias("array_agg", "groupArray", AggregateFunctionFactory::CaseInsensitive);
factory.registerAliasUnchecked("array_concat_agg", "groupArrayArray", AggregateFunctionFactory::CaseInsensitive);
factory.registerAlias("array_agg", "groupArray", AggregateFunctionFactory::Case::Insensitive);
factory.registerAliasUnchecked("array_concat_agg", "groupArrayArray", AggregateFunctionFactory::Case::Insensitive);
factory.registerFunction("groupArraySample", { createAggregateFunctionGroupArraySample, properties });
factory.registerFunction("groupArrayLast", { createAggregateFunctionGroupArray<true>, properties });
}

View File

@ -150,8 +150,18 @@ public:
void deserialize(AggregateDataPtr __restrict place, ReadBuffer & buf, std::optional<size_t> /* version */, Arena *) const override
{
readVarUInt(this->data(place).version, buf);
this->data(place).value.read(buf);
auto & set = this->data(place).value;
auto & version = this->data(place).version;
size_t size;
readVarUInt(version, buf);
readVarUInt(size, buf);
set.reserve(size);
for (size_t i = 0; i < size; ++i)
{
int key;
readIntBinary(key, buf);
set.insert(key);
}
}
void insertResultInto(AggregateDataPtr __restrict place, IColumn & to, Arena *) const override
@ -292,7 +302,7 @@ public:
}
return new_map;
};
auto new_map = rhs_value.size() < set.size() ? create_new_map(rhs_value, set) : create_new_map(set, rhs_value);
auto new_map = create_new_map(set, rhs_value);
set = std::move(new_map);
}
}
@ -316,11 +326,9 @@ public:
readVarUInt(version, buf);
readVarUInt(size, buf);
set.reserve(size);
UInt64 elem_version;
for (size_t i = 0; i < size; ++i)
{
auto key = readStringBinaryInto(*arena, buf);
readVarUInt(elem_version, buf);
set.insert(key);
}
}

View File

@ -277,7 +277,7 @@ void registerAggregateFunctionGroupConcat(AggregateFunctionFactory & factory)
AggregateFunctionProperties properties = { .returns_default_when_only_null = false, .is_order_dependent = true };
factory.registerFunction("groupConcat", { createAggregateFunctionGroupConcat, properties });
factory.registerAlias("group_concat", "groupConcat", AggregateFunctionFactory::CaseInsensitive);
factory.registerAlias("group_concat", "groupConcat", AggregateFunctionFactory::Case::Insensitive);
}
}

View File

@ -350,7 +350,7 @@ AggregateFunctionPtr createAggregateFunctionKolmogorovSmirnovTest(
void registerAggregateFunctionKolmogorovSmirnovTest(AggregateFunctionFactory & factory)
{
factory.registerFunction("kolmogorovSmirnovTest", createAggregateFunctionKolmogorovSmirnovTest, AggregateFunctionFactory::CaseInsensitive);
factory.registerFunction("kolmogorovSmirnovTest", createAggregateFunctionKolmogorovSmirnovTest, AggregateFunctionFactory::Case::Insensitive);
}
}

View File

@ -15,11 +15,11 @@ void registerAggregateFunctionsStatisticsSecondMoment(AggregateFunctionFactory &
factory.registerFunction("stddevPop", createAggregateFunctionStatisticsUnary<AggregateFunctionSecondMoment, StatisticsFunctionKind::stddevPop>);
/// Synonyms for compatibility.
factory.registerAlias("VAR_SAMP", "varSamp", AggregateFunctionFactory::CaseInsensitive);
factory.registerAlias("VAR_POP", "varPop", AggregateFunctionFactory::CaseInsensitive);
factory.registerAlias("STDDEV_SAMP", "stddevSamp", AggregateFunctionFactory::CaseInsensitive);
factory.registerAlias("STDDEV_POP", "stddevPop", AggregateFunctionFactory::CaseInsensitive);
factory.registerAlias("STD", "stddevPop", AggregateFunctionFactory::CaseInsensitive);
factory.registerAlias("VAR_SAMP", "varSamp", AggregateFunctionFactory::Case::Insensitive);
factory.registerAlias("VAR_POP", "varPop", AggregateFunctionFactory::Case::Insensitive);
factory.registerAlias("STDDEV_SAMP", "stddevSamp", AggregateFunctionFactory::Case::Insensitive);
factory.registerAlias("STDDEV_POP", "stddevPop", AggregateFunctionFactory::Case::Insensitive);
factory.registerAlias("STD", "stddevPop", AggregateFunctionFactory::Case::Insensitive);
}
}

View File

@ -72,7 +72,7 @@ AggregateFunctionPtr createAggregateFunctionSum(const std::string & name, const
void registerAggregateFunctionSum(AggregateFunctionFactory & factory)
{
factory.registerFunction("sum", createAggregateFunctionSum<AggregateFunctionSumSimple>, AggregateFunctionFactory::CaseInsensitive);
factory.registerFunction("sum", createAggregateFunctionSum<AggregateFunctionSumSimple>, AggregateFunctionFactory::Case::Insensitive);
factory.registerFunction("sumWithOverflow", createAggregateFunctionSum<AggregateFunctionSumWithOverflow>);
factory.registerFunction("sumKahan", createAggregateFunctionSum<AggregateFunctionSumKahan>);
}

View File

@ -535,9 +535,9 @@ void registerAggregateFunctionTopK(AggregateFunctionFactory & factory)
factory.registerFunction("topK", { createAggregateFunctionTopK<false, false>, properties });
factory.registerFunction("topKWeighted", { createAggregateFunctionTopK<true, false>, properties });
factory.registerFunction("approx_top_k", { createAggregateFunctionTopK<false, true>, properties }, AggregateFunctionFactory::CaseInsensitive);
factory.registerFunction("approx_top_sum", { createAggregateFunctionTopK<true, true>, properties }, AggregateFunctionFactory::CaseInsensitive);
factory.registerAlias("approx_top_count", "approx_top_k", AggregateFunctionFactory::CaseInsensitive);
factory.registerFunction("approx_top_k", { createAggregateFunctionTopK<false, true>, properties }, AggregateFunctionFactory::Case::Insensitive);
factory.registerFunction("approx_top_sum", { createAggregateFunctionTopK<true, true>, properties }, AggregateFunctionFactory::Case::Insensitive);
factory.registerAlias("approx_top_count", "approx_top_k", AggregateFunctionFactory::Case::Insensitive);
}
}

View File

@ -195,8 +195,8 @@ AggregateFunctionPtr createAggregateFunctionMinMax(
void registerAggregateFunctionsMinMax(AggregateFunctionFactory & factory)
{
factory.registerFunction("min", createAggregateFunctionMinMax<true>, AggregateFunctionFactory::CaseInsensitive);
factory.registerFunction("max", createAggregateFunctionMinMax<false>, AggregateFunctionFactory::CaseInsensitive);
factory.registerFunction("min", createAggregateFunctionMinMax<true>, AggregateFunctionFactory::Case::Insensitive);
factory.registerFunction("max", createAggregateFunctionMinMax<false>, AggregateFunctionFactory::Case::Insensitive);
}
}

View File

@ -2919,6 +2919,17 @@ ProjectionNames QueryAnalyzer::resolveFunction(QueryTreeNodePtr & node, Identifi
resolveExpressionNode(in_second_argument, scope, false /*allow_lambda_expression*/, true /*allow_table_expression*/);
}
/// Edge case when the first argument of IN is scalar subquery.
auto & in_first_argument = function_in_arguments_nodes[0];
auto first_argument_type = in_first_argument->getNodeType();
if (first_argument_type == QueryTreeNodeType::QUERY || first_argument_type == QueryTreeNodeType::UNION)
{
IdentifierResolveScope subquery_scope(in_first_argument, &scope /*parent_scope*/);
subquery_scope.subquery_depth = scope.subquery_depth + 1;
evaluateScalarSubqueryIfNeeded(in_first_argument, subquery_scope);
}
}
/// Initialize function argument columns

View File

@ -419,9 +419,6 @@ dbms_target_link_libraries (
boost::circular_buffer
boost::heap)
target_include_directories(clickhouse_common_io PUBLIC "${CMAKE_CURRENT_BINARY_DIR}/Core/include") # uses some includes from core
dbms_target_include_directories(PUBLIC "${CMAKE_CURRENT_BINARY_DIR}/Core/include")
target_link_libraries(clickhouse_common_io PUBLIC
ch_contrib::miniselect
ch_contrib::pdqsort)

View File

@ -21,6 +21,7 @@
#include <Common/StringUtils.h>
#include <Common/filesystemHelpers.h>
#include <Common/NetException.h>
#include <Common/SignalHandlers.h>
#include <Common/tryGetFileNameByFileDescriptor.h>
#include <Columns/ColumnString.h>
#include <Columns/ColumnsNumber.h>
@ -80,6 +81,10 @@
#include <Common/config_version.h>
#include "config.h"
#if USE_GWP_ASAN
# include <Common/GWPAsan.h>
#endif
namespace fs = std::filesystem;
using namespace std::literals;
@ -302,7 +307,13 @@ public:
};
ClientBase::~ClientBase() = default;
ClientBase::~ClientBase()
{
writeSignalIDtoSignalPipe(SignalListener::StopThread);
signal_listener_thread.join();
HandledSignals::instance().reset();
}
ClientBase::ClientBase(
int in_fd_,
int out_fd_,
@ -3072,6 +3083,8 @@ void ClientBase::init(int argc, char ** argv)
("max_memory_usage_in_client", po::value<std::string>(), "Set memory limit in client/local server")
("fuzzer-args", po::value<std::string>(), "Command line arguments for the LLVM's libFuzzer driver. Only relevant if the application is compiled with libFuzzer.")
("client_logs_file", po::value<std::string>(), "Path to a file for writing client logs. Currently we only have fatal logs (when the client crashes)")
;
addOptions(options_description);
@ -3236,6 +3249,30 @@ void ClientBase::init(int argc, char ** argv)
total_memory_tracker.setDescription("(total)");
total_memory_tracker.setMetric(CurrentMetrics::MemoryTracking);
}
/// Print stacktrace in case of crash
HandledSignals::instance().setupTerminateHandler();
HandledSignals::instance().setupCommonDeadlySignalHandlers();
/// We don't setup signal handlers for SIGINT, SIGQUIT, SIGTERM because we don't
/// have an option for client to shutdown gracefully.
fatal_channel_ptr = new Poco::SplitterChannel;
fatal_console_channel_ptr = new Poco::ConsoleChannel;
fatal_channel_ptr->addChannel(fatal_console_channel_ptr);
if (options.count("client_logs_file"))
{
fatal_file_channel_ptr = new Poco::SimpleFileChannel(options["client_logs_file"].as<std::string>());
fatal_channel_ptr->addChannel(fatal_file_channel_ptr);
}
fatal_log = createLogger("ClientBase", fatal_channel_ptr.get(), Poco::Message::PRIO_FATAL);
signal_listener = std::make_unique<SignalListener>(nullptr, fatal_log);
signal_listener_thread.start(*signal_listener);
#if USE_GWP_ASAN
GWPAsan::initFinished();
#endif
}
}

View File

@ -12,6 +12,9 @@
#include <Core/ExternalTable.h>
#include <Core/Settings.h>
#include <Poco/Util/Application.h>
#include <Poco/ConsoleChannel.h>
#include <Poco/SimpleFileChannel.h>
#include <Poco/SplitterChannel.h>
#include <Interpreters/Context.h>
#include <Client/Suggest.h>
#include <boost/program_options.hpp>
@ -212,6 +215,13 @@ protected:
SharedContextHolder shared_context;
ContextMutablePtr global_context;
LoggerPtr fatal_log;
Poco::AutoPtr<Poco::SplitterChannel> fatal_channel_ptr;
Poco::AutoPtr<Poco::Channel> fatal_console_channel_ptr;
Poco::AutoPtr<Poco::Channel> fatal_file_channel_ptr;
Poco::Thread signal_listener_thread;
std::unique_ptr<Poco::Runnable> signal_listener;
bool is_interactive = false; /// Use either interactive line editing interface or batch mode.
bool is_multiquery = false;
bool delayed_interactive = false;

View File

@ -366,13 +366,10 @@ void ColumnAggregateFunction::updateHashWithValue(size_t n, SipHash & hash) cons
hash.update(wbuf.str().c_str(), wbuf.str().size());
}
void ColumnAggregateFunction::updateWeakHash32(WeakHash32 & hash) const
WeakHash32 ColumnAggregateFunction::getWeakHash32() const
{
auto s = data.size();
if (hash.getData().size() != data.size())
throw Exception(ErrorCodes::LOGICAL_ERROR, "Size of WeakHash32 does not match size of column: "
"column size is {}, hash size is {}", std::to_string(s), hash.getData().size());
WeakHash32 hash(s);
auto & hash_data = hash.getData();
std::vector<UInt8> v;
@ -383,6 +380,8 @@ void ColumnAggregateFunction::updateWeakHash32(WeakHash32 & hash) const
wbuf.finalize();
hash_data[i] = ::updateWeakHash32(v.data(), v.size(), hash_data[i]);
}
return hash;
}
void ColumnAggregateFunction::updateHashFast(SipHash & hash) const

View File

@ -177,7 +177,7 @@ public:
void updateHashWithValue(size_t n, SipHash & hash) const override;
void updateWeakHash32(WeakHash32 & hash) const override;
WeakHash32 getWeakHash32() const override;
void updateHashFast(SipHash & hash) const override;

View File

@ -271,15 +271,12 @@ void ColumnArray::updateHashWithValue(size_t n, SipHash & hash) const
getData().updateHashWithValue(offset + i, hash);
}
void ColumnArray::updateWeakHash32(WeakHash32 & hash) const
WeakHash32 ColumnArray::getWeakHash32() const
{
auto s = offsets->size();
if (hash.getData().size() != s)
throw Exception(ErrorCodes::LOGICAL_ERROR, "Size of WeakHash32 does not match size of column: "
"column size is {}, hash size is {}", s, hash.getData().size());
WeakHash32 hash(s);
WeakHash32 internal_hash(data->size());
data->updateWeakHash32(internal_hash);
WeakHash32 internal_hash = data->getWeakHash32();
Offset prev_offset = 0;
const auto & offsets_data = getOffsets();
@ -300,6 +297,8 @@ void ColumnArray::updateWeakHash32(WeakHash32 & hash) const
prev_offset = offsets_data[i];
}
return hash;
}
void ColumnArray::updateHashFast(SipHash & hash) const

View File

@ -82,7 +82,7 @@ public:
const char * deserializeAndInsertFromArena(const char * pos) override;
const char * skipSerializedInArena(const char * pos) const override;
void updateHashWithValue(size_t n, SipHash & hash) const override;
void updateWeakHash32(WeakHash32 & hash) const override;
WeakHash32 getWeakHash32() const override;
void updateHashFast(SipHash & hash) const override;
#if !defined(ABORT_ON_LOGICAL_ERROR)
void insertRangeFrom(const IColumn & src, size_t start, size_t length) override;

View File

@ -3,6 +3,7 @@
#include <optional>
#include <Core/Field.h>
#include <Columns/IColumn.h>
#include <Common/WeakHash.h>
#include <IO/BufferWithOwnMemory.h>
@ -98,7 +99,7 @@ public:
const char * deserializeAndInsertFromArena(const char *) override { throwMustBeDecompressed(); }
const char * skipSerializedInArena(const char *) const override { throwMustBeDecompressed(); }
void updateHashWithValue(size_t, SipHash &) const override { throwMustBeDecompressed(); }
void updateWeakHash32(WeakHash32 &) const override { throwMustBeDecompressed(); }
WeakHash32 getWeakHash32() const override { throwMustBeDecompressed(); }
void updateHashFast(SipHash &) const override { throwMustBeDecompressed(); }
ColumnPtr filter(const Filter &, ssize_t) const override { throwMustBeDecompressed(); }
void expand(const Filter &, bool) override { throwMustBeDecompressed(); }

View File

@ -137,18 +137,10 @@ void ColumnConst::updatePermutation(PermutationSortDirection /*direction*/, Perm
{
}
void ColumnConst::updateWeakHash32(WeakHash32 & hash) const
WeakHash32 ColumnConst::getWeakHash32() const
{
if (hash.getData().size() != s)
throw Exception(ErrorCodes::LOGICAL_ERROR, "Size of WeakHash32 does not match size of column: "
"column size is {}, hash size is {}", std::to_string(s), std::to_string(hash.getData().size()));
WeakHash32 element_hash(1);
data->updateWeakHash32(element_hash);
size_t data_hash = element_hash.getData()[0];
for (auto & value : hash.getData())
value = static_cast<UInt32>(intHashCRC32(data_hash, value));
WeakHash32 element_hash = data->getWeakHash32();
return WeakHash32(s, element_hash.getData()[0]);
}
void ColumnConst::compareColumn(

View File

@ -204,7 +204,7 @@ public:
data->updateHashWithValue(0, hash);
}
void updateWeakHash32(WeakHash32 & hash) const override;
WeakHash32 getWeakHash32() const override;
void updateHashFast(SipHash & hash) const override
{

View File

@ -28,7 +28,6 @@ namespace ErrorCodes
extern const int PARAMETER_OUT_OF_BOUND;
extern const int SIZES_OF_COLUMNS_DOESNT_MATCH;
extern const int NOT_IMPLEMENTED;
extern const int LOGICAL_ERROR;
}
template <is_decimal T>
@ -76,13 +75,10 @@ void ColumnDecimal<T>::updateHashWithValue(size_t n, SipHash & hash) const
}
template <is_decimal T>
void ColumnDecimal<T>::updateWeakHash32(WeakHash32 & hash) const
WeakHash32 ColumnDecimal<T>::getWeakHash32() const
{
auto s = data.size();
if (hash.getData().size() != s)
throw Exception(ErrorCodes::LOGICAL_ERROR, "Size of WeakHash32 does not match size of column: "
"column size is {}, hash size is {}", std::to_string(s), std::to_string(hash.getData().size()));
WeakHash32 hash(s);
const T * begin = data.data();
const T * end = begin + s;
@ -94,6 +90,8 @@ void ColumnDecimal<T>::updateWeakHash32(WeakHash32 & hash) const
++begin;
++hash_data;
}
return hash;
}
template <is_decimal T>

View File

@ -102,7 +102,7 @@ public:
const char * deserializeAndInsertFromArena(const char * pos) override;
const char * skipSerializedInArena(const char * pos) const override;
void updateHashWithValue(size_t n, SipHash & hash) const override;
void updateWeakHash32(WeakHash32 & hash) const override;
WeakHash32 getWeakHash32() const override;
void updateHashFast(SipHash & hash) const override;
#if !defined(ABORT_ON_LOGICAL_ERROR)
int compareAt(size_t n, size_t m, const IColumn & rhs_, int nan_direction_hint) const override;

View File

@ -4,6 +4,7 @@
#include <Columns/ColumnVector.h>
#include <Columns/ColumnVariant.h>
#include <DataTypes/IDataType.h>
#include <Common/WeakHash.h>
namespace DB
@ -174,9 +175,9 @@ public:
void updateHashWithValue(size_t n, SipHash & hash) const override;
void updateWeakHash32(WeakHash32 & hash) const override
WeakHash32 getWeakHash32() const override
{
variant_column->updateWeakHash32(hash);
return variant_column->getWeakHash32();
}
void updateHashFast(SipHash & hash) const override

View File

@ -137,14 +137,10 @@ void ColumnFixedString::updateHashWithValue(size_t index, SipHash & hash) const
hash.update(reinterpret_cast<const char *>(&chars[n * index]), n);
}
void ColumnFixedString::updateWeakHash32(WeakHash32 & hash) const
WeakHash32 ColumnFixedString::getWeakHash32() const
{
auto s = size();
if (hash.getData().size() != s)
throw Exception(ErrorCodes::LOGICAL_ERROR, "Size of WeakHash32 does not match size of column: "
"column size is {}, "
"hash size is {}", std::to_string(s), std::to_string(hash.getData().size()));
WeakHash32 hash(s);
const UInt8 * pos = chars.data();
UInt32 * hash_data = hash.getData().data();
@ -156,6 +152,8 @@ void ColumnFixedString::updateWeakHash32(WeakHash32 & hash) const
pos += n;
++hash_data;
}
return hash;
}
void ColumnFixedString::updateHashFast(SipHash & hash) const

View File

@ -133,7 +133,7 @@ public:
void updateHashWithValue(size_t index, SipHash & hash) const override;
void updateWeakHash32(WeakHash32 & hash) const override;
WeakHash32 getWeakHash32() const override;
void updateHashFast(SipHash & hash) const override;

View File

@ -4,6 +4,7 @@
#include <Core/NamesAndTypes.h>
#include <Core/ColumnsWithTypeAndName.h>
#include <Columns/IColumn.h>
#include <Common/WeakHash.h>
namespace DB
@ -130,9 +131,9 @@ public:
throw Exception(ErrorCodes::NOT_IMPLEMENTED, "updateHashWithValue is not implemented for {}", getName());
}
void updateWeakHash32(WeakHash32 &) const override
WeakHash32 getWeakHash32() const override
{
throw Exception(ErrorCodes::NOT_IMPLEMENTED, "updateWeakHash32 is not implemented for {}", getName());
throw Exception(ErrorCodes::NOT_IMPLEMENTED, "getWeakHash32 is not implemented for {}", getName());
}
void updateHashFast(SipHash &) const override

View File

@ -7,8 +7,7 @@
#include <Common/HashTable/HashMap.h>
#include <Common/WeakHash.h>
#include <Common/assert_cast.h>
#include "Storages/IndicesDescription.h"
#include "base/types.h"
#include <base/types.h>
#include <base/sort.h>
#include <base/scope_guard.h>
@ -320,19 +319,10 @@ const char * ColumnLowCardinality::skipSerializedInArena(const char * pos) const
return getDictionary().skipSerializedInArena(pos);
}
void ColumnLowCardinality::updateWeakHash32(WeakHash32 & hash) const
WeakHash32 ColumnLowCardinality::getWeakHash32() const
{
auto s = size();
if (hash.getData().size() != s)
throw Exception(ErrorCodes::LOGICAL_ERROR, "Size of WeakHash32 does not match size of column: "
"column size is {}, hash size is {}", std::to_string(s), std::to_string(hash.getData().size()));
const auto & dict = getDictionary().getNestedColumn();
WeakHash32 dict_hash(dict->size());
dict->updateWeakHash32(dict_hash);
idx.updateWeakHash(hash, dict_hash);
WeakHash32 dict_hash = getDictionary().getNestedColumn()->getWeakHash32();
return idx.getWeakHash(dict_hash);
}
void ColumnLowCardinality::updateHashFast(SipHash & hash) const
@ -832,10 +822,11 @@ bool ColumnLowCardinality::Index::containsDefault() const
return contains;
}
void ColumnLowCardinality::Index::updateWeakHash(WeakHash32 & hash, WeakHash32 & dict_hash) const
WeakHash32 ColumnLowCardinality::Index::getWeakHash(const WeakHash32 & dict_hash) const
{
WeakHash32 hash(positions->size());
auto & hash_data = hash.getData();
auto & dict_hash_data = dict_hash.getData();
const auto & dict_hash_data = dict_hash.getData();
auto update_weak_hash = [&](auto x)
{
@ -844,10 +835,11 @@ void ColumnLowCardinality::Index::updateWeakHash(WeakHash32 & hash, WeakHash32 &
auto size = data.size();
for (size_t i = 0; i < size; ++i)
hash_data[i] = static_cast<UInt32>(intHashCRC32(dict_hash_data[data[i]], hash_data[i]));
hash_data[i] = dict_hash_data[data[i]];
};
callForType(std::move(update_weak_hash), size_of_type);
return hash;
}
void ColumnLowCardinality::Index::collectSerializedValueSizes(

View File

@ -111,7 +111,7 @@ public:
getDictionary().updateHashWithValue(getIndexes().getUInt(n), hash);
}
void updateWeakHash32(WeakHash32 & hash) const override;
WeakHash32 getWeakHash32() const override;
void updateHashFast(SipHash &) const override;
@ -325,7 +325,7 @@ public:
bool containsDefault() const;
void updateWeakHash(WeakHash32 & hash, WeakHash32 & dict_hash) const;
WeakHash32 getWeakHash(const WeakHash32 & dict_hash) const;
void collectSerializedValueSizes(PaddedPODArray<UInt64> & sizes, const PaddedPODArray<UInt64> & dict_sizes) const;

View File

@ -143,9 +143,9 @@ void ColumnMap::updateHashWithValue(size_t n, SipHash & hash) const
nested->updateHashWithValue(n, hash);
}
void ColumnMap::updateWeakHash32(WeakHash32 & hash) const
WeakHash32 ColumnMap::getWeakHash32() const
{
nested->updateWeakHash32(hash);
return nested->getWeakHash32();
}
void ColumnMap::updateHashFast(SipHash & hash) const

View File

@ -64,7 +64,7 @@ public:
const char * deserializeAndInsertFromArena(const char * pos) override;
const char * skipSerializedInArena(const char * pos) const override;
void updateHashWithValue(size_t n, SipHash & hash) const override;
void updateWeakHash32(WeakHash32 & hash) const override;
WeakHash32 getWeakHash32() const override;
void updateHashFast(SipHash & hash) const override;
#if !defined(ABORT_ON_LOGICAL_ERROR)

View File

@ -56,25 +56,21 @@ void ColumnNullable::updateHashWithValue(size_t n, SipHash & hash) const
getNestedColumn().updateHashWithValue(n, hash);
}
void ColumnNullable::updateWeakHash32(WeakHash32 & hash) const
WeakHash32 ColumnNullable::getWeakHash32() const
{
auto s = size();
if (hash.getData().size() != s)
throw Exception(ErrorCodes::LOGICAL_ERROR, "Size of WeakHash32 does not match size of column: "
"column size is {}, hash size is {}", std::to_string(s), std::to_string(hash.getData().size()));
WeakHash32 old_hash = hash;
nested_column->updateWeakHash32(hash);
WeakHash32 hash = nested_column->getWeakHash32();
const auto & null_map_data = getNullMapData();
auto & hash_data = hash.getData();
auto & old_hash_data = old_hash.getData();
/// Use old data for nulls.
/// Use default for nulls.
for (size_t row = 0; row < s; ++row)
if (null_map_data[row])
hash_data[row] = old_hash_data[row];
hash_data[row] = WeakHash32::kDefaultInitialValue;
return hash;
}
void ColumnNullable::updateHashFast(SipHash & hash) const

View File

@ -133,7 +133,7 @@ public:
void protect() override;
ColumnPtr replicate(const Offsets & replicate_offsets) const override;
void updateHashWithValue(size_t n, SipHash & hash) const override;
void updateWeakHash32(WeakHash32 & hash) const override;
WeakHash32 getWeakHash32() const override;
void updateHashFast(SipHash & hash) const override;
void getExtremes(Field & min, Field & max) const override;
// Special function for nullable minmax index

View File

@ -5,6 +5,7 @@
#include <Core/Names.h>
#include <DataTypes/Serializations/SubcolumnsTree.h>
#include <Common/PODArray.h>
#include <Common/WeakHash.h>
#include <DataTypes/IDataType.h>
@ -252,7 +253,7 @@ public:
const char * deserializeAndInsertFromArena(const char *) override { throwMustBeConcrete(); }
const char * skipSerializedInArena(const char *) const override { throwMustBeConcrete(); }
void updateHashWithValue(size_t, SipHash &) const override { throwMustBeConcrete(); }
void updateWeakHash32(WeakHash32 &) const override { throwMustBeConcrete(); }
WeakHash32 getWeakHash32() const override { throwMustBeConcrete(); }
void updateHashFast(SipHash & hash) const override;
void expand(const Filter &, bool) override { throwMustBeConcrete(); }
bool hasEqualValues() const override { throwMustBeConcrete(); }

View File

@ -678,20 +678,22 @@ void ColumnSparse::updateHashWithValue(size_t n, SipHash & hash) const
values->updateHashWithValue(getValueIndex(n), hash);
}
void ColumnSparse::updateWeakHash32(WeakHash32 & hash) const
WeakHash32 ColumnSparse::getWeakHash32() const
{
if (hash.getData().size() != _size)
throw Exception(ErrorCodes::LOGICAL_ERROR, "Size of WeakHash32 does not match size of column: "
"column size is {}, hash size is {}", _size, hash.getData().size());
WeakHash32 values_hash = values->getWeakHash32();
WeakHash32 hash(size());
auto & hash_data = hash.getData();
auto & values_hash_data = values_hash.getData();
auto offset_it = begin();
auto & hash_data = hash.getData();
for (size_t i = 0; i < _size; ++i, ++offset_it)
{
size_t value_index = offset_it.getValueIndex();
auto data_ref = values->getDataAt(value_index);
hash_data[i] = ::updateWeakHash32(reinterpret_cast<const UInt8 *>(data_ref.data), data_ref.size, hash_data[i]);
hash_data[i] = values_hash_data[value_index];
}
return hash;
}
void ColumnSparse::updateHashFast(SipHash & hash) const

View File

@ -139,7 +139,7 @@ public:
void protect() override;
ColumnPtr replicate(const Offsets & replicate_offsets) const override;
void updateHashWithValue(size_t n, SipHash & hash) const override;
void updateWeakHash32(WeakHash32 & hash) const override;
WeakHash32 getWeakHash32() const override;
void updateHashFast(SipHash & hash) const override;
void getExtremes(Field & min, Field & max) const override;

View File

@ -108,13 +108,10 @@ MutableColumnPtr ColumnString::cloneResized(size_t to_size) const
return res;
}
void ColumnString::updateWeakHash32(WeakHash32 & hash) const
WeakHash32 ColumnString::getWeakHash32() const
{
auto s = offsets.size();
if (hash.getData().size() != s)
throw Exception(ErrorCodes::LOGICAL_ERROR, "Size of WeakHash32 does not match size of column: "
"column size is {}, hash size is {}", std::to_string(s), std::to_string(hash.getData().size()));
WeakHash32 hash(s);
const UInt8 * pos = chars.data();
UInt32 * hash_data = hash.getData().data();
@ -130,6 +127,8 @@ void ColumnString::updateWeakHash32(WeakHash32 & hash) const
prev_offset = offset;
++hash_data;
}
return hash;
}

View File

@ -212,7 +212,7 @@ public:
hash.update(reinterpret_cast<const char *>(&chars[offset]), string_size);
}
void updateWeakHash32(WeakHash32 & hash) const override;
WeakHash32 getWeakHash32() const override;
void updateHashFast(SipHash & hash) const override
{

View File

@ -201,6 +201,7 @@ bool ColumnTuple::tryInsert(const Field & x)
return false;
}
}
++column_length;
return true;
}
@ -236,6 +237,7 @@ void ColumnTuple::doInsertManyFrom(const IColumn & src, size_t position, size_t
for (size_t i = 0; i < tuple_size; ++i)
columns[i]->insertManyFrom(*src_tuple.columns[i], position, length);
column_length += length;
}
void ColumnTuple::insertDefault()
@ -308,16 +310,15 @@ void ColumnTuple::updateHashWithValue(size_t n, SipHash & hash) const
column->updateHashWithValue(n, hash);
}
void ColumnTuple::updateWeakHash32(WeakHash32 & hash) const
WeakHash32 ColumnTuple::getWeakHash32() const
{
auto s = size();
if (hash.getData().size() != s)
throw Exception(ErrorCodes::LOGICAL_ERROR, "Size of WeakHash32 does not match size of column: "
"column size is {}, hash size is {}", std::to_string(s), std::to_string(hash.getData().size()));
WeakHash32 hash(s);
for (const auto & column : columns)
column->updateWeakHash32(hash);
hash.update(column->getWeakHash32());
return hash;
}
void ColumnTuple::updateHashFast(SipHash & hash) const

View File

@ -81,7 +81,7 @@ public:
const char * deserializeAndInsertFromArena(const char * pos) override;
const char * skipSerializedInArena(const char * pos) const override;
void updateHashWithValue(size_t n, SipHash & hash) const override;
void updateWeakHash32(WeakHash32 & hash) const override;
WeakHash32 getWeakHash32() const override;
void updateHashFast(SipHash & hash) const override;
#if !defined(ABORT_ON_LOGICAL_ERROR)
void insertRangeFrom(const IColumn & src, size_t start, size_t length) override;

View File

@ -789,36 +789,26 @@ void ColumnVariant::updateHashWithValue(size_t n, SipHash & hash) const
variants[localDiscriminatorByGlobal(global_discr)]->updateHashWithValue(offsetAt(n), hash);
}
void ColumnVariant::updateWeakHash32(WeakHash32 & hash) const
WeakHash32 ColumnVariant::getWeakHash32() const
{
auto s = size();
if (hash.getData().size() != s)
throw Exception(ErrorCodes::LOGICAL_ERROR, "Size of WeakHash32 does not match size of column: "
"column size is {}, hash size is {}", std::to_string(s), std::to_string(hash.getData().size()));
/// If we have only NULLs, keep hash unchanged.
if (hasOnlyNulls())
return;
return WeakHash32(s);
/// Optimization for case when there is only 1 non-empty variant and no NULLs.
/// In this case we can just calculate weak hash for this variant.
if (auto non_empty_local_discr = getLocalDiscriminatorOfOneNoneEmptyVariantNoNulls())
{
variants[*non_empty_local_discr]->updateWeakHash32(hash);
return;
}
return variants[*non_empty_local_discr]->getWeakHash32();
/// Calculate weak hash for all variants.
std::vector<WeakHash32> nested_hashes;
for (const auto & variant : variants)
{
WeakHash32 nested_hash(variant->size());
variant->updateWeakHash32(nested_hash);
nested_hashes.emplace_back(std::move(nested_hash));
}
nested_hashes.emplace_back(variant->getWeakHash32());
/// For each row hash is a hash of corresponding row from corresponding variant.
WeakHash32 hash(s);
auto & hash_data = hash.getData();
const auto & local_discriminators_data = getLocalDiscriminators();
const auto & offsets_data = getOffsets();
@ -827,11 +817,10 @@ void ColumnVariant::updateWeakHash32(WeakHash32 & hash) const
Discriminator discr = local_discriminators_data[i];
/// Update hash only for non-NULL values
if (discr != NULL_DISCRIMINATOR)
{
auto nested_hash = nested_hashes[local_discriminators_data[i]].getData()[offsets_data[i]];
hash_data[i] = static_cast<UInt32>(hashCRC32(nested_hash, hash_data[i]));
}
hash_data[i] = nested_hashes[discr].getData()[offsets_data[i]];
}
return hash;
}
void ColumnVariant::updateHashFast(SipHash & hash) const

View File

@ -213,7 +213,7 @@ public:
const char * deserializeVariantAndInsertFromArena(Discriminator global_discr, const char * pos);
const char * skipSerializedInArena(const char * pos) const override;
void updateHashWithValue(size_t n, SipHash & hash) const override;
void updateWeakHash32(WeakHash32 & hash) const override;
WeakHash32 getWeakHash32() const override;
void updateHashFast(SipHash & hash) const override;
ColumnPtr filter(const Filter & filt, ssize_t result_size_hint) const override;
void expand(const Filter & mask, bool inverted) override;

View File

@ -73,13 +73,10 @@ void ColumnVector<T>::updateHashWithValue(size_t n, SipHash & hash) const
}
template <typename T>
void ColumnVector<T>::updateWeakHash32(WeakHash32 & hash) const
WeakHash32 ColumnVector<T>::getWeakHash32() const
{
auto s = data.size();
if (hash.getData().size() != s)
throw Exception(ErrorCodes::LOGICAL_ERROR, "Size of WeakHash32 does not match size of column: "
"column size is {}, hash size is {}", std::to_string(s), std::to_string(hash.getData().size()));
WeakHash32 hash(s);
const T * begin = data.data();
const T * end = begin + s;
@ -91,6 +88,8 @@ void ColumnVector<T>::updateWeakHash32(WeakHash32 & hash) const
++begin;
++hash_data;
}
return hash;
}
template <typename T>

View File

@ -114,7 +114,7 @@ public:
void updateHashWithValue(size_t n, SipHash & hash) const override;
void updateWeakHash32(WeakHash32 & hash) const override;
WeakHash32 getWeakHash32() const override;
void updateHashFast(SipHash & hash) const override;

View File

@ -300,10 +300,10 @@ public:
/// passed bytes to hash must identify sequence of values unambiguously.
virtual void updateHashWithValue(size_t n, SipHash & hash) const = 0;
/// Update hash function value. Hash is calculated for each element.
/// Get hash function value. Hash is calculated for each element.
/// It's a fast weak hash function. Mainly need to scatter data between threads.
/// WeakHash32 must have the same size as column.
virtual void updateWeakHash32(WeakHash32 & hash) const = 0;
virtual WeakHash32 getWeakHash32() const = 0;
/// Update state of hash with all column.
virtual void updateHashFast(SipHash & hash) const = 0;

View File

@ -1,6 +1,7 @@
#pragma once
#include <Columns/IColumn.h>
#include <Common/WeakHash.h>
namespace DB
@ -63,8 +64,9 @@ public:
{
}
void updateWeakHash32(WeakHash32 & /*hash*/) const override
WeakHash32 getWeakHash32() const override
{
return WeakHash32(s);
}
void updateHashFast(SipHash & /*hash*/) const override

View File

@ -1,6 +1,7 @@
#pragma once
#include <optional>
#include <Columns/IColumn.h>
#include <Common/WeakHash.h>
namespace DB
{
@ -166,9 +167,9 @@ public:
throw Exception(ErrorCodes::NOT_IMPLEMENTED, "Method scatter is not supported for ColumnUnique.");
}
void updateWeakHash32(WeakHash32 &) const override
WeakHash32 getWeakHash32() const override
{
throw Exception(ErrorCodes::NOT_IMPLEMENTED, "Method updateWeakHash32 is not supported for ColumnUnique.");
throw Exception(ErrorCodes::NOT_IMPLEMENTED, "Method getWeakHash32 is not supported for ColumnUnique.");
}
void updateHashFast(SipHash &) const override

View File

@ -60,8 +60,7 @@ TEST(WeakHash32, ColumnVectorU8)
data.push_back(i);
}
WeakHash32 hash(col->size());
col->updateWeakHash32(hash);
WeakHash32 hash = col->getWeakHash32();
checkColumn(hash.getData(), col->getData());
}
@ -77,8 +76,7 @@ TEST(WeakHash32, ColumnVectorI8)
data.push_back(i);
}
WeakHash32 hash(col->size());
col->updateWeakHash32(hash);
WeakHash32 hash = col->getWeakHash32();
checkColumn(hash.getData(), col->getData());
}
@ -94,8 +92,7 @@ TEST(WeakHash32, ColumnVectorU16)
data.push_back(i);
}
WeakHash32 hash(col->size());
col->updateWeakHash32(hash);
WeakHash32 hash = col->getWeakHash32();
checkColumn(hash.getData(), col->getData());
}
@ -111,8 +108,7 @@ TEST(WeakHash32, ColumnVectorI16)
data.push_back(i);
}
WeakHash32 hash(col->size());
col->updateWeakHash32(hash);
WeakHash32 hash = col->getWeakHash32();
checkColumn(hash.getData(), col->getData());
}
@ -128,8 +124,7 @@ TEST(WeakHash32, ColumnVectorU32)
data.push_back(i << 16u);
}
WeakHash32 hash(col->size());
col->updateWeakHash32(hash);
WeakHash32 hash = col->getWeakHash32();
checkColumn(hash.getData(), col->getData());
}
@ -145,8 +140,7 @@ TEST(WeakHash32, ColumnVectorI32)
data.push_back(i << 16);
}
WeakHash32 hash(col->size());
col->updateWeakHash32(hash);
WeakHash32 hash = col->getWeakHash32();
checkColumn(hash.getData(), col->getData());
}
@ -162,8 +156,7 @@ TEST(WeakHash32, ColumnVectorU64)
data.push_back(i << 32u);
}
WeakHash32 hash(col->size());
col->updateWeakHash32(hash);
WeakHash32 hash = col->getWeakHash32();
checkColumn(hash.getData(), col->getData());
}
@ -179,8 +172,7 @@ TEST(WeakHash32, ColumnVectorI64)
data.push_back(i << 32);
}
WeakHash32 hash(col->size());
col->updateWeakHash32(hash);
WeakHash32 hash = col->getWeakHash32();
checkColumn(hash.getData(), col->getData());
}
@ -204,8 +196,7 @@ TEST(WeakHash32, ColumnVectorU128)
}
}
WeakHash32 hash(col->size());
col->updateWeakHash32(hash);
WeakHash32 hash = col->getWeakHash32();
checkColumn(hash.getData(), eq_data);
}
@ -221,8 +212,7 @@ TEST(WeakHash32, ColumnVectorI128)
data.push_back(i << 32);
}
WeakHash32 hash(col->size());
col->updateWeakHash32(hash);
WeakHash32 hash = col->getWeakHash32();
checkColumn(hash.getData(), col->getData());
}
@ -238,8 +228,7 @@ TEST(WeakHash32, ColumnDecimal32)
data.push_back(i << 16);
}
WeakHash32 hash(col->size());
col->updateWeakHash32(hash);
WeakHash32 hash = col->getWeakHash32();
checkColumn(hash.getData(), col->getData());
}
@ -255,8 +244,7 @@ TEST(WeakHash32, ColumnDecimal64)
data.push_back(i << 32);
}
WeakHash32 hash(col->size());
col->updateWeakHash32(hash);
WeakHash32 hash = col->getWeakHash32();
checkColumn(hash.getData(), col->getData());
}
@ -272,8 +260,7 @@ TEST(WeakHash32, ColumnDecimal128)
data.push_back(i << 32);
}
WeakHash32 hash(col->size());
col->updateWeakHash32(hash);
WeakHash32 hash = col->getWeakHash32();
checkColumn(hash.getData(), col->getData());
}
@ -294,8 +281,7 @@ TEST(WeakHash32, ColumnString1)
}
}
WeakHash32 hash(col->size());
col->updateWeakHash32(hash);
WeakHash32 hash = col->getWeakHash32();
checkColumn(hash.getData(), data);
}
@ -331,8 +317,7 @@ TEST(WeakHash32, ColumnString2)
}
}
WeakHash32 hash(col->size());
col->updateWeakHash32(hash);
WeakHash32 hash = col->getWeakHash32();
checkColumn(hash.getData(), data);
}
@ -369,8 +354,7 @@ TEST(WeakHash32, ColumnString3)
}
}
WeakHash32 hash(col->size());
col->updateWeakHash32(hash);
WeakHash32 hash = col->getWeakHash32();
checkColumn(hash.getData(), data);
}
@ -397,8 +381,7 @@ TEST(WeakHash32, ColumnFixedString)
}
}
WeakHash32 hash(col->size());
col->updateWeakHash32(hash);
WeakHash32 hash = col->getWeakHash32();
checkColumn(hash.getData(), data);
}
@ -444,8 +427,7 @@ TEST(WeakHash32, ColumnArray)
auto col_arr = ColumnArray::create(std::move(val), std::move(off));
WeakHash32 hash(col_arr->size());
col_arr->updateWeakHash32(hash);
WeakHash32 hash = col_arr->getWeakHash32();
checkColumn(hash.getData(), eq_data);
}
@ -479,8 +461,7 @@ TEST(WeakHash32, ColumnArray2)
auto col_arr = ColumnArray::create(std::move(val), std::move(off));
WeakHash32 hash(col_arr->size());
col_arr->updateWeakHash32(hash);
WeakHash32 hash = col_arr->getWeakHash32();
checkColumn(hash.getData(), eq_data);
}
@ -536,8 +517,7 @@ TEST(WeakHash32, ColumnArrayArray)
auto col_arr = ColumnArray::create(std::move(val), std::move(off));
auto col_arr_arr = ColumnArray::create(std::move(col_arr), std::move(off2));
WeakHash32 hash(col_arr_arr->size());
col_arr_arr->updateWeakHash32(hash);
WeakHash32 hash = col_arr_arr->getWeakHash32();
checkColumn(hash.getData(), eq_data);
}
@ -555,8 +535,7 @@ TEST(WeakHash32, ColumnConst)
auto col_const = ColumnConst::create(std::move(inner_col), 256);
WeakHash32 hash(col_const->size());
col_const->updateWeakHash32(hash);
WeakHash32 hash = col_const->getWeakHash32();
checkColumn(hash.getData(), data);
}
@ -576,8 +555,7 @@ TEST(WeakHash32, ColumnLowcardinality)
}
}
WeakHash32 hash(col->size());
col->updateWeakHash32(hash);
WeakHash32 hash = col->getWeakHash32();
checkColumn(hash.getData(), data);
}
@ -602,8 +580,7 @@ TEST(WeakHash32, ColumnNullable)
auto col_null = ColumnNullable::create(std::move(col), std::move(mask));
WeakHash32 hash(col_null->size());
col_null->updateWeakHash32(hash);
WeakHash32 hash = col_null->getWeakHash32();
checkColumn(hash.getData(), eq);
}
@ -633,8 +610,7 @@ TEST(WeakHash32, ColumnTupleUInt64UInt64)
columns.emplace_back(std::move(col2));
auto col_tuple = ColumnTuple::create(std::move(columns));
WeakHash32 hash(col_tuple->size());
col_tuple->updateWeakHash32(hash);
WeakHash32 hash = col_tuple->getWeakHash32();
checkColumn(hash.getData(), eq);
}
@ -671,8 +647,7 @@ TEST(WeakHash32, ColumnTupleUInt64String)
columns.emplace_back(std::move(col2));
auto col_tuple = ColumnTuple::create(std::move(columns));
WeakHash32 hash(col_tuple->size());
col_tuple->updateWeakHash32(hash);
WeakHash32 hash = col_tuple->getWeakHash32();
checkColumn(hash.getData(), eq);
}
@ -709,8 +684,7 @@ TEST(WeakHash32, ColumnTupleUInt64FixedString)
columns.emplace_back(std::move(col2));
auto col_tuple = ColumnTuple::create(std::move(columns));
WeakHash32 hash(col_tuple->size());
col_tuple->updateWeakHash32(hash);
WeakHash32 hash = col_tuple->getWeakHash32();
checkColumn(hash.getData(), eq);
}
@ -756,8 +730,7 @@ TEST(WeakHash32, ColumnTupleUInt64Array)
columns.emplace_back(ColumnArray::create(std::move(val), std::move(off)));
auto col_tuple = ColumnTuple::create(std::move(columns));
WeakHash32 hash(col_tuple->size());
col_tuple->updateWeakHash32(hash);
WeakHash32 hash = col_tuple->getWeakHash32();
checkColumn(hash.getData(), eq_data);
}

View File

@ -68,7 +68,7 @@ void * allocNoTrack(size_t size, size_t alignment)
{
void * buf;
#if USE_GWP_ASAN
if (unlikely(GWPAsan::GuardedAlloc.shouldSample()))
if (unlikely(GWPAsan::shouldSample()))
{
if (void * ptr = GWPAsan::GuardedAlloc.allocate(size, alignment))
{
@ -185,7 +185,7 @@ void * Allocator<clear_memory_, populate>::realloc(void * buf, size_t old_size,
}
#if USE_GWP_ASAN
if (unlikely(GWPAsan::GuardedAlloc.shouldSample()))
if (unlikely(GWPAsan::shouldSample()))
{
auto trace_alloc = CurrentMemoryTracker::alloc(new_size);
if (void * ptr = GWPAsan::GuardedAlloc.allocate(new_size, alignment))

View File

@ -84,12 +84,18 @@ public:
return result;
}
void append(Self && other)
// append items for other inscnace only if there is no such item in current instance
void appendIfUniq(Self && other)
{
auto middle_idx = records.size();
std::move(other.records.begin(), other.records.end(), std::back_inserter(records));
// merge is stable
std::inplace_merge(records.begin(), records.begin() + middle_idx, records.end());
chassert(isUniqTypes());
// remove duplicates
records.erase(std::unique(records.begin(), records.end()), records.end());
assert(std::is_sorted(records.begin(), records.end()));
assert(isUniqTypes());
}
template <class T>
@ -142,7 +148,6 @@ private:
bool isUniqTypes() const
{
auto uniq_it = std::adjacent_find(records.begin(), records.end());
return uniq_it == records.end();
}
@ -161,8 +166,6 @@ private:
records.emplace(it, type_idx, item);
chassert(isUniqTypes());
}
Records::const_iterator getImpl(std::type_index type_idx) const

View File

@ -244,6 +244,15 @@ private:
const char * className() const noexcept override { return "DB::ErrnoException"; }
};
/// An exception to use in unit tests to test interfaces.
/// It is distinguished from others, so it does not have to be logged.
class TestException : public Exception
{
public:
using Exception::Exception;
};
using Exceptions = std::vector<std::exception_ptr>;
/** Try to write an exception to the log (and forget about it).

View File

@ -217,6 +217,13 @@ void printReport([[maybe_unused]] uintptr_t fault_address)
reinterpret_cast<void **>(trace.data()), 0, trace_length, [&](const auto line) { LOG_FATAL(logger, fmt::runtime(line)); });
}
std::atomic<bool> init_finished = false;
void initFinished()
{
init_finished.store(true, std::memory_order_relaxed);
}
std::atomic<double> force_sample_probability = 0.0;
void setForceSampleProbability(double value)

View File

@ -19,12 +19,30 @@ bool isGWPAsanError(uintptr_t fault_address);
void printReport(uintptr_t fault_address);
extern std::atomic<bool> init_finished;
void initFinished();
extern std::atomic<double> force_sample_probability;
void setForceSampleProbability(double value);
/**
* We'd like to postpone sampling allocations under the startup is finished. There are mainly
* two reasons for that:
*
* - To avoid complex issues with initialization order
* - Don't waste MaxSimultaneousAllocations on global objects as it's not useful
*/
inline bool shouldSample()
{
return init_finished.load(std::memory_order_relaxed) && GuardedAlloc.shouldSample();
}
inline bool shouldForceSample()
{
if (!init_finished.load(std::memory_order_relaxed))
return false;
std::bernoulli_distribution dist(force_sample_probability.load(std::memory_order_relaxed));
return dist(thread_local_rng);
}

View File

@ -39,16 +39,16 @@ protected:
public:
/// For compatibility with SQL, it's possible to specify that certain function name is case insensitive.
enum CaseSensitiveness
enum Case
{
CaseSensitive,
CaseInsensitive
Sensitive,
Insensitive
};
/** Register additional name for value
* real_name have to be already registered.
*/
void registerAlias(const String & alias_name, const String & real_name, CaseSensitiveness case_sensitiveness = CaseSensitive)
void registerAlias(const String & alias_name, const String & real_name, Case case_sensitiveness = Sensitive)
{
const auto & creator_map = getMap();
const auto & case_insensitive_creator_map = getCaseInsensitiveMap();
@ -66,12 +66,12 @@ public:
}
/// We need sure the real_name exactly exists when call the function directly.
void registerAliasUnchecked(const String & alias_name, const String & real_name, CaseSensitiveness case_sensitiveness = CaseSensitive)
void registerAliasUnchecked(const String & alias_name, const String & real_name, Case case_sensitiveness = Sensitive)
{
String alias_name_lowercase = Poco::toLower(alias_name);
const String factory_name = getFactoryName();
if (case_sensitiveness == CaseInsensitive)
if (case_sensitiveness == Insensitive)
{
if (!case_insensitive_aliases.emplace(alias_name_lowercase, real_name).second)
throw Exception(ErrorCodes::LOGICAL_ERROR, "{}: case insensitive alias name '{}' is not unique", factory_name, alias_name);

View File

@ -116,32 +116,32 @@ public:
return elements;
}
bool exists(const std::string & path) const override
bool exists(const std::string & file_name) const override
{
return fs::exists(getPath(path));
return fs::exists(getPath(file_name));
}
std::string read(const std::string & path) const override
std::string read(const std::string & file_name) const override
{
ReadBufferFromFile in(getPath(path));
ReadBufferFromFile in(getPath(file_name));
std::string data;
readStringUntilEOF(data, in);
return data;
}
void write(const std::string & path, const std::string & data, bool replace) override
void write(const std::string & file_name, const std::string & data, bool replace) override
{
if (!replace && fs::exists(path))
if (!replace && fs::exists(file_name))
{
throw Exception(
ErrorCodes::NAMED_COLLECTION_ALREADY_EXISTS,
"Metadata file {} for named collection already exists",
path);
file_name);
}
fs::create_directories(root_path);
auto tmp_path = getPath(path + ".tmp");
auto tmp_path = getPath(file_name + ".tmp");
WriteBufferFromFile out(tmp_path, data.size(), O_WRONLY | O_CREAT | O_EXCL);
writeString(data, out);
@ -150,28 +150,32 @@ public:
out.sync();
out.close();
fs::rename(tmp_path, getPath(path));
fs::rename(tmp_path, getPath(file_name));
}
void remove(const std::string & path) override
void remove(const std::string & file_name) override
{
if (!removeIfExists(getPath(path)))
if (!removeIfExists(file_name))
{
throw Exception(
ErrorCodes::NAMED_COLLECTION_DOESNT_EXIST,
"Cannot remove `{}`, because it doesn't exist", path);
"Cannot remove `{}`, because it doesn't exist", file_name);
}
}
bool removeIfExists(const std::string & path) override
bool removeIfExists(const std::string & file_name) override
{
return fs::remove(getPath(path));
return fs::remove(getPath(file_name));
}
private:
std::string getPath(const std::string & path) const
std::string getPath(const std::string & file_name) const
{
return fs::path(root_path) / path;
const auto file_name_as_path = fs::path(file_name);
if (file_name_as_path.is_absolute())
throw Exception(ErrorCodes::LOGICAL_ERROR, "Filename {} cannot be an absolute path", file_name);
return fs::path(root_path) / file_name_as_path;
}
/// Delete .tmp files. They could be left undeleted in case of
@ -264,49 +268,49 @@ public:
return children;
}
bool exists(const std::string & path) const override
bool exists(const std::string & file_name) const override
{
return getClient()->exists(getPath(path));
return getClient()->exists(getPath(file_name));
}
std::string read(const std::string & path) const override
std::string read(const std::string & file_name) const override
{
return getClient()->get(getPath(path));
return getClient()->get(getPath(file_name));
}
void write(const std::string & path, const std::string & data, bool replace) override
void write(const std::string & file_name, const std::string & data, bool replace) override
{
if (replace)
{
getClient()->createOrUpdate(getPath(path), data, zkutil::CreateMode::Persistent);
getClient()->createOrUpdate(getPath(file_name), data, zkutil::CreateMode::Persistent);
}
else
{
auto code = getClient()->tryCreate(getPath(path), data, zkutil::CreateMode::Persistent);
auto code = getClient()->tryCreate(getPath(file_name), data, zkutil::CreateMode::Persistent);
if (code == Coordination::Error::ZNODEEXISTS)
{
throw Exception(
ErrorCodes::NAMED_COLLECTION_ALREADY_EXISTS,
"Metadata file {} for named collection already exists",
path);
file_name);
}
}
}
void remove(const std::string & path) override
void remove(const std::string & file_name) override
{
getClient()->remove(getPath(path));
getClient()->remove(getPath(file_name));
}
bool removeIfExists(const std::string & path) override
bool removeIfExists(const std::string & file_name) override
{
auto code = getClient()->tryRemove(getPath(path));
auto code = getClient()->tryRemove(getPath(file_name));
if (code == Coordination::Error::ZOK)
return true;
if (code == Coordination::Error::ZNONODE)
return false;
throw Coordination::Exception::fromPath(code, getPath(path));
throw Coordination::Exception::fromPath(code, getPath(file_name));
}
private:
@ -320,9 +324,13 @@ private:
return zookeeper_client;
}
std::string getPath(const std::string & path) const
std::string getPath(const std::string & file_name) const
{
return fs::path(root_path) / path;
const auto file_name_as_path = fs::path(file_name);
if (file_name_as_path.is_absolute())
throw Exception(ErrorCodes::LOGICAL_ERROR, "Filename {} cannot be an absolute path", file_name);
return fs::path(root_path) / file_name_as_path;
}
};

View File

@ -442,8 +442,6 @@ The server successfully detected this situation and will download merged part fr
M(ReadBufferFromS3InitMicroseconds, "Time spent initializing connection to S3.") \
M(ReadBufferFromS3Bytes, "Bytes read from S3.") \
M(ReadBufferFromS3RequestsErrors, "Number of exceptions while reading from S3.") \
M(ReadBufferFromS3ResetSessions, "Number of HTTP sessions that were reset in ReadBufferFromS3.") \
M(ReadBufferFromS3PreservedSessions, "Number of HTTP sessions that were preserved in ReadBufferFromS3.") \
\
M(WriteBufferFromS3Microseconds, "Time spent on writing to S3.") \
M(WriteBufferFromS3Bytes, "Bytes written to S3.") \

View File

@ -0,0 +1,635 @@
#include <Common/SignalHandlers.h>
#include <Common/config_version.h>
#include <Common/getHashOfLoadedBinary.h>
#include <Common/CurrentThread.h>
#include <Daemon/BaseDaemon.h>
#include <Daemon/SentryWriter.h>
#include <base/sleep.h>
#include <base/getThreadId.h>
#include <IO/WriteBufferFromFileDescriptor.h>
#include <IO/ReadBufferFromFileDescriptor.h>
#include <IO/WriteBufferFromFileDescriptorDiscardOnFailure.h>
#include <IO/WriteHelpers.h>
#include <IO/ReadHelpers.h>
#include <Interpreters/Context.h>
#include <Core/Settings.h>
#pragma clang diagnostic ignored "-Wreserved-identifier"
namespace DB
{
namespace ErrorCodes
{
extern const int CANNOT_SET_SIGNAL_HANDLER;
extern const int CANNOT_SEND_SIGNAL;
}
}
using namespace DB;
void call_default_signal_handler(int sig)
{
if (SIG_ERR == signal(sig, SIG_DFL))
throw ErrnoException(ErrorCodes::CANNOT_SET_SIGNAL_HANDLER, "Cannot set signal handler");
if (0 != raise(sig))
throw ErrnoException(ErrorCodes::CANNOT_SEND_SIGNAL, "Cannot send signal");
}
void writeSignalIDtoSignalPipe(int sig)
{
auto saved_errno = errno; /// We must restore previous value of errno in signal handler.
char buf[signal_pipe_buf_size];
auto & signal_pipe = HandledSignals::instance().signal_pipe;
WriteBufferFromFileDescriptor out(signal_pipe.fds_rw[1], signal_pipe_buf_size, buf);
writeBinary(sig, out);
out.next();
errno = saved_errno;
}
void closeLogsSignalHandler(int sig, siginfo_t *, void *)
{
DENY_ALLOCATIONS_IN_SCOPE;
writeSignalIDtoSignalPipe(sig);
}
void terminateRequestedSignalHandler(int sig, siginfo_t *, void *)
{
DENY_ALLOCATIONS_IN_SCOPE;
writeSignalIDtoSignalPipe(sig);
}
void signalHandler(int sig, siginfo_t * info, void * context)
{
if (asynchronous_stack_unwinding && sig == SIGSEGV)
siglongjmp(asynchronous_stack_unwinding_signal_jump_buffer, 1);
DENY_ALLOCATIONS_IN_SCOPE;
auto saved_errno = errno; /// We must restore previous value of errno in signal handler.
char buf[signal_pipe_buf_size];
auto & signal_pipe = HandledSignals::instance().signal_pipe;
WriteBufferFromFileDescriptorDiscardOnFailure out(signal_pipe.fds_rw[1], signal_pipe_buf_size, buf);
const ucontext_t * signal_context = reinterpret_cast<ucontext_t *>(context);
const StackTrace stack_trace(*signal_context);
#if USE_GWP_ASAN
if (const auto fault_address = reinterpret_cast<uintptr_t>(info->si_addr);
GWPAsan::isGWPAsanError(fault_address))
GWPAsan::printReport(fault_address);
#endif
writeBinary(sig, out);
writePODBinary(*info, out);
writePODBinary(signal_context, out);
writePODBinary(stack_trace, out);
writeVectorBinary(Exception::enable_job_stack_trace ? Exception::thread_frame_pointers : std::vector<StackTrace::FramePointers>{}, out);
writeBinary(static_cast<UInt32>(getThreadId()), out);
writePODBinary(current_thread, out);
out.next();
if (sig != SIGTSTP) /// This signal is used for debugging.
{
/// The time that is usually enough for separate thread to print info into log.
/// Under MSan full stack unwinding with DWARF info about inline functions takes 101 seconds in one case.
for (size_t i = 0; i < 300; ++i)
{
/// We will synchronize with the thread printing the messages with an atomic variable to finish earlier.
if (HandledSignals::instance().fatal_error_printed.test())
break;
/// This coarse method of synchronization is perfectly ok for fatal signals.
sleepForSeconds(1);
}
/// Wait for all logs flush operations
sleepForSeconds(3);
call_default_signal_handler(sig);
}
errno = saved_errno;
}
[[noreturn]] void terminate_handler()
{
static thread_local bool terminating = false;
if (terminating)
abort();
terminating = true;
std::string log_message;
if (std::current_exception())
log_message = "Terminate called for uncaught exception:\n" + getCurrentExceptionMessage(true);
else
log_message = "Terminate called without an active exception";
/// POSIX.1 says that write(2)s of less than PIPE_BUF bytes must be atomic - man 7 pipe
/// And the buffer should not be too small because our exception messages can be large.
static constexpr size_t buf_size = PIPE_BUF;
if (log_message.size() > buf_size - 16)
log_message.resize(buf_size - 16);
char buf[buf_size];
auto & signal_pipe = HandledSignals::instance().signal_pipe;
WriteBufferFromFileDescriptor out(signal_pipe.fds_rw[1], buf_size, buf);
writeBinary(static_cast<int>(SignalListener::StdTerminate), out);
writeBinary(static_cast<UInt32>(getThreadId()), out);
writeBinary(log_message, out);
out.next();
abort();
}
#if defined(SANITIZER)
template <typename T>
struct ValueHolder
{
ValueHolder(T value_) : value(value_)
{}
T value;
};
extern "C" void __sanitizer_set_death_callback(void (*)());
/// Sanitizers may not expect some function calls from death callback.
/// Let's try to disable instrumentation to avoid possible issues.
/// However, this callback may call other functions that are still instrumented.
/// We can try [[clang::always_inline]] attribute for statements in future (available in clang-15)
/// See https://github.com/google/sanitizers/issues/1543 and https://github.com/google/sanitizers/issues/1549.
static DISABLE_SANITIZER_INSTRUMENTATION void sanitizerDeathCallback()
{
DENY_ALLOCATIONS_IN_SCOPE;
/// Also need to send data via pipe. Otherwise it may lead to deadlocks or failures in printing diagnostic info.
char buf[signal_pipe_buf_size];
auto & signal_pipe = HandledSignals::instance().signal_pipe;
WriteBufferFromFileDescriptorDiscardOnFailure out(signal_pipe.fds_rw[1], signal_pipe_buf_size, buf);
const StackTrace stack_trace;
writeBinary(SignalListener::SanitizerTrap, out);
writePODBinary(stack_trace, out);
/// We create a dummy struct with a constructor so DISABLE_SANITIZER_INSTRUMENTATION is not applied to it
/// otherwise, Memory sanitizer can't know that values initiialized inside this function are actually initialized
/// because instrumentations are disabled leading to false positives later on
ValueHolder<UInt32> thread_id{static_cast<UInt32>(getThreadId())};
writeBinary(thread_id.value, out);
writePODBinary(current_thread, out);
out.next();
/// The time that is usually enough for separate thread to print info into log.
sleepForSeconds(20);
}
#endif
void HandledSignals::addSignalHandler(const std::vector<int> & signals, signal_function handler, bool register_signal)
{
struct sigaction sa;
memset(&sa, 0, sizeof(sa));
sa.sa_sigaction = handler;
sa.sa_flags = SA_SIGINFO;
#if defined(OS_DARWIN)
sigemptyset(&sa.sa_mask);
for (auto signal : signals)
sigaddset(&sa.sa_mask, signal);
#else
if (sigemptyset(&sa.sa_mask))
throw Poco::Exception("Cannot set signal handler.");
for (auto signal : signals)
if (sigaddset(&sa.sa_mask, signal))
throw Poco::Exception("Cannot set signal handler.");
#endif
for (auto signal : signals)
if (sigaction(signal, &sa, nullptr))
throw Poco::Exception("Cannot set signal handler.");
if (register_signal)
std::copy(signals.begin(), signals.end(), std::back_inserter(handled_signals));
}
void blockSignals(const std::vector<int> & signals)
{
sigset_t sig_set;
#if defined(OS_DARWIN)
sigemptyset(&sig_set);
for (auto signal : signals)
sigaddset(&sig_set, signal);
#else
if (sigemptyset(&sig_set))
throw Poco::Exception("Cannot block signal.");
for (auto signal : signals)
if (sigaddset(&sig_set, signal))
throw Poco::Exception("Cannot block signal.");
#endif
if (pthread_sigmask(SIG_BLOCK, &sig_set, nullptr))
throw Poco::Exception("Cannot block signal.");
}
void SignalListener::run()
{
static_assert(PIPE_BUF >= 512);
static_assert(signal_pipe_buf_size <= PIPE_BUF, "Only write of PIPE_BUF to pipe is atomic and the minimal known PIPE_BUF across supported platforms is 512");
char buf[signal_pipe_buf_size];
auto & signal_pipe = HandledSignals::instance().signal_pipe;
ReadBufferFromFileDescriptor in(signal_pipe.fds_rw[0], signal_pipe_buf_size, buf);
while (!in.eof())
{
int sig = 0;
readBinary(sig, in);
// We may log some specific signals afterwards, with different log
// levels and more info, but for completeness we log all signals
// here at trace level.
// Don't use strsignal here, because it's not thread-safe.
LOG_TRACE(log, "Received signal {}", sig);
if (sig == StopThread)
{
LOG_INFO(log, "Stop SignalListener thread");
break;
}
else if (sig == SIGHUP)
{
LOG_DEBUG(log, "Received signal to close logs.");
BaseDaemon::instance().closeLogs(BaseDaemon::instance().logger());
LOG_INFO(log, "Opened new log file after received signal.");
}
else if (sig == StdTerminate)
{
UInt32 thread_num;
std::string message;
readBinary(thread_num, in);
readBinary(message, in);
onTerminate(message, thread_num);
}
else if (sig == SIGINT ||
sig == SIGQUIT ||
sig == SIGTERM)
{
if (daemon)
daemon->handleSignal(sig);
}
else
{
siginfo_t info{};
ucontext_t * context{};
StackTrace stack_trace(NoCapture{});
std::vector<StackTrace::FramePointers> thread_frame_pointers;
UInt32 thread_num{};
ThreadStatus * thread_ptr{};
if (sig != SanitizerTrap)
{
readPODBinary(info, in);
readPODBinary(context, in);
}
readPODBinary(stack_trace, in);
if (sig != SanitizerTrap)
readVectorBinary(thread_frame_pointers, in);
readBinary(thread_num, in);
readPODBinary(thread_ptr, in);
/// This allows to receive more signals if failure happens inside onFault function.
/// Example: segfault while symbolizing stack trace.
try
{
std::thread([=, this] { onFault(sig, info, context, stack_trace, thread_frame_pointers, thread_num, thread_ptr); }).detach();
}
catch (...)
{
/// Likely cannot allocate thread
onFault(sig, info, context, stack_trace, thread_frame_pointers, thread_num, thread_ptr);
}
}
}
}
void SignalListener::onTerminate(std::string_view message, UInt32 thread_num) const
{
size_t pos = message.find('\n');
LOG_FATAL(log, "(version {}{}, build id: {}, git hash: {}) (from thread {}) {}",
VERSION_STRING, VERSION_OFFICIAL, daemon ? daemon->build_id : "", daemon ? daemon->git_hash : "", thread_num, message.substr(0, pos));
/// Print trace from std::terminate exception line-by-line to make it easy for grep.
while (pos != std::string_view::npos)
{
++pos;
size_t next_pos = message.find('\n', pos);
size_t size = next_pos;
if (next_pos != std::string_view::npos)
size = next_pos - pos;
LOG_FATAL(log, fmt::runtime(message.substr(pos, size)));
pos = next_pos;
}
}
void SignalListener::onFault(
int sig,
const siginfo_t & info,
ucontext_t * context,
const StackTrace & stack_trace,
const std::vector<StackTrace::FramePointers> & thread_frame_pointers,
UInt32 thread_num,
DB::ThreadStatus * thread_ptr) const
try
{
ThreadStatus thread_status;
/// First log those fields that are safe to access and that should not cause new fault.
/// That way we will have some duplicated info in the log but we don't loose important info
/// in case of double fault.
LOG_FATAL(log, "########## Short fault info ############");
LOG_FATAL(log, "(version {}{}, build id: {}, git hash: {}) (from thread {}) Received signal {}",
VERSION_STRING, VERSION_OFFICIAL, daemon ? daemon->build_id : "", daemon ? daemon->git_hash : "",
thread_num, sig);
std::string signal_description = "Unknown signal";
/// Some of these are not really signals, but our own indications on failure reason.
if (sig == StdTerminate)
signal_description = "std::terminate";
else if (sig == SanitizerTrap)
signal_description = "sanitizer trap";
else if (sig >= 0)
signal_description = strsignal(sig); // NOLINT(concurrency-mt-unsafe) // it is not thread-safe but ok in this context
LOG_FATAL(log, "Signal description: {}", signal_description);
String error_message;
if (sig != SanitizerTrap)
error_message = signalToErrorMessage(sig, info, *context);
else
error_message = "Sanitizer trap.";
LOG_FATAL(log, fmt::runtime(error_message));
String bare_stacktrace_str;
if (stack_trace.getSize())
{
/// Write bare stack trace (addresses) just in case if we will fail to print symbolized stack trace.
/// NOTE: This still require memory allocations and mutex lock inside logger.
/// BTW we can also print it to stderr using write syscalls.
WriteBufferFromOwnString bare_stacktrace;
writeString("Stack trace:", bare_stacktrace);
for (size_t i = stack_trace.getOffset(); i < stack_trace.getSize(); ++i)
{
writeChar(' ', bare_stacktrace);
writePointerHex(stack_trace.getFramePointers()[i], bare_stacktrace);
}
LOG_FATAL(log, fmt::runtime(bare_stacktrace.str()));
bare_stacktrace_str = bare_stacktrace.str();
}
/// Now try to access potentially unsafe data in thread_ptr.
String query_id;
String query;
/// Send logs from this thread to client if possible.
/// It will allow client to see failure messages directly.
if (thread_ptr)
{
query_id = thread_ptr->getQueryId();
query = thread_ptr->getQueryForLog();
if (auto logs_queue = thread_ptr->getInternalTextLogsQueue())
{
CurrentThread::attachInternalTextLogsQueue(logs_queue, LogsLevel::trace);
}
}
LOG_FATAL(log, "########################################");
if (query_id.empty())
{
LOG_FATAL(log, "(version {}{}, build id: {}, git hash: {}) (from thread {}) (no query) Received signal {} ({})",
VERSION_STRING, VERSION_OFFICIAL, daemon ? daemon->build_id : "", daemon ? daemon->git_hash : "",
thread_num, signal_description, sig);
}
else
{
LOG_FATAL(log, "(version {}{}, build id: {}, git hash: {}) (from thread {}) (query_id: {}) (query: {}) Received signal {} ({})",
VERSION_STRING, VERSION_OFFICIAL, daemon ? daemon->build_id : "", daemon ? daemon->git_hash : "",
thread_num, query_id, query, signal_description, sig);
}
LOG_FATAL(log, fmt::runtime(error_message));
if (!bare_stacktrace_str.empty())
{
LOG_FATAL(log, fmt::runtime(bare_stacktrace_str));
}
/// Write symbolized stack trace line by line for better grep-ability.
stack_trace.toStringEveryLine([&](std::string_view s) { LOG_FATAL(log, fmt::runtime(s)); });
/// In case it's a scheduled job write all previous jobs origins call stacks
std::for_each(thread_frame_pointers.rbegin(), thread_frame_pointers.rend(),
[this](const StackTrace::FramePointers & frame_pointers)
{
if (size_t size = std::ranges::find(frame_pointers, nullptr) - frame_pointers.begin())
{
LOG_FATAL(log, "========================================");
WriteBufferFromOwnString bare_stacktrace;
writeString("Job's origin stack trace:", bare_stacktrace);
std::for_each_n(frame_pointers.begin(), size,
[&bare_stacktrace](const void * ptr)
{
writeChar(' ', bare_stacktrace);
writePointerHex(ptr, bare_stacktrace);
}
);
LOG_FATAL(log, fmt::runtime(bare_stacktrace.str()));
StackTrace::toStringEveryLine(const_cast<void **>(frame_pointers.data()), 0, size, [this](std::string_view s) { LOG_FATAL(log, fmt::runtime(s)); });
}
}
);
#if defined(OS_LINUX)
/// Write information about binary checksum. It can be difficult to calculate, so do it only after printing stack trace.
/// Please keep the below log messages in-sync with the ones in programs/server/Server.cpp
if (daemon && daemon->stored_binary_hash.empty())
{
LOG_FATAL(log, "Integrity check of the executable skipped because the reference checksum could not be read.");
}
else if (daemon)
{
String calculated_binary_hash = getHashOfLoadedBinaryHex();
if (calculated_binary_hash == daemon->stored_binary_hash)
{
LOG_FATAL(log, "Integrity check of the executable successfully passed (checksum: {})", calculated_binary_hash);
}
else
{
LOG_FATAL(
log,
"Calculated checksum of the executable ({0}) does not correspond"
" to the reference checksum stored in the executable ({1})."
" This may indicate one of the following:"
" - the executable was changed just after startup;"
" - the executable was corrupted on disk due to faulty hardware;"
" - the loaded executable was corrupted in memory due to faulty hardware;"
" - the file was intentionally modified;"
" - a logical error in the code.",
calculated_binary_hash,
daemon->stored_binary_hash);
}
}
#endif
/// Write crash to system.crash_log table if available.
if (collectCrashLog)
collectCrashLog(sig, thread_num, query_id, stack_trace);
Context::getGlobalContextInstance()->handleCrash();
/// Send crash report to developers (if configured)
if (sig != SanitizerTrap)
{
if (daemon)
{
if (auto * sentry = SentryWriter::getInstance())
sentry->onSignal(sig, error_message, stack_trace.getFramePointers(), stack_trace.getOffset(), stack_trace.getSize());
}
/// Advice the user to send it manually.
if (std::string_view(VERSION_OFFICIAL).contains("official build"))
{
const auto & date_lut = DateLUT::instance();
/// Approximate support period, upper bound.
if (time(nullptr) - date_lut.makeDate(2000 + VERSION_MAJOR, VERSION_MINOR, 1) < (365 + 30) * 86400)
{
LOG_FATAL(log, "Report this error to https://github.com/ClickHouse/ClickHouse/issues");
}
else
{
LOG_FATAL(log, "ClickHouse version {} is old and should be upgraded to the latest version.", VERSION_STRING);
}
}
else
{
LOG_FATAL(log, "This ClickHouse version is not official and should be upgraded to the official build.");
}
}
/// List changed settings.
if (!query_id.empty())
{
ContextPtr query_context = thread_ptr->getQueryContext();
if (query_context)
{
String changed_settings = query_context->getSettingsRef().toString();
if (changed_settings.empty())
LOG_FATAL(log, "No settings were changed");
else
LOG_FATAL(log, "Changed settings: {}", changed_settings);
}
}
/// When everything is done, we will try to send these error messages to the client.
if (thread_ptr)
thread_ptr->onFatalError();
HandledSignals::instance().fatal_error_printed.test_and_set();
}
catch (...)
{
/// onFault is called from the std::thread, and it should catch all exceptions; otherwise, you can get unrelated fatal errors.
PreformattedMessage message = getCurrentExceptionMessageAndPattern(true);
LOG_FATAL(log, message);
}
HandledSignals::HandledSignals()
{
signal_pipe.setNonBlockingWrite();
signal_pipe.tryIncreaseSize(1 << 20);
}
void HandledSignals::reset()
{
/// Reset signals to SIG_DFL to avoid trying to write to the signal_pipe that will be closed after.
for (int sig : handled_signals)
{
if (SIG_ERR == signal(sig, SIG_DFL))
{
try
{
throw ErrnoException(ErrorCodes::CANNOT_SET_SIGNAL_HANDLER, "Cannot set signal handler");
}
catch (ErrnoException &)
{
tryLogCurrentException(__PRETTY_FUNCTION__);
}
}
}
signal_pipe.close();
}
HandledSignals::~HandledSignals()
{
reset();
};
HandledSignals & HandledSignals::instance()
{
static HandledSignals res;
return res;
}
void HandledSignals::setupTerminateHandler()
{
std::set_terminate(terminate_handler);
}
void HandledSignals::setupCommonDeadlySignalHandlers()
{
/// SIGTSTP is added for debugging purposes. To output a stack trace of any running thread at anytime.
addSignalHandler({SIGABRT, SIGSEGV, SIGILL, SIGBUS, SIGSYS, SIGFPE, SIGPIPE, SIGTSTP, SIGTRAP}, signalHandler, true);
#if defined(SANITIZER)
__sanitizer_set_death_callback(sanitizerDeathCallback);
#endif
}
void HandledSignals::setupCommonTerminateRequestSignalHandlers()
{
addSignalHandler({SIGINT, SIGQUIT, SIGTERM}, terminateRequestedSignalHandler, true);
}

110
src/Common/SignalHandlers.h Normal file
View File

@ -0,0 +1,110 @@
#pragma once
#include <csignal>
#include <base/defines.h>
#include <Common/PipeFDs.h>
#include <Common/StackTrace.h>
#include <Common/ThreadStatus.h>
#include <Core/Types.h>
#include <Poco/Runnable.h>
class BaseDaemon;
/** Reset signal handler to the default and send signal to itself.
* It's called from user signal handler to write core dump.
*/
void call_default_signal_handler(int sig);
const size_t signal_pipe_buf_size =
sizeof(int)
+ sizeof(siginfo_t)
+ sizeof(ucontext_t*)
+ sizeof(StackTrace)
+ sizeof(UInt64)
+ sizeof(UInt32)
+ sizeof(void*);
using signal_function = void(int, siginfo_t*, void*);
void writeSignalIDtoSignalPipe(int sig);
/** Signal handler for HUP */
void closeLogsSignalHandler(int sig, siginfo_t *, void *);
void terminateRequestedSignalHandler(int sig, siginfo_t *, void *);
/** Handler for "fault" or diagnostic signals. Send data about fault to separate thread to write into log.
*/
void signalHandler(int sig, siginfo_t * info, void * context);
/** To use with std::set_terminate.
* Collects slightly more info than __gnu_cxx::__verbose_terminate_handler,
* and send it to pipe. Other thread will read this info from pipe and asynchronously write it to log.
* Look at libstdc++-v3/libsupc++/vterminate.cc for example.
*/
[[noreturn]] void terminate_handler();
/// Avoid link time dependency on DB/Interpreters - will use this function only when linked.
__attribute__((__weak__)) void collectCrashLog(
Int32 signal, UInt64 thread_id, const String & query_id, const StackTrace & stack_trace);
void blockSignals(const std::vector<int> & signals);
/** The thread that read info about signal or std::terminate from pipe.
* On HUP, close log files (for new files to be opened later).
* On information about std::terminate, write it to log.
* On other signals, write info to log.
*/
class SignalListener : public Poco::Runnable
{
public:
static constexpr int StdTerminate = -1;
static constexpr int StopThread = -2;
static constexpr int SanitizerTrap = -3;
explicit SignalListener(BaseDaemon * daemon_, LoggerPtr log_)
: daemon(daemon_), log(log_)
{
}
void run() override;
private:
BaseDaemon * daemon;
LoggerPtr log;
void onTerminate(std::string_view message, UInt32 thread_num) const;
void onFault(
int sig,
const siginfo_t & info,
ucontext_t * context,
const StackTrace & stack_trace,
const std::vector<StackTrace::FramePointers> & thread_frame_pointers,
UInt32 thread_num,
DB::ThreadStatus * thread_ptr) const;
};
struct HandledSignals
{
std::vector<int> handled_signals;
DB::PipeFDs signal_pipe;
std::atomic_flag fatal_error_printed;
HandledSignals();
~HandledSignals();
void setupTerminateHandler();
void setupCommonDeadlySignalHandlers();
void setupCommonTerminateRequestSignalHandlers();
void addSignalHandler(const std::vector<int> & signals, signal_function handler, bool register_signal);
void reset();
static HandledSignals & instance();
};

View File

@ -1,2 +1,24 @@
#include <Common/WeakHash.h>
#include <Common/Exception.h>
#include <Common/HashTable/Hash.h>
namespace DB
{
namespace ErrorCodes
{
extern const int LOGICAL_ERROR;
}
void WeakHash32::update(const WeakHash32 & other)
{
size_t size = data.size();
if (size != other.data.size())
throw Exception(ErrorCodes::LOGICAL_ERROR, "Size of WeakHash32 does not match:"
"left size is {}, right size is {}", size, other.data.size());
for (size_t i = 0; i < size; ++i)
data[i] = static_cast<UInt32>(intHashCRC32(other.data[i], data[i]));
}
}

View File

@ -11,9 +11,8 @@ namespace DB
/// The main purpose why this class needed is to support data initialization. Initially, every bit is 1.
class WeakHash32
{
static constexpr UInt32 kDefaultInitialValue = ~UInt32(0);
public:
static constexpr UInt32 kDefaultInitialValue = ~UInt32(0);
using Container = PaddedPODArray<UInt32>;
@ -22,6 +21,8 @@ public:
void reset(size_t size, UInt32 initial_value = kDefaultInitialValue) { data.assign(size, initial_value); }
void update(const WeakHash32 & other);
const Container & getData() const { return data; }
Container & getData() { return data; }

View File

@ -13,14 +13,14 @@
#include <Common/ZooKeeper/Types.h>
#include <Common/ZooKeeper/ZooKeeperCommon.h>
#include <Common/randomSeed.h>
#include <base/find_symbols.h>
#include <base/sort.h>
#include <base/map.h>
#include <base/getFQDNOrHostName.h>
#include <Core/ServerUUID.h>
#include <Core/BackgroundSchedulePool.h>
#include "Common/ZooKeeper/IKeeper.h"
#include <Common/DNSResolver.h>
#include <Common/ZooKeeper/IKeeper.h>
#include <Common/StringUtils.h>
#include <Common/quoteString.h>
#include <Common/Exception.h>
#include <Interpreters/Context.h>
@ -114,7 +114,11 @@ void ZooKeeper::init(ZooKeeperArgs args_, std::unique_ptr<Coordination::IKeeper>
/// availability_zones is empty on server startup or after config reloading
/// We will keep the az info when starting new sessions
availability_zones = args.availability_zones;
LOG_TEST(log, "Availability zones from config: [{}], client: {}", fmt::join(availability_zones, ", "), args.client_availability_zone);
LOG_TEST(log, "Availability zones from config: [{}], client: {}",
fmt::join(collections::map(availability_zones, [](auto s){ return DB::quoteString(s); }), ", "),
DB::quoteString(args.client_availability_zone));
if (args.availability_zone_autodetect)
updateAvailabilityZones();
}

View File

@ -37,7 +37,7 @@ requires DB::OptionalArgument<TAlign...>
inline ALWAYS_INLINE void * newImpl(std::size_t size, TAlign... align)
{
#if USE_GWP_ASAN
if (unlikely(GWPAsan::GuardedAlloc.shouldSample()))
if (unlikely(GWPAsan::shouldSample()))
{
if constexpr (sizeof...(TAlign) == 1)
{
@ -83,7 +83,7 @@ inline ALWAYS_INLINE void * newImpl(std::size_t size, TAlign... align)
inline ALWAYS_INLINE void * newNoExcept(std::size_t size) noexcept
{
#if USE_GWP_ASAN
if (unlikely(GWPAsan::GuardedAlloc.shouldSample()))
if (unlikely(GWPAsan::shouldSample()))
{
if (void * ptr = GWPAsan::GuardedAlloc.allocate(size))
{
@ -102,7 +102,7 @@ inline ALWAYS_INLINE void * newNoExcept(std::size_t size) noexcept
inline ALWAYS_INLINE void * newNoExcept(std::size_t size, std::align_val_t align) noexcept
{
#if USE_GWP_ASAN
if (unlikely(GWPAsan::GuardedAlloc.shouldSample()))
if (unlikely(GWPAsan::shouldSample()))
{
if (void * ptr = GWPAsan::GuardedAlloc.allocate(size, alignToSizeT(align)))
{

View File

@ -54,16 +54,3 @@ TEST(ShellCommand, ExecuteWithInput)
EXPECT_EQ(res, "Hello, world!\n");
}
TEST(ShellCommand, AutoWait)
{
// <defunct> hunting:
for (int i = 0; i < 1000; ++i)
{
auto command = ShellCommand::execute("echo " + std::to_string(i));
//command->wait(); // now automatic
}
// std::cerr << "inspect me: ps auxwwf\n";
// std::this_thread::sleep_for(std::chrono::seconds(100));
}

View File

@ -55,6 +55,7 @@ struct Settings;
M(UInt64, min_request_size_for_cache, 50 * 1024, "Minimal size of the request to cache the deserialization result. Caching can have negative effect on latency for smaller requests, set to 0 to disable", 0) \
M(UInt64, raft_limits_reconnect_limit, 50, "If connection to a peer is silent longer than this limit * (multiplied by heartbeat interval), we re-establish the connection.", 0) \
M(Bool, async_replication, false, "Enable async replication. All write and read guarantees are preserved while better performance is achieved. Settings is disabled by default to not break backwards compatibility.", 0) \
M(Bool, experimental_use_rocksdb, false, "Use rocksdb as backend storage", 0) \
M(UInt64, latest_logs_cache_size_threshold, 1 * 1024 * 1024 * 1024, "Maximum total size of in-memory cache of latest log entries.", 0) \
M(UInt64, commit_logs_cache_size_threshold, 500 * 1024 * 1024, "Maximum total size of in-memory cache of log entries needed next for commit.", 0) \
M(UInt64, disk_move_retries_wait_ms, 1000, "How long to wait between retries after a failure which happened while a file was being moved between disks.", 0) \

View File

@ -183,8 +183,6 @@
M(ReadBufferFromS3InitMicroseconds) \
M(ReadBufferFromS3Bytes) \
M(ReadBufferFromS3RequestsErrors) \
M(ReadBufferFromS3ResetSessions) \
M(ReadBufferFromS3PreservedSessions) \
\
M(WriteBufferFromS3Microseconds) \
M(WriteBufferFromS3Bytes) \

View File

@ -5,18 +5,27 @@
#include <Coordination/CoordinationSettings.h>
#include <Coordination/Defines.h>
#include <Disks/DiskLocal.h>
#include <Interpreters/Context.h>
#include <IO/S3/Credentials.h>
#include <IO/WriteHelpers.h>
#include <Poco/Util/AbstractConfiguration.h>
#include <Poco/Util/JSONConfiguration.h>
#include <Coordination/KeeperConstants.h>
#include <Server/CloudPlacementInfo.h>
#include <Coordination/KeeperFeatureFlags.h>
#include <Disks/DiskLocal.h>
#include <Disks/DiskSelector.h>
#include <IO/S3/Credentials.h>
#include <Interpreters/Context.h>
#include <Poco/Util/AbstractConfiguration.h>
#include <Common/logger_useful.h>
#include <boost/algorithm/string.hpp>
#include "config.h"
#if USE_ROCKSDB
#include <rocksdb/table.h>
#include <rocksdb/convenience.h>
#include <rocksdb/utilities/db_ttl.h>
#endif
namespace DB
{
@ -24,6 +33,8 @@ namespace ErrorCodes
{
extern const int BAD_ARGUMENTS;
extern const int LOGICAL_ERROR;
extern const int ROCKSDB_ERROR;
}
@ -41,6 +52,95 @@ KeeperContext::KeeperContext(bool standalone_keeper_, CoordinationSettingsPtr co
system_nodes_with_data[keeper_api_version_path] = toString(static_cast<uint8_t>(KeeperApiVersion::WITH_MULTI_READ));
}
#if USE_ROCKSDB
using RocksDBOptions = std::unordered_map<std::string, std::string>;
static RocksDBOptions getOptionsFromConfig(const Poco::Util::AbstractConfiguration & config, const std::string & path)
{
RocksDBOptions options;
Poco::Util::AbstractConfiguration::Keys keys;
config.keys(path, keys);
for (const auto & key : keys)
{
const String key_path = path + "." + key;
options[key] = config.getString(key_path);
}
return options;
}
static rocksdb::Options getRocksDBOptionsFromConfig(const Poco::Util::AbstractConfiguration & config)
{
rocksdb::Status status;
rocksdb::Options base;
base.create_if_missing = true;
base.compression = rocksdb::CompressionType::kZSTD;
base.statistics = rocksdb::CreateDBStatistics();
/// It is too verbose by default, and in fact we don't care about rocksdb logs at all.
base.info_log_level = rocksdb::ERROR_LEVEL;
rocksdb::Options merged = base;
rocksdb::BlockBasedTableOptions table_options;
if (config.has("keeper_server.rocksdb.options"))
{
auto config_options = getOptionsFromConfig(config, "keeper_server.rocksdb.options");
status = rocksdb::GetDBOptionsFromMap(merged, config_options, &merged);
if (!status.ok())
{
throw Exception(ErrorCodes::ROCKSDB_ERROR, "Fail to merge rocksdb options from 'rocksdb.options' : {}",
status.ToString());
}
}
if (config.has("rocksdb.column_family_options"))
{
auto column_family_options = getOptionsFromConfig(config, "rocksdb.column_family_options");
status = rocksdb::GetColumnFamilyOptionsFromMap(merged, column_family_options, &merged);
if (!status.ok())
{
throw Exception(ErrorCodes::ROCKSDB_ERROR, "Fail to merge rocksdb options from 'rocksdb.column_family_options' at: {}", status.ToString());
}
}
if (config.has("rocksdb.block_based_table_options"))
{
auto block_based_table_options = getOptionsFromConfig(config, "rocksdb.block_based_table_options");
status = rocksdb::GetBlockBasedTableOptionsFromMap(table_options, block_based_table_options, &table_options);
if (!status.ok())
{
throw Exception(ErrorCodes::ROCKSDB_ERROR, "Fail to merge rocksdb options from 'rocksdb.block_based_table_options' at: {}", status.ToString());
}
}
merged.table_factory.reset(rocksdb::NewBlockBasedTableFactory(table_options));
return merged;
}
#endif
KeeperContext::Storage KeeperContext::getRocksDBPathFromConfig(const Poco::Util::AbstractConfiguration & config) const
{
const auto create_local_disk = [](const auto & path)
{
if (fs::exists(path))
fs::remove_all(path);
fs::create_directories(path);
return std::make_shared<DiskLocal>("LocalRocksDBDisk", path);
};
if (config.has("keeper_server.rocksdb_path"))
return create_local_disk(config.getString("keeper_server.rocksdb_path"));
if (config.has("keeper_server.storage_path"))
return create_local_disk(std::filesystem::path{config.getString("keeper_server.storage_path")} / "rocksdb");
if (standalone_keeper)
return create_local_disk(std::filesystem::path{config.getString("path", KEEPER_DEFAULT_PATH)} / "rocksdb");
else
return create_local_disk(std::filesystem::path{config.getString("path", DBMS_DEFAULT_PATH)} / "coordination/rocksdb");
}
void KeeperContext::initialize(const Poco::Util::AbstractConfiguration & config, KeeperDispatcher * dispatcher_)
{
dispatcher = dispatcher_;
@ -59,6 +159,14 @@ void KeeperContext::initialize(const Poco::Util::AbstractConfiguration & config,
initializeFeatureFlags(config);
initializeDisks(config);
#if USE_ROCKSDB
if (config.getBool("keeper_server.coordination_settings.experimental_use_rocksdb", false))
{
rocksdb_options = std::make_shared<rocksdb::Options>(getRocksDBOptionsFromConfig(config));
digest_enabled = false; /// TODO: support digest
}
#endif
}
namespace
@ -94,6 +202,8 @@ void KeeperContext::initializeDisks(const Poco::Util::AbstractConfiguration & co
{
disk_selector->initialize(config, "storage_configuration.disks", Context::getGlobalContextInstance(), diskValidator);
rocksdb_storage = getRocksDBPathFromConfig(config);
log_storage = getLogsPathFromConfig(config);
if (config.has("keeper_server.latest_log_storage_disk"))
@ -262,6 +372,37 @@ void KeeperContext::dumpConfiguration(WriteBufferFromOwnString & buf) const
}
}
void KeeperContext::setRocksDBDisk(DiskPtr disk)
{
rocksdb_storage = std::move(disk);
}
DiskPtr KeeperContext::getTemporaryRocksDBDisk() const
{
DiskPtr rocksdb_disk = getDisk(rocksdb_storage);
if (!rocksdb_disk)
{
throw Exception(ErrorCodes::LOGICAL_ERROR, "rocksdb storage is not initialized");
}
auto uuid_str = formatUUID(UUIDHelpers::generateV4());
String path_to_create = "rocks_" + std::string(uuid_str.data(), uuid_str.size());
rocksdb_disk->createDirectory(path_to_create);
return std::make_shared<DiskLocal>("LocalTmpRocksDBDisk", fullPath(rocksdb_disk, path_to_create));
}
void KeeperContext::setRocksDBOptions(std::shared_ptr<rocksdb::Options> rocksdb_options_)
{
if (rocksdb_options_ != nullptr)
rocksdb_options = rocksdb_options_;
else
{
#if USE_ROCKSDB
rocksdb_options = std::make_shared<rocksdb::Options>(getRocksDBOptionsFromConfig(Poco::Util::JSONConfiguration()));
#endif
}
}
KeeperContext::Storage KeeperContext::getLogsPathFromConfig(const Poco::Util::AbstractConfiguration & config) const
{
const auto create_local_disk = [](const auto & path)

View File

@ -6,6 +6,11 @@
#include <cstdint>
#include <memory>
namespace rocksdb
{
struct Options;
}
namespace DB
{
@ -62,6 +67,12 @@ public:
constexpr KeeperDispatcher * getDispatcher() const { return dispatcher; }
void setRocksDBDisk(DiskPtr disk);
DiskPtr getTemporaryRocksDBDisk() const;
void setRocksDBOptions(std::shared_ptr<rocksdb::Options> rocksdb_options_ = nullptr);
std::shared_ptr<rocksdb::Options> getRocksDBOptions() const { return rocksdb_options; }
UInt64 getKeeperMemorySoftLimit() const { return memory_soft_limit; }
void updateKeeperMemorySoftLimit(const Poco::Util::AbstractConfiguration & config);
@ -90,6 +101,7 @@ private:
void initializeFeatureFlags(const Poco::Util::AbstractConfiguration & config);
void initializeDisks(const Poco::Util::AbstractConfiguration & config);
Storage getRocksDBPathFromConfig(const Poco::Util::AbstractConfiguration & config) const;
Storage getLogsPathFromConfig(const Poco::Util::AbstractConfiguration & config) const;
Storage getSnapshotsPathFromConfig(const Poco::Util::AbstractConfiguration & config) const;
Storage getStatePathFromConfig(const Poco::Util::AbstractConfiguration & config) const;
@ -111,12 +123,15 @@ private:
std::shared_ptr<DiskSelector> disk_selector;
Storage rocksdb_storage;
Storage log_storage;
Storage latest_log_storage;
Storage snapshot_storage;
Storage latest_snapshot_storage;
Storage state_file_storage;
std::shared_ptr<rocksdb::Options> rocksdb_options;
std::vector<std::string> old_log_disk_names;
std::vector<std::string> old_snapshot_disk_names;

View File

@ -117,13 +117,13 @@ void KeeperDispatcher::requestThread()
RaftAppendResult prev_result = nullptr;
/// Requests from previous iteration. We store them to be able
/// to send errors to the client.
KeeperStorage::RequestsForSessions prev_batch;
KeeperStorageBase::RequestsForSessions prev_batch;
const auto & shutdown_called = keeper_context->isShutdownCalled();
while (!shutdown_called)
{
KeeperStorage::RequestForSession request;
KeeperStorageBase::RequestForSession request;
auto coordination_settings = configuration_and_settings->coordination_settings;
uint64_t max_wait = coordination_settings->operation_timeout_ms.totalMilliseconds();
@ -159,7 +159,7 @@ void KeeperDispatcher::requestThread()
continue;
}
KeeperStorage::RequestsForSessions current_batch;
KeeperStorageBase::RequestsForSessions current_batch;
size_t current_batch_bytes_size = 0;
bool has_read_request = false;
@ -317,7 +317,7 @@ void KeeperDispatcher::responseThread()
const auto & shutdown_called = keeper_context->isShutdownCalled();
while (!shutdown_called)
{
KeeperStorage::ResponseForSession response_for_session;
KeeperStorageBase::ResponseForSession response_for_session;
uint64_t max_wait = configuration_and_settings->coordination_settings->operation_timeout_ms.totalMilliseconds();
@ -408,7 +408,7 @@ bool KeeperDispatcher::putRequest(const Coordination::ZooKeeperRequestPtr & requ
return false;
}
KeeperStorage::RequestForSession request_info;
KeeperStorageBase::RequestForSession request_info;
request_info.request = request;
using namespace std::chrono;
request_info.time = duration_cast<milliseconds>(system_clock::now().time_since_epoch()).count();
@ -454,7 +454,7 @@ void KeeperDispatcher::initialize(const Poco::Util::AbstractConfiguration & conf
snapshots_queue,
keeper_context,
snapshot_s3,
[this](uint64_t /*log_idx*/, const KeeperStorage::RequestForSession & request_for_session)
[this](uint64_t /*log_idx*/, const KeeperStorageBase::RequestForSession & request_for_session)
{
{
/// check if we have queue of read requests depending on this request to be committed
@ -546,7 +546,7 @@ void KeeperDispatcher::shutdown()
update_configuration_thread.join();
}
KeeperStorage::RequestForSession request_for_session;
KeeperStorageBase::RequestForSession request_for_session;
/// Set session expired for all pending requests
while (requests_queue && requests_queue->tryPop(request_for_session))
@ -557,7 +557,7 @@ void KeeperDispatcher::shutdown()
setResponse(request_for_session.session_id, response);
}
KeeperStorage::RequestsForSessions close_requests;
KeeperStorageBase::RequestsForSessions close_requests;
{
/// Clear all registered sessions
std::lock_guard lock(session_to_response_callback_mutex);
@ -571,7 +571,7 @@ void KeeperDispatcher::shutdown()
auto request = Coordination::ZooKeeperRequestFactory::instance().get(Coordination::OpNum::Close);
request->xid = Coordination::CLOSE_XID;
using namespace std::chrono;
KeeperStorage::RequestForSession request_info
KeeperStorageBase::RequestForSession request_info
{
.session_id = session,
.time = duration_cast<milliseconds>(system_clock::now().time_since_epoch()).count(),
@ -669,7 +669,7 @@ void KeeperDispatcher::sessionCleanerTask()
auto request = Coordination::ZooKeeperRequestFactory::instance().get(Coordination::OpNum::Close);
request->xid = Coordination::CLOSE_XID;
using namespace std::chrono;
KeeperStorage::RequestForSession request_info
KeeperStorageBase::RequestForSession request_info
{
.session_id = dead_session,
.time = duration_cast<milliseconds>(system_clock::now().time_since_epoch()).count(),
@ -717,16 +717,16 @@ void KeeperDispatcher::finishSession(int64_t session_id)
}
}
void KeeperDispatcher::addErrorResponses(const KeeperStorage::RequestsForSessions & requests_for_sessions, Coordination::Error error)
void KeeperDispatcher::addErrorResponses(const KeeperStorageBase::RequestsForSessions & requests_for_sessions, Coordination::Error error)
{
for (const auto & request_for_session : requests_for_sessions)
{
KeeperStorage::ResponsesForSessions responses;
KeeperStorageBase::ResponsesForSessions responses;
auto response = request_for_session.request->makeResponse();
response->xid = request_for_session.request->xid;
response->zxid = 0;
response->error = error;
if (!responses_queue.push(DB::KeeperStorage::ResponseForSession{request_for_session.session_id, response}))
if (!responses_queue.push(DB::KeeperStorageBase::ResponseForSession{request_for_session.session_id, response}))
throw Exception(ErrorCodes::SYSTEM_ERROR,
"Could not push error response xid {} zxid {} error message {} to responses queue",
response->xid,
@ -736,7 +736,7 @@ void KeeperDispatcher::addErrorResponses(const KeeperStorage::RequestsForSession
}
nuraft::ptr<nuraft::buffer> KeeperDispatcher::forceWaitAndProcessResult(
RaftAppendResult & result, KeeperStorage::RequestsForSessions & requests_for_sessions, bool clear_requests_on_success)
RaftAppendResult & result, KeeperStorageBase::RequestsForSessions & requests_for_sessions, bool clear_requests_on_success)
{
if (!result->has_result())
result->get();
@ -761,7 +761,7 @@ int64_t KeeperDispatcher::getSessionID(int64_t session_timeout_ms)
{
/// New session id allocation is a special request, because we cannot process it in normal
/// way: get request -> put to raft -> set response for registered callback.
KeeperStorage::RequestForSession request_info;
KeeperStorageBase::RequestForSession request_info;
std::shared_ptr<Coordination::ZooKeeperSessionIDRequest> request = std::make_shared<Coordination::ZooKeeperSessionIDRequest>();
/// Internal session id. It's a temporary number which is unique for each client on this server
/// but can be same on different servers.

View File

@ -26,7 +26,7 @@ using ZooKeeperResponseCallback = std::function<void(const Coordination::ZooKeep
class KeeperDispatcher
{
private:
using RequestsQueue = ConcurrentBoundedQueue<KeeperStorage::RequestForSession>;
using RequestsQueue = ConcurrentBoundedQueue<KeeperStorageBase::RequestForSession>;
using SessionToResponseCallback = std::unordered_map<int64_t, ZooKeeperResponseCallback>;
using ClusterUpdateQueue = ConcurrentBoundedQueue<ClusterUpdateAction>;
@ -95,18 +95,18 @@ private:
/// Add error responses for requests to responses queue.
/// Clears requests.
void addErrorResponses(const KeeperStorage::RequestsForSessions & requests_for_sessions, Coordination::Error error);
void addErrorResponses(const KeeperStorageBase::RequestsForSessions & requests_for_sessions, Coordination::Error error);
/// Forcefully wait for result and sets errors if something when wrong.
/// Clears both arguments
nuraft::ptr<nuraft::buffer> forceWaitAndProcessResult(
RaftAppendResult & result, KeeperStorage::RequestsForSessions & requests_for_sessions, bool clear_requests_on_success);
RaftAppendResult & result, KeeperStorageBase::RequestsForSessions & requests_for_sessions, bool clear_requests_on_success);
public:
std::mutex read_request_queue_mutex;
/// queue of read requests that can be processed after a request with specific session ID and XID is committed
std::unordered_map<int64_t, std::unordered_map<Coordination::XID, KeeperStorage::RequestsForSessions>> read_request_queue;
std::unordered_map<int64_t, std::unordered_map<Coordination::XID, KeeperStorageBase::RequestsForSessions>> read_request_queue;
/// Just allocate some objects, real initialization is done by `intialize method`
KeeperDispatcher();
@ -192,7 +192,7 @@ public:
Keeper4LWInfo getKeeper4LWInfo() const;
const KeeperStateMachine & getStateMachine() const
const IKeeperStateMachine & getStateMachine() const
{
return *server->getKeeperStateMachine();
}

View File

@ -123,7 +123,7 @@ KeeperServer::KeeperServer(
SnapshotsQueue & snapshots_queue_,
KeeperContextPtr keeper_context_,
KeeperSnapshotManagerS3 & snapshot_manager_s3,
KeeperStateMachine::CommitCallback commit_callback)
IKeeperStateMachine::CommitCallback commit_callback)
: server_id(configuration_and_settings_->server_id)
, log(getLogger("KeeperServer"))
, is_recovering(config.getBool("keeper_server.force_recovery", false))
@ -134,13 +134,28 @@ KeeperServer::KeeperServer(
if (keeper_context->getCoordinationSettings()->quorum_reads)
LOG_WARNING(log, "Quorum reads enabled, Keeper will work slower.");
state_machine = nuraft::cs_new<KeeperStateMachine>(
responses_queue_,
snapshots_queue_,
keeper_context,
config.getBool("keeper_server.upload_snapshot_on_exit", false) ? &snapshot_manager_s3 : nullptr,
commit_callback,
checkAndGetSuperdigest(configuration_and_settings_->super_digest));
#if USE_ROCKSDB
const auto & coordination_settings = keeper_context->getCoordinationSettings();
if (coordination_settings->experimental_use_rocksdb)
{
state_machine = nuraft::cs_new<KeeperStateMachine<KeeperRocksStorage>>(
responses_queue_,
snapshots_queue_,
keeper_context,
config.getBool("keeper_server.upload_snapshot_on_exit", false) ? &snapshot_manager_s3 : nullptr,
commit_callback,
checkAndGetSuperdigest(configuration_and_settings_->super_digest));
LOG_WARNING(log, "Use RocksDB as Keeper backend storage.");
}
else
#endif
state_machine = nuraft::cs_new<KeeperStateMachine<KeeperMemoryStorage>>(
responses_queue_,
snapshots_queue_,
keeper_context,
config.getBool("keeper_server.upload_snapshot_on_exit", false) ? &snapshot_manager_s3 : nullptr,
commit_callback,
checkAndGetSuperdigest(configuration_and_settings_->super_digest));
state_manager = nuraft::cs_new<KeeperStateManager>(
server_id,
@ -522,7 +537,7 @@ namespace
{
// Serialize the request for the log entry
nuraft::ptr<nuraft::buffer> getZooKeeperLogEntry(const KeeperStorage::RequestForSession & request_for_session)
nuraft::ptr<nuraft::buffer> getZooKeeperLogEntry(const KeeperStorageBase::RequestForSession & request_for_session)
{
DB::WriteBufferFromNuraftBuffer write_buf;
DB::writeIntBinary(request_for_session.session_id, write_buf);
@ -530,7 +545,7 @@ nuraft::ptr<nuraft::buffer> getZooKeeperLogEntry(const KeeperStorage::RequestFor
DB::writeIntBinary(request_for_session.time, write_buf);
/// we fill with dummy values to eliminate unnecessary copy later on when we will write correct values
DB::writeIntBinary(static_cast<int64_t>(0), write_buf); /// zxid
DB::writeIntBinary(KeeperStorage::DigestVersion::NO_DIGEST, write_buf); /// digest version or NO_DIGEST flag
DB::writeIntBinary(KeeperStorageBase::DigestVersion::NO_DIGEST, write_buf); /// digest version or NO_DIGEST flag
DB::writeIntBinary(static_cast<uint64_t>(0), write_buf); /// digest value
/// if new fields are added, update KeeperStateMachine::ZooKeeperLogSerializationVersion along with parseRequest function and PreAppendLog callback handler
return write_buf.getBuffer();
@ -538,7 +553,7 @@ nuraft::ptr<nuraft::buffer> getZooKeeperLogEntry(const KeeperStorage::RequestFor
}
void KeeperServer::putLocalReadRequest(const KeeperStorage::RequestForSession & request_for_session)
void KeeperServer::putLocalReadRequest(const KeeperStorageBase::RequestForSession & request_for_session)
{
if (!request_for_session.request->isReadRequest())
throw Exception(ErrorCodes::LOGICAL_ERROR, "Cannot process non-read request locally");
@ -546,7 +561,7 @@ void KeeperServer::putLocalReadRequest(const KeeperStorage::RequestForSession &
state_machine->processReadRequest(request_for_session);
}
RaftAppendResult KeeperServer::putRequestBatch(const KeeperStorage::RequestsForSessions & requests_for_sessions)
RaftAppendResult KeeperServer::putRequestBatch(const KeeperStorageBase::RequestsForSessions & requests_for_sessions)
{
std::vector<nuraft::ptr<nuraft::buffer>> entries;
entries.reserve(requests_for_sessions.size());
@ -789,7 +804,7 @@ nuraft::cb_func::ReturnCode KeeperServer::callbackFunc(nuraft::cb_func::Type typ
auto entry_buf = entry->get_buf_ptr();
KeeperStateMachine::ZooKeeperLogSerializationVersion serialization_version;
IKeeperStateMachine::ZooKeeperLogSerializationVersion serialization_version;
auto request_for_session = state_machine->parseRequest(*entry_buf, /*final=*/false, &serialization_version);
request_for_session->zxid = next_zxid;
if (!state_machine->preprocess(*request_for_session))
@ -799,10 +814,10 @@ nuraft::cb_func::ReturnCode KeeperServer::callbackFunc(nuraft::cb_func::Type typ
/// older versions of Keeper can send logs that are missing some fields
size_t bytes_missing = 0;
if (serialization_version < KeeperStateMachine::ZooKeeperLogSerializationVersion::WITH_TIME)
if (serialization_version < IKeeperStateMachine::ZooKeeperLogSerializationVersion::WITH_TIME)
bytes_missing += sizeof(request_for_session->time);
if (serialization_version < KeeperStateMachine::ZooKeeperLogSerializationVersion::WITH_ZXID_DIGEST)
if (serialization_version < IKeeperStateMachine::ZooKeeperLogSerializationVersion::WITH_ZXID_DIGEST)
bytes_missing += sizeof(request_for_session->zxid) + sizeof(request_for_session->digest->version) + sizeof(request_for_session->digest->value);
if (bytes_missing != 0)
@ -816,19 +831,19 @@ nuraft::cb_func::ReturnCode KeeperServer::callbackFunc(nuraft::cb_func::Type typ
size_t write_buffer_header_size
= sizeof(request_for_session->zxid) + sizeof(request_for_session->digest->version) + sizeof(request_for_session->digest->value);
if (serialization_version < KeeperStateMachine::ZooKeeperLogSerializationVersion::WITH_TIME)
if (serialization_version < IKeeperStateMachine::ZooKeeperLogSerializationVersion::WITH_TIME)
write_buffer_header_size += sizeof(request_for_session->time);
auto * buffer_start = reinterpret_cast<BufferBase::Position>(entry_buf->data_begin() + entry_buf->size() - write_buffer_header_size);
WriteBufferFromPointer write_buf(buffer_start, write_buffer_header_size);
if (serialization_version < KeeperStateMachine::ZooKeeperLogSerializationVersion::WITH_TIME)
if (serialization_version < IKeeperStateMachine::ZooKeeperLogSerializationVersion::WITH_TIME)
writeIntBinary(request_for_session->time, write_buf);
writeIntBinary(request_for_session->zxid, write_buf);
writeIntBinary(request_for_session->digest->version, write_buf);
if (request_for_session->digest->version != KeeperStorage::NO_DIGEST)
if (request_for_session->digest->version != KeeperStorageBase::NO_DIGEST)
writeIntBinary(request_for_session->digest->value, write_buf);
write_buf.finalize();

View File

@ -24,7 +24,7 @@ class KeeperServer
private:
const int server_id;
nuraft::ptr<KeeperStateMachine> state_machine;
nuraft::ptr<IKeeperStateMachine> state_machine;
nuraft::ptr<KeeperStateManager> state_manager;
@ -79,26 +79,26 @@ public:
SnapshotsQueue & snapshots_queue_,
KeeperContextPtr keeper_context_,
KeeperSnapshotManagerS3 & snapshot_manager_s3,
KeeperStateMachine::CommitCallback commit_callback);
IKeeperStateMachine::CommitCallback commit_callback);
/// Load state machine from the latest snapshot and load log storage. Start NuRaft with required settings.
void startup(const Poco::Util::AbstractConfiguration & config, bool enable_ipv6 = true);
/// Put local read request and execute in state machine directly and response into
/// responses queue
void putLocalReadRequest(const KeeperStorage::RequestForSession & request);
void putLocalReadRequest(const KeeperStorageBase::RequestForSession & request);
bool isRecovering() const { return is_recovering; }
bool reconfigEnabled() const { return enable_reconfiguration; }
/// Put batch of requests into Raft and get result of put. Responses will be set separately into
/// responses_queue.
RaftAppendResult putRequestBatch(const KeeperStorage::RequestsForSessions & requests);
RaftAppendResult putRequestBatch(const KeeperStorageBase::RequestsForSessions & requests);
/// Return set of the non-active sessions
std::vector<int64_t> getDeadSessions();
nuraft::ptr<KeeperStateMachine> getKeeperStateMachine() const { return state_machine; }
nuraft::ptr<IKeeperStateMachine> getKeeperStateMachine() const { return state_machine; }
void forceRecovery();

View File

@ -66,7 +66,8 @@ namespace
return base;
}
void writeNode(const KeeperStorage::Node & node, SnapshotVersion version, WriteBuffer & out)
template<typename Node>
void writeNode(const Node & node, SnapshotVersion version, WriteBuffer & out)
{
writeBinary(node.getData(), out);
@ -86,7 +87,7 @@ namespace
writeBinary(node.aversion, out);
writeBinary(node.ephemeralOwner(), out);
if (version < SnapshotVersion::V6)
writeBinary(static_cast<int32_t>(node.data_size), out);
writeBinary(static_cast<int32_t>(node.getData().size()), out);
writeBinary(node.numChildren(), out);
writeBinary(node.pzxid, out);
@ -96,7 +97,8 @@ namespace
writeBinary(node.sizeInBytes(), out);
}
void readNode(KeeperStorage::Node & node, ReadBuffer & in, SnapshotVersion version, ACLMap & acl_map)
template<typename Node>
void readNode(Node & node, ReadBuffer & in, SnapshotVersion version, ACLMap & acl_map)
{
readVarUInt(node.data_size, in);
if (node.data_size != 0)
@ -195,7 +197,8 @@ namespace
}
}
void KeeperStorageSnapshot::serialize(const KeeperStorageSnapshot & snapshot, WriteBuffer & out, KeeperContextPtr keeper_context)
template<typename Storage>
void KeeperStorageSnapshot<Storage>::serialize(const KeeperStorageSnapshot<Storage> & snapshot, WriteBuffer & out, KeeperContextPtr keeper_context)
{
writeBinary(static_cast<uint8_t>(snapshot.version), out);
serializeSnapshotMetadata(snapshot.snapshot_meta, out);
@ -205,11 +208,11 @@ void KeeperStorageSnapshot::serialize(const KeeperStorageSnapshot & snapshot, Wr
writeBinary(snapshot.zxid, out);
if (keeper_context->digestEnabled())
{
writeBinary(static_cast<uint8_t>(KeeperStorage::CURRENT_DIGEST_VERSION), out);
writeBinary(static_cast<uint8_t>(Storage::CURRENT_DIGEST_VERSION), out);
writeBinary(snapshot.nodes_digest, out);
}
else
writeBinary(static_cast<uint8_t>(KeeperStorage::NO_DIGEST), out);
writeBinary(static_cast<uint8_t>(Storage::NO_DIGEST), out);
}
writeBinary(snapshot.session_id, out);
@ -255,7 +258,6 @@ void KeeperStorageSnapshot::serialize(const KeeperStorageSnapshot & snapshot, Wr
/// slightly bigger than required.
if (node.mzxid > snapshot.zxid)
break;
writeBinary(path, out);
writeNode(node, snapshot.version, out);
@ -282,7 +284,7 @@ void KeeperStorageSnapshot::serialize(const KeeperStorageSnapshot & snapshot, Wr
writeBinary(session_id, out);
writeBinary(timeout, out);
KeeperStorage::AuthIDs ids;
KeeperStorageBase::AuthIDs ids;
if (snapshot.session_and_auth.contains(session_id))
ids = snapshot.session_and_auth.at(session_id);
@ -303,7 +305,8 @@ void KeeperStorageSnapshot::serialize(const KeeperStorageSnapshot & snapshot, Wr
}
}
void KeeperStorageSnapshot::deserialize(SnapshotDeserializationResult & deserialization_result, ReadBuffer & in, KeeperContextPtr keeper_context)
template<typename Storage>
void KeeperStorageSnapshot<Storage>::deserialize(SnapshotDeserializationResult<Storage> & deserialization_result, ReadBuffer & in, KeeperContextPtr keeper_context)
{
uint8_t version;
readBinary(version, in);
@ -312,7 +315,7 @@ void KeeperStorageSnapshot::deserialize(SnapshotDeserializationResult & deserial
throw Exception(ErrorCodes::UNKNOWN_FORMAT_VERSION, "Unsupported snapshot version {}", version);
deserialization_result.snapshot_meta = deserializeSnapshotMetadata(in);
KeeperStorage & storage = *deserialization_result.storage;
Storage & storage = *deserialization_result.storage;
bool recalculate_digest = keeper_context->digestEnabled();
if (version >= SnapshotVersion::V5)
@ -320,11 +323,11 @@ void KeeperStorageSnapshot::deserialize(SnapshotDeserializationResult & deserial
readBinary(storage.zxid, in);
uint8_t digest_version;
readBinary(digest_version, in);
if (digest_version != KeeperStorage::DigestVersion::NO_DIGEST)
if (digest_version != Storage::DigestVersion::NO_DIGEST)
{
uint64_t nodes_digest;
readBinary(nodes_digest, in);
if (digest_version == KeeperStorage::CURRENT_DIGEST_VERSION)
if (digest_version == Storage::CURRENT_DIGEST_VERSION)
{
storage.nodes_digest = nodes_digest;
recalculate_digest = false;
@ -374,8 +377,8 @@ void KeeperStorageSnapshot::deserialize(SnapshotDeserializationResult & deserial
size_t snapshot_container_size;
readBinary(snapshot_container_size, in);
storage.container.reserve(snapshot_container_size);
if constexpr (!use_rocksdb)
storage.container.reserve(snapshot_container_size);
if (recalculate_digest)
storage.nodes_digest = 0;
@ -389,7 +392,7 @@ void KeeperStorageSnapshot::deserialize(SnapshotDeserializationResult & deserial
in.readStrict(path_data.get(), path_size);
std::string_view path{path_data.get(), path_size};
KeeperStorage::Node node{};
typename Storage::Node node{};
readNode(node, in, current_version, storage.acl_map);
using enum Coordination::PathMatchResult;
@ -421,7 +424,7 @@ void KeeperStorageSnapshot::deserialize(SnapshotDeserializationResult & deserial
if (keeper_context->ignoreSystemPathOnStartup() || keeper_context->getServerState() != KeeperContext::Phase::INIT)
{
LOG_ERROR(getLogger("KeeperSnapshotManager"), "{}. Ignoring it", get_error_msg());
node = KeeperStorage::Node{};
node = typename Storage::Node{};
}
else
throw Exception(
@ -433,8 +436,9 @@ void KeeperStorageSnapshot::deserialize(SnapshotDeserializationResult & deserial
}
auto ephemeral_owner = node.ephemeralOwner();
if (!node.isEphemeral() && node.numChildren() > 0)
node.getChildren().reserve(node.numChildren());
if constexpr (!use_rocksdb)
if (!node.isEphemeral() && node.numChildren() > 0)
node.getChildren().reserve(node.numChildren());
if (ephemeral_owner != 0)
storage.ephemerals[node.ephemeralOwner()].insert(std::string{path});
@ -447,36 +451,38 @@ void KeeperStorageSnapshot::deserialize(SnapshotDeserializationResult & deserial
LOG_TRACE(getLogger("KeeperSnapshotManager"), "Building structure for children nodes");
for (const auto & itr : storage.container)
if constexpr (!use_rocksdb)
{
if (itr.key != "/")
for (const auto & itr : storage.container)
{
auto parent_path = parentNodePath(itr.key);
storage.container.updateValue(
parent_path, [path = itr.key](KeeperStorage::Node & value) { value.addChild(getBaseNodeName(path)); });
}
}
for (const auto & itr : storage.container)
{
if (itr.key != "/")
{
if (itr.value.numChildren() != static_cast<int32_t>(itr.value.getChildren().size()))
if (itr.key != "/")
{
auto parent_path = parentNodePath(itr.key);
storage.container.updateValue(
parent_path, [path = itr.key](typename Storage::Node & value) { value.addChild(getBaseNodeName(path)); });
}
}
for (const auto & itr : storage.container)
{
if (itr.key != "/")
{
if (itr.value.numChildren() != static_cast<int32_t>(itr.value.getChildren().size()))
{
#ifdef NDEBUG
/// TODO (alesapin) remove this, it should be always CORRUPTED_DATA.
LOG_ERROR(getLogger("KeeperSnapshotManager"), "Children counter in stat.numChildren {}"
" is different from actual children size {} for node {}", itr.value.numChildren(), itr.value.getChildren().size(), itr.key);
/// TODO (alesapin) remove this, it should be always CORRUPTED_DATA.
LOG_ERROR(getLogger("KeeperSnapshotManager"), "Children counter in stat.numChildren {}"
" is different from actual children size {} for node {}", itr.value.numChildren(), itr.value.getChildren().size(), itr.key);
#else
throw Exception(ErrorCodes::LOGICAL_ERROR, "Children counter in stat.numChildren {}"
" is different from actual children size {} for node {}",
itr.value.numChildren(), itr.value.getChildren().size(), itr.key);
throw Exception(ErrorCodes::LOGICAL_ERROR, "Children counter in stat.numChildren {}"
" is different from actual children size {} for node {}",
itr.value.numChildren(), itr.value.getChildren().size(), itr.key);
#endif
}
}
}
}
size_t active_sessions_size;
readBinary(active_sessions_size, in);
@ -493,14 +499,14 @@ void KeeperStorageSnapshot::deserialize(SnapshotDeserializationResult & deserial
size_t session_auths_size;
readBinary(session_auths_size, in);
KeeperStorage::AuthIDs ids;
typename Storage::AuthIDs ids;
size_t session_auth_counter = 0;
while (session_auth_counter < session_auths_size)
{
String scheme, id;
readBinary(scheme, in);
readBinary(id, in);
ids.emplace_back(KeeperStorage::AuthID{scheme, id});
ids.emplace_back(typename Storage::AuthID{scheme, id});
session_auth_counter++;
}
@ -523,7 +529,8 @@ void KeeperStorageSnapshot::deserialize(SnapshotDeserializationResult & deserial
}
}
KeeperStorageSnapshot::KeeperStorageSnapshot(KeeperStorage * storage_, uint64_t up_to_log_idx_, const ClusterConfigPtr & cluster_config_)
template<typename Storage>
KeeperStorageSnapshot<Storage>::KeeperStorageSnapshot(Storage * storage_, uint64_t up_to_log_idx_, const ClusterConfigPtr & cluster_config_)
: storage(storage_)
, snapshot_meta(std::make_shared<SnapshotMetadata>(up_to_log_idx_, 0, std::make_shared<nuraft::cluster_config>()))
, session_id(storage->session_id_counter)
@ -540,8 +547,9 @@ KeeperStorageSnapshot::KeeperStorageSnapshot(KeeperStorage * storage_, uint64_t
session_and_auth = storage->session_and_auth;
}
KeeperStorageSnapshot::KeeperStorageSnapshot(
KeeperStorage * storage_, const SnapshotMetadataPtr & snapshot_meta_, const ClusterConfigPtr & cluster_config_)
template<typename Storage>
KeeperStorageSnapshot<Storage>::KeeperStorageSnapshot(
Storage * storage_, const SnapshotMetadataPtr & snapshot_meta_, const ClusterConfigPtr & cluster_config_)
: storage(storage_)
, snapshot_meta(snapshot_meta_)
, session_id(storage->session_id_counter)
@ -558,12 +566,14 @@ KeeperStorageSnapshot::KeeperStorageSnapshot(
session_and_auth = storage->session_and_auth;
}
KeeperStorageSnapshot::~KeeperStorageSnapshot()
template<typename Storage>
KeeperStorageSnapshot<Storage>::~KeeperStorageSnapshot()
{
storage->disableSnapshotMode();
}
KeeperSnapshotManager::KeeperSnapshotManager(
template<typename Storage>
KeeperSnapshotManager<Storage>::KeeperSnapshotManager(
size_t snapshots_to_keep_,
const KeeperContextPtr & keeper_context_,
bool compress_snapshots_zstd_,
@ -651,7 +661,8 @@ KeeperSnapshotManager::KeeperSnapshotManager(
moveSnapshotsIfNeeded();
}
SnapshotFileInfoPtr KeeperSnapshotManager::serializeSnapshotBufferToDisk(nuraft::buffer & buffer, uint64_t up_to_log_idx)
template<typename Storage>
SnapshotFileInfoPtr KeeperSnapshotManager<Storage>::serializeSnapshotBufferToDisk(nuraft::buffer & buffer, uint64_t up_to_log_idx)
{
ReadBufferFromNuraftBuffer reader(buffer);
@ -680,7 +691,8 @@ SnapshotFileInfoPtr KeeperSnapshotManager::serializeSnapshotBufferToDisk(nuraft:
return snapshot_file_info;
}
nuraft::ptr<nuraft::buffer> KeeperSnapshotManager::deserializeLatestSnapshotBufferFromDisk()
template<typename Storage>
nuraft::ptr<nuraft::buffer> KeeperSnapshotManager<Storage>::deserializeLatestSnapshotBufferFromDisk()
{
while (!existing_snapshots.empty())
{
@ -701,7 +713,8 @@ nuraft::ptr<nuraft::buffer> KeeperSnapshotManager::deserializeLatestSnapshotBuff
return nullptr;
}
nuraft::ptr<nuraft::buffer> KeeperSnapshotManager::deserializeSnapshotBufferFromDisk(uint64_t up_to_log_idx) const
template<typename Storage>
nuraft::ptr<nuraft::buffer> KeeperSnapshotManager<Storage>::deserializeSnapshotBufferFromDisk(uint64_t up_to_log_idx) const
{
const auto & [snapshot_path, snapshot_disk, size] = *existing_snapshots.at(up_to_log_idx);
WriteBufferFromNuraftBuffer writer;
@ -710,7 +723,8 @@ nuraft::ptr<nuraft::buffer> KeeperSnapshotManager::deserializeSnapshotBufferFrom
return writer.getBuffer();
}
nuraft::ptr<nuraft::buffer> KeeperSnapshotManager::serializeSnapshotToBuffer(const KeeperStorageSnapshot & snapshot) const
template<typename Storage>
nuraft::ptr<nuraft::buffer> KeeperSnapshotManager<Storage>::serializeSnapshotToBuffer(const KeeperStorageSnapshot<Storage> & snapshot) const
{
std::unique_ptr<WriteBufferFromNuraftBuffer> writer = std::make_unique<WriteBufferFromNuraftBuffer>();
auto * buffer_raw_ptr = writer.get();
@ -720,13 +734,13 @@ nuraft::ptr<nuraft::buffer> KeeperSnapshotManager::serializeSnapshotToBuffer(con
else
compressed_writer = std::make_unique<CompressedWriteBuffer>(*writer);
KeeperStorageSnapshot::serialize(snapshot, *compressed_writer, keeper_context);
KeeperStorageSnapshot<Storage>::serialize(snapshot, *compressed_writer, keeper_context);
compressed_writer->finalize();
return buffer_raw_ptr->getBuffer();
}
bool KeeperSnapshotManager::isZstdCompressed(nuraft::ptr<nuraft::buffer> buffer)
template<typename Storage>
bool KeeperSnapshotManager<Storage>::isZstdCompressed(nuraft::ptr<nuraft::buffer> buffer)
{
static constexpr unsigned char ZSTD_COMPRESSED_MAGIC[4] = {0x28, 0xB5, 0x2F, 0xFD};
@ -737,7 +751,8 @@ bool KeeperSnapshotManager::isZstdCompressed(nuraft::ptr<nuraft::buffer> buffer)
return memcmp(magic_from_buffer, ZSTD_COMPRESSED_MAGIC, 4) == 0;
}
SnapshotDeserializationResult KeeperSnapshotManager::deserializeSnapshotFromBuffer(nuraft::ptr<nuraft::buffer> buffer) const
template<typename Storage>
SnapshotDeserializationResult<Storage> KeeperSnapshotManager<Storage>::deserializeSnapshotFromBuffer(nuraft::ptr<nuraft::buffer> buffer) const
{
bool is_zstd_compressed = isZstdCompressed(buffer);
@ -749,14 +764,15 @@ SnapshotDeserializationResult KeeperSnapshotManager::deserializeSnapshotFromBuff
else
compressed_reader = std::make_unique<CompressedReadBuffer>(*reader);
SnapshotDeserializationResult result;
result.storage = std::make_unique<KeeperStorage>(storage_tick_time, superdigest, keeper_context, /* initialize_system_nodes */ false);
KeeperStorageSnapshot::deserialize(result, *compressed_reader, keeper_context);
SnapshotDeserializationResult<Storage> result;
result.storage = std::make_unique<Storage>(storage_tick_time, superdigest, keeper_context, /* initialize_system_nodes */ false);
KeeperStorageSnapshot<Storage>::deserialize(result, *compressed_reader, keeper_context);
result.storage->initializeSystemNodes();
return result;
}
SnapshotDeserializationResult KeeperSnapshotManager::restoreFromLatestSnapshot()
template<typename Storage>
SnapshotDeserializationResult<Storage> KeeperSnapshotManager<Storage>::restoreFromLatestSnapshot()
{
if (existing_snapshots.empty())
return {};
@ -767,23 +783,27 @@ SnapshotDeserializationResult KeeperSnapshotManager::restoreFromLatestSnapshot()
return deserializeSnapshotFromBuffer(buffer);
}
DiskPtr KeeperSnapshotManager::getDisk() const
template<typename Storage>
DiskPtr KeeperSnapshotManager<Storage>::getDisk() const
{
return keeper_context->getSnapshotDisk();
}
DiskPtr KeeperSnapshotManager::getLatestSnapshotDisk() const
template<typename Storage>
DiskPtr KeeperSnapshotManager<Storage>::getLatestSnapshotDisk() const
{
return keeper_context->getLatestSnapshotDisk();
}
void KeeperSnapshotManager::removeOutdatedSnapshotsIfNeeded()
template<typename Storage>
void KeeperSnapshotManager<Storage>::removeOutdatedSnapshotsIfNeeded()
{
while (existing_snapshots.size() > snapshots_to_keep)
removeSnapshot(existing_snapshots.begin()->first);
}
void KeeperSnapshotManager::moveSnapshotsIfNeeded()
template<typename Storage>
void KeeperSnapshotManager<Storage>::moveSnapshotsIfNeeded()
{
/// move snapshots to correct disks
@ -813,7 +833,8 @@ void KeeperSnapshotManager::moveSnapshotsIfNeeded()
}
void KeeperSnapshotManager::removeSnapshot(uint64_t log_idx)
template<typename Storage>
void KeeperSnapshotManager<Storage>::removeSnapshot(uint64_t log_idx)
{
auto itr = existing_snapshots.find(log_idx);
if (itr == existing_snapshots.end())
@ -823,7 +844,8 @@ void KeeperSnapshotManager::removeSnapshot(uint64_t log_idx)
existing_snapshots.erase(itr);
}
SnapshotFileInfoPtr KeeperSnapshotManager::serializeSnapshotToDisk(const KeeperStorageSnapshot & snapshot)
template<typename Storage>
SnapshotFileInfoPtr KeeperSnapshotManager<Storage>::serializeSnapshotToDisk(const KeeperStorageSnapshot<Storage> & snapshot)
{
auto up_to_log_idx = snapshot.snapshot_meta->get_last_log_idx();
auto snapshot_file_name = getSnapshotFileName(up_to_log_idx, compress_snapshots_zstd);
@ -842,7 +864,7 @@ SnapshotFileInfoPtr KeeperSnapshotManager::serializeSnapshotToDisk(const KeeperS
else
compressed_writer = std::make_unique<CompressedWriteBuffer>(*writer);
KeeperStorageSnapshot::serialize(snapshot, *compressed_writer, keeper_context);
KeeperStorageSnapshot<Storage>::serialize(snapshot, *compressed_writer, keeper_context);
compressed_writer->finalize();
compressed_writer->sync();
@ -864,14 +886,16 @@ SnapshotFileInfoPtr KeeperSnapshotManager::serializeSnapshotToDisk(const KeeperS
return snapshot_file_info;
}
size_t KeeperSnapshotManager::getLatestSnapshotIndex() const
template<typename Storage>
size_t KeeperSnapshotManager<Storage>::getLatestSnapshotIndex() const
{
if (!existing_snapshots.empty())
return existing_snapshots.rbegin()->first;
return 0;
}
SnapshotFileInfoPtr KeeperSnapshotManager::getLatestSnapshotInfo() const
template<typename Storage>
SnapshotFileInfoPtr KeeperSnapshotManager<Storage>::getLatestSnapshotInfo() const
{
if (!existing_snapshots.empty())
{
@ -890,4 +914,10 @@ SnapshotFileInfoPtr KeeperSnapshotManager::getLatestSnapshotInfo() const
return nullptr;
}
template struct KeeperStorageSnapshot<KeeperMemoryStorage>;
template class KeeperSnapshotManager<KeeperMemoryStorage>;
#if USE_ROCKSDB
template struct KeeperStorageSnapshot<KeeperRocksStorage>;
template class KeeperSnapshotManager<KeeperRocksStorage>;
#endif
}

View File

@ -34,10 +34,11 @@ enum SnapshotVersion : uint8_t
static constexpr auto CURRENT_SNAPSHOT_VERSION = SnapshotVersion::V6;
/// What is stored in binary snapshot
template<typename Storage>
struct SnapshotDeserializationResult
{
/// Storage
KeeperStoragePtr storage;
std::unique_ptr<Storage> storage;
/// Snapshot metadata (up_to_log_idx and so on)
SnapshotMetadataPtr snapshot_meta;
/// Cluster config
@ -52,21 +53,31 @@ struct SnapshotDeserializationResult
///
/// This representation of snapshot have to be serialized into NuRaft
/// buffer and send over network or saved to file.
template<typename Storage>
struct KeeperStorageSnapshot
{
#if USE_ROCKSDB
static constexpr bool use_rocksdb = std::is_same_v<Storage, KeeperRocksStorage>;
#else
static constexpr bool use_rocksdb = false;
#endif
public:
KeeperStorageSnapshot(KeeperStorage * storage_, uint64_t up_to_log_idx_, const ClusterConfigPtr & cluster_config_ = nullptr);
KeeperStorageSnapshot(Storage * storage_, uint64_t up_to_log_idx_, const ClusterConfigPtr & cluster_config_ = nullptr);
KeeperStorageSnapshot(
KeeperStorage * storage_, const SnapshotMetadataPtr & snapshot_meta_, const ClusterConfigPtr & cluster_config_ = nullptr);
Storage * storage_, const SnapshotMetadataPtr & snapshot_meta_, const ClusterConfigPtr & cluster_config_ = nullptr);
KeeperStorageSnapshot(const KeeperStorageSnapshot<Storage>&) = delete;
KeeperStorageSnapshot(KeeperStorageSnapshot<Storage>&&) = default;
~KeeperStorageSnapshot();
static void serialize(const KeeperStorageSnapshot & snapshot, WriteBuffer & out, KeeperContextPtr keeper_context);
static void serialize(const KeeperStorageSnapshot<Storage> & snapshot, WriteBuffer & out, KeeperContextPtr keeper_context);
static void deserialize(SnapshotDeserializationResult & deserialization_result, ReadBuffer & in, KeeperContextPtr keeper_context);
static void deserialize(SnapshotDeserializationResult<Storage> & deserialization_result, ReadBuffer & in, KeeperContextPtr keeper_context);
KeeperStorage * storage;
Storage * storage;
SnapshotVersion version = CURRENT_SNAPSHOT_VERSION;
/// Snapshot metadata
@ -77,11 +88,11 @@ public:
/// so we have for loop for (i = 0; i < snapshot_container_size; ++i) { doSmth(begin + i); }
size_t snapshot_container_size;
/// Iterator to the start of the storage
KeeperStorage::Container::const_iterator begin;
Storage::Container::const_iterator begin;
/// Active sessions and their timeouts
SessionAndTimeout session_and_timeout;
/// Sessions credentials
KeeperStorage::SessionAndAuth session_and_auth;
Storage::SessionAndAuth session_and_auth;
/// ACLs cache for better performance. Without we cannot deserialize storage.
std::unordered_map<uint64_t, Coordination::ACLs> acl_map;
/// Cluster config from snapshot, can be empty
@ -105,14 +116,16 @@ struct SnapshotFileInfo
};
using SnapshotFileInfoPtr = std::shared_ptr<SnapshotFileInfo>;
using KeeperStorageSnapshotPtr = std::shared_ptr<KeeperStorageSnapshot>;
using CreateSnapshotCallback = std::function<std::shared_ptr<SnapshotFileInfo>(KeeperStorageSnapshotPtr &&, bool)>;
using SnapshotMetaAndStorage = std::pair<SnapshotMetadataPtr, KeeperStoragePtr>;
#if USE_ROCKSDB
using KeeperStorageSnapshotPtr = std::variant<std::shared_ptr<KeeperStorageSnapshot<KeeperMemoryStorage>>, std::shared_ptr<KeeperStorageSnapshot<KeeperRocksStorage>>>;
#else
using KeeperStorageSnapshotPtr = std::variant<std::shared_ptr<KeeperStorageSnapshot<KeeperMemoryStorage>>>;
#endif
using CreateSnapshotCallback = std::function<SnapshotFileInfoPtr(KeeperStorageSnapshotPtr &&, bool)>;
/// Class responsible for snapshots serialization and deserialization. Each snapshot
/// has it's path on disk and log index.
template<typename Storage>
class KeeperSnapshotManager
{
public:
@ -124,18 +137,18 @@ public:
size_t storage_tick_time_ = 500);
/// Restore storage from latest available snapshot
SnapshotDeserializationResult restoreFromLatestSnapshot();
SnapshotDeserializationResult<Storage> restoreFromLatestSnapshot();
/// Compress snapshot and serialize it to buffer
nuraft::ptr<nuraft::buffer> serializeSnapshotToBuffer(const KeeperStorageSnapshot & snapshot) const;
nuraft::ptr<nuraft::buffer> serializeSnapshotToBuffer(const KeeperStorageSnapshot<Storage> & snapshot) const;
/// Serialize already compressed snapshot to disk (return path)
SnapshotFileInfoPtr serializeSnapshotBufferToDisk(nuraft::buffer & buffer, uint64_t up_to_log_idx);
/// Serialize snapshot directly to disk
SnapshotFileInfoPtr serializeSnapshotToDisk(const KeeperStorageSnapshot & snapshot);
SnapshotFileInfoPtr serializeSnapshotToDisk(const KeeperStorageSnapshot<Storage> & snapshot);
SnapshotDeserializationResult deserializeSnapshotFromBuffer(nuraft::ptr<nuraft::buffer> buffer) const;
SnapshotDeserializationResult<Storage> deserializeSnapshotFromBuffer(nuraft::ptr<nuraft::buffer> buffer) const;
/// Deserialize snapshot with log index up_to_log_idx from disk into compressed nuraft buffer.
nuraft::ptr<nuraft::buffer> deserializeSnapshotBufferFromDisk(uint64_t up_to_log_idx) const;

View File

@ -44,7 +44,7 @@ namespace ErrorCodes
extern const int LOGICAL_ERROR;
}
KeeperStateMachine::KeeperStateMachine(
IKeeperStateMachine::IKeeperStateMachine(
ResponsesQueue & responses_queue_,
SnapshotsQueue & snapshots_queue_,
const KeeperContextPtr & keeper_context_,
@ -52,12 +52,6 @@ KeeperStateMachine::KeeperStateMachine(
CommitCallback commit_callback_,
const std::string & superdigest_)
: commit_callback(commit_callback_)
, snapshot_manager(
keeper_context_->getCoordinationSettings()->snapshots_to_keep,
keeper_context_,
keeper_context_->getCoordinationSettings()->compress_snapshots_with_zstd_format,
superdigest_,
keeper_context_->getCoordinationSettings()->dead_session_check_period_ms.totalMilliseconds())
, responses_queue(responses_queue_)
, snapshots_queue(snapshots_queue_)
, min_request_size_to_cache(keeper_context_->getCoordinationSettings()->min_request_size_for_cache)
@ -68,6 +62,32 @@ KeeperStateMachine::KeeperStateMachine(
{
}
template<typename Storage>
KeeperStateMachine<Storage>::KeeperStateMachine(
ResponsesQueue & responses_queue_,
SnapshotsQueue & snapshots_queue_,
// const CoordinationSettingsPtr & coordination_settings_,
const KeeperContextPtr & keeper_context_,
KeeperSnapshotManagerS3 * snapshot_manager_s3_,
IKeeperStateMachine::CommitCallback commit_callback_,
const std::string & superdigest_)
: IKeeperStateMachine(
responses_queue_,
snapshots_queue_,
/// coordination_settings_,
keeper_context_,
snapshot_manager_s3_,
commit_callback_,
superdigest_),
snapshot_manager(
keeper_context_->getCoordinationSettings()->snapshots_to_keep,
keeper_context_,
keeper_context_->getCoordinationSettings()->compress_snapshots_with_zstd_format,
superdigest_,
keeper_context_->getCoordinationSettings()->dead_session_check_period_ms.totalMilliseconds())
{
}
namespace
{
@ -78,7 +98,8 @@ bool isLocalDisk(const IDisk & disk)
}
void KeeperStateMachine::init()
template<typename Storage>
void KeeperStateMachine<Storage>::init()
{
/// Do everything without mutexes, no other threads exist.
LOG_DEBUG(log, "Totally have {} snapshots", snapshot_manager.totalSnapshots());
@ -123,7 +144,7 @@ void KeeperStateMachine::init()
LOG_DEBUG(log, "No existing snapshots, last committed log index {}", last_committed_idx);
if (!storage)
storage = std::make_unique<KeeperStorage>(
storage = std::make_unique<Storage>(
keeper_context->getCoordinationSettings()->dead_session_check_period_ms.totalMilliseconds(), superdigest, keeper_context);
}
@ -131,13 +152,13 @@ namespace
{
void assertDigest(
const KeeperStorage::Digest & expected,
const KeeperStorage::Digest & actual,
const KeeperStorageBase::Digest & expected,
const KeeperStorageBase::Digest & actual,
const Coordination::ZooKeeperRequest & request,
uint64_t log_idx,
bool committing)
{
if (!KeeperStorage::checkDigest(expected, actual))
if (!KeeperStorageBase::checkDigest(expected, actual))
{
LOG_FATAL(
getLogger("KeeperStateMachine"),
@ -170,7 +191,8 @@ struct TSA_SCOPED_LOCKABLE LockGuardWithStats final
}
nuraft::ptr<nuraft::buffer> KeeperStateMachine::pre_commit(uint64_t log_idx, nuraft::buffer & data)
template<typename Storage>
nuraft::ptr<nuraft::buffer> KeeperStateMachine<Storage>::pre_commit(uint64_t log_idx, nuraft::buffer & data)
{
auto result = nuraft::buffer::alloc(sizeof(log_idx));
nuraft::buffer_serializer ss(result);
@ -191,10 +213,10 @@ nuraft::ptr<nuraft::buffer> KeeperStateMachine::pre_commit(uint64_t log_idx, nur
return result;
}
std::shared_ptr<KeeperStorage::RequestForSession> KeeperStateMachine::parseRequest(nuraft::buffer & data, bool final, ZooKeeperLogSerializationVersion * serialization_version)
std::shared_ptr<KeeperStorageBase::RequestForSession> IKeeperStateMachine::parseRequest(nuraft::buffer & data, bool final, ZooKeeperLogSerializationVersion * serialization_version)
{
ReadBufferFromNuraftBuffer buffer(data);
auto request_for_session = std::make_shared<KeeperStorage::RequestForSession>();
auto request_for_session = std::make_shared<KeeperStorageBase::RequestForSession>();
readIntBinary(request_for_session->session_id, buffer);
int32_t length;
@ -267,7 +289,7 @@ std::shared_ptr<KeeperStorage::RequestForSession> KeeperStateMachine::parseReque
request_for_session->digest.emplace();
readIntBinary(request_for_session->digest->version, buffer);
if (request_for_session->digest->version != KeeperStorage::DigestVersion::NO_DIGEST || !buffer.eof())
if (request_for_session->digest->version != KeeperStorageBase::DigestVersion::NO_DIGEST || !buffer.eof())
readIntBinary(request_for_session->digest->value, buffer);
}
@ -283,7 +305,8 @@ std::shared_ptr<KeeperStorage::RequestForSession> KeeperStateMachine::parseReque
return request_for_session;
}
bool KeeperStateMachine::preprocess(const KeeperStorage::RequestForSession & request_for_session)
template<typename Storage>
bool KeeperStateMachine<Storage>::preprocess(const KeeperStorageBase::RequestForSession & request_for_session)
{
const auto op_num = request_for_session.request->getOpNum();
if (op_num == Coordination::OpNum::SessionID || op_num == Coordination::OpNum::Reconfig)
@ -317,10 +340,11 @@ bool KeeperStateMachine::preprocess(const KeeperStorage::RequestForSession & req
return true;
}
void KeeperStateMachine::reconfigure(const KeeperStorage::RequestForSession& request_for_session)
template<typename Storage>
void KeeperStateMachine<Storage>::reconfigure(const KeeperStorageBase::RequestForSession& request_for_session)
{
LockGuardWithStats lock(storage_and_responses_lock);
KeeperStorage::ResponseForSession response = processReconfiguration(request_for_session);
KeeperStorageBase::ResponseForSession response = processReconfiguration(request_for_session);
if (!responses_queue.push(response))
{
ProfileEvents::increment(ProfileEvents::KeeperCommitsFailed);
@ -330,8 +354,9 @@ void KeeperStateMachine::reconfigure(const KeeperStorage::RequestForSession& req
}
}
KeeperStorage::ResponseForSession KeeperStateMachine::processReconfiguration(
const KeeperStorage::RequestForSession & request_for_session)
template<typename Storage>
KeeperStorageBase::ResponseForSession KeeperStateMachine<Storage>::processReconfiguration(
const KeeperStorageBase::RequestForSession & request_for_session)
{
ProfileEvents::increment(ProfileEvents::KeeperReconfigRequest);
@ -340,7 +365,7 @@ KeeperStorage::ResponseForSession KeeperStateMachine::processReconfiguration(
const int64_t zxid = request_for_session.zxid;
using enum Coordination::Error;
auto bad_request = [&](Coordination::Error code = ZBADARGUMENTS) -> KeeperStorage::ResponseForSession
auto bad_request = [&](Coordination::Error code = ZBADARGUMENTS) -> KeeperStorageBase::ResponseForSession
{
auto res = std::make_shared<Coordination::ZooKeeperReconfigResponse>();
res->xid = request.xid;
@ -397,7 +422,8 @@ KeeperStorage::ResponseForSession KeeperStateMachine::processReconfiguration(
return { session_id, std::move(response) };
}
nuraft::ptr<nuraft::buffer> KeeperStateMachine::commit(const uint64_t log_idx, nuraft::buffer & data)
template<typename Storage>
nuraft::ptr<nuraft::buffer> KeeperStateMachine<Storage>::commit(const uint64_t log_idx, nuraft::buffer & data)
{
auto request_for_session = parseRequest(data, true);
if (!request_for_session->zxid)
@ -408,7 +434,7 @@ nuraft::ptr<nuraft::buffer> KeeperStateMachine::commit(const uint64_t log_idx, n
if (!keeper_context->localLogsPreprocessed() && !preprocess(*request_for_session))
return nullptr;
auto try_push = [&](const KeeperStorage::ResponseForSession & response)
auto try_push = [&](const KeeperStorageBase::ResponseForSession & response)
{
if (!responses_queue.push(response))
{
@ -430,7 +456,7 @@ nuraft::ptr<nuraft::buffer> KeeperStateMachine::commit(const uint64_t log_idx, n
std::shared_ptr<Coordination::ZooKeeperSessionIDResponse> response = std::make_shared<Coordination::ZooKeeperSessionIDResponse>();
response->internal_id = session_id_request.internal_id;
response->server_id = session_id_request.server_id;
KeeperStorage::ResponseForSession response_for_session;
KeeperStorageBase::ResponseForSession response_for_session;
response_for_session.session_id = -1;
response_for_session.response = response;
response_for_session.request = request_for_session->request;
@ -451,7 +477,7 @@ nuraft::ptr<nuraft::buffer> KeeperStateMachine::commit(const uint64_t log_idx, n
}
LockGuardWithStats lock(storage_and_responses_lock);
KeeperStorage::ResponsesForSessions responses_for_sessions
KeeperStorageBase::ResponsesForSessions responses_for_sessions
= storage->processRequest(request_for_session->request, request_for_session->session_id, request_for_session->zxid);
for (auto & response_for_session : responses_for_sessions)
@ -482,7 +508,8 @@ nuraft::ptr<nuraft::buffer> KeeperStateMachine::commit(const uint64_t log_idx, n
return nullptr;
}
bool KeeperStateMachine::apply_snapshot(nuraft::snapshot & s)
template<typename Storage>
bool KeeperStateMachine<Storage>::apply_snapshot(nuraft::snapshot & s)
{
LOG_DEBUG(log, "Applying snapshot {}", s.get_last_log_idx());
nuraft::ptr<nuraft::buffer> latest_snapshot_ptr;
@ -509,7 +536,7 @@ bool KeeperStateMachine::apply_snapshot(nuraft::snapshot & s)
{ /// deserialize and apply snapshot to storage
LockGuardWithStats lock(storage_and_responses_lock);
SnapshotDeserializationResult snapshot_deserialization_result;
SnapshotDeserializationResult<Storage> snapshot_deserialization_result;
if (latest_snapshot_ptr)
snapshot_deserialization_result = snapshot_manager.deserializeSnapshotFromBuffer(latest_snapshot_ptr);
else
@ -530,7 +557,7 @@ bool KeeperStateMachine::apply_snapshot(nuraft::snapshot & s)
}
void KeeperStateMachine::commit_config(const uint64_t log_idx, nuraft::ptr<nuraft::cluster_config> & new_conf)
void IKeeperStateMachine::commit_config(const uint64_t log_idx, nuraft::ptr<nuraft::cluster_config> & new_conf)
{
std::lock_guard lock(cluster_config_lock);
auto tmp = new_conf->serialize();
@ -538,7 +565,7 @@ void KeeperStateMachine::commit_config(const uint64_t log_idx, nuraft::ptr<nuraf
keeper_context->setLastCommitIndex(log_idx);
}
void KeeperStateMachine::rollback(uint64_t log_idx, nuraft::buffer & data)
void IKeeperStateMachine::rollback(uint64_t log_idx, nuraft::buffer & data)
{
/// Don't rollback anything until the first commit because nothing was preprocessed
if (!keeper_context->localLogsPreprocessed())
@ -554,7 +581,8 @@ void KeeperStateMachine::rollback(uint64_t log_idx, nuraft::buffer & data)
rollbackRequest(*request_for_session, false);
}
void KeeperStateMachine::rollbackRequest(const KeeperStorage::RequestForSession & request_for_session, bool allow_missing)
template<typename Storage>
void KeeperStateMachine<Storage>::rollbackRequest(const KeeperStorageBase::RequestForSession & request_for_session, bool allow_missing)
{
if (request_for_session.request->getOpNum() == Coordination::OpNum::SessionID)
return;
@ -563,7 +591,8 @@ void KeeperStateMachine::rollbackRequest(const KeeperStorage::RequestForSession
storage->rollbackRequest(request_for_session.zxid, allow_missing);
}
void KeeperStateMachine::rollbackRequestNoLock(const KeeperStorage::RequestForSession & request_for_session, bool allow_missing)
template<typename Storage>
void KeeperStateMachine<Storage>::rollbackRequestNoLock(const KeeperStorageBase::RequestForSession & request_for_session, bool allow_missing)
{
if (request_for_session.request->getOpNum() == Coordination::OpNum::SessionID)
return;
@ -571,14 +600,15 @@ void KeeperStateMachine::rollbackRequestNoLock(const KeeperStorage::RequestForSe
storage->rollbackRequest(request_for_session.zxid, allow_missing);
}
nuraft::ptr<nuraft::snapshot> KeeperStateMachine::last_snapshot()
nuraft::ptr<nuraft::snapshot> IKeeperStateMachine::last_snapshot()
{
/// Just return the latest snapshot.
std::lock_guard lock(snapshots_lock);
return latest_snapshot_meta;
}
void KeeperStateMachine::create_snapshot(nuraft::snapshot & s, nuraft::async_result<bool>::handler_type & when_done)
template<typename Storage>
void KeeperStateMachine<Storage>::create_snapshot(nuraft::snapshot & s, nuraft::async_result<bool>::handler_type & when_done)
{
LOG_DEBUG(log, "Creating snapshot {}", s.get_last_log_idx());
@ -587,14 +617,15 @@ void KeeperStateMachine::create_snapshot(nuraft::snapshot & s, nuraft::async_res
CreateSnapshotTask snapshot_task;
{ /// lock storage for a short period time to turn on "snapshot mode". After that we can read consistent storage state without locking.
LockGuardWithStats lock(storage_and_responses_lock);
snapshot_task.snapshot = std::make_shared<KeeperStorageSnapshot>(storage.get(), snapshot_meta_copy, getClusterConfig());
snapshot_task.snapshot = std::make_shared<KeeperStorageSnapshot<Storage>>(storage.get(), snapshot_meta_copy, getClusterConfig());
}
/// create snapshot task for background execution (in snapshot thread)
snapshot_task.create_snapshot = [this, when_done](KeeperStorageSnapshotPtr && snapshot, bool execute_only_cleanup)
snapshot_task.create_snapshot = [this, when_done](KeeperStorageSnapshotPtr && snapshot_, bool execute_only_cleanup)
{
nuraft::ptr<std::exception> exception(nullptr);
bool ret = false;
auto && snapshot = std::get<std::shared_ptr<KeeperStorageSnapshot<Storage>>>(std::move(snapshot_));
if (!execute_only_cleanup)
{
try
@ -683,7 +714,8 @@ void KeeperStateMachine::create_snapshot(nuraft::snapshot & s, nuraft::async_res
LOG_WARNING(log, "Cannot push snapshot task into queue");
}
void KeeperStateMachine::save_logical_snp_obj(
template<typename Storage>
void KeeperStateMachine<Storage>::save_logical_snp_obj(
nuraft::snapshot & s, uint64_t & obj_id, nuraft::buffer & data, bool /*is_first_obj*/, bool /*is_last_obj*/)
{
LOG_DEBUG(log, "Saving snapshot {} obj_id {}", s.get_last_log_idx(), obj_id);
@ -748,7 +780,7 @@ static int bufferFromFile(LoggerPtr log, const std::string & path, nuraft::ptr<n
return 0;
}
int KeeperStateMachine::read_logical_snp_obj(
int IKeeperStateMachine::read_logical_snp_obj(
nuraft::snapshot & s, void *& /*user_snp_ctx*/, uint64_t obj_id, nuraft::ptr<nuraft::buffer> & data_out, bool & is_last_obj)
{
LOG_DEBUG(log, "Reading snapshot {} obj_id {}", s.get_last_log_idx(), obj_id);
@ -788,7 +820,8 @@ int KeeperStateMachine::read_logical_snp_obj(
return 1;
}
void KeeperStateMachine::processReadRequest(const KeeperStorage::RequestForSession & request_for_session)
template<typename Storage>
void KeeperStateMachine<Storage>::processReadRequest(const KeeperStorageBase::RequestForSession & request_for_session)
{
/// Pure local request, just process it with storage
LockGuardWithStats lock(storage_and_responses_lock);
@ -804,103 +837,120 @@ void KeeperStateMachine::processReadRequest(const KeeperStorage::RequestForSessi
}
}
void KeeperStateMachine::shutdownStorage()
template<typename Storage>
void KeeperStateMachine<Storage>::shutdownStorage()
{
LockGuardWithStats lock(storage_and_responses_lock);
storage->finalize();
}
std::vector<int64_t> KeeperStateMachine::getDeadSessions()
template<typename Storage>
std::vector<int64_t> KeeperStateMachine<Storage>::getDeadSessions()
{
LockGuardWithStats lock(storage_and_responses_lock);
return storage->getDeadSessions();
}
int64_t KeeperStateMachine::getNextZxid() const
template<typename Storage>
int64_t KeeperStateMachine<Storage>::getNextZxid() const
{
LockGuardWithStats lock(storage_and_responses_lock);
return storage->getNextZXID();
}
KeeperStorage::Digest KeeperStateMachine::getNodesDigest() const
template<typename Storage>
KeeperStorageBase::Digest KeeperStateMachine<Storage>::getNodesDigest() const
{
LockGuardWithStats lock(storage_and_responses_lock);
return storage->getNodesDigest(false);
}
uint64_t KeeperStateMachine::getLastProcessedZxid() const
template<typename Storage>
uint64_t KeeperStateMachine<Storage>::getLastProcessedZxid() const
{
LockGuardWithStats lock(storage_and_responses_lock);
return storage->getZXID();
}
uint64_t KeeperStateMachine::getNodesCount() const
template<typename Storage>
uint64_t KeeperStateMachine<Storage>::getNodesCount() const
{
LockGuardWithStats lock(storage_and_responses_lock);
return storage->getNodesCount();
}
uint64_t KeeperStateMachine::getTotalWatchesCount() const
template<typename Storage>
uint64_t KeeperStateMachine<Storage>::getTotalWatchesCount() const
{
LockGuardWithStats lock(storage_and_responses_lock);
return storage->getTotalWatchesCount();
}
uint64_t KeeperStateMachine::getWatchedPathsCount() const
template<typename Storage>
uint64_t KeeperStateMachine<Storage>::getWatchedPathsCount() const
{
LockGuardWithStats lock(storage_and_responses_lock);
return storage->getWatchedPathsCount();
}
uint64_t KeeperStateMachine::getSessionsWithWatchesCount() const
template<typename Storage>
uint64_t KeeperStateMachine<Storage>::getSessionsWithWatchesCount() const
{
LockGuardWithStats lock(storage_and_responses_lock);
return storage->getSessionsWithWatchesCount();
}
uint64_t KeeperStateMachine::getTotalEphemeralNodesCount() const
template<typename Storage>
uint64_t KeeperStateMachine<Storage>::getTotalEphemeralNodesCount() const
{
LockGuardWithStats lock(storage_and_responses_lock);
return storage->getTotalEphemeralNodesCount();
}
uint64_t KeeperStateMachine::getSessionWithEphemeralNodesCount() const
template<typename Storage>
uint64_t KeeperStateMachine<Storage>::getSessionWithEphemeralNodesCount() const
{
LockGuardWithStats lock(storage_and_responses_lock);
return storage->getSessionWithEphemeralNodesCount();
}
void KeeperStateMachine::dumpWatches(WriteBufferFromOwnString & buf) const
template<typename Storage>
void KeeperStateMachine<Storage>::dumpWatches(WriteBufferFromOwnString & buf) const
{
LockGuardWithStats lock(storage_and_responses_lock);
storage->dumpWatches(buf);
}
void KeeperStateMachine::dumpWatchesByPath(WriteBufferFromOwnString & buf) const
template<typename Storage>
void KeeperStateMachine<Storage>::dumpWatchesByPath(WriteBufferFromOwnString & buf) const
{
LockGuardWithStats lock(storage_and_responses_lock);
storage->dumpWatchesByPath(buf);
}
void KeeperStateMachine::dumpSessionsAndEphemerals(WriteBufferFromOwnString & buf) const
template<typename Storage>
void KeeperStateMachine<Storage>::dumpSessionsAndEphemerals(WriteBufferFromOwnString & buf) const
{
LockGuardWithStats lock(storage_and_responses_lock);
storage->dumpSessionsAndEphemerals(buf);
}
uint64_t KeeperStateMachine::getApproximateDataSize() const
template<typename Storage>
uint64_t KeeperStateMachine<Storage>::getApproximateDataSize() const
{
LockGuardWithStats lock(storage_and_responses_lock);
return storage->getApproximateDataSize();
}
uint64_t KeeperStateMachine::getKeyArenaSize() const
template<typename Storage>
uint64_t KeeperStateMachine<Storage>::getKeyArenaSize() const
{
LockGuardWithStats lock(storage_and_responses_lock);
return storage->getArenaDataSize();
}
uint64_t KeeperStateMachine::getLatestSnapshotSize() const
template<typename Storage>
uint64_t KeeperStateMachine<Storage>::getLatestSnapshotSize() const
{
auto snapshot_info = [&]
{
@ -923,7 +973,7 @@ uint64_t KeeperStateMachine::getLatestSnapshotSize() const
return size;
}
ClusterConfigPtr KeeperStateMachine::getClusterConfig() const
ClusterConfigPtr IKeeperStateMachine::getClusterConfig() const
{
std::lock_guard lock(cluster_config_lock);
if (cluster_config)
@ -935,11 +985,18 @@ ClusterConfigPtr KeeperStateMachine::getClusterConfig() const
return nullptr;
}
void KeeperStateMachine::recalculateStorageStats()
template<typename Storage>
void KeeperStateMachine<Storage>::recalculateStorageStats()
{
LockGuardWithStats lock(storage_and_responses_lock);
LOG_INFO(log, "Recalculating storage stats");
storage->recalculateStats();
LOG_INFO(log, "Done recalculating storage stats");
}
template class KeeperStateMachine<KeeperMemoryStorage>;
#if USE_ROCKSDB
template class KeeperStateMachine<KeeperRocksStorage>;
#endif
}

View File

@ -11,26 +11,24 @@
namespace DB
{
using ResponsesQueue = ConcurrentBoundedQueue<KeeperStorage::ResponseForSession>;
using ResponsesQueue = ConcurrentBoundedQueue<KeeperStorageBase::ResponseForSession>;
using SnapshotsQueue = ConcurrentBoundedQueue<CreateSnapshotTask>;
/// ClickHouse Keeper state machine. Wrapper for KeeperStorage.
/// Responsible for entries commit, snapshots creation and so on.
class KeeperStateMachine : public nuraft::state_machine
class IKeeperStateMachine : public nuraft::state_machine
{
public:
using CommitCallback = std::function<void(uint64_t, const KeeperStorage::RequestForSession &)>;
using CommitCallback = std::function<void(uint64_t, const KeeperStorageBase::RequestForSession &)>;
KeeperStateMachine(
IKeeperStateMachine(
ResponsesQueue & responses_queue_,
SnapshotsQueue & snapshots_queue_,
const KeeperContextPtr & keeper_context_,
KeeperSnapshotManagerS3 * snapshot_manager_s3_,
CommitCallback commit_callback_ = {},
const std::string & superdigest_ = "");
CommitCallback commit_callback_,
const std::string & superdigest_);
/// Read state from the latest snapshot
void init();
virtual void init() = 0;
enum ZooKeeperLogSerializationVersion
{
@ -47,89 +45,66 @@ public:
///
/// final - whether it's the final time we will fetch the request so we can safely remove it from cache
/// serialization_version - information about which fields were parsed from the buffer so we can modify the buffer accordingly
std::shared_ptr<KeeperStorage::RequestForSession> parseRequest(nuraft::buffer & data, bool final, ZooKeeperLogSerializationVersion * serialization_version = nullptr);
std::shared_ptr<KeeperStorageBase::RequestForSession> parseRequest(nuraft::buffer & data, bool final, ZooKeeperLogSerializationVersion * serialization_version = nullptr);
bool preprocess(const KeeperStorage::RequestForSession & request_for_session);
virtual bool preprocess(const KeeperStorageBase::RequestForSession & request_for_session) = 0;
nuraft::ptr<nuraft::buffer> pre_commit(uint64_t log_idx, nuraft::buffer & data) override;
nuraft::ptr<nuraft::buffer> commit(const uint64_t log_idx, nuraft::buffer & data) override; /// NOLINT
/// Save new cluster config to our snapshot (copy of the config stored in StateManager)
void commit_config(const uint64_t log_idx, nuraft::ptr<nuraft::cluster_config> & new_conf) override; /// NOLINT
void rollback(uint64_t log_idx, nuraft::buffer & data) override;
// allow_missing - whether the transaction we want to rollback can be missing from storage
// (can happen in case of exception during preprocessing)
void rollbackRequest(const KeeperStorage::RequestForSession & request_for_session, bool allow_missing);
void rollbackRequestNoLock(
const KeeperStorage::RequestForSession & request_for_session,
bool allow_missing) TSA_NO_THREAD_SAFETY_ANALYSIS;
virtual void rollbackRequest(const KeeperStorageBase::RequestForSession & request_for_session, bool allow_missing) = 0;
uint64_t last_commit_index() override { return keeper_context->lastCommittedIndex(); }
/// Apply preliminarily saved (save_logical_snp_obj) snapshot to our state.
bool apply_snapshot(nuraft::snapshot & s) override;
nuraft::ptr<nuraft::snapshot> last_snapshot() override;
/// Create new snapshot from current state.
void create_snapshot(nuraft::snapshot & s, nuraft::async_result<bool>::handler_type & when_done) override;
void create_snapshot(nuraft::snapshot & s, nuraft::async_result<bool>::handler_type & when_done) override = 0;
/// Save snapshot which was send by leader to us. After that we will apply it in apply_snapshot.
void save_logical_snp_obj(nuraft::snapshot & s, uint64_t & obj_id, nuraft::buffer & data, bool is_first_obj, bool is_last_obj) override;
void save_logical_snp_obj(nuraft::snapshot & s, uint64_t & obj_id, nuraft::buffer & data, bool is_first_obj, bool is_last_obj) override = 0;
/// Better name is `serialize snapshot` -- save existing snapshot (created by create_snapshot) into
/// in-memory buffer data_out.
int read_logical_snp_obj(
nuraft::snapshot & s, void *& user_snp_ctx, uint64_t obj_id, nuraft::ptr<nuraft::buffer> & data_out, bool & is_last_obj) override;
// This should be used only for tests or keeper-data-dumper because it violates
// TSA -- we can't acquire the lock outside of this class or return a storage under lock
// in a reasonable way.
KeeperStorage & getStorageUnsafe() TSA_NO_THREAD_SAFETY_ANALYSIS
{
return *storage;
}
void shutdownStorage();
virtual void shutdownStorage() = 0;
ClusterConfigPtr getClusterConfig() const;
/// Process local read request
void processReadRequest(const KeeperStorage::RequestForSession & request_for_session);
virtual void processReadRequest(const KeeperStorageBase::RequestForSession & request_for_session) = 0;
std::vector<int64_t> getDeadSessions();
virtual std::vector<int64_t> getDeadSessions() = 0;
int64_t getNextZxid() const;
virtual int64_t getNextZxid() const = 0;
KeeperStorage::Digest getNodesDigest() const;
virtual KeeperStorageBase::Digest getNodesDigest() const = 0;
/// Introspection functions for 4lw commands
uint64_t getLastProcessedZxid() const;
virtual uint64_t getLastProcessedZxid() const = 0;
uint64_t getNodesCount() const;
uint64_t getTotalWatchesCount() const;
uint64_t getWatchedPathsCount() const;
uint64_t getSessionsWithWatchesCount() const;
virtual uint64_t getNodesCount() const = 0;
virtual uint64_t getTotalWatchesCount() const = 0;
virtual uint64_t getWatchedPathsCount() const = 0;
virtual uint64_t getSessionsWithWatchesCount() const = 0;
void dumpWatches(WriteBufferFromOwnString & buf) const;
void dumpWatchesByPath(WriteBufferFromOwnString & buf) const;
void dumpSessionsAndEphemerals(WriteBufferFromOwnString & buf) const;
virtual void dumpWatches(WriteBufferFromOwnString & buf) const = 0;
virtual void dumpWatchesByPath(WriteBufferFromOwnString & buf) const = 0;
virtual void dumpSessionsAndEphemerals(WriteBufferFromOwnString & buf) const = 0;
uint64_t getSessionWithEphemeralNodesCount() const;
uint64_t getTotalEphemeralNodesCount() const;
uint64_t getApproximateDataSize() const;
uint64_t getKeyArenaSize() const;
uint64_t getLatestSnapshotSize() const;
virtual uint64_t getSessionWithEphemeralNodesCount() const = 0;
virtual uint64_t getTotalEphemeralNodesCount() const = 0;
virtual uint64_t getApproximateDataSize() const = 0;
virtual uint64_t getKeyArenaSize() const = 0;
virtual uint64_t getLatestSnapshotSize() const = 0;
void recalculateStorageStats();
virtual void recalculateStorageStats() = 0;
void reconfigure(const KeeperStorage::RequestForSession& request_for_session);
virtual void reconfigure(const KeeperStorageBase::RequestForSession& request_for_session) = 0;
private:
protected:
CommitCallback commit_callback;
/// In our state machine we always have a single snapshot which is stored
/// in memory in compressed (serialized) format.
@ -137,12 +112,9 @@ private:
std::shared_ptr<SnapshotFileInfo> latest_snapshot_info;
nuraft::ptr<nuraft::buffer> latest_snapshot_buf = nullptr;
/// Main state machine logic
KeeperStoragePtr storage TSA_PT_GUARDED_BY(storage_and_responses_lock);
CoordinationSettingsPtr coordination_settings;
/// Save/Load and Serialize/Deserialize logic for snapshots.
KeeperSnapshotManager snapshot_manager;
/// Put processed responses into this queue
ResponsesQueue & responses_queue;
@ -159,7 +131,7 @@ private:
/// for request.
mutable std::mutex storage_and_responses_lock;
std::unordered_map<int64_t, std::unordered_map<Coordination::XID, std::shared_ptr<KeeperStorage::RequestForSession>>> parsed_request_cache;
std::unordered_map<int64_t, std::unordered_map<Coordination::XID, std::shared_ptr<KeeperStorageBase::RequestForSession>>> parsed_request_cache;
uint64_t min_request_size_to_cache{0};
/// we only need to protect the access to the map itself
/// requests can be modified from anywhere without lock because a single request
@ -181,7 +153,104 @@ private:
KeeperSnapshotManagerS3 * snapshot_manager_s3;
KeeperStorage::ResponseForSession processReconfiguration(const KeeperStorage::RequestForSession & request_for_session)
TSA_REQUIRES(storage_and_responses_lock);
virtual KeeperStorageBase::ResponseForSession processReconfiguration(
const KeeperStorageBase::RequestForSession& request_for_session)
TSA_REQUIRES(storage_and_responses_lock) = 0;
};
/// ClickHouse Keeper state machine. Wrapper for KeeperStorage.
/// Responsible for entries commit, snapshots creation and so on.
template<typename Storage>
class KeeperStateMachine : public IKeeperStateMachine
{
public:
/// using CommitCallback = std::function<void(uint64_t, const KeeperStorage::RequestForSession &)>;
KeeperStateMachine(
ResponsesQueue & responses_queue_,
SnapshotsQueue & snapshots_queue_,
/// const CoordinationSettingsPtr & coordination_settings_,
const KeeperContextPtr & keeper_context_,
KeeperSnapshotManagerS3 * snapshot_manager_s3_,
CommitCallback commit_callback_ = {},
const std::string & superdigest_ = "");
/// Read state from the latest snapshot
void init() override;
bool preprocess(const KeeperStorageBase::RequestForSession & request_for_session) override;
nuraft::ptr<nuraft::buffer> pre_commit(uint64_t log_idx, nuraft::buffer & data) override;
nuraft::ptr<nuraft::buffer> commit(const uint64_t log_idx, nuraft::buffer & data) override; /// NOLINT
// allow_missing - whether the transaction we want to rollback can be missing from storage
// (can happen in case of exception during preprocessing)
void rollbackRequest(const KeeperStorageBase::RequestForSession & request_for_session, bool allow_missing) override;
void rollbackRequestNoLock(
const KeeperStorageBase::RequestForSession & request_for_session,
bool allow_missing) TSA_NO_THREAD_SAFETY_ANALYSIS;
/// Apply preliminarily saved (save_logical_snp_obj) snapshot to our state.
bool apply_snapshot(nuraft::snapshot & s) override;
/// Create new snapshot from current state.
void create_snapshot(nuraft::snapshot & s, nuraft::async_result<bool>::handler_type & when_done) override;
/// Save snapshot which was send by leader to us. After that we will apply it in apply_snapshot.
void save_logical_snp_obj(nuraft::snapshot & s, uint64_t & obj_id, nuraft::buffer & data, bool is_first_obj, bool is_last_obj) override;
// This should be used only for tests or keeper-data-dumper because it violates
// TSA -- we can't acquire the lock outside of this class or return a storage under lock
// in a reasonable way.
Storage & getStorageUnsafe() TSA_NO_THREAD_SAFETY_ANALYSIS
{
return *storage;
}
void shutdownStorage() override;
/// Process local read request
void processReadRequest(const KeeperStorageBase::RequestForSession & request_for_session) override;
std::vector<int64_t> getDeadSessions() override;
int64_t getNextZxid() const override;
KeeperStorageBase::Digest getNodesDigest() const override;
/// Introspection functions for 4lw commands
uint64_t getLastProcessedZxid() const override;
uint64_t getNodesCount() const override;
uint64_t getTotalWatchesCount() const override;
uint64_t getWatchedPathsCount() const override;
uint64_t getSessionsWithWatchesCount() const override;
void dumpWatches(WriteBufferFromOwnString & buf) const override;
void dumpWatchesByPath(WriteBufferFromOwnString & buf) const override;
void dumpSessionsAndEphemerals(WriteBufferFromOwnString & buf) const override;
uint64_t getSessionWithEphemeralNodesCount() const override;
uint64_t getTotalEphemeralNodesCount() const override;
uint64_t getApproximateDataSize() const override;
uint64_t getKeyArenaSize() const override;
uint64_t getLatestSnapshotSize() const override;
void recalculateStorageStats() override;
void reconfigure(const KeeperStorageBase::RequestForSession& request_for_session) override;
private:
/// Main state machine logic
std::unique_ptr<Storage> storage; //TSA_PT_GUARDED_BY(storage_and_responses_lock);
/// Save/Load and Serialize/Deserialize logic for snapshots.
KeeperSnapshotManager<Storage> snapshot_manager;
KeeperStorageBase::ResponseForSession processReconfiguration(const KeeperStorageBase::RequestForSession & request_for_session)
TSA_REQUIRES(storage_and_responses_lock) override;
};
}

File diff suppressed because it is too large Load Diff

Some files were not shown because too many files have changed in this diff Show More