add fault injection in ZooKeeper client

This commit is contained in:
Alexander Tokmakov 2021-10-21 16:59:13 +03:00
parent 6e296d0342
commit 27ca943863
7 changed files with 57 additions and 5 deletions

View File

@ -34,6 +34,7 @@ dpkg -i package_folder/clickhouse-test_*.deb
function configure()
{
export ZOOKEEPER_FAULT_INJECTION=1
# install test configs
/usr/share/clickhouse-test/config/install.sh

View File

@ -31,9 +31,13 @@ ZooKeeperArgs::ZooKeeperArgs(const Poco::Util::AbstractConfiguration & config, c
{
connection_timeout_ms = config.getInt(config_name + "." + key);
}
else if (key == "session_fault_probability")
else if (key == "send_fault_probability")
{
session_fault_probability = config.getDouble(config_name + "." + key);
send_fault_probability = config.getDouble(config_name + "." + key);
}
else if (key == "recv_fault_probability")
{
recv_fault_probability = config.getDouble(config_name + "." + key);
}
else if (key == "identity")
{

View File

@ -23,7 +23,7 @@ struct ZooKeeperArgs
bool operator == (const ZooKeeperArgs &) const = default;
bool operator != (const ZooKeeperArgs &) const = default;
String implementation;
String implementation = "zookeeper";
Strings hosts;
String auth_scheme;
String identity;
@ -31,7 +31,8 @@ struct ZooKeeperArgs
int32_t connection_timeout_ms = Coordination::DEFAULT_CONNECTION_TIMEOUT_MS;
int32_t session_timeout_ms = Coordination::DEFAULT_SESSION_TIMEOUT_MS;
int32_t operation_timeout_ms = Coordination::DEFAULT_OPERATION_TIMEOUT_MS;
float session_fault_probability = 0;
float send_fault_probability = 0;
float recv_fault_probability = 0;
};
}

View File

@ -336,6 +336,18 @@ ZooKeeper::ZooKeeper(
default_acls.emplace_back(std::move(acl));
}
/// It makes sense (especially, for async requests) to inject a fault in two places:
/// pushRequest (before request is sent) and receiveEvent (after request was executed).
if (0 < args.send_fault_probability && args.send_fault_probability <= 1)
{
send_inject_fault.emplace(args.send_fault_probability);
}
if (0 < args.recv_fault_probability && args.recv_fault_probability <= 1)
{
recv_inject_fault.emplace(args.recv_fault_probability);
}
connect(nodes, args.connection_timeout_ms * 1000);
if (!args.auth_scheme.empty())
@ -683,6 +695,9 @@ void ZooKeeper::receiveEvent()
RequestInfo request_info;
ZooKeeperResponsePtr response;
if (unlikely(recv_inject_fault) && recv_inject_fault.value()(thread_local_rng))
throw Exception("Session expired (fault injected)", Error::ZSESSIONEXPIRED);
if (xid == PING_XID)
{
if (err != Error::ZOK)
@ -1019,6 +1034,9 @@ void ZooKeeper::pushRequest(RequestInfo && info)
}
}
if (unlikely(send_inject_fault) && send_inject_fault.value()(thread_local_rng))
throw Exception("Session expired (fault injected)", Error::ZSESSIONEXPIRED);
if (!requests_queue.tryPush(std::move(info), args.operation_timeout_ms))
{
if (requests_queue.isFinished())

View File

@ -26,6 +26,7 @@
#include <cstdint>
#include <optional>
#include <functional>
#include <random>
/** ZooKeeper C++ library, a replacement for libzookeeper.
@ -192,6 +193,8 @@ private:
zkutil::ZooKeeperArgs args;
std::optional<std::bernoulli_distribution> send_inject_fault;
std::optional<std::bernoulli_distribution> recv_inject_fault;
Poco::Net::StreamSocket socket;
/// To avoid excessive getpeername(2) calls.

View File

@ -0,0 +1,19 @@
<clickhouse>
<zookeeper>
<node index="1">
<host>localhost</host>
<port>9181</port>
</node>
<!-- Settings for fault injection.
Approximate probability of request success:
(1 - send_fault_probability) * (1 - recv_fault_probability) = 0.99998 * 0.99998 = 0.99996
Actually it will be less, because if some request fails due to fault injection,
then all requests which are in the queue now also fail.
In other words, session will expire 4 times per 99996 successful requests
or approximately each 25000 requests (on average).
-->
<send_fault_probability>0.00002</send_fault_probability>
<recv_fault_probability>0.00002</recv_fault_probability>
</zookeeper>
</clickhouse>

View File

@ -15,7 +15,6 @@ mkdir -p $DEST_SERVER_PATH/config.d/
mkdir -p $DEST_SERVER_PATH/users.d/
mkdir -p $DEST_CLIENT_PATH
ln -sf $SRC_PATH/config.d/zookeeper.xml $DEST_SERVER_PATH/config.d/
ln -sf $SRC_PATH/config.d/listen.xml $DEST_SERVER_PATH/config.d/
ln -sf $SRC_PATH/config.d/part_log.xml $DEST_SERVER_PATH/config.d/
ln -sf $SRC_PATH/config.d/text_log.xml $DEST_SERVER_PATH/config.d/
@ -72,6 +71,13 @@ ln -sf $SRC_PATH/dhparam.pem $DEST_SERVER_PATH/
ln -sf --backup=simple --suffix=_original.xml \
$SRC_PATH/config.d/query_masking_rules.xml $DEST_SERVER_PATH/config.d/
if [[ -n "$ZOOKEEPER_FAULT_INJECTION" ]] && [[ "$ZOOKEEPER_FAULT_INJECTION" -eq 1 ]]; then
ln -sf $SRC_PATH/config.d/zookeeper_fault_injection.xml $DEST_SERVER_PATH/config.d/
else
ln -sf $SRC_PATH/config.d/zookeeper.xml $DEST_SERVER_PATH/config.d/
fi
if [[ -n "$USE_POLYMORPHIC_PARTS" ]] && [[ "$USE_POLYMORPHIC_PARTS" -eq 1 ]]; then
ln -sf $SRC_PATH/config.d/polymorphic_parts.xml $DEST_SERVER_PATH/config.d/
fi