Merge pull request #21318 from azat/servers-pool-wait

Fix SIGSEGV by waiting servers thread pool
This commit is contained in:
alexey-milovidov 2021-04-14 14:15:07 +03:00 committed by GitHub
commit f8ce5547aa
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
8 changed files with 165 additions and 20 deletions

View File

@ -173,18 +173,24 @@ int waitServersToFinish(std::vector<DB::ProtocolServerAdapter> & servers, size_t
const int sleep_one_ms = 100;
int sleep_current_ms = 0;
int current_connections = 0;
while (sleep_current_ms < sleep_max_ms)
for (;;)
{
current_connections = 0;
for (auto & server : servers)
{
server.stop();
current_connections += server.currentConnections();
}
if (!current_connections)
break;
sleep_current_ms += sleep_one_ms;
if (sleep_current_ms < sleep_max_ms)
std::this_thread::sleep_for(std::chrono::milliseconds(sleep_one_ms));
else
break;
}
return current_connections;
}
@ -951,6 +957,9 @@ int Server::main(const std::vector<std::string> & /*args*/)
global_context->shutdownKeeperStorageDispatcher();
}
/// Wait server pool to avoid use-after-free of destroyed context in the handlers
server_pool.joinAll();
/** Explicitly destroy Context. It is more convenient than in destructor of Server, because logger is still available.
* At this moment, no one could own shared part of Context.
*/

View File

@ -373,7 +373,7 @@ struct ContextSharedPart
std::atomic_size_t max_partition_size_to_drop = 50000000000lu; /// Protects MergeTree partitions from accidental DROP (50GB by default)
String format_schema_path; /// Path to a directory that contains schema files used by input formats.
ActionLocksManagerPtr action_locks_manager; /// Set of storages' action lockers
std::optional<SystemLogs> system_logs; /// Used to log queries and operations on parts
std::unique_ptr<SystemLogs> system_logs; /// Used to log queries and operations on parts
std::optional<StorageS3Settings> storage_s3_settings; /// Settings of S3 storage
RemoteHostFilter remote_host_filter; /// Allowed URL from config.xml
@ -442,10 +442,14 @@ struct ContextSharedPart
DatabaseCatalog::shutdown();
std::unique_ptr<SystemLogs> delete_system_logs;
{
auto lock = std::lock_guard(mutex);
/// Preemptive destruction is important, because these objects may have a refcount to ContextShared (cyclic reference).
/// TODO: Get rid of this.
system_logs.reset();
delete_system_logs = std::move(system_logs);
embedded_dictionaries.reset();
external_dictionaries_loader.reset();
models_repository_guard.reset();
@ -462,6 +466,10 @@ struct ContextSharedPart
zookeeper.reset();
}
/// Can be removed w/o context lock
delete_system_logs.reset();
}
bool hasTraceCollector() const
{
return trace_collector.has_value();
@ -1910,7 +1918,7 @@ void Context::setCluster(const String & cluster_name, const std::shared_ptr<Clus
void Context::initializeSystemLogs()
{
auto lock = getLock();
shared->system_logs.emplace(getGlobalContext(), getConfigRef());
shared->system_logs = std::make_unique<SystemLogs>(getGlobalContext(), getConfigRef());
}
void Context::initializeTraceCollector()

View File

@ -152,6 +152,8 @@ public:
void shutdown() override
{
stopFlushThread();
if (table)
table->shutdown();
}
String getName() override

View File

@ -0,0 +1,35 @@
<?xml version="1.0"?>
<yandex>
<logger>
<level>trace</level>
<console>true</console>
</logger>
<tcp_port>9000</tcp_port>
<path>./</path>
<mark_cache_size>0</mark_cache_size>
<users>
<default>
<password></password>
<networks>
<ip>::/0</ip>
</networks>
<profile>default</profile>
<quota>default</quota>
<access_management>1</access_management>
</default>
</users>
<profiles>
<default/>
</profiles>
<quotas>
<default />
</quotas>
</yandex>

View File

@ -0,0 +1,83 @@
#!/usr/bin/env bash
CUR_DIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)
# shellcheck source=../shell_config.sh
. "$CUR_DIR"/../shell_config.sh
server_opts=(
"--config-file=$CUR_DIR/$(basename "${BASH_SOURCE[0]}" .sh).config.xml"
"--"
# to avoid multiple listen sockets (complexity for port discovering)
"--listen_host=127.1"
# we will discover the real port later.
"--tcp_port=0"
"--shutdown_wait_unfinished=0"
)
CLICKHOUSE_WATCHDOG_ENABLE=0 $CLICKHOUSE_SERVER_BINARY "${server_opts[@]}" >& clickhouse-server.log &
server_pid=$!
trap cleanup EXIT
function cleanup()
{
kill -9 $server_pid
kill -9 $client_pid
echo "Test failed. Server log:"
cat clickhouse-server.log
rm -f clickhouse-server.log
exit 1
}
server_port=
i=0 retries=300
# wait until server will start to listen (max 30 seconds)
while [[ -z $server_port ]] && [[ $i -lt $retries ]]; do
server_port=$(lsof -n -a -P -i tcp -s tcp:LISTEN -p $server_pid 2>/dev/null | awk -F'[ :]' '/LISTEN/ { print $(NF-1) }')
((++i))
sleep 0.1
done
if [[ -z $server_port ]]; then
echo "Cannot wait for LISTEN socket" >&2
exit 1
fi
# wait for the server to start accepting tcp connections (max 30 seconds)
i=0 retries=300
while ! $CLICKHOUSE_CLIENT_BINARY --host 127.1 --port "$server_port" --format Null -q 'select 1' 2>/dev/null && [[ $i -lt $retries ]]; do
sleep 0.1
done
if ! $CLICKHOUSE_CLIENT_BINARY --host 127.1 --port "$server_port" --format Null -q 'select 1'; then
echo "Cannot wait until server will start accepting connections on <tcp_port>" >&2
exit 1
fi
query_id="$CLICKHOUSE_DATABASE-$SECONDS"
$CLICKHOUSE_CLIENT_BINARY --query_id "$query_id" --host 127.1 --port "$server_port" --format Null -q 'select sleepEachRow(1) from numbers(10)' 2>/dev/null &
client_pid=$!
# wait until the query will appear in processlist (max 10 second)
# (it is enough to trigger the problem)
i=0 retries=1000
while [[ $($CLICKHOUSE_CLIENT_BINARY --host 127.1 --port "$server_port" -q "select count() from system.processes where query_id = '$query_id'") != "1" ]] && [[ $i -lt $retries ]]; do
sleep 0.01
done
if [[ $($CLICKHOUSE_CLIENT_BINARY --host 127.1 --port "$server_port" -q "select count() from system.processes where query_id = '$query_id'") != "1" ]]; then
echo "Cannot wait until the query will start" >&2
exit 1
fi
# send TERM and save the error code to ensure that it is 0 (EXIT_SUCCESS)
kill $server_pid
wait $server_pid
return_code=$?
wait $client_pid
trap '' EXIT
if [ $return_code != 0 ]; then
cat clickhouse-server.log
fi
rm -f clickhouse-server.log
exit $return_code

View File

@ -23,14 +23,21 @@ export CLICKHOUSE_TEST_ZOOKEEPER_PREFIX="${CLICKHOUSE_TEST_NAME}_${CLICKHOUSE_DA
[ -v CLICKHOUSE_LOG_COMMENT ] && CLICKHOUSE_BENCHMARK_OPT0+=" --log_comment='${CLICKHOUSE_LOG_COMMENT}' "
export CLICKHOUSE_BINARY=${CLICKHOUSE_BINARY:="clickhouse"}
# client
[ -x "$CLICKHOUSE_BINARY-client" ] && CLICKHOUSE_CLIENT_BINARY=${CLICKHOUSE_CLIENT_BINARY:=$CLICKHOUSE_BINARY-client}
[ -x "$CLICKHOUSE_BINARY" ] && CLICKHOUSE_CLIENT_BINARY=${CLICKHOUSE_CLIENT_BINARY:=$CLICKHOUSE_BINARY client}
export CLICKHOUSE_CLIENT_BINARY=${CLICKHOUSE_CLIENT_BINARY:=$CLICKHOUSE_BINARY-client}
export CLICKHOUSE_CLIENT_OPT="${CLICKHOUSE_CLIENT_OPT0:-} ${CLICKHOUSE_CLIENT_OPT:-}"
export CLICKHOUSE_CLIENT=${CLICKHOUSE_CLIENT:="$CLICKHOUSE_CLIENT_BINARY ${CLICKHOUSE_CLIENT_OPT:-}"}
# local
[ -x "${CLICKHOUSE_BINARY}-local" ] && CLICKHOUSE_LOCAL=${CLICKHOUSE_LOCAL:="${CLICKHOUSE_BINARY}-local"}
[ -x "${CLICKHOUSE_BINARY}" ] && CLICKHOUSE_LOCAL=${CLICKHOUSE_LOCAL:="${CLICKHOUSE_BINARY} local"}
export CLICKHOUSE_LOCAL=${CLICKHOUSE_LOCAL:="${CLICKHOUSE_BINARY}-local"}
# server
[ -x "${CLICKHOUSE_BINARY}-server" ] && CLICKHOUSE_SERVER_BINARY=${CLICKHOUSE_SERVER_BINARY:="${CLICKHOUSE_BINARY}-server"}
[ -x "${CLICKHOUSE_BINARY}" ] && CLICKHOUSE_SERVER_BINARY=${CLICKHOUSE_SERVER_BINARY:="${CLICKHOUSE_BINARY} server"}
export CLICKHOUSE_SERVER_BINARY=${CLICKHOUSE_SERVER_BINARY:="${CLICKHOUSE_BINARY}-server"}
# others
export CLICKHOUSE_OBFUSCATOR=${CLICKHOUSE_OBFUSCATOR:="${CLICKHOUSE_BINARY}-obfuscator"}
export CLICKHOUSE_COMPRESSOR=${CLICKHOUSE_COMPRESSOR:="${CLICKHOUSE_BINARY}-compressor"}
export CLICKHOUSE_BENCHMARK=${CLICKHOUSE_BENCHMARK:="${CLICKHOUSE_BINARY}-benchmark ${CLICKHOUSE_BENCHMARK_OPT0:-}"}

View File

@ -696,6 +696,7 @@
"01682_cache_dictionary_complex_key",
"01684_ssd_cache_dictionary_simple_key",
"01685_ssd_cache_dictionary_complex_key",
"01737_clickhouse_server_wait_server_pool_long", // This test is fully compatible to run in parallel, however under ASAN processes are pretty heavy and may fail under flaky adress check.
"01760_system_dictionaries",
"01760_polygon_dictionaries",
"01778_hierarchical_dictionaries",