This commit is contained in:
Nikita Fomichev 2024-08-27 23:15:41 +02:00 committed by GitHub
commit 86a5e18ef2
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
4 changed files with 238 additions and 104 deletions

View File

@ -110,6 +110,8 @@ function setup_logs_replication
# The function is launched in a separate shell instance to not expose the
# exported values from CLICKHOUSE_CI_LOGS_CREDENTIALS
set +x
PORT=${1:-"9000"}
# disable output
if ! [ -r "${CLICKHOUSE_CI_LOGS_CREDENTIALS}" ]; then
echo "File $CLICKHOUSE_CI_LOGS_CREDENTIALS does not exist, do not setup"
@ -125,16 +127,16 @@ function setup_logs_replication
__set_connection_args
echo 'Create all configured system logs'
clickhouse-client --query "SYSTEM FLUSH LOGS"
clickhouse-client --port "$PORT" --query "SYSTEM FLUSH LOGS"
debug_or_sanitizer_build=$(clickhouse-client -q "WITH ((SELECT value FROM system.build_options WHERE name='BUILD_TYPE') AS build, (SELECT value FROM system.build_options WHERE name='CXX_FLAGS') as flags) SELECT build='Debug' OR flags LIKE '%fsanitize%'")
debug_or_sanitizer_build=$(clickhouse-client --port "$PORT" -q "WITH ((SELECT value FROM system.build_options WHERE name='BUILD_TYPE') AS build, (SELECT value FROM system.build_options WHERE name='CXX_FLAGS') as flags) SELECT build='Debug' OR flags LIKE '%fsanitize%'")
echo "Build is debug or sanitizer: $debug_or_sanitizer_build"
# We will pre-create a table system.coverage_log.
# It is normally created by clickhouse-test rather than the server,
# so we will create it in advance to make it be picked up by the next commands:
clickhouse-client --query "
clickhouse-client --port "$PORT" --query "
CREATE TABLE IF NOT EXISTS system.coverage_log
(
time DateTime COMMENT 'The time of test run',
@ -145,7 +147,7 @@ function setup_logs_replication
# For each system log table:
echo 'Create %_log tables'
clickhouse-client --query "SHOW TABLES FROM system LIKE '%\\_log'" | while read -r table
clickhouse-client --port "$PORT" --query "SHOW TABLES FROM system LIKE '%\\_log'" | while read -r table
do
if [[ "$table" = "trace_log" ]]
then
@ -169,7 +171,7 @@ function setup_logs_replication
fi
# Calculate hash of its structure. Note: 4 is the version of extra columns - increment it if extra columns are changed:
hash=$(clickhouse-client --query "
hash=$(clickhouse-client --port "$PORT" --query "
SELECT sipHash64(9, groupArray((name, type)))
FROM (SELECT name, type FROM system.columns
WHERE database = 'system' AND table = '$table'
@ -177,7 +179,7 @@ function setup_logs_replication
")
# Create the destination table with adapted name and structure:
statement=$(clickhouse-client --format TSVRaw --query "SHOW CREATE TABLE system.${table}" | sed -r -e '
statement=$(clickhouse-client --port "$PORT" --format TSVRaw --query "SHOW CREATE TABLE system.${table}" | sed -r -e '
s/^\($/('"$EXTRA_COLUMNS_FOR_TABLE"'/;
s/^ORDER BY (([^\(].+?)|\((.+?)\))$/ORDER BY ('"$EXTRA_ORDER_BY_COLUMNS"', \2\3)/;
s/^CREATE TABLE system\.\w+_log$/CREATE TABLE IF NOT EXISTS '"$table"'_'"$hash"'/;
@ -193,7 +195,7 @@ function setup_logs_replication
echo "Creating table system.${table}_sender" >&2
# Create Distributed table and materialized view to watch on the original table:
clickhouse-client --query "
clickhouse-client --port "$PORT" --query "
CREATE TABLE system.${table}_sender
ENGINE = Distributed(${CLICKHOUSE_CI_LOGS_CLUSTER}, default, ${table}_${hash})
SETTINGS flush_on_detach=0
@ -204,7 +206,7 @@ function setup_logs_replication
echo "Creating materialized view system.${table}_watcher" >&2
clickhouse-client --query "
clickhouse-client --port "$PORT" --query "
CREATE MATERIALIZED VIEW system.${table}_watcher TO system.${table}_sender AS
SELECT ${EXTRA_COLUMNS_EXPRESSION_FOR_TABLE}, *
FROM system.${table}
@ -215,9 +217,10 @@ function setup_logs_replication
function stop_logs_replication
{
echo "Detach all logs replication"
clickhouse-client --query "select database||'.'||table from system.tables where database = 'system' and (table like '%_sender' or table like '%_watcher')" | {
PORT=${1:-"9000"}
clickhouse-client --port "$PORT" --query "select database||'.'||table from system.tables where database = 'system' and (table like '%_sender' or table like '%_watcher')" | {
tee /dev/stderr
} | {
timeout --preserve-status --signal TERM --kill-after 5m 15m xargs -n1 -r -i clickhouse-client --query "drop table {}"
timeout --preserve-status --signal TERM --kill-after 5m 15m xargs -n1 -r -i clickhouse-client --port "$PORT" --query "drop table {}"
}
}

View File

@ -1574,8 +1574,11 @@ class TestCase:
print("Cannot insert coverage data: ", str(e))
# Check for dumped coverage files
file_pattern = "coverage.*"
coverage_prefix = os.getenv("CLICKHOUSE_WRITE_COVERAGE", "coverage")
file_pattern = coverage_prefix + ".*"
matching_files = glob.glob(file_pattern)
# TODO remove before merge
print(result.case_name, coverage_prefix, matching_files)
for file_path in matching_files:
try:
body = read_file_as_binary_string(file_path)
@ -1722,6 +1725,11 @@ class TestCase:
f"localhost {os.environ['CLICKHOUSE_PORT_TCP']}",
"localhost 9000",
)
replace_in_file(
self.stdout_file,
f"127.0.0.1:{os.environ['CLICKHOUSE_PORT_TCP']}",
"127.0.0.1:9000",
)
if os.environ.get("CLICKHOUSE_PORT_TCP_SECURE"):
replace_in_file(
@ -2138,6 +2146,11 @@ class TestSuite:
else:
self.parallel_tests.append(test_name)
if args.run_parallel_only:
self.sequential_tests = []
if args.run_no_parallel_only:
self.parallel_tests = []
def is_sequential_test(self, test_name):
if args.sequential:
if any(s in test_name for s in args.sequential):
@ -2246,13 +2259,9 @@ class GlobalTimeout(Exception):
pass
def run_tests_array(all_tests_with_params: Tuple[List[str], int, TestSuite, bool]):
(
all_tests,
num_tests,
test_suite,
is_concurrent,
) = all_tests_with_params
def run_tests_array(
all_tests: List[str], num_tests: int, test_suite: TestSuite, is_concurrent: bool
):
global stop_time
global exit_code
global server_died
@ -2598,94 +2607,81 @@ def run_tests_process(*args, **kwargs):
def do_run_tests(jobs, test_suite: TestSuite):
if jobs > 1 and len(test_suite.parallel_tests) > 0:
print(
"Found",
len(test_suite.parallel_tests),
"parallel tests and",
len(test_suite.sequential_tests),
"sequential tests",
print(
"Found",
len(test_suite.parallel_tests),
"parallel tests and",
len(test_suite.sequential_tests),
"sequential tests",
)
tests_n = max(len(test_suite.parallel_tests), 1)
jobs = min(jobs, tests_n)
# If we don't do random shuffling then there will be always
# nearly the same groups of test suites running concurrently.
# Thus, if there is a test within group which appears to be broken
# then it will affect all other tests in a non-random form.
# So each time a bad test fails - other tests from the group will also fail
# and this process will be more or less stable.
# It makes it more difficult to detect real flaky tests,
# because the distribution and the amount
# of failures will be nearly the same for all tests from the group.
random.shuffle(test_suite.parallel_tests)
batch_size = len(test_suite.parallel_tests) // jobs
manager = multiprocessing.Manager()
parallel_tests = manager.list()
parallel_tests.extend(test_suite.parallel_tests)
is_concurrent = jobs > 1
processes = []
for _ in range(jobs):
process = multiprocessing.Process(
target=run_tests_process,
args=(parallel_tests, batch_size, test_suite, is_concurrent),
)
tests_n = len(test_suite.parallel_tests)
jobs = min(jobs, tests_n)
processes.append(process)
process.start()
# If we don't do random shuffling then there will be always
# nearly the same groups of test suites running concurrently.
# Thus, if there is a test within group which appears to be broken
# then it will affect all other tests in a non-random form.
# So each time a bad test fails - other tests from the group will also fail
# and this process will be more or less stable.
# It makes it more difficult to detect real flaky tests,
# because the distribution and the amount
# of failures will be nearly the same for all tests from the group.
random.shuffle(test_suite.parallel_tests)
batch_size = len(test_suite.parallel_tests) // jobs
manager = multiprocessing.Manager()
parallel_tests = manager.list()
parallel_tests.extend(test_suite.parallel_tests)
processes = []
for _ in range(jobs):
process = multiprocessing.Process(
target=run_tests_process,
args=((parallel_tests, batch_size, test_suite, True),),
while processes:
sys.stdout.flush()
# Periodically check the server for hangs
# and stop all processes in this case
try:
clickhouse_execute(
args,
query="SELECT 1 /*hung check*/",
max_http_retries=20,
timeout=10,
)
processes.append(process)
process.start()
while processes:
sys.stdout.flush()
# Periodically check the server for hangs
# and stop all processes in this case
try:
clickhouse_execute(
args,
query="SELECT 1 /*hung check*/",
max_http_retries=20,
timeout=10,
)
except Exception:
print("Hung check failed")
server_died.set()
if server_died.is_set():
print("Server died, terminating all processes...")
kill_gdb_if_any()
# Wait for test results
sleep(args.timeout)
for p in processes:
if p.is_alive():
p.terminate()
break
for p in processes[:]:
if not p.is_alive():
processes.remove(p)
sleep(5)
run_tests_array(
(
test_suite.sequential_tests,
len(test_suite.sequential_tests),
test_suite,
False,
)
)
except Exception:
print("Hung check failed")
server_died.set()
return len(test_suite.sequential_tests) + len(test_suite.parallel_tests)
else:
num_tests = len(test_suite.all_tests)
run_tests_array(
(
test_suite.all_tests,
num_tests,
test_suite,
False,
)
)
return num_tests
if server_died.is_set():
print("Server died, terminating all processes...")
kill_gdb_if_any()
# Wait for test results
sleep(args.timeout)
for p in processes:
if p.is_alive():
p.terminate()
break
for p in processes[:]:
if not p.is_alive():
processes.remove(p)
run_tests_array(
test_suite.sequential_tests,
len(test_suite.sequential_tests),
test_suite,
is_concurrent,
)
return len(test_suite.sequential_tests) + len(test_suite.parallel_tests)
def is_test_from_dir(suite_dir, case):
@ -3527,6 +3523,19 @@ def parse_args():
help="Capture stacktraces from clickhouse-client/local on errors",
)
parser.add_argument(
"--run-parallel-only",
action="store_true",
default=False,
help="",
)
parser.add_argument(
"--run-no-parallel-only",
action="store_true",
default=False,
help="",
)
return parser.parse_args()

View File

@ -17,7 +17,11 @@ set -e -x -a
USE_DATABASE_REPLICATED=${USE_DATABASE_REPLICATED:=0}
USE_SHARED_CATALOG=${USE_SHARED_CATALOG:=0}
RUN_SEQUENTIAL_TESTS_IN_PARALLEL=1
if [[ "$USE_DATABASE_REPLICATED" -eq 1 ]] || [[ "$USE_SHARED_CATALOG" -eq 1 ]]; then
RUN_SEQUENTIAL_TESTS_IN_PARALLEL=0
fi
# Choose random timezone for this test run.
#
# NOTE: that clickhouse-test will randomize session_timezone by itself as well
@ -108,6 +112,60 @@ if [ "$NUM_TRIES" -gt "1" ]; then
mkdir -p /var/run/clickhouse-server
fi
# Run a CH instance to execute sequential tests on it in parallel with all other tests.
if [[ "$RUN_SEQUENTIAL_TESTS_IN_PARALLEL" -eq 1 ]]; then
mkdir -p /var/run/clickhouse-server3 /etc/clickhouse-server3 /var/lib/clickhouse3
cp -r -L /etc/clickhouse-server/* /etc/clickhouse-server3/
sudo chown clickhouse:clickhouse /var/run/clickhouse-server3 /var/lib/clickhouse3 /etc/clickhouse-server3/
sudo chown -R clickhouse:clickhouse /etc/clickhouse-server3/*
function replace(){
sudo find /etc/clickhouse-server3/ -type f -name '*.xml' -exec sed -i "$1" {} \;
}
replace "s|<port>9000</port>|<port>19000</port>|g"
replace "s|<port>9440</port>|<port>19440</port>|g"
replace "s|<port>9988</port>|<port>19988</port>|g"
replace "s|<port>9234</port>|<port>19234</port>|g"
replace "s|<port>9181</port>|<port>19181</port>|g"
replace "s|<https_port>8443</https_port>|<https_port>18443</https_port>|g"
replace "s|<tcp_port>9000</tcp_port>|<tcp_port>19000</tcp_port>|g"
replace "s|<tcp_port>9181</tcp_port>|<tcp_port>19181</tcp_port>|g"
replace "s|<tcp_port_secure>9440</tcp_port_secure>|<tcp_port_secure>19440</tcp_port_secure>|g"
replace "s|<tcp_with_proxy_port>9010</tcp_with_proxy_port>|<tcp_with_proxy_port>19010</tcp_with_proxy_port>|g"
replace "s|<mysql_port>9004</mysql_port>|<mysql_port>19004</mysql_port>|g"
replace "s|<postgresql_port>9005</postgresql_port>|<postgresql_port>19005</postgresql_port>|g"
replace "s|<interserver_http_port>9009</interserver_http_port>|<interserver_http_port>19009</interserver_http_port>|g"
replace "s|8123|18123|g"
replace "s|/var/lib/clickhouse/|/var/lib/clickhouse3/|g"
replace "s|/etc/clickhouse-server/|/etc/clickhouse-server3/|g"
# distributed cache
replace "s|<tcp_port>10001</tcp_port>|<tcp_port>10004</tcp_port>|g"
replace "s|<tcp_port>10002</tcp_port>|<tcp_port>10005</tcp_port>|g"
replace "s|<tcp_port>10003</tcp_port>|<tcp_port>10006</tcp_port>|g"
# use half of available memory for each server
sudo find /etc/clickhouse-server/ -type f -name '*.xml' -exec sed -i "s|<max_server_memory_usage_to_ram_ratio>0.9</max_server_memory_usage_to_ram_ratio>|<max_server_memory_usage_to_ram_ratio>0.4</max_server_memory_usage_to_ram_ratio>|g" {} \;
replace "s|<max_server_memory_usage_to_ram_ratio>0.9</max_server_memory_usage_to_ram_ratio>|<max_server_memory_usage_to_ram_ratio>0.55</max_server_memory_usage_to_ram_ratio>|g"
replace "s|<path>/var/lib/clickhouse/access/</path>|<path>/var/lib/clickhouse3/access/</path>|g"
sudo -E -u clickhouse /usr/bin/clickhouse server --daemon --config /etc/clickhouse-server3/config.xml \
--pid-file /var/run/clickhouse-server3/clickhouse-server.pid \
-- --path /var/lib/clickhouse3/ --logger.stderr /var/log/clickhouse-server/stderr-no-parallel.log \
--logger.log /var/log/clickhouse-server/clickhouse-server-no-parallel.log \
--logger.errorlog /var/log/clickhouse-server/clickhouse-server-no-parallel.err.log \
--tcp_port 19000 --tcp_port_secure 19440 --http_port 18123 --https_port 18443 --interserver_http_port 19009 --tcp_with_proxy_port 19010 \
--prometheus.port 19988 --keeper_server.raft_configuration.server.port 19234 --keeper_server.tcp_port 19181 \
--mysql_port 19004 --postgresql_port 19005
for _ in {1..100}
do
clickhouse-client --port 19000 --query "SELECT 1" && break
sleep 1
done
fi
# simplest way to forward env variables to server
sudo -E -u clickhouse /usr/bin/clickhouse-server --config /etc/clickhouse-server/config.xml --daemon --pid-file /var/run/clickhouse-server/clickhouse-server.pid
@ -177,6 +235,10 @@ do
done
setup_logs_replication
if [[ "$RUN_SEQUENTIAL_TESTS_IN_PARALLEL" -eq 1 ]]; then
setup_logs_replication 19000
fi
attach_gdb_to_clickhouse
# create tables for minio log webhooks
@ -290,7 +352,7 @@ function run_tests()
else
# All other configurations are OK.
ADDITIONAL_OPTIONS+=('--jobs')
ADDITIONAL_OPTIONS+=('8')
ADDITIONAL_OPTIONS+=('5')
fi
if [[ -n "$RUN_BY_HASH_NUM" ]] && [[ -n "$RUN_BY_HASH_TOTAL" ]]; then
@ -343,7 +405,44 @@ if [ "$NUM_TRIES" -gt "1" ]; then
| sed 's/All tests have finished/Redacted: a message about tests finish is deleted/' | sed 's/No tests were run/Redacted: a message about no tests run is deleted/' ||:
fi
run_tests ||:
function run_no_parallel_test()
{
export CLICKHOUSE_CLIENT_OPT=" --port 19000 "
export CLICKHOUSE_CONFIG="/etc/clickhouse-server3/config.xml"
export CLICKHOUSE_CONFIG_DIR="/etc/clickhouse-server3"
export CLICKHOUSE_CONFIG_GREP="/etc/clickhouse-server3/preprocessed/config.xml"
export CLICKHOUSE_USER_FILES="/var/lib/clickhouse3/user_files"
export CLICKHOUSE_SCHEMA_FILES="/var/lib/clickhouse3/format_schemas"
export CLICKHOUSE_PATH="/var/lib/clickhouse3"
export CLICKHOUSE_PORT_TCP="19000"
export CLICKHOUSE_PORT_TCP_SECURE="19440"
export CLICKHOUSE_PORT_TCP_WITH_PROXY="19010"
export CLICKHOUSE_PORT_HTTP="18123"
export CLICKHOUSE_PORT_HTTPS="18443"
export CLICKHOUSE_PORT_INTERSERVER="19009"
export CLICKHOUSE_PORT_KEEPER="19181"
export CLICKHOUSE_PORT_PROMTHEUS_PORT="19988"
export CLICKHOUSE_PORT_MYSQL="19004"
export CLICKHOUSE_PORT_POSTGRESQL="19005"
export CLICKHOUSE_WRITE_COVERAGE="coverage_no_parallel"
export ADDITIONAL_OPTIONS=("--run-parallel-only")
run_tests
}
if [[ "$RUN_SEQUENTIAL_TESTS_IN_PARALLEL" -eq 1 ]]; then
run_no_parallel_test &
PID1=$!
export ADDITIONAL_OPTIONS=("--run-no-parallel-only")
run_tests &
PID2=$!
wait $PID1 $PID2 ||:
else
run_tests ||:
fi
echo "Files in current directory"
ls -la ./
@ -356,6 +455,9 @@ clickhouse-client -q "system flush logs" ||:
# stop logs replication to make it possible to dump logs tables via clickhouse-local
stop_logs_replication
if [[ "$RUN_SEQUENTIAL_TESTS_IN_PARALLEL" -eq 1 ]]; then
stop_logs_replication 19000
fi
# Try to get logs while server is running
failed_to_save_logs=0
@ -364,6 +466,12 @@ do
if ! clickhouse-client -q "select * from system.$table into outfile '/test_output/$table.tsv.zst' format TSVWithNamesAndTypes"; then
failed_to_save_logs=1
fi
if [[ "$RUN_SEQUENTIAL_TESTS_IN_PARALLEL" -eq 1 ]]; then
if ! clickhouse-client --port 19000 -q "select * from system.$table into outfile '/test_output/$table.3.tsv.zst' format TSVWithNamesAndTypes"; then
failed_to_save_logs=1
fi
fi
if [[ "$USE_DATABASE_REPLICATED" -eq 1 ]]; then
if ! clickhouse-client --port 19000 -q "select * from system.$table into outfile '/test_output/$table.1.tsv.zst' format TSVWithNamesAndTypes"; then
failed_to_save_logs=1
@ -393,6 +501,9 @@ clickhouse-client --max_block_size 8192 --max_memory_usage 10G --max_threads 1 -
# Because it's the simplest way to read it when server has crashed.
sudo clickhouse stop ||:
if [[ "$RUN_SEQUENTIAL_TESTS_IN_PARALLEL" -eq 1 ]]; then
sudo clickhouse stop --pid-path /var/run/clickhouse-server3 ||:
fi
if [[ "$USE_DATABASE_REPLICATED" -eq 1 ]]; then
sudo clickhouse stop --pid-path /var/run/clickhouse-server1 ||:
@ -430,7 +541,12 @@ if [ $failed_to_save_logs -ne 0 ]; then
do
clickhouse-local "$data_path_config" --only-system-tables --stacktrace -q "select * from system.$table format TSVWithNamesAndTypes" | zstd --threads=0 > /test_output/$table.tsv.zst ||:
if [[ "$USE_DATABASE_REPLICATED" -eq 1 ]]; then
if [[ "$RUN_SEQUENTIAL_TESTS_IN_PARALLEL" -eq 1 ]]; then
clickhouse-local --path /var/lib/clickhouse3/ --only-system-tables --stacktrace -q "select * from system.$table format TSVWithNamesAndTypes" | zstd --threads=0 > /test_output/$table.3.tsv.zst ||:
fi
if
[[ "$USE_DATABASE_REPLICATED" -eq 1 ]]; then
clickhouse-local --path /var/lib/clickhouse1/ --only-system-tables --stacktrace -q "select * from system.$table format TSVWithNamesAndTypes" | zstd --threads=0 > /test_output/$table.1.tsv.zst ||:
clickhouse-local --path /var/lib/clickhouse2/ --only-system-tables --stacktrace -q "select * from system.$table format TSVWithNamesAndTypes" | zstd --threads=0 > /test_output/$table.2.tsv.zst ||:
fi
@ -473,6 +589,12 @@ rm -rf /var/lib/clickhouse/data/system/*/
tar -chf /test_output/store.tar /var/lib/clickhouse/store ||:
tar -chf /test_output/metadata.tar /var/lib/clickhouse/metadata/*.sql ||:
if [[ "$RUN_SEQUENTIAL_TESTS_IN_PARALLEL" -eq 1 ]]; then
rg -Fa "<Fatal>" /var/log/clickhouse-server/clickhouse-server-no-parallel.log ||:
zstd --threads=0 < /var/log/clickhouse-server/clickhouse-server-no-parallel.log > /test_output/clickhouse-server-no-parallel.log.zst ||:
mv /var/log/clickhouse-server/stderr-no-parallel.log /test_output/ ||:
tar -chf /test_output/coordination-no-parallel.tar /var/lib/clickhouse3/coordination ||:
fi
if [[ "$USE_DATABASE_REPLICATED" -eq 1 ]]; then
rg -Fa "<Fatal>" /var/log/clickhouse-server/clickhouse-server1.log ||:

View File

@ -3,7 +3,7 @@
# If ClickHouse was built with coverage - dump the coverage information at exit
# (in other cases this environment variable has no effect)
export CLICKHOUSE_WRITE_COVERAGE="coverage"
export CLICKHOUSE_WRITE_COVERAGE=${CLICKHOUSE_WRITE_COVERAGE:="coverage"}
export CLICKHOUSE_DATABASE=${CLICKHOUSE_DATABASE:="test"}
export CLICKHOUSE_CLIENT_SERVER_LOGS_LEVEL=${CLICKHOUSE_CLIENT_SERVER_LOGS_LEVEL:="warning"}