ClickHouse/tests/perf_drafts/vert_merge/test_merges
2021-09-25 07:08:34 +03:00

184 lines
6.0 KiB
Bash
Executable File

#!/usr/bin/env bash
set -e
SOURCE_RAW=/opt/ontime/ontime.csv
#SOURCE=/opt/ontime/ontime9M_id.csv
#SOURCE=/opt/ontime/ontime9M_id.Native
SOURCE=/opt/ontime/ontime_id.csv
if [[ ! -f $SOURCE ]]; then
echo "Inserting id field from $SOURCE_RAW to $SOURCE ..."
tail -n +2 "$SOURCE_RAW" | ./add_id_to_csv > $SOURCE
fi
STRUCT="id UInt32, "`cat ontime.struct`
COLUMNS=`echo "$STRUCT" | tr " " "\n" | awk 'NR % 2 == 1' | tr "\n" "," | head -c -1`
ACTIVE_COLUMNS=11
ST_OPTIONS=""
#ST_OPTIONS="--max_threads=1 --background_pool_size=1" # increase std. dev. of measurements
db="test"
table_name="ontime"
table="test.ontime"
function read_src_data {
clickhouse-local -s --file "$SOURCE" --input-format CSV --structure "$STRUCT" -of Native --query "$@"
#clickhouse-local -s --file "$SOURCE" --input-format Native --structure "$STRUCT" -of Native --query "$@"
}
function set_vertical_alg {
echo "<clickhouse><merge_tree><enable_vertical_merge_algorithm>$1</enable_vertical_merge_algorithm></merge_tree></clickhouse>" | sudo tee /etc/clickhouse-server/conf.d/enable_vertical_merge_algorithm.xml >/dev/null
echo "<clickhouse><merge_tree><vertical_merge_algorithm_min_rows_to_activate>0</vertical_merge_algorithm_min_rows_to_activate></merge_tree></clickhouse>" | sudo tee /etc/clickhouse-server/conf.d/vertical_merge_algorithm_min_rows_to_activate.xml >/dev/null
}
function set_and_restart {
sudo service clickhouse-server stop 1>/dev/null
set_vertical_alg $1
sudo service clickhouse-server start 1>/dev/null
./wait_clickhouse_server
}
function get_n_columns {
echo $1 | cut -d ',' -f -$2
}
function parts_stat {
clickhouse-client --query "SELECT count() as parts, round(avg(marks), 2) AS marks_avg, min(marks) AS marks_min, max(marks) AS marks_max FROM system.parts WHERE active AND table='$table_name' AND database='$db' FORMAT TSKV"
}
function parts_count {
clickhouse-client --query "SELECT count() FROM system.parts WHERE active AND table='$table_name' AND database='$db'"
}
function merges_count {
clickhouse-client --query "SELECT count() FROM system.merges WHERE table='$table_name' AND database='$db'"
}
function wait_merges {
while [[ -n $(merges_count) ]]; do sleep 1; done
}
function drop_cache {
sudo sh -c 'echo 3 >/proc/sys/vm/drop_caches'
}
function get_last_merge_info {
cat /var/log/clickhouse-server/clickhouse-server.log | grep "(Merger): Merging" | tail -1 | cut -d " " -f 12-
}
function get_last_merge_time {
cat /var/log/clickhouse-server/clickhouse-server.log | grep "(Merger): Merge sorted" | tail -1 | cut -d " " -f 21
}
function total_merge_time_from_log {
cat /var/log/clickhouse-server/clickhouse-server.log | grep "(Merger): Merge sorted" | cut -d " " -f 21 | clickhouse-local -s -S "d Float64" --query "SELECT round(sum(d), 3) FROM table"
}
function get_max_clickhouse_server_memory {
cat /proc/`cat /var/run/clickhouse-server/clickhouse-server.pid`/status | grep VmPeak | awk '{ print $2/1024 }' #MiB
}
function optimize_rounds {
[[ -z $1 ]] && NUM_ROUNDS=29 || NUM_ROUNDS=$1
drop_cache
echo "OPTIMIZE before: $(parts_stat)"
t_optimize_total=0
for i in $(seq 1 $NUM_ROUNDS); do
local t=`clickhouse-client --time ${ST_OPTIONS} --query "OPTIMIZE TABLE $table" 2>&1`
#echo "$t $(get_last_merge_time) $(get_last_merge_info)"
t_optimize_total=`echo "$t_optimize_total + $t" | bc -l`
done
echo "OPTIMIZE after : $(parts_stat)"
echo "OPTIMIZE time : $t_optimize_total"
}
function run_case {
case_func="case_$1"
sudo service clickhouse-server stop 1>/dev/null
sudo rm -f /var/log/clickhouse-server/clickhouse-server.log
sudo service clickhouse-server start 1>/dev/null
./wait_clickhouse_server
clickhouse-client --query "DROP TABLE IF EXISTS $table"
drop_cache
$case_func 1>&2
wait_merges
t_merges=$(total_merge_time_from_log)
echo "After INSERT: $(parts_stat)"
#echo "Merges time : $t_merges"
#optimize_rounds
#optimize_times="$optimize_times $t_optimize_total"
insert_times="$insert_times $t_insert"
merges_times="$merges_times $t_merges"
clickhouse-client --query "DROP TABLE IF EXISTS $table"
}
function run_cases {
cur_struct=$(get_n_columns "$STRUCT" $ACTIVE_COLUMNS)
cur_columns=$(get_n_columns "$COLUMNS" $ACTIVE_COLUMNS)
t_insert=0
insert_times=""
merges_times=""
optimize_times=""
run_case 1
run_case 2
run_case 3
echo "INSERT times : $insert_times"
echo "Merges times : $merges_times"
#echo "OPTIMIZE times: $optimize_times"
}
function case_1 {
echo "Case #1. Trivial. All parts not intersected by PK."
clickhouse-client --query "CREATE TABLE $table ($cur_struct) ENGINE = MergeTree(FlightDate, (FlightDate), 8192)"
t_insert=`read_src_data "SELECT $cur_columns FROM table" | clickhouse-client --time ${ST_OPTIONS} --query "INSERT INTO $table FORMAT Native" 2>&1`
}
function case_2 {
echo "Case #2. Strong mixture. Each new (merged) row comes from new part."
clickhouse-client --query "CREATE TABLE $table ($cur_struct) ENGINE = MergeTree(FlightDate, (intHash32(id), FlightDate), 8192)"
t_insert=`read_src_data "SELECT $cur_columns FROM table" | clickhouse-client --time ${ST_OPTIONS} --query "INSERT INTO $table FORMAT Native" 2>&1`
}
function case_3 {
echo "Case #3. Chunked mixture. Merged row with dozens of its neighbors come from the same part."
clickhouse-client --query "CREATE TABLE $table ($cur_struct) ENGINE = MergeTree(FlightDate, (bitAnd(id, 15), intHash32(bitShiftRight(id, 4))), 8192)"
t_insert=`read_src_data "SELECT $cur_columns FROM table" | clickhouse-client --time ${ST_OPTIONS} --query "INSERT INTO $table FORMAT Native" 2>&1`
}
[[ $(whoami) -ne "root" ]] && echo "Run script as root"
echo "### Vertical ###"
set_and_restart 1
run_cases
vertical_optimize_times="$optimize_times"
vertical_merges_times="$merges_times"
echo
echo "### Horizontal ###"
set_and_restart 0
run_cases
horizontal_optimize_times="$optimize_times"
horizontal_merges_times="$merges_times"
echo
echo "#V" "Merges:" ${vertical_merges_times} #"Optimitze:" ${vertical_optimize_times}
echo "#H" "Merges:" ${horizontal_merges_times} #"Optimitze:" ${horizontal_optimize_times}