#!/usr/bin/env bash set -e SOURCE_RAW=/opt/ontime/ontime.csv #SOURCE=/opt/ontime/ontime9M_id.csv #SOURCE=/opt/ontime/ontime9M_id.Native SOURCE=/opt/ontime/ontime_id.csv if [[ ! -f $SOURCE ]]; then echo "Inserting id field from $SOURCE_RAW to $SOURCE ..." tail -n +2 "$SOURCE_RAW" | ./add_id_to_csv > $SOURCE fi STRUCT="id UInt32, "`cat ontime.struct` COLUMNS=`echo "$STRUCT" | tr " " "\n" | awk 'NR % 2 == 1' | tr "\n" "," | head -c -1` ACTIVE_COLUMNS=11 ST_OPTIONS="" #ST_OPTIONS="--max_threads=1 --background_pool_size=1" # increase std. dev. of measurements db="test" table_name="ontime" table="test.ontime" function read_src_data { clickhouse-local -s --file "$SOURCE" --input-format CSV --structure "$STRUCT" -of Native --query "$@" #clickhouse-local -s --file "$SOURCE" --input-format Native --structure "$STRUCT" -of Native --query "$@" } function set_vertical_alg { echo "$1" | sudo tee /etc/clickhouse-server/conf.d/enable_vertical_merge_algorithm.xml >/dev/null echo "0" | sudo tee /etc/clickhouse-server/conf.d/vertical_merge_algorithm_min_rows_to_activate.xml >/dev/null } function set_and_restart { sudo service clickhouse-server stop 1>/dev/null set_vertical_alg $1 sudo service clickhouse-server start 1>/dev/null ./wait_clickhouse_server } function get_n_columns { echo $1 | cut -d ',' -f -$2 } function parts_stat { clickhouse-client --query "SELECT count() as parts, round(avg(marks), 2) AS marks_avg, min(marks) AS marks_min, max(marks) AS marks_max FROM system.parts WHERE active AND table='$table_name' AND database='$db' FORMAT TSKV" } function parts_count { clickhouse-client --query "SELECT count() FROM system.parts WHERE active AND table='$table_name' AND database='$db'" } function merges_count { clickhouse-client --query "SELECT count() FROM system.merges WHERE table='$table_name' AND database='$db'" } function wait_merges { while [[ -n $(merges_count) ]]; do sleep 1; done } function drop_cache { sudo sh -c 'echo 3 >/proc/sys/vm/drop_caches' } function get_last_merge_info { cat /var/log/clickhouse-server/clickhouse-server.log | grep "(Merger): Merging" | tail -1 | cut -d " " -f 12- } function get_last_merge_time { cat /var/log/clickhouse-server/clickhouse-server.log | grep "(Merger): Merge sorted" | tail -1 | cut -d " " -f 21 } function total_merge_time_from_log { cat /var/log/clickhouse-server/clickhouse-server.log | grep "(Merger): Merge sorted" | cut -d " " -f 21 | clickhouse-local -s -S "d Float64" --query "SELECT round(sum(d), 3) FROM table" } function get_max_clickhouse_server_memory { cat /proc/`cat /var/run/clickhouse-server/clickhouse-server.pid`/status | grep VmPeak | awk '{ print $2/1024 }' #MiB } function optimize_rounds { [[ -z $1 ]] && NUM_ROUNDS=29 || NUM_ROUNDS=$1 drop_cache echo "OPTIMIZE before: $(parts_stat)" t_optimize_total=0 for i in $(seq 1 $NUM_ROUNDS); do local t=`clickhouse-client --time ${ST_OPTIONS} --query "OPTIMIZE TABLE $table" 2>&1` #echo "$t $(get_last_merge_time) $(get_last_merge_info)" t_optimize_total=`echo "$t_optimize_total + $t" | bc -l` done echo "OPTIMIZE after : $(parts_stat)" echo "OPTIMIZE time : $t_optimize_total" } function run_case { case_func="case_$1" sudo service clickhouse-server stop 1>/dev/null sudo rm -f /var/log/clickhouse-server/clickhouse-server.log sudo service clickhouse-server start 1>/dev/null ./wait_clickhouse_server clickhouse-client --query "DROP TABLE IF EXISTS $table" drop_cache $case_func 1>&2 wait_merges t_merges=$(total_merge_time_from_log) echo "After INSERT: $(parts_stat)" #echo "Merges time : $t_merges" #optimize_rounds #optimize_times="$optimize_times $t_optimize_total" insert_times="$insert_times $t_insert" merges_times="$merges_times $t_merges" clickhouse-client --query "DROP TABLE IF EXISTS $table" } function run_cases { cur_struct=$(get_n_columns "$STRUCT" $ACTIVE_COLUMNS) cur_columns=$(get_n_columns "$COLUMNS" $ACTIVE_COLUMNS) t_insert=0 insert_times="" merges_times="" optimize_times="" run_case 1 run_case 2 run_case 3 echo "INSERT times : $insert_times" echo "Merges times : $merges_times" #echo "OPTIMIZE times: $optimize_times" } function case_1 { echo "Case #1. Trivial. All parts not intersected by PK." clickhouse-client --query "CREATE TABLE $table ($cur_struct) ENGINE = MergeTree(FlightDate, (FlightDate), 8192)" t_insert=`read_src_data "SELECT $cur_columns FROM table" | clickhouse-client --time ${ST_OPTIONS} --query "INSERT INTO $table FORMAT Native" 2>&1` } function case_2 { echo "Case #2. Strong mixture. Each new (merged) row comes from new part." clickhouse-client --query "CREATE TABLE $table ($cur_struct) ENGINE = MergeTree(FlightDate, (intHash32(id), FlightDate), 8192)" t_insert=`read_src_data "SELECT $cur_columns FROM table" | clickhouse-client --time ${ST_OPTIONS} --query "INSERT INTO $table FORMAT Native" 2>&1` } function case_3 { echo "Case #3. Chunked mixture. Merged row with dozens of its neighbors come from the same part." clickhouse-client --query "CREATE TABLE $table ($cur_struct) ENGINE = MergeTree(FlightDate, (bitAnd(id, 15), intHash32(bitShiftRight(id, 4))), 8192)" t_insert=`read_src_data "SELECT $cur_columns FROM table" | clickhouse-client --time ${ST_OPTIONS} --query "INSERT INTO $table FORMAT Native" 2>&1` } [[ $(whoami) -ne "root" ]] && echo "Run script as root" echo "### Vertical ###" set_and_restart 1 run_cases vertical_optimize_times="$optimize_times" vertical_merges_times="$merges_times" echo echo "### Horizontal ###" set_and_restart 0 run_cases horizontal_optimize_times="$optimize_times" horizontal_merges_times="$merges_times" echo echo "#V" "Merges:" ${vertical_merges_times} #"Optimitze:" ${vertical_optimize_times} echo "#H" "Merges:" ${horizontal_merges_times} #"Optimitze:" ${horizontal_optimize_times}