Merge pull request #22373 from ClickHouse/jepsen_for_ci

Add image to run jepsen tests
This commit is contained in:
alesapin 2021-04-04 18:35:21 +03:00 committed by GitHub
commit caaa19edb8
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
12 changed files with 157 additions and 54 deletions

2
contrib/NuRaft vendored

@ -1 +1 @@
Subproject commit 70468326ad5d72e9497944838484c591dae054ea
Subproject commit 241fd3754a1eb4d82ab68a9a875dc99391ec9f02

View File

@ -138,7 +138,8 @@
"docker/test/stateless_unbundled",
"docker/test/stateless_pytest",
"docker/test/integration/base",
"docker/test/fuzzer"
"docker/test/fuzzer",
"docker/test/keeper-jepsen"
]
},
"docker/packager/unbundled": {
@ -159,5 +160,9 @@
"docker/test/sqlancer": {
"name": "yandex/clickhouse-sqlancer-test",
"dependent": []
},
"docker/test/keeper-jepsen": {
"name": "yandex/clickhouse-keeper-jepsen-test",
"dependent": []
}
}

View File

@ -0,0 +1,39 @@
# docker build -t yandex/clickhouse-keeper-jepsen-test .
FROM yandex/clickhouse-test-base
ENV DEBIAN_FRONTEND=noninteractive
ENV CLOJURE_VERSION=1.10.3.814
# arguments
ENV PR_TO_TEST=""
ENV SHA_TO_TEST=""
ENV NODES_USERNAME="root"
ENV NODES_PASSWORD=""
ENV TESTS_TO_RUN="30"
ENV TIME_LIMIT="30"
# volumes
ENV NODES_FILE_PATH="/nodes.txt"
ENV TEST_OUTPUT="/test_output"
RUN mkdir "/root/.ssh"
RUN touch "/root/.ssh/known_hosts"
# install java
RUN apt-get update && apt-get install default-jre default-jdk libjna-java libjna-jni ssh gnuplot graphviz --yes --no-install-recommends
# install clojure
RUN curl -O "https://download.clojure.org/install/linux-install-${CLOJURE_VERSION}.sh" && \
chmod +x "linux-install-${CLOJURE_VERSION}.sh" && \
bash "./linux-install-${CLOJURE_VERSION}.sh"
# install leiningen
RUN curl -O "https://raw.githubusercontent.com/technomancy/leiningen/stable/bin/lein" && \
chmod +x ./lein && \
mv ./lein /usr/bin
COPY run.sh /
CMD ["/bin/bash", "/run.sh"]

View File

@ -0,0 +1,22 @@
#!/usr/bin/env bash
set -euo pipefail
CLICKHOUSE_PACKAGE=${CLICKHOUSE_PACKAGE:="https://clickhouse-builds.s3.yandex.net/$PR_TO_TEST/$SHA_TO_TEST/clickhouse_build_check/clang-11_relwithdebuginfo_none_bundled_unsplitted_disable_False_binary/clickhouse"}
CLICKHOUSE_REPO_PATH=${CLICKHOUSE_REPO_PATH:=""}
if [ -z "$CLICKHOUSE_REPO_PATH" ]; then
CLICKHOUSE_REPO_PATH=ch
rm -rf ch ||:
mkdir ch ||:
wget -nv -nd -c "https://clickhouse-test-reports.s3.yandex.net/$PR_TO_TEST/$SHA_TO_TEST/repo/clickhouse_no_subs.tar.gz"
tar -C ch --strip-components=1 -xf clickhouse_no_subs.tar.gz
ls -lath ||:
fi
cd "$CLICKHOUSE_REPO_PATH/tests/jepsen.clickhouse-keeper"
(lein run test-all --nodes-file "$NODES_FILE_PATH" --username "$NODES_USERNAME" --logging-json --password "$NODES_PASSWORD" --time-limit "$TIME_LIMIT" --concurrency 50 -r 50 --snapshot-distance 100 --stale-log-gap 100 --reserved-log-items 10 --lightweight-run --clickhouse-source "$CLICKHOUSE_PACKAGE" -q --test-count "$TESTS_TO_RUN" || true) | tee "$TEST_OUTPUT/jepsen_run_all_tests.log"
mv store "$TEST_OUTPUT/"

View File

@ -150,6 +150,13 @@ void KeeperServer::putRequest(const KeeperStorage::RequestForSession & request_f
int64_t KeeperServer::getSessionID(int64_t session_timeout_ms)
{
/// Just some sanity check. We don't want to make a lot of clients wait with lock.
if (active_session_id_requests > 10)
throw Exception(ErrorCodes::RAFT_ERROR, "Too many concurrent SessionID requests already in flight");
++active_session_id_requests;
SCOPE_EXIT({ --active_session_id_requests; });
auto entry = nuraft::buffer::alloc(sizeof(int64_t));
/// Just special session request
nuraft::buffer_serializer bs(entry);

View File

@ -34,6 +34,7 @@ private:
std::atomic<bool> initialized_flag = false;
std::condition_variable initialized_cv;
std::atomic<bool> initial_batch_committed = false;
std::atomic<size_t> active_session_id_requests = 0;
nuraft::cb_func::ReturnCode callbackFunc(nuraft::cb_func::Type type, nuraft::cb_func::Param * param);

View File

@ -169,15 +169,15 @@ void KeeperStateMachine::create_snapshot(
bool ret = true;
try
{
auto snapshot_buf = snapshot_manager.serializeSnapshotToBuffer(*snapshot);
auto result_path = snapshot_manager.serializeSnapshotBufferToDisk(*snapshot_buf, snapshot->snapshot_meta->get_last_log_idx());
{
std::lock_guard lock(snapshots_lock);
auto snapshot_buf = snapshot_manager.serializeSnapshotToBuffer(*snapshot);
auto result_path = snapshot_manager.serializeSnapshotBufferToDisk(*snapshot_buf, snapshot->snapshot_meta->get_last_log_idx());
latest_snapshot_buf = snapshot_buf;
latest_snapshot_meta = snapshot->snapshot_meta;
}
LOG_DEBUG(log, "Created persistent snapshot {} with path {}", latest_snapshot_meta->get_last_log_idx(), result_path);
LOG_DEBUG(log, "Created persistent snapshot {} with path {}", latest_snapshot_meta->get_last_log_idx(), result_path);
}
{
/// Must do it with lock (clearing elements from list)
@ -228,37 +228,19 @@ void KeeperStateMachine::save_logical_snp_obj(
nuraft::ptr<nuraft::buffer> snp_buf = s.serialize();
cloned_meta = nuraft::snapshot::deserialize(*snp_buf);
/// Sometimes NuRaft can call save and create snapshots from different threads
/// at once. To avoid race conditions we serialize snapshots through snapshots_queue
/// TODO: make something better
CreateSnapshotTask snapshot_task;
std::shared_ptr<std::promise<void>> waiter = std::make_shared<std::promise<void>>();
auto future = waiter->get_future();
snapshot_task.snapshot = nullptr;
snapshot_task.create_snapshot = [this, waiter, cloned_buffer, log_idx = s.get_last_log_idx()] (KeeperStorageSnapshotPtr &&)
{
try
{
auto result_path = snapshot_manager.serializeSnapshotBufferToDisk(*cloned_buffer, log_idx);
LOG_DEBUG(log, "Saved snapshot {} to path {}", log_idx, result_path);
}
catch (...)
{
tryLogCurrentException(log);
}
waiter->set_value();
};
snapshots_queue.push(std::move(snapshot_task));
future.wait();
try
{
std::lock_guard lock(snapshots_lock);
auto result_path = snapshot_manager.serializeSnapshotBufferToDisk(*cloned_buffer, s.get_last_log_idx());
latest_snapshot_buf = cloned_buffer;
latest_snapshot_meta = cloned_meta;
LOG_DEBUG(log, "Saved snapshot {} to path {}", s.get_last_log_idx(), result_path);
obj_id++;
}
catch (...)
{
tryLogCurrentException(log);
}
obj_id++;
}
int KeeperStateMachine::read_logical_snp_obj(

View File

@ -9,6 +9,9 @@
<force_sync>false</force_sync>
<startup_timeout>120000</startup_timeout>
<raft_logs_level>trace</raft_logs_level>
<heart_beat_interval_ms>1000</heart_beat_interval_ms>
<election_timeout_lower_bound_ms>2000</election_timeout_lower_bound_ms>
<election_timeout_upper_bound_ms>4000</election_timeout_upper_bound_ms>
<quorum_reads>{quorum_reads}</quorum_reads>
<snapshot_distance>{snapshot_distance}</snapshot_distance>
<stale_log_gap>{stale_log_gap}</stale_log_gap>

View File

@ -16,3 +16,5 @@
(def coordination-logs-dir (str coordination-data-dir "/logs"))
(def stderr-file (str logs-dir "/stderr.log"))
(def binaries-cache-dir (str common-prefix "/binaries"))

View File

@ -17,9 +17,7 @@
(defn get-clickhouse-url
[url]
(let [download-result (cu/wget! url)]
(do (c/exec :mv download-result common-prefix)
(str common-prefix "/" download-result))))
(non-precise-cached-wget! url))
(defn download-clickhouse
[source]
@ -49,6 +47,7 @@
(defn chmod-binary
[path]
(info "Binary path chmod" path)
(c/exec :chmod :+x path))
(defn install-downloaded-clickhouse
@ -90,6 +89,13 @@
(c/exec :echo (slurp (io/resource "listen.xml")) :> (str sub-configs-dir "/listen.xml"))
(c/exec :echo (cluster-config test node (slurp (io/resource "keeper_config.xml"))) :> (str sub-configs-dir "/keeper_config.xml")))
(defn collect-traces
[test node]
(let [pid (c/exec :pidof "clickhouse")]
(c/exec :timeout :-s "KILL" "60" :gdb :-ex "set pagination off" :-ex (str "set logging file " logs-dir "/gdb.log") :-ex
"set logging on" :-ex "backtrace" :-ex "thread apply all backtrace"
:-ex "backtrace" :-ex "detach" :-ex "quit" :--pid pid :|| :true)))
(defn db
[version reuse-binary]
(reify db/DB
@ -110,19 +116,31 @@
(teardown! [_ test node]
(info node "Tearing down clickhouse")
(kill-clickhouse! node test)
(c/su
(kill-clickhouse! node test)
(if (not reuse-binary)
(c/exec :rm :-rf binary-path))
(c/exec :rm :-rf pid-file-path)
(c/exec :rm :-rf data-dir)
;(c/exec :rm :-rf logs-dir)
(c/exec :rm :-rf logs-dir)
(c/exec :rm :-rf configs-dir)))
db/LogFiles
(log-files [_ test node]
(c/su
(if (cu/exists? pid-file-path)
(do
(info node "Collecting traces")
(collect-traces test node))
(info node "Pid files doesn't exists"))
(kill-clickhouse! node test)
(c/cd data-dir
(c/exec :tar :czf "coordination.tar.gz" "coordination")))
[stderr-file (str logs-dir "/clickhouse-server.log") (str data-dir "/coordination.tar.gz")])))
(if (cu/exists? coordination-data-dir)
(do
(info node "Coordination files exists, going to compress")
(c/cd data-dir
(c/exec :tar :czf "coordination.tar.gz" "coordination")))))
(let [common-logs [stderr-file (str logs-dir "/clickhouse-server.log") (str data-dir "/coordination.tar.gz")]
gdb-log (str logs-dir "/gdb.log")]
(if (cu/exists? (str logs-dir "/gdb.log"))
(conj common-logs gdb-log)
common-logs)))))

View File

@ -18,7 +18,8 @@
:nodename node))
(setup! [this test]
(zk-create-if-not-exists conn k "#{}"))
(exec-with-retries 30 (fn []
(zk-create-if-not-exists conn k "#{}"))))
(invoke! [this test op]
(case (:f op)

View File

@ -6,11 +6,24 @@
[jepsen.control.util :as cu]
[jepsen.clickhouse-keeper.constants :refer :all]
[jepsen.control :as c]
[clojure.tools.logging :refer :all])
[clojure.tools.logging :refer :all]
[clojure.java.io :as io])
(:import (org.apache.zookeeper.data Stat)
(org.apache.zookeeper CreateMode
ZooKeeper)
(org.apache.zookeeper ZooKeeper KeeperException KeeperException$BadVersionException)))
(org.apache.zookeeper ZooKeeper KeeperException KeeperException$BadVersionException)
(java.security MessageDigest)))
(defn exec-with-retries
[retries f & args]
(let [res (try {:value (apply f args)}
(catch Exception e
(if (zero? retries)
(throw e)
{:exception e})))]
(if (:exception res)
(do (Thread/sleep 1000) (recur (dec retries) f args))
(:value res))))
(defn parse-long
"Parses a string to a Long. Passes through `nil` and empty strings."
@ -32,7 +45,7 @@
(defn zk-connect
[host port timeout]
(zk/connect (str host ":" port) :timeout-msec timeout))
(exec-with-retries 15 (fn [] (zk/connect (str host ":" port) :timeout-msec timeout))))
(defn zk-create-range
[conn n]
@ -168,13 +181,23 @@
:--keeper_server.logs_storage_path coordination-logs-dir)
(wait-clickhouse-alive! node test)))
(defn exec-with-retries
[retries f & args]
(let [res (try {:value (apply f args)}
(catch Exception e
(if (zero? retries)
(throw e)
{:exception e})))]
(if (:exception res)
(do (Thread/sleep 1000) (recur (dec retries) f args))
(:value res))))
(defn md5 [^String s]
(let [algorithm (MessageDigest/getInstance "MD5")
raw (.digest algorithm (.getBytes s))]
(format "%032x" (BigInteger. 1 raw))))
(defn non-precise-cached-wget!
[url]
(let [encoded-url (md5 url)
expected-file-name (.getName (io/file url))
dest-file (str binaries-cache-dir "/" encoded-url)
dest-symlink (str common-prefix "/" expected-file-name)
wget-opts (concat cu/std-wget-opts [:-O dest-file])]
(when-not (cu/exists? dest-file)
(info "Downloading" url)
(do (c/exec :mkdir :-p binaries-cache-dir)
(c/cd binaries-cache-dir
(cu/wget-helper! wget-opts url))))
(c/exec :rm :-rf dest-symlink)
(c/exec :ln :-s dest-file dest-symlink)
dest-symlink))