mirror of
https://github.com/ClickHouse/ClickHouse.git
synced 2024-09-20 08:40:50 +00:00
Merge pull request #22373 from ClickHouse/jepsen_for_ci
Add image to run jepsen tests
This commit is contained in:
commit
caaa19edb8
2
contrib/NuRaft
vendored
2
contrib/NuRaft
vendored
@ -1 +1 @@
|
||||
Subproject commit 70468326ad5d72e9497944838484c591dae054ea
|
||||
Subproject commit 241fd3754a1eb4d82ab68a9a875dc99391ec9f02
|
@ -138,7 +138,8 @@
|
||||
"docker/test/stateless_unbundled",
|
||||
"docker/test/stateless_pytest",
|
||||
"docker/test/integration/base",
|
||||
"docker/test/fuzzer"
|
||||
"docker/test/fuzzer",
|
||||
"docker/test/keeper-jepsen"
|
||||
]
|
||||
},
|
||||
"docker/packager/unbundled": {
|
||||
@ -159,5 +160,9 @@
|
||||
"docker/test/sqlancer": {
|
||||
"name": "yandex/clickhouse-sqlancer-test",
|
||||
"dependent": []
|
||||
},
|
||||
"docker/test/keeper-jepsen": {
|
||||
"name": "yandex/clickhouse-keeper-jepsen-test",
|
||||
"dependent": []
|
||||
}
|
||||
}
|
||||
|
39
docker/test/keeper-jepsen/Dockerfile
Normal file
39
docker/test/keeper-jepsen/Dockerfile
Normal file
@ -0,0 +1,39 @@
|
||||
# docker build -t yandex/clickhouse-keeper-jepsen-test .
|
||||
FROM yandex/clickhouse-test-base
|
||||
|
||||
ENV DEBIAN_FRONTEND=noninteractive
|
||||
ENV CLOJURE_VERSION=1.10.3.814
|
||||
|
||||
# arguments
|
||||
ENV PR_TO_TEST=""
|
||||
ENV SHA_TO_TEST=""
|
||||
|
||||
ENV NODES_USERNAME="root"
|
||||
ENV NODES_PASSWORD=""
|
||||
ENV TESTS_TO_RUN="30"
|
||||
ENV TIME_LIMIT="30"
|
||||
|
||||
|
||||
# volumes
|
||||
ENV NODES_FILE_PATH="/nodes.txt"
|
||||
ENV TEST_OUTPUT="/test_output"
|
||||
|
||||
RUN mkdir "/root/.ssh"
|
||||
RUN touch "/root/.ssh/known_hosts"
|
||||
|
||||
# install java
|
||||
RUN apt-get update && apt-get install default-jre default-jdk libjna-java libjna-jni ssh gnuplot graphviz --yes --no-install-recommends
|
||||
|
||||
# install clojure
|
||||
RUN curl -O "https://download.clojure.org/install/linux-install-${CLOJURE_VERSION}.sh" && \
|
||||
chmod +x "linux-install-${CLOJURE_VERSION}.sh" && \
|
||||
bash "./linux-install-${CLOJURE_VERSION}.sh"
|
||||
|
||||
# install leiningen
|
||||
RUN curl -O "https://raw.githubusercontent.com/technomancy/leiningen/stable/bin/lein" && \
|
||||
chmod +x ./lein && \
|
||||
mv ./lein /usr/bin
|
||||
|
||||
COPY run.sh /
|
||||
|
||||
CMD ["/bin/bash", "/run.sh"]
|
22
docker/test/keeper-jepsen/run.sh
Normal file
22
docker/test/keeper-jepsen/run.sh
Normal file
@ -0,0 +1,22 @@
|
||||
#!/usr/bin/env bash
|
||||
set -euo pipefail
|
||||
|
||||
|
||||
CLICKHOUSE_PACKAGE=${CLICKHOUSE_PACKAGE:="https://clickhouse-builds.s3.yandex.net/$PR_TO_TEST/$SHA_TO_TEST/clickhouse_build_check/clang-11_relwithdebuginfo_none_bundled_unsplitted_disable_False_binary/clickhouse"}
|
||||
CLICKHOUSE_REPO_PATH=${CLICKHOUSE_REPO_PATH:=""}
|
||||
|
||||
|
||||
if [ -z "$CLICKHOUSE_REPO_PATH" ]; then
|
||||
CLICKHOUSE_REPO_PATH=ch
|
||||
rm -rf ch ||:
|
||||
mkdir ch ||:
|
||||
wget -nv -nd -c "https://clickhouse-test-reports.s3.yandex.net/$PR_TO_TEST/$SHA_TO_TEST/repo/clickhouse_no_subs.tar.gz"
|
||||
tar -C ch --strip-components=1 -xf clickhouse_no_subs.tar.gz
|
||||
ls -lath ||:
|
||||
fi
|
||||
|
||||
cd "$CLICKHOUSE_REPO_PATH/tests/jepsen.clickhouse-keeper"
|
||||
|
||||
(lein run test-all --nodes-file "$NODES_FILE_PATH" --username "$NODES_USERNAME" --logging-json --password "$NODES_PASSWORD" --time-limit "$TIME_LIMIT" --concurrency 50 -r 50 --snapshot-distance 100 --stale-log-gap 100 --reserved-log-items 10 --lightweight-run --clickhouse-source "$CLICKHOUSE_PACKAGE" -q --test-count "$TESTS_TO_RUN" || true) | tee "$TEST_OUTPUT/jepsen_run_all_tests.log"
|
||||
|
||||
mv store "$TEST_OUTPUT/"
|
@ -150,6 +150,13 @@ void KeeperServer::putRequest(const KeeperStorage::RequestForSession & request_f
|
||||
|
||||
int64_t KeeperServer::getSessionID(int64_t session_timeout_ms)
|
||||
{
|
||||
/// Just some sanity check. We don't want to make a lot of clients wait with lock.
|
||||
if (active_session_id_requests > 10)
|
||||
throw Exception(ErrorCodes::RAFT_ERROR, "Too many concurrent SessionID requests already in flight");
|
||||
|
||||
++active_session_id_requests;
|
||||
SCOPE_EXIT({ --active_session_id_requests; });
|
||||
|
||||
auto entry = nuraft::buffer::alloc(sizeof(int64_t));
|
||||
/// Just special session request
|
||||
nuraft::buffer_serializer bs(entry);
|
||||
|
@ -34,6 +34,7 @@ private:
|
||||
std::atomic<bool> initialized_flag = false;
|
||||
std::condition_variable initialized_cv;
|
||||
std::atomic<bool> initial_batch_committed = false;
|
||||
std::atomic<size_t> active_session_id_requests = 0;
|
||||
|
||||
nuraft::cb_func::ReturnCode callbackFunc(nuraft::cb_func::Type type, nuraft::cb_func::Param * param);
|
||||
|
||||
|
@ -169,15 +169,15 @@ void KeeperStateMachine::create_snapshot(
|
||||
bool ret = true;
|
||||
try
|
||||
{
|
||||
auto snapshot_buf = snapshot_manager.serializeSnapshotToBuffer(*snapshot);
|
||||
auto result_path = snapshot_manager.serializeSnapshotBufferToDisk(*snapshot_buf, snapshot->snapshot_meta->get_last_log_idx());
|
||||
{
|
||||
std::lock_guard lock(snapshots_lock);
|
||||
auto snapshot_buf = snapshot_manager.serializeSnapshotToBuffer(*snapshot);
|
||||
auto result_path = snapshot_manager.serializeSnapshotBufferToDisk(*snapshot_buf, snapshot->snapshot_meta->get_last_log_idx());
|
||||
latest_snapshot_buf = snapshot_buf;
|
||||
latest_snapshot_meta = snapshot->snapshot_meta;
|
||||
}
|
||||
|
||||
LOG_DEBUG(log, "Created persistent snapshot {} with path {}", latest_snapshot_meta->get_last_log_idx(), result_path);
|
||||
LOG_DEBUG(log, "Created persistent snapshot {} with path {}", latest_snapshot_meta->get_last_log_idx(), result_path);
|
||||
}
|
||||
|
||||
{
|
||||
/// Must do it with lock (clearing elements from list)
|
||||
@ -228,37 +228,19 @@ void KeeperStateMachine::save_logical_snp_obj(
|
||||
nuraft::ptr<nuraft::buffer> snp_buf = s.serialize();
|
||||
cloned_meta = nuraft::snapshot::deserialize(*snp_buf);
|
||||
|
||||
/// Sometimes NuRaft can call save and create snapshots from different threads
|
||||
/// at once. To avoid race conditions we serialize snapshots through snapshots_queue
|
||||
/// TODO: make something better
|
||||
CreateSnapshotTask snapshot_task;
|
||||
std::shared_ptr<std::promise<void>> waiter = std::make_shared<std::promise<void>>();
|
||||
auto future = waiter->get_future();
|
||||
snapshot_task.snapshot = nullptr;
|
||||
snapshot_task.create_snapshot = [this, waiter, cloned_buffer, log_idx = s.get_last_log_idx()] (KeeperStorageSnapshotPtr &&)
|
||||
{
|
||||
try
|
||||
{
|
||||
auto result_path = snapshot_manager.serializeSnapshotBufferToDisk(*cloned_buffer, log_idx);
|
||||
LOG_DEBUG(log, "Saved snapshot {} to path {}", log_idx, result_path);
|
||||
}
|
||||
catch (...)
|
||||
{
|
||||
tryLogCurrentException(log);
|
||||
}
|
||||
waiter->set_value();
|
||||
};
|
||||
snapshots_queue.push(std::move(snapshot_task));
|
||||
future.wait();
|
||||
|
||||
try
|
||||
{
|
||||
std::lock_guard lock(snapshots_lock);
|
||||
auto result_path = snapshot_manager.serializeSnapshotBufferToDisk(*cloned_buffer, s.get_last_log_idx());
|
||||
latest_snapshot_buf = cloned_buffer;
|
||||
latest_snapshot_meta = cloned_meta;
|
||||
LOG_DEBUG(log, "Saved snapshot {} to path {}", s.get_last_log_idx(), result_path);
|
||||
obj_id++;
|
||||
}
|
||||
catch (...)
|
||||
{
|
||||
tryLogCurrentException(log);
|
||||
}
|
||||
|
||||
|
||||
obj_id++;
|
||||
}
|
||||
|
||||
int KeeperStateMachine::read_logical_snp_obj(
|
||||
|
@ -9,6 +9,9 @@
|
||||
<force_sync>false</force_sync>
|
||||
<startup_timeout>120000</startup_timeout>
|
||||
<raft_logs_level>trace</raft_logs_level>
|
||||
<heart_beat_interval_ms>1000</heart_beat_interval_ms>
|
||||
<election_timeout_lower_bound_ms>2000</election_timeout_lower_bound_ms>
|
||||
<election_timeout_upper_bound_ms>4000</election_timeout_upper_bound_ms>
|
||||
<quorum_reads>{quorum_reads}</quorum_reads>
|
||||
<snapshot_distance>{snapshot_distance}</snapshot_distance>
|
||||
<stale_log_gap>{stale_log_gap}</stale_log_gap>
|
||||
|
@ -16,3 +16,5 @@
|
||||
(def coordination-logs-dir (str coordination-data-dir "/logs"))
|
||||
|
||||
(def stderr-file (str logs-dir "/stderr.log"))
|
||||
|
||||
(def binaries-cache-dir (str common-prefix "/binaries"))
|
||||
|
@ -17,9 +17,7 @@
|
||||
|
||||
(defn get-clickhouse-url
|
||||
[url]
|
||||
(let [download-result (cu/wget! url)]
|
||||
(do (c/exec :mv download-result common-prefix)
|
||||
(str common-prefix "/" download-result))))
|
||||
(non-precise-cached-wget! url))
|
||||
|
||||
(defn download-clickhouse
|
||||
[source]
|
||||
@ -49,6 +47,7 @@
|
||||
|
||||
(defn chmod-binary
|
||||
[path]
|
||||
(info "Binary path chmod" path)
|
||||
(c/exec :chmod :+x path))
|
||||
|
||||
(defn install-downloaded-clickhouse
|
||||
@ -90,6 +89,13 @@
|
||||
(c/exec :echo (slurp (io/resource "listen.xml")) :> (str sub-configs-dir "/listen.xml"))
|
||||
(c/exec :echo (cluster-config test node (slurp (io/resource "keeper_config.xml"))) :> (str sub-configs-dir "/keeper_config.xml")))
|
||||
|
||||
(defn collect-traces
|
||||
[test node]
|
||||
(let [pid (c/exec :pidof "clickhouse")]
|
||||
(c/exec :timeout :-s "KILL" "60" :gdb :-ex "set pagination off" :-ex (str "set logging file " logs-dir "/gdb.log") :-ex
|
||||
"set logging on" :-ex "backtrace" :-ex "thread apply all backtrace"
|
||||
:-ex "backtrace" :-ex "detach" :-ex "quit" :--pid pid :|| :true)))
|
||||
|
||||
(defn db
|
||||
[version reuse-binary]
|
||||
(reify db/DB
|
||||
@ -110,19 +116,31 @@
|
||||
|
||||
(teardown! [_ test node]
|
||||
(info node "Tearing down clickhouse")
|
||||
(kill-clickhouse! node test)
|
||||
(c/su
|
||||
(kill-clickhouse! node test)
|
||||
(if (not reuse-binary)
|
||||
(c/exec :rm :-rf binary-path))
|
||||
(c/exec :rm :-rf pid-file-path)
|
||||
(c/exec :rm :-rf data-dir)
|
||||
;(c/exec :rm :-rf logs-dir)
|
||||
(c/exec :rm :-rf logs-dir)
|
||||
(c/exec :rm :-rf configs-dir)))
|
||||
|
||||
db/LogFiles
|
||||
(log-files [_ test node]
|
||||
(c/su
|
||||
(if (cu/exists? pid-file-path)
|
||||
(do
|
||||
(info node "Collecting traces")
|
||||
(collect-traces test node))
|
||||
(info node "Pid files doesn't exists"))
|
||||
(kill-clickhouse! node test)
|
||||
(c/cd data-dir
|
||||
(c/exec :tar :czf "coordination.tar.gz" "coordination")))
|
||||
[stderr-file (str logs-dir "/clickhouse-server.log") (str data-dir "/coordination.tar.gz")])))
|
||||
(if (cu/exists? coordination-data-dir)
|
||||
(do
|
||||
(info node "Coordination files exists, going to compress")
|
||||
(c/cd data-dir
|
||||
(c/exec :tar :czf "coordination.tar.gz" "coordination")))))
|
||||
(let [common-logs [stderr-file (str logs-dir "/clickhouse-server.log") (str data-dir "/coordination.tar.gz")]
|
||||
gdb-log (str logs-dir "/gdb.log")]
|
||||
(if (cu/exists? (str logs-dir "/gdb.log"))
|
||||
(conj common-logs gdb-log)
|
||||
common-logs)))))
|
||||
|
@ -18,7 +18,8 @@
|
||||
:nodename node))
|
||||
|
||||
(setup! [this test]
|
||||
(zk-create-if-not-exists conn k "#{}"))
|
||||
(exec-with-retries 30 (fn []
|
||||
(zk-create-if-not-exists conn k "#{}"))))
|
||||
|
||||
(invoke! [this test op]
|
||||
(case (:f op)
|
||||
|
@ -6,11 +6,24 @@
|
||||
[jepsen.control.util :as cu]
|
||||
[jepsen.clickhouse-keeper.constants :refer :all]
|
||||
[jepsen.control :as c]
|
||||
[clojure.tools.logging :refer :all])
|
||||
[clojure.tools.logging :refer :all]
|
||||
[clojure.java.io :as io])
|
||||
(:import (org.apache.zookeeper.data Stat)
|
||||
(org.apache.zookeeper CreateMode
|
||||
ZooKeeper)
|
||||
(org.apache.zookeeper ZooKeeper KeeperException KeeperException$BadVersionException)))
|
||||
(org.apache.zookeeper ZooKeeper KeeperException KeeperException$BadVersionException)
|
||||
(java.security MessageDigest)))
|
||||
|
||||
(defn exec-with-retries
|
||||
[retries f & args]
|
||||
(let [res (try {:value (apply f args)}
|
||||
(catch Exception e
|
||||
(if (zero? retries)
|
||||
(throw e)
|
||||
{:exception e})))]
|
||||
(if (:exception res)
|
||||
(do (Thread/sleep 1000) (recur (dec retries) f args))
|
||||
(:value res))))
|
||||
|
||||
(defn parse-long
|
||||
"Parses a string to a Long. Passes through `nil` and empty strings."
|
||||
@ -32,7 +45,7 @@
|
||||
|
||||
(defn zk-connect
|
||||
[host port timeout]
|
||||
(zk/connect (str host ":" port) :timeout-msec timeout))
|
||||
(exec-with-retries 15 (fn [] (zk/connect (str host ":" port) :timeout-msec timeout))))
|
||||
|
||||
(defn zk-create-range
|
||||
[conn n]
|
||||
@ -168,13 +181,23 @@
|
||||
:--keeper_server.logs_storage_path coordination-logs-dir)
|
||||
(wait-clickhouse-alive! node test)))
|
||||
|
||||
(defn exec-with-retries
|
||||
[retries f & args]
|
||||
(let [res (try {:value (apply f args)}
|
||||
(catch Exception e
|
||||
(if (zero? retries)
|
||||
(throw e)
|
||||
{:exception e})))]
|
||||
(if (:exception res)
|
||||
(do (Thread/sleep 1000) (recur (dec retries) f args))
|
||||
(:value res))))
|
||||
(defn md5 [^String s]
|
||||
(let [algorithm (MessageDigest/getInstance "MD5")
|
||||
raw (.digest algorithm (.getBytes s))]
|
||||
(format "%032x" (BigInteger. 1 raw))))
|
||||
|
||||
(defn non-precise-cached-wget!
|
||||
[url]
|
||||
(let [encoded-url (md5 url)
|
||||
expected-file-name (.getName (io/file url))
|
||||
dest-file (str binaries-cache-dir "/" encoded-url)
|
||||
dest-symlink (str common-prefix "/" expected-file-name)
|
||||
wget-opts (concat cu/std-wget-opts [:-O dest-file])]
|
||||
(when-not (cu/exists? dest-file)
|
||||
(info "Downloading" url)
|
||||
(do (c/exec :mkdir :-p binaries-cache-dir)
|
||||
(c/cd binaries-cache-dir
|
||||
(cu/wget-helper! wget-opts url))))
|
||||
(c/exec :rm :-rf dest-symlink)
|
||||
(c/exec :ln :-s dest-file dest-symlink)
|
||||
dest-symlink))
|
||||
|
Loading…
Reference in New Issue
Block a user