aboutsummaryrefslogtreecommitdiff
path: root/script
diff options
context:
space:
mode:
authorAlex Auvolat <alex@adnab.me>2023-10-25 11:41:34 +0200
committerAlex Auvolat <alex@adnab.me>2023-10-25 11:41:34 +0200
commitdb921cc05f8bcfccd0d0ba1d90b6dcd77f06dcdd (patch)
treee2dadcb33d8cdc6eae93cbf7819fae7693df35b5 /script
parent4fa2646a75ed9b4823bf36ae6218a18cca11c471 (diff)
downloadgarage-db921cc05f8bcfccd0d0ba1d90b6dcd77f06dcdd.tar.gz
garage-db921cc05f8bcfccd0d0ba1d90b6dcd77f06dcdd.zip
jepsen: reconfigure nemesis + add db nemesis
Diffstat (limited to 'script')
-rw-r--r--script/jepsen.garage/README.md2
-rw-r--r--script/jepsen.garage/src/jepsen/garage.clj14
-rw-r--r--script/jepsen.garage/src/jepsen/garage/daemon.clj18
-rw-r--r--script/jepsen.garage/src/jepsen/garage/nemesis.clj121
-rw-r--r--script/jepsen.garage/src/jepsen/garage/reg.clj37
-rw-r--r--script/jepsen.garage/src/jepsen/garage/set.clj49
6 files changed, 134 insertions, 107 deletions
diff --git a/script/jepsen.garage/README.md b/script/jepsen.garage/README.md
index 5d407b6a..ced8ebb5 100644
--- a/script/jepsen.garage/README.md
+++ b/script/jepsen.garage/README.md
@@ -97,6 +97,8 @@ Results:
- For now, no failures with clock-scramble nemesis + partition nemesis -> TODO long test run
+- Does not seem to fail with partition + layout reconfiguration nemesis (>100 runs)
+
- Does not seem to fail with the clock scrambler + partition + layout reconfiguation nemesis (>10 runs), although theoretically it could
TODO: make it fail!!!
diff --git a/script/jepsen.garage/src/jepsen/garage.clj b/script/jepsen.garage/src/jepsen/garage.clj
index a67399e0..3fe527a6 100644
--- a/script/jepsen.garage/src/jepsen/garage.clj
+++ b/script/jepsen.garage/src/jepsen/garage.clj
@@ -27,7 +27,8 @@
"cp" grgNemesis/scenario-cp
"r" grgNemesis/scenario-r
"pr" grgNemesis/scenario-pr
- "cpr" grgNemesis/scenario-cpr})
+ "cpr" grgNemesis/scenario-cpr
+ "dpr" grgNemesis/scenario-dpr})
(def patches
"A map of patch names to Garage builds"
@@ -59,15 +60,16 @@
"Given an options map from the command line runner (e.g. :nodes, :ssh,
:concurrency, ...), constructs a test map."
[opts]
- (let [workload ((get workloads (:workload opts)) opts)
- scenario ((get scenari (:scenario opts)) opts)
- garage-version (get patches (:patch opts))]
+ (let [garage-version (get patches (:patch opts))
+ db (grg/db garage-version)
+ workload ((get workloads (:workload opts)) opts)
+ scenario ((get scenari (:scenario opts)) (assoc opts :db db))]
(merge tests/noop-test
opts
{:pure-generators true
:name (str "garage " (name (:workload opts)))
:os debian/os
- :db (grg/db garage-version)
+ :db db
:client (:client workload)
:generator (gen/phases
(->>
@@ -82,7 +84,7 @@
(gen/clients (:final-generator workload)))
:nemesis (:nemesis scenario)
:checker (checker/compose
- {:perf (checker/perf)
+ {:perf (checker/perf (:perf scenario))
:workload (:checker workload)})
})))
diff --git a/script/jepsen.garage/src/jepsen/garage/daemon.clj b/script/jepsen.garage/src/jepsen/garage/daemon.clj
index 7c581ba1..d407dd29 100644
--- a/script/jepsen.garage/src/jepsen/garage/daemon.clj
+++ b/script/jepsen.garage/src/jepsen/garage/daemon.clj
@@ -119,6 +119,24 @@
(c/exec :rm :-rf data-dir)
(c/exec :rm :-rf meta-dir)))
+ db/Pause
+ (pause! [_ test node]
+ (cu/grepkill! :stop binary))
+ (resume! [_ test node]
+ (cu/grepkill! :cont binary))
+
+ db/Kill
+ (kill! [_ test node]
+ (cu/stop-daemon! binary pidfile))
+ (start! [_ test node]
+ (cu/start-daemon!
+ {:logfile logfile
+ :pidfile pidfile
+ :chdir base-dir
+ :env {:RUST_LOG "garage=debug,garage_api=trace"}}
+ binary
+ :server))
+
db/LogFiles
(log-files [_ test node]
[logfile])))
diff --git a/script/jepsen.garage/src/jepsen/garage/nemesis.clj b/script/jepsen.garage/src/jepsen/garage/nemesis.clj
index 6a2e1935..0222e463 100644
--- a/script/jepsen.garage/src/jepsen/garage/nemesis.clj
+++ b/script/jepsen.garage/src/jepsen/garage/nemesis.clj
@@ -4,6 +4,7 @@
[core :as jepsen]
[generator :as gen]
[nemesis :as nemesis]]
+ [jepsen.nemesis.combined :as combined]
[jepsen.garage.daemon :as grg]
[jepsen.control.util :as cu]))
@@ -11,21 +12,23 @@
(defn configure-present!
"Configure node to be active in new cluster layout"
- [test node]
- (info "configure-present!" node)
- (let [node-id (c/on node (c/exec grg/binary :node :id :-q))]
- (c/on
- (jepsen/primary test)
- (c/exec grg/binary :layout :assign (subs node-id 0 16) :-c :1G))))
+ [test nodes]
+ (info "configure-present!" nodes)
+ (let [node-ids (c/on-many nodes (c/exec grg/binary :node :id :-q))
+ node-id-strs (map (fn [[_ v]] (subs v 0 16)) node-ids)]
+ (c/on
+ (jepsen/primary test)
+ (apply c/exec (concat [grg/binary :layout :assign :-c :1G] node-id-strs)))))
(defn configure-absent!
- "Configure node to be active in new cluster layout"
- [test node]
- (info "configure-absent!" node)
- (let [node-id (c/on node (c/exec grg/binary :node :id :-q))]
- (c/on
- (jepsen/primary test)
- (c/exec grg/binary :layout :assign (subs node-id 0 16) :-g))))
+ "Configure nodes to be active in new cluster layout"
+ [test nodes]
+ (info "configure-absent!" nodes)
+ (let [node-ids (c/on-many nodes (c/exec grg/binary :node :id :-q))
+ node-id-strs (map (fn [[_ v]] (subs v 0 16)) node-ids)]
+ (c/on
+ (jepsen/primary test)
+ (apply c/exec (concat [grg/binary :layout :assign :-g] node-id-strs)))))
(defn finalize-config!
"Apply the proposed cluster layout"
@@ -53,14 +56,14 @@
shuffle
(split-at cnt))]
(info "layout split: keep " keep-nodes ", remove " remove-nodes)
- (run! #(configure-present! test %) keep-nodes)
- (run! #(configure-absent! test %) remove-nodes)
+ (configure-present! test keep-nodes)
+ (configure-absent! test remove-nodes)
(finalize-config! test)
(assoc op :value keep-nodes))
:stop
(do
(info "layout un-split: all nodes=" (:nodes test))
- (run! #(configure-present! test %) (:nodes test))
+ (configure-present! test (:nodes test))
(finalize-config! test)
(assoc op :value (:nodes test)))))
@@ -73,70 +76,58 @@
[op]
(fn [_ _] {:type :info, :f op}))
-(defn scenario-c
- "Clock scramble scenario"
+(defn reconfiguration-package
+ "Cluster reconfiguration nemesis package"
[opts]
{:generator (->>
- (nemesis-op :clock-scramble)
- (gen/stagger 5))
+ (gen/mix [(nemesis-op :reconfigure-start)
+ (nemesis-op :reconfigure-stop)])
+ (gen/stagger (:interval opts 5)))
+ :final-generator {:type :info, :f :reconfigure-stop}
:nemesis (nemesis/compose
- {{:clock-scramble :scramble} (nemesis/clock-scrambler 20.0)})})
+ {{:reconfigure-start :start
+ :reconfigure-stop :stop} (reconfigure-subset 3)})
+ :perf #{{:name "reconfigure"
+ :start #{:reconfigure-start}
+ :stop #{:reconfigur-stop}
+ :color "#A197E9"}}})
+
+(defn scenario-c
+ "Clock modifying scenario"
+ [opts]
+ (combined/clock-package {:db (:db opts), :interval 1, :faults #{:clock}}))
(defn scenario-cp
- "Clock scramble + partition scenario"
+ "Clock modifying + partition scenario"
[opts]
- {:generator (->>
- (gen/mix [(nemesis-op :clock-scramble)
- (nemesis-op :partition-stop)
- (nemesis-op :partition-start)])
- (gen/stagger 5))
- :final-generator (gen/once {:type :info, :f :partition-stop})
- :nemesis (nemesis/compose
- {{:clock-scramble :scramble} (nemesis/clock-scrambler 20.0)
- {:partition-start :start
- :partition-stop :stop} (nemesis/partition-random-halves)})})
+ (combined/compose-packages
+ [(combined/clock-package {:db (:db opts), :interval 1, :faults #{:clock}})
+ (combined/partition-package {:db (:db opts), :interval 1, :faults #{:partition}})]))
(defn scenario-r
"Cluster reconfiguration scenario"
[opts]
- {:generator (->>
- (gen/mix [(nemesis-op :reconfigure-start)
- (nemesis-op :reconfigure-stop)])
- (gen/stagger 5))
- :nemesis (nemesis/compose
- {{:reconfigure-start :start
- :reconfigure-stop :stop} (reconfigure-subset 3)})})
+ (reconfiguration-package {:interval 1}))
(defn scenario-pr
"Partition + cluster reconfiguration scenario"
[opts]
- {:generator (->>
- (gen/mix [(nemesis-op :partition-start)
- (nemesis-op :partition-stop)
- (nemesis-op :reconfigure-start)
- (nemesis-op :reconfigure-stop)])
- (gen/stagger 5))
- :final-generator (gen/once {:type :info, :f :partition-stop})
- :nemesis (nemesis/compose
- {{:partition-start :start
- :partition-stop :stop} (nemesis/partition-random-halves)
- {:reconfigure-start :start
- :reconfigure-stop :stop} (reconfigure-subset 3)})})
+ (combined/compose-packages
+ [(combined/partition-package {:db (:db opts), :interval 1, :faults #{:partition}})
+ (reconfiguration-package {:interval 1})]))
(defn scenario-cpr
"Clock scramble + partition + cluster reconfiguration scenario"
[opts]
- {:generator (->>
- (gen/mix [(nemesis-op :clock-scramble)
- (nemesis-op :partition-start)
- (nemesis-op :partition-stop)
- (nemesis-op :reconfigure-start)
- (nemesis-op :reconfigure-stop)])
- (gen/stagger 5))
- :final-generator (gen/once {:type :info, :f :partition-stop})
- :nemesis (nemesis/compose
- {{:clock-scramble :scramble} (nemesis/clock-scrambler 20.0)
- {:partition-start :start
- :partition-stop :stop} (nemesis/partition-random-halves)
- {:reconfigure-start :start
- :reconfigure-stop :stop} (reconfigure-subset 3)})})
+ (combined/compose-packages
+ [(combined/clock-package {:db (:db opts), :interval 1, :faults #{:clock}})
+ (combined/partition-package {:db (:db opts), :interval 1, :faults #{:partition}})
+ (reconfiguration-package {:interval 1})]))
+
+(defn scenario-dpr
+ "Db + partition + cluster reconfiguration scenario"
+ [opts]
+ (combined/compose-packages
+ [(combined/db-package {:db (:db opts), :interval 1, :faults #{:db :pause :kill}})
+ (combined/partition-package {:db (:db opts), :interval 1, :faults #{:partition}})
+ (reconfiguration-package {:interval 1})]))
diff --git a/script/jepsen.garage/src/jepsen/garage/reg.clj b/script/jepsen.garage/src/jepsen/garage/reg.clj
index 6772abfe..39708c0b 100644
--- a/script/jepsen.garage/src/jepsen/garage/reg.clj
+++ b/script/jepsen.garage/src/jepsen/garage/reg.clj
@@ -30,21 +30,28 @@
(assoc this :creds (grg/creds node)))
(setup! [this test])
(invoke! [this test op]
- (let [[k v] (:value op)]
- (case (:f op)
- :read
- (util/timeout
- 10000
- (assoc op :type :fail, :error ::timeout)
- (let [value (s3/get (:creds this) k)]
- (assoc op :type :ok, :value (independent/tuple k value))))
- :write
- (util/timeout
- 10000
- (assoc op :type :info, :error ::timeout)
- (do
- (s3/put (:creds this) k v)
- (assoc op :type :ok))))))
+ (try+
+ (let [[k v] (:value op)]
+ (case (:f op)
+ :read
+ (util/timeout
+ 10000
+ (assoc op :type :fail, :error ::timeout)
+ (let [value (s3/get (:creds this) k)]
+ (assoc op :type :ok, :value (independent/tuple k value))))
+ :write
+ (util/timeout
+ 10000
+ (assoc op :type :info, :error ::timeout)
+ (do
+ (s3/put (:creds this) k v)
+ (assoc op :type :ok)))))
+ (catch (re-find #"Unavailable" (.getMessage %)) ex
+ (assoc op :type :info, :error ::unavailable))
+ (catch (re-find #"Broken pipe" (.getMessage %)) ex
+ (assoc op :type :info, :error ::broken-pipe))
+ (catch (re-find #"Connection refused" (.getMessage %)) ex
+ (assoc op :type :info, :error ::connection-refused))))
(teardown! [this test])
(close! [this test]))
diff --git a/script/jepsen.garage/src/jepsen/garage/set.clj b/script/jepsen.garage/src/jepsen/garage/set.clj
index f625e672..670c73f2 100644
--- a/script/jepsen.garage/src/jepsen/garage/set.clj
+++ b/script/jepsen.garage/src/jepsen/garage/set.clj
@@ -30,27 +30,34 @@
(assoc this :creds (grg/creds node)))
(setup! [this test])
(invoke! [this test op]
- (let [[k v] (:value op)
- prefix (str "set" k "/")]
- (case (:f op)
- :add
- (util/timeout
- 10000
- (assoc op :type :info, :error ::timeout)
- (do
- (s3/put (:creds this) (str prefix v) "present")
- (assoc op :type :ok)))
- :read
- (util/timeout
- 10000
- (assoc op :type :fail, :error ::timeout)
- (do
- (let [items (s3/list (:creds this) prefix)]
- (let [items-stripped (map (fn [o]
- (assert (str/starts-with? o prefix))
- (str/replace-first o prefix "")) items)
- items-set (set (map parse-long items-stripped))]
- (assoc op :type :ok, :value (independent/tuple k items-set)))))))))
+ (try+
+ (let [[k v] (:value op)
+ prefix (str "set" k "/")]
+ (case (:f op)
+ :add
+ (util/timeout
+ 10000
+ (assoc op :type :info, :error ::timeout)
+ (do
+ (s3/put (:creds this) (str prefix v) "present")
+ (assoc op :type :ok)))
+ :read
+ (util/timeout
+ 10000
+ (assoc op :type :fail, :error ::timeout)
+ (do
+ (let [items (s3/list (:creds this) prefix)]
+ (let [items-stripped (map (fn [o]
+ (assert (str/starts-with? o prefix))
+ (str/replace-first o prefix "")) items)
+ items-set (set (map parse-long items-stripped))]
+ (assoc op :type :ok, :value (independent/tuple k items-set))))))))
+ (catch (re-find #"Unavailable" (.getMessage %)) ex
+ (assoc op :type :info, :error ::unavailable))
+ (catch (re-find #"Broken pipe" (.getMessage %)) ex
+ (assoc op :type :info, :error ::broken-pipe))
+ (catch (re-find #"Connection refused" (.getMessage %)) ex
+ (assoc op :type :info, :error ::connection-refused))))
(teardown! [this test])
(close! [this test]))