From db921cc05f8bcfccd0d0ba1d90b6dcd77f06dcdd Mon Sep 17 00:00:00 2001 From: Alex Auvolat Date: Wed, 25 Oct 2023 11:41:34 +0200 Subject: [PATCH] jepsen: reconfigure nemesis + add db nemesis --- script/jepsen.garage/README.md | 2 + script/jepsen.garage/src/jepsen/garage.clj | 14 +- .../src/jepsen/garage/daemon.clj | 18 +++ .../src/jepsen/garage/nemesis.clj | 129 ++++++++---------- .../jepsen.garage/src/jepsen/garage/reg.clj | 37 +++-- .../jepsen.garage/src/jepsen/garage/set.clj | 49 ++++--- 6 files changed, 138 insertions(+), 111 deletions(-) diff --git a/script/jepsen.garage/README.md b/script/jepsen.garage/README.md index 5d407b6a..ced8ebb5 100644 --- a/script/jepsen.garage/README.md +++ b/script/jepsen.garage/README.md @@ -97,6 +97,8 @@ Results: - For now, no failures with clock-scramble nemesis + partition nemesis -> TODO long test run +- Does not seem to fail with partition + layout reconfiguration nemesis (>100 runs) + - Does not seem to fail with the clock scrambler + partition + layout reconfiguation nemesis (>10 runs), although theoretically it could TODO: make it fail!!! diff --git a/script/jepsen.garage/src/jepsen/garage.clj b/script/jepsen.garage/src/jepsen/garage.clj index a67399e0..3fe527a6 100644 --- a/script/jepsen.garage/src/jepsen/garage.clj +++ b/script/jepsen.garage/src/jepsen/garage.clj @@ -27,7 +27,8 @@ "cp" grgNemesis/scenario-cp "r" grgNemesis/scenario-r "pr" grgNemesis/scenario-pr - "cpr" grgNemesis/scenario-cpr}) + "cpr" grgNemesis/scenario-cpr + "dpr" grgNemesis/scenario-dpr}) (def patches "A map of patch names to Garage builds" @@ -59,15 +60,16 @@ "Given an options map from the command line runner (e.g. :nodes, :ssh, :concurrency, ...), constructs a test map." [opts] - (let [workload ((get workloads (:workload opts)) opts) - scenario ((get scenari (:scenario opts)) opts) - garage-version (get patches (:patch opts))] + (let [garage-version (get patches (:patch opts)) + db (grg/db garage-version) + workload ((get workloads (:workload opts)) opts) + scenario ((get scenari (:scenario opts)) (assoc opts :db db))] (merge tests/noop-test opts {:pure-generators true :name (str "garage " (name (:workload opts))) :os debian/os - :db (grg/db garage-version) + :db db :client (:client workload) :generator (gen/phases (->> @@ -82,7 +84,7 @@ (gen/clients (:final-generator workload))) :nemesis (:nemesis scenario) :checker (checker/compose - {:perf (checker/perf) + {:perf (checker/perf (:perf scenario)) :workload (:checker workload)}) }))) diff --git a/script/jepsen.garage/src/jepsen/garage/daemon.clj b/script/jepsen.garage/src/jepsen/garage/daemon.clj index 7c581ba1..d407dd29 100644 --- a/script/jepsen.garage/src/jepsen/garage/daemon.clj +++ b/script/jepsen.garage/src/jepsen/garage/daemon.clj @@ -119,6 +119,24 @@ (c/exec :rm :-rf data-dir) (c/exec :rm :-rf meta-dir))) + db/Pause + (pause! [_ test node] + (cu/grepkill! :stop binary)) + (resume! [_ test node] + (cu/grepkill! :cont binary)) + + db/Kill + (kill! [_ test node] + (cu/stop-daemon! binary pidfile)) + (start! [_ test node] + (cu/start-daemon! + {:logfile logfile + :pidfile pidfile + :chdir base-dir + :env {:RUST_LOG "garage=debug,garage_api=trace"}} + binary + :server)) + db/LogFiles (log-files [_ test node] [logfile]))) diff --git a/script/jepsen.garage/src/jepsen/garage/nemesis.clj b/script/jepsen.garage/src/jepsen/garage/nemesis.clj index 6a2e1935..0222e463 100644 --- a/script/jepsen.garage/src/jepsen/garage/nemesis.clj +++ b/script/jepsen.garage/src/jepsen/garage/nemesis.clj @@ -4,6 +4,7 @@ [core :as jepsen] [generator :as gen] [nemesis :as nemesis]] + [jepsen.nemesis.combined :as combined] [jepsen.garage.daemon :as grg] [jepsen.control.util :as cu])) @@ -11,21 +12,23 @@ (defn configure-present! "Configure node to be active in new cluster layout" - [test node] - (info "configure-present!" node) - (let [node-id (c/on node (c/exec grg/binary :node :id :-q))] - (c/on - (jepsen/primary test) - (c/exec grg/binary :layout :assign (subs node-id 0 16) :-c :1G)))) + [test nodes] + (info "configure-present!" nodes) + (let [node-ids (c/on-many nodes (c/exec grg/binary :node :id :-q)) + node-id-strs (map (fn [[_ v]] (subs v 0 16)) node-ids)] + (c/on + (jepsen/primary test) + (apply c/exec (concat [grg/binary :layout :assign :-c :1G] node-id-strs))))) (defn configure-absent! - "Configure node to be active in new cluster layout" - [test node] - (info "configure-absent!" node) - (let [node-id (c/on node (c/exec grg/binary :node :id :-q))] - (c/on - (jepsen/primary test) - (c/exec grg/binary :layout :assign (subs node-id 0 16) :-g)))) + "Configure nodes to be active in new cluster layout" + [test nodes] + (info "configure-absent!" nodes) + (let [node-ids (c/on-many nodes (c/exec grg/binary :node :id :-q)) + node-id-strs (map (fn [[_ v]] (subs v 0 16)) node-ids)] + (c/on + (jepsen/primary test) + (apply c/exec (concat [grg/binary :layout :assign :-g] node-id-strs))))) (defn finalize-config! "Apply the proposed cluster layout" @@ -53,14 +56,14 @@ shuffle (split-at cnt))] (info "layout split: keep " keep-nodes ", remove " remove-nodes) - (run! #(configure-present! test %) keep-nodes) - (run! #(configure-absent! test %) remove-nodes) + (configure-present! test keep-nodes) + (configure-absent! test remove-nodes) (finalize-config! test) (assoc op :value keep-nodes)) :stop (do (info "layout un-split: all nodes=" (:nodes test)) - (run! #(configure-present! test %) (:nodes test)) + (configure-present! test (:nodes test)) (finalize-config! test) (assoc op :value (:nodes test))))) @@ -73,70 +76,58 @@ [op] (fn [_ _] {:type :info, :f op})) -(defn scenario-c - "Clock scramble scenario" - [opts] - {:generator (->> - (nemesis-op :clock-scramble) - (gen/stagger 5)) - :nemesis (nemesis/compose - {{:clock-scramble :scramble} (nemesis/clock-scrambler 20.0)})}) - -(defn scenario-cp - "Clock scramble + partition scenario" - [opts] - {:generator (->> - (gen/mix [(nemesis-op :clock-scramble) - (nemesis-op :partition-stop) - (nemesis-op :partition-start)]) - (gen/stagger 5)) - :final-generator (gen/once {:type :info, :f :partition-stop}) - :nemesis (nemesis/compose - {{:clock-scramble :scramble} (nemesis/clock-scrambler 20.0) - {:partition-start :start - :partition-stop :stop} (nemesis/partition-random-halves)})}) - -(defn scenario-r - "Cluster reconfiguration scenario" +(defn reconfiguration-package + "Cluster reconfiguration nemesis package" [opts] {:generator (->> (gen/mix [(nemesis-op :reconfigure-start) (nemesis-op :reconfigure-stop)]) - (gen/stagger 5)) + (gen/stagger (:interval opts 5))) + :final-generator {:type :info, :f :reconfigure-stop} :nemesis (nemesis/compose {{:reconfigure-start :start - :reconfigure-stop :stop} (reconfigure-subset 3)})}) + :reconfigure-stop :stop} (reconfigure-subset 3)}) + :perf #{{:name "reconfigure" + :start #{:reconfigure-start} + :stop #{:reconfigur-stop} + :color "#A197E9"}}}) + +(defn scenario-c + "Clock modifying scenario" + [opts] + (combined/clock-package {:db (:db opts), :interval 1, :faults #{:clock}})) + +(defn scenario-cp + "Clock modifying + partition scenario" + [opts] + (combined/compose-packages + [(combined/clock-package {:db (:db opts), :interval 1, :faults #{:clock}}) + (combined/partition-package {:db (:db opts), :interval 1, :faults #{:partition}})])) + +(defn scenario-r + "Cluster reconfiguration scenario" + [opts] + (reconfiguration-package {:interval 1})) (defn scenario-pr "Partition + cluster reconfiguration scenario" [opts] - {:generator (->> - (gen/mix [(nemesis-op :partition-start) - (nemesis-op :partition-stop) - (nemesis-op :reconfigure-start) - (nemesis-op :reconfigure-stop)]) - (gen/stagger 5)) - :final-generator (gen/once {:type :info, :f :partition-stop}) - :nemesis (nemesis/compose - {{:partition-start :start - :partition-stop :stop} (nemesis/partition-random-halves) - {:reconfigure-start :start - :reconfigure-stop :stop} (reconfigure-subset 3)})}) + (combined/compose-packages + [(combined/partition-package {:db (:db opts), :interval 1, :faults #{:partition}}) + (reconfiguration-package {:interval 1})])) (defn scenario-cpr "Clock scramble + partition + cluster reconfiguration scenario" [opts] - {:generator (->> - (gen/mix [(nemesis-op :clock-scramble) - (nemesis-op :partition-start) - (nemesis-op :partition-stop) - (nemesis-op :reconfigure-start) - (nemesis-op :reconfigure-stop)]) - (gen/stagger 5)) - :final-generator (gen/once {:type :info, :f :partition-stop}) - :nemesis (nemesis/compose - {{:clock-scramble :scramble} (nemesis/clock-scrambler 20.0) - {:partition-start :start - :partition-stop :stop} (nemesis/partition-random-halves) - {:reconfigure-start :start - :reconfigure-stop :stop} (reconfigure-subset 3)})}) + (combined/compose-packages + [(combined/clock-package {:db (:db opts), :interval 1, :faults #{:clock}}) + (combined/partition-package {:db (:db opts), :interval 1, :faults #{:partition}}) + (reconfiguration-package {:interval 1})])) + +(defn scenario-dpr + "Db + partition + cluster reconfiguration scenario" + [opts] + (combined/compose-packages + [(combined/db-package {:db (:db opts), :interval 1, :faults #{:db :pause :kill}}) + (combined/partition-package {:db (:db opts), :interval 1, :faults #{:partition}}) + (reconfiguration-package {:interval 1})])) diff --git a/script/jepsen.garage/src/jepsen/garage/reg.clj b/script/jepsen.garage/src/jepsen/garage/reg.clj index 6772abfe..39708c0b 100644 --- a/script/jepsen.garage/src/jepsen/garage/reg.clj +++ b/script/jepsen.garage/src/jepsen/garage/reg.clj @@ -30,21 +30,28 @@ (assoc this :creds (grg/creds node))) (setup! [this test]) (invoke! [this test op] - (let [[k v] (:value op)] - (case (:f op) - :read - (util/timeout - 10000 - (assoc op :type :fail, :error ::timeout) - (let [value (s3/get (:creds this) k)] - (assoc op :type :ok, :value (independent/tuple k value)))) - :write - (util/timeout - 10000 - (assoc op :type :info, :error ::timeout) - (do - (s3/put (:creds this) k v) - (assoc op :type :ok)))))) + (try+ + (let [[k v] (:value op)] + (case (:f op) + :read + (util/timeout + 10000 + (assoc op :type :fail, :error ::timeout) + (let [value (s3/get (:creds this) k)] + (assoc op :type :ok, :value (independent/tuple k value)))) + :write + (util/timeout + 10000 + (assoc op :type :info, :error ::timeout) + (do + (s3/put (:creds this) k v) + (assoc op :type :ok))))) + (catch (re-find #"Unavailable" (.getMessage %)) ex + (assoc op :type :info, :error ::unavailable)) + (catch (re-find #"Broken pipe" (.getMessage %)) ex + (assoc op :type :info, :error ::broken-pipe)) + (catch (re-find #"Connection refused" (.getMessage %)) ex + (assoc op :type :info, :error ::connection-refused)))) (teardown! [this test]) (close! [this test])) diff --git a/script/jepsen.garage/src/jepsen/garage/set.clj b/script/jepsen.garage/src/jepsen/garage/set.clj index f625e672..670c73f2 100644 --- a/script/jepsen.garage/src/jepsen/garage/set.clj +++ b/script/jepsen.garage/src/jepsen/garage/set.clj @@ -30,27 +30,34 @@ (assoc this :creds (grg/creds node))) (setup! [this test]) (invoke! [this test op] - (let [[k v] (:value op) - prefix (str "set" k "/")] - (case (:f op) - :add - (util/timeout - 10000 - (assoc op :type :info, :error ::timeout) - (do - (s3/put (:creds this) (str prefix v) "present") - (assoc op :type :ok))) - :read - (util/timeout - 10000 - (assoc op :type :fail, :error ::timeout) - (do - (let [items (s3/list (:creds this) prefix)] - (let [items-stripped (map (fn [o] - (assert (str/starts-with? o prefix)) - (str/replace-first o prefix "")) items) - items-set (set (map parse-long items-stripped))] - (assoc op :type :ok, :value (independent/tuple k items-set))))))))) + (try+ + (let [[k v] (:value op) + prefix (str "set" k "/")] + (case (:f op) + :add + (util/timeout + 10000 + (assoc op :type :info, :error ::timeout) + (do + (s3/put (:creds this) (str prefix v) "present") + (assoc op :type :ok))) + :read + (util/timeout + 10000 + (assoc op :type :fail, :error ::timeout) + (do + (let [items (s3/list (:creds this) prefix)] + (let [items-stripped (map (fn [o] + (assert (str/starts-with? o prefix)) + (str/replace-first o prefix "")) items) + items-set (set (map parse-long items-stripped))] + (assoc op :type :ok, :value (independent/tuple k items-set)))))))) + (catch (re-find #"Unavailable" (.getMessage %)) ex + (assoc op :type :info, :error ::unavailable)) + (catch (re-find #"Broken pipe" (.getMessage %)) ex + (assoc op :type :info, :error ::broken-pipe)) + (catch (re-find #"Connection refused" (.getMessage %)) ex + (assoc op :type :info, :error ::connection-refused)))) (teardown! [this test]) (close! [this test]))