Compare commits

..

No commits in common. "fd85010a403775bbb18030ae2d9d3689b34f3e8a" and "4fa2646a75ed9b4823bf36ae6218a18cca11c471" have entirely different histories.

6 changed files with 112 additions and 138 deletions

View file

@ -86,7 +86,7 @@ Results:
- Does not seem to fail with only the layout reconfiguation nemesis (<10 runs), although theoretically it could - Does not seem to fail with only the layout reconfiguation nemesis (<10 runs), although theoretically it could
- **Fails with the partition + layout reconfiguration nemesis** (`--scenario pr`). - **Fails with the partition + layout reconfiguration nemesis** (`--scenario pr`).
Example of a failed run: `garage set1/20231024T172214.488+0200` (1 failure in 4 runs). EXample of a failed run: `garage set1/20231024T172214.488+0200` (1 failure in 4 runs).
### Set, continuous test (interspersed reads and writes) ### Set, continuous test (interspersed reads and writes)
@ -97,10 +97,9 @@ Results:
- For now, no failures with clock-scramble nemesis + partition nemesis -> TODO long test run - For now, no failures with clock-scramble nemesis + partition nemesis -> TODO long test run
- **Fails with layout reconfiguration nemesis** (`--scenario r`). - Does not seem to fail with the clock scrambler + partition + layout reconfiguation nemesis (>10 runs), although theoretically it could
Example of a failed run: `garage set2/20231025T115033.553+0200` (2 failures in 2 runs).
TODO: investigate. TODO: make it fail!!!
This is the failure mode we are looking for and trying to fix for NLnet task 3.
## Investigating (and fixing) errors ## Investigating (and fixing) errors

View file

@ -27,8 +27,7 @@
"cp" grgNemesis/scenario-cp "cp" grgNemesis/scenario-cp
"r" grgNemesis/scenario-r "r" grgNemesis/scenario-r
"pr" grgNemesis/scenario-pr "pr" grgNemesis/scenario-pr
"cpr" grgNemesis/scenario-cpr "cpr" grgNemesis/scenario-cpr})
"dpr" grgNemesis/scenario-dpr})
(def patches (def patches
"A map of patch names to Garage builds" "A map of patch names to Garage builds"
@ -60,16 +59,15 @@
"Given an options map from the command line runner (e.g. :nodes, :ssh, "Given an options map from the command line runner (e.g. :nodes, :ssh,
:concurrency, ...), constructs a test map." :concurrency, ...), constructs a test map."
[opts] [opts]
(let [garage-version (get patches (:patch opts)) (let [workload ((get workloads (:workload opts)) opts)
db (grg/db garage-version) scenario ((get scenari (:scenario opts)) opts)
workload ((get workloads (:workload opts)) opts) garage-version (get patches (:patch opts))]
scenario ((get scenari (:scenario opts)) (assoc opts :db db))]
(merge tests/noop-test (merge tests/noop-test
opts opts
{:pure-generators true {:pure-generators true
:name (str "garage " (name (:workload opts))) :name (str "garage " (name (:workload opts)))
:os debian/os :os debian/os
:db db :db (grg/db garage-version)
:client (:client workload) :client (:client workload)
:generator (gen/phases :generator (gen/phases
(->> (->>
@ -84,7 +82,7 @@
(gen/clients (:final-generator workload))) (gen/clients (:final-generator workload)))
:nemesis (:nemesis scenario) :nemesis (:nemesis scenario)
:checker (checker/compose :checker (checker/compose
{:perf (checker/perf (:perf scenario)) {:perf (checker/perf)
:workload (:checker workload)}) :workload (:checker workload)})
}))) })))

View file

@ -119,24 +119,6 @@
(c/exec :rm :-rf data-dir) (c/exec :rm :-rf data-dir)
(c/exec :rm :-rf meta-dir))) (c/exec :rm :-rf meta-dir)))
db/Pause
(pause! [_ test node]
(cu/grepkill! :stop binary))
(resume! [_ test node]
(cu/grepkill! :cont binary))
db/Kill
(kill! [_ test node]
(cu/stop-daemon! binary pidfile))
(start! [_ test node]
(cu/start-daemon!
{:logfile logfile
:pidfile pidfile
:chdir base-dir
:env {:RUST_LOG "garage=debug,garage_api=trace"}}
binary
:server))
db/LogFiles db/LogFiles
(log-files [_ test node] (log-files [_ test node]
[logfile]))) [logfile])))

View file

@ -4,7 +4,6 @@
[core :as jepsen] [core :as jepsen]
[generator :as gen] [generator :as gen]
[nemesis :as nemesis]] [nemesis :as nemesis]]
[jepsen.nemesis.combined :as combined]
[jepsen.garage.daemon :as grg] [jepsen.garage.daemon :as grg]
[jepsen.control.util :as cu])) [jepsen.control.util :as cu]))
@ -12,23 +11,21 @@
(defn configure-present! (defn configure-present!
"Configure node to be active in new cluster layout" "Configure node to be active in new cluster layout"
[test nodes] [test node]
(info "configure-present!" nodes) (info "configure-present!" node)
(let [node-ids (c/on-many nodes (c/exec grg/binary :node :id :-q)) (let [node-id (c/on node (c/exec grg/binary :node :id :-q))]
node-id-strs (map (fn [[_ v]] (subs v 0 16)) node-ids)]
(c/on (c/on
(jepsen/primary test) (jepsen/primary test)
(apply c/exec (concat [grg/binary :layout :assign :-c :1G] node-id-strs))))) (c/exec grg/binary :layout :assign (subs node-id 0 16) :-c :1G))))
(defn configure-absent! (defn configure-absent!
"Configure nodes to be active in new cluster layout" "Configure node to be active in new cluster layout"
[test nodes] [test node]
(info "configure-absent!" nodes) (info "configure-absent!" node)
(let [node-ids (c/on-many nodes (c/exec grg/binary :node :id :-q)) (let [node-id (c/on node (c/exec grg/binary :node :id :-q))]
node-id-strs (map (fn [[_ v]] (subs v 0 16)) node-ids)]
(c/on (c/on
(jepsen/primary test) (jepsen/primary test)
(apply c/exec (concat [grg/binary :layout :assign :-g] node-id-strs))))) (c/exec grg/binary :layout :assign (subs node-id 0 16) :-g))))
(defn finalize-config! (defn finalize-config!
"Apply the proposed cluster layout" "Apply the proposed cluster layout"
@ -56,14 +53,14 @@
shuffle shuffle
(split-at cnt))] (split-at cnt))]
(info "layout split: keep " keep-nodes ", remove " remove-nodes) (info "layout split: keep " keep-nodes ", remove " remove-nodes)
(configure-present! test keep-nodes) (run! #(configure-present! test %) keep-nodes)
(configure-absent! test remove-nodes) (run! #(configure-absent! test %) remove-nodes)
(finalize-config! test) (finalize-config! test)
(assoc op :value keep-nodes)) (assoc op :value keep-nodes))
:stop :stop
(do (do
(info "layout un-split: all nodes=" (:nodes test)) (info "layout un-split: all nodes=" (:nodes test))
(configure-present! test (:nodes test)) (run! #(configure-present! test %) (:nodes test))
(finalize-config! test) (finalize-config! test)
(assoc op :value (:nodes test))))) (assoc op :value (:nodes test)))))
@ -76,58 +73,70 @@
[op] [op]
(fn [_ _] {:type :info, :f op})) (fn [_ _] {:type :info, :f op}))
(defn reconfiguration-package (defn scenario-c
"Cluster reconfiguration nemesis package" "Clock scramble scenario"
[opts] [opts]
{:generator (->> {:generator (->>
(gen/mix [(nemesis-op :reconfigure-start) (nemesis-op :clock-scramble)
(nemesis-op :reconfigure-stop)]) (gen/stagger 5))
(gen/stagger (:interval opts 5)))
:final-generator {:type :info, :f :reconfigure-stop}
:nemesis (nemesis/compose :nemesis (nemesis/compose
{{:reconfigure-start :start {{:clock-scramble :scramble} (nemesis/clock-scrambler 20.0)})})
:reconfigure-stop :stop} (reconfigure-subset 3)})
:perf #{{:name "reconfigure"
:start #{:reconfigure-start}
:stop #{:reconfigur-stop}
:color "#A197E9"}}})
(defn scenario-c
"Clock modifying scenario"
[opts]
(combined/clock-package {:db (:db opts), :interval 1, :faults #{:clock}}))
(defn scenario-cp (defn scenario-cp
"Clock modifying + partition scenario" "Clock scramble + partition scenario"
[opts] [opts]
(combined/compose-packages {:generator (->>
[(combined/clock-package {:db (:db opts), :interval 1, :faults #{:clock}}) (gen/mix [(nemesis-op :clock-scramble)
(combined/partition-package {:db (:db opts), :interval 1, :faults #{:partition}})])) (nemesis-op :partition-stop)
(nemesis-op :partition-start)])
(gen/stagger 5))
:final-generator (gen/once {:type :info, :f :partition-stop})
:nemesis (nemesis/compose
{{:clock-scramble :scramble} (nemesis/clock-scrambler 20.0)
{:partition-start :start
:partition-stop :stop} (nemesis/partition-random-halves)})})
(defn scenario-r (defn scenario-r
"Cluster reconfiguration scenario" "Cluster reconfiguration scenario"
[opts] [opts]
(reconfiguration-package {:interval 1})) {:generator (->>
(gen/mix [(nemesis-op :reconfigure-start)
(nemesis-op :reconfigure-stop)])
(gen/stagger 5))
:nemesis (nemesis/compose
{{:reconfigure-start :start
:reconfigure-stop :stop} (reconfigure-subset 3)})})
(defn scenario-pr (defn scenario-pr
"Partition + cluster reconfiguration scenario" "Partition + cluster reconfiguration scenario"
[opts] [opts]
(combined/compose-packages {:generator (->>
[(combined/partition-package {:db (:db opts), :interval 1, :faults #{:partition}}) (gen/mix [(nemesis-op :partition-start)
(reconfiguration-package {:interval 1})])) (nemesis-op :partition-stop)
(nemesis-op :reconfigure-start)
(nemesis-op :reconfigure-stop)])
(gen/stagger 5))
:final-generator (gen/once {:type :info, :f :partition-stop})
:nemesis (nemesis/compose
{{:partition-start :start
:partition-stop :stop} (nemesis/partition-random-halves)
{:reconfigure-start :start
:reconfigure-stop :stop} (reconfigure-subset 3)})})
(defn scenario-cpr (defn scenario-cpr
"Clock scramble + partition + cluster reconfiguration scenario" "Clock scramble + partition + cluster reconfiguration scenario"
[opts] [opts]
(combined/compose-packages {:generator (->>
[(combined/clock-package {:db (:db opts), :interval 1, :faults #{:clock}}) (gen/mix [(nemesis-op :clock-scramble)
(combined/partition-package {:db (:db opts), :interval 1, :faults #{:partition}}) (nemesis-op :partition-start)
(reconfiguration-package {:interval 1})])) (nemesis-op :partition-stop)
(nemesis-op :reconfigure-start)
(defn scenario-dpr (nemesis-op :reconfigure-stop)])
"Db + partition + cluster reconfiguration scenario" (gen/stagger 5))
[opts] :final-generator (gen/once {:type :info, :f :partition-stop})
(combined/compose-packages :nemesis (nemesis/compose
[(combined/db-package {:db (:db opts), :interval 1, :faults #{:db :pause :kill}}) {{:clock-scramble :scramble} (nemesis/clock-scrambler 20.0)
(combined/partition-package {:db (:db opts), :interval 1, :faults #{:partition}}) {:partition-start :start
(reconfiguration-package {:interval 1})])) :partition-stop :stop} (nemesis/partition-random-halves)
{:reconfigure-start :start
:reconfigure-stop :stop} (reconfigure-subset 3)})})

View file

@ -30,7 +30,6 @@
(assoc this :creds (grg/creds node))) (assoc this :creds (grg/creds node)))
(setup! [this test]) (setup! [this test])
(invoke! [this test op] (invoke! [this test op]
(try+
(let [[k v] (:value op)] (let [[k v] (:value op)]
(case (:f op) (case (:f op)
:read :read
@ -45,13 +44,7 @@
(assoc op :type :info, :error ::timeout) (assoc op :type :info, :error ::timeout)
(do (do
(s3/put (:creds this) k v) (s3/put (:creds this) k v)
(assoc op :type :ok))))) (assoc op :type :ok))))))
(catch (re-find #"Unavailable" (.getMessage %)) ex
(assoc op :type :info, :error ::unavailable))
(catch (re-find #"Broken pipe" (.getMessage %)) ex
(assoc op :type :info, :error ::broken-pipe))
(catch (re-find #"Connection refused" (.getMessage %)) ex
(assoc op :type :info, :error ::connection-refused))))
(teardown! [this test]) (teardown! [this test])
(close! [this test])) (close! [this test]))

View file

@ -30,7 +30,6 @@
(assoc this :creds (grg/creds node))) (assoc this :creds (grg/creds node)))
(setup! [this test]) (setup! [this test])
(invoke! [this test op] (invoke! [this test op]
(try+
(let [[k v] (:value op) (let [[k v] (:value op)
prefix (str "set" k "/")] prefix (str "set" k "/")]
(case (:f op) (case (:f op)
@ -51,13 +50,7 @@
(assert (str/starts-with? o prefix)) (assert (str/starts-with? o prefix))
(str/replace-first o prefix "")) items) (str/replace-first o prefix "")) items)
items-set (set (map parse-long items-stripped))] items-set (set (map parse-long items-stripped))]
(assoc op :type :ok, :value (independent/tuple k items-set)))))))) (assoc op :type :ok, :value (independent/tuple k items-set)))))))))
(catch (re-find #"Unavailable" (.getMessage %)) ex
(assoc op :type :info, :error ::unavailable))
(catch (re-find #"Broken pipe" (.getMessage %)) ex
(assoc op :type :info, :error ::broken-pipe))
(catch (re-find #"Connection refused" (.getMessage %)) ex
(assoc op :type :info, :error ::connection-refused))))
(teardown! [this test]) (teardown! [this test])
(close! [this test])) (close! [this test]))
@ -81,7 +74,7 @@
([:invoke :read]) ([:invoke :read])
(assoc-in state [:read-must-contain (:process op)] (:add-done state)) (assoc-in state [:read-must-contain (:process op)] (:add-done state))
([:ok :read]) ([:ok :read])
(let [read-must-contain (get (:read-must-contain state) (:process op)) (let [read-must-contain (get (:process op) (:read-must-contain state))
new-missed (set/difference read-must-contain (:value op)) new-missed (set/difference read-must-contain (:value op))
new-unexpected (set/difference (:value op) (:add-started state))] new-unexpected (set/difference (:value op) (:add-started state))]
(assoc state (assoc state