Compare commits
No commits in common. "fd85010a403775bbb18030ae2d9d3689b34f3e8a" and "4fa2646a75ed9b4823bf36ae6218a18cca11c471" have entirely different histories.
fd85010a40
...
4fa2646a75
6 changed files with 112 additions and 138 deletions
|
@ -86,7 +86,7 @@ Results:
|
||||||
- Does not seem to fail with only the layout reconfiguation nemesis (<10 runs), although theoretically it could
|
- Does not seem to fail with only the layout reconfiguation nemesis (<10 runs), although theoretically it could
|
||||||
|
|
||||||
- **Fails with the partition + layout reconfiguration nemesis** (`--scenario pr`).
|
- **Fails with the partition + layout reconfiguration nemesis** (`--scenario pr`).
|
||||||
Example of a failed run: `garage set1/20231024T172214.488+0200` (1 failure in 4 runs).
|
EXample of a failed run: `garage set1/20231024T172214.488+0200` (1 failure in 4 runs).
|
||||||
|
|
||||||
|
|
||||||
### Set, continuous test (interspersed reads and writes)
|
### Set, continuous test (interspersed reads and writes)
|
||||||
|
@ -97,10 +97,9 @@ Results:
|
||||||
|
|
||||||
- For now, no failures with clock-scramble nemesis + partition nemesis -> TODO long test run
|
- For now, no failures with clock-scramble nemesis + partition nemesis -> TODO long test run
|
||||||
|
|
||||||
- **Fails with layout reconfiguration nemesis** (`--scenario r`).
|
- Does not seem to fail with the clock scrambler + partition + layout reconfiguation nemesis (>10 runs), although theoretically it could
|
||||||
Example of a failed run: `garage set2/20231025T115033.553+0200` (2 failures in 2 runs).
|
|
||||||
TODO: investigate.
|
TODO: make it fail!!!
|
||||||
This is the failure mode we are looking for and trying to fix for NLnet task 3.
|
|
||||||
|
|
||||||
|
|
||||||
## Investigating (and fixing) errors
|
## Investigating (and fixing) errors
|
||||||
|
|
|
@ -27,8 +27,7 @@
|
||||||
"cp" grgNemesis/scenario-cp
|
"cp" grgNemesis/scenario-cp
|
||||||
"r" grgNemesis/scenario-r
|
"r" grgNemesis/scenario-r
|
||||||
"pr" grgNemesis/scenario-pr
|
"pr" grgNemesis/scenario-pr
|
||||||
"cpr" grgNemesis/scenario-cpr
|
"cpr" grgNemesis/scenario-cpr})
|
||||||
"dpr" grgNemesis/scenario-dpr})
|
|
||||||
|
|
||||||
(def patches
|
(def patches
|
||||||
"A map of patch names to Garage builds"
|
"A map of patch names to Garage builds"
|
||||||
|
@ -60,16 +59,15 @@
|
||||||
"Given an options map from the command line runner (e.g. :nodes, :ssh,
|
"Given an options map from the command line runner (e.g. :nodes, :ssh,
|
||||||
:concurrency, ...), constructs a test map."
|
:concurrency, ...), constructs a test map."
|
||||||
[opts]
|
[opts]
|
||||||
(let [garage-version (get patches (:patch opts))
|
(let [workload ((get workloads (:workload opts)) opts)
|
||||||
db (grg/db garage-version)
|
scenario ((get scenari (:scenario opts)) opts)
|
||||||
workload ((get workloads (:workload opts)) opts)
|
garage-version (get patches (:patch opts))]
|
||||||
scenario ((get scenari (:scenario opts)) (assoc opts :db db))]
|
|
||||||
(merge tests/noop-test
|
(merge tests/noop-test
|
||||||
opts
|
opts
|
||||||
{:pure-generators true
|
{:pure-generators true
|
||||||
:name (str "garage " (name (:workload opts)))
|
:name (str "garage " (name (:workload opts)))
|
||||||
:os debian/os
|
:os debian/os
|
||||||
:db db
|
:db (grg/db garage-version)
|
||||||
:client (:client workload)
|
:client (:client workload)
|
||||||
:generator (gen/phases
|
:generator (gen/phases
|
||||||
(->>
|
(->>
|
||||||
|
@ -84,7 +82,7 @@
|
||||||
(gen/clients (:final-generator workload)))
|
(gen/clients (:final-generator workload)))
|
||||||
:nemesis (:nemesis scenario)
|
:nemesis (:nemesis scenario)
|
||||||
:checker (checker/compose
|
:checker (checker/compose
|
||||||
{:perf (checker/perf (:perf scenario))
|
{:perf (checker/perf)
|
||||||
:workload (:checker workload)})
|
:workload (:checker workload)})
|
||||||
})))
|
})))
|
||||||
|
|
||||||
|
|
|
@ -119,24 +119,6 @@
|
||||||
(c/exec :rm :-rf data-dir)
|
(c/exec :rm :-rf data-dir)
|
||||||
(c/exec :rm :-rf meta-dir)))
|
(c/exec :rm :-rf meta-dir)))
|
||||||
|
|
||||||
db/Pause
|
|
||||||
(pause! [_ test node]
|
|
||||||
(cu/grepkill! :stop binary))
|
|
||||||
(resume! [_ test node]
|
|
||||||
(cu/grepkill! :cont binary))
|
|
||||||
|
|
||||||
db/Kill
|
|
||||||
(kill! [_ test node]
|
|
||||||
(cu/stop-daemon! binary pidfile))
|
|
||||||
(start! [_ test node]
|
|
||||||
(cu/start-daemon!
|
|
||||||
{:logfile logfile
|
|
||||||
:pidfile pidfile
|
|
||||||
:chdir base-dir
|
|
||||||
:env {:RUST_LOG "garage=debug,garage_api=trace"}}
|
|
||||||
binary
|
|
||||||
:server))
|
|
||||||
|
|
||||||
db/LogFiles
|
db/LogFiles
|
||||||
(log-files [_ test node]
|
(log-files [_ test node]
|
||||||
[logfile])))
|
[logfile])))
|
||||||
|
|
|
@ -4,7 +4,6 @@
|
||||||
[core :as jepsen]
|
[core :as jepsen]
|
||||||
[generator :as gen]
|
[generator :as gen]
|
||||||
[nemesis :as nemesis]]
|
[nemesis :as nemesis]]
|
||||||
[jepsen.nemesis.combined :as combined]
|
|
||||||
[jepsen.garage.daemon :as grg]
|
[jepsen.garage.daemon :as grg]
|
||||||
[jepsen.control.util :as cu]))
|
[jepsen.control.util :as cu]))
|
||||||
|
|
||||||
|
@ -12,23 +11,21 @@
|
||||||
|
|
||||||
(defn configure-present!
|
(defn configure-present!
|
||||||
"Configure node to be active in new cluster layout"
|
"Configure node to be active in new cluster layout"
|
||||||
[test nodes]
|
[test node]
|
||||||
(info "configure-present!" nodes)
|
(info "configure-present!" node)
|
||||||
(let [node-ids (c/on-many nodes (c/exec grg/binary :node :id :-q))
|
(let [node-id (c/on node (c/exec grg/binary :node :id :-q))]
|
||||||
node-id-strs (map (fn [[_ v]] (subs v 0 16)) node-ids)]
|
(c/on
|
||||||
(c/on
|
(jepsen/primary test)
|
||||||
(jepsen/primary test)
|
(c/exec grg/binary :layout :assign (subs node-id 0 16) :-c :1G))))
|
||||||
(apply c/exec (concat [grg/binary :layout :assign :-c :1G] node-id-strs)))))
|
|
||||||
|
|
||||||
(defn configure-absent!
|
(defn configure-absent!
|
||||||
"Configure nodes to be active in new cluster layout"
|
"Configure node to be active in new cluster layout"
|
||||||
[test nodes]
|
[test node]
|
||||||
(info "configure-absent!" nodes)
|
(info "configure-absent!" node)
|
||||||
(let [node-ids (c/on-many nodes (c/exec grg/binary :node :id :-q))
|
(let [node-id (c/on node (c/exec grg/binary :node :id :-q))]
|
||||||
node-id-strs (map (fn [[_ v]] (subs v 0 16)) node-ids)]
|
(c/on
|
||||||
(c/on
|
(jepsen/primary test)
|
||||||
(jepsen/primary test)
|
(c/exec grg/binary :layout :assign (subs node-id 0 16) :-g))))
|
||||||
(apply c/exec (concat [grg/binary :layout :assign :-g] node-id-strs)))))
|
|
||||||
|
|
||||||
(defn finalize-config!
|
(defn finalize-config!
|
||||||
"Apply the proposed cluster layout"
|
"Apply the proposed cluster layout"
|
||||||
|
@ -56,14 +53,14 @@
|
||||||
shuffle
|
shuffle
|
||||||
(split-at cnt))]
|
(split-at cnt))]
|
||||||
(info "layout split: keep " keep-nodes ", remove " remove-nodes)
|
(info "layout split: keep " keep-nodes ", remove " remove-nodes)
|
||||||
(configure-present! test keep-nodes)
|
(run! #(configure-present! test %) keep-nodes)
|
||||||
(configure-absent! test remove-nodes)
|
(run! #(configure-absent! test %) remove-nodes)
|
||||||
(finalize-config! test)
|
(finalize-config! test)
|
||||||
(assoc op :value keep-nodes))
|
(assoc op :value keep-nodes))
|
||||||
:stop
|
:stop
|
||||||
(do
|
(do
|
||||||
(info "layout un-split: all nodes=" (:nodes test))
|
(info "layout un-split: all nodes=" (:nodes test))
|
||||||
(configure-present! test (:nodes test))
|
(run! #(configure-present! test %) (:nodes test))
|
||||||
(finalize-config! test)
|
(finalize-config! test)
|
||||||
(assoc op :value (:nodes test)))))
|
(assoc op :value (:nodes test)))))
|
||||||
|
|
||||||
|
@ -76,58 +73,70 @@
|
||||||
[op]
|
[op]
|
||||||
(fn [_ _] {:type :info, :f op}))
|
(fn [_ _] {:type :info, :f op}))
|
||||||
|
|
||||||
(defn reconfiguration-package
|
(defn scenario-c
|
||||||
"Cluster reconfiguration nemesis package"
|
"Clock scramble scenario"
|
||||||
[opts]
|
[opts]
|
||||||
{:generator (->>
|
{:generator (->>
|
||||||
(gen/mix [(nemesis-op :reconfigure-start)
|
(nemesis-op :clock-scramble)
|
||||||
(nemesis-op :reconfigure-stop)])
|
(gen/stagger 5))
|
||||||
(gen/stagger (:interval opts 5)))
|
|
||||||
:final-generator {:type :info, :f :reconfigure-stop}
|
|
||||||
:nemesis (nemesis/compose
|
:nemesis (nemesis/compose
|
||||||
{{:reconfigure-start :start
|
{{:clock-scramble :scramble} (nemesis/clock-scrambler 20.0)})})
|
||||||
:reconfigure-stop :stop} (reconfigure-subset 3)})
|
|
||||||
:perf #{{:name "reconfigure"
|
|
||||||
:start #{:reconfigure-start}
|
|
||||||
:stop #{:reconfigur-stop}
|
|
||||||
:color "#A197E9"}}})
|
|
||||||
|
|
||||||
(defn scenario-c
|
|
||||||
"Clock modifying scenario"
|
|
||||||
[opts]
|
|
||||||
(combined/clock-package {:db (:db opts), :interval 1, :faults #{:clock}}))
|
|
||||||
|
|
||||||
(defn scenario-cp
|
(defn scenario-cp
|
||||||
"Clock modifying + partition scenario"
|
"Clock scramble + partition scenario"
|
||||||
[opts]
|
[opts]
|
||||||
(combined/compose-packages
|
{:generator (->>
|
||||||
[(combined/clock-package {:db (:db opts), :interval 1, :faults #{:clock}})
|
(gen/mix [(nemesis-op :clock-scramble)
|
||||||
(combined/partition-package {:db (:db opts), :interval 1, :faults #{:partition}})]))
|
(nemesis-op :partition-stop)
|
||||||
|
(nemesis-op :partition-start)])
|
||||||
|
(gen/stagger 5))
|
||||||
|
:final-generator (gen/once {:type :info, :f :partition-stop})
|
||||||
|
:nemesis (nemesis/compose
|
||||||
|
{{:clock-scramble :scramble} (nemesis/clock-scrambler 20.0)
|
||||||
|
{:partition-start :start
|
||||||
|
:partition-stop :stop} (nemesis/partition-random-halves)})})
|
||||||
|
|
||||||
(defn scenario-r
|
(defn scenario-r
|
||||||
"Cluster reconfiguration scenario"
|
"Cluster reconfiguration scenario"
|
||||||
[opts]
|
[opts]
|
||||||
(reconfiguration-package {:interval 1}))
|
{:generator (->>
|
||||||
|
(gen/mix [(nemesis-op :reconfigure-start)
|
||||||
|
(nemesis-op :reconfigure-stop)])
|
||||||
|
(gen/stagger 5))
|
||||||
|
:nemesis (nemesis/compose
|
||||||
|
{{:reconfigure-start :start
|
||||||
|
:reconfigure-stop :stop} (reconfigure-subset 3)})})
|
||||||
|
|
||||||
(defn scenario-pr
|
(defn scenario-pr
|
||||||
"Partition + cluster reconfiguration scenario"
|
"Partition + cluster reconfiguration scenario"
|
||||||
[opts]
|
[opts]
|
||||||
(combined/compose-packages
|
{:generator (->>
|
||||||
[(combined/partition-package {:db (:db opts), :interval 1, :faults #{:partition}})
|
(gen/mix [(nemesis-op :partition-start)
|
||||||
(reconfiguration-package {:interval 1})]))
|
(nemesis-op :partition-stop)
|
||||||
|
(nemesis-op :reconfigure-start)
|
||||||
|
(nemesis-op :reconfigure-stop)])
|
||||||
|
(gen/stagger 5))
|
||||||
|
:final-generator (gen/once {:type :info, :f :partition-stop})
|
||||||
|
:nemesis (nemesis/compose
|
||||||
|
{{:partition-start :start
|
||||||
|
:partition-stop :stop} (nemesis/partition-random-halves)
|
||||||
|
{:reconfigure-start :start
|
||||||
|
:reconfigure-stop :stop} (reconfigure-subset 3)})})
|
||||||
|
|
||||||
(defn scenario-cpr
|
(defn scenario-cpr
|
||||||
"Clock scramble + partition + cluster reconfiguration scenario"
|
"Clock scramble + partition + cluster reconfiguration scenario"
|
||||||
[opts]
|
[opts]
|
||||||
(combined/compose-packages
|
{:generator (->>
|
||||||
[(combined/clock-package {:db (:db opts), :interval 1, :faults #{:clock}})
|
(gen/mix [(nemesis-op :clock-scramble)
|
||||||
(combined/partition-package {:db (:db opts), :interval 1, :faults #{:partition}})
|
(nemesis-op :partition-start)
|
||||||
(reconfiguration-package {:interval 1})]))
|
(nemesis-op :partition-stop)
|
||||||
|
(nemesis-op :reconfigure-start)
|
||||||
(defn scenario-dpr
|
(nemesis-op :reconfigure-stop)])
|
||||||
"Db + partition + cluster reconfiguration scenario"
|
(gen/stagger 5))
|
||||||
[opts]
|
:final-generator (gen/once {:type :info, :f :partition-stop})
|
||||||
(combined/compose-packages
|
:nemesis (nemesis/compose
|
||||||
[(combined/db-package {:db (:db opts), :interval 1, :faults #{:db :pause :kill}})
|
{{:clock-scramble :scramble} (nemesis/clock-scrambler 20.0)
|
||||||
(combined/partition-package {:db (:db opts), :interval 1, :faults #{:partition}})
|
{:partition-start :start
|
||||||
(reconfiguration-package {:interval 1})]))
|
:partition-stop :stop} (nemesis/partition-random-halves)
|
||||||
|
{:reconfigure-start :start
|
||||||
|
:reconfigure-stop :stop} (reconfigure-subset 3)})})
|
||||||
|
|
|
@ -30,28 +30,21 @@
|
||||||
(assoc this :creds (grg/creds node)))
|
(assoc this :creds (grg/creds node)))
|
||||||
(setup! [this test])
|
(setup! [this test])
|
||||||
(invoke! [this test op]
|
(invoke! [this test op]
|
||||||
(try+
|
(let [[k v] (:value op)]
|
||||||
(let [[k v] (:value op)]
|
(case (:f op)
|
||||||
(case (:f op)
|
:read
|
||||||
:read
|
(util/timeout
|
||||||
(util/timeout
|
10000
|
||||||
10000
|
(assoc op :type :fail, :error ::timeout)
|
||||||
(assoc op :type :fail, :error ::timeout)
|
(let [value (s3/get (:creds this) k)]
|
||||||
(let [value (s3/get (:creds this) k)]
|
(assoc op :type :ok, :value (independent/tuple k value))))
|
||||||
(assoc op :type :ok, :value (independent/tuple k value))))
|
:write
|
||||||
:write
|
(util/timeout
|
||||||
(util/timeout
|
10000
|
||||||
10000
|
(assoc op :type :info, :error ::timeout)
|
||||||
(assoc op :type :info, :error ::timeout)
|
(do
|
||||||
(do
|
(s3/put (:creds this) k v)
|
||||||
(s3/put (:creds this) k v)
|
(assoc op :type :ok))))))
|
||||||
(assoc op :type :ok)))))
|
|
||||||
(catch (re-find #"Unavailable" (.getMessage %)) ex
|
|
||||||
(assoc op :type :info, :error ::unavailable))
|
|
||||||
(catch (re-find #"Broken pipe" (.getMessage %)) ex
|
|
||||||
(assoc op :type :info, :error ::broken-pipe))
|
|
||||||
(catch (re-find #"Connection refused" (.getMessage %)) ex
|
|
||||||
(assoc op :type :info, :error ::connection-refused))))
|
|
||||||
(teardown! [this test])
|
(teardown! [this test])
|
||||||
(close! [this test]))
|
(close! [this test]))
|
||||||
|
|
||||||
|
|
|
@ -30,34 +30,27 @@
|
||||||
(assoc this :creds (grg/creds node)))
|
(assoc this :creds (grg/creds node)))
|
||||||
(setup! [this test])
|
(setup! [this test])
|
||||||
(invoke! [this test op]
|
(invoke! [this test op]
|
||||||
(try+
|
(let [[k v] (:value op)
|
||||||
(let [[k v] (:value op)
|
prefix (str "set" k "/")]
|
||||||
prefix (str "set" k "/")]
|
(case (:f op)
|
||||||
(case (:f op)
|
:add
|
||||||
:add
|
(util/timeout
|
||||||
(util/timeout
|
10000
|
||||||
10000
|
(assoc op :type :info, :error ::timeout)
|
||||||
(assoc op :type :info, :error ::timeout)
|
(do
|
||||||
(do
|
(s3/put (:creds this) (str prefix v) "present")
|
||||||
(s3/put (:creds this) (str prefix v) "present")
|
(assoc op :type :ok)))
|
||||||
(assoc op :type :ok)))
|
:read
|
||||||
:read
|
(util/timeout
|
||||||
(util/timeout
|
10000
|
||||||
10000
|
(assoc op :type :fail, :error ::timeout)
|
||||||
(assoc op :type :fail, :error ::timeout)
|
(do
|
||||||
(do
|
(let [items (s3/list (:creds this) prefix)]
|
||||||
(let [items (s3/list (:creds this) prefix)]
|
(let [items-stripped (map (fn [o]
|
||||||
(let [items-stripped (map (fn [o]
|
(assert (str/starts-with? o prefix))
|
||||||
(assert (str/starts-with? o prefix))
|
(str/replace-first o prefix "")) items)
|
||||||
(str/replace-first o prefix "")) items)
|
items-set (set (map parse-long items-stripped))]
|
||||||
items-set (set (map parse-long items-stripped))]
|
(assoc op :type :ok, :value (independent/tuple k items-set)))))))))
|
||||||
(assoc op :type :ok, :value (independent/tuple k items-set))))))))
|
|
||||||
(catch (re-find #"Unavailable" (.getMessage %)) ex
|
|
||||||
(assoc op :type :info, :error ::unavailable))
|
|
||||||
(catch (re-find #"Broken pipe" (.getMessage %)) ex
|
|
||||||
(assoc op :type :info, :error ::broken-pipe))
|
|
||||||
(catch (re-find #"Connection refused" (.getMessage %)) ex
|
|
||||||
(assoc op :type :info, :error ::connection-refused))))
|
|
||||||
(teardown! [this test])
|
(teardown! [this test])
|
||||||
(close! [this test]))
|
(close! [this test]))
|
||||||
|
|
||||||
|
@ -81,7 +74,7 @@
|
||||||
([:invoke :read])
|
([:invoke :read])
|
||||||
(assoc-in state [:read-must-contain (:process op)] (:add-done state))
|
(assoc-in state [:read-must-contain (:process op)] (:add-done state))
|
||||||
([:ok :read])
|
([:ok :read])
|
||||||
(let [read-must-contain (get (:read-must-contain state) (:process op))
|
(let [read-must-contain (get (:process op) (:read-must-contain state))
|
||||||
new-missed (set/difference read-must-contain (:value op))
|
new-missed (set/difference read-must-contain (:value op))
|
||||||
new-unexpected (set/difference (:value op) (:add-started state))]
|
new-unexpected (set/difference (:value op) (:add-started state))]
|
||||||
(assoc state
|
(assoc state
|
||||||
|
|
Loading…
Add table
Reference in a new issue