Jepsen testing (NLnet task 3 subtask 1) #544

Merged
lx merged 41 commits from jepsen into main 2024-01-11 10:52:13 +00:00
3 changed files with 47 additions and 40 deletions
Showing only changes of commit d13bde5e26 - Show all commits

View file

@ -69,9 +69,9 @@ Results with timestamp patch (`--patch tsfix2`):
- No failures with clock-scramble nemesis + partition nemesis (`--scenario cp`). - No failures with clock-scramble nemesis + partition nemesis (`--scenario cp`).
This proves that `tsfix2` (PR#543) does improve consistency. This proves that `tsfix2` (PR#543) does improve consistency.
- **Fails with layout reconfiguration nemesis** (`--scenario r`) - **Fails with layout reconfiguration nemesis** (`--scenario r`).
(TODO: note down the run id of a failed run) Example of a failed run: `garage reg2/20231024T120806.899+0200`.
(TODO: test more and investigate). TODO: investigate.
This is the failure mode we are looking for and trying to fix for NLnet task 3. This is the failure mode we are looking for and trying to fix for NLnet task 3.
@ -83,12 +83,11 @@ Results:
- For now, no failures with clock-scramble nemesis + partition nemesis -> TODO long test run - For now, no failures with clock-scramble nemesis + partition nemesis -> TODO long test run
- Failures were not yet achieved with only the layout reconfiguration nemesis, although they should be. - Does not seem to fail with only the layout reconfiguation nemesis (>20 runs), although theoretically it could
- **Fails with partition + layout reconfiguration nemesis** (`--scenario pr`) - Does not seem to fail with the layout reconfiguation + partition nemesis (<10 runs), although theoretically it could
(TODO: note down the run id of a failed run)
(TODO: test more and investigate). TODO: make it fail!!!
This is the failure mode we are looking for and trying to fix for NLnet task 3.
### Set, continuous test (interspersed reads and writes) ### Set, continuous test (interspersed reads and writes)
@ -99,10 +98,9 @@ Results:
- For now, no failures with clock-scramble nemesis + partition nemesis -> TODO long test run - For now, no failures with clock-scramble nemesis + partition nemesis -> TODO long test run
- Failures were not yet achieved with only the layout reconfiguration nemesis, although they should be. - Does not seem to fail with the clock scrambler + partition + layout reconfiguation nemesis (>10 runs), although theoretically it could
- TODO: failures should be achieved with `--scenario pr`? Even with 4 or 5 consecutive test runs, no failures were achieved, why? TODO: make it fail!!!
(TODO: note down the run id of a failed run)
## Investigating (and fixing) errors ## Investigating (and fixing) errors

View file

@ -26,7 +26,8 @@
{"c" grgNemesis/scenario-c {"c" grgNemesis/scenario-c
"cp" grgNemesis/scenario-cp "cp" grgNemesis/scenario-cp
"r" grgNemesis/scenario-r "r" grgNemesis/scenario-r
"pr" grgNemesis/scenario-pr}) "pr" grgNemesis/scenario-pr
"cpr" grgNemesis/scenario-cpr})
(def patches (def patches
"A map of patch names to Garage builds" "A map of patch names to Garage builds"

View file

@ -76,30 +76,24 @@
(defn scenario-cp (defn scenario-cp
"Clock scramble + partition scenario" "Clock scramble + partition scenario"
[opts] [opts]
{:generator (cycle [(gen/sleep 5) {:generator (->>
{:type :info, :f :partition-start} (gen/mix [{:type :info, :f :clock-scramble}
(gen/sleep 5) {:type :info, :f :partition-stop}
{:type :info, :f :clock-scramble} {:type :info, :f :partition-start}])
(gen/sleep 5) (gen/stagger 3))
{:type :info, :f :partition-stop}
(gen/sleep 5)
{:type :info, :f :clock-scramble}])
:final-generator (gen/once {:type :info, :f :partition-stop}) :final-generator (gen/once {:type :info, :f :partition-stop})
:nemesis (nemesis/compose :nemesis (nemesis/compose
{{:partition-start :start {{:clock-scramble :scramble} (nemesis/clock-scrambler 20.0)
:partition-stop :stop} (nemesis/partition-random-halves) {:partition-start :start
{:clock-scramble :scramble} (nemesis/clock-scrambler 20.0)})}) :partition-stop :stop} (nemesis/partition-random-halves)})})
(defn scenario-r (defn scenario-r
"Cluster reconfiguration scenario" "Cluster reconfiguration scenario"
[opts] [opts]
{:generator (cycle [(gen/sleep 5) {:generator (->>
{:type :info, :f :reconfigure-start} (gen/mix [{:type :info, :f :reconfigure-start}
(gen/sleep 5) {:type :info, :f :reconfigure-stop}])
{:type :info, :f :reconfigure-start} (gen/stagger 3))
(gen/sleep 5)
{:type :info, :f :reconfigure-stop}])
:final-generator (gen/once {:type :info, :f :reconfigure-stop})
:nemesis (nemesis/compose :nemesis (nemesis/compose
{{:reconfigure-start :start {{:reconfigure-start :start
:reconfigure-stop :stop} (reconfigure-subset 3)})}) :reconfigure-stop :stop} (reconfigure-subset 3)})})
@ -107,19 +101,33 @@
(defn scenario-pr (defn scenario-pr
"Partition + cluster reconfiguration scenario" "Partition + cluster reconfiguration scenario"
[opts] [opts]
{:generator (cycle [(gen/sleep 3) {:generator (->>
{:type :info, :f :reconfigure-start} (gen/mix [{:type :info, :f :partition-start}
(gen/sleep 3) {:type :info, :f :partition-stop}
{:type :info, :f :partition-start} {:type :info, :f :reconfigure-start}
(gen/sleep 3) {:type :info, :f :reconfigure-stop}])
{:type :info, :f :reconfigure-start} (gen/stagger 3))
(gen/sleep 3)
{:type :info, :f :partition-stop}
(gen/sleep 3)
{:type :info, :f :reconfigure-stop}])
:final-generator (gen/once {:type :info, :f :partition-stop}) :final-generator (gen/once {:type :info, :f :partition-stop})
:nemesis (nemesis/compose :nemesis (nemesis/compose
{{:partition-start :start {{:partition-start :start
:partition-stop :stop} (nemesis/partition-random-halves) :partition-stop :stop} (nemesis/partition-random-halves)
{:reconfigure-start :start {:reconfigure-start :start
:reconfigure-stop :stop} (reconfigure-subset 3)})}) :reconfigure-stop :stop} (reconfigure-subset 3)})})
(defn scenario-cpr
"Clock scramble + partition + cluster reconfiguration scenario"
[opts]
{:generator (->>
(gen/mix [{:type :info, :f :clock-scramble}
{:type :info, :f :partition-start}
{:type :info, :f :partition-stop}
{:type :info, :f :reconfigure-start}
{:type :info, :f :reconfigure-stop}])
(gen/stagger 3))
:final-generator (gen/once {:type :info, :f :partition-stop})
:nemesis (nemesis/compose
{{:clock-scramble :scramble} (nemesis/clock-scrambler 20.0)
{:partition-start :start
:partition-stop :stop} (nemesis/partition-random-halves)
{:reconfigure-start :start
:reconfigure-stop :stop} (reconfigure-subset 3)})})