diff --git a/script/jepsen.garage/.gitignore b/script/jepsen.garage/.gitignore index 6eb8c209..31842a96 100644 --- a/script/jepsen.garage/.gitignore +++ b/script/jepsen.garage/.gitignore @@ -13,4 +13,5 @@ pom.xml.asc .hg/ .direnv /store +/store.* .vagrant diff --git a/script/jepsen.garage/README.md b/script/jepsen.garage/README.md index f7479a3d..50c7eb38 100644 --- a/script/jepsen.garage/README.md +++ b/script/jepsen.garage/README.md @@ -7,29 +7,19 @@ Jepsen checking of Garage consistency properties. Requirements: - vagrant -- VirtualBox, configured so that nodes can take an IP in a private network `192.168.56.0/24` +- VirtualBox, configured so that nodes can take an IP in a private network `192.168.56.0/24` (it's the default) - a user that can create VirtualBox VMs - leiningen - gnuplot -Set up VMs: +Set up VMs before running tests: ``` vagrant up ``` -Run tests (this one should fail): +Run tests: see commands below. -``` -lein run test --nodes-file nodes.vagrant --time-limit 64 --concurrency 50 --rate 50 --workload reg -``` - -These ones are working: - -``` -lein run test --nodes-file nodes.vagrant --time-limit 64 --rate 50 --concurrency 50 --workload set1 -lein run test --nodes-file nodes.vagrant --time-limit 64 --rate 50 --concurrency 50 --workload set2 -``` ## Results @@ -73,16 +63,19 @@ Results with timestamp patch (`--patch tsfix2`): Example of a failed run: `garage reg2/20231024T120806.899+0200`. This is the failure mode we are looking for and trying to fix for NLnet task 3. -- Changes brought by NLnet task 3 code (commit 707442f5de): - no failures with `--scenario r` (0 of 10 runs), `--scenario pr` (0 of 10 runs), +Results with NLnet task 3 code (commit 707442f5de, `--patch task3a`): + +- No failures with `--scenario r` (0 of 10 runs), `--scenario pr` (0 of 10 runs), `--scenario cpr` (0 of 10 runs) and `--scenario dpr` (0 of 10 runs). +- Same with `--patch task3c` (commit `0041b013`, the final version). + ### Set, basic test (write some items, then read) -Command: `lein run test --nodes-file nodes.vagrant --time-limit 60 --rate 200 --concurrency 200 --workload set1 --ops-per-key 100 --patch tsfix2` +Command: `lein run test --nodes-file nodes.vagrant --time-limit 60 --rate 200 --concurrency 200 --workload set1 --ops-per-key 100` -Results: +Results without NLnet task3 code (`--patch tsfix2`): - For now, no failures with clock-scramble nemesis + partition nemesis -> TODO long test run @@ -90,15 +83,22 @@ Results: - **Fails with the partition + layout reconfiguration nemesis** (`--scenario pr`). Example of a failed run: `garage set1/20231024T172214.488+0200` (1 failure in 4 runs). - TODO: investigate. This is the failure mode we are looking for and trying to fix for NLnet task 3. +Results with NLnet task 3 code (commit 707442f5de, `--patch task3a`): + +- The tests are buggy and often result in an "unknown" validity status, which + is caused by some requests not returning results during network partitions or + other nemesis-induced broken cluster states. However, when the tests were + able to finish, there were no failures with scenarios `r`, `pr`, `cpr`, + `dpr`. + ### Set, continuous test (interspersed reads and writes) -Command: `lein run test --nodes-file nodes.vagrant --time-limit 60 --rate 100 --concurrency 100 --workload set2 --ops-per-key 100 --patch tsfix2` +Command: `lein run test --nodes-file nodes.vagrant --time-limit 60 --rate 100 --concurrency 100 --workload set2 --ops-per-key 100` -Results: +Results without NLnet task3 code (`--patch tsfix2`): - No failures with clock-scramble nemesis + db nemesis + partition nemesis (`--scenario cdp`) (0 failures in 10 runs). @@ -106,17 +106,26 @@ Results: Example of a failed run: `garage set2/20231025T141940.198+0200` (10 failures in 10 runs). This is the failure mode we are looking for and trying to fix for NLnet task 3. -- Changes brought by NLnet task 3 code (commit 707442f5de): - no failures with `--scenario r` (0 of 10 runs), `--scenario pr` (0 of 10 runs). +Results with NLnet task3 code (commit 707442f5de, `--patch task3a`): + +- No failures with `--scenario r` (0 of 10 runs), `--scenario pr` (0 of 10 runs), `--scenario cpr` (0 of 10 runs) and `--scenario dpr` (0 of 10 runs). +- Same with `--patch task3c` (commit `0041b013`, the final version). + + +## NLnet task 3 final results + +- With code from task3 (`--patch task3c`): [reg2 and set2](results/Results-2023-12-13-task3c.png), [set1](results/Results-2023-12-14-task3-set1.png). +- Without (`--patch tsfix2`): [reg2 and set2](results/Results-2023-12-13-tsfix2.png), set1 TBD. ## Investigating (and fixing) errors ### Segfaults They are due to the download being interrupted in the middle (^C during first launch on clean VMs), the `garage` binary is truncated. -Add `:force?` to the `cached-wget!` call in `daemon.clj` to re-download the binary. +Add `:force?` to the `cached-wget!` call in `daemon.clj` to re-download the binary, +or restar the VMs to clear temporary files. ### In `jepsen.garage`: prefix wierdness diff --git a/script/jepsen.garage/Vagrantfile b/script/jepsen.garage/Vagrantfile index 4d02397d..b54c2426 100644 --- a/script/jepsen.garage/Vagrantfile +++ b/script/jepsen.garage/Vagrantfile @@ -29,4 +29,12 @@ Vagrant.configure("2") do |config| config.vm.define "n5" do |config| vm(config, "n5", "192.168.56.25") end config.vm.define "n6" do |config| vm(config, "n6", "192.168.56.26") end config.vm.define "n7" do |config| vm(config, "n7", "192.168.56.27") end + + config.vm.define "n8" do |config| vm(config, "n8", "192.168.56.28") end + config.vm.define "n9" do |config| vm(config, "n9", "192.168.56.29") end + config.vm.define "n10" do |config| vm(config, "n10", "192.168.56.30") end + config.vm.define "n11" do |config| vm(config, "n11", "192.168.56.31") end + config.vm.define "n12" do |config| vm(config, "n12", "192.168.56.32") end + config.vm.define "n13" do |config| vm(config, "n13", "192.168.56.33") end + config.vm.define "n14" do |config| vm(config, "n14", "192.168.56.34") end end diff --git a/script/jepsen.garage/all_tests_1.sh b/script/jepsen.garage/all_tests_1.sh new file mode 100755 index 00000000..b5397d13 --- /dev/null +++ b/script/jepsen.garage/all_tests_1.sh @@ -0,0 +1,18 @@ +#!/usr/bin/env bash + +set -x + +#for ppatch in task3c task3a tsfix2; do +for ppatch in tsfix2; do + #for psc in c cp cdp r pr cpr dpr; do + for psc in cdp r pr cpr dpr; do + #for ptsk in reg2 set1 set2; do + for ptsk in set1; do + for irun in $(seq 10); do + lein run test --nodes-file nodes.vagrant \ + --time-limit 60 --rate 100 --concurrency 100 --ops-per-key 100 \ + --workload $ptsk --patch $ppatch --scenario $psc + done + done + done +done diff --git a/script/jepsen.garage/all_tests_2.sh b/script/jepsen.garage/all_tests_2.sh new file mode 100755 index 00000000..641643ed --- /dev/null +++ b/script/jepsen.garage/all_tests_2.sh @@ -0,0 +1,16 @@ +#!/usr/bin/env bash + +set -x + +#for ppatch in task3c tsfix2; do +for ppatch in tsfix2; do + for psc in cdp r pr cpr dpr; do + for ptsk in set1; do + for irun in $(seq 10); do + lein run test --nodes-file nodes2.vagrant \ + --time-limit 60 --rate 100 --concurrency 100 --ops-per-key 100 \ + --workload $ptsk --patch $ppatch --scenario $psc + done + done + done +done diff --git a/script/jepsen.garage/nodes2.vagrant b/script/jepsen.garage/nodes2.vagrant new file mode 100644 index 00000000..842bf276 --- /dev/null +++ b/script/jepsen.garage/nodes2.vagrant @@ -0,0 +1,7 @@ +192.168.56.28 +192.168.56.29 +192.168.56.30 +192.168.56.31 +192.168.56.32 +192.168.56.33 +192.168.56.34 diff --git a/script/jepsen.garage/results/Results-2023-11-16.png b/script/jepsen.garage/results/Results-2023-11-16.png new file mode 100644 index 00000000..26dac833 Binary files /dev/null and b/script/jepsen.garage/results/Results-2023-11-16.png differ diff --git a/script/jepsen.garage/results/Results-2023-12-13-task3c.png b/script/jepsen.garage/results/Results-2023-12-13-task3c.png new file mode 100644 index 00000000..216043c3 Binary files /dev/null and b/script/jepsen.garage/results/Results-2023-12-13-task3c.png differ diff --git a/script/jepsen.garage/results/Results-2023-12-13-tsfix2.png b/script/jepsen.garage/results/Results-2023-12-13-tsfix2.png new file mode 100644 index 00000000..147d25e9 Binary files /dev/null and b/script/jepsen.garage/results/Results-2023-12-13-tsfix2.png differ diff --git a/script/jepsen.garage/results/Results-2023-12-14-task3-set1.png b/script/jepsen.garage/results/Results-2023-12-14-task3-set1.png new file mode 100644 index 00000000..dbff3a95 Binary files /dev/null and b/script/jepsen.garage/results/Results-2023-12-14-task3-set1.png differ diff --git a/script/jepsen.garage/src/jepsen/garage.clj b/script/jepsen.garage/src/jepsen/garage.clj index 174e8df0..446b81de 100644 --- a/script/jepsen.garage/src/jepsen/garage.clj +++ b/script/jepsen.garage/src/jepsen/garage.clj @@ -36,7 +36,9 @@ {"default" "v0.9.0" "tsfix1" "d146cdd5b66ca1d3ed65ce93ca42c6db22defc09" "tsfix2" "c82d91c6bccf307186332b6c5c6fc0b128b1b2b1" - "task3a" "707442f5de416fdbed4681a33b739f0a787b7834"}) + "task3a" "707442f5de416fdbed4681a33b739f0a787b7834" + "task3b" "431b28e0cfdc9cac6c649193cf602108a8b02997" + "task3c" "0041b013a473e3ae72f50209d8f79db75a72848b"}) (def cli-opts "Additional command line options." @@ -69,7 +71,7 @@ (merge tests/noop-test opts {:pure-generators true - :name (str "garage " (name (:workload opts)) " " (name (:scenario opts)) " " (name (:patch opts))) + :name (str "garage-" (name (:patch opts)) " " (name (:workload opts)) " " (name (:scenario opts))) :os debian/os :db db :client (:client workload) @@ -83,7 +85,9 @@ (gen/nemesis (:final-generator scenario)) (gen/log "Waiting for recovery") (gen/sleep 10) - (gen/clients (:final-generator workload))) + (gen/log "Running final generator") + (gen/clients (:final-generator workload)) + (gen/log "Generators all done")) :nemesis (:nemesis scenario) :checker (checker/compose {:perf (checker/perf (:perf scenario)) diff --git a/script/jepsen.garage/src/jepsen/garage/set.clj b/script/jepsen.garage/src/jepsen/garage/set.clj index a73b8efc..2c7a2ccd 100644 --- a/script/jepsen.garage/src/jepsen/garage/set.clj +++ b/script/jepsen.garage/src/jepsen/garage/set.clj @@ -108,11 +108,13 @@ (->> (range) (map (fn [x] {:type :invoke, :f :add, :value x})) (gen/limit (:ops-per-key opts))))) - :final-generator (gen/phases - (independent/sequential-generator - (range 100) - (fn [k] (gen/once op-read))) - (gen/sleep 5))}) + :final-generator (independent/concurrent-generator + 10 + (range 100) + (fn [k] + (gen/phases + (gen/once op-read) + (gen/sleep 5))))}) (defn workload2 "Tests insertions and deletions"