Improvements to garbage collector (fix #39) #135

Merged
lx merged 4 commits from cleanups into main 2021-11-09 13:03:35 +00:00
3 changed files with 76 additions and 44 deletions
Showing only changes of commit ad7ab31411 - Show all commits

View File

@ -1,3 +1,4 @@
use std::convert::TryInto;
use std::path::{Path, PathBuf};
use std::sync::Arc;
use std::time::Duration;
@ -422,7 +423,7 @@ impl BlockManager {
async fn resync_iter(&self, must_exit: &mut watch::Receiver<bool>) -> Result<bool, Error> {
if let Some((time_bytes, hash_bytes)) = self.resync_queue.pop_min()? {
let time_msec = u64_from_be_bytes(&time_bytes[0..8]);
let time_msec = u64::from_be_bytes(time_bytes[0..8].try_into().unwrap());
let now = now_msec();
if now >= time_msec {
let hash = Hash::try_from(&hash_bytes[..]).unwrap();
@ -705,13 +706,6 @@ impl BlockManagerLocked {
}
}
fn u64_from_be_bytes<T: AsRef<[u8]>>(bytes: T) -> u64 {
assert!(bytes.as_ref().len() == 8);
let mut x8 = [0u8; 8];
x8.copy_from_slice(bytes.as_ref());
u64::from_be_bytes(x8)
}
/// Describes the state of the reference counter for a block
#[derive(Clone, Copy, Debug)]
enum RcEntry {
Review

Just me being nitpicking but we created this enum because we want a bit more than a Reference Counter, something like a Reference Counter with Delay. We could rename it to RcDelayEntry or RcDelay. But feel free to keep this name too if you are more comfortable with the current name :)

Just me being nitpicking but we created this enum because we want a bit more than a Reference Counter, something like a Reference Counter with Delay. We could rename it to `RcDelayEntry` or `RcDelay`. But feel free to keep this name too if you are more comfortable with the current name :)

View File

@ -55,7 +55,7 @@ where
.expect("Unable to open DB Merkle TODO tree");
let gc_todo = db
.open_tree(&format!("{}:gc_todo", name))
.open_tree(&format!("{}:gc_todo_v2", name))
Review

Here, do we create a new db/table/key format because the previous one had a different format that is now incompatible? Does it mean that we will have a "dead" gc_todo table on already deployed nodes?
I do not see that as a blocking problem either but it is good to know.

Here, do we create a new db/table/key format because the previous one had a different format that is now incompatible? Does it mean that we will have a "dead" gc_todo table on already deployed nodes? I do not see that as a blocking problem either but it is good to know.
Review

Yes it's a dead table. If the migration is executed properly, i.e. Garage is read-only for enough time for the GC in the old version to process everything, gc_todo should be empty.

Yes it's a dead table. If the migration is executed properly, i.e. Garage is read-only for enough time for the GC in the old version to process everything, `gc_todo` should be empty.
.expect("Unable to open DB tree");
Arc::new(Self {

View File

@ -1,4 +1,5 @@
use std::collections::HashMap;
use std::convert::TryInto;
use std::sync::Arc;
use std::time::Duration;
@ -13,6 +14,7 @@ use tokio::sync::watch;
use garage_util::data::*;
use garage_util::error::*;
use garage_util::time::*;
use garage_rpc::system::System;
use garage_rpc::*;
@ -24,6 +26,11 @@ use crate::schema::*;
const TABLE_GC_BATCH_SIZE: usize = 1024;
const TABLE_GC_RPC_TIMEOUT: Duration = Duration::from_secs(30);
// GC delay for table entries: 1 day (24 hours)
// (the delay before the entry is added in the GC todo list
// and the moment the garbage collection actually happens)
const TABLE_GC_DELAY: Duration = Duration::from_secs(24 * 3600);
pub(crate) struct TableGc<F: TableSchema + 'static, R: TableReplication + 'static> {
system: Arc<System>,
data: Arc<TableData<F, R>>,
@ -72,35 +79,49 @@ where
async fn gc_loop(self: Arc<Self>, mut must_exit: watch::Receiver<bool>) {
while !*must_exit.borrow() {
match self.gc_loop_iter().await {
Ok(true) => {
Ok(None) => {
// Stuff was done, loop immediately
continue;
}
Ok(false) => {
// Nothing was done, sleep for some time (below)
Ok(Some(wait_delay)) => {
// Nothing was done, wait specified delay.
select! {
_ = tokio::time::sleep(wait_delay).fuse() => {},
_ = must_exit.changed().fuse() => {},
}
}
Err(e) => {
warn!("({}) Error doing GC: {}", self.data.name, e);
}
}
select! {
_ = tokio::time::sleep(Duration::from_secs(10)).fuse() => {},
_ = must_exit.changed().fuse() => {},
}
}
}
async fn gc_loop_iter(&self) -> Result<bool, Error> {
async fn gc_loop_iter(&self) -> Result<Option<Duration>, Error> {
let now = now_msec();
let mut entries = vec![];
let mut excluded = vec![];
// List entries in the GC todo list
// These entries are put there when a tombstone is inserted in the table
// This is detected and done in data.rs in update_entry
// (see update_entry in data.rs)
for entry_kv in self.data.gc_todo.iter() {
let (k, vhash) = entry_kv?;
let mut todo_entry = GcTodoEntry::parse(&k, &vhash);
if todo_entry.deletion_time() > now {
if entries.is_empty() && excluded.is_empty() {
// If the earliest entry in the todo list shouldn't yet be processed,
// return a duration to wait in the loop
return Ok(Some(Duration::from_millis(
todo_entry.deletion_time() - now,
)));
} else {
// Otherwise we have some entries to process, do a normal iteration.
break;
}
}
let vhash = Hash::try_from(&vhash[..]).unwrap();
// Check if the tombstone is still the current value of the entry.
Review

Can you confirm that one reason to invalid a tombstone is a case where I upload a file, detele it, and then reupload it?

Can you confirm that one reason to invalid a tombstone is a case where I upload a file, detele it, and then reupload it?
Review

Yes! For the object tables, an entry can take the following sequence of values:

  1. an old version (not a tombstone)
  2. a deletion marker (a tombstone which we want to GC if possible)
  3. a new version (not a tombstone)
Yes! For the object tables, an entry can take the following sequence of values: 1. an old version (not a tombstone) 2. a deletion marker (a tombstone which we want to GC if possible) 3. a new version (not a tombstone)
@ -134,8 +155,9 @@ where
// and for which they are still currently tombstones in the table.
if entries.is_empty() {
// Nothing to do in this iteration
return Ok(false);
// Nothing to do in this iteration (no entries present)
// Wait for a default delay of 60 seconds
return Ok(Some(Duration::from_secs(60)));
}
debug!("({}) GC: doing {} items", self.data.name, entries.len());
@ -181,7 +203,7 @@ where
}
if errs.is_empty() {
Ok(true)
Ok(None)
} else {
Err(Error::Message(
errs.into_iter()
@ -189,19 +211,20 @@ where
.collect::<Vec<_>>()
.join(", "),
))
.err_context("in try_send_and_delete:")
.err_context("in try_send_and_delete in table GC:")
}
}
async fn try_send_and_delete(
&self,
nodes: Vec<Uuid>,
items: Vec<GcTodoEntry>,
mut items: Vec<GcTodoEntry>,
) -> Result<(), Error> {
let n_items = items.len();
// Strategy: we first send all of the values to the remote nodes,
Review

Does it mean that deleting a tombstone is a particularly expensive operation? Could we create a deny of service simply by deleting lot of data in a row? or deleting lot of data in a row while a node is down?

But it might be totally ok as it seems we batch our requests, can you confirm?

Edit: I just understood that nodes does not map to all nodes of the cluster but to all nodes of the same "partition"

Does it mean that deleting a tombstone is a particularly expensive operation? Could we create a deny of service simply by deleting lot of data in a row? or deleting lot of data in a row while a node is down? But it might be totally ok as it seems we batch our requests, can you confirm? Edit: I just understood that `nodes` does not map to all nodes of the cluster but to all nodes of the same "partition"
Review

It's not that expensive (tombstones are small and can be batched), and it's the cost of a correct algorithm :)

At least we are not doing a full Raft or Paxos

It's not that expensive (tombstones are small and can be batched), and it's the cost of a correct algorithm :) At least we are not doing a full Raft or Paxos
// to ensure that they are aware of the tombstone state.
// to ensure that they are aware of the tombstone state,
// and that the previous state was correctly overwritten
// (if they have a newer state that overrides the tombstone, that's fine).
// Second, once everyone is at least at the tombstone state,
// we instruct everyone to delete the tombstone IF that is still their current state.
@ -209,13 +232,14 @@ where
// tombstone in the CRDT lattice, and it will be propagated back to us at some point
// (either just a regular update that hasn't reached us yet, or later when the
// table is synced).
// Here, we store in updates all of the tombstones to send for step 1,
// and in deletes the list of keys and hashes of value for step 2.
let mut updates = vec![];
let mut deletes = vec![];
for item in items {
updates.push(ByteBuf::from(item.value.unwrap()));
deletes.push((ByteBuf::from(item.key), item.value_hash));
for item in items.iter_mut() {
updates.push(ByteBuf::from(item.value.take().unwrap()));
deletes.push((ByteBuf::from(item.key.clone()), item.value_hash));
}
// Step 1: ensure everyone is at least at tombstone in CRDT lattice
@ -250,7 +274,7 @@ where
.try_call_many(
&self.endpoint,
&nodes[..],
GcRpc::DeleteIfEqualHash(deletes.clone()),
GcRpc::DeleteIfEqualHash(deletes),
RequestStrategy::with_priority(PRIO_BACKGROUND)
.with_quorum(nodes.len())
.with_timeout(TABLE_GC_RPC_TIMEOUT),
@ -260,24 +284,16 @@ where
// GC has been successfull for all of these entries.
// We now remove them all from our local table and from the GC todo list.
for (k, vhash) in deletes {
for item in items {
self.data
.delete_if_equal_hash(&k[..], vhash)
.delete_if_equal_hash(&item.key[..], item.value_hash)
.err_context("GC: local delete tombstones")?;
self.todo_remove_if_equal(&k[..], vhash)
item.remove_if_equal(&self.data.gc_todo)
.err_context("GC: remove from todo list after successfull GC")?;
}
Ok(())
}
fn todo_remove_if_equal(&self, key: &[u8], vhash: Hash) -> Result<(), Error> {
let _ = self
.data
.gc_todo
.compare_and_swap::<_, _, Vec<u8>>(key, Some(vhash), None)?;
Ok(())
}
}
#[async_trait]
@ -295,7 +311,6 @@ where
GcRpc::DeleteIfEqualHash(items) => {
for (key, vhash) in items.iter() {
self.data.delete_if_equal_hash(&key[..], *vhash)?;
self.todo_remove_if_equal(&key[..], *vhash)?;
}
Ok(GcRpc::Ok)
}
@ -307,7 +322,16 @@ where
/// An entry stored in the gc_todo Sled tree associated with the table
/// Contains helper function for parsing, saving, and removing
/// such entry in Sled
///
/// Format of an entry:
/// - key = 8 bytes: timestamp of tombstone
Review

Here also we have some byte manipulations too, this is even the purpose of this object. Are we doing them because of a Sled limitation (it needs a primitive type as a key) or for backward compatibility reasons? (we were storing a byte array before so we keep it)? Are you not affraid that it could introduce bugs with time as we are loosing some checks done by the compiler?

Here also we have some byte manipulations too, this is even the purpose of this object. Are we doing them because of a Sled limitation (it needs a primitive type as a key) or for backward compatibility reasons? (we were storing a byte array before so we keep it)? Are you not affraid that it could introduce bugs with time as we are loosing some checks done by the compiler?
Review

We are limited by Sled that basically acts as a BTreeMap<Vec<u8>, Vec<u8>>: we have to manage stuff as byte arrays for both keys and values. It's true we have to be extra carefull with this: that's the reason why I extracted the serialization and parsing logic in a separate struct. This would typically be a good place for unit testing if complexity gets out of hand (for now I don't think it has).

We are limited by Sled that basically acts as a `BTreeMap<Vec<u8>, Vec<u8>>`: we have to manage stuff as byte arrays for both keys and values. It's true we have to be extra carefull with this: that's the reason why I extracted the serialization and parsing logic in a separate struct. This would typically be a good place for unit testing if complexity gets out of hand (for now I don't think it has).
/// (used to implement GC delay)
/// n bytes: key in the main data table
/// - value = hash of the table entry to delete (the tombstone)
/// for verification purpose, because we don't want to delete
/// things that aren't tombstones
pub(crate) struct GcTodoEntry {
tombstone_timestamp: u64,
key: Vec<u8>,
value_hash: Hash,
value: Option<Vec<u8>>,
@ -319,6 +343,7 @@ impl GcTodoEntry {
/// serialized value
pub(crate) fn new(key: Vec<u8>, value_hash: Hash) -> Self {
Self {
tombstone_timestamp: now_msec(),
key,
value_hash,
value: None,
@ -328,7 +353,8 @@ impl GcTodoEntry {
/// Parses a GcTodoEntry from a (k, v) pair stored in the gc_todo tree
pub(crate) fn parse(sled_k: &[u8], sled_v: &[u8]) -> Self {
Self {
key: sled_k.to_vec(),
tombstone_timestamp: u64::from_be_bytes(sled_k[0..8].try_into().unwrap()),
key: sled_k[8..].to_vec(),
value_hash: Hash::try_from(sled_v).unwrap(),
value: None,
}
@ -336,7 +362,7 @@ impl GcTodoEntry {
/// Saves the GcTodoEntry in the gc_todo tree
pub(crate) fn save(&self, gc_todo_tree: &sled::Tree) -> Result<(), Error> {
gc_todo_tree.insert(&self.key[..], self.value_hash.as_slice())?;
gc_todo_tree.insert(self.todo_table_key(), self.value_hash.as_slice())?;
Ok(())
}
@ -347,10 +373,22 @@ impl GcTodoEntry {
/// what we have to do is still the same
pub(crate) fn remove_if_equal(&self, gc_todo_tree: &sled::Tree) -> Result<(), Error> {
let _ = gc_todo_tree.compare_and_swap::<_, _, Vec<u8>>(
&self.key[..],
&self.todo_table_key()[..],
Some(self.value_hash),
None,
)?;
Ok(())
}
fn todo_table_key(&self) -> Vec<u8> {
[
&u64::to_be_bytes(self.tombstone_timestamp)[..],
&self.key[..],
]
.concat()
}
fn deletion_time(&self) -> u64 {
self.tombstone_timestamp + TABLE_GC_DELAY.as_millis() as u64
}
}