Move block RC code to separate `rc.rs`

This commit is contained in:
Alex 2022-03-15 12:23:33 +01:00
parent c3982a90b6
commit 8fd6745745
Signed by: lx
GPG Key ID: 0E496D15096376BE
3 changed files with 175 additions and 136 deletions

View File

@ -5,3 +5,4 @@ pub mod manager;
mod block; mod block;
mod metrics; mod metrics;
mod rc;

View File

@ -31,6 +31,7 @@ use garage_table::replication::{TableReplication, TableShardedReplication};
use crate::metrics::*; use crate::metrics::*;
use crate::block::*; use crate::block::*;
use crate::rc::*;
/// Size under which data will be stored inlined in database instead of as files /// Size under which data will be stored inlined in database instead of as files
pub const INLINE_THRESHOLD: usize = 3072; pub const INLINE_THRESHOLD: usize = 3072;
@ -51,7 +52,7 @@ const RESYNC_RETRY_DELAY: Duration = Duration::from_secs(60);
// The delay between the moment when the reference counter // The delay between the moment when the reference counter
// drops to zero, and the moment where we allow ourselves // drops to zero, and the moment where we allow ourselves
// to delete the block locally. // to delete the block locally.
const BLOCK_GC_DELAY: Duration = Duration::from_secs(600); pub(crate) const BLOCK_GC_DELAY: Duration = Duration::from_secs(600);
/// RPC messages used to share blocks of data between nodes /// RPC messages used to share blocks of data between nodes
#[derive(Debug, Serialize, Deserialize)] #[derive(Debug, Serialize, Deserialize)]
@ -86,7 +87,7 @@ pub struct BlockManager {
mutation_lock: Mutex<BlockManagerLocked>, mutation_lock: Mutex<BlockManagerLocked>,
rc: sled::Tree, pub rc: BlockRc,
resync_queue: SledCountedTree, resync_queue: SledCountedTree,
resync_notify: Notify, resync_notify: Notify,
@ -114,6 +115,7 @@ impl BlockManager {
let rc = db let rc = db
.open_tree("block_local_rc") .open_tree("block_local_rc")
.expect("Unable to open block_local_rc tree"); .expect("Unable to open block_local_rc tree");
let rc = BlockRc::new(rc);
let resync_queue = db let resync_queue = db
.open_tree("block_local_resync_queue") .open_tree("block_local_resync_queue")
@ -213,7 +215,7 @@ impl BlockManager {
/// to fix any mismatch between the two. /// to fix any mismatch between the two.
pub async fn repair_data_store(&self, must_exit: &watch::Receiver<bool>) -> Result<(), Error> { pub async fn repair_data_store(&self, must_exit: &watch::Receiver<bool>) -> Result<(), Error> {
// 1. Repair blocks from RC table. // 1. Repair blocks from RC table.
for (i, entry) in self.rc.iter().enumerate() { for (i, entry) in self.rc.rc.iter().enumerate() {
let (hash, _) = entry?; let (hash, _) = entry?;
let hash = Hash::try_from(&hash[..]).unwrap(); let hash = Hash::try_from(&hash[..]).unwrap();
self.put_to_resync(&hash, Duration::from_secs(0))?; self.put_to_resync(&hash, Duration::from_secs(0))?;
@ -261,7 +263,7 @@ impl BlockManager {
/// Get number of items in the refcount table /// Get number of items in the refcount table
pub fn rc_len(&self) -> usize { pub fn rc_len(&self) -> usize {
self.rc.len() self.rc.rc.len()
} }
//// ----- Managing the reference counter ---- //// ----- Managing the reference counter ----
@ -269,11 +271,7 @@ impl BlockManager {
/// Increment the number of time a block is used, putting it to resynchronization if it is /// Increment the number of time a block is used, putting it to resynchronization if it is
/// required, but not known /// required, but not known
pub fn block_incref(&self, hash: &Hash) -> Result<(), Error> { pub fn block_incref(&self, hash: &Hash) -> Result<(), Error> {
let old_rc = self if self.rc.block_incref(hash)? {
.rc
.fetch_and_update(&hash, |old| RcEntry::parse_opt(old).increment().serialize())?;
let old_rc = RcEntry::parse_opt(old_rc);
if old_rc.is_zero() {
// When the reference counter is incremented, there is // When the reference counter is incremented, there is
// normally a node that is responsible for sending us the // normally a node that is responsible for sending us the
// data of the block. However that operation may fail, // data of the block. However that operation may fail,
@ -287,35 +285,17 @@ impl BlockManager {
/// Decrement the number of time a block is used /// Decrement the number of time a block is used
pub fn block_decref(&self, hash: &Hash) -> Result<(), Error> { pub fn block_decref(&self, hash: &Hash) -> Result<(), Error> {
let new_rc = self if self.rc.block_decref(hash)? {
.rc // When the RC is decremented, it might drop to zero,
.update_and_fetch(&hash, |old| RcEntry::parse_opt(old).decrement().serialize())?; // indicating that we don't need the block.
let new_rc = RcEntry::parse_opt(new_rc); // There is a delay before we garbage collect it;
if let RcEntry::Deletable { .. } = new_rc { // make sure that it is handled in the resync loop
// after that delay has passed.
self.put_to_resync(hash, BLOCK_GC_DELAY + Duration::from_secs(10))?; self.put_to_resync(hash, BLOCK_GC_DELAY + Duration::from_secs(10))?;
} }
Ok(()) Ok(())
} }
/// Read a block's reference count
fn get_block_rc(&self, hash: &Hash) -> Result<RcEntry, Error> {
Ok(RcEntry::parse_opt(self.rc.get(hash.as_ref())?))
}
/// Delete an entry in the RC table if it is deletable and the
/// deletion time has passed
fn clear_deleted_block_rc(&self, hash: &Hash) -> Result<(), Error> {
let now = now_msec();
self.rc.update_and_fetch(&hash, |rcval| {
let updated = match RcEntry::parse_opt(rcval) {
RcEntry::Deletable { at_time } if now > at_time => RcEntry::Absent,
v => v,
};
updated.serialize()
})?;
Ok(())
}
// ---- Reading and writing blocks locally ---- // ---- Reading and writing blocks locally ----
/// Write a block to disk /// Write a block to disk
@ -659,7 +639,7 @@ impl BlockManager {
.delete_if_unneeded(hash, self) .delete_if_unneeded(hash, self)
.await?; .await?;
self.clear_deleted_block_rc(hash)?; self.rc.clear_deleted_block_rc(hash)?;
} }
if needed.is_nonzero() && !exists { if needed.is_nonzero() && !exists {
@ -773,7 +753,7 @@ impl BlockManagerLocked {
mgr: &BlockManager, mgr: &BlockManager,
) -> Result<BlockStatus, Error> { ) -> Result<BlockStatus, Error> {
let exists = mgr.is_block_compressed(hash).await.is_ok(); let exists = mgr.is_block_compressed(hash).await.is_ok();
let needed = mgr.get_block_rc(hash)?; let needed = mgr.rc.get_block_rc(hash)?;
Ok(BlockStatus { exists, needed }) Ok(BlockStatus { exists, needed })
} }
@ -869,107 +849,6 @@ impl BlockManagerLocked {
} }
} }
/// Describes the state of the reference counter for a block
#[derive(Clone, Copy, Debug)]
enum RcEntry {
/// Present: the block has `count` references, with `count` > 0.
///
/// This is stored as u64::to_be_bytes(count)
Present { count: u64 },
/// Deletable: the block has zero references, and can be deleted
/// once time (returned by now_msec) is larger than at_time
/// (in millis since Unix epoch)
///
/// This is stored as [0u8; 8] followed by u64::to_be_bytes(at_time),
/// (this allows for the data format to be backwards compatible with
/// previous Garage versions that didn't have this intermediate state)
Deletable { at_time: u64 },
/// Absent: the block has zero references, and can be deleted
/// immediately
Absent,
}
impl RcEntry {
fn parse(bytes: &[u8]) -> Self {
if bytes.len() == 8 {
RcEntry::Present {
count: u64::from_be_bytes(bytes.try_into().unwrap()),
}
} else if bytes.len() == 16 {
RcEntry::Deletable {
at_time: u64::from_be_bytes(bytes[8..16].try_into().unwrap()),
}
} else {
panic!("Invalid RC entry: {:?}, database is corrupted. This is an error Garage is currently unable to recover from. Sorry, and also please report a bug.",
bytes
)
}
}
fn parse_opt<V: AsRef<[u8]>>(bytes: Option<V>) -> Self {
bytes
.map(|b| Self::parse(b.as_ref()))
.unwrap_or(Self::Absent)
}
fn serialize(self) -> Option<Vec<u8>> {
match self {
RcEntry::Present { count } => Some(u64::to_be_bytes(count).to_vec()),
RcEntry::Deletable { at_time } => {
Some([u64::to_be_bytes(0), u64::to_be_bytes(at_time)].concat())
}
RcEntry::Absent => None,
}
}
fn increment(self) -> Self {
let old_count = match self {
RcEntry::Present { count } => count,
_ => 0,
};
RcEntry::Present {
count: old_count + 1,
}
}
fn decrement(self) -> Self {
match self {
RcEntry::Present { count } => {
if count > 1 {
RcEntry::Present { count: count - 1 }
} else {
RcEntry::Deletable {
at_time: now_msec() + BLOCK_GC_DELAY.as_millis() as u64,
}
}
}
del => del,
}
}
fn is_zero(&self) -> bool {
matches!(self, RcEntry::Deletable { .. } | RcEntry::Absent)
}
fn is_nonzero(&self) -> bool {
!self.is_zero()
}
fn is_deletable(&self) -> bool {
match self {
RcEntry::Present { .. } => false,
RcEntry::Deletable { at_time } => now_msec() > *at_time,
RcEntry::Absent => true,
}
}
fn is_needed(&self) -> bool {
!self.is_deletable()
}
}
/// Counts the number of errors when resyncing a block, /// Counts the number of errors when resyncing a block,
/// and the time of the last try. /// and the time of the last try.
/// Used to implement exponential backoff. /// Used to implement exponential backoff.

159
src/block/rc.rs Normal file
View File

@ -0,0 +1,159 @@
use std::convert::TryInto;
use garage_util::error::*;
use garage_util::data::*;
use garage_util::time::*;
use crate::manager::BLOCK_GC_DELAY;
pub struct BlockRc {
pub(crate) rc: sled::Tree,
}
impl BlockRc {
pub(crate) fn new(rc: sled::Tree) -> Self {
Self {
rc
}
}
/// Increment the reference counter associated to a hash.
/// Returns true if the RC goes from zero to nonzero.
pub(crate) fn block_incref(&self, hash: &Hash) -> Result<bool, Error> {
let old_rc = self
.rc
.fetch_and_update(&hash, |old| RcEntry::parse_opt(old).increment().serialize())?;
let old_rc = RcEntry::parse_opt(old_rc);
Ok(old_rc.is_zero())
}
/// Decrement the reference counter associated to a hash.
/// Returns true if the RC is now zero.
pub(crate) fn block_decref(&self, hash: &Hash) -> Result<bool, Error> {
let new_rc = self
.rc
.update_and_fetch(&hash, |old| RcEntry::parse_opt(old).decrement().serialize())?;
let new_rc = RcEntry::parse_opt(new_rc);
Ok(matches!(new_rc, RcEntry::Deletable {..}))
}
/// Read a block's reference count
pub(crate) fn get_block_rc(&self, hash: &Hash) -> Result<RcEntry, Error> {
Ok(RcEntry::parse_opt(self.rc.get(hash.as_ref())?))
}
/// Delete an entry in the RC table if it is deletable and the
/// deletion time has passed
pub(crate) fn clear_deleted_block_rc(&self, hash: &Hash) -> Result<(), Error> {
let now = now_msec();
self.rc.update_and_fetch(&hash, |rcval| {
let updated = match RcEntry::parse_opt(rcval) {
RcEntry::Deletable { at_time } if now > at_time => RcEntry::Absent,
v => v,
};
updated.serialize()
})?;
Ok(())
}
}
/// Describes the state of the reference counter for a block
#[derive(Clone, Copy, Debug)]
pub(crate) enum RcEntry {
/// Present: the block has `count` references, with `count` > 0.
///
/// This is stored as u64::to_be_bytes(count)
Present { count: u64 },
/// Deletable: the block has zero references, and can be deleted
/// once time (returned by now_msec) is larger than at_time
/// (in millis since Unix epoch)
///
/// This is stored as [0u8; 8] followed by u64::to_be_bytes(at_time),
/// (this allows for the data format to be backwards compatible with
/// previous Garage versions that didn't have this intermediate state)
Deletable { at_time: u64 },
/// Absent: the block has zero references, and can be deleted
/// immediately
Absent,
}
impl RcEntry {
fn parse(bytes: &[u8]) -> Self {
if bytes.len() == 8 {
RcEntry::Present {
count: u64::from_be_bytes(bytes.try_into().unwrap()),
}
} else if bytes.len() == 16 {
RcEntry::Deletable {
at_time: u64::from_be_bytes(bytes[8..16].try_into().unwrap()),
}
} else {
panic!("Invalid RC entry: {:?}, database is corrupted. This is an error Garage is currently unable to recover from. Sorry, and also please report a bug.",
bytes
)
}
}
fn parse_opt<V: AsRef<[u8]>>(bytes: Option<V>) -> Self {
bytes
.map(|b| Self::parse(b.as_ref()))
.unwrap_or(Self::Absent)
}
fn serialize(self) -> Option<Vec<u8>> {
match self {
RcEntry::Present { count } => Some(u64::to_be_bytes(count).to_vec()),
RcEntry::Deletable { at_time } => {
Some([u64::to_be_bytes(0), u64::to_be_bytes(at_time)].concat())
}
RcEntry::Absent => None,
}
}
fn increment(self) -> Self {
let old_count = match self {
RcEntry::Present { count } => count,
_ => 0,
};
RcEntry::Present {
count: old_count + 1,
}
}
fn decrement(self) -> Self {
match self {
RcEntry::Present { count } => {
if count > 1 {
RcEntry::Present { count: count - 1 }
} else {
RcEntry::Deletable {
at_time: now_msec() + BLOCK_GC_DELAY.as_millis() as u64,
}
}
}
del => del,
}
}
pub(crate) fn is_zero(&self) -> bool {
matches!(self, RcEntry::Deletable { .. } | RcEntry::Absent)
}
pub(crate) fn is_nonzero(&self) -> bool {
!self.is_zero()
}
pub(crate) fn is_deletable(&self) -> bool {
match self {
RcEntry::Present { .. } => false,
RcEntry::Deletable { at_time } => now_msec() > *at_time,
RcEntry::Absent => true,
}
}
pub(crate) fn is_needed(&self) -> bool {
!self.is_deletable()
}
}