forked from Deuxfleurs/garage
Move block RC code to separate rc.rs
This commit is contained in:
parent
12fd12fd6a
commit
a70ef456e4
3 changed files with 175 additions and 136 deletions
|
@ -5,3 +5,4 @@ pub mod manager;
|
||||||
|
|
||||||
mod block;
|
mod block;
|
||||||
mod metrics;
|
mod metrics;
|
||||||
|
mod rc;
|
||||||
|
|
|
@ -31,6 +31,7 @@ use garage_table::replication::{TableReplication, TableShardedReplication};
|
||||||
|
|
||||||
use crate::metrics::*;
|
use crate::metrics::*;
|
||||||
use crate::block::*;
|
use crate::block::*;
|
||||||
|
use crate::rc::*;
|
||||||
|
|
||||||
/// Size under which data will be stored inlined in database instead of as files
|
/// Size under which data will be stored inlined in database instead of as files
|
||||||
pub const INLINE_THRESHOLD: usize = 3072;
|
pub const INLINE_THRESHOLD: usize = 3072;
|
||||||
|
@ -51,7 +52,7 @@ const RESYNC_RETRY_DELAY: Duration = Duration::from_secs(60);
|
||||||
// The delay between the moment when the reference counter
|
// The delay between the moment when the reference counter
|
||||||
// drops to zero, and the moment where we allow ourselves
|
// drops to zero, and the moment where we allow ourselves
|
||||||
// to delete the block locally.
|
// to delete the block locally.
|
||||||
const BLOCK_GC_DELAY: Duration = Duration::from_secs(600);
|
pub(crate) const BLOCK_GC_DELAY: Duration = Duration::from_secs(600);
|
||||||
|
|
||||||
/// RPC messages used to share blocks of data between nodes
|
/// RPC messages used to share blocks of data between nodes
|
||||||
#[derive(Debug, Serialize, Deserialize)]
|
#[derive(Debug, Serialize, Deserialize)]
|
||||||
|
@ -86,7 +87,7 @@ pub struct BlockManager {
|
||||||
|
|
||||||
mutation_lock: Mutex<BlockManagerLocked>,
|
mutation_lock: Mutex<BlockManagerLocked>,
|
||||||
|
|
||||||
rc: sled::Tree,
|
pub rc: BlockRc,
|
||||||
|
|
||||||
resync_queue: SledCountedTree,
|
resync_queue: SledCountedTree,
|
||||||
resync_notify: Notify,
|
resync_notify: Notify,
|
||||||
|
@ -114,6 +115,7 @@ impl BlockManager {
|
||||||
let rc = db
|
let rc = db
|
||||||
.open_tree("block_local_rc")
|
.open_tree("block_local_rc")
|
||||||
.expect("Unable to open block_local_rc tree");
|
.expect("Unable to open block_local_rc tree");
|
||||||
|
let rc = BlockRc::new(rc);
|
||||||
|
|
||||||
let resync_queue = db
|
let resync_queue = db
|
||||||
.open_tree("block_local_resync_queue")
|
.open_tree("block_local_resync_queue")
|
||||||
|
@ -213,7 +215,7 @@ impl BlockManager {
|
||||||
/// to fix any mismatch between the two.
|
/// to fix any mismatch between the two.
|
||||||
pub async fn repair_data_store(&self, must_exit: &watch::Receiver<bool>) -> Result<(), Error> {
|
pub async fn repair_data_store(&self, must_exit: &watch::Receiver<bool>) -> Result<(), Error> {
|
||||||
// 1. Repair blocks from RC table.
|
// 1. Repair blocks from RC table.
|
||||||
for (i, entry) in self.rc.iter().enumerate() {
|
for (i, entry) in self.rc.rc.iter().enumerate() {
|
||||||
let (hash, _) = entry?;
|
let (hash, _) = entry?;
|
||||||
let hash = Hash::try_from(&hash[..]).unwrap();
|
let hash = Hash::try_from(&hash[..]).unwrap();
|
||||||
self.put_to_resync(&hash, Duration::from_secs(0))?;
|
self.put_to_resync(&hash, Duration::from_secs(0))?;
|
||||||
|
@ -261,7 +263,7 @@ impl BlockManager {
|
||||||
|
|
||||||
/// Get number of items in the refcount table
|
/// Get number of items in the refcount table
|
||||||
pub fn rc_len(&self) -> usize {
|
pub fn rc_len(&self) -> usize {
|
||||||
self.rc.len()
|
self.rc.rc.len()
|
||||||
}
|
}
|
||||||
|
|
||||||
//// ----- Managing the reference counter ----
|
//// ----- Managing the reference counter ----
|
||||||
|
@ -269,11 +271,7 @@ impl BlockManager {
|
||||||
/// Increment the number of time a block is used, putting it to resynchronization if it is
|
/// Increment the number of time a block is used, putting it to resynchronization if it is
|
||||||
/// required, but not known
|
/// required, but not known
|
||||||
pub fn block_incref(&self, hash: &Hash) -> Result<(), Error> {
|
pub fn block_incref(&self, hash: &Hash) -> Result<(), Error> {
|
||||||
let old_rc = self
|
if self.rc.block_incref(hash)? {
|
||||||
.rc
|
|
||||||
.fetch_and_update(&hash, |old| RcEntry::parse_opt(old).increment().serialize())?;
|
|
||||||
let old_rc = RcEntry::parse_opt(old_rc);
|
|
||||||
if old_rc.is_zero() {
|
|
||||||
// When the reference counter is incremented, there is
|
// When the reference counter is incremented, there is
|
||||||
// normally a node that is responsible for sending us the
|
// normally a node that is responsible for sending us the
|
||||||
// data of the block. However that operation may fail,
|
// data of the block. However that operation may fail,
|
||||||
|
@ -287,35 +285,17 @@ impl BlockManager {
|
||||||
|
|
||||||
/// Decrement the number of time a block is used
|
/// Decrement the number of time a block is used
|
||||||
pub fn block_decref(&self, hash: &Hash) -> Result<(), Error> {
|
pub fn block_decref(&self, hash: &Hash) -> Result<(), Error> {
|
||||||
let new_rc = self
|
if self.rc.block_decref(hash)? {
|
||||||
.rc
|
// When the RC is decremented, it might drop to zero,
|
||||||
.update_and_fetch(&hash, |old| RcEntry::parse_opt(old).decrement().serialize())?;
|
// indicating that we don't need the block.
|
||||||
let new_rc = RcEntry::parse_opt(new_rc);
|
// There is a delay before we garbage collect it;
|
||||||
if let RcEntry::Deletable { .. } = new_rc {
|
// make sure that it is handled in the resync loop
|
||||||
|
// after that delay has passed.
|
||||||
self.put_to_resync(hash, BLOCK_GC_DELAY + Duration::from_secs(10))?;
|
self.put_to_resync(hash, BLOCK_GC_DELAY + Duration::from_secs(10))?;
|
||||||
}
|
}
|
||||||
Ok(())
|
Ok(())
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Read a block's reference count
|
|
||||||
fn get_block_rc(&self, hash: &Hash) -> Result<RcEntry, Error> {
|
|
||||||
Ok(RcEntry::parse_opt(self.rc.get(hash.as_ref())?))
|
|
||||||
}
|
|
||||||
|
|
||||||
/// Delete an entry in the RC table if it is deletable and the
|
|
||||||
/// deletion time has passed
|
|
||||||
fn clear_deleted_block_rc(&self, hash: &Hash) -> Result<(), Error> {
|
|
||||||
let now = now_msec();
|
|
||||||
self.rc.update_and_fetch(&hash, |rcval| {
|
|
||||||
let updated = match RcEntry::parse_opt(rcval) {
|
|
||||||
RcEntry::Deletable { at_time } if now > at_time => RcEntry::Absent,
|
|
||||||
v => v,
|
|
||||||
};
|
|
||||||
updated.serialize()
|
|
||||||
})?;
|
|
||||||
Ok(())
|
|
||||||
}
|
|
||||||
|
|
||||||
// ---- Reading and writing blocks locally ----
|
// ---- Reading and writing blocks locally ----
|
||||||
|
|
||||||
/// Write a block to disk
|
/// Write a block to disk
|
||||||
|
@ -659,7 +639,7 @@ impl BlockManager {
|
||||||
.delete_if_unneeded(hash, self)
|
.delete_if_unneeded(hash, self)
|
||||||
.await?;
|
.await?;
|
||||||
|
|
||||||
self.clear_deleted_block_rc(hash)?;
|
self.rc.clear_deleted_block_rc(hash)?;
|
||||||
}
|
}
|
||||||
|
|
||||||
if needed.is_nonzero() && !exists {
|
if needed.is_nonzero() && !exists {
|
||||||
|
@ -773,7 +753,7 @@ impl BlockManagerLocked {
|
||||||
mgr: &BlockManager,
|
mgr: &BlockManager,
|
||||||
) -> Result<BlockStatus, Error> {
|
) -> Result<BlockStatus, Error> {
|
||||||
let exists = mgr.is_block_compressed(hash).await.is_ok();
|
let exists = mgr.is_block_compressed(hash).await.is_ok();
|
||||||
let needed = mgr.get_block_rc(hash)?;
|
let needed = mgr.rc.get_block_rc(hash)?;
|
||||||
|
|
||||||
Ok(BlockStatus { exists, needed })
|
Ok(BlockStatus { exists, needed })
|
||||||
}
|
}
|
||||||
|
@ -869,107 +849,6 @@ impl BlockManagerLocked {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Describes the state of the reference counter for a block
|
|
||||||
#[derive(Clone, Copy, Debug)]
|
|
||||||
enum RcEntry {
|
|
||||||
/// Present: the block has `count` references, with `count` > 0.
|
|
||||||
///
|
|
||||||
/// This is stored as u64::to_be_bytes(count)
|
|
||||||
Present { count: u64 },
|
|
||||||
|
|
||||||
/// Deletable: the block has zero references, and can be deleted
|
|
||||||
/// once time (returned by now_msec) is larger than at_time
|
|
||||||
/// (in millis since Unix epoch)
|
|
||||||
///
|
|
||||||
/// This is stored as [0u8; 8] followed by u64::to_be_bytes(at_time),
|
|
||||||
/// (this allows for the data format to be backwards compatible with
|
|
||||||
/// previous Garage versions that didn't have this intermediate state)
|
|
||||||
Deletable { at_time: u64 },
|
|
||||||
|
|
||||||
/// Absent: the block has zero references, and can be deleted
|
|
||||||
/// immediately
|
|
||||||
Absent,
|
|
||||||
}
|
|
||||||
|
|
||||||
impl RcEntry {
|
|
||||||
fn parse(bytes: &[u8]) -> Self {
|
|
||||||
if bytes.len() == 8 {
|
|
||||||
RcEntry::Present {
|
|
||||||
count: u64::from_be_bytes(bytes.try_into().unwrap()),
|
|
||||||
}
|
|
||||||
} else if bytes.len() == 16 {
|
|
||||||
RcEntry::Deletable {
|
|
||||||
at_time: u64::from_be_bytes(bytes[8..16].try_into().unwrap()),
|
|
||||||
}
|
|
||||||
} else {
|
|
||||||
panic!("Invalid RC entry: {:?}, database is corrupted. This is an error Garage is currently unable to recover from. Sorry, and also please report a bug.",
|
|
||||||
bytes
|
|
||||||
)
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
fn parse_opt<V: AsRef<[u8]>>(bytes: Option<V>) -> Self {
|
|
||||||
bytes
|
|
||||||
.map(|b| Self::parse(b.as_ref()))
|
|
||||||
.unwrap_or(Self::Absent)
|
|
||||||
}
|
|
||||||
|
|
||||||
fn serialize(self) -> Option<Vec<u8>> {
|
|
||||||
match self {
|
|
||||||
RcEntry::Present { count } => Some(u64::to_be_bytes(count).to_vec()),
|
|
||||||
RcEntry::Deletable { at_time } => {
|
|
||||||
Some([u64::to_be_bytes(0), u64::to_be_bytes(at_time)].concat())
|
|
||||||
}
|
|
||||||
RcEntry::Absent => None,
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
fn increment(self) -> Self {
|
|
||||||
let old_count = match self {
|
|
||||||
RcEntry::Present { count } => count,
|
|
||||||
_ => 0,
|
|
||||||
};
|
|
||||||
RcEntry::Present {
|
|
||||||
count: old_count + 1,
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
fn decrement(self) -> Self {
|
|
||||||
match self {
|
|
||||||
RcEntry::Present { count } => {
|
|
||||||
if count > 1 {
|
|
||||||
RcEntry::Present { count: count - 1 }
|
|
||||||
} else {
|
|
||||||
RcEntry::Deletable {
|
|
||||||
at_time: now_msec() + BLOCK_GC_DELAY.as_millis() as u64,
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
del => del,
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
fn is_zero(&self) -> bool {
|
|
||||||
matches!(self, RcEntry::Deletable { .. } | RcEntry::Absent)
|
|
||||||
}
|
|
||||||
|
|
||||||
fn is_nonzero(&self) -> bool {
|
|
||||||
!self.is_zero()
|
|
||||||
}
|
|
||||||
|
|
||||||
fn is_deletable(&self) -> bool {
|
|
||||||
match self {
|
|
||||||
RcEntry::Present { .. } => false,
|
|
||||||
RcEntry::Deletable { at_time } => now_msec() > *at_time,
|
|
||||||
RcEntry::Absent => true,
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
fn is_needed(&self) -> bool {
|
|
||||||
!self.is_deletable()
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
/// Counts the number of errors when resyncing a block,
|
/// Counts the number of errors when resyncing a block,
|
||||||
/// and the time of the last try.
|
/// and the time of the last try.
|
||||||
/// Used to implement exponential backoff.
|
/// Used to implement exponential backoff.
|
||||||
|
|
159
src/block/rc.rs
Normal file
159
src/block/rc.rs
Normal file
|
@ -0,0 +1,159 @@
|
||||||
|
use std::convert::TryInto;
|
||||||
|
|
||||||
|
use garage_util::error::*;
|
||||||
|
use garage_util::data::*;
|
||||||
|
use garage_util::time::*;
|
||||||
|
|
||||||
|
use crate::manager::BLOCK_GC_DELAY;
|
||||||
|
|
||||||
|
pub struct BlockRc {
|
||||||
|
pub(crate) rc: sled::Tree,
|
||||||
|
}
|
||||||
|
|
||||||
|
impl BlockRc {
|
||||||
|
pub(crate) fn new(rc: sled::Tree) -> Self {
|
||||||
|
Self {
|
||||||
|
rc
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Increment the reference counter associated to a hash.
|
||||||
|
/// Returns true if the RC goes from zero to nonzero.
|
||||||
|
pub(crate) fn block_incref(&self, hash: &Hash) -> Result<bool, Error> {
|
||||||
|
let old_rc = self
|
||||||
|
.rc
|
||||||
|
.fetch_and_update(&hash, |old| RcEntry::parse_opt(old).increment().serialize())?;
|
||||||
|
let old_rc = RcEntry::parse_opt(old_rc);
|
||||||
|
Ok(old_rc.is_zero())
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Decrement the reference counter associated to a hash.
|
||||||
|
/// Returns true if the RC is now zero.
|
||||||
|
pub(crate) fn block_decref(&self, hash: &Hash) -> Result<bool, Error> {
|
||||||
|
let new_rc = self
|
||||||
|
.rc
|
||||||
|
.update_and_fetch(&hash, |old| RcEntry::parse_opt(old).decrement().serialize())?;
|
||||||
|
let new_rc = RcEntry::parse_opt(new_rc);
|
||||||
|
Ok(matches!(new_rc, RcEntry::Deletable {..}))
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Read a block's reference count
|
||||||
|
pub(crate) fn get_block_rc(&self, hash: &Hash) -> Result<RcEntry, Error> {
|
||||||
|
Ok(RcEntry::parse_opt(self.rc.get(hash.as_ref())?))
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Delete an entry in the RC table if it is deletable and the
|
||||||
|
/// deletion time has passed
|
||||||
|
pub(crate) fn clear_deleted_block_rc(&self, hash: &Hash) -> Result<(), Error> {
|
||||||
|
let now = now_msec();
|
||||||
|
self.rc.update_and_fetch(&hash, |rcval| {
|
||||||
|
let updated = match RcEntry::parse_opt(rcval) {
|
||||||
|
RcEntry::Deletable { at_time } if now > at_time => RcEntry::Absent,
|
||||||
|
v => v,
|
||||||
|
};
|
||||||
|
updated.serialize()
|
||||||
|
})?;
|
||||||
|
Ok(())
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Describes the state of the reference counter for a block
|
||||||
|
#[derive(Clone, Copy, Debug)]
|
||||||
|
pub(crate) enum RcEntry {
|
||||||
|
/// Present: the block has `count` references, with `count` > 0.
|
||||||
|
///
|
||||||
|
/// This is stored as u64::to_be_bytes(count)
|
||||||
|
Present { count: u64 },
|
||||||
|
|
||||||
|
/// Deletable: the block has zero references, and can be deleted
|
||||||
|
/// once time (returned by now_msec) is larger than at_time
|
||||||
|
/// (in millis since Unix epoch)
|
||||||
|
///
|
||||||
|
/// This is stored as [0u8; 8] followed by u64::to_be_bytes(at_time),
|
||||||
|
/// (this allows for the data format to be backwards compatible with
|
||||||
|
/// previous Garage versions that didn't have this intermediate state)
|
||||||
|
Deletable { at_time: u64 },
|
||||||
|
|
||||||
|
/// Absent: the block has zero references, and can be deleted
|
||||||
|
/// immediately
|
||||||
|
Absent,
|
||||||
|
}
|
||||||
|
|
||||||
|
impl RcEntry {
|
||||||
|
fn parse(bytes: &[u8]) -> Self {
|
||||||
|
if bytes.len() == 8 {
|
||||||
|
RcEntry::Present {
|
||||||
|
count: u64::from_be_bytes(bytes.try_into().unwrap()),
|
||||||
|
}
|
||||||
|
} else if bytes.len() == 16 {
|
||||||
|
RcEntry::Deletable {
|
||||||
|
at_time: u64::from_be_bytes(bytes[8..16].try_into().unwrap()),
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
panic!("Invalid RC entry: {:?}, database is corrupted. This is an error Garage is currently unable to recover from. Sorry, and also please report a bug.",
|
||||||
|
bytes
|
||||||
|
)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
fn parse_opt<V: AsRef<[u8]>>(bytes: Option<V>) -> Self {
|
||||||
|
bytes
|
||||||
|
.map(|b| Self::parse(b.as_ref()))
|
||||||
|
.unwrap_or(Self::Absent)
|
||||||
|
}
|
||||||
|
|
||||||
|
fn serialize(self) -> Option<Vec<u8>> {
|
||||||
|
match self {
|
||||||
|
RcEntry::Present { count } => Some(u64::to_be_bytes(count).to_vec()),
|
||||||
|
RcEntry::Deletable { at_time } => {
|
||||||
|
Some([u64::to_be_bytes(0), u64::to_be_bytes(at_time)].concat())
|
||||||
|
}
|
||||||
|
RcEntry::Absent => None,
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
fn increment(self) -> Self {
|
||||||
|
let old_count = match self {
|
||||||
|
RcEntry::Present { count } => count,
|
||||||
|
_ => 0,
|
||||||
|
};
|
||||||
|
RcEntry::Present {
|
||||||
|
count: old_count + 1,
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
fn decrement(self) -> Self {
|
||||||
|
match self {
|
||||||
|
RcEntry::Present { count } => {
|
||||||
|
if count > 1 {
|
||||||
|
RcEntry::Present { count: count - 1 }
|
||||||
|
} else {
|
||||||
|
RcEntry::Deletable {
|
||||||
|
at_time: now_msec() + BLOCK_GC_DELAY.as_millis() as u64,
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
del => del,
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
pub(crate) fn is_zero(&self) -> bool {
|
||||||
|
matches!(self, RcEntry::Deletable { .. } | RcEntry::Absent)
|
||||||
|
}
|
||||||
|
|
||||||
|
pub(crate) fn is_nonzero(&self) -> bool {
|
||||||
|
!self.is_zero()
|
||||||
|
}
|
||||||
|
|
||||||
|
pub(crate) fn is_deletable(&self) -> bool {
|
||||||
|
match self {
|
||||||
|
RcEntry::Present { .. } => false,
|
||||||
|
RcEntry::Deletable { at_time } => now_msec() > *at_time,
|
||||||
|
RcEntry::Absent => true,
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
pub(crate) fn is_needed(&self) -> bool {
|
||||||
|
!self.is_deletable()
|
||||||
|
}
|
||||||
|
}
|
Loading…
Reference in a new issue