garage/src/block/layout.rs

374 lines
11 KiB
Rust

use std::collections::HashMap;
use std::path::PathBuf;
use serde::{Deserialize, Serialize};
use garage_util::config::DataDirEnum;
use garage_util::data::Hash;
use garage_util::error::{Error, OkOrMessage};
use garage_util::migrate::*;
type Idx = u16;
const DRIVE_NPART: usize = 1024;
const HASH_DRIVE_BYTES: (usize, usize) = (2, 3);
const MARKER_FILE_NAME: &str = "garage-marker";
#[derive(Serialize, Deserialize, Debug, Clone)]
pub(crate) struct DataLayout {
pub(crate) data_dirs: Vec<DataDir>,
markers: HashMap<PathBuf, String>,
/// Primary storage location (index in data_dirs) for each partition
/// = the location where the data is supposed to be, blocks are always
/// written there (copies in other dirs may be deleted if they exist)
pub(crate) part_prim: Vec<Idx>,
/// Secondary storage locations for each partition = locations
/// where data blocks might be, we check from these dirs when reading
pub(crate) part_sec: Vec<Vec<Idx>>,
}
#[derive(Serialize, Deserialize, Debug, Clone, Eq, PartialEq)]
pub(crate) struct DataDir {
pub(crate) path: PathBuf,
pub(crate) state: DataDirState,
}
#[derive(Serialize, Deserialize, Debug, Clone, Copy, Eq, PartialEq)]
pub(crate) enum DataDirState {
Active { capacity: u64 },
ReadOnly,
}
impl DataLayout {
pub(crate) fn initialize(dirs: &DataDirEnum) -> Result<Self, Error> {
let data_dirs = make_data_dirs(dirs)?;
// Split partitions proportionnally to capacity for all drives
// to affect primary storage location
let total_cap = data_dirs.iter().filter_map(|x| x.capacity()).sum::<u64>();
assert!(total_cap > 0);
let mut part_prim = Vec::with_capacity(DRIVE_NPART);
let mut cum_cap = 0;
for (i, dd) in data_dirs.iter().enumerate() {
if let DataDirState::Active { capacity } = dd.state {
cum_cap += capacity;
let n_total = (cum_cap * DRIVE_NPART as u64) / total_cap;
part_prim.resize(n_total as usize, i as Idx);
}
}
assert_eq!(cum_cap, total_cap);
assert_eq!(part_prim.len(), DRIVE_NPART);
// If any of the storage locations is non-empty, it probably existed before
// this algorithm was added, so add it as a secondary storage location for all partitions
// to make sure existing files are not lost
let mut part_sec = vec![vec![]; DRIVE_NPART];
for (i, dd) in data_dirs.iter().enumerate() {
if dir_not_empty(&dd.path)? {
for (sec, prim) in part_sec.iter_mut().zip(part_prim.iter()) {
if *prim != i as Idx {
sec.push(i as Idx);
}
}
}
}
Ok(Self {
data_dirs,
markers: HashMap::new(),
part_prim,
part_sec,
})
}
pub(crate) fn update(self, dirs: &DataDirEnum) -> Result<Self, Error> {
// Make list of new data directories, exit if nothing changed
let data_dirs = make_data_dirs(dirs)?;
if data_dirs == self.data_dirs {
return Ok(self);
}
let total_cap = data_dirs.iter().filter_map(|x| x.capacity()).sum::<u64>();
assert!(total_cap > 0);
// Compute mapping of old indices to new indices
let old2new = self
.data_dirs
.iter()
.map(|x| {
data_dirs
.iter()
.position(|y| y.path == x.path)
.map(|x| x as Idx)
})
.collect::<Vec<_>>();
// Compute secondary location list for partitions based on existing
// folders, translating indices from old to new
let mut part_sec = self
.part_sec
.iter()
.map(|dl| {
dl.iter()
.filter_map(|old| old2new.get(*old as usize).copied().flatten())
.collect::<Vec<_>>()
})
.collect::<Vec<_>>();
// Compute a vector that, for each data dir,
// contains the list of partitions primarily stored on that drive
let mut dir_prim = vec![vec![]; data_dirs.len()];
for (ipart, prim) in self.part_prim.iter().enumerate() {
if let Some(new) = old2new.get(*prim as usize).copied().flatten() {
dir_prim[new as usize].push(ipart);
}
}
// Compute the target number of partitions per data directory
let mut cum_cap = 0;
let mut npart_per_dir = vec![0; data_dirs.len()];
for (idir, dd) in data_dirs.iter().enumerate() {
if let DataDirState::Active { capacity } = dd.state {
let begin = (cum_cap * DRIVE_NPART as u64) / total_cap;
cum_cap += capacity;
let end = (cum_cap * DRIVE_NPART as u64) / total_cap;
npart_per_dir[idir] = (end - begin) as usize;
}
}
assert_eq!(cum_cap, total_cap);
assert_eq!(npart_per_dir.iter().sum::<usize>(), DRIVE_NPART);
// For all directories that have too many primary partitions,
// move that partition to secondary
for (idir, (parts, tgt_npart)) in dir_prim.iter_mut().zip(npart_per_dir.iter()).enumerate()
{
while parts.len() > *tgt_npart {
let part = parts.pop().unwrap();
if !part_sec[part].contains(&(idir as Idx)) {
part_sec[part].push(idir as Idx);
}
}
}
// Calculate the vector of primary partition dir index
let mut part_prim = vec![None; DRIVE_NPART];
for (idir, parts) in dir_prim.iter().enumerate() {
for part in parts.iter() {
assert!(part_prim[*part].is_none());
part_prim[*part] = Some(idir as Idx)
}
}
// Calculate a vector of unassigned partitions
let mut unassigned = part_prim
.iter()
.enumerate()
.filter(|(_, dir)| dir.is_none())
.map(|(ipart, _)| ipart)
.collect::<Vec<_>>();
// For all directories that don't have enough primary partitions,
// add partitions from unassigned
for (idir, (parts, tgt_npart)) in dir_prim.iter_mut().zip(npart_per_dir.iter()).enumerate()
{
if parts.len() < *tgt_npart {
let required = *tgt_npart - parts.len();
assert!(unassigned.len() >= required);
for _ in 0..required {
let new_part = unassigned.pop().unwrap();
part_prim[new_part] = Some(idir as Idx);
part_sec[new_part].retain(|x| *x != idir as Idx);
}
}
}
// Sanity checks
assert!(part_prim.iter().all(|x| x.is_some()));
assert!(unassigned.is_empty());
// Transform part_prim from vec of Option<Idx> to vec of Idx
let part_prim = part_prim
.into_iter()
.map(|x| x.unwrap())
.collect::<Vec<_>>();
assert!(part_prim.iter().all(|p| data_dirs
.get(*p as usize)
.and_then(|x| x.capacity())
.unwrap_or(0)
> 0));
// If any of the newly added storage locations is non-empty,
// it might have been removed and added again and might contain data,
// so add it as a secondary storage location for all partitions
// to make sure existing files are not lost
for (i, dd) in data_dirs.iter().enumerate() {
if self.data_dirs.iter().any(|ed| ed.path == dd.path) {
continue;
}
if dir_not_empty(&dd.path)? {
for (sec, prim) in part_sec.iter_mut().zip(part_prim.iter()) {
if *prim != i as Idx && !sec.contains(&(i as Idx)) {
sec.push(i as Idx);
}
}
}
}
// Apply newly generated config
Ok(Self {
data_dirs,
markers: self.markers,
part_prim,
part_sec,
})
}
pub(crate) fn check_markers(&mut self) -> Result<(), Error> {
let data_dirs = &self.data_dirs;
self.markers
.retain(|k, _| data_dirs.iter().any(|x| x.path == *k));
for dir in self.data_dirs.iter() {
let mut marker_path = dir.path.clone();
marker_path.push(MARKER_FILE_NAME);
let existing_marker = std::fs::read_to_string(&marker_path).ok();
match (existing_marker, self.markers.get(&dir.path)) {
(Some(m1), Some(m2)) => {
if m1 != *m2 {
return Err(Error::Message(format!("Mismatched content for marker file `{}` in data directory `{}`. If you moved data directories or changed their mountpoints, you should remove the `data_layout` file in Garage's metadata directory and restart Garage.", MARKER_FILE_NAME, dir.path.display())));
}
}
(None, Some(_)) => {
return Err(Error::Message(format!("Could not find expected marker file `{}` in data directory `{}`, make sure this data directory is mounted correctly.", MARKER_FILE_NAME, dir.path.display())));
}
(Some(mkr), None) => {
self.markers.insert(dir.path.clone(), mkr);
}
(None, None) => {
let mkr = hex::encode(garage_util::data::gen_uuid().as_slice());
std::fs::write(&marker_path, &mkr)?;
self.markers.insert(dir.path.clone(), mkr);
}
}
}
Ok(())
}
pub(crate) fn primary_block_dir(&self, hash: &Hash) -> PathBuf {
let ipart = self.partition_from(hash);
let idir = self.part_prim[ipart] as usize;
self.block_dir_from(hash, &self.data_dirs[idir].path)
}
pub(crate) fn secondary_block_dirs<'a>(
&'a self,
hash: &'a Hash,
) -> impl Iterator<Item = PathBuf> + 'a {
let ipart = self.partition_from(hash);
self.part_sec[ipart]
.iter()
.map(move |idir| self.block_dir_from(hash, &self.data_dirs[*idir as usize].path))
}
fn partition_from(&self, hash: &Hash) -> usize {
u16::from_be_bytes([
hash.as_slice()[HASH_DRIVE_BYTES.0],
hash.as_slice()[HASH_DRIVE_BYTES.1],
]) as usize % DRIVE_NPART
}
fn block_dir_from(&self, hash: &Hash, dir: &PathBuf) -> PathBuf {
let mut path = dir.clone();
path.push(hex::encode(&hash.as_slice()[0..1]));
path.push(hex::encode(&hash.as_slice()[1..2]));
path
}
pub(crate) fn without_secondary_locations(&self) -> Self {
Self {
data_dirs: self.data_dirs.clone(),
markers: self.markers.clone(),
part_prim: self.part_prim.clone(),
part_sec: self.part_sec.iter().map(|_| vec![]).collect::<Vec<_>>(),
}
}
}
impl InitialFormat for DataLayout {
const VERSION_MARKER: &'static [u8] = b"G09bmdl";
}
impl DataDir {
pub fn capacity(&self) -> Option<u64> {
match self.state {
DataDirState::Active { capacity } => Some(capacity),
_ => None,
}
}
}
fn make_data_dirs(dirs: &DataDirEnum) -> Result<Vec<DataDir>, Error> {
let mut data_dirs = vec![];
match dirs {
DataDirEnum::Single(path) => data_dirs.push(DataDir {
path: path.clone(),
state: DataDirState::Active {
capacity: 1_000_000_000, // whatever, doesn't matter
},
}),
DataDirEnum::Multiple(dirs) => {
let mut ok = false;
for dir in dirs.iter() {
let state = match &dir.capacity {
Some(cap) if dir.read_only == false => {
let capacity = cap.parse::<bytesize::ByteSize>()
.ok_or_message("invalid capacity value")?.as_u64();
if capacity == 0 {
return Err(Error::Message(format!("data directory {} should have non-zero capacity", dir.path.to_string_lossy())));
}
ok = true;
DataDirState::Active {
capacity,
}
}
None if dir.read_only == true => {
DataDirState::ReadOnly
}
_ => return Err(Error::Message(format!("data directories in data_dir should have a capacity value or be marked read_only, not the case for {}", dir.path.to_string_lossy()))),
};
data_dirs.push(DataDir {
path: dir.path.clone(),
state,
});
}
if !ok {
return Err(Error::Message(
"incorrect data_dir configuration, no primary writable directory specified"
.into(),
));
}
}
}
Ok(data_dirs)
}
fn dir_not_empty(path: &PathBuf) -> Result<bool, Error> {
for entry in std::fs::read_dir(&path)? {
let dir = entry?;
let ft = dir.file_type()?;
let name = dir.file_name().into_string().ok();
if ft.is_file() && name.as_deref() == Some(MARKER_FILE_NAME) {
return Ok(true);
}
if ft.is_dir() && name.and_then(|hex| hex::decode(&hex).ok()).is_some() {
return Ok(true);
}
}
Ok(false)
}