From 2f111e6b3d772b10c8ed6279ce0c82d22852afd1 Mon Sep 17 00:00:00 2001 From: Alex Auvolat Date: Mon, 18 Jul 2022 18:40:57 +0200 Subject: [PATCH] Performance improvements: - reduce contention on mutation_lock by having 256 of them - better lmdb defaults --- src/api/s3/put.rs | 30 +++++++++++++++++++++++++----- src/block/manager.rs | 27 +++++++++++++++------------ src/model/garage.rs | 15 ++++++++++----- src/util/async_hash.rs | 15 +++++++++------ 4 files changed, 59 insertions(+), 28 deletions(-) diff --git a/src/api/s3/put.rs b/src/api/s3/put.rs index fbfa6f0d..a182f04d 100644 --- a/src/api/s3/put.rs +++ b/src/api/s3/put.rs @@ -8,6 +8,11 @@ use hyper::{Request, Response}; use md5::{digest::generic_array::*, Digest as Md5Digest, Md5}; use sha2::Sha256; +use opentelemetry::{ + trace::{FutureExt as OtelFutureExt, TraceContextExt, Tracer}, + Context, +}; + use garage_table::*; use garage_util::async_hash::*; use garage_util::data::*; @@ -279,12 +284,21 @@ async fn read_and_put_blocks> + Unpin>( first_block_hash: Hash, chunker: &mut StreamChunker, ) -> Result<(u64, GenericArray, Hash), Error> { + let tracer = opentelemetry::global::tracer("garage"); + let first_block = Bytes::from(first_block); let md5hasher = AsyncHasher::::new(); let sha256hasher = AsyncHasher::::new(); - md5hasher.update(first_block.clone()); - sha256hasher.update(first_block.clone()); + + futures::future::join( + md5hasher.update(first_block.clone()), + sha256hasher.update(first_block.clone()), + ) + .with_context(Context::current_with_span( + tracer.start("Hash first block (md5, sha256)"), + )) + .await; let mut next_offset = first_block.len(); let mut put_curr_version_block = put_block_meta( @@ -307,9 +321,15 @@ async fn read_and_put_blocks> + Unpin>( )?; if let Some(block) = next_block { let block = Bytes::from(block); - md5hasher.update(block.clone()); - sha256hasher.update(block.clone()); - let block_hash = async_blake2sum(block.clone()).await; + let (_, _, block_hash) = futures::future::join3( + md5hasher.update(block.clone()), + sha256hasher.update(block.clone()), + async_blake2sum(block.clone()), + ) + .with_context(Context::current_with_span( + tracer.start("Hash block (md5, sha256, blake2)"), + )) + .await; let block_len = block.len(); put_curr_version_block = put_block_meta( garage, diff --git a/src/block/manager.rs b/src/block/manager.rs index 890c247d..be53ec6e 100644 --- a/src/block/manager.rs +++ b/src/block/manager.rs @@ -93,7 +93,7 @@ pub struct BlockManager { compression_level: Option, background_tranquility: u32, - mutation_lock: Mutex, + mutation_lock: [Mutex; 256], pub(crate) rc: BlockRc, @@ -150,8 +150,6 @@ impl BlockManager { .netapp .endpoint("garage_block/manager.rs/Rpc".to_string()); - let manager_locked = BlockManagerLocked(); - let metrics = BlockManagerMetrics::new(resync_queue.clone(), resync_errors.clone()); let block_manager = Arc::new(Self { @@ -159,7 +157,7 @@ impl BlockManager { data_dir, compression_level, background_tranquility, - mutation_lock: Mutex::new(manager_locked), + mutation_lock: [(); 256].map(|_| Mutex::new(BlockManagerLocked())), rc, resync_queue, resync_notify: Notify::new(), @@ -313,14 +311,21 @@ impl BlockManager { /// Write a block to disk async fn write_block(&self, hash: &Hash, data: &DataBlock) -> Result { + let tracer = opentelemetry::global::tracer("garage"); + let write_size = data.inner_buffer().len() as u64; - let res = self - .mutation_lock + let res = self.mutation_lock[hash.as_slice()[0] as usize] .lock() + .with_context(Context::current_with_span( + tracer.start("Acquire mutation_lock"), + )) .await .write_block(hash, data, self) .bound_record_duration(&self.metrics.block_write_duration) + .with_context(Context::current_with_span( + tracer.start("BlockManagerLocked::write_block"), + )) .await?; self.metrics.bytes_written.add(write_size); @@ -370,7 +375,7 @@ impl BlockManager { if data.verify(*hash).is_err() { self.metrics.corruption_counter.add(1); - self.mutation_lock + self.mutation_lock[hash.as_slice()[0] as usize] .lock() .await .move_block_to_corrupted(hash, self) @@ -384,8 +389,7 @@ impl BlockManager { /// Check if this node should have a block, but don't actually have it async fn need_block(&self, hash: &Hash) -> Result { - let BlockStatus { exists, needed } = self - .mutation_lock + let BlockStatus { exists, needed } = self.mutation_lock[hash.as_slice()[0] as usize] .lock() .await .check_block_status(hash, self) @@ -608,8 +612,7 @@ impl BlockManager { } async fn resync_block(&self, hash: &Hash) -> Result<(), Error> { - let BlockStatus { exists, needed } = self - .mutation_lock + let BlockStatus { exists, needed } = self.mutation_lock[hash.as_slice()[0] as usize] .lock() .await .check_block_status(hash, self) @@ -694,7 +697,7 @@ impl BlockManager { who.len() ); - self.mutation_lock + self.mutation_lock[hash.as_slice()[0] as usize] .lock() .await .delete_if_unneeded(hash, self) diff --git a/src/model/garage.rs b/src/model/garage.rs index 15769a17..0d239df6 100644 --- a/src/model/garage.rs +++ b/src/model/garage.rs @@ -104,11 +104,16 @@ impl Garage { std::fs::create_dir_all(&db_path).expect("Unable to create LMDB data directory"); let map_size = garage_db::lmdb_adapter::recommended_map_size(); - let db = db::lmdb_adapter::heed::EnvOpenOptions::new() - .max_dbs(100) - .map_size(map_size) - .open(&db_path) - .expect("Unable to open LMDB DB"); + use db::lmdb_adapter::heed; + let mut env_builder = heed::EnvOpenOptions::new(); + env_builder.max_dbs(100); + env_builder.max_readers(500); + env_builder.map_size(map_size); + unsafe { + env_builder.flag(heed::flags::Flags::MdbNoSync); + env_builder.flag(heed::flags::Flags::MdbNoMetaSync); + } + let db = env_builder.open(&db_path).expect("Unable to open LMDB DB"); db::lmdb_adapter::LmdbDb::init(db) } e => { diff --git a/src/util/async_hash.rs b/src/util/async_hash.rs index 67776eb9..be0535de 100644 --- a/src/util/async_hash.rs +++ b/src/util/async_hash.rs @@ -1,7 +1,7 @@ use bytes::Bytes; use digest::Digest; -use tokio::sync::mpsc; +use tokio::sync::{mpsc, oneshot}; use tokio::task::JoinHandle; use crate::data::*; @@ -27,25 +27,28 @@ pub async fn async_blake2sum(data: Bytes) -> Hash { // ---- pub struct AsyncHasher { - sendblk: mpsc::UnboundedSender, + sendblk: mpsc::UnboundedSender<(Bytes, oneshot::Sender<()>)>, task: JoinHandle>, } impl AsyncHasher { pub fn new() -> Self { - let (sendblk, mut recvblk) = mpsc::unbounded_channel::(); + let (sendblk, mut recvblk) = mpsc::unbounded_channel::<(Bytes, oneshot::Sender<()>)>(); let task = tokio::task::spawn_blocking(move || { let mut digest = D::new(); - while let Some(blk) = recvblk.blocking_recv() { + while let Some((blk, ch)) = recvblk.blocking_recv() { digest.update(&blk[..]); + let _ = ch.send(()); } digest.finalize() }); Self { sendblk, task } } - pub fn update(&self, b: Bytes) { - self.sendblk.send(b).unwrap() + pub async fn update(&self, b: Bytes) { + let (tx, rx) = oneshot::channel(); + self.sendblk.send((b, tx)).unwrap(); + let _ = rx.await; } pub async fn finalize(self) -> digest::Output {