forked from Deuxfleurs/garage
layout/sync: fix bugs and add tracing
This commit is contained in:
parent
ce89d1ddab
commit
df24bb806d
3 changed files with 48 additions and 25 deletions
|
@ -131,7 +131,8 @@ impl LayoutHistory {
|
|||
pub(crate) fn cleanup_old_versions(&mut self) {
|
||||
let min_sync_ack = self.calculate_global_min(&self.update_trackers.sync_ack_map);
|
||||
while self.versions.first().as_ref().unwrap().version < min_sync_ack {
|
||||
self.versions.remove(0);
|
||||
let removed = self.versions.remove(0);
|
||||
info!("Layout history: pruning old version {}", removed.version);
|
||||
}
|
||||
}
|
||||
|
||||
|
|
|
@ -133,7 +133,7 @@ impl LayoutManager {
|
|||
pub fn sync_table_until(self: &Arc<Self>, table_name: &'static str, version: u64) {
|
||||
let mut table_sync_version = self.table_sync_version.lock().unwrap();
|
||||
*table_sync_version.get_mut(table_name).unwrap() = version;
|
||||
let sync_until = table_sync_version.iter().map(|(_, v)| *v).max().unwrap();
|
||||
let sync_until = table_sync_version.iter().map(|(_, v)| *v).min().unwrap();
|
||||
drop(table_sync_version);
|
||||
|
||||
let mut layout = self.layout.write().unwrap();
|
||||
|
@ -142,6 +142,7 @@ impl LayoutManager {
|
|||
.sync_map
|
||||
.set_max(self.node_id, sync_until)
|
||||
{
|
||||
debug!("sync_until updated to {}", sync_until);
|
||||
layout.update_hashes();
|
||||
self.broadcast_update(SystemRpc::AdvertiseClusterLayoutTrackers(
|
||||
layout.update_trackers.clone(),
|
||||
|
@ -277,7 +278,12 @@ impl LayoutManager {
|
|||
self: &Arc<Self>,
|
||||
adv: &LayoutHistory,
|
||||
) -> Result<SystemRpc, Error> {
|
||||
debug!("handle_advertise_cluster_layout: {:?}", adv);
|
||||
debug!(
|
||||
"handle_advertise_cluster_layout: {} versions, last={}, trackers={:?}",
|
||||
adv.versions.len(),
|
||||
adv.current().version,
|
||||
adv.update_trackers
|
||||
);
|
||||
|
||||
if adv.current().replication_factor != self.replication_factor {
|
||||
let msg = format!(
|
||||
|
|
|
@ -488,8 +488,29 @@ struct SyncWorker<F: TableSchema, R: TableReplication> {
|
|||
}
|
||||
|
||||
impl<F: TableSchema, R: TableReplication> SyncWorker<F, R> {
|
||||
fn check_add_full_sync(&mut self) {
|
||||
let layout_versions = self.syncer.system.cluster_layout().sync_versions();
|
||||
if layout_versions != self.layout_versions {
|
||||
self.layout_versions = layout_versions;
|
||||
info!(
|
||||
"({}) Layout versions changed (max={}, ack={}, min stored={}), adding full sync to syncer todo list",
|
||||
F::TABLE_NAME,
|
||||
layout_versions.0,
|
||||
layout_versions.1,
|
||||
layout_versions.2
|
||||
);
|
||||
self.add_full_sync();
|
||||
}
|
||||
}
|
||||
|
||||
fn add_full_sync(&mut self) {
|
||||
let mut partitions = self.syncer.data.replication.sync_partitions();
|
||||
info!(
|
||||
"{}: Adding full sync for ack layout version {}",
|
||||
F::TABLE_NAME,
|
||||
partitions.layout_version
|
||||
);
|
||||
|
||||
partitions.partitions.shuffle(&mut thread_rng());
|
||||
self.todo = Some(partitions);
|
||||
self.next_full_sync = Instant::now() + ANTI_ENTROPY_INTERVAL;
|
||||
|
@ -510,6 +531,8 @@ impl<F: TableSchema, R: TableReplication> Worker for SyncWorker<F, R> {
|
|||
}
|
||||
|
||||
async fn work(&mut self, must_exit: &mut watch::Receiver<bool>) -> Result<WorkerState, Error> {
|
||||
self.check_add_full_sync();
|
||||
|
||||
if let Some(todo) = &mut self.todo {
|
||||
let partition = todo.partitions.pop().unwrap();
|
||||
|
||||
|
@ -531,20 +554,24 @@ impl<F: TableSchema, R: TableReplication> Worker for SyncWorker<F, R> {
|
|||
return Err(e);
|
||||
}
|
||||
|
||||
// done
|
||||
if !todo.partitions.is_empty() {
|
||||
return Ok(WorkerState::Busy);
|
||||
}
|
||||
|
||||
if todo.partitions.is_empty() {
|
||||
info!(
|
||||
"{}: Completed full sync for ack layout version {}",
|
||||
F::TABLE_NAME,
|
||||
todo.layout_version
|
||||
);
|
||||
self.syncer
|
||||
.system
|
||||
.layout_manager
|
||||
.sync_table_until(F::TABLE_NAME, todo.layout_version);
|
||||
self.todo = None;
|
||||
}
|
||||
|
||||
self.todo = None;
|
||||
Ok(WorkerState::Busy)
|
||||
} else {
|
||||
Ok(WorkerState::Idle)
|
||||
}
|
||||
}
|
||||
|
||||
async fn wait_for_work(&mut self) -> WorkerState {
|
||||
select! {
|
||||
|
@ -554,18 +581,7 @@ impl<F: TableSchema, R: TableReplication> Worker for SyncWorker<F, R> {
|
|||
}
|
||||
},
|
||||
_ = self.layout_notify.notified() => {
|
||||
let layout_versions = self.syncer.system.cluster_layout().sync_versions();
|
||||
if layout_versions != self.layout_versions {
|
||||
self.layout_versions = layout_versions;
|
||||
debug!(
|
||||
"({}) Layout versions changed (max={}, ack={}, min stored={}), adding full sync to syncer todo list",
|
||||
F::TABLE_NAME,
|
||||
layout_versions.0,
|
||||
layout_versions.1,
|
||||
layout_versions.2
|
||||
);
|
||||
self.add_full_sync();
|
||||
}
|
||||
self.check_add_full_sync();
|
||||
},
|
||||
_ = tokio::time::sleep_until(self.next_full_sync.into()) => {
|
||||
self.add_full_sync();
|
||||
|
|
Loading…
Reference in a new issue