layout/sync: fix bugs and add tracing
Some checks failed
continuous-integration/drone/pr Build is failing
continuous-integration/drone/push Build is failing

This commit is contained in:
Alex 2023-11-11 12:37:33 +01:00
parent ce89d1ddab
commit df24bb806d
Signed by: lx
GPG key ID: 0E496D15096376BE
3 changed files with 48 additions and 25 deletions

View file

@ -131,7 +131,8 @@ impl LayoutHistory {
pub(crate) fn cleanup_old_versions(&mut self) { pub(crate) fn cleanup_old_versions(&mut self) {
let min_sync_ack = self.calculate_global_min(&self.update_trackers.sync_ack_map); let min_sync_ack = self.calculate_global_min(&self.update_trackers.sync_ack_map);
while self.versions.first().as_ref().unwrap().version < min_sync_ack { while self.versions.first().as_ref().unwrap().version < min_sync_ack {
self.versions.remove(0); let removed = self.versions.remove(0);
info!("Layout history: pruning old version {}", removed.version);
} }
} }

View file

@ -133,7 +133,7 @@ impl LayoutManager {
pub fn sync_table_until(self: &Arc<Self>, table_name: &'static str, version: u64) { pub fn sync_table_until(self: &Arc<Self>, table_name: &'static str, version: u64) {
let mut table_sync_version = self.table_sync_version.lock().unwrap(); let mut table_sync_version = self.table_sync_version.lock().unwrap();
*table_sync_version.get_mut(table_name).unwrap() = version; *table_sync_version.get_mut(table_name).unwrap() = version;
let sync_until = table_sync_version.iter().map(|(_, v)| *v).max().unwrap(); let sync_until = table_sync_version.iter().map(|(_, v)| *v).min().unwrap();
drop(table_sync_version); drop(table_sync_version);
let mut layout = self.layout.write().unwrap(); let mut layout = self.layout.write().unwrap();
@ -142,6 +142,7 @@ impl LayoutManager {
.sync_map .sync_map
.set_max(self.node_id, sync_until) .set_max(self.node_id, sync_until)
{ {
debug!("sync_until updated to {}", sync_until);
layout.update_hashes(); layout.update_hashes();
self.broadcast_update(SystemRpc::AdvertiseClusterLayoutTrackers( self.broadcast_update(SystemRpc::AdvertiseClusterLayoutTrackers(
layout.update_trackers.clone(), layout.update_trackers.clone(),
@ -277,7 +278,12 @@ impl LayoutManager {
self: &Arc<Self>, self: &Arc<Self>,
adv: &LayoutHistory, adv: &LayoutHistory,
) -> Result<SystemRpc, Error> { ) -> Result<SystemRpc, Error> {
debug!("handle_advertise_cluster_layout: {:?}", adv); debug!(
"handle_advertise_cluster_layout: {} versions, last={}, trackers={:?}",
adv.versions.len(),
adv.current().version,
adv.update_trackers
);
if adv.current().replication_factor != self.replication_factor { if adv.current().replication_factor != self.replication_factor {
let msg = format!( let msg = format!(

View file

@ -488,8 +488,29 @@ struct SyncWorker<F: TableSchema, R: TableReplication> {
} }
impl<F: TableSchema, R: TableReplication> SyncWorker<F, R> { impl<F: TableSchema, R: TableReplication> SyncWorker<F, R> {
fn check_add_full_sync(&mut self) {
let layout_versions = self.syncer.system.cluster_layout().sync_versions();
if layout_versions != self.layout_versions {
self.layout_versions = layout_versions;
info!(
"({}) Layout versions changed (max={}, ack={}, min stored={}), adding full sync to syncer todo list",
F::TABLE_NAME,
layout_versions.0,
layout_versions.1,
layout_versions.2
);
self.add_full_sync();
}
}
fn add_full_sync(&mut self) { fn add_full_sync(&mut self) {
let mut partitions = self.syncer.data.replication.sync_partitions(); let mut partitions = self.syncer.data.replication.sync_partitions();
info!(
"{}: Adding full sync for ack layout version {}",
F::TABLE_NAME,
partitions.layout_version
);
partitions.partitions.shuffle(&mut thread_rng()); partitions.partitions.shuffle(&mut thread_rng());
self.todo = Some(partitions); self.todo = Some(partitions);
self.next_full_sync = Instant::now() + ANTI_ENTROPY_INTERVAL; self.next_full_sync = Instant::now() + ANTI_ENTROPY_INTERVAL;
@ -510,6 +531,8 @@ impl<F: TableSchema, R: TableReplication> Worker for SyncWorker<F, R> {
} }
async fn work(&mut self, must_exit: &mut watch::Receiver<bool>) -> Result<WorkerState, Error> { async fn work(&mut self, must_exit: &mut watch::Receiver<bool>) -> Result<WorkerState, Error> {
self.check_add_full_sync();
if let Some(todo) = &mut self.todo { if let Some(todo) = &mut self.todo {
let partition = todo.partitions.pop().unwrap(); let partition = todo.partitions.pop().unwrap();
@ -531,20 +554,24 @@ impl<F: TableSchema, R: TableReplication> Worker for SyncWorker<F, R> {
return Err(e); return Err(e);
} }
// done if todo.partitions.is_empty() {
if !todo.partitions.is_empty() { info!(
return Ok(WorkerState::Busy); "{}: Completed full sync for ack layout version {}",
} F::TABLE_NAME,
todo.layout_version
);
self.syncer self.syncer
.system .system
.layout_manager .layout_manager
.sync_table_until(F::TABLE_NAME, todo.layout_version); .sync_table_until(F::TABLE_NAME, todo.layout_version);
self.todo = None;
} }
self.todo = None; Ok(WorkerState::Busy)
} else {
Ok(WorkerState::Idle) Ok(WorkerState::Idle)
} }
}
async fn wait_for_work(&mut self) -> WorkerState { async fn wait_for_work(&mut self) -> WorkerState {
select! { select! {
@ -554,18 +581,7 @@ impl<F: TableSchema, R: TableReplication> Worker for SyncWorker<F, R> {
} }
}, },
_ = self.layout_notify.notified() => { _ = self.layout_notify.notified() => {
let layout_versions = self.syncer.system.cluster_layout().sync_versions(); self.check_add_full_sync();
if layout_versions != self.layout_versions {
self.layout_versions = layout_versions;
debug!(
"({}) Layout versions changed (max={}, ack={}, min stored={}), adding full sync to syncer todo list",
F::TABLE_NAME,
layout_versions.0,
layout_versions.1,
layout_versions.2
);
self.add_full_sync();
}
}, },
_ = tokio::time::sleep_until(self.next_full_sync.into()) => { _ = tokio::time::sleep_until(self.next_full_sync.into()) => {
self.add_full_sync(); self.add_full_sync();