From fcf094d0ba1daee9d8bdf1919c2ca4bb34703b53 Mon Sep 17 00:00:00 2001 From: John Gallagher Date: Mon, 27 Oct 2025 17:01:37 -0400 Subject: [PATCH 1/2] sled-agent: Move `UnderlayInfo` into `SwitchZoneConfig` --- sled-agent/src/services.rs | 75 ++++++++++++++++++++------------------ 1 file changed, 39 insertions(+), 36 deletions(-) diff --git a/sled-agent/src/services.rs b/sled-agent/src/services.rs index daadb57008..75d187dc56 100644 --- a/sled-agent/src/services.rs +++ b/sled-agent/src/services.rs @@ -443,6 +443,7 @@ struct SwitchZoneConfig { id: Uuid, addresses: Vec, services: Vec, + underlay_info: Option, } /// Describes one of several services that may be deployed in a switch zone @@ -2444,7 +2445,7 @@ impl ServiceManager { bootstrap_name_and_address: Option<(String, Ipv6Addr)>, device_names: &[String], ) -> Result { - let SwitchZoneConfig { id, services, addresses } = config; + let SwitchZoneConfig { id, services, addresses, .. } = config; let disabled_dns_client_service = ServiceBuilder::new("network/dns/client") @@ -3265,16 +3266,14 @@ impl ServiceManager { }; addresses.push(Ipv6Addr::LOCALHOST); - let request = - SwitchZoneConfig { id: Uuid::new_v4(), addresses, services }; - - self.ensure_switch_zone( - Some(request), - filesystems, - data_links, + let request = SwitchZoneConfig { + id: Uuid::new_v4(), + addresses, + services, underlay_info, - ) - .await?; + }; + + self.ensure_switch_zone(Some(request), filesystems, data_links).await?; Ok(()) } @@ -3494,8 +3493,6 @@ impl ServiceManager { vec![], // data_links= vec![], - // underlay_info= - None, ) .await } @@ -3514,7 +3511,6 @@ impl ServiceManager { request: SwitchZoneConfig, filesystems: Vec, data_links: Vec, - underlay_info: Option, ) { let (exit_tx, exit_rx) = oneshot::channel(); *zone = SwitchZoneState::Initializing { @@ -3524,8 +3520,7 @@ impl ServiceManager { worker: Some(Task { exit_tx, initializer: tokio::task::spawn(async move { - self.initialize_switch_zone_loop(underlay_info, exit_rx) - .await + self.initialize_switch_zone_loop(exit_rx).await }), }), }; @@ -3537,7 +3532,6 @@ impl ServiceManager { request: Option, filesystems: Vec, data_links: Vec, - underlay_info: Option, ) -> Result<(), Error> { let log = &self.inner.log; @@ -3552,7 +3546,6 @@ impl ServiceManager { request, filesystems, data_links, - underlay_info, ); } ( @@ -3918,7 +3911,7 @@ impl ServiceManager { // We also need to ensure any uplinks are configured. Spawn a // task that goes into an infinite retry loop until it succeeds. - if let Some(underlay_info) = underlay_info { + if let Some(underlay_info) = request.underlay_info.clone() { if let Some(old_worker) = worker.take() { old_worker.stop().await; } @@ -3978,7 +3971,7 @@ impl ServiceManager { async fn try_initialize_switch_zone( &self, sled_zone: &mut SwitchZoneState, - ) -> Result<(), Error> { + ) -> Result, Error> { let SwitchZoneState::Initializing { request, filesystems, @@ -3986,7 +3979,7 @@ impl ServiceManager { .. } = &*sled_zone else { - return Ok(()); + return Ok(None); }; // The switch zone must use the ramdisk in order to receive requests @@ -4003,19 +3996,19 @@ impl ServiceManager { let zone = self .initialize_zone(zone_args, zone_root_path, filesystems, data_links) .await?; + let underlay_info = request.underlay_info.clone(); *sled_zone = SwitchZoneState::Running { request: request.clone(), zone: Box::new(zone), worker: None, }; - Ok(()) + Ok(underlay_info) } // Body of a tokio task responsible for running until the switch zone is // inititalized, or it has been told to stop. async fn initialize_switch_zone_loop( &self, - underlay_info: Option, mut exit_rx: oneshot::Receiver<()>, ) { // We don't really expect failures trying to initialize the switch zone @@ -4025,13 +4018,25 @@ impl ServiceManager { // First, go into a loop to bring up the switch zone; retry until we // succeed or are told to give up via `exit_rx`. - loop { + let underlay_info = loop { { let mut sled_zone = self.inner.switch_zone.lock().await; match self.try_initialize_switch_zone(&mut sled_zone).await { - Ok(()) => { - info!(self.inner.log, "initialized switch zone"); - break; + Ok(None) => { + info!( + self.inner.log, + "initialized switch zone \ + (no underlay info available yet)", + ); + return; + } + Ok(Some(underlay_info)) => { + info!( + self.inner.log, + "initialized switch zone (underlay info \ + available: will attempt uplink configuration)", + ); + break underlay_info; } Err(e) => { warn!( @@ -4060,17 +4065,15 @@ impl ServiceManager { continue; } }; - } + }; - // Then, if we have underlay info, go into a loop trying to configure - // our uplinks. As above, retry until we succeed or are told to stop. - if let Some(underlay_info) = underlay_info { - self.ensure_switch_zone_uplinks_configured_loop( - &underlay_info, - exit_rx, - ) - .await; - } + // Then go into a loop trying to configure our uplinks. As above, retry + // until we succeed or are told to stop. + self.ensure_switch_zone_uplinks_configured_loop( + &underlay_info, + exit_rx, + ) + .await; } } From 07aa525f9781f5fdb163b3d2c619790f01ab2e16 Mon Sep 17 00:00:00 2001 From: John Gallagher Date: Mon, 27 Oct 2025 17:56:32 -0400 Subject: [PATCH 2/2] don't drop worker task on transition --- sled-agent/src/services.rs | 15 ++++++++++++--- 1 file changed, 12 insertions(+), 3 deletions(-) diff --git a/sled-agent/src/services.rs b/sled-agent/src/services.rs index 75d187dc56..80833805d1 100644 --- a/sled-agent/src/services.rs +++ b/sled-agent/src/services.rs @@ -3976,8 +3976,8 @@ impl ServiceManager { request, filesystems, data_links, - .. - } = &*sled_zone + worker, + } = sled_zone else { return Ok(None); }; @@ -3997,10 +3997,19 @@ impl ServiceManager { .initialize_zone(zone_args, zone_root_path, filesystems, data_links) .await?; let underlay_info = request.underlay_info.clone(); + + // Even though we've initialized the zone, the `worker` task may still + // be running to configure uplinks. If we drop `worker` now it will + // cause that task to exit before it gets a chance to do so. This is all + // very unsatisfying and needs some serious rework: + // https://github.com/oxidecomputer/omicron/issues/8970 and + // https://github.com/oxidecomputer/omicron/issues/9182 are strongly + // related. + let worker = worker.take(); *sled_zone = SwitchZoneState::Running { request: request.clone(), zone: Box::new(zone), - worker: None, + worker, }; Ok(underlay_info) }