@@ -52,6 +52,32 @@ pub use self::rng::SledPlannerRng;
5252mod omicron_zone_placement;
5353pub ( crate ) mod rng;
5454
55+ /// Maximum number of MGS-managed updates (updates to SP, RoT, RoT bootloader,
56+ /// or host OS) that we allow to be pending across the whole system at one time
57+ ///
58+ /// For now, we limit this to 1 for safety. That's for a few reasons:
59+ ///
60+ /// - SP updates reboot the corresponding host. Thus, if we have one of these
61+ /// updates outstanding, we should assume that host may be offline. Most
62+ /// control plane services are designed to survive multiple failures (e.g.,
63+ /// the Cockroach cluster can sustain two failures and stay online), but
64+ /// having one sled offline eats into that margin. And some services like
65+ /// Crucible volumes can only sustain one failure. Taking down two sleds
66+ /// would render unavailable any Crucible volumes with regions on those two
67+ /// sleds.
68+ ///
69+ /// - There is unfortunately some risk in updating the RoT bootloader, in that
70+ /// there's a window where a failure could render the device unbootable. See
71+ /// oxidecomputer/omicron#7819 for more on this. Updating only one at a time
72+ /// helps mitigate this risk.
73+ ///
74+ /// More sophisticated schemes are certainly possible (e.g., allocate Crucible
75+ /// regions in such a way that there are at least pairs of sleds we could update
76+ /// concurrently without taking volumes down; and/or be willing to update
77+ /// multiple sleds as long as they don't have overlapping control plane
78+ /// services, etc.).
79+ const NUM_CONCURRENT_MGS_UPDATES : usize = 1 ;
80+
5581enum UpdateStepResult {
5682 ContinueToNextStep ,
5783 Waiting ,
@@ -122,8 +148,7 @@ impl<'a> Planner<'a> {
122148 self . do_plan_expunge ( ) ?;
123149 self . do_plan_add ( ) ?;
124150 self . do_plan_decommission ( ) ?;
125- if let UpdateStepResult :: ContinueToNextStep =
126- self . do_plan_mgs_updates ( ) ?
151+ if let UpdateStepResult :: ContinueToNextStep = self . do_plan_mgs_updates ( )
127152 {
128153 self . do_plan_zone_updates ( ) ?;
129154 }
@@ -914,7 +939,7 @@ impl<'a> Planner<'a> {
914939
915940 /// Update at most one MGS-managed device (SP, RoT, etc.), if any are out of
916941 /// date.
917- fn do_plan_mgs_updates ( & mut self ) -> Result < UpdateStepResult , Error > {
942+ fn do_plan_mgs_updates ( & mut self ) -> UpdateStepResult {
918943 // Determine which baseboards we will consider updating.
919944 //
920945 // Sleds may be present but not adopted as part of the control plane.
@@ -956,7 +981,7 @@ impl<'a> Planner<'a> {
956981 & included_baseboards,
957982 & current_updates,
958983 current_artifacts,
959- 1 ,
984+ NUM_CONCURRENT_MGS_UPDATES ,
960985 ) ;
961986
962987 // TODO This is not quite right. See oxidecomputer/omicron#8285.
@@ -966,7 +991,7 @@ impl<'a> Planner<'a> {
966991 UpdateStepResult :: Waiting
967992 } ;
968993 self . blueprint . pending_mgs_updates_replace_all ( next) ;
969- Ok ( rv )
994+ rv
970995 }
971996
972997 /// Update at most one existing zone to use a new image source.
0 commit comments