diff --git a/common/src/api/external/error.rs b/common/src/api/external/error.rs index dc1459f46aa..e508b7ecba2 100644 --- a/common/src/api/external/error.rs +++ b/common/src/api/external/error.rs @@ -61,6 +61,9 @@ pub enum Error { #[error("Type version mismatch! {internal_message}")] TypeVersionMismatch { internal_message: String }, + + #[error("Conflict: {internal_message}")] + Conflict { internal_message: String }, } /// Indicates how an object was looked up (for an `ObjectNotFound` error) @@ -118,7 +121,8 @@ impl Error { | Error::Forbidden | Error::MethodNotAllowed { .. } | Error::InternalError { .. } - | Error::TypeVersionMismatch { .. } => false, + | Error::TypeVersionMismatch { .. } + | Error::Conflict { .. } => false, } } @@ -174,6 +178,18 @@ impl Error { Error::TypeVersionMismatch { internal_message: message.to_owned() } } + /// Generates an [`Error::Conflict`] with a specific message. + /// + /// This is used in cases where a request cannot proceed because the target + /// resource is currently in a state that's incompatible with that request, + /// but where the request might succeed if it is retried or modified and + /// retried. The internal message should provide more information about the + /// source of the conflict and possible actions the caller can take to + /// resolve it (if any). + pub fn conflict(message: &str) -> Error { + Error::Conflict { internal_message: message.to_owned() } + } + /// Given an [`Error`] with an internal message, return the same error with /// `context` prepended to it to provide more context /// @@ -223,6 +239,9 @@ impl Error { ), } } + Error::Conflict { internal_message } => Error::Conflict { + internal_message: format!("{}: {}", context, internal_message), + }, } } } @@ -317,6 +336,14 @@ impl From for HttpError { Error::TypeVersionMismatch { internal_message } => { HttpError::for_internal_error(internal_message) } + + Error::Conflict { internal_message } => { + HttpError::for_client_error( + Some(String::from("Conflict")), + http::StatusCode::CONFLICT, + internal_message, + ) + } } } } diff --git a/nexus/db-model/src/instance.rs b/nexus/db-model/src/instance.rs index 1ee25ec8b6e..5d722912da6 100644 --- a/nexus/db-model/src/instance.rs +++ b/nexus/db-model/src/instance.rs @@ -98,7 +98,10 @@ impl DatastoreAttachTargetConfig for Instance { Serialize, Deserialize, )] -#[diesel(table_name = instance)] +// N.B. Setting `treat_none_as_null` is required for these fields to be cleared +// properly during live migrations. See the documentation for +// `diesel::prelude::AsChangeset`. +#[diesel(table_name = instance, treat_none_as_null = true)] pub struct InstanceRuntimeState { /// The instance's current user-visible instance state. /// diff --git a/nexus/db-model/src/sled.rs b/nexus/db-model/src/sled.rs index 68387625431..22d1b1ddd35 100644 --- a/nexus/db-model/src/sled.rs +++ b/nexus/db-model/src/sled.rs @@ -143,3 +143,54 @@ impl DatastoreCollectionConfig for Sled { type CollectionTimeDeletedColumn = sled::dsl::time_deleted; type CollectionIdColumn = service::dsl::sled_id; } + +/// A set of constraints that can be placed on operations that select a sled. +#[derive(Debug)] +pub struct SledReservationConstraints { + must_select_from: Vec, +} + +impl SledReservationConstraints { + /// Creates a constraint set with no constraints in it. + pub fn none() -> Self { + Self { must_select_from: Vec::new() } + } + + /// If the constraints include a set of sleds that the caller must select + /// from, returns `Some` and a slice containing the members of that set. + /// + /// If no "must select from these" constraint exists, returns None. + pub fn must_select_from(&self) -> Option<&[Uuid]> { + if self.must_select_from.is_empty() { + None + } else { + Some(&self.must_select_from) + } + } +} + +#[derive(Debug)] +pub struct SledReservationConstraintBuilder { + constraints: SledReservationConstraints, +} + +impl SledReservationConstraintBuilder { + pub fn new() -> Self { + SledReservationConstraintBuilder { + constraints: SledReservationConstraints::none(), + } + } + + /// Adds a "must select from the following sled IDs" constraint. If such a + /// constraint already exists, appends the supplied sled IDs to the "must + /// select from" list. + pub fn must_select_from(mut self, sled_ids: &[Uuid]) -> Self { + self.constraints.must_select_from.extend(sled_ids); + self + } + + /// Builds a set of constraints from this builder's current state. + pub fn build(self) -> SledReservationConstraints { + self.constraints + } +} diff --git a/nexus/db-queries/src/db/datastore/sled.rs b/nexus/db-queries/src/db/datastore/sled.rs index 94a1334f19c..9ab3e026d18 100644 --- a/nexus/db-queries/src/db/datastore/sled.rs +++ b/nexus/db-queries/src/db/datastore/sled.rs @@ -76,6 +76,7 @@ impl DataStore { resource_id: Uuid, resource_kind: db::model::SledResourceKind, resources: db::model::Resources, + constraints: db::model::SledReservationConstraints, ) -> CreateResult { #[derive(Debug)] enum SledReservationError { @@ -120,10 +121,10 @@ impl DataStore { resource_dsl::rss_ram::NAME )) + resources.rss_ram) .le(sled_dsl::usable_physical_ram); - sql_function!(fn random() -> diesel::sql_types::Float); - let sled_targets = sled_dsl::sled - // LEFT JOIN so we can observe sleds with no - // currently-allocated resources as potential targets + + // Generate a query describing all of the sleds that have space + // for this reservation. + let mut sled_targets = sled_dsl::sled .left_join( resource_dsl::sled_resource .on(resource_dsl::sled_id.eq(sled_dsl::id)), @@ -135,6 +136,17 @@ impl DataStore { ) .filter(sled_dsl::time_deleted.is_null()) .select(sled_dsl::id) + .into_boxed(); + + // Further constrain the sled IDs according to any caller- + // supplied constraints. + if let Some(must_select_from) = constraints.must_select_from() { + sled_targets = sled_targets + .filter(sled_dsl::id.eq_any(must_select_from.to_vec())); + } + + sql_function!(fn random() -> diesel::sql_types::Float); + let sled_targets = sled_targets .order(random()) .limit(1) .get_results_async::(&conn) diff --git a/nexus/src/app/instance.rs b/nexus/src/app/instance.rs index 1248dbf0853..803269b311b 100644 --- a/nexus/src/app/instance.rs +++ b/nexus/src/app/instance.rs @@ -35,6 +35,8 @@ use omicron_common::api::external::NameOrId; use omicron_common::api::external::UpdateResult; use omicron_common::api::external::Vni; use omicron_common::api::internal::nexus; +use sled_agent_client::types::InstanceMigrationSourceParams; +use sled_agent_client::types::InstancePutMigrationIdsBody; use sled_agent_client::types::InstancePutStateBody; use sled_agent_client::types::InstanceStateRequested; use sled_agent_client::types::SourceNatConfig; @@ -51,6 +53,11 @@ use uuid::Uuid; const MAX_KEYS_PER_INSTANCE: u32 = 8; +pub(crate) enum WriteBackUpdatedInstance { + WriteBack, + Drop, +} + impl super::Nexus { pub fn instance_lookup<'a>( &'a self, @@ -267,13 +274,29 @@ impl super::Nexus { instance_lookup: &lookup::Instance<'_>, params: params::InstanceMigrate, ) -> UpdateResult { - let (.., authz_instance) = - instance_lookup.lookup_for(authz::Action::Modify).await?; + let (.., authz_instance, db_instance) = + instance_lookup.fetch_for(authz::Action::Modify).await?; + + if db_instance.runtime().state.0 != InstanceState::Running { + return Err(Error::invalid_request( + "instance must be running before it can migrate", + )); + } + + if db_instance.runtime().sled_id == params.dst_sled_id { + return Err(Error::invalid_request( + "instance is already running on destination sled", + )); + } + + if db_instance.runtime().migration_id.is_some() { + return Err(Error::unavail("instance is already migrating")); + } // Kick off the migration saga let saga_params = sagas::instance_migrate::Params { serialized_authn: authn::saga::Serialized::for_opctx(opctx), - instance_id: authz_instance.id(), + instance: db_instance, migrate_params: params, }; self.execute_saga::( @@ -287,39 +310,107 @@ impl super::Nexus { self.db_datastore.instance_refetch(opctx, &authz_instance).await } - /// Idempotently place the instance in a 'Migrating' state. - pub async fn instance_start_migrate( + /// Attempts to set the migration IDs for the supplied instance via the + /// sled specified in `db_instance`. + /// + /// The caller is assumed to have fetched the current instance record from + /// the DB and verified that the record has no migration IDs. + /// + /// Returns `Ok` and the updated instance record if this call successfully + /// updated the instance with the sled agent and that update was + /// successfully reflected into CRDB. Returns `Err` with an appropriate + /// error otherwise. + /// + /// # Panics + /// + /// Asserts that `db_instance` has no migration ID or destination Propolis + /// ID set. + pub async fn instance_set_migration_ids( &self, - _opctx: &OpContext, - _instance_id: Uuid, - _migration_id: Uuid, - _dst_propolis_id: Uuid, + opctx: &OpContext, + instance_id: Uuid, + db_instance: &db::model::Instance, + migration_params: InstanceMigrationSourceParams, ) -> UpdateResult { - todo!("Migration endpoint not yet implemented in sled agent"); + assert!(db_instance.runtime().migration_id.is_none()); + assert!(db_instance.runtime().dst_propolis_id.is_none()); - /* - let (.., authz_instance, db_instance) = - LookupPath::new(opctx, &self.db_datastore) - .instance_id(instance_id) - .fetch() - .await - .unwrap(); - let requested = InstanceRuntimeStateRequested { - run_state: InstanceStateRequested::Migrating, - migration_params: Some(InstanceRuntimeStateMigrateParams { - migration_id, - dst_propolis_id, - }), - }; - self.instance_set_runtime( - opctx, - &authz_instance, - &db_instance, - requested, - ) - .await?; - self.db_datastore.instance_refetch(opctx, &authz_instance).await - */ + let (.., authz_instance) = LookupPath::new(opctx, &self.db_datastore) + .instance_id(instance_id) + .lookup_for(authz::Action::Modify) + .await?; + + let sa = self.instance_sled(&db_instance).await?; + let instance_put_result = sa + .instance_put_migration_ids( + &instance_id, + &InstancePutMigrationIdsBody { + old_runtime: db_instance.runtime().clone().into(), + migration_params: Some(migration_params), + }, + ) + .await + .map(|res| Some(res.into_inner())); + + // Write the updated instance runtime state back to CRDB. If this + // outright fails, this operation fails. If the operation nominally + // succeeds but nothing was updated, this action is outdated and the + // caller should not proceed with migration. + let updated = self + .handle_instance_put_result(&db_instance, instance_put_result) + .await?; + + if updated { + Ok(self + .db_datastore + .instance_refetch(opctx, &authz_instance) + .await?) + } else { + Err(Error::conflict( + "instance is already migrating, or underwent an operation that \ + prevented this migration from proceeding" + )) + } + } + + /// Attempts to clear the migration IDs for the supplied instance via the + /// sled specified in `db_instance`. + /// + /// The supplied instance record must contain valid migration IDs. + /// + /// Returns `Ok` if sled agent accepted the request to clear migration IDs + /// and the resulting attempt to write instance runtime state back to CRDB + /// succeeded. This routine returns `Ok` even if the update was not actually + /// applied (due to a separate generation number change). + /// + /// # Panics + /// + /// Asserts that `db_instance` has a migration ID and destination Propolis + /// ID set. + pub async fn instance_clear_migration_ids( + &self, + instance_id: Uuid, + db_instance: &db::model::Instance, + ) -> Result<(), Error> { + assert!(db_instance.runtime().migration_id.is_some()); + assert!(db_instance.runtime().dst_propolis_id.is_some()); + + let sa = self.instance_sled(&db_instance).await?; + let instance_put_result = sa + .instance_put_migration_ids( + &instance_id, + &InstancePutMigrationIdsBody { + old_runtime: db_instance.runtime().clone().into(), + migration_params: None, + }, + ) + .await + .map(|res| Some(res.into_inner())); + + self.handle_instance_put_result(&db_instance, instance_put_result) + .await?; + + Ok(()) } /// Reboot the specified instance. @@ -402,6 +493,7 @@ impl super::Nexus { opctx: &OpContext, authz_instance: &authz::Instance, db_instance: &db::model::Instance, + write_back: WriteBackUpdatedInstance, ) -> Result<(), Error> { opctx.authorize(authz::Action::Modify, authz_instance).await?; let sa = self.instance_sled(&db_instance).await?; @@ -409,7 +501,17 @@ impl super::Nexus { .instance_unregister(&db_instance.id()) .await .map(|res| res.into_inner().updated_runtime); - self.handle_instance_put_result(db_instance, result).await.map(|_| ()) + + match write_back { + WriteBackUpdatedInstance::WriteBack => self + .handle_instance_put_result(db_instance, result) + .await + .map(|_| ()), + WriteBackUpdatedInstance::Drop => { + result?; + Ok(()) + } + } } /// Returns the SledAgentClient for the host where this Instance is running. @@ -726,9 +828,11 @@ impl super::Nexus { // what to do with status codes. error!(self.log, "saw {} from instance_put!", e); - // this is unfortunate, but sled_agent_client::Error doesn't - // implement Copy, and can't be match'ed upon below without this - // line. + // Convert to the Omicron API error type. + // + // N.B. The match below assumes that this conversion will turn + // any 400-level error status from sled agent into an + // `Error::InvalidRequest`. let e = e.into(); match &e { @@ -888,6 +992,10 @@ impl super::Nexus { ) -> Result<(), Error> { let log = &self.log; + slog::debug!(log, "received new runtime state from sled agent"; + "instance_id" => %id, + "runtime_state" => ?new_runtime_state); + let result = self .db_datastore .instance_update_runtime(id, &(new_runtime_state.clone().into())) diff --git a/nexus/src/app/sagas/instance_create.rs b/nexus/src/app/sagas/instance_create.rs index 58a75958e70..a3b8a4eae14 100644 --- a/nexus/src/app/sagas/instance_create.rs +++ b/nexus/src/app/sagas/instance_create.rs @@ -3,6 +3,7 @@ // file, You can obtain one at https://mozilla.org/MPL/2.0/. use super::{NexusActionContext, NexusSaga, SagaInitError, ACTION_GENERATE_ID}; +use crate::app::instance::WriteBackUpdatedInstance; use crate::app::sagas::declare_saga_actions; use crate::app::sagas::disk_create::{self, SagaDiskCreate}; use crate::app::{ @@ -615,6 +616,7 @@ async fn sic_alloc_server( propolis_id, db::model::SledResourceKind::Instance, resources, + db::model::SledReservationConstraints::none(), ) .await .map_err(ActionError::action_failed)?; @@ -1075,10 +1077,9 @@ async fn ensure_instance_disk_attach_state( pub(super) async fn allocate_sled_ipv6( opctx: &OpContext, sagactx: NexusActionContext, - sled_id_name: &str, + sled_uuid: Uuid, ) -> Result { let osagactx = sagactx.user_data(); - let sled_uuid = sagactx.lookup::(sled_id_name)?; osagactx .datastore() .next_ipv6_address(opctx, sled_uuid) @@ -1145,7 +1146,8 @@ async fn sic_allocate_propolis_ip( &sagactx, ¶ms.serialized_authn, ); - allocate_sled_ipv6(&opctx, sagactx, "server_id").await + let sled_uuid = sagactx.lookup::("server_id")?; + allocate_sled_ipv6(&opctx, sagactx, sled_uuid).await } async fn sic_create_instance_record( @@ -1400,7 +1402,12 @@ async fn sic_instance_ensure_registered_undo( osagactx .nexus() - .instance_ensure_unregistered(&opctx, &authz_instance, &db_instance) + .instance_ensure_unregistered( + &opctx, + &authz_instance, + &db_instance, + WriteBackUpdatedInstance::WriteBack, + ) .await .map_err(ActionError::action_failed)?; diff --git a/nexus/src/app/sagas/instance_migrate.rs b/nexus/src/app/sagas/instance_migrate.rs index 7148a1b9764..b6df0554546 100644 --- a/nexus/src/app/sagas/instance_migrate.rs +++ b/nexus/src/app/sagas/instance_migrate.rs @@ -4,13 +4,20 @@ use super::instance_create::allocate_sled_ipv6; use super::{NexusActionContext, NexusSaga, ACTION_GENERATE_ID}; +use crate::app::instance::WriteBackUpdatedInstance; use crate::app::sagas::declare_saga_actions; -use crate::authn; -use crate::db::identity::Resource; +use crate::db::{identity::Resource, lookup::LookupPath}; use crate::external_api::params; +use crate::{authn, authz, db}; +use omicron_common::api::external::InstanceState; use omicron_common::api::internal::nexus::InstanceRuntimeState; use serde::Deserialize; use serde::Serialize; +use sled_agent_client::types::{ + InstanceMigrationSourceParams, InstanceMigrationTargetParams, + InstanceStateRequested, +}; +use slog::warn; use std::net::Ipv6Addr; use steno::ActionError; use steno::Node; @@ -21,37 +28,148 @@ use uuid::Uuid; #[derive(Debug, Deserialize, Serialize)] pub struct Params { pub serialized_authn: authn::saga::Serialized, - pub instance_id: Uuid, + pub instance: db::model::Instance, pub migrate_params: params::InstanceMigrate, } -// instance migrate saga: actions - +// The migration saga is similar to the instance creation saga: get a +// destination sled, allocate a Propolis process on it, and send it a request to +// initialize via migration, then wait (outside the saga) for this to resolve. +// +// Most of the complexity in this saga comes from the fact that during +// migration, there are two sleds with their own instance runtime states, and +// both the saga and the work that happen after it have to specify carefully +// which of the two participating VMMs is actually running the VM once the +// migration is over. +// +// Only active instances can migrate. While an instance is active on some sled +// (and isn't migrating), that sled's sled agent maintains the instance's +// runtime state and sends updated state to Nexus when it changes. At the start +// of this saga, the participating sled agents and CRDB have the following +// runtime states (note that some fields, like the actual Propolis state, are +// not relevant to migration and are omitted here): +// +// | Item | Source | Dest | CRDB | +// |--------------|--------|------|------| +// | Propolis gen | G | None | G | +// | Propolis ID | P1 | None | P1 | +// | Sled ID | S1 | None | S1 | +// | Dst Prop. ID | None | None | None | +// | Migration ID | None | None | None | declare_saga_actions! { instance_migrate; + + RESERVE_RESOURCES -> "server_id" { + + sim_reserve_sled_resources + - sim_release_sled_resources + } + ALLOCATE_PROPOLIS_IP -> "dst_propolis_ip" { + sim_allocate_propolis_ip } - MIGRATE_PREP -> "migrate_instance" { - + sim_migrate_prep + + // This step sets the instance's migration ID and destination Propolis ID + // fields. Because the instance is active, its current sled agent maintains + // the most recent runtime state, so to update it, the saga calls into the + // sled and asks it to produce an updated record with the appropriate + // migration IDs and a new generation number. + // + // Sled agent provides the synchronization here: while this operation is + // idempotent for any single transition between IDs, sled agent ensures that + // if multiple concurrent sagas try to set migration IDs at the same + // Propolis generation, then only one will win and get to proceed through + // the saga. + // + // Once this update completes, the sleds have the following states, and the + // source sled's state will be stored in CRDB: + // + // | Item | Source | Dest | CRDB | + // |--------------|--------|------|------| + // | Propolis gen | G+1 | None | G+1 | + // | Propolis ID | P1 | None | P1 | + // | Sled ID | S1 | None | S1 | + // | Dst Prop. ID | P2 | None | P2 | + // | Migration ID | M | None | M | + // + // Unwinding this step clears the migration IDs using the source sled: + // + // | Item | Source | Dest | CRDB | + // |--------------|--------|------|------| + // | Propolis gen | G+2 | None | G+2 | + // | Propolis ID | P1 | None | P1 | + // | Sled ID | S1 | None | S1 | + // | Dst Prop. ID | None | None | None | + // | Migration ID | None | None | None | + SET_MIGRATION_IDS -> "set_migration_ids" { + + sim_set_migration_ids + - sim_clear_migration_ids } - INSTANCE_MIGRATE -> "instance_migrate" { - // TODO robustness: This needs an undo action - + sim_instance_migrate + + // The instance state on the destination looks like the instance state on + // the source, except that it bears all of the destination's "location" + // information--its Propolis ID, sled ID, and Propolis IP--with the same + // Propolis generation number as the source set in the previous step. + CREATE_DESTINATION_STATE -> "dst_runtime_state" { + + sim_create_destination_state } - V2P_ENSURE -> "v2p_ensure" { - // TODO robustness: This needs an undo action - + sim_v2p_ensure + + // Instantiate the new Propolis on the destination sled. This uses the + // record created in the previous step, so the sleds end up with the + // following state: + // + // | Item | Source | Dest | CRDB | + // |--------------|--------|------|------| + // | Propolis gen | G+1 | G+1 | G+1 | + // | Propolis ID | P1 | P2 | P1 | + // | Sled ID | S1 | S2 | S1 | + // | Dst Prop. ID | P2 | P2 | P2 | + // | Migration ID | M | M | M | + // + // Note that, because the source and destination have the same Propolis + // generation, the destination's record will not be written back to CRDB. + // + // Once the migration completes (whether successfully or not), the sled that + // ends up with the instance will publish an update that clears the + // generation numbers and (on success) updates the Propolis ID pointer. If + // migration succeeds, this produces the following: + // + // | Item | Source | Dest | CRDB | + // |--------------|--------|------|------| + // | Propolis gen | G+1 | G+2 | G+2 | + // | Propolis ID | P1 | P2 | P2 | + // | Sled ID | S1 | S2 | S2 | + // | Dst Prop. ID | P2 | None | None | + // | Migration ID | M | None | None | + // + // The undo step for this node requires special care. Unregistering a + // Propolis from a sled typically increments its Propolis generation number. + // (This is so that Nexus can rudely terminate a Propolis via unregistration + // and end up with the state it would have gotten if the Propolis had shut + // down normally.) If this step unwinds, this will produce the same state + // on the destination as in the previous table, even though no migration + // has started yet. If that update gets written back, then it will write + // Propolis generation G+2 to CRDB (as in the table above) with the wrong + // Propolis ID, and the subsequent request to clear migration IDs will not + // fix it (because the source sled's generation number is still at G+1 and + // will move to G+2, which is not recent enough to push another update). + // + // To avoid this problem, this undo step takes special care not to write + // back the updated record the destination sled returns to it. + ENSURE_DESTINATION_PROPOLIS -> "ensure_destination" { + + sim_ensure_destination_propolis + - sim_ensure_destination_propolis_undo } - CLEANUP_SOURCE -> "cleanup_source" { - // TODO robustness: This needs an undo action. Is it even possible - // to undo at this point? - + sim_cleanup_source + + // Note that this step only requests migration by sending a "migrate in" + // request to the destination sled. It does not wait for migration to + // finish. It cannot be unwound, either, because there is no way to cancel + // an in-progress migration (indeed, a requested migration might have + // finished entirely by the time the undo step runs). + INSTANCE_MIGRATE -> "instance_migrate" { + + sim_instance_migrate } } -// instance migrate saga: definition - #[derive(Debug)] pub struct SagaInstanceMigrate; impl NexusSaga for SagaInstanceMigrate { @@ -78,49 +196,63 @@ impl NexusSaga for SagaInstanceMigrate { ACTION_GENERATE_ID.as_ref(), )); + builder.append(reserve_resources_action()); builder.append(allocate_propolis_ip_action()); - builder.append(migrate_prep_action()); + builder.append(set_migration_ids_action()); + builder.append(create_destination_state_action()); + builder.append(ensure_destination_propolis_action()); builder.append(instance_migrate_action()); - builder.append(v2p_ensure_action()); - builder.append(cleanup_source_action()); Ok(builder.build()?) } } -async fn sim_migrate_prep( +/// Reserves resources for the destination on the specified target sled. +async fn sim_reserve_sled_resources( sagactx: NexusActionContext, -) -> Result<(Uuid, InstanceRuntimeState), ActionError> { +) -> Result { let osagactx = sagactx.user_data(); let params = sagactx.saga_params::()?; - let opctx = crate::context::op_context_for_saga_action( - &sagactx, - ¶ms.serialized_authn, + + // N.B. This assumes that the instance's shape (CPU/memory allotment) is + // immutable despite being in the instance's "runtime" state. + let resources = db::model::Resources::new( + params.instance.runtime_state.ncpus.0 .0.into(), + params.instance.runtime_state.memory, + // TODO(#2804): Properly specify reservoir size. + omicron_common::api::external::ByteCount::from(0).into(), ); - let migrate_uuid = sagactx.lookup::("migrate_id")?; - let dst_propolis_uuid = sagactx.lookup::("dst_propolis_id")?; + // Add a constraint that the only allowed sled is the one specified in the + // parameters. + let constraints = db::model::SledReservationConstraintBuilder::new() + .must_select_from(&[params.migrate_params.dst_sled_id]) + .build(); - // We have sled-agent (via Nexus) attempt to place - // the instance in a "Migrating" state w/ the given - // migration id. This will also update the instance - // state in the db - let instance = osagactx + let propolis_id = sagactx.lookup::("dst_propolis_id")?; + let resource = osagactx .nexus() - .instance_start_migrate( - &opctx, - params.instance_id, - migrate_uuid, - dst_propolis_uuid, + .reserve_on_random_sled( + propolis_id, + db::model::SledResourceKind::Instance, + resources, + constraints, ) .await .map_err(ActionError::action_failed)?; - let instance_id = instance.id(); + Ok(resource.sled_id) +} - Ok((instance_id, instance.runtime_state.into())) +async fn sim_release_sled_resources( + sagactx: NexusActionContext, +) -> Result<(), anyhow::Error> { + let osagactx = sagactx.user_data(); + let propolis_id = sagactx.lookup::("dst_propolis_id")?; + osagactx.nexus().delete_sled_reservation(propolis_id).await?; + Ok(()) } -// Allocate an IP address on the destination sled for the Propolis server. +/// Allocates an IP address on the destination sled for the Propolis server. async fn sim_allocate_propolis_ip( sagactx: NexusActionContext, ) -> Result { @@ -129,15 +261,12 @@ async fn sim_allocate_propolis_ip( &sagactx, ¶ms.serialized_authn, ); - allocate_sled_ipv6(&opctx, sagactx, "dst_sled_uuid").await + allocate_sled_ipv6(&opctx, sagactx, params.migrate_params.dst_sled_id).await } -async fn sim_instance_migrate( - _sagactx: NexusActionContext, -) -> Result<(), ActionError> { - todo!("Migration action not yet implemented"); - - /* +async fn sim_set_migration_ids( + sagactx: NexusActionContext, +) -> Result { let osagactx = sagactx.user_data(); let params = sagactx.saga_params::()?; let opctx = crate::context::op_context_for_saga_action( @@ -145,129 +274,142 @@ async fn sim_instance_migrate( ¶ms.serialized_authn, ); + let db_instance = ¶ms.instance; let migration_id = sagactx.lookup::("migrate_id")?; - let dst_sled_id = params.migrate_params.dst_sled_id; let dst_propolis_id = sagactx.lookup::("dst_propolis_id")?; - let (instance_id, old_runtime) = - sagactx.lookup::<(Uuid, InstanceRuntimeState)>("migrate_instance")?; - - // Allocate an IP address the destination sled for the new Propolis server. - let propolis_addr = osagactx - .datastore() - .next_ipv6_address(&opctx, dst_sled_id) + let updated_record = osagactx + .nexus() + .instance_set_migration_ids( + &opctx, + db_instance.id(), + db_instance, + InstanceMigrationSourceParams { dst_propolis_id, migration_id }, + ) .await .map_err(ActionError::action_failed)?; - let runtime = InstanceRuntimeState { - sled_id: dst_sled_id, - propolis_id: dst_propolis_id, - propolis_addr: Some(std::net::SocketAddr::new( - propolis_addr.into(), - 12400, - )), - ..old_runtime - }; + Ok(updated_record) +} - // Collect the external IPs for the instance. - // https://github.com/oxidecomputer/omicron/issues/1467 - // TODO-correctness: Handle Floating IPs, see - // https://github.com/oxidecomputer/omicron/issues/1334 - let (snat_ip, external_ips): (Vec<_>, Vec<_>) = osagactx - .datastore() - .instance_lookup_external_ips(&opctx, instance_id) +async fn sim_clear_migration_ids( + sagactx: NexusActionContext, +) -> Result<(), anyhow::Error> { + let osagactx = sagactx.user_data(); + let db_instance = + sagactx.lookup::("set_migration_ids")?; + + // Because the migration never actually started (and thus didn't finish), + // the instance should be at the same Propolis generation as it was when + // migration IDs were set, which means sled agent should accept a request to + // clear them. The only exception is if the instance stopped, but that also + // clears its migration IDs; in that case there is no work to do here. + // + // Other failures to clear migration IDs are handled like any other failure + // to update an instance's state: the callee attempts to mark the instance + // as failed; if the failure occurred because the instance changed state + // such that sled agent could not fulfill the request, the callee will + // produce a stale generation number and will not actually mark the instance + // as failed. + if let Err(e) = osagactx + .nexus() + .instance_clear_migration_ids(db_instance.id(), &db_instance) .await - .map_err(ActionError::action_failed)? - .into_iter() - .partition(|ip| ip.kind == IpKind::SNat); - - // Sanity checks on the number and kind of each IP address. - if external_ips.len() > crate::app::MAX_EXTERNAL_IPS_PER_INSTANCE { - return Err(ActionError::action_failed(Error::internal_error( - format!( - "Expected the number of external IPs to be limited to \ - {}, but found {}", - crate::app::MAX_EXTERNAL_IPS_PER_INSTANCE, - external_ips.len(), - ) - .as_str(), - ))); + { + warn!(osagactx.log(), + "Error clearing migration IDs during rollback"; + "instance_id" => %db_instance.id(), + "error" => ?e); } - let external_ips = - external_ips.into_iter().map(|model| model.ip.ip()).collect(); - if snat_ip.len() != 1 { - return Err(ActionError::action_failed(Error::internal_error( - "Expected exactly one SNAT IP address for an instance", - ))); - } - let source_nat = SourceNatConfig::from(snat_ip.into_iter().next().unwrap()); - - // The TODO items below are tracked in - // https://github.com/oxidecomputer/omicron/issues/1783 - let instance_hardware = InstanceHardware { - runtime: runtime.into(), - // TODO: populate NICs - nics: vec![], - source_nat, - external_ips, - // TODO: populate firewall rules - firewall_rules: vec![], - // TODO: populate disks - disks: vec![], - // TODO: populate cloud init bytes - cloud_init_bytes: None, - }; - let target = InstanceRuntimeStateRequested { - run_state: InstanceStateRequested::Migrating, - migration_params: Some(InstanceRuntimeStateMigrateParams { - migration_id, - dst_propolis_id, - }), + + Ok(()) +} + +async fn sim_create_destination_state( + sagactx: NexusActionContext, +) -> Result { + let params = sagactx.saga_params::()?; + let mut db_instance = + sagactx.lookup::("set_migration_ids")?; + let dst_propolis_id = sagactx.lookup::("dst_propolis_id")?; + let dst_propolis_ip = sagactx.lookup::("dst_propolis_ip")?; + + // Update the runtime state to refer to the new Propolis. + let new_runtime = db::model::InstanceRuntimeState { + state: db::model::InstanceState::new(InstanceState::Creating), + sled_id: params.migrate_params.dst_sled_id, + propolis_id: dst_propolis_id, + propolis_ip: Some(ipnetwork::Ipv6Network::from(dst_propolis_ip).into()), + ..db_instance.runtime_state }; - let src_propolis_id = old_runtime.propolis_id; - let src_propolis_addr = old_runtime.propolis_addr.ok_or_else(|| { - ActionError::action_failed(Error::invalid_request( - "expected source propolis-addr", - )) - })?; + db_instance.runtime_state = new_runtime; + Ok(db_instance) +} - let dst_sa = osagactx - .sled_client(&dst_sled_id) +async fn sim_ensure_destination_propolis( + sagactx: NexusActionContext, +) -> Result<(), ActionError> { + let osagactx = sagactx.user_data(); + let params = sagactx.saga_params::()?; + let opctx = crate::context::op_context_for_saga_action( + &sagactx, + ¶ms.serialized_authn, + ); + let db_instance = + sagactx.lookup::("dst_runtime_state")?; + let (.., authz_instance) = LookupPath::new(&opctx, &osagactx.datastore()) + .instance_id(db_instance.id()) + .lookup_for(authz::Action::Modify) .await .map_err(ActionError::action_failed)?; - let new_runtime_state: InstanceRuntimeState = dst_sa - .instance_put( - &instance_id, - &InstanceEnsureBody { - initial: instance_hardware, - target, - migrate: Some(InstanceMigrationTargetParams { - src_propolis_addr: src_propolis_addr.to_string(), - src_propolis_id, - }), - }, - ) + osagactx + .nexus() + .instance_ensure_registered(&opctx, &authz_instance, &db_instance) .await - .map_err(omicron_common::api::external::Error::from) - .map_err(ActionError::action_failed)? - .into_inner() - .into(); + .map_err(ActionError::action_failed)?; + + Ok(()) +} + +async fn sim_ensure_destination_propolis_undo( + sagactx: NexusActionContext, +) -> Result<(), anyhow::Error> { + let osagactx = sagactx.user_data(); + let params = sagactx.saga_params::()?; + let opctx = crate::context::op_context_for_saga_action( + &sagactx, + ¶ms.serialized_authn, + ); + let db_instance = + sagactx.lookup::("dst_runtime_state")?; + let (.., authz_instance) = LookupPath::new(&opctx, &osagactx.datastore()) + .instance_id(db_instance.id()) + .lookup_for(authz::Action::Modify) + .await + .map_err(ActionError::action_failed)?; + // Ensure that the destination sled has no Propolis matching the description + // the saga previously generated. + // + // The updated instance record from this undo action must be dropped so + // that a later undo action (clearing migration IDs) can update the record + // instead. See the saga definition for more details. osagactx - .datastore() - .instance_update_runtime(&instance_id, &new_runtime_state.into()) + .nexus() + .instance_ensure_unregistered( + &opctx, + &authz_instance, + &db_instance, + WriteBackUpdatedInstance::Drop, + ) .await .map_err(ActionError::action_failed)?; Ok(()) - */ } -/// Add V2P mappings for the destination instance -// Note this must run after sled_id of the instance is set to the destination -// sled! -async fn sim_v2p_ensure( +async fn sim_instance_migrate( sagactx: NexusActionContext, ) -> Result<(), ActionError> { let osagactx = sagactx.user_data(); @@ -276,35 +418,348 @@ async fn sim_v2p_ensure( &sagactx, ¶ms.serialized_authn, ); - let (instance_id, _) = - sagactx.lookup::<(Uuid, InstanceRuntimeState)>("migrate_instance")?; - - // TODO-performance the instance_put in sim_instance_migrate will *start* a - // migration, but the source and destination propolis servers will perform - // it asynchronously. If this step occurs before the source instance vCPUs - // are paused, updating the mappings here will briefly "disconnect" the - // source instance in the sense that it will be able to send packets out (as - // other instance's V2P mappings will be untouched) but will not be able to - // receive any packets (other instances will send packets to the destination - // propolis' sled but the destination instance vCPUs may not have started - // yet). Until the destination propolis takes over, there will be a inbound - // network outage for the instance. + let src_runtime: InstanceRuntimeState = sagactx + .lookup::("set_migration_ids")? + .runtime() + .clone() + .into(); + let dst_db_instance = + sagactx.lookup::("dst_runtime_state")?; + let (.., authz_instance) = LookupPath::new(&opctx, &osagactx.datastore()) + .instance_id(dst_db_instance.id()) + .lookup_for(authz::Action::Modify) + .await + .map_err(ActionError::action_failed)?; + + // TODO-correctness: This needs to be retried if a transient error occurs to + // avoid a problem like the following: // - // TODO-correctness if the migration fails, there's nothing that will unwind - // this and restore the original V2P mappings + // 1. The saga executor runs this step and successfully starts migration. + // 2. The executor crashes. + // 3. Migration completes. + // 4. The executor restarts, runs this step, encounters a transient error, + // and then tries to unwind the saga. + // + // Now the "ensure destination" undo step will tear down the (running) + // migration target. + // + // Possibly sled agent can help with this by using state or Propolis + // generation numbers to filter out stale destruction requests. osagactx .nexus() - .create_instance_v2p_mappings(&opctx, instance_id) + .instance_request_state( + &opctx, + &authz_instance, + &dst_db_instance, + InstanceStateRequested::MigrationTarget( + InstanceMigrationTargetParams { + src_propolis_addr: src_runtime + .propolis_addr + .unwrap() + .to_string(), + src_propolis_id: src_runtime.propolis_id, + }, + ), + ) .await .map_err(ActionError::action_failed)?; Ok(()) } -async fn sim_cleanup_source( - _sagactx: NexusActionContext, -) -> Result<(), ActionError> { - // TODO: clean up the previous instance whether it's on the same sled or a - // different one - Ok(()) +#[cfg(test)] +mod tests { + use std::sync::Arc; + + use crate::{ + app::{saga::create_saga_dag, sagas::instance_create}, + Nexus, TestInterfaces as _, + }; + + use dropshot::test_util::ClientTestContext; + use http::{method::Method, StatusCode}; + use nexus_test_interface::NexusServer; + use nexus_test_utils::{ + http_testing::{AuthnMode, NexusRequest, RequestBuilder}, + resource_helpers::{create_project, object_create, populate_ip_pool}, + start_sled_agent, + }; + use nexus_test_utils_macros::nexus_test; + use omicron_common::api::external::{ + ByteCount, IdentityMetadataCreateParams, InstanceCpuCount, + }; + use omicron_sled_agent::sim::Server; + use sled_agent_client::TestInterfaces as _; + + use super::*; + + type ControlPlaneTestContext = + nexus_test_utils::ControlPlaneTestContext; + + const PROJECT_NAME: &str = "test-project"; + const INSTANCE_NAME: &str = "test-instance"; + + async fn setup_test_project(client: &ClientTestContext) -> Uuid { + populate_ip_pool(&client, "default", None).await; + let project = create_project(&client, PROJECT_NAME).await; + project.identity.id + } + + async fn add_sleds( + cptestctx: &ControlPlaneTestContext, + num_sleds: usize, + ) -> Vec<(Uuid, Server)> { + let mut sas = Vec::with_capacity(num_sleds); + for _ in 0..num_sleds { + let sa_id = Uuid::new_v4(); + let log = + cptestctx.logctx.log.new(o!("sled_id" => sa_id.to_string())); + let addr = + cptestctx.server.get_http_server_internal_address().await; + + info!(&cptestctx.logctx.log, "Adding simulated sled"; "sled_id" => %sa_id); + let update_dir = std::path::Path::new("/should/be/unused"); + let sa = start_sled_agent( + log, + addr, + sa_id, + &update_dir, + omicron_sled_agent::sim::SimMode::Explicit, + ) + .await + .unwrap(); + sas.push((sa_id, sa)); + } + + sas + } + + async fn create_instance( + client: &ClientTestContext, + ) -> omicron_common::api::external::Instance { + let instances_url = format!("/v1/instances?project={}", PROJECT_NAME); + object_create( + client, + &instances_url, + ¶ms::InstanceCreate { + identity: IdentityMetadataCreateParams { + name: INSTANCE_NAME.parse().unwrap(), + description: format!("instance {:?}", INSTANCE_NAME), + }, + ncpus: InstanceCpuCount(2), + memory: ByteCount::from_gibibytes_u32(2), + hostname: String::from(INSTANCE_NAME), + user_data: b"#cloud-config".to_vec(), + network_interfaces: + params::InstanceNetworkInterfaceAttachment::None, + external_ips: vec![], + disks: vec![], + start: true, + }, + ) + .await + } + + async fn instance_simulate( + cptestctx: &ControlPlaneTestContext, + nexus: &Arc, + instance_id: &Uuid, + ) { + info!(&cptestctx.logctx.log, "Poking simulated instance"; + "instance_id" => %instance_id); + let sa = nexus.instance_sled_by_id(instance_id).await.unwrap(); + sa.instance_finish_transition(*instance_id).await; + } + + async fn fetch_db_instance( + cptestctx: &ControlPlaneTestContext, + opctx: &nexus_db_queries::context::OpContext, + id: Uuid, + ) -> nexus_db_model::Instance { + let datastore = cptestctx.server.apictx().nexus.datastore().clone(); + let (.., db_instance) = LookupPath::new(&opctx, &datastore) + .instance_id(id) + .fetch() + .await + .expect("test instance should be present in datastore"); + + info!(&cptestctx.logctx.log, "refetched instance from db"; + "instance" => ?db_instance); + + db_instance + } + + async fn instance_start(cptestctx: &ControlPlaneTestContext, id: &Uuid) { + let client = &cptestctx.external_client; + let instance_stop_url = format!("/v1/instances/{}/start", id); + NexusRequest::new( + RequestBuilder::new(client, Method::POST, &instance_stop_url) + .body(None as Option<&serde_json::Value>) + .expect_status(Some(StatusCode::ACCEPTED)), + ) + .authn_as(AuthnMode::PrivilegedUser) + .execute() + .await + .expect("Failed to start instance"); + } + + async fn instance_stop(cptestctx: &ControlPlaneTestContext, id: &Uuid) { + let client = &cptestctx.external_client; + let instance_stop_url = format!("/v1/instances/{}/stop", id); + NexusRequest::new( + RequestBuilder::new(client, Method::POST, &instance_stop_url) + .body(None as Option<&serde_json::Value>) + .expect_status(Some(StatusCode::ACCEPTED)), + ) + .authn_as(AuthnMode::PrivilegedUser) + .execute() + .await + .expect("Failed to stop instance"); + } + + fn select_first_alternate_sled( + db_instance: &db::model::Instance, + other_sleds: &[(Uuid, Server)], + ) -> Uuid { + let default_sled_uuid = + Uuid::parse_str(nexus_test_utils::SLED_AGENT_UUID).unwrap(); + if other_sleds.is_empty() { + panic!("need at least one other sled"); + } + + if other_sleds.iter().any(|sled| sled.0 == default_sled_uuid) { + panic!("default test sled agent was in other_sleds"); + } + + if db_instance.runtime().sled_id == default_sled_uuid { + other_sleds[0].0 + } else { + default_sled_uuid + } + } + + #[nexus_test(server = crate::Server)] + async fn test_saga_basic_usage_succeeds( + cptestctx: &ControlPlaneTestContext, + ) { + let other_sleds = add_sleds(cptestctx, 1).await; + let client = &cptestctx.external_client; + let nexus = &cptestctx.server.apictx().nexus; + let _project_id = setup_test_project(&client).await; + + let opctx = instance_create::test::test_opctx(cptestctx); + let instance = create_instance(client).await; + + // Poke the instance to get it into the Running state. + instance_simulate(cptestctx, nexus, &instance.identity.id).await; + + let db_instance = + fetch_db_instance(cptestctx, &opctx, instance.identity.id).await; + let old_runtime = db_instance.runtime().clone(); + let dst_sled_id = + select_first_alternate_sled(&db_instance, &other_sleds); + let params = Params { + serialized_authn: authn::saga::Serialized::for_opctx(&opctx), + instance: db_instance, + migrate_params: params::InstanceMigrate { dst_sled_id }, + }; + + let dag = create_saga_dag::(params).unwrap(); + let saga = nexus.create_runnable_saga(dag).await.unwrap(); + nexus.run_saga(saga).await.expect("Migration saga should succeed"); + + // Merely running the migration saga (without simulating any completion + // steps in the simulated agents) should not change where the instance + // is running. + let new_db_instance = + fetch_db_instance(cptestctx, &opctx, instance.identity.id).await; + assert_eq!(new_db_instance.runtime().sled_id, old_runtime.sled_id); + assert_eq!( + new_db_instance.runtime().propolis_id, + old_runtime.propolis_id + ); + } + + #[nexus_test(server = crate::Server)] + async fn test_action_failure_can_unwind( + cptestctx: &ControlPlaneTestContext, + ) { + let log = &cptestctx.logctx.log; + let other_sleds = add_sleds(cptestctx, 1).await; + let client = &cptestctx.external_client; + let nexus = &cptestctx.server.apictx().nexus; + let _project_id = setup_test_project(&client).await; + + let opctx = instance_create::test::test_opctx(cptestctx); + let instance = create_instance(client).await; + + // Poke the instance to get it into the Running state. + instance_simulate(cptestctx, nexus, &instance.identity.id).await; + + let db_instance = + fetch_db_instance(cptestctx, &opctx, instance.identity.id).await; + let old_runtime = db_instance.runtime().clone(); + let dst_sled_id = + select_first_alternate_sled(&db_instance, &other_sleds); + let params = Params { + serialized_authn: authn::saga::Serialized::for_opctx(&opctx), + instance: db_instance, + migrate_params: params::InstanceMigrate { dst_sled_id }, + }; + + let dag = create_saga_dag::(params).unwrap(); + for node in dag.get_nodes() { + info!( + log, + "Creating new saga which will fail at index {:?}", node.index(); + "node_name" => node.name().as_ref(), + "label" => node.label(), + ); + + let runnable_saga = + nexus.create_runnable_saga(dag.clone()).await.unwrap(); + nexus + .sec() + .saga_inject_error(runnable_saga.id(), node.index()) + .await + .unwrap(); + nexus + .run_saga(runnable_saga) + .await + .expect_err("Saga should have failed"); + + // Unwinding at any step should clear the migration IDs from the + // instance record and leave the instance's location otherwise + // untouched. + let new_db_instance = + fetch_db_instance(cptestctx, &opctx, instance.identity.id) + .await; + + assert!(new_db_instance.runtime().migration_id.is_none()); + assert!(new_db_instance.runtime().dst_propolis_id.is_none()); + assert_eq!(new_db_instance.runtime().sled_id, old_runtime.sled_id); + assert_eq!( + new_db_instance.runtime().propolis_id, + old_runtime.propolis_id + ); + + // Ensure the instance can stop. This helps to check that destroying + // the migration destination (if one was ensured) doesn't advance + // the Propolis ID generation in a way that prevents the source from + // issuing further state updates. + instance_stop(cptestctx, &instance.identity.id).await; + instance_simulate(cptestctx, nexus, &instance.identity.id).await; + let new_db_instance = + fetch_db_instance(cptestctx, &opctx, instance.identity.id) + .await; + assert_eq!( + new_db_instance.runtime().state.0, + InstanceState::Stopped + ); + + // Restart the instance for the next iteration. + instance_start(cptestctx, &instance.identity.id).await; + instance_simulate(cptestctx, nexus, &instance.identity.id).await; + } + } } diff --git a/nexus/src/app/session.rs b/nexus/src/app/session.rs index 00de241321d..04a9170c7d8 100644 --- a/nexus/src/app/session.rs +++ b/nexus/src/app/session.rs @@ -155,7 +155,8 @@ impl super::Nexus { | Error::InternalError { .. } | Error::ServiceUnavailable { .. } | Error::MethodNotAllowed { .. } - | Error::TypeVersionMismatch { .. } => { + | Error::TypeVersionMismatch { .. } + | Error::Conflict { .. } => { Reason::UnknownError { source: error } } })?; diff --git a/nexus/src/app/sled.rs b/nexus/src/app/sled.rs index 3ba687fc912..8188a118c7c 100644 --- a/nexus/src/app/sled.rs +++ b/nexus/src/app/sled.rs @@ -113,6 +113,7 @@ impl super::Nexus { resource_id: Uuid, resource_kind: db::model::SledResourceKind, resources: db::model::Resources, + constraints: db::model::SledReservationConstraints, ) -> Result { self.db_datastore .sled_reservation_create( @@ -120,6 +121,7 @@ impl super::Nexus { resource_id, resource_kind, resources, + constraints, ) .await } diff --git a/sled-agent/src/instance.rs b/sled-agent/src/instance.rs index 18a408241b1..0f0977f8fab 100644 --- a/sled-agent/src/instance.rs +++ b/sled-agent/src/instance.rs @@ -729,8 +729,8 @@ impl Instance { } return Err(Error::Transition( - omicron_common::api::external::Error::InvalidRequest { - message: format!( + omicron_common::api::external::Error::Conflict { + internal_message: format!( "wrong Propolis ID generation: expected {}, got {}", inner.state.current().propolis_gen, old_runtime.propolis_gen diff --git a/sled-agent/src/sim/sled_agent.rs b/sled-agent/src/sim/sled_agent.rs index a05f68e2fdf..b8b50c0122e 100644 --- a/sled-agent/src/sim/sled_agent.rs +++ b/sled-agent/src/sim/sled_agent.rs @@ -325,7 +325,15 @@ impl SledAgent { instance_id: Uuid, ) -> Result { let instance = - self.instances.sim_get_cloned_object(&instance_id).await?; + match self.instances.sim_get_cloned_object(&instance_id).await { + Ok(instance) => instance, + Err(Error::ObjectNotFound { .. }) => { + return Ok(InstanceUnregisterResponse { + updated_runtime: None, + }) + } + Err(e) => return Err(e), + }; self.detach_disks_from_instance(instance_id).await?; Ok(InstanceUnregisterResponse { diff --git a/sled-agent/src/sled_agent.rs b/sled-agent/src/sled_agent.rs index 9ae0a2a2b99..efd7a8c576d 100644 --- a/sled-agent/src/sled_agent.rs +++ b/sled-agent/src/sled_agent.rs @@ -119,10 +119,14 @@ impl From for dropshot::HttpError { } } } - + crate::instance::Error::Transition(omicron_error) => { + // Preserve the status associated with the wrapped + // Omicron error so that Nexus will see it in the + // Progenitor client error it gets back. + HttpError::from(omicron_error) + } e => HttpError::for_internal_error(e.to_string()), }, - e => HttpError::for_internal_error(e.to_string()), } }