Skip to content

Commit e2bd1e8

Browse files
authored
Adds concept of a primary network interface (#1143)
* Adds concept of a primary network interface - Adds the `is_primary` column to the `network_interface` table, and a corresponding field `primary` in the database model and external `NetworkInterface` objects. Primary interfaces are used for NAT and appear in DNS records. - Updates the `InsertNetworkInterfaceQuery` to automatically decide if this interface should be considered the primary. It considers the new NIC primary iff there are zero existing NICs for the instance it's to be attached to. That means that the first NIC added to an instance, either during a provision or later, is the primary. Future work could allow changing which NIC is the primary. - Adds a new query for deleting a network interface from an instance, with improved validation. This now checks that the instance is stopped inside the query, fixing a TOCTOU bug. It also verifies that the instance either has exactly 1 interface (which must be the primary) or that the instance has 2 or more (we're deleting a secondary). This means that the primary interface cannot be deleted until all secondary interfaces are deleted. The reason for this restriction is that instances _must_ have a primary interface, and it's not clear how to pick a new primary from the remaining secondaries if we allow deletion of the primary. We force the client to make the choice. - Adds a special error type for handling the above validation failures. - Adds tests for this deletion behavior to the instance integration tests * Review feedback - Cleanup comments - Simplify NIC query/error type names - Remove stale test
1 parent 1593b8b commit e2bd1e8

File tree

14 files changed

+648
-174
lines changed

14 files changed

+648
-174
lines changed

common/src/api/external/mod.rs

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1811,6 +1811,9 @@ pub struct NetworkInterface {
18111811
pub ip: IpAddr,
18121812
// TODO-correctness: We need to split this into an optional V4 and optional
18131813
// V6 address, at least one of which must be specified.
1814+
/// True if this interface is the primary for the instance to which it's
1815+
/// attached.
1816+
pub primary: bool,
18141817
}
18151818

18161819
#[derive(

common/src/sql/dbinit.sql

Lines changed: 8 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -675,7 +675,14 @@ CREATE TABLE omicron.public.network_interface (
675675
* Limited to 8 NICs per instance. This value must be kept in sync with
676676
* `crate::nexus::MAX_NICS_PER_INSTANCE`.
677677
*/
678-
slot INT2 NOT NULL CHECK (slot >= 0 AND slot < 8)
678+
slot INT2 NOT NULL CHECK (slot >= 0 AND slot < 8),
679+
680+
/* True if this interface is the primary interface for the instance.
681+
*
682+
* The primary interface appears in DNS and its address is used for external
683+
* connectivity for the instance.
684+
*/
685+
is_primary BOOL NOT NULL
679686
);
680687

681688
/* TODO-completeness

nexus/src/app/instance.rs

Lines changed: 15 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -13,7 +13,7 @@ use crate::db;
1313
use crate::db::identity::Resource;
1414
use crate::db::lookup::LookupPath;
1515
use crate::db::model::Name;
16-
use crate::db::queries::network_interface::NetworkInterfaceError;
16+
use crate::db::queries::network_interface;
1717
use crate::external_api::params;
1818
use omicron_common::api::external;
1919
use omicron_common::api::external::CreateResult;
@@ -642,7 +642,8 @@ impl super::Nexus {
642642
// instance between this check and when we actually create the NIC
643643
// record. One solution is to place the state verification in the query
644644
// to create the NIC. Unfortunately, that query is already very
645-
// complicated.
645+
// complicated. See
646+
// https://github.com/oxidecomputer/omicron/issues/1134.
646647
let stopped =
647648
db::model::InstanceState::new(external::InstanceState::Stopped);
648649
if db_instance.runtime_state.state != stopped {
@@ -680,7 +681,7 @@ impl super::Nexus {
680681
interface,
681682
)
682683
.await
683-
.map_err(NetworkInterfaceError::into_external)?;
684+
.map_err(network_interface::InsertError::into_external)?;
684685
Ok(interface)
685686
}
686687

@@ -724,6 +725,9 @@ impl super::Nexus {
724725
}
725726

726727
/// Delete a network interface from the provided instance.
728+
///
729+
/// Note that the primary interface for an instance cannot be deleted if
730+
/// there are any secondary interfaces.
727731
pub async fn instance_delete_network_interface(
728732
&self,
729733
opctx: &OpContext,
@@ -746,6 +750,8 @@ impl super::Nexus {
746750
.await?;
747751

748752
// TODO-completeness: We'd like to relax this once hot-plug is supported
753+
// TODO-correctness: There's a race condition here. Someone may start
754+
// the instance after this check but before we actually delete the NIC.
749755
let stopped =
750756
db::model::InstanceState::new(external::InstanceState::Stopped);
751757
if db_instance.runtime_state.state != stopped {
@@ -754,8 +760,13 @@ impl super::Nexus {
754760
));
755761
}
756762
self.db_datastore
757-
.instance_delete_network_interface(opctx, &authz_interface)
763+
.instance_delete_network_interface(
764+
opctx,
765+
&authz_instance,
766+
&authz_interface,
767+
)
758768
.await
769+
.map_err(network_interface::DeleteError::into_external)
759770
}
760771

761772
/// Invoked by a sled agent to publish an updated runtime state for an

nexus/src/app/sagas/instance_create.rs

Lines changed: 21 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -9,7 +9,8 @@ use crate::app::{MAX_DISKS_PER_INSTANCE, MAX_NICS_PER_INSTANCE};
99
use crate::context::OpContext;
1010
use crate::db::identity::Resource;
1111
use crate::db::lookup::LookupPath;
12-
use crate::db::queries::network_interface::NetworkInterfaceError;
12+
use crate::db::queries::network_interface::InsertError as InsertNicError;
13+
use crate::defaults::DEFAULT_PRIMARY_NIC_NAME;
1314
use crate::external_api::params;
1415
use crate::saga_interface::SagaContext;
1516
use crate::{authn, authz, db};
@@ -242,7 +243,7 @@ async fn sic_create_network_interfaces(
242243
match sagactx.saga_params().create_params.network_interfaces {
243244
params::InstanceNetworkInterfaceAttachment::None => Ok(()),
244245
params::InstanceNetworkInterfaceAttachment::Default => {
245-
sic_create_default_network_interface(&sagactx).await
246+
sic_create_default_primary_network_interface(&sagactx).await
246247
}
247248
params::InstanceNetworkInterfaceAttachment::Create(
248249
ref create_params,
@@ -340,14 +341,14 @@ async fn sic_create_custom_network_interfaces(
340341
// insert that record if it exists, which obviously fails with a
341342
// primary key violation. (If the record does _not_ exist, one will
342343
// be inserted as usual, see
343-
// `db::subnet_name::InsertNetworkInterfaceQuery` for details).
344+
// `db::queries::network_interface::InsertQuery` for details).
344345
//
345346
// In this one specific case, we're asserting that any primary key
346347
// duplicate arises because this saga node ran partway and then
347348
// crashed. The saga recovery machinery will replay just this node,
348349
// without first unwinding it, so any previously-inserted interfaces
349350
// will still exist. This is expected.
350-
Err(NetworkInterfaceError::DuplicatePrimaryKey(_)) => {
351+
Err(InsertNicError::DuplicatePrimaryKey(_)) => {
351352
// TODO-observability: We should bump a counter here.
352353
let log = osagactx.log();
353354
warn!(
@@ -369,8 +370,9 @@ async fn sic_create_custom_network_interfaces(
369370
Ok(())
370371
}
371372

372-
/// Create the default network interface for an instance during the create saga
373-
async fn sic_create_default_network_interface(
373+
/// Create a default primary network interface for an instance during the create
374+
/// saga.
375+
async fn sic_create_default_primary_network_interface(
374376
sagactx: &ActionContext<SagaInstanceCreate>,
375377
) -> Result<(), ActionError> {
376378
let osagactx = sagactx.user_data();
@@ -379,13 +381,23 @@ async fn sic_create_default_network_interface(
379381
let opctx =
380382
OpContext::for_saga_action(&sagactx, &saga_params.serialized_authn);
381383
let instance_id = sagactx.lookup::<Uuid>("instance_id")?;
384+
385+
// The literal name "default" is currently used for the VPC and VPC Subnet,
386+
// when not specified in the client request.
387+
// TODO-completeness: We'd like to select these from Project-level defaults.
388+
// See https://github.com/oxidecomputer/omicron/issues/1015.
382389
let default_name = Name::try_from("default".to_string()).unwrap();
383390
let internal_default_name = db::model::Name::from(default_name.clone());
391+
392+
// The name of the default primary interface.
393+
let iface_name =
394+
Name::try_from(DEFAULT_PRIMARY_NIC_NAME.to_string()).unwrap();
395+
384396
let interface_params = params::NetworkInterfaceCreate {
385397
identity: IdentityMetadataCreateParams {
386-
name: default_name.clone(),
398+
name: iface_name.clone(),
387399
description: format!(
388-
"default interface for {}",
400+
"default primary interface for {}",
389401
saga_params.create_params.identity.name,
390402
),
391403
},
@@ -427,7 +439,7 @@ async fn sic_create_default_network_interface(
427439
interface,
428440
)
429441
.await
430-
.map_err(NetworkInterfaceError::into_external)
442+
.map_err(InsertNicError::into_external)
431443
.map_err(ActionError::action_failed)?;
432444
Ok(())
433445
}

nexus/src/db/datastore.rs

Lines changed: 31 additions & 24 deletions
Original file line numberDiff line numberDiff line change
@@ -40,8 +40,7 @@ use crate::db::lookup::LookupPath;
4040
use crate::db::model::DatabaseString;
4141
use crate::db::model::IncompleteVpc;
4242
use crate::db::model::Vpc;
43-
use crate::db::queries::network_interface::InsertNetworkInterfaceQuery;
44-
use crate::db::queries::network_interface::NetworkInterfaceError;
43+
use crate::db::queries::network_interface;
4544
use crate::db::queries::vpc::InsertVpcQuery;
4645
use crate::db::queries::vpc_subnet::FilterConflictingVpcSubnetRangesQuery;
4746
use crate::db::queries::vpc_subnet::SubnetError;
@@ -1575,35 +1574,37 @@ impl DataStore {
15751574
authz_subnet: &authz::VpcSubnet,
15761575
authz_instance: &authz::Instance,
15771576
interface: IncompleteNetworkInterface,
1578-
) -> Result<NetworkInterface, NetworkInterfaceError> {
1577+
) -> Result<NetworkInterface, network_interface::InsertError> {
15791578
opctx
15801579
.authorize(authz::Action::CreateChild, authz_instance)
15811580
.await
1582-
.map_err(NetworkInterfaceError::External)?;
1581+
.map_err(network_interface::InsertError::External)?;
15831582
opctx
15841583
.authorize(authz::Action::CreateChild, authz_subnet)
15851584
.await
1586-
.map_err(NetworkInterfaceError::External)?;
1585+
.map_err(network_interface::InsertError::External)?;
15871586
self.instance_create_network_interface_raw(&opctx, interface).await
15881587
}
15891588

15901589
pub(super) async fn instance_create_network_interface_raw(
15911590
&self,
15921591
opctx: &OpContext,
15931592
interface: IncompleteNetworkInterface,
1594-
) -> Result<NetworkInterface, NetworkInterfaceError> {
1593+
) -> Result<NetworkInterface, network_interface::InsertError> {
15951594
use db::schema::network_interface::dsl;
1596-
let query = InsertNetworkInterfaceQuery::new(interface.clone());
1595+
let query = network_interface::InsertQuery::new(interface.clone());
15971596
diesel::insert_into(dsl::network_interface)
15981597
.values(query)
15991598
.returning(NetworkInterface::as_returning())
16001599
.get_result_async(
16011600
self.pool_authorized(opctx)
16021601
.await
1603-
.map_err(NetworkInterfaceError::External)?,
1602+
.map_err(network_interface::InsertError::External)?,
16041603
)
16051604
.await
1606-
.map_err(|e| NetworkInterfaceError::from_pool(e, &interface))
1605+
.map_err(|e| {
1606+
network_interface::InsertError::from_pool(e, &interface)
1607+
})
16071608
}
16081609

16091610
/// Delete all network interfaces attached to the given instance.
@@ -1634,27 +1635,33 @@ impl DataStore {
16341635
}
16351636

16361637
/// Delete a `NetworkInterface` attached to a provided instance.
1638+
///
1639+
/// Note that the primary interface for an instance cannot be deleted if
1640+
/// there are any secondary interfaces.
16371641
pub async fn instance_delete_network_interface(
16381642
&self,
16391643
opctx: &OpContext,
1644+
authz_instance: &authz::Instance,
16401645
authz_interface: &authz::NetworkInterface,
1641-
) -> DeleteResult {
1642-
opctx.authorize(authz::Action::Delete, authz_interface).await?;
1643-
1644-
use db::schema::network_interface::dsl;
1645-
let now = Utc::now();
1646-
let interface_id = authz_interface.id();
1647-
diesel::update(dsl::network_interface)
1648-
.filter(dsl::id.eq(interface_id))
1649-
.filter(dsl::time_deleted.is_null())
1650-
.set((dsl::time_deleted.eq(now),))
1651-
.execute_async(self.pool_authorized(opctx).await?)
1646+
) -> Result<(), network_interface::DeleteError> {
1647+
opctx
1648+
.authorize(authz::Action::Delete, authz_interface)
1649+
.await
1650+
.map_err(network_interface::DeleteError::External)?;
1651+
let query = network_interface::DeleteQuery::new(
1652+
authz_instance.id(),
1653+
authz_interface.id(),
1654+
);
1655+
query
1656+
.clone()
1657+
.execute_async(
1658+
self.pool_authorized(opctx)
1659+
.await
1660+
.map_err(network_interface::DeleteError::External)?,
1661+
)
16521662
.await
16531663
.map_err(|e| {
1654-
public_error_from_diesel_pool(
1655-
e,
1656-
ErrorHandler::NotFoundByResource(authz_interface),
1657-
)
1664+
network_interface::DeleteError::from_pool(e, &query)
16581665
})?;
16591666
Ok(())
16601667
}

nexus/src/db/model/network_interface.rs

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -26,6 +26,8 @@ pub struct NetworkInterface {
2626
// If neither is specified, auto-assign one of each?
2727
pub ip: ipnetwork::IpNetwork,
2828
pub slot: i16,
29+
#[diesel(column_name = is_primary)]
30+
pub primary: bool,
2931
}
3032

3133
impl From<NetworkInterface> for external::NetworkInterface {
@@ -37,6 +39,7 @@ impl From<NetworkInterface> for external::NetworkInterface {
3739
subnet_id: iface.subnet_id,
3840
ip: iface.ip.ip(),
3941
mac: *iface.mac,
42+
primary: iface.primary,
4043
}
4144
}
4245
}

0 commit comments

Comments
 (0)