Skip to content

Commit 05ed790

Browse files
internet-diglettLevon Tarver
andauthored
Rpws for all networking (#4822)
Overview --- This PR ensures the rest of our `dpd` configuration is covered by a RPW to help recover state in the event of `dendrite` crashing, the switch zone restarting / being replaced, or the sled restarting. This is accomplished via a background task in Nexus that periodically ensures `dpd` is up to date with the tables in Nexus. The tradeoffs of this design is that we don't track versioning and reconcile the entire state every time, but since the actual number of ports will never be that high (relative to something like NAT entries) the tradeoff of less efficiency for much greater simplicity seems to make sense today, and it requires much less rework in Nexus and Dendrite should we choose to replace this strategy down the road. Tasks --- - [x] Ensure that Service Zones configured during rss, cold boot, and nexus have their NAT entries added to the NAT RPW table (extracted into #4857) - [x] Create background task that periodically reconciles switch port configuration for dendrite instances - [x] Move switch zone uplink SMF property updates to RPW - [x] Move routing updates (via mg) to RPW - [x] Static Routing - [x] BGP - [x] Move bootstore updates to RPW - [x] Move loopback address management to RPW - [x] Move Nexus-side switch zone service on-demand lookups as outlined in #5092 Verifications Performed --- - [x] Basic instance deployment - [x] Loopback Address Creation - [x] BGP configuration (a4x2) - [ ] BGP configuration modification (a4x2) - [x] Static routing - [x] Static routing configuration modification Related --- Closes #4715 Closes #4650 Depends on https://github.com/oxidecomputer/dendrite/pull/838 --------- Co-authored-by: Levon Tarver <[email protected]>
1 parent 255b14a commit 05ed790

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

52 files changed

+2434
-3028
lines changed

.github/buildomat/jobs/deploy.sh

Lines changed: 14 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -231,12 +231,10 @@ first = \"$SERVICE_IP_POOL_START\"
231231
/^last/c\\
232232
last = \"$SERVICE_IP_POOL_END\"
233233
}
234-
/^\\[rack_network_config/,/^$/ {
235-
/^infra_ip_first/c\\
234+
/^infra_ip_first/c\\
236235
infra_ip_first = \"$UPLINK_IP\"
237-
/^infra_ip_last/c\\
236+
/^infra_ip_last/c\\
238237
infra_ip_last = \"$UPLINK_IP\"
239-
}
240238
/^\\[\\[rack_network_config.ports/,/^\$/ {
241239
/^routes/c\\
242240
routes = \\[{nexthop = \"$GATEWAY_IP\", destination = \"0.0.0.0/0\"}\\]
@@ -335,6 +333,18 @@ while [[ $(pfexec svcs -z $(zoneadm list -n | grep oxz_ntp) \
335333
done
336334
echo "Waited for chrony: ${retry}s"
337335

336+
# Wait for at least one nexus zone to become available
337+
retry=0
338+
until zoneadm list | grep nexus; do
339+
if [[ $retry -gt 300 ]]; then
340+
echo "Failed to start at least one nexus zone after 300 seconds"
341+
exit 1
342+
fi
343+
sleep 1
344+
retry=$((retry + 1))
345+
done
346+
echo "Waited for nexus: ${retry}s"
347+
338348
export RUST_BACKTRACE=1
339349
export E2E_TLS_CERT IPPOOL_START IPPOOL_END
340350
eval "$(./tests/bootstrap)"

clients/mg-admin-client/src/lib.rs

Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -17,11 +17,13 @@ mod inner {
1717
}
1818

1919
pub use inner::types;
20+
use inner::types::Prefix4;
2021
pub use inner::Error;
2122

2223
use inner::Client as InnerClient;
2324
use omicron_common::api::external::BgpPeerState;
2425
use slog::Logger;
26+
use std::hash::Hash;
2527
use std::net::Ipv6Addr;
2628
use std::net::SocketAddr;
2729
use thiserror::Error;
@@ -81,3 +83,18 @@ impl Client {
8183
Ok(Self { inner, log })
8284
}
8385
}
86+
87+
impl Eq for Prefix4 {}
88+
89+
impl PartialEq for Prefix4 {
90+
fn eq(&self, other: &Self) -> bool {
91+
self.value == other.value && self.length == other.length
92+
}
93+
}
94+
95+
impl Hash for Prefix4 {
96+
fn hash<H: std::hash::Hasher>(&self, state: &mut H) {
97+
self.value.hash(state);
98+
self.length.hash(state);
99+
}
100+
}

dev-tools/omdb/tests/env.out

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -92,6 +92,10 @@ task: "service_zone_nat_tracker"
9292
ensures service zone nat records are recorded in NAT RPW table
9393

9494

95+
task: "switch_port_config_manager"
96+
manages switch port settings for rack switches
97+
98+
9599
---------------------------------------------
96100
stderr:
97101
note: using Nexus URL http://127.0.0.1:REDACTED_PORT
@@ -182,6 +186,10 @@ task: "service_zone_nat_tracker"
182186
ensures service zone nat records are recorded in NAT RPW table
183187

184188

189+
task: "switch_port_config_manager"
190+
manages switch port settings for rack switches
191+
192+
185193
---------------------------------------------
186194
stderr:
187195
note: Nexus URL not specified. Will pick one from DNS.
@@ -259,6 +267,10 @@ task: "service_zone_nat_tracker"
259267
ensures service zone nat records are recorded in NAT RPW table
260268

261269

270+
task: "switch_port_config_manager"
271+
manages switch port settings for rack switches
272+
273+
262274
---------------------------------------------
263275
stderr:
264276
note: Nexus URL not specified. Will pick one from DNS.

dev-tools/omdb/tests/successes.out

Lines changed: 13 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -299,6 +299,10 @@ task: "service_zone_nat_tracker"
299299
ensures service zone nat records are recorded in NAT RPW table
300300

301301

302+
task: "switch_port_config_manager"
303+
manages switch port settings for rack switches
304+
305+
302306
---------------------------------------------
303307
stderr:
304308
note: using Nexus URL http://127.0.0.1:REDACTED_PORT/
@@ -368,7 +372,7 @@ task: "nat_v4_garbage_collector"
368372
currently executing: no
369373
last completed activation: iter 2, triggered by an explicit signal
370374
started at <REDACTED TIMESTAMP> (<REDACTED DURATION>s ago) and ran for <REDACTED DURATION>ms
371-
warning: unknown background task: "nat_v4_garbage_collector" (don't know how to interpret details: Null)
375+
last completion reported error: failed to resolve addresses for Dendrite services: no record found for Query { name: Name("_dendrite._tcp.control-plane.oxide.internal."), query_type: SRV, query_class: IN }
372376

373377
task: "blueprint_loader"
374378
configured period: every 1m 40s
@@ -389,7 +393,7 @@ task: "bfd_manager"
389393
currently executing: no
390394
last completed activation: iter 2, triggered by an explicit signal
391395
started at <REDACTED TIMESTAMP> (<REDACTED DURATION>s ago) and ran for <REDACTED DURATION>ms
392-
warning: unknown background task: "bfd_manager" (don't know how to interpret details: Object {})
396+
last completion reported error: failed to resolve addresses for Dendrite services: no record found for Query { name: Name("_dendrite._tcp.control-plane.oxide.internal."), query_type: SRV, query_class: IN }
393397

394398
task: "external_endpoints"
395399
configured period: every 1m
@@ -440,6 +444,13 @@ task: "service_zone_nat_tracker"
440444
started at <REDACTED TIMESTAMP> (<REDACTED DURATION>s ago) and ran for <REDACTED DURATION>ms
441445
last completion reported error: inventory collection is None
442446

447+
task: "switch_port_config_manager"
448+
configured period: every 30s
449+
currently executing: no
450+
last completed activation: iter 2, triggered by an explicit signal
451+
started at <REDACTED TIMESTAMP> (<REDACTED DURATION>s ago) and ran for <REDACTED DURATION>ms
452+
warning: unknown background task: "switch_port_config_manager" (don't know how to interpret details: Object {})
453+
443454
---------------------------------------------
444455
stderr:
445456
note: using Nexus URL http://127.0.0.1:REDACTED_PORT/

nexus-config/src/nexus_config.rs

Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -367,6 +367,8 @@ pub struct BackgroundTaskConfig {
367367
pub sync_service_zone_nat: SyncServiceZoneNatConfig,
368368
/// configuration for the bfd manager task
369369
pub bfd_manager: BfdManagerConfig,
370+
/// configuration for the switch port settings manager task
371+
pub switch_port_settings_manager: SwitchPortSettingsManagerConfig,
370372
/// configuration for region replacement task
371373
pub region_replacement: RegionReplacementConfig,
372374
}
@@ -427,6 +429,15 @@ pub struct SyncServiceZoneNatConfig {
427429
pub period_secs: Duration,
428430
}
429431

432+
#[serde_as]
433+
#[derive(Clone, Debug, Deserialize, Eq, PartialEq, Serialize)]
434+
pub struct SwitchPortSettingsManagerConfig {
435+
/// Interval (in seconds) for periodic activations of this background task.
436+
/// This task is also activated on-demand when any of the switch port settings
437+
/// api endpoints are called.
438+
#[serde_as(as = "DurationSeconds<u64>")]
439+
pub period_secs: Duration,
440+
}
430441
#[serde_as]
431442
#[derive(Clone, Debug, Deserialize, Eq, PartialEq, Serialize)]
432443
pub struct InventoryConfig {
@@ -713,6 +724,7 @@ mod test {
713724
blueprints.period_secs_load = 10
714725
blueprints.period_secs_execute = 60
715726
sync_service_zone_nat.period_secs = 30
727+
switch_port_settings_manager.period_secs = 30
716728
region_replacement.period_secs = 30
717729
[default_region_allocation_strategy]
718730
type = "random"
@@ -828,6 +840,10 @@ mod test {
828840
sync_service_zone_nat: SyncServiceZoneNatConfig {
829841
period_secs: Duration::from_secs(30)
830842
},
843+
switch_port_settings_manager:
844+
SwitchPortSettingsManagerConfig {
845+
period_secs: Duration::from_secs(30),
846+
},
831847
region_replacement: RegionReplacementConfig {
832848
period_secs: Duration::from_secs(30),
833849
},
@@ -893,6 +909,7 @@ mod test {
893909
blueprints.period_secs_load = 10
894910
blueprints.period_secs_execute = 60
895911
sync_service_zone_nat.period_secs = 30
912+
switch_port_settings_manager.period_secs = 30
896913
region_replacement.period_secs = 30
897914
[default_region_allocation_strategy]
898915
type = "random"

nexus/db-model/src/address_lot.rs

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -11,8 +11,10 @@ use omicron_common::api::external;
1111
use serde::{Deserialize, Serialize};
1212
use uuid::Uuid;
1313

14+
pub const INFRA_LOT: &str = "initial-infra";
15+
1416
impl_enum_type!(
15-
#[derive(SqlType, Debug, Clone, Copy)]
17+
#[derive(SqlType, Debug, Clone, Copy, QueryId)]
1618
#[diesel(postgres_type(name = "address_lot_kind", schema = "public"))]
1719
pub struct AddressLotKindEnum;
1820

@@ -24,7 +26,7 @@ impl_enum_type!(
2426
FromSqlRow,
2527
PartialEq,
2628
Serialize,
27-
Deserialize
29+
Deserialize,
2830
)]
2931
#[diesel(sql_type = AddressLotKindEnum)]
3032
pub enum AddressLotKind;

nexus/db-model/src/bootstore.rs

Lines changed: 17 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,5 @@
1-
use crate::schema::bootstore_keys;
1+
use crate::schema::{bootstore_config, bootstore_keys};
2+
use chrono::{DateTime, Utc};
23
use serde::{Deserialize, Serialize};
34

45
pub const NETWORK_KEY: &str = "network_key";
@@ -11,3 +12,18 @@ pub struct BootstoreKeys {
1112
pub key: String,
1213
pub generation: i64,
1314
}
15+
16+
/// BootstoreConfig is a key-value store for bootstrapping data.
17+
/// We serialize the data as json because it is inherently polymorphic and it
18+
/// is not intended to be queried directly.
19+
#[derive(
20+
Queryable, Insertable, Selectable, Clone, Debug, Serialize, Deserialize,
21+
)]
22+
#[diesel(table_name = bootstore_config)]
23+
pub struct BootstoreConfig {
24+
pub key: String,
25+
pub generation: i64,
26+
pub data: serde_json::Value,
27+
pub time_created: DateTime<Utc>,
28+
pub time_deleted: Option<DateTime<Utc>>,
29+
}

nexus/db-model/src/schema.rs

Lines changed: 11 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -13,7 +13,7 @@ use omicron_common::api::external::SemverVersion;
1313
///
1414
/// This should be updated whenever the schema is changed. For more details,
1515
/// refer to: schema/crdb/README.adoc
16-
pub const SCHEMA_VERSION: SemverVersion = SemverVersion::new(44, 0, 0);
16+
pub const SCHEMA_VERSION: SemverVersion = SemverVersion::new(45, 0, 0);
1717

1818
table! {
1919
disk (id) {
@@ -1529,6 +1529,16 @@ table! {
15291529
}
15301530
}
15311531

1532+
table! {
1533+
bootstore_config (key, generation) {
1534+
key -> Text,
1535+
generation -> Int8,
1536+
data -> Jsonb,
1537+
time_created -> Timestamptz,
1538+
time_deleted -> Nullable<Timestamptz>,
1539+
}
1540+
}
1541+
15321542
table! {
15331543
bfd_session (remote, switch) {
15341544
id -> Uuid,

nexus/db-model/src/unsigned.rs

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -130,6 +130,7 @@ where
130130
FromSqlRow,
131131
Serialize,
132132
Deserialize,
133+
QueryId,
133134
)]
134135
#[diesel(sql_type = sql_types::BigInt)]
135136
#[repr(transparent)]

0 commit comments

Comments
 (0)