Skip to content

Commit 68bfc4b

Browse files
authored
(5/N) Read database access records on boot (#8925)
Split off of #8845 Reads records on boot, validates access. Depends on queries from #8932 Fixes #8501
1 parent e44ad55 commit 68bfc4b

File tree

7 files changed

+227
-48
lines changed

7 files changed

+227
-48
lines changed

nexus/db-queries/src/db/datastore/db_metadata.rs

Lines changed: 67 additions & 26 deletions
Original file line numberDiff line numberDiff line change
@@ -443,32 +443,35 @@ impl DataStore {
443443
})
444444
}
445445

446-
// Ensures that the database schema matches "desired_version".
447-
//
448-
// - Updating the schema makes the database incompatible with older
449-
// versions of Nexus, which are not running "desired_version".
450-
// - This is a one-way operation that cannot be undone.
451-
// - The caller is responsible for ensuring that the new version is valid,
452-
// and that all running Nexus instances can understand the new schema
453-
// version.
454-
//
455-
// TODO: This function assumes that all concurrently executing Nexus
456-
// instances on the rack are operating on the same version of software.
457-
// If that assumption is broken, nothing would stop a "new deployment"
458-
// from making a change that invalidates the queries used by an "old
459-
// deployment".
460-
pub async fn ensure_schema(
446+
/// Ensures that the database schema matches `desired_version`.
447+
///
448+
/// - `validated_action`: A [ValidatedDatastoreSetupAction], indicating that
449+
/// [Self::check_schema_and_access] has already been called.
450+
/// - `all_versions`: A description of all schema versions between
451+
/// "whatever is in the DB" and `desired_version`, instructing
452+
/// how to perform an update.
453+
pub async fn update_schema(
461454
&self,
462-
log: &Logger,
463-
desired_version: Version,
455+
validated_action: ValidatedDatastoreSetupAction,
464456
all_versions: Option<&AllSchemaVersions>,
465457
) -> Result<(), anyhow::Error> {
458+
let action = validated_action.action();
459+
460+
match action {
461+
DatastoreSetupAction::Ready => {
462+
bail!("No schema update is necessary")
463+
}
464+
DatastoreSetupAction::Update => (),
465+
_ => bail!("Not ready for schema update"),
466+
}
467+
468+
let desired_version = validated_action.desired_version().clone();
466469
let (found_version, found_target_version) = self
467470
.database_schema_version()
468471
.await
469472
.context("Cannot read database schema version")?;
470473

471-
let log = log.new(o!(
474+
let log = self.log.new(o!(
472475
"found_version" => found_version.to_string(),
473476
"desired_version" => desired_version.to_string(),
474477
));
@@ -1166,15 +1169,34 @@ mod test {
11661169
// Confirms that calling the internal "ensure_schema" function can succeed
11671170
// when the database is already at that version.
11681171
#[tokio::test]
1169-
async fn ensure_schema_is_current_version() {
1170-
let logctx = dev::test_setup_log("ensure_schema_is_current_version");
1172+
async fn check_schema_is_current_version() {
1173+
let logctx = dev::test_setup_log("check_schema_is_current_version");
11711174
let db = TestDatabase::new_with_raw_datastore(&logctx.log).await;
11721175
let datastore = db.datastore();
11731176

1174-
datastore
1175-
.ensure_schema(&logctx.log, SCHEMA_VERSION, None)
1177+
let checked_action = datastore
1178+
.check_schema_and_access(
1179+
IdentityCheckPolicy::DontCare,
1180+
SCHEMA_VERSION,
1181+
)
11761182
.await
1177-
.expect("Failed to ensure schema");
1183+
.expect("Failed to check schema and access");
1184+
1185+
assert!(
1186+
matches!(checked_action.action(), DatastoreSetupAction::Ready),
1187+
"Unexpected action: {:?}",
1188+
checked_action.action(),
1189+
);
1190+
assert_eq!(
1191+
checked_action.desired_version(),
1192+
&SCHEMA_VERSION,
1193+
"Unexpected desired version: {}",
1194+
checked_action.desired_version()
1195+
);
1196+
1197+
datastore.update_schema(checked_action, None).await.expect_err(
1198+
"Should not be able to update schema that's already up-to-date",
1199+
);
11781200

11791201
db.terminate().await;
11801202
logctx.cleanup_successful();
@@ -1277,8 +1299,13 @@ mod test {
12771299
let log = log.clone();
12781300
let pool = pool.clone();
12791301
tokio::task::spawn(async move {
1280-
let datastore =
1281-
DataStore::new(&log, pool, Some(&all_versions)).await?;
1302+
let datastore = DataStore::new(
1303+
&log,
1304+
pool,
1305+
Some(&all_versions),
1306+
IdentityCheckPolicy::DontCare,
1307+
)
1308+
.await?;
12821309

12831310
// This is the crux of this test: confirm that, as each
12841311
// migration completes, it's not possible to see any artifacts
@@ -1405,9 +1432,23 @@ mod test {
14051432

14061433
// Manually construct the datastore to avoid the backoff timeout.
14071434
// We want to trigger errors, but have no need to wait.
1435+
14081436
let datastore = DataStore::new_unchecked(log.clone(), pool.clone());
1437+
let checked_action = datastore
1438+
.check_schema_and_access(
1439+
IdentityCheckPolicy::DontCare,
1440+
SCHEMA_VERSION,
1441+
)
1442+
.await
1443+
.expect("Failed to check schema and access");
1444+
1445+
// This needs to be in a loop because we constructed a schema change
1446+
// that will intentionally fail sometimes when doing this work.
1447+
//
1448+
// This isn't a normal behavior! But we're trying to test the
1449+
// intermediate steps of a schema change here.
14091450
while let Err(e) = datastore
1410-
.ensure_schema(&log, SCHEMA_VERSION, Some(&all_versions))
1451+
.update_schema(checked_action.clone(), Some(&all_versions))
14111452
.await
14121453
{
14131454
warn!(log, "Failed to ensure schema"; "err" => %e);

nexus/db-queries/src/db/datastore/mod.rs

Lines changed: 89 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -43,6 +43,7 @@ use omicron_uuid_kinds::GenericUuid;
4343
use omicron_uuid_kinds::OmicronZoneUuid;
4444
use omicron_uuid_kinds::SledUuid;
4545
use slog::Logger;
46+
use slog_error_chain::InlineErrorChain;
4647
use std::net::Ipv6Addr;
4748
use std::num::NonZeroU32;
4849
use std::sync::Arc;
@@ -123,6 +124,8 @@ pub mod webhook_delivery;
123124
mod zpool;
124125

125126
pub use address_lot::AddressLotCreateResult;
127+
pub use db_metadata::DatastoreSetupAction;
128+
pub use db_metadata::ValidatedDatastoreSetupAction;
126129
pub use dns::DataStoreDnsTest;
127130
pub use dns::DnsVersionUpdateBuilder;
128131
pub use ereport::EreportFilters;
@@ -241,16 +244,19 @@ impl DataStore {
241244
log: &Logger,
242245
pool: Arc<Pool>,
243246
config: Option<&AllSchemaVersions>,
247+
identity_check: IdentityCheckPolicy,
244248
) -> Result<Self, String> {
245-
Self::new_with_timeout(log, pool, config, None).await
249+
Self::new_with_timeout(log, pool, config, None, identity_check).await
246250
}
247251

248252
pub async fn new_with_timeout(
249253
log: &Logger,
250254
pool: Arc<Pool>,
251255
config: Option<&AllSchemaVersions>,
252256
try_for: Option<std::time::Duration>,
257+
identity_check: IdentityCheckPolicy,
253258
) -> Result<Self, String> {
259+
use db_metadata::DatastoreSetupAction;
254260
use nexus_db_model::SCHEMA_VERSION as EXPECTED_VERSION;
255261

256262
let datastore =
@@ -264,25 +270,96 @@ impl DataStore {
264270
|| async {
265271
if let Some(try_for) = try_for {
266272
if std::time::Instant::now() > start + try_for {
267-
return Err(BackoffError::permanent(()));
273+
return Err(BackoffError::permanent(
274+
"Timeout waiting for DataStore::new_with_timeout",
275+
));
268276
}
269277
}
270278

271-
match datastore
272-
.ensure_schema(&log, EXPECTED_VERSION, config)
273-
.await
274-
{
275-
Ok(()) => return Ok(()),
276-
Err(e) => {
277-
warn!(log, "Failed to ensure schema version"; "error" => #%e);
279+
loop {
280+
let checked_action = datastore
281+
.check_schema_and_access(
282+
identity_check,
283+
EXPECTED_VERSION,
284+
)
285+
.await
286+
.map_err(|err| {
287+
warn!(
288+
log,
289+
"Cannot check schema version / Nexus access";
290+
InlineErrorChain::new(err.as_ref()),
291+
);
292+
BackoffError::transient(
293+
"Cannot check schema version / Nexus access",
294+
)
295+
})?;
296+
297+
match checked_action.action() {
298+
DatastoreSetupAction::Ready => {
299+
info!(log, "Datastore is ready for usage");
300+
return Ok(());
301+
}
302+
DatastoreSetupAction::NeedsHandoff { nexus_id } => {
303+
info!(log, "Datastore is awaiting handoff");
304+
305+
datastore
306+
.attempt_handoff(*nexus_id)
307+
.await
308+
.map_err(|err| {
309+
warn!(
310+
log,
311+
"Could not handoff to new nexus";
312+
err
313+
);
314+
BackoffError::transient(
315+
"Could not handoff to new nexus",
316+
)
317+
})?;
318+
319+
// If the handoff was successful, immediately
320+
// re-evaluate the schema and access policies to see
321+
// if we should update or not.
322+
continue;
323+
}
324+
DatastoreSetupAction::TryLater => {
325+
error!(log, "Waiting for metadata; trying later");
326+
return Err(BackoffError::permanent(
327+
"Waiting for metadata; trying later",
328+
));
329+
}
330+
DatastoreSetupAction::Update => {
331+
info!(
332+
log,
333+
"Datastore should be updated before usage"
334+
);
335+
datastore
336+
.update_schema(checked_action, config)
337+
.await
338+
.map_err(|err| {
339+
warn!(
340+
log,
341+
"Failed to update schema version";
342+
InlineErrorChain::new(err.as_ref())
343+
);
344+
BackoffError::transient(
345+
"Failed to update schema version",
346+
)
347+
})?;
348+
return Ok(());
349+
}
350+
DatastoreSetupAction::Refuse => {
351+
error!(log, "Datastore should not be used");
352+
return Err(BackoffError::permanent(
353+
"Datastore should not be used",
354+
));
355+
}
278356
}
279-
};
280-
return Err(BackoffError::transient(()));
357+
}
281358
},
282359
|_, _| {},
283360
)
284361
.await
285-
.map_err(|_| "Failed to read valid DB schema".to_string())?;
362+
.map_err(|err| err.to_string())?;
286363

287364
Ok(datastore)
288365
}

nexus/db-queries/src/db/pub_test_utils/mod.rs

Lines changed: 14 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,7 @@ use crate::authz;
1212
use crate::context::OpContext;
1313
use crate::db;
1414
use crate::db::DataStore;
15+
use crate::db::datastore::IdentityCheckPolicy;
1516
use omicron_test_utils::dev::db::CockroachInstance;
1617
use slog::Logger;
1718
use std::sync::Arc;
@@ -114,7 +115,14 @@ impl TestDatabaseBuilder {
114115
Interface::Datastore => {
115116
let pool = new_pool(log, &db);
116117
let datastore = Arc::new(
117-
DataStore::new(&log, pool, None).await.unwrap(),
118+
DataStore::new(
119+
&log,
120+
pool,
121+
None,
122+
IdentityCheckPolicy::DontCare,
123+
)
124+
.await
125+
.unwrap(),
118126
);
119127
TestDatabase {
120128
db,
@@ -300,7 +308,11 @@ async fn datastore_test(
300308

301309
let cfg = db::Config { url: db.pg_config().clone() };
302310
let pool = Arc::new(db::Pool::new_single_host(&log, &cfg));
303-
let datastore = Arc::new(DataStore::new(&log, pool, None).await.unwrap());
311+
let datastore = Arc::new(
312+
DataStore::new(&log, pool, None, IdentityCheckPolicy::DontCare)
313+
.await
314+
.unwrap(),
315+
);
304316

305317
// Create an OpContext with the credentials of "db-init" just for the
306318
// purpose of loading the built-in users, roles, and assignments.

nexus/src/app/mod.rs

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -24,6 +24,7 @@ use nexus_db_queries::authn;
2424
use nexus_db_queries::authz;
2525
use nexus_db_queries::context::OpContext;
2626
use nexus_db_queries::db;
27+
use nexus_db_queries::db::datastore::IdentityCheckPolicy;
2728
use nexus_mgs_updates::ArtifactCache;
2829
use nexus_mgs_updates::MgsUpdateDriver;
2930
use nexus_types::deployment::PendingMgsUpdates;
@@ -308,12 +309,14 @@ impl Nexus {
308309
.map(|s| AllSchemaVersions::load(&s.schema_dir))
309310
.transpose()
310311
.map_err(|error| format!("{error:#}"))?;
312+
let nexus_id = config.deployment.id;
311313
let db_datastore = Arc::new(
312314
db::DataStore::new_with_timeout(
313315
&log,
314316
Arc::clone(&pool),
315317
all_versions.as_ref(),
316318
config.pkg.tunables.load_timeout,
319+
IdentityCheckPolicy::CheckAndTakeover { nexus_id },
317320
)
318321
.await?,
319322
);

nexus/src/bin/schema-updater.rs

Lines changed: 36 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,8 @@ use nexus_db_model::AllSchemaVersions;
1414
use nexus_db_model::SCHEMA_VERSION;
1515
use nexus_db_queries::db;
1616
use nexus_db_queries::db::DataStore;
17+
use nexus_db_queries::db::datastore::DatastoreSetupAction;
18+
use nexus_db_queries::db::datastore::IdentityCheckPolicy;
1719
use semver::Version;
1820
use slog::Drain;
1921
use slog::Level;
@@ -108,11 +110,40 @@ async fn main_impl() -> anyhow::Result<()> {
108110
}
109111
Cmd::Upgrade { version } => {
110112
println!("Upgrading to {version}");
111-
datastore
112-
.ensure_schema(&log, version.clone(), Some(&all_versions))
113-
.await
114-
.map_err(|e| anyhow!(e))?;
115-
println!("Upgrade to {version} complete");
113+
let checked_action = datastore
114+
.check_schema_and_access(
115+
IdentityCheckPolicy::DontCare,
116+
version.clone(),
117+
)
118+
.await?;
119+
120+
match checked_action.action() {
121+
DatastoreSetupAction::Ready => {
122+
println!("Already at version {version}")
123+
}
124+
DatastoreSetupAction::Update => {
125+
datastore
126+
.update_schema(checked_action, Some(&all_versions))
127+
.await
128+
.map_err(|e| anyhow!(e))?;
129+
println!("Update to {version} complete");
130+
}
131+
DatastoreSetupAction::Refuse => {
132+
println!("Refusing to update to version {version}")
133+
}
134+
DatastoreSetupAction::TryLater
135+
| DatastoreSetupAction::NeedsHandoff { .. } => {
136+
// This case should not happen - we supplied
137+
// IdentityCheckPolicy::DontCare, so we should not be told
138+
// to attempt a takeover by a specific Nexus.
139+
println!(
140+
"Refusing to update to version {version}. \
141+
The schema updater tried to ignore the identity check, \
142+
but got a response indicating handoff is needed. \
143+
This is unexpected, and probably a bug"
144+
)
145+
}
146+
}
116147
}
117148
}
118149
datastore.terminate().await;

0 commit comments

Comments
 (0)