From c9a71c7344054cc958b44254e925528715ce8423 Mon Sep 17 00:00:00 2001 From: Patrick Roy Date: Thu, 23 Nov 2023 12:08:38 +0000 Subject: [PATCH 1/4] fix: Eliminate MainError::MicrovmStoppedWithoutError This error variant was used to encode that no error actually happened, which does not make much sense conceptually. What made this worse is that is contained a FcExitCode, which is itself just a fake Result<(), non-zero-exit code>. This means it was possible to get Firecracker to exit with status "error, but not actually error, but actually there is an error after all", or: "Firecracker exited with an error: Microvm stopped without an error: GenericError". The underlying problem here is the fact that we are using `FcExitCode` as an error variant for `Result`. Since `FcExitCode::Ok` exists, `FcExitCode` is a kind of `Result` itself, meaning we are dealing with `Result<_, Result<_, _>>` as a type, which has no well-defined interpretation. Signed-off-by: Patrick Roy --- src/firecracker/src/api_server_adapter.rs | 14 ++++---------- src/firecracker/src/main.rs | 1 - 2 files changed, 4 insertions(+), 11 deletions(-) diff --git a/src/firecracker/src/api_server_adapter.rs b/src/firecracker/src/api_server_adapter.rs index b2cad171f3c..3f008658a22 100644 --- a/src/firecracker/src/api_server_adapter.rs +++ b/src/firecracker/src/api_server_adapter.rs @@ -22,8 +22,6 @@ use vmm::{EventManager, FcExitCode, Vmm}; #[derive(Debug, thiserror::Error, displaydoc::Display)] pub enum ApiServerError { - /// MicroVMStopped without an error: {0:?} - MicroVMStoppedWithoutError(FcExitCode), /// MicroVMStopped with an error: {0:?} MicroVMStoppedWithError(FcExitCode), /// Failed to open the API socket at: {0}. Check that it is not already used. @@ -52,7 +50,7 @@ impl ApiServerAdapter { vm_resources: VmResources, vmm: Arc>, event_manager: &mut EventManager, - ) -> Result<(), FcExitCode> { + ) -> Result<(), ApiServerError> { let api_adapter = Arc::new(Mutex::new(Self { api_event_fd, from_api, @@ -67,7 +65,7 @@ impl ApiServerAdapter { match vmm.lock().unwrap().shutdown_exit_code() { Some(FcExitCode::Ok) => break, - Some(exit_code) => return Err(exit_code), + Some(exit_code) => return Err(ApiServerError::MicroVMStoppedWithError(exit_code)), None => continue, } } @@ -227,7 +225,7 @@ pub(crate) fn run_with_api( .map_err(ApiServerError::MicroVMStoppedWithError), }; - let result = build_result.map(|(vm_resources, vmm)| { + let result = build_result.and_then(|(vm_resources, vmm)| { firecracker_metrics .lock() .expect("Poisoned lock") @@ -248,9 +246,5 @@ pub(crate) fn run_with_api( // shutdown-internal and returns from its function. api_thread.join().expect("Api thread should join"); - match result { - Ok(Ok(())) => Ok(()), - Ok(Err(exit_code)) => Err(ApiServerError::MicroVMStoppedWithoutError(exit_code)), - Err(exit_error) => Err(exit_error), - } + result } diff --git a/src/firecracker/src/main.rs b/src/firecracker/src/main.rs index 1aef9ce77c7..e26e7ef8eee 100644 --- a/src/firecracker/src/main.rs +++ b/src/firecracker/src/main.rs @@ -81,7 +81,6 @@ impl From for ExitCode { let exit_code = match value { MainError::ParseArguments(_) => FcExitCode::ArgParsing, MainError::InvalidLogLevel(_) => FcExitCode::BadConfiguration, - MainError::RunWithApi(ApiServerError::MicroVMStoppedWithoutError(code)) => code, MainError::RunWithApi(ApiServerError::MicroVMStoppedWithError(code)) => code, MainError::RunWithoutApiError(RunWithoutApiError::Shutdown(code)) => code, _ => FcExitCode::GenericError, From 131c5469b0a41d1b3cc8ebd832eb9e06ff231096 Mon Sep 17 00:00:00 2001 From: Patrick Roy Date: Thu, 23 Nov 2023 12:30:13 +0000 Subject: [PATCH 2/4] fix: Give build_microvm_from_requests a proper error type Using FcExitCode as an error type is undesirable, as it allows us to construct Err(FcExitCode::Ok), e.g. an object that says "error: everything's okay!". This is confusing and has caused problems in different contexts before, so replace FcExitCode with a proper error type here. Signed-off-by: Patrick Roy --- src/firecracker/src/api_server_adapter.rs | 9 +++-- src/vmm/src/rpc_interface.rs | 40 +++++++++++++---------- 2 files changed, 28 insertions(+), 21 deletions(-) diff --git a/src/firecracker/src/api_server_adapter.rs b/src/firecracker/src/api_server_adapter.rs index 3f008658a22..f529bb60038 100644 --- a/src/firecracker/src/api_server_adapter.rs +++ b/src/firecracker/src/api_server_adapter.rs @@ -15,14 +15,17 @@ use utils::eventfd::EventFd; use vmm::logger::{error, warn, ProcessTimeReporter}; use vmm::resources::VmResources; use vmm::rpc_interface::{ - ApiRequest, ApiResponse, PrebootApiController, RuntimeApiController, VmmAction, + ApiRequest, ApiResponse, BuildMicrovmFromRequestsError, PrebootApiController, + RuntimeApiController, VmmAction, }; use vmm::vmm_config::instance_info::InstanceInfo; use vmm::{EventManager, FcExitCode, Vmm}; #[derive(Debug, thiserror::Error, displaydoc::Display)] pub enum ApiServerError { - /// MicroVMStopped with an error: {0:?} + /// Failed to build MicroVM: {0}. + BuildMicroVmError(BuildMicrovmFromRequestsError), + /// MicroVM stopped with an error: {0:?} MicroVMStoppedWithError(FcExitCode), /// Failed to open the API socket at: {0}. Check that it is not already used. FailedToBindSocket(String), @@ -222,7 +225,7 @@ pub(crate) fn run_with_api( mmds_size_limit, metadata_json, ) - .map_err(ApiServerError::MicroVMStoppedWithError), + .map_err(ApiServerError::BuildMicroVmError), }; let result = build_result.and_then(|(vm_resources, vmm)| { diff --git a/src/vmm/src/rpc_interface.rs b/src/vmm/src/rpc_interface.rs index d7ee989d4ff..9b52bfe6fd9 100644 --- a/src/vmm/src/rpc_interface.rs +++ b/src/vmm/src/rpc_interface.rs @@ -20,7 +20,7 @@ use super::{ }; use crate::builder::StartMicrovmError; use crate::cpu_config::templates::{CustomCpuTemplate, GuestConfigError}; -use crate::logger::{error, info, warn, LoggerConfig, *}; +use crate::logger::{info, warn, LoggerConfig, *}; use crate::mmds::data_store::{self, Mmds}; use crate::persist::{CreateSnapshotError, RestoreFromSnapshotError, VmInfo}; use crate::resources::VmmConfig; @@ -42,7 +42,7 @@ use crate::vmm_config::net::{ use crate::vmm_config::snapshot::{CreateSnapshotParams, LoadSnapshotParams, SnapshotType}; use crate::vmm_config::vsock::{VsockConfigError, VsockDeviceConfig}; use crate::vmm_config::{self, RateLimiterUpdate}; -use crate::{EventManager, FcExitCode}; +use crate::EventManager; /// This enum represents the public interface of the VMM. Each action contains various /// bits of information (ids, paths, etc.). @@ -247,7 +247,7 @@ pub struct PrebootApiController<'a> { boot_path: bool, // Some PrebootApiRequest errors are irrecoverable and Firecracker // should cleanly teardown if they occur. - fatal_error: Option, + fatal_error: Option, } // TODO Remove when `EventManager` implements `std::fmt::Debug`. @@ -287,6 +287,17 @@ pub type ApiRequest = Box; /// Shorthand type for a response containing a boxed Result. pub type ApiResponse = Box>; +/// Error type for `PrebootApiController::build_microvm_from_requests`. +#[derive(Debug, thiserror::Error, displaydoc::Display, derive_more::From)] +pub enum BuildMicrovmFromRequestsError { + /// Populating MMDS from file failed: {0:?}. + Mmds(data_store::Error), + /// Loading snapshot failed. + Restore, + /// Resuming MicroVM after loading snapshot failed. + Resume, +} + impl<'a> PrebootApiController<'a> { /// Constructor for the PrebootApiController. pub fn new( @@ -320,7 +331,7 @@ impl<'a> PrebootApiController<'a> { boot_timer_enabled: bool, mmds_size_limit: usize, metadata_json: Option<&str>, - ) -> Result<(VmResources, Arc>), FcExitCode> { + ) -> Result<(VmResources, Arc>), BuildMicrovmFromRequestsError> { let mut vm_resources = VmResources::default(); // Silence false clippy warning. Clippy suggests using // VmResources { boot_timer: boot_timer_enabled, ..Default::default() }; but this will @@ -333,16 +344,9 @@ impl<'a> PrebootApiController<'a> { // Init the data store from file, if present. if let Some(data) = metadata_json { - vm_resources - .locked_mmds_or_default() - .put_data( - serde_json::from_str(data) - .expect("MMDS error: metadata provided not valid json"), - ) - .map_err(|err| { - error!("Populating MMDS from file failed: {:?}", err); - crate::FcExitCode::GenericError - })?; + vm_resources.locked_mmds_or_default().put_data( + serde_json::from_str(data).expect("MMDS error: metadata provided not valid json"), + )?; info!("Successfully added metadata to mmds from file"); } @@ -376,8 +380,8 @@ impl<'a> PrebootApiController<'a> { to_api.send(Box::new(res)).expect("one-shot channel closed"); // If any fatal errors were encountered, break the loop. - if let Some(exit_code) = preboot_controller.fatal_error { - return Err(exit_code); + if let Some(preboot_error) = preboot_controller.fatal_error { + return Err(preboot_error); } } @@ -577,7 +581,7 @@ impl<'a> PrebootApiController<'a> { ) .map_err(|err| { // If restore fails, we consider the process is too dirty to recover. - self.fatal_error = Some(FcExitCode::BadConfiguration); + self.fatal_error = Some(BuildMicrovmFromRequestsError::Restore); err })?; // Resume VM @@ -587,7 +591,7 @@ impl<'a> PrebootApiController<'a> { .resume_vm() .map_err(|err| { // If resume fails, we consider the process is too dirty to recover. - self.fatal_error = Some(FcExitCode::BadConfiguration); + self.fatal_error = Some(BuildMicrovmFromRequestsError::Resume); err })?; } From 65cb91ffdcc0b5c2fd5b0ff34e994adcbfd26fb7 Mon Sep 17 00:00:00 2001 From: Patrick Roy Date: Thu, 23 Nov 2023 14:11:27 +0000 Subject: [PATCH 3/4] fix: Ensure to exit with error if any vcpu exits with error Previously, when a VM exited, we looked for the first vcpu that reported an exit status, and then indiscriminately reported that back. However, it is possible to one vcpu to exit successfully while another exits with an error, and this could lead us to report "Firecracker Exited Successfully" even though it did not. Now, we explicitly look for the status code of all vcpus. If any of them report an error, this takes precedence over non-error status codes. Signed-off-by: Patrick Roy --- src/vmm/src/lib.rs | 36 ++++++++++++++++++++---------------- 1 file changed, 20 insertions(+), 16 deletions(-) diff --git a/src/vmm/src/lib.rs b/src/vmm/src/lib.rs index 3aec20fa36e..e67b31823a7 100644 --- a/src/vmm/src/lib.rs +++ b/src/vmm/src/lib.rs @@ -110,7 +110,7 @@ pub mod vstate; use std::collections::HashMap; use std::io; use std::os::unix::io::AsRawFd; -use std::sync::mpsc::{RecvTimeoutError, TryRecvError}; +use std::sync::mpsc::RecvTimeoutError; use std::sync::{Arc, Barrier, Mutex}; use std::time::Duration; @@ -903,23 +903,27 @@ impl MutEventSubscriber for Vmm { // Exit event handling should never do anything more than call 'self.stop()'. let _ = self.vcpus_exit_evt.read(); - let mut exit_code = None; - // Query each vcpu for their exit_code. - for handle in &self.vcpus_handles { - match handle.response_receiver().try_recv() { - Ok(VcpuResponse::Exited(status)) => { - exit_code = Some(status); - // Just use the first encountered exit-code. - break; - } - Ok(_response) => {} // Don't care about these, we are exiting. - Err(TryRecvError::Empty) => {} // Nothing pending in channel - Err(err) => { - panic!("Error while looking for VCPU exit status: {}", err); + let exit_code = 'exit_code: { + // Query each vcpu for their exit_code. + for handle in &self.vcpus_handles { + // Drain all vcpu responses that are pending from this vcpu until we find an + // exit status. + for response in handle.response_receiver().try_iter() { + if let VcpuResponse::Exited(status) = response { + // It could be that some vcpus exited successfully while others + // errored out. Thus make sure that error exits from one vcpu always + // takes precedence over "ok" exits + if status != FcExitCode::Ok { + break 'exit_code status; + } + } } } - } - self.stop(exit_code.unwrap_or(FcExitCode::Ok)); + + // No CPUs exited with error status code, report "Ok" + FcExitCode::Ok + }; + self.stop(exit_code); } else { error!("Spurious EventManager event for handler: Vmm"); } From f8f446a4d9acdfa299958b78a1f8802d7f8a767d Mon Sep 17 00:00:00 2001 From: Patrick Roy Date: Fri, 24 Nov 2023 16:34:39 +0000 Subject: [PATCH 4/4] doc: Add changelog entry for #4271 Backported commit with PR number chnaged. Signed-off-by: Patrick Roy --- CHANGELOG.md | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index 4a762f30553..95d857b6c33 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -14,6 +14,11 @@ - [#4179](https://github.com/firecracker-microvm/firecracker/pull/4179): Fixed a bug reporting a non-zero exit code on successful shutdown when starting Firecracker with `--no-api`. +- [#4271](https://github.com/firecracker-microvm/firecracker/pull/4271): Fixed + a bug where Firecracker would log "RunWithApiError error: MicroVMStopped + without an error: GenericError" when exiting after encountering an emulation + error. It now correctly prints "RunWithApiError error: MicroVMStopped *with* an + error: GenericError". - [#4270](https://github.com/firecracker-microvm/firecracker/pull/4270): Fixed a bug introduced in #4047 that limited the `--level` option of logger to Pascal-cased values (e.g. accepting "Info", but not "info"). It now