From 955980ba5ad54fedd1f385b93cf06bad1335fb26 Mon Sep 17 00:00:00 2001 From: Jonathan Lebon Date: Thu, 24 Jun 2021 11:23:30 -0400 Subject: [PATCH] Add `coreos-installer install --fetch-retries` We're hitting issues in RHCOS right now where it's possible that `coreos-installer.service` is racing with networking being fully up, even though we're ordered after `network-online.target` and `systemd-resolved.service` (though RHCOS doesn't use the latter). The issue may not be a race in the end, but misconfigured networking. But anyway, we really should be retrying fetches. I gated this behind a switch instead of doing it by default for all fetches, because e.g. interactively I think it makes more sense not to retry. (And similarly for other commands like `download` and `list-stream`). Related: https://bugzilla.redhat.com/show_bug.cgi?id=1974411 Related: https://bugzilla.redhat.com/show_bug.cgi?id=1967483 Closes: #283 --- docs/cmd/download.md | 1 + docs/cmd/install.md | 1 + scripts/coreos-installer-service | 3 ++ src/cmdline.rs | 59 +++++++++++++++++++-- src/download.rs | 9 +--- src/source.rs | 88 ++++++++++++++++++++++++-------- 6 files changed, 130 insertions(+), 31 deletions(-) diff --git a/docs/cmd/download.md b/docs/cmd/download.md index 615d81ca2..a0bdd256f 100644 --- a/docs/cmd/download.md +++ b/docs/cmd/download.md @@ -25,3 +25,4 @@ Download a CoreOS image | **--decompress**, **-d** | Decompress image and don't save signature | | **--insecure** | Skip signature verification | | **--stream-base-url** *URL* | Base URL for Fedora CoreOS stream metadata | +| **--fetch-retries** *N* | Fetch retries, or string "infinite" | diff --git a/docs/cmd/install.md b/docs/cmd/install.md index ebb74c881..e9f9f247d 100644 --- a/docs/cmd/install.md +++ b/docs/cmd/install.md @@ -39,3 +39,4 @@ Install Fedora CoreOS or RHEL CoreOS | **--stream-base-url** *URL* | Base URL for Fedora CoreOS stream metadata | | **--architecture** *name* | Target CPU architecture [default: x86_64] | | **--preserve-on-error** | Don't clear partition table on error | +| **--fetch-retries** *N* | Fetch retries, or string "infinite" | diff --git a/scripts/coreos-installer-service b/scripts/coreos-installer-service index 347b5f56c..4870d6e0f 100755 --- a/scripts/coreos-installer-service +++ b/scripts/coreos-installer-service @@ -97,6 +97,9 @@ if karg_bool coreos.inst.insecure; then args+=("--insecure") fi +# Always retry HTTP requests; we've got nothing to lose since we fail anyway. +args+=("--fetch-retries" "infinite") + # Ensure device nodes have been created udevadm settle diff --git a/src/cmdline.rs b/src/cmdline.rs index 92d931b0e..208d9852f 100644 --- a/src/cmdline.rs +++ b/src/cmdline.rs @@ -62,6 +62,14 @@ pub struct InstallConfig { pub preserve_on_error: bool, pub network_config: Option, pub save_partitions: Vec, + pub fetch_retries: FetchRetries, +} + +#[derive(Debug, Clone, Copy)] +pub enum FetchRetries { + Infinite, + Finite(NonZeroU32), + None, } #[derive(Debug, PartialEq, Eq)] @@ -334,6 +342,13 @@ pub fn parse_args() -> Result { .long("preserve-on-error") .help("Don't clear partition table on error"), ) + .arg( + Arg::with_name("fetch-retries") + .long("fetch-retries") + .value_name("N") + .help("Fetch retries, or string \"infinite\"") + .takes_value(true), + ) // positional args .arg( Arg::with_name("device") @@ -414,6 +429,13 @@ pub fn parse_args() -> Result { .value_name("URL") .help("Base URL for Fedora CoreOS stream metadata") .takes_value(true), + ) + .arg( + Arg::with_name("fetch-retries") + .long("fetch-retries") + .value_name("N") + .help("Fetch retries, or string \"infinite\"") + .takes_value(true), ), ) .subcommand( @@ -855,6 +877,20 @@ fn parse_install(matches: &ArgMatches) -> Result { .value_of("architecture") .expect("architecture missing"); + let fetch_retries: FetchRetries = matches + .value_of("fetch-retries") + .map(|s| match s { + "infinite" => Ok(FetchRetries::Infinite), + num => num.parse::().map(|num| { + NonZeroU32::new(num) + .map(FetchRetries::Finite) + .unwrap_or(FetchRetries::None) + }), + }) + .transpose() + .context("parsing --fetch-retries")? + .unwrap_or(FetchRetries::None); + // Uninitialized ECKD DASD's blocksize is 512, but after formatting // it changes to the recommended 4096 // https://bugzilla.redhat.com/show_bug.cgi?id=1905159 @@ -878,7 +914,7 @@ fn parse_install(matches: &ArgMatches) -> Result { } else if matches.is_present("image-url") { let image_url = Url::parse(matches.value_of("image-url").expect("image-url missing")) .context("parsing image URL")?; - Box::new(UrlLocation::new(&image_url)) + Box::new(UrlLocation::new(&image_url, fetch_retries)) } else if matches.is_present("offline") { match OsmetLocation::new(architecture, sector_size)? { Some(osmet) => Box::new(osmet), @@ -921,6 +957,7 @@ fn parse_install(matches: &ArgMatches) -> Result { "metal", format, base_url.as_ref(), + fetch_retries, )?) } }; @@ -944,7 +981,7 @@ fn parse_install(matches: &ArgMatches) -> Result { } else if !url.starts_with("https://") { bail!("unknown protocol for URL '{}'", url); } - download_to_tempfile(url) + download_to_tempfile(url, fetch_retries) .with_context(|| format!("downloading source Ignition config {}", url)) }).transpose()? } else { @@ -968,6 +1005,7 @@ fn parse_install(matches: &ArgMatches) -> Result { device, location, ignition, + fetch_retries, ignition_hash: matches .value_of("ignition-hash") .map(IgnitionHash::try_parse) @@ -1051,10 +1089,24 @@ fn parse_download(matches: &ArgMatches) -> Result { // Build image location. Ideally we'd use conflicts_with (and an // ArgGroup for streams), but that doesn't play well with default // arguments, so we manually prioritize modes. + let fetch_retries: FetchRetries = matches + .value_of("fetch-retries") + .map(|s| match s { + "infinite" => Ok(FetchRetries::Infinite), + num => num.parse::().map(|num| { + NonZeroU32::new(num) + .map(FetchRetries::Finite) + .unwrap_or(FetchRetries::None) + }), + }) + .transpose() + .context("parsing --fetch-retries")? + .unwrap_or(FetchRetries::None); + let location: Box = if matches.is_present("image-url") { let image_url = Url::parse(matches.value_of("image-url").expect("image-url missing")) .context("parsing image URL")?; - Box::new(UrlLocation::new(&image_url)) + Box::new(UrlLocation::new(&image_url, fetch_retries)) } else { let base_url = if let Some(stream_base_url) = matches.value_of("stream-base-url") { Some(Url::parse(stream_base_url).context("parsing stream base URL")?) @@ -1069,6 +1121,7 @@ fn parse_download(matches: &ArgMatches) -> Result { matches.value_of("platform").expect("platform missing"), matches.value_of("format").expect("format missing"), base_url.as_ref(), + fetch_retries, )?) }; diff --git a/src/download.rs b/src/download.rs index 3eb5c67cf..e60713363 100644 --- a/src/download.rs +++ b/src/download.rs @@ -327,16 +327,11 @@ pub fn image_copy_default( Ok(()) } -pub fn download_to_tempfile(url: &str) -> Result { +pub fn download_to_tempfile(url: &str, retries: FetchRetries) -> Result { let mut f = tempfile::tempfile()?; let client = new_http_client()?; - let mut resp = client - .get(url) - .send() - .with_context(|| format!("sending request for '{}'", url))? - .error_for_status() - .with_context(|| format!("fetching '{}'", url))?; + let mut resp = http_get(client, url, retries)?; let mut writer = BufWriter::with_capacity(BUFFER_SIZE, &mut f); copy( diff --git a/src/source.rs b/src/source.rs index 28d0d1e74..ea1a49e74 100644 --- a/src/source.rs +++ b/src/source.rs @@ -20,6 +20,7 @@ use std::fmt::{Display, Formatter}; use std::fs::OpenOptions; use std::io::{Read, Seek, SeekFrom}; use std::path::{Path, PathBuf}; +use std::thread::sleep; use std::time::Duration; use crate::cmdline::*; @@ -65,6 +66,7 @@ pub struct UrlLocation { image_url: Url, sig_url: Url, artifact_type: String, + retries: FetchRetries, } // Remote image source specified by Fedora CoreOS stream metadata @@ -76,6 +78,7 @@ pub struct StreamLocation { architecture: String, platform: String, format: String, + retries: FetchRetries, } pub struct ImageSource { @@ -150,28 +153,25 @@ impl ImageLocation for FileLocation { } impl UrlLocation { - pub fn new(url: &Url) -> Self { + pub fn new(url: &Url, retries: FetchRetries) -> Self { let mut sig_url = url.clone(); sig_url.set_path(&format!("{}.sig", sig_url.path())); - Self::new_with_sig_and_type(url, &sig_url, "disk") + Self::new_full(url, &sig_url, "disk", retries) } - fn new_with_sig_and_type(url: &Url, sig_url: &Url, artifact_type: &str) -> Self { + fn new_full(url: &Url, sig_url: &Url, artifact_type: &str, retries: FetchRetries) -> Self { Self { image_url: url.clone(), sig_url: sig_url.clone(), artifact_type: artifact_type.to_string(), + retries, } } /// Fetch signature content from URL. - fn fetch_signature(sig_url: &Url) -> Result> { + fn fetch_signature(&self) -> Result> { let client = new_http_client()?; - let mut resp = client - .get(sig_url.clone()) - .send() - .context("sending signature request")? - .error_for_status() + let mut resp = http_get(client, self.sig_url.as_str(), self.retries) .context("fetching signature URL")?; let mut sig_bytes = Vec::new(); @@ -193,15 +193,14 @@ impl Display for UrlLocation { impl ImageLocation for UrlLocation { fn sources(&self) -> Result> { - let signature = Self::fetch_signature(&self.sig_url) + let signature = self + .fetch_signature() .map_err(|e| eprintln!("Failed to fetch signature: {}", e)) .ok(); // start fetch, get length let client = new_http_client()?; - let resp = client - .get(self.image_url.clone()) - .send() + let resp = http_get(client, self.image_url.as_str(), self.retries) .context("fetching image URL")?; match resp.status() { StatusCode::OK => (), @@ -234,6 +233,7 @@ impl StreamLocation { platform: &str, format: &str, base_url: Option<&Url>, + retries: FetchRetries, ) -> Result { Ok(Self { stream_base_url: base_url.cloned(), @@ -242,6 +242,7 @@ impl StreamLocation { architecture: architecture.to_string(), platform: platform.to_string(), format: format.to_string(), + retries, }) } } @@ -268,7 +269,7 @@ impl ImageLocation for StreamLocation { fn sources(&self) -> Result> { // fetch and parse stream metadata let client = new_http_client()?; - let stream = fetch_stream(client, &self.stream_url)?; + let stream = fetch_stream(client, &self.stream_url, self.retries)?; // descend it let artifacts = stream @@ -293,7 +294,7 @@ impl ImageLocation for StreamLocation { let signature_url = Url::parse(&artifact.signature) .context("parsing signature URL from stream metadata")?; let mut artifact_sources = - UrlLocation::new_with_sig_and_type(&artifact_url, &signature_url, &artifact_type) + UrlLocation::new_full(&artifact_url, &signature_url, &artifact_type, self.retries) .sources()?; sources.append(&mut artifact_sources); } @@ -385,7 +386,7 @@ pub fn list_stream(config: &ListStreamConfig) -> Result<()> { // fetch stream metadata let client = new_http_client()?; let stream_url = build_stream_url(&config.stream, config.stream_base_url.as_ref())?; - let stream = fetch_stream(client, &stream_url)?; + let stream = fetch_stream(client, &stream_url, FetchRetries::None)?; // walk formats let mut rows: Vec = Vec::new(); @@ -439,12 +440,9 @@ fn build_stream_url(stream: &str, base_url: Option<&Url>) -> Result { } /// Fetch and parse stream metadata. -fn fetch_stream(client: blocking::Client, url: &Url) -> Result { +fn fetch_stream(client: blocking::Client, url: &Url, retries: FetchRetries) -> Result { // fetch stream metadata - let resp = client - .get(url.clone()) - .send() - .context("fetching stream metadata")?; + let resp = http_get(client, url.as_str(), retries).context("fetching stream metadata")?; match resp.status() { StatusCode::OK => (), s => bail!("stream metadata fetch from {} failed: {}", url, s), @@ -463,6 +461,54 @@ pub fn new_http_client() -> Result { .context("building HTTP client") } +/// Wrapper around Client::get() with error handling based on HTTP return code and optionally basic +/// exponential backoff retries for transient errors. +pub fn http_get( + client: blocking::Client, + url: &str, + retries: FetchRetries, +) -> Result { + // this matches `curl --retry` semantics -- see list in `curl(1)` + const RETRY_STATUS_CODES: [u16; 6] = [408, 429, 500, 502, 503, 504]; + + let mut delay = 1; + let (infinite, mut tries) = match retries { + FetchRetries::Infinite => (true, 0), + FetchRetries::Finite(n) => (false, n.get() + 1), + FetchRetries::None => (false, 1), + }; + + loop { + let err: anyhow::Error = match client.get(url).send() { + Err(err) => err.into(), + Ok(resp) => match resp.status().as_u16() { + code if RETRY_STATUS_CODES.contains(&code) => anyhow!( + "HTTP {} {}", + code, + resp.status().canonical_reason().unwrap_or("") + ), + _ => { + return resp + .error_for_status() + .with_context(|| format!("fetching '{}'", url)); + } + }, + }; + + if !infinite { + tries -= 1; + if tries == 0 { + return Err(err).with_context(|| format!("fetching '{}'", url)); + } + } + + eprintln!("Error fetching '{}': {}", url, err); + eprintln!("Sleeping {}s and retrying...", delay); + sleep(Duration::from_secs(delay)); + delay = std::cmp::min(delay * 2, 10 * 60); // cap to 10 mins; matches curl + } +} + #[derive(Debug, Deserialize)] struct Stream { architectures: HashMap,