From c17df5ff37a91e23ded012bcb7a05b75642bfe00 Mon Sep 17 00:00:00 2001 From: jdx <216188+jdx@users.noreply.github.com> Date: Sun, 26 Apr 2026 16:03:14 -0500 Subject: [PATCH 01/10] fix(http): retry transient HTTP failures with backoff and warn on rescue MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit 5xx responses (e.g. GitHub's occasional 502 on release downloads) and network-layer drops were failing installs immediately because http_retries defaulted to 0 and the wrapped retry layer never fired. Turn retries on by default and add a transient-vs-permanent classifier so 4xx still fails fast. Cover the chunk-streaming path in download_file_with_headers too — the original retry only wrapped the initial GET, so a connection drop mid-tarball would still fail. Mirror the same behavior in the vfox crate's downloads. When a retry rescues a request, log a warn! with the original error so flaky infrastructure doesn't silently mask itself. Tighten the http→https fallback to fire only on connection-level errors, not on HTTP status errors — falling back to https after the server already returned a 4xx makes no sense and was silently double-querying on every non-2xx response. Co-Authored-By: Claude Opus 4.7 (1M context) --- crates/vfox/src/http.rs | 71 ++++++- crates/vfox/src/lua_mod/http.rs | 100 +++++----- crates/vfox/src/vfox.rs | 12 +- schema/mise.json | 4 +- settings.toml | 15 +- src/http.rs | 325 +++++++++++++++++++++++++++----- 6 files changed, 425 insertions(+), 102 deletions(-) diff --git a/crates/vfox/src/http.rs b/crates/vfox/src/http.rs index 22b51c54d4..2e6107bacb 100644 --- a/crates/vfox/src/http.rs +++ b/crates/vfox/src/http.rs @@ -1,5 +1,6 @@ -use reqwest::{Client, ClientBuilder}; +use reqwest::{Client, ClientBuilder, StatusCode}; use std::sync::LazyLock; +use std::time::Duration; pub static CLIENT: LazyLock = LazyLock::new(|| { ClientBuilder::new() @@ -7,3 +8,71 @@ pub static CLIENT: LazyLock = LazyLock::new(|| { .build() .expect("Failed to create reqwest client") }); + +pub(crate) const HTTP_RETRY_ATTEMPTS: usize = 3; + +pub(crate) fn should_retry_status(status: StatusCode) -> bool { + let code = status.as_u16(); + code == 408 || code == 429 || (500..600).contains(&code) +} + +pub(crate) fn is_transient(err: &reqwest::Error) -> bool { + if err.is_timeout() || err.is_connect() || err.is_body() { + return true; + } + if let Some(status) = err.status() { + return should_retry_status(status); + } + false +} + +pub(crate) fn retry_delay(attempt: usize) -> Duration { + Duration::from_millis(200 * (attempt as u64 + 1)) +} + +/// Retry an async operation that issues a request AND extracts the body. +/// Use for download/text/bytes flows where mid-stream failures (is_body()) need +/// to restart the whole request. Emits a warn! on a successful retry. +pub(crate) async fn retry_async( + url: &str, + mut f: F, +) -> std::result::Result +where + F: FnMut() -> Fut, + Fut: std::future::Future>, +{ + let mut last_err_msg: Option = None; + let mut last_err: Option = None; + for attempt in 0..HTTP_RETRY_ATTEMPTS { + match f().await { + Ok(value) => { + if let Some(prev) = last_err_msg { + log::warn!( + "HTTP {} succeeded on attempt {} after transient error: {}", + url, + attempt + 1, + prev + ); + } + return Ok(value); + } + Err(err) => { + if !is_transient(&err) || attempt + 1 >= HTTP_RETRY_ATTEMPTS { + return Err(err); + } + let delay = retry_delay(attempt); + log::debug!( + "HTTP {} attempt {} failed (transient): {}; retrying in {:?}", + url, + attempt + 1, + err, + delay + ); + last_err_msg = Some(err.to_string()); + last_err = Some(err); + tokio::time::sleep(delay).await; + } + } + } + Err(last_err.expect("retry loop should always return")) +} diff --git a/crates/vfox/src/lua_mod/http.rs b/crates/vfox/src/lua_mod/http.rs index 50dc51bf46..9af805e384 100644 --- a/crates/vfox/src/lua_mod/http.rs +++ b/crates/vfox/src/lua_mod/http.rs @@ -1,33 +1,23 @@ use mlua::{BorrowedStr, ExternalResult, Lua, MultiValue, Result, Table, Value}; use reqwest::header::{AUTHORIZATION, HeaderMap, HeaderName, HeaderValue}; -use reqwest::{RequestBuilder, Response, StatusCode}; -use std::time::Duration; +use reqwest::{RequestBuilder, Response}; use url::Url; -use crate::http::CLIENT; - -const HTTP_RETRY_ATTEMPTS: usize = 3; - -fn should_retry_status(status: StatusCode) -> bool { - matches!( - status, - StatusCode::REQUEST_TIMEOUT - | StatusCode::TOO_MANY_REQUESTS - | StatusCode::BAD_GATEWAY - | StatusCode::SERVICE_UNAVAILABLE - | StatusCode::GATEWAY_TIMEOUT - ) -} - -fn retry_delay(attempt: usize) -> Duration { - Duration::from_millis(200 * (attempt as u64 + 1)) -} +use crate::http::{ + CLIENT, HTTP_RETRY_ATTEMPTS, is_transient, retry_async, retry_delay, should_retry_status, +}; async fn send_with_retry(builder: RequestBuilder) -> std::result::Result { + let url = builder + .try_clone() + .and_then(|b| b.build().ok()) + .map(|r| r.url().to_string()) + .unwrap_or_default(); let Some(template) = builder.try_clone() else { return builder.send().await; }; + let mut last_err_msg: Option = None; for attempt in 0..HTTP_RETRY_ATTEMPTS { let response = template .try_clone() @@ -35,15 +25,38 @@ async fn send_with_retry(builder: RequestBuilder) -> std::result::Result = match response { Ok(resp) if should_retry_status(resp.status()) && attempt + 1 < HTTP_RETRY_ATTEMPTS => { - tokio::time::sleep(retry_delay(attempt)).await; + Some(format!("HTTP {}", resp.status())) } - Ok(resp) => return Ok(resp), - Err(err) if err.is_timeout() && attempt + 1 < HTTP_RETRY_ATTEMPTS => { - tokio::time::sleep(retry_delay(attempt)).await; + Ok(resp) => { + if let Some(prev) = last_err_msg { + log::warn!( + "HTTP {} succeeded on attempt {} after transient error: {}", + url, + attempt + 1, + prev + ); + } + return Ok(resp); + } + Err(err) if is_transient(&err) && attempt + 1 < HTTP_RETRY_ATTEMPTS => { + Some(err.to_string()) } Err(err) => return Err(err), + }; + + if let Some(msg) = transient_err { + let delay = retry_delay(attempt); + log::debug!( + "HTTP {} attempt {} failed (transient): {}; retrying in {:?}", + url, + attempt + 1, + msg, + delay + ); + last_err_msg = Some(msg); + tokio::time::sleep(delay).await; } } @@ -186,12 +199,16 @@ async fn download_file(lua: &Lua, input: MultiValue) -> Result<()> { }; let headers = add_default_headers(lua, &url, headers); let path: String = input.iter().nth(1).unwrap().to_string()?; - let resp = send_with_retry(CLIENT.get(&url).headers(headers)) - .await - .into_lua_err()?; - resp.error_for_status_ref().into_lua_err()?; + // Retry the whole flow (request + body) so a mid-stream drop restarts the + // download instead of failing. + let bytes = retry_async(&url, || async { + let resp = CLIENT.get(&url).headers(headers.clone()).send().await?; + let resp = resp.error_for_status()?; + resp.bytes().await + }) + .await + .into_lua_err()?; let mut file = tokio::fs::File::create(&path).await.into_lua_err()?; - let bytes = resp.bytes().await.into_lua_err()?; tokio::io::AsyncWriteExt::write_all(&mut file, &bytes) .await .into_lua_err()?; @@ -292,22 +309,13 @@ async fn try_download_file(lua: &Lua, input: MultiValue) -> Result { ])); } }; - let resp = match send_with_retry(CLIENT.get(&url).headers(headers)).await { - Ok(resp) => resp, - Err(e) => { - return Ok(MultiValue::from_vec(vec![ - Value::Nil, - Value::String(lua.create_string(e.to_string())?), - ])); - } - }; - if let Err(e) = resp.error_for_status_ref() { - return Ok(MultiValue::from_vec(vec![ - Value::Nil, - Value::String(lua.create_string(e.to_string())?), - ])); - } - let bytes = match resp.bytes().await { + let bytes = match retry_async(&url, || async { + let resp = CLIENT.get(&url).headers(headers.clone()).send().await?; + let resp = resp.error_for_status()?; + resp.bytes().await + }) + .await + { Ok(bytes) => bytes, Err(e) => { return Ok(MultiValue::from_vec(vec![ diff --git a/crates/vfox/src/vfox.rs b/crates/vfox/src/vfox.rs index e0316a34ae..d806b6db4d 100644 --- a/crates/vfox/src/vfox.rs +++ b/crates/vfox/src/vfox.rs @@ -19,7 +19,7 @@ use crate::hooks::mise_path::MisePathContext; use crate::hooks::parse_legacy_file::ParseLegacyFileResponse; use crate::hooks::post_install::PostInstallContext; use crate::hooks::pre_install::{PreInstall, PreInstallAttestation, VerifiedAttestation}; -use crate::http::CLIENT; +use crate::http::{CLIENT, retry_async}; use crate::metadata::Metadata; use crate::plugin::Plugin; use crate::registry; @@ -414,11 +414,15 @@ impl Vfox { .download_dir .join(format!("{sdk}-{version}")) .join(filename); - let resp = CLIENT.get(url.clone()).send().await?; - resp.error_for_status_ref()?; + let url_str = url.to_string(); + let bytes = retry_async(&url_str, || async { + let resp = CLIENT.get(url.clone()).send().await?; + let resp = resp.error_for_status()?; + resp.bytes().await + }) + .await?; file::mkdirp(path.parent().unwrap())?; let mut file = tokio::fs::File::create(&path).await?; - let bytes = resp.bytes().await?; tokio::io::AsyncWriteExt::write_all(&mut file, &bytes).await?; file.sync_all().await?; Ok(path) diff --git a/schema/mise.json b/schema/mise.json index cfb8360638..397c5d6ed1 100644 --- a/schema/mise.json +++ b/schema/mise.json @@ -971,8 +971,8 @@ } }, "http_retries": { - "default": 0, - "description": "Number of retries for HTTP requests in mise.", + "default": 3, + "description": "Number of retries for transient HTTP failures in mise.", "type": "number" }, "http_timeout": { diff --git a/settings.toml b/settings.toml index f73a7fc43e..e9997e3459 100644 --- a/settings.toml +++ b/settings.toml @@ -947,10 +947,19 @@ env = "MISE_HOOK_ENV_CHPWD_ONLY" type = "Bool" [http_retries] -default = 0 -description = "Number of retries for HTTP requests in mise." +default = 3 +description = "Number of retries for transient HTTP failures in mise." docs = """ -Uses an exponential backoff strategy. The duration is calculated by taking the base (10ms) to the n-th power. +Retries are attempted only on transient errors: HTTP 5xx (server errors), 408 +(Request Timeout), 429 (Too Many Requests), and network-layer failures (connect +refused, timeout, mid-stream body drops). Other 4xx responses (e.g. 404) are +treated as permanent and not retried. + +Backoff schedule with jitter: ~200ms / ~1s / ~4s / ~15s. Set to 0 to disable +retries entirely. + +When a retry rescues a request, a warning is logged with the original error so +flaky infrastructure doesn't silently mask itself. """ env = "MISE_HTTP_RETRIES" type = "Integer" diff --git a/src/http.rs b/src/http.rs index 14d5fc5c69..213806a4e5 100644 --- a/src/http.rs +++ b/src/http.rs @@ -12,8 +12,7 @@ use reqwest::header::{HeaderMap, HeaderValue}; use reqwest::{ClientBuilder, IntoUrl, Method, Response}; use std::sync::LazyLock as Lazy; use tokio::sync::OnceCell; -use tokio_retry::Retry; -use tokio_retry::strategy::{ExponentialBackoff, jitter}; +use tokio_retry::strategy::jitter; use url::Url; use crate::cli::version; @@ -322,29 +321,37 @@ impl Client { ) -> Result<()> { let url = url.into_url()?; debug!("GET Downloading {} to {}", &url, display_path(path)); - let mut resp = self.get_async_with_headers(url.clone(), headers).await?; - if let Some(length) = resp.content_length() - && let Some(pr) = pr - { - // Reset progress on each attempt - pr.set_length(length); - pr.set_position(0); - } - let parent = path.parent().unwrap(); file::create_dir_all(parent)?; - let mut file = tempfile::NamedTempFile::with_prefix_in(path, parent)?; - while let Some(chunk) = resp.chunk().await? { - if crate::ui::ctrlc::is_cancelled() { - bail!("download cancelled by user"); + + // Retry the whole download so a mid-stream chunk failure restarts from + // byte 0 instead of failing the install. send_once_with_https_fallback + // (not send_with_https_fallback) is used inside to avoid retry-on-retry. + retry_async("GET", &url, || async { + let mut resp = self + .send_once_with_https_fallback(Method::GET, url.clone(), headers, "GET") + .await?; + if let Some(length) = resp.content_length() + && let Some(pr) = pr + { + // Reset progress on each attempt + pr.set_length(length); + pr.set_position(0); } - file.write_all(&chunk)?; - if let Some(pr) = pr { - pr.inc(chunk.len() as u64); + let mut file = tempfile::NamedTempFile::with_prefix_in(path, parent)?; + while let Some(chunk) = resp.chunk().await? { + if crate::ui::ctrlc::is_cancelled() { + bail!("download cancelled by user"); + } + file.write_all(&chunk)?; + if let Some(pr) = pr { + pr.inc(chunk.len() as u64); + } } - } - file.persist(path)?; - Ok(()) + file.persist(path)?; + Ok(()) + }) + .await } async fn send_with_https_fallback( @@ -354,31 +361,41 @@ impl Client { headers: &HeaderMap, verb_label: &str, ) -> Result { - Retry::spawn( - default_backoff_strategy(Settings::get().http_retries), - || { - let method = method.clone(); - let url = url.clone(); - let headers = headers.clone(); - async move { - match self - .send_once(method.clone(), url.clone(), &headers, verb_label) - .await - { - Ok(resp) => Ok(resp), - Err(_err) if url.scheme() == "http" => { - let mut url = url; - url.set_scheme("https").unwrap(); - self.send_once(method, url, &headers, verb_label).await - } - Err(err) => Err(err), - } - } - }, - ) + retry_async(verb_label, &url, || async { + self.send_once_with_https_fallback(method.clone(), url.clone(), headers, verb_label) + .await + }) .await } + /// One attempt with http→https fallback, no retry. Used as the inner step + /// for both `send_with_https_fallback` (which adds retry) and + /// `download_file_with_headers` (which has its own outer retry covering the + /// chunk stream). Splitting this out avoids retry × retry blowup. + /// The fallback only fires on connection-level errors (corporate proxy + /// blocking plain http), not on HTTP status errors — falling back to https + /// after the server already returned a 4xx/5xx makes no sense. + async fn send_once_with_https_fallback( + &self, + method: Method, + url: Url, + headers: &HeaderMap, + verb_label: &str, + ) -> Result { + match self + .send_once(method.clone(), url.clone(), headers, verb_label) + .await + { + Ok(resp) => Ok(resp), + Err(err) if url.scheme() == "http" && is_connection_error(&err) => { + let mut url = url; + url.set_scheme("https").unwrap(); + self.send_once(method, url, headers, verb_label).await + } + Err(err) => Err(err), + } + } + async fn send_once( &self, method: Method, @@ -422,7 +439,9 @@ impl Client { env_var ) }; - bail!(hint); + // wrap_err preserves the underlying reqwest::Error in the chain so + // is_transient() can still classify this as a retryable timeout. + return Err(Report::new(err).wrap_err(hint)); } return Err(err.into()); } @@ -576,12 +595,93 @@ fn display_github_rate_limit(resp: &Response) { } } -fn default_backoff_strategy(retries: i64) -> impl Iterator { - ExponentialBackoff::from_millis(10) +fn default_backoff_strategy(retries: i64) -> impl Iterator { + // Hand-rolled schedule (with jitter): ~200ms / ~1s / ~4s / ~15s. Enough for + // transient server blips (5xx, brief network drops) without making install + // commands wait minutes. Note: tokio_retry's ExponentialBackoff::from_millis + // is geometric in the base value (base, base*base, ...), so picking a base + // that gives nice human-scale delays is awkward — explicit sequence is clearer. + [200u64, 1_000, 4_000, 15_000] + .into_iter() + .map(Duration::from_millis) .map(jitter) .take(retries.max(0) as usize) } +/// True if the error is a network-layer connection problem (no status received). +/// Used to decide when http→https fallback makes sense: only when the http +/// attempt never reached the server, not when the server returned a status. +fn is_connection_error(err: &Report) -> bool { + err.chain().any(|e| { + let Some(reqwest_err) = e.downcast_ref::() else { + return false; + }; + (reqwest_err.is_connect() || reqwest_err.is_timeout()) && reqwest_err.status().is_none() + }) +} + +/// Classifies an error as transient (should retry) vs permanent. +/// Walks the error chain so wrapped errors (e.g. our timeout hint) still match. +pub(crate) fn is_transient(err: &Report) -> bool { + err.chain().any(|e| { + let Some(reqwest_err) = e.downcast_ref::() else { + return false; + }; + // Network-layer failures: connect refused, timeout, mid-stream body drop. + if reqwest_err.is_timeout() || reqwest_err.is_connect() || reqwest_err.is_body() { + return true; + } + // Status errors: 5xx server errors plus 408 (Request Timeout) and + // 429 (Too Many Requests). Other 4xx are deterministic — don't retry. + if let Some(status) = reqwest_err.status() { + let code = status.as_u16(); + return code == 408 || code == 429 || (500..600).contains(&code); + } + false + }) +} + +/// Retry an async operation on transient errors using `default_backoff_strategy`. +/// On a successful retry (attempt > 1), emits a warn! noting which transient +/// error was rescued, so flaky infrastructure doesn't silently mask itself. +pub(crate) async fn retry_async(verb_label: &str, url: &Url, mut f: F) -> Result +where + F: FnMut() -> Fut, + Fut: std::future::Future>, +{ + let mut backoff = default_backoff_strategy(Settings::get().http_retries); + let mut attempt: usize = 1; + let mut last_err: Option = None; + loop { + match f().await { + Ok(value) => { + if let Some(prev) = last_err { + warn!( + "HTTP {} {} succeeded on attempt {} after transient error: {}", + verb_label, url, attempt, prev + ); + } + return Ok(value); + } + Err(err) => { + if !is_transient(&err) { + return Err(err); + } + let Some(delay) = backoff.next() else { + return Err(err); + }; + debug!( + "HTTP {} {} attempt {} failed (transient): {}; retrying in {:?}", + verb_label, url, attempt, err, delay + ); + last_err = Some(err); + tokio::time::sleep(delay).await; + attempt += 1; + } + } + } +} + #[cfg(test)] mod tests { use super::*; @@ -616,6 +716,139 @@ mod tests { result } + // RAII guard that holds the global test lock and resets settings on drop. + // Use this in async tests so the mutex stays held across .await points + // without sync/async closure shenanigans. + struct SettingsGuard { + _lock: std::sync::MutexGuard<'static, ()>, + } + impl Drop for SettingsGuard { + fn drop(&mut self) { + crate::config::Settings::reset(None); + } + } + fn set_test_http_retries(retries: i64) -> SettingsGuard { + let lock = TEST_SETTINGS_LOCK.lock().unwrap(); + let mut settings = crate::config::settings::SettingsPartial::empty(); + settings.http_retries = Some(retries); + crate::config::Settings::reset(Some(settings)); + SettingsGuard { _lock: lock } + } + + // A tiny in-process HTTP/1.1 responder. Each accepted connection consumes + // the next response from `responses` and writes it back. Returns the bound + // port and an Arc counter of connections actually served. + async fn spawn_canned_server( + responses: Vec<&'static str>, + ) -> (u16, std::sync::Arc) { + use std::sync::Arc; + use std::sync::atomic::{AtomicUsize, Ordering}; + use tokio::io::{AsyncReadExt, AsyncWriteExt}; + + let listener = tokio::net::TcpListener::bind("127.0.0.1:0").await.unwrap(); + let port = listener.local_addr().unwrap().port(); + let count = Arc::new(AtomicUsize::new(0)); + let count_inner = count.clone(); + tokio::spawn(async move { + let mut iter = responses.into_iter(); + while let Some(resp) = iter.next() { + let Ok((mut sock, _)) = listener.accept().await else { + return; + }; + count_inner.fetch_add(1, Ordering::SeqCst); + // Drain request headers (read until \r\n\r\n or EOF). + let mut buf = [0u8; 4096]; + let mut total = Vec::new(); + loop { + match sock.read(&mut buf).await { + Ok(0) => break, + Ok(n) => { + total.extend_from_slice(&buf[..n]); + if total.windows(4).any(|w| w == b"\r\n\r\n") { + break; + } + } + Err(_) => break, + } + } + let _ = sock.write_all(resp.as_bytes()).await; + let _ = sock.shutdown().await; + } + }); + (port, count) + } + + fn ok_response() -> &'static str { + "HTTP/1.1 200 OK\r\nContent-Length: 2\r\nConnection: close\r\n\r\nOK" + } + fn bad_gateway_response() -> &'static str { + "HTTP/1.1 502 Bad Gateway\r\nContent-Length: 0\r\nConnection: close\r\n\r\n" + } + fn not_found_response() -> &'static str { + "HTTP/1.1 404 Not Found\r\nContent-Length: 0\r\nConnection: close\r\n\r\n" + } + fn server_error_response() -> &'static str { + "HTTP/1.1 500 Internal Server Error\r\nContent-Length: 0\r\nConnection: close\r\n\r\n" + } + + #[tokio::test(flavor = "current_thread")] + async fn test_retry_succeeds_after_two_502s() { + // 2 retries is enough to verify the rescue path (2 failures + 1 success) + // without paying the third backoff (~12.5s). + let _guard = set_test_http_retries(2); + let (port, count) = spawn_canned_server(vec![ + bad_gateway_response(), + bad_gateway_response(), + ok_response(), + ]) + .await; + let url: Url = format!("http://127.0.0.1:{}/", port).parse().unwrap(); + let client = Client::new(Duration::from_secs(2), ClientKind::Http).unwrap(); + let resp = client.get_async(url).await.unwrap(); + assert!(resp.status().is_success()); + // Should have served 3 connections: two 502s + one 200. + assert_eq!(count.load(std::sync::atomic::Ordering::SeqCst), 3); + } + + #[tokio::test(flavor = "current_thread")] + async fn test_no_retry_on_404() { + let _guard = set_test_http_retries(3); + let (port, count) = spawn_canned_server(vec![not_found_response()]).await; + let url: Url = format!("http://127.0.0.1:{}/", port).parse().unwrap(); + let client = Client::new(Duration::from_secs(2), ClientKind::Http).unwrap(); + let err = client.get_async(url).await.unwrap_err(); + let msg = format!("{err:?}"); + assert!(msg.contains("404"), "expected 404 in error: {msg}"); + // Should not have retried — only one connection. + assert_eq!(count.load(std::sync::atomic::Ordering::SeqCst), 1); + } + + #[tokio::test(flavor = "current_thread")] + async fn test_retry_exhausted_on_persistent_500() { + // Use 1 retry so the test doesn't pay the full backoff schedule; + // the behavior under test (exhaustion → final error) is the same. + let _guard = set_test_http_retries(1); + // 2 connections: initial + 1 retry. + let (port, count) = + spawn_canned_server(vec![server_error_response(), server_error_response()]).await; + let url: Url = format!("http://127.0.0.1:{}/", port).parse().unwrap(); + let client = Client::new(Duration::from_secs(2), ClientKind::Http).unwrap(); + let err = client.get_async(url).await.unwrap_err(); + assert!(format!("{err:?}").contains("500")); + assert_eq!(count.load(std::sync::atomic::Ordering::SeqCst), 2); + } + + #[tokio::test(flavor = "current_thread")] + async fn test_retries_disabled_fails_immediately() { + let _guard = set_test_http_retries(0); + let (port, count) = spawn_canned_server(vec![bad_gateway_response()]).await; + let url: Url = format!("http://127.0.0.1:{}/", port).parse().unwrap(); + let client = Client::new(Duration::from_secs(2), ClientKind::Http).unwrap(); + let err = client.get_async(url).await.unwrap_err(); + assert!(format!("{err:?}").contains("502")); + assert_eq!(count.load(std::sync::atomic::Ordering::SeqCst), 1); + } + #[test] fn test_simple_string_replacement() { let mut replacements = IndexMap::new(); From 043fc6864ad23ae6b1546705a508e054c0c72945 Mon Sep 17 00:00:00 2001 From: jdx <216188+jdx@users.noreply.github.com> Date: Sun, 26 Apr 2026 17:03:22 -0500 Subject: [PATCH 02/10] fix(http): vfox honors MISE_HTTP_RETRIES, fix clippy lint - vfox crate now reads MISE_HTTP_RETRIES at runtime instead of using a hardcoded retry count, so the documented opt-out works for vfox-driven downloads too. - Backoff schedule in vfox now matches the main crate (~200ms / ~1s / ~4s / ~15s) rather than a linear 200/400/600ms. - Replace `while let Some(_) = iter.next()` with `for _ in iter` to satisfy clippy::while_let_on_iterator (only enforced in CI, not local lint). Co-Authored-By: Claude Opus 4.7 (1M context) --- crates/vfox/src/http.rs | 46 ++++++++++++++++++++++++++++++--- crates/vfox/src/lua_mod/http.rs | 11 ++++---- src/http.rs | 3 +-- 3 files changed, 48 insertions(+), 12 deletions(-) diff --git a/crates/vfox/src/http.rs b/crates/vfox/src/http.rs index 2e6107bacb..1c07880621 100644 --- a/crates/vfox/src/http.rs +++ b/crates/vfox/src/http.rs @@ -9,7 +9,29 @@ pub static CLIENT: LazyLock = LazyLock::new(|| { .expect("Failed to create reqwest client") }); -pub(crate) const HTTP_RETRY_ATTEMPTS: usize = 3; +/// Default retry attempts when MISE_HTTP_RETRIES is unset. Mirrors the +/// `http_retries` setting default in the main mise crate. +const DEFAULT_HTTP_RETRIES: usize = 3; + +/// Backoff schedule (ms) shared with the main mise crate. Hand-rolled rather +/// than using ExponentialBackoff::from_millis (which is geometric in the base +/// value) so the human-readable cadence is obvious. Jitter is applied per delay. +const BACKOFF_SCHEDULE_MS: &[u64] = &[200, 1_000, 4_000, 15_000]; + +/// Read MISE_HTTP_RETRIES so vfox honors the same opt-out as the rest of mise. +/// vfox is a separate crate without access to mise's Settings layer, so the env +/// var is the only shared signal. +fn http_retries() -> usize { + std::env::var("MISE_HTTP_RETRIES") + .ok() + .and_then(|s| s.parse::().ok()) + .unwrap_or(DEFAULT_HTTP_RETRIES) +} + +/// Total attempts = retries + initial attempt. +pub(crate) fn http_retry_attempts() -> usize { + http_retries().saturating_add(1) +} pub(crate) fn should_retry_status(status: StatusCode) -> bool { let code = status.as_u16(); @@ -26,8 +48,23 @@ pub(crate) fn is_transient(err: &reqwest::Error) -> bool { false } +/// Backoff for the `n`-th retry (0-indexed). Falls back to the longest delay +/// in the schedule for retries beyond it. A small uniform jitter in [50%, 100%] +/// of the base avoids thundering herd while keeping delays at least half the +/// nominal value. pub(crate) fn retry_delay(attempt: usize) -> Duration { - Duration::from_millis(200 * (attempt as u64 + 1)) + let base_ms = BACKOFF_SCHEDULE_MS + .get(attempt) + .copied() + .unwrap_or_else(|| *BACKOFF_SCHEDULE_MS.last().unwrap()); + // Cheap deterministic-ish jitter from the system clock — vfox is a small + // crate and pulling in `rand` just for this isn't worth it. + let jitter_pct = 50 + + (std::time::SystemTime::now() + .duration_since(std::time::UNIX_EPOCH) + .map(|d| d.subsec_nanos() % 50) + .unwrap_or(0)) as u64; + Duration::from_millis(base_ms * jitter_pct / 100) } /// Retry an async operation that issues a request AND extracts the body. @@ -41,9 +78,10 @@ where F: FnMut() -> Fut, Fut: std::future::Future>, { + let attempts = http_retry_attempts().max(1); let mut last_err_msg: Option = None; let mut last_err: Option = None; - for attempt in 0..HTTP_RETRY_ATTEMPTS { + for attempt in 0..attempts { match f().await { Ok(value) => { if let Some(prev) = last_err_msg { @@ -57,7 +95,7 @@ where return Ok(value); } Err(err) => { - if !is_transient(&err) || attempt + 1 >= HTTP_RETRY_ATTEMPTS { + if !is_transient(&err) || attempt + 1 >= attempts { return Err(err); } let delay = retry_delay(attempt); diff --git a/crates/vfox/src/lua_mod/http.rs b/crates/vfox/src/lua_mod/http.rs index 9af805e384..21ddb72853 100644 --- a/crates/vfox/src/lua_mod/http.rs +++ b/crates/vfox/src/lua_mod/http.rs @@ -4,7 +4,7 @@ use reqwest::{RequestBuilder, Response}; use url::Url; use crate::http::{ - CLIENT, HTTP_RETRY_ATTEMPTS, is_transient, retry_async, retry_delay, should_retry_status, + CLIENT, http_retry_attempts, is_transient, retry_async, retry_delay, should_retry_status, }; async fn send_with_retry(builder: RequestBuilder) -> std::result::Result { @@ -17,8 +17,9 @@ async fn send_with_retry(builder: RequestBuilder) -> std::result::Result = None; - for attempt in 0..HTTP_RETRY_ATTEMPTS { + for attempt in 0..attempts { let response = template .try_clone() .expect("cloned request builder should remain cloneable") @@ -26,7 +27,7 @@ async fn send_with_retry(builder: RequestBuilder) -> std::result::Result = match response { - Ok(resp) if should_retry_status(resp.status()) && attempt + 1 < HTTP_RETRY_ATTEMPTS => { + Ok(resp) if should_retry_status(resp.status()) && attempt + 1 < attempts => { Some(format!("HTTP {}", resp.status())) } Ok(resp) => { @@ -40,9 +41,7 @@ async fn send_with_retry(builder: RequestBuilder) -> std::result::Result { - Some(err.to_string()) - } + Err(err) if is_transient(&err) && attempt + 1 < attempts => Some(err.to_string()), Err(err) => return Err(err), }; diff --git a/src/http.rs b/src/http.rs index 213806a4e5..00fdcc463f 100644 --- a/src/http.rs +++ b/src/http.rs @@ -750,8 +750,7 @@ mod tests { let count = Arc::new(AtomicUsize::new(0)); let count_inner = count.clone(); tokio::spawn(async move { - let mut iter = responses.into_iter(); - while let Some(resp) = iter.next() { + for resp in responses { let Ok((mut sock, _)) = listener.accept().await else { return; }; From 5be4c19b9f8fa538d91846175532b7a6dbdb255c Mon Sep 17 00:00:00 2001 From: jdx <216188+jdx@users.noreply.github.com> Date: Sun, 26 Apr 2026 17:07:04 -0500 Subject: [PATCH 03/10] fix(http): don't silently cap retries at the backoff schedule length MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The fixed 4-element schedule with `.take(retries)` capped any `MISE_HTTP_RETRIES > 4` at 4 retries — a regression vs. the previous unbounded `ExponentialBackoff::from_millis(10)`. Chain the schedule with a repeat of the longest delay (15s) so any retry count is honored. Co-Authored-By: Claude Opus 4.7 (1M context) --- src/http.rs | 20 +++++++++++++++----- 1 file changed, 15 insertions(+), 5 deletions(-) diff --git a/src/http.rs b/src/http.rs index 00fdcc463f..c7b9b05e99 100644 --- a/src/http.rs +++ b/src/http.rs @@ -596,13 +596,15 @@ fn display_github_rate_limit(resp: &Response) { } fn default_backoff_strategy(retries: i64) -> impl Iterator { - // Hand-rolled schedule (with jitter): ~200ms / ~1s / ~4s / ~15s. Enough for - // transient server blips (5xx, brief network drops) without making install - // commands wait minutes. Note: tokio_retry's ExponentialBackoff::from_millis - // is geometric in the base value (base, base*base, ...), so picking a base - // that gives nice human-scale delays is awkward — explicit sequence is clearer. + // Hand-rolled schedule (with jitter): ~200ms / ~1s / ~4s / ~15s, then 15s + // for every retry beyond the schedule. The trailing repeat matters because + // `MISE_HTTP_RETRIES` can be set arbitrarily high — a fixed-length array + // would silently cap retries at its length. tokio_retry's ExponentialBackoff + // ::from_millis is geometric in the base (base, base*base, …) so picking a + // base that gives nice human-scale delays is awkward; explicit is clearer. [200u64, 1_000, 4_000, 15_000] .into_iter() + .chain(std::iter::repeat(15_000)) .map(Duration::from_millis) .map(jitter) .take(retries.max(0) as usize) @@ -837,6 +839,14 @@ mod tests { assert_eq!(count.load(std::sync::atomic::Ordering::SeqCst), 2); } + #[test] + fn test_backoff_strategy_yields_requested_count_beyond_schedule() { + // Regression: a fixed-length schedule used to silently cap retries at 4. + // Now extra retries should fall back to the longest delay. + let delays: Vec<_> = default_backoff_strategy(7).collect(); + assert_eq!(delays.len(), 7); + } + #[tokio::test(flavor = "current_thread")] async fn test_retries_disabled_fails_immediately() { let _guard = set_test_http_retries(0); From bd8f74da46a012137611a8dddbfdfb4ad270dab3 Mon Sep 17 00:00:00 2001 From: jdx <216188+jdx@users.noreply.github.com> Date: Sun, 26 Apr 2026 17:14:27 -0500 Subject: [PATCH 04/10] fix(http): warn immediately on transient failure, not just on rescue Previously a transient failure was silent until the retry sequence either succeeded (warn on rescue) or exhausted (raw error propagated). With the new ~15s tail of the backoff schedule, the user would sit through long delays with no feedback about what was going wrong. Now every transient failure emits a warn! the moment it happens, with the attempt number and next-retry delay. Successful rescues still warn ("succeeded on attempt N"), and exhausted retries also warn before propagating the final error so the attempt count is visible. Mirror the same behavior in the vfox crate. Co-Authored-By: Claude Opus 4.7 (1M context) --- crates/vfox/src/http.rs | 30 ++++++++++++++++++------------ crates/vfox/src/lua_mod/http.rs | 15 +++++---------- src/http.rs | 22 ++++++++++++++-------- 3 files changed, 37 insertions(+), 30 deletions(-) diff --git a/crates/vfox/src/http.rs b/crates/vfox/src/http.rs index 1c07880621..d80eaed7cb 100644 --- a/crates/vfox/src/http.rs +++ b/crates/vfox/src/http.rs @@ -69,7 +69,9 @@ pub(crate) fn retry_delay(attempt: usize) -> Duration { /// Retry an async operation that issues a request AND extracts the body. /// Use for download/text/bytes flows where mid-stream failures (is_body()) need -/// to restart the whole request. Emits a warn! on a successful retry. +/// to restart the whole request. Warns immediately on each transient failure +/// (so users see flakiness without waiting through the backoff) and again on +/// eventual success or final exhaustion. pub(crate) async fn retry_async( url: &str, mut f: F, @@ -79,34 +81,38 @@ where Fut: std::future::Future>, { let attempts = http_retry_attempts().max(1); - let mut last_err_msg: Option = None; + let mut had_transient_failure = false; let mut last_err: Option = None; for attempt in 0..attempts { match f().await { Ok(value) => { - if let Some(prev) = last_err_msg { - log::warn!( - "HTTP {} succeeded on attempt {} after transient error: {}", - url, - attempt + 1, - prev - ); + if had_transient_failure { + log::warn!("HTTP {} succeeded on attempt {}", url, attempt + 1); } return Ok(value); } Err(err) => { - if !is_transient(&err) || attempt + 1 >= attempts { + if !is_transient(&err) { + return Err(err); + } + if attempt + 1 >= attempts { + log::warn!( + "HTTP {} failed after {} attempts: {}", + url, + attempt + 1, + err + ); return Err(err); } let delay = retry_delay(attempt); - log::debug!( + log::warn!( "HTTP {} attempt {} failed (transient): {}; retrying in {:?}", url, attempt + 1, err, delay ); - last_err_msg = Some(err.to_string()); + had_transient_failure = true; last_err = Some(err); tokio::time::sleep(delay).await; } diff --git a/crates/vfox/src/lua_mod/http.rs b/crates/vfox/src/lua_mod/http.rs index 21ddb72853..810510ae0e 100644 --- a/crates/vfox/src/lua_mod/http.rs +++ b/crates/vfox/src/lua_mod/http.rs @@ -18,7 +18,7 @@ async fn send_with_retry(builder: RequestBuilder) -> std::result::Result = None; + let mut had_transient_failure = false; for attempt in 0..attempts { let response = template .try_clone() @@ -31,13 +31,8 @@ async fn send_with_retry(builder: RequestBuilder) -> std::result::Result { - if let Some(prev) = last_err_msg { - log::warn!( - "HTTP {} succeeded on attempt {} after transient error: {}", - url, - attempt + 1, - prev - ); + if had_transient_failure { + log::warn!("HTTP {} succeeded on attempt {}", url, attempt + 1); } return Ok(resp); } @@ -47,14 +42,14 @@ async fn send_with_retry(builder: RequestBuilder) -> std::result::Result bool { } /// Retry an async operation on transient errors using `default_backoff_strategy`. -/// On a successful retry (attempt > 1), emits a warn! noting which transient -/// error was rescued, so flaky infrastructure doesn't silently mask itself. +/// Emits a warn! immediately on each transient failure (so the user sees what's +/// happening without waiting through the backoff schedule) and again on the +/// eventual successful rescue, so flaky infrastructure doesn't silently mask +/// itself either way. pub(crate) async fn retry_async(verb_label: &str, url: &Url, mut f: F) -> Result where F: FnMut() -> Fut, @@ -653,14 +655,14 @@ where { let mut backoff = default_backoff_strategy(Settings::get().http_retries); let mut attempt: usize = 1; - let mut last_err: Option = None; + let mut had_transient_failure = false; loop { match f().await { Ok(value) => { - if let Some(prev) = last_err { + if had_transient_failure { warn!( - "HTTP {} {} succeeded on attempt {} after transient error: {}", - verb_label, url, attempt, prev + "HTTP {} {} succeeded on attempt {}", + verb_label, url, attempt ); } return Ok(value); @@ -670,13 +672,17 @@ where return Err(err); } let Some(delay) = backoff.next() else { + warn!( + "HTTP {} {} failed after {} attempts: {}", + verb_label, url, attempt, err + ); return Err(err); }; - debug!( + warn!( "HTTP {} {} attempt {} failed (transient): {}; retrying in {:?}", verb_label, url, attempt, err, delay ); - last_err = Some(err); + had_transient_failure = true; tokio::time::sleep(delay).await; attempt += 1; } From faa856242776927175350a9bea6aa3fd3e2c01cf Mon Sep 17 00:00:00 2001 From: jdx <216188+jdx@users.noreply.github.com> Date: Sun, 26 Apr 2026 17:21:50 -0500 Subject: [PATCH 05/10] fix(vfox): don't log "succeeded" when last retry returns a 5xx MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The retry-status arm in vfox's send_with_retry only fires while attempts remain (`attempt + 1 < attempts`), so the final attempt's 5xx falls through to the success arm. With prior transient failures, that arm was warning "succeeded on attempt N" while returning a 5xx response — exactly the silent-mask the warnings were added to prevent. Distinguish real success from "ran out of retries with a bad status" by checking `resp.status().is_success()` and logging "failed after N attempts: HTTP " otherwise. Co-Authored-By: Claude Opus 4.7 (1M context) --- crates/vfox/src/lua_mod/http.rs | 14 +++++++++++++- 1 file changed, 13 insertions(+), 1 deletion(-) diff --git a/crates/vfox/src/lua_mod/http.rs b/crates/vfox/src/lua_mod/http.rs index 810510ae0e..0abdd468e7 100644 --- a/crates/vfox/src/lua_mod/http.rs +++ b/crates/vfox/src/lua_mod/http.rs @@ -31,8 +31,20 @@ async fn send_with_retry(builder: RequestBuilder) -> std::result::Result { + // The retry-status arm above only fires while attempts remain, + // so the final attempt's 5xx (if any) lands here. Distinguish + // real success from "ran out of retries with a bad status." if had_transient_failure { - log::warn!("HTTP {} succeeded on attempt {}", url, attempt + 1); + if resp.status().is_success() { + log::warn!("HTTP {} succeeded on attempt {}", url, attempt + 1); + } else { + log::warn!( + "HTTP {} failed after {} attempts: HTTP {}", + url, + attempt + 1, + resp.status() + ); + } } return Ok(resp); } From 8e3590ffdd87739775e5cc767d99e941493e97b0 Mon Sep 17 00:00:00 2001 From: jdx <216188+jdx@users.noreply.github.com> Date: Sun, 26 Apr 2026 17:26:02 -0500 Subject: [PATCH 06/10] refactor(http): drop redundant rescue/exhaustion warnings Per-attempt warnings already announce each transient failure as it happens. The post-loop "succeeded on attempt N" / "failed after N attempts" warnings just duplicated information the user already saw or will see via the propagated response/error. Keep the per-attempt warns (they're the live signal), drop the rest. Same simplification across all three retry paths. Co-Authored-By: Claude Opus 4.7 (1M context) --- crates/vfox/src/http.rs | 25 +++++-------------------- crates/vfox/src/lua_mod/http.rs | 21 +-------------------- src/http.rs | 24 +++++------------------- 3 files changed, 11 insertions(+), 59 deletions(-) diff --git a/crates/vfox/src/http.rs b/crates/vfox/src/http.rs index d80eaed7cb..1ca5bcfac1 100644 --- a/crates/vfox/src/http.rs +++ b/crates/vfox/src/http.rs @@ -70,8 +70,9 @@ pub(crate) fn retry_delay(attempt: usize) -> Duration { /// Retry an async operation that issues a request AND extracts the body. /// Use for download/text/bytes flows where mid-stream failures (is_body()) need /// to restart the whole request. Warns immediately on each transient failure -/// (so users see flakiness without waiting through the backoff) and again on -/// eventual success or final exhaustion. +/// (so users see flakiness without waiting through the backoff). Successful +/// rescues and final exhaustion don't get extra warnings — the caller surfaces +/// the outcome. pub(crate) async fn retry_async( url: &str, mut f: F, @@ -81,27 +82,12 @@ where Fut: std::future::Future>, { let attempts = http_retry_attempts().max(1); - let mut had_transient_failure = false; let mut last_err: Option = None; for attempt in 0..attempts { match f().await { - Ok(value) => { - if had_transient_failure { - log::warn!("HTTP {} succeeded on attempt {}", url, attempt + 1); - } - return Ok(value); - } + Ok(value) => return Ok(value), Err(err) => { - if !is_transient(&err) { - return Err(err); - } - if attempt + 1 >= attempts { - log::warn!( - "HTTP {} failed after {} attempts: {}", - url, - attempt + 1, - err - ); + if !is_transient(&err) || attempt + 1 >= attempts { return Err(err); } let delay = retry_delay(attempt); @@ -112,7 +98,6 @@ where err, delay ); - had_transient_failure = true; last_err = Some(err); tokio::time::sleep(delay).await; } diff --git a/crates/vfox/src/lua_mod/http.rs b/crates/vfox/src/lua_mod/http.rs index 0abdd468e7..6dcf362950 100644 --- a/crates/vfox/src/lua_mod/http.rs +++ b/crates/vfox/src/lua_mod/http.rs @@ -18,7 +18,6 @@ async fn send_with_retry(builder: RequestBuilder) -> std::result::Result std::result::Result { Some(format!("HTTP {}", resp.status())) } - Ok(resp) => { - // The retry-status arm above only fires while attempts remain, - // so the final attempt's 5xx (if any) lands here. Distinguish - // real success from "ran out of retries with a bad status." - if had_transient_failure { - if resp.status().is_success() { - log::warn!("HTTP {} succeeded on attempt {}", url, attempt + 1); - } else { - log::warn!( - "HTTP {} failed after {} attempts: HTTP {}", - url, - attempt + 1, - resp.status() - ); - } - } - return Ok(resp); - } + Ok(resp) => return Ok(resp), Err(err) if is_transient(&err) && attempt + 1 < attempts => Some(err.to_string()), Err(err) => return Err(err), }; @@ -61,7 +43,6 @@ async fn send_with_retry(builder: RequestBuilder) -> std::result::Result bool { } /// Retry an async operation on transient errors using `default_backoff_strategy`. -/// Emits a warn! immediately on each transient failure (so the user sees what's -/// happening without waiting through the backoff schedule) and again on the -/// eventual successful rescue, so flaky infrastructure doesn't silently mask -/// itself either way. +/// Emits a warn! immediately on each transient failure so the user sees flaky +/// infrastructure as it's happening, instead of waiting through the backoff +/// schedule. Successful rescues and final exhaustion don't get extra warnings +/// — the caller surfaces the outcome. pub(crate) async fn retry_async(verb_label: &str, url: &Url, mut f: F) -> Result where F: FnMut() -> Fut, @@ -655,34 +655,20 @@ where { let mut backoff = default_backoff_strategy(Settings::get().http_retries); let mut attempt: usize = 1; - let mut had_transient_failure = false; loop { match f().await { - Ok(value) => { - if had_transient_failure { - warn!( - "HTTP {} {} succeeded on attempt {}", - verb_label, url, attempt - ); - } - return Ok(value); - } + Ok(value) => return Ok(value), Err(err) => { if !is_transient(&err) { return Err(err); } let Some(delay) = backoff.next() else { - warn!( - "HTTP {} {} failed after {} attempts: {}", - verb_label, url, attempt, err - ); return Err(err); }; warn!( "HTTP {} {} attempt {} failed (transient): {}; retrying in {:?}", verb_label, url, attempt, err, delay ); - had_transient_failure = true; tokio::time::sleep(delay).await; attempt += 1; } From 2cd5b8f5aab67195894ca4ed55e08498f863b098 Mon Sep 17 00:00:00 2001 From: jdx <216188+jdx@users.noreply.github.com> Date: Sun, 26 Apr 2026 17:33:47 -0500 Subject: [PATCH 07/10] fix(http): restore offline-mode check in download_file_with_headers The retry refactor swapped get_async_with_headers for the lower-level send_once_with_https_fallback to avoid retry-on-retry, which dropped the offline-mode guard that get_async_with_headers does. Add it back at the top of download_file_with_headers so downloads honor MISE_OFFLINE again. Co-Authored-By: Claude Opus 4.7 (1M context) --- src/http.rs | 1 + 1 file changed, 1 insertion(+) diff --git a/src/http.rs b/src/http.rs index f13effc067..e1629c2d28 100644 --- a/src/http.rs +++ b/src/http.rs @@ -319,6 +319,7 @@ impl Client { headers: &HeaderMap, pr: Option<&dyn SingleReport>, ) -> Result<()> { + ensure!(!Settings::get().offline(), "offline mode is enabled"); let url = url.into_url()?; debug!("GET Downloading {} to {}", &url, display_path(path)); let parent = path.parent().unwrap(); From 30125b6b690582d90c2fbcf2975ad902f74650d4 Mon Sep 17 00:00:00 2001 From: jdx <216188+jdx@users.noreply.github.com> Date: Sun, 26 Apr 2026 17:48:16 -0500 Subject: [PATCH 08/10] fix(http): use equal jitter so backoff can't drop near zero MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit tokio_retry's `jitter` returns a value in [0, d), so a 200ms base could become 2ms — effectively no backoff. Replace with an "equal jitter" helper (random in [d/2, d)) matching what the vfox crate already does. This keeps the documented schedule (~200ms / ~1s / ~4s / ~15s) honest at the lower bound. Co-Authored-By: Claude Opus 4.7 (1M context) --- src/http.rs | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/src/http.rs b/src/http.rs index e1629c2d28..cad7c1e5cf 100644 --- a/src/http.rs +++ b/src/http.rs @@ -12,7 +12,6 @@ use reqwest::header::{HeaderMap, HeaderValue}; use reqwest::{ClientBuilder, IntoUrl, Method, Response}; use std::sync::LazyLock as Lazy; use tokio::sync::OnceCell; -use tokio_retry::strategy::jitter; use url::Url; use crate::cli::version; @@ -607,10 +606,18 @@ fn default_backoff_strategy(retries: i64) -> impl Iterator { .into_iter() .chain(std::iter::repeat(15_000)) .map(Duration::from_millis) - .map(jitter) + .map(equal_jitter) .take(retries.max(0) as usize) } +/// Jitter the duration to a random value in `[d/2, d)` — "equal jitter" per +/// AWS's backoff guidance. Avoids tokio_retry's `jitter` which can return +/// near-zero (its range is `[0, d)`), defeating the point of backoff. +fn equal_jitter(d: Duration) -> Duration { + let factor = 0.5 + rand::random::() * 0.5; + Duration::from_secs_f64(d.as_secs_f64() * factor) +} + /// True if the error is a network-layer connection problem (no status received). /// Used to decide when http→https fallback makes sense: only when the http /// attempt never reached the server, not when the server returned a status. From a76e8257ada68911d77dc29dedef2eb9149b4b30 Mon Sep 17 00:00:00 2001 From: jdx <216188+jdx@users.noreply.github.com> Date: Sun, 26 Apr 2026 17:57:50 -0500 Subject: [PATCH 09/10] chore(deps): drop tokio-retry, no longer used Replaced its `jitter` helper with a local equal-jitter implementation in the previous commit; the crate's `Retry` driver was already removed when the manual loop landed. cargo machete flagged it. Co-Authored-By: Claude Opus 4.7 (1M context) --- Cargo.toml | 1 - 1 file changed, 1 deletion(-) diff --git a/Cargo.toml b/Cargo.toml index 74a8039d44..4f1c3e30d7 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -183,7 +183,6 @@ tera = "1" terminal_size = "0.4" thiserror = "2" tokio = { version = "1", features = ["full"] } -tokio-retry = "0.3" toml = { version = "1.0", features = ["parse", "preserve_order"] } toml_edit = { version = "0.25", features = ["parse"] } ubi = { version = "0.9", default-features = false } From ccb2d067b9f836a94beb6b26238883ff5f9c165a Mon Sep 17 00:00:00 2001 From: jdx <216188+jdx@users.noreply.github.com> Date: Sun, 26 Apr 2026 17:59:04 -0500 Subject: [PATCH 10/10] chore(deps): regenerate Cargo.lock after dropping tokio-retry Co-Authored-By: Claude Opus 4.7 (1M context) --- Cargo.lock | 12 ------------ 1 file changed, 12 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 309e79b342..221b6bec01 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -5761,7 +5761,6 @@ dependencies = [ "test-log", "thiserror 2.0.18", "tokio", - "tokio-retry", "toml 1.1.2+spec-1.1.0", "toml_edit", "ubi", @@ -9607,17 +9606,6 @@ dependencies = [ "tokio", ] -[[package]] -name = "tokio-retry" -version = "0.3.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "40f644c762e9d396831ae2f8935c954b0d758c4532e924bead0f666d0c1c8640" -dependencies = [ - "pin-project-lite", - "rand 0.10.1", - "tokio", -] - [[package]] name = "tokio-rustls" version = "0.26.4"