diff --git a/lychee-bin/src/commands/check.rs b/lychee-bin/src/commands/check.rs index 38f8ab0dec..4451a2240d 100644 --- a/lychee-bin/src/commands/check.rs +++ b/lychee-bin/src/commands/check.rs @@ -265,14 +265,13 @@ async fn handle( // `accepted` status codes might have changed from the previous run // and they may have an impact on the interpretation of the status // code. + client.host_pool().record_persistent_cache_hit(&uri); Status::from_cache_status(v.value().status, &accept) }; - client.host_pool().record_cache_hit(&uri); return Ok(Response::new(uri.clone(), status, request.source.into())); } - client.host_pool().record_cache_miss(&uri); let response = check_url(client, request).await; // - Never cache filesystem access as it is fast already so caching has no benefit. diff --git a/lychee-bin/src/formatters/host_stats/compact.rs b/lychee-bin/src/formatters/host_stats/compact.rs index 121230e259..d312e0c60f 100644 --- a/lychee-bin/src/formatters/host_stats/compact.rs +++ b/lychee-bin/src/formatters/host_stats/compact.rs @@ -70,12 +70,8 @@ impl Compact { } impl HostStatsFormatter for Compact { - fn format(&self, host_stats: HashMap) -> Result> { - if host_stats.is_empty() { - return Ok(None); - } - + fn format(&self, host_stats: HashMap) -> Result { let compact = CompactHostStats { host_stats }; - Ok(Some(compact.to_string())) + Ok(compact.to_string()) } } diff --git a/lychee-bin/src/formatters/host_stats/detailed.rs b/lychee-bin/src/formatters/host_stats/detailed.rs index 01bfd42bc8..f62d248138 100644 --- a/lychee-bin/src/formatters/host_stats/detailed.rs +++ b/lychee-bin/src/formatters/host_stats/detailed.rs @@ -79,12 +79,8 @@ impl Detailed { } impl HostStatsFormatter for Detailed { - fn format(&self, host_stats: HashMap) -> Result> { - if host_stats.is_empty() { - return Ok(None); - } - + fn format(&self, host_stats: HashMap) -> Result { let detailed = DetailedHostStats { host_stats }; - Ok(Some(detailed.to_string())) + Ok(detailed.to_string()) } } diff --git a/lychee-bin/src/formatters/host_stats/json.rs b/lychee-bin/src/formatters/host_stats/json.rs index 24f7fe0d2e..fcb66b89df 100644 --- a/lychee-bin/src/formatters/host_stats/json.rs +++ b/lychee-bin/src/formatters/host_stats/json.rs @@ -9,17 +9,13 @@ pub(crate) struct Json; impl Json { pub(crate) const fn new() -> Self { - Self {} + Self } } impl HostStatsFormatter for Json { /// Format host stats as JSON object - fn format(&self, host_stats: HashMap) -> Result> { - if host_stats.is_empty() { - return Ok(None); - } - + fn format(&self, host_stats: HashMap) -> Result { // Convert HostStats to a more JSON-friendly format let json_stats: HashMap = host_stats .into_iter() @@ -50,8 +46,6 @@ impl HostStatsFormatter for Json { "host_statistics": json_stats }); - serde_json::to_string_pretty(&output) - .map(Some) - .context("Cannot format host stats as JSON") + serde_json::to_string_pretty(&output).context("Cannot format host stats as JSON") } } diff --git a/lychee-bin/src/formatters/host_stats/markdown.rs b/lychee-bin/src/formatters/host_stats/markdown.rs index 8980066107..3543911a9e 100644 --- a/lychee-bin/src/formatters/host_stats/markdown.rs +++ b/lychee-bin/src/formatters/host_stats/markdown.rs @@ -81,12 +81,8 @@ impl Markdown { } impl HostStatsFormatter for Markdown { - fn format(&self, host_stats: HashMap) -> Result> { - if host_stats.is_empty() { - return Ok(None); - } - + fn format(&self, host_stats: HashMap) -> Result { let markdown = MarkdownHostStats(host_stats); - Ok(Some(markdown.to_string())) + Ok(markdown.to_string()) } } diff --git a/lychee-bin/src/formatters/host_stats/mod.rs b/lychee-bin/src/formatters/host_stats/mod.rs index 8c312bfdd5..205f5581d4 100644 --- a/lychee-bin/src/formatters/host_stats/mod.rs +++ b/lychee-bin/src/formatters/host_stats/mod.rs @@ -15,7 +15,7 @@ use std::collections::HashMap; /// Trait for formatting per-host statistics in different output formats pub(crate) trait HostStatsFormatter { /// Format the host statistics and return them as a string - fn format(&self, host_stats: HashMap) -> Result>; + fn format(&self, host_stats: HashMap) -> Result; } /// Sort host statistics by request count (descending order) diff --git a/lychee-bin/src/host_stats.rs b/lychee-bin/src/host_stats.rs index 5ba26d9995..a5ddbf118f 100644 --- a/lychee-bin/src/host_stats.rs +++ b/lychee-bin/src/host_stats.rs @@ -11,17 +11,16 @@ pub(crate) fn output_per_host_statistics(host_pool: &HostPool, config: &Config) let host_stats = host_pool.all_host_stats(); let host_stats_formatter = get_host_stats_formatter(&config.format, &config.mode); + let formatted_host_stats = host_stats_formatter.format(host_stats)?; - if let Some(formatted_host_stats) = host_stats_formatter.format(host_stats)? { - if let Some(output) = &config.output { - // For file output, append to the existing output - let mut file_content = std::fs::read_to_string(output).unwrap_or_default(); - file_content.push_str(&formatted_host_stats); - std::fs::write(output, file_content) - .context("Cannot write host stats to output file")?; - } else { - print!("{formatted_host_stats}"); - } + if let Some(output) = &config.output { + // For file output, append to the existing output + let mut file_content = std::fs::read_to_string(output).unwrap_or_default(); + file_content.push_str(&formatted_host_stats); + std::fs::write(output, file_content).context("Cannot write host stats to output file")?; + } else { + print!("{formatted_host_stats}"); } + Ok(()) } diff --git a/lychee-bin/tests/cli.rs b/lychee-bin/tests/cli.rs index c0cae571d0..8fa5b95542 100644 --- a/lychee-bin/tests/cli.rs +++ b/lychee-bin/tests/cli.rs @@ -1482,6 +1482,51 @@ The config file should contain every possible key for documentation purposes." Ok(()) } + #[tokio::test] + async fn test_process_internal_host_caching() -> Result<()> { + // Note that this process-internal per-host caching + // has no direct relation to the lychee cache file + // where state can be persisted between multiple invocations. + let server = wiremock::MockServer::start().await; + + // Return one rate-limited response to make sure that + // such a response isn't cached. + wiremock::Mock::given(wiremock::matchers::method("GET")) + .respond_with(ResponseTemplate::new(429)) + .up_to_n_times(1) + .mount(&server) + .await; + + wiremock::Mock::given(wiremock::matchers::method("GET")) + .respond_with(ResponseTemplate::new(200)) + .expect(1) + .mount(&server) + .await; + + let temp_dir = tempfile::tempdir()?; + for i in 0..9 { + let test_md1 = temp_dir.path().join(format!("test{i}.md")); + fs::write(&test_md1, server.uri())?; + } + + cargo_bin_cmd!() + .arg(temp_dir.path()) + .arg("--host-stats") + .assert() + .success() + .stdout(contains("9 Total")) + .stdout(contains("9 OK")) + .stdout(contains("0 Errors")) + // Per-host statistics + // 1 rate limited + 9 OK + .stdout(contains("10 reqs")) + // 1 rate limited, 1 OK, 8 cached + .stdout(contains("80.0% cached")); + + server.verify().await; + Ok(()) + } + #[test] fn test_verbatim_skipped_by_default() { let input = fixtures_path!().join("TEST_CODE_BLOCKS.md"); diff --git a/lychee-lib/src/checker/website.rs b/lychee-lib/src/checker/website.rs index 6a00915e46..c490fc22b3 100644 --- a/lychee-lib/src/checker/website.rs +++ b/lychee-lib/src/checker/website.rs @@ -2,7 +2,7 @@ use crate::{ BasicAuthCredentials, ErrorKind, FileType, Status, Uri, chain::{Chain, ChainResult, ClientRequestChains, Handler, RequestChain}, quirks::Quirks, - ratelimit::HostPool, + ratelimit::{CacheableResponse, HostPool}, retry::RetryExt, types::{redirect_history::RedirectHistory, uri::github::GithubUri}, utils::fragment_checker::{FragmentChecker, FragmentInput}, @@ -10,7 +10,7 @@ use crate::{ use async_trait::async_trait; use http::{Method, StatusCode}; use octocrab::Octocrab; -use reqwest::{Request, Response, header::CONTENT_TYPE}; +use reqwest::{Request, header::CONTENT_TYPE}; use std::{collections::HashSet, path::Path, sync::Arc, time::Duration}; use url::Url; @@ -127,12 +127,12 @@ impl WebsiteChecker { // when `accept=200,429`, `status_code=429` will be treated as success // but we are not able the check the fragment since it's inapplicable. if self.include_fragments - && response.status().is_success() + && response.status.is_success() && method == Method::GET && request_url.fragment().is_some_and(|x| !x.is_empty()) { let Some(content_type) = response - .headers() + .headers .get(CONTENT_TYPE) .and_then(|header| header.to_str().ok()) else { @@ -143,7 +143,7 @@ impl WebsiteChecker { ct if ct.starts_with("text/html") => FileType::Html, ct if ct.starts_with("text/markdown") => FileType::Markdown, ct if ct.starts_with("text/plain") => { - let path = Path::new(response.url().path()); + let path = Path::new(response.url.path()); match path.extension() { Some(ext) if ext.eq_ignore_ascii_case("md") => FileType::Markdown, _ => return status, @@ -169,22 +169,18 @@ impl WebsiteChecker { &self, url: Url, status: Status, - response: Response, + response: CacheableResponse, file_type: FileType, ) -> Status { - match response.text().await { - Ok(content) => { - match self - .fragment_checker - .check(FragmentInput { content, file_type }, &url) - .await - { - Ok(true) => status, - Ok(false) => Status::Error(ErrorKind::InvalidFragment(url.into())), - Err(e) => Status::Error(e), - } - } - Err(e) => Status::Error(ErrorKind::ReadResponseBody(e)), + let content = response.text; + match self + .fragment_checker + .check(FragmentInput { content, file_type }, &url) + .await + { + Ok(true) => status, + Ok(false) => Status::Error(ErrorKind::InvalidFragment(url.into())), + Err(e) => Status::Error(e), } } diff --git a/lychee-lib/src/ratelimit/host/host.rs b/lychee-lib/src/ratelimit/host/host.rs index 8e1c1977ac..e5cbe8ff96 100644 --- a/lychee-lib/src/ratelimit/host/host.rs +++ b/lychee-lib/src/ratelimit/host/host.rs @@ -1,21 +1,25 @@ -use crate::ratelimit::headers; +use crate::{ + ratelimit::{CacheableResponse, headers}, + retry::RetryExt, +}; use dashmap::DashMap; use governor::{ Quota, RateLimiter, clock::DefaultClock, state::{InMemoryState, NotKeyed}, }; +use http::StatusCode; use humantime_serde::re::humantime::format_duration; use log::warn; -use reqwest::{Client as ReqwestClient, Request, Response}; +use reqwest::{Client as ReqwestClient, Request, Response as ReqwestResponse}; use std::time::{Duration, Instant}; use std::{num::NonZeroU32, sync::Mutex}; use tokio::sync::Semaphore; use super::key::HostKey; use super::stats::HostStats; +use crate::Uri; use crate::types::Result; -use crate::{CacheStatus, Status, Uri}; use crate::{ ErrorKind, ratelimit::{HostConfig, RateLimitConfig}, @@ -24,22 +28,8 @@ use crate::{ /// Cap maximum backoff duration to reasonable limits const MAXIMUM_BACKOFF: Duration = Duration::from_secs(60); -/// Cache value for per-host caching -#[derive(Debug, Clone)] -struct HostCacheValue { - status: CacheStatus, -} - -impl From<&Status> for HostCacheValue { - fn from(status: &Status) -> Self { - Self { - status: status.into(), - } - } -} - /// Per-host cache for storing request results -type HostCache = DashMap; +type HostCache = DashMap; /// Represents a single host with its own rate limiting, concurrency control, /// HTTP client configuration, and request cache. @@ -70,7 +60,8 @@ pub struct Host { /// Current backoff duration for adaptive rate limiting backoff_duration: Mutex, - /// Per-host cache to prevent duplicate requests + /// Per-host cache to prevent duplicate requests during a single link check invocation. + /// Note that this cache has no direct relation to the inter-process persistable [`crate::CacheStatus`]. cache: HostCache, } @@ -108,40 +99,28 @@ impl Host { /// # Panics /// /// Panics if the statistics mutex is poisoned - pub fn get_cached_status(&self, uri: &Uri) -> Option { - if let Some(entry) = self.cache.get(uri) { - // Cache hit - self.stats.lock().unwrap().record_cache_hit(); - return Some(entry.status); - } + fn get_cached_status(&self, uri: &Uri) -> Option { + self.cache.get(uri).map(|v| v.clone()) + } + + fn record_cache_hit(&self) { + self.stats.lock().unwrap().record_cache_hit(); + } - // Cache miss + fn record_cache_miss(&self) { self.stats.lock().unwrap().record_cache_miss(); - None } /// Cache a request result - pub fn cache_result(&self, uri: &Uri, status: &Status) { - let cache_value = HostCacheValue::from(status); - self.cache.insert(uri.clone(), cache_value); + fn cache_result(&self, uri: &Uri, response: CacheableResponse) { + // Do not cache responses that are potentially retried + if !response.status.should_retry() { + self.cache.insert(uri.clone(), response); + } } /// Execute a request with rate limiting, concurrency control, and caching /// - /// This method: - /// 1. Checks the per-host cache for existing results - /// 2. If not cached, acquires a semaphore permit for concurrency control - /// 3. Waits for rate limiter permission - /// 4. Applies adaptive backoff if needed - /// 5. Executes the request - /// 6. Updates statistics based on response - /// 7. Parses rate limit headers to adjust future behavior - /// 8. Caches the result for future use - /// - /// # Arguments - /// - /// * `request` - The HTTP request to execute - /// /// # Errors /// /// Returns an error if the request fails or rate limiting is exceeded @@ -149,39 +128,35 @@ impl Host { /// # Panics /// /// Panics if the statistics mutex is poisoned - pub async fn execute_request(&self, request: Request) -> Result { + pub(crate) async fn execute_request(&self, request: Request) -> Result { let uri = Uri::from(request.url().clone()); + let _permit = self.acquire_semaphore().await; - // Note: Cache checking is handled at the HostPool level - // This method focuses on executing the actual HTTP request - - // Acquire semaphore permit for concurrency control - let _permit = self - .semaphore - .acquire() - .await - // SAFETY: this should not panic as we never close the semaphore - .expect("Semaphore was closed unexpectedly"); - - // Apply adaptive backoff if needed - let backoff_duration = { - let backoff = self.backoff_duration.lock().unwrap(); - *backoff - }; - if !backoff_duration.is_zero() { - log::debug!( - "Host {} applying backoff delay of {}ms due to previous rate limiting or errors", - self.key, - backoff_duration.as_millis() - ); - tokio::time::sleep(backoff_duration).await; + if let Some(cached) = self.get_cached_status(&uri) { + self.record_cache_hit(); + return Ok(cached); } + self.await_backoff().await; + if let Some(rate_limiter) = &self.rate_limiter { rate_limiter.until_ready().await; } - // Execute the request and track timing + if let Some(cached) = self.get_cached_status(&uri) { + self.record_cache_hit(); + return Ok(cached); + } + + self.record_cache_miss(); + self.perform_request(request, uri).await + } + + pub(crate) const fn get_client(&self) -> &ReqwestClient { + &self.client + } + + async fn perform_request(&self, request: Request, uri: Uri) -> Result { let start_time = Instant::now(); let response = match self.client.execute(request).await { Ok(response) => response, @@ -190,74 +165,85 @@ impl Host { return Err(ErrorKind::NetworkRequest(e)); } }; - let request_time = start_time.elapsed(); - // Update statistics based on response - let status_code = response.status().as_u16(); - self.update_stats_and_backoff(status_code, request_time); - - // Parse rate limit headers to adjust behavior + self.update_stats(response.status(), start_time.elapsed()); + self.update_backoff(response.status()); self.handle_rate_limit_headers(&response); - // Cache the result - let status = Status::Ok(response.status()); - self.cache_result(&uri, &status); - + let response = CacheableResponse::try_from(response).await?; + self.cache_result(&uri, response.clone()); Ok(response) } - pub(crate) const fn get_client(&self) -> &ReqwestClient { - &self.client + /// Await adaptive backoff if needed + async fn await_backoff(&self) { + let backoff_duration = { + let backoff = self.backoff_duration.lock().unwrap(); + *backoff + }; + if !backoff_duration.is_zero() { + log::debug!( + "Host {} applying backoff delay of {}ms due to previous rate limiting or errors", + self.key, + backoff_duration.as_millis() + ); + tokio::time::sleep(backoff_duration).await; + } } - /// Update internal statistics and backoff based on the response - fn update_stats_and_backoff(&self, status_code: u16, request_time: Duration) { - // Update statistics - { - let mut stats = self.stats.lock().unwrap(); - stats.record_response(status_code, request_time); - } + async fn acquire_semaphore(&self) -> tokio::sync::SemaphorePermit<'_> { + self.semaphore + .acquire() + .await + // SAFETY: this should not panic as we never close the semaphore + .expect("Semaphore was closed unexpectedly") + } - // Update backoff duration based on response - { - let mut backoff = self.backoff_duration.lock().unwrap(); - match status_code { - 200..=299 => { - // Reset backoff on success - *backoff = Duration::from_millis(0); - } - 429 => { - // Exponential backoff on rate limit, capped at 30 seconds - let new_backoff = std::cmp::min( - if backoff.is_zero() { - Duration::from_millis(500) - } else { - *backoff * 2 - }, - Duration::from_secs(30), - ); - log::debug!( - "Host {} hit rate limit (429), increasing backoff from {}ms to {}ms", - self.key, - backoff.as_millis(), - new_backoff.as_millis() - ); - *backoff = new_backoff; - } - 500..=599 => { - // Moderate backoff increase on server errors, capped at 10 seconds - *backoff = std::cmp::min( - *backoff + Duration::from_millis(200), - Duration::from_secs(10), - ); - } - _ => {} // No backoff change for other status codes + fn update_backoff(&self, status: StatusCode) { + let mut backoff = self.backoff_duration.lock().unwrap(); + match status.as_u16() { + 200..=299 => { + // Reset backoff on success + *backoff = Duration::from_millis(0); } + 429 => { + // Exponential backoff on rate limit, capped at 30 seconds + let new_backoff = std::cmp::min( + if backoff.is_zero() { + Duration::from_millis(500) + } else { + *backoff * 2 + }, + Duration::from_secs(30), + ); + log::debug!( + "Host {} hit rate limit (429), increasing backoff from {}ms to {}ms", + self.key, + backoff.as_millis(), + new_backoff.as_millis() + ); + *backoff = new_backoff; + } + 500..=599 => { + // Moderate backoff increase on server errors, capped at 10 seconds + *backoff = std::cmp::min( + *backoff + Duration::from_millis(200), + Duration::from_secs(10), + ); + } + _ => {} // No backoff change for other status codes } } + fn update_stats(&self, status: StatusCode, request_time: Duration) { + self.stats + .lock() + .unwrap() + .record_response(status.as_u16(), request_time); + } + /// Parse rate limit headers from response and adjust behavior - fn handle_rate_limit_headers(&self, response: &Response) { + fn handle_rate_limit_headers(&self, response: &ReqwestResponse) { // Implement basic parsing here rather than using the rate-limits crate to keep dependencies minimal let headers = response.headers(); self.handle_retry_after_header(headers); @@ -321,27 +307,10 @@ impl Host { self.stats.lock().unwrap().clone() } - /// Record a cache hit from the persistent disk cache - /// - /// # Panics - /// - /// Panics if the statistics mutex is poisoned - pub fn record_persistent_cache_hit(&self) { - self.stats.lock().unwrap().record_cache_hit(); - } - - /// Record a cache miss from the persistent disk cache - /// - /// # Panics - /// - /// Panics if the statistics mutex is poisoned - pub fn record_persistent_cache_miss(&self) { - self.stats.lock().unwrap().record_cache_miss(); - } - - /// Get the current number of available permits (concurrent request slots) - pub fn available_permits(&self) -> usize { - self.semaphore.available_permits() + /// Record a cache hit from the persistent disk cache. + /// Cache misses are tracked internally, so we don't expose such a method. + pub(crate) fn record_persistent_cache_hit(&self) { + self.record_cache_hit(); } /// Get the current cache size (number of cached entries) @@ -365,7 +334,7 @@ mod tests { let host = Host::new(key.clone(), &host_config, &global_config, Client::default()); assert_eq!(host.key, key); - assert_eq!(host.available_permits(), 10); // Default concurrency + assert_eq!(host.semaphore.available_permits(), 10); // Default concurrency assert!((host.stats().success_rate() - 1.0).abs() < f64::EPSILON); assert_eq!(host.cache_size(), 0); } diff --git a/lychee-lib/src/ratelimit/mod.rs b/lychee-lib/src/ratelimit/mod.rs index ad4cb48551..fac500207b 100644 --- a/lychee-lib/src/ratelimit/mod.rs +++ b/lychee-lib/src/ratelimit/mod.rs @@ -19,4 +19,36 @@ mod pool; pub use config::{HostConfig, HostConfigs, RateLimitConfig}; pub use host::{Host, HostKey, HostStats}; +use http::HeaderMap; pub use pool::{ClientMap, HostPool}; +use reqwest::Response; +use url::Url; + +use crate::{ErrorKind, Result}; + +/// The result of a HTTP request, used for internal per-host caching. +/// This abstraction exists, because [`Response`] cannot easily be cached +/// since it does not implement [`Clone`]. +#[derive(Debug, Clone)] +pub(crate) struct CacheableResponse { + pub(crate) status: reqwest::StatusCode, + pub(crate) text: String, + pub(crate) headers: HeaderMap, + pub(crate) url: Url, +} + +impl CacheableResponse { + async fn try_from(response: Response) -> Result { + let status = response.status(); + let headers = response.headers().clone(); + let url = response.url().clone(); + let text = response.text().await.map_err(ErrorKind::ReadResponseBody)?; + + Ok(Self { + status, + text, + headers, + url, + }) + } +} diff --git a/lychee-lib/src/ratelimit/pool.rs b/lychee-lib/src/ratelimit/pool.rs index 9ea7700803..df6d5027dc 100644 --- a/lychee-lib/src/ratelimit/pool.rs +++ b/lychee-lib/src/ratelimit/pool.rs @@ -1,12 +1,12 @@ use dashmap::DashMap; use http::Method; -use reqwest::{Client, Request, Response}; +use reqwest::{Client, Request}; use std::collections::HashMap; use std::sync::Arc; -use crate::ratelimit::{self, Host, HostConfigs, HostKey, HostStats, RateLimitConfig}; +use crate::ratelimit::{CacheableResponse, Host, HostConfigs, HostKey, HostStats, RateLimitConfig}; use crate::types::Result; -use crate::{CacheStatus, ErrorKind, Status, Uri}; +use crate::{ErrorKind, Uri}; /// Keep track of host-specific [`reqwest::Client`]s pub type ClientMap = HashMap; @@ -65,23 +65,7 @@ impl HostPool { /// Fails if: /// - The request URL has no valid hostname /// - The underlying HTTP request fails - /// - /// # Examples - /// - /// ```no_run - /// # use lychee_lib::ratelimit::{HostPool, RateLimitConfig}; - /// # use std::collections::HashMap; - /// # use reqwest::{Request, header::HeaderMap}; - /// # use std::time::Duration; - /// # #[tokio::main] - /// # async fn main() -> Result<(), Box> { - /// let pool = HostPool::default(); - /// let request = reqwest::Request::new(reqwest::Method::GET, "https://example.com".parse()?); - /// let response = pool.execute_request(request).await?; - /// # Ok(()) - /// # } - /// ``` - pub async fn execute_request(&self, request: Request) -> Result { + pub(crate) async fn execute_request(&self, request: Request) -> Result { let url = request.url(); let host_key = HostKey::try_from(url)?; let host = self.get_or_create_host(host_key); @@ -187,33 +171,6 @@ impl HostPool { self.hosts.remove(&host_key).is_some() } - /// Check if a URI is cached in the appropriate host's cache - /// - /// # Returns - /// - /// Returns the cached status if found and valid, `None` otherwise - #[must_use] - pub fn get_cached_status(&self, uri: &Uri) -> Option { - let host_key = HostKey::try_from(uri).ok()?; - - if let Some(host) = self.hosts.get(&host_key) { - host.get_cached_status(uri) - } else { - None - } - } - - /// Cache a result for a URI in the appropriate host's cache - pub fn cache_result(&self, uri: &Uri, status: &Status) { - if let Ok(host_key) = HostKey::try_from(uri) - && let Some(host) = self.hosts.get(&host_key) - { - host.cache_result(uri, status); - } - // If host doesn't exist yet, we don't cache - // The result will be cached when the host is created and the request is made - } - /// Get cache statistics across all hosts #[must_use] pub fn cache_stats(&self) -> HashMap { @@ -228,34 +185,16 @@ impl HostPool { .collect() } - /// Try to record a cache hit for the given URI in host statistics. - /// File and mail URIs are ignored. - /// - /// This tracks that a request was served from the persistent disk cache - /// rather than going through the rate-limited HTTP request flow. - /// This method will create a host instance if one doesn't exist yet. - pub fn record_cache_hit(&self, uri: &crate::Uri) { - self.record_cache_event(uri, |host| host.record_persistent_cache_hit()); - } - - /// Try to record a cache miss for the given URI in host statistics - /// File and mail URIs are ignored. - /// - /// This tracks that a request could not be served from the persistent disk cache - /// and will need to go through the rate-limited HTTP request flow. - /// This method will create a Host instance if one doesn't exist yet. - pub fn record_cache_miss(&self, uri: &Uri) { - self.record_cache_event(uri, |host| host.record_persistent_cache_miss()); - } - - /// File and mail URIs are ignored. - /// Errors are logged and not returned. - fn record_cache_event)>(&self, uri: &Uri, action: F) { + /// Record a cache hit for the given URI in host statistics. + /// This tracks that a request was served from the persistent disk cache. + /// Note that no equivalent function for tracking cache misses is exposed, + /// since this is handled internally. + pub fn record_persistent_cache_hit(&self, uri: &crate::Uri) { if !uri.is_file() && !uri.is_mail() { - match ratelimit::HostKey::try_from(uri) { + match crate::ratelimit::HostKey::try_from(uri) { Ok(key) => { let host = self.get_or_create_host(key); - action(host); + host.record_persistent_cache_hit(); } Err(e) => { log::debug!("Failed to record cache hit for {uri}: {e}"); diff --git a/lychee-lib/src/retry.rs b/lychee-lib/src/retry.rs index 8bf0ec7d10..595cd08df5 100644 --- a/lychee-lib/src/retry.rs +++ b/lychee-lib/src/retry.rs @@ -18,21 +18,10 @@ pub(crate) trait RetryExt { impl RetryExt for reqwest::StatusCode { /// Try to map a `reqwest` response into `Retryable`. - #[allow(clippy::if_same_then_else)] fn should_retry(&self) -> bool { - let status = *self; - if status.is_server_error() { - true - } else if status.is_client_error() - && status != StatusCode::REQUEST_TIMEOUT - && status != StatusCode::TOO_MANY_REQUESTS - { - false - } else if status.is_success() { - false - } else { - status == StatusCode::REQUEST_TIMEOUT || status == StatusCode::TOO_MANY_REQUESTS - } + self.is_server_error() + || self == &StatusCode::REQUEST_TIMEOUT + || self == &StatusCode::TOO_MANY_REQUESTS } } diff --git a/lychee-lib/src/types/status.rs b/lychee-lib/src/types/status.rs index f246c0bd34..35577ec4f3 100644 --- a/lychee-lib/src/types/status.rs +++ b/lychee-lib/src/types/status.rs @@ -4,8 +4,8 @@ use super::CacheStatus; use super::redirect_history::Redirects; use crate::ErrorKind; use crate::RequestError; +use crate::ratelimit::CacheableResponse; use http::StatusCode; -use reqwest::Response; use serde::ser::SerializeStruct; use serde::{Serialize, Serializer}; @@ -92,13 +92,12 @@ impl Serialize for Status { impl Status { #[must_use] /// Create a status object from a response and the set of accepted status codes - pub fn new(response: &Response, accepted: &HashSet) -> Self { - let code = response.status(); - - if accepted.contains(&code) { - Self::Ok(code) + pub(crate) fn new(response: &CacheableResponse, accepted: &HashSet) -> Self { + let status = response.status; + if accepted.contains(&status) { + Self::Ok(status) } else { - Self::Error(ErrorKind::RejectedStatusCode(code)) + Self::Error(ErrorKind::RejectedStatusCode(status)) } }