From fb8c58439ced2f2839ab272d087517c1610c49b0 Mon Sep 17 00:00:00 2001 From: Charlotte Hartmann Paludo Date: Fri, 12 Sep 2025 08:14:01 +0200 Subject: [PATCH] fix: do not reuse excluded status from cache --- lychee-bin/src/cache.rs | 61 ++++++++++++++++++++++++++++++++--- lychee-bin/src/main.rs | 6 +++- lychee-lib/src/types/cache.rs | 23 ++++++++++++- 3 files changed, 83 insertions(+), 7 deletions(-) diff --git a/lychee-bin/src/cache.rs b/lychee-bin/src/cache.rs index 4e872d4567..644526ce81 100644 --- a/lychee-bin/src/cache.rs +++ b/lychee-bin/src/cache.rs @@ -1,7 +1,7 @@ use crate::time::{self, Timestamp, timestamp}; use anyhow::Result; use dashmap::DashMap; -use lychee_lib::{CacheStatus, Status, Uri}; +use lychee_lib::{CacheStatus, Status, StatusCodeExcluder, Uri}; use serde::{Deserialize, Serialize}; use std::path::Path; @@ -33,7 +33,11 @@ pub(crate) trait StoreExt { fn store>(&self, path: T) -> Result<()>; /// Load cache from path. Discard entries older than `max_age_secs` - fn load>(path: T, max_age_secs: u64) -> Result; + fn load>( + path: T, + max_age_secs: u64, + excluder: &StatusCodeExcluder, + ) -> Result; } impl StoreExt for Cache { @@ -47,7 +51,11 @@ impl StoreExt for Cache { Ok(()) } - fn load>(path: T, max_age_secs: u64) -> Result { + fn load>( + path: T, + max_age_secs: u64, + excluder: &StatusCodeExcluder, + ) -> Result { let mut rdr = csv::ReaderBuilder::new() .has_headers(false) .from_path(path)?; @@ -58,10 +66,53 @@ impl StoreExt for Cache { let (uri, value): (Uri, CacheValue) = result?; // Discard entries older than `max_age_secs`. // This allows gradually updating the cache over multiple runs. - if current_ts - value.timestamp < max_age_secs { - map.insert(uri, value); + if current_ts - value.timestamp >= max_age_secs { + continue; } + + // Discard entries for status codes which have been excluded. + // Without this check, an entry might be cached, then its status code is configured as + // excluded, and in subsequent runs the cached value is still reused. + if value.status.is_excluded(excluder) { + continue; + } + + map.insert(uri, value); } Ok(map) } } + +#[cfg(test)] +mod tests { + use dashmap::DashMap; + use lychee_lib::{AcceptRange, CacheStatus, StatusCodeExcluder, Uri}; + + use crate::{ + cache::{Cache, CacheValue, StoreExt}, + time::timestamp, + }; + + #[test] + fn test_excluded_status_not_reused_from_cache() { + let uri: Uri = "https://example.com".try_into().unwrap(); + + let cache: Cache = DashMap::::new(); + cache.insert( + uri.clone(), + CacheValue { + status: CacheStatus::Ok(429), + timestamp: timestamp(), + }, + ); + + let tmp = tempfile::NamedTempFile::new().unwrap(); + cache.store(tmp.path()).unwrap(); + + let mut excluder = StatusCodeExcluder::new(); + excluder.add_range(AcceptRange::new_from(400, 500).unwrap()); + + let cache = Cache::load(tmp.path(), u64::MAX, &excluder).unwrap(); + assert!(cache.get(&uri).is_none()); + } +} diff --git a/lychee-bin/src/main.rs b/lychee-bin/src/main.rs index c814b0aa7c..1321f0a926 100644 --- a/lychee-bin/src/main.rs +++ b/lychee-bin/src/main.rs @@ -246,7 +246,11 @@ fn load_cache(cfg: &Config) -> Option { } } - let cache = Cache::load(LYCHEE_CACHE_FILE, cfg.max_cache_age.as_secs()); + let cache = Cache::load( + LYCHEE_CACHE_FILE, + cfg.max_cache_age.as_secs(), + &cfg.cache_exclude_status, + ); match cache { Ok(cache) => Some(cache), Err(e) => { diff --git a/lychee-lib/src/types/cache.rs b/lychee-lib/src/types/cache.rs index aa42a50a9d..41f7f1529e 100644 --- a/lychee-lib/src/types/cache.rs +++ b/lychee-lib/src/types/cache.rs @@ -2,7 +2,7 @@ use std::fmt::Display; use serde::{Deserialize, Deserializer, Serialize}; -use crate::{ErrorKind, Status}; +use crate::{ErrorKind, Status, StatusCodeExcluder}; /// Representation of the status of a cached request. This is kept simple on /// purpose because the type gets serialized to a cache file and might need to @@ -89,6 +89,27 @@ impl From<&Status> for CacheStatus { } } +impl From for Option { + fn from(val: CacheStatus) -> Self { + match val { + CacheStatus::Ok(status) => Some(status), + CacheStatus::Error(status) => status, + _ => None, + } + } +} + +impl CacheStatus { + /// Returns `true` if the cache status is excluded by the given [`StatusCodeExcluder`]. + #[must_use] + pub fn is_excluded(&self, excluder: &StatusCodeExcluder) -> bool { + match Option::::from(*self) { + Some(status) => excluder.contains(status), + _ => false, + } + } +} + #[cfg(test)] mod tests { use serde::Deserialize;