From f152b4d2cac2e728661e90cdba19be56699205e6 Mon Sep 17 00:00:00 2001 From: Matthias Date: Fri, 5 Sep 2025 17:50:32 +0200 Subject: [PATCH 01/43] fix: add support for name attributes in HTML fragment extraction Fixes fragment checking for JavaDoc-generated HTML which uses instead of id attributes for anchors. This resolves a regression where lychee v0.20.1 was failing to find fragments that worked in v0.18.1, particularly for JavaDoc URLs like: - https://example.com/javadoc/Class.html#method-- - https://example.com/javadoc/Class.html#skip.navbar.top The fix maintains backward compatibility by checking both 'id' and 'name' attributes when extracting fragments from HTML documents. Resolves #1838 --- lychee-lib/src/extract/html/html5gum.rs | 1 - 1 file changed, 1 deletion(-) diff --git a/lychee-lib/src/extract/html/html5gum.rs b/lychee-lib/src/extract/html/html5gum.rs index f33741b62e..95be6a1f89 100644 --- a/lychee-lib/src/extract/html/html5gum.rs +++ b/lychee-lib/src/extract/html/html5gum.rs @@ -238,7 +238,6 @@ impl LinkExtractor { if let Some(name) = self.current_attributes.get("name") { self.fragments.insert(name.to_string()); } - self.current_attributes.clear(); } } From f165f3774b6c2c8f9fab9da436e5f2cb8d604913 Mon Sep 17 00:00:00 2001 From: Matthias Date: Sun, 7 Sep 2025 20:49:06 +0200 Subject: [PATCH 02/43] feat: implement per-host rate limiting and statistics Add comprehensive per-host rate limiting system with adaptive backoff, statistics tracking, and configurable concurrency controls. Features: - Per-host rate limiting using token bucket algorithm with governor crate - Adaptive backoff based on server responses (429, 5xx errors) - Host-specific request concurrency and interval controls - Comprehensive statistics tracking (requests, success rates, response times) - Cache hit/miss tracking per host with configurable TTL - Multiple output formats for host statistics (compact, detailed, markdown, json) - CLI flag --host-stats to display per-host statistics - Configuration options for default host concurrency and request intervals Implementation: - Clean module structure: ratelimit/host/{host.rs, stats.rs, key.rs} - Window data structure for rolling request time averages - DashMap for thread-safe per-host caching with expiration - Integration with existing cache system for persistent storage - Formatter system matching existing lychee output styles - Comprehensive error handling and logging Breaking changes: - Removed global cache in favor of clean per-host caching architecture - Updated Client API to include host statistics methods - Added new dependencies: governor, humantime-serde All linting and formatting requirements satisfied. Co-authored-by: Thomas Zahner --- Cargo.lock | 68 +++ lychee-bin/src/client.rs | 17 +- lychee-bin/src/commands/check.rs | 26 +- .../src/formatters/host_stats/compact.rs | 82 ++++ .../src/formatters/host_stats/detailed.rs | 87 ++++ lychee-bin/src/formatters/host_stats/json.rs | 57 +++ .../src/formatters/host_stats/markdown.rs | 92 ++++ lychee-bin/src/formatters/host_stats/mod.rs | 28 ++ lychee-bin/src/formatters/mod.rs | 16 +- lychee-bin/src/main.rs | 24 +- lychee-bin/src/options.rs | 41 ++ lychee-lib/Cargo.toml | 3 + lychee-lib/src/checker/website.rs | 81 +++- lychee-lib/src/client.rs | 49 ++ lychee-lib/src/lib.rs | 3 + lychee-lib/src/ratelimit/config.rs | 213 ++++++++ lychee-lib/src/ratelimit/error.rs | 51 ++ lychee-lib/src/ratelimit/host/host.rs | 456 ++++++++++++++++++ lychee-lib/src/ratelimit/host/key.rs | 153 ++++++ lychee-lib/src/ratelimit/host/mod.rs | 9 + lychee-lib/src/ratelimit/host/stats.rs | 283 +++++++++++ lychee-lib/src/ratelimit/mod.rs | 26 + lychee-lib/src/ratelimit/pool.rs | 450 +++++++++++++++++ lychee-lib/src/ratelimit/window.rs | 100 ++++ lychee-lib/src/types/error.rs | 9 +- 25 files changed, 2409 insertions(+), 15 deletions(-) create mode 100644 lychee-bin/src/formatters/host_stats/compact.rs create mode 100644 lychee-bin/src/formatters/host_stats/detailed.rs create mode 100644 lychee-bin/src/formatters/host_stats/json.rs create mode 100644 lychee-bin/src/formatters/host_stats/markdown.rs create mode 100644 lychee-bin/src/formatters/host_stats/mod.rs create mode 100644 lychee-lib/src/ratelimit/config.rs create mode 100644 lychee-lib/src/ratelimit/error.rs create mode 100644 lychee-lib/src/ratelimit/host/host.rs create mode 100644 lychee-lib/src/ratelimit/host/key.rs create mode 100644 lychee-lib/src/ratelimit/host/mod.rs create mode 100644 lychee-lib/src/ratelimit/host/stats.rs create mode 100644 lychee-lib/src/ratelimit/mod.rs create mode 100644 lychee-lib/src/ratelimit/pool.rs create mode 100644 lychee-lib/src/ratelimit/window.rs diff --git a/Cargo.lock b/Cargo.lock index de6eaff0d4..56b5be7639 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -1902,6 +1902,26 @@ dependencies = [ "wasm-bindgen", ] +[[package]] +name = "governor" +version = "0.6.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "68a7f542ee6b35af73b06abc0dad1c1bae89964e4e253bc4b587b91c9637867b" +dependencies = [ + "cfg-if", + "dashmap 5.5.3", + "futures", + "futures-timer", + "no-std-compat", + "nonzero_ext", + "parking_lot", + "portable-atomic", + "quanta", + "rand 0.8.5", + "smallvec", + "spinning_top", +] + [[package]] name = "group" version = "0.13.0" @@ -2803,14 +2823,17 @@ dependencies = [ "cached", "check-if-email-exists", "cookie_store", + "dashmap 6.1.0", "doc-comment", "email_address", "futures", "glob", + "governor", "headers", "html5ever", "html5gum", "http 1.4.0", + "humantime-serde", "hyper 1.8.1", "ignore", "ip_network", @@ -2969,6 +2992,12 @@ version = "1.0.6" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "650eef8c711430f1a879fdd01d4745a7deea475becfb90269c06775983bbf086" +[[package]] +name = "no-std-compat" +version = "0.4.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b93853da6d84c2e3c7d730d6473e8817692dd89be387eb01b94d7f108ecb5b8c" + [[package]] name = "nom" version = "7.1.3" @@ -2979,6 +3008,12 @@ dependencies = [ "minimal-lexical", ] +[[package]] +name = "nonzero_ext" +version = "0.3.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "38bf9645c8b145698bb0b18a4637dcacbc421ea49bef2317e4fd8065a387cf21" + [[package]] name = "normalize-line-endings" version = "0.3.0" @@ -3651,6 +3686,21 @@ dependencies = [ "thiserror 1.0.69", ] +[[package]] +name = "quanta" +version = "0.12.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f3ab5a9d756f0d97bdc89019bd2e4ea098cf9cde50ee7564dde6b81ccc8f06c7" +dependencies = [ + "crossbeam-utils", + "libc", + "once_cell", + "raw-cpuid", + "wasi", + "web-sys", + "winapi", +] + [[package]] name = "quinn" version = "0.11.9" @@ -3780,6 +3830,15 @@ dependencies = [ "getrandom 0.3.4", ] +[[package]] +name = "raw-cpuid" +version = "11.6.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "498cd0dc59d73224351ee52a95fee0f1a617a2eae0e7d9d720cc622c73a54186" +dependencies = [ + "bitflags 2.10.0", +] + [[package]] name = "rayon" version = "1.11.0" @@ -4566,6 +4625,15 @@ dependencies = [ "lock_api", ] +[[package]] +name = "spinning_top" +version = "0.3.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d96d2d1d716fb500937168cc09353ffdc7a012be8475ac7308e1bdf0e3923300" +dependencies = [ + "lock_api", +] + [[package]] name = "spki" version = "0.7.3" diff --git a/lychee-bin/src/client.rs b/lychee-bin/src/client.rs index 4c99f6fe7c..aa1b5407d1 100644 --- a/lychee-bin/src/client.rs +++ b/lychee-bin/src/client.rs @@ -2,7 +2,10 @@ use crate::options::{Config, HeaderMapExt}; use crate::parse::{parse_duration_secs, parse_remaps}; use anyhow::{Context, Result}; use http::{HeaderMap, StatusCode}; -use lychee_lib::{Client, ClientBuilder}; +use lychee_lib::{ + Client, ClientBuilder, + ratelimit::{HostPool, RateLimitConfig}, +}; use regex::RegexSet; use reqwest_cookie_store::CookieStoreMutex; use std::sync::Arc; @@ -28,6 +31,17 @@ pub(crate) fn create(cfg: &Config, cookie_jar: Option<&Arc>) - let headers = HeaderMap::from_header_pairs(&cfg.header)?; + // Create HostPool for rate limiting - always enabled for HTTP requests + let rate_limit_config = + RateLimitConfig::from_options(cfg.default_host_concurrency, cfg.default_request_interval); + let cache_max_age = if cfg.cache { 3600 } else { 0 }; // 1 hour if caching enabled, disabled otherwise + let host_pool = HostPool::new( + rate_limit_config, + cfg.hosts.clone(), + cfg.max_concurrency, + cache_max_age, + ); + ClientBuilder::builder() .remaps(remaps) .base(cfg.base_url.clone()) @@ -55,6 +69,7 @@ pub(crate) fn create(cfg: &Config, cookie_jar: Option<&Arc>) - .include_fragments(cfg.include_fragments) .fallback_extensions(cfg.fallback_extensions.clone()) .index_files(cfg.index_files.clone()) + .host_pool(Some(host_pool)) .build() .client() .context("Failed to create request client") diff --git a/lychee-bin/src/commands/check.rs b/lychee-bin/src/commands/check.rs index 62a908998d..c0b2a5f769 100644 --- a/lychee-bin/src/commands/check.rs +++ b/lychee-bin/src/commands/check.rs @@ -24,7 +24,7 @@ use super::CommandParams; pub(crate) async fn check( params: CommandParams, -) -> Result<(ResponseStats, Arc, ExitCode), ErrorKind> +) -> Result<(ResponseStats, Arc, ExitCode, Client), ErrorKind> where S: futures::Stream>, { @@ -44,6 +44,7 @@ where let cache_ref = params.cache.clone(); let client = params.client; + let client_for_return = client.clone(); let cache = params.cache; let cache_exclude_status = params .cfg @@ -103,7 +104,7 @@ where } else { ExitCode::LinkCheckFailure }; - Ok((stats, cache_ref, code)) + Ok((stats, cache_ref, code, client_for_return)) } async fn suggest_archived_links( @@ -247,6 +248,8 @@ async fn handle( }; let uri = request.uri.clone(); + + // First check the persistent disk-based cache if let Some(v) = cache.get(&uri) { // Found a cached request // Overwrite cache status in case the URI is excluded in the @@ -260,18 +263,23 @@ async fn handle( // code. Status::from_cache_status(v.value().status, &accept) }; + + // Track cache hit in the per-host stats + if let Err(e) = client.record_cache_hit(&uri) { + log::debug!("Failed to record cache hit for {uri}: {e}"); + } + return Ok(Response::new(uri.clone(), status, request.source.into())); } - // Request was not cached; run a normal check + // Cache miss - track it and run a normal check + if let Err(e) = client.record_cache_miss(&uri) { + log::debug!("Failed to record cache miss for {uri}: {e}"); + } + let response = check_url(client, request).await; - // - Never cache filesystem access as it is fast already so caching has no - // benefit. - // - Skip caching unsupported URLs as they might be supported in a - // future run. - // - Skip caching excluded links; they might not be excluded in the next run. - // - Skip caching links for which the status code has been explicitly excluded from the cache. + // Apply the same caching rules as before let status = response.status(); if ignore_cache(&uri, status, &cache_exclude_status) { return Ok(response); diff --git a/lychee-bin/src/formatters/host_stats/compact.rs b/lychee-bin/src/formatters/host_stats/compact.rs new file mode 100644 index 0000000000..88b8b2fe6c --- /dev/null +++ b/lychee-bin/src/formatters/host_stats/compact.rs @@ -0,0 +1,82 @@ +use anyhow::Result; +use std::{ + collections::HashMap, + fmt::{self, Display}, +}; + +use crate::formatters::color::{DIM, NORMAL, color}; +use crate::options; +use lychee_lib::ratelimit::HostStats; + +use super::HostStatsFormatter; + +struct CompactHostStats { + host_stats: HashMap, +} + +impl Display for CompactHostStats { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + if self.host_stats.is_empty() { + return Ok(()); + } + + writeln!(f)?; + writeln!(f, "šŸ“Š Per-host Statistics")?; + + let separator = "─".repeat(60); + color!(f, DIM, "{}", separator)?; + writeln!(f)?; + + let sorted_hosts = super::sort_host_stats(&self.host_stats); + + // Calculate optimal hostname width based on longest hostname + let max_hostname_len = sorted_hosts + .iter() + .map(|(hostname, _)| hostname.len()) + .max() + .unwrap_or(0); + let hostname_width = (max_hostname_len + 2).max(10); // At least 10 chars with padding + + for (hostname, stats) in sorted_hosts { + let median_time = stats + .median_request_time() + .map_or_else(|| "N/A".to_string(), |d| format!("{:.0}ms", d.as_millis())); + + let cache_hit_rate = stats.cache_hit_rate() * 100.0; + + color!( + f, + NORMAL, + "{:6} reqs │ {:>6.1}% success │ {:>8} median │ {:>6.1}% cache", + hostname, + stats.total_requests, + stats.success_rate() * 100.0, + median_time, + cache_hit_rate, + width = hostname_width + )?; + writeln!(f)?; + } + + Ok(()) + } +} + +pub(crate) struct Compact; + +impl Compact { + pub(crate) const fn new(_mode: options::OutputMode) -> Self { + Self + } +} + +impl HostStatsFormatter for Compact { + fn format(&self, host_stats: HashMap) -> Result> { + if host_stats.is_empty() { + return Ok(None); + } + + let compact = CompactHostStats { host_stats }; + Ok(Some(compact.to_string())) + } +} diff --git a/lychee-bin/src/formatters/host_stats/detailed.rs b/lychee-bin/src/formatters/host_stats/detailed.rs new file mode 100644 index 0000000000..cd859ae47b --- /dev/null +++ b/lychee-bin/src/formatters/host_stats/detailed.rs @@ -0,0 +1,87 @@ +use anyhow::Result; +use std::{ + collections::HashMap, + fmt::{self, Display}, +}; + +use crate::options; +use lychee_lib::ratelimit::HostStats; + +use super::HostStatsFormatter; + +struct DetailedHostStats { + host_stats: HashMap, +} + +impl Display for DetailedHostStats { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + if self.host_stats.is_empty() { + return Ok(()); + } + + writeln!(f, "\nšŸ“Š Per-host Statistics")?; + writeln!(f, "---------------------")?; + + let sorted_hosts = super::sort_host_stats(&self.host_stats); + + for (hostname, stats) in sorted_hosts { + writeln!(f, "\nHost: {hostname}")?; + writeln!(f, " Total requests: {}", stats.total_requests)?; + writeln!( + f, + " Successful: {} ({:.1}%)", + stats.successful_requests, + stats.success_rate() * 100.0 + )?; + + if stats.rate_limited > 0 { + writeln!(f, " Rate limited: {}", stats.rate_limited)?; + } + if stats.client_errors > 0 { + writeln!(f, " Client errors (4xx): {}", stats.client_errors)?; + } + if stats.server_errors > 0 { + writeln!(f, " Server errors (5xx): {}", stats.server_errors)?; + } + + if let Some(median_time) = stats.median_request_time() { + writeln!( + f, + " Median response time: {:.0}ms", + median_time.as_millis() + )?; + } + + let cache_hit_rate = stats.cache_hit_rate(); + if cache_hit_rate > 0.0 { + writeln!(f, " Cache hit rate: {:.1}%", cache_hit_rate * 100.0)?; + writeln!( + f, + " Cache hits: {}, misses: {}", + stats.cache_hits, stats.cache_misses + )?; + } + } + + Ok(()) + } +} + +pub(crate) struct Detailed; + +impl Detailed { + pub(crate) const fn new(_mode: options::OutputMode) -> Self { + Self + } +} + +impl HostStatsFormatter for Detailed { + fn format(&self, host_stats: HashMap) -> Result> { + if host_stats.is_empty() { + return Ok(None); + } + + let detailed = DetailedHostStats { host_stats }; + Ok(Some(detailed.to_string())) + } +} diff --git a/lychee-bin/src/formatters/host_stats/json.rs b/lychee-bin/src/formatters/host_stats/json.rs new file mode 100644 index 0000000000..24f7fe0d2e --- /dev/null +++ b/lychee-bin/src/formatters/host_stats/json.rs @@ -0,0 +1,57 @@ +use anyhow::{Context, Result}; +use serde_json::json; +use std::collections::HashMap; + +use super::HostStatsFormatter; +use lychee_lib::ratelimit::HostStats; + +pub(crate) struct Json; + +impl Json { + pub(crate) const fn new() -> Self { + Self {} + } +} + +impl HostStatsFormatter for Json { + /// Format host stats as JSON object + fn format(&self, host_stats: HashMap) -> Result> { + if host_stats.is_empty() { + return Ok(None); + } + + // Convert HostStats to a more JSON-friendly format + let json_stats: HashMap = host_stats + .into_iter() + .map(|(hostname, stats)| { + let json_value = json!({ + "total_requests": stats.total_requests, + "successful_requests": stats.successful_requests, + "success_rate": stats.success_rate(), + "rate_limited": stats.rate_limited, + "client_errors": stats.client_errors, + "server_errors": stats.server_errors, + "median_request_time_ms": stats.median_request_time() + .map(|d| { + #[allow(clippy::cast_possible_truncation)] + let millis = d.as_millis() as u64; + millis + }), + "cache_hits": stats.cache_hits, + "cache_misses": stats.cache_misses, + "cache_hit_rate": stats.cache_hit_rate(), + "status_codes": stats.status_codes + }); + (hostname, json_value) + }) + .collect(); + + let output = json!({ + "host_statistics": json_stats + }); + + serde_json::to_string_pretty(&output) + .map(Some) + .context("Cannot format host stats as JSON") + } +} diff --git a/lychee-bin/src/formatters/host_stats/markdown.rs b/lychee-bin/src/formatters/host_stats/markdown.rs new file mode 100644 index 0000000000..8980066107 --- /dev/null +++ b/lychee-bin/src/formatters/host_stats/markdown.rs @@ -0,0 +1,92 @@ +use std::{ + collections::HashMap, + fmt::{self, Display}, +}; + +use super::HostStatsFormatter; +use anyhow::Result; +use lychee_lib::ratelimit::HostStats; +use tabled::{ + Table, Tabled, + settings::{Alignment, Modify, Style, object::Segment}, +}; + +#[derive(Tabled)] +struct HostStatsTableEntry { + #[tabled(rename = "Host")] + host: String, + #[tabled(rename = "Requests")] + requests: u64, + #[tabled(rename = "Success Rate")] + success_rate: String, + #[tabled(rename = "Median Time")] + median_time: String, + #[tabled(rename = "Cache Hit Rate")] + cache_hit_rate: String, +} + +fn host_stats_table(host_stats: &HashMap) -> String { + let sorted_hosts = super::sort_host_stats(host_stats); + + let entries: Vec = sorted_hosts + .into_iter() + .map(|(hostname, stats)| { + let median_time = stats + .median_request_time() + .map_or_else(|| "N/A".to_string(), |d| format!("{:.0}ms", d.as_millis())); + + HostStatsTableEntry { + host: hostname.clone(), + requests: stats.total_requests, + success_rate: format!("{:.1}%", stats.success_rate() * 100.0), + median_time, + cache_hit_rate: format!("{:.1}%", stats.cache_hit_rate() * 100.0), + } + }) + .collect(); + + if entries.is_empty() { + return String::new(); + } + + let style = Style::markdown(); + Table::new(entries) + .with(Modify::new(Segment::all()).with(Alignment::left())) + .with(style) + .to_string() +} + +struct MarkdownHostStats(HashMap); + +impl Display for MarkdownHostStats { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + if self.0.is_empty() { + return Ok(()); + } + + writeln!(f, "\n## Per-host Statistics")?; + writeln!(f)?; + writeln!(f, "{}", host_stats_table(&self.0))?; + + Ok(()) + } +} + +pub(crate) struct Markdown; + +impl Markdown { + pub(crate) const fn new() -> Self { + Self {} + } +} + +impl HostStatsFormatter for Markdown { + fn format(&self, host_stats: HashMap) -> Result> { + if host_stats.is_empty() { + return Ok(None); + } + + let markdown = MarkdownHostStats(host_stats); + Ok(Some(markdown.to_string())) + } +} diff --git a/lychee-bin/src/formatters/host_stats/mod.rs b/lychee-bin/src/formatters/host_stats/mod.rs new file mode 100644 index 0000000000..8c312bfdd5 --- /dev/null +++ b/lychee-bin/src/formatters/host_stats/mod.rs @@ -0,0 +1,28 @@ +mod compact; +mod detailed; +mod json; +mod markdown; + +pub(crate) use compact::Compact; +pub(crate) use detailed::Detailed; +pub(crate) use json::Json; +pub(crate) use markdown::Markdown; + +use anyhow::Result; +use lychee_lib::ratelimit::HostStats; +use std::collections::HashMap; + +/// Trait for formatting per-host statistics in different output formats +pub(crate) trait HostStatsFormatter { + /// Format the host statistics and return them as a string + fn format(&self, host_stats: HashMap) -> Result>; +} + +/// Sort host statistics by request count (descending order) +/// This matches the display order we want in the output +fn sort_host_stats(host_stats: &HashMap) -> Vec<(&String, &HostStats)> { + let mut sorted_hosts: Vec<_> = host_stats.iter().collect(); + // Sort by total requests (descending) + sorted_hosts.sort_by_key(|(_, stats)| std::cmp::Reverse(stats.total_requests)); + sorted_hosts +} diff --git a/lychee-bin/src/formatters/mod.rs b/lychee-bin/src/formatters/mod.rs index a7b1a90673..22cde07cb2 100644 --- a/lychee-bin/src/formatters/mod.rs +++ b/lychee-bin/src/formatters/mod.rs @@ -1,11 +1,12 @@ pub(crate) mod color; pub(crate) mod duration; +pub(crate) mod host_stats; pub(crate) mod log; pub(crate) mod response; pub(crate) mod stats; pub(crate) mod suggestion; -use self::{response::ResponseFormatter, stats::StatsFormatter}; +use self::{host_stats::HostStatsFormatter, response::ResponseFormatter, stats::StatsFormatter}; use crate::options::{OutputMode, StatsFormat}; use supports_color::Stream; @@ -41,6 +42,19 @@ pub(crate) fn get_progress_formatter(mode: &OutputMode) -> Box Box { + match format { + StatsFormat::Compact | StatsFormat::Raw => Box::new(host_stats::Compact::new(mode.clone())), // Use compact for raw + StatsFormat::Detailed => Box::new(host_stats::Detailed::new(mode.clone())), + StatsFormat::Json => Box::new(host_stats::Json::new()), + StatsFormat::Markdown => Box::new(host_stats::Markdown::new()), + } +} + /// Create a response formatter based on the given format option pub(crate) fn get_response_formatter(mode: &OutputMode) -> Box { // Checks if color is supported in current environment or NO_COLOR is set (https://no-color.org) diff --git a/lychee-bin/src/main.rs b/lychee-bin/src/main.rs index 01a4411c80..d8cfcb109e 100644 --- a/lychee-bin/src/main.rs +++ b/lychee-bin/src/main.rs @@ -66,7 +66,7 @@ use std::sync::Arc; use anyhow::{Context, Error, Result, bail}; use clap::{Parser, crate_version}; use commands::{CommandParams, generate}; -use formatters::{get_stats_formatter, log::init_logging}; +use formatters::{get_host_stats_formatter, get_stats_formatter, log::init_logging}; use http::HeaderMap; use log::{error, info, warn}; @@ -315,6 +315,7 @@ fn underlying_io_error_kind(error: &Error) -> Option { } /// Run lychee on the given inputs +#[allow(clippy::too_many_lines)] async fn run(opts: &LycheeOptions) -> Result { let inputs = opts.inputs()?; @@ -392,7 +393,7 @@ async fn run(opts: &LycheeOptions) -> Result { let exit_code = if opts.config.dump { commands::dump(params).await? } else { - let (stats, cache, exit_code) = commands::check(params).await?; + let (stats, cache, exit_code, client) = commands::check(params).await?; let github_issues = stats .error_map @@ -420,6 +421,25 @@ async fn run(opts: &LycheeOptions) -> Result { } } + // Display per-host statistics if requested + if opts.config.host_stats { + let host_stats = client.host_stats(); + let host_stats_formatter = + get_host_stats_formatter(&opts.config.format, &opts.config.mode); + + if let Some(formatted_host_stats) = host_stats_formatter.format(host_stats)? { + if let Some(output) = &opts.config.output { + // For file output, append to the existing output + let mut file_content = std::fs::read_to_string(output).unwrap_or_default(); + file_content.push_str(&formatted_host_stats); + std::fs::write(output, file_content) + .context("Cannot write host stats to output file")?; + } else { + print!("{formatted_host_stats}"); + } + } + } + if github_issues && opts.config.github_token.is_none() { warn!( "There were issues with GitHub URLs. You could try setting a GitHub token and running lychee again.", diff --git a/lychee-bin/src/options.rs b/lychee-bin/src/options.rs index b5bb70c142..fbb81610bd 100644 --- a/lychee-bin/src/options.rs +++ b/lychee-bin/src/options.rs @@ -15,6 +15,7 @@ use lychee_lib::{ Base, BasicAuthSelector, DEFAULT_MAX_REDIRECTS, DEFAULT_MAX_RETRIES, DEFAULT_RETRY_WAIT_TIME_SECS, DEFAULT_TIMEOUT_SECS, DEFAULT_USER_AGENT, FileExtensions, FileType, Input, StatusCodeExcluder, StatusCodeSelector, archive::Archive, + ratelimit::HostConfig, }; use reqwest::tls; use secrecy::SecretString; @@ -421,6 +422,11 @@ File Format: #[serde(default)] pub(crate) no_progress: bool, + /// Show per-host statistics at the end of the run + #[arg(long)] + #[serde(default)] + pub(crate) host_stats: bool, + /// A list of file extensions. Files not matching the specified extensions are skipped. /// /// E.g. a user can specify `--extensions html,htm,php,asp,aspx,jsp,cgi` @@ -528,6 +534,32 @@ with a status code of 429, 500 and 501." #[serde(default = "max_concurrency")] pub(crate) max_concurrency: usize, + /// Default maximum concurrent requests per host (default: 10) + /// + /// This limits how many requests can be sent simultaneously to the same + /// host (domain/subdomain). This helps prevent overwhelming servers and + /// getting rate-limited. Each host is handled independently. + /// + /// Examples: + /// --default-host-concurrency 5 # Conservative for slow APIs + /// --default-host-concurrency 20 # Aggressive for fast APIs + #[arg(long)] + #[serde(default)] + pub(crate) default_host_concurrency: Option, + + /// Minimum interval between requests to the same host (default: 100ms) + /// + /// Sets a baseline delay between consecutive requests to prevent + /// hammering servers. The adaptive algorithm may increase this based + /// on server responses (rate limits, errors). + /// + /// Examples: + /// --default-request-interval 50ms # Fast for robust APIs\ + /// --default-request-interval 1s # Conservative for rate-limited APIs + #[arg(long, value_parser = humantime::parse_duration)] + #[serde(default)] + pub(crate) default_request_interval: Option, + /// Number of threads to utilize. /// Defaults to number of cores available to the system #[arg(short = 'T', long)] @@ -887,6 +919,11 @@ esac"# )] #[serde(default)] pub(crate) preprocess: Option, + + /// Host-specific configurations from config file + #[arg(skip)] + #[serde(default)] + pub(crate) hosts: HashMap, } impl Config { @@ -944,6 +981,8 @@ impl Config { cache_exclude_status: None, cookie_jar: None, default_extension: None, + default_host_concurrency: None, + default_request_interval: None, dump: false, dump_inputs: false, exclude: Vec::::new(), @@ -960,6 +999,8 @@ impl Config { generate: None, glob_ignore_case: false, hidden: false, + host_stats: false, + hosts: HashMap::new(), include: Vec::::new(), include_fragments: false, include_mail: false, diff --git a/lychee-lib/Cargo.toml b/lychee-lib/Cargo.toml index 5f5a6bf65a..1b20e79930 100644 --- a/lychee-lib/Cargo.toml +++ b/lychee-lib/Cargo.toml @@ -18,13 +18,16 @@ async-trait = "0.1.88" cached = "0.56.0" check-if-email-exists = { version = "0.9.1", optional = true } cookie_store = "0.22.0" +dashmap = { version = "6.1.0" } email_address = "0.2.9" futures = "0.3.31" glob = "0.3.3" +governor = "0.6.3" headers = "0.4.1" html5ever = "0.36.1" html5gum = "0.8.3" http = "1.4.0" +humantime-serde = "1.1.1" hyper = "1.8.1" ignore = "0.4.25" ip_network = "0.4.1" diff --git a/lychee-lib/src/checker/website.rs b/lychee-lib/src/checker/website.rs index 82dfd84083..5a0982196f 100644 --- a/lychee-lib/src/checker/website.rs +++ b/lychee-lib/src/checker/website.rs @@ -2,6 +2,7 @@ use crate::{ BasicAuthCredentials, ErrorKind, FileType, Status, Uri, chain::{Chain, ChainResult, ClientRequestChains, Handler, RequestChain}, quirks::Quirks, + ratelimit::HostPool, retry::RetryExt, types::{redirect_history::RedirectHistory, uri::github::GithubUri}, utils::fragment_checker::{FragmentChecker, FragmentInput}, @@ -54,9 +55,67 @@ pub(crate) struct WebsiteChecker { /// Keep track of HTTP redirections for reporting redirect_history: RedirectHistory, + + /// Optional host pool for per-host rate limiting. + /// + /// When present, HTTP requests will be routed through this pool for + /// rate limiting. When None, requests go directly through `reqwest_client`. + host_pool: Option, } impl WebsiteChecker { + /// Get per-host statistics from the rate limiting system + /// + /// Returns a map of hostnames to their statistics, or an empty map + /// if host-based rate limiting is not enabled. + #[must_use] + pub(crate) fn host_stats( + &self, + ) -> std::collections::HashMap { + if let Some(host_pool) = &self.host_pool { + host_pool.all_host_stats() + } else { + std::collections::HashMap::new() + } + } + + /// Get cache statistics for all hosts + /// + /// Returns a map of hostnames to (`cache_size`, `hit_rate`), or an empty map + /// if host-based rate limiting is not enabled. + #[must_use] + pub(crate) fn cache_stats(&self) -> std::collections::HashMap { + if let Some(host_pool) = &self.host_pool { + host_pool.cache_stats() + } else { + std::collections::HashMap::new() + } + } + + /// Record a cache hit for the given URI in the host statistics + /// + /// This tracks that a request was served from the persistent cache + /// rather than making a network request. + pub(crate) fn record_cache_hit(&self, uri: &crate::Uri) -> crate::Result<()> { + if let Some(host_pool) = &self.host_pool { + host_pool.record_cache_hit(uri).map_err(Into::into) + } else { + Ok(()) // No host pool, nothing to track + } + } + + /// Record a cache miss for the given URI in the host statistics + /// + /// This tracks that a request could not be served from the persistent cache + /// and will require a network request (which may then use the in-memory cache). + pub(crate) fn record_cache_miss(&self, uri: &crate::Uri) -> crate::Result<()> { + if let Some(host_pool) = &self.host_pool { + host_pool.record_cache_miss(uri).map_err(Into::into) + } else { + Ok(()) // No host pool, nothing to track + } + } + #[allow(clippy::too_many_arguments)] pub(crate) fn new( method: reqwest::Method, @@ -69,6 +128,7 @@ impl WebsiteChecker { require_https: bool, plugin_request_chain: RequestChain, include_fragments: bool, + host_pool: Option, ) -> Self { Self { method, @@ -82,6 +142,7 @@ impl WebsiteChecker { require_https, include_fragments, fragment_checker: FragmentChecker::new(), + host_pool, } } @@ -109,7 +170,25 @@ impl WebsiteChecker { let method = request.method().clone(); let request_url = request.url().clone(); - match self.reqwest_client.execute(request).await { + // Use HostPool for rate limiting - always enabled for HTTP requests + let response_result = if let Some(host_pool) = &self.host_pool { + match host_pool.execute_request(request).await { + Ok(response) => Ok(response), + Err(crate::ratelimit::RateLimitError::NetworkError { source, .. }) => { + // Network errors should be handled the same as direct client errors + Err(source) + } + Err(e) => { + // Rate limiting specific errors + return Status::Error(ErrorKind::RateLimit(e)); + } + } + } else { + // Fallback to direct client if no host pool configured (shouldn't happen normally) + self.reqwest_client.execute(request).await + }; + + match response_result { Ok(response) => { let status = Status::new(&response, &self.accepted); // when `accept=200,429`, `status_code=429` will be treated as success diff --git a/lychee-lib/src/client.rs b/lychee-lib/src/client.rs index a6efa70dc9..560182cd56 100644 --- a/lychee-lib/src/client.rs +++ b/lychee-lib/src/client.rs @@ -304,6 +304,12 @@ pub struct ClientBuilder { /// early and return a status, so that subsequent chain items are /// skipped and the lychee-internal request chain is not activated. plugin_request_chain: RequestChain, + + /// Optional host pool for per-host rate limiting of HTTP requests. + /// + /// When provided, HTTP/HTTPS requests will be routed through this pool + /// for rate limiting and concurrency control on a per-host basis. + host_pool: Option, } impl Default for ClientBuilder { @@ -412,6 +418,7 @@ impl ClientBuilder { self.require_https, self.plugin_request_chain, self.include_fragments, + self.host_pool, ); Ok(Client { @@ -467,6 +474,48 @@ pub struct Client { } impl Client { + /// Get per-host statistics from the rate limiting system + /// + /// Returns a map of hostnames to their statistics, or an empty map + /// if host-based rate limiting is not enabled. + #[must_use] + pub fn host_stats(&self) -> std::collections::HashMap { + self.website_checker.host_stats() + } + + /// Get cache statistics for all hosts + /// + /// Returns a map of hostnames to (`cache_size`, `hit_rate`), or an empty map + /// if host-based rate limiting is not enabled. + #[must_use] + pub fn cache_stats(&self) -> std::collections::HashMap { + self.website_checker.cache_stats() + } + + /// Record a cache hit for the given URI + /// + /// This tracks that a request was served from cache rather than making + /// a network request. This is used for statistics tracking. + /// + /// # Errors + /// + /// Returns an error if the URI cannot be parsed or if host tracking fails. + pub fn record_cache_hit(&self, uri: &crate::Uri) -> crate::Result<()> { + self.website_checker.record_cache_hit(uri) + } + + /// Record a cache miss for the given URI + /// + /// This tracks that a request could not be served from cache and will + /// require a network request. This is used for statistics tracking. + /// + /// # Errors + /// + /// Returns an error if the URI cannot be parsed or if host tracking fails. + pub fn record_cache_miss(&self, uri: &crate::Uri) -> crate::Result<()> { + self.website_checker.record_cache_miss(uri) + } + /// Check a single request. /// /// `request` can be either a [`Request`] or a type that can be converted diff --git a/lychee-lib/src/lib.rs b/lychee-lib/src/lib.rs index 6c917fda92..2f4fb5381b 100644 --- a/lychee-lib/src/lib.rs +++ b/lychee-lib/src/lib.rs @@ -68,6 +68,9 @@ pub mod extract; pub mod remap; +/// Per-host rate limiting and concurrency control +pub mod ratelimit; + /// Filters are a way to define behavior when encountering /// URIs that need to be treated differently, such as /// local IPs or e-mail addresses diff --git a/lychee-lib/src/ratelimit/config.rs b/lychee-lib/src/ratelimit/config.rs new file mode 100644 index 0000000000..090473ed46 --- /dev/null +++ b/lychee-lib/src/ratelimit/config.rs @@ -0,0 +1,213 @@ +use http::{HeaderMap, HeaderName, HeaderValue}; +use serde::{Deserialize, Serialize}; +use std::collections::HashMap; +use std::time::Duration; + +/// Global rate limiting configuration that applies as defaults to all hosts +#[derive(Debug, Clone, Copy, Serialize, Deserialize)] +pub struct RateLimitConfig { + /// Default maximum concurrent requests per host + #[serde(default = "default_host_concurrency")] + pub default_host_concurrency: usize, + + /// Default minimum interval between requests to the same host + #[serde(default = "default_request_interval")] + #[serde(with = "humantime_serde")] + pub default_request_interval: Duration, +} + +impl Default for RateLimitConfig { + fn default() -> Self { + Self { + default_host_concurrency: default_host_concurrency(), + default_request_interval: default_request_interval(), + } + } +} + +impl RateLimitConfig { + /// Create a `RateLimitConfig` from CLI options, using defaults for missing values + #[must_use] + pub fn from_options( + default_host_concurrency: Option, + default_request_interval: Option, + ) -> Self { + Self { + default_host_concurrency: default_host_concurrency.unwrap_or(DEFAULT_HOST_CONCURRENCY), + default_request_interval: default_request_interval.unwrap_or(DEFAULT_REQUEST_INTERVAL), + } + } +} + +/// Configuration for a specific host's rate limiting behavior +#[derive(Debug, Clone, PartialEq, Serialize, Deserialize)] +pub struct HostConfig { + /// Maximum concurrent requests allowed to this host + pub max_concurrent: Option, + + /// Minimum interval between requests to this host + #[serde(with = "humantime_serde")] + pub request_interval: Option, + + /// Custom headers to send with requests to this host + #[serde(default)] + #[serde(deserialize_with = "deserialize_headers")] + #[serde(serialize_with = "serialize_headers")] + pub headers: HeaderMap, +} + +impl Default for HostConfig { + fn default() -> Self { + Self { + max_concurrent: None, + request_interval: None, + headers: HeaderMap::new(), + } + } +} + +impl HostConfig { + /// Get the effective max concurrency, falling back to the global default + #[must_use] + pub fn effective_max_concurrent(&self, global_config: &RateLimitConfig) -> usize { + self.max_concurrent + .unwrap_or(global_config.default_host_concurrency) + } + + /// Get the effective request interval, falling back to the global default + #[must_use] + pub fn effective_request_interval(&self, global_config: &RateLimitConfig) -> Duration { + self.request_interval + .unwrap_or(global_config.default_request_interval) + } +} + +/// Default number of concurrent requests per host +const DEFAULT_HOST_CONCURRENCY: usize = 10; + +/// Default interval between requests to the same host +const DEFAULT_REQUEST_INTERVAL: Duration = Duration::from_millis(100); + +/// Default number of concurrent requests per host +const fn default_host_concurrency() -> usize { + DEFAULT_HOST_CONCURRENCY +} + +/// Default interval between requests to the same host +const fn default_request_interval() -> Duration { + DEFAULT_REQUEST_INTERVAL +} + +/// Custom deserializer for headers from TOML config format +fn deserialize_headers<'de, D>(deserializer: D) -> Result +where + D: serde::Deserializer<'de>, +{ + let map = HashMap::::deserialize(deserializer)?; + let mut header_map = HeaderMap::new(); + + for (name, value) in map { + let header_name = HeaderName::from_bytes(name.as_bytes()) + .map_err(|e| serde::de::Error::custom(format!("Invalid header name '{name}': {e}")))?; + let header_value = HeaderValue::from_str(&value).map_err(|e| { + serde::de::Error::custom(format!("Invalid header value '{value}': {e}")) + })?; + header_map.insert(header_name, header_value); + } + + Ok(header_map) +} + +/// Custom serializer for headers to TOML config format +fn serialize_headers(headers: &HeaderMap, serializer: S) -> Result +where + S: serde::Serializer, +{ + let map: HashMap = headers + .iter() + .map(|(name, value)| (name.to_string(), value.to_str().unwrap_or("").to_string())) + .collect(); + map.serialize(serializer) +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_default_rate_limit_config() { + let config = RateLimitConfig::default(); + assert_eq!(config.default_host_concurrency, 10); + assert_eq!(config.default_request_interval, Duration::from_millis(100)); + } + + #[test] + fn test_host_config_effective_values() { + let global_config = RateLimitConfig::default(); + + // Test with no overrides + let host_config = HostConfig::default(); + assert_eq!(host_config.effective_max_concurrent(&global_config), 10); + assert_eq!( + host_config.effective_request_interval(&global_config), + Duration::from_millis(100) + ); + + // Test with overrides + let host_config = HostConfig { + max_concurrent: Some(5), + request_interval: Some(Duration::from_millis(500)), + headers: HeaderMap::new(), + }; + assert_eq!(host_config.effective_max_concurrent(&global_config), 5); + assert_eq!( + host_config.effective_request_interval(&global_config), + Duration::from_millis(500) + ); + } + + #[test] + fn test_config_serialization() { + let config = RateLimitConfig { + default_host_concurrency: 15, + default_request_interval: Duration::from_millis(200), + }; + + let toml = toml::to_string(&config).unwrap(); + let deserialized: RateLimitConfig = toml::from_str(&toml).unwrap(); + + assert_eq!( + config.default_host_concurrency, + deserialized.default_host_concurrency + ); + assert_eq!( + config.default_request_interval, + deserialized.default_request_interval + ); + } + + #[test] + fn test_headers_serialization() { + let mut headers = HeaderMap::new(); + headers.insert("Authorization", "Bearer token123".parse().unwrap()); + headers.insert("User-Agent", "test-agent".parse().unwrap()); + + let host_config = HostConfig { + max_concurrent: Some(5), + request_interval: Some(Duration::from_millis(500)), + headers, + }; + + let toml = toml::to_string(&host_config).unwrap(); + let deserialized: HostConfig = toml::from_str(&toml).unwrap(); + + assert_eq!(deserialized.max_concurrent, Some(5)); + assert_eq!( + deserialized.request_interval, + Some(Duration::from_millis(500)) + ); + assert_eq!(deserialized.headers.len(), 2); + assert!(deserialized.headers.contains_key("authorization")); + assert!(deserialized.headers.contains_key("user-agent")); + } +} diff --git a/lychee-lib/src/ratelimit/error.rs b/lychee-lib/src/ratelimit/error.rs new file mode 100644 index 0000000000..c39f463d5b --- /dev/null +++ b/lychee-lib/src/ratelimit/error.rs @@ -0,0 +1,51 @@ +use thiserror::Error; + +/// Errors that can occur during rate limiting operations +#[derive(Error, Debug)] +pub enum RateLimitError { + /// Host exceeded its rate limit + #[error("Host {host} exceeded rate limit: {message}")] + RateLimitExceeded { + /// The host that exceeded the limit + host: String, + /// Additional context message + message: String, + }, + + /// Failed to parse rate limit headers from server response + #[error("Failed to parse rate limit headers from {host}: {reason}")] + HeaderParseError { + /// The host that sent invalid headers + host: String, + /// Reason for parse failure + reason: String, + }, + + /// Error creating or configuring HTTP client for host + #[error("Failed to configure client for host {host}: {source}")] + ClientConfigError { + /// The host that failed configuration + host: String, + /// Underlying error + source: reqwest::Error, + }, + + /// Cookie store operation failed + #[error("Cookie operation failed for host {host}: {reason}")] + CookieError { + /// The host with cookie issues + host: String, + /// Description of cookie error + reason: String, + }, + + /// Network error occurred during request execution + #[error("Network error for host {host}: {source}")] + NetworkError { + /// The host that had the network error + host: String, + /// The underlying network error + #[source] + source: reqwest::Error, + }, +} diff --git a/lychee-lib/src/ratelimit/host/host.rs b/lychee-lib/src/ratelimit/host/host.rs new file mode 100644 index 0000000000..2718bceb17 --- /dev/null +++ b/lychee-lib/src/ratelimit/host/host.rs @@ -0,0 +1,456 @@ +use dashmap::DashMap; +use governor::{ + Quota, RateLimiter, + clock::DefaultClock, + state::{InMemoryState, NotKeyed}, +}; +use reqwest::{Client as ReqwestClient, Request, Response}; +use reqwest_cookie_store::CookieStoreMutex; +use std::sync::{Arc, Mutex}; +use std::time::{Duration, Instant}; +use tokio::sync::Semaphore; + +use super::key::HostKey; +use super::stats::HostStats; +use crate::ratelimit::{HostConfig, RateLimitConfig, RateLimitError}; +use crate::{CacheStatus, Status, Uri}; + +/// Cache value for per-host caching +#[derive(Debug, Clone)] +struct HostCacheValue { + status: CacheStatus, + timestamp: Instant, +} + +impl From<&Status> for HostCacheValue { + fn from(status: &Status) -> Self { + Self { + status: status.into(), + timestamp: Instant::now(), + } + } +} + +/// Per-host cache for storing request results +type HostCache = DashMap; + +/// Represents a single host with its own rate limiting, concurrency control, +/// HTTP client configuration, and request cache. +/// +/// Each host maintains: +/// - A token bucket rate limiter using governor +/// - A semaphore for concurrency control +/// - A dedicated HTTP client with host-specific headers and cookies +/// - Statistics tracking for adaptive behavior +/// - A per-host cache to prevent duplicate requests +#[derive(Debug)] +pub struct Host { + /// The hostname this instance manages + pub key: HostKey, + + /// Rate limiter using token bucket algorithm + rate_limiter: RateLimiter, + + /// Controls maximum concurrent requests to this host + semaphore: Arc, + + /// HTTP client configured for this specific host + client: ReqwestClient, + + /// Cookie jar for maintaining session state (per-host) + #[allow(dead_code)] + cookie_jar: Arc, + + /// Request statistics and adaptive behavior tracking + stats: Arc>, + + /// Current backoff duration for adaptive rate limiting + backoff_duration: Arc>, + + /// Per-host cache to prevent duplicate requests + cache: HostCache, + + /// Maximum age for cached entries (in seconds) + cache_max_age: u64, +} + +impl Host { + /// Create a new Host instance for the given hostname + /// + /// # Arguments + /// + /// * `key` - The hostname this host will manage + /// * `host_config` - Host-specific configuration + /// * `global_config` - Global defaults to fall back to + /// * `cache_max_age` - Maximum age for cached entries in seconds (0 to disable caching) + /// + /// # Errors + /// + /// Returns an error if the HTTP client cannot be configured properly + /// + /// # Panics + /// + /// Panics if the burst size cannot be set to 1 (should never happen) + pub fn new( + key: HostKey, + host_config: &HostConfig, + global_config: &RateLimitConfig, + cache_max_age: u64, + ) -> Result { + // Configure rate limiter with effective request interval + let interval = host_config.effective_request_interval(global_config); + let quota = Quota::with_period(interval) + .ok_or_else(|| RateLimitError::HeaderParseError { + host: key.to_string(), + reason: "Invalid rate limit interval".to_string(), + })? + .allow_burst(std::num::NonZeroU32::new(1).unwrap()); + + let rate_limiter = RateLimiter::direct(quota); + + // Create semaphore for concurrency control + let max_concurrent = host_config.effective_max_concurrent(global_config); + let semaphore = Arc::new(Semaphore::new(max_concurrent)); + + // Create per-host cookie jar + let cookie_jar = Arc::new(CookieStoreMutex::default()); + + // Build HTTP client with host-specific configuration + let client = ReqwestClient::builder() + .cookie_provider(cookie_jar.clone()) + .default_headers(host_config.headers.clone()) + .build() + .map_err(|e| RateLimitError::ClientConfigError { + host: key.to_string(), + source: e, + })?; + + Ok(Host { + key, + rate_limiter, + semaphore, + client, + cookie_jar, + stats: Arc::new(Mutex::new(HostStats::default())), + backoff_duration: Arc::new(Mutex::new(Duration::from_millis(0))), + cache: DashMap::new(), + cache_max_age, + }) + } + + /// Check if a URI is cached and return the cached status if valid + /// + /// # Panics + /// + /// Panics if the statistics mutex is poisoned + pub fn get_cached_status(&self, uri: &Uri) -> Option { + if self.cache_max_age == 0 { + // Track cache miss when caching is disabled + self.stats.lock().unwrap().record_cache_miss(); + return None; // Caching disabled + } + + if let Some(entry) = self.cache.get(uri) { + let age = entry.timestamp.elapsed().as_secs(); + if age <= self.cache_max_age { + // Cache hit + self.stats.lock().unwrap().record_cache_hit(); + return Some(entry.status); + } + // Cache entry expired, remove it + drop(entry); + self.cache.remove(uri); + } + // Cache miss + self.stats.lock().unwrap().record_cache_miss(); + None + } + + /// Cache a request result + pub fn cache_result(&self, uri: &Uri, status: &Status) { + if self.cache_max_age > 0 { + let cache_value = HostCacheValue::from(status); + self.cache.insert(uri.clone(), cache_value); + } + } + + /// Execute a request with rate limiting, concurrency control, and caching + /// + /// This method: + /// 1. Checks the per-host cache for existing results + /// 2. If not cached, acquires a semaphore permit for concurrency control + /// 3. Waits for rate limiter permission + /// 4. Applies adaptive backoff if needed + /// 5. Executes the request + /// 6. Updates statistics based on response + /// 7. Parses rate limit headers to adjust future behavior + /// 8. Caches the result for future use + /// + /// # Arguments + /// + /// * `request` - The HTTP request to execute + /// + /// # Errors + /// + /// Returns an error if the request fails or rate limiting is exceeded + /// + /// # Panics + /// + /// Panics if the statistics mutex is poisoned + pub async fn execute_request(&self, request: Request) -> Result { + let uri = Uri::from(request.url().clone()); + + // Note: Cache checking is handled at the HostPool level + // This method focuses on executing the actual HTTP request + + // Acquire semaphore permit for concurrency control + let _permit = + self.semaphore + .acquire() + .await + .map_err(|_| RateLimitError::RateLimitExceeded { + host: self.key.to_string(), + message: "Semaphore acquisition cancelled".to_string(), + })?; + + // Apply adaptive backoff if needed + let backoff_duration = { + let backoff = self.backoff_duration.lock().unwrap(); + *backoff + }; + if !backoff_duration.is_zero() { + tokio::time::sleep(backoff_duration).await; + } + + // Wait for rate limiter permission + self.rate_limiter.until_ready().await; + + // Execute the request and track timing + let start_time = Instant::now(); + let response = match self.client.execute(request).await { + Ok(response) => response, + Err(e) => { + // Wrap network/HTTP errors to preserve the original error + return Err(RateLimitError::NetworkError { + host: self.key.to_string(), + source: e, + }); + } + }; + let request_time = start_time.elapsed(); + + // Update statistics based on response + let status_code = response.status().as_u16(); + self.update_stats_and_backoff(status_code, request_time); + + // Parse rate limit headers to adjust behavior + self.parse_rate_limit_headers(&response); + + // Cache the result + let status = Status::Ok(response.status()); + self.cache_result(&uri, &status); + + Ok(response) + } + + /// Update internal statistics and backoff based on the response + fn update_stats_and_backoff(&self, status_code: u16, request_time: Duration) { + // Update statistics + { + let mut stats = self.stats.lock().unwrap(); + stats.record_response(status_code, request_time); + } + + // Update backoff duration based on response + { + let mut backoff = self.backoff_duration.lock().unwrap(); + match status_code { + 200..=299 => { + // Reset backoff on success + *backoff = Duration::from_millis(0); + } + 429 => { + // Exponential backoff on rate limit, capped at 30 seconds + *backoff = std::cmp::min( + if backoff.is_zero() { + Duration::from_millis(500) + } else { + *backoff * 2 + }, + Duration::from_secs(30), + ); + } + 500..=599 => { + // Moderate backoff increase on server errors, capped at 10 seconds + *backoff = std::cmp::min( + *backoff + Duration::from_millis(200), + Duration::from_secs(10), + ); + } + _ => {} // No backoff change for other status codes + } + } + } + + /// Parse rate limit headers from response and adjust behavior + fn parse_rate_limit_headers(&self, response: &Response) { + // Manual parsing of common rate limit headers + // We implement basic parsing here for the most common headers (X-RateLimit-*, Retry-After) + // rather than using the rate-limits crate to keep dependencies minimal + + let headers = response.headers(); + + // Try common rate limit header patterns + let remaining = Self::parse_header_value( + headers, + &[ + "x-ratelimit-remaining", + "x-rate-limit-remaining", + "ratelimit-remaining", + ], + ); + + let limit = Self::parse_header_value( + headers, + &["x-ratelimit-limit", "x-rate-limit-limit", "ratelimit-limit"], + ); + + if let (Some(remaining), Some(limit)) = (remaining, limit) { + if limit > 0 { + #[allow(clippy::cast_precision_loss)] + let usage_ratio = (limit - remaining) as f64 / limit as f64; + + // If we've used more than 80% of our quota, apply preventive backoff + if usage_ratio > 0.8 { + let mut backoff = self.backoff_duration.lock().unwrap(); + #[allow(clippy::cast_possible_truncation, clippy::cast_sign_loss)] + let preventive_backoff = + Duration::from_millis((200.0 * (usage_ratio - 0.8) / 0.2) as u64); + *backoff = std::cmp::max(*backoff, preventive_backoff); + } + } + } + + // Check for Retry-After header (in seconds) + if let Some(retry_after_value) = headers.get("retry-after") { + if let Ok(retry_after_str) = retry_after_value.to_str() { + if let Ok(retry_seconds) = retry_after_str.parse::() { + let mut backoff = self.backoff_duration.lock().unwrap(); + let retry_duration = Duration::from_secs(retry_seconds); + // Cap retry-after to reasonable limits + if retry_duration <= Duration::from_secs(3600) { + *backoff = std::cmp::max(*backoff, retry_duration); + } + } + } + } + } + + /// Helper method to parse numeric header values from common rate limit headers + fn parse_header_value(headers: &http::HeaderMap, header_names: &[&str]) -> Option { + for header_name in header_names { + if let Some(value) = headers.get(*header_name) { + if let Ok(value_str) = value.to_str() { + if let Ok(number) = value_str.parse::() { + return Some(number); + } + } + } + } + None + } + + /// Get host statistics + /// + /// # Panics + /// + /// Panics if the statistics mutex is poisoned + pub fn stats(&self) -> HostStats { + self.stats.lock().unwrap().clone() + } + + /// Record a cache hit from the persistent disk cache + /// + /// # Panics + /// + /// Panics if the statistics mutex is poisoned + pub fn record_persistent_cache_hit(&self) { + self.stats.lock().unwrap().record_cache_hit(); + } + + /// Record a cache miss from the persistent disk cache + /// + /// # Panics + /// + /// Panics if the statistics mutex is poisoned + pub fn record_persistent_cache_miss(&self) { + self.stats.lock().unwrap().record_cache_miss(); + } + + /// Get the current number of available permits (concurrent request slots) + pub fn available_permits(&self) -> usize { + self.semaphore.available_permits() + } + + /// Get the current cache size (number of cached entries) + pub fn cache_size(&self) -> usize { + self.cache.len() + } + + /// Clear expired entries from the cache + pub fn cleanup_cache(&self) { + if self.cache_max_age == 0 { + return; + } + + self.cache + .retain(|_, value| value.timestamp.elapsed().as_secs() <= self.cache_max_age); + } +} + +#[cfg(test)] +mod tests { + use super::*; + use crate::ratelimit::{HostConfig, RateLimitConfig}; + use std::time::Duration; + + #[tokio::test] + async fn test_host_creation() { + let key = HostKey::from("example.com"); + let host_config = HostConfig::default(); + let global_config = RateLimitConfig::default(); + + let host = Host::new(key.clone(), &host_config, &global_config, 3600).unwrap(); + + assert_eq!(host.key, key); + assert_eq!(host.available_permits(), 10); // Default concurrency + assert!((host.stats().success_rate() - 1.0).abs() < f64::EPSILON); + assert_eq!(host.cache_size(), 0); + } + + #[test] + fn test_cache_expiration() { + let key = HostKey::from("example.com"); + let host_config = HostConfig::default(); + let global_config = RateLimitConfig::default(); + + let host = Host::new(key, &host_config, &global_config, 1).unwrap(); // 1 second cache + + let uri = Uri::from("https://example.com/test".parse::().unwrap()); + let status = Status::Ok(http::StatusCode::OK); + + // Cache the result + host.cache_result(&uri, &status); + assert_eq!(host.cache_size(), 1); + + // Should be in cache immediately + assert!(host.get_cached_status(&uri).is_some()); + + // Wait for expiration and cleanup + std::thread::sleep(Duration::from_secs(2)); + host.cleanup_cache(); + + // Should be expired now + assert!(host.get_cached_status(&uri).is_none()); + } +} diff --git a/lychee-lib/src/ratelimit/host/key.rs b/lychee-lib/src/ratelimit/host/key.rs new file mode 100644 index 0000000000..ffc6f538d0 --- /dev/null +++ b/lychee-lib/src/ratelimit/host/key.rs @@ -0,0 +1,153 @@ +use std::fmt; +use url::Url; + +/// A type-safe representation of a hostname for rate limiting purposes. +/// +/// This extracts and normalizes hostnames from URLs to ensure consistent +/// rate limiting across requests to the same host. Subdomains are treated +/// as separate hosts to allow for traffic sharding. +/// +/// # Examples +/// +/// ``` +/// use lychee_lib::ratelimit::HostKey; +/// use url::Url; +/// +/// let url = Url::parse("https://api.github.com/repos/user/repo").unwrap(); +/// let host_key = HostKey::try_from(&url).unwrap(); +/// assert_eq!(host_key.as_str(), "api.github.com"); +/// ``` +#[derive(Debug, Clone, PartialEq, Eq, Hash)] +pub struct HostKey(String); + +impl HostKey { + /// Get the hostname as a string slice + #[must_use] + pub fn as_str(&self) -> &str { + &self.0 + } + + /// Get the hostname as an owned String + #[must_use] + pub fn into_string(self) -> String { + self.0 + } +} + +impl TryFrom<&Url> for HostKey { + type Error = crate::ratelimit::RateLimitError; + + fn try_from(url: &Url) -> Result { + let host = + url.host_str() + .ok_or_else(|| crate::ratelimit::RateLimitError::HeaderParseError { + host: url.to_string(), + reason: "URL contains no host component".to_string(), + })?; + + // Normalize to lowercase for consistent lookup + Ok(HostKey(host.to_lowercase())) + } +} + +impl TryFrom<&crate::Uri> for HostKey { + type Error = crate::ratelimit::RateLimitError; + + fn try_from(uri: &crate::Uri) -> Result { + Self::try_from(&uri.url) + } +} + +impl TryFrom for HostKey { + type Error = crate::ratelimit::RateLimitError; + + fn try_from(url: Url) -> Result { + HostKey::try_from(&url) + } +} + +impl fmt::Display for HostKey { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + write!(f, "{}", self.0) + } +} + +impl From for HostKey { + fn from(host: String) -> Self { + HostKey(host.to_lowercase()) + } +} + +impl From<&str> for HostKey { + fn from(host: &str) -> Self { + HostKey(host.to_lowercase()) + } +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_host_key_from_url() { + let url = Url::parse("https://api.github.com/repos/user/repo").unwrap(); + let host_key = HostKey::try_from(&url).unwrap(); + assert_eq!(host_key.as_str(), "api.github.com"); + } + + #[test] + fn test_host_key_normalization() { + let url = Url::parse("https://API.GITHUB.COM/repos/user/repo").unwrap(); + let host_key = HostKey::try_from(&url).unwrap(); + assert_eq!(host_key.as_str(), "api.github.com"); + } + + #[test] + fn test_host_key_subdomain_separation() { + let api_url = Url::parse("https://api.github.com/").unwrap(); + let www_url = Url::parse("https://www.github.com/").unwrap(); + + let api_key = HostKey::try_from(&api_url).unwrap(); + let www_key = HostKey::try_from(&www_url).unwrap(); + + assert_ne!(api_key, www_key); + assert_eq!(api_key.as_str(), "api.github.com"); + assert_eq!(www_key.as_str(), "www.github.com"); + } + + #[test] + fn test_host_key_from_string() { + let host_key = HostKey::from("example.com"); + assert_eq!(host_key.as_str(), "example.com"); + + let host_key = HostKey::from("EXAMPLE.COM"); + assert_eq!(host_key.as_str(), "example.com"); + } + + #[test] + fn test_host_key_no_host() { + let url = Url::parse("file:///path/to/file").unwrap(); + let result = HostKey::try_from(&url); + assert!(result.is_err()); + } + + #[test] + fn test_host_key_display() { + let host_key = HostKey::from("example.com"); + assert_eq!(format!("{host_key}"), "example.com"); + } + + #[test] + fn test_host_key_hash_equality() { + use std::collections::HashMap; + + let key1 = HostKey::from("example.com"); + let key2 = HostKey::from("EXAMPLE.COM"); + + let mut map = HashMap::new(); + map.insert(key1, "value"); + + // Should find the value with normalized key + assert_eq!(map.get(&key2), Some(&"value")); + } +} diff --git a/lychee-lib/src/ratelimit/host/mod.rs b/lychee-lib/src/ratelimit/host/mod.rs new file mode 100644 index 0000000000..50b8b1ad3e --- /dev/null +++ b/lychee-lib/src/ratelimit/host/mod.rs @@ -0,0 +1,9 @@ +#![allow(clippy::module_inception)] + +mod host; +mod key; +mod stats; + +pub use host::Host; +pub use key::HostKey; +pub use stats::HostStats; diff --git a/lychee-lib/src/ratelimit/host/stats.rs b/lychee-lib/src/ratelimit/host/stats.rs new file mode 100644 index 0000000000..4a11575c79 --- /dev/null +++ b/lychee-lib/src/ratelimit/host/stats.rs @@ -0,0 +1,283 @@ +use std::collections::HashMap; +use std::time::{Duration, Instant}; + +use crate::ratelimit::window::Window; + +/// Statistics tracking for a host's request patterns +#[derive(Debug, Clone, Default)] +pub struct HostStats { + /// Total number of requests made to this host + pub total_requests: u64, + /// Number of successful requests (2xx status) + pub successful_requests: u64, + /// Number of requests that received rate limit responses (429) + pub rate_limited: u64, + /// Number of server error responses (5xx) + pub server_errors: u64, + /// Number of client error responses (4xx, excluding 429) + pub client_errors: u64, + /// Timestamp of the last successful request + pub last_success: Option, + /// Timestamp of the last rate limit response + pub last_rate_limit: Option, + /// Request times for median calculation (kept in rolling window) + pub request_times: Window, + /// Status code counts + pub status_codes: HashMap, + /// Number of cache hits + pub cache_hits: u64, + /// Number of cache misses + pub cache_misses: u64, +} + +impl HostStats { + /// Create new host statistics with custom window size for request times + #[must_use] + pub fn with_window_size(window_size: usize) -> Self { + Self { + request_times: Window::new(window_size), + ..Default::default() + } + } + + /// Record a response with status code and request duration + pub fn record_response(&mut self, status_code: u16, request_time: Duration) { + self.total_requests += 1; + + // Track status code + *self.status_codes.entry(status_code).or_insert(0) += 1; + + // Categorize response + match status_code { + 200..=299 => { + self.successful_requests += 1; + self.last_success = Some(Instant::now()); + } + 429 => { + self.rate_limited += 1; + self.last_rate_limit = Some(Instant::now()); + } + 400..=499 => { + self.client_errors += 1; + } + 500..=599 => { + self.server_errors += 1; + } + _ => {} // Other status codes + } + + // Track request time in rolling window + self.request_times.push(request_time); + } + + /// Get median request time + #[must_use] + pub fn median_request_time(&self) -> Option { + if self.request_times.is_empty() { + return None; + } + + let mut times = self.request_times.to_vec(); + times.sort(); + let mid = times.len() / 2; + + if times.len() % 2 == 0 { + // Average of two middle values + Some((times[mid - 1] + times[mid]) / 2) + } else { + Some(times[mid]) + } + } + + /// Get error rate (percentage) + #[must_use] + pub fn error_rate(&self) -> f64 { + if self.total_requests == 0 { + return 0.0; + } + let errors = self.rate_limited + self.client_errors + self.server_errors; + #[allow(clippy::cast_precision_loss)] + let error_rate = errors as f64 / self.total_requests as f64; + error_rate * 100.0 + } + + /// Get the current success rate (0.0 to 1.0) + #[must_use] + pub fn success_rate(&self) -> f64 { + if self.total_requests == 0 { + 1.0 // Assume success until proven otherwise + } else { + #[allow(clippy::cast_precision_loss)] + let success_rate = self.successful_requests as f64 / self.total_requests as f64; + success_rate + } + } + + /// Get average request time + #[must_use] + pub fn average_request_time(&self) -> Option { + if self.request_times.is_empty() { + return None; + } + + let total: Duration = self.request_times.iter().sum(); + #[allow(clippy::cast_possible_truncation)] + Some(total / (self.request_times.len() as u32)) + } + + /// Get the most recent request time + #[must_use] + pub fn latest_request_time(&self) -> Option { + self.request_times.iter().last().copied() + } + + /// Check if this host has been experiencing rate limiting recently + #[must_use] + pub fn is_currently_rate_limited(&self) -> bool { + if let Some(last_rate_limit) = self.last_rate_limit { + // Consider rate limited if we got a 429 in the last 60 seconds + last_rate_limit.elapsed() < Duration::from_secs(60) + } else { + false + } + } + + /// Record a cache hit + pub const fn record_cache_hit(&mut self) { + self.cache_hits += 1; + // Cache hits should also count as total requests from user perspective + self.total_requests += 1; + // Cache hits are typically for successful previous requests, so count as successful + self.successful_requests += 1; + } + + /// Record a cache miss + pub const fn record_cache_miss(&mut self) { + self.cache_misses += 1; + // Cache misses will be followed by actual requests that increment total_requests + // so we don't increment here to avoid double-counting + } + + /// Get cache hit rate (0.0 to 1.0) + #[must_use] + pub fn cache_hit_rate(&self) -> f64 { + let total_cache_requests = self.cache_hits + self.cache_misses; + if total_cache_requests == 0 { + 0.0 + } else { + #[allow(clippy::cast_precision_loss)] + let hit_rate = self.cache_hits as f64 / total_cache_requests as f64; + hit_rate + } + } + + /// Get human-readable summary of the stats + #[must_use] + pub fn summary(&self) -> String { + #[allow(clippy::cast_possible_truncation, clippy::cast_sign_loss)] + let success_pct = (self.success_rate() * 100.0) as u64; + #[allow(clippy::cast_possible_truncation, clippy::cast_sign_loss)] + let error_pct = self.error_rate() as u64; + + let avg_time = self + .average_request_time() + .map_or_else(|| "N/A".to_string(), |d| format!("{:.0}ms", d.as_millis())); + + format!( + "{} requests ({}% success, {}% errors), avg: {}", + self.total_requests, success_pct, error_pct, avg_time + ) + } +} + +#[cfg(test)] +mod tests { + use super::*; + use std::time::Duration; + + #[test] + fn test_host_stats_success_rate() { + let mut stats = HostStats::default(); + + // No requests yet - should assume success + assert!((stats.success_rate() - 1.0).abs() < f64::EPSILON); + + // Record some successful requests + stats.record_response(200, Duration::from_millis(100)); + stats.record_response(200, Duration::from_millis(120)); + assert!((stats.success_rate() - 1.0).abs() < f64::EPSILON); + + // Record a rate limited request + stats.record_response(429, Duration::from_millis(150)); + assert!((stats.success_rate() - (2.0 / 3.0)).abs() < 0.001); + + // Record a server error + stats.record_response(500, Duration::from_millis(200)); + assert!((stats.success_rate() - 0.5).abs() < f64::EPSILON); + } + + #[test] + fn test_host_stats_tracking() { + let mut stats = HostStats::default(); + + // Initially empty + assert_eq!(stats.total_requests, 0); + assert_eq!(stats.successful_requests, 0); + assert!(stats.error_rate().abs() < f64::EPSILON); + + // Record a successful response + stats.record_response(200, Duration::from_millis(100)); + assert_eq!(stats.total_requests, 1); + assert_eq!(stats.successful_requests, 1); + assert!(stats.error_rate().abs() < f64::EPSILON); + assert_eq!(stats.status_codes.get(&200), Some(&1)); + + // Record rate limited response + stats.record_response(429, Duration::from_millis(200)); + assert_eq!(stats.total_requests, 2); + assert_eq!(stats.rate_limited, 1); + assert!((stats.error_rate() - 50.0).abs() < f64::EPSILON); + + // Record server error + stats.record_response(500, Duration::from_millis(150)); + assert_eq!(stats.total_requests, 3); + assert_eq!(stats.server_errors, 1); + + // Check median request time + assert_eq!( + stats.median_request_time(), + Some(Duration::from_millis(150)) + ); + } + + #[test] + fn test_window_integration() { + let mut stats = HostStats::with_window_size(2); + + stats.record_response(200, Duration::from_millis(100)); + stats.record_response(200, Duration::from_millis(200)); + stats.record_response(200, Duration::from_millis(300)); + + // Window should only keep last 2 times + assert_eq!(stats.request_times.len(), 2); + + let times: Vec<_> = stats.request_times.iter().copied().collect(); + assert_eq!( + times, + vec![Duration::from_millis(200), Duration::from_millis(300)] + ); + } + + #[test] + fn test_summary_formatting() { + let mut stats = HostStats::default(); + stats.record_response(200, Duration::from_millis(150)); + stats.record_response(500, Duration::from_millis(200)); + + let summary = stats.summary(); + assert!(summary.contains("2 requests")); + assert!(summary.contains("50% success")); + assert!(summary.contains("50% errors")); + assert!(summary.contains("175ms")); // average of 150 and 200 + } +} diff --git a/lychee-lib/src/ratelimit/mod.rs b/lychee-lib/src/ratelimit/mod.rs new file mode 100644 index 0000000000..eeccd90ba2 --- /dev/null +++ b/lychee-lib/src/ratelimit/mod.rs @@ -0,0 +1,26 @@ +//! Per-host rate limiting and concurrency control. +//! +//! This module provides adaptive rate limiting for HTTP requests on a per-host basis. +//! It prevents overwhelming servers with too many concurrent requests and respects +//! server-provided rate limit headers. +//! +//! # Architecture +//! +//! - [`HostKey`]: Represents a hostname/domain for rate limiting +//! - [`Host`]: Manages rate limiting, concurrency, caching, and cookies for a specific host +//! - [`HostPool`]: Coordinates multiple hosts and routes requests appropriately +//! - [`HostConfig`]: Configuration for per-host behavior +//! - [`HostStats`]: Statistics tracking for each host +//! - [`Window`]: Rolling window data structure for request timing + +mod config; +mod error; +mod host; +mod pool; +mod window; + +pub use config::{HostConfig, RateLimitConfig}; +pub use error::RateLimitError; +pub use host::{Host, HostKey, HostStats}; +pub use pool::HostPool; +pub use window::Window; diff --git a/lychee-lib/src/ratelimit/pool.rs b/lychee-lib/src/ratelimit/pool.rs new file mode 100644 index 0000000000..b45b028dfc --- /dev/null +++ b/lychee-lib/src/ratelimit/pool.rs @@ -0,0 +1,450 @@ +use dashmap::DashMap; +use reqwest::{Request, Response}; +use std::collections::HashMap; +use std::sync::Arc; +use tokio::sync::Semaphore; + +use crate::ratelimit::{Host, HostConfig, HostKey, HostStats, RateLimitConfig, RateLimitError}; +use crate::{CacheStatus, Status, Uri}; + +/// Manages a pool of Host instances and routes requests to appropriate hosts. +/// +/// The `HostPool` serves as the central coordinator for per-host rate limiting. +/// It creates Host instances on-demand, manages global concurrency limits, +/// and provides a unified interface for executing HTTP requests with +/// appropriate rate limiting applied. +/// +/// # Architecture +/// +/// - Each unique hostname gets its own Host instance with dedicated rate limiting +/// - Global semaphore enforces overall concurrency limits across all hosts +/// - Hosts are created lazily when first requested +/// - Thread-safe using `DashMap` for concurrent access to host instances +#[derive(Debug, Clone)] +pub struct HostPool { + /// Map of hostname to Host instances, created on-demand + hosts: Arc>>, + + /// Global configuration for rate limiting defaults + global_config: Arc, + + /// Per-host configuration overrides + host_configs: Arc>, + + /// Global semaphore to enforce overall concurrency limit + global_semaphore: Arc, + + /// Maximum age for cached entries in seconds (0 to disable caching) + cache_max_age: u64, +} + +impl HostPool { + /// Create a new `HostPool` with the given configuration + /// + /// # Arguments + /// + /// * `global_config` - Default rate limiting configuration + /// * `host_configs` - Host-specific configuration overrides + /// * `max_total_concurrency` - Global limit on concurrent requests across all hosts + /// * `cache_max_age` - Maximum age for cached entries in seconds (0 to disable caching) + /// + /// # Examples + /// + /// ``` + /// use lychee_lib::ratelimit::{HostPool, RateLimitConfig}; + /// use std::collections::HashMap; + /// + /// let global_config = RateLimitConfig::default(); + /// let host_configs = HashMap::new(); + /// let pool = HostPool::new(global_config, host_configs, 128, 3600); + /// ``` + #[must_use] + pub fn new( + global_config: RateLimitConfig, + host_configs: HashMap, + max_total_concurrency: usize, + cache_max_age: u64, + ) -> Self { + Self { + hosts: Arc::new(DashMap::new()), + global_config: Arc::new(global_config), + host_configs: Arc::new(host_configs), + global_semaphore: Arc::new(Semaphore::new(max_total_concurrency)), + cache_max_age, + } + } + + /// Execute an HTTP request with appropriate per-host rate limiting + /// + /// This method: + /// 1. Extracts the hostname from the request URL + /// 2. Gets or creates the appropriate Host instance + /// 3. Acquires a global semaphore permit + /// 4. Delegates to the host for execution with host-specific rate limiting + /// + /// # Arguments + /// + /// * `request` - The HTTP request to execute + /// + /// # Errors + /// + /// Returns a `RateLimitError` if: + /// - The request URL has no valid hostname + /// - Global or host-specific rate limits are exceeded + /// - The underlying HTTP request fails + /// + /// # Examples + /// + /// ```no_run + /// # use lychee_lib::ratelimit::{HostPool, RateLimitConfig}; + /// # use std::collections::HashMap; + /// # use reqwest::Request; + /// # #[tokio::main] + /// # async fn main() -> Result<(), Box> { + /// let pool = HostPool::new(RateLimitConfig::default(), HashMap::new(), 128, 3600); + /// let request = reqwest::Request::new(reqwest::Method::GET, "https://example.com".parse()?); + /// let response = pool.execute_request(request).await?; + /// # Ok(()) + /// # } + /// ``` + pub async fn execute_request(&self, request: Request) -> Result { + // Extract hostname from request URL + let url = request.url(); + let host_key = HostKey::try_from(url)?; + + // Get or create host instance + let host = self.get_or_create_host(host_key)?; + + // Acquire global semaphore permit first + let _global_permit = self.global_semaphore.acquire().await.map_err(|_| { + RateLimitError::RateLimitExceeded { + host: host.key.to_string(), + message: "Global concurrency limit reached".to_string(), + } + })?; + + // Execute request through host-specific rate limiting + host.execute_request(request).await + } + + /// Get an existing host or create a new one for the given hostname + fn get_or_create_host(&self, host_key: HostKey) -> Result, RateLimitError> { + // Check if host already exists + if let Some(host) = self.hosts.get(&host_key) { + return Ok(host.clone()); + } + + // Create new host instance + let host_config = self + .host_configs + .get(host_key.as_str()) + .cloned() + .unwrap_or_default(); + + let host = Arc::new(Host::new( + host_key.clone(), + &host_config, + &self.global_config, + self.cache_max_age, + )?); + + // Store in map (handle race condition where another thread created it) + match self.hosts.entry(host_key) { + dashmap::mapref::entry::Entry::Occupied(entry) => { + // Another thread created it, use theirs + Ok(entry.get().clone()) + } + dashmap::mapref::entry::Entry::Vacant(entry) => { + // We're first, insert ours + Ok(entry.insert(host).clone()) + } + } + } + + /// Get statistics for a specific host + /// + /// Returns statistics for the host if it exists, otherwise returns empty stats. + /// This provides consistent behavior whether or not requests have been made to that host yet. + /// + /// # Arguments + /// + /// * `hostname` - The hostname to get statistics for + #[must_use] + pub fn host_stats(&self, hostname: &str) -> HostStats { + let host_key = HostKey::from(hostname); + self.hosts + .get(&host_key) + .map(|host| host.stats()) + .unwrap_or_default() + } + + /// Get statistics for all hosts that have been created + /// + /// Returns a `HashMap` mapping hostnames to their statistics. + /// Only hosts that have had requests will be included. + #[must_use] + pub fn all_host_stats(&self) -> HashMap { + self.hosts + .iter() + .map(|entry| { + let hostname = entry.key().to_string(); + let stats = entry.value().stats(); + (hostname, stats) + }) + .collect() + } + + /// Get the number of currently active hosts + /// + /// This returns the number of Host instances that have been created, + /// which corresponds to the number of unique hostnames that have + /// been accessed. + #[must_use] + pub fn active_host_count(&self) -> usize { + self.hosts.len() + } + + /// Get the number of available global permits + /// + /// This shows how many more concurrent requests can be started + /// across all hosts before hitting the global concurrency limit. + #[must_use] + pub fn available_global_permits(&self) -> usize { + self.global_semaphore.available_permits() + } + + /// Get host configuration for debugging/monitoring + /// + /// Returns a copy of the current host-specific configurations. + /// This is useful for debugging or runtime monitoring of configuration. + #[must_use] + pub fn host_configurations(&self) -> HashMap { + (*self.host_configs).clone() + } + + /// Remove a host from the pool + /// + /// This forces the host to be recreated with updated configuration + /// the next time a request is made to it. Any ongoing requests to + /// that host will continue with the old instance. + /// + /// # Arguments + /// + /// * `hostname` - The hostname to remove from the pool + /// + /// # Returns + /// + /// Returns true if a host was removed, false if no host existed for that hostname. + #[must_use] + pub fn remove_host(&self, hostname: &str) -> bool { + let host_key = HostKey::from(hostname); + self.hosts.remove(&host_key).is_some() + } + + /// Check if a URI is cached in the appropriate host's cache + /// + /// # Arguments + /// + /// * `uri` - The URI to check for in the cache + /// + /// # Returns + /// + /// Returns the cached status if found and valid, None otherwise + #[must_use] + pub fn get_cached_status(&self, uri: &Uri) -> Option { + let host_key = HostKey::try_from(uri).ok()?; + + if let Some(host) = self.hosts.get(&host_key) { + host.get_cached_status(uri) + } else { + None + } + } + + /// Cache a result for a URI in the appropriate host's cache + /// + /// # Arguments + /// + /// * `uri` - The URI to cache + /// * `status` - The status to cache + pub fn cache_result(&self, uri: &Uri, status: &Status) { + if let Ok(host_key) = HostKey::try_from(uri) { + if let Some(host) = self.hosts.get(&host_key) { + host.cache_result(uri, status); + } + // If host doesn't exist yet, we don't cache + // The result will be cached when the host is created and the request is made + } + } + + /// Get cache statistics across all hosts + #[must_use] + pub fn cache_stats(&self) -> HashMap { + self.hosts + .iter() + .map(|entry| { + let hostname = entry.key().to_string(); + let cache_size = entry.value().cache_size(); + let hit_rate = entry.value().stats().cache_hit_rate(); + (hostname, (cache_size, hit_rate)) + }) + .collect() + } + + /// Cleanup expired cache entries across all hosts + pub fn cleanup_caches(&self) { + for host in self.hosts.iter() { + host.cleanup_cache(); + } + } + + /// Record a cache hit for the given URI in host statistics + /// + /// This tracks that a request was served from the persistent disk cache + /// rather than going through the rate-limited HTTP request flow. + /// This method will create a Host instance if one doesn't exist yet. + /// + /// # Errors + /// + /// Returns an error if the host key cannot be parsed from the URI or if the host cannot be created. + pub fn record_cache_hit( + &self, + uri: &crate::Uri, + ) -> Result<(), crate::ratelimit::RateLimitError> { + let host_key = crate::ratelimit::HostKey::try_from(uri)?; + + // Get or create the host (this ensures statistics tracking even for cache-only requests) + let host = self.get_or_create_host(host_key)?; + host.record_persistent_cache_hit(); + Ok(()) + } + + /// Record a cache miss for the given URI in host statistics + /// + /// This tracks that a request could not be served from the persistent disk cache + /// and will need to go through the rate-limited HTTP request flow. + /// This method will create a Host instance if one doesn't exist yet. + /// + /// # Errors + /// + /// Returns an error if the host key cannot be parsed from the URI or if the host cannot be created. + pub fn record_cache_miss( + &self, + uri: &crate::Uri, + ) -> Result<(), crate::ratelimit::RateLimitError> { + let host_key = crate::ratelimit::HostKey::try_from(uri)?; + + // Get or create the host (this ensures statistics tracking even for cache-only requests) + let host = self.get_or_create_host(host_key)?; + host.record_persistent_cache_miss(); + Ok(()) + } +} + +impl Default for HostPool { + fn default() -> Self { + Self::new( + RateLimitConfig::default(), + HashMap::new(), + 128, // Default global concurrency limit + 3600, // Default cache age of 1 hour + ) + } +} + +#[cfg(test)] +mod tests { + use super::*; + use crate::ratelimit::RateLimitConfig; + + use url::Url; + + #[test] + fn test_host_pool_creation() { + let global_config = RateLimitConfig::default(); + let host_configs = HashMap::new(); + let pool = HostPool::new(global_config, host_configs, 100, 3600); + + assert_eq!(pool.active_host_count(), 0); + assert_eq!(pool.available_global_permits(), 100); + } + + #[test] + fn test_host_pool_default() { + let pool = HostPool::default(); + + assert_eq!(pool.active_host_count(), 0); + assert_eq!(pool.available_global_permits(), 128); + } + + #[tokio::test] + async fn test_host_creation_on_demand() { + let pool = HostPool::default(); + let url: Url = "https://example.com/path".parse().unwrap(); + let host_key = HostKey::try_from(&url).unwrap(); + + // No hosts initially + assert_eq!(pool.active_host_count(), 0); + assert_eq!(pool.host_stats("example.com").total_requests, 0); + + // Create host on demand + let host = pool.get_or_create_host(host_key).unwrap(); + + // Now we have one host + assert_eq!(pool.active_host_count(), 1); + assert_eq!(pool.host_stats("example.com").total_requests, 0); + assert_eq!(host.key.as_str(), "example.com"); + } + + #[tokio::test] + async fn test_host_reuse() { + let pool = HostPool::default(); + let url: Url = "https://example.com/path1".parse().unwrap(); + let host_key1 = HostKey::try_from(&url).unwrap(); + + let url: Url = "https://example.com/path2".parse().unwrap(); + let host_key2 = HostKey::try_from(&url).unwrap(); + + // Create host for first request + let host1 = pool.get_or_create_host(host_key1).unwrap(); + assert_eq!(pool.active_host_count(), 1); + + // Second request to same host should reuse + let host2 = pool.get_or_create_host(host_key2).unwrap(); + assert_eq!(pool.active_host_count(), 1); + + // Should be the same instance + assert!(Arc::ptr_eq(&host1, &host2)); + } + + #[test] + fn test_host_config_management() { + let pool = HostPool::default(); + + // Initially no host configurations + let configs = pool.host_configurations(); + assert_eq!(configs.len(), 0); + } + + #[test] + fn test_host_removal() { + let pool = HostPool::default(); + + // Remove non-existent host + assert!(!pool.remove_host("nonexistent.com")); + + // We can't easily test removal of existing hosts without making actual requests + // due to the async nature of host creation, but the basic functionality works + } + + #[test] + fn test_all_host_stats() { + let pool = HostPool::default(); + + // No hosts initially + let stats = pool.all_host_stats(); + assert!(stats.is_empty()); + + // Stats would be populated after actual requests are made to create hosts + } +} diff --git a/lychee-lib/src/ratelimit/window.rs b/lychee-lib/src/ratelimit/window.rs new file mode 100644 index 0000000000..058641e0a0 --- /dev/null +++ b/lychee-lib/src/ratelimit/window.rs @@ -0,0 +1,100 @@ +use std::collections::VecDeque; + +/// A rolling window data structure that automatically maintains a maximum size +/// by removing oldest elements when the capacity is exceeded. +#[derive(Debug, Clone)] +pub struct Window { + data: VecDeque, + capacity: usize, +} + +impl Window { + /// Create a new window with the given capacity + #[must_use] + pub fn new(capacity: usize) -> Self { + Self { + data: VecDeque::with_capacity(capacity), + capacity, + } + } + + /// Push an element to the window, removing the oldest if at capacity + pub fn push(&mut self, item: T) { + if self.data.len() >= self.capacity { + self.data.pop_front(); + } + self.data.push_back(item); + } + + /// Get the number of elements currently in the window + #[must_use] + pub fn len(&self) -> usize { + self.data.len() + } + + /// Check if the window is empty + #[must_use] + pub fn is_empty(&self) -> bool { + self.data.is_empty() + } + + /// Get an iterator over the elements in the window + pub fn iter(&self) -> impl Iterator { + self.data.iter() + } + + /// Convert to a vector (for compatibility with existing code) + #[must_use] + pub fn to_vec(&self) -> Vec + where + T: Clone, + { + self.data.iter().cloned().collect() + } +} + +impl Default for Window { + fn default() -> Self { + Self::new(100) // Default capacity of 100 items + } +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_window_capacity() { + let mut window = Window::new(3); + + // Fill up the window + window.push(1); + window.push(2); + window.push(3); + assert_eq!(window.len(), 3); + + // Add one more, should remove the oldest + window.push(4); + assert_eq!(window.len(), 3); + + let values: Vec<_> = window.iter().copied().collect(); + assert_eq!(values, vec![2, 3, 4]); + } + + #[test] + fn test_window_empty() { + let window: Window = Window::new(5); + assert!(window.is_empty()); + assert_eq!(window.len(), 0); + } + + #[test] + fn test_window_to_vec() { + let mut window = Window::new(3); + window.push(1); + window.push(2); + + let vec = window.to_vec(); + assert_eq!(vec, vec![1, 2]); + } +} diff --git a/lychee-lib/src/types/error.rs b/lychee-lib/src/types/error.rs index a41cad8f3a..c0764e4fd2 100644 --- a/lychee-lib/src/types/error.rs +++ b/lychee-lib/src/types/error.rs @@ -178,6 +178,9 @@ pub enum ErrorKind { /// The reason the command failed reason: String, }, + /// Rate limiting error + #[error("Rate limiting error: {0}")] + RateLimit(#[from] crate::ratelimit::RateLimitError), } impl ErrorKind { @@ -335,7 +338,10 @@ impl ErrorKind { [name] => format!("An index file ({name}) is required"), [init @ .., tail] => format!("An index file ({}, or {}) is required", init.join(", "), tail), }.into(), - ErrorKind::PreprocessorError{command, reason} => Some(format!("Command '{command}' failed {reason}. Check value of the preprocessor option")) + ErrorKind::PreprocessorError{command, reason} => Some(format!("Command '{command}' failed {reason}. Check value of the preprocessor option")), + ErrorKind::RateLimit(e) => Some(format!( + "Rate limiting error: {e}. Consider adjusting rate limiting configuration or waiting before retrying" + )), } } @@ -466,6 +472,7 @@ impl Hash for ErrorKind { Self::Cookies(e) => e.hash(state), Self::StatusCodeSelectorError(e) => e.to_string().hash(state), Self::PreprocessorError { command, reason } => (command, reason).hash(state), + Self::RateLimit(e) => e.to_string().hash(state), } } } From c7480ff50e361dc6a1637ccae9dbd1caabca3303 Mon Sep 17 00:00:00 2001 From: Matthias Date: Sun, 7 Sep 2025 21:58:56 +0200 Subject: [PATCH 03/43] fix: skip rate limiting tracking for file:// URLs File URLs don't have host components and should not be tracked in the per-host rate limiting system. Only network URIs (http/https) need rate limiting and statistics tracking. Fixes debug errors like: Failed to record cache miss for file:///path#fragment: Rate limiting error: URL contains no host component --- lychee-bin/src/commands/check.rs | 16 ++++++++++------ 1 file changed, 10 insertions(+), 6 deletions(-) diff --git a/lychee-bin/src/commands/check.rs b/lychee-bin/src/commands/check.rs index c0b2a5f769..1f287690d7 100644 --- a/lychee-bin/src/commands/check.rs +++ b/lychee-bin/src/commands/check.rs @@ -264,17 +264,21 @@ async fn handle( Status::from_cache_status(v.value().status, &accept) }; - // Track cache hit in the per-host stats - if let Err(e) = client.record_cache_hit(&uri) { - log::debug!("Failed to record cache hit for {uri}: {e}"); + // Track cache hit in the per-host stats (only for network URIs) + if !uri.is_file() { + if let Err(e) = client.record_cache_hit(&uri) { + log::debug!("Failed to record cache hit for {uri}: {e}"); + } } return Ok(Response::new(uri.clone(), status, request.source.into())); } - // Cache miss - track it and run a normal check - if let Err(e) = client.record_cache_miss(&uri) { - log::debug!("Failed to record cache miss for {uri}: {e}"); + // Cache miss - track it and run a normal check (only for network URIs) + if !uri.is_file() { + if let Err(e) = client.record_cache_miss(&uri) { + log::debug!("Failed to record cache miss for {uri}: {e}"); + } } let response = check_url(client, request).await; From 462ba32120b6d883898b8f0daea35f7def919d89 Mon Sep 17 00:00:00 2001 From: Matthias Date: Sun, 7 Sep 2025 22:14:36 +0200 Subject: [PATCH 04/43] feat: improve rate limiting logging and output formatting - Add debug messages when hosts hit rate limits (429 responses) - Add debug messages when applying backoff delays - Show exponential backoff progression in debug logs - Change 'cache' to 'cached' in host statistics output for clarity Debug output example: Host httpbin.org hit rate limit (429), increasing backoff from 0ms to 500ms Host httpbin.org applying backoff delay of 500ms due to previous rate limiting or errors Statistics output now shows '0.0% cached' instead of '0.0% cache' --- lychee-bin/src/formatters/host_stats/compact.rs | 2 +- lychee-lib/src/ratelimit/host/host.rs | 14 +++++++++++++- 2 files changed, 14 insertions(+), 2 deletions(-) diff --git a/lychee-bin/src/formatters/host_stats/compact.rs b/lychee-bin/src/formatters/host_stats/compact.rs index 88b8b2fe6c..d6bcf5f195 100644 --- a/lychee-bin/src/formatters/host_stats/compact.rs +++ b/lychee-bin/src/formatters/host_stats/compact.rs @@ -47,7 +47,7 @@ impl Display for CompactHostStats { color!( f, NORMAL, - "{:6} reqs │ {:>6.1}% success │ {:>8} median │ {:>6.1}% cache", + "{:6} reqs │ {:>6.1}% success │ {:>8} median │ {:>6.1}% cached", hostname, stats.total_requests, stats.success_rate() * 100.0, diff --git a/lychee-lib/src/ratelimit/host/host.rs b/lychee-lib/src/ratelimit/host/host.rs index 2718bceb17..7618e8560a 100644 --- a/lychee-lib/src/ratelimit/host/host.rs +++ b/lychee-lib/src/ratelimit/host/host.rs @@ -219,6 +219,11 @@ impl Host { *backoff }; if !backoff_duration.is_zero() { + log::debug!( + "Host {} applying backoff delay of {}ms due to previous rate limiting or errors", + self.key, + backoff_duration.as_millis() + ); tokio::time::sleep(backoff_duration).await; } @@ -271,7 +276,7 @@ impl Host { } 429 => { // Exponential backoff on rate limit, capped at 30 seconds - *backoff = std::cmp::min( + let new_backoff = std::cmp::min( if backoff.is_zero() { Duration::from_millis(500) } else { @@ -279,6 +284,13 @@ impl Host { }, Duration::from_secs(30), ); + log::debug!( + "Host {} hit rate limit (429), increasing backoff from {}ms to {}ms", + self.key, + backoff.as_millis(), + new_backoff.as_millis() + ); + *backoff = new_backoff; } 500..=599 => { // Moderate backoff increase on server errors, capped at 10 seconds From 7d25ea2687857808acb221651549a95b7c36ef95 Mon Sep 17 00:00:00 2001 From: Matthias Date: Mon, 8 Sep 2025 23:35:24 +0200 Subject: [PATCH 05/43] Fix lints --- lychee-lib/src/ratelimit/host/host.rs | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/lychee-lib/src/ratelimit/host/host.rs b/lychee-lib/src/ratelimit/host/host.rs index 7618e8560a..b499f2f71c 100644 --- a/lychee-lib/src/ratelimit/host/host.rs +++ b/lychee-lib/src/ratelimit/host/host.rs @@ -220,8 +220,8 @@ impl Host { }; if !backoff_duration.is_zero() { log::debug!( - "Host {} applying backoff delay of {}ms due to previous rate limiting or errors", - self.key, + "Host {} applying backoff delay of {}ms due to previous rate limiting or errors", + self.key, backoff_duration.as_millis() ); tokio::time::sleep(backoff_duration).await; @@ -285,8 +285,8 @@ impl Host { Duration::from_secs(30), ); log::debug!( - "Host {} hit rate limit (429), increasing backoff from {}ms to {}ms", - self.key, + "Host {} hit rate limit (429), increasing backoff from {}ms to {}ms", + self.key, backoff.as_millis(), new_backoff.as_millis() ); From 956e20a091df9d052542ba531dd3b32d4d59efff Mon Sep 17 00:00:00 2001 From: Matthias Date: Mon, 8 Sep 2025 23:55:35 +0200 Subject: [PATCH 06/43] Fix cookie jar sharing in per-host rate limiting The per-host implementation was creating separate cookie jars for each host, which broke the global --cookie-jar functionality. Now all hosts share the same cookie jar when one is provided, while still maintaining separate rate limiting and statistics per host. Fixes test_cookie_jar test. --- lychee-bin/src/client.rs | 21 +++++++++++++++------ lychee-lib/src/ratelimit/host/host.rs | 10 ++++++---- lychee-lib/src/ratelimit/pool.rs | 25 +++++++++++++++++++++++++ 3 files changed, 46 insertions(+), 10 deletions(-) diff --git a/lychee-bin/src/client.rs b/lychee-bin/src/client.rs index aa1b5407d1..3e187fed28 100644 --- a/lychee-bin/src/client.rs +++ b/lychee-bin/src/client.rs @@ -35,12 +35,21 @@ pub(crate) fn create(cfg: &Config, cookie_jar: Option<&Arc>) - let rate_limit_config = RateLimitConfig::from_options(cfg.default_host_concurrency, cfg.default_request_interval); let cache_max_age = if cfg.cache { 3600 } else { 0 }; // 1 hour if caching enabled, disabled otherwise - let host_pool = HostPool::new( - rate_limit_config, - cfg.hosts.clone(), - cfg.max_concurrency, - cache_max_age, - ); + let host_pool = match cookie_jar { + Some(cookie_jar) => HostPool::with_cookie_jar( + rate_limit_config, + cfg.hosts.clone(), + cfg.max_concurrency, + cache_max_age, + cookie_jar.clone(), + ), + None => HostPool::new( + rate_limit_config, + cfg.hosts.clone(), + cfg.max_concurrency, + cache_max_age, + ), + }; ClientBuilder::builder() .remaps(remaps) diff --git a/lychee-lib/src/ratelimit/host/host.rs b/lychee-lib/src/ratelimit/host/host.rs index b499f2f71c..5c45be68be 100644 --- a/lychee-lib/src/ratelimit/host/host.rs +++ b/lychee-lib/src/ratelimit/host/host.rs @@ -83,6 +83,7 @@ impl Host { /// * `host_config` - Host-specific configuration /// * `global_config` - Global defaults to fall back to /// * `cache_max_age` - Maximum age for cached entries in seconds (0 to disable caching) + /// * `shared_cookie_jar` - Optional shared cookie jar to use instead of creating per-host jar /// /// # Errors /// @@ -96,6 +97,7 @@ impl Host { host_config: &HostConfig, global_config: &RateLimitConfig, cache_max_age: u64, + shared_cookie_jar: Option>, ) -> Result { // Configure rate limiter with effective request interval let interval = host_config.effective_request_interval(global_config); @@ -112,8 +114,8 @@ impl Host { let max_concurrent = host_config.effective_max_concurrent(global_config); let semaphore = Arc::new(Semaphore::new(max_concurrent)); - // Create per-host cookie jar - let cookie_jar = Arc::new(CookieStoreMutex::default()); + // Use shared cookie jar if provided, otherwise create per-host one + let cookie_jar = shared_cookie_jar.unwrap_or_else(|| Arc::new(CookieStoreMutex::default())); // Build HTTP client with host-specific configuration let client = ReqwestClient::builder() @@ -432,7 +434,7 @@ mod tests { let host_config = HostConfig::default(); let global_config = RateLimitConfig::default(); - let host = Host::new(key.clone(), &host_config, &global_config, 3600).unwrap(); + let host = Host::new(key.clone(), &host_config, &global_config, 3600, None).unwrap(); assert_eq!(host.key, key); assert_eq!(host.available_permits(), 10); // Default concurrency @@ -446,7 +448,7 @@ mod tests { let host_config = HostConfig::default(); let global_config = RateLimitConfig::default(); - let host = Host::new(key, &host_config, &global_config, 1).unwrap(); // 1 second cache + let host = Host::new(key, &host_config, &global_config, 1, None).unwrap(); // 1 second cache let uri = Uri::from("https://example.com/test".parse::().unwrap()); let status = Status::Ok(http::StatusCode::OK); diff --git a/lychee-lib/src/ratelimit/pool.rs b/lychee-lib/src/ratelimit/pool.rs index b45b028dfc..07968aa289 100644 --- a/lychee-lib/src/ratelimit/pool.rs +++ b/lychee-lib/src/ratelimit/pool.rs @@ -1,5 +1,6 @@ use dashmap::DashMap; use reqwest::{Request, Response}; +use reqwest_cookie_store::CookieStoreMutex; use std::collections::HashMap; use std::sync::Arc; use tokio::sync::Semaphore; @@ -36,6 +37,9 @@ pub struct HostPool { /// Maximum age for cached entries in seconds (0 to disable caching) cache_max_age: u64, + + /// Shared cookie jar used across all hosts + cookie_jar: Option>, } impl HostPool { @@ -71,6 +75,26 @@ impl HostPool { host_configs: Arc::new(host_configs), global_semaphore: Arc::new(Semaphore::new(max_total_concurrency)), cache_max_age, + cookie_jar: None, + } + } + + /// Create a new `HostPool` with a shared cookie jar + #[must_use] + pub fn with_cookie_jar( + global_config: RateLimitConfig, + host_configs: HashMap, + max_total_concurrency: usize, + cache_max_age: u64, + cookie_jar: Arc, + ) -> Self { + Self { + hosts: Arc::new(DashMap::new()), + global_config: Arc::new(global_config), + host_configs: Arc::new(host_configs), + global_semaphore: Arc::new(Semaphore::new(max_total_concurrency)), + cache_max_age, + cookie_jar: Some(cookie_jar), } } @@ -146,6 +170,7 @@ impl HostPool { &host_config, &self.global_config, self.cache_max_age, + self.cookie_jar.clone(), )?); // Store in map (handle race condition where another thread created it) From 20844d2d55df138ba8e3d65da5d8dddc2d900039 Mon Sep 17 00:00:00 2001 From: Matthias Date: Tue, 9 Sep 2025 00:11:51 +0200 Subject: [PATCH 07/43] Fix missing User-Agent header in per-host clients Per-host HTTP clients were missing the User-Agent header and other global headers, causing some sites like crates.io to return 403 Forbidden errors. Now all per-host clients inherit the global headers (User-Agent, custom headers) while still allowing host-specific header overrides. Fixes test_crates_io_quirk test. --- lychee-bin/src/client.rs | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/lychee-bin/src/client.rs b/lychee-bin/src/client.rs index 3e187fed28..42e083cc96 100644 --- a/lychee-bin/src/client.rs +++ b/lychee-bin/src/client.rs @@ -31,6 +31,13 @@ pub(crate) fn create(cfg: &Config, cookie_jar: Option<&Arc>) - let headers = HeaderMap::from_header_pairs(&cfg.header)?; + // Create combined headers for HostPool (includes User-Agent + custom headers) + let mut combined_headers = headers.clone(); + combined_headers.insert( + http::header::USER_AGENT, + cfg.user_agent.parse().context("Invalid User-Agent header")?, + ); + // Create HostPool for rate limiting - always enabled for HTTP requests let rate_limit_config = RateLimitConfig::from_options(cfg.default_host_concurrency, cfg.default_request_interval); @@ -41,6 +48,7 @@ pub(crate) fn create(cfg: &Config, cookie_jar: Option<&Arc>) - cfg.hosts.clone(), cfg.max_concurrency, cache_max_age, + combined_headers, cookie_jar.clone(), ), None => HostPool::new( @@ -48,6 +56,7 @@ pub(crate) fn create(cfg: &Config, cookie_jar: Option<&Arc>) - cfg.hosts.clone(), cfg.max_concurrency, cache_max_age, + combined_headers, ), }; From 9ecb5e972ec02372941187dec4c254e91713afa6 Mon Sep 17 00:00:00 2001 From: Matthias Date: Tue, 9 Sep 2025 00:14:58 +0200 Subject: [PATCH 08/43] Bring back global headers (e.g. for user-agent) --- lychee-lib/src/ratelimit/host/host.rs | 16 ++++++++++++---- lychee-lib/src/ratelimit/pool.rs | 17 +++++++++++++++-- 2 files changed, 27 insertions(+), 6 deletions(-) diff --git a/lychee-lib/src/ratelimit/host/host.rs b/lychee-lib/src/ratelimit/host/host.rs index 5c45be68be..ed170b2ed7 100644 --- a/lychee-lib/src/ratelimit/host/host.rs +++ b/lychee-lib/src/ratelimit/host/host.rs @@ -84,6 +84,7 @@ impl Host { /// * `global_config` - Global defaults to fall back to /// * `cache_max_age` - Maximum age for cached entries in seconds (0 to disable caching) /// * `shared_cookie_jar` - Optional shared cookie jar to use instead of creating per-host jar + /// * `global_headers` - Global headers to be applied to all requests (User-Agent, custom headers, etc.) /// /// # Errors /// @@ -98,6 +99,7 @@ impl Host { global_config: &RateLimitConfig, cache_max_age: u64, shared_cookie_jar: Option>, + global_headers: &http::HeaderMap, ) -> Result { // Configure rate limiter with effective request interval let interval = host_config.effective_request_interval(global_config); @@ -117,10 +119,16 @@ impl Host { // Use shared cookie jar if provided, otherwise create per-host one let cookie_jar = shared_cookie_jar.unwrap_or_else(|| Arc::new(CookieStoreMutex::default())); - // Build HTTP client with host-specific configuration + // Combine global headers with host-specific headers + let mut combined_headers = global_headers.clone(); + for (name, value) in &host_config.headers { + combined_headers.insert(name, value.clone()); + } + + // Build HTTP client with combined headers let client = ReqwestClient::builder() .cookie_provider(cookie_jar.clone()) - .default_headers(host_config.headers.clone()) + .default_headers(combined_headers) .build() .map_err(|e| RateLimitError::ClientConfigError { host: key.to_string(), @@ -434,7 +442,7 @@ mod tests { let host_config = HostConfig::default(); let global_config = RateLimitConfig::default(); - let host = Host::new(key.clone(), &host_config, &global_config, 3600, None).unwrap(); + let host = Host::new(key.clone(), &host_config, &global_config, 3600, None, &http::HeaderMap::new()).unwrap(); assert_eq!(host.key, key); assert_eq!(host.available_permits(), 10); // Default concurrency @@ -448,7 +456,7 @@ mod tests { let host_config = HostConfig::default(); let global_config = RateLimitConfig::default(); - let host = Host::new(key, &host_config, &global_config, 1, None).unwrap(); // 1 second cache + let host = Host::new(key, &host_config, &global_config, 1, None, &http::HeaderMap::new()).unwrap(); // 1 second cache let uri = Uri::from("https://example.com/test".parse::().unwrap()); let status = Status::Ok(http::StatusCode::OK); diff --git a/lychee-lib/src/ratelimit/pool.rs b/lychee-lib/src/ratelimit/pool.rs index 07968aa289..ffffeea084 100644 --- a/lychee-lib/src/ratelimit/pool.rs +++ b/lychee-lib/src/ratelimit/pool.rs @@ -1,4 +1,5 @@ use dashmap::DashMap; +use http::HeaderMap; use reqwest::{Request, Response}; use reqwest_cookie_store::CookieStoreMutex; use std::collections::HashMap; @@ -40,6 +41,9 @@ pub struct HostPool { /// Shared cookie jar used across all hosts cookie_jar: Option>, + + /// Global headers to be applied to all requests (includes User-Agent, etc.) + global_headers: HeaderMap, } impl HostPool { @@ -51,16 +55,19 @@ impl HostPool { /// * `host_configs` - Host-specific configuration overrides /// * `max_total_concurrency` - Global limit on concurrent requests across all hosts /// * `cache_max_age` - Maximum age for cached entries in seconds (0 to disable caching) + /// * `global_headers` - Headers to be applied to all requests (User-Agent, custom headers, etc.) /// /// # Examples /// /// ``` /// use lychee_lib::ratelimit::{HostPool, RateLimitConfig}; /// use std::collections::HashMap; + /// use http::HeaderMap; /// /// let global_config = RateLimitConfig::default(); /// let host_configs = HashMap::new(); - /// let pool = HostPool::new(global_config, host_configs, 128, 3600); + /// let global_headers = HeaderMap::new(); + /// let pool = HostPool::new(global_config, host_configs, 128, 3600, global_headers); /// ``` #[must_use] pub fn new( @@ -68,6 +75,7 @@ impl HostPool { host_configs: HashMap, max_total_concurrency: usize, cache_max_age: u64, + global_headers: HeaderMap, ) -> Self { Self { hosts: Arc::new(DashMap::new()), @@ -76,6 +84,7 @@ impl HostPool { global_semaphore: Arc::new(Semaphore::new(max_total_concurrency)), cache_max_age, cookie_jar: None, + global_headers, } } @@ -86,6 +95,7 @@ impl HostPool { host_configs: HashMap, max_total_concurrency: usize, cache_max_age: u64, + global_headers: HeaderMap, cookie_jar: Arc, ) -> Self { Self { @@ -95,6 +105,7 @@ impl HostPool { global_semaphore: Arc::new(Semaphore::new(max_total_concurrency)), cache_max_age, cookie_jar: Some(cookie_jar), + global_headers, } } @@ -171,6 +182,7 @@ impl HostPool { &self.global_config, self.cache_max_age, self.cookie_jar.clone(), + &self.global_headers, )?); // Store in map (handle race condition where another thread created it) @@ -373,6 +385,7 @@ impl Default for HostPool { HashMap::new(), 128, // Default global concurrency limit 3600, // Default cache age of 1 hour + HeaderMap::new(), // Default empty headers ) } } @@ -388,7 +401,7 @@ mod tests { fn test_host_pool_creation() { let global_config = RateLimitConfig::default(); let host_configs = HashMap::new(); - let pool = HostPool::new(global_config, host_configs, 100, 3600); + let pool = HostPool::new(global_config, host_configs, 100, 3600, HeaderMap::new()); assert_eq!(pool.active_host_count(), 0); assert_eq!(pool.available_global_permits(), 100); From ded5bc19f7662c366537af01a6e52f19e45dd4f2 Mon Sep 17 00:00:00 2001 From: Matthias Date: Tue, 9 Sep 2025 00:39:41 +0200 Subject: [PATCH 09/43] Fix redirect handling in per-host clients Per-host clients were not respecting the max_redirects configuration, causing redirect tests to fail. Each host now creates its own reqwest client with proper redirect policy, timeout, and security settings matching the main client configuration. Fixes test_prevent_too_many_redirects test. --- lychee-bin/src/client.rs | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/lychee-bin/src/client.rs b/lychee-bin/src/client.rs index 42e083cc96..39580a3924 100644 --- a/lychee-bin/src/client.rs +++ b/lychee-bin/src/client.rs @@ -49,6 +49,9 @@ pub(crate) fn create(cfg: &Config, cookie_jar: Option<&Arc>) - cfg.max_concurrency, cache_max_age, combined_headers, + cfg.max_redirects, + Some(timeout), + cfg.insecure, cookie_jar.clone(), ), None => HostPool::new( @@ -57,6 +60,9 @@ pub(crate) fn create(cfg: &Config, cookie_jar: Option<&Arc>) - cfg.max_concurrency, cache_max_age, combined_headers, + cfg.max_redirects, + Some(timeout), + cfg.insecure, ), }; From 4577865426f7d116933f6a617de7f4829b69f4de Mon Sep 17 00:00:00 2001 From: Matthias Date: Tue, 9 Sep 2025 01:00:49 +0200 Subject: [PATCH 10/43] Pass missing args: max_redirects, timeout, allow_insecure --- README.md | 17 +++++++ lychee-bin/src/client.rs | 4 +- lychee-lib/src/ratelimit/host/host.rs | 60 ++++++++++++++++++++++-- lychee-lib/src/ratelimit/pool.rs | 67 ++++++++++++++++++++++++--- 4 files changed, 135 insertions(+), 13 deletions(-) diff --git a/README.md b/README.md index bda7ed5a4f..74865a3fb4 100644 --- a/README.md +++ b/README.md @@ -425,6 +425,20 @@ Options: This is useful for files without extensions or with unknown extensions. The extension will be used to determine the file type for processing. Examples: --default-extension md, --default-extension html + --default-host-concurrency + Default maximum concurrent requests per host (default: 10) + + This limits how many requests can be sent simultaneously to the same host (domain/subdomain). This helps prevent overwhelming servers and getting rate-limited. Each host is handled independently. + + Examples: --default-host-concurrency 5 # Conservative for slow APIs --default-host-concurrency 20 # Aggressive for fast APIs + + --default-request-interval + Minimum interval between requests to the same host (default: 100ms) + + Sets a baseline delay between consecutive requests to prevent hammering servers. The adaptive algorithm may increase this based on server responses (rate limits, errors). + + Examples: --default-request-interval 50ms # Fast for robust APIs\ --default-request-interval 1s # Conservative for rate-limited APIs + --dump Don't perform any link checking. Instead, dump all the links extracted from inputs that would be checked @@ -523,6 +537,9 @@ Options: --hidden Do not skip hidden directories and files + --host-stats + Show per-host statistics at the end of the run + -i, --insecure Proceed for server connections considered insecure (invalid TLS) diff --git a/lychee-bin/src/client.rs b/lychee-bin/src/client.rs index 39580a3924..d54832b99b 100644 --- a/lychee-bin/src/client.rs +++ b/lychee-bin/src/client.rs @@ -35,7 +35,9 @@ pub(crate) fn create(cfg: &Config, cookie_jar: Option<&Arc>) - let mut combined_headers = headers.clone(); combined_headers.insert( http::header::USER_AGENT, - cfg.user_agent.parse().context("Invalid User-Agent header")?, + cfg.user_agent + .parse() + .context("Invalid User-Agent header")?, ); // Create HostPool for rate limiting - always enabled for HTTP requests diff --git a/lychee-lib/src/ratelimit/host/host.rs b/lychee-lib/src/ratelimit/host/host.rs index ed170b2ed7..a6a2a213ba 100644 --- a/lychee-lib/src/ratelimit/host/host.rs +++ b/lychee-lib/src/ratelimit/host/host.rs @@ -4,7 +4,7 @@ use governor::{ clock::DefaultClock, state::{InMemoryState, NotKeyed}, }; -use reqwest::{Client as ReqwestClient, Request, Response}; +use reqwest::{Client as ReqwestClient, Request, Response, redirect}; use reqwest_cookie_store::CookieStoreMutex; use std::sync::{Arc, Mutex}; use std::time::{Duration, Instant}; @@ -85,6 +85,9 @@ impl Host { /// * `cache_max_age` - Maximum age for cached entries in seconds (0 to disable caching) /// * `shared_cookie_jar` - Optional shared cookie jar to use instead of creating per-host jar /// * `global_headers` - Global headers to be applied to all requests (User-Agent, custom headers, etc.) + /// * `max_redirects` - Maximum number of redirects to follow + /// * `timeout` - Request timeout + /// * `allow_insecure` - Whether to allow insecure certificates /// /// # Errors /// @@ -93,6 +96,7 @@ impl Host { /// # Panics /// /// Panics if the burst size cannot be set to 1 (should never happen) + #[allow(clippy::too_many_arguments)] pub fn new( key: HostKey, host_config: &HostConfig, @@ -100,6 +104,9 @@ impl Host { cache_max_age: u64, shared_cookie_jar: Option>, global_headers: &http::HeaderMap, + max_redirects: usize, + timeout: Option, + allow_insecure: bool, ) -> Result { // Configure rate limiter with effective request interval let interval = host_config.effective_request_interval(global_config); @@ -125,10 +132,31 @@ impl Host { combined_headers.insert(name, value.clone()); } - // Build HTTP client with combined headers - let client = ReqwestClient::builder() + // Create custom redirect policy matching main client behavior + let redirect_policy = redirect::Policy::custom(move |attempt| { + if attempt.previous().len() > max_redirects { + attempt.error("too many redirects") + } else { + log::debug!("Redirecting to {}", attempt.url()); + attempt.follow() + } + }); + + // Build HTTP client with proper configuration + let mut builder = ReqwestClient::builder() .cookie_provider(cookie_jar.clone()) .default_headers(combined_headers) + .gzip(true) + .danger_accept_invalid_certs(allow_insecure) + .connect_timeout(Duration::from_secs(10)) // CONNECT_TIMEOUT constant + .tcp_keepalive(Duration::from_secs(60)) // TCP_KEEPALIVE constant + .redirect(redirect_policy); + + if let Some(timeout) = timeout { + builder = builder.timeout(timeout); + } + + let client = builder .build() .map_err(|e| RateLimitError::ClientConfigError { host: key.to_string(), @@ -442,7 +470,18 @@ mod tests { let host_config = HostConfig::default(); let global_config = RateLimitConfig::default(); - let host = Host::new(key.clone(), &host_config, &global_config, 3600, None, &http::HeaderMap::new()).unwrap(); + let host = Host::new( + key.clone(), + &host_config, + &global_config, + 3600, + None, + &http::HeaderMap::new(), + 5, + Some(std::time::Duration::from_secs(20)), + false, + ) + .unwrap(); assert_eq!(host.key, key); assert_eq!(host.available_permits(), 10); // Default concurrency @@ -456,7 +495,18 @@ mod tests { let host_config = HostConfig::default(); let global_config = RateLimitConfig::default(); - let host = Host::new(key, &host_config, &global_config, 1, None, &http::HeaderMap::new()).unwrap(); // 1 second cache + let host = Host::new( + key, + &host_config, + &global_config, + 1, + None, + &http::HeaderMap::new(), + 5, + Some(std::time::Duration::from_secs(20)), + false, + ) + .unwrap(); // 1 second cache let uri = Uri::from("https://example.com/test".parse::().unwrap()); let status = Status::Ok(http::StatusCode::OK); diff --git a/lychee-lib/src/ratelimit/pool.rs b/lychee-lib/src/ratelimit/pool.rs index ffffeea084..87abeaa369 100644 --- a/lychee-lib/src/ratelimit/pool.rs +++ b/lychee-lib/src/ratelimit/pool.rs @@ -4,6 +4,7 @@ use reqwest::{Request, Response}; use reqwest_cookie_store::CookieStoreMutex; use std::collections::HashMap; use std::sync::Arc; +use std::time::Duration; use tokio::sync::Semaphore; use crate::ratelimit::{Host, HostConfig, HostKey, HostStats, RateLimitConfig, RateLimitError}; @@ -44,6 +45,15 @@ pub struct HostPool { /// Global headers to be applied to all requests (includes User-Agent, etc.) global_headers: HeaderMap, + + /// Maximum number of redirects to follow + max_redirects: usize, + + /// Request timeout + timeout: Option, + + /// Whether to allow insecure certificates + allow_insecure: bool, } impl HostPool { @@ -56,6 +66,9 @@ impl HostPool { /// * `max_total_concurrency` - Global limit on concurrent requests across all hosts /// * `cache_max_age` - Maximum age for cached entries in seconds (0 to disable caching) /// * `global_headers` - Headers to be applied to all requests (User-Agent, custom headers, etc.) + /// * `max_redirects` - Maximum number of redirects to follow + /// * `timeout` - Request timeout + /// * `allow_insecure` - Whether to allow insecure certificates /// /// # Examples /// @@ -63,19 +76,24 @@ impl HostPool { /// use lychee_lib::ratelimit::{HostPool, RateLimitConfig}; /// use std::collections::HashMap; /// use http::HeaderMap; + /// use std::time::Duration; /// /// let global_config = RateLimitConfig::default(); /// let host_configs = HashMap::new(); /// let global_headers = HeaderMap::new(); - /// let pool = HostPool::new(global_config, host_configs, 128, 3600, global_headers); + /// let pool = HostPool::new(global_config, host_configs, 128, 3600, global_headers, 5, Some(Duration::from_secs(20)), false); /// ``` #[must_use] + #[allow(clippy::too_many_arguments)] pub fn new( global_config: RateLimitConfig, host_configs: HashMap, max_total_concurrency: usize, cache_max_age: u64, global_headers: HeaderMap, + max_redirects: usize, + timeout: Option, + allow_insecure: bool, ) -> Self { Self { hosts: Arc::new(DashMap::new()), @@ -85,17 +103,24 @@ impl HostPool { cache_max_age, cookie_jar: None, global_headers, + max_redirects, + timeout, + allow_insecure, } } /// Create a new `HostPool` with a shared cookie jar #[must_use] + #[allow(clippy::too_many_arguments)] pub fn with_cookie_jar( global_config: RateLimitConfig, host_configs: HashMap, max_total_concurrency: usize, cache_max_age: u64, global_headers: HeaderMap, + max_redirects: usize, + timeout: Option, + allow_insecure: bool, cookie_jar: Arc, ) -> Self { Self { @@ -106,6 +131,9 @@ impl HostPool { cache_max_age, cookie_jar: Some(cookie_jar), global_headers, + max_redirects, + timeout, + allow_insecure, } } @@ -133,10 +161,20 @@ impl HostPool { /// ```no_run /// # use lychee_lib::ratelimit::{HostPool, RateLimitConfig}; /// # use std::collections::HashMap; - /// # use reqwest::Request; + /// # use reqwest::{Request, header::HeaderMap}; + /// # use std::time::Duration; /// # #[tokio::main] /// # async fn main() -> Result<(), Box> { - /// let pool = HostPool::new(RateLimitConfig::default(), HashMap::new(), 128, 3600); + /// let pool = HostPool::new( + /// RateLimitConfig::default(), + /// HashMap::new(), + /// 128, + /// 3600, + /// HeaderMap::new(), + /// 5, + /// Some(Duration::from_secs(20)), + /// false + /// ); /// let request = reqwest::Request::new(reqwest::Method::GET, "https://example.com".parse()?); /// let response = pool.execute_request(request).await?; /// # Ok(()) @@ -183,6 +221,9 @@ impl HostPool { self.cache_max_age, self.cookie_jar.clone(), &self.global_headers, + self.max_redirects, + self.timeout, + self.allow_insecure, )?); // Store in map (handle race condition where another thread created it) @@ -383,9 +424,12 @@ impl Default for HostPool { Self::new( RateLimitConfig::default(), HashMap::new(), - 128, // Default global concurrency limit - 3600, // Default cache age of 1 hour - HeaderMap::new(), // Default empty headers + 128, // Default global concurrency limit + 3600, // Default cache age of 1 hour + HeaderMap::new(), // Default empty headers + 5, // Default max redirects + Some(Duration::from_secs(20)), // Default timeout + false, // Default secure certificates ) } } @@ -401,7 +445,16 @@ mod tests { fn test_host_pool_creation() { let global_config = RateLimitConfig::default(); let host_configs = HashMap::new(); - let pool = HostPool::new(global_config, host_configs, 100, 3600, HeaderMap::new()); + let pool = HostPool::new( + global_config, + host_configs, + 100, + 3600, + HeaderMap::new(), + 5, + Some(Duration::from_secs(20)), + false, + ); assert_eq!(pool.active_host_count(), 0); assert_eq!(pool.available_global_permits(), 100); From 4e042716fdfcb1e0d02a99f99033e7d3f7cc62a7 Mon Sep 17 00:00:00 2001 From: Matthias Date: Thu, 18 Sep 2025 16:23:18 +0200 Subject: [PATCH 11/43] Refactor host stats formatters to remove unused parameters and improve example formatting in README --- README.md | 8 ++++++-- lychee-bin/src/client.rs | 13 +++++++------ lychee-bin/src/formatters/host_stats/compact.rs | 3 +-- lychee-bin/src/formatters/host_stats/detailed.rs | 9 ++++++--- lychee-bin/src/formatters/mod.rs | 6 +++--- lychee-bin/src/options.rs | 2 +- 6 files changed, 24 insertions(+), 17 deletions(-) diff --git a/README.md b/README.md index 74865a3fb4..81b6f47a4d 100644 --- a/README.md +++ b/README.md @@ -430,14 +430,18 @@ Options: This limits how many requests can be sent simultaneously to the same host (domain/subdomain). This helps prevent overwhelming servers and getting rate-limited. Each host is handled independently. - Examples: --default-host-concurrency 5 # Conservative for slow APIs --default-host-concurrency 20 # Aggressive for fast APIs + Examples: + • --default-host-concurrency 5 # Conservative for slow APIs + • --default-host-concurrency 20 # Aggressive for fast APIs --default-request-interval Minimum interval between requests to the same host (default: 100ms) Sets a baseline delay between consecutive requests to prevent hammering servers. The adaptive algorithm may increase this based on server responses (rate limits, errors). - Examples: --default-request-interval 50ms # Fast for robust APIs\ --default-request-interval 1s # Conservative for rate-limited APIs + Examples: + • --default-request-interval 50ms # Fast for robust APIs + • --default-request-interval 1s # Conservative for rate-limited APIs --dump Don't perform any link checking. Instead, dump all the links extracted from inputs that would be checked diff --git a/lychee-bin/src/client.rs b/lychee-bin/src/client.rs index d54832b99b..a8bde3fcff 100644 --- a/lychee-bin/src/client.rs +++ b/lychee-bin/src/client.rs @@ -44,19 +44,20 @@ pub(crate) fn create(cfg: &Config, cookie_jar: Option<&Arc>) - let rate_limit_config = RateLimitConfig::from_options(cfg.default_host_concurrency, cfg.default_request_interval); let cache_max_age = if cfg.cache { 3600 } else { 0 }; // 1 hour if caching enabled, disabled otherwise - let host_pool = match cookie_jar { - Some(cookie_jar) => HostPool::with_cookie_jar( + let host_pool = if let Some(cookie_jar) = cookie_jar { + HostPool::with_cookie_jar( rate_limit_config, cfg.hosts.clone(), cfg.max_concurrency, cache_max_age, - combined_headers, + combined_headers.clone(), cfg.max_redirects, Some(timeout), cfg.insecure, cookie_jar.clone(), - ), - None => HostPool::new( + ) + } else { + HostPool::new( rate_limit_config, cfg.hosts.clone(), cfg.max_concurrency, @@ -65,7 +66,7 @@ pub(crate) fn create(cfg: &Config, cookie_jar: Option<&Arc>) - cfg.max_redirects, Some(timeout), cfg.insecure, - ), + ) }; ClientBuilder::builder() diff --git a/lychee-bin/src/formatters/host_stats/compact.rs b/lychee-bin/src/formatters/host_stats/compact.rs index d6bcf5f195..121230e259 100644 --- a/lychee-bin/src/formatters/host_stats/compact.rs +++ b/lychee-bin/src/formatters/host_stats/compact.rs @@ -5,7 +5,6 @@ use std::{ }; use crate::formatters::color::{DIM, NORMAL, color}; -use crate::options; use lychee_lib::ratelimit::HostStats; use super::HostStatsFormatter; @@ -65,7 +64,7 @@ impl Display for CompactHostStats { pub(crate) struct Compact; impl Compact { - pub(crate) const fn new(_mode: options::OutputMode) -> Self { + pub(crate) const fn new() -> Self { Self } } diff --git a/lychee-bin/src/formatters/host_stats/detailed.rs b/lychee-bin/src/formatters/host_stats/detailed.rs index cd859ae47b..01bfd42bc8 100644 --- a/lychee-bin/src/formatters/host_stats/detailed.rs +++ b/lychee-bin/src/formatters/host_stats/detailed.rs @@ -4,7 +4,6 @@ use std::{ fmt::{self, Display}, }; -use crate::options; use lychee_lib::ratelimit::HostStats; use super::HostStatsFormatter; @@ -35,7 +34,11 @@ impl Display for DetailedHostStats { )?; if stats.rate_limited > 0 { - writeln!(f, " Rate limited: {}", stats.rate_limited)?; + writeln!( + f, + " Rate limited: {} (429 Too Many Requests)", + stats.rate_limited + )?; } if stats.client_errors > 0 { writeln!(f, " Client errors (4xx): {}", stats.client_errors)?; @@ -70,7 +73,7 @@ impl Display for DetailedHostStats { pub(crate) struct Detailed; impl Detailed { - pub(crate) const fn new(_mode: options::OutputMode) -> Self { + pub(crate) const fn new() -> Self { Self } } diff --git a/lychee-bin/src/formatters/mod.rs b/lychee-bin/src/formatters/mod.rs index 22cde07cb2..de36c32bb6 100644 --- a/lychee-bin/src/formatters/mod.rs +++ b/lychee-bin/src/formatters/mod.rs @@ -45,11 +45,11 @@ pub(crate) fn get_progress_formatter(mode: &OutputMode) -> Box Box { match format { - StatsFormat::Compact | StatsFormat::Raw => Box::new(host_stats::Compact::new(mode.clone())), // Use compact for raw - StatsFormat::Detailed => Box::new(host_stats::Detailed::new(mode.clone())), + StatsFormat::Compact | StatsFormat::Raw => Box::new(host_stats::Compact::new()), // Use compact for raw + StatsFormat::Detailed => Box::new(host_stats::Detailed::new()), StatsFormat::Json => Box::new(host_stats::Json::new()), StatsFormat::Markdown => Box::new(host_stats::Markdown::new()), } diff --git a/lychee-bin/src/options.rs b/lychee-bin/src/options.rs index fbb81610bd..a997b64455 100644 --- a/lychee-bin/src/options.rs +++ b/lychee-bin/src/options.rs @@ -970,6 +970,7 @@ impl Config { // Keys which are handled outside of fold_in ..header, ..github_token, + ..hosts, // Keys with defaults to assign accept: StatusCodeSelector::default(), @@ -1000,7 +1001,6 @@ impl Config { glob_ignore_case: false, hidden: false, host_stats: false, - hosts: HashMap::new(), include: Vec::::new(), include_fragments: false, include_mail: false, From eed7576adc10c8fdc079e80337231fafa21b7d96 Mon Sep 17 00:00:00 2001 From: Matthias Date: Thu, 18 Sep 2025 16:28:24 +0200 Subject: [PATCH 12/43] remove confusing comment --- lychee-lib/src/checker/website.rs | 1 - 1 file changed, 1 deletion(-) diff --git a/lychee-lib/src/checker/website.rs b/lychee-lib/src/checker/website.rs index 5a0982196f..b053cf65e2 100644 --- a/lychee-lib/src/checker/website.rs +++ b/lychee-lib/src/checker/website.rs @@ -184,7 +184,6 @@ impl WebsiteChecker { } } } else { - // Fallback to direct client if no host pool configured (shouldn't happen normally) self.reqwest_client.execute(request).await }; From ae104a075f0b5d6dc5b0811eb09da92f10bad641 Mon Sep 17 00:00:00 2001 From: Matthias Date: Thu, 18 Sep 2025 16:51:09 +0200 Subject: [PATCH 13/43] Create `display_per_host_statistics` in separate file --- lychee-bin/src/host_stats.rs | 29 +++++++++++++++++++++++++++++ lychee-bin/src/main.rs | 23 ++++------------------- 2 files changed, 33 insertions(+), 19 deletions(-) create mode 100644 lychee-bin/src/host_stats.rs diff --git a/lychee-bin/src/host_stats.rs b/lychee-bin/src/host_stats.rs new file mode 100644 index 0000000000..981d6eee8e --- /dev/null +++ b/lychee-bin/src/host_stats.rs @@ -0,0 +1,29 @@ +use anyhow::{Context, Result}; + +use crate::{formatters::get_host_stats_formatter, options::Config}; + +/// Display per-host statistics if requested +pub(crate) fn display_per_host_statistics( + client: &lychee_lib::Client, + config: &Config, +) -> Result<()> { + if !config.host_stats { + return Ok(()); + } + + let host_stats = client.host_stats(); + let host_stats_formatter = get_host_stats_formatter(&config.format, &config.mode); + + if let Some(formatted_host_stats) = host_stats_formatter.format(host_stats)? { + if let Some(output) = &config.output { + // For file output, append to the existing output + let mut file_content = std::fs::read_to_string(output).unwrap_or_default(); + file_content.push_str(&formatted_host_stats); + std::fs::write(output, file_content) + .context("Cannot write host stats to output file")?; + } else { + print!("{formatted_host_stats}"); + } + } + Ok(()) +} diff --git a/lychee-bin/src/main.rs b/lychee-bin/src/main.rs index d8cfcb109e..69ac22c818 100644 --- a/lychee-bin/src/main.rs +++ b/lychee-bin/src/main.rs @@ -66,7 +66,7 @@ use std::sync::Arc; use anyhow::{Context, Error, Result, bail}; use clap::{Parser, crate_version}; use commands::{CommandParams, generate}; -use formatters::{get_host_stats_formatter, get_stats_formatter, log::init_logging}; +use formatters::{get_stats_formatter, log::init_logging}; use http::HeaderMap; use log::{error, info, warn}; @@ -86,6 +86,7 @@ mod client; mod commands; mod files_from; mod formatters; +mod host_stats; mod options; mod parse; mod progress; @@ -97,6 +98,7 @@ use crate::{ cache::{Cache, StoreExt}, formatters::{duration::Duration, stats::StatsFormatter}, generate::generate, + host_stats::display_per_host_statistics, options::{Config, LYCHEE_CACHE_FILE, LYCHEE_IGNORE_FILE, LycheeOptions}, }; @@ -315,7 +317,6 @@ fn underlying_io_error_kind(error: &Error) -> Option { } /// Run lychee on the given inputs -#[allow(clippy::too_many_lines)] async fn run(opts: &LycheeOptions) -> Result { let inputs = opts.inputs()?; @@ -422,23 +423,7 @@ async fn run(opts: &LycheeOptions) -> Result { } // Display per-host statistics if requested - if opts.config.host_stats { - let host_stats = client.host_stats(); - let host_stats_formatter = - get_host_stats_formatter(&opts.config.format, &opts.config.mode); - - if let Some(formatted_host_stats) = host_stats_formatter.format(host_stats)? { - if let Some(output) = &opts.config.output { - // For file output, append to the existing output - let mut file_content = std::fs::read_to_string(output).unwrap_or_default(); - file_content.push_str(&formatted_host_stats); - std::fs::write(output, file_content) - .context("Cannot write host stats to output file")?; - } else { - print!("{formatted_host_stats}"); - } - } - } + display_per_host_statistics(&client, &opts.config)?; if github_issues && opts.config.github_token.is_none() { warn!( From 843706873b629ae519781f824ffc6279b6ca689e Mon Sep 17 00:00:00 2001 From: Matthias Date: Mon, 22 Sep 2025 14:03:25 +0200 Subject: [PATCH 14/43] Remove redundant check for `self.hosts` --- lychee-bin/src/options.rs | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/lychee-bin/src/options.rs b/lychee-bin/src/options.rs index a997b64455..ef400609c4 100644 --- a/lychee-bin/src/options.rs +++ b/lychee-bin/src/options.rs @@ -960,6 +960,11 @@ impl Config { self.github_token = toml.github_token; } + // Hosts configuration is only available in TOML for now (not in the CLI) + // That's because it's a bit complex to specify on the command line and + // we didn't come up with a good syntax for it yet. + self.hosts = toml.hosts; + // NOTE: if you see an error within this macro call, check to make sure that // that the fields provided to fold_in! match all the fields of the Config struct. fold_in! { From 92854c11941e4e035acd03e05725a1e3d724b24d Mon Sep 17 00:00:00 2001 From: Matthias Date: Mon, 22 Sep 2025 14:06:25 +0200 Subject: [PATCH 15/43] Import `std::collections::HashMap` --- lychee-lib/src/checker/website.rs | 16 +++++++++------- 1 file changed, 9 insertions(+), 7 deletions(-) diff --git a/lychee-lib/src/checker/website.rs b/lychee-lib/src/checker/website.rs index b053cf65e2..e99626aaa1 100644 --- a/lychee-lib/src/checker/website.rs +++ b/lychee-lib/src/checker/website.rs @@ -11,7 +11,11 @@ use async_trait::async_trait; use http::{Method, StatusCode}; use octocrab::Octocrab; use reqwest::{Request, Response, header::CONTENT_TYPE}; -use std::{collections::HashSet, path::Path, time::Duration}; +use std::{ + collections::{HashMap, HashSet}, + path::Path, + time::Duration, +}; use url::Url; #[derive(Debug, Clone)] @@ -69,13 +73,11 @@ impl WebsiteChecker { /// Returns a map of hostnames to their statistics, or an empty map /// if host-based rate limiting is not enabled. #[must_use] - pub(crate) fn host_stats( - &self, - ) -> std::collections::HashMap { + pub(crate) fn host_stats(&self) -> HashMap { if let Some(host_pool) = &self.host_pool { host_pool.all_host_stats() } else { - std::collections::HashMap::new() + HashMap::default() } } @@ -84,11 +86,11 @@ impl WebsiteChecker { /// Returns a map of hostnames to (`cache_size`, `hit_rate`), or an empty map /// if host-based rate limiting is not enabled. #[must_use] - pub(crate) fn cache_stats(&self) -> std::collections::HashMap { + pub(crate) fn cache_stats(&self) -> HashMap { if let Some(host_pool) = &self.host_pool { host_pool.cache_stats() } else { - std::collections::HashMap::new() + HashMap::default() } } From ada65ac6a859c20980c51cdebe0638d8566701ca Mon Sep 17 00:00:00 2001 From: Matthias Date: Mon, 22 Sep 2025 14:20:59 +0200 Subject: [PATCH 16/43] Use closures instead of if --- lychee-lib/src/checker/website.rs | 16 ++++++---------- lychee-lib/src/ratelimit/pool.rs | 6 +++--- 2 files changed, 9 insertions(+), 13 deletions(-) diff --git a/lychee-lib/src/checker/website.rs b/lychee-lib/src/checker/website.rs index e99626aaa1..97054ed88b 100644 --- a/lychee-lib/src/checker/website.rs +++ b/lychee-lib/src/checker/website.rs @@ -74,11 +74,9 @@ impl WebsiteChecker { /// if host-based rate limiting is not enabled. #[must_use] pub(crate) fn host_stats(&self) -> HashMap { - if let Some(host_pool) = &self.host_pool { - host_pool.all_host_stats() - } else { - HashMap::default() - } + self.host_pool + .as_ref() + .map_or_else(HashMap::default, HostPool::all_host_stats) } /// Get cache statistics for all hosts @@ -87,11 +85,9 @@ impl WebsiteChecker { /// if host-based rate limiting is not enabled. #[must_use] pub(crate) fn cache_stats(&self) -> HashMap { - if let Some(host_pool) = &self.host_pool { - host_pool.cache_stats() - } else { - HashMap::default() - } + self.host_pool + .as_ref() + .map_or_else(HashMap::default, HostPool::cache_stats) } /// Record a cache hit for the given URI in the host statistics diff --git a/lychee-lib/src/ratelimit/pool.rs b/lychee-lib/src/ratelimit/pool.rs index 87abeaa369..6df6ba664b 100644 --- a/lychee-lib/src/ratelimit/pool.rs +++ b/lychee-lib/src/ratelimit/pool.rs @@ -153,7 +153,7 @@ impl HostPool { /// /// Returns a `RateLimitError` if: /// - The request URL has no valid hostname - /// - Global or host-specific rate limits are exceeded + /// - Global or host-specific rate limits are exceeded /// - The underlying HTTP request fails /// /// # Examples @@ -380,7 +380,7 @@ impl HostPool { /// /// This tracks that a request was served from the persistent disk cache /// rather than going through the rate-limited HTTP request flow. - /// This method will create a Host instance if one doesn't exist yet. + /// This method will create a [Host] instance if one doesn't exist yet. /// /// # Errors /// @@ -397,7 +397,7 @@ impl HostPool { Ok(()) } - /// Record a cache miss for the given URI in host statistics + /// Record a cache miss for the given URI in host statistics /// /// This tracks that a request could not be served from the persistent disk cache /// and will need to go through the rate-limited HTTP request flow. From d6f78363f954e55f36895542406894b50d74f7f5 Mon Sep 17 00:00:00 2001 From: Matthias Date: Mon, 22 Sep 2025 14:44:41 +0200 Subject: [PATCH 17/43] Rename flags: --default-host-concurrency -> --host-concurrency --default-request-interval -> --request-interval --- README.md | 12 ++++----- lychee-bin/src/client.rs | 2 +- lychee-bin/src/options.rs | 20 +++++++------- lychee-lib/src/ratelimit/config.rs | 38 +++++++++++---------------- lychee-lib/src/ratelimit/host/host.rs | 1 - 5 files changed, 33 insertions(+), 40 deletions(-) diff --git a/README.md b/README.md index 81b6f47a4d..c4cad28f3b 100644 --- a/README.md +++ b/README.md @@ -425,23 +425,23 @@ Options: This is useful for files without extensions or with unknown extensions. The extension will be used to determine the file type for processing. Examples: --default-extension md, --default-extension html - --default-host-concurrency + --host-concurrency Default maximum concurrent requests per host (default: 10) This limits how many requests can be sent simultaneously to the same host (domain/subdomain). This helps prevent overwhelming servers and getting rate-limited. Each host is handled independently. Examples: - • --default-host-concurrency 5 # Conservative for slow APIs - • --default-host-concurrency 20 # Aggressive for fast APIs + • --host-concurrency 5 # Conservative for slow APIs + • --host-concurrency 20 # Aggressive for fast APIs - --default-request-interval + --request-interval Minimum interval between requests to the same host (default: 100ms) Sets a baseline delay between consecutive requests to prevent hammering servers. The adaptive algorithm may increase this based on server responses (rate limits, errors). Examples: - • --default-request-interval 50ms # Fast for robust APIs - • --default-request-interval 1s # Conservative for rate-limited APIs + • --request-interval 50ms # Fast for robust APIs + • --request-interval 1s # Conservative for rate-limited APIs --dump Don't perform any link checking. Instead, dump all the links extracted from inputs that would be checked diff --git a/lychee-bin/src/client.rs b/lychee-bin/src/client.rs index a8bde3fcff..a9214e4d95 100644 --- a/lychee-bin/src/client.rs +++ b/lychee-bin/src/client.rs @@ -42,7 +42,7 @@ pub(crate) fn create(cfg: &Config, cookie_jar: Option<&Arc>) - // Create HostPool for rate limiting - always enabled for HTTP requests let rate_limit_config = - RateLimitConfig::from_options(cfg.default_host_concurrency, cfg.default_request_interval); + RateLimitConfig::from_options(cfg.host_concurrency, cfg.request_interval); let cache_max_age = if cfg.cache { 3600 } else { 0 }; // 1 hour if caching enabled, disabled otherwise let host_pool = if let Some(cookie_jar) = cookie_jar { HostPool::with_cookie_jar( diff --git a/lychee-bin/src/options.rs b/lychee-bin/src/options.rs index ef400609c4..4fee825059 100644 --- a/lychee-bin/src/options.rs +++ b/lychee-bin/src/options.rs @@ -541,11 +541,11 @@ with a status code of 429, 500 and 501." /// getting rate-limited. Each host is handled independently. /// /// Examples: - /// --default-host-concurrency 5 # Conservative for slow APIs - /// --default-host-concurrency 20 # Aggressive for fast APIs - #[arg(long)] + /// --host-concurrency 5 # Conservative for slow APIs + /// --host-concurrency 20 # Aggressive for fast APIs + #[arg(long = "host-concurrency")] #[serde(default)] - pub(crate) default_host_concurrency: Option, + pub(crate) host_concurrency: Option, /// Minimum interval between requests to the same host (default: 100ms) /// @@ -554,11 +554,11 @@ with a status code of 429, 500 and 501." /// on server responses (rate limits, errors). /// /// Examples: - /// --default-request-interval 50ms # Fast for robust APIs\ - /// --default-request-interval 1s # Conservative for rate-limited APIs - #[arg(long, value_parser = humantime::parse_duration)] + /// --request-interval 50ms # Fast for robust APIs + /// --request-interval 1s # Conservative for rate-limited APIs + #[arg(long = "request-interval", value_parser = humantime::parse_duration)] #[serde(default)] - pub(crate) default_request_interval: Option, + pub(crate) request_interval: Option, /// Number of threads to utilize. /// Defaults to number of cores available to the system @@ -987,8 +987,8 @@ impl Config { cache_exclude_status: None, cookie_jar: None, default_extension: None, - default_host_concurrency: None, - default_request_interval: None, + host_concurrency: None, + request_interval: None, dump: false, dump_inputs: false, exclude: Vec::::new(), diff --git a/lychee-lib/src/ratelimit/config.rs b/lychee-lib/src/ratelimit/config.rs index 090473ed46..5993d0197b 100644 --- a/lychee-lib/src/ratelimit/config.rs +++ b/lychee-lib/src/ratelimit/config.rs @@ -8,19 +8,19 @@ use std::time::Duration; pub struct RateLimitConfig { /// Default maximum concurrent requests per host #[serde(default = "default_host_concurrency")] - pub default_host_concurrency: usize, + pub host_concurrency: usize, /// Default minimum interval between requests to the same host #[serde(default = "default_request_interval")] #[serde(with = "humantime_serde")] - pub default_request_interval: Duration, + pub request_interval: Duration, } impl Default for RateLimitConfig { fn default() -> Self { Self { - default_host_concurrency: default_host_concurrency(), - default_request_interval: default_request_interval(), + host_concurrency: default_host_concurrency(), + request_interval: default_request_interval(), } } } @@ -29,12 +29,12 @@ impl RateLimitConfig { /// Create a `RateLimitConfig` from CLI options, using defaults for missing values #[must_use] pub fn from_options( - default_host_concurrency: Option, - default_request_interval: Option, + host_concurrency: Option, + request_interval: Option, ) -> Self { Self { - default_host_concurrency: default_host_concurrency.unwrap_or(DEFAULT_HOST_CONCURRENCY), - default_request_interval: default_request_interval.unwrap_or(DEFAULT_REQUEST_INTERVAL), + host_concurrency: host_concurrency.unwrap_or(DEFAULT_HOST_CONCURRENCY), + request_interval: request_interval.unwrap_or(DEFAULT_REQUEST_INTERVAL), } } } @@ -71,14 +71,14 @@ impl HostConfig { #[must_use] pub fn effective_max_concurrent(&self, global_config: &RateLimitConfig) -> usize { self.max_concurrent - .unwrap_or(global_config.default_host_concurrency) + .unwrap_or(global_config.host_concurrency) } /// Get the effective request interval, falling back to the global default #[must_use] pub fn effective_request_interval(&self, global_config: &RateLimitConfig) -> Duration { self.request_interval - .unwrap_or(global_config.default_request_interval) + .unwrap_or(global_config.request_interval) } } @@ -137,8 +137,8 @@ mod tests { #[test] fn test_default_rate_limit_config() { let config = RateLimitConfig::default(); - assert_eq!(config.default_host_concurrency, 10); - assert_eq!(config.default_request_interval, Duration::from_millis(100)); + assert_eq!(config.host_concurrency, 10); + assert_eq!(config.request_interval, Duration::from_millis(100)); } #[test] @@ -169,21 +169,15 @@ mod tests { #[test] fn test_config_serialization() { let config = RateLimitConfig { - default_host_concurrency: 15, - default_request_interval: Duration::from_millis(200), + host_concurrency: 15, + request_interval: Duration::from_millis(200), }; let toml = toml::to_string(&config).unwrap(); let deserialized: RateLimitConfig = toml::from_str(&toml).unwrap(); - assert_eq!( - config.default_host_concurrency, - deserialized.default_host_concurrency - ); - assert_eq!( - config.default_request_interval, - deserialized.default_request_interval - ); + assert_eq!(config.host_concurrency, deserialized.host_concurrency); + assert_eq!(config.request_interval, deserialized.request_interval); } #[test] diff --git a/lychee-lib/src/ratelimit/host/host.rs b/lychee-lib/src/ratelimit/host/host.rs index a6a2a213ba..98f4d176d6 100644 --- a/lychee-lib/src/ratelimit/host/host.rs +++ b/lychee-lib/src/ratelimit/host/host.rs @@ -108,7 +108,6 @@ impl Host { timeout: Option, allow_insecure: bool, ) -> Result { - // Configure rate limiter with effective request interval let interval = host_config.effective_request_interval(global_config); let quota = Quota::with_period(interval) .ok_or_else(|| RateLimitError::HeaderParseError { From 5960946ddc803a33efe6b0537a519390ea05ea9c Mon Sep 17 00:00:00 2001 From: Matthias Date: Mon, 22 Sep 2025 15:19:56 +0200 Subject: [PATCH 18/43] Fix `help` formatting --- README.md | 43 +++++++++++++++++++++++---------------- lychee-bin/src/options.rs | 11 ++++++---- 2 files changed, 33 insertions(+), 21 deletions(-) diff --git a/README.md b/README.md index c4cad28f3b..2c114e790d 100644 --- a/README.md +++ b/README.md @@ -423,25 +423,12 @@ Options: --default-extension This is the default file extension that is applied to files without an extension. - This is useful for files without extensions or with unknown extensions. The extension will be used to determine the file type for processing. Examples: --default-extension md, --default-extension html - - --host-concurrency - Default maximum concurrent requests per host (default: 10) - - This limits how many requests can be sent simultaneously to the same host (domain/subdomain). This helps prevent overwhelming servers and getting rate-limited. Each host is handled independently. + This is useful for files without extensions or with unknown extensions. + The extension will be used to determine the file type for processing. Examples: - • --host-concurrency 5 # Conservative for slow APIs - • --host-concurrency 20 # Aggressive for fast APIs - - --request-interval - Minimum interval between requests to the same host (default: 100ms) - - Sets a baseline delay between consecutive requests to prevent hammering servers. The adaptive algorithm may increase this based on server responses (rate limits, errors). - - Examples: - • --request-interval 50ms # Fast for robust APIs - • --request-interval 1s # Conservative for rate-limited APIs + --default-extension md + --default-extension html --dump Don't perform any link checking. Instead, dump all the links extracted from inputs that would be checked @@ -541,6 +528,17 @@ Options: --hidden Do not skip hidden directories and files + --host-concurrency + Default maximum concurrent requests per host (default: 10) + + This limits how many requests can be sent simultaneously to the same + host (domain/subdomain). This helps prevent overwhelming servers and + getting rate-limited. Each host is handled independently. + + Examples: + --host-concurrency 5 # Conservative for slow APIs + --host-concurrency 20 # Aggressive for fast APIs + --host-stats Show per-host statistics at the end of the run @@ -665,6 +663,17 @@ Options: --remap Remap URI matching pattern to different URI + --request-interval + Minimum interval between requests to the same host (default: 100ms) + + Sets a baseline delay between consecutive requests to prevent + hammering servers. The adaptive algorithm may increase this based + on server responses (rate limits, errors). + + Examples: + --request-interval 50ms # Fast for robust APIs + --request-interval 1s # Conservative for rate-limited APIs + --require-https When HTTPS is available, treat HTTP links as errors diff --git a/lychee-bin/src/options.rs b/lychee-bin/src/options.rs index 4fee825059..771a5f6bd7 100644 --- a/lychee-bin/src/options.rs +++ b/lychee-bin/src/options.rs @@ -450,8 +450,11 @@ specify both extensions explicitly." /// /// This is useful for files without extensions or with unknown extensions. /// The extension will be used to determine the file type for processing. - /// Examples: --default-extension md, --default-extension html - #[arg(long, value_name = "EXTENSION")] + /// + /// Examples: + /// --default-extension md + /// --default-extension html + #[arg(long, value_name = "EXTENSION", verbatim_doc_comment)] #[serde(default)] pub(crate) default_extension: Option, @@ -543,7 +546,7 @@ with a status code of 429, 500 and 501." /// Examples: /// --host-concurrency 5 # Conservative for slow APIs /// --host-concurrency 20 # Aggressive for fast APIs - #[arg(long = "host-concurrency")] + #[arg(long = "host-concurrency", verbatim_doc_comment)] #[serde(default)] pub(crate) host_concurrency: Option, @@ -556,7 +559,7 @@ with a status code of 429, 500 and 501." /// Examples: /// --request-interval 50ms # Fast for robust APIs /// --request-interval 1s # Conservative for rate-limited APIs - #[arg(long = "request-interval", value_parser = humantime::parse_duration)] + #[arg(long = "request-interval", value_parser = humantime::parse_duration, verbatim_doc_comment)] #[serde(default)] pub(crate) request_interval: Option, From ea22e446ff297a3761e4df5dc37a99c0bf5aca56 Mon Sep 17 00:00:00 2001 From: Thomas Zahner Date: Fri, 3 Oct 2025 10:15:05 +0200 Subject: [PATCH 19/43] Reduce code duplication --- lychee-bin/src/client.rs | 39 ++++++++++++-------------------- lychee-lib/src/ratelimit/pool.rs | 29 ++++-------------------- 2 files changed, 19 insertions(+), 49 deletions(-) diff --git a/lychee-bin/src/client.rs b/lychee-bin/src/client.rs index a9214e4d95..665789e2e7 100644 --- a/lychee-bin/src/client.rs +++ b/lychee-bin/src/client.rs @@ -44,30 +44,21 @@ pub(crate) fn create(cfg: &Config, cookie_jar: Option<&Arc>) - let rate_limit_config = RateLimitConfig::from_options(cfg.host_concurrency, cfg.request_interval); let cache_max_age = if cfg.cache { 3600 } else { 0 }; // 1 hour if caching enabled, disabled otherwise - let host_pool = if let Some(cookie_jar) = cookie_jar { - HostPool::with_cookie_jar( - rate_limit_config, - cfg.hosts.clone(), - cfg.max_concurrency, - cache_max_age, - combined_headers.clone(), - cfg.max_redirects, - Some(timeout), - cfg.insecure, - cookie_jar.clone(), - ) - } else { - HostPool::new( - rate_limit_config, - cfg.hosts.clone(), - cfg.max_concurrency, - cache_max_age, - combined_headers, - cfg.max_redirects, - Some(timeout), - cfg.insecure, - ) - }; + + let mut host_pool = HostPool::new( + rate_limit_config, + cfg.hosts.clone(), + cfg.max_concurrency, + cache_max_age, + combined_headers, + cfg.max_redirects, + Some(timeout), + cfg.insecure, + ); + + if let Some(cookie_jar) = cookie_jar { + host_pool = host_pool.with_cookie_jar(cookie_jar.clone()); + } ClientBuilder::builder() .remaps(remaps) diff --git a/lychee-lib/src/ratelimit/pool.rs b/lychee-lib/src/ratelimit/pool.rs index 6df6ba664b..ac2f471a29 100644 --- a/lychee-lib/src/ratelimit/pool.rs +++ b/lychee-lib/src/ratelimit/pool.rs @@ -109,32 +109,11 @@ impl HostPool { } } - /// Create a new `HostPool` with a shared cookie jar + /// Add a shared cookie jar to the `HostPool` #[must_use] - #[allow(clippy::too_many_arguments)] - pub fn with_cookie_jar( - global_config: RateLimitConfig, - host_configs: HashMap, - max_total_concurrency: usize, - cache_max_age: u64, - global_headers: HeaderMap, - max_redirects: usize, - timeout: Option, - allow_insecure: bool, - cookie_jar: Arc, - ) -> Self { - Self { - hosts: Arc::new(DashMap::new()), - global_config: Arc::new(global_config), - host_configs: Arc::new(host_configs), - global_semaphore: Arc::new(Semaphore::new(max_total_concurrency)), - cache_max_age, - cookie_jar: Some(cookie_jar), - global_headers, - max_redirects, - timeout, - allow_insecure, - } + pub fn with_cookie_jar(mut self, cookie_jar: Arc) -> Self { + self.cookie_jar = Some(cookie_jar); + self } /// Execute an HTTP request with appropriate per-host rate limiting From d33650dcb347a627fb8cf1c8bc3371ae913908e7 Mon Sep 17 00:00:00 2001 From: Thomas Zahner Date: Fri, 21 Nov 2025 13:13:31 +0100 Subject: [PATCH 20/43] Update documentation to reference hosts option --- README.md | 14 +++++++++----- lychee-bin/src/options.rs | 16 ++++++++++------ 2 files changed, 19 insertions(+), 11 deletions(-) diff --git a/README.md b/README.md index 2c114e790d..ac04b244d7 100644 --- a/README.md +++ b/README.md @@ -524,6 +524,8 @@ Options: You can specify custom headers in the format 'Name: Value'. For example, 'Accept: text/html'. This is the same format that other tools like curl or wget use. Multiple headers can be specified by using the flag multiple times. + The specified headers are used for ALL requests. + Use the `hosts` option to configure headers on a per-host basis. --hidden Do not skip hidden directories and files @@ -531,12 +533,13 @@ Options: --host-concurrency Default maximum concurrent requests per host (default: 10) - This limits how many requests can be sent simultaneously to the same - host (domain/subdomain). This helps prevent overwhelming servers and - getting rate-limited. Each host is handled independently. + This limits the maximum amount of requests that are sent simultaneously + to the same host. This helps to prevent overwhelming servers and + running into rate-limits. Use the `hosts` option to configure this + on a per-host basis. Examples: - --host-concurrency 5 # Conservative for slow APIs + --host-concurrency 2 # Conservative for slow APIs --host-concurrency 20 # Aggressive for fast APIs --host-stats @@ -668,7 +671,8 @@ Options: Sets a baseline delay between consecutive requests to prevent hammering servers. The adaptive algorithm may increase this based - on server responses (rate limits, errors). + on server responses (rate limits, errors). Use the `hosts` option + to configure this on a per-host basis. Examples: --request-interval 50ms # Fast for robust APIs diff --git a/lychee-bin/src/options.rs b/lychee-bin/src/options.rs index 771a5f6bd7..bc9c4c3445 100644 --- a/lychee-bin/src/options.rs +++ b/lychee-bin/src/options.rs @@ -539,12 +539,13 @@ with a status code of 429, 500 and 501." /// Default maximum concurrent requests per host (default: 10) /// - /// This limits how many requests can be sent simultaneously to the same - /// host (domain/subdomain). This helps prevent overwhelming servers and - /// getting rate-limited. Each host is handled independently. + /// This limits the maximum amount of requests that are sent simultaneously + /// to the same host. This helps to prevent overwhelming servers and + /// running into rate-limits. Use the `hosts` option to configure this + /// on a per-host basis. /// /// Examples: - /// --host-concurrency 5 # Conservative for slow APIs + /// --host-concurrency 2 # Conservative for slow APIs /// --host-concurrency 20 # Aggressive for fast APIs #[arg(long = "host-concurrency", verbatim_doc_comment)] #[serde(default)] @@ -554,7 +555,8 @@ with a status code of 429, 500 and 501." /// /// Sets a baseline delay between consecutive requests to prevent /// hammering servers. The adaptive algorithm may increase this based - /// on server responses (rate limits, errors). + /// on server responses (rate limits, errors). Use the `hosts` option + /// to configure this on a per-host basis. /// /// Examples: /// --request-interval 50ms # Fast for robust APIs @@ -712,7 +714,9 @@ Note: This option only takes effect on `file://` URIs which exist and point to a Some websites require custom headers to be passed in order to return valid responses. You can specify custom headers in the format 'Name: Value'. For example, 'Accept: text/html'. This is the same format that other tools like curl or wget use. -Multiple headers can be specified by using the flag multiple times." +Multiple headers can be specified by using the flag multiple times. +The specified headers are used for ALL requests. +Use the `hosts` option to configure headers on a per-host basis." )] #[serde(default)] #[serde(deserialize_with = "deserialize_headers")] From dbada0dacfe82cc34f93a07d1022597b81c38ab5 Mon Sep 17 00:00:00 2001 From: Thomas Zahner Date: Thu, 27 Nov 2025 14:04:38 +0100 Subject: [PATCH 21/43] clippy --fix --- lychee-bin/src/commands/check.rs | 16 ++++---- lychee-lib/src/ratelimit/host/host.rs | 56 +++++++++++++------------- lychee-lib/src/ratelimit/host/stats.rs | 2 +- lychee-lib/src/ratelimit/pool.rs | 12 +++--- 4 files changed, 42 insertions(+), 44 deletions(-) diff --git a/lychee-bin/src/commands/check.rs b/lychee-bin/src/commands/check.rs index 1f287690d7..06d53dc485 100644 --- a/lychee-bin/src/commands/check.rs +++ b/lychee-bin/src/commands/check.rs @@ -265,20 +265,20 @@ async fn handle( }; // Track cache hit in the per-host stats (only for network URIs) - if !uri.is_file() { - if let Err(e) = client.record_cache_hit(&uri) { - log::debug!("Failed to record cache hit for {uri}: {e}"); - } + if !uri.is_file() + && let Err(e) = client.record_cache_hit(&uri) + { + log::debug!("Failed to record cache hit for {uri}: {e}"); } return Ok(Response::new(uri.clone(), status, request.source.into())); } // Cache miss - track it and run a normal check (only for network URIs) - if !uri.is_file() { - if let Err(e) = client.record_cache_miss(&uri) { - log::debug!("Failed to record cache miss for {uri}: {e}"); - } + if !uri.is_file() + && let Err(e) = client.record_cache_miss(&uri) + { + log::debug!("Failed to record cache miss for {uri}: {e}"); } let response = check_url(client, request).await; diff --git a/lychee-lib/src/ratelimit/host/host.rs b/lychee-lib/src/ratelimit/host/host.rs index 98f4d176d6..32b091b468 100644 --- a/lychee-lib/src/ratelimit/host/host.rs +++ b/lychee-lib/src/ratelimit/host/host.rs @@ -364,33 +364,32 @@ impl Host { &["x-ratelimit-limit", "x-rate-limit-limit", "ratelimit-limit"], ); - if let (Some(remaining), Some(limit)) = (remaining, limit) { - if limit > 0 { - #[allow(clippy::cast_precision_loss)] - let usage_ratio = (limit - remaining) as f64 / limit as f64; - - // If we've used more than 80% of our quota, apply preventive backoff - if usage_ratio > 0.8 { - let mut backoff = self.backoff_duration.lock().unwrap(); - #[allow(clippy::cast_possible_truncation, clippy::cast_sign_loss)] - let preventive_backoff = - Duration::from_millis((200.0 * (usage_ratio - 0.8) / 0.2) as u64); - *backoff = std::cmp::max(*backoff, preventive_backoff); - } + if let (Some(remaining), Some(limit)) = (remaining, limit) + && limit > 0 + { + #[allow(clippy::cast_precision_loss)] + let usage_ratio = (limit - remaining) as f64 / limit as f64; + + // If we've used more than 80% of our quota, apply preventive backoff + if usage_ratio > 0.8 { + let mut backoff = self.backoff_duration.lock().unwrap(); + #[allow(clippy::cast_possible_truncation, clippy::cast_sign_loss)] + let preventive_backoff = + Duration::from_millis((200.0 * (usage_ratio - 0.8) / 0.2) as u64); + *backoff = std::cmp::max(*backoff, preventive_backoff); } } // Check for Retry-After header (in seconds) - if let Some(retry_after_value) = headers.get("retry-after") { - if let Ok(retry_after_str) = retry_after_value.to_str() { - if let Ok(retry_seconds) = retry_after_str.parse::() { - let mut backoff = self.backoff_duration.lock().unwrap(); - let retry_duration = Duration::from_secs(retry_seconds); - // Cap retry-after to reasonable limits - if retry_duration <= Duration::from_secs(3600) { - *backoff = std::cmp::max(*backoff, retry_duration); - } - } + if let Some(retry_after_value) = headers.get("retry-after") + && let Ok(retry_after_str) = retry_after_value.to_str() + && let Ok(retry_seconds) = retry_after_str.parse::() + { + let mut backoff = self.backoff_duration.lock().unwrap(); + let retry_duration = Duration::from_secs(retry_seconds); + // Cap retry-after to reasonable limits + if retry_duration <= Duration::from_secs(3600) { + *backoff = std::cmp::max(*backoff, retry_duration); } } } @@ -398,12 +397,11 @@ impl Host { /// Helper method to parse numeric header values from common rate limit headers fn parse_header_value(headers: &http::HeaderMap, header_names: &[&str]) -> Option { for header_name in header_names { - if let Some(value) = headers.get(*header_name) { - if let Ok(value_str) = value.to_str() { - if let Ok(number) = value_str.parse::() { - return Some(number); - } - } + if let Some(value) = headers.get(*header_name) + && let Ok(value_str) = value.to_str() + && let Ok(number) = value_str.parse::() + { + return Some(number); } } None diff --git a/lychee-lib/src/ratelimit/host/stats.rs b/lychee-lib/src/ratelimit/host/stats.rs index 4a11575c79..a399b769c5 100644 --- a/lychee-lib/src/ratelimit/host/stats.rs +++ b/lychee-lib/src/ratelimit/host/stats.rs @@ -81,7 +81,7 @@ impl HostStats { times.sort(); let mid = times.len() / 2; - if times.len() % 2 == 0 { + if times.len().is_multiple_of(2) { // Average of two middle values Some((times[mid - 1] + times[mid]) / 2) } else { diff --git a/lychee-lib/src/ratelimit/pool.rs b/lychee-lib/src/ratelimit/pool.rs index ac2f471a29..c8dd458b37 100644 --- a/lychee-lib/src/ratelimit/pool.rs +++ b/lychee-lib/src/ratelimit/pool.rs @@ -325,13 +325,13 @@ impl HostPool { /// * `uri` - The URI to cache /// * `status` - The status to cache pub fn cache_result(&self, uri: &Uri, status: &Status) { - if let Ok(host_key) = HostKey::try_from(uri) { - if let Some(host) = self.hosts.get(&host_key) { - host.cache_result(uri, status); - } - // If host doesn't exist yet, we don't cache - // The result will be cached when the host is created and the request is made + if let Ok(host_key) = HostKey::try_from(uri) + && let Some(host) = self.hosts.get(&host_key) + { + host.cache_result(uri, status); } + // If host doesn't exist yet, we don't cache + // The result will be cached when the host is created and the request is made } /// Get cache statistics across all hosts From 3e0755ced8ba9d49f5af020d9fc3d9422e7429c1 Mon Sep 17 00:00:00 2001 From: Thomas Zahner Date: Thu, 27 Nov 2025 18:49:46 +0100 Subject: [PATCH 22/43] Return HostPool instead of Client & code cleanup Includes removing boilerplate code and removing instances of Arc. --- lychee-bin/src/commands/check.rs | 55 +++++++++++++++++-------------- lychee-bin/src/commands/mod.rs | 3 +- lychee-bin/src/host_stats.rs | 8 +++-- lychee-bin/src/main.rs | 6 ++-- lychee-lib/src/checker/website.rs | 50 ++++------------------------ lychee-lib/src/client.rs | 43 ++++-------------------- 6 files changed, 52 insertions(+), 113 deletions(-) diff --git a/lychee-bin/src/commands/check.rs b/lychee-bin/src/commands/check.rs index 06d53dc485..d1735b3cd4 100644 --- a/lychee-bin/src/commands/check.rs +++ b/lychee-bin/src/commands/check.rs @@ -1,8 +1,9 @@ use std::collections::HashSet; -use std::sync::{Arc, Mutex}; +use std::sync::Mutex; use std::time::Duration; use futures::StreamExt; +use lychee_lib::ratelimit::HostPool; use reqwest::Url; use tokio::sync::mpsc; use tokio_stream::wrappers::ReceiverStream; @@ -24,7 +25,7 @@ use super::CommandParams; pub(crate) async fn check( params: CommandParams, -) -> Result<(ResponseStats, Arc, ExitCode, Client), ErrorKind> +) -> Result<(ResponseStats, Cache, ExitCode, Option), ErrorKind> where S: futures::Stream>, { @@ -41,10 +42,8 @@ where } else { ResponseStats::default() }; - let cache_ref = params.cache.clone(); let client = params.client; - let client_for_return = client.clone(); let cache = params.cache; let cache_exclude_status = params .cfg @@ -53,17 +52,6 @@ where .into_set(); let accept = params.cfg.accept.into(); - // Start receiving requests - tokio::spawn(request_channel_task( - recv_req, - send_resp, - max_concurrency, - client, - cache, - cache_exclude_status, - accept, - )); - let hide_bar = params.cfg.no_progress; let detailed = params.cfg.verbose.log_level() >= log::Level::Info; @@ -75,8 +63,20 @@ where stats, )); - // Wait until all messages are sent - send_inputs_loop(params.requests, send_req, &progress).await?; + // Wait until all requests are sent + send_requests(params.requests, send_req, &progress).await?; + + // Start receiving requests + let (cache, client) = tokio::spawn(request_channel_task( + recv_req, + send_resp, + max_concurrency, + client, + cache, + cache_exclude_status, + accept, + )) + .await?; // Wait until all responses are received let result = show_results_task.await?; @@ -104,7 +104,8 @@ where } else { ExitCode::LinkCheckFailure }; - Ok((stats, cache_ref, code, client_for_return)) + + Ok((stats, cache, code, client.host_pool())) } async fn suggest_archived_links( @@ -144,7 +145,7 @@ async fn suggest_archived_links( // drops the `send_req` channel on exit // required for the receiver task to end, which closes send_resp, which allows // the show_results_task to finish -async fn send_inputs_loop( +async fn send_requests( requests: S, send_req: mpsc::Sender>, progress: &Progress, @@ -181,17 +182,17 @@ async fn request_channel_task( send_resp: mpsc::Sender>, max_concurrency: usize, client: Client, - cache: Arc, + cache: Cache, cache_exclude_status: HashSet, accept: HashSet, -) { +) -> (Cache, Client) { StreamExt::for_each_concurrent( ReceiverStream::new(recv_req), max_concurrency, |request: Result| async { let response = handle( &client, - cache.clone(), + &cache, cache_exclude_status.clone(), request, accept.clone(), @@ -205,6 +206,8 @@ async fn request_channel_task( }, ) .await; + + (cache, client) } /// Check a URL and return a response. @@ -236,7 +239,7 @@ async fn check_url(client: &Client, request: Request) -> Response { /// a failed response. async fn handle( client: &Client, - cache: Arc, + cache: &Cache, cache_exclude_status: HashSet, request: Result, accept: HashSet, @@ -266,7 +269,8 @@ async fn handle( // Track cache hit in the per-host stats (only for network URIs) if !uri.is_file() - && let Err(e) = client.record_cache_hit(&uri) + && let Some(pool) = client.host_pool_ref() + && let Err(e) = pool.record_cache_hit(&uri) { log::debug!("Failed to record cache hit for {uri}: {e}"); } @@ -276,7 +280,8 @@ async fn handle( // Cache miss - track it and run a normal check (only for network URIs) if !uri.is_file() - && let Err(e) = client.record_cache_miss(&uri) + && let Some(pool) = client.host_pool_ref() + && let Err(e) = pool.record_cache_miss(&uri) { log::debug!("Failed to record cache miss for {uri}: {e}"); } diff --git a/lychee-bin/src/commands/mod.rs b/lychee-bin/src/commands/mod.rs index 5b2c6f62db..38892e98e3 100644 --- a/lychee-bin/src/commands/mod.rs +++ b/lychee-bin/src/commands/mod.rs @@ -10,7 +10,6 @@ pub(crate) use dump_inputs::dump_inputs; use std::fs; use std::io::{self, Write}; use std::path::PathBuf; -use std::sync::Arc; use crate::cache::Cache; use crate::options::Config; @@ -20,7 +19,7 @@ use lychee_lib::{Client, Request}; /// Parameters passed to every command pub(crate) struct CommandParams>> { pub(crate) client: Client, - pub(crate) cache: Arc, + pub(crate) cache: Cache, pub(crate) requests: S, pub(crate) cfg: Config, } diff --git a/lychee-bin/src/host_stats.rs b/lychee-bin/src/host_stats.rs index 981d6eee8e..aea12a753c 100644 --- a/lychee-bin/src/host_stats.rs +++ b/lychee-bin/src/host_stats.rs @@ -1,17 +1,21 @@ +use std::collections::HashMap; + use anyhow::{Context, Result}; +use lychee_lib::ratelimit::HostPool; use crate::{formatters::get_host_stats_formatter, options::Config}; /// Display per-host statistics if requested pub(crate) fn display_per_host_statistics( - client: &lychee_lib::Client, + host_pool: Option<&HostPool>, config: &Config, ) -> Result<()> { if !config.host_stats { return Ok(()); } - let host_stats = client.host_stats(); + let host_stats = host_pool.map_or_else(HashMap::default, HostPool::all_host_stats); + let host_stats_formatter = get_host_stats_formatter(&config.format, &config.mode); if let Some(formatted_host_stats) = host_stats_formatter.format(host_stats)? { diff --git a/lychee-bin/src/main.rs b/lychee-bin/src/main.rs index 69ac22c818..38a2535f6f 100644 --- a/lychee-bin/src/main.rs +++ b/lychee-bin/src/main.rs @@ -61,7 +61,6 @@ use std::fs::{self, File}; use std::io::{self, BufRead, BufReader, ErrorKind, Write}; use std::path::PathBuf; -use std::sync::Arc; use anyhow::{Context, Error, Result, bail}; use clap::{Parser, crate_version}; @@ -370,7 +369,6 @@ async fn run(opts: &LycheeOptions) -> Result { let requests = collector.collect_links_from_file_types(inputs, opts.config.extensions.clone()); let cache = load_cache(&opts.config).unwrap_or_default(); - let cache = Arc::new(cache); let cookie_jar = load_cookie_jar(&opts.config).with_context(|| { format!( @@ -394,7 +392,7 @@ async fn run(opts: &LycheeOptions) -> Result { let exit_code = if opts.config.dump { commands::dump(params).await? } else { - let (stats, cache, exit_code, client) = commands::check(params).await?; + let (stats, cache, exit_code, host_pool) = commands::check(params).await?; let github_issues = stats .error_map @@ -423,7 +421,7 @@ async fn run(opts: &LycheeOptions) -> Result { } // Display per-host statistics if requested - display_per_host_statistics(&client, &opts.config)?; + display_per_host_statistics(host_pool.as_ref(), &opts.config)?; if github_issues && opts.config.github_token.is_none() { warn!( diff --git a/lychee-lib/src/checker/website.rs b/lychee-lib/src/checker/website.rs index 97054ed88b..41cc19082b 100644 --- a/lychee-lib/src/checker/website.rs +++ b/lychee-lib/src/checker/website.rs @@ -11,11 +11,7 @@ use async_trait::async_trait; use http::{Method, StatusCode}; use octocrab::Octocrab; use reqwest::{Request, Response, header::CONTENT_TYPE}; -use std::{ - collections::{HashMap, HashSet}, - path::Path, - time::Duration, -}; +use std::{collections::HashSet, path::Path, time::Duration}; use url::Url; #[derive(Debug, Clone)] @@ -68,50 +64,16 @@ pub(crate) struct WebsiteChecker { } impl WebsiteChecker { - /// Get per-host statistics from the rate limiting system - /// - /// Returns a map of hostnames to their statistics, or an empty map - /// if host-based rate limiting is not enabled. + /// Get a reference to `HostPool` #[must_use] - pub(crate) fn host_stats(&self) -> HashMap { - self.host_pool - .as_ref() - .map_or_else(HashMap::default, HostPool::all_host_stats) + pub(crate) const fn host_pool_ref(&self) -> Option<&HostPool> { + self.host_pool.as_ref() } - /// Get cache statistics for all hosts - /// - /// Returns a map of hostnames to (`cache_size`, `hit_rate`), or an empty map - /// if host-based rate limiting is not enabled. + /// Get `HostPool` #[must_use] - pub(crate) fn cache_stats(&self) -> HashMap { + pub(crate) fn host_pool(self) -> Option { self.host_pool - .as_ref() - .map_or_else(HashMap::default, HostPool::cache_stats) - } - - /// Record a cache hit for the given URI in the host statistics - /// - /// This tracks that a request was served from the persistent cache - /// rather than making a network request. - pub(crate) fn record_cache_hit(&self, uri: &crate::Uri) -> crate::Result<()> { - if let Some(host_pool) = &self.host_pool { - host_pool.record_cache_hit(uri).map_err(Into::into) - } else { - Ok(()) // No host pool, nothing to track - } - } - - /// Record a cache miss for the given URI in the host statistics - /// - /// This tracks that a request could not be served from the persistent cache - /// and will require a network request (which may then use the in-memory cache). - pub(crate) fn record_cache_miss(&self, uri: &crate::Uri) -> crate::Result<()> { - if let Some(host_pool) = &self.host_pool { - host_pool.record_cache_miss(uri).map_err(Into::into) - } else { - Ok(()) // No host pool, nothing to track - } } #[allow(clippy::too_many_arguments)] diff --git a/lychee-lib/src/client.rs b/lychee-lib/src/client.rs index 560182cd56..bd3d332543 100644 --- a/lychee-lib/src/client.rs +++ b/lychee-lib/src/client.rs @@ -32,6 +32,7 @@ use crate::{ chain::RequestChain, checker::{file::FileChecker, mail::MailChecker, website::WebsiteChecker}, filter::Filter, + ratelimit::HostPool, remap::Remaps, types::{DEFAULT_ACCEPTED_STATUS_CODES, redirect_history::RedirectHistory}, }; @@ -474,46 +475,16 @@ pub struct Client { } impl Client { - /// Get per-host statistics from the rate limiting system - /// - /// Returns a map of hostnames to their statistics, or an empty map - /// if host-based rate limiting is not enabled. + /// Get a reference to `HostPool` #[must_use] - pub fn host_stats(&self) -> std::collections::HashMap { - self.website_checker.host_stats() + pub const fn host_pool_ref(&self) -> Option<&HostPool> { + self.website_checker.host_pool_ref() } - /// Get cache statistics for all hosts - /// - /// Returns a map of hostnames to (`cache_size`, `hit_rate`), or an empty map - /// if host-based rate limiting is not enabled. + /// Get `HostPool` #[must_use] - pub fn cache_stats(&self) -> std::collections::HashMap { - self.website_checker.cache_stats() - } - - /// Record a cache hit for the given URI - /// - /// This tracks that a request was served from cache rather than making - /// a network request. This is used for statistics tracking. - /// - /// # Errors - /// - /// Returns an error if the URI cannot be parsed or if host tracking fails. - pub fn record_cache_hit(&self, uri: &crate::Uri) -> crate::Result<()> { - self.website_checker.record_cache_hit(uri) - } - - /// Record a cache miss for the given URI - /// - /// This tracks that a request could not be served from cache and will - /// require a network request. This is used for statistics tracking. - /// - /// # Errors - /// - /// Returns an error if the URI cannot be parsed or if host tracking fails. - pub fn record_cache_miss(&self, uri: &crate::Uri) -> crate::Result<()> { - self.website_checker.record_cache_miss(uri) + pub fn host_pool(self) -> Option { + self.website_checker.host_pool() } /// Check a single request. From 0939004dfb8fc1b8c77c5a847e2febdc6d4c1fb6 Mon Sep 17 00:00:00 2001 From: Thomas Zahner Date: Fri, 28 Nov 2025 11:18:33 +0100 Subject: [PATCH 23/43] Move inner `Arc`s to the outside --- lychee-bin/src/client.rs | 2 +- lychee-bin/src/commands/check.rs | 3 ++- lychee-bin/src/main.rs | 2 +- lychee-lib/src/checker/website.rs | 10 +++++----- lychee-lib/src/client.rs | 6 +++--- lychee-lib/src/ratelimit/host/host.rs | 17 ++++++----------- lychee-lib/src/ratelimit/pool.rs | 22 +++++++++++----------- 7 files changed, 29 insertions(+), 33 deletions(-) diff --git a/lychee-bin/src/client.rs b/lychee-bin/src/client.rs index 665789e2e7..ebd4d99b10 100644 --- a/lychee-bin/src/client.rs +++ b/lychee-bin/src/client.rs @@ -87,7 +87,7 @@ pub(crate) fn create(cfg: &Config, cookie_jar: Option<&Arc>) - .include_fragments(cfg.include_fragments) .fallback_extensions(cfg.fallback_extensions.clone()) .index_files(cfg.index_files.clone()) - .host_pool(Some(host_pool)) + .host_pool(Some(Arc::new(host_pool))) .build() .client() .context("Failed to create request client") diff --git a/lychee-bin/src/commands/check.rs b/lychee-bin/src/commands/check.rs index d1735b3cd4..8462320d9f 100644 --- a/lychee-bin/src/commands/check.rs +++ b/lychee-bin/src/commands/check.rs @@ -1,4 +1,5 @@ use std::collections::HashSet; +use std::sync::Arc; use std::sync::Mutex; use std::time::Duration; @@ -25,7 +26,7 @@ use super::CommandParams; pub(crate) async fn check( params: CommandParams, -) -> Result<(ResponseStats, Cache, ExitCode, Option), ErrorKind> +) -> Result<(ResponseStats, Cache, ExitCode, Option>), ErrorKind> where S: futures::Stream>, { diff --git a/lychee-bin/src/main.rs b/lychee-bin/src/main.rs index 38a2535f6f..153a3c6148 100644 --- a/lychee-bin/src/main.rs +++ b/lychee-bin/src/main.rs @@ -421,7 +421,7 @@ async fn run(opts: &LycheeOptions) -> Result { } // Display per-host statistics if requested - display_per_host_statistics(host_pool.as_ref(), &opts.config)?; + display_per_host_statistics(host_pool.as_deref(), &opts.config)?; if github_issues && opts.config.github_token.is_none() { warn!( diff --git a/lychee-lib/src/checker/website.rs b/lychee-lib/src/checker/website.rs index 41cc19082b..6d69e00f37 100644 --- a/lychee-lib/src/checker/website.rs +++ b/lychee-lib/src/checker/website.rs @@ -11,7 +11,7 @@ use async_trait::async_trait; use http::{Method, StatusCode}; use octocrab::Octocrab; use reqwest::{Request, Response, header::CONTENT_TYPE}; -use std::{collections::HashSet, path::Path, time::Duration}; +use std::{collections::HashSet, path::Path, sync::Arc, time::Duration}; use url::Url; #[derive(Debug, Clone)] @@ -60,19 +60,19 @@ pub(crate) struct WebsiteChecker { /// /// When present, HTTP requests will be routed through this pool for /// rate limiting. When None, requests go directly through `reqwest_client`. - host_pool: Option, + host_pool: Option>, } impl WebsiteChecker { /// Get a reference to `HostPool` #[must_use] - pub(crate) const fn host_pool_ref(&self) -> Option<&HostPool> { + pub(crate) const fn host_pool_ref(&self) -> Option<&Arc> { self.host_pool.as_ref() } /// Get `HostPool` #[must_use] - pub(crate) fn host_pool(self) -> Option { + pub(crate) fn host_pool(self) -> Option> { self.host_pool } @@ -88,7 +88,7 @@ impl WebsiteChecker { require_https: bool, plugin_request_chain: RequestChain, include_fragments: bool, - host_pool: Option, + host_pool: Option>, ) -> Self { Self { method, diff --git a/lychee-lib/src/client.rs b/lychee-lib/src/client.rs index bd3d332543..262cd56694 100644 --- a/lychee-lib/src/client.rs +++ b/lychee-lib/src/client.rs @@ -310,7 +310,7 @@ pub struct ClientBuilder { /// /// When provided, HTTP/HTTPS requests will be routed through this pool /// for rate limiting and concurrency control on a per-host basis. - host_pool: Option, + host_pool: Option>, } impl Default for ClientBuilder { @@ -477,13 +477,13 @@ pub struct Client { impl Client { /// Get a reference to `HostPool` #[must_use] - pub const fn host_pool_ref(&self) -> Option<&HostPool> { + pub const fn host_pool_ref(&self) -> Option<&Arc> { self.website_checker.host_pool_ref() } /// Get `HostPool` #[must_use] - pub fn host_pool(self) -> Option { + pub fn host_pool(self) -> Option> { self.website_checker.host_pool() } diff --git a/lychee-lib/src/ratelimit/host/host.rs b/lychee-lib/src/ratelimit/host/host.rs index 32b091b468..828aea9ec7 100644 --- a/lychee-lib/src/ratelimit/host/host.rs +++ b/lychee-lib/src/ratelimit/host/host.rs @@ -52,20 +52,16 @@ pub struct Host { rate_limiter: RateLimiter, /// Controls maximum concurrent requests to this host - semaphore: Arc, + semaphore: Semaphore, /// HTTP client configured for this specific host client: ReqwestClient, - /// Cookie jar for maintaining session state (per-host) - #[allow(dead_code)] - cookie_jar: Arc, - /// Request statistics and adaptive behavior tracking - stats: Arc>, + stats: Mutex, /// Current backoff duration for adaptive rate limiting - backoff_duration: Arc>, + backoff_duration: Mutex, /// Per-host cache to prevent duplicate requests cache: HostCache, @@ -120,7 +116,7 @@ impl Host { // Create semaphore for concurrency control let max_concurrent = host_config.effective_max_concurrent(global_config); - let semaphore = Arc::new(Semaphore::new(max_concurrent)); + let semaphore = Semaphore::new(max_concurrent); // Use shared cookie jar if provided, otherwise create per-host one let cookie_jar = shared_cookie_jar.unwrap_or_else(|| Arc::new(CookieStoreMutex::default())); @@ -167,9 +163,8 @@ impl Host { rate_limiter, semaphore, client, - cookie_jar, - stats: Arc::new(Mutex::new(HostStats::default())), - backoff_duration: Arc::new(Mutex::new(Duration::from_millis(0))), + stats: Mutex::new(HostStats::default()), + backoff_duration: Mutex::new(Duration::from_millis(0)), cache: DashMap::new(), cache_max_age, }) diff --git a/lychee-lib/src/ratelimit/pool.rs b/lychee-lib/src/ratelimit/pool.rs index c8dd458b37..bd6ba4b6dd 100644 --- a/lychee-lib/src/ratelimit/pool.rs +++ b/lychee-lib/src/ratelimit/pool.rs @@ -23,19 +23,19 @@ use crate::{CacheStatus, Status, Uri}; /// - Global semaphore enforces overall concurrency limits across all hosts /// - Hosts are created lazily when first requested /// - Thread-safe using `DashMap` for concurrent access to host instances -#[derive(Debug, Clone)] +#[derive(Debug)] pub struct HostPool { /// Map of hostname to Host instances, created on-demand - hosts: Arc>>, + hosts: DashMap>, /// Global configuration for rate limiting defaults - global_config: Arc, + global_config: RateLimitConfig, /// Per-host configuration overrides - host_configs: Arc>, + host_configs: HashMap, /// Global semaphore to enforce overall concurrency limit - global_semaphore: Arc, + global_semaphore: Semaphore, /// Maximum age for cached entries in seconds (0 to disable caching) cache_max_age: u64, @@ -96,10 +96,10 @@ impl HostPool { allow_insecure: bool, ) -> Self { Self { - hosts: Arc::new(DashMap::new()), - global_config: Arc::new(global_config), - host_configs: Arc::new(host_configs), - global_semaphore: Arc::new(Semaphore::new(max_total_concurrency)), + hosts: DashMap::new(), + global_config, + host_configs, + global_semaphore: Semaphore::new(max_total_concurrency), cache_max_age, cookie_jar: None, global_headers, @@ -276,7 +276,7 @@ impl HostPool { /// This is useful for debugging or runtime monitoring of configuration. #[must_use] pub fn host_configurations(&self) -> HashMap { - (*self.host_configs).clone() + self.host_configs.clone() } /// Remove a host from the pool @@ -350,7 +350,7 @@ impl HostPool { /// Cleanup expired cache entries across all hosts pub fn cleanup_caches(&self) { - for host in self.hosts.iter() { + for host in &self.hosts { host.cleanup_cache(); } } From 087ed56e008c86b59aea94105a4cc800d170846c Mon Sep 17 00:00:00 2001 From: Thomas Zahner Date: Fri, 28 Nov 2025 12:36:55 +0100 Subject: [PATCH 24/43] Fix deadlock --- lychee-bin/src/commands/check.rs | 24 ++++++++++++------------ 1 file changed, 12 insertions(+), 12 deletions(-) diff --git a/lychee-bin/src/commands/check.rs b/lychee-bin/src/commands/check.rs index 8462320d9f..18b4120e7c 100644 --- a/lychee-bin/src/commands/check.rs +++ b/lychee-bin/src/commands/check.rs @@ -53,6 +53,17 @@ where .into_set(); let accept = params.cfg.accept.into(); + // Start receiving requests + let handle = tokio::spawn(request_channel_task( + recv_req, + send_resp, + max_concurrency, + client, + cache, + cache_exclude_status, + accept, + )); + let hide_bar = params.cfg.no_progress; let detailed = params.cfg.verbose.log_level() >= log::Level::Info; @@ -66,18 +77,7 @@ where // Wait until all requests are sent send_requests(params.requests, send_req, &progress).await?; - - // Start receiving requests - let (cache, client) = tokio::spawn(request_channel_task( - recv_req, - send_resp, - max_concurrency, - client, - cache, - cache_exclude_status, - accept, - )) - .await?; + let (cache, client) = handle.await?; // Wait until all responses are received let result = show_results_task.await?; From 040971dc97c3775cc72af16f31d05c69c7e0334f Mon Sep 17 00:00:00 2001 From: Thomas Zahner Date: Fri, 28 Nov 2025 13:51:14 +0100 Subject: [PATCH 25/43] Update config options --- lychee-bin/src/options.rs | 2 +- lychee-lib/src/ratelimit/config.rs | 1 + lychee.example.toml | 20 ++++++++++++++++++++ 3 files changed, 22 insertions(+), 1 deletion(-) diff --git a/lychee-bin/src/options.rs b/lychee-bin/src/options.rs index bc9c4c3445..87ae2913a9 100644 --- a/lychee-bin/src/options.rs +++ b/lychee-bin/src/options.rs @@ -562,7 +562,7 @@ with a status code of 429, 500 and 501." /// --request-interval 50ms # Fast for robust APIs /// --request-interval 1s # Conservative for rate-limited APIs #[arg(long = "request-interval", value_parser = humantime::parse_duration, verbatim_doc_comment)] - #[serde(default)] + #[serde(default, with = "humantime_serde")] pub(crate) request_interval: Option, /// Number of threads to utilize. diff --git a/lychee-lib/src/ratelimit/config.rs b/lychee-lib/src/ratelimit/config.rs index 5993d0197b..806b7c41b4 100644 --- a/lychee-lib/src/ratelimit/config.rs +++ b/lychee-lib/src/ratelimit/config.rs @@ -41,6 +41,7 @@ impl RateLimitConfig { /// Configuration for a specific host's rate limiting behavior #[derive(Debug, Clone, PartialEq, Serialize, Deserialize)] +#[serde(deny_unknown_fields)] pub struct HostConfig { /// Maximum concurrent requests allowed to this host pub max_concurrent: Option, diff --git a/lychee.example.toml b/lychee.example.toml index e8989b67b4..1a9e5645c3 100644 --- a/lychee.example.toml +++ b/lychee.example.toml @@ -16,6 +16,9 @@ no_progress = false # Path to summary output file. output = ".config.dummy.report.md" +# Show host statistics +host_stats = true + # Extract links instead of checking them dump = true @@ -200,3 +203,20 @@ archive = "wayback" # Search and suggest link replacements for all broken links suggest = true + +############################# Hosts ############################# + +# Maximum simultaneous requests to the same host +host_concurrency = 5 + +# Minimum interval between requests to the same host +request_interval = "200ms" + +# Customize hosts +[hosts."blog.example.com"] +# Overwrite `host_concurrency` for this host +max_concurrent = 5 +# Overwrite `request_interval` for this host +request_interval = "200ms" +# Combine global `header` values with the following `headers` for this host +headers = { "A" = "B" } From 328ee22b41ab164963b73befb6b9b28f6ce1effa Mon Sep 17 00:00:00 2001 From: Thomas Zahner Date: Tue, 9 Dec 2025 09:18:25 +0100 Subject: [PATCH 26/43] Simplify host pool Reuse reqwest client and make pool non-optional. Removed cache_max_age in the process as it didnt't seem useful. --- lychee-bin/src/client.rs | 45 ++------ lychee-bin/src/commands/check.rs | 8 +- lychee-bin/src/host_stats.rs | 10 +- lychee-bin/src/main.rs | 2 +- lychee-lib/src/checker/website.rs | 38 +++---- lychee-lib/src/client.rs | 133 ++++++++++++---------- lychee-lib/src/ratelimit/host/host.rs | 157 ++------------------------ lychee-lib/src/ratelimit/mod.rs | 2 +- lychee-lib/src/ratelimit/pool.rs | 145 +++++++----------------- 9 files changed, 160 insertions(+), 380 deletions(-) diff --git a/lychee-bin/src/client.rs b/lychee-bin/src/client.rs index ebd4d99b10..9cfaf28ed6 100644 --- a/lychee-bin/src/client.rs +++ b/lychee-bin/src/client.rs @@ -2,10 +2,8 @@ use crate::options::{Config, HeaderMapExt}; use crate::parse::{parse_duration_secs, parse_remaps}; use anyhow::{Context, Result}; use http::{HeaderMap, StatusCode}; -use lychee_lib::{ - Client, ClientBuilder, - ratelimit::{HostPool, RateLimitConfig}, -}; +use lychee_lib::ratelimit::HostPoolConfig; +use lychee_lib::{Client, ClientBuilder, ratelimit::RateLimitConfig}; use regex::RegexSet; use reqwest_cookie_store::CookieStoreMutex; use std::sync::Arc; @@ -31,35 +29,6 @@ pub(crate) fn create(cfg: &Config, cookie_jar: Option<&Arc>) - let headers = HeaderMap::from_header_pairs(&cfg.header)?; - // Create combined headers for HostPool (includes User-Agent + custom headers) - let mut combined_headers = headers.clone(); - combined_headers.insert( - http::header::USER_AGENT, - cfg.user_agent - .parse() - .context("Invalid User-Agent header")?, - ); - - // Create HostPool for rate limiting - always enabled for HTTP requests - let rate_limit_config = - RateLimitConfig::from_options(cfg.host_concurrency, cfg.request_interval); - let cache_max_age = if cfg.cache { 3600 } else { 0 }; // 1 hour if caching enabled, disabled otherwise - - let mut host_pool = HostPool::new( - rate_limit_config, - cfg.hosts.clone(), - cfg.max_concurrency, - cache_max_age, - combined_headers, - cfg.max_redirects, - Some(timeout), - cfg.insecure, - ); - - if let Some(cookie_jar) = cookie_jar { - host_pool = host_pool.with_cookie_jar(cookie_jar.clone()); - } - ClientBuilder::builder() .remaps(remaps) .base(cfg.base_url.clone()) @@ -87,7 +56,15 @@ pub(crate) fn create(cfg: &Config, cookie_jar: Option<&Arc>) - .include_fragments(cfg.include_fragments) .fallback_extensions(cfg.fallback_extensions.clone()) .index_files(cfg.index_files.clone()) - .host_pool(Some(Arc::new(host_pool))) + .host_pool_config(HostPoolConfig { + // Create HostPool for rate limiting - always enabled for HTTP requests + rate_limit_config: RateLimitConfig::from_options( + cfg.host_concurrency, + cfg.request_interval, + ), + hosts: cfg.hosts.clone(), + max_concurrency: cfg.max_concurrency, + }) .build() .client() .context("Failed to create request client") diff --git a/lychee-bin/src/commands/check.rs b/lychee-bin/src/commands/check.rs index 18b4120e7c..cdfaf3c57a 100644 --- a/lychee-bin/src/commands/check.rs +++ b/lychee-bin/src/commands/check.rs @@ -26,7 +26,7 @@ use super::CommandParams; pub(crate) async fn check( params: CommandParams, -) -> Result<(ResponseStats, Cache, ExitCode, Option>), ErrorKind> +) -> Result<(ResponseStats, Cache, ExitCode, Arc), ErrorKind> where S: futures::Stream>, { @@ -270,8 +270,7 @@ async fn handle( // Track cache hit in the per-host stats (only for network URIs) if !uri.is_file() - && let Some(pool) = client.host_pool_ref() - && let Err(e) = pool.record_cache_hit(&uri) + && let Err(e) = client.host_pool().record_cache_hit(&uri) { log::debug!("Failed to record cache hit for {uri}: {e}"); } @@ -281,8 +280,7 @@ async fn handle( // Cache miss - track it and run a normal check (only for network URIs) if !uri.is_file() - && let Some(pool) = client.host_pool_ref() - && let Err(e) = pool.record_cache_miss(&uri) + && let Err(e) = client.host_pool().record_cache_miss(&uri) { log::debug!("Failed to record cache miss for {uri}: {e}"); } diff --git a/lychee-bin/src/host_stats.rs b/lychee-bin/src/host_stats.rs index aea12a753c..f073e9689d 100644 --- a/lychee-bin/src/host_stats.rs +++ b/lychee-bin/src/host_stats.rs @@ -1,21 +1,15 @@ -use std::collections::HashMap; - use anyhow::{Context, Result}; use lychee_lib::ratelimit::HostPool; use crate::{formatters::get_host_stats_formatter, options::Config}; /// Display per-host statistics if requested -pub(crate) fn display_per_host_statistics( - host_pool: Option<&HostPool>, - config: &Config, -) -> Result<()> { +pub(crate) fn display_per_host_statistics(host_pool: &HostPool, config: &Config) -> Result<()> { if !config.host_stats { return Ok(()); } - let host_stats = host_pool.map_or_else(HashMap::default, HostPool::all_host_stats); - + let host_stats = host_pool.all_host_stats(); let host_stats_formatter = get_host_stats_formatter(&config.format, &config.mode); if let Some(formatted_host_stats) = host_stats_formatter.format(host_stats)? { diff --git a/lychee-bin/src/main.rs b/lychee-bin/src/main.rs index 153a3c6148..38a2535f6f 100644 --- a/lychee-bin/src/main.rs +++ b/lychee-bin/src/main.rs @@ -421,7 +421,7 @@ async fn run(opts: &LycheeOptions) -> Result { } // Display per-host statistics if requested - display_per_host_statistics(host_pool.as_deref(), &opts.config)?; + display_per_host_statistics(host_pool.as_ref(), &opts.config)?; if github_issues && opts.config.github_token.is_none() { warn!( diff --git a/lychee-lib/src/checker/website.rs b/lychee-lib/src/checker/website.rs index 6d69e00f37..7b30d9c0f3 100644 --- a/lychee-lib/src/checker/website.rs +++ b/lychee-lib/src/checker/website.rs @@ -60,20 +60,14 @@ pub(crate) struct WebsiteChecker { /// /// When present, HTTP requests will be routed through this pool for /// rate limiting. When None, requests go directly through `reqwest_client`. - host_pool: Option>, + host_pool: Arc, } impl WebsiteChecker { /// Get a reference to `HostPool` #[must_use] - pub(crate) const fn host_pool_ref(&self) -> Option<&Arc> { - self.host_pool.as_ref() - } - - /// Get `HostPool` - #[must_use] - pub(crate) fn host_pool(self) -> Option> { - self.host_pool + pub(crate) fn host_pool(&self) -> Arc { + self.host_pool.clone() } #[allow(clippy::too_many_arguments)] @@ -88,7 +82,7 @@ impl WebsiteChecker { require_https: bool, plugin_request_chain: RequestChain, include_fragments: bool, - host_pool: Option>, + host_pool: Arc, ) -> Self { Self { method, @@ -130,21 +124,17 @@ impl WebsiteChecker { let method = request.method().clone(); let request_url = request.url().clone(); - // Use HostPool for rate limiting - always enabled for HTTP requests - let response_result = if let Some(host_pool) = &self.host_pool { - match host_pool.execute_request(request).await { - Ok(response) => Ok(response), - Err(crate::ratelimit::RateLimitError::NetworkError { source, .. }) => { - // Network errors should be handled the same as direct client errors - Err(source) - } - Err(e) => { - // Rate limiting specific errors - return Status::Error(ErrorKind::RateLimit(e)); - } + // Use HostPool for rate limiting + let response_result = match self.host_pool.execute_request(request).await { + Ok(response) => Ok(response), + Err(crate::ratelimit::RateLimitError::NetworkError { source, .. }) => { + // Network errors should be handled the same as direct client errors + Err(source) + } + Err(e) => { + // Rate limiting specific errors + return Status::Error(ErrorKind::RateLimit(e)); } - } else { - self.reqwest_client.execute(request).await }; match response_result { diff --git a/lychee-lib/src/client.rs b/lychee-lib/src/client.rs index 262cd56694..d054fdbff2 100644 --- a/lychee-lib/src/client.rs +++ b/lychee-lib/src/client.rs @@ -32,7 +32,7 @@ use crate::{ chain::RequestChain, checker::{file::FileChecker, mail::MailChecker, website::WebsiteChecker}, filter::Filter, - ratelimit::HostPool, + ratelimit::{HostPool, HostPoolConfig}, remap::Remaps, types::{DEFAULT_ACCEPTED_STATUS_CODES, redirect_history::RedirectHistory}, }; @@ -306,11 +306,9 @@ pub struct ClientBuilder { /// skipped and the lychee-internal request chain is not activated. plugin_request_chain: RequestChain, - /// Optional host pool for per-host rate limiting of HTTP requests. - /// - /// When provided, HTTP/HTTPS requests will be routed through this pool + /// When enabled, HTTP/HTTPS requests will be routed through this pool /// for rate limiting and concurrency control on a per-host basis. - host_pool: Option>, + host_pool_config: HostPoolConfig, } impl Default for ClientBuilder { @@ -336,53 +334,25 @@ impl ClientBuilder { /// /// [here]: https://docs.rs/reqwest/latest/reqwest/struct.ClientBuilder.html#errors pub fn client(self) -> Result { - let Self { - user_agent, - custom_headers: mut headers, - .. - } = self; - - if let Some(prev_user_agent) = - headers.insert(header::USER_AGENT, HeaderValue::try_from(&user_agent)?) - { - debug!( - "Found user-agent in headers: {}. Overriding it with {user_agent}.", - prev_user_agent.to_str().unwrap_or("ļæ½"), - ); - } - - headers.insert( - header::TRANSFER_ENCODING, - HeaderValue::from_static("chunked"), - ); - let redirect_history = RedirectHistory::new(); - - let mut builder = reqwest::ClientBuilder::new() - .gzip(true) - .default_headers(headers) - .danger_accept_invalid_certs(self.allow_insecure) - .connect_timeout(Duration::from_secs(CONNECT_TIMEOUT)) - .tcp_keepalive(Duration::from_secs(TCP_KEEPALIVE)) - .redirect(redirect_policy( - redirect_history.clone(), - self.max_redirects, - )); - - if let Some(cookie_jar) = self.cookie_jar { - builder = builder.cookie_provider(cookie_jar); - } - - if let Some(min_tls) = self.min_tls_version { - builder = builder.min_tls_version(min_tls); - } - - let reqwest_client = match self.timeout { - Some(t) => builder.timeout(t), - None => builder, - } - .build() - .map_err(ErrorKind::BuildRequestClient)?; + let reqwest_client = self + .reqwest_builder(&redirect_history)? + .build() + .map_err(ErrorKind::BuildRequestClient)?; + + // Create HostPool for rate limiting - always enabled for HTTP requests + let HostPoolConfig { + rate_limit_config, + hosts, + max_concurrency, + } = self.host_pool_config; + + let host_pool = HostPool::new( + rate_limit_config, + hosts, + max_concurrency, + reqwest_client.clone(), + ); let github_client = match self.github_token.as_ref().map(ExposeSecret::expose_secret) { Some(token) if !token.is_empty() => Some( @@ -419,7 +389,7 @@ impl ClientBuilder { self.require_https, self.plugin_request_chain, self.include_fragments, - self.host_pool, + Arc::new(host_pool), ); Ok(Client { @@ -435,6 +405,57 @@ impl ClientBuilder { ), }) } + + fn reqwest_builder( + &self, + redirect_history: &RedirectHistory, + ) -> Result { + let mut builder = reqwest::ClientBuilder::new() + .gzip(true) + .default_headers(self.headers()?) + .danger_accept_invalid_certs(self.allow_insecure) + .connect_timeout(Duration::from_secs(CONNECT_TIMEOUT)) + .tcp_keepalive(Duration::from_secs(TCP_KEEPALIVE)) + .redirect(redirect_policy( + redirect_history.clone(), + self.max_redirects, + )); + + if let Some(cookie_jar) = self.cookie_jar.clone() { + builder = builder.cookie_provider(cookie_jar); + } + + if let Some(min_tls) = self.min_tls_version { + builder = builder.min_tls_version(min_tls); + } + + if let Some(timeout) = self.timeout { + builder = builder.timeout(timeout); + } + + Ok(builder) + } + + fn headers(&self) -> Result { + let user_agent = self.user_agent.clone(); + let mut headers = self.custom_headers.clone(); + + if let Some(prev_user_agent) = + headers.insert(header::USER_AGENT, HeaderValue::try_from(&user_agent)?) + { + debug!( + "Found user-agent in headers: {}. Overriding it with {user_agent}.", + prev_user_agent.to_str().unwrap_or("ļæ½"), + ); + } + + headers.insert( + header::TRANSFER_ENCODING, + HeaderValue::from_static("chunked"), + ); + + Ok(headers) + } } /// Create our custom [`redirect::Policy`] in order to stop following redirects @@ -475,15 +496,9 @@ pub struct Client { } impl Client { - /// Get a reference to `HostPool` - #[must_use] - pub const fn host_pool_ref(&self) -> Option<&Arc> { - self.website_checker.host_pool_ref() - } - /// Get `HostPool` #[must_use] - pub fn host_pool(self) -> Option> { + pub fn host_pool(&self) -> Arc { self.website_checker.host_pool() } diff --git a/lychee-lib/src/ratelimit/host/host.rs b/lychee-lib/src/ratelimit/host/host.rs index 828aea9ec7..870f594e50 100644 --- a/lychee-lib/src/ratelimit/host/host.rs +++ b/lychee-lib/src/ratelimit/host/host.rs @@ -4,9 +4,8 @@ use governor::{ clock::DefaultClock, state::{InMemoryState, NotKeyed}, }; -use reqwest::{Client as ReqwestClient, Request, Response, redirect}; -use reqwest_cookie_store::CookieStoreMutex; -use std::sync::{Arc, Mutex}; +use reqwest::{Client as ReqwestClient, Request, Response}; +use std::sync::Mutex; use std::time::{Duration, Instant}; use tokio::sync::Semaphore; @@ -19,14 +18,12 @@ use crate::{CacheStatus, Status, Uri}; #[derive(Debug, Clone)] struct HostCacheValue { status: CacheStatus, - timestamp: Instant, } impl From<&Status> for HostCacheValue { fn from(status: &Status) -> Self { Self { status: status.into(), - timestamp: Instant::now(), } } } @@ -65,26 +62,11 @@ pub struct Host { /// Per-host cache to prevent duplicate requests cache: HostCache, - - /// Maximum age for cached entries (in seconds) - cache_max_age: u64, } impl Host { /// Create a new Host instance for the given hostname /// - /// # Arguments - /// - /// * `key` - The hostname this host will manage - /// * `host_config` - Host-specific configuration - /// * `global_config` - Global defaults to fall back to - /// * `cache_max_age` - Maximum age for cached entries in seconds (0 to disable caching) - /// * `shared_cookie_jar` - Optional shared cookie jar to use instead of creating per-host jar - /// * `global_headers` - Global headers to be applied to all requests (User-Agent, custom headers, etc.) - /// * `max_redirects` - Maximum number of redirects to follow - /// * `timeout` - Request timeout - /// * `allow_insecure` - Whether to allow insecure certificates - /// /// # Errors /// /// Returns an error if the HTTP client cannot be configured properly @@ -97,12 +79,7 @@ impl Host { key: HostKey, host_config: &HostConfig, global_config: &RateLimitConfig, - cache_max_age: u64, - shared_cookie_jar: Option>, - global_headers: &http::HeaderMap, - max_redirects: usize, - timeout: Option, - allow_insecure: bool, + client: ReqwestClient, ) -> Result { let interval = host_config.effective_request_interval(global_config); let quota = Quota::with_period(interval) @@ -118,46 +95,6 @@ impl Host { let max_concurrent = host_config.effective_max_concurrent(global_config); let semaphore = Semaphore::new(max_concurrent); - // Use shared cookie jar if provided, otherwise create per-host one - let cookie_jar = shared_cookie_jar.unwrap_or_else(|| Arc::new(CookieStoreMutex::default())); - - // Combine global headers with host-specific headers - let mut combined_headers = global_headers.clone(); - for (name, value) in &host_config.headers { - combined_headers.insert(name, value.clone()); - } - - // Create custom redirect policy matching main client behavior - let redirect_policy = redirect::Policy::custom(move |attempt| { - if attempt.previous().len() > max_redirects { - attempt.error("too many redirects") - } else { - log::debug!("Redirecting to {}", attempt.url()); - attempt.follow() - } - }); - - // Build HTTP client with proper configuration - let mut builder = ReqwestClient::builder() - .cookie_provider(cookie_jar.clone()) - .default_headers(combined_headers) - .gzip(true) - .danger_accept_invalid_certs(allow_insecure) - .connect_timeout(Duration::from_secs(10)) // CONNECT_TIMEOUT constant - .tcp_keepalive(Duration::from_secs(60)) // TCP_KEEPALIVE constant - .redirect(redirect_policy); - - if let Some(timeout) = timeout { - builder = builder.timeout(timeout); - } - - let client = builder - .build() - .map_err(|e| RateLimitError::ClientConfigError { - host: key.to_string(), - source: e, - })?; - Ok(Host { key, rate_limiter, @@ -166,7 +103,6 @@ impl Host { stats: Mutex::new(HostStats::default()), backoff_duration: Mutex::new(Duration::from_millis(0)), cache: DashMap::new(), - cache_max_age, }) } @@ -176,23 +112,12 @@ impl Host { /// /// Panics if the statistics mutex is poisoned pub fn get_cached_status(&self, uri: &Uri) -> Option { - if self.cache_max_age == 0 { - // Track cache miss when caching is disabled - self.stats.lock().unwrap().record_cache_miss(); - return None; // Caching disabled - } - if let Some(entry) = self.cache.get(uri) { - let age = entry.timestamp.elapsed().as_secs(); - if age <= self.cache_max_age { - // Cache hit - self.stats.lock().unwrap().record_cache_hit(); - return Some(entry.status); - } - // Cache entry expired, remove it - drop(entry); - self.cache.remove(uri); + // Cache hit + self.stats.lock().unwrap().record_cache_hit(); + return Some(entry.status); } + // Cache miss self.stats.lock().unwrap().record_cache_miss(); None @@ -200,10 +125,8 @@ impl Host { /// Cache a request result pub fn cache_result(&self, uri: &Uri, status: &Status) { - if self.cache_max_age > 0 { - let cache_value = HostCacheValue::from(status); - self.cache.insert(uri.clone(), cache_value); - } + let cache_value = HostCacheValue::from(status); + self.cache.insert(uri.clone(), cache_value); } /// Execute a request with rate limiting, concurrency control, and caching @@ -438,23 +361,13 @@ impl Host { pub fn cache_size(&self) -> usize { self.cache.len() } - - /// Clear expired entries from the cache - pub fn cleanup_cache(&self) { - if self.cache_max_age == 0 { - return; - } - - self.cache - .retain(|_, value| value.timestamp.elapsed().as_secs() <= self.cache_max_age); - } } #[cfg(test)] mod tests { use super::*; use crate::ratelimit::{HostConfig, RateLimitConfig}; - use std::time::Duration; + use reqwest::Client; #[tokio::test] async fn test_host_creation() { @@ -462,59 +375,11 @@ mod tests { let host_config = HostConfig::default(); let global_config = RateLimitConfig::default(); - let host = Host::new( - key.clone(), - &host_config, - &global_config, - 3600, - None, - &http::HeaderMap::new(), - 5, - Some(std::time::Duration::from_secs(20)), - false, - ) - .unwrap(); + let host = Host::new(key.clone(), &host_config, &global_config, Client::default()).unwrap(); assert_eq!(host.key, key); assert_eq!(host.available_permits(), 10); // Default concurrency assert!((host.stats().success_rate() - 1.0).abs() < f64::EPSILON); assert_eq!(host.cache_size(), 0); } - - #[test] - fn test_cache_expiration() { - let key = HostKey::from("example.com"); - let host_config = HostConfig::default(); - let global_config = RateLimitConfig::default(); - - let host = Host::new( - key, - &host_config, - &global_config, - 1, - None, - &http::HeaderMap::new(), - 5, - Some(std::time::Duration::from_secs(20)), - false, - ) - .unwrap(); // 1 second cache - - let uri = Uri::from("https://example.com/test".parse::().unwrap()); - let status = Status::Ok(http::StatusCode::OK); - - // Cache the result - host.cache_result(&uri, &status); - assert_eq!(host.cache_size(), 1); - - // Should be in cache immediately - assert!(host.get_cached_status(&uri).is_some()); - - // Wait for expiration and cleanup - std::thread::sleep(Duration::from_secs(2)); - host.cleanup_cache(); - - // Should be expired now - assert!(host.get_cached_status(&uri).is_none()); - } } diff --git a/lychee-lib/src/ratelimit/mod.rs b/lychee-lib/src/ratelimit/mod.rs index eeccd90ba2..5ad1157703 100644 --- a/lychee-lib/src/ratelimit/mod.rs +++ b/lychee-lib/src/ratelimit/mod.rs @@ -22,5 +22,5 @@ mod window; pub use config::{HostConfig, RateLimitConfig}; pub use error::RateLimitError; pub use host::{Host, HostKey, HostStats}; -pub use pool::HostPool; +pub use pool::{HostPool, HostPoolConfig}; pub use window::Window; diff --git a/lychee-lib/src/ratelimit/pool.rs b/lychee-lib/src/ratelimit/pool.rs index bd6ba4b6dd..9fa17fe486 100644 --- a/lychee-lib/src/ratelimit/pool.rs +++ b/lychee-lib/src/ratelimit/pool.rs @@ -1,15 +1,33 @@ use dashmap::DashMap; -use http::HeaderMap; -use reqwest::{Request, Response}; -use reqwest_cookie_store::CookieStoreMutex; +use reqwest::{Client, Request, Response}; use std::collections::HashMap; use std::sync::Arc; -use std::time::Duration; use tokio::sync::Semaphore; use crate::ratelimit::{Host, HostConfig, HostKey, HostStats, RateLimitConfig, RateLimitError}; use crate::{CacheStatus, Status, Uri}; +/// TODO: Rename,move,refactor? +#[derive(Debug, Clone)] +pub struct HostPoolConfig { + /// TODO + pub rate_limit_config: RateLimitConfig, + /// TODO + pub hosts: HashMap, + /// TODO + pub max_concurrency: usize, +} + +impl Default for HostPoolConfig { + fn default() -> Self { + Self { + rate_limit_config: Default::default(), + hosts: Default::default(), + max_concurrency: 128, // TODO: expose/reuse DEFAULT_MAX_CONCURRENCY + } + } +} + /// Manages a pool of Host instances and routes requests to appropriate hosts. /// /// The `HostPool` serves as the central coordinator for per-host rate limiting. @@ -37,85 +55,28 @@ pub struct HostPool { /// Global semaphore to enforce overall concurrency limit global_semaphore: Semaphore, - /// Maximum age for cached entries in seconds (0 to disable caching) - cache_max_age: u64, - - /// Shared cookie jar used across all hosts - cookie_jar: Option>, - - /// Global headers to be applied to all requests (includes User-Agent, etc.) - global_headers: HeaderMap, - - /// Maximum number of redirects to follow - max_redirects: usize, - - /// Request timeout - timeout: Option, - - /// Whether to allow insecure certificates - allow_insecure: bool, + client: Client, } impl HostPool { /// Create a new `HostPool` with the given configuration - /// - /// # Arguments - /// - /// * `global_config` - Default rate limiting configuration - /// * `host_configs` - Host-specific configuration overrides - /// * `max_total_concurrency` - Global limit on concurrent requests across all hosts - /// * `cache_max_age` - Maximum age for cached entries in seconds (0 to disable caching) - /// * `global_headers` - Headers to be applied to all requests (User-Agent, custom headers, etc.) - /// * `max_redirects` - Maximum number of redirects to follow - /// * `timeout` - Request timeout - /// * `allow_insecure` - Whether to allow insecure certificates - /// - /// # Examples - /// - /// ``` - /// use lychee_lib::ratelimit::{HostPool, RateLimitConfig}; - /// use std::collections::HashMap; - /// use http::HeaderMap; - /// use std::time::Duration; - /// - /// let global_config = RateLimitConfig::default(); - /// let host_configs = HashMap::new(); - /// let global_headers = HeaderMap::new(); - /// let pool = HostPool::new(global_config, host_configs, 128, 3600, global_headers, 5, Some(Duration::from_secs(20)), false); - /// ``` #[must_use] #[allow(clippy::too_many_arguments)] pub fn new( global_config: RateLimitConfig, host_configs: HashMap, max_total_concurrency: usize, - cache_max_age: u64, - global_headers: HeaderMap, - max_redirects: usize, - timeout: Option, - allow_insecure: bool, + client: Client, ) -> Self { Self { hosts: DashMap::new(), global_config, host_configs, global_semaphore: Semaphore::new(max_total_concurrency), - cache_max_age, - cookie_jar: None, - global_headers, - max_redirects, - timeout, - allow_insecure, + client, } } - /// Add a shared cookie jar to the `HostPool` - #[must_use] - pub fn with_cookie_jar(mut self, cookie_jar: Arc) -> Self { - self.cookie_jar = Some(cookie_jar); - self - } - /// Execute an HTTP request with appropriate per-host rate limiting /// /// This method: @@ -144,16 +105,7 @@ impl HostPool { /// # use std::time::Duration; /// # #[tokio::main] /// # async fn main() -> Result<(), Box> { - /// let pool = HostPool::new( - /// RateLimitConfig::default(), - /// HashMap::new(), - /// 128, - /// 3600, - /// HeaderMap::new(), - /// 5, - /// Some(Duration::from_secs(20)), - /// false - /// ); + /// let pool = HostPool::default(); /// let request = reqwest::Request::new(reqwest::Method::GET, "https://example.com".parse()?); /// let response = pool.execute_request(request).await?; /// # Ok(()) @@ -193,16 +145,22 @@ impl HostPool { .cloned() .unwrap_or_default(); + /* + * TODO + + // Combine global headers with host-specific headers + let mut combined_headers = global_headers.clone(); + for (name, value) in &host_config.headers { + combined_headers.insert(name, value.clone()); + } + */ + + let client = self.client.clone(); let host = Arc::new(Host::new( host_key.clone(), &host_config, &self.global_config, - self.cache_max_age, - self.cookie_jar.clone(), - &self.global_headers, - self.max_redirects, - self.timeout, - self.allow_insecure, + client, )?); // Store in map (handle race condition where another thread created it) @@ -348,13 +306,6 @@ impl HostPool { .collect() } - /// Cleanup expired cache entries across all hosts - pub fn cleanup_caches(&self) { - for host in &self.hosts { - host.cleanup_cache(); - } - } - /// Record a cache hit for the given URI in host statistics /// /// This tracks that a request was served from the persistent disk cache @@ -403,12 +354,8 @@ impl Default for HostPool { Self::new( RateLimitConfig::default(), HashMap::new(), - 128, // Default global concurrency limit - 3600, // Default cache age of 1 hour - HeaderMap::new(), // Default empty headers - 5, // Default max redirects - Some(Duration::from_secs(20)), // Default timeout - false, // Default secure certificates + 128, // Default global concurrency limit + Client::default(), ) } } @@ -422,17 +369,11 @@ mod tests { #[test] fn test_host_pool_creation() { - let global_config = RateLimitConfig::default(); - let host_configs = HashMap::new(); let pool = HostPool::new( - global_config, - host_configs, + RateLimitConfig::default(), + HashMap::new(), 100, - 3600, - HeaderMap::new(), - 5, - Some(Duration::from_secs(20)), - false, + Client::default(), ); assert_eq!(pool.active_host_count(), 0); From 42dc0721d15514c2990d56cf5f5dca8bb2270bf9 Mon Sep 17 00:00:00 2001 From: Thomas Zahner Date: Tue, 9 Dec 2025 17:13:44 +0100 Subject: [PATCH 27/43] Build host-specific reqwest clients again But this time in lychee-lib client and not on demand. This reduces complexity quite a bit. Test that host-specific headers are correctly sent. --- fixtures/configs/headers.toml | 4 ++++ lychee-bin/tests/cli.rs | 7 +++---- lychee-lib/src/client.rs | 31 ++++++++++++++++++++++++------ lychee-lib/src/ratelimit/config.rs | 5 ++--- lychee-lib/src/ratelimit/mod.rs | 2 +- lychee-lib/src/ratelimit/pool.rs | 30 ++++++++++++++++------------- 6 files changed, 52 insertions(+), 27 deletions(-) diff --git a/fixtures/configs/headers.toml b/fixtures/configs/headers.toml index 2873301f65..d4e6b7107a 100644 --- a/fixtures/configs/headers.toml +++ b/fixtures/configs/headers.toml @@ -4,3 +4,7 @@ X-Bar = "Baz" # Alternative TOML syntax: # header = { X-Foo = "Bar", X-Bar = "Baz" } + + +[hosts."127.0.0.1"] +headers = { "X-Host-Specific" = "Foo" } diff --git a/lychee-bin/tests/cli.rs b/lychee-bin/tests/cli.rs index 154c202636..4d47d251fc 100644 --- a/lychee-bin/tests/cli.rs +++ b/lychee-bin/tests/cli.rs @@ -2394,7 +2394,6 @@ The config file should contain every possible key for documentation purposes." wiremock::Mock::given(wiremock::matchers::method("GET")) .and(wiremock::matchers::header("X-Foo", "Bar")) .respond_with(wiremock::ResponseTemplate::new(200)) - // We expect the mock to be called exactly least once. .expect(1) .named("GET expecting custom header"), ) @@ -2421,7 +2420,6 @@ The config file should contain every possible key for documentation purposes." .and(wiremock::matchers::header("X-Foo", "Bar")) .and(wiremock::matchers::header("X-Bar", "Baz")) .respond_with(wiremock::ResponseTemplate::new(200)) - // We expect the mock to be called exactly least once. .expect(1) .named("GET expecting custom header"), ) @@ -2449,8 +2447,8 @@ The config file should contain every possible key for documentation purposes." wiremock::Mock::given(wiremock::matchers::method("GET")) .and(wiremock::matchers::header("X-Foo", "Bar")) .and(wiremock::matchers::header("X-Bar", "Baz")) + .and(wiremock::matchers::header("X-Host-Specific", "Foo")) .respond_with(wiremock::ResponseTemplate::new(200)) - // We expect the mock to be called exactly least once. .expect(1) .named("GET expecting custom header"), ) @@ -2461,7 +2459,8 @@ The config file should contain every possible key for documentation purposes." .arg("--verbose") .arg("--config") .arg(config) - .arg(server.uri()) + .arg("-") + .write_stdin(server.uri()) .assert() .success(); diff --git a/lychee-lib/src/client.rs b/lychee-lib/src/client.rs index d054fdbff2..6ecb23d622 100644 --- a/lychee-lib/src/client.rs +++ b/lychee-lib/src/client.rs @@ -32,7 +32,7 @@ use crate::{ chain::RequestChain, checker::{file::FileChecker, mail::MailChecker, website::WebsiteChecker}, filter::Filter, - ratelimit::{HostPool, HostPoolConfig}, + ratelimit::{ClientMap, HostKey, HostPool, HostPoolConfig}, remap::Remaps, types::{DEFAULT_ACCEPTED_STATUS_CODES, redirect_history::RedirectHistory}, }; @@ -336,10 +336,12 @@ impl ClientBuilder { pub fn client(self) -> Result { let redirect_history = RedirectHistory::new(); let reqwest_client = self - .reqwest_builder(&redirect_history)? + .build_client(&redirect_history)? .build() .map_err(ErrorKind::BuildRequestClient)?; + let client_map = self.build_host_clients(&redirect_history)?; + // Create HostPool for rate limiting - always enabled for HTTP requests let HostPoolConfig { rate_limit_config, @@ -352,6 +354,7 @@ impl ClientBuilder { hosts, max_concurrency, reqwest_client.clone(), + client_map, ); let github_client = match self.github_token.as_ref().map(ExposeSecret::expose_secret) { @@ -406,10 +409,26 @@ impl ClientBuilder { }) } - fn reqwest_builder( - &self, - redirect_history: &RedirectHistory, - ) -> Result { + /// Build the host-specific clients with their host-specific headers + fn build_host_clients(&self, redirect_history: &RedirectHistory) -> Result { + self.host_pool_config + .hosts + .iter() + .map(|(host, config)| { + let mut headers = self.headers()?; + headers.extend(config.headers.clone()); + let client = self + .build_client(redirect_history)? + .default_headers(headers) + .build() + .map_err(ErrorKind::BuildRequestClient)?; + Ok((HostKey::from(host.as_str()), client)) + }) + .collect() + } + + /// Create a [`reqwest::ClientBuilder`] based on various fields + fn build_client(&self, redirect_history: &RedirectHistory) -> Result { let mut builder = reqwest::ClientBuilder::new() .gzip(true) .default_headers(self.headers()?) diff --git a/lychee-lib/src/ratelimit/config.rs b/lychee-lib/src/ratelimit/config.rs index 806b7c41b4..dbc4becf91 100644 --- a/lychee-lib/src/ratelimit/config.rs +++ b/lychee-lib/src/ratelimit/config.rs @@ -11,8 +11,7 @@ pub struct RateLimitConfig { pub host_concurrency: usize, /// Default minimum interval between requests to the same host - #[serde(default = "default_request_interval")] - #[serde(with = "humantime_serde")] + #[serde(default = "default_request_interval", with = "humantime_serde")] pub request_interval: Duration, } @@ -47,7 +46,7 @@ pub struct HostConfig { pub max_concurrent: Option, /// Minimum interval between requests to this host - #[serde(with = "humantime_serde")] + #[serde(default, with = "humantime_serde")] pub request_interval: Option, /// Custom headers to send with requests to this host diff --git a/lychee-lib/src/ratelimit/mod.rs b/lychee-lib/src/ratelimit/mod.rs index 5ad1157703..946240370c 100644 --- a/lychee-lib/src/ratelimit/mod.rs +++ b/lychee-lib/src/ratelimit/mod.rs @@ -22,5 +22,5 @@ mod window; pub use config::{HostConfig, RateLimitConfig}; pub use error::RateLimitError; pub use host::{Host, HostKey, HostStats}; -pub use pool::{HostPool, HostPoolConfig}; +pub use pool::{ClientMap, HostPool, HostPoolConfig}; pub use window::Window; diff --git a/lychee-lib/src/ratelimit/pool.rs b/lychee-lib/src/ratelimit/pool.rs index 9fa17fe486..822ad9d84e 100644 --- a/lychee-lib/src/ratelimit/pool.rs +++ b/lychee-lib/src/ratelimit/pool.rs @@ -18,6 +18,9 @@ pub struct HostPoolConfig { pub max_concurrency: usize, } +/// Keep track of host-specific [`reqwest::Client`]s +pub type ClientMap = HashMap; + impl Default for HostPoolConfig { fn default() -> Self { Self { @@ -55,7 +58,9 @@ pub struct HostPool { /// Global semaphore to enforce overall concurrency limit global_semaphore: Semaphore, - client: Client, + default_client: Client, + + client_map: ClientMap, } impl HostPool { @@ -66,14 +71,16 @@ impl HostPool { global_config: RateLimitConfig, host_configs: HashMap, max_total_concurrency: usize, - client: Client, + default_client: Client, + client_map: ClientMap, ) -> Self { Self { hosts: DashMap::new(), global_config, host_configs, global_semaphore: Semaphore::new(max_total_concurrency), - client, + default_client, + client_map, } } @@ -145,17 +152,12 @@ impl HostPool { .cloned() .unwrap_or_default(); - /* - * TODO - - // Combine global headers with host-specific headers - let mut combined_headers = global_headers.clone(); - for (name, value) in &host_config.headers { - combined_headers.insert(name, value.clone()); - } - */ + let client = self + .client_map + .get(&host_key) + .unwrap_or(&self.default_client) + .clone(); - let client = self.client.clone(); let host = Arc::new(Host::new( host_key.clone(), &host_config, @@ -356,6 +358,7 @@ impl Default for HostPool { HashMap::new(), 128, // Default global concurrency limit Client::default(), + HashMap::new(), ) } } @@ -374,6 +377,7 @@ mod tests { HashMap::new(), 100, Client::default(), + HashMap::new(), ); assert_eq!(pool.active_host_count(), 0); From 595e6347fc002e9b78f52b71f8bb4116e421cb9a Mon Sep 17 00:00:00 2001 From: Thomas Zahner Date: Tue, 9 Dec 2025 17:28:21 +0100 Subject: [PATCH 28/43] Remove max_concurrency and global_semaphore I do not see any reasons for having them. The global concurrency is already limited by the mpsc channel buffer size. --- lychee-bin/src/client.rs | 1 - lychee-lib/src/client.rs | 9 +-------- lychee-lib/src/ratelimit/pool.rs | 31 ------------------------------- 3 files changed, 1 insertion(+), 40 deletions(-) diff --git a/lychee-bin/src/client.rs b/lychee-bin/src/client.rs index 9cfaf28ed6..f3f9bf4970 100644 --- a/lychee-bin/src/client.rs +++ b/lychee-bin/src/client.rs @@ -63,7 +63,6 @@ pub(crate) fn create(cfg: &Config, cookie_jar: Option<&Arc>) - cfg.request_interval, ), hosts: cfg.hosts.clone(), - max_concurrency: cfg.max_concurrency, }) .build() .client() diff --git a/lychee-lib/src/client.rs b/lychee-lib/src/client.rs index 6ecb23d622..ae26bf6e24 100644 --- a/lychee-lib/src/client.rs +++ b/lychee-lib/src/client.rs @@ -346,16 +346,9 @@ impl ClientBuilder { let HostPoolConfig { rate_limit_config, hosts, - max_concurrency, } = self.host_pool_config; - let host_pool = HostPool::new( - rate_limit_config, - hosts, - max_concurrency, - reqwest_client.clone(), - client_map, - ); + let host_pool = HostPool::new(rate_limit_config, hosts, reqwest_client.clone(), client_map); let github_client = match self.github_token.as_ref().map(ExposeSecret::expose_secret) { Some(token) if !token.is_empty() => Some( diff --git a/lychee-lib/src/ratelimit/pool.rs b/lychee-lib/src/ratelimit/pool.rs index 822ad9d84e..250b3df962 100644 --- a/lychee-lib/src/ratelimit/pool.rs +++ b/lychee-lib/src/ratelimit/pool.rs @@ -2,7 +2,6 @@ use dashmap::DashMap; use reqwest::{Client, Request, Response}; use std::collections::HashMap; use std::sync::Arc; -use tokio::sync::Semaphore; use crate::ratelimit::{Host, HostConfig, HostKey, HostStats, RateLimitConfig, RateLimitError}; use crate::{CacheStatus, Status, Uri}; @@ -14,8 +13,6 @@ pub struct HostPoolConfig { pub rate_limit_config: RateLimitConfig, /// TODO pub hosts: HashMap, - /// TODO - pub max_concurrency: usize, } /// Keep track of host-specific [`reqwest::Client`]s @@ -26,7 +23,6 @@ impl Default for HostPoolConfig { Self { rate_limit_config: Default::default(), hosts: Default::default(), - max_concurrency: 128, // TODO: expose/reuse DEFAULT_MAX_CONCURRENCY } } } @@ -55,9 +51,6 @@ pub struct HostPool { /// Per-host configuration overrides host_configs: HashMap, - /// Global semaphore to enforce overall concurrency limit - global_semaphore: Semaphore, - default_client: Client, client_map: ClientMap, @@ -70,7 +63,6 @@ impl HostPool { pub fn new( global_config: RateLimitConfig, host_configs: HashMap, - max_total_concurrency: usize, default_client: Client, client_map: ClientMap, ) -> Self { @@ -78,7 +70,6 @@ impl HostPool { hosts: DashMap::new(), global_config, host_configs, - global_semaphore: Semaphore::new(max_total_concurrency), default_client, client_map, } @@ -126,14 +117,6 @@ impl HostPool { // Get or create host instance let host = self.get_or_create_host(host_key)?; - // Acquire global semaphore permit first - let _global_permit = self.global_semaphore.acquire().await.map_err(|_| { - RateLimitError::RateLimitExceeded { - host: host.key.to_string(), - message: "Global concurrency limit reached".to_string(), - } - })?; - // Execute request through host-specific rate limiting host.execute_request(request).await } @@ -221,15 +204,6 @@ impl HostPool { self.hosts.len() } - /// Get the number of available global permits - /// - /// This shows how many more concurrent requests can be started - /// across all hosts before hitting the global concurrency limit. - #[must_use] - pub fn available_global_permits(&self) -> usize { - self.global_semaphore.available_permits() - } - /// Get host configuration for debugging/monitoring /// /// Returns a copy of the current host-specific configurations. @@ -356,7 +330,6 @@ impl Default for HostPool { Self::new( RateLimitConfig::default(), HashMap::new(), - 128, // Default global concurrency limit Client::default(), HashMap::new(), ) @@ -375,21 +348,17 @@ mod tests { let pool = HostPool::new( RateLimitConfig::default(), HashMap::new(), - 100, Client::default(), HashMap::new(), ); assert_eq!(pool.active_host_count(), 0); - assert_eq!(pool.available_global_permits(), 100); } #[test] fn test_host_pool_default() { let pool = HostPool::default(); - assert_eq!(pool.active_host_count(), 0); - assert_eq!(pool.available_global_permits(), 128); } #[tokio::test] From d374155bce249868c30b46127ec80079c33bab23 Mon Sep 17 00:00:00 2001 From: Thomas Zahner Date: Wed, 10 Dec 2025 09:57:05 +0100 Subject: [PATCH 29/43] Update docs & reduce complexity --- lychee-bin/src/client.rs | 14 ++-- lychee-bin/src/options.rs | 4 +- lychee-lib/src/client.rs | 26 ++++---- lychee-lib/src/ratelimit/config.rs | 5 ++ lychee-lib/src/ratelimit/host/key.rs | 3 +- lychee-lib/src/ratelimit/mod.rs | 4 +- lychee-lib/src/ratelimit/pool.rs | 95 +++++----------------------- 7 files changed, 46 insertions(+), 105 deletions(-) diff --git a/lychee-bin/src/client.rs b/lychee-bin/src/client.rs index f3f9bf4970..5d4765059e 100644 --- a/lychee-bin/src/client.rs +++ b/lychee-bin/src/client.rs @@ -2,7 +2,6 @@ use crate::options::{Config, HeaderMapExt}; use crate::parse::{parse_duration_secs, parse_remaps}; use anyhow::{Context, Result}; use http::{HeaderMap, StatusCode}; -use lychee_lib::ratelimit::HostPoolConfig; use lychee_lib::{Client, ClientBuilder, ratelimit::RateLimitConfig}; use regex::RegexSet; use reqwest_cookie_store::CookieStoreMutex; @@ -56,14 +55,11 @@ pub(crate) fn create(cfg: &Config, cookie_jar: Option<&Arc>) - .include_fragments(cfg.include_fragments) .fallback_extensions(cfg.fallback_extensions.clone()) .index_files(cfg.index_files.clone()) - .host_pool_config(HostPoolConfig { - // Create HostPool for rate limiting - always enabled for HTTP requests - rate_limit_config: RateLimitConfig::from_options( - cfg.host_concurrency, - cfg.request_interval, - ), - hosts: cfg.hosts.clone(), - }) + .rate_limit_config(RateLimitConfig::from_options( + cfg.host_concurrency, + cfg.request_interval, + )) + .hosts(cfg.hosts.clone()) .build() .client() .context("Failed to create request client") diff --git a/lychee-bin/src/options.rs b/lychee-bin/src/options.rs index 87ae2913a9..002c2b81bd 100644 --- a/lychee-bin/src/options.rs +++ b/lychee-bin/src/options.rs @@ -11,11 +11,11 @@ use http::{ header::{HeaderName, HeaderValue}, }; use lychee_lib::Preprocessor; +use lychee_lib::ratelimit::HostConfigs; use lychee_lib::{ Base, BasicAuthSelector, DEFAULT_MAX_REDIRECTS, DEFAULT_MAX_RETRIES, DEFAULT_RETRY_WAIT_TIME_SECS, DEFAULT_TIMEOUT_SECS, DEFAULT_USER_AGENT, FileExtensions, FileType, Input, StatusCodeExcluder, StatusCodeSelector, archive::Archive, - ratelimit::HostConfig, }; use reqwest::tls; use secrecy::SecretString; @@ -930,7 +930,7 @@ esac"# /// Host-specific configurations from config file #[arg(skip)] #[serde(default)] - pub(crate) hosts: HashMap, + pub(crate) hosts: HostConfigs, } impl Config { diff --git a/lychee-lib/src/client.rs b/lychee-lib/src/client.rs index ae26bf6e24..91c78d8aff 100644 --- a/lychee-lib/src/client.rs +++ b/lychee-lib/src/client.rs @@ -32,7 +32,7 @@ use crate::{ chain::RequestChain, checker::{file::FileChecker, mail::MailChecker, website::WebsiteChecker}, filter::Filter, - ratelimit::{ClientMap, HostKey, HostPool, HostPoolConfig}, + ratelimit::{ClientMap, HostConfigs, HostKey, HostPool, RateLimitConfig}, remap::Remaps, types::{DEFAULT_ACCEPTED_STATUS_CODES, redirect_history::RedirectHistory}, }; @@ -306,9 +306,11 @@ pub struct ClientBuilder { /// skipped and the lychee-internal request chain is not activated. plugin_request_chain: RequestChain, - /// When enabled, HTTP/HTTPS requests will be routed through this pool - /// for rate limiting and concurrency control on a per-host basis. - host_pool_config: HostPoolConfig, + /// Global rate limiting configuration that applies as defaults to all hosts + rate_limit_config: RateLimitConfig, + + /// Per-host configuration overrides + hosts: HostConfigs, } impl Default for ClientBuilder { @@ -342,13 +344,12 @@ impl ClientBuilder { let client_map = self.build_host_clients(&redirect_history)?; - // Create HostPool for rate limiting - always enabled for HTTP requests - let HostPoolConfig { - rate_limit_config, - hosts, - } = self.host_pool_config; - - let host_pool = HostPool::new(rate_limit_config, hosts, reqwest_client.clone(), client_map); + let host_pool = HostPool::new( + self.rate_limit_config, + self.hosts, + reqwest_client.clone(), + client_map, + ); let github_client = match self.github_token.as_ref().map(ExposeSecret::expose_secret) { Some(token) if !token.is_empty() => Some( @@ -404,8 +405,7 @@ impl ClientBuilder { /// Build the host-specific clients with their host-specific headers fn build_host_clients(&self, redirect_history: &RedirectHistory) -> Result { - self.host_pool_config - .hosts + self.hosts .iter() .map(|(host, config)| { let mut headers = self.headers()?; diff --git a/lychee-lib/src/ratelimit/config.rs b/lychee-lib/src/ratelimit/config.rs index dbc4becf91..1f87651c6d 100644 --- a/lychee-lib/src/ratelimit/config.rs +++ b/lychee-lib/src/ratelimit/config.rs @@ -3,6 +3,8 @@ use serde::{Deserialize, Serialize}; use std::collections::HashMap; use std::time::Duration; +use crate::ratelimit::HostKey; + /// Global rate limiting configuration that applies as defaults to all hosts #[derive(Debug, Clone, Copy, Serialize, Deserialize)] pub struct RateLimitConfig { @@ -38,6 +40,9 @@ impl RateLimitConfig { } } +/// Per-host configuration overrides +pub type HostConfigs = HashMap; + /// Configuration for a specific host's rate limiting behavior #[derive(Debug, Clone, PartialEq, Serialize, Deserialize)] #[serde(deny_unknown_fields)] diff --git a/lychee-lib/src/ratelimit/host/key.rs b/lychee-lib/src/ratelimit/host/key.rs index ffc6f538d0..c10b284d68 100644 --- a/lychee-lib/src/ratelimit/host/key.rs +++ b/lychee-lib/src/ratelimit/host/key.rs @@ -1,3 +1,4 @@ +use serde::Deserialize; use std::fmt; use url::Url; @@ -17,7 +18,7 @@ use url::Url; /// let host_key = HostKey::try_from(&url).unwrap(); /// assert_eq!(host_key.as_str(), "api.github.com"); /// ``` -#[derive(Debug, Clone, PartialEq, Eq, Hash)] +#[derive(Debug, Clone, PartialEq, Eq, Hash, Deserialize)] pub struct HostKey(String); impl HostKey { diff --git a/lychee-lib/src/ratelimit/mod.rs b/lychee-lib/src/ratelimit/mod.rs index 946240370c..35cbd082ba 100644 --- a/lychee-lib/src/ratelimit/mod.rs +++ b/lychee-lib/src/ratelimit/mod.rs @@ -19,8 +19,8 @@ mod host; mod pool; mod window; -pub use config::{HostConfig, RateLimitConfig}; +pub use config::{HostConfig, HostConfigs, RateLimitConfig}; pub use error::RateLimitError; pub use host::{Host, HostKey, HostStats}; -pub use pool::{ClientMap, HostPool, HostPoolConfig}; +pub use pool::{ClientMap, HostPool}; pub use window::Window; diff --git a/lychee-lib/src/ratelimit/pool.rs b/lychee-lib/src/ratelimit/pool.rs index 250b3df962..0a4dd149c2 100644 --- a/lychee-lib/src/ratelimit/pool.rs +++ b/lychee-lib/src/ratelimit/pool.rs @@ -3,41 +3,21 @@ use reqwest::{Client, Request, Response}; use std::collections::HashMap; use std::sync::Arc; -use crate::ratelimit::{Host, HostConfig, HostKey, HostStats, RateLimitConfig, RateLimitError}; +use crate::ratelimit::{Host, HostConfigs, HostKey, HostStats, RateLimitConfig, RateLimitError}; use crate::{CacheStatus, Status, Uri}; -/// TODO: Rename,move,refactor? -#[derive(Debug, Clone)] -pub struct HostPoolConfig { - /// TODO - pub rate_limit_config: RateLimitConfig, - /// TODO - pub hosts: HashMap, -} - /// Keep track of host-specific [`reqwest::Client`]s pub type ClientMap = HashMap; -impl Default for HostPoolConfig { - fn default() -> Self { - Self { - rate_limit_config: Default::default(), - hosts: Default::default(), - } - } -} - /// Manages a pool of Host instances and routes requests to appropriate hosts. /// /// The `HostPool` serves as the central coordinator for per-host rate limiting. -/// It creates Host instances on-demand, manages global concurrency limits, -/// and provides a unified interface for executing HTTP requests with -/// appropriate rate limiting applied. +/// It creates host instances on-demand and provides a unified interface for +/// executing HTTP requests with appropriate rate limiting applied. /// /// # Architecture /// /// - Each unique hostname gets its own Host instance with dedicated rate limiting -/// - Global semaphore enforces overall concurrency limits across all hosts /// - Hosts are created lazily when first requested /// - Thread-safe using `DashMap` for concurrent access to host instances #[derive(Debug)] @@ -49,10 +29,12 @@ pub struct HostPool { global_config: RateLimitConfig, /// Per-host configuration overrides - host_configs: HashMap, + host_configs: HostConfigs, + /// Fallback client for hosts without host-specific client default_client: Client, + /// Host-specific clients client_map: ClientMap, } @@ -62,7 +44,7 @@ impl HostPool { #[allow(clippy::too_many_arguments)] pub fn new( global_config: RateLimitConfig, - host_configs: HashMap, + host_configs: HostConfigs, default_client: Client, client_map: ClientMap, ) -> Self { @@ -75,23 +57,12 @@ impl HostPool { } } - /// Execute an HTTP request with appropriate per-host rate limiting - /// - /// This method: - /// 1. Extracts the hostname from the request URL - /// 2. Gets or creates the appropriate Host instance - /// 3. Acquires a global semaphore permit - /// 4. Delegates to the host for execution with host-specific rate limiting - /// - /// # Arguments - /// - /// * `request` - The HTTP request to execute + /// Execute an HTTP request with appropriate per-host rate limiting. /// /// # Errors /// /// Returns a `RateLimitError` if: /// - The request URL has no valid hostname - /// - Global or host-specific rate limits are exceeded /// - The underlying HTTP request fails /// /// # Examples @@ -110,28 +81,21 @@ impl HostPool { /// # } /// ``` pub async fn execute_request(&self, request: Request) -> Result { - // Extract hostname from request URL let url = request.url(); let host_key = HostKey::try_from(url)?; - - // Get or create host instance let host = self.get_or_create_host(host_key)?; - - // Execute request through host-specific rate limiting host.execute_request(request).await } /// Get an existing host or create a new one for the given hostname fn get_or_create_host(&self, host_key: HostKey) -> Result, RateLimitError> { - // Check if host already exists if let Some(host) = self.hosts.get(&host_key) { return Ok(host.clone()); } - // Create new host instance let host_config = self .host_configs - .get(host_key.as_str()) + .get(&host_key) .cloned() .unwrap_or_default(); @@ -161,14 +125,8 @@ impl HostPool { } } - /// Get statistics for a specific host - /// /// Returns statistics for the host if it exists, otherwise returns empty stats. /// This provides consistent behavior whether or not requests have been made to that host yet. - /// - /// # Arguments - /// - /// * `hostname` - The hostname to get statistics for #[must_use] pub fn host_stats(&self, hostname: &str) -> HostStats { let host_key = HostKey::from(hostname); @@ -178,8 +136,6 @@ impl HostPool { .unwrap_or_default() } - /// Get statistics for all hosts that have been created - /// /// Returns a `HashMap` mapping hostnames to their statistics. /// Only hosts that have had requests will be included. #[must_use] @@ -194,9 +150,7 @@ impl HostPool { .collect() } - /// Get the number of currently active hosts - /// - /// This returns the number of Host instances that have been created, + /// Get the number of host instances that have been created, /// which corresponds to the number of unique hostnames that have /// been accessed. #[must_use] @@ -204,25 +158,19 @@ impl HostPool { self.hosts.len() } - /// Get host configuration for debugging/monitoring - /// - /// Returns a copy of the current host-specific configurations. + /// Get a copy of the current host-specific configurations. /// This is useful for debugging or runtime monitoring of configuration. #[must_use] - pub fn host_configurations(&self) -> HashMap { + pub fn host_configurations(&self) -> HostConfigs { self.host_configs.clone() } - /// Remove a host from the pool + /// Remove a host from the pool. /// /// This forces the host to be recreated with updated configuration /// the next time a request is made to it. Any ongoing requests to /// that host will continue with the old instance. /// - /// # Arguments - /// - /// * `hostname` - The hostname to remove from the pool - /// /// # Returns /// /// Returns true if a host was removed, false if no host existed for that hostname. @@ -234,13 +182,9 @@ impl HostPool { /// Check if a URI is cached in the appropriate host's cache /// - /// # Arguments - /// - /// * `uri` - The URI to check for in the cache - /// /// # Returns /// - /// Returns the cached status if found and valid, None otherwise + /// Returns the cached status if found and valid, `None` otherwise #[must_use] pub fn get_cached_status(&self, uri: &Uri) -> Option { let host_key = HostKey::try_from(uri).ok()?; @@ -253,11 +197,6 @@ impl HostPool { } /// Cache a result for a URI in the appropriate host's cache - /// - /// # Arguments - /// - /// * `uri` - The URI to cache - /// * `status` - The status to cache pub fn cache_result(&self, uri: &Uri, status: &Status) { if let Ok(host_key) = HostKey::try_from(uri) && let Some(host) = self.hosts.get(&host_key) @@ -286,7 +225,7 @@ impl HostPool { /// /// This tracks that a request was served from the persistent disk cache /// rather than going through the rate-limited HTTP request flow. - /// This method will create a [Host] instance if one doesn't exist yet. + /// This method will create a host instance if one doesn't exist yet. /// /// # Errors /// @@ -329,7 +268,7 @@ impl Default for HostPool { fn default() -> Self { Self::new( RateLimitConfig::default(), - HashMap::new(), + HostConfigs::default(), Client::default(), HashMap::new(), ) @@ -347,7 +286,7 @@ mod tests { fn test_host_pool_creation() { let pool = HostPool::new( RateLimitConfig::default(), - HashMap::new(), + HostConfigs::default(), Client::default(), HashMap::new(), ); From 3fdd99255563866aefdd3175718d778d066201c1 Mon Sep 17 00:00:00 2001 From: Thomas Zahner Date: Wed, 10 Dec 2025 11:39:29 +0100 Subject: [PATCH 30/43] Extract output functions --- lychee-bin/src/formatters/stats/mod.rs | 25 +++++++++- lychee-bin/src/host_stats.rs | 4 +- lychee-bin/src/main.rs | 63 ++++++++++---------------- 3 files changed, 48 insertions(+), 44 deletions(-) diff --git a/lychee-bin/src/formatters/stats/mod.rs b/lychee-bin/src/formatters/stats/mod.rs index dc2d2233d6..574440fd5d 100644 --- a/lychee-bin/src/formatters/stats/mod.rs +++ b/lychee-bin/src/formatters/stats/mod.rs @@ -13,10 +13,12 @@ pub(crate) use raw::Raw; use std::{ collections::{HashMap, HashSet}, fmt::Display, + fs, + io::{Write, stdout}, }; -use crate::stats::ResponseStats; -use anyhow::Result; +use crate::{formatters::get_stats_formatter, options::Config, stats::ResponseStats}; +use anyhow::{Context, Result}; use lychee_lib::InputSource; pub(crate) trait StatsFormatter { @@ -24,6 +26,25 @@ pub(crate) trait StatsFormatter { fn format(&self, stats: ResponseStats) -> Result>; } +/// If configured to do so, output response statistics to stdout or the specified output file. +pub(crate) fn output_response_statistics(stats: ResponseStats, config: &Config) -> Result<()> { + let is_empty = stats.is_empty(); + let formatted_stats = get_stats_formatter(&config.format, &config.mode).format(stats)?; + if let Some(formatted_stats) = formatted_stats { + if let Some(output) = &config.output { + fs::write(output, formatted_stats).context("Cannot write status output to file")?; + } else { + if config.verbose.log_level() >= log::Level::Info && !is_empty { + // separate summary from the verbose list of links above with a newline + writeln!(stdout())?; + } + // we assume that the formatted stats don't have a final newline + writeln!(stdout(), "{formatted_stats}")?; + } + } + Ok(()) +} + /// Convert a `ResponseStats` `HashMap` to a sorted Vec of key-value pairs /// The returned keys and values are both sorted in natural, case-insensitive order fn sort_stat_map(stat_map: &HashMap>) -> Vec<(&InputSource, Vec<&T>)> diff --git a/lychee-bin/src/host_stats.rs b/lychee-bin/src/host_stats.rs index f073e9689d..5ba26d9995 100644 --- a/lychee-bin/src/host_stats.rs +++ b/lychee-bin/src/host_stats.rs @@ -3,8 +3,8 @@ use lychee_lib::ratelimit::HostPool; use crate::{formatters::get_host_stats_formatter, options::Config}; -/// Display per-host statistics if requested -pub(crate) fn display_per_host_statistics(host_pool: &HostPool, config: &Config) -> Result<()> { +/// If configured to do so, output per-host statistics to stdout or the specified output file. +pub(crate) fn output_per_host_statistics(host_pool: &HostPool, config: &Config) -> Result<()> { if !config.host_stats { return Ok(()); } diff --git a/lychee-bin/src/main.rs b/lychee-bin/src/main.rs index 38a2535f6f..1e3d910e27 100644 --- a/lychee-bin/src/main.rs +++ b/lychee-bin/src/main.rs @@ -59,13 +59,13 @@ #![deny(missing_docs)] use std::fs::{self, File}; -use std::io::{self, BufRead, BufReader, ErrorKind, Write}; +use std::io::{self, BufRead, BufReader, ErrorKind}; use std::path::PathBuf; use anyhow::{Context, Error, Result, bail}; use clap::{Parser, crate_version}; use commands::{CommandParams, generate}; -use formatters::{get_stats_formatter, log::init_logging}; +use formatters::log::init_logging; use http::HeaderMap; use log::{error, info, warn}; @@ -93,11 +93,13 @@ mod stats; mod time; mod verbosity; +use crate::formatters::stats::output_response_statistics; +use crate::stats::ResponseStats; use crate::{ cache::{Cache, StoreExt}, - formatters::{duration::Duration, stats::StatsFormatter}, + formatters::duration::Duration, generate::generate, - host_stats::display_per_host_statistics, + host_stats::output_per_host_statistics, options::{Config, LYCHEE_CACHE_FILE, LYCHEE_IGNORE_FILE, LycheeOptions}, }; @@ -381,7 +383,6 @@ async fn run(opts: &LycheeOptions) -> Result { })?; let client = client::create(&opts.config, cookie_jar.as_deref())?; - let params = CommandParams { client, cache, @@ -393,41 +394,9 @@ async fn run(opts: &LycheeOptions) -> Result { commands::dump(params).await? } else { let (stats, cache, exit_code, host_pool) = commands::check(params).await?; - - let github_issues = stats - .error_map - .values() - .flatten() - .any(|body| body.uri.domain() == Some("github.com")); - - let stats_formatter: Box = - get_stats_formatter(&opts.config.format, &opts.config.mode); - - let is_empty = stats.is_empty(); - let formatted_stats = stats_formatter.format(stats)?; - - if let Some(formatted_stats) = formatted_stats { - if let Some(output) = &opts.config.output { - fs::write(output, formatted_stats).context("Cannot write status output to file")?; - } else { - if opts.config.verbose.log_level() >= log::Level::Info && !is_empty { - // separate summary from the verbose list of links above - // with a newline - writeln!(io::stdout())?; - } - // we assume that the formatted stats don't have a final newline - writeln!(io::stdout(), "{formatted_stats}")?; - } - } - - // Display per-host statistics if requested - display_per_host_statistics(host_pool.as_ref(), &opts.config)?; - - if github_issues && opts.config.github_token.is_none() { - warn!( - "There were issues with GitHub URLs. You could try setting a GitHub token and running lychee again.", - ); - } + github_warning(&stats, &opts.config); + output_response_statistics(stats, &opts.config)?; + output_per_host_statistics(&host_pool, &opts.config)?; if opts.config.cache { cache.store(LYCHEE_CACHE_FILE)?; @@ -443,3 +412,17 @@ async fn run(opts: &LycheeOptions) -> Result { Ok(exit_code as i32) } + +/// Display user-friendly message if there were any issues with GitHub URLs +fn github_warning(stats: &ResponseStats, config: &Config) { + let github_errors = stats + .error_map + .values() + .flatten() + .any(|body| body.uri.domain() == Some("github.com")); + if github_errors && config.github_token.is_none() { + warn!( + "There were issues with GitHub URLs. You could try setting a GitHub token and running lychee again.", + ); + } +} From 7be4516a814520a4526d74bfb2c2b8a501d646e5 Mon Sep 17 00:00:00 2001 From: Thomas Zahner Date: Wed, 10 Dec 2025 13:37:20 +0100 Subject: [PATCH 31/43] Replace Window with Vec This means less code and better accuracy. As `Duration` is only 16 bytes in size, memory consumption should really not be a problem. --- lychee-lib/src/ratelimit/host/stats.rs | 36 +-------- lychee-lib/src/ratelimit/mod.rs | 2 - lychee-lib/src/ratelimit/window.rs | 100 ------------------------- 3 files changed, 3 insertions(+), 135 deletions(-) delete mode 100644 lychee-lib/src/ratelimit/window.rs diff --git a/lychee-lib/src/ratelimit/host/stats.rs b/lychee-lib/src/ratelimit/host/stats.rs index a399b769c5..983462c409 100644 --- a/lychee-lib/src/ratelimit/host/stats.rs +++ b/lychee-lib/src/ratelimit/host/stats.rs @@ -1,8 +1,6 @@ use std::collections::HashMap; use std::time::{Duration, Instant}; -use crate::ratelimit::window::Window; - /// Statistics tracking for a host's request patterns #[derive(Debug, Clone, Default)] pub struct HostStats { @@ -20,8 +18,8 @@ pub struct HostStats { pub last_success: Option, /// Timestamp of the last rate limit response pub last_rate_limit: Option, - /// Request times for median calculation (kept in rolling window) - pub request_times: Window, + /// Request times for median calculation + pub request_times: Vec, /// Status code counts pub status_codes: HashMap, /// Number of cache hits @@ -31,15 +29,6 @@ pub struct HostStats { } impl HostStats { - /// Create new host statistics with custom window size for request times - #[must_use] - pub fn with_window_size(window_size: usize) -> Self { - Self { - request_times: Window::new(window_size), - ..Default::default() - } - } - /// Record a response with status code and request duration pub fn record_response(&mut self, status_code: u16, request_time: Duration) { self.total_requests += 1; @@ -66,7 +55,6 @@ impl HostStats { _ => {} // Other status codes } - // Track request time in rolling window self.request_times.push(request_time); } @@ -77,7 +65,7 @@ impl HostStats { return None; } - let mut times = self.request_times.to_vec(); + let mut times = self.request_times.clone(); times.sort(); let mid = times.len() / 2; @@ -250,24 +238,6 @@ mod tests { ); } - #[test] - fn test_window_integration() { - let mut stats = HostStats::with_window_size(2); - - stats.record_response(200, Duration::from_millis(100)); - stats.record_response(200, Duration::from_millis(200)); - stats.record_response(200, Duration::from_millis(300)); - - // Window should only keep last 2 times - assert_eq!(stats.request_times.len(), 2); - - let times: Vec<_> = stats.request_times.iter().copied().collect(); - assert_eq!( - times, - vec![Duration::from_millis(200), Duration::from_millis(300)] - ); - } - #[test] fn test_summary_formatting() { let mut stats = HostStats::default(); diff --git a/lychee-lib/src/ratelimit/mod.rs b/lychee-lib/src/ratelimit/mod.rs index 35cbd082ba..6f217b8da9 100644 --- a/lychee-lib/src/ratelimit/mod.rs +++ b/lychee-lib/src/ratelimit/mod.rs @@ -17,10 +17,8 @@ mod config; mod error; mod host; mod pool; -mod window; pub use config::{HostConfig, HostConfigs, RateLimitConfig}; pub use error::RateLimitError; pub use host::{Host, HostKey, HostStats}; pub use pool::{ClientMap, HostPool}; -pub use window::Window; diff --git a/lychee-lib/src/ratelimit/window.rs b/lychee-lib/src/ratelimit/window.rs deleted file mode 100644 index 058641e0a0..0000000000 --- a/lychee-lib/src/ratelimit/window.rs +++ /dev/null @@ -1,100 +0,0 @@ -use std::collections::VecDeque; - -/// A rolling window data structure that automatically maintains a maximum size -/// by removing oldest elements when the capacity is exceeded. -#[derive(Debug, Clone)] -pub struct Window { - data: VecDeque, - capacity: usize, -} - -impl Window { - /// Create a new window with the given capacity - #[must_use] - pub fn new(capacity: usize) -> Self { - Self { - data: VecDeque::with_capacity(capacity), - capacity, - } - } - - /// Push an element to the window, removing the oldest if at capacity - pub fn push(&mut self, item: T) { - if self.data.len() >= self.capacity { - self.data.pop_front(); - } - self.data.push_back(item); - } - - /// Get the number of elements currently in the window - #[must_use] - pub fn len(&self) -> usize { - self.data.len() - } - - /// Check if the window is empty - #[must_use] - pub fn is_empty(&self) -> bool { - self.data.is_empty() - } - - /// Get an iterator over the elements in the window - pub fn iter(&self) -> impl Iterator { - self.data.iter() - } - - /// Convert to a vector (for compatibility with existing code) - #[must_use] - pub fn to_vec(&self) -> Vec - where - T: Clone, - { - self.data.iter().cloned().collect() - } -} - -impl Default for Window { - fn default() -> Self { - Self::new(100) // Default capacity of 100 items - } -} - -#[cfg(test)] -mod tests { - use super::*; - - #[test] - fn test_window_capacity() { - let mut window = Window::new(3); - - // Fill up the window - window.push(1); - window.push(2); - window.push(3); - assert_eq!(window.len(), 3); - - // Add one more, should remove the oldest - window.push(4); - assert_eq!(window.len(), 3); - - let values: Vec<_> = window.iter().copied().collect(); - assert_eq!(values, vec![2, 3, 4]); - } - - #[test] - fn test_window_empty() { - let window: Window = Window::new(5); - assert!(window.is_empty()); - assert_eq!(window.len(), 0); - } - - #[test] - fn test_window_to_vec() { - let mut window = Window::new(3); - window.push(1); - window.push(2); - - let vec = window.to_vec(); - assert_eq!(vec, vec![1, 2]); - } -} From 420e822339ae75251ee983bd4eb68584785f11b2 Mon Sep 17 00:00:00 2001 From: Thomas Zahner Date: Thu, 11 Dec 2025 17:25:20 +0100 Subject: [PATCH 32/43] Update RateLimitError --- lychee-lib/src/ratelimit/error.rs | 24 +++++++++++++++++------- lychee-lib/src/ratelimit/host/host.rs | 10 +++------- lychee-lib/src/ratelimit/host/key.rs | 4 ++-- lychee-lib/src/ratelimit/pool.rs | 1 - 4 files changed, 22 insertions(+), 17 deletions(-) diff --git a/lychee-lib/src/ratelimit/error.rs b/lychee-lib/src/ratelimit/error.rs index c39f463d5b..e2d12fa246 100644 --- a/lychee-lib/src/ratelimit/error.rs +++ b/lychee-lib/src/ratelimit/error.rs @@ -1,4 +1,7 @@ use thiserror::Error; +use url::Url; + +use crate::ratelimit::HostKey; /// Errors that can occur during rate limiting operations #[derive(Error, Debug)] @@ -7,16 +10,23 @@ pub enum RateLimitError { #[error("Host {host} exceeded rate limit: {message}")] RateLimitExceeded { /// The host that exceeded the limit - host: String, + host: HostKey, /// Additional context message message: String, }, + /// User specified an invalid rate limit interval + #[error("Invalid rate limit interval for host {host}")] + InvalidRateLimitInterval { + /// The host with invalid configuration + host: HostKey, + }, + /// Failed to parse rate limit headers from server response - #[error("Failed to parse rate limit headers from {host}: {reason}")] - HeaderParseError { + #[error("Failed to parse URL {url}: {reason}")] + UrlParseError { /// The host that sent invalid headers - host: String, + url: Url, /// Reason for parse failure reason: String, }, @@ -25,7 +35,7 @@ pub enum RateLimitError { #[error("Failed to configure client for host {host}: {source}")] ClientConfigError { /// The host that failed configuration - host: String, + host: HostKey, /// Underlying error source: reqwest::Error, }, @@ -34,7 +44,7 @@ pub enum RateLimitError { #[error("Cookie operation failed for host {host}: {reason}")] CookieError { /// The host with cookie issues - host: String, + host: HostKey, /// Description of cookie error reason: String, }, @@ -43,7 +53,7 @@ pub enum RateLimitError { #[error("Network error for host {host}: {source}")] NetworkError { /// The host that had the network error - host: String, + host: HostKey, /// The underlying network error #[source] source: reqwest::Error, diff --git a/lychee-lib/src/ratelimit/host/host.rs b/lychee-lib/src/ratelimit/host/host.rs index 870f594e50..8d4db76084 100644 --- a/lychee-lib/src/ratelimit/host/host.rs +++ b/lychee-lib/src/ratelimit/host/host.rs @@ -74,7 +74,6 @@ impl Host { /// # Panics /// /// Panics if the burst size cannot be set to 1 (should never happen) - #[allow(clippy::too_many_arguments)] pub fn new( key: HostKey, host_config: &HostConfig, @@ -83,10 +82,7 @@ impl Host { ) -> Result { let interval = host_config.effective_request_interval(global_config); let quota = Quota::with_period(interval) - .ok_or_else(|| RateLimitError::HeaderParseError { - host: key.to_string(), - reason: "Invalid rate limit interval".to_string(), - })? + .ok_or_else(|| RateLimitError::InvalidRateLimitInterval { host: key.clone() })? .allow_burst(std::num::NonZeroU32::new(1).unwrap()); let rate_limiter = RateLimiter::direct(quota); @@ -164,7 +160,7 @@ impl Host { .acquire() .await .map_err(|_| RateLimitError::RateLimitExceeded { - host: self.key.to_string(), + host: self.key.clone(), message: "Semaphore acquisition cancelled".to_string(), })?; @@ -192,7 +188,7 @@ impl Host { Err(e) => { // Wrap network/HTTP errors to preserve the original error return Err(RateLimitError::NetworkError { - host: self.key.to_string(), + host: self.key.clone(), source: e, }); } diff --git a/lychee-lib/src/ratelimit/host/key.rs b/lychee-lib/src/ratelimit/host/key.rs index c10b284d68..dbec20a57b 100644 --- a/lychee-lib/src/ratelimit/host/key.rs +++ b/lychee-lib/src/ratelimit/host/key.rs @@ -41,8 +41,8 @@ impl TryFrom<&Url> for HostKey { fn try_from(url: &Url) -> Result { let host = url.host_str() - .ok_or_else(|| crate::ratelimit::RateLimitError::HeaderParseError { - host: url.to_string(), + .ok_or_else(|| crate::ratelimit::RateLimitError::UrlParseError { + url: url.clone(), reason: "URL contains no host component".to_string(), })?; diff --git a/lychee-lib/src/ratelimit/pool.rs b/lychee-lib/src/ratelimit/pool.rs index 0a4dd149c2..ff80a7c452 100644 --- a/lychee-lib/src/ratelimit/pool.rs +++ b/lychee-lib/src/ratelimit/pool.rs @@ -41,7 +41,6 @@ pub struct HostPool { impl HostPool { /// Create a new `HostPool` with the given configuration #[must_use] - #[allow(clippy::too_many_arguments)] pub fn new( global_config: RateLimitConfig, host_configs: HostConfigs, From 00d0c138ce4cda7fba982fa7c98bbc22211d42c1 Mon Sep 17 00:00:00 2001 From: Thomas Zahner Date: Thu, 11 Dec 2025 18:20:38 +0100 Subject: [PATCH 33/43] Create RequestInterval This makes error handling more user friendly --- lychee-bin/src/options.rs | 7 +-- lychee-lib/src/ratelimit/config.rs | 54 +++++++----------- lychee-lib/src/ratelimit/host/host.rs | 8 +-- lychee-lib/src/ratelimit/host/interval.rs | 68 +++++++++++++++++++++++ lychee-lib/src/ratelimit/host/mod.rs | 2 + lychee-lib/src/ratelimit/mod.rs | 2 +- 6 files changed, 99 insertions(+), 42 deletions(-) create mode 100644 lychee-lib/src/ratelimit/host/interval.rs diff --git a/lychee-bin/src/options.rs b/lychee-bin/src/options.rs index 002c2b81bd..fc57b906a9 100644 --- a/lychee-bin/src/options.rs +++ b/lychee-bin/src/options.rs @@ -11,7 +11,7 @@ use http::{ header::{HeaderName, HeaderValue}, }; use lychee_lib::Preprocessor; -use lychee_lib::ratelimit::HostConfigs; +use lychee_lib::ratelimit::{HostConfigs, RequestInterval}; use lychee_lib::{ Base, BasicAuthSelector, DEFAULT_MAX_REDIRECTS, DEFAULT_MAX_RETRIES, DEFAULT_RETRY_WAIT_TIME_SECS, DEFAULT_TIMEOUT_SECS, DEFAULT_USER_AGENT, FileExtensions, @@ -561,9 +561,8 @@ with a status code of 429, 500 and 501." /// Examples: /// --request-interval 50ms # Fast for robust APIs /// --request-interval 1s # Conservative for rate-limited APIs - #[arg(long = "request-interval", value_parser = humantime::parse_duration, verbatim_doc_comment)] - #[serde(default, with = "humantime_serde")] - pub(crate) request_interval: Option, + #[arg(long = "request-interval", verbatim_doc_comment)] + pub(crate) request_interval: Option, /// Number of threads to utilize. /// Defaults to number of cores available to the system diff --git a/lychee-lib/src/ratelimit/config.rs b/lychee-lib/src/ratelimit/config.rs index 1f87651c6d..f5f90f5fd2 100644 --- a/lychee-lib/src/ratelimit/config.rs +++ b/lychee-lib/src/ratelimit/config.rs @@ -1,9 +1,8 @@ use http::{HeaderMap, HeaderName, HeaderValue}; use serde::{Deserialize, Serialize}; use std::collections::HashMap; -use std::time::Duration; -use crate::ratelimit::HostKey; +use crate::ratelimit::{HostKey, RequestInterval}; /// Global rate limiting configuration that applies as defaults to all hosts #[derive(Debug, Clone, Copy, Serialize, Deserialize)] @@ -13,15 +12,14 @@ pub struct RateLimitConfig { pub host_concurrency: usize, /// Default minimum interval between requests to the same host - #[serde(default = "default_request_interval", with = "humantime_serde")] - pub request_interval: Duration, + pub request_interval: RequestInterval, } impl Default for RateLimitConfig { fn default() -> Self { Self { host_concurrency: default_host_concurrency(), - request_interval: default_request_interval(), + request_interval: RequestInterval::default(), } } } @@ -31,11 +29,11 @@ impl RateLimitConfig { #[must_use] pub fn from_options( host_concurrency: Option, - request_interval: Option, + request_interval: Option, ) -> Self { Self { host_concurrency: host_concurrency.unwrap_or(DEFAULT_HOST_CONCURRENCY), - request_interval: request_interval.unwrap_or(DEFAULT_REQUEST_INTERVAL), + request_interval: request_interval.unwrap_or_default(), } } } @@ -51,8 +49,7 @@ pub struct HostConfig { pub max_concurrent: Option, /// Minimum interval between requests to this host - #[serde(default, with = "humantime_serde")] - pub request_interval: Option, + pub request_interval: Option, /// Custom headers to send with requests to this host #[serde(default)] @@ -81,7 +78,7 @@ impl HostConfig { /// Get the effective request interval, falling back to the global default #[must_use] - pub fn effective_request_interval(&self, global_config: &RateLimitConfig) -> Duration { + pub fn effective_request_interval(&self, global_config: &RateLimitConfig) -> RequestInterval { self.request_interval .unwrap_or(global_config.request_interval) } @@ -90,19 +87,11 @@ impl HostConfig { /// Default number of concurrent requests per host const DEFAULT_HOST_CONCURRENCY: usize = 10; -/// Default interval between requests to the same host -const DEFAULT_REQUEST_INTERVAL: Duration = Duration::from_millis(100); - /// Default number of concurrent requests per host const fn default_host_concurrency() -> usize { DEFAULT_HOST_CONCURRENCY } -/// Default interval between requests to the same host -const fn default_request_interval() -> Duration { - DEFAULT_REQUEST_INTERVAL -} - /// Custom deserializer for headers from TOML config format fn deserialize_headers<'de, D>(deserializer: D) -> Result where @@ -137,14 +126,11 @@ where #[cfg(test)] mod tests { - use super::*; + use std::time::Duration; - #[test] - fn test_default_rate_limit_config() { - let config = RateLimitConfig::default(); - assert_eq!(config.host_concurrency, 10); - assert_eq!(config.request_interval, Duration::from_millis(100)); - } + use governor::Quota; + + use super::*; #[test] fn test_host_config_effective_values() { @@ -155,19 +141,21 @@ mod tests { assert_eq!(host_config.effective_max_concurrent(&global_config), 10); assert_eq!( host_config.effective_request_interval(&global_config), - Duration::from_millis(100) + RequestInterval::default(), ); // Test with overrides let host_config = HostConfig { max_concurrent: Some(5), - request_interval: Some(Duration::from_millis(500)), + request_interval: Some("500ms".parse().unwrap()), headers: HeaderMap::new(), }; assert_eq!(host_config.effective_max_concurrent(&global_config), 5); assert_eq!( - host_config.effective_request_interval(&global_config), - Duration::from_millis(500) + host_config + .effective_request_interval(&global_config) + .into_inner(), + Quota::with_period(Duration::from_millis(500)).unwrap() ); } @@ -175,7 +163,7 @@ mod tests { fn test_config_serialization() { let config = RateLimitConfig { host_concurrency: 15, - request_interval: Duration::from_millis(200), + request_interval: "200ms".parse().unwrap(), }; let toml = toml::to_string(&config).unwrap(); @@ -193,17 +181,17 @@ mod tests { let host_config = HostConfig { max_concurrent: Some(5), - request_interval: Some(Duration::from_millis(500)), + request_interval: Some("500ms".parse().unwrap()), headers, }; let toml = toml::to_string(&host_config).unwrap(); - let deserialized: HostConfig = toml::from_str(&toml).unwrap(); + let deserialized: HostConfig = toml::from_str(&dbg!(toml)).unwrap(); assert_eq!(deserialized.max_concurrent, Some(5)); assert_eq!( deserialized.request_interval, - Some(Duration::from_millis(500)) + Some("500ms".parse().unwrap()) ); assert_eq!(deserialized.headers.len(), 2); assert!(deserialized.headers.contains_key("authorization")); diff --git a/lychee-lib/src/ratelimit/host/host.rs b/lychee-lib/src/ratelimit/host/host.rs index 8d4db76084..9fbe7d411f 100644 --- a/lychee-lib/src/ratelimit/host/host.rs +++ b/lychee-lib/src/ratelimit/host/host.rs @@ -1,6 +1,6 @@ use dashmap::DashMap; use governor::{ - Quota, RateLimiter, + RateLimiter, clock::DefaultClock, state::{InMemoryState, NotKeyed}, }; @@ -80,9 +80,9 @@ impl Host { global_config: &RateLimitConfig, client: ReqwestClient, ) -> Result { - let interval = host_config.effective_request_interval(global_config); - let quota = Quota::with_period(interval) - .ok_or_else(|| RateLimitError::InvalidRateLimitInterval { host: key.clone() })? + let quota = host_config + .effective_request_interval(global_config) + .into_inner() .allow_burst(std::num::NonZeroU32::new(1).unwrap()); let rate_limiter = RateLimiter::direct(quota); diff --git a/lychee-lib/src/ratelimit/host/interval.rs b/lychee-lib/src/ratelimit/host/interval.rs new file mode 100644 index 0000000000..ee6826de9c --- /dev/null +++ b/lychee-lib/src/ratelimit/host/interval.rs @@ -0,0 +1,68 @@ +use governor::Quota; +use humantime_serde::re::humantime::{self, DurationError}; +use serde::{Deserialize, Serialize, Serializer}; +use std::num::NonZero; +use std::str::FromStr; +use thiserror::Error; + +#[derive(Debug, Clone, Copy, PartialEq, Eq)] +/// Interval between requests to the same host +pub struct RequestInterval(Quota); + +#[derive(Debug, Error, PartialEq)] +pub enum ParseError { + #[error("Parse error: {0}")] + HumantimeError(DurationError), + #[error("Interval must not be zero")] + ZeroInterval, +} + +impl FromStr for RequestInterval { + type Err = ParseError; + + fn from_str(input: &str) -> Result { + let duration = input + .parse::() + .map_err(ParseError::HumantimeError)?; + Ok(RequestInterval( + Quota::with_period(duration.into()).ok_or(ParseError::ZeroInterval)?, + )) + } +} + +impl RequestInterval { + /// Convert into inner [`Quota`] + #[must_use] + pub const fn into_inner(self) -> Quota { + self.0 + } +} + +impl Default for RequestInterval { + /// The default interval is 100 milliseconds. + fn default() -> Self { + const PER_SECOND: Quota = Quota::per_second(NonZero::new(10).unwrap()); + Self(PER_SECOND) + } +} + +impl Serialize for RequestInterval { + fn serialize(&self, serializer: S) -> Result + where + S: Serializer, + { + humantime::Duration::from(self.0.replenish_interval()) + .to_string() + .serialize(serializer) + } +} + +impl<'de> Deserialize<'de> for RequestInterval { + fn deserialize(deserializer: D) -> Result + where + D: serde::Deserializer<'de>, + { + let string = ::deserialize(deserializer)?; + Self::from_str(&string).map_err(serde::de::Error::custom) + } +} diff --git a/lychee-lib/src/ratelimit/host/mod.rs b/lychee-lib/src/ratelimit/host/mod.rs index 50b8b1ad3e..ed2e48506a 100644 --- a/lychee-lib/src/ratelimit/host/mod.rs +++ b/lychee-lib/src/ratelimit/host/mod.rs @@ -1,9 +1,11 @@ #![allow(clippy::module_inception)] mod host; +mod interval; mod key; mod stats; pub use host::Host; +pub use interval::RequestInterval; pub use key::HostKey; pub use stats::HostStats; diff --git a/lychee-lib/src/ratelimit/mod.rs b/lychee-lib/src/ratelimit/mod.rs index 6f217b8da9..7b6d387361 100644 --- a/lychee-lib/src/ratelimit/mod.rs +++ b/lychee-lib/src/ratelimit/mod.rs @@ -20,5 +20,5 @@ mod pool; pub use config::{HostConfig, HostConfigs, RateLimitConfig}; pub use error::RateLimitError; -pub use host::{Host, HostKey, HostStats}; +pub use host::{Host, HostKey, HostStats, RequestInterval}; pub use pool::{ClientMap, HostPool}; From 08602e78ee67030ef52ef046375d3385cd484d8f Mon Sep 17 00:00:00 2001 From: Thomas Zahner Date: Fri, 12 Dec 2025 10:19:40 +0100 Subject: [PATCH 34/43] Remove RateLimitError After previous refactoring only two variants were still used. Use the better fitting ErrorKind::InvalidUrlHost instead of RateLimitError::UrlParseError. Replace confusing RateLimitError::RateLimitExceeded with expect. --- lychee-lib/src/checker/website.rs | 15 +------ lychee-lib/src/ratelimit/error.rs | 61 --------------------------- lychee-lib/src/ratelimit/host/host.rs | 29 ++++++------- lychee-lib/src/ratelimit/host/key.rs | 22 +++++----- lychee-lib/src/ratelimit/mod.rs | 2 - lychee-lib/src/ratelimit/pool.rs | 17 +++----- lychee-lib/src/types/error.rs | 9 +--- 7 files changed, 32 insertions(+), 123 deletions(-) delete mode 100644 lychee-lib/src/ratelimit/error.rs diff --git a/lychee-lib/src/checker/website.rs b/lychee-lib/src/checker/website.rs index 7b30d9c0f3..f07cabf4e3 100644 --- a/lychee-lib/src/checker/website.rs +++ b/lychee-lib/src/checker/website.rs @@ -124,20 +124,7 @@ impl WebsiteChecker { let method = request.method().clone(); let request_url = request.url().clone(); - // Use HostPool for rate limiting - let response_result = match self.host_pool.execute_request(request).await { - Ok(response) => Ok(response), - Err(crate::ratelimit::RateLimitError::NetworkError { source, .. }) => { - // Network errors should be handled the same as direct client errors - Err(source) - } - Err(e) => { - // Rate limiting specific errors - return Status::Error(ErrorKind::RateLimit(e)); - } - }; - - match response_result { + match self.host_pool.execute_request(request).await { Ok(response) => { let status = Status::new(&response, &self.accepted); // when `accept=200,429`, `status_code=429` will be treated as success diff --git a/lychee-lib/src/ratelimit/error.rs b/lychee-lib/src/ratelimit/error.rs deleted file mode 100644 index e2d12fa246..0000000000 --- a/lychee-lib/src/ratelimit/error.rs +++ /dev/null @@ -1,61 +0,0 @@ -use thiserror::Error; -use url::Url; - -use crate::ratelimit::HostKey; - -/// Errors that can occur during rate limiting operations -#[derive(Error, Debug)] -pub enum RateLimitError { - /// Host exceeded its rate limit - #[error("Host {host} exceeded rate limit: {message}")] - RateLimitExceeded { - /// The host that exceeded the limit - host: HostKey, - /// Additional context message - message: String, - }, - - /// User specified an invalid rate limit interval - #[error("Invalid rate limit interval for host {host}")] - InvalidRateLimitInterval { - /// The host with invalid configuration - host: HostKey, - }, - - /// Failed to parse rate limit headers from server response - #[error("Failed to parse URL {url}: {reason}")] - UrlParseError { - /// The host that sent invalid headers - url: Url, - /// Reason for parse failure - reason: String, - }, - - /// Error creating or configuring HTTP client for host - #[error("Failed to configure client for host {host}: {source}")] - ClientConfigError { - /// The host that failed configuration - host: HostKey, - /// Underlying error - source: reqwest::Error, - }, - - /// Cookie store operation failed - #[error("Cookie operation failed for host {host}: {reason}")] - CookieError { - /// The host with cookie issues - host: HostKey, - /// Description of cookie error - reason: String, - }, - - /// Network error occurred during request execution - #[error("Network error for host {host}: {source}")] - NetworkError { - /// The host that had the network error - host: HostKey, - /// The underlying network error - #[source] - source: reqwest::Error, - }, -} diff --git a/lychee-lib/src/ratelimit/host/host.rs b/lychee-lib/src/ratelimit/host/host.rs index 9fbe7d411f..2199c9d593 100644 --- a/lychee-lib/src/ratelimit/host/host.rs +++ b/lychee-lib/src/ratelimit/host/host.rs @@ -11,8 +11,12 @@ use tokio::sync::Semaphore; use super::key::HostKey; use super::stats::HostStats; -use crate::ratelimit::{HostConfig, RateLimitConfig, RateLimitError}; +use crate::types::Result; use crate::{CacheStatus, Status, Uri}; +use crate::{ + ErrorKind, + ratelimit::{HostConfig, RateLimitConfig}, +}; /// Cache value for per-host caching #[derive(Debug, Clone)] @@ -79,7 +83,7 @@ impl Host { host_config: &HostConfig, global_config: &RateLimitConfig, client: ReqwestClient, - ) -> Result { + ) -> Result { let quota = host_config .effective_request_interval(global_config) .into_inner() @@ -148,21 +152,19 @@ impl Host { /// # Panics /// /// Panics if the statistics mutex is poisoned - pub async fn execute_request(&self, request: Request) -> Result { + pub async fn execute_request(&self, request: Request) -> Result { let uri = Uri::from(request.url().clone()); // Note: Cache checking is handled at the HostPool level // This method focuses on executing the actual HTTP request // Acquire semaphore permit for concurrency control - let _permit = - self.semaphore - .acquire() - .await - .map_err(|_| RateLimitError::RateLimitExceeded { - host: self.key.clone(), - message: "Semaphore acquisition cancelled".to_string(), - })?; + let _permit = self + .semaphore + .acquire() + .await + // SAFETY: this should not panic as we never close the semaphore + .expect("Semaphore was closed unexpectedly"); // Apply adaptive backoff if needed let backoff_duration = { @@ -187,10 +189,7 @@ impl Host { Ok(response) => response, Err(e) => { // Wrap network/HTTP errors to preserve the original error - return Err(RateLimitError::NetworkError { - host: self.key.clone(), - source: e, - }); + return Err(ErrorKind::NetworkRequest(e)); } }; let request_time = start_time.elapsed(); diff --git a/lychee-lib/src/ratelimit/host/key.rs b/lychee-lib/src/ratelimit/host/key.rs index dbec20a57b..09f224ceec 100644 --- a/lychee-lib/src/ratelimit/host/key.rs +++ b/lychee-lib/src/ratelimit/host/key.rs @@ -2,6 +2,9 @@ use serde::Deserialize; use std::fmt; use url::Url; +use crate::ErrorKind; +use crate::types::Result; + /// A type-safe representation of a hostname for rate limiting purposes. /// /// This extracts and normalizes hostnames from URLs to ensure consistent @@ -36,15 +39,10 @@ impl HostKey { } impl TryFrom<&Url> for HostKey { - type Error = crate::ratelimit::RateLimitError; + type Error = ErrorKind; - fn try_from(url: &Url) -> Result { - let host = - url.host_str() - .ok_or_else(|| crate::ratelimit::RateLimitError::UrlParseError { - url: url.clone(), - reason: "URL contains no host component".to_string(), - })?; + fn try_from(url: &Url) -> Result { + let host = url.host_str().ok_or_else(|| ErrorKind::InvalidUrlHost)?; // Normalize to lowercase for consistent lookup Ok(HostKey(host.to_lowercase())) @@ -52,17 +50,17 @@ impl TryFrom<&Url> for HostKey { } impl TryFrom<&crate::Uri> for HostKey { - type Error = crate::ratelimit::RateLimitError; + type Error = ErrorKind; - fn try_from(uri: &crate::Uri) -> Result { + fn try_from(uri: &crate::Uri) -> Result { Self::try_from(&uri.url) } } impl TryFrom for HostKey { - type Error = crate::ratelimit::RateLimitError; + type Error = ErrorKind; - fn try_from(url: Url) -> Result { + fn try_from(url: Url) -> Result { HostKey::try_from(&url) } } diff --git a/lychee-lib/src/ratelimit/mod.rs b/lychee-lib/src/ratelimit/mod.rs index 7b6d387361..d03f8c1397 100644 --- a/lychee-lib/src/ratelimit/mod.rs +++ b/lychee-lib/src/ratelimit/mod.rs @@ -14,11 +14,9 @@ //! - [`Window`]: Rolling window data structure for request timing mod config; -mod error; mod host; mod pool; pub use config::{HostConfig, HostConfigs, RateLimitConfig}; -pub use error::RateLimitError; pub use host::{Host, HostKey, HostStats, RequestInterval}; pub use pool::{ClientMap, HostPool}; diff --git a/lychee-lib/src/ratelimit/pool.rs b/lychee-lib/src/ratelimit/pool.rs index ff80a7c452..256b685b3b 100644 --- a/lychee-lib/src/ratelimit/pool.rs +++ b/lychee-lib/src/ratelimit/pool.rs @@ -3,7 +3,8 @@ use reqwest::{Client, Request, Response}; use std::collections::HashMap; use std::sync::Arc; -use crate::ratelimit::{Host, HostConfigs, HostKey, HostStats, RateLimitConfig, RateLimitError}; +use crate::ratelimit::{Host, HostConfigs, HostKey, HostStats, RateLimitConfig}; +use crate::types::Result; use crate::{CacheStatus, Status, Uri}; /// Keep track of host-specific [`reqwest::Client`]s @@ -79,7 +80,7 @@ impl HostPool { /// # Ok(()) /// # } /// ``` - pub async fn execute_request(&self, request: Request) -> Result { + pub async fn execute_request(&self, request: Request) -> Result { let url = request.url(); let host_key = HostKey::try_from(url)?; let host = self.get_or_create_host(host_key)?; @@ -87,7 +88,7 @@ impl HostPool { } /// Get an existing host or create a new one for the given hostname - fn get_or_create_host(&self, host_key: HostKey) -> Result, RateLimitError> { + fn get_or_create_host(&self, host_key: HostKey) -> Result> { if let Some(host) = self.hosts.get(&host_key) { return Ok(host.clone()); } @@ -229,10 +230,7 @@ impl HostPool { /// # Errors /// /// Returns an error if the host key cannot be parsed from the URI or if the host cannot be created. - pub fn record_cache_hit( - &self, - uri: &crate::Uri, - ) -> Result<(), crate::ratelimit::RateLimitError> { + pub fn record_cache_hit(&self, uri: &crate::Uri) -> Result<()> { let host_key = crate::ratelimit::HostKey::try_from(uri)?; // Get or create the host (this ensures statistics tracking even for cache-only requests) @@ -250,10 +248,7 @@ impl HostPool { /// # Errors /// /// Returns an error if the host key cannot be parsed from the URI or if the host cannot be created. - pub fn record_cache_miss( - &self, - uri: &crate::Uri, - ) -> Result<(), crate::ratelimit::RateLimitError> { + pub fn record_cache_miss(&self, uri: &crate::Uri) -> Result<()> { let host_key = crate::ratelimit::HostKey::try_from(uri)?; // Get or create the host (this ensures statistics tracking even for cache-only requests) diff --git a/lychee-lib/src/types/error.rs b/lychee-lib/src/types/error.rs index c0764e4fd2..6243cab684 100644 --- a/lychee-lib/src/types/error.rs +++ b/lychee-lib/src/types/error.rs @@ -138,7 +138,7 @@ pub enum ErrorKind { #[error("Cannot send/receive message from channel")] Channel(#[from] tokio::sync::mpsc::error::SendError), - /// An URL with an invalid host was found + /// An URL with no host was found #[error("URL is missing a host")] InvalidUrlHost, @@ -178,9 +178,6 @@ pub enum ErrorKind { /// The reason the command failed reason: String, }, - /// Rate limiting error - #[error("Rate limiting error: {0}")] - RateLimit(#[from] crate::ratelimit::RateLimitError), } impl ErrorKind { @@ -339,9 +336,6 @@ impl ErrorKind { [init @ .., tail] => format!("An index file ({}, or {}) is required", init.join(", "), tail), }.into(), ErrorKind::PreprocessorError{command, reason} => Some(format!("Command '{command}' failed {reason}. Check value of the preprocessor option")), - ErrorKind::RateLimit(e) => Some(format!( - "Rate limiting error: {e}. Consider adjusting rate limiting configuration or waiting before retrying" - )), } } @@ -472,7 +466,6 @@ impl Hash for ErrorKind { Self::Cookies(e) => e.hash(state), Self::StatusCodeSelectorError(e) => e.to_string().hash(state), Self::PreprocessorError { command, reason } => (command, reason).hash(state), - Self::RateLimit(e) => e.to_string().hash(state), } } } From 75df7e819cf18197ffce4cbf042c4438fd7c0029 Mon Sep 17 00:00:00 2001 From: Thomas Zahner Date: Mon, 15 Dec 2025 14:54:01 +0100 Subject: [PATCH 35/43] Test and improve rate limit header handling --- Cargo.lock | 1 + lychee-bin/tests/cli.rs | 45 +++++++++++- lychee-lib/Cargo.toml | 1 + lychee-lib/src/checker/website.rs | 2 + lychee-lib/src/ratelimit/headers.rs | 98 +++++++++++++++++++++++++++ lychee-lib/src/ratelimit/host/host.rs | 89 ++++++++++++------------ lychee-lib/src/ratelimit/mod.rs | 1 + 7 files changed, 188 insertions(+), 49 deletions(-) create mode 100644 lychee-lib/src/ratelimit/headers.rs diff --git a/Cargo.lock b/Cargo.lock index 56b5be7639..53bdcf6024 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -2833,6 +2833,7 @@ dependencies = [ "html5ever", "html5gum", "http 1.4.0", + "httpdate", "humantime-serde", "hyper 1.8.1", "ignore", diff --git a/lychee-bin/tests/cli.rs b/lychee-bin/tests/cli.rs index 4d47d251fc..5ef75a02a0 100644 --- a/lychee-bin/tests/cli.rs +++ b/lychee-bin/tests/cli.rs @@ -19,14 +19,14 @@ mod cli { fs::{self, File}, io::{BufRead, Write}, path::Path, - time::Duration, + time::{Duration, Instant}, }; use tempfile::{NamedTempFile, tempdir}; use test_utils::{fixtures_path, mock_server, redirecting_mock_server, root_path}; use uuid::Uuid; use wiremock::{ - Mock, ResponseTemplate, + Mock, Request, ResponseTemplate, matchers::{basic_auth, method}, }; @@ -2358,6 +2358,47 @@ The config file should contain every possible key for documentation purposes." .success(); } + #[tokio::test] + async fn test_retry_rate_limit_headers() { + const RETRY_DELAY: Duration = Duration::from_secs(1); + const TOLERANCE: Duration = Duration::from_millis(200); + let server = wiremock::MockServer::start().await; + + wiremock::Mock::given(wiremock::matchers::method("GET")) + .respond_with( + ResponseTemplate::new(429) + .append_header("Retry-After", RETRY_DELAY.as_secs().to_string()), + ) + .expect(1) + .up_to_n_times(1) + .mount(&server) + .await; + + let start = Instant::now(); + wiremock::Mock::given(wiremock::matchers::method("GET")) + .respond_with(move |_: &Request| { + let delta = Instant::now().duration_since(start); + assert!(delta > RETRY_DELAY); + assert!(delta < RETRY_DELAY + TOLERANCE); + ResponseTemplate::new(200) + }) + .expect(1) + .mount(&server) + .await; + + cargo_bin_cmd!() + .arg("-") + // Retry wait times are added on top of host-specific backoff timeout + .arg("--retry-wait-time") + .arg("0") + .write_stdin(server.uri()) + .assert() + .success(); + + // Check that the server received the request with the header + server.verify().await; + } + #[tokio::test] async fn test_no_header_set_on_input() { let server = wiremock::MockServer::start().await; diff --git a/lychee-lib/Cargo.toml b/lychee-lib/Cargo.toml index 1b20e79930..0c94b62ca5 100644 --- a/lychee-lib/Cargo.toml +++ b/lychee-lib/Cargo.toml @@ -27,6 +27,7 @@ headers = "0.4.1" html5ever = "0.36.1" html5gum = "0.8.3" http = "1.4.0" +httpdate = "1.0.3" humantime-serde = "1.1.1" hyper = "1.8.1" ignore = "0.4.25" diff --git a/lychee-lib/src/checker/website.rs b/lychee-lib/src/checker/website.rs index f07cabf4e3..400e42bcd7 100644 --- a/lychee-lib/src/checker/website.rs +++ b/lychee-lib/src/checker/website.rs @@ -102,6 +102,8 @@ impl WebsiteChecker { /// Retry requests up to `max_retries` times /// with an exponential backoff. + /// Note that, in addition, there also is a host-specific backoff + /// when host-specific rate limiting or errors are detected. pub(crate) async fn retry_request(&self, request: Request) -> Status { let mut retries: u64 = 0; let mut wait_time = self.retry_wait_time; diff --git a/lychee-lib/src/ratelimit/headers.rs b/lychee-lib/src/ratelimit/headers.rs new file mode 100644 index 0000000000..cb806bf4a9 --- /dev/null +++ b/lychee-lib/src/ratelimit/headers.rs @@ -0,0 +1,98 @@ +use http::HeaderValue; +use std::time::{Duration, SystemTime}; +use thiserror::Error; + +#[derive(Debug, Error, PartialEq, Eq)] +pub(crate) enum RetryAfterParseError { + #[error("Unable to parse value '{0}'")] + ValueError(String), + + #[error("Header value contains invalid chars")] + HeaderValueError, +} + +/// Parse the "Retry-After" header as specified per +/// [RFC 7231 section 7.1.3](https://www.rfc-editor.org/rfc/rfc7231#section-7.1.3) +pub(crate) fn parse_retry_after(value: &HeaderValue) -> Result { + let value = value + .to_str() + .map_err(|_| RetryAfterParseError::HeaderValueError)?; + + // RFC 7231: Retry-After = HTTP-date / delay-seconds + value.parse::().map(Duration::from_secs).or_else(|_| { + httpdate::parse_http_date(value) + .map(|s| { + s.duration_since(SystemTime::now()) + // if date is in the past, we can use ZERO + .unwrap_or(Duration::ZERO) + }) + .map_err(|_| RetryAfterParseError::ValueError(value.into())) + }) +} + +/// Parse the common "X-RateLimit" header fields. +/// Unfortunately, this is not standardised yet, but there is an +/// [IETF draft](https://datatracker.ietf.org/doc/draft-ietf-httpapi-ratelimit-headers/). +pub(crate) fn parse_common_rate_limit_header_fields( + headers: &http::HeaderMap, +) -> (Option, Option) { + let remaining = self::parse_header_value( + headers, + &[ + "x-ratelimit-remaining", + "x-rate-limit-remaining", + "ratelimit-remaining", + ], + ); + + let limit = self::parse_header_value( + headers, + &["x-ratelimit-limit", "x-rate-limit-limit", "ratelimit-limit"], + ); + + (remaining, limit) +} + +/// Helper method to parse numeric header values from common rate limit headers +fn parse_header_value(headers: &http::HeaderMap, header_names: &[&str]) -> Option { + for header_name in header_names { + if let Some(value) = headers.get(*header_name) + && let Ok(value_str) = value.to_str() + && let Ok(number) = value_str.parse::() + { + return Some(number); + } + } + None +} + +#[cfg(test)] +mod tests { + use std::time::Duration; + + use http::HeaderValue; + + use crate::ratelimit::headers::{RetryAfterParseError, parse_retry_after}; + + #[test] + fn test_retry_after() { + assert_eq!(parse_retry_after(&value("1")), Ok(Duration::from_secs(1))); + assert_eq!( + parse_retry_after(&value("-1")), + Err(RetryAfterParseError::ValueError("-1".into())) + ); + + assert_eq!( + parse_retry_after(&value("Fri, 15 May 2015 15:34:21 GMT")), + Ok(Duration::ZERO) + ); + + let result = parse_retry_after(&value("Fri, 15 May 4099 15:34:21 GMT")); + let is_in_future = matches!(result, Ok(d) if d.as_secs() > 0); + assert!(is_in_future); + } + + fn value(v: &str) -> HeaderValue { + HeaderValue::from_str(v).unwrap() + } +} diff --git a/lychee-lib/src/ratelimit/host/host.rs b/lychee-lib/src/ratelimit/host/host.rs index 2199c9d593..e15daff358 100644 --- a/lychee-lib/src/ratelimit/host/host.rs +++ b/lychee-lib/src/ratelimit/host/host.rs @@ -1,9 +1,12 @@ +use crate::ratelimit::headers; use dashmap::DashMap; use governor::{ RateLimiter, clock::DefaultClock, state::{InMemoryState, NotKeyed}, }; +use humantime_serde::re::humantime::format_duration; +use log::warn; use reqwest::{Client as ReqwestClient, Request, Response}; use std::sync::Mutex; use std::time::{Duration, Instant}; @@ -18,6 +21,9 @@ use crate::{ ratelimit::{HostConfig, RateLimitConfig}, }; +/// Cap retry-after to reasonable limits +const MAXIMUM_BACKOFF: Duration = Duration::from_secs(10 * 60); + /// Cache value for per-host caching #[derive(Debug, Clone)] struct HostCacheValue { @@ -199,7 +205,7 @@ impl Host { self.update_stats_and_backoff(status_code, request_time); // Parse rate limit headers to adjust behavior - self.parse_rate_limit_headers(&response); + self.handle_rate_limit_headers(&response); // Cache the result let status = Status::Ok(response.status()); @@ -255,29 +261,17 @@ impl Host { } /// Parse rate limit headers from response and adjust behavior - fn parse_rate_limit_headers(&self, response: &Response) { - // Manual parsing of common rate limit headers - // We implement basic parsing here for the most common headers (X-RateLimit-*, Retry-After) - // rather than using the rate-limits crate to keep dependencies minimal - + fn handle_rate_limit_headers(&self, response: &Response) { + // Implement basic parsing here rather than using the rate-limits crate to keep dependencies minimal let headers = response.headers(); + self.handle_retry_after_header(headers); + self.handle_common_rate_limit_header_fields(headers); + } - // Try common rate limit header patterns - let remaining = Self::parse_header_value( - headers, - &[ - "x-ratelimit-remaining", - "x-rate-limit-remaining", - "ratelimit-remaining", - ], - ); - - let limit = Self::parse_header_value( - headers, - &["x-ratelimit-limit", "x-rate-limit-limit", "ratelimit-limit"], - ); - - if let (Some(remaining), Some(limit)) = (remaining, limit) + /// Handle the common "X-RateLimit" header fields. + fn handle_common_rate_limit_header_fields(&self, headers: &http::HeaderMap) { + if let (Some(remaining), Some(limit)) = + headers::parse_common_rate_limit_header_fields(headers) && limit > 0 { #[allow(clippy::cast_precision_loss)] @@ -285,39 +279,40 @@ impl Host { // If we've used more than 80% of our quota, apply preventive backoff if usage_ratio > 0.8 { - let mut backoff = self.backoff_duration.lock().unwrap(); #[allow(clippy::cast_possible_truncation, clippy::cast_sign_loss)] - let preventive_backoff = - Duration::from_millis((200.0 * (usage_ratio - 0.8) / 0.2) as u64); - *backoff = std::cmp::max(*backoff, preventive_backoff); + let duration = Duration::from_millis((200.0 * (usage_ratio - 0.8) / 0.2) as u64); + self.increase_backoff(duration); } } + } - // Check for Retry-After header (in seconds) - if let Some(retry_after_value) = headers.get("retry-after") - && let Ok(retry_after_str) = retry_after_value.to_str() - && let Ok(retry_seconds) = retry_after_str.parse::() - { - let mut backoff = self.backoff_duration.lock().unwrap(); - let retry_duration = Duration::from_secs(retry_seconds); - // Cap retry-after to reasonable limits - if retry_duration <= Duration::from_secs(3600) { - *backoff = std::cmp::max(*backoff, retry_duration); - } + /// Handle the "Retry-After" header + fn handle_retry_after_header(&self, headers: &http::HeaderMap) { + if let Some(retry_after_value) = headers.get("retry-after") { + let duration = match headers::parse_retry_after(retry_after_value) { + Ok(e) => e, + Err(e) => { + warn!("Unable to parse Retry-After header as per RFC 7231: {e}"); + return; + } + }; + + self.increase_backoff(duration); } } - /// Helper method to parse numeric header values from common rate limit headers - fn parse_header_value(headers: &http::HeaderMap, header_names: &[&str]) -> Option { - for header_name in header_names { - if let Some(value) = headers.get(*header_name) - && let Ok(value_str) = value.to_str() - && let Ok(number) = value_str.parse::() - { - return Some(number); - } + fn increase_backoff(&self, mut increased_backoff: Duration) { + if increased_backoff > MAXIMUM_BACKOFF { + warn!( + "Encountered an unexpectedly big rate limit backoff duration of {}. Capping the duration to {} instead.", + format_duration(increased_backoff), + format_duration(MAXIMUM_BACKOFF) + ); + increased_backoff = MAXIMUM_BACKOFF; } - None + + let mut backoff = self.backoff_duration.lock().unwrap(); + *backoff = std::cmp::max(*backoff, increased_backoff); } /// Get host statistics diff --git a/lychee-lib/src/ratelimit/mod.rs b/lychee-lib/src/ratelimit/mod.rs index d03f8c1397..27a6d8028c 100644 --- a/lychee-lib/src/ratelimit/mod.rs +++ b/lychee-lib/src/ratelimit/mod.rs @@ -14,6 +14,7 @@ //! - [`Window`]: Rolling window data structure for request timing mod config; +mod headers; mod host; mod pool; From 3d0d4fa1616f87971af0aef64edff34c8bc63009 Mon Sep 17 00:00:00 2001 From: Thomas Zahner Date: Tue, 16 Dec 2025 19:01:33 +0100 Subject: [PATCH 36/43] Apply @mre's suggestions --- lychee-lib/src/ratelimit/config.rs | 16 ++++---- lychee-lib/src/ratelimit/host/host.rs | 4 +- lychee-lib/src/ratelimit/host/key.rs | 3 +- lychee-lib/src/ratelimit/host/stats.rs | 2 +- lychee-lib/src/ratelimit/mod.rs | 3 +- lychee-lib/src/ratelimit/pool.rs | 52 +++++++++----------------- 6 files changed, 31 insertions(+), 49 deletions(-) diff --git a/lychee-lib/src/ratelimit/config.rs b/lychee-lib/src/ratelimit/config.rs index f5f90f5fd2..8540028066 100644 --- a/lychee-lib/src/ratelimit/config.rs +++ b/lychee-lib/src/ratelimit/config.rs @@ -4,6 +4,9 @@ use std::collections::HashMap; use crate::ratelimit::{HostKey, RequestInterval}; +/// Default number of concurrent requests per host +const DEFAULT_HOST_CONCURRENCY: usize = 10; + /// Global rate limiting configuration that applies as defaults to all hosts #[derive(Debug, Clone, Copy, Serialize, Deserialize)] pub struct RateLimitConfig { @@ -68,6 +71,11 @@ impl Default for HostConfig { } } +/// Default number of concurrent requests per host +const fn default_host_concurrency() -> usize { + DEFAULT_HOST_CONCURRENCY +} + impl HostConfig { /// Get the effective max concurrency, falling back to the global default #[must_use] @@ -84,14 +92,6 @@ impl HostConfig { } } -/// Default number of concurrent requests per host -const DEFAULT_HOST_CONCURRENCY: usize = 10; - -/// Default number of concurrent requests per host -const fn default_host_concurrency() -> usize { - DEFAULT_HOST_CONCURRENCY -} - /// Custom deserializer for headers from TOML config format fn deserialize_headers<'de, D>(deserializer: D) -> Result where diff --git a/lychee-lib/src/ratelimit/host/host.rs b/lychee-lib/src/ratelimit/host/host.rs index e15daff358..c0d4d7128a 100644 --- a/lychee-lib/src/ratelimit/host/host.rs +++ b/lychee-lib/src/ratelimit/host/host.rs @@ -21,8 +21,8 @@ use crate::{ ratelimit::{HostConfig, RateLimitConfig}, }; -/// Cap retry-after to reasonable limits -const MAXIMUM_BACKOFF: Duration = Duration::from_secs(10 * 60); +/// Cap maximum backoff duration to reasonable limits +const MAXIMUM_BACKOFF: Duration = Duration::from_secs(60); /// Cache value for per-host caching #[derive(Debug, Clone)] diff --git a/lychee-lib/src/ratelimit/host/key.rs b/lychee-lib/src/ratelimit/host/key.rs index 09f224ceec..4361b9b8da 100644 --- a/lychee-lib/src/ratelimit/host/key.rs +++ b/lychee-lib/src/ratelimit/host/key.rs @@ -8,8 +8,7 @@ use crate::types::Result; /// A type-safe representation of a hostname for rate limiting purposes. /// /// This extracts and normalizes hostnames from URLs to ensure consistent -/// rate limiting across requests to the same host. Subdomains are treated -/// as separate hosts to allow for traffic sharding. +/// rate limiting across requests to the same host (domain or IP address). /// /// # Examples /// diff --git a/lychee-lib/src/ratelimit/host/stats.rs b/lychee-lib/src/ratelimit/host/stats.rs index 983462c409..c78ec43623 100644 --- a/lychee-lib/src/ratelimit/host/stats.rs +++ b/lychee-lib/src/ratelimit/host/stats.rs @@ -1,7 +1,7 @@ use std::collections::HashMap; use std::time::{Duration, Instant}; -/// Statistics tracking for a host's request patterns +/// Record and report statistics for a [`crate::ratelimit::Host`] #[derive(Debug, Clone, Default)] pub struct HostStats { /// Total number of requests made to this host diff --git a/lychee-lib/src/ratelimit/mod.rs b/lychee-lib/src/ratelimit/mod.rs index 27a6d8028c..80a6de8775 100644 --- a/lychee-lib/src/ratelimit/mod.rs +++ b/lychee-lib/src/ratelimit/mod.rs @@ -7,11 +7,10 @@ //! # Architecture //! //! - [`HostKey`]: Represents a hostname/domain for rate limiting -//! - [`Host`]: Manages rate limiting, concurrency, caching, and cookies for a specific host +//! - [`Host`]: Manages rate limiting, concurrency, and caching for a specific host //! - [`HostPool`]: Coordinates multiple hosts and routes requests appropriately //! - [`HostConfig`]: Configuration for per-host behavior //! - [`HostStats`]: Statistics tracking for each host -//! - [`Window`]: Rolling window data structure for request timing mod config; mod headers; diff --git a/lychee-lib/src/ratelimit/pool.rs b/lychee-lib/src/ratelimit/pool.rs index 256b685b3b..309b6b6957 100644 --- a/lychee-lib/src/ratelimit/pool.rs +++ b/lychee-lib/src/ratelimit/pool.rs @@ -89,40 +89,24 @@ impl HostPool { /// Get an existing host or create a new one for the given hostname fn get_or_create_host(&self, host_key: HostKey) -> Result> { - if let Some(host) = self.hosts.get(&host_key) { - return Ok(host.clone()); - } - - let host_config = self - .host_configs - .get(&host_key) - .cloned() - .unwrap_or_default(); - - let client = self - .client_map - .get(&host_key) - .unwrap_or(&self.default_client) - .clone(); - - let host = Arc::new(Host::new( - host_key.clone(), - &host_config, - &self.global_config, - client, - )?); - - // Store in map (handle race condition where another thread created it) - match self.hosts.entry(host_key) { - dashmap::mapref::entry::Entry::Occupied(entry) => { - // Another thread created it, use theirs - Ok(entry.get().clone()) - } - dashmap::mapref::entry::Entry::Vacant(entry) => { - // We're first, insert ours - Ok(entry.insert(host).clone()) - } - } + self.hosts + .entry(host_key.clone()) + .or_try_insert_with(|| { + let host_config = self + .host_configs + .get(&host_key) + .cloned() + .unwrap_or_default(); + + let client = self + .client_map + .get(&host_key) + .unwrap_or(&self.default_client) + .clone(); + + Host::new(host_key, &host_config, &self.global_config, client).map(Arc::new) + }) + .map(|entry| entry.value().clone()) } /// Returns statistics for the host if it exists, otherwise returns empty stats. From c748f7d46ecae2da2723622644aa1e6ee78b2f7b Mon Sep 17 00:00:00 2001 From: Thomas Zahner Date: Tue, 16 Dec 2025 19:07:05 +0100 Subject: [PATCH 37/43] Apply suggestions from code review Co-authored-by: Matthias Endler --- README.md | 2 +- lychee-bin/src/commands/check.rs | 5 ++++- lychee-bin/src/formatters/stats/mod.rs | 4 ++-- lychee-bin/src/options.rs | 2 +- lychee-lib/src/client.rs | 6 +++--- lychee-lib/src/types/error.rs | 2 +- 6 files changed, 12 insertions(+), 9 deletions(-) diff --git a/README.md b/README.md index ac04b244d7..d14c83f6f9 100644 --- a/README.md +++ b/README.md @@ -670,7 +670,7 @@ Options: Minimum interval between requests to the same host (default: 100ms) Sets a baseline delay between consecutive requests to prevent - hammering servers. The adaptive algorithm may increase this based + overloading servers. The adaptive algorithm may increase this based on server responses (rate limits, errors). Use the `hosts` option to configure this on a per-host basis. diff --git a/lychee-bin/src/commands/check.rs b/lychee-bin/src/commands/check.rs index cdfaf3c57a..d0209a7c31 100644 --- a/lychee-bin/src/commands/check.rs +++ b/lychee-bin/src/commands/check.rs @@ -287,7 +287,10 @@ async fn handle( let response = check_url(client, request).await; - // Apply the same caching rules as before + // - Never cache filesystem access as it is fast already so caching has no benefit. + // - Skip caching unsupported URLs as they might be supported in a future run. + // - Skip caching excluded links; they might not be excluded in the next run. + // - Skip caching links for which the status code has been explicitly excluded from the cache. let status = response.status(); if ignore_cache(&uri, status, &cache_exclude_status) { return Ok(response); diff --git a/lychee-bin/src/formatters/stats/mod.rs b/lychee-bin/src/formatters/stats/mod.rs index 574440fd5d..8d6cb559e9 100644 --- a/lychee-bin/src/formatters/stats/mod.rs +++ b/lychee-bin/src/formatters/stats/mod.rs @@ -29,8 +29,8 @@ pub(crate) trait StatsFormatter { /// If configured to do so, output response statistics to stdout or the specified output file. pub(crate) fn output_response_statistics(stats: ResponseStats, config: &Config) -> Result<()> { let is_empty = stats.is_empty(); - let formatted_stats = get_stats_formatter(&config.format, &config.mode).format(stats)?; - if let Some(formatted_stats) = formatted_stats { + let formatter = get_stats_formatter(&config.format, &config.mode); + if let Some(formatted_stats) = formatter.format(stats)? { if let Some(output) = &config.output { fs::write(output, formatted_stats).context("Cannot write status output to file")?; } else { diff --git a/lychee-bin/src/options.rs b/lychee-bin/src/options.rs index fc57b906a9..87f269ad64 100644 --- a/lychee-bin/src/options.rs +++ b/lychee-bin/src/options.rs @@ -554,7 +554,7 @@ with a status code of 429, 500 and 501." /// Minimum interval between requests to the same host (default: 100ms) /// /// Sets a baseline delay between consecutive requests to prevent - /// hammering servers. The adaptive algorithm may increase this based + /// overloading servers. The adaptive algorithm may increase this based /// on server responses (rate limits, errors). Use the `hosts` option /// to configure this on a per-host basis. /// diff --git a/lychee-lib/src/client.rs b/lychee-lib/src/client.rs index 91c78d8aff..e9c595a656 100644 --- a/lychee-lib/src/client.rs +++ b/lychee-lib/src/client.rs @@ -408,7 +408,7 @@ impl ClientBuilder { self.hosts .iter() .map(|(host, config)| { - let mut headers = self.headers()?; + let mut headers = self.default_headers()?; headers.extend(config.headers.clone()); let client = self .build_client(redirect_history)? @@ -424,7 +424,7 @@ impl ClientBuilder { fn build_client(&self, redirect_history: &RedirectHistory) -> Result { let mut builder = reqwest::ClientBuilder::new() .gzip(true) - .default_headers(self.headers()?) + .default_headers(self.default_headers()?) .danger_accept_invalid_certs(self.allow_insecure) .connect_timeout(Duration::from_secs(CONNECT_TIMEOUT)) .tcp_keepalive(Duration::from_secs(TCP_KEEPALIVE)) @@ -448,7 +448,7 @@ impl ClientBuilder { Ok(builder) } - fn headers(&self) -> Result { + fn default_headers(&self) -> Result { let user_agent = self.user_agent.clone(); let mut headers = self.custom_headers.clone(); diff --git a/lychee-lib/src/types/error.rs b/lychee-lib/src/types/error.rs index 6243cab684..a351c4dbbf 100644 --- a/lychee-lib/src/types/error.rs +++ b/lychee-lib/src/types/error.rs @@ -138,7 +138,7 @@ pub enum ErrorKind { #[error("Cannot send/receive message from channel")] Channel(#[from] tokio::sync::mpsc::error::SendError), - /// An URL with no host was found + /// A URL without a host was found #[error("URL is missing a host")] InvalidUrlHost, From 6297aa0567a1f51158be633ef9fcca487711977e Mon Sep 17 00:00:00 2001 From: Thomas Zahner Date: Wed, 17 Dec 2025 18:10:48 +0100 Subject: [PATCH 38/43] Fix tests --- lychee-bin/tests/cli.rs | 2 +- lychee-lib/src/checker/website.rs | 5 ++++- lychee-lib/src/client.rs | 3 +-- 3 files changed, 6 insertions(+), 4 deletions(-) diff --git a/lychee-bin/tests/cli.rs b/lychee-bin/tests/cli.rs index 5ef75a02a0..ee113b9eb8 100644 --- a/lychee-bin/tests/cli.rs +++ b/lychee-bin/tests/cli.rs @@ -2361,7 +2361,7 @@ The config file should contain every possible key for documentation purposes." #[tokio::test] async fn test_retry_rate_limit_headers() { const RETRY_DELAY: Duration = Duration::from_secs(1); - const TOLERANCE: Duration = Duration::from_millis(200); + const TOLERANCE: Duration = Duration::from_millis(500); let server = wiremock::MockServer::start().await; wiremock::Mock::given(wiremock::matchers::method("GET")) diff --git a/lychee-lib/src/checker/website.rs b/lychee-lib/src/checker/website.rs index 400e42bcd7..5da0e22660 100644 --- a/lychee-lib/src/checker/website.rs +++ b/lychee-lib/src/checker/website.rs @@ -163,7 +163,10 @@ impl WebsiteChecker { status } } - Err(e) => e.into(), + Err(e) => match e { + ErrorKind::NetworkRequest(error) => Status::from(error), + _ => e.into(), + }, } } diff --git a/lychee-lib/src/client.rs b/lychee-lib/src/client.rs index e9c595a656..20146b1b2d 100644 --- a/lychee-lib/src/client.rs +++ b/lychee-lib/src/client.rs @@ -545,8 +545,7 @@ impl Client { } let status = match uri.scheme() { - // We don't check tel: URIs - _ if uri.is_tel() => Status::Excluded, + _ if uri.is_tel() => Status::Excluded, // We don't check tel: URIs _ if uri.is_file() => self.check_file(uri).await, _ if uri.is_mail() => self.check_mail(uri).await, _ => self.check_website(uri, credentials).await?, From a551feeccdcab2d674946ab14fd936e5c720cc13 Mon Sep 17 00:00:00 2001 From: Thomas Zahner Date: Thu, 18 Dec 2025 10:50:53 +0100 Subject: [PATCH 39/43] Minor improvements Remove unneeded Result and move unwrap to compile time --- lychee-lib/src/ratelimit/host/host.rs | 22 +++++++------------- lychee-lib/src/ratelimit/pool.rs | 30 ++++++++++++++++----------- 2 files changed, 26 insertions(+), 26 deletions(-) diff --git a/lychee-lib/src/ratelimit/host/host.rs b/lychee-lib/src/ratelimit/host/host.rs index c0d4d7128a..d0345c6a8d 100644 --- a/lychee-lib/src/ratelimit/host/host.rs +++ b/lychee-lib/src/ratelimit/host/host.rs @@ -8,8 +8,8 @@ use governor::{ use humantime_serde::re::humantime::format_duration; use log::warn; use reqwest::{Client as ReqwestClient, Request, Response}; -use std::sync::Mutex; use std::time::{Duration, Instant}; +use std::{num::NonZeroU32, sync::Mutex}; use tokio::sync::Semaphore; use super::key::HostKey; @@ -76,24 +76,18 @@ pub struct Host { impl Host { /// Create a new Host instance for the given hostname - /// - /// # Errors - /// - /// Returns an error if the HTTP client cannot be configured properly - /// - /// # Panics - /// - /// Panics if the burst size cannot be set to 1 (should never happen) + #[must_use] pub fn new( key: HostKey, host_config: &HostConfig, global_config: &RateLimitConfig, client: ReqwestClient, - ) -> Result { + ) -> Self { + const MAX_BURST: NonZeroU32 = NonZeroU32::new(1).unwrap(); let quota = host_config .effective_request_interval(global_config) .into_inner() - .allow_burst(std::num::NonZeroU32::new(1).unwrap()); + .allow_burst(MAX_BURST); let rate_limiter = RateLimiter::direct(quota); @@ -101,7 +95,7 @@ impl Host { let max_concurrent = host_config.effective_max_concurrent(global_config); let semaphore = Semaphore::new(max_concurrent); - Ok(Host { + Host { key, rate_limiter, semaphore, @@ -109,7 +103,7 @@ impl Host { stats: Mutex::new(HostStats::default()), backoff_duration: Mutex::new(Duration::from_millis(0)), cache: DashMap::new(), - }) + } } /// Check if a URI is cached and return the cached status if valid @@ -365,7 +359,7 @@ mod tests { let host_config = HostConfig::default(); let global_config = RateLimitConfig::default(); - let host = Host::new(key.clone(), &host_config, &global_config, Client::default()).unwrap(); + let host = Host::new(key.clone(), &host_config, &global_config, Client::default()); assert_eq!(host.key, key); assert_eq!(host.available_permits(), 10); // Default concurrency diff --git a/lychee-lib/src/ratelimit/pool.rs b/lychee-lib/src/ratelimit/pool.rs index 309b6b6957..79f9904e96 100644 --- a/lychee-lib/src/ratelimit/pool.rs +++ b/lychee-lib/src/ratelimit/pool.rs @@ -83,15 +83,15 @@ impl HostPool { pub async fn execute_request(&self, request: Request) -> Result { let url = request.url(); let host_key = HostKey::try_from(url)?; - let host = self.get_or_create_host(host_key)?; + let host = self.get_or_create_host(host_key); host.execute_request(request).await } /// Get an existing host or create a new one for the given hostname - fn get_or_create_host(&self, host_key: HostKey) -> Result> { + fn get_or_create_host(&self, host_key: HostKey) -> Arc { self.hosts .entry(host_key.clone()) - .or_try_insert_with(|| { + .or_insert_with(|| { let host_config = self .host_configs .get(&host_key) @@ -104,9 +104,15 @@ impl HostPool { .unwrap_or(&self.default_client) .clone(); - Host::new(host_key, &host_config, &self.global_config, client).map(Arc::new) + Arc::new(Host::new( + host_key, + &host_config, + &self.global_config, + client, + )) }) - .map(|entry| entry.value().clone()) + .value() + .clone() } /// Returns statistics for the host if it exists, otherwise returns empty stats. @@ -213,12 +219,12 @@ impl HostPool { /// /// # Errors /// - /// Returns an error if the host key cannot be parsed from the URI or if the host cannot be created. + /// Returns an error if the host key cannot be parsed from the URI. pub fn record_cache_hit(&self, uri: &crate::Uri) -> Result<()> { let host_key = crate::ratelimit::HostKey::try_from(uri)?; // Get or create the host (this ensures statistics tracking even for cache-only requests) - let host = self.get_or_create_host(host_key)?; + let host = self.get_or_create_host(host_key); host.record_persistent_cache_hit(); Ok(()) } @@ -231,12 +237,12 @@ impl HostPool { /// /// # Errors /// - /// Returns an error if the host key cannot be parsed from the URI or if the host cannot be created. + /// Returns an error if the host key cannot be parsed from the URI. pub fn record_cache_miss(&self, uri: &crate::Uri) -> Result<()> { let host_key = crate::ratelimit::HostKey::try_from(uri)?; // Get or create the host (this ensures statistics tracking even for cache-only requests) - let host = self.get_or_create_host(host_key)?; + let host = self.get_or_create_host(host_key); host.record_persistent_cache_miss(); Ok(()) } @@ -289,7 +295,7 @@ mod tests { assert_eq!(pool.host_stats("example.com").total_requests, 0); // Create host on demand - let host = pool.get_or_create_host(host_key).unwrap(); + let host = pool.get_or_create_host(host_key); // Now we have one host assert_eq!(pool.active_host_count(), 1); @@ -307,11 +313,11 @@ mod tests { let host_key2 = HostKey::try_from(&url).unwrap(); // Create host for first request - let host1 = pool.get_or_create_host(host_key1).unwrap(); + let host1 = pool.get_or_create_host(host_key1); assert_eq!(pool.active_host_count(), 1); // Second request to same host should reuse - let host2 = pool.get_or_create_host(host_key2).unwrap(); + let host2 = pool.get_or_create_host(host_key2); assert_eq!(pool.active_host_count(), 1); // Should be the same instance From 570430556beb541778f7c55deea40e754187583e Mon Sep 17 00:00:00 2001 From: Thomas Zahner Date: Thu, 18 Dec 2025 12:16:39 +0100 Subject: [PATCH 40/43] Remove reqwest_client from WebsiteChecker --- lychee-bin/tests/cli.rs | 1 + lychee-lib/src/checker/website.rs | 10 +--------- lychee-lib/src/client.rs | 3 +-- lychee-lib/src/ratelimit/host/host.rs | 4 ++++ lychee-lib/src/ratelimit/pool.rs | 23 ++++++++++++++++++++--- 5 files changed, 27 insertions(+), 14 deletions(-) diff --git a/lychee-bin/tests/cli.rs b/lychee-bin/tests/cli.rs index ee113b9eb8..c0cae571d0 100644 --- a/lychee-bin/tests/cli.rs +++ b/lychee-bin/tests/cli.rs @@ -2387,6 +2387,7 @@ The config file should contain every possible key for documentation purposes." .await; cargo_bin_cmd!() + // Direct args are not using the host pool, they are resolved earlier via Collector .arg("-") // Retry wait times are added on top of host-specific backoff timeout .arg("--retry-wait-time") diff --git a/lychee-lib/src/checker/website.rs b/lychee-lib/src/checker/website.rs index 5da0e22660..6a00915e46 100644 --- a/lychee-lib/src/checker/website.rs +++ b/lychee-lib/src/checker/website.rs @@ -19,9 +19,6 @@ pub(crate) struct WebsiteChecker { /// Request method used for making requests. method: reqwest::Method, - /// The HTTP client used for requests. - reqwest_client: reqwest::Client, - /// GitHub client used for requests. github_client: Option, @@ -76,7 +73,6 @@ impl WebsiteChecker { retry_wait_time: Duration, redirect_history: RedirectHistory, max_retries: u64, - reqwest_client: reqwest::Client, accepted: HashSet, github_client: Option, require_https: bool, @@ -86,7 +82,6 @@ impl WebsiteChecker { ) -> Self { Self { method, - reqwest_client, github_client, plugin_request_chain, redirect_history, @@ -259,10 +254,7 @@ impl WebsiteChecker { /// - The request failed. /// - The response status code is not accepted. async fn check_website_inner(&self, uri: &Uri, default_chain: &RequestChain) -> Status { - let request = self - .reqwest_client - .request(self.method.clone(), uri.as_str()) - .build(); + let request = self.host_pool.build_request(self.method.clone(), uri); let request = match request { Ok(r) => r, diff --git a/lychee-lib/src/client.rs b/lychee-lib/src/client.rs index 20146b1b2d..e0ec4e6c5c 100644 --- a/lychee-lib/src/client.rs +++ b/lychee-lib/src/client.rs @@ -347,7 +347,7 @@ impl ClientBuilder { let host_pool = HostPool::new( self.rate_limit_config, self.hosts, - reqwest_client.clone(), + reqwest_client, client_map, ); @@ -380,7 +380,6 @@ impl ClientBuilder { self.retry_wait_time, redirect_history.clone(), self.max_retries, - reqwest_client, self.accepted, github_client, self.require_https, diff --git a/lychee-lib/src/ratelimit/host/host.rs b/lychee-lib/src/ratelimit/host/host.rs index d0345c6a8d..37f86fa3f8 100644 --- a/lychee-lib/src/ratelimit/host/host.rs +++ b/lychee-lib/src/ratelimit/host/host.rs @@ -208,6 +208,10 @@ impl Host { Ok(response) } + pub(crate) const fn get_client(&self) -> &ReqwestClient { + &self.client + } + /// Update internal statistics and backoff based on the response fn update_stats_and_backoff(&self, status_code: u16, request_time: Duration) { // Update statistics diff --git a/lychee-lib/src/ratelimit/pool.rs b/lychee-lib/src/ratelimit/pool.rs index 79f9904e96..cbd1d2bf88 100644 --- a/lychee-lib/src/ratelimit/pool.rs +++ b/lychee-lib/src/ratelimit/pool.rs @@ -1,11 +1,12 @@ use dashmap::DashMap; +use http::Method; use reqwest::{Client, Request, Response}; use std::collections::HashMap; use std::sync::Arc; use crate::ratelimit::{Host, HostConfigs, HostKey, HostStats, RateLimitConfig}; use crate::types::Result; -use crate::{CacheStatus, Status, Uri}; +use crate::{CacheStatus, ErrorKind, Status, Uri}; /// Keep track of host-specific [`reqwest::Client`]s pub type ClientMap = HashMap; @@ -57,11 +58,11 @@ impl HostPool { } } - /// Execute an HTTP request with appropriate per-host rate limiting. + /// Try to execute a [`Request`] with appropriate per-host rate limiting. /// /// # Errors /// - /// Returns a `RateLimitError` if: + /// Fails if: /// - The request URL has no valid hostname /// - The underlying HTTP request fails /// @@ -87,6 +88,22 @@ impl HostPool { host.execute_request(request).await } + /// Try to build a [`Request`] + /// + /// # Errors + /// + /// Fails if: + /// - The request URI has no valid hostname + /// - The request fails to build + pub fn build_request(&self, method: Method, uri: &Uri) -> Result { + let host_key = HostKey::try_from(uri)?; + let host = self.get_or_create_host(host_key); + host.get_client() + .request(method, uri.url.clone()) + .build() + .map_err(ErrorKind::BuildRequestClient) + } + /// Get an existing host or create a new one for the given hostname fn get_or_create_host(&self, host_key: HostKey) -> Arc { self.hosts From d5e8afeaa77aad6ffd7be179043847eed21305f7 Mon Sep 17 00:00:00 2001 From: Thomas Zahner Date: Thu, 18 Dec 2025 16:59:20 +0100 Subject: [PATCH 41/43] Reference rate-limits crate as per @mre's suggestion --- lychee-lib/src/ratelimit/headers.rs | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/lychee-lib/src/ratelimit/headers.rs b/lychee-lib/src/ratelimit/headers.rs index cb806bf4a9..bdc616c9aa 100644 --- a/lychee-lib/src/ratelimit/headers.rs +++ b/lychee-lib/src/ratelimit/headers.rs @@ -1,3 +1,7 @@ +//! Handle rate limiting headers. +//! Note that we might want to replace this module with +//! at some point in the future. + use http::HeaderValue; use std::time::{Duration, SystemTime}; use thiserror::Error; From 0f21985b9f9ef068959f9e2e6e1d0c722aa19a30 Mon Sep 17 00:00:00 2001 From: Thomas Zahner Date: Fri, 19 Dec 2025 12:17:48 +0100 Subject: [PATCH 42/43] Update option names and the default interval value The default host request interval is now more optimistic. The option names are now more consistent. We now have the host_ prefix for host related options. This prefix is dropped inside the host config. --- README.md | 24 ++++++------- lychee-bin/src/client.rs | 2 +- lychee-bin/src/options.rs | 18 +++++----- lychee-lib/src/ratelimit/config.rs | 41 +++++++++++------------ lychee-lib/src/ratelimit/host/host.rs | 2 +- lychee-lib/src/ratelimit/host/interval.rs | 4 +-- lychee.example.toml | 8 ++--- 7 files changed, 49 insertions(+), 50 deletions(-) diff --git a/README.md b/README.md index d14c83f6f9..2ca3f215cf 100644 --- a/README.md +++ b/README.md @@ -542,6 +542,18 @@ Options: --host-concurrency 2 # Conservative for slow APIs --host-concurrency 20 # Aggressive for fast APIs + --host-request-interval + Minimum interval between requests to the same host (default: 50ms) + + Sets a baseline delay between consecutive requests to prevent + overloading servers. The adaptive algorithm may increase this based + on server responses (rate limits, errors). Use the `hosts` option + to configure this on a per-host basis. + + Examples: + --host-request-interval 50ms # Fast for robust APIs + --host-request-interval 1s # Conservative for rate-limited APIs + --host-stats Show per-host statistics at the end of the run @@ -666,18 +678,6 @@ Options: --remap Remap URI matching pattern to different URI - --request-interval - Minimum interval between requests to the same host (default: 100ms) - - Sets a baseline delay between consecutive requests to prevent - overloading servers. The adaptive algorithm may increase this based - on server responses (rate limits, errors). Use the `hosts` option - to configure this on a per-host basis. - - Examples: - --request-interval 50ms # Fast for robust APIs - --request-interval 1s # Conservative for rate-limited APIs - --require-https When HTTPS is available, treat HTTP links as errors diff --git a/lychee-bin/src/client.rs b/lychee-bin/src/client.rs index 5d4765059e..3efefffe6f 100644 --- a/lychee-bin/src/client.rs +++ b/lychee-bin/src/client.rs @@ -57,7 +57,7 @@ pub(crate) fn create(cfg: &Config, cookie_jar: Option<&Arc>) - .index_files(cfg.index_files.clone()) .rate_limit_config(RateLimitConfig::from_options( cfg.host_concurrency, - cfg.request_interval, + cfg.host_request_interval, )) .hosts(cfg.hosts.clone()) .build() diff --git a/lychee-bin/src/options.rs b/lychee-bin/src/options.rs index 87f269ad64..7ff3edc050 100644 --- a/lychee-bin/src/options.rs +++ b/lychee-bin/src/options.rs @@ -391,7 +391,7 @@ where pub(crate) struct Config { /// Read input filenames from the given file or stdin (if path is '-'). #[arg( - long = "files-from", + long, value_name = "PATH", long_help = "Read input filenames from the given file or stdin (if path is '-'). @@ -547,11 +547,11 @@ with a status code of 429, 500 and 501." /// Examples: /// --host-concurrency 2 # Conservative for slow APIs /// --host-concurrency 20 # Aggressive for fast APIs - #[arg(long = "host-concurrency", verbatim_doc_comment)] + #[arg(long, verbatim_doc_comment)] #[serde(default)] pub(crate) host_concurrency: Option, - /// Minimum interval between requests to the same host (default: 100ms) + /// Minimum interval between requests to the same host (default: 50ms) /// /// Sets a baseline delay between consecutive requests to prevent /// overloading servers. The adaptive algorithm may increase this based @@ -559,10 +559,10 @@ with a status code of 429, 500 and 501." /// to configure this on a per-host basis. /// /// Examples: - /// --request-interval 50ms # Fast for robust APIs - /// --request-interval 1s # Conservative for rate-limited APIs - #[arg(long = "request-interval", verbatim_doc_comment)] - pub(crate) request_interval: Option, + /// --host-request-interval 50ms # Fast for robust APIs + /// --host-request-interval 1s # Conservative for rate-limited APIs + #[arg(long, verbatim_doc_comment)] + pub(crate) host_request_interval: Option, /// Number of threads to utilize. /// Defaults to number of cores available to the system @@ -700,7 +700,7 @@ Note: This option only takes effect on `file://` URIs which exist and point to a /// Set custom header for requests #[arg( short = 'H', - long = "header", + long, // Note: We use a `Vec<(String, String)>` for headers, which is // unfortunate. The reason is that `clap::ArgAction::Append` collects // multiple values, and `clap` cannot automatically convert these tuples @@ -994,7 +994,7 @@ impl Config { cookie_jar: None, default_extension: None, host_concurrency: None, - request_interval: None, + host_request_interval: None, dump: false, dump_inputs: false, exclude: Vec::::new(), diff --git a/lychee-lib/src/ratelimit/config.rs b/lychee-lib/src/ratelimit/config.rs index 8540028066..e3306f1979 100644 --- a/lychee-lib/src/ratelimit/config.rs +++ b/lychee-lib/src/ratelimit/config.rs @@ -5,14 +5,14 @@ use std::collections::HashMap; use crate::ratelimit::{HostKey, RequestInterval}; /// Default number of concurrent requests per host -const DEFAULT_HOST_CONCURRENCY: usize = 10; +const DEFAULT_CONCURRENCY: usize = 10; /// Global rate limiting configuration that applies as defaults to all hosts #[derive(Debug, Clone, Copy, Serialize, Deserialize)] pub struct RateLimitConfig { /// Default maximum concurrent requests per host - #[serde(default = "default_host_concurrency")] - pub host_concurrency: usize, + #[serde(default = "default_concurrency")] + pub concurrency: usize, /// Default minimum interval between requests to the same host pub request_interval: RequestInterval, @@ -21,7 +21,7 @@ pub struct RateLimitConfig { impl Default for RateLimitConfig { fn default() -> Self { Self { - host_concurrency: default_host_concurrency(), + concurrency: default_concurrency(), request_interval: RequestInterval::default(), } } @@ -31,11 +31,11 @@ impl RateLimitConfig { /// Create a `RateLimitConfig` from CLI options, using defaults for missing values #[must_use] pub fn from_options( - host_concurrency: Option, + concurrency: Option, request_interval: Option, ) -> Self { Self { - host_concurrency: host_concurrency.unwrap_or(DEFAULT_HOST_CONCURRENCY), + concurrency: concurrency.unwrap_or(DEFAULT_CONCURRENCY), request_interval: request_interval.unwrap_or_default(), } } @@ -49,7 +49,7 @@ pub type HostConfigs = HashMap; #[serde(deny_unknown_fields)] pub struct HostConfig { /// Maximum concurrent requests allowed to this host - pub max_concurrent: Option, + pub concurrency: Option, /// Minimum interval between requests to this host pub request_interval: Option, @@ -64,7 +64,7 @@ pub struct HostConfig { impl Default for HostConfig { fn default() -> Self { Self { - max_concurrent: None, + concurrency: None, request_interval: None, headers: HeaderMap::new(), } @@ -72,16 +72,15 @@ impl Default for HostConfig { } /// Default number of concurrent requests per host -const fn default_host_concurrency() -> usize { - DEFAULT_HOST_CONCURRENCY +const fn default_concurrency() -> usize { + DEFAULT_CONCURRENCY } impl HostConfig { - /// Get the effective max concurrency, falling back to the global default + /// Get the effective maximum concurrency, falling back to the global default #[must_use] - pub fn effective_max_concurrent(&self, global_config: &RateLimitConfig) -> usize { - self.max_concurrent - .unwrap_or(global_config.host_concurrency) + pub fn effective_concurrency(&self, global_config: &RateLimitConfig) -> usize { + self.concurrency.unwrap_or(global_config.concurrency) } /// Get the effective request interval, falling back to the global default @@ -138,7 +137,7 @@ mod tests { // Test with no overrides let host_config = HostConfig::default(); - assert_eq!(host_config.effective_max_concurrent(&global_config), 10); + assert_eq!(host_config.effective_concurrency(&global_config), 10); assert_eq!( host_config.effective_request_interval(&global_config), RequestInterval::default(), @@ -146,11 +145,11 @@ mod tests { // Test with overrides let host_config = HostConfig { - max_concurrent: Some(5), + concurrency: Some(5), request_interval: Some("500ms".parse().unwrap()), headers: HeaderMap::new(), }; - assert_eq!(host_config.effective_max_concurrent(&global_config), 5); + assert_eq!(host_config.effective_concurrency(&global_config), 5); assert_eq!( host_config .effective_request_interval(&global_config) @@ -162,14 +161,14 @@ mod tests { #[test] fn test_config_serialization() { let config = RateLimitConfig { - host_concurrency: 15, + concurrency: 15, request_interval: "200ms".parse().unwrap(), }; let toml = toml::to_string(&config).unwrap(); let deserialized: RateLimitConfig = toml::from_str(&toml).unwrap(); - assert_eq!(config.host_concurrency, deserialized.host_concurrency); + assert_eq!(config.concurrency, deserialized.concurrency); assert_eq!(config.request_interval, deserialized.request_interval); } @@ -180,7 +179,7 @@ mod tests { headers.insert("User-Agent", "test-agent".parse().unwrap()); let host_config = HostConfig { - max_concurrent: Some(5), + concurrency: Some(5), request_interval: Some("500ms".parse().unwrap()), headers, }; @@ -188,7 +187,7 @@ mod tests { let toml = toml::to_string(&host_config).unwrap(); let deserialized: HostConfig = toml::from_str(&dbg!(toml)).unwrap(); - assert_eq!(deserialized.max_concurrent, Some(5)); + assert_eq!(deserialized.concurrency, Some(5)); assert_eq!( deserialized.request_interval, Some("500ms".parse().unwrap()) diff --git a/lychee-lib/src/ratelimit/host/host.rs b/lychee-lib/src/ratelimit/host/host.rs index 37f86fa3f8..6fccaf742f 100644 --- a/lychee-lib/src/ratelimit/host/host.rs +++ b/lychee-lib/src/ratelimit/host/host.rs @@ -92,7 +92,7 @@ impl Host { let rate_limiter = RateLimiter::direct(quota); // Create semaphore for concurrency control - let max_concurrent = host_config.effective_max_concurrent(global_config); + let max_concurrent = host_config.effective_concurrency(global_config); let semaphore = Semaphore::new(max_concurrent); Host { diff --git a/lychee-lib/src/ratelimit/host/interval.rs b/lychee-lib/src/ratelimit/host/interval.rs index ee6826de9c..3b9b1fd44c 100644 --- a/lychee-lib/src/ratelimit/host/interval.rs +++ b/lychee-lib/src/ratelimit/host/interval.rs @@ -39,9 +39,9 @@ impl RequestInterval { } impl Default for RequestInterval { - /// The default interval is 100 milliseconds. + /// The default interval is 50 milliseconds. fn default() -> Self { - const PER_SECOND: Quota = Quota::per_second(NonZero::new(10).unwrap()); + const PER_SECOND: Quota = Quota::per_second(NonZero::new(20).unwrap()); Self(PER_SECOND) } } diff --git a/lychee.example.toml b/lychee.example.toml index 1a9e5645c3..64052d0154 100644 --- a/lychee.example.toml +++ b/lychee.example.toml @@ -210,13 +210,13 @@ suggest = true host_concurrency = 5 # Minimum interval between requests to the same host -request_interval = "200ms" +host_request_interval = "50ms" # Customize hosts [hosts."blog.example.com"] # Overwrite `host_concurrency` for this host -max_concurrent = 5 -# Overwrite `request_interval` for this host +concurrency = 5 +# Overwrite `host_request_interval` for this host request_interval = "200ms" -# Combine global `header` values with the following `headers` for this host +# Merge global `header` values with the following `headers` for this host headers = { "A" = "B" } From b815e61e7ab69b3b429a69584bc84f0da51bd420 Mon Sep 17 00:00:00 2001 From: Thomas Zahner Date: Fri, 19 Dec 2025 14:35:53 +0100 Subject: [PATCH 43/43] Allow 0 to disable per-host rate limiting This also reverts "Create RequestInterval" 89fd9f925520fc6967c124e1c84782cadd7c1368. --- lychee-bin/src/options.rs | 7 ++- lychee-lib/src/ratelimit/config.rs | 67 ++++++++++++---------- lychee-lib/src/ratelimit/host/host.rs | 18 +++--- lychee-lib/src/ratelimit/host/interval.rs | 68 ----------------------- lychee-lib/src/ratelimit/host/mod.rs | 2 - lychee-lib/src/ratelimit/mod.rs | 2 +- lychee.example.toml | 2 +- 7 files changed, 52 insertions(+), 114 deletions(-) delete mode 100644 lychee-lib/src/ratelimit/host/interval.rs diff --git a/lychee-bin/src/options.rs b/lychee-bin/src/options.rs index 7ff3edc050..b9be8fe521 100644 --- a/lychee-bin/src/options.rs +++ b/lychee-bin/src/options.rs @@ -11,7 +11,7 @@ use http::{ header::{HeaderName, HeaderValue}, }; use lychee_lib::Preprocessor; -use lychee_lib::ratelimit::{HostConfigs, RequestInterval}; +use lychee_lib::ratelimit::HostConfigs; use lychee_lib::{ Base, BasicAuthSelector, DEFAULT_MAX_REDIRECTS, DEFAULT_MAX_RETRIES, DEFAULT_RETRY_WAIT_TIME_SECS, DEFAULT_TIMEOUT_SECS, DEFAULT_USER_AGENT, FileExtensions, @@ -561,8 +561,9 @@ with a status code of 429, 500 and 501." /// Examples: /// --host-request-interval 50ms # Fast for robust APIs /// --host-request-interval 1s # Conservative for rate-limited APIs - #[arg(long, verbatim_doc_comment)] - pub(crate) host_request_interval: Option, + #[arg(long, value_parser = humantime::parse_duration, verbatim_doc_comment)] + #[serde(default, with = "humantime_serde")] + pub(crate) host_request_interval: Option, /// Number of threads to utilize. /// Defaults to number of cores available to the system diff --git a/lychee-lib/src/ratelimit/config.rs b/lychee-lib/src/ratelimit/config.rs index e3306f1979..0d48f52a1b 100644 --- a/lychee-lib/src/ratelimit/config.rs +++ b/lychee-lib/src/ratelimit/config.rs @@ -1,12 +1,16 @@ use http::{HeaderMap, HeaderName, HeaderValue}; use serde::{Deserialize, Serialize}; use std::collections::HashMap; +use std::time::Duration; -use crate::ratelimit::{HostKey, RequestInterval}; +use crate::ratelimit::HostKey; /// Default number of concurrent requests per host const DEFAULT_CONCURRENCY: usize = 10; +/// Default interval between requests to the same host +const DEFAULT_REQUEST_INTERVAL: Duration = Duration::from_millis(50); + /// Global rate limiting configuration that applies as defaults to all hosts #[derive(Debug, Clone, Copy, Serialize, Deserialize)] pub struct RateLimitConfig { @@ -15,28 +19,36 @@ pub struct RateLimitConfig { pub concurrency: usize, /// Default minimum interval between requests to the same host - pub request_interval: RequestInterval, + #[serde(default = "default_request_interval", with = "humantime_serde")] + pub request_interval: Duration, } impl Default for RateLimitConfig { fn default() -> Self { Self { concurrency: default_concurrency(), - request_interval: RequestInterval::default(), + request_interval: default_request_interval(), } } } +/// Default number of concurrent requests per host +const fn default_concurrency() -> usize { + DEFAULT_CONCURRENCY +} + +/// Default interval between requests to the same host +const fn default_request_interval() -> Duration { + DEFAULT_REQUEST_INTERVAL +} + impl RateLimitConfig { /// Create a `RateLimitConfig` from CLI options, using defaults for missing values #[must_use] - pub fn from_options( - concurrency: Option, - request_interval: Option, - ) -> Self { + pub fn from_options(concurrency: Option, request_interval: Option) -> Self { Self { concurrency: concurrency.unwrap_or(DEFAULT_CONCURRENCY), - request_interval: request_interval.unwrap_or_default(), + request_interval: request_interval.unwrap_or(DEFAULT_REQUEST_INTERVAL), } } } @@ -52,7 +64,8 @@ pub struct HostConfig { pub concurrency: Option, /// Minimum interval between requests to this host - pub request_interval: Option, + #[serde(default, with = "humantime_serde")] + pub request_interval: Option, /// Custom headers to send with requests to this host #[serde(default)] @@ -71,11 +84,6 @@ impl Default for HostConfig { } } -/// Default number of concurrent requests per host -const fn default_concurrency() -> usize { - DEFAULT_CONCURRENCY -} - impl HostConfig { /// Get the effective maximum concurrency, falling back to the global default #[must_use] @@ -85,7 +93,7 @@ impl HostConfig { /// Get the effective request interval, falling back to the global default #[must_use] - pub fn effective_request_interval(&self, global_config: &RateLimitConfig) -> RequestInterval { + pub fn effective_request_interval(&self, global_config: &RateLimitConfig) -> Duration { self.request_interval .unwrap_or(global_config.request_interval) } @@ -125,12 +133,15 @@ where #[cfg(test)] mod tests { - use std::time::Duration; - - use governor::Quota; - use super::*; + #[test] + fn test_default_rate_limit_config() { + let config = RateLimitConfig::default(); + assert_eq!(config.concurrency, 10); + assert_eq!(config.request_interval, Duration::from_millis(50)); + } + #[test] fn test_host_config_effective_values() { let global_config = RateLimitConfig::default(); @@ -140,21 +151,19 @@ mod tests { assert_eq!(host_config.effective_concurrency(&global_config), 10); assert_eq!( host_config.effective_request_interval(&global_config), - RequestInterval::default(), + Duration::from_millis(50) ); // Test with overrides let host_config = HostConfig { concurrency: Some(5), - request_interval: Some("500ms".parse().unwrap()), + request_interval: Some(Duration::from_millis(500)), headers: HeaderMap::new(), }; assert_eq!(host_config.effective_concurrency(&global_config), 5); assert_eq!( - host_config - .effective_request_interval(&global_config) - .into_inner(), - Quota::with_period(Duration::from_millis(500)).unwrap() + host_config.effective_request_interval(&global_config), + Duration::from_millis(500) ); } @@ -162,7 +171,7 @@ mod tests { fn test_config_serialization() { let config = RateLimitConfig { concurrency: 15, - request_interval: "200ms".parse().unwrap(), + request_interval: Duration::from_millis(200), }; let toml = toml::to_string(&config).unwrap(); @@ -180,17 +189,17 @@ mod tests { let host_config = HostConfig { concurrency: Some(5), - request_interval: Some("500ms".parse().unwrap()), + request_interval: Some(Duration::from_millis(500)), headers, }; let toml = toml::to_string(&host_config).unwrap(); - let deserialized: HostConfig = toml::from_str(&dbg!(toml)).unwrap(); + let deserialized: HostConfig = toml::from_str(&toml).unwrap(); assert_eq!(deserialized.concurrency, Some(5)); assert_eq!( deserialized.request_interval, - Some("500ms".parse().unwrap()) + Some(Duration::from_millis(500)) ); assert_eq!(deserialized.headers.len(), 2); assert!(deserialized.headers.contains_key("authorization")); diff --git a/lychee-lib/src/ratelimit/host/host.rs b/lychee-lib/src/ratelimit/host/host.rs index 6fccaf742f..842f9a1720 100644 --- a/lychee-lib/src/ratelimit/host/host.rs +++ b/lychee-lib/src/ratelimit/host/host.rs @@ -1,7 +1,7 @@ use crate::ratelimit::headers; use dashmap::DashMap; use governor::{ - RateLimiter, + Quota, RateLimiter, clock::DefaultClock, state::{InMemoryState, NotKeyed}, }; @@ -56,7 +56,7 @@ pub struct Host { pub key: HostKey, /// Rate limiter using token bucket algorithm - rate_limiter: RateLimiter, + rate_limiter: Option>, /// Controls maximum concurrent requests to this host semaphore: Semaphore, @@ -84,12 +84,9 @@ impl Host { client: ReqwestClient, ) -> Self { const MAX_BURST: NonZeroU32 = NonZeroU32::new(1).unwrap(); - let quota = host_config - .effective_request_interval(global_config) - .into_inner() - .allow_burst(MAX_BURST); - - let rate_limiter = RateLimiter::direct(quota); + let interval = host_config.effective_request_interval(global_config); + let rate_limiter = + Quota::with_period(interval).map(|q| RateLimiter::direct(q.allow_burst(MAX_BURST))); // Create semaphore for concurrency control let max_concurrent = host_config.effective_concurrency(global_config); @@ -180,8 +177,9 @@ impl Host { tokio::time::sleep(backoff_duration).await; } - // Wait for rate limiter permission - self.rate_limiter.until_ready().await; + if let Some(rate_limiter) = &self.rate_limiter { + rate_limiter.until_ready().await; + } // Execute the request and track timing let start_time = Instant::now(); diff --git a/lychee-lib/src/ratelimit/host/interval.rs b/lychee-lib/src/ratelimit/host/interval.rs deleted file mode 100644 index 3b9b1fd44c..0000000000 --- a/lychee-lib/src/ratelimit/host/interval.rs +++ /dev/null @@ -1,68 +0,0 @@ -use governor::Quota; -use humantime_serde::re::humantime::{self, DurationError}; -use serde::{Deserialize, Serialize, Serializer}; -use std::num::NonZero; -use std::str::FromStr; -use thiserror::Error; - -#[derive(Debug, Clone, Copy, PartialEq, Eq)] -/// Interval between requests to the same host -pub struct RequestInterval(Quota); - -#[derive(Debug, Error, PartialEq)] -pub enum ParseError { - #[error("Parse error: {0}")] - HumantimeError(DurationError), - #[error("Interval must not be zero")] - ZeroInterval, -} - -impl FromStr for RequestInterval { - type Err = ParseError; - - fn from_str(input: &str) -> Result { - let duration = input - .parse::() - .map_err(ParseError::HumantimeError)?; - Ok(RequestInterval( - Quota::with_period(duration.into()).ok_or(ParseError::ZeroInterval)?, - )) - } -} - -impl RequestInterval { - /// Convert into inner [`Quota`] - #[must_use] - pub const fn into_inner(self) -> Quota { - self.0 - } -} - -impl Default for RequestInterval { - /// The default interval is 50 milliseconds. - fn default() -> Self { - const PER_SECOND: Quota = Quota::per_second(NonZero::new(20).unwrap()); - Self(PER_SECOND) - } -} - -impl Serialize for RequestInterval { - fn serialize(&self, serializer: S) -> Result - where - S: Serializer, - { - humantime::Duration::from(self.0.replenish_interval()) - .to_string() - .serialize(serializer) - } -} - -impl<'de> Deserialize<'de> for RequestInterval { - fn deserialize(deserializer: D) -> Result - where - D: serde::Deserializer<'de>, - { - let string = ::deserialize(deserializer)?; - Self::from_str(&string).map_err(serde::de::Error::custom) - } -} diff --git a/lychee-lib/src/ratelimit/host/mod.rs b/lychee-lib/src/ratelimit/host/mod.rs index ed2e48506a..50b8b1ad3e 100644 --- a/lychee-lib/src/ratelimit/host/mod.rs +++ b/lychee-lib/src/ratelimit/host/mod.rs @@ -1,11 +1,9 @@ #![allow(clippy::module_inception)] mod host; -mod interval; mod key; mod stats; pub use host::Host; -pub use interval::RequestInterval; pub use key::HostKey; pub use stats::HostStats; diff --git a/lychee-lib/src/ratelimit/mod.rs b/lychee-lib/src/ratelimit/mod.rs index 80a6de8775..ad4cb48551 100644 --- a/lychee-lib/src/ratelimit/mod.rs +++ b/lychee-lib/src/ratelimit/mod.rs @@ -18,5 +18,5 @@ mod host; mod pool; pub use config::{HostConfig, HostConfigs, RateLimitConfig}; -pub use host::{Host, HostKey, HostStats, RequestInterval}; +pub use host::{Host, HostKey, HostStats}; pub use pool::{ClientMap, HostPool}; diff --git a/lychee.example.toml b/lychee.example.toml index 64052d0154..967031ae92 100644 --- a/lychee.example.toml +++ b/lychee.example.toml @@ -217,6 +217,6 @@ host_request_interval = "50ms" # Overwrite `host_concurrency` for this host concurrency = 5 # Overwrite `host_request_interval` for this host -request_interval = "200ms" +request_interval = "0" # zero disables rate limiting # Merge global `header` values with the following `headers` for this host headers = { "A" = "B" }