From 6ca7ace1b13a8231c270f421ac4177b6526c9d7b Mon Sep 17 00:00:00 2001 From: Keming Date: Mon, 1 Dec 2025 00:53:54 +0800 Subject: [PATCH 1/4] feat: add github markdown fragment quirk Signed-off-by: Keming --- lychee-lib/src/quirks/mod.rs | 38 ++++++++++++++++++++++++++++++++++++ 1 file changed, 38 insertions(+) diff --git a/lychee-lib/src/quirks/mod.rs b/lychee-lib/src/quirks/mod.rs index a712aab8c6..299c75d288 100644 --- a/lychee-lib/src/quirks/mod.rs +++ b/lychee-lib/src/quirks/mod.rs @@ -15,6 +15,8 @@ static YOUTUBE_PATTERN: LazyLock = LazyLock::new(|| Regex::new(r"^(https?://)?(www\.)?youtube(-nocookie)?\.com").unwrap()); static YOUTUBE_SHORT_PATTERN: LazyLock = LazyLock::new(|| Regex::new(r"^(https?://)?(www\.)?(youtu\.?be)").unwrap()); +static GITHUB_MARKDOWN_FRAGMENT_PATTERN: LazyLock = + LazyLock::new(|| Regex::new(r"^https://github.com/(.*?)/(.*?)/blob/(.*?)/(.*#.*)$").unwrap()); // Retrieve a map of query params for the given request fn query(request: &Request) -> HashMap { @@ -78,6 +80,25 @@ impl Default for Quirks { request }, }, + Quirk { + pattern: &GITHUB_MARKDOWN_FRAGMENT_PATTERN, + rewrite: |mut request| { + let matches = GITHUB_MARKDOWN_FRAGMENT_PATTERN + .captures(request.url().as_str()) + .expect("should be always true as `is_match` is true"); + let raw_url = format!( + "https://raw.githubusercontent.com/{}", + matches + .iter() + .skip(1) + .map(|c| c.expect("match GitHub markdown pattern").as_str()) + .collect::>() + .join("/"), + ); + *request.url_mut() = Url::parse(&raw_url).unwrap(); + request + }, + }, ]; Self { quirks } } @@ -188,6 +209,23 @@ mod tests { assert_eq!(MockRequest(modified), MockRequest::new(Method::GET, url)); } + #[test] + fn test_github_markdown_fragment_request() { + let url = + Url::parse("https://github.com/moby/docker-image-spec/blob/main/spec.md#terminology") + .unwrap(); + let request = Request::new(Method::GET, url); + let modified = Quirks::default().apply(request); + + assert_eq!( + MockRequest(modified), + MockRequest::new( + Method::GET, + Url::parse("https://raw.githubusercontent.com/moby/docker-image-spec/main/spec.md#terminology").unwrap() + ) + ); + } + #[test] fn test_no_quirk_applied() { let url = Url::parse("https://endler.dev").unwrap(); From 5709ae3e1c6866acfe4fbdf67e28d73fc4bb8228 Mon Sep 17 00:00:00 2001 From: Keming Date: Thu, 4 Dec 2025 21:33:25 +0800 Subject: [PATCH 2/4] address commments Signed-off-by: Keming --- lychee-lib/src/quirks/mod.rs | 41 ++++++++++++++++-------------------- 1 file changed, 18 insertions(+), 23 deletions(-) diff --git a/lychee-lib/src/quirks/mod.rs b/lychee-lib/src/quirks/mod.rs index 299c75d288..297c4dc100 100644 --- a/lychee-lib/src/quirks/mod.rs +++ b/lychee-lib/src/quirks/mod.rs @@ -5,7 +5,7 @@ use crate::{ use async_trait::async_trait; use header::HeaderValue; use http::header; -use regex::Regex; +use regex::{Captures, Regex}; use reqwest::{Request, Url}; use std::{collections::HashMap, sync::LazyLock}; @@ -15,8 +15,10 @@ static YOUTUBE_PATTERN: LazyLock = LazyLock::new(|| Regex::new(r"^(https?://)?(www\.)?youtube(-nocookie)?\.com").unwrap()); static YOUTUBE_SHORT_PATTERN: LazyLock = LazyLock::new(|| Regex::new(r"^(https?://)?(www\.)?(youtu\.?be)").unwrap()); -static GITHUB_MARKDOWN_FRAGMENT_PATTERN: LazyLock = - LazyLock::new(|| Regex::new(r"^https://github.com/(.*?)/(.*?)/blob/(.*?)/(.*#.*)$").unwrap()); +static GITHUB_BLOB_MARKDOWN_FRAGMENT_PATTERN: LazyLock = LazyLock::new(|| { + Regex::new(r"^https://github.com/(?.*?)/(?.*?)/blob/(?.*?)/(?.*md#.*)$") + .unwrap() +}); // Retrieve a map of query params for the given request fn query(request: &Request) -> HashMap { @@ -26,7 +28,7 @@ fn query(request: &Request) -> HashMap { #[derive(Debug, Clone)] pub(crate) struct Quirk { pub(crate) pattern: &'static LazyLock, - pub(crate) rewrite: fn(Request) -> Request, + pub(crate) rewrite: fn(Request, Captures) -> Request, } #[derive(Debug, Clone)] @@ -39,7 +41,7 @@ impl Default for Quirks { let quirks = vec![ Quirk { pattern: &CRATES_PATTERN, - rewrite: |mut request| { + rewrite: |mut request, _| { request .headers_mut() .insert(header::ACCEPT, HeaderValue::from_static("text/html")); @@ -48,7 +50,7 @@ impl Default for Quirks { }, Quirk { pattern: &YOUTUBE_PATTERN, - rewrite: |mut request| { + rewrite: |mut request, _| { // Extract video id if it's a video page let video_id = match request.url().path() { "/watch" => query(&request).get("v").map(ToOwned::to_owned), @@ -69,7 +71,7 @@ impl Default for Quirks { }, Quirk { pattern: &YOUTUBE_SHORT_PATTERN, - rewrite: |mut request| { + rewrite: |mut request, _| { // Short links use the path as video id let id = request.url().path().trim_start_matches('/'); if id.is_empty() { @@ -81,19 +83,12 @@ impl Default for Quirks { }, }, Quirk { - pattern: &GITHUB_MARKDOWN_FRAGMENT_PATTERN, - rewrite: |mut request| { - let matches = GITHUB_MARKDOWN_FRAGMENT_PATTERN - .captures(request.url().as_str()) - .expect("should be always true as `is_match` is true"); - let raw_url = format!( - "https://raw.githubusercontent.com/{}", - matches - .iter() - .skip(1) - .map(|c| c.expect("match GitHub markdown pattern").as_str()) - .collect::>() - .join("/"), + pattern: &GITHUB_BLOB_MARKDOWN_FRAGMENT_PATTERN, + rewrite: |mut request, captures| { + let mut raw_url = String::new(); + captures.expand( + "https://raw.githubusercontent.com/$user/$repo/$path/$file", + &mut raw_url, ); *request.url_mut() = Url::parse(&raw_url).unwrap(); request @@ -110,8 +105,8 @@ impl Quirks { /// simplicity reasons. This limitation might be lifted in the future. pub(crate) fn apply(&self, request: Request) -> Request { for quirk in &self.quirks { - if quirk.pattern.is_match(request.url().as_str()) { - return (quirk.rewrite)(request); + if let Some(captures) = quirk.pattern.captures(request.url().clone().as_str()) { + return (quirk.rewrite)(request, captures); } } // Request was not modified @@ -210,7 +205,7 @@ mod tests { } #[test] - fn test_github_markdown_fragment_request() { + fn test_github_blob_markdown_fragment_request() { let url = Url::parse("https://github.com/moby/docker-image-spec/blob/main/spec.md#terminology") .unwrap(); From 2e8a7eca9cdd358d48982fb4ab80038578c6c431 Mon Sep 17 00:00:00 2001 From: Keming Date: Thu, 4 Dec 2025 22:00:09 +0800 Subject: [PATCH 3/4] address commments Signed-off-by: Keming --- lychee-lib/src/quirks/mod.rs | 45 +++++++++++++++++++++++++----------- 1 file changed, 32 insertions(+), 13 deletions(-) diff --git a/lychee-lib/src/quirks/mod.rs b/lychee-lib/src/quirks/mod.rs index 297c4dc100..e1885f640f 100644 --- a/lychee-lib/src/quirks/mod.rs +++ b/lychee-lib/src/quirks/mod.rs @@ -16,7 +16,7 @@ static YOUTUBE_PATTERN: LazyLock = static YOUTUBE_SHORT_PATTERN: LazyLock = LazyLock::new(|| Regex::new(r"^(https?://)?(www\.)?(youtu\.?be)").unwrap()); static GITHUB_BLOB_MARKDOWN_FRAGMENT_PATTERN: LazyLock = LazyLock::new(|| { - Regex::new(r"^https://github.com/(?.*?)/(?.*?)/blob/(?.*?)/(?.*md#.*)$") + Regex::new(r"^https://github\.com/(?.*?)/(?.*?)/blob/(?.*?)/(?.*\.(md|markdown)#.*)$") .unwrap() }); @@ -206,19 +206,38 @@ mod tests { #[test] fn test_github_blob_markdown_fragment_request() { - let url = - Url::parse("https://github.com/moby/docker-image-spec/blob/main/spec.md#terminology") - .unwrap(); - let request = Request::new(Method::GET, url); - let modified = Quirks::default().apply(request); - - assert_eq!( - MockRequest(modified), - MockRequest::new( - Method::GET, - Url::parse("https://raw.githubusercontent.com/moby/docker-image-spec/main/spec.md#terminology").unwrap() + let cases = [ + ( + "https://github.com/moby/docker-image-spec/blob/main/spec.md#terminology", + "https://raw.githubusercontent.com/moby/docker-image-spec/main/spec.md#terminology", + ), + ( + "https://github.com/moby/docker-image-spec/blob/main/spec.markdown#terminology", + "https://raw.githubusercontent.com/moby/docker-image-spec/main/spec.markdown#terminology", + ), + ( + "https://github.com/moby/docker-image-spec/blob/main/spec.md", + "https://github.com/moby/docker-image-spec/blob/main/spec.md", + ), + ( + "https://github.com/lycheeverse/lychee/blob/master/.gitignore#section", + "https://github.com/lycheeverse/lychee/blob/master/.gitignore#section", + ), + ( + "https://github.com/lycheeverse/lychee/blob/v0.15.0/README.md#features", + "https://raw.githubusercontent.com/lycheeverse/lychee/v0.15.0/README.md#features", ) - ); + ]; + for (origin, expect) in cases.iter() { + let url = Url::parse(&origin).unwrap(); + let request = Request::new(Method::GET, url); + let modified = Quirks::default().apply(request); + + assert_eq!( + MockRequest(modified), + MockRequest::new(Method::GET, Url::parse(expect).unwrap()) + ); + } } #[test] From 3b94ebf9c6041f2a0d51f0644a1149ede0d9e939 Mon Sep 17 00:00:00 2001 From: Keming Date: Thu, 4 Dec 2025 22:02:19 +0800 Subject: [PATCH 4/4] fix lint Signed-off-by: Keming --- lychee-lib/src/quirks/mod.rs | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/lychee-lib/src/quirks/mod.rs b/lychee-lib/src/quirks/mod.rs index e1885f640f..07d5ef195a 100644 --- a/lychee-lib/src/quirks/mod.rs +++ b/lychee-lib/src/quirks/mod.rs @@ -226,10 +226,10 @@ mod tests { ( "https://github.com/lycheeverse/lychee/blob/v0.15.0/README.md#features", "https://raw.githubusercontent.com/lycheeverse/lychee/v0.15.0/README.md#features", - ) + ), ]; - for (origin, expect) in cases.iter() { - let url = Url::parse(&origin).unwrap(); + for (origin, expect) in &cases { + let url = Url::parse(origin).unwrap(); let request = Request::new(Method::GET, url); let modified = Quirks::default().apply(request);