diff --git a/lychee-lib/src/quirks/mod.rs b/lychee-lib/src/quirks/mod.rs index a712aab8c6..07d5ef195a 100644 --- a/lychee-lib/src/quirks/mod.rs +++ b/lychee-lib/src/quirks/mod.rs @@ -5,7 +5,7 @@ use crate::{ use async_trait::async_trait; use header::HeaderValue; use http::header; -use regex::Regex; +use regex::{Captures, Regex}; use reqwest::{Request, Url}; use std::{collections::HashMap, sync::LazyLock}; @@ -15,6 +15,10 @@ static YOUTUBE_PATTERN: LazyLock = LazyLock::new(|| Regex::new(r"^(https?://)?(www\.)?youtube(-nocookie)?\.com").unwrap()); static YOUTUBE_SHORT_PATTERN: LazyLock = LazyLock::new(|| Regex::new(r"^(https?://)?(www\.)?(youtu\.?be)").unwrap()); +static GITHUB_BLOB_MARKDOWN_FRAGMENT_PATTERN: LazyLock = LazyLock::new(|| { + Regex::new(r"^https://github\.com/(?.*?)/(?.*?)/blob/(?.*?)/(?.*\.(md|markdown)#.*)$") + .unwrap() +}); // Retrieve a map of query params for the given request fn query(request: &Request) -> HashMap { @@ -24,7 +28,7 @@ fn query(request: &Request) -> HashMap { #[derive(Debug, Clone)] pub(crate) struct Quirk { pub(crate) pattern: &'static LazyLock, - pub(crate) rewrite: fn(Request) -> Request, + pub(crate) rewrite: fn(Request, Captures) -> Request, } #[derive(Debug, Clone)] @@ -37,7 +41,7 @@ impl Default for Quirks { let quirks = vec![ Quirk { pattern: &CRATES_PATTERN, - rewrite: |mut request| { + rewrite: |mut request, _| { request .headers_mut() .insert(header::ACCEPT, HeaderValue::from_static("text/html")); @@ -46,7 +50,7 @@ impl Default for Quirks { }, Quirk { pattern: &YOUTUBE_PATTERN, - rewrite: |mut request| { + rewrite: |mut request, _| { // Extract video id if it's a video page let video_id = match request.url().path() { "/watch" => query(&request).get("v").map(ToOwned::to_owned), @@ -67,7 +71,7 @@ impl Default for Quirks { }, Quirk { pattern: &YOUTUBE_SHORT_PATTERN, - rewrite: |mut request| { + rewrite: |mut request, _| { // Short links use the path as video id let id = request.url().path().trim_start_matches('/'); if id.is_empty() { @@ -78,6 +82,18 @@ impl Default for Quirks { request }, }, + Quirk { + pattern: &GITHUB_BLOB_MARKDOWN_FRAGMENT_PATTERN, + rewrite: |mut request, captures| { + let mut raw_url = String::new(); + captures.expand( + "https://raw.githubusercontent.com/$user/$repo/$path/$file", + &mut raw_url, + ); + *request.url_mut() = Url::parse(&raw_url).unwrap(); + request + }, + }, ]; Self { quirks } } @@ -89,8 +105,8 @@ impl Quirks { /// simplicity reasons. This limitation might be lifted in the future. pub(crate) fn apply(&self, request: Request) -> Request { for quirk in &self.quirks { - if quirk.pattern.is_match(request.url().as_str()) { - return (quirk.rewrite)(request); + if let Some(captures) = quirk.pattern.captures(request.url().clone().as_str()) { + return (quirk.rewrite)(request, captures); } } // Request was not modified @@ -188,6 +204,42 @@ mod tests { assert_eq!(MockRequest(modified), MockRequest::new(Method::GET, url)); } + #[test] + fn test_github_blob_markdown_fragment_request() { + let cases = [ + ( + "https://github.com/moby/docker-image-spec/blob/main/spec.md#terminology", + "https://raw.githubusercontent.com/moby/docker-image-spec/main/spec.md#terminology", + ), + ( + "https://github.com/moby/docker-image-spec/blob/main/spec.markdown#terminology", + "https://raw.githubusercontent.com/moby/docker-image-spec/main/spec.markdown#terminology", + ), + ( + "https://github.com/moby/docker-image-spec/blob/main/spec.md", + "https://github.com/moby/docker-image-spec/blob/main/spec.md", + ), + ( + "https://github.com/lycheeverse/lychee/blob/master/.gitignore#section", + "https://github.com/lycheeverse/lychee/blob/master/.gitignore#section", + ), + ( + "https://github.com/lycheeverse/lychee/blob/v0.15.0/README.md#features", + "https://raw.githubusercontent.com/lycheeverse/lychee/v0.15.0/README.md#features", + ), + ]; + for (origin, expect) in &cases { + let url = Url::parse(origin).unwrap(); + let request = Request::new(Method::GET, url); + let modified = Quirks::default().apply(request); + + assert_eq!( + MockRequest(modified), + MockRequest::new(Method::GET, Url::parse(expect).unwrap()) + ); + } + } + #[test] fn test_no_quirk_applied() { let url = Url::parse("https://endler.dev").unwrap();