Skip to content
12 changes: 12 additions & 0 deletions lychee-bin/tests/cli.rs
Original file line number Diff line number Diff line change
Expand Up @@ -3022,6 +3022,18 @@ mod cli {
Ok(())
}

/// Invalid glob patterns should be checked and reported as a CLI parsing
/// error before link checking.
#[test]
fn test_invalid_glob_fails_parse() {
main_command!()
.arg("invalid-unmatched-brackets[")
.assert()
.stderr(contains("Cannot parse input"))
.failure()
.code(1); // cli parsing error code
}

/// Preprocessing with `cat` is like an identity function because it
/// outputs its input without any changes.
#[test]
Expand Down
2 changes: 1 addition & 1 deletion lychee-lib/src/collector.rs
Original file line number Diff line number Diff line change
Expand Up @@ -384,7 +384,7 @@ mod tests {
))),
Input::from_input_source(InputSource::FsPath(file_path)),
Input::from_input_source(InputSource::FsGlob {
pattern: temp_dir_path.join("glob*").to_str().unwrap().to_owned(),
pattern: glob::Pattern::new(&temp_dir_path.join("glob*").to_string_lossy())?,
ignore_case: true,
}),
]);
Expand Down
79 changes: 2 additions & 77 deletions lychee-lib/src/types/input/input.rs
Original file line number Diff line number Diff line change
Expand Up @@ -13,13 +13,10 @@ use crate::{ErrorKind, Result};
use async_stream::try_stream;
use futures::stream::{Stream, StreamExt};
use glob::glob_with;
use reqwest::Url;
use shellexpand::tilde;
use std::path::{Path, PathBuf};
use tokio::io::{AsyncReadExt, stdin};

const STDIN: &str = "-";

/// Lychee Input with optional file hint for parsing
#[derive(Clone, Debug, PartialEq, Eq, Hash)]
pub struct Input {
Expand Down Expand Up @@ -48,79 +45,7 @@ impl Input {
file_type_hint: Option<FileType>,
glob_ignore_case: bool,
) -> Result<Self> {
let source = if input == STDIN {
InputSource::Stdin
} else {
// We use [`reqwest::Url::parse`] because it catches some other edge cases that [`http::Request:builder`] does not
match Url::parse(input) {
// Weed out non-HTTP schemes, including Windows drive
// specifiers, which can be parsed by the
// [url](https://crates.io/crates/url) crate
Ok(url) if url.scheme() == "http" || url.scheme() == "https" => {
InputSource::RemoteUrl(Box::new(url))
}
Ok(_) => {
// URL parsed successfully, but it's not HTTP or HTTPS
return Err(ErrorKind::InvalidFile(PathBuf::from(input)));
}
_ => {
// This seems to be the only way to determine if this is a glob pattern
let is_glob = glob::Pattern::escape(input) != input;

if is_glob {
InputSource::FsGlob {
pattern: input.to_owned(),
ignore_case: glob_ignore_case,
}
} else {
// It might be a file path; check if it exists
let path = PathBuf::from(input);

// On Windows, a filepath can never be mistaken for a
// URL, because Windows filepaths use `\` and URLs use
// `/`
#[cfg(windows)]
if path.exists() {
// The file exists, so we return the path
InputSource::FsPath(path)
} else {
// We have a valid filepath, but the file does not
// exist so we return an error
return Err(ErrorKind::InvalidFile(path));
}

#[cfg(unix)]
if path.exists() {
InputSource::FsPath(path)
} else if input.starts_with('~') || input.starts_with('.') {
// The path is not valid, but it might still be a
// valid URL.
//
// Check if the path starts with a tilde (`~`) or a
// dot and exit early if it does.
//
// This check might not be sufficient to cover all cases
// but it catches the most common ones
return Err(ErrorKind::InvalidFile(path));
} else {
// Invalid path; check if a valid URL can be constructed from the input
// by prefixing it with a `http://` scheme.
//
// Curl also uses http (i.e. not https), see
// https://github.com/curl/curl/blob/70ac27604a2abfa809a7b2736506af0da8c3c8a9/lib/urlapi.c#L1104-L1124
//
// TODO: We should get rid of this heuristic and
// require users to provide a full URL with scheme.
// This is a big source of confusion to users.
let url = Url::parse(&format!("http://{input}")).map_err(|e| {
ErrorKind::ParseUrl(e, "Input is not a valid URL".to_string())
})?;
InputSource::RemoteUrl(Box::new(url))
}
}
}
}
};
let source = InputSource::new(input, glob_ignore_case)?;
Ok(Self {
source,
file_type_hint,
Expand Down Expand Up @@ -285,7 +210,7 @@ impl Input {
ref pattern,
ignore_case,
} => {
let glob_expanded = tilde(&pattern).to_string();
let glob_expanded = tilde(pattern.as_str()).to_string();
let mut match_opts = glob::MatchOptions::new();
match_opts.case_sensitive = !ignore_case;
for entry in glob_with(&glob_expanded, match_opts)? {
Expand Down
4 changes: 3 additions & 1 deletion lychee-lib/src/types/input/resolver.rs
Original file line number Diff line number Diff line change
Expand Up @@ -99,7 +99,9 @@ impl InputResolver {
InputSource::FsGlob { pattern, ignore_case } => {
// For glob patterns, we expand the pattern and yield
// matching paths as ResolvedInputSource::FsPath items.
let glob_expanded = tilde(pattern).to_string();
// NOTE: we convert the glob::Pattern back to str because
// `glob_with` only takes string arguments.
let glob_expanded = tilde(pattern.as_str()).to_string();
let mut match_opts = glob::MatchOptions::new();
match_opts.case_sensitive = !ignore_case;

Expand Down
123 changes: 120 additions & 3 deletions lychee-lib/src/types/input/source.rs
Original file line number Diff line number Diff line change
Expand Up @@ -14,11 +14,15 @@
//! and filtered by extension
//! - URLs, raw strings, and standard input (`stdin`) are read directly

use crate::ErrorKind;

use glob::Pattern;
use reqwest::Url;
use serde::{Deserialize, Serialize};
use serde::{Deserialize, Deserializer, Serialize};
use std::borrow::Cow;
use std::fmt::Display;
use std::path::PathBuf;
use std::result::Result;

/// Input types which lychee supports
#[derive(Debug, Clone, PartialEq, Eq, Hash, Deserialize)]
Expand All @@ -29,7 +33,8 @@ pub enum InputSource {
/// Unix shell-style glob pattern.
FsGlob {
/// The glob pattern matching all input files
pattern: String,
#[serde(deserialize_with = "InputSource::deserialize_pattern")]
pattern: Pattern,
/// Don't be case sensitive when matching files against a glob pattern
ignore_case: bool,
},
Expand All @@ -41,6 +46,98 @@ pub enum InputSource {
String(Cow<'static, str>),
}

impl InputSource {
const STDIN: &str = "-";

/// Parses a [`InputSource`] from the given string. The kind of input source will be
/// automatically detected according to certain rules and precedences.
///
/// # Errors
///
/// Returns an error if:
/// - the input does not exist (i.e. the path is invalid)
/// - the input cannot be parsed as a URL
pub fn new(input: &str, glob_ignore_case: bool) -> Result<Self, ErrorKind> {
if input == Self::STDIN {
return Ok(InputSource::Stdin);
}

// We use [`reqwest::Url::parse`] because it catches some other edge cases that [`http::Request:builder`] does not
if let Ok(url) = Url::parse(input) {
// Weed out non-HTTP schemes, including Windows drive
// specifiers, which can be parsed by the
// [url](https://crates.io/crates/url) crate
return match url.scheme() {
"http" | "https" => Ok(InputSource::RemoteUrl(Box::new(url))),
_ => Err(ErrorKind::InvalidFile(PathBuf::from(input))),
};
}

// This seems to be the only way to determine if this is a glob pattern
let is_glob = glob::Pattern::escape(input) != input;

if is_glob {
return Ok(InputSource::FsGlob {
pattern: Pattern::new(input)?,
ignore_case: glob_ignore_case,
});
}

// It might be a file path; check if it exists
let path = PathBuf::from(input);

// On Windows, a filepath can never be mistaken for a
// URL, because Windows filepaths use `\` and URLs use
// `/`
#[cfg(windows)]
if path.exists() {
// The file exists, so we return the path
Ok(InputSource::FsPath(path))
} else {
// We have a valid filepath, but the file does not
// exist so we return an error
Err(ErrorKind::InvalidFile(path))
}

#[cfg(unix)]
if path.exists() {
Ok(InputSource::FsPath(path))
} else if input.starts_with('~') || input.starts_with('.') {
// The path is not valid, but it might still be a
// valid URL.
//
// Check if the path starts with a tilde (`~`) or a
// dot and exit early if it does.
//
// This check might not be sufficient to cover all cases
// but it catches the most common ones
Err(ErrorKind::InvalidFile(path))
} else {
// Invalid path; check if a valid URL can be constructed from the input
// by prefixing it with a `http://` scheme.
//
// Curl also uses http (i.e. not https), see
// https://github.com/curl/curl/blob/70ac27604a2abfa809a7b2736506af0da8c3c8a9/lib/urlapi.c#L1104-L1124
//
// TODO: We should get rid of this heuristic and
// require users to provide a full URL with scheme.
// This is a big source of confusion to users.
let url = Url::parse(&format!("http://{input}"))
.map_err(|e| ErrorKind::ParseUrl(e, "Input is not a valid URL".to_string()))?;
Ok(InputSource::RemoteUrl(Box::new(url)))
}
}

fn deserialize_pattern<'de, D>(deserializer: D) -> Result<Pattern, D::Error>
where
D: Deserializer<'de>,
{
use serde::de::Error;
let s = String::deserialize(deserializer)?;
Pattern::new(&s).map_err(D::Error::custom)
}
}

/// Resolved input sources that can be processed for content.
///
/// This represents input sources after glob pattern expansion.
Expand Down Expand Up @@ -105,10 +202,30 @@ impl Display for InputSource {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
f.write_str(match self {
Self::RemoteUrl(url) => url.as_str(),
Self::FsGlob { pattern, .. } => pattern,
Self::FsGlob { pattern, .. } => pattern.as_str(),
Self::FsPath(path) => path.to_str().unwrap_or_default(),
Self::Stdin => "stdin",
Self::String(s) => s.as_ref(),
})
}
}

#[cfg(test)]
mod tests {
use super::*;

/// Serialization of `FsGlob` relies on [`glob::Pattern::to_string`].
/// Here, we check that the `to_string` works as we require.
#[test]
fn test_pattern_serialization_is_original_pattern() {
let pat = "asd[f]*";
assert_eq!(
serde_json::to_string(&InputSource::FsGlob {
pattern: Pattern::new(pat).unwrap(),
ignore_case: false,
})
.unwrap(),
serde_json::to_string(pat).unwrap(),
);
}
}