Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions Cargo.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

6 changes: 3 additions & 3 deletions examples/collect_links/collect_links.rs
Original file line number Diff line number Diff line change
@@ -1,17 +1,17 @@
use lychee_lib::{Collector, Input, InputSource, Result};
use reqwest::Url;
use std::path::PathBuf;
use std::{collections::HashSet, path::PathBuf};
use tokio_stream::StreamExt;

#[tokio::main]
async fn main() -> Result<()> {
// Collect all links from the following inputs
let inputs = vec![
let inputs = HashSet::from_iter([
Input::from_input_source(InputSource::RemoteUrl(Box::new(
Url::parse("https://github.com/lycheeverse/lychee").unwrap(),
))),
Input::from_input_source(InputSource::FsPath(PathBuf::from("fixtures/TEST.md"))),
];
]);

let links = Collector::default()
.skip_missing_inputs(false) // don't skip missing inputs? (default=false)
Expand Down
4 changes: 2 additions & 2 deletions lychee-bin/src/options.rs
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@ use lychee_lib::{
use reqwest::tls;
use secrecy::{ExposeSecret, SecretString};
use serde::{Deserialize, Deserializer};
use std::collections::HashMap;
use std::collections::{HashMap, HashSet};
use std::path::Path;
use std::{fs, path::PathBuf, str::FromStr, time::Duration};
use strum::{Display, EnumIter, EnumString, VariantNames};
Expand Down Expand Up @@ -330,7 +330,7 @@ impl LycheeOptions {
// This depends on the config, which is why a method is required (we could
// accept a `Vec<Input>` in `LycheeOptions` and do the conversion there, but
// we wouldn't get access to `glob_ignore_case`.
pub(crate) fn inputs(&self) -> Result<Vec<Input>> {
pub(crate) fn inputs(&self) -> Result<HashSet<Input>> {
self.raw_inputs
.iter()
.map(|s| Input::new(s, None, self.config.glob_ignore_case))
Expand Down
32 changes: 32 additions & 0 deletions lychee-bin/tests/cli.rs
Original file line number Diff line number Diff line change
Expand Up @@ -1784,6 +1784,38 @@ mod cli {
assert!(all_cookies.iter().all(|c| c.domain() == Some("google.com")));
Ok(())
}

#[test]
fn test_dump_inputs_does_not_include_duplicates() -> Result<()> {
let pattern = fixtures_path().join("dump_inputs/markdown.md");

let mut cmd = main_command();
cmd.arg("--dump-inputs")
.arg(&pattern)
.arg(&pattern)
.assert()
.success()
.stdout(contains("fixtures/dump_inputs/markdown.md").count(1));

Ok(())
}

#[test]
fn test_dump_inputs_glob_does_not_include_duplicates() -> Result<()> {
let pattern1 = fixtures_path().join("**/markdown.*");
let pattern2 = fixtures_path().join("**/*.md");

let mut cmd = main_command();
cmd.arg("--dump-inputs")
.arg(pattern1)
.arg(pattern2)
.assert()
.success()
.stdout(contains("fixtures/dump_inputs/markdown.md").count(1));

Ok(())
}

#[test]
fn test_dump_inputs_glob_md() -> Result<()> {
let pattern = fixtures_path().join("**/*.md");
Expand Down
1 change: 1 addition & 0 deletions lychee-lib/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@ async-trait = "0.1.88"
cached = "0.55.1"
check-if-email-exists = { version = "0.9.1", optional = true }
cookie_store = "0.21.1"
dashmap = { version = "6.1.0", features = ["serde"] }
email_address = "0.2.9"
futures = "0.3.31"
glob = "0.3.2"
Expand Down
101 changes: 84 additions & 17 deletions lychee-lib/src/collector.rs
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@ use crate::{
Base, Input, Request, Result, basic_auth::BasicAuthExtractor, extract::Extractor,
types::FileExtensions, types::uri::raw::RawUri, utils::request,
};
use dashmap::DashSet;
use futures::TryStreamExt;
use futures::{
StreamExt,
Expand All @@ -14,7 +15,9 @@ use futures::{
use http::HeaderMap;
use par_stream::ParStreamExt;
use reqwest::Client;
use std::collections::HashSet;
use std::path::PathBuf;
use std::sync::Arc;

/// Collector keeps the state of link collection
/// It drives the link extraction from inputs
Expand Down Expand Up @@ -154,15 +157,30 @@ impl Collector {

/// Collect all sources from a list of [`Input`]s. For further details,
/// see also [`Input::get_sources`](crate::Input#method.get_sources).
pub fn collect_sources(self, inputs: Vec<Input>) -> impl Stream<Item = Result<String>> {
pub fn collect_sources(self, inputs: HashSet<Input>) -> impl Stream<Item = Result<String>> {
let seen = Arc::new(DashSet::new());

stream::iter(inputs)
.par_then_unordered(None, move |input| async move { input.get_sources() })
.flatten()
.filter_map({
move |source: Result<String>| {
let seen = Arc::clone(&seen);
async move {
if let Ok(s) = &source {
if !seen.insert(s.clone()) {
return None;
}
}
Some(source)
}
}
})
}

/// Convenience method to fetch all unique links from inputs
/// with the default extensions.
pub fn collect_links(self, inputs: Vec<Input>) -> impl Stream<Item = Result<Request>> {
pub fn collect_links(self, inputs: HashSet<Input>) -> impl Stream<Item = Result<Request>> {
self.collect_links_from_file_types(inputs, crate::types::FileType::default_extensions())
}

Expand All @@ -175,7 +193,7 @@ impl Collector {
/// Will return `Err` if links cannot be extracted from an input
pub fn collect_links_from_file_types(
self,
inputs: Vec<Input>,
inputs: HashSet<Input>,
extensions: FileExtensions,
) -> impl Stream<Item = Result<Request>> {
let skip_missing_inputs = self.skip_missing_inputs;
Expand Down Expand Up @@ -255,7 +273,7 @@ mod tests {

// Helper function to run the collector on the given inputs
async fn collect(
inputs: Vec<Input>,
inputs: HashSet<Input>,
root_dir: Option<PathBuf>,
base: Option<Base>,
) -> Result<HashSet<Uri>> {
Expand All @@ -268,7 +286,7 @@ mod tests {
/// A verbatim link is a link that is not parsed by the HTML parser.
/// For example, a link in a code block or a script tag.
async fn collect_verbatim(
inputs: Vec<Input>,
inputs: HashSet<Input>,
root_dir: Option<PathBuf>,
base: Option<Base>,
extensions: FileExtensions,
Expand Down Expand Up @@ -329,6 +347,40 @@ mod tests {
Ok(())
}

#[tokio::test]
async fn test_collect_sources() -> Result<()> {
let temp_dir = tempfile::tempdir().unwrap();
let temp_dir_path = temp_dir.path();

std::env::set_current_dir(temp_dir_path)?;

let file_path = temp_dir_path.join("markdown.md");
File::create(&file_path).unwrap();

let file_path = temp_dir_path.join("README");
File::create(&file_path).unwrap();

let inputs = HashSet::from_iter([
Input::from_input_source(InputSource::FsGlob {
pattern: "*.md".to_string(),
ignore_case: true,
}),
Input::from_input_source(InputSource::FsGlob {
pattern: "markdown.*".to_string(),
ignore_case: true,
}),
]);

let collector = Collector::new(Some(temp_dir_path.to_path_buf()), None)?;

let sources: Vec<_> = collector.collect_sources(inputs).collect().await;

assert_eq!(sources.len(), 1);
assert_eq!(sources[0], Ok("markdown.md".to_string()));

return Ok(());
}

#[tokio::test]
async fn test_collect_links() -> Result<()> {
let temp_dir = tempfile::tempdir().unwrap();
Expand All @@ -348,7 +400,7 @@ mod tests {

let mock_server = mock_server!(StatusCode::OK, set_body_string(TEST_URL));

let inputs = vec![
let inputs = HashSet::from_iter([
Input::from_input_source(InputSource::String(TEST_STRING.to_owned())),
Input::from_input_source(InputSource::RemoteUrl(Box::new(
Url::parse(&mock_server.uri())
Expand All @@ -360,7 +412,7 @@ mod tests {
pattern: temp_dir_path.join("glob*").to_str().unwrap().to_owned(),
ignore_case: true,
}),
];
]);

let links = collect_verbatim(inputs, None, None, FileType::default_extensions())
.await
Expand All @@ -387,7 +439,9 @@ mod tests {
source: InputSource::String("This is [a test](https://endler.dev). This is a relative link test [Relative Link Test](relative_link)".to_string()),
file_type_hint: Some(FileType::Markdown),
};
let links = collect(vec![input], None, Some(base)).await.ok().unwrap();
let inputs = HashSet::from_iter([input]);

let links = collect(inputs, None, Some(base)).await.ok().unwrap();

let expected_links = HashSet::from_iter([
website("https://endler.dev"),
Expand All @@ -412,7 +466,9 @@ mod tests {
),
file_type_hint: Some(FileType::Html),
};
let links = collect(vec![input], None, Some(base)).await.ok().unwrap();
let inputs = HashSet::from_iter([input]);

let links = collect(inputs, None, Some(base)).await.ok().unwrap();

let expected_links = HashSet::from_iter([
website("https://github.com/lycheeverse/lychee/"),
Expand Down Expand Up @@ -440,7 +496,9 @@ mod tests {
),
file_type_hint: Some(FileType::Html),
};
let links = collect(vec![input], None, Some(base)).await.ok().unwrap();
let inputs = HashSet::from_iter([input]);

let links = collect(inputs, None, Some(base)).await.ok().unwrap();

let expected_links = HashSet::from_iter([
website("https://example.com/static/image.png"),
Expand All @@ -465,8 +523,9 @@ mod tests {
),
file_type_hint: Some(FileType::Markdown),
};
let inputs = HashSet::from_iter([input]);

let links = collect(vec![input], None, Some(base)).await.ok().unwrap();
let links = collect(inputs, None, Some(base)).await.ok().unwrap();

let expected = HashSet::from_iter([
website("https://localhost.com/@/internal.md"),
Expand All @@ -487,7 +546,9 @@ mod tests {
source: InputSource::String(input),
file_type_hint: Some(FileType::Html),
};
let links = collect(vec![input], None, Some(base)).await.ok().unwrap();
let inputs = HashSet::from_iter([input]);

let links = collect(inputs, None, Some(base)).await.ok().unwrap();

let expected_links = HashSet::from_iter([
// the body links wouldn't be present if the file was parsed strictly as XML
Expand Down Expand Up @@ -516,7 +577,9 @@ mod tests {

let input = Input::from_input_source(InputSource::RemoteUrl(Box::new(server_uri.clone())));

let links = collect(vec![input], None, None).await.ok().unwrap();
let inputs = HashSet::from_iter([input]);

let links = collect(inputs, None, None).await.ok().unwrap();

let expected_urls = HashSet::from_iter([
website("https://github.com/lycheeverse/lychee/"),
Expand All @@ -532,7 +595,9 @@ mod tests {
"This is a mailto:user@example.com?subject=Hello link".to_string(),
));

let links = collect(vec![input], None, None).await.ok().unwrap();
let inputs = HashSet::from_iter([input]);

let links = collect(inputs, None, None).await.ok().unwrap();

let expected_links = HashSet::from_iter([mail("user@example.com")]);

Expand All @@ -550,7 +615,7 @@ mod tests {
set_body_string(r#"<a href="relative.html">Link</a>"#)
);

let inputs = vec![
let inputs = HashSet::from_iter([
Input {
source: InputSource::RemoteUrl(Box::new(
Url::parse(&format!(
Expand All @@ -571,7 +636,7 @@ mod tests {
)),
file_type_hint: Some(FileType::Html),
},
];
]);

let links = collect(inputs, None, None).await.ok().unwrap();

Expand Down Expand Up @@ -606,7 +671,9 @@ mod tests {
file_type_hint: Some(FileType::Html),
};

let links = collect(vec![input], None, Some(base)).await.ok().unwrap();
let inputs = HashSet::from_iter([input]);

let links = collect(inputs, None, Some(base)).await.ok().unwrap();

let expected_links = HashSet::from_iter([
path("/path/to/root/index.html"),
Expand Down
2 changes: 1 addition & 1 deletion lychee-lib/src/types/input.rs
Original file line number Diff line number Diff line change
Expand Up @@ -102,7 +102,7 @@ impl Display for InputSource {
}

/// Lychee Input with optional file hint for parsing
#[derive(Clone, Debug, PartialEq)]
#[derive(Clone, Debug, PartialEq, Eq, Hash)]
pub struct Input {
/// Origin of input
pub source: InputSource,
Expand Down