From 0098877056322d5c8cc0672bdf41876dc39c2503 Mon Sep 17 00:00:00 2001 From: Andrew Langmeier Date: Sun, 4 Feb 2024 14:59:30 -0500 Subject: [PATCH] Allow ignoring files when link checking (#2264) * Allow ignoring files when link checking * cargo fmt * Fix tests * Remove mystery duplicate function..? * Add in some mysterious missing code..? * Simple tests for link checker file globs in config * cargo fmt * Remove comment * convert expect to error propagation * Address comments * cargo fmt --- components/config/src/config/link_checker.rs | 18 ++- components/config/src/config/mod.rs | 110 ++++++++++++------- components/site/src/link_checking.rs | 10 +- components/utils/src/globs.rs | 20 ++++ components/utils/src/lib.rs | 1 + 5 files changed, 117 insertions(+), 42 deletions(-) create mode 100644 components/utils/src/globs.rs diff --git a/components/config/src/config/link_checker.rs b/components/config/src/config/link_checker.rs index 8501287981..7668c71b1f 100644 --- a/components/config/src/config/link_checker.rs +++ b/components/config/src/config/link_checker.rs @@ -1,5 +1,9 @@ +use libs::globset::GlobSet; use serde::{Deserialize, Serialize}; +use errors::Result; +use utils::globs::build_ignore_glob_set; + #[derive(Clone, Debug, PartialEq, Eq, Serialize, Deserialize)] pub enum LinkCheckerLevel { #[serde(rename = "error")] @@ -14,7 +18,7 @@ impl Default for LinkCheckerLevel { } } -#[derive(Clone, Debug, Default, PartialEq, Eq, Serialize, Deserialize)] +#[derive(Clone, Debug, Default, Serialize, Deserialize)] #[serde(default)] pub struct LinkChecker { /// Skip link checking for these URL prefixes @@ -25,4 +29,16 @@ pub struct LinkChecker { pub internal_level: LinkCheckerLevel, /// Emit either "error" or "warn" for broken external links (including anchor links). pub external_level: LinkCheckerLevel, + /// A list of file glob patterns to skip link checking on + pub ignored_files: Vec, + #[serde(skip_serializing, skip_deserializing)] // not a typo, 2 are needed + pub ignored_files_globset: Option, +} + +impl LinkChecker { + pub fn resolve_globset(&mut self) -> Result<()> { + let glob_set = build_ignore_glob_set(&self.ignored_files, "files")?; + self.ignored_files_globset = Some(glob_set); + Ok(()) + } } diff --git a/components/config/src/config/mod.rs b/components/config/src/config/mod.rs index a79542e85e..f2a40675ae 100644 --- a/components/config/src/config/mod.rs +++ b/components/config/src/config/mod.rs @@ -8,13 +8,14 @@ pub mod taxonomies; use std::collections::HashMap; use std::path::{Path, PathBuf}; -use libs::globset::{Glob, GlobSet, GlobSetBuilder}; +use libs::globset::GlobSet; use libs::toml::Value as Toml; use serde::{Deserialize, Serialize}; use crate::theme::Theme; use errors::{anyhow, bail, Result}; use utils::fs::read_file; +use utils::globs::build_ignore_glob_set; use utils::slugs::slugify_paths; // We want a default base url for tests @@ -28,18 +29,6 @@ pub enum Mode { Check, } -fn build_ignore_glob_set(ignore: &Vec, name: &str) -> Result { - let mut glob_set_builder = GlobSetBuilder::new(); - for pat in ignore { - let glob = match Glob::new(pat) { - Ok(g) => g, - Err(e) => bail!("Invalid ignored_{} glob pattern: {}, error = {}", name, pat, e), - }; - glob_set_builder.add(glob); - } - Ok(glob_set_builder.build().unwrap_or_else(|_| panic!("Bad ignored_{} in config file.", name))) -} - #[derive(Clone, Debug, Deserialize)] #[serde(default)] pub struct Config { @@ -150,21 +139,13 @@ impl Config { config.add_default_language()?; config.slugify_taxonomies(); + config.link_checker.resolve_globset()?; - if !config.ignored_content.is_empty() { - // Convert the file glob strings into a compiled glob set matcher. We want to do this once, - // at program initialization, rather than for every page, for example. We arrange for the - // globset matcher to always exist (even though it has to be an inside an Option at the - // moment because of the TOML serializer); if the glob set is empty the `is_match` function - // of the globber always returns false. - let glob_set = build_ignore_glob_set(&config.ignored_content, "content")?; - config.ignored_content_globset = Some(glob_set); - } + let content_glob_set = build_ignore_glob_set(&config.ignored_content, "content")?; + config.ignored_content_globset = Some(content_glob_set); - if !config.ignored_static.is_empty() { - let glob_set = build_ignore_glob_set(&config.ignored_static, "static")?; - config.ignored_static_globset = Some(glob_set); - } + let static_glob_set = build_ignore_glob_set(&config.ignored_static, "static")?; + config.ignored_static_globset = Some(static_glob_set); Ok(config) } @@ -652,45 +633,41 @@ title = "A title" } #[test] - fn missing_ignored_content_results_in_empty_vector_and_empty_globset() { + fn missing_ignored_content_results_in_empty_vector() { let config_str = r#" title = "My site" base_url = "example.com" "#; let config = Config::parse(config_str).unwrap(); - let v = config.ignored_content; - assert_eq!(v.len(), 0); - assert!(config.ignored_content_globset.is_none()); + assert_eq!(config.ignored_content.len(), 0); } #[test] - fn missing_ignored_static_results_in_empty_vector_and_empty_globset() { + fn empty_ignored_content_results_in_empty_vector() { let config_str = r#" title = "My site" base_url = "example.com" +ignored_content = [] "#; + let config = Config::parse(config_str).unwrap(); - let v = config.ignored_static; - assert_eq!(v.len(), 0); - assert!(config.ignored_static_globset.is_none()); + assert_eq!(config.ignored_content.len(), 0); } #[test] - fn empty_ignored_content_results_in_empty_vector_and_empty_globset() { + fn missing_ignored_static_results_in_empty_vector() { let config_str = r#" title = "My site" base_url = "example.com" -ignored_content = [] "#; let config = Config::parse(config_str).unwrap(); - assert_eq!(config.ignored_content.len(), 0); - assert!(config.ignored_content_globset.is_none()); + assert_eq!(config.ignored_static.len(), 0); } #[test] - fn empty_ignored_static_results_in_empty_vector_and_empty_globset() { + fn empty_ignored_static_results_in_empty_vector() { let config_str = r#" title = "My site" base_url = "example.com" @@ -699,7 +676,30 @@ ignored_static = [] let config = Config::parse(config_str).unwrap(); assert_eq!(config.ignored_static.len(), 0); - assert!(config.ignored_static_globset.is_none()); + } + + #[test] + fn missing_link_checker_ignored_files_results_in_empty_vector() { + let config_str = r#" +title = "My site" +base_url = "example.com" + "#; + + let config = Config::parse(config_str).unwrap(); + assert_eq!(config.link_checker.ignored_files.len(), 0); + } + + #[test] + fn empty_link_checker_ignored_files_results_in_empty_vector() { + let config_str = r#" +title = "My site" +base_url = "example.com" +[link_checker] +ignored_files = [] + "#; + + let config = Config::parse(config_str).unwrap(); + assert_eq!(config.link_checker.ignored_files.len(), 0); } #[test] @@ -760,6 +760,36 @@ ignored_static = ["*.{graphml,iso}", "*.py?", "**/{target,temp_folder}"] assert!(g.is_match("content/poetry/zen.py2")); } + #[test] + fn non_empty_link_checker_ignored_pages_results_in_vector_of_patterns_and_configured_globset() { + let config_str = r#" +title = "My site" +base_url = "example.com" +[link_checker] +ignored_files = ["*.{graphml,iso}", "*.py?", "**/{target,temp_folder}"] + "#; + + let config = Config::parse(config_str).unwrap(); + let v = config.link_checker.ignored_files; + assert_eq!(v, vec!["*.{graphml,iso}", "*.py?", "**/{target,temp_folder}"]); + + let g = config.link_checker.ignored_files_globset.unwrap(); + assert_eq!(g.len(), 3); + assert!(g.is_match("foo.graphml")); + assert!(g.is_match("foo/bar/foo.graphml")); + assert!(g.is_match("foo.iso")); + assert!(!g.is_match("foo.png")); + assert!(g.is_match("foo.py2")); + assert!(g.is_match("foo.py3")); + assert!(!g.is_match("foo.py")); + assert!(g.is_match("foo/bar/target")); + assert!(g.is_match("foo/bar/baz/temp_folder")); + assert!(g.is_match("foo/bar/baz/temp_folder/target")); + assert!(g.is_match("temp_folder")); + assert!(g.is_match("my/isos/foo.iso")); + assert!(g.is_match("content/poetry/zen.py2")); + } + #[test] fn link_checker_skip_anchor_prefixes() { let config_str = r#" diff --git a/components/site/src/link_checking.rs b/components/site/src/link_checking.rs index b5c1ed2186..1bb1a66941 100644 --- a/components/site/src/link_checking.rs +++ b/components/site/src/link_checking.rs @@ -3,6 +3,7 @@ use std::path::{Path, PathBuf}; use std::{cmp, collections::HashMap, collections::HashSet, iter::FromIterator, thread}; use config::LinkCheckerLevel; +use libs::globset::GlobSet; use libs::rayon::prelude::*; use crate::Site; @@ -105,6 +106,10 @@ fn should_skip_by_prefix(link: &str, skip_prefixes: &[String]) -> bool { skip_prefixes.iter().any(|prefix| link.starts_with(prefix)) } +fn should_skip_by_file(file_path: &Path, glob_set: &GlobSet) -> bool { + glob_set.is_match(file_path) +} + fn get_link_domain(link: &str) -> Result { return match Url::parse(link) { Ok(url) => match url.host_str().map(String::from) { @@ -150,9 +155,12 @@ pub fn check_external_links(site: &Site) -> Vec { let mut invalid_url_links: u32 = 0; // First we look at all the external links, skip those the user wants to skip and record // the ones that have invalid URLs + let ignored_files_globset = site.config.link_checker.ignored_files_globset.as_ref().unwrap(); for (file_path, links) in external_links { for link in links { - if should_skip_by_prefix(link, &site.config.link_checker.skip_prefixes) { + if should_skip_by_prefix(link, &site.config.link_checker.skip_prefixes) + || should_skip_by_file(file_path, ignored_files_globset) + { skipped_link_count += 1; } else { match get_link_domain(link) { diff --git a/components/utils/src/globs.rs b/components/utils/src/globs.rs new file mode 100644 index 0000000000..2377964b17 --- /dev/null +++ b/components/utils/src/globs.rs @@ -0,0 +1,20 @@ +use libs::globset::{Glob, GlobSet, GlobSetBuilder}; + +use errors::{bail, Result}; + +pub fn build_ignore_glob_set(ignore: &Vec, name: &str) -> Result { + // Convert the file glob strings into a compiled glob set matcher. We want to do this once, + // at program initialization, rather than for every page, for example. We arrange for the + // globset matcher to always exist (even though it has to be inside an Option at the + // moment because of the TOML serializer); if the glob set is empty the `is_match` function + // of the globber always returns false. + let mut glob_set_builder = GlobSetBuilder::new(); + for pat in ignore { + let glob = match Glob::new(pat) { + Ok(g) => g, + Err(e) => bail!("Invalid ignored_{} glob pattern: {}, error = {}", name, pat, e), + }; + glob_set_builder.add(glob); + } + Ok(glob_set_builder.build()?) +} diff --git a/components/utils/src/lib.rs b/components/utils/src/lib.rs index 3b2ab6022d..619636b8f7 100644 --- a/components/utils/src/lib.rs +++ b/components/utils/src/lib.rs @@ -1,6 +1,7 @@ pub mod anchors; pub mod de; pub mod fs; +pub mod globs; pub mod net; pub mod site; pub mod slugs;