From 0098877056322d5c8cc0672bdf41876dc39c2503 Mon Sep 17 00:00:00 2001
From: Andrew Langmeier <raymi306@gmail.com>
Date: Sun, 4 Feb 2024 14:59:30 -0500
Subject: [PATCH] Allow ignoring files when link checking (#2264)

* Allow ignoring files when link checking

* cargo fmt

* Fix tests

* Remove mystery duplicate function..?

* Add in some mysterious missing code..?

* Simple tests for link checker file globs in config

* cargo fmt

* Remove comment

* convert expect to error propagation

* Address comments

* cargo fmt
---
 components/config/src/config/link_checker.rs |  18 ++-
 components/config/src/config/mod.rs          | 110 ++++++++++++-------
 components/site/src/link_checking.rs         |  10 +-
 components/utils/src/globs.rs                |  20 ++++
 components/utils/src/lib.rs                  |   1 +
 5 files changed, 117 insertions(+), 42 deletions(-)
 create mode 100644 components/utils/src/globs.rs
diff --git a/components/config/src/config/link_checker.rs b/components/config/src/config/link_checker.rs
index 8501287981..7668c71b1f 100644
--- a/components/config/src/config/link_checker.rs
+++ b/components/config/src/config/link_checker.rs
@@ -1,5 +1,9 @@
+use libs::globset::GlobSet;
 use serde::{Deserialize, Serialize};
 
+use errors::Result;
+use utils::globs::build_ignore_glob_set;
+
 #[derive(Clone, Debug, PartialEq, Eq, Serialize, Deserialize)]
 pub enum LinkCheckerLevel {
     #[serde(rename = "error")]
@@ -14,7 +18,7 @@ impl Default for LinkCheckerLevel {
     }
 }
 
-#[derive(Clone, Debug, Default, PartialEq, Eq, Serialize, Deserialize)]
+#[derive(Clone, Debug, Default, Serialize, Deserialize)]
 #[serde(default)]
 pub struct LinkChecker {
     /// Skip link checking for these URL prefixes
@@ -25,4 +29,16 @@ pub struct LinkChecker {
     pub internal_level: LinkCheckerLevel,
     /// Emit either "error" or "warn" for broken external links (including anchor links).
     pub external_level: LinkCheckerLevel,
+    /// A list of file glob patterns to skip link checking on
+    pub ignored_files: Vec<String>,
+    #[serde(skip_serializing, skip_deserializing)] // not a typo, 2 are needed
+    pub ignored_files_globset: Option<GlobSet>,
+}
+
+impl LinkChecker {
+    pub fn resolve_globset(&mut self) -> Result<()> {
+        let glob_set = build_ignore_glob_set(&self.ignored_files, "files")?;
+        self.ignored_files_globset = Some(glob_set);
+        Ok(())
+    }
 }
diff --git a/components/config/src/config/mod.rs b/components/config/src/config/mod.rs
index a79542e85e..f2a40675ae 100644
--- a/components/config/src/config/mod.rs
+++ b/components/config/src/config/mod.rs
@@ -8,13 +8,14 @@ pub mod taxonomies;
 use std::collections::HashMap;
 use std::path::{Path, PathBuf};
 
-use libs::globset::{Glob, GlobSet, GlobSetBuilder};
+use libs::globset::GlobSet;
 use libs::toml::Value as Toml;
 use serde::{Deserialize, Serialize};
 
 use crate::theme::Theme;
 use errors::{anyhow, bail, Result};
 use utils::fs::read_file;
+use utils::globs::build_ignore_glob_set;
 use utils::slugs::slugify_paths;
 
 // We want a default base url for tests
@@ -28,18 +29,6 @@ pub enum Mode {
     Check,
 }
 
-fn build_ignore_glob_set(ignore: &Vec<String>, name: &str) -> Result<GlobSet> {
-    let mut glob_set_builder = GlobSetBuilder::new();
-    for pat in ignore {
-        let glob = match Glob::new(pat) {
-            Ok(g) => g,
-            Err(e) => bail!("Invalid ignored_{} glob pattern: {}, error = {}", name, pat, e),
-        };
-        glob_set_builder.add(glob);
-    }
-    Ok(glob_set_builder.build().unwrap_or_else(|_| panic!("Bad ignored_{} in config file.", name)))
-}
-
 #[derive(Clone, Debug, Deserialize)]
 #[serde(default)]
 pub struct Config {
@@ -150,21 +139,13 @@ impl Config {
 
         config.add_default_language()?;
         config.slugify_taxonomies();
+        config.link_checker.resolve_globset()?;
 
-        if !config.ignored_content.is_empty() {
-            // Convert the file glob strings into a compiled glob set matcher. We want to do this once,
-            // at program initialization, rather than for every page, for example. We arrange for the
-            // globset matcher to always exist (even though it has to be an inside an Option at the
-            // moment because of the TOML serializer); if the glob set is empty the `is_match` function
-            // of the globber always returns false.
-            let glob_set = build_ignore_glob_set(&config.ignored_content, "content")?;
-            config.ignored_content_globset = Some(glob_set);
-        }
+        let content_glob_set = build_ignore_glob_set(&config.ignored_content, "content")?;
+        config.ignored_content_globset = Some(content_glob_set);
 
-        if !config.ignored_static.is_empty() {
-            let glob_set = build_ignore_glob_set(&config.ignored_static, "static")?;
-            config.ignored_static_globset = Some(glob_set);
-        }
+        let static_glob_set = build_ignore_glob_set(&config.ignored_static, "static")?;
+        config.ignored_static_globset = Some(static_glob_set);
 
         Ok(config)
     }
@@ -652,45 +633,41 @@ title = "A title"
     }
 
     #[test]
-    fn missing_ignored_content_results_in_empty_vector_and_empty_globset() {
+    fn missing_ignored_content_results_in_empty_vector() {
         let config_str = r#"
 title = "My site"
 base_url = "example.com"
         "#;
 
         let config = Config::parse(config_str).unwrap();
-        let v = config.ignored_content;
-        assert_eq!(v.len(), 0);
-        assert!(config.ignored_content_globset.is_none());
+        assert_eq!(config.ignored_content.len(), 0);
     }
 
     #[test]
-    fn missing_ignored_static_results_in_empty_vector_and_empty_globset() {
+    fn empty_ignored_content_results_in_empty_vector() {
         let config_str = r#"
 title = "My site"
 base_url = "example.com"
+ignored_content = []
         "#;
+
         let config = Config::parse(config_str).unwrap();
-        let v = config.ignored_static;
-        assert_eq!(v.len(), 0);
-        assert!(config.ignored_static_globset.is_none());
+        assert_eq!(config.ignored_content.len(), 0);
     }
 
     #[test]
-    fn empty_ignored_content_results_in_empty_vector_and_empty_globset() {
+    fn missing_ignored_static_results_in_empty_vector() {
         let config_str = r#"
 title = "My site"
 base_url = "example.com"
-ignored_content = []
         "#;
 
         let config = Config::parse(config_str).unwrap();
-        assert_eq!(config.ignored_content.len(), 0);
-        assert!(config.ignored_content_globset.is_none());
+        assert_eq!(config.ignored_static.len(), 0);
     }
 
     #[test]
-    fn empty_ignored_static_results_in_empty_vector_and_empty_globset() {
+    fn empty_ignored_static_results_in_empty_vector() {
         let config_str = r#"
 title = "My site"
 base_url = "example.com"
@@ -699,7 +676,30 @@ ignored_static = []
 
         let config = Config::parse(config_str).unwrap();
         assert_eq!(config.ignored_static.len(), 0);
-        assert!(config.ignored_static_globset.is_none());
+    }
+
+    #[test]
+    fn missing_link_checker_ignored_files_results_in_empty_vector() {
+        let config_str = r#"
+title = "My site"
+base_url = "example.com"
+        "#;
+
+        let config = Config::parse(config_str).unwrap();
+        assert_eq!(config.link_checker.ignored_files.len(), 0);
+    }
+
+    #[test]
+    fn empty_link_checker_ignored_files_results_in_empty_vector() {
+        let config_str = r#"
+title = "My site"
+base_url = "example.com"
+[link_checker]
+ignored_files = []
+        "#;
+
+        let config = Config::parse(config_str).unwrap();
+        assert_eq!(config.link_checker.ignored_files.len(), 0);
     }
 
     #[test]
@@ -760,6 +760,36 @@ ignored_static = ["*.{graphml,iso}", "*.py?", "**/{target,temp_folder}"]
         assert!(g.is_match("content/poetry/zen.py2"));
     }
 
+    #[test]
+    fn non_empty_link_checker_ignored_pages_results_in_vector_of_patterns_and_configured_globset() {
+        let config_str = r#"
+title = "My site"
+base_url = "example.com"
+[link_checker]
+ignored_files = ["*.{graphml,iso}", "*.py?", "**/{target,temp_folder}"]
+        "#;
+
+        let config = Config::parse(config_str).unwrap();
+        let v = config.link_checker.ignored_files;
+        assert_eq!(v, vec!["*.{graphml,iso}", "*.py?", "**/{target,temp_folder}"]);
+
+        let g = config.link_checker.ignored_files_globset.unwrap();
+        assert_eq!(g.len(), 3);
+        assert!(g.is_match("foo.graphml"));
+        assert!(g.is_match("foo/bar/foo.graphml"));
+        assert!(g.is_match("foo.iso"));
+        assert!(!g.is_match("foo.png"));
+        assert!(g.is_match("foo.py2"));
+        assert!(g.is_match("foo.py3"));
+        assert!(!g.is_match("foo.py"));
+        assert!(g.is_match("foo/bar/target"));
+        assert!(g.is_match("foo/bar/baz/temp_folder"));
+        assert!(g.is_match("foo/bar/baz/temp_folder/target"));
+        assert!(g.is_match("temp_folder"));
+        assert!(g.is_match("my/isos/foo.iso"));
+        assert!(g.is_match("content/poetry/zen.py2"));
+    }
+
     #[test]
     fn link_checker_skip_anchor_prefixes() {
         let config_str = r#"
diff --git a/components/site/src/link_checking.rs b/components/site/src/link_checking.rs
index b5c1ed2186..1bb1a66941 100644
--- a/components/site/src/link_checking.rs
+++ b/components/site/src/link_checking.rs
@@ -3,6 +3,7 @@ use std::path::{Path, PathBuf};
 use std::{cmp, collections::HashMap, collections::HashSet, iter::FromIterator, thread};
 
 use config::LinkCheckerLevel;
+use libs::globset::GlobSet;
 use libs::rayon::prelude::*;
 
 use crate::Site;
@@ -105,6 +106,10 @@ fn should_skip_by_prefix(link: &str, skip_prefixes: &[String]) -> bool {
     skip_prefixes.iter().any(|prefix| link.starts_with(prefix))
 }
 
+fn should_skip_by_file(file_path: &Path, glob_set: &GlobSet) -> bool {
+    glob_set.is_match(file_path)
+}
+
 fn get_link_domain(link: &str) -> Result<String> {
     return match Url::parse(link) {
         Ok(url) => match url.host_str().map(String::from) {
@@ -150,9 +155,12 @@ pub fn check_external_links(site: &Site) -> Vec<String> {
     let mut invalid_url_links: u32 = 0;
     // First we look at all the external links, skip those the user wants to skip and record
     // the ones that have invalid URLs
+    let ignored_files_globset = site.config.link_checker.ignored_files_globset.as_ref().unwrap();
     for (file_path, links) in external_links {
         for link in links {
-            if should_skip_by_prefix(link, &site.config.link_checker.skip_prefixes) {
+            if should_skip_by_prefix(link, &site.config.link_checker.skip_prefixes)
+                || should_skip_by_file(file_path, ignored_files_globset)
+            {
                 skipped_link_count += 1;
             } else {
                 match get_link_domain(link) {
diff --git a/components/utils/src/globs.rs b/components/utils/src/globs.rs
new file mode 100644
index 0000000000..2377964b17
--- /dev/null
+++ b/components/utils/src/globs.rs
@@ -0,0 +1,20 @@
+use libs::globset::{Glob, GlobSet, GlobSetBuilder};
+
+use errors::{bail, Result};
+
+pub fn build_ignore_glob_set(ignore: &Vec<String>, name: &str) -> Result<GlobSet> {
+    // Convert the file glob strings into a compiled glob set matcher. We want to do this once,
+    // at program initialization, rather than for every page, for example. We arrange for the
+    // globset matcher to always exist (even though it has to be inside an Option at the
+    // moment because of the TOML serializer); if the glob set is empty the `is_match` function
+    // of the globber always returns false.
+    let mut glob_set_builder = GlobSetBuilder::new();
+    for pat in ignore {
+        let glob = match Glob::new(pat) {
+            Ok(g) => g,
+            Err(e) => bail!("Invalid ignored_{} glob pattern: {}, error = {}", name, pat, e),
+        };
+        glob_set_builder.add(glob);
+    }
+    Ok(glob_set_builder.build()?)
+}
diff --git a/components/utils/src/lib.rs b/components/utils/src/lib.rs
index 3b2ab6022d..619636b8f7 100644
--- a/components/utils/src/lib.rs
+++ b/components/utils/src/lib.rs
@@ -1,6 +1,7 @@
 pub mod anchors;
 pub mod de;
 pub mod fs;
+pub mod globs;
 pub mod net;
 pub mod site;
 pub mod slugs;