Skip to content

Commit

Permalink
Allow ignoring files when link checking (#2264)
Browse files Browse the repository at this point in the history
* Allow ignoring files when link checking

* cargo fmt

* Fix tests

* Remove mystery duplicate function..?

* Add in some mysterious missing code..?

* Simple tests for link checker file globs in config

* cargo fmt

* Remove comment

* convert expect to error propagation

* Address comments

* cargo fmt
  • Loading branch information
Raymi306 authored Feb 4, 2024
1 parent 0122845 commit 0098877
Show file tree
Hide file tree
Showing 5 changed files with 117 additions and 42 deletions.
18 changes: 17 additions & 1 deletion components/config/src/config/link_checker.rs
Original file line number Diff line number Diff line change
@@ -1,5 +1,9 @@
use libs::globset::GlobSet;
use serde::{Deserialize, Serialize};

use errors::Result;
use utils::globs::build_ignore_glob_set;

#[derive(Clone, Debug, PartialEq, Eq, Serialize, Deserialize)]
pub enum LinkCheckerLevel {
#[serde(rename = "error")]
Expand All @@ -14,7 +18,7 @@ impl Default for LinkCheckerLevel {
}
}

#[derive(Clone, Debug, Default, PartialEq, Eq, Serialize, Deserialize)]
#[derive(Clone, Debug, Default, Serialize, Deserialize)]
#[serde(default)]
pub struct LinkChecker {
/// Skip link checking for these URL prefixes
Expand All @@ -25,4 +29,16 @@ pub struct LinkChecker {
pub internal_level: LinkCheckerLevel,
/// Emit either "error" or "warn" for broken external links (including anchor links).
pub external_level: LinkCheckerLevel,
/// A list of file glob patterns to skip link checking on
pub ignored_files: Vec<String>,
#[serde(skip_serializing, skip_deserializing)] // not a typo, 2 are needed
pub ignored_files_globset: Option<GlobSet>,
}

impl LinkChecker {
pub fn resolve_globset(&mut self) -> Result<()> {
let glob_set = build_ignore_glob_set(&self.ignored_files, "files")?;
self.ignored_files_globset = Some(glob_set);
Ok(())
}
}
110 changes: 70 additions & 40 deletions components/config/src/config/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -8,13 +8,14 @@ pub mod taxonomies;
use std::collections::HashMap;
use std::path::{Path, PathBuf};

use libs::globset::{Glob, GlobSet, GlobSetBuilder};
use libs::globset::GlobSet;
use libs::toml::Value as Toml;
use serde::{Deserialize, Serialize};

use crate::theme::Theme;
use errors::{anyhow, bail, Result};
use utils::fs::read_file;
use utils::globs::build_ignore_glob_set;
use utils::slugs::slugify_paths;

// We want a default base url for tests
Expand All @@ -28,18 +29,6 @@ pub enum Mode {
Check,
}

fn build_ignore_glob_set(ignore: &Vec<String>, name: &str) -> Result<GlobSet> {
let mut glob_set_builder = GlobSetBuilder::new();
for pat in ignore {
let glob = match Glob::new(pat) {
Ok(g) => g,
Err(e) => bail!("Invalid ignored_{} glob pattern: {}, error = {}", name, pat, e),
};
glob_set_builder.add(glob);
}
Ok(glob_set_builder.build().unwrap_or_else(|_| panic!("Bad ignored_{} in config file.", name)))
}

#[derive(Clone, Debug, Deserialize)]
#[serde(default)]
pub struct Config {
Expand Down Expand Up @@ -150,21 +139,13 @@ impl Config {

config.add_default_language()?;
config.slugify_taxonomies();
config.link_checker.resolve_globset()?;

if !config.ignored_content.is_empty() {
// Convert the file glob strings into a compiled glob set matcher. We want to do this once,
// at program initialization, rather than for every page, for example. We arrange for the
// globset matcher to always exist (even though it has to be an inside an Option at the
// moment because of the TOML serializer); if the glob set is empty the `is_match` function
// of the globber always returns false.
let glob_set = build_ignore_glob_set(&config.ignored_content, "content")?;
config.ignored_content_globset = Some(glob_set);
}
let content_glob_set = build_ignore_glob_set(&config.ignored_content, "content")?;
config.ignored_content_globset = Some(content_glob_set);

if !config.ignored_static.is_empty() {
let glob_set = build_ignore_glob_set(&config.ignored_static, "static")?;
config.ignored_static_globset = Some(glob_set);
}
let static_glob_set = build_ignore_glob_set(&config.ignored_static, "static")?;
config.ignored_static_globset = Some(static_glob_set);

Ok(config)
}
Expand Down Expand Up @@ -652,45 +633,41 @@ title = "A title"
}

#[test]
fn missing_ignored_content_results_in_empty_vector_and_empty_globset() {
fn missing_ignored_content_results_in_empty_vector() {
let config_str = r#"
title = "My site"
base_url = "example.com"
"#;

let config = Config::parse(config_str).unwrap();
let v = config.ignored_content;
assert_eq!(v.len(), 0);
assert!(config.ignored_content_globset.is_none());
assert_eq!(config.ignored_content.len(), 0);
}

#[test]
fn missing_ignored_static_results_in_empty_vector_and_empty_globset() {
fn empty_ignored_content_results_in_empty_vector() {
let config_str = r#"
title = "My site"
base_url = "example.com"
ignored_content = []
"#;

let config = Config::parse(config_str).unwrap();
let v = config.ignored_static;
assert_eq!(v.len(), 0);
assert!(config.ignored_static_globset.is_none());
assert_eq!(config.ignored_content.len(), 0);
}

#[test]
fn empty_ignored_content_results_in_empty_vector_and_empty_globset() {
fn missing_ignored_static_results_in_empty_vector() {
let config_str = r#"
title = "My site"
base_url = "example.com"
ignored_content = []
"#;

let config = Config::parse(config_str).unwrap();
assert_eq!(config.ignored_content.len(), 0);
assert!(config.ignored_content_globset.is_none());
assert_eq!(config.ignored_static.len(), 0);
}

#[test]
fn empty_ignored_static_results_in_empty_vector_and_empty_globset() {
fn empty_ignored_static_results_in_empty_vector() {
let config_str = r#"
title = "My site"
base_url = "example.com"
Expand All @@ -699,7 +676,30 @@ ignored_static = []

let config = Config::parse(config_str).unwrap();
assert_eq!(config.ignored_static.len(), 0);
assert!(config.ignored_static_globset.is_none());
}

#[test]
fn missing_link_checker_ignored_files_results_in_empty_vector() {
let config_str = r#"
title = "My site"
base_url = "example.com"
"#;

let config = Config::parse(config_str).unwrap();
assert_eq!(config.link_checker.ignored_files.len(), 0);
}

#[test]
fn empty_link_checker_ignored_files_results_in_empty_vector() {
let config_str = r#"
title = "My site"
base_url = "example.com"
[link_checker]
ignored_files = []
"#;

let config = Config::parse(config_str).unwrap();
assert_eq!(config.link_checker.ignored_files.len(), 0);
}

#[test]
Expand Down Expand Up @@ -760,6 +760,36 @@ ignored_static = ["*.{graphml,iso}", "*.py?", "**/{target,temp_folder}"]
assert!(g.is_match("content/poetry/zen.py2"));
}

#[test]
fn non_empty_link_checker_ignored_pages_results_in_vector_of_patterns_and_configured_globset() {
let config_str = r#"
title = "My site"
base_url = "example.com"
[link_checker]
ignored_files = ["*.{graphml,iso}", "*.py?", "**/{target,temp_folder}"]
"#;

let config = Config::parse(config_str).unwrap();
let v = config.link_checker.ignored_files;
assert_eq!(v, vec!["*.{graphml,iso}", "*.py?", "**/{target,temp_folder}"]);

let g = config.link_checker.ignored_files_globset.unwrap();
assert_eq!(g.len(), 3);
assert!(g.is_match("foo.graphml"));
assert!(g.is_match("foo/bar/foo.graphml"));
assert!(g.is_match("foo.iso"));
assert!(!g.is_match("foo.png"));
assert!(g.is_match("foo.py2"));
assert!(g.is_match("foo.py3"));
assert!(!g.is_match("foo.py"));
assert!(g.is_match("foo/bar/target"));
assert!(g.is_match("foo/bar/baz/temp_folder"));
assert!(g.is_match("foo/bar/baz/temp_folder/target"));
assert!(g.is_match("temp_folder"));
assert!(g.is_match("my/isos/foo.iso"));
assert!(g.is_match("content/poetry/zen.py2"));
}

#[test]
fn link_checker_skip_anchor_prefixes() {
let config_str = r#"
Expand Down
10 changes: 9 additions & 1 deletion components/site/src/link_checking.rs
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@ use std::path::{Path, PathBuf};
use std::{cmp, collections::HashMap, collections::HashSet, iter::FromIterator, thread};

use config::LinkCheckerLevel;
use libs::globset::GlobSet;
use libs::rayon::prelude::*;

use crate::Site;
Expand Down Expand Up @@ -105,6 +106,10 @@ fn should_skip_by_prefix(link: &str, skip_prefixes: &[String]) -> bool {
skip_prefixes.iter().any(|prefix| link.starts_with(prefix))
}

fn should_skip_by_file(file_path: &Path, glob_set: &GlobSet) -> bool {
glob_set.is_match(file_path)
}

fn get_link_domain(link: &str) -> Result<String> {
return match Url::parse(link) {
Ok(url) => match url.host_str().map(String::from) {
Expand Down Expand Up @@ -150,9 +155,12 @@ pub fn check_external_links(site: &Site) -> Vec<String> {
let mut invalid_url_links: u32 = 0;
// First we look at all the external links, skip those the user wants to skip and record
// the ones that have invalid URLs
let ignored_files_globset = site.config.link_checker.ignored_files_globset.as_ref().unwrap();
for (file_path, links) in external_links {
for link in links {
if should_skip_by_prefix(link, &site.config.link_checker.skip_prefixes) {
if should_skip_by_prefix(link, &site.config.link_checker.skip_prefixes)
|| should_skip_by_file(file_path, ignored_files_globset)
{
skipped_link_count += 1;
} else {
match get_link_domain(link) {
Expand Down
20 changes: 20 additions & 0 deletions components/utils/src/globs.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,20 @@
use libs::globset::{Glob, GlobSet, GlobSetBuilder};

use errors::{bail, Result};

pub fn build_ignore_glob_set(ignore: &Vec<String>, name: &str) -> Result<GlobSet> {
// Convert the file glob strings into a compiled glob set matcher. We want to do this once,
// at program initialization, rather than for every page, for example. We arrange for the
// globset matcher to always exist (even though it has to be inside an Option at the
// moment because of the TOML serializer); if the glob set is empty the `is_match` function
// of the globber always returns false.
let mut glob_set_builder = GlobSetBuilder::new();
for pat in ignore {
let glob = match Glob::new(pat) {
Ok(g) => g,
Err(e) => bail!("Invalid ignored_{} glob pattern: {}, error = {}", name, pat, e),
};
glob_set_builder.add(glob);
}
Ok(glob_set_builder.build()?)
}
1 change: 1 addition & 0 deletions components/utils/src/lib.rs
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
pub mod anchors;
pub mod de;
pub mod fs;
pub mod globs;
pub mod net;
pub mod site;
pub mod slugs;
Expand Down

0 comments on commit 0098877

Please sign in to comment.