From 1a3b78312082d0a6054c9a09370937b085764081 Mon Sep 17 00:00:00 2001 From: Vincent Prouillet Date: Wed, 11 May 2022 22:31:54 +0200 Subject: [PATCH] DRY up the external link checking a bit --- CHANGELOG.md | 1 + components/site/src/lib.rs | 8 +- components/site/src/link_checking.rs | 150 +++++++++++++-------------- 3 files changed, 78 insertions(+), 81 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 86f43276a7..9cd86a0cae 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -23,6 +23,7 @@ any pages related to that taxonomy - Use Zola Tera instance for markdown filter: this means you have access to the same Tera functions as in shortcodes - Ignore sections with `render=false` when looking for path collisions - Add support for backlinks +- Add a warning mode for internal/external link checking in case you don't want zola to stop the build on invalid links ## 0.15.3 (2022-01-23) diff --git a/components/site/src/lib.rs b/components/site/src/lib.rs index 6615bfceb7..03072e2d5d 100644 --- a/components/site/src/lib.rs +++ b/components/site/src/lib.rs @@ -298,8 +298,8 @@ impl Site { let internal_link_messages = link_checking::check_internal_links_with_anchors(self); // log any broken internal links and error out if needed - if let Err(messages) = internal_link_messages { - let messages: Vec = messages + if !internal_link_messages.is_empty() { + let messages: Vec = internal_link_messages .iter() .enumerate() .map(|(i, msg)| format!(" {}. {}", i + 1, msg)) @@ -318,8 +318,8 @@ impl Site { // check external links, log the results, and error out if needed if self.config.is_in_check_mode() { let external_link_messages = link_checking::check_external_links(self); - if let Err(messages) = external_link_messages { - let messages: Vec = messages + if !external_link_messages.is_empty() { + let messages: Vec = external_link_messages .iter() .enumerate() .map(|(i, msg)| format!(" {}. {}", i + 1, msg)) diff --git a/components/site/src/link_checking.rs b/components/site/src/link_checking.rs index 589927d8ef..1972535b2a 100644 --- a/components/site/src/link_checking.rs +++ b/components/site/src/link_checking.rs @@ -1,4 +1,5 @@ use core::time; +use std::path::Path; use std::{collections::HashMap, path::PathBuf, thread}; use config::LinkCheckerLevel; @@ -15,19 +16,19 @@ use libs::url::Url; /// is always performed (while external ones only conditionally in `zola check`). If broken links /// are encountered, the `internal_level` setting in config.toml will determine whether they are /// treated as warnings or errors. -pub fn check_internal_links_with_anchors(site: &Site) -> Result<(), Vec> { +pub fn check_internal_links_with_anchors(site: &Site) -> Vec { println!("Checking all internal links with anchors."); let library = site.library.write().expect("Get lock for check_internal_links_with_anchors"); // Chain all internal links, from both sections and pages. - let page_links = library.pages.values().flat_map(|p| { - let path = &p.file.path; - p.internal_links.iter().map(move |l| (path.clone(), l)) - }); - let section_links = library.sections.values().flat_map(|p| { - let path = &p.file.path; - p.internal_links.iter().map(move |l| (path.clone(), l)) - }); + let page_links = library + .pages + .values() + .flat_map(|p| p.internal_links.iter().map(move |l| (p.file.path.clone(), l))); + let section_links = library + .sections + .values() + .flat_map(|p| p.internal_links.iter().map(move |l| (p.file.path.clone(), l))); let all_links = page_links.chain(section_links); // Only keep links with anchor fragments, and count them too. @@ -88,19 +89,16 @@ pub fn check_internal_links_with_anchors(site: &Site) -> Result<(), Vec> .collect::>(); // Finally emit a summary, and return overall anchors-checking result. - match messages.len() { - 0 => { - println!("> Successfully checked {} internal link(s) with anchors.", anchors_total); - Ok(()) - } - errors_total => { - println!( - "> Checked {} internal link(s) with anchors: {} target(s) missing.", - anchors_total, errors_total, - ); - Err(messages) - } + if messages.is_empty() { + println!("> Successfully checked {} internal link(s) with anchors.", anchors_total); + } else { + println!( + "> Checked {} internal link(s) with anchors: {} target(s) missing.", + anchors_total, + messages.len(), + ); } + messages } fn should_skip_by_prefix(link: &str, skip_prefixes: &[String]) -> bool { @@ -117,110 +115,112 @@ fn get_link_domain(link: &str) -> Result { }; } -pub fn check_external_links(site: &Site) -> Result<(), Vec> { +/// Checks all external links and returns all the errors that were encountered. +/// Empty vec == all good +pub fn check_external_links(site: &Site) -> Vec { let library = site.library.write().expect("Get lock for check_external_links"); struct LinkDef { file_path: PathBuf, external_link: String, - domain: Result, + domain: String, } impl LinkDef { - pub fn new(file_path: PathBuf, external_link: String, domain: Result) -> Self { - Self { file_path, external_link, domain } + pub fn new(file_path: &Path, external_link: &str, domain: String) -> Self { + Self { + file_path: file_path.to_path_buf(), + external_link: external_link.to_string(), + domain, + } } } let mut messages: Vec = vec![]; - let mut checked_links: Vec = vec![]; - let mut skipped_link_count: u32 = 0; - + let mut external_links = Vec::new(); for p in library.pages.values() { - for external_link in p.clone().external_links.into_iter() { - if should_skip_by_prefix(&external_link, &site.config.link_checker.skip_prefixes) { - skipped_link_count += 1; - } else { - let domain = get_link_domain(&external_link); - checked_links.push(LinkDef::new(p.file.path.clone(), external_link, domain)); - } - } + external_links.push((&p.file.path, &p.external_links)); } - for s in library.sections.values() { - for external_link in s.clone().external_links.into_iter() { - if should_skip_by_prefix(&external_link, &site.config.link_checker.skip_prefixes) { + external_links.push((&s.file.path, &s.external_links)); + } + + let mut checked_links: Vec = vec![]; + let mut skipped_link_count: u32 = 0; + let mut invalid_url_links: u32 = 0; + // First we look at all the external links, skip those the user wants to skip and record + // the ones that have invalid URLs + for (file_path, links) in external_links { + for link in links { + if should_skip_by_prefix(&link, &site.config.link_checker.skip_prefixes) { skipped_link_count += 1; } else { - let domain = get_link_domain(&external_link); - checked_links.push(LinkDef::new(s.file.path.clone(), external_link, domain)); + match get_link_domain(link) { + Ok(domain) => { + checked_links.push(LinkDef::new(file_path, link, domain)); + } + Err(err) => { + // We could use the messages.len() to keep track of them for below + // but it's more explicit this way + invalid_url_links += 1; + messages.push(err.to_string()); + } + } } } } - // separate the links with valid domains from the links with invalid domains - let (checked_links, invalid_url_links): (Vec<&LinkDef>, Vec<&LinkDef>) = - checked_links.iter().partition(|link| link.domain.is_ok()); - println!( "Checking {} external link(s). Skipping {} external link(s).{}", checked_links.len(), skipped_link_count, - if invalid_url_links.is_empty() { + if invalid_url_links == 0 { "".to_string() } else { - format!(" {} link(s) had unparseable URLs.", invalid_url_links.len()) + format!(" {} link(s) had unparseable URLs.", invalid_url_links) } ); - for err in invalid_url_links.into_iter() { - let msg = err.domain.as_ref().unwrap_err().to_string(); - messages.push(msg); + if checked_links.is_empty() { + return Vec::new(); } // error out if we're in error mode and any external URLs couldn't be parsed - match site.config.link_checker.external_level { - LinkCheckerLevel::Error if messages.len() > 0 => return Err(messages), - _ => (), + if site.config.link_checker.external_level == LinkCheckerLevel::Error && !messages.is_empty() { + return messages; } - let mut links_by_domain: HashMap> = HashMap::new(); - + let mut links_by_domain: HashMap<&str, Vec<&LinkDef>> = HashMap::new(); for link in checked_links.iter() { - let domain = link.domain.as_ref().unwrap(); - links_by_domain.entry(domain.to_string()).or_default(); - // Insert content path and link under the domain key - links_by_domain.get_mut(domain).unwrap().push(link); - } - - if checked_links.is_empty() { - return Ok(()); + if links_by_domain.contains_key(link.domain.as_str()) { + links_by_domain.get_mut(link.domain.as_str()).unwrap().push(link); + } else { + links_by_domain.insert(link.domain.as_str(), vec![link]); + } } // create thread pool with lots of threads so we can fetch // (almost) all pages simultaneously, limiting all links for a single // domain to one thread to avoid rate-limiting let threads = std::cmp::min(links_by_domain.len(), 8); - let pool = rayon::ThreadPoolBuilder::new().num_threads(threads).build(); - - match pool { + match rayon::ThreadPoolBuilder::new().num_threads(threads).build() { Ok(pool) => { let errors = pool.install(|| { links_by_domain .par_iter() - .map(|(_domain, links)| { - let mut links_to_process = links.len(); + .map(|(_, links)| { + let mut num_links_left = links.len(); links .iter() .filter_map(move |link_def| { - links_to_process -= 1; + num_links_left -= 1; let res = link_checker::check_url( &link_def.external_link, &site.config.link_checker, ); - if links_to_process > 0 { + if num_links_left > 0 { // Prevent rate-limiting, wait before next crawl unless we're done with this domain thread::sleep(time::Duration::from_millis(500)); } @@ -243,21 +243,17 @@ pub fn check_external_links(site: &Site) -> Result<(), Vec> { errors.len() ); - if errors.is_empty() { - return Ok(()); - } - - for (page_path, link, check_res) in errors.iter() { + for (page_path, link, check_res) in errors { messages.push(format!( "Broken link in {} to {}: {}", page_path.to_string_lossy(), link, - link_checker::message(check_res) + link_checker::message(&check_res) )); } } Err(pool_err) => messages.push(pool_err.to_string()), } - Err(messages) + messages }