From ef6fbd7321a4d3c3aa9ce96b73ddfe0a37e82461 Mon Sep 17 00:00:00 2001 From: Florian Dieminger Date: Fri, 1 Nov 2024 14:01:54 +0100 Subject: [PATCH] feat(html): post process dts (#34) This moves the insert links logic for dts to post processing. --- crates/rari-doc/src/html/modifier.rs | 185 +++++++++++++++++++++++++-- crates/rari-doc/src/html/rewriter.rs | 32 +---- crates/rari-doc/src/pages/build.rs | 3 +- crates/rari-md/src/html.rs | 45 +------ crates/rari-md/src/lib.rs | 4 +- 5 files changed, 178 insertions(+), 91 deletions(-) diff --git a/crates/rari-doc/src/html/modifier.rs b/crates/rari-doc/src/html/modifier.rs index 9caf83b2..198d0b0f 100644 --- a/crates/rari-doc/src/html/modifier.rs +++ b/crates/rari-doc/src/html/modifier.rs @@ -2,12 +2,23 @@ use std::borrow::Cow; use std::collections::HashSet; use ego_tree::NodeId; -use html5ever::{namespace_url, ns, QualName}; +use html5ever::{namespace_url, ns, Attribute, QualName}; use rari_md::anchor::anchorize; +use rari_utils::concat_strs; +use scraper::node::{self}; use scraper::{ElementRef, Html, Node, Selector}; use crate::error::DocError; - +/// Adds an attribute to a specified HTML node. +/// +/// # Parameters +/// - `html`: A mutable reference to the HTML document structure. +/// - `node_id`: The ID of the node to which the attribute will be added. +/// - `key`: The name of the attribute to add. +/// - `value`: The value of the attribute to add. +/// +/// If the node exists and is an element, this function adds or updates +/// the specified attribute in the node's attributes list. pub fn add_attribute(html: &mut Html, node_id: NodeId, key: &str, value: &str) { if let Some(mut details) = html.tree.get_mut(node_id) { if let Node::Element(ref mut el) = details.value() { @@ -23,6 +34,15 @@ pub fn add_attribute(html: &mut Html, node_id: NodeId, key: &str, value: &str) { } } +/// Removes an attribute from a specified HTML node. +/// +/// # Parameters +/// - `html`: A mutable reference to the HTML document structure. +/// - `node_id`: The ID of the node from which the attribute will be removed. +/// - `key`: The name of the attribute to remove. +/// +/// If the node exists and is an element, this function removes the specified +/// attribute from the node's attributes list, if it exists. pub fn remove_attribute(html: &mut Html, node_id: NodeId, key: &str) { if let Some(mut details) = html.tree.get_mut(node_id) { if let Node::Element(ref mut el) = details.value() { @@ -35,6 +55,156 @@ pub fn remove_attribute(html: &mut Html, node_id: NodeId, key: &str) { } } +/// Retrieves the `id` attribute of an HTML node if it exists, prefixed with `#`. +/// +/// # Arguments +/// * `html` - A reference to the `Html` structure containing the node tree. +/// * `node_id` - The identifier of the node from which to retrieve the `id`. +/// +/// # Returns +/// * `Option` - Returns `Some(String)` containing the `id` prefixed with `#` if found, or `None` if the node +/// has no `id` attribute. +pub fn get_id(html: &Html, node_id: NodeId) -> Option { + if let Some(node) = html.tree.get(node_id) { + if let Node::Element(node_el) = node.value() { + if let Some(id) = node_el.attr("id") { + return Some(concat_strs!("#", id)); + } + } + } + None +} + +/// Wraps the children of a specified node with a link element pointing to the node's own `id` attribute. +/// +/// # Arguments +/// * `html` - A mutable reference to the `Html` structure to modify. +/// * `node_id` - The identifier of the node whose children will be wrapped with a link. +/// +/// # Details +/// This function calls `get_id` to retrieve the `id` of the specified node and, if successful, wraps its children +/// with an anchor (``) link element using that `id` as the `href` attribute. +pub fn wrap_children_with_link_to_id(html: &mut Html, node_id: NodeId) { + if let Some(id) = get_id(html, node_id) { + wrap_children_with_link(html, node_id, id); + } +} + +/// Wraps the children of a specified node with a link element containing a specified `href`. +/// +/// # Arguments +/// * `html` - A mutable reference to the `Html` structure to modify. +/// * `node_id` - The identifier of the node whose children will be wrapped with the link element. +/// * `href` - A `String` representing the `href` attribute for the new link element. +/// +/// # Details +/// This function creates an anchor (``) element with the given `href`, then appends it as a child to the specified +/// node and reparents the node’s children to be inside the new link element. +pub fn wrap_children_with_link(html: &mut Html, node_id: NodeId, href: String) { + let attribute = Attribute { + name: QualName { + prefix: None, + ns: ns!(), + local: "href".into(), + }, + value: href.into(), + }; + + let a_node = Node::Element(node::Element::new( + QualName { + prefix: None, + ns: ns!(), + local: "a".into(), + }, + vec![attribute], + )); + let mut a_node_ref = html.tree.orphan(a_node); + a_node_ref.reparent_from_id_append(node_id); + let a_node_id = a_node_ref.id(); + if let Some(mut node) = html.tree.get_mut(node_id) { + node.append_id(a_node_id); + } +} + +/// Inserts self-links for all `
` elements in the given HTML that do not already +/// contain a direct child anchor (``) element. This function selects all `
` +/// elements that lack an anchor tag and wraps their children with a link pointing +/// to the element’s own `id` attribute. +/// +/// # Arguments +/// +/// * `html` - A mutable reference to the `Html` structure, representing the HTML +/// document to be processed. +/// +/// # Returns +/// +/// * `Result<(), DocError>` - Returns `Ok(())` if all operations succeed, otherwise +/// returns a `DocError` if an error is encountered. +pub fn insert_self_links_for_dts(html: &mut Html) -> Result<(), DocError> { + let selector = Selector::parse("dt:not(:has(> a)").unwrap(); + let subs = html.select(&selector).map(|el| el.id()).collect::>(); + for el_id in subs { + wrap_children_with_link_to_id(html, el_id); + } + Ok(()) +} + +/// Removes all empty `

` elements from the given HTML document. This function +/// selects all `

` elements that have no children or content and removes them +/// from the HTML tree structure to clean up any unnecessary empty elements. +/// +/// # Arguments +/// +/// * `html` - A mutable reference to the `Html` structure, representing the HTML +/// document to be modified. +/// +/// # Returns +/// +/// * `Result<(), DocError>` - Returns `Ok(())` if all empty `

` elements are +/// successfully removed, otherwise returns a `DocError` if an error occurs. +pub fn remove_empty_p(html: &mut Html) -> Result<(), DocError> { + let selector = Selector::parse("p:empty").unwrap(); + let dels = html.select(&selector).map(|el| el.id()).collect::>(); + + for id in dels { + html.tree.get_mut(id).unwrap().detach(); + } + + Ok(()) +} + +/// Adds unique `id` attributes to HTML elements that are missing them. +/// +/// This function scans through an HTML document, identifying elements that either: +/// 1. Already contain an `id` attribute, or +/// 2. Lack an `id` attribute but have `data-update-id` attributes or are headers (`

`, `

`) or `
` elements. +/// +/// For elements missing `id` attributes, it generates a unique `id` based on the element’s text content, +/// ensuring that the `id` does not conflict with any existing `id`s in the document. If an ID conflict +/// arises, a numeric suffix (e.g., `_2`, `_3`) is appended to the generated `id` until uniqueness is ensured. +/// +/// # Arguments +/// +/// * `html` - A mutable reference to an HTML document represented by the `Html` type. +/// +/// # Returns +/// +/// This function returns `Ok(())` on success or a `DocError` if an error occurs. +/// +/// # Errors +/// +/// If a `DocError` occurs during processing, such as a failure to parse selectors or update attributes, +/// the error is returned. +/// +/// # Example +/// +/// ```rust +/// let mut html = Html::parse_document("

Some Heading

"); +/// add_missing_ids(&mut html); +/// ``` +/// +/// After calling this function, the HTML will have generated unique `id` attributes for +/// elements without `id`s, based on the element’s content text. pub fn add_missing_ids(html: &mut Html) -> Result<(), DocError> { let selector = Selector::parse("*[id]").unwrap(); let mut ids = html @@ -85,14 +255,3 @@ pub fn add_missing_ids(html: &mut Html) -> Result<(), DocError> { } Ok(()) } - -pub fn remove_empty_p(html: &mut Html) -> Result<(), DocError> { - let selector = Selector::parse("p:empty").unwrap(); - let dels = html.select(&selector).map(|el| el.id()).collect::>(); - - for id in dels { - html.tree.get_mut(id).unwrap().detach(); - } - - Ok(()) -} diff --git a/crates/rari-doc/src/html/rewriter.rs b/crates/rari-doc/src/html/rewriter.rs index c3bd6e3b..77de2b3b 100644 --- a/crates/rari-doc/src/html/rewriter.rs +++ b/crates/rari-doc/src/html/rewriter.rs @@ -1,7 +1,7 @@ use std::borrow::Cow; use std::collections::HashSet; -use lol_html::html_content::{ContentType, Element}; +use lol_html::html_content::ContentType; use lol_html::{element, rewrite_str, HtmlRewriter, RewriteStrSettings, Settings}; use rari_md::ext::DELIM_START; use rari_md::node_card::NoteCard; @@ -42,7 +42,6 @@ pub fn post_process_html( ) -> Result { let mut output = vec![]; let mut ids = HashSet::new(); - let open_dt_a = std::rc::Rc::new(std::cell::RefCell::new(false)); let options = Url::options(); let url = page.url(); let base = Url::parse(&concat_strs!( @@ -264,35 +263,6 @@ pub fn post_process_html( Ok(()) }), - element!("dt[data-add-link]", |el: &mut Element| { - el.remove_attribute("data-add-link"); - if let Some(id) = el.get_attribute("id") { - el.prepend(&concat_strs!("
"), ContentType::Html); - let mut s = open_dt_a.borrow_mut(); - *s = true; - let open_dt_a = open_dt_a.clone(); - // We need this handler if there's only a text node in the dl. - if let Some(handlers) = el.end_tag_handlers() { - handlers.push(Box::new(move |end| { - let mut s = open_dt_a.borrow_mut(); - if *s { - end.before("", ContentType::Html); - *s = false; - } - Ok(()) - })); - } - } - Ok(()) - }), - element!("dt[data-add-link] *:first-child", |el| { - let mut s = open_dt_a.borrow_mut(); - if *s { - el.after("", ContentType::Html); - *s = false; - } - Ok(()) - }), element!("pre:not(.notranslate)", |el| { let mut class = el.get_attribute("class").unwrap_or_default(); class.push_str(" notranslate"); diff --git a/crates/rari-doc/src/pages/build.rs b/crates/rari-doc/src/pages/build.rs index 2e8c7282..a4455edb 100644 --- a/crates/rari-doc/src/pages/build.rs +++ b/crates/rari-doc/src/pages/build.rs @@ -22,7 +22,7 @@ use crate::error::DocError; use crate::helpers::parents::parents; use crate::helpers::title::{page_title, transform_title}; use crate::html::bubble_up::bubble_up_curriculum_page; -use crate::html::modifier::{add_missing_ids, remove_empty_p}; +use crate::html::modifier::{add_missing_ids, insert_self_links_for_dts, remove_empty_p}; use crate::html::rewriter::{post_process_html, post_process_inline_sidebar}; use crate::html::sections::{split_sections, BuildSection, BuildSectionType, Splitted}; use crate::html::sidebar::{ @@ -165,6 +165,7 @@ fn build_content(page: &T) -> Result { } remove_empty_p(&mut fragment)?; add_missing_ids(&mut fragment)?; + insert_self_links_for_dts(&mut fragment)?; expand_details_and_mark_current_for_inline_sidebar(&mut fragment, page.url())?; let Splitted { sections, diff --git a/crates/rari-md/src/html.rs b/crates/rari-md/src/html.rs index ba2705c1..00fa76dd 100644 --- a/crates/rari-md/src/html.rs +++ b/crates/rari-md/src/html.rs @@ -473,37 +473,6 @@ impl<'o, 'c: 'o> HtmlFormatter<'o, 'c> { } } - fn collect_first_child_text<'a>(node: &'a AstNode<'a>, output: &mut Vec) { - if let Some(child) = node.children().next() { - if matches!(child.data.borrow().value, NodeValue::Paragraph) { - if let Some(child) = child.children().next() { - if !matches!(child.data.borrow().value, NodeValue::HtmlInline(_)) { - return Self::collect_text(child, output); - } - } - } - Self::collect_text(child, output) - } else { - Self::collect_text(node, output) - } - } - - fn next_is_link<'a>(node: &'a AstNode<'a>) -> bool { - if let Some(child) = node.children().next() { - if matches!(child.data.borrow().value, NodeValue::Link(_)) { - return true; - } - if matches!(child.data.borrow().value, NodeValue::Paragraph) { - if let Some(child) = child.children().next() { - if matches!(child.data.borrow().value, NodeValue::Link(_)) { - return true; - } - } - } - } - false - } - fn format_node<'a>( &mut self, node: &'a AstNode<'a>, @@ -597,19 +566,7 @@ impl<'o, 'c: 'o> HtmlFormatter<'o, 'c> { NodeValue::DescriptionTerm => { if entering { self.cr()?; - let mut text_content = Vec::with_capacity(20); - Self::collect_first_child_text(node, &mut text_content); - let raw_id = String::from_utf8_lossy(&text_content); - let is_templ = raw_id.contains(DELIM_START); - if is_templ { - write!(self.output, "
")?; } else { diff --git a/crates/rari-md/src/lib.rs b/crates/rari-md/src/lib.rs index 568a7122..455a9ebd 100644 --- a/crates/rari-md/src/lib.rs +++ b/crates/rari-md/src/lib.rs @@ -105,7 +105,7 @@ mod test { let out = m2h("- {{foo}}\n - : bar", Locale::EnUs)?; assert_eq!( out, - "
\n
{{foo}}
\n
\n

bar

\n
\n
\n" + "
\n
{{foo}}
\n
\n

bar

\n
\n
\n" ); Ok(()) } @@ -115,7 +115,7 @@ mod test { let out = m2h("- {{foo}}\n - : bar", Locale::EnUs)?; assert_eq!( out, - "
\n
{{foo}}
\n
\n

bar

\n
\n
\n" + "
\n
{{foo}}
\n
\n

bar

\n
\n
\n" ); Ok(()) }