Skip to content

Commit

Permalink
feat(html): post process dts (#34)
Browse files Browse the repository at this point in the history
This moves the insert links logic for dts to post processing.
  • Loading branch information
fiji-flo authored Nov 1, 2024
1 parent 3ebf64b commit ef6fbd7
Show file tree
Hide file tree
Showing 5 changed files with 178 additions and 91 deletions.
185 changes: 172 additions & 13 deletions crates/rari-doc/src/html/modifier.rs
Original file line number Diff line number Diff line change
Expand Up @@ -2,12 +2,23 @@ use std::borrow::Cow;
use std::collections::HashSet;

use ego_tree::NodeId;
use html5ever::{namespace_url, ns, QualName};
use html5ever::{namespace_url, ns, Attribute, QualName};
use rari_md::anchor::anchorize;
use rari_utils::concat_strs;
use scraper::node::{self};
use scraper::{ElementRef, Html, Node, Selector};

use crate::error::DocError;

/// Adds an attribute to a specified HTML node.
///
/// # Parameters
/// - `html`: A mutable reference to the HTML document structure.
/// - `node_id`: The ID of the node to which the attribute will be added.
/// - `key`: The name of the attribute to add.
/// - `value`: The value of the attribute to add.
///
/// If the node exists and is an element, this function adds or updates
/// the specified attribute in the node's attributes list.
pub fn add_attribute(html: &mut Html, node_id: NodeId, key: &str, value: &str) {
if let Some(mut details) = html.tree.get_mut(node_id) {
if let Node::Element(ref mut el) = details.value() {
Expand All @@ -23,6 +34,15 @@ pub fn add_attribute(html: &mut Html, node_id: NodeId, key: &str, value: &str) {
}
}

/// Removes an attribute from a specified HTML node.
///
/// # Parameters
/// - `html`: A mutable reference to the HTML document structure.
/// - `node_id`: The ID of the node from which the attribute will be removed.
/// - `key`: The name of the attribute to remove.
///
/// If the node exists and is an element, this function removes the specified
/// attribute from the node's attributes list, if it exists.
pub fn remove_attribute(html: &mut Html, node_id: NodeId, key: &str) {
if let Some(mut details) = html.tree.get_mut(node_id) {
if let Node::Element(ref mut el) = details.value() {
Expand All @@ -35,6 +55,156 @@ pub fn remove_attribute(html: &mut Html, node_id: NodeId, key: &str) {
}
}

/// Retrieves the `id` attribute of an HTML node if it exists, prefixed with `#`.
///
/// # Arguments
/// * `html` - A reference to the `Html` structure containing the node tree.
/// * `node_id` - The identifier of the node from which to retrieve the `id`.
///
/// # Returns
/// * `Option<String>` - Returns `Some(String)` containing the `id` prefixed with `#` if found, or `None` if the node
/// has no `id` attribute.
pub fn get_id(html: &Html, node_id: NodeId) -> Option<String> {
if let Some(node) = html.tree.get(node_id) {
if let Node::Element(node_el) = node.value() {
if let Some(id) = node_el.attr("id") {
return Some(concat_strs!("#", id));
}
}
}
None
}

/// Wraps the children of a specified node with a link element pointing to the node's own `id` attribute.
///
/// # Arguments
/// * `html` - A mutable reference to the `Html` structure to modify.
/// * `node_id` - The identifier of the node whose children will be wrapped with a link.
///
/// # Details
/// This function calls `get_id` to retrieve the `id` of the specified node and, if successful, wraps its children
/// with an anchor (`<a>`) link element using that `id` as the `href` attribute.
pub fn wrap_children_with_link_to_id(html: &mut Html, node_id: NodeId) {
if let Some(id) = get_id(html, node_id) {
wrap_children_with_link(html, node_id, id);
}
}

/// Wraps the children of a specified node with a link element containing a specified `href`.
///
/// # Arguments
/// * `html` - A mutable reference to the `Html` structure to modify.
/// * `node_id` - The identifier of the node whose children will be wrapped with the link element.
/// * `href` - A `String` representing the `href` attribute for the new link element.
///
/// # Details
/// This function creates an anchor (`<a>`) element with the given `href`, then appends it as a child to the specified
/// node and reparents the node’s children to be inside the new link element.
pub fn wrap_children_with_link(html: &mut Html, node_id: NodeId, href: String) {
let attribute = Attribute {
name: QualName {
prefix: None,
ns: ns!(),
local: "href".into(),
},
value: href.into(),
};

let a_node = Node::Element(node::Element::new(
QualName {
prefix: None,
ns: ns!(),
local: "a".into(),
},
vec![attribute],
));
let mut a_node_ref = html.tree.orphan(a_node);
a_node_ref.reparent_from_id_append(node_id);
let a_node_id = a_node_ref.id();
if let Some(mut node) = html.tree.get_mut(node_id) {
node.append_id(a_node_id);
}
}

/// Inserts self-links for all `<dt>` elements in the given HTML that do not already
/// contain a direct child anchor (`<a>`) element. This function selects all `<dt>`
/// elements that lack an anchor tag and wraps their children with a link pointing
/// to the element’s own `id` attribute.
///
/// # Arguments
///
/// * `html` - A mutable reference to the `Html` structure, representing the HTML
/// document to be processed.
///
/// # Returns
///
/// * `Result<(), DocError>` - Returns `Ok(())` if all operations succeed, otherwise
/// returns a `DocError` if an error is encountered.
pub fn insert_self_links_for_dts(html: &mut Html) -> Result<(), DocError> {
let selector = Selector::parse("dt:not(:has(> a)").unwrap();
let subs = html.select(&selector).map(|el| el.id()).collect::<Vec<_>>();
for el_id in subs {
wrap_children_with_link_to_id(html, el_id);
}
Ok(())
}

/// Removes all empty `<p>` elements from the given HTML document. This function
/// selects all `<p>` elements that have no children or content and removes them
/// from the HTML tree structure to clean up any unnecessary empty elements.
///
/// # Arguments
///
/// * `html` - A mutable reference to the `Html` structure, representing the HTML
/// document to be modified.
///
/// # Returns
///
/// * `Result<(), DocError>` - Returns `Ok(())` if all empty `<p>` elements are
/// successfully removed, otherwise returns a `DocError` if an error occurs.
pub fn remove_empty_p(html: &mut Html) -> Result<(), DocError> {
let selector = Selector::parse("p:empty").unwrap();
let dels = html.select(&selector).map(|el| el.id()).collect::<Vec<_>>();

for id in dels {
html.tree.get_mut(id).unwrap().detach();
}

Ok(())
}

/// Adds unique `id` attributes to HTML elements that are missing them.
///
/// This function scans through an HTML document, identifying elements that either:
/// 1. Already contain an `id` attribute, or
/// 2. Lack an `id` attribute but have `data-update-id` attributes or are headers (`<h2>`, `<h3>`) or `<dt>` elements.
///
/// For elements missing `id` attributes, it generates a unique `id` based on the element’s text content,
/// ensuring that the `id` does not conflict with any existing `id`s in the document. If an ID conflict
/// arises, a numeric suffix (e.g., `_2`, `_3`) is appended to the generated `id` until uniqueness is ensured.
///
/// # Arguments
///
/// * `html` - A mutable reference to an HTML document represented by the `Html` type.
///
/// # Returns
///
/// This function returns `Ok(())` on success or a `DocError` if an error occurs.
///
/// # Errors
///
/// If a `DocError` occurs during processing, such as a failure to parse selectors or update attributes,
/// the error is returned.
///
/// # Example
///
/// ```rust
/// let mut html = Html::parse_document("<h2>Some Heading</h2>");
/// add_missing_ids(&mut html);
/// ```
///
/// After calling this function, the HTML will have generated unique `id` attributes for
/// elements without `id`s, based on the element’s content text.
pub fn add_missing_ids(html: &mut Html) -> Result<(), DocError> {
let selector = Selector::parse("*[id]").unwrap();
let mut ids = html
Expand Down Expand Up @@ -85,14 +255,3 @@ pub fn add_missing_ids(html: &mut Html) -> Result<(), DocError> {
}
Ok(())
}

pub fn remove_empty_p(html: &mut Html) -> Result<(), DocError> {
let selector = Selector::parse("p:empty").unwrap();
let dels = html.select(&selector).map(|el| el.id()).collect::<Vec<_>>();

for id in dels {
html.tree.get_mut(id).unwrap().detach();
}

Ok(())
}
32 changes: 1 addition & 31 deletions crates/rari-doc/src/html/rewriter.rs
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
use std::borrow::Cow;
use std::collections::HashSet;

use lol_html::html_content::{ContentType, Element};
use lol_html::html_content::ContentType;
use lol_html::{element, rewrite_str, HtmlRewriter, RewriteStrSettings, Settings};
use rari_md::ext::DELIM_START;
use rari_md::node_card::NoteCard;
Expand Down Expand Up @@ -42,7 +42,6 @@ pub fn post_process_html<T: PageLike>(
) -> Result<String, DocError> {
let mut output = vec![];
let mut ids = HashSet::new();
let open_dt_a = std::rc::Rc::new(std::cell::RefCell::new(false));
let options = Url::options();
let url = page.url();
let base = Url::parse(&concat_strs!(
Expand Down Expand Up @@ -264,35 +263,6 @@ pub fn post_process_html<T: PageLike>(

Ok(())
}),
element!("dt[data-add-link]", |el: &mut Element| {
el.remove_attribute("data-add-link");
if let Some(id) = el.get_attribute("id") {
el.prepend(&concat_strs!("<a href=\"#", &id, "\">"), ContentType::Html);
let mut s = open_dt_a.borrow_mut();
*s = true;
let open_dt_a = open_dt_a.clone();
// We need this handler if there's only a text node in the dl.
if let Some(handlers) = el.end_tag_handlers() {
handlers.push(Box::new(move |end| {
let mut s = open_dt_a.borrow_mut();
if *s {
end.before("</a>", ContentType::Html);
*s = false;
}
Ok(())
}));
}
}
Ok(())
}),
element!("dt[data-add-link] *:first-child", |el| {
let mut s = open_dt_a.borrow_mut();
if *s {
el.after("</a>", ContentType::Html);
*s = false;
}
Ok(())
}),
element!("pre:not(.notranslate)", |el| {
let mut class = el.get_attribute("class").unwrap_or_default();
class.push_str(" notranslate");
Expand Down
3 changes: 2 additions & 1 deletion crates/rari-doc/src/pages/build.rs
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,7 @@ use crate::error::DocError;
use crate::helpers::parents::parents;
use crate::helpers::title::{page_title, transform_title};
use crate::html::bubble_up::bubble_up_curriculum_page;
use crate::html::modifier::{add_missing_ids, remove_empty_p};
use crate::html::modifier::{add_missing_ids, insert_self_links_for_dts, remove_empty_p};
use crate::html::rewriter::{post_process_html, post_process_inline_sidebar};
use crate::html::sections::{split_sections, BuildSection, BuildSectionType, Splitted};
use crate::html::sidebar::{
Expand Down Expand Up @@ -165,6 +165,7 @@ fn build_content<T: PageLike>(page: &T) -> Result<PageContent, DocError> {
}
remove_empty_p(&mut fragment)?;
add_missing_ids(&mut fragment)?;
insert_self_links_for_dts(&mut fragment)?;
expand_details_and_mark_current_for_inline_sidebar(&mut fragment, page.url())?;
let Splitted {
sections,
Expand Down
45 changes: 1 addition & 44 deletions crates/rari-md/src/html.rs
Original file line number Diff line number Diff line change
Expand Up @@ -473,37 +473,6 @@ impl<'o, 'c: 'o> HtmlFormatter<'o, 'c> {
}
}

fn collect_first_child_text<'a>(node: &'a AstNode<'a>, output: &mut Vec<u8>) {
if let Some(child) = node.children().next() {
if matches!(child.data.borrow().value, NodeValue::Paragraph) {
if let Some(child) = child.children().next() {
if !matches!(child.data.borrow().value, NodeValue::HtmlInline(_)) {
return Self::collect_text(child, output);
}
}
}
Self::collect_text(child, output)
} else {
Self::collect_text(node, output)
}
}

fn next_is_link<'a>(node: &'a AstNode<'a>) -> bool {
if let Some(child) = node.children().next() {
if matches!(child.data.borrow().value, NodeValue::Link(_)) {
return true;
}
if matches!(child.data.borrow().value, NodeValue::Paragraph) {
if let Some(child) = child.children().next() {
if matches!(child.data.borrow().value, NodeValue::Link(_)) {
return true;
}
}
}
}
false
}

fn format_node<'a>(
&mut self,
node: &'a AstNode<'a>,
Expand Down Expand Up @@ -597,19 +566,7 @@ impl<'o, 'c: 'o> HtmlFormatter<'o, 'c> {
NodeValue::DescriptionTerm => {
if entering {
self.cr()?;
let mut text_content = Vec::with_capacity(20);
Self::collect_first_child_text(node, &mut text_content);
let raw_id = String::from_utf8_lossy(&text_content);
let is_templ = raw_id.contains(DELIM_START);
if is_templ {
write!(self.output, "<dt data-update-id")?;
} else {
let id = self.anchorizer.anchorize(&raw_id);
write!(self.output, "<dt id=\"{}\"", id)?;
};
if !is_templ && !Self::next_is_link(node) {
write!(self.output, " data-add-link")?;
}
self.output.write_all(b"<dt")?;
self.render_sourcepos(node)?;
self.output.write_all(b">")?;
} else {
Expand Down
4 changes: 2 additions & 2 deletions crates/rari-md/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -105,7 +105,7 @@ mod test {
let out = m2h("- {{foo}}\n - : bar", Locale::EnUs)?;
assert_eq!(
out,
"<dl data-sourcepos=\"1:1-2:9\">\n<dt id=\"foo\" data-add-link data-sourcepos=\"1:1-2:9\">{{foo}}</dt>\n<dd data-sourcepos=\"2:3-2:9\">\n<p data-sourcepos=\"2:5-2:9\">bar</p>\n</dd>\n</dl>\n"
"<dl data-sourcepos=\"1:1-2:9\">\n<dt data-sourcepos=\"1:1-2:9\">{{foo}}</dt>\n<dd data-sourcepos=\"2:3-2:9\">\n<p data-sourcepos=\"2:5-2:9\">bar</p>\n</dd>\n</dl>\n"
);
Ok(())
}
Expand All @@ -115,7 +115,7 @@ mod test {
let out = m2h("- {{foo}}\n - : bar", Locale::EnUs)?;
assert_eq!(
out,
"<dl data-sourcepos=\"1:1-2:9\">\n<dt id=\"foo\" data-add-link data-sourcepos=\"1:1-2:9\">{{foo}}</dt>\n<dd data-sourcepos=\"2:3-2:9\">\n<p data-sourcepos=\"2:5-2:9\">bar</p>\n</dd>\n</dl>\n"
"<dl data-sourcepos=\"1:1-2:9\">\n<dt data-sourcepos=\"1:1-2:9\">{{foo}}</dt>\n<dd data-sourcepos=\"2:3-2:9\">\n<p data-sourcepos=\"2:5-2:9\">bar</p>\n</dd>\n</dl>\n"
);
Ok(())
}
Expand Down

0 comments on commit ef6fbd7

Please sign in to comment.