From 369d145b80dfed073532872f757b8ec16eb6b20e Mon Sep 17 00:00:00 2001 From: Carson McManus Date: Wed, 4 Sep 2024 10:16:08 -0400 Subject: [PATCH] feat(format/html): port `JsxChildList` formatting to `HtmlElementList` --- .../src/html/lists/element_list.rs | 714 +++++++++++++++++- crates/biome_html_formatter/src/lib.rs | 1 + .../src/utils/children.rs | 422 +++++++++++ crates/biome_html_formatter/src/utils/mod.rs | 1 + .../tests/specs/long-content.html | 1 + .../tests/specs/long-content.html.snap | 39 + .../tests/specs/many-children.html | 4 + .../tests/specs/many-children.html.snap | 40 + crates/biome_html_parser/src/lexer/tests.rs | 15 + 9 files changed, 1233 insertions(+), 4 deletions(-) create mode 100644 crates/biome_html_formatter/src/utils/children.rs create mode 100644 crates/biome_html_formatter/src/utils/mod.rs create mode 100644 crates/biome_html_formatter/tests/specs/long-content.html create mode 100644 crates/biome_html_formatter/tests/specs/long-content.html.snap create mode 100644 crates/biome_html_formatter/tests/specs/many-children.html create mode 100644 crates/biome_html_formatter/tests/specs/many-children.html.snap diff --git a/crates/biome_html_formatter/src/html/lists/element_list.rs b/crates/biome_html_formatter/src/html/lists/element_list.rs index d687a63417df..4b54f89d0c88 100644 --- a/crates/biome_html_formatter/src/html/lists/element_list.rs +++ b/crates/biome_html_formatter/src/html/lists/element_list.rs @@ -1,10 +1,716 @@ -use crate::prelude::*; -use biome_html_syntax::HtmlElementList; +//! This implementation is very heavily inspired by the JSX formatter implementation for JsxChildList. + +use std::cell::RefCell; + +use crate::{ + prelude::*, + utils::children::{ + html_split_children, is_meaningful_html_text, HtmlChild, HtmlChildrenIterator, + HtmlRawSpace, HtmlSpace, + }, +}; +use biome_formatter::{best_fitting, prelude::*}; +use biome_formatter::{format_args, write, VecBuffer}; +use biome_html_syntax::{AnyHtmlElement, HtmlElementList}; +use tag::GroupMode; #[derive(Debug, Clone, Default)] -pub(crate) struct FormatHtmlElementList; +pub(crate) struct FormatHtmlElementList { + layout: HtmlChildListLayout, +} impl FormatRule for FormatHtmlElementList { type Context = HtmlFormatContext; fn fmt(&self, node: &HtmlElementList, f: &mut HtmlFormatter) -> FormatResult<()> { - f.join().entries(node.iter().formatted()).finish() + let result = self.fmt_children(node, f)?; + match result { + FormatChildrenResult::ForceMultiline(format_multiline) => { + write!(f, [format_multiline]) + } + FormatChildrenResult::BestFitting { + flat_children, + expanded_children, + } => { + write!(f, [best_fitting![flat_children, expanded_children]]) + } + } + } +} + +#[derive(Debug)] +pub(crate) enum FormatChildrenResult { + ForceMultiline(FormatMultilineChildren), + BestFitting { + flat_children: FormatFlatChildren, + expanded_children: FormatMultilineChildren, + }, +} + +impl FormatHtmlElementList { + pub(crate) fn fmt_children( + &self, + list: &HtmlElementList, + f: &mut HtmlFormatter, + ) -> FormatResult { + self.disarm_debug_assertions(list, f); + + let children_meta = self.children_meta(list); + let layout = self.layout(children_meta); + + let multiline_layout = if children_meta.meaningful_text { + MultilineLayout::Fill + } else { + MultilineLayout::NoFill + }; + + let mut flat = FlatBuilder::new(); + let mut multiline = MultilineBuilder::new(multiline_layout); + + let mut force_multiline = layout.is_multiline(); + + let mut children = html_split_children(list.iter())?; + + // Trim trailing new lines + if let Some(HtmlChild::EmptyLine | HtmlChild::Newline) = children.last() { + children.pop(); + } + + let mut last: Option<&HtmlChild> = None; + let mut children_iter = HtmlChildrenIterator::new(children.iter()); + + // Trim leading new lines + if let Some(HtmlChild::Newline | HtmlChild::EmptyLine) = children_iter.peek() { + children_iter.next(); + } + + while let Some(child) = children_iter.next() { + let mut child_breaks = false; + + match &child { + // A single word: Both `a` and `b` are a word in `a b` because they're separated by HTML Whitespace. + HtmlChild::Word(word) => { + let separator = match children_iter.peek() { + Some(HtmlChild::Word(_)) => { + // Separate words by a space or line break in extended mode + Some(WordSeparator::BetweenWords) + } + + // Last word or last word before an element without any whitespace in between + Some(HtmlChild::NonText(next_child)) => Some(WordSeparator::EndOfText { + is_soft_line_break: !matches!( + next_child, + AnyHtmlElement::HtmlSelfClosingElement(_) + ) || word.is_single_character(), + }), + + Some(HtmlChild::Newline | HtmlChild::Whitespace | HtmlChild::EmptyLine) => { + None + } + + None => None, + }; + + child_breaks = separator.map_or(false, |separator| separator.will_break()); + + flat.write(&format_args![word, separator], f); + + if let Some(separator) = separator { + multiline.write_with_separator(word, &separator, f); + } else { + // it's safe to write without a separator because None means that next element is a separator or end of the iterator + multiline.write_content(word, f); + } + } + + // * Whitespace after the opening tag and before a meaningful text: `
a` + // * Whitespace before the closing tag: `a
` + // * Whitespace before an opening tag: `a
` + HtmlChild::Whitespace => { + flat.write(&HtmlSpace, f); + + // ```javascript + //
a + // {' '}
+ // ``` + let is_after_line_break = + last.as_ref().map_or(false, |last| last.is_any_line()); + + // `
aaa
` or `
` + let is_trailing_or_only_whitespace = children_iter.peek().is_none(); + + if is_trailing_or_only_whitespace || is_after_line_break { + multiline.write_separator(&HtmlRawSpace, f); + } + // Leading whitespace. Only possible if used together with a expression child + // + // ``` + //
+ // + // {' '} + // + //
+ // ``` + else if last.is_none() { + multiline.write_with_separator(&HtmlRawSpace, &hard_line_break(), f); + } else { + multiline.write_separator(&HtmlSpace, f); + } + } + + // A new line between some JSX text and an element + HtmlChild::Newline => { + let is_soft_break = { + // Here we handle the case when we have a newline between a single-character word and a jsx element + // We need to use the previous and the next element + // [HtmlChild::Word, HtmlChild::Newline, HtmlChild::NonText] + // ``` + //
+ //
First
, + //
Second
+ //
+ // ``` + if let Some(HtmlChild::Word(word)) = last { + let is_next_element_self_closing = matches!( + children_iter.peek(), + Some(HtmlChild::NonText(AnyHtmlElement::HtmlSelfClosingElement( + _ + ))) + ); + !is_next_element_self_closing && word.is_single_character() + } + // Here we handle the case when we have a single-character word between a new line and a jsx element + // Here we need to look ahead two elements + // [HtmlChild::Newline, HtmlChild::Word, HtmlChild::NonText] + // ``` + //
+ //
First
+ // ,
Second
+ //
+ // ``` + else if let Some(HtmlChild::Word(next_word)) = children_iter.peek() { + let next_next_element = children_iter.peek_next(); + let is_next_next_element_new_line = + matches!(next_next_element, Some(HtmlChild::Newline)); + let is_next_next_element_self_closing = matches!( + next_next_element, + Some(HtmlChild::NonText(AnyHtmlElement::HtmlSelfClosingElement( + _ + ))) + ); + let has_new_line_and_self_closing = is_next_next_element_new_line + && matches!( + children_iter.peek_next_next(), + Some(HtmlChild::NonText( + AnyHtmlElement::HtmlSelfClosingElement(_) + )) + ); + + !has_new_line_and_self_closing + && !is_next_next_element_self_closing + && next_word.is_single_character() + } else { + false + } + }; + + if is_soft_break { + multiline.write_separator(&soft_line_break(), f); + } else { + child_breaks = true; + multiline.write_separator(&hard_line_break(), f); + } + } + + // An empty line between some JSX text and an element + HtmlChild::EmptyLine => { + child_breaks = true; + + // Additional empty lines are not preserved when any of + // the children are a meaningful text node. + // + // <> + //
First
+ // + //
Second
+ // + // Third + // + // + // Becomes: + // + // <> + //
First
+ //
Second
+ // Third + // + if children_meta.meaningful_text { + multiline.write_separator(&hard_line_break(), f); + } else { + multiline.write_separator(&empty_line(), f); + } + } + + // Any child that isn't text + HtmlChild::NonText(non_text) => { + let line_mode = match children_iter.peek() { + Some(HtmlChild::Word(word)) => { + // Break if the current or next element is a self closing element + // ```javascript + //
adefg
+                            // ```
+                            // Becomes
+                            // ```javascript
+                            // 
+                            // adefg
+                            // ```
+                            if matches!(non_text, AnyHtmlElement::HtmlSelfClosingElement(_))
+                                && !word.is_single_character()
+                            {
+                                Some(LineMode::Hard)
+                            } else {
+                                Some(LineMode::Soft)
+                            }
+                        }
+
+                        // Add a hard line break if what comes after the element is not a text or is all whitespace
+                        Some(HtmlChild::NonText(_)) => Some(LineMode::Hard),
+
+                        Some(HtmlChild::Newline | HtmlChild::Whitespace | HtmlChild::EmptyLine) => {
+                            None
+                        }
+                        // Don't insert trailing line breaks
+                        None => None,
+                    };
+
+                    child_breaks = line_mode.map_or(false, |mode| mode.is_hard());
+
+                    let format_separator = line_mode.map(|mode| {
+                        format_with(move |f| f.write_element(FormatElement::Line(mode)))
+                    });
+
+                    if force_multiline {
+                        if let Some(format_separator) = format_separator {
+                            multiline.write_with_separator(
+                                &non_text.format(),
+                                &format_separator,
+                                f,
+                            );
+                        } else {
+                            // it's safe to write without a separator because None means that next element is a separator or end of the iterator
+                            multiline.write_content(&non_text.format(), f);
+                        }
+                    } else {
+                        let mut memoized = non_text.format().memoized();
+
+                        force_multiline = memoized.inspect(f)?.will_break();
+                        flat.write(&format_args![memoized, format_separator], f);
+
+                        if let Some(format_separator) = format_separator {
+                            multiline.write_with_separator(&memoized, &format_separator, f);
+                        } else {
+                            // it's safe to write without a separator because None means that next element is a separator or end of the iterator
+                            multiline.write_content(&memoized, f);
+                        }
+                    }
+                }
+            }
+
+            if child_breaks {
+                flat.disable();
+                force_multiline = true;
+            }
+
+            last = Some(child);
+        }
+
+        if force_multiline {
+            Ok(FormatChildrenResult::ForceMultiline(multiline.finish()?))
+        } else {
+            Ok(FormatChildrenResult::BestFitting {
+                flat_children: flat.finish()?,
+                expanded_children: multiline.finish()?,
+            })
+        }
+    }
+
+    /// Tracks the tokens of [HtmlContent] nodes to be formatted and
+    /// asserts that the suppression comments are checked (they get ignored).
+    ///
+    /// This is necessary because the formatting of [HtmlContentList] bypasses the node formatting for
+    /// [HtmlContent] and instead, formats the nodes itself.
+    #[cfg(debug_assertions)]
+    fn disarm_debug_assertions(&self, node: &HtmlElementList, f: &mut HtmlFormatter) {
+        use biome_formatter::CstFormatContext;
+        // use biome_html_syntax::{AnyJsExpression, AnyJsLiteralExpression};
+        use AnyHtmlElement::*;
+
+        for child in node {
+            match child {
+                HtmlContent(text) => {
+                    f.state_mut().track_token(&text.value_token().unwrap());
+
+                    // You can't suppress a text node
+                    f.context()
+                        .comments()
+                        .mark_suppression_checked(text.syntax());
+                }
+                _ => {
+                    continue;
+                }
+            }
+        }
+    }
+
+    #[cfg(not(debug_assertions))]
+    fn disarm_debug_assertions(&self, _: &HtmlElementList, _: &mut HtmlFormatter) {}
+
+    fn layout(&self, meta: ChildrenMeta) -> HtmlChildListLayout {
+        match self.layout {
+            HtmlChildListLayout::BestFitting => {
+                if meta.any_tag || meta.multiple_expressions {
+                    HtmlChildListLayout::Multiline
+                } else {
+                    HtmlChildListLayout::BestFitting
+                }
+            }
+            HtmlChildListLayout::Multiline => HtmlChildListLayout::Multiline,
+        }
+    }
+
+    /// Computes additional meta data about the children by iterating once over all children.
+    fn children_meta(&self, list: &HtmlElementList) -> ChildrenMeta {
+        let mut meta = ChildrenMeta::default();
+
+        for child in list {
+            use AnyHtmlElement::*;
+
+            match child {
+                HtmlElement(_) | HtmlSelfClosingElement(_) => meta.any_tag = true,
+                HtmlContent(text) => {
+                    meta.meaningful_text = meta.meaningful_text
+                        || text
+                            .value_token()
+                            .map_or(false, |token| is_meaningful_html_text(token.text()));
+                }
+                _ => {}
+            }
+        }
+
+        meta
+    }
+}
+
+#[derive(Debug, Default, Copy, Clone)]
+pub enum HtmlChildListLayout {
+    /// Prefers to format the children on a single line if possible.
+    #[default]
+    BestFitting,
+
+    /// Forces the children to be formatted over multiple lines
+    Multiline,
+}
+
+impl HtmlChildListLayout {
+    const fn is_multiline(&self) -> bool {
+        matches!(self, HtmlChildListLayout::Multiline)
+    }
+}
+
+#[derive(Copy, Clone, Debug, Default)]
+struct ChildrenMeta {
+    /// `true` if children contains a [HtmlElement] or [HtmlFragment]
+    any_tag: bool,
+
+    /// `true` if children contains more than one [HtmlExpressionChild]
+    multiple_expressions: bool,
+
+    /// `true` if any child contains meaningful a [HtmlText] with meaningful text.
+    meaningful_text: bool,
+}
+
+#[derive(Copy, Clone, Debug)]
+enum WordSeparator {
+    /// Separator between two words. Creates a soft line break or space.
+    ///
+    /// `a b`
+    BetweenWords,
+
+    /// A separator of a word at the end of a [HtmlText] element. Either because it is the last
+    /// child in its parent OR it is right before the start of another child (element, expression, ...).
+    ///
+    /// ```javascript
+    /// 
a
; // last element of parent + ///
a
// last element before another element + ///
a{expression}
// last element before expression + /// ``` + /// + /// Creates a soft line break EXCEPT if the next element is a self closing element + /// or the previous word was an ascii punctuation, which results in a hard line break: + /// + /// ```javascript + /// a =
ab
; + /// + /// // becomes + /// + /// a = ( + ///
+ /// ab + ///
+ ///
+ /// ); + /// ``` + EndOfText { is_soft_line_break: bool }, +} + +impl WordSeparator { + /// Returns if formatting this separator will result in a child that expands + fn will_break(&self) -> bool { + matches!( + self, + WordSeparator::EndOfText { + is_soft_line_break: false, + } + ) + } +} + +impl Format for WordSeparator { + fn fmt(&self, f: &mut Formatter) -> FormatResult<()> { + match self { + WordSeparator::BetweenWords => soft_line_break_or_space().fmt(f), + WordSeparator::EndOfText { is_soft_line_break } => { + if *is_soft_line_break { + soft_line_break().fmt(f) + } + // ```javascript + //
ab
+ // ``` + // Becomes + // + // ```javascript + //
+ // ab + //
+ //
+ // ``` + else { + hard_line_break().fmt(f) + } + } + } + } +} + +#[derive(Copy, Clone, Debug, Default)] +enum MultilineLayout { + Fill, + #[default] + NoFill, +} + +/// Builder that helps to create the output for the multiline layout. +/// +/// The multiline layout may use [FormatElement::Fill] element that requires that its children +/// are an alternating sequence of `[element, separator, element, separator, ...]`. +/// +/// This requires that each element is wrapped inside of a list if it emits more than one element to uphold +/// the constraints of [FormatElement::Fill]. +/// +/// However, the wrapping is only necessary for [MultilineLayout::Fill] for when the [FormatElement::Fill] element is used. +/// +/// This builder takes care of doing the least amount of work necessary for the chosen layout while also guaranteeing +/// that the written element is valid +#[derive(Debug, Clone)] +struct MultilineBuilder { + layout: MultilineLayout, + result: FormatResult>, +} + +impl MultilineBuilder { + fn new(layout: MultilineLayout) -> Self { + Self { + layout, + result: Ok(Vec::new()), + } + } + + /// Formats an element that does not require a separator + /// It is safe to omit the separator because at the call side we must guarantee that we have reached the end of the iterator + /// or the next element is a space/newline that should be written into the separator "slot". + fn write_content(&mut self, content: &dyn Format, f: &mut HtmlFormatter) { + self.write(content, None, f); + } + + /// Formatting a separator does not require any element in the separator slot + fn write_separator( + &mut self, + separator: &dyn Format, + f: &mut HtmlFormatter, + ) { + self.write(separator, None, f); + } + + fn write_with_separator( + &mut self, + content: &dyn Format, + separator: &dyn Format, + f: &mut HtmlFormatter, + ) { + self.write(content, Some(separator), f); + } + + fn write( + &mut self, + content: &dyn Format, + separator: Option<&dyn Format>, + f: &mut HtmlFormatter, + ) { + let result = std::mem::replace(&mut self.result, Ok(Vec::new())); + + self.result = result.and_then(|elements| { + let elements = { + let mut buffer = VecBuffer::new_with_vec(f.state_mut(), elements); + match self.layout { + MultilineLayout::Fill => { + // Make sure that the separator and content only ever write a single element + buffer.write_element(FormatElement::Tag(Tag::StartEntry))?; + write!(buffer, [content])?; + buffer.write_element(FormatElement::Tag(Tag::EndEntry))?; + + if let Some(separator) = separator { + buffer.write_element(FormatElement::Tag(Tag::StartEntry))?; + write!(buffer, [separator])?; + buffer.write_element(FormatElement::Tag(Tag::EndEntry))?; + } + } + MultilineLayout::NoFill => { + write!(buffer, [content, separator])?; + + if let Some(separator) = separator { + write!(buffer, [separator])?; + } + } + }; + buffer.into_vec() + }; + Ok(elements) + }) + } + + fn finish(self) -> FormatResult { + Ok(FormatMultilineChildren { + layout: self.layout, + elements: RefCell::new(self.result?), + }) + } +} + +#[derive(Debug)] +pub(crate) struct FormatMultilineChildren { + layout: MultilineLayout, + elements: RefCell>, +} + +impl Format for FormatMultilineChildren { + fn fmt(&self, f: &mut Formatter) -> FormatResult<()> { + let format_inner = format_once(|f| { + if let Some(elements) = f.intern_vec(self.elements.take()) { + match self.layout { + MultilineLayout::Fill => f.write_elements([ + FormatElement::Tag(Tag::StartFill), + elements, + FormatElement::Tag(Tag::EndFill), + ])?, + MultilineLayout::NoFill => f.write_elements([ + FormatElement::Tag(Tag::StartGroup( + tag::Group::new().with_mode(GroupMode::Expand), + )), + elements, + FormatElement::Tag(Tag::EndGroup), + ])?, + }; + } + + Ok(()) + }); + + // This indent is wrapped with a group to ensure that the print mode is + // set to `Expanded` when the group prints and will guarantee that the + // content _does not_ fit when printed as part of a `Fill`. Example: + //
+ // + // + // {" "} + // ({variable}) + //
+ // The `...` is the element that gets wrapped in the group + // by this line. Importantly, it contains a hard line break, and because + // [FitsMeasurer::fits_element] considers all hard lines as `Fits::Yes`, + // it will cause the element and the following separator to be printed + // in flat mode due to the logic of `Fill`. But because the we know the + // item breaks over multiple lines, we want it to _not_ fit and print + // both the content and the separator in Expanded mode, keeping the + // formatting as shown above. + // + // The `group` here allows us to opt-in to telling the `FitsMeasurer` + // that content that breaks shouldn't be considered flat and should be + // expanded. This is in contrast to something like a concise array fill, + // which _does_ allow breaks to fit and preserves density. + write!(f, [group(&block_indent(&format_inner))]) + } +} + +#[derive(Debug)] +struct FlatBuilder { + result: FormatResult>, + disabled: bool, +} + +impl FlatBuilder { + fn new() -> Self { + Self { + result: Ok(Vec::new()), + disabled: false, + } + } + + fn write(&mut self, content: &dyn Format, f: &mut HtmlFormatter) { + if self.disabled { + return; + } + + let result = std::mem::replace(&mut self.result, Ok(Vec::new())); + + self.result = result.and_then(|elements| { + let mut buffer = VecBuffer::new_with_vec(f.state_mut(), elements); + + write!(buffer, [content])?; + + Ok(buffer.into_vec()) + }) + } + + fn disable(&mut self) { + self.disabled = true; + } + + fn finish(self) -> FormatResult { + assert!(!self.disabled, "The flat builder has been disabled and thus, does no longer store any elements. Make sure you don't call disable if you later intend to format the flat content."); + + Ok(FormatFlatChildren { + elements: RefCell::new(self.result?), + }) + } +} + +#[derive(Debug)] +pub(crate) struct FormatFlatChildren { + elements: RefCell>, +} + +impl Format for FormatFlatChildren { + fn fmt(&self, f: &mut Formatter) -> FormatResult<()> { + if let Some(elements) = f.intern_vec(self.elements.take()) { + f.write_element(elements)?; + } + Ok(()) } } diff --git a/crates/biome_html_formatter/src/lib.rs b/crates/biome_html_formatter/src/lib.rs index 3b68dca55214..4d3ffa266564 100644 --- a/crates/biome_html_formatter/src/lib.rs +++ b/crates/biome_html_formatter/src/lib.rs @@ -14,6 +14,7 @@ mod cst; mod generated; mod html; pub(crate) mod prelude; +pub mod utils; /// Formats a Html file based on its features. /// diff --git a/crates/biome_html_formatter/src/utils/children.rs b/crates/biome_html_formatter/src/utils/children.rs new file mode 100644 index 000000000000..1ac4a5dee529 --- /dev/null +++ b/crates/biome_html_formatter/src/utils/children.rs @@ -0,0 +1,422 @@ +use std::{ + iter::{FusedIterator, Peekable}, + str::Chars, +}; + +use biome_formatter::{ + format_args, prelude::*, write, Buffer, Format, FormatElement, FormatResult, +}; +use biome_html_syntax::AnyHtmlElement; +use biome_rowan::{SyntaxResult, TextLen, TextRange, TextSize, TokenText}; + +use crate::{context::HtmlFormatContext, HtmlFormatter}; + +pub(crate) static HTML_WHITESPACE_CHARS: [char; 4] = [' ', '\n', '\t', '\r']; + +/// Meaningful HTML text is defined to be text that has either non-whitespace +/// characters, or does not contain a newline. Whitespace is defined as ASCII +/// whitespace. +/// +/// ``` +/// use biome_html_formatter::utils::is_meaningful_html_text; +/// +/// assert_eq!(is_meaningful_html_text(" \t\r "), true); +/// assert_eq!(is_meaningful_html_text(" \n\r "), false); +/// assert_eq!(is_meaningful_html_text(" Alien "), true); +/// assert_eq!(is_meaningful_html_text("\n Alien "), true); +/// assert_eq!(is_meaningful_html_text(" Alien \n"), true); +/// assert_eq!(is_meaningful_html_text(""), true); +/// ``` +pub fn is_meaningful_html_text(text: &str) -> bool { + let mut has_newline = false; + for c in text.chars() { + // If there is a non-whitespace character + if !HTML_WHITESPACE_CHARS.contains(&c) { + return true; + } else if c == '\n' { + has_newline = true; + } + } + + !has_newline +} + +/// A word in a Html Text. A word is string sequence that isn't separated by any HTML whitespace. +#[derive(Debug, Clone, Eq, PartialEq)] +pub(crate) struct HtmlWord { + text: TokenText, + source_position: TextSize, +} + +impl HtmlWord { + fn new(text: TokenText, source_position: TextSize) -> Self { + HtmlWord { + text, + source_position, + } + } + + pub(crate) fn is_single_character(&self) -> bool { + self.text.chars().count() == 1 + } +} + +impl Format for HtmlWord { + fn fmt(&self, f: &mut Formatter) -> FormatResult<()> { + f.write_element(FormatElement::LocatedTokenText { + source_position: self.source_position, + slice: self.text.clone(), + }) + } +} + +#[derive(Debug, Clone, Eq, PartialEq)] +pub(crate) enum HtmlChild { + /// A Single word in a HTML text. For example, the words for `a b\nc` are `[a, b, c]` + Word(HtmlWord), + + /// A ` ` whitespace + /// + /// ```html + ///
+ ///
a
+ ///
a
+ ///
a + /// b
+ /// ``` + /// + /// Whitespace between two words is not represented as whitespace + /// ```javascript + ///
a b
+ /// ``` + /// The space between `a` and `b` is not considered a whitespace. + Whitespace, + + /// A new line at the start or end of a [HtmlText] with meaningful content. (that isn't all whitespace + /// and contains a new line). + /// + /// ```html + ///
+ /// a + ///
+ /// ``` + Newline, + + /// A [HtmlText] that only consists of whitespace and has at least two line breaks; + /// + /// ```html + ///
+ /// + /// + ///
+ /// ``` + /// + /// The text between `
` and `` is an empty line text. + EmptyLine, + + /// Any other content that isn't a text. Should be formatted as is. + NonText(AnyHtmlElement), +} + +impl HtmlChild { + pub(crate) const fn is_any_line(&self) -> bool { + matches!(self, HtmlChild::EmptyLine | HtmlChild::Newline) + } +} + +/// Creates either a space using an expression child and a string literal, +/// or a regular space, depending on whether the group breaks or not. +/// +/// ```html +///
Winter Light
; +/// +///
+/// Winter Light +/// Through A Glass Darkly +/// The Silence +/// Seventh Seal +/// Wild Strawberries +///
+/// ``` +#[derive(Default)] +pub(crate) struct HtmlSpace; + +impl Format for HtmlSpace { + fn fmt(&self, formatter: &mut HtmlFormatter) -> FormatResult<()> { + write![ + formatter, + [ + if_group_breaks(&format_args![HtmlRawSpace, soft_line_break()]), + if_group_fits_on_line(&space()) + ] + ] + } +} + +pub(crate) struct HtmlRawSpace; + +impl Format for HtmlRawSpace { + fn fmt(&self, f: &mut Formatter) -> FormatResult<()> { + write!(f, [text(" ")]) + } +} + +pub(crate) fn html_split_children(children: I) -> SyntaxResult> +where + I: IntoIterator, +{ + let mut builder = HtmlSplitChildrenBuilder::new(); + + for child in children { + match child { + AnyHtmlElement::HtmlContent(text) => { + // Split the text into words + // Keep track if there's any leading/trailing empty line, new line or whitespace + + let value_token = text.value_token()?; + let mut chunks = HtmlSplitChunksIterator::new(value_token.text()).peekable(); + + // Text starting with a whitespace + if let Some((_, HtmlTextChunk::Whitespace(_whitespace))) = chunks.peek() { + match chunks.next() { + Some((_, HtmlTextChunk::Whitespace(whitespace))) => { + if whitespace.contains('\n') { + if chunks.peek().is_none() { + // A text only consisting of whitespace that also contains a new line isn't considered meaningful text. + // It can be entirely removed from the content without changing the semantics. + let newlines = + whitespace.chars().filter(|c| *c == '\n').count(); + + // Keep up to one blank line between tags/expressions and text. + // ```javascript + //
+ // + // + //
+ // ``` + if newlines > 1 { + builder.entry(HtmlChild::EmptyLine); + } + + continue; + } + + builder.entry(HtmlChild::Newline) + } else { + builder.entry(HtmlChild::Whitespace) + } + } + _ => unreachable!(), + } + } + + while let Some(chunk) = chunks.next() { + match chunk { + (_, HtmlTextChunk::Whitespace(whitespace)) => { + // Only handle trailing whitespace. Words must always be joined by new lines + if chunks.peek().is_none() { + if whitespace.contains('\n') { + builder.entry(HtmlChild::Newline); + } else { + builder.entry(HtmlChild::Whitespace) + } + } + } + + (relative_start, HtmlTextChunk::Word(word)) => { + let text = value_token + .token_text() + .slice(TextRange::at(relative_start, word.text_len())); + let source_position = value_token.text_range().start() + relative_start; + + builder.entry(HtmlChild::Word(HtmlWord::new(text, source_position))); + } + } + } + } + child => { + builder.entry(HtmlChild::NonText(child)); + } + } + } + + Ok(builder.finish()) +} + +/// The builder is used to: +/// 1. Remove [HtmlChild::EmptyLine], [HtmlChild::Newline], [HtmlChild::Whitespace] if a next element is [HtmlChild::Whitespace] +/// 2. Don't push a new element [HtmlChild::EmptyLine], [HtmlChild::Newline], [HtmlChild::Whitespace] if previous one is [HtmlChild::EmptyLine], [HtmlChild::Newline], [HtmlChild::Whitespace] +/// +/// [Prettier applies]: https://github.com/prettier/prettier/blob/b0d9387b95cdd4e9d50f5999d3be53b0b5d03a97/src/language-js/print/jsx.js#L144-L180 +#[derive(Debug)] +struct HtmlSplitChildrenBuilder { + buffer: Vec, +} + +impl HtmlSplitChildrenBuilder { + fn new() -> Self { + HtmlSplitChildrenBuilder { buffer: vec![] } + } + + fn entry(&mut self, child: HtmlChild) { + match self.buffer.last_mut() { + Some(last @ (HtmlChild::EmptyLine | HtmlChild::Newline | HtmlChild::Whitespace)) => { + if matches!(child, HtmlChild::Whitespace) { + *last = child; + } else if matches!(child, HtmlChild::NonText(_) | HtmlChild::Word(_)) { + self.buffer.push(child); + } + } + _ => self.buffer.push(child), + } + } + + fn finish(self) -> Vec { + self.buffer + } +} + +#[derive(Eq, PartialEq, Copy, Clone, Debug)] +enum HtmlTextChunk<'a> { + Whitespace(&'a str), + Word(&'a str), +} + +/// Splits a text into whitespace only and non-whitespace chunks. +/// +/// See `jsx_split_chunks_iterator` test for examples +struct HtmlSplitChunksIterator<'a> { + position: TextSize, + text: &'a str, + chars: Peekable>, +} + +impl<'a> HtmlSplitChunksIterator<'a> { + fn new(text: &'a str) -> Self { + Self { + position: TextSize::default(), + text, + chars: text.chars().peekable(), + } + } +} + +impl<'a> Iterator for HtmlSplitChunksIterator<'a> { + type Item = (TextSize, HtmlTextChunk<'a>); + + fn next(&mut self) -> Option { + let char = self.chars.next()?; + + let start = self.position; + self.position += char.text_len(); + + let is_whitespace = matches!(char, ' ' | '\n' | '\t' | '\r'); + + while let Some(next) = self.chars.peek() { + let next_is_whitespace = matches!(next, ' ' | '\n' | '\t' | '\r'); + + if is_whitespace != next_is_whitespace { + break; + } + + self.position += next.text_len(); + self.chars.next(); + } + + let range = TextRange::new(start, self.position); + let slice = &self.text[range]; + + let chunk = if is_whitespace { + HtmlTextChunk::Whitespace(slice) + } else { + HtmlTextChunk::Word(slice) + }; + + Some((start, chunk)) + } +} + +impl FusedIterator for HtmlSplitChunksIterator<'_> {} + +/// An iterator adaptor that allows a lookahead of three tokens +/// +/// # Examples +/// ``` +/// use biome_html_formatter::utils::HtmlChildrenIterator; +/// +/// let buffer = vec![1, 2, 3, 4]; +/// +/// let mut iter = HtmlChildrenIterator::new(buffer.iter()); +/// +/// assert_eq!(iter.peek(), Some(&&1)); +/// assert_eq!(iter.peek_next(), Some(&&2)); +/// assert_eq!(iter.peek_next_next(), Some(&&3)); +/// assert_eq!(iter.next(), Some(&1)); +/// assert_eq!(iter.next(), Some(&2)); +/// assert_eq!(iter.next(), Some(&3)); +/// ``` +#[derive(Clone, Debug)] +pub struct HtmlChildrenIterator { + iter: I, + + peeked: Option>, + peeked_next: Option>, + peeked_next_next: Option>, +} + +impl HtmlChildrenIterator { + pub fn new(iter: I) -> Self { + Self { + iter, + peeked: None, + peeked_next: None, + peeked_next_next: None, + } + } + + pub fn peek(&mut self) -> Option<&I::Item> { + let iter = &mut self.iter; + self.peeked.get_or_insert_with(|| iter.next()).as_ref() + } + + pub fn peek_next(&mut self) -> Option<&I::Item> { + let iter = &mut self.iter; + let peeked = &mut self.peeked; + + self.peeked_next + .get_or_insert_with(|| { + peeked.get_or_insert_with(|| iter.next()); + iter.next() + }) + .as_ref() + } + + pub fn peek_next_next(&mut self) -> Option<&I::Item> { + let iter = &mut self.iter; + let peeked = &mut self.peeked; + let peeked_next = &mut self.peeked_next; + + self.peeked_next_next + .get_or_insert_with(|| { + peeked.get_or_insert_with(|| iter.next()); + peeked_next.get_or_insert_with(|| iter.next()); + iter.next() + }) + .as_ref() + } +} + +impl Iterator for HtmlChildrenIterator { + type Item = I::Item; + + fn next(&mut self) -> Option { + match self.peeked.take() { + Some(peeked) => { + self.peeked = self.peeked_next.take(); + self.peeked_next = self.peeked_next_next.take(); + peeked + } + None => self.iter.next(), + } + } +} diff --git a/crates/biome_html_formatter/src/utils/mod.rs b/crates/biome_html_formatter/src/utils/mod.rs new file mode 100644 index 000000000000..397bb6841325 --- /dev/null +++ b/crates/biome_html_formatter/src/utils/mod.rs @@ -0,0 +1 @@ +pub mod children; diff --git a/crates/biome_html_formatter/tests/specs/long-content.html b/crates/biome_html_formatter/tests/specs/long-content.html new file mode 100644 index 000000000000..62c6ecdf825a --- /dev/null +++ b/crates/biome_html_formatter/tests/specs/long-content.html @@ -0,0 +1 @@ +
Lorem ipsum dolor sit amet, consectetur adipiscing elit. Sed dapibus velit non justo tempus pretium. Praesent orci lorem, pellentesque sit amet ullamcorper sed, consectetur quis odio. In felis nulla, pellentesque a dolor eget, semper vestibulum nisi. Sed imperdiet dui a massa placerat ullamcorper. Praesent eleifend purus eget arcu faucibus, ac fringilla nulla aliquet. Aliquam id accumsan libero. Duis efficitur nisi quis massa mollis ultricies. Sed condimentum sit amet urna in cursus. Ut id tortor vestibulum, mollis enim sed, ornare ipsum. Class aptent taciti sociosqu ad litora torquent per conubia nostra, per inceptos himenaeos. Ut nec dapibus est. Maecenas orci purus, blandit eu faucibus eu, lacinia id turpis.
diff --git a/crates/biome_html_formatter/tests/specs/long-content.html.snap b/crates/biome_html_formatter/tests/specs/long-content.html.snap new file mode 100644 index 000000000000..6b8dec426b89 --- /dev/null +++ b/crates/biome_html_formatter/tests/specs/long-content.html.snap @@ -0,0 +1,39 @@ +--- +source: crates/biome_formatter_test/src/snapshot_builder.rs +info: long-content.html +--- +# Input + +```html +
Lorem ipsum dolor sit amet, consectetur adipiscing elit. Sed dapibus velit non justo tempus pretium. Praesent orci lorem, pellentesque sit amet ullamcorper sed, consectetur quis odio. In felis nulla, pellentesque a dolor eget, semper vestibulum nisi. Sed imperdiet dui a massa placerat ullamcorper. Praesent eleifend purus eget arcu faucibus, ac fringilla nulla aliquet. Aliquam id accumsan libero. Duis efficitur nisi quis massa mollis ultricies. Sed condimentum sit amet urna in cursus. Ut id tortor vestibulum, mollis enim sed, ornare ipsum. Class aptent taciti sociosqu ad litora torquent per conubia nostra, per inceptos himenaeos. Ut nec dapibus est. Maecenas orci purus, blandit eu faucibus eu, lacinia id turpis.
+ +``` + + +============================= + +# Outputs + +## Output 1 + +----- +Indent style: Tab +Indent width: 2 +Line ending: LF +Line width: 80 +Attribute Position: Auto +----- + +```html +
+ Lorem ipsum dolor sit amet, consectetur adipiscing elit. Sed dapibus velit non + justo tempus pretium. Praesent orci lorem, pellentesque sit amet ullamcorper + sed, consectetur quis odio. In felis nulla, pellentesque a dolor eget, semper + vestibulum nisi. Sed imperdiet dui a massa placerat ullamcorper. Praesent + eleifend purus eget arcu faucibus, ac fringilla nulla aliquet. Aliquam id + accumsan libero. Duis efficitur nisi quis massa mollis ultricies. Sed + condimentum sit amet urna in cursus. Ut id tortor vestibulum, mollis enim sed, + ornare ipsum. Class aptent taciti sociosqu ad litora torquent per conubia + nostra, per inceptos himenaeos. Ut nec dapibus est. Maecenas orci purus, + blandit eu faucibus eu, lacinia id turpis. +
``` diff --git a/crates/biome_html_formatter/tests/specs/many-children.html b/crates/biome_html_formatter/tests/specs/many-children.html new file mode 100644 index 000000000000..97fa531b7093 --- /dev/null +++ b/crates/biome_html_formatter/tests/specs/many-children.html @@ -0,0 +1,4 @@ +
+
Foo
Bar
Baz
Qux
+
Quux
Quuz
Corge
Grault
+
diff --git a/crates/biome_html_formatter/tests/specs/many-children.html.snap b/crates/biome_html_formatter/tests/specs/many-children.html.snap new file mode 100644 index 000000000000..92693478f3c8 --- /dev/null +++ b/crates/biome_html_formatter/tests/specs/many-children.html.snap @@ -0,0 +1,40 @@ +--- +source: crates/biome_formatter_test/src/snapshot_builder.rs +info: many-children.html +--- +# Input + +```html +
+
Foo
Bar
Baz
Qux
+
Quux
Quuz
Corge
Grault
+
+ +``` + + +============================= + +# Outputs + +## Output 1 + +----- +Indent style: Tab +Indent width: 2 +Line ending: LF +Line width: 80 +Attribute Position: Auto +----- + +```html +
+
Foo
+
Bar
+
Baz
+
Qux
+
Quux
+
Quuz
+
Corge
+
Grault
+
``` diff --git a/crates/biome_html_parser/src/lexer/tests.rs b/crates/biome_html_parser/src/lexer/tests.rs index 2db8b6357a47..046fe50b847e 100644 --- a/crates/biome_html_parser/src/lexer/tests.rs +++ b/crates/biome_html_parser/src/lexer/tests.rs @@ -149,6 +149,21 @@ fn element() { } } +#[test] +fn element_with_text() { + assert_lex! { + "
hello world
", + L_ANGLE: 1, + HTML_LITERAL: 3, + R_ANGLE: 1, + HTML_LITERAL: 11, + L_ANGLE: 1, + SLASH: 1, + HTML_LITERAL: 3, + R_ANGLE: 1, + } +} + #[test] fn doctype_with_quirk() { assert_lex! {