From 38ab9d2c819f19b06386ceb137fe438f59f0b11a Mon Sep 17 00:00:00 2001 From: David Langley Date: Wed, 2 Jul 2025 15:13:21 +0100 Subject: [PATCH 01/17] Add parsing support for html from external sources(pasteboard from google docs and ms word) --- crates/wysiwyg/Cargo.toml | 2 +- crates/wysiwyg/src/dom/parser/parse.rs | 199 ++++++++++++++++++++----- 2 files changed, 162 insertions(+), 39 deletions(-) diff --git a/crates/wysiwyg/Cargo.toml b/crates/wysiwyg/Cargo.toml index ebbd5880b..8d298a5a6 100644 --- a/crates/wysiwyg/Cargo.toml +++ b/crates/wysiwyg/Cargo.toml @@ -26,7 +26,7 @@ strum = "0.27" strum_macros = "0.27" unicode-segmentation = "1.7.1" wasm-bindgen = { version = "0.2.83", default-features = false, optional = true } -web-sys = { version = "0.3.60", default-features = false, features = ["Document", "DomParser", "HtmlElement", "Node", "NodeList", "SupportedType"], optional = true } +web-sys = { version = "0.3.60", default-features = false, features = ["Document", "DomParser", "HtmlElement", "Node", "NodeList", "SupportedType", "CssStyleDeclaration"], optional = true } widestring = "1.0.2" indoc = "2.0" url="2.3.1" diff --git a/crates/wysiwyg/src/dom/parser/parse.rs b/crates/wysiwyg/src/dom/parser/parse.rs index 743f35c9f..15300dbc7 100644 --- a/crates/wysiwyg/src/dom/parser/parse.rs +++ b/crates/wysiwyg/src/dom/parser/parse.rs @@ -1162,7 +1162,9 @@ mod js { use matrix_mentions::Mention; use std::fmt; use wasm_bindgen::JsCast; - use web_sys::{Document, DomParser, Element, NodeList, SupportedType}; + use web_sys::{ + Document, DomParser, Element, HtmlElement, NodeList, SupportedType, + }; pub(super) struct HtmlParser { current_path: Vec, @@ -1178,6 +1180,27 @@ mod js { &mut self, html: &str, ) -> Result, HtmlParseError> + where + S: UnicodeString, + { + self.parse_internal(html, false) + } + + pub(super) fn parse_from_external_html_source( + &mut self, + html: &str, + ) -> Result, HtmlParseError> + where + S: UnicodeString, + { + self.parse_internal(html, true) + } + + fn parse_internal( + &mut self, + html: &str, + external_html_source: bool, + ) -> Result, HtmlParseError> where S: UnicodeString, { @@ -1195,7 +1218,7 @@ mod js { ) })?; - self.webdom_to_dom(document) + self.webdom_to_dom(document, external_html_source) .map_err(to_dom_creation_error) .map(post_process_blocks) } @@ -1203,15 +1226,20 @@ mod js { fn webdom_to_dom( &mut self, webdoc: Document, + external_html_source: bool, ) -> Result, Error> where S: UnicodeString, { let body = webdoc.body().ok_or_else(|| Error::NoBody)?; - self.convert(body.child_nodes()) + self.convert(body.child_nodes(), external_html_source) } - fn convert(&mut self, nodes: NodeList) -> Result, Error> + fn convert( + &mut self, + nodes: NodeList, + external_html_source: bool, + ) -> Result, Error> where S: UnicodeString, { @@ -1219,7 +1247,7 @@ mod js { let mut dom = Dom::new(Vec::with_capacity(number_of_nodes)); let dom_document = dom.document_mut(); - self.convert_container(nodes, dom_document)?; + self.convert_container(nodes, dom_document, external_html_source)?; Ok(dom) } @@ -1228,6 +1256,7 @@ mod js { &mut self, nodes: NodeList, dom: &mut ContainerNode, + external_html_source: bool, ) -> Result<(), Error> where S: UnicodeString, @@ -1309,7 +1338,10 @@ mod js { ); } else { let children = self - .convert(node.child_nodes())? + .convert( + node.child_nodes(), + external_html_source, + )? .take_children(); dom.append_child(DomNode::new_link( url.into(), @@ -1329,8 +1361,11 @@ mod js { dom.append_child(DomNode::Container( ContainerNode::new_list( ListType::Ordered, - self.convert(node.child_nodes())? - .take_children(), + self.convert( + node.child_nodes(), + external_html_source, + )? + .take_children(), if let Some(custom_start) = custom_start { Some(vec![( "start".into(), @@ -1349,8 +1384,11 @@ mod js { dom.append_child(DomNode::Container( ContainerNode::new_list( ListType::Unordered, - self.convert(node.child_nodes())? - .take_children(), + self.convert( + node.child_nodes(), + external_html_source, + )? + .take_children(), None, ), )); @@ -1361,8 +1399,11 @@ mod js { self.current_path.push(DomNodeKind::ListItem); dom.append_child(DomNode::Container( ContainerNode::new_list_item( - self.convert(node.child_nodes())? - .take_children(), + self.convert( + node.child_nodes(), + external_html_source, + )? + .take_children(), ), )); self.current_path.pop(); @@ -1382,7 +1423,8 @@ mod js { }; dom.append_child(DomNode::Container( ContainerNode::new_code_block( - self.convert(children)?.take_children(), + self.convert(children, external_html_source)? + .take_children(), ), )); self.current_path.pop(); @@ -1392,8 +1434,11 @@ mod js { self.current_path.push(DomNodeKind::Quote); dom.append_child(DomNode::Container( ContainerNode::new_quote( - self.convert(node.child_nodes())? - .take_children(), + self.convert( + node.child_nodes(), + external_html_source, + )? + .take_children(), ), )); self.current_path.pop(); @@ -1403,40 +1448,87 @@ mod js { self.current_path.push(DomNodeKind::Paragraph); dom.append_child(DomNode::Container( ContainerNode::new_paragraph( - self.convert(node.child_nodes())? - .take_children(), + self.convert( + node.child_nodes(), + external_html_source, + )? + .take_children(), ), )); self.current_path.pop(); } - node_name => { - let children_nodes = - self.convert(node.child_nodes())?.take_children(); - + let children_nodes = self + .convert(node.child_nodes(), external_html_source)? + .take_children(); let formatting_kind = match node_name { - "STRONG" | "B" => InlineFormatType::Bold, - "EM" | "I" => InlineFormatType::Italic, - "DEL" => InlineFormatType::StrikeThrough, - "U" => InlineFormatType::Underline, - "CODE" => InlineFormatType::InlineCode, + "STRONG" | "B" => Some(InlineFormatType::Bold), + "EM" | "I" => Some(InlineFormatType::Italic), + "DEL" => Some(InlineFormatType::StrikeThrough), + "U" => Some(InlineFormatType::Underline), + "CODE" => Some(InlineFormatType::InlineCode), + "SPAN" => { + if !external_html_source { + return Err(Error::UnknownNode( + node_name.to_owned(), + )); + } + let style = + node.unchecked_ref::().style(); + if style + .get_property_value("font-weight") + .unwrap_or_default() + == "bold" + { + Some(InlineFormatType::Bold) + } else if style + .get_property_value("font-style") + .unwrap_or_default() + == "italic" + { + Some(InlineFormatType::Italic) + } else if style + .get_property_value("text-decoration") + .unwrap_or_default() + == "underline" + { + Some(InlineFormatType::Underline) + } else if style + .get_property_value("text-decoration") + .unwrap_or_default() + == "line-through" + { + Some(InlineFormatType::StrikeThrough) + } else { + None + } + } _ => { - return Err(Error::UnknownNode( - node_name.to_owned(), - )) + if !external_html_source { + return Err(Error::UnknownNode( + node_name.to_owned(), + )); + } + None } }; - self.current_path.push(DomNodeKind::Formatting( - formatting_kind.clone(), - )); + if formatting_kind.is_none() { + if !children_nodes.is_empty() { + dom.append_children(children_nodes); + } + } else { + self.current_path.push(DomNodeKind::Formatting( + formatting_kind.clone().unwrap(), + )); - dom.append_child(DomNode::Container( - ContainerNode::new_formatting( - formatting_kind, - children_nodes, - ), - )); + dom.append_child(DomNode::Container( + ContainerNode::new_formatting( + formatting_kind.unwrap(), + children_nodes, + ), + )); + } self.current_path.pop(); } } @@ -1512,6 +1604,37 @@ mod js { roundtrip("foo bar baz"); } + #[wasm_bindgen_test] + fn google_doc_rich_text() { + let html = r#" +
  • Italic

  • Bold

  • Unformatted

  • Strikethrough

  • Underlined

  • Linked

    • nested

+ "#; + let dom = HtmlParser::default() + .parse_from_external_html_source::(html) + .unwrap(); + assert_eq!(dom.to_string(), "
  • Italic

  • Bold

  • Unformatted

  • Strikethrough

  • Underlined

  • Linked

    • nested

"); + } + + #[wasm_bindgen_test] + fn ms_rich_text() { + let html = r#" +
  • Italic 

  • Bold 

  • Unformatted 

  • Strikethrough 

  • Underlined 

  • nested 

+ "#; + let dom = HtmlParser::default() + .parse_from_external_html_source::(html) + .unwrap(); + assert_eq!(dom.to_string(), "
  • Italic

  • Bold

  • Unformatted

  • Strikethrough

  • Underlined

  • nested

"); + } + + #[wasm_bindgen_test] + fn unknown_tag_errors() { + let html = r#" + Bold + "#; + let result = HtmlParser::default().parse::(html); + assert_eq!(result.is_err(), true); + } + #[wasm_bindgen_test] fn br() { let html = "foo
bar"; From 8a4837ad643ed822ed0d6d36ad5962885e6e4647 Mon Sep 17 00:00:00 2001 From: David Langley Date: Wed, 2 Jul 2025 20:12:03 +0100 Subject: [PATCH 02/17] Add contains_style function to check if a pa node contains a style attribute of a particular value --- .../src/dom/parser/panode_container.rs | 27 +++++++++++++++++++ 1 file changed, 27 insertions(+) diff --git a/crates/wysiwyg/src/dom/parser/panode_container.rs b/crates/wysiwyg/src/dom/parser/panode_container.rs index 4d83e8f18..ab4aa5de4 100644 --- a/crates/wysiwyg/src/dom/parser/panode_container.rs +++ b/crates/wysiwyg/src/dom/parser/panode_container.rs @@ -5,6 +5,7 @@ // Please see LICENSE in the repository root for full details. use html5ever::QualName; +use regex::Regex; use super::PaDomHandle; @@ -21,4 +22,30 @@ impl PaNodeContainer { .find(|(n, _v)| n == name) .map(|(_n, v)| v.as_str()) } + + pub(crate) fn contains_style(&self, name: &str, value: &str) -> bool { + return self + .get_attr("style") + .map(|v| { + return Regex::new(&format!( + r"(?i){}:\s*{};", + regex::escape(name), + regex::escape(value) + )) + .map(|re| re.is_match(v)) + .unwrap_or(false); + }) + .unwrap_or(false); + } } + +#[test] +fn test_contains_style() { + let node = PaNodeContainer { + name: QualName::new(None, "div".into(), "div".into()), + attrs: vec![("style".into(), "font-weight:bold;".into())], + children: Vec::new(), + }; + assert!(node.contains_style("font-weight", "bold")); + assert!(!node.contains_style("font-weight", "normal")); +} \ No newline at end of file From 90caa7f78f3fae558a5f06b913f93be85245c326 Mon Sep 17 00:00:00 2001 From: David Langley Date: Wed, 2 Jul 2025 20:14:54 +0100 Subject: [PATCH 03/17] Add support support for external_html_source/span to sys parser and bring it's error handling in line with sys parser so that it returns an error if external_html_source is false and it encounters tags it doesn't support. --- crates/wysiwyg/src/dom/parser/parse.rs | 302 +++++++++++++++++++++---- 1 file changed, 261 insertions(+), 41 deletions(-) diff --git a/crates/wysiwyg/src/dom/parser/parse.rs b/crates/wysiwyg/src/dom/parser/parse.rs index 15300dbc7..3687c44e4 100644 --- a/crates/wysiwyg/src/dom/parser/parse.rs +++ b/crates/wysiwyg/src/dom/parser/parse.rs @@ -27,8 +27,19 @@ where } } +#[cfg(test)] +const GOOGLE_DOC_HTML_PASTEBOARD: &str = r#" +
  • Italic

  • Bold

  • Unformatted

  • Strikethrough

  • Underlined

  • Linked

    • nested

+ "#; +#[cfg(test)] +const MS_DOC_HTML_PASTEBOARD: &str = r#" +
  • Italic 

  • Bold 

  • Unformatted 

  • Strikethrough 

  • Underlined 

  • nested 

+ "#; + #[cfg(feature = "sys")] mod sys { + use std::fmt; + use matrix_mentions::Mention; use super::super::padom_node::PaDomNode; @@ -58,14 +69,37 @@ mod sys { where S: UnicodeString, { - PaDomCreator::parse(html) - .map(|pa_dom| { - let dom = self.padom_to_dom(pa_dom); - post_process_blocks(dom) - }) - .map_err(|err| { - self.padom_creation_error_to_html_parse_error(err) - }) + return self.parse_internal(html, true); + } + + pub(super) fn parse_from_external_html_source( + &mut self, + html: &str, + ) -> Result, HtmlParseError> + where + S: UnicodeString, + { + self.parse_internal(html, true) + } + + pub(super) fn parse_internal( + &mut self, + html: &str, + external_html_source: bool, + ) -> Result, HtmlParseError> + where + S: UnicodeString, + { + let pa_dom = PaDomCreator::parse(html).map_err(|err| { + self.padom_creation_error_to_html_parse_error(err) + })?; + + let dom = self.padom_to_dom(pa_dom, external_html_source).map_err( + |err| HtmlParseError { + parse_errors: vec![err.to_string()], + }, + )?; + Ok(post_process_blocks(dom)) } /// Convert a [PaDom] into a [Dom]. @@ -80,7 +114,11 @@ mod sys { /// /// [Dom] is for general use. Parent nodes own their children, and Dom may be /// cloned, compared, and converted into an HTML string. - fn padom_to_dom(&mut self, padom: PaDom) -> Dom + fn padom_to_dom( + &mut self, + padom: PaDom, + external_html_source: bool, + ) -> Result, Error> where S: UnicodeString, { @@ -88,11 +126,11 @@ mod sys { let doc = ret.document_mut(); if let PaDomNode::Document(padoc) = padom.get_document() { - self.convert(&padom, padoc, doc) + self.convert(&padom, padoc, doc, external_html_source)?; } else { - panic!("Document was not a document!"); + return Err(Error::NoBody); } - ret + Ok(ret) } /// Copy all panode's information into node. @@ -101,14 +139,21 @@ mod sys { padom: &PaDom, panode: &PaNodeContainer, node: &mut ContainerNode, - ) where + external_html_source: bool, + ) -> Result<(), Error> + where S: UnicodeString, { for child_handle in &panode.children { let child = padom.get_node(child_handle); match child { PaDomNode::Container(child) => { - self.convert_container(padom, child, node); + self.convert_container( + padom, + child, + node, + external_html_source, + )?; } PaDomNode::Document(_) => { panic!("Found a document inside a document!") @@ -128,6 +173,7 @@ mod sys { } } } + Ok(()) } /// Copy all panode's information into node (now we know it's a container). @@ -136,7 +182,9 @@ mod sys { padom: &PaDom, child: &PaNodeContainer, node: &mut ContainerNode, - ) where + external_html_source: bool, + ) -> Result<(), Error> + where S: UnicodeString, { let cur_path_idx = self.current_path.len(); @@ -145,7 +193,12 @@ mod sys { "b" | "code" | "del" | "em" | "i" | "strong" | "u" => { let formatting_node = Self::new_formatting(tag); if tag == "code" && self.current_path.contains(&CodeBlock) { - self.convert_children(padom, child, Some(node)); + self.convert_children( + padom, + child, + Some(node), + external_html_source, + )?; } else { self.current_path.push(formatting_node.kind()); node.append_child(formatting_node); @@ -153,10 +206,51 @@ mod sys { padom, child, last_container_mut_in(node), - ); + external_html_source, + )?; self.current_path.remove(cur_path_idx); } } + "span" => { + let mut formatting_tag = None; + if child.contains_style("font-weight", "bold") { + formatting_tag = Some("b"); + } else if child.contains_style("font-style", "italic") { + formatting_tag = Some("i"); + } else if child + .contains_style("text-decoration", "underline") + { + formatting_tag = Some("u"); + } else if child + .contains_style("text-decoration", "line-through") + { + formatting_tag = Some("del"); + } + + if let Some(tag) = formatting_tag { + let formatting_node = Self::new_formatting(tag); + self.current_path.push(formatting_node.kind()); + node.append_child(formatting_node); + self.convert_children( + padom, + child, + last_container_mut_in(node), + external_html_source, + )?; + self.current_path.remove(cur_path_idx); + } else { + if external_html_source { + self.convert( + padom, + child, + node, + external_html_source, + )?; + } else { + return Err(Error::UnknownNode(tag.to_string())); + } + } + } "br" => { node.append_child(Self::new_line_break()); } @@ -174,7 +268,8 @@ mod sys { padom, child, last_container_mut_in(node), - ); + external_html_source, + )?; self.current_path.remove(cur_path_idx); } "li" => { @@ -184,7 +279,8 @@ mod sys { padom, child, last_container_mut_in(node), - ); + external_html_source, + )?; self.current_path.remove(cur_path_idx); } "a" => { @@ -212,7 +308,8 @@ mod sys { padom, child, last_container_mut_in(node), - ); + external_html_source, + )?; } self.current_path.remove(cur_path_idx); } @@ -223,7 +320,8 @@ mod sys { padom, child, last_container_mut_in(node), - ); + external_html_source, + )?; self.current_path.remove(cur_path_idx); } "blockquote" => { @@ -233,14 +331,15 @@ mod sys { padom, child, last_container_mut_in(node), - ); + external_html_source, + )?; self.current_path.remove(cur_path_idx); } "html" => { // Skip the html tag - add its children to the // current node directly. - self.convert(padom, child, node); + self.convert(padom, child, node, external_html_source)?; } "p" => { self.current_path.push(DomNodeKind::Paragraph); @@ -249,14 +348,19 @@ mod sys { padom, child, last_container_mut_in(node), - ); + external_html_source, + )?; self.current_path.remove(cur_path_idx); } _ => { - // Ignore tags we don't recognise - // We should log - see internal task PSU-741 + if external_html_source { + self.convert(padom, child, node, external_html_source)?; + } else { + return Err(Error::UnknownNode(tag.to_string())); + } } }; + Ok(()) } /// Recurse into panode's children and convert them too @@ -265,14 +369,17 @@ mod sys { padom: &PaDom, child: &PaNodeContainer, new_node: Option<&mut ContainerNode>, - ) where + external_html_source: bool, + ) -> Result<(), Error> + where S: UnicodeString, { if let Some(new_node) = new_node { - self.convert(padom, child, new_node); + self.convert(padom, child, new_node, external_html_source)?; } else { panic!("Container became non-container!"); } + Ok(()) } /// Create a formatting node @@ -403,6 +510,27 @@ mod sys { } } + enum Error { + NoBody, + UnknownNode(String), + } + + impl fmt::Display for Error { + fn fmt(&self, formatter: &mut fmt::Formatter<'_>) -> fmt::Result { + match self { + Self::NoBody => { + write!( + formatter, + "The `Document` does not have a `` element" + ) + } + Self::UnknownNode(node_name) => { + write!(formatter, "Node `{node_name}` is not supported") + } + } + } + } + #[cfg(test)] mod test { use crate::dom::parser::parse::sys::HtmlParser; @@ -790,10 +918,9 @@ mod sys { fn parse_code_block_post_processes_it() { let mut parser = HtmlParser::default(); let html = "
Test\nCode
"; - let dom: Dom = PaDomCreator::parse(html) - .map(|pa_dom| parser.padom_to_dom(pa_dom)) - .ok() - .unwrap(); + let pa_dom = PaDomCreator::parse(html).unwrap(); + let dom: Dom = + parser.padom_to_dom(pa_dom, false).ok().unwrap(); // First, line breaks are added as placeholders for paragraphs assert_eq!( dom.to_html().to_string(), @@ -936,6 +1063,100 @@ mod sys { "#} ); } + + #[test] + fn parse_google_doc_rich_text() { + let dom: Dom = HtmlParser::default() + .parse(GOOGLE_DOC_HTML_PASTEBOARD) + .unwrap(); + let tree = dom.to_tree().to_string(); + assert_eq!( + tree, + indoc! { + r#" + + └>ul + ├>li + │ └>p + │ └>i + │ └>"Italic" + ├>li + │ └>p + │ └>"Bold" + ├>li + │ └>p + │ └>"Unformatted" + ├>li + │ └>p + │ └>del + │ └>"Strikethrough" + ├>li + │ └>p + │ └>u + │ └>"Underlined" + ├>li + │ └>p + │ └>a "http://matrix.org" + │ └>u + │ └>"Linked" + └>ul + └>li + └>p + └>u + └>"nested" + "# + } + ); + } + + #[test] + fn parse_ms_doc_rich_text() { + let dom: Dom = + HtmlParser::default().parse(MS_DOC_HTML_PASTEBOARD).unwrap(); + let tree = dom.to_tree().to_string(); + assert_eq!( + tree, + indoc! { + r#" + + ├>ul + │ └>li + │ └>p + │ └>i + │ └>"Italic" + ├>ul + │ └>li + │ └>p + │ └>b + │ └>"Bold" + ├>ul + │ └>li + │ └>p + │ └>"Unformatted" + ├>ul + │ └>li + │ └>p + │ └>del + │ └>"Strikethrough" + ├>ul + │ └>li + │ └>p + │ └>u + │ └>"Underlined" + ├>ul + │ └>li + │ └>p + │ └>a "https://matrix.org/" + │ └>u + │ └>"Linked" + └>ul + └>li + └>p + └>"nested" + "# + } + ); + } } } @@ -1517,6 +1738,7 @@ mod js { if !children_nodes.is_empty() { dom.append_children(children_nodes); } + self.current_path.pop(); } else { self.current_path.push(DomNodeKind::Formatting( formatting_kind.clone().unwrap(), @@ -1528,8 +1750,8 @@ mod js { children_nodes, ), )); + self.current_path.pop(); } - self.current_path.pop(); } } } @@ -1606,22 +1828,20 @@ mod js { #[wasm_bindgen_test] fn google_doc_rich_text() { - let html = r#" -
  • Italic

  • Bold

  • Unformatted

  • Strikethrough

  • Underlined

  • Linked

    • nested

- "#; let dom = HtmlParser::default() - .parse_from_external_html_source::(html) + .parse_from_external_html_source::( + GOOGLE_DOC_HTML_PASTEBOARD, + ) .unwrap(); assert_eq!(dom.to_string(), "
  • Italic

  • Bold

  • Unformatted

  • Strikethrough

  • Underlined

  • Linked

    • nested

"); } #[wasm_bindgen_test] fn ms_rich_text() { - let html = r#" -
  • Italic 

  • Bold 

  • Unformatted 

  • Strikethrough 

  • Underlined 

  • nested 

- "#; let dom = HtmlParser::default() - .parse_from_external_html_source::(html) + .parse_from_external_html_source::( + MS_DOC_HTML_PASTEBOARD, + ) .unwrap(); assert_eq!(dom.to_string(), "
  • Italic

  • Bold

  • Unformatted

  • Strikethrough

  • Underlined

  • nested

"); } From d83a3fba303e75f5a1f9f3b8b18802817ff303d3 Mon Sep 17 00:00:00 2001 From: David Langley Date: Thu, 3 Jul 2025 20:30:50 +0100 Subject: [PATCH 04/17] Add top level functions and e2e test --- crates/wysiwyg/src/composer_model.rs | 1 + .../src/composer_model/replace_html.rs | 52 +++++++++++++ crates/wysiwyg/src/dom/parser.rs | 1 + crates/wysiwyg/src/dom/parser/parse.rs | 78 +++++++++++-------- .../web/cypress/e2e/clipboard/paste.spec.ts | 26 +++++++ 5 files changed, 127 insertions(+), 31 deletions(-) create mode 100644 crates/wysiwyg/src/composer_model/replace_html.rs diff --git a/crates/wysiwyg/src/composer_model.rs b/crates/wysiwyg/src/composer_model.rs index e655a9174..835003dbb 100644 --- a/crates/wysiwyg/src/composer_model.rs +++ b/crates/wysiwyg/src/composer_model.rs @@ -17,6 +17,7 @@ pub mod menu_action; pub mod menu_state; pub mod new_lines; pub mod quotes; +pub mod replace_html; pub mod replace_text; pub mod selection; pub mod undo_redo; diff --git a/crates/wysiwyg/src/composer_model/replace_html.rs b/crates/wysiwyg/src/composer_model/replace_html.rs new file mode 100644 index 000000000..946248842 --- /dev/null +++ b/crates/wysiwyg/src/composer_model/replace_html.rs @@ -0,0 +1,52 @@ +// Copyright 2024 New Vector Ltd. +// Copyright 2022 The Matrix.org Foundation C.I.C. +// +// SPDX-License-Identifier: AGPL-3.0-only OR LicenseRef-Element-Commercial +// Please see LICENSE in the repository root for full details. + +use crate::dom::nodes::{ContainerNodeKind, DomNode}; +use crate::dom::parser::parse_from_external_html_source; +use crate::{parse, ComposerModel, ComposerUpdate, Location, UnicodeString}; + +impl ComposerModel +where + S: UnicodeString, +{ + /// Replaces text in the current selection with new_html. + /// Treats its input as html that is parsed into a DomNode and inserted into + /// the document at the cursor. + pub fn replace_html( + &mut self, + new_html: S, + from_external_source: bool, + ) -> ComposerUpdate { + self.push_state_to_history(); + if self.has_selection() { + self.do_replace_text(S::default()); + } + let result = if from_external_source { + parse_from_external_html_source(&new_html.to_string()) + } else { + parse(&new_html.to_string()) + }; + + let dom = result.unwrap().into_document_node(); + + let (start, end) = self.safe_selection(); + let range = self.state.dom.find_range(start, end); + + let new_cursor_index = start + dom.text_len(); + let handle = self.state.dom.insert_node_at_cursor(&range, dom); + + // manually move the cursor to the end of the html + self.state.start = Location::from(new_cursor_index); + self.state.end = self.state.start; + + // add a trailing space in cases when we do not have a next sibling + if self.state.dom.is_last_in_parent(&handle) { + self.do_replace_text(" ".into()) + } else { + self.create_update_replace_all() + } + } +} diff --git a/crates/wysiwyg/src/dom/parser.rs b/crates/wysiwyg/src/dom/parser.rs index e7c79b458..0c2d24eaf 100644 --- a/crates/wysiwyg/src/dom/parser.rs +++ b/crates/wysiwyg/src/dom/parser.rs @@ -49,3 +49,4 @@ mod sys { use sys::*; pub use parse::parse; +pub use parse::parse_from_external_html_source; diff --git a/crates/wysiwyg/src/dom/parser/parse.rs b/crates/wysiwyg/src/dom/parser/parse.rs index 3687c44e4..a8fd59571 100644 --- a/crates/wysiwyg/src/dom/parser/parse.rs +++ b/crates/wysiwyg/src/dom/parser/parse.rs @@ -27,10 +27,27 @@ where } } +pub fn parse_from_external_html_source( + html: &str, +) -> Result, HtmlParseError> +where + S: UnicodeString, +{ + cfg_if::cfg_if! { + if #[cfg(feature = "sys")] { + sys::HtmlParser::default().parse_from_external_html_source(html) + } else if #[cfg(all(feature = "js", target_arch = "wasm32"))] { + js::HtmlParser::default().parse_from_external_html_source(html) + } else { + unreachable!("The `sys` or `js` are mutually exclusive, and one of them must be enabled.") + } + } +} + #[cfg(test)] const GOOGLE_DOC_HTML_PASTEBOARD: &str = r#" -
  • Italic

  • Bold

  • Unformatted

  • Strikethrough

  • Underlined

  • Linked

    • nested

- "#; +
  • Italic

  • Bold

  • Unformatted

  • Strikethrough

  • Underlined

  • Linked

    • nested

+ "#; #[cfg(test)] const MS_DOC_HTML_PASTEBOARD: &str = r#"
  • Italic 

  • Bold 

  • Unformatted 

  • Strikethrough 

  • Underlined 

  • nested 

@@ -1074,36 +1091,35 @@ mod sys { tree, indoc! { r#" - + + └>ul + ├>li + │ └>p + │ └>i + │ └>"Italic" + ├>li + │ └>p + │ └>"Bold" + ├>li + │ └>p + │ └>"Unformatted" + ├>li + │ └>p + │ └>del + │ └>"Strikethrough" + ├>li + │ └>p + │ └>u + │ └>"Underlined" + ├>li + │ └>p + │ └>a "http://matrix.org" + │ └>u + │ └>"Linked" └>ul - ├>li - │ └>p - │ └>i - │ └>"Italic" - ├>li - │ └>p - │ └>"Bold" - ├>li - │ └>p - │ └>"Unformatted" - ├>li - │ └>p - │ └>del - │ └>"Strikethrough" - ├>li - │ └>p - │ └>u - │ └>"Underlined" - ├>li - │ └>p - │ └>a "http://matrix.org" - │ └>u - │ └>"Linked" - └>ul └>li - └>p - └>u - └>"nested" + └>p + └>"nested" "# } ); @@ -1833,7 +1849,7 @@ mod js { GOOGLE_DOC_HTML_PASTEBOARD, ) .unwrap(); - assert_eq!(dom.to_string(), "
  • Italic

  • Bold

  • Unformatted

  • Strikethrough

  • Underlined

  • Linked

    • nested

"); + assert_eq!(dom.to_string(), "
  • Italic

  • Bold

  • Unformatted

  • Strikethrough

  • Underlined

  • Linked

    • nested

"); } #[wasm_bindgen_test] diff --git a/platforms/web/cypress/e2e/clipboard/paste.spec.ts b/platforms/web/cypress/e2e/clipboard/paste.spec.ts index 698b8ba16..b46fd7603 100644 --- a/platforms/web/cypress/e2e/clipboard/paste.spec.ts +++ b/platforms/web/cypress/e2e/clipboard/paste.spec.ts @@ -35,4 +35,30 @@ describe('Paste', () => { // Note: we used to test it 'should convert pasted newlines into BRs' but // the test was flakey, sometimes correctly showing text containing br tags, // and sometimes mysteriously showing converted into two divs. + + it( + 'should display pasted richtext after we type', + { browser: 'electron' }, + () => { + cy.visit('/'); + cy.get(editor).wait(500); + cy.get(editor).type('BEFORE'); + cy.contains(editor, 'BEFORE'); + + cy.window().its('navigator.clipboard') + .then(async (clip) => { + const blob = new Blob(["link"], {type: 'text/html'}); + const item = new ClipboardItem({'text/html': blob}); + return await clip.write([item]); + }) + + cy.log("item"); + cy.document().invoke('execCommand', 'paste'); + cy.contains(editor, 'BEFORElink'); + + cy.get(editor).type('AFTER'); + cy.contains(editor, /^BEFORElink AFTER/); + }, + ); + }); From 9744b3a8ce500de716755d0ef7cc0ed075a1d967 Mon Sep 17 00:00:00 2001 From: David Langley Date: Fri, 4 Jul 2025 12:10:04 +0100 Subject: [PATCH 05/17] Add HtmlSource and stripping of meta tags and the outer b tag for google docs. --- .../src/composer_model/replace_html.rs | 110 +++++++-- crates/wysiwyg/src/dom.rs | 2 + crates/wysiwyg/src/dom/html_source.rs | 6 + crates/wysiwyg/src/dom/parser.rs | 2 +- crates/wysiwyg/src/dom/parser/parse.rs | 211 +++++++++--------- crates/wysiwyg/src/lib.rs | 1 + platforms/web/lib/composer.ts | 22 +- 7 files changed, 226 insertions(+), 128 deletions(-) create mode 100644 crates/wysiwyg/src/dom/html_source.rs diff --git a/crates/wysiwyg/src/composer_model/replace_html.rs b/crates/wysiwyg/src/composer_model/replace_html.rs index 946248842..22cf3f3c2 100644 --- a/crates/wysiwyg/src/composer_model/replace_html.rs +++ b/crates/wysiwyg/src/composer_model/replace_html.rs @@ -4,8 +4,10 @@ // SPDX-License-Identifier: AGPL-3.0-only OR LicenseRef-Element-Commercial // Please see LICENSE in the repository root for full details. -use crate::dom::nodes::{ContainerNodeKind, DomNode}; -use crate::dom::parser::parse_from_external_html_source; +use regex::Regex; + +use crate::dom::html_source::HtmlSource; +use crate::dom::parser::parse_from_source; use crate::{parse, ComposerModel, ComposerUpdate, Location, UnicodeString}; impl ComposerModel @@ -18,35 +20,105 @@ where pub fn replace_html( &mut self, new_html: S, - from_external_source: bool, + external_source: HtmlSource, ) -> ComposerUpdate { self.push_state_to_history(); if self.has_selection() { self.do_replace_text(S::default()); } - let result = if from_external_source { - parse_from_external_html_source(&new_html.to_string()) + + let meta_regex = Regex::new(r"]*>").unwrap(); + let mut cleaned_html = meta_regex + .replace_all(&new_html.to_string(), "") + .to_string(); + + if external_source == HtmlSource::GoogleDoc { + // Strip first b tag (opening and closing) + let b_regex = Regex::new(r"]*>(.*)<\/b>").unwrap(); + cleaned_html = b_regex.replace(&cleaned_html, "$1").to_string(); + } + + println!("cleaned_html: {}", cleaned_html); + let result = if external_source == HtmlSource::Matrix { + parse(&cleaned_html.to_string()) } else { - parse(&new_html.to_string()) + parse_from_source(&cleaned_html.to_string(), external_source) }; - let dom = result.unwrap().into_document_node(); + // We should have only one top level dom node, so add each of the children at the cursor. + let dom_children = result.unwrap().into_container().take_children(); - let (start, end) = self.safe_selection(); - let range = self.state.dom.find_range(start, end); + for node in dom_children.iter() { + let (start, end) = self.safe_selection(); + let range = self.state.dom.find_range(start, end); - let new_cursor_index = start + dom.text_len(); - let handle = self.state.dom.insert_node_at_cursor(&range, dom); + let new_cursor_index = start + node.text_len(); + let _ = self.state.dom.insert_node_at_cursor(&range, node.clone()); - // manually move the cursor to the end of the html - self.state.start = Location::from(new_cursor_index); - self.state.end = self.state.start; + // manually move the cursor to the end of the html + self.state.start = Location::from(new_cursor_index); + self.state.end = self.state.start; + } // add a trailing space in cases when we do not have a next sibling - if self.state.dom.is_last_in_parent(&handle) { - self.do_replace_text(" ".into()) - } else { - self.create_update_replace_all() - } + self.create_update_replace_all() + } +} + +#[cfg(test)] +const GOOGLE_DOC_HTML_PASTEBOARD: &str = r#"test"#; +#[cfg(test)] +const MS_DOC_HTML_PASTEBOARD: &str = r#"test "#; + +// ...existing code... + +#[cfg(test)] +mod test { + use super::*; + use crate::dom::html_source::HtmlSource; + use crate::tests::testutils_composer_model::cm; + + #[test] + fn test_replace_html_strips_meta_tags_google_docs() { + let mut model = cm("|"); + + let _ = model.replace_html( + GOOGLE_DOC_HTML_PASTEBOARD.into(), + HtmlSource::GoogleDoc, + ); + + // Verify the HTML doesn't contain meta or the outer b tag + let html = model.get_content_as_html(); + let html_str = html.to_string(); + assert!(!html_str.contains("test

"); } } diff --git a/crates/wysiwyg/src/dom.rs b/crates/wysiwyg/src/dom.rs index d0ae3cb34..5bad8e8b1 100644 --- a/crates/wysiwyg/src/dom.rs +++ b/crates/wysiwyg/src/dom.rs @@ -15,6 +15,7 @@ pub mod dom_struct; pub mod find_extended_range; pub mod find_range; pub mod find_result; +pub mod html_source; pub mod insert_node_at_cursor; pub mod insert_parent; pub mod iter; @@ -35,6 +36,7 @@ pub use dom_creation_error::MarkdownParseError; pub use dom_handle::DomHandle; pub use dom_struct::Dom; pub use find_result::FindResult; +pub use html_source::HtmlSource; pub use range::DomLocation; pub use range::Range; pub use to_html::ToHtml; diff --git a/crates/wysiwyg/src/dom/html_source.rs b/crates/wysiwyg/src/dom/html_source.rs new file mode 100644 index 000000000..7707b0b83 --- /dev/null +++ b/crates/wysiwyg/src/dom/html_source.rs @@ -0,0 +1,6 @@ +#[derive(Copy, Clone, PartialEq, Eq)] +pub enum HtmlSource { + Matrix, + GoogleDoc, + UnknownExternal, +} diff --git a/crates/wysiwyg/src/dom/parser.rs b/crates/wysiwyg/src/dom/parser.rs index 0c2d24eaf..c845f1951 100644 --- a/crates/wysiwyg/src/dom/parser.rs +++ b/crates/wysiwyg/src/dom/parser.rs @@ -49,4 +49,4 @@ mod sys { use sys::*; pub use parse::parse; -pub use parse::parse_from_external_html_source; +pub use parse::parse_from_source; diff --git a/crates/wysiwyg/src/dom/parser/parse.rs b/crates/wysiwyg/src/dom/parser/parse.rs index a8fd59571..d639ae855 100644 --- a/crates/wysiwyg/src/dom/parser/parse.rs +++ b/crates/wysiwyg/src/dom/parser/parse.rs @@ -7,6 +7,7 @@ use regex::Regex; use crate::dom::dom_creation_error::HtmlParseError; +use crate::dom::html_source::HtmlSource; use crate::dom::nodes::dom_node::DomNodeKind::{self}; use crate::dom::nodes::{ContainerNode, ContainerNodeKind}; use crate::dom::Dom; @@ -27,17 +28,18 @@ where } } -pub fn parse_from_external_html_source( +pub fn parse_from_source( html: &str, + source: HtmlSource, ) -> Result, HtmlParseError> where S: UnicodeString, { cfg_if::cfg_if! { if #[cfg(feature = "sys")] { - sys::HtmlParser::default().parse_from_external_html_source(html) + sys::HtmlParser::default().parse_from_source(html, source) } else if #[cfg(all(feature = "js", target_arch = "wasm32"))] { - js::HtmlParser::default().parse_from_external_html_source(html) + js::HtmlParser::default().parse_from_source(html, source) } else { unreachable!("The `sys` or `js` are mutually exclusive, and one of them must be enabled.") } @@ -86,23 +88,24 @@ mod sys { where S: UnicodeString, { - return self.parse_internal(html, true); + return self.parse_internal(html, HtmlSource::Matrix); } - pub(super) fn parse_from_external_html_source( + pub(super) fn parse_from_source( &mut self, html: &str, + source: HtmlSource, ) -> Result, HtmlParseError> where S: UnicodeString, { - self.parse_internal(html, true) + self.parse_internal(html, source) } pub(super) fn parse_internal( &mut self, html: &str, - external_html_source: bool, + html_source: HtmlSource, ) -> Result, HtmlParseError> where S: UnicodeString, @@ -111,11 +114,12 @@ mod sys { self.padom_creation_error_to_html_parse_error(err) })?; - let dom = self.padom_to_dom(pa_dom, external_html_source).map_err( - |err| HtmlParseError { - parse_errors: vec![err.to_string()], - }, - )?; + let dom = + self.padom_to_dom(pa_dom, html_source).map_err(|err| { + HtmlParseError { + parse_errors: vec![err.to_string()], + } + })?; Ok(post_process_blocks(dom)) } @@ -134,7 +138,7 @@ mod sys { fn padom_to_dom( &mut self, padom: PaDom, - external_html_source: bool, + html_source: HtmlSource, ) -> Result, Error> where S: UnicodeString, @@ -143,7 +147,7 @@ mod sys { let doc = ret.document_mut(); if let PaDomNode::Document(padoc) = padom.get_document() { - self.convert(&padom, padoc, doc, external_html_source)?; + self.convert(&padom, padoc, doc, html_source)?; } else { return Err(Error::NoBody); } @@ -156,7 +160,7 @@ mod sys { padom: &PaDom, panode: &PaNodeContainer, node: &mut ContainerNode, - external_html_source: bool, + html_source: HtmlSource, ) -> Result<(), Error> where S: UnicodeString, @@ -169,7 +173,7 @@ mod sys { padom, child, node, - external_html_source, + html_source, )?; } PaDomNode::Document(_) => { @@ -199,7 +203,7 @@ mod sys { padom: &PaDom, child: &PaNodeContainer, node: &mut ContainerNode, - external_html_source: bool, + html_source: HtmlSource, ) -> Result<(), Error> where S: UnicodeString, @@ -214,7 +218,7 @@ mod sys { padom, child, Some(node), - external_html_source, + html_source, )?; } else { self.current_path.push(formatting_node.kind()); @@ -223,7 +227,7 @@ mod sys { padom, child, last_container_mut_in(node), - external_html_source, + html_source, )?; self.current_path.remove(cur_path_idx); } @@ -252,19 +256,14 @@ mod sys { padom, child, last_container_mut_in(node), - external_html_source, + html_source, )?; self.current_path.remove(cur_path_idx); } else { - if external_html_source { - self.convert( - padom, - child, - node, - external_html_source, - )?; - } else { + if html_source == HtmlSource::Matrix { return Err(Error::UnknownNode(tag.to_string())); + } else { + self.convert(padom, child, node, html_source)?; } } } @@ -285,7 +284,7 @@ mod sys { padom, child, last_container_mut_in(node), - external_html_source, + html_source, )?; self.current_path.remove(cur_path_idx); } @@ -296,7 +295,7 @@ mod sys { padom, child, last_container_mut_in(node), - external_html_source, + html_source, )?; self.current_path.remove(cur_path_idx); } @@ -325,7 +324,7 @@ mod sys { padom, child, last_container_mut_in(node), - external_html_source, + html_source, )?; } self.current_path.remove(cur_path_idx); @@ -337,7 +336,7 @@ mod sys { padom, child, last_container_mut_in(node), - external_html_source, + html_source, )?; self.current_path.remove(cur_path_idx); } @@ -348,7 +347,7 @@ mod sys { padom, child, last_container_mut_in(node), - external_html_source, + html_source, )?; self.current_path.remove(cur_path_idx); @@ -356,7 +355,7 @@ mod sys { "html" => { // Skip the html tag - add its children to the // current node directly. - self.convert(padom, child, node, external_html_source)?; + self.convert(padom, child, node, html_source)?; } "p" => { self.current_path.push(DomNodeKind::Paragraph); @@ -365,15 +364,15 @@ mod sys { padom, child, last_container_mut_in(node), - external_html_source, + html_source, )?; self.current_path.remove(cur_path_idx); } _ => { - if external_html_source { - self.convert(padom, child, node, external_html_source)?; - } else { + if html_source == HtmlSource::Matrix { return Err(Error::UnknownNode(tag.to_string())); + } else { + self.convert(padom, child, node, html_source)?; } } }; @@ -386,13 +385,13 @@ mod sys { padom: &PaDom, child: &PaNodeContainer, new_node: Option<&mut ContainerNode>, - external_html_source: bool, + html_source: HtmlSource, ) -> Result<(), Error> where S: UnicodeString, { if let Some(new_node) = new_node { - self.convert(padom, child, new_node, external_html_source)?; + self.convert(padom, child, new_node, html_source)?; } else { panic!("Container became non-container!"); } @@ -936,8 +935,10 @@ mod sys { let mut parser = HtmlParser::default(); let html = "
Test\nCode
"; let pa_dom = PaDomCreator::parse(html).unwrap(); - let dom: Dom = - parser.padom_to_dom(pa_dom, false).ok().unwrap(); + let dom: Dom = parser + .padom_to_dom(pa_dom, HtmlSource::Matrix) + .ok() + .unwrap(); // First, line breaks are added as placeholders for paragraphs assert_eq!( dom.to_html().to_string(), @@ -1084,41 +1085,44 @@ mod sys { #[test] fn parse_google_doc_rich_text() { let dom: Dom = HtmlParser::default() - .parse(GOOGLE_DOC_HTML_PASTEBOARD) + .parse_from_source( + GOOGLE_DOC_HTML_PASTEBOARD, + HtmlSource::GoogleDoc, + ) .unwrap(); let tree = dom.to_tree().to_string(); assert_eq!( tree, indoc! { r#" - - └>ul - ├>li - │ └>p - │ └>i - │ └>"Italic" - ├>li - │ └>p - │ └>"Bold" - ├>li - │ └>p - │ └>"Unformatted" - ├>li - │ └>p - │ └>del - │ └>"Strikethrough" - ├>li - │ └>p - │ └>u - │ └>"Underlined" - ├>li - │ └>p - │ └>a "http://matrix.org" - │ └>u - │ └>"Linked" + └>ul + ├>li + │ └>p + │ └>i + │ └>"Italic" + ├>li + │ └>p + │ └>"Bold" + ├>li + │ └>p + │ └>"Unformatted" + ├>li + │ └>p + │ └>del + │ └>"Strikethrough" + ├>li + │ └>p + │ └>u + │ └>"Underlined" + ├>li + │ └>p + │ └>a "http://matrix.org" + │ └>u + │ └>"Linked" + └>ul └>li - └>p + └>p └>"nested" "# } @@ -1127,8 +1131,12 @@ mod sys { #[test] fn parse_ms_doc_rich_text() { - let dom: Dom = - HtmlParser::default().parse(MS_DOC_HTML_PASTEBOARD).unwrap(); + let dom: Dom = HtmlParser::default() + .parse_from_source( + MS_DOC_HTML_PASTEBOARD, + HtmlSource::UnknownExternal, + ) + .unwrap(); let tree = dom.to_tree().to_string(); assert_eq!( tree, @@ -1420,23 +1428,24 @@ mod js { where S: UnicodeString, { - self.parse_internal(html, false) + self.parse_internal(html, HtmlSource::Matrix) } - pub(super) fn parse_from_external_html_source( + pub(super) fn parse_from_source( &mut self, html: &str, + html_source: HtmlSource, ) -> Result, HtmlParseError> where S: UnicodeString, { - self.parse_internal(html, true) + self.parse_internal(html, html_source) } fn parse_internal( &mut self, html: &str, - external_html_source: bool, + html_source: HtmlSource, ) -> Result, HtmlParseError> where S: UnicodeString, @@ -1455,7 +1464,7 @@ mod js { ) })?; - self.webdom_to_dom(document, external_html_source) + self.webdom_to_dom(document, html_source) .map_err(to_dom_creation_error) .map(post_process_blocks) } @@ -1463,19 +1472,19 @@ mod js { fn webdom_to_dom( &mut self, webdoc: Document, - external_html_source: bool, + html_source: HtmlSource, ) -> Result, Error> where S: UnicodeString, { let body = webdoc.body().ok_or_else(|| Error::NoBody)?; - self.convert(body.child_nodes(), external_html_source) + self.convert(body.child_nodes(), html_source) } fn convert( &mut self, nodes: NodeList, - external_html_source: bool, + html_source: HtmlSource, ) -> Result, Error> where S: UnicodeString, @@ -1484,7 +1493,7 @@ mod js { let mut dom = Dom::new(Vec::with_capacity(number_of_nodes)); let dom_document = dom.document_mut(); - self.convert_container(nodes, dom_document, external_html_source)?; + self.convert_container(nodes, dom_document, html_source)?; Ok(dom) } @@ -1493,7 +1502,7 @@ mod js { &mut self, nodes: NodeList, dom: &mut ContainerNode, - external_html_source: bool, + html_source: HtmlSource, ) -> Result<(), Error> where S: UnicodeString, @@ -1575,10 +1584,7 @@ mod js { ); } else { let children = self - .convert( - node.child_nodes(), - external_html_source, - )? + .convert(node.child_nodes(), html_source)? .take_children(); dom.append_child(DomNode::new_link( url.into(), @@ -1636,11 +1642,8 @@ mod js { self.current_path.push(DomNodeKind::ListItem); dom.append_child(DomNode::Container( ContainerNode::new_list_item( - self.convert( - node.child_nodes(), - external_html_source, - )? - .take_children(), + self.convert(node.child_nodes(), html_source)? + .take_children(), ), )); self.current_path.pop(); @@ -1660,7 +1663,7 @@ mod js { }; dom.append_child(DomNode::Container( ContainerNode::new_code_block( - self.convert(children, external_html_source)? + self.convert(children, html_source)? .take_children(), ), )); @@ -1671,11 +1674,8 @@ mod js { self.current_path.push(DomNodeKind::Quote); dom.append_child(DomNode::Container( ContainerNode::new_quote( - self.convert( - node.child_nodes(), - external_html_source, - )? - .take_children(), + self.convert(node.child_nodes(), html_source)? + .take_children(), ), )); self.current_path.pop(); @@ -1685,18 +1685,15 @@ mod js { self.current_path.push(DomNodeKind::Paragraph); dom.append_child(DomNode::Container( ContainerNode::new_paragraph( - self.convert( - node.child_nodes(), - external_html_source, - )? - .take_children(), + self.convert(node.child_nodes(), html_source)? + .take_children(), ), )); self.current_path.pop(); } node_name => { let children_nodes = self - .convert(node.child_nodes(), external_html_source)? + .convert(node.child_nodes(), html_source)? .take_children(); let formatting_kind = match node_name { "STRONG" | "B" => Some(InlineFormatType::Bold), @@ -1705,7 +1702,7 @@ mod js { "U" => Some(InlineFormatType::Underline), "CODE" => Some(InlineFormatType::InlineCode), "SPAN" => { - if !external_html_source { + if html_source == HtmlSource::Matrix { return Err(Error::UnknownNode( node_name.to_owned(), )); @@ -1741,7 +1738,7 @@ mod js { } } _ => { - if !external_html_source { + if html_source == HtmlSource::Matrix { return Err(Error::UnknownNode( node_name.to_owned(), )); @@ -1845,8 +1842,9 @@ mod js { #[wasm_bindgen_test] fn google_doc_rich_text() { let dom = HtmlParser::default() - .parse_from_external_html_source::( + .parse_from_source::( GOOGLE_DOC_HTML_PASTEBOARD, + HtmlSource::GoogleDoc, ) .unwrap(); assert_eq!(dom.to_string(), "
  • Italic

  • Bold

  • Unformatted

  • Strikethrough

  • Underlined

  • Linked

    • nested

"); @@ -1855,8 +1853,9 @@ mod js { #[wasm_bindgen_test] fn ms_rich_text() { let dom = HtmlParser::default() - .parse_from_external_html_source::( + .parse_from_source::( MS_DOC_HTML_PASTEBOARD, + HtmlSource::UnknownExternal, ) .unwrap(); assert_eq!(dom.to_string(), "
  • Italic

  • Bold

  • Unformatted

  • Strikethrough

  • Underlined

  • nested

"); diff --git a/crates/wysiwyg/src/lib.rs b/crates/wysiwyg/src/lib.rs index 79a4d33e3..2319b6994 100644 --- a/crates/wysiwyg/src/lib.rs +++ b/crates/wysiwyg/src/lib.rs @@ -33,6 +33,7 @@ pub use crate::dom::parser::parse; pub use crate::dom::DomCreationError; pub use crate::dom::DomHandle; pub use crate::dom::HtmlParseError; +pub use crate::dom::HtmlSource; pub use crate::dom::MarkdownParseError; pub use crate::dom::ToHtml; pub use crate::dom::ToRawText; diff --git a/platforms/web/lib/composer.ts b/platforms/web/lib/composer.ts index 56ad75a16..19f890db5 100644 --- a/platforms/web/lib/composer.ts +++ b/platforms/web/lib/composer.ts @@ -9,6 +9,7 @@ Please see LICENSE in the repository root for full details. import { ComposerModel, ComposerUpdate, + HtmlSource, SuggestionPattern, } from '@vector-im/matrix-wysiwyg-wasm'; @@ -65,8 +66,25 @@ export function processInput( } if (isClipboardEvent(event)) { - const data = event.clipboardData?.getData('text/plain') ?? ''; - return action(composerModel.replace_text(data), 'paste'); + const clipboardData = event.clipboardData; + const htmlData = clipboardData?.getData('text/html'); + const plainData = clipboardData?.getData('text/plain') ?? ''; + + if (htmlData && htmlData !== plainData) { + const htmlSource = clipboardData?.types.includes( + 'application/x-vnd.google-docs-document-slice-clip+wrapped', + ) ? HtmlSource.GoogleDoc : HtmlSource.UnknownExternal; + return action( + composerModel.replace_html(htmlData, htmlSource), + 'replace_html_paste', + htmlData, + ); + } + return action( + composerModel.replace_text(plainData), + 'replace_text_paste', + plainData, + ); } switch (event.inputType) { From f8e526a293585b764e9aeb06dca5c767cbc73807 Mon Sep 17 00:00:00 2001 From: David Langley Date: Wed, 16 Jul 2025 12:09:53 +0100 Subject: [PATCH 06/17] Update parse.rs --- crates/wysiwyg/src/dom/parser/parse.rs | 14 ++++---------- 1 file changed, 4 insertions(+), 10 deletions(-) diff --git a/crates/wysiwyg/src/dom/parser/parse.rs b/crates/wysiwyg/src/dom/parser/parse.rs index d639ae855..1eb38c139 100644 --- a/crates/wysiwyg/src/dom/parser/parse.rs +++ b/crates/wysiwyg/src/dom/parser/parse.rs @@ -1604,11 +1604,8 @@ mod js { dom.append_child(DomNode::Container( ContainerNode::new_list( ListType::Ordered, - self.convert( - node.child_nodes(), - external_html_source, - )? - .take_children(), + self.convert(node.child_nodes(), html_source)? + .take_children(), if let Some(custom_start) = custom_start { Some(vec![( "start".into(), @@ -1627,11 +1624,8 @@ mod js { dom.append_child(DomNode::Container( ContainerNode::new_list( ListType::Unordered, - self.convert( - node.child_nodes(), - external_html_source, - )? - .take_children(), + self.convert(node.child_nodes(), html_source)? + .take_children(), None, ), )); From f8da65adb2d3fdc590fda9f934b08e965f755814 Mon Sep 17 00:00:00 2001 From: David Langley Date: Fri, 4 Jul 2025 12:42:13 +0100 Subject: [PATCH 07/17] Fix e2e test --- platforms/web/cypress/e2e/clipboard/paste.spec.ts | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/platforms/web/cypress/e2e/clipboard/paste.spec.ts b/platforms/web/cypress/e2e/clipboard/paste.spec.ts index b46fd7603..44fb9a04e 100644 --- a/platforms/web/cypress/e2e/clipboard/paste.spec.ts +++ b/platforms/web/cypress/e2e/clipboard/paste.spec.ts @@ -57,7 +57,7 @@ describe('Paste', () => { cy.contains(editor, 'BEFORElink'); cy.get(editor).type('AFTER'); - cy.contains(editor, /^BEFORElink AFTER/); + cy.contains(editor, /^BEFORElinkAFTER/); }, ); From 974b899c0d078629f8c958edc8f4c237ad54129a Mon Sep 17 00:00:00 2001 From: David Langley Date: Tue, 15 Jul 2025 16:41:04 +0100 Subject: [PATCH 08/17] Support nested lists for google docs(adds the ul/ol to the li's children, not the li/ol's). Also adds post processing to cleanup sibling text nodes. --- bindings/wysiwyg-wasm/src/lib.rs | 29 ++ .../src/composer_model/replace_html.rs | 46 +-- crates/wysiwyg/src/dom/dom_methods.rs | 7 +- crates/wysiwyg/src/dom/parser/parse.rs | 264 ++++++++++++++---- crates/wysiwyg/src/tests/test_deleting.rs | 10 +- 5 files changed, 274 insertions(+), 82 deletions(-) diff --git a/bindings/wysiwyg-wasm/src/lib.rs b/bindings/wysiwyg-wasm/src/lib.rs index 9117c33db..e4bee0db2 100644 --- a/bindings/wysiwyg-wasm/src/lib.rs +++ b/bindings/wysiwyg-wasm/src/lib.rs @@ -187,6 +187,17 @@ impl ComposerModel { ) } + pub fn replace_html( + &mut self, + new_html: &str, + external_source: HtmlSource, + ) -> ComposerUpdate { + ComposerUpdate::from(self.inner.replace_html( + Utf16String::from_str(new_html), + external_source.into(), + )) + } + pub fn replace_text_suggestion( &mut self, new_text: &str, @@ -914,6 +925,24 @@ impl From> for LinkAction { } } +#[wasm_bindgen] +#[derive(Clone)] +pub enum HtmlSource { + Matrix, + GoogleDoc, + UnknownExternal, +} + +impl From for wysiwyg::HtmlSource { + fn from(source: HtmlSource) -> Self { + match source { + HtmlSource::Matrix => Self::Matrix, + HtmlSource::GoogleDoc => Self::GoogleDoc, + HtmlSource::UnknownExternal => Self::UnknownExternal, + } + } +} + #[cfg(test)] mod test { use super::ComposerModel; diff --git a/crates/wysiwyg/src/composer_model/replace_html.rs b/crates/wysiwyg/src/composer_model/replace_html.rs index 22cf3f3c2..d9ee660f7 100644 --- a/crates/wysiwyg/src/composer_model/replace_html.rs +++ b/crates/wysiwyg/src/composer_model/replace_html.rs @@ -7,8 +7,12 @@ use regex::Regex; use crate::dom::html_source::HtmlSource; +use crate::dom::nodes::ContainerNode; use crate::dom::parser::parse_from_source; -use crate::{parse, ComposerModel, ComposerUpdate, Location, UnicodeString}; + +use crate::{ + parse, ComposerModel, ComposerUpdate, DomNode, Location, UnicodeString, +}; impl ComposerModel where @@ -38,39 +42,39 @@ where cleaned_html = b_regex.replace(&cleaned_html, "$1").to_string(); } - println!("cleaned_html: {}", cleaned_html); let result = if external_source == HtmlSource::Matrix { parse(&cleaned_html.to_string()) } else { parse_from_source(&cleaned_html.to_string(), external_source) }; - // We should have only one top level dom node, so add each of the children at the cursor. - let dom_children = result.unwrap().into_container().take_children(); - - for node in dom_children.iter() { - let (start, end) = self.safe_selection(); - let range = self.state.dom.find_range(start, end); + let doc_node = result.unwrap().into_document_node(); + let (start, end) = self.safe_selection(); + let range = self.state.dom.find_range(start, end); - let new_cursor_index = start + node.text_len(); - let _ = self.state.dom.insert_node_at_cursor(&range, node.clone()); + let p = DomNode::Container(ContainerNode::new_paragraph( + doc_node.into_container().unwrap().take_children(), + )); - // manually move the cursor to the end of the html - self.state.start = Location::from(new_cursor_index); - self.state.end = self.state.start; - } + let new_cursor_index = start + p.text_len(); + let handle = self.state.dom.insert_node_at_cursor(&range, p); + self.state.dom.replace_node_with_its_children(&handle); - // add a trailing space in cases when we do not have a next sibling + // manually move the cursor to the end of the html + self.state.start = Location::from(new_cursor_index); + self.state.end = self.state.start; self.create_update_replace_all() } } #[cfg(test)] -const GOOGLE_DOC_HTML_PASTEBOARD: &str = r#"test"#; +const GOOGLE_DOC_HTML_PASTEBOARD: &str = r#" +
  • Italic

  • Bold

  • Unformatted

  • Strikethrough

  • Underlined

  • Linked

    • nested

+ "#; #[cfg(test)] -const MS_DOC_HTML_PASTEBOARD: &str = r#"test "#; - -// ...existing code... +const MS_DOC_HTML_PASTEBOARD: &str = r#" +
  • Italic 

  • Bold 

  • Unformatted 

  • Strikethrough 

  • Underlined 

  • Nested

+ "#; #[cfg(test)] mod test { @@ -92,6 +96,7 @@ mod test { let html_str = html.to_string(); assert!(!html_str.contains("
  • Italic

  • Bold

  • Unformatted

  • Strikethrough

  • Underlined

  • Linked

    • nested

  • "); } #[test] @@ -106,8 +111,7 @@ mod test { let html = model.get_content_as_html(); let html_str = html.to_string(); assert!(!html_str.contains("
  • Italic

  • Bold

  • Unformatted

  • Strikethrough

  • Underlined

  • Linked

  • Nested

  • "); } #[test] diff --git a/crates/wysiwyg/src/dom/dom_methods.rs b/crates/wysiwyg/src/dom/dom_methods.rs index c5eb653e5..f4a1f3293 100644 --- a/crates/wysiwyg/src/dom/dom_methods.rs +++ b/crates/wysiwyg/src/dom/dom_methods.rs @@ -696,9 +696,11 @@ where self.merge_text_nodes_around(&first_location.node_handle); } } + #[cfg(any(test, feature = "assert-invariants"))] + self.assert_invariants(); } - fn merge_text_nodes_around(&mut self, handle: &DomHandle) { + pub fn merge_text_nodes_around(&mut self, handle: &DomHandle) { // TODO: make this method not public because it is used to make // the invariants true, instead of assuming they are true at the // beginning! @@ -710,9 +712,6 @@ where merge_if_adjacent_text_nodes(parent, idx - 1); } merge_if_adjacent_text_nodes(parent, idx); - - #[cfg(any(test, feature = "assert-invariants"))] - self.assert_invariants(); } /// Recursively visit container nodes, looking for block nodes and, if they contain a diff --git a/crates/wysiwyg/src/dom/parser/parse.rs b/crates/wysiwyg/src/dom/parser/parse.rs index 1eb38c139..3fad573c2 100644 --- a/crates/wysiwyg/src/dom/parser/parse.rs +++ b/crates/wysiwyg/src/dom/parser/parse.rs @@ -11,7 +11,7 @@ use crate::dom::html_source::HtmlSource; use crate::dom::nodes::dom_node::DomNodeKind::{self}; use crate::dom::nodes::{ContainerNode, ContainerNodeKind}; use crate::dom::Dom; -use crate::{DomHandle, DomNode, UnicodeString}; +use crate::{DomHandle, DomNode, ToTree, UnicodeString}; pub fn parse(html: &str) -> Result, HtmlParseError> where @@ -52,7 +52,7 @@ const GOOGLE_DOC_HTML_PASTEBOARD: &str = r#" "#; #[cfg(test)] const MS_DOC_HTML_PASTEBOARD: &str = r#" -
    • Italic 

    • Bold 

    • Unformatted 

    • Strikethrough 

    • Underlined 

    • nested 

    +
    • Italic 

    • Bold 

    • Unformatted 

    • Strikethrough 

    • Underlined 

    • Nested

    "#; #[cfg(feature = "sys")] @@ -120,7 +120,10 @@ mod sys { parse_errors: vec![err.to_string()], } })?; - Ok(post_process_blocks(dom)) + let dom_blocks_done = post_process_blocks(dom); + let dom_adjacted_text_done = + post_process_for_adjacent_text(dom_blocks_done); + Ok(dom_adjacted_text_done) } /// Convert a [PaDom] into a [Dom]. @@ -272,18 +275,33 @@ mod sys { } "ol" | "ul" => { self.current_path.push(DomNodeKind::List); + + let target_node = if node.is_list() { + if html_source != HtmlSource::GoogleDoc + || node.last_child_mut().is_none() + { + return Err(Error::InvalidListItemNode); + } + node.last_child_mut() + .unwrap() + .as_container_mut() + .unwrap() + } else { + node + }; if tag == "ol" { let custom_start = child .get_attr("start") .and_then(|start| start.parse::().ok()); - node.append_child(Self::new_ordered_list(custom_start)); + target_node + .append_child(Self::new_ordered_list(custom_start)); } else { - node.append_child(Self::new_unordered_list()); + target_node.append_child(Self::new_unordered_list()); } self.convert_children( padom, child, - last_container_mut_in(node), + last_container_mut_in(target_node), html_source, )?; self.current_path.remove(cur_path_idx); @@ -529,6 +547,7 @@ mod sys { enum Error { NoBody, UnknownNode(String), + InvalidListItemNode, } impl fmt::Display for Error { @@ -543,6 +562,12 @@ mod sys { Self::UnknownNode(node_name) => { write!(formatter, "Node `{node_name}` is not supported") } + Self::InvalidListItemNode => { + write!( + formatter, + "Invalid list item node: a list must only contain list items" + ) + } } } } @@ -557,7 +582,7 @@ mod sys { use super::*; use crate::tests::testutils_composer_model::restore_whitespace; - use crate::{ToHtml, ToTree}; + use crate::{ToHtml, ToMarkdown, ToTree}; trait Roundtrips { fn roundtrips(&self); @@ -1095,7 +1120,7 @@ mod sys { tree, indoc! { r#" - + └>ul ├>li │ └>p @@ -1115,18 +1140,28 @@ mod sys { │ └>p │ └>u │ └>"Underlined" - ├>li - │ └>p - │ └>a "http://matrix.org" - │ └>u - │ └>"Linked" - └>ul - └>li - └>p - └>"nested" + └>li + ├>p + │ └>a "http://matrix.org" + │ └>u + │ └>"Linked" + └>ul + └>li + └>p + └>"nested" "# } ); + assert_eq!( + dom.to_markdown().unwrap().to_string(), + r#"* *Italic* +* Bold +* Unformatted +* ~~Strikethrough~~ +* Underlined +* [Linked]() + * nested"# + ); } #[test] @@ -1176,7 +1211,7 @@ mod sys { └>ul └>li └>p - └>"nested" + └>"Nested" "# } ); @@ -1184,6 +1219,32 @@ mod sys { } } +fn post_process_for_adjacent_text(mut dom: Dom) -> Dom { + println!( + "Post-processing adjacent text nodes: {}", + dom.to_tree().to_string() + ); + let text_handles = find_text_nodes(&dom); + for handle in text_handles.iter().rev() { + dom = post_process_adjacent_text(dom, handle); + } + dom +} + +fn find_text_nodes(dom: &Dom) -> Vec { + dom.iter() + .filter(|n| n.is_text_node()) + .map(|n| n.handle()) + .collect::>() +} + +fn post_process_adjacent_text( + mut dom: Dom, + handle: &DomHandle, +) -> Dom { + dom.merge_text_nodes_around(handle); + dom +} fn post_process_blocks(mut dom: Dom) -> Dom { let block_handles = find_blocks(&dom); for handle in block_handles.iter().rev() { @@ -1359,8 +1420,7 @@ fn convert_text( for (i, str) in text_nodes.into_iter().enumerate() { let is_nbsp = str == "\u{A0}" || str == " "; if !str.is_empty() && !is_nbsp { - let text_node = DomNode::new_text(str.into()); - node.append_child(text_node); + node.append_child(DomNode::new_text(str.into())); } if i + 1 < text_nodes_len { node.append_child(DomNode::new_line_break()); @@ -1406,6 +1466,7 @@ mod js { }; use matrix_mentions::Mention; use std::fmt; + use wasm_bindgen::JsCast; use web_sys::{ Document, DomParser, Element, HtmlElement, NodeList, SupportedType, @@ -1467,6 +1528,7 @@ mod js { self.webdom_to_dom(document, html_source) .map_err(to_dom_creation_error) .map(post_process_blocks) + .map(post_process_for_adjacent_text) } fn webdom_to_dom( @@ -1478,12 +1540,13 @@ mod js { S: UnicodeString, { let body = webdoc.body().ok_or_else(|| Error::NoBody)?; - self.convert(body.child_nodes(), html_source) + self.convert(body.child_nodes(), DomNodeKind::Generic, html_source) } fn convert( &mut self, nodes: NodeList, + parent_kind: DomNodeKind, html_source: HtmlSource, ) -> Result, Error> where @@ -1493,7 +1556,12 @@ mod js { let mut dom = Dom::new(Vec::with_capacity(number_of_nodes)); let dom_document = dom.document_mut(); - self.convert_container(nodes, dom_document, html_source)?; + self.convert_container( + nodes, + dom_document, + parent_kind, + html_source, + )?; Ok(dom) } @@ -1502,6 +1570,7 @@ mod js { &mut self, nodes: NodeList, dom: &mut ContainerNode, + parent_kind: DomNodeKind, html_source: HtmlSource, ) -> Result<(), Error> where @@ -1584,7 +1653,11 @@ mod js { ); } else { let children = self - .convert(node.child_nodes(), html_source)? + .convert( + node.child_nodes(), + DomNodeKind::Link, + html_source, + )? .take_children(); dom.append_child(DomNode::new_link( url.into(), @@ -1601,11 +1674,16 @@ mod js { .unchecked_ref::() .get_attribute("start"); self.current_path.push(DomNodeKind::List); + dom.append_child(DomNode::Container( ContainerNode::new_list( ListType::Ordered, - self.convert(node.child_nodes(), html_source)? - .take_children(), + self.convert( + node.child_nodes(), + DomNodeKind::List, + html_source, + )? + .take_children(), if let Some(custom_start) = custom_start { Some(vec![( "start".into(), @@ -1621,14 +1699,43 @@ mod js { "UL" => { self.current_path.push(DomNodeKind::List); - dom.append_child(DomNode::Container( - ContainerNode::new_list( - ListType::Unordered, - self.convert(node.child_nodes(), html_source)? + // TODO We should pass the parent kind in so that we can bail out if a non-list item is being added to it's children. + if parent_kind == DomNodeKind::List { + if html_source != HtmlSource::GoogleDoc { + return Err(Error::InvalidListItemNode); + } + let target = dom + .last_child_mut() + .unwrap() + .as_container_mut() + .unwrap(); + target.append_child(DomNode::Container( + ContainerNode::new_list( + ListType::Unordered, + self.convert( + node.child_nodes(), + DomNodeKind::List, + html_source, + )? .take_children(), - None, - ), - )); + None, + ), + )); + } else { + dom.append_child(DomNode::Container( + ContainerNode::new_list( + ListType::Unordered, + self.convert( + node.child_nodes(), + DomNodeKind::List, + html_source, + )? + .take_children(), + None, + ), + )); + } + self.current_path.pop(); } @@ -1636,8 +1743,12 @@ mod js { self.current_path.push(DomNodeKind::ListItem); dom.append_child(DomNode::Container( ContainerNode::new_list_item( - self.convert(node.child_nodes(), html_source)? - .take_children(), + self.convert( + node.child_nodes(), + DomNodeKind::ListItem, + html_source, + )? + .take_children(), ), )); self.current_path.pop(); @@ -1657,8 +1768,12 @@ mod js { }; dom.append_child(DomNode::Container( ContainerNode::new_code_block( - self.convert(children, html_source)? - .take_children(), + self.convert( + children, + DomNodeKind::CodeBlock, + html_source, + )? + .take_children(), ), )); self.current_path.pop(); @@ -1668,8 +1783,12 @@ mod js { self.current_path.push(DomNodeKind::Quote); dom.append_child(DomNode::Container( ContainerNode::new_quote( - self.convert(node.child_nodes(), html_source)? - .take_children(), + self.convert( + node.child_nodes(), + DomNodeKind::Quote, + html_source, + )? + .take_children(), ), )); self.current_path.pop(); @@ -1679,16 +1798,17 @@ mod js { self.current_path.push(DomNodeKind::Paragraph); dom.append_child(DomNode::Container( ContainerNode::new_paragraph( - self.convert(node.child_nodes(), html_source)? - .take_children(), + self.convert( + node.child_nodes(), + DomNodeKind::Paragraph, + html_source, + )? + .take_children(), ), )); self.current_path.pop(); } node_name => { - let children_nodes = self - .convert(node.child_nodes(), html_source)? - .take_children(); let formatting_kind = match node_name { "STRONG" | "B" => Some(InlineFormatType::Bold), "EM" | "I" => Some(InlineFormatType::Italic), @@ -1741,23 +1861,43 @@ mod js { } }; - if formatting_kind.is_none() { - if !children_nodes.is_empty() { - dom.append_children(children_nodes); - } - self.current_path.pop(); - } else { + if let Some(formatting_kind) = formatting_kind { self.current_path.push(DomNodeKind::Formatting( - formatting_kind.clone().unwrap(), + formatting_kind.clone(), + )); + let children_nodes = self + .convert( + node.child_nodes(), + DomNodeKind::Formatting( + formatting_kind.clone(), + ), + html_source, + )? + .take_children(); + self.current_path.push(DomNodeKind::Formatting( + formatting_kind.clone(), )); dom.append_child(DomNode::Container( ContainerNode::new_formatting( - formatting_kind.unwrap(), + formatting_kind.clone(), children_nodes, ), )); self.current_path.pop(); + } else { + self.current_path.push(parent_kind.clone()); + let children_nodes = self + .convert( + node.child_nodes(), + parent_kind.clone(), + html_source, + )? + .take_children(); + if !children_nodes.is_empty() { + dom.append_children(children_nodes); + } + self.current_path.pop(); } } } @@ -1779,6 +1919,7 @@ mod js { enum Error { NoBody, UnknownNode(String), + InvalidListItemNode, } impl fmt::Display for Error { @@ -1794,6 +1935,12 @@ mod js { Self::UnknownNode(node_name) => { write!(formatter, "Node `{node_name}` is not supported") } + Self::InvalidListItemNode => { + write!( + formatter, + "Invalid list item node: a list must only contain list items" + ) + } } } } @@ -1802,7 +1949,8 @@ mod js { mod tests { use super::*; use crate::{ - tests::testutils_composer_model::restore_whitespace, ToHtml, ToTree, + tests::testutils_composer_model::restore_whitespace, ToHtml, + ToMarkdown, ToTree, }; use indoc::indoc; use wasm_bindgen_test::*; @@ -1841,7 +1989,17 @@ mod js { HtmlSource::GoogleDoc, ) .unwrap(); - assert_eq!(dom.to_string(), "
    • Italic

    • Bold

    • Unformatted

    • Strikethrough

    • Underlined

    • Linked

      • nested

    "); + assert_eq!(dom.to_string(), "
    • Italic

    • Bold

    • Unformatted

    • Strikethrough

    • Underlined

    • Linked

      • nested

    "); + assert_eq!( + dom.to_markdown().unwrap().to_string(), + r#"* *Italic* +* Bold +* Unformatted +* ~~Strikethrough~~ +* Underlined +* [Linked]() + * nested"# + ); } #[wasm_bindgen_test] diff --git a/crates/wysiwyg/src/tests/test_deleting.rs b/crates/wysiwyg/src/tests/test_deleting.rs index 02d4ee390..fc5a01571 100644 --- a/crates/wysiwyg/src/tests/test_deleting.rs +++ b/crates/wysiwyg/src/tests/test_deleting.rs @@ -173,10 +173,12 @@ fn deleting_across_lists_joins_them() { fn deleting_across_lists_joins_them_nested() { let mut model = cm("
      \
    1. 1{1
    2. \ -
    3. 22
    4. \ -
        \ -
      1. 55
      2. \ -
      \ +
    5. +

      22

      +
        \ +
      1. 55
      2. \ +
      \ +
    6. \
    \
      \
    1. 33
    2. \ From 8dd57ebde224943d7b78d77acc06ffc6c02f2d7d Mon Sep 17 00:00:00 2001 From: David Langley Date: Wed, 16 Jul 2025 14:17:21 +0100 Subject: [PATCH 09/17] Finish implementation of ordered lists in the js parser an improve comments --- .../src/composer_model/replace_html.rs | 26 ++-- crates/wysiwyg/src/dom/parser/parse.rs | 145 +++++++++--------- 2 files changed, 83 insertions(+), 88 deletions(-) diff --git a/crates/wysiwyg/src/composer_model/replace_html.rs b/crates/wysiwyg/src/composer_model/replace_html.rs index d9ee660f7..ca40b5c32 100644 --- a/crates/wysiwyg/src/composer_model/replace_html.rs +++ b/crates/wysiwyg/src/composer_model/replace_html.rs @@ -10,9 +10,7 @@ use crate::dom::html_source::HtmlSource; use crate::dom::nodes::ContainerNode; use crate::dom::parser::parse_from_source; -use crate::{ - parse, ComposerModel, ComposerUpdate, DomNode, Location, UnicodeString, -}; +use crate::{ComposerModel, ComposerUpdate, DomNode, Location, UnicodeString}; impl ComposerModel where @@ -30,32 +28,30 @@ where if self.has_selection() { self.do_replace_text(S::default()); } - + // Remove meta tags from the HTML which caused errors in html5ever let meta_regex = Regex::new(r"]*>").unwrap(); let mut cleaned_html = meta_regex .replace_all(&new_html.to_string(), "") .to_string(); if external_source == HtmlSource::GoogleDoc { - // Strip first b tag (opening and closing) + // Strip outer b tag that google docs adds let b_regex = Regex::new(r"]*>(.*)<\/b>").unwrap(); cleaned_html = b_regex.replace(&cleaned_html, "$1").to_string(); } - let result = if external_source == HtmlSource::Matrix { - parse(&cleaned_html.to_string()) - } else { - parse_from_source(&cleaned_html.to_string(), external_source) - }; + let result = + parse_from_source(&cleaned_html.to_string(), external_source); let doc_node = result.unwrap().into_document_node(); let (start, end) = self.safe_selection(); let range = self.state.dom.find_range(start, end); + // We should only have 1 dom node, so add the children under a paragraph to take advantage of the exisitng + // insert_node_at_cursor api and then delete the paragraph node promoting it's the children up a level. let p = DomNode::Container(ContainerNode::new_paragraph( doc_node.into_container().unwrap().take_children(), )); - let new_cursor_index = start + p.text_len(); let handle = self.state.dom.insert_node_at_cursor(&range, p); self.state.dom.replace_node_with_its_children(&handle); @@ -69,11 +65,11 @@ where #[cfg(test)] const GOOGLE_DOC_HTML_PASTEBOARD: &str = r#" -
      • Italic

      • Bold

      • Unformatted

      • Strikethrough

      • Underlined

      • Linked

        • nested

      +
      1. Italic

      2. Bold

      3. Unformatted

      4. Strikethrough

      5. Underlined

      6. Linked

        • Nested

      "#; #[cfg(test)] const MS_DOC_HTML_PASTEBOARD: &str = r#" -
      • Italic 

      • Bold 

      • Unformatted 

      • Strikethrough 

      • Underlined 

      • Nested

      +
      1. Italic 

      1. Bold 

      1. Unformatted 

      1. Strikethrough 

      1. Underlined 

      • Nested

      "#; #[cfg(test)] @@ -96,7 +92,7 @@ mod test { let html_str = html.to_string(); assert!(!html_str.contains("
    3. Italic

    4. Bold

    5. Unformatted

    6. Strikethrough

    7. Underlined

    8. Linked

      • nested

    9. "); + assert_eq!(html_str, "
      1. Italic

      2. Bold

      3. Unformatted

      4. Strikethrough

      5. Underlined

      6. Linked

        • Nested

      "); } #[test] @@ -111,7 +107,7 @@ mod test { let html = model.get_content_as_html(); let html_str = html.to_string(); assert!(!html_str.contains("
    10. Italic

    11. Bold

    12. Unformatted

    13. Strikethrough

    14. Underlined

    15. Linked

    16. Nested

    17. "); + assert_eq!(html_str, "
      1. Italic

      2. Bold

      3. Unformatted

      4. Strikethrough

      5. Underlined

      6. Linked

      • Nested

      "); } #[test] diff --git a/crates/wysiwyg/src/dom/parser/parse.rs b/crates/wysiwyg/src/dom/parser/parse.rs index 3fad573c2..849712963 100644 --- a/crates/wysiwyg/src/dom/parser/parse.rs +++ b/crates/wysiwyg/src/dom/parser/parse.rs @@ -11,7 +11,7 @@ use crate::dom::html_source::HtmlSource; use crate::dom::nodes::dom_node::DomNodeKind::{self}; use crate::dom::nodes::{ContainerNode, ContainerNodeKind}; use crate::dom::Dom; -use crate::{DomHandle, DomNode, ToTree, UnicodeString}; +use crate::{DomHandle, DomNode, UnicodeString}; pub fn parse(html: &str) -> Result, HtmlParseError> where @@ -48,11 +48,11 @@ where #[cfg(test)] const GOOGLE_DOC_HTML_PASTEBOARD: &str = r#" -
      • Italic

      • Bold

      • Unformatted

      • Strikethrough

      • Underlined

      • Linked

        • nested

      +
      1. Italic

      2. Bold

      3. Unformatted

      4. Strikethrough

      5. Underlined

      6. Linked

        • Nested

      "#; #[cfg(test)] const MS_DOC_HTML_PASTEBOARD: &str = r#" -
      • Italic 

      • Bold 

      • Unformatted 

      • Strikethrough 

      • Underlined 

      • Nested

      +
      1. Italic 

      1. Bold 

      1. Unformatted 

      1. Strikethrough 

      1. Underlined 

      • Nested

      "#; #[cfg(feature = "sys")] @@ -236,6 +236,12 @@ mod sys { } } "span" => { + if html_source == HtmlSource::Matrix { + return Err(Error::UnknownNode(tag.to_string())); + } + + // For external sources, we check for common formatting styles for spans + // and convert them to appropriate formatting nodes. let mut formatting_tag = None; if child.contains_style("font-weight", "bold") { formatting_tag = Some("b"); @@ -263,23 +269,23 @@ mod sys { )?; self.current_path.remove(cur_path_idx); } else { - if html_source == HtmlSource::Matrix { - return Err(Error::UnknownNode(tag.to_string())); - } else { - self.convert(padom, child, node, html_source)?; - } + // If no formatting tag was found, just skip and convert the children + self.convert(padom, child, node, html_source)?; } } "br" => { node.append_child(Self::new_line_break()); } "ol" | "ul" => { - self.current_path.push(DomNodeKind::List); - let target_node = if node.is_list() { + // Google docs adds nested lists as children of the list node, this breaks our invariants. + // For the google docs case, we can add the nested list to the last list item instead. if html_source != HtmlSource::GoogleDoc || node.last_child_mut().is_none() + || node.last_child_mut().unwrap().is_list_item() + == false { + // If source is not Google Docs or the last child is not a list item, we return an error. return Err(Error::InvalidListItemNode); } node.last_child_mut() @@ -289,6 +295,7 @@ mod sys { } else { node }; + self.current_path.push(DomNodeKind::List); if tag == "ol" { let custom_start = child .get_attr("start") @@ -1121,7 +1128,7 @@ mod sys { indoc! { r#" - └>ul + └>ol ├>li │ └>p │ └>i @@ -1148,19 +1155,21 @@ mod sys { └>ul └>li └>p - └>"nested" + └>"Nested" "# } ); assert_eq!( dom.to_markdown().unwrap().to_string(), - r#"* *Italic* -* Bold -* Unformatted -* ~~Strikethrough~~ -* Underlined -* [Linked]() - * nested"# + indoc! {r#" + 1. *Italic* + 2. Bold + 3. Unformatted + 4. ~~Strikethrough~~ + 5. Underlined + 6. [Linked]() + * Nested"# + } ); } @@ -1178,31 +1187,31 @@ mod sys { indoc! { r#" - ├>ul + ├>ol │ └>li │ └>p │ └>i │ └>"Italic" - ├>ul + ├>ol │ └>li │ └>p │ └>b │ └>"Bold" - ├>ul + ├>ol │ └>li │ └>p │ └>"Unformatted" - ├>ul + ├>ol │ └>li │ └>p │ └>del │ └>"Strikethrough" - ├>ul + ├>ol │ └>li │ └>p │ └>u │ └>"Underlined" - ├>ul + ├>ol │ └>li │ └>p │ └>a "https://matrix.org/" @@ -1220,10 +1229,6 @@ mod sys { } fn post_process_for_adjacent_text(mut dom: Dom) -> Dom { - println!( - "Post-processing adjacent text nodes: {}", - dom.to_tree().to_string() - ); let text_handles = find_text_nodes(&dom); for handle in text_handles.iter().rev() { dom = post_process_adjacent_text(dom, handle); @@ -1580,8 +1585,9 @@ mod js { for nth in 0..number_of_nodes { let node = nodes.get(nth as _).unwrap(); - - match node.node_name().as_str() { + let node_name = node.node_name(); + let tag = node_name.as_str(); + match tag { "BR" => { dom.append_child(DomNode::new_line_break()); } @@ -1668,42 +1674,32 @@ mod js { self.current_path.pop(); } - - "OL" => { + "UL" | "OL" => { let custom_start = node .unchecked_ref::() .get_attribute("start"); - self.current_path.push(DomNodeKind::List); - dom.append_child(DomNode::Container( - ContainerNode::new_list( - ListType::Ordered, - self.convert( - node.child_nodes(), - DomNodeKind::List, - html_source, - )? - .take_children(), - if let Some(custom_start) = custom_start { - Some(vec![( - "start".into(), - custom_start.into(), - )]) - } else { - None - }, - ), - )); - self.current_path.pop(); - } + let attributes: Option> = + if tag == "ol" && custom_start.is_some() { + Some(vec![( + "start".into(), + custom_start.unwrap().into(), + )]) + } else { + None + }; + + let list_type = if tag == "OL" { + ListType::Ordered + } else { + ListType::Unordered + }; - "UL" => { - self.current_path.push(DomNodeKind::List); - // TODO We should pass the parent kind in so that we can bail out if a non-list item is being added to it's children. if parent_kind == DomNodeKind::List { if html_source != HtmlSource::GoogleDoc { return Err(Error::InvalidListItemNode); } + self.current_path.push(DomNodeKind::List); let target = dom .last_child_mut() .unwrap() @@ -1711,27 +1707,27 @@ mod js { .unwrap(); target.append_child(DomNode::Container( ContainerNode::new_list( - ListType::Unordered, + list_type, self.convert( node.child_nodes(), DomNodeKind::List, html_source, )? .take_children(), - None, + attributes, ), )); } else { dom.append_child(DomNode::Container( ContainerNode::new_list( - ListType::Unordered, + list_type, self.convert( node.child_nodes(), DomNodeKind::List, html_source, )? .take_children(), - None, + attributes, ), )); } @@ -1821,6 +1817,8 @@ mod js { node_name.to_owned(), )); } + // For external sources, we check for common formatting styles for spans + // and convert them to appropriate formatting nodes. let style = node.unchecked_ref::().style(); if style @@ -1886,7 +1884,7 @@ mod js { )); self.current_path.pop(); } else { - self.current_path.push(parent_kind.clone()); + // If it's an external source we skip the node and process it's children. let children_nodes = self .convert( node.child_nodes(), @@ -1897,7 +1895,6 @@ mod js { if !children_nodes.is_empty() { dom.append_children(children_nodes); } - self.current_path.pop(); } } } @@ -1989,16 +1986,18 @@ mod js { HtmlSource::GoogleDoc, ) .unwrap(); - assert_eq!(dom.to_string(), "
      • Italic

      • Bold

      • Unformatted

      • Strikethrough

      • Underlined

      • Linked

        • nested

      "); + assert_eq!(dom.to_string(), "
      1. Italic

      2. Bold

      3. Unformatted

      4. Strikethrough

      5. Underlined

      6. Linked

        • Nested

      "); assert_eq!( dom.to_markdown().unwrap().to_string(), - r#"* *Italic* -* Bold -* Unformatted -* ~~Strikethrough~~ -* Underlined -* [Linked]() - * nested"# + indoc! {r#" + 1. *Italic* + 2. Bold + 3. Unformatted + 4. ~~Strikethrough~~ + 5. Underlined + 6. [Linked]() + * Nested"# + } ); } @@ -2010,7 +2009,7 @@ mod js { HtmlSource::UnknownExternal, ) .unwrap(); - assert_eq!(dom.to_string(), "
      • Italic

      • Bold

      • Unformatted

      • Strikethrough

      • Underlined

      • nested

      "); + assert_eq!(dom.to_string(), "
      1. Italic

      1. Bold

      1. Unformatted

      1. Strikethrough

      1. Underlined

      1. Linked

      • Nested

      "); } #[wasm_bindgen_test] From 781038145f9152e1da668628d347ef5364e7355e Mon Sep 17 00:00:00 2001 From: David Langley Date: Thu, 17 Jul 2025 13:47:10 +0100 Subject: [PATCH 10/17] Put back the enforcing of assertions and improve parsing to fix cases where there are inline + block containers in the same parent. --- crates/wysiwyg/src/dom/dom_methods.rs | 5 +- crates/wysiwyg/src/dom/dom_struct.rs | 7 ++- .../wysiwyg/src/dom/nodes/container_node.rs | 10 ++-- crates/wysiwyg/src/dom/parser/parse.rs | 54 ++++++++++++++++--- 4 files changed, 62 insertions(+), 14 deletions(-) diff --git a/crates/wysiwyg/src/dom/dom_methods.rs b/crates/wysiwyg/src/dom/dom_methods.rs index f4a1f3293..225678573 100644 --- a/crates/wysiwyg/src/dom/dom_methods.rs +++ b/crates/wysiwyg/src/dom/dom_methods.rs @@ -696,8 +696,6 @@ where self.merge_text_nodes_around(&first_location.node_handle); } } - #[cfg(any(test, feature = "assert-invariants"))] - self.assert_invariants(); } pub fn merge_text_nodes_around(&mut self, handle: &DomHandle) { @@ -712,6 +710,9 @@ where merge_if_adjacent_text_nodes(parent, idx - 1); } merge_if_adjacent_text_nodes(parent, idx); + + #[cfg(any(test, feature = "assert-invariants"))] + self.assert_invariants(); } /// Recursively visit container nodes, looking for block nodes and, if they contain a diff --git a/crates/wysiwyg/src/dom/dom_struct.rs b/crates/wysiwyg/src/dom/dom_struct.rs index 0ef76ea31..bff6acc9a 100644 --- a/crates/wysiwyg/src/dom/dom_struct.rs +++ b/crates/wysiwyg/src/dom/dom_struct.rs @@ -924,12 +924,15 @@ mod test { #[test] fn find_parent_list_item_or_self_finds_our_grandparent() { let d = cm("|
      • bc
      d").state.dom; + // The "|" at the start infers that when the dom is created, it be within a paragraph + // (as inline nodes and blocks are not allowed to be siblings). + // So the handle is [1, 0, 1, 0]. let res = d.find_ancestor_list_item_or_self(&DomHandle::from_raw(vec![ - 0, 0, 1, 0, + 1, 0, 1, 0, ])); let res = res.expect("Should have found a list parent!"); - assert_eq!(res.into_raw(), vec![0, 0]); + assert_eq!(res.into_raw(), vec![1, 0]); } #[test] diff --git a/crates/wysiwyg/src/dom/nodes/container_node.rs b/crates/wysiwyg/src/dom/nodes/container_node.rs index fc386986a..97092721d 100644 --- a/crates/wysiwyg/src/dom/nodes/container_node.rs +++ b/crates/wysiwyg/src/dom/nodes/container_node.rs @@ -1578,19 +1578,21 @@ mod test { #[test] fn paragraph_to_message_html() { - let model = cm("

       

       

      Hello!

       

      |"); + let model = + cm("

       

       

      Hello!

       

      |

      "); assert_eq!( &model.state.dom.to_message_html(), - "

      Hello!
      " + "

      Hello!

      " ); } #[test] fn paragraph_to_html() { - let model = cm("

       

       

      Hello!

       

      |"); + let model = + cm("

       

       

      Hello!

       

      |

      "); assert_eq!( &model.state.dom.to_html(), - "

      \u{a0}

      \u{a0}

      Hello!

      \u{a0}

      " + "

      \u{a0}

      \u{a0}

      Hello!

      \u{a0}

      \u{a0}

      " ); } diff --git a/crates/wysiwyg/src/dom/parser/parse.rs b/crates/wysiwyg/src/dom/parser/parse.rs index 849712963..ad581d28a 100644 --- a/crates/wysiwyg/src/dom/parser/parse.rs +++ b/crates/wysiwyg/src/dom/parser/parse.rs @@ -9,7 +9,7 @@ use regex::Regex; use crate::dom::dom_creation_error::HtmlParseError; use crate::dom::html_source::HtmlSource; use crate::dom::nodes::dom_node::DomNodeKind::{self}; -use crate::dom::nodes::{ContainerNode, ContainerNodeKind}; +use crate::dom::nodes::{container_node, ContainerNode, ContainerNodeKind}; use crate::dom::Dom; use crate::{DomHandle, DomNode, UnicodeString}; @@ -121,8 +121,10 @@ mod sys { } })?; let dom_blocks_done = post_process_blocks(dom); + let dom_inline_blocks_done = + post_process_for_block_and_inline_siblings(dom_blocks_done); let dom_adjacted_text_done = - post_process_for_adjacent_text(dom_blocks_done); + post_process_for_adjacent_text(dom_inline_blocks_done); Ok(dom_adjacted_text_done) } @@ -1237,10 +1239,7 @@ fn post_process_for_adjacent_text(mut dom: Dom) -> Dom { } fn find_text_nodes(dom: &Dom) -> Vec { - dom.iter() - .filter(|n| n.is_text_node()) - .map(|n| n.handle()) - .collect::>() + dom.iter_text().map(|n| n.handle()).collect::>() } fn post_process_adjacent_text( @@ -1250,6 +1249,49 @@ fn post_process_adjacent_text( dom.merge_text_nodes_around(handle); dom } + +fn post_process_for_block_and_inline_siblings( + mut dom: Dom, +) -> Dom { + let continer_handles = find_containers_with_inline_and_block_children(&dom); + for handle in continer_handles.iter().rev() { + dom = post_process_container_for_block_and_inline_siblings(dom, handle); + } + dom +} + +fn find_containers_with_inline_and_block_children( + dom: &Dom, +) -> Vec { + dom.iter_containers() + .filter(|n| { + if n.children().is_empty() { + return false; // Skip empty containers + } + let all_nodes_are_inline = + n.children().iter().all(|n| !n.is_block_node()); + let all_nodes_are_block = + n.children().iter().all(|n| n.is_block_node()); + !all_nodes_are_inline && !all_nodes_are_block + }) + .map(|n| n.handle()) + .collect::>() +} + +fn post_process_container_for_block_and_inline_siblings( + mut dom: Dom, + handle: &DomHandle, +) -> Dom { + // upate the container node by grouping inline nodes, to avoid + // having inline nodes as siblings of block nodes. + let container_node = + dom.lookup_node_mut(handle).as_container_mut().unwrap(); + let new_children = + group_inline_nodes(container_node.remove_children().to_vec()); + container_node.insert_children(0, new_children.clone()); + dom +} + fn post_process_blocks(mut dom: Dom) -> Dom { let block_handles = find_blocks(&dom); for handle in block_handles.iter().rev() { From bc5d2c7f417cb8dc6640919f91dadac57292d834 Mon Sep 17 00:00:00 2001 From: David Langley Date: Fri, 18 Jul 2025 10:01:19 +0100 Subject: [PATCH 11/17] Add tests and fix some edge cases Namely, fixing the cursor position after insert, and adding more parsing post processing to keep the dom in an expected state with wrap_inline_nodes_into_paragraphs_if_needed and join_nodes_in_container. Also fixing some tests. --- .../src/composer_model/example_format.rs | 3 +- .../src/composer_model/replace_html.rs | 88 +++++++++++++++++-- crates/wysiwyg/src/dom/dom_struct.rs | 19 +++- .../wysiwyg/src/dom/insert_node_at_cursor.rs | 4 + crates/wysiwyg/src/dom/parser/parse.rs | 84 +++++------------- crates/wysiwyg/src/dom/range.rs | 16 ++++ crates/wysiwyg/src/tests/test_deleting.rs | 8 +- 7 files changed, 144 insertions(+), 78 deletions(-) diff --git a/crates/wysiwyg/src/composer_model/example_format.rs b/crates/wysiwyg/src/composer_model/example_format.rs index 62c15f8d6..48760ade7 100644 --- a/crates/wysiwyg/src/composer_model/example_format.rs +++ b/crates/wysiwyg/src/composer_model/example_format.rs @@ -903,7 +903,7 @@ mod test { #[test] fn selection_across_lists_roundtrips() { assert_that!( - "
      1. 1{1
      2. 22
      1. 33
      2. 4}|4
      " + "
      1. 1{1
      2. 22

      a

      1. 33
      2. 4}|4
      " ) .roundtrips(); } @@ -915,6 +915,7 @@ mod test {
    18. 1{1
    19. \
    20. 22
    21. \
    \ +

    a

    \
      \
    1. 33
    2. \
    3. 4}|4
    4. \ diff --git a/crates/wysiwyg/src/composer_model/replace_html.rs b/crates/wysiwyg/src/composer_model/replace_html.rs index ca40b5c32..e21613a8e 100644 --- a/crates/wysiwyg/src/composer_model/replace_html.rs +++ b/crates/wysiwyg/src/composer_model/replace_html.rs @@ -9,8 +9,7 @@ use regex::Regex; use crate::dom::html_source::HtmlSource; use crate::dom::nodes::ContainerNode; use crate::dom::parser::parse_from_source; - -use crate::{ComposerModel, ComposerUpdate, DomNode, Location, UnicodeString}; +use crate::{ComposerModel, ComposerUpdate, DomNode, Location, UnicodeString}; // Import the trait for to_tree impl ComposerModel where @@ -49,16 +48,25 @@ where // We should only have 1 dom node, so add the children under a paragraph to take advantage of the exisitng // insert_node_at_cursor api and then delete the paragraph node promoting it's the children up a level. - let p = DomNode::Container(ContainerNode::new_paragraph( - doc_node.into_container().unwrap().take_children(), - )); - let new_cursor_index = start + p.text_len(); + let new_children = doc_node.into_container().unwrap().take_children(); + let child_count = new_children.len(); + let p = DomNode::Container(ContainerNode::new_paragraph(new_children)); + let handle = self.state.dom.insert_node_at_cursor(&range, p); self.state.dom.replace_node_with_its_children(&handle); + self.state.dom.wrap_inline_nodes_into_paragraphs_if_needed( + &self.state.dom.parent(&handle).handle(), + ); - // manually move the cursor to the end of the html - self.state.start = Location::from(new_cursor_index); + // Track the index of the last inserted node for placing the cursor + let last_index = handle.index_in_parent() + child_count - 1; + let last_handle = handle.parent_handle().child_handle(last_index); + let location = self.state.dom.location_for_node(&last_handle); + + self.state.start = + Location::from(location.position + location.length - 1); self.state.end = self.state.start; + // add a trailing space in cases when we do not have a next sibling self.create_update_replace_all() } } @@ -121,4 +129,68 @@ mod test { let html_str = html.to_string(); assert_eq!(html_str, "

      test

      "); } + + #[test] + fn test_replace_html_with_existing_selection() { + let mut model = cm("Hello{world}|test"); + let new_html = "

      replacement

      "; + + let _ = + model.replace_html(new_html.into(), HtmlSource::UnknownExternal); + + let html = model.get_content_as_html(); + let html_str = html.to_string(); + assert_eq!( + html_str, + "

      Hello

      replacement

      test

      " + ); + } + + #[test] + fn test_replace_html_cursor_position_after_insert() { + let mut model = cm("Start|"); + let new_html = "Bold text"; + let _ = model.replace_html(new_html.into(), HtmlSource::Matrix); + // Cursor should be positioned after the inserted content + let (start, end) = model.safe_selection(); + assert_eq!(start, end); // No selection, just cursor + model.bold(); + model.enter(); + // Insert more text to verify cursor position + let _ = model.replace_text("End".into()); + let html = model.get_content_as_html(); + let html_str = html.to_string(); + assert_eq!( + html_str, + "

      Start

      Bold text

      End

      " + ); + } + + #[test] + fn test_replace_html_multiple_meta_tags() { + let mut model = cm("|"); + let html_with_multiple_metas = r#"

      Content after metas

      "#; + + let _ = model.replace_html( + html_with_multiple_metas.into(), + HtmlSource::UnknownExternal, + ); + + let html = model.get_content_as_html(); + let html_str = html.to_string(); + assert!(!html_str.contains("Content after metas

      "); + } + + #[test] + fn test_replace_html_empty_content() { + let mut model = cm("Existing content|"); + let empty_html = ""; + + let _ = model.replace_html(empty_html.into(), HtmlSource::Matrix); + + let html = model.get_content_as_html(); + let html_str = html.to_string(); + assert_eq!(html_str, "

      Existing content

      "); + } } diff --git a/crates/wysiwyg/src/dom/dom_struct.rs b/crates/wysiwyg/src/dom/dom_struct.rs index bff6acc9a..d84d9bdde 100644 --- a/crates/wysiwyg/src/dom/dom_struct.rs +++ b/crates/wysiwyg/src/dom/dom_struct.rs @@ -11,6 +11,7 @@ use crate::dom::nodes::{ContainerNode, DomNode}; use crate::dom::to_html::ToHtmlState; use crate::dom::to_markdown::{MarkdownError, MarkdownOptions, ToMarkdown}; use crate::dom::unicode_string::UnicodeStrExt; +use crate::dom::DomLocation; use crate::dom::{ find_range, to_raw_text::ToRawText, DomHandle, Range, ToTree, UnicodeString, }; @@ -295,14 +296,24 @@ where find_range::find_range(self, start, end) } - pub fn find_range_by_node(&self, node_handle: &DomHandle) -> Range { - let result = find_range::find_pos(self, node_handle, 0, usize::MAX); + pub fn location_for_node(&self, node_handle: &DomHandle) -> DomLocation { + let locations = find_range::find_range(self, 0, usize::MAX); + return locations.find_location(node_handle).unwrap().clone(); + } - let locations = match result { + pub fn locations_for_node( + &self, + node_handle: &DomHandle, + ) -> Vec { + let result = find_range::find_pos(self, &node_handle, 0, usize::MAX); + match result { FindResult::Found(locations) => locations, _ => panic!("Node does not exist"), - }; + } + } + pub fn find_range_by_node(&self, node_handle: &DomHandle) -> Range { + let locations = self.locations_for_node(node_handle); let leaves = locations.iter().filter(|l| l.is_leaf()); let s = leaves.clone().map(|l| l.position).min().unwrap(); diff --git a/crates/wysiwyg/src/dom/insert_node_at_cursor.rs b/crates/wysiwyg/src/dom/insert_node_at_cursor.rs index 2a892c42f..1a724542a 100644 --- a/crates/wysiwyg/src/dom/insert_node_at_cursor.rs +++ b/crates/wysiwyg/src/dom/insert_node_at_cursor.rs @@ -67,6 +67,10 @@ where }; } + self.wrap_inline_nodes_into_paragraphs_if_needed( + &self.parent(&inserted_handle).handle(), + ); + #[cfg(any(test, feature = "assert-invariants"))] self.assert_invariants(); diff --git a/crates/wysiwyg/src/dom/parser/parse.rs b/crates/wysiwyg/src/dom/parser/parse.rs index ad581d28a..720cd8123 100644 --- a/crates/wysiwyg/src/dom/parser/parse.rs +++ b/crates/wysiwyg/src/dom/parser/parse.rs @@ -9,7 +9,7 @@ use regex::Regex; use crate::dom::dom_creation_error::HtmlParseError; use crate::dom::html_source::HtmlSource; use crate::dom::nodes::dom_node::DomNodeKind::{self}; -use crate::dom::nodes::{container_node, ContainerNode, ContainerNodeKind}; +use crate::dom::nodes::{ContainerNode, ContainerNodeKind}; use crate::dom::Dom; use crate::{DomHandle, DomNode, UnicodeString}; @@ -1190,30 +1190,25 @@ mod sys { r#" ├>ol - │ └>li - │ └>p - │ └>i - │ └>"Italic" - ├>ol - │ └>li - │ └>p - │ └>b - │ └>"Bold" - ├>ol - │ └>li - │ └>p - │ └>"Unformatted" - ├>ol - │ └>li - │ └>p - │ └>del - │ └>"Strikethrough" - ├>ol - │ └>li - │ └>p - │ └>u - │ └>"Underlined" - ├>ol + │ ├>li + │ │ └>p + │ │ └>i + │ │ └>"Italic" + │ ├>li + │ │ └>p + │ │ └>b + │ │ └>"Bold" + │ ├>li + │ │ └>p + │ │ └>"Unformatted" + │ ├>li + │ │ └>p + │ │ └>del + │ │ └>"Strikethrough" + │ ├>li + │ │ └>p + │ │ └>u + │ │ └>"Underlined" │ └>li │ └>p │ └>a "https://matrix.org/" @@ -1253,42 +1248,7 @@ fn post_process_adjacent_text( fn post_process_for_block_and_inline_siblings( mut dom: Dom, ) -> Dom { - let continer_handles = find_containers_with_inline_and_block_children(&dom); - for handle in continer_handles.iter().rev() { - dom = post_process_container_for_block_and_inline_siblings(dom, handle); - } - dom -} - -fn find_containers_with_inline_and_block_children( - dom: &Dom, -) -> Vec { - dom.iter_containers() - .filter(|n| { - if n.children().is_empty() { - return false; // Skip empty containers - } - let all_nodes_are_inline = - n.children().iter().all(|n| !n.is_block_node()); - let all_nodes_are_block = - n.children().iter().all(|n| n.is_block_node()); - !all_nodes_are_inline && !all_nodes_are_block - }) - .map(|n| n.handle()) - .collect::>() -} - -fn post_process_container_for_block_and_inline_siblings( - mut dom: Dom, - handle: &DomHandle, -) -> Dom { - // upate the container node by grouping inline nodes, to avoid - // having inline nodes as siblings of block nodes. - let container_node = - dom.lookup_node_mut(handle).as_container_mut().unwrap(); - let new_children = - group_inline_nodes(container_node.remove_children().to_vec()); - container_node.insert_children(0, new_children.clone()); + dom.wrap_inline_nodes_into_paragraphs_if_needed(&DomHandle::root()); dom } @@ -1296,6 +1256,7 @@ fn post_process_blocks(mut dom: Dom) -> Dom { let block_handles = find_blocks(&dom); for handle in block_handles.iter().rev() { dom = post_process_block_lines(dom, handle); + dom.join_nodes_in_container(&handle); } dom } @@ -1575,6 +1536,7 @@ mod js { self.webdom_to_dom(document, html_source) .map_err(to_dom_creation_error) .map(post_process_blocks) + .map(post_process_for_block_and_inline_siblings) .map(post_process_for_adjacent_text) } diff --git a/crates/wysiwyg/src/dom/range.rs b/crates/wysiwyg/src/dom/range.rs index 4ec6b9e5c..22c1085a4 100644 --- a/crates/wysiwyg/src/dom/range.rs +++ b/crates/wysiwyg/src/dom/range.rs @@ -7,6 +7,7 @@ use crate::dom::dom_handle::DomHandle; use crate::dom::nodes::dom_node::DomNodeKind; use std::cmp::{min, Ordering}; +use std::fmt; /// Represents the relative position of a DomLocation towards /// the range start and end. @@ -202,6 +203,21 @@ impl DomLocation { } } +impl fmt::Display for DomLocation { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + write!( + f, + "DomLocation[node_handle: {:?}, position: {}, start_offset: {}, end_offset: {}, length: {}, kind: {:?}]", + self.node_handle.raw(), + self.position, + self.start_offset, + self.end_offset, + self.length, + self.kind + ) + } +} + impl PartialOrd for DomLocation { fn partial_cmp(&self, other: &Self) -> Option { Some(self.cmp(other)) diff --git a/crates/wysiwyg/src/tests/test_deleting.rs b/crates/wysiwyg/src/tests/test_deleting.rs index fc5a01571..a28033287 100644 --- a/crates/wysiwyg/src/tests/test_deleting.rs +++ b/crates/wysiwyg/src/tests/test_deleting.rs @@ -878,7 +878,7 @@ fn backspace_immutable_link_from_inside_link() { #[test] fn backspace_immutable_link_multiple() { let mut model = cm( - "firstsecond|", + "firstsecond|", ); model.backspace(); assert_eq!( @@ -956,12 +956,12 @@ fn delete_mention_from_start() { #[test] fn delete_first_immutable_link_of_multiple() { let mut model = cm( - "|firstsecond", + "|firstsecond", ); model.delete(); assert_eq!( restore_whitespace(&tx(&model)), - "|second" + "|second" ); model.delete(); assert_eq!(restore_whitespace(&tx(&model)), "|"); @@ -984,7 +984,7 @@ fn delete_first_mention_of_multiple() { #[test] fn delete_second_immutable_link_of_multiple() { let mut model = cm( - "firstsecond|", + "firstsecond|", ); model.backspace(); assert_eq!( From 3b8b034df0438024e2942e3562d32f69e6ce51c3 Mon Sep 17 00:00:00 2001 From: David Langley Date: Wed, 23 Jul 2025 14:42:31 +0100 Subject: [PATCH 12/17] Add stricter checks/tests around lists As we are now parsing from non-matrix sources, adding some rigour around validation of a valid dom. E.g. making sure nodes other than list items are not added to lists or that list items are not added to containers other than lists. --- .../src/composer_model/replace_html.rs | 97 ++ crates/wysiwyg/src/dom/parser/parse.rs | 1012 ++++++++++------- 2 files changed, 674 insertions(+), 435 deletions(-) diff --git a/crates/wysiwyg/src/composer_model/replace_html.rs b/crates/wysiwyg/src/composer_model/replace_html.rs index e21613a8e..f7b1dfe24 100644 --- a/crates/wysiwyg/src/composer_model/replace_html.rs +++ b/crates/wysiwyg/src/composer_model/replace_html.rs @@ -193,4 +193,101 @@ mod test { let html_str = html.to_string(); assert_eq!(html_str, "

      Existing content

      "); } + + #[test] + fn test_insert_list_item_without_list_parent() { + let mut model = cm("hello|"); + let html = "
    5. list item
    6. "; + + let _ = model.replace_html(html.into(), HtmlSource::UnknownExternal); + + let html = model.get_content_as_html(); + let html_str = html.to_string(); + assert_eq!(html_str, "

      hello

      list item

      "); + } +} + +#[cfg(all(test, target_arch = "wasm32"))] +mod wasm_tests { + use crate::dom::html_source::HtmlSource; + use crate::tests::testutils_composer_model::cm; + use wasm_bindgen_test::*; + + wasm_bindgen_test_configure!(run_in_browser); + + #[wasm_bindgen_test] + fn test_replace_html_with_existing_selection() { + let mut model = cm("Hello{world}|test"); + let new_html = "

      replacement

      "; + + let _ = + model.replace_html(new_html.into(), HtmlSource::UnknownExternal); + + let html = model.get_content_as_html(); + let html_str = html.to_string(); + assert_eq!( + html_str, + "

      Hello

      replacement

      test

      " + ); + } + + #[wasm_bindgen_test] + fn test_replace_html_cursor_position_after_insert() { + let mut model = cm("Start|"); + let new_html = "Bold text"; + let _ = model.replace_html(new_html.into(), HtmlSource::Matrix); + // Cursor should be positioned after the inserted content + let (start, end) = model.safe_selection(); + assert_eq!(start, end); // No selection, just cursor + model.bold(); + model.enter(); + // Insert more text to verify cursor position + let _ = model.replace_text("End".into()); + let html = model.get_content_as_html(); + let html_str = html.to_string(); + assert_eq!( + html_str, + "

      Start

      Bold text

      End

      " + ); + } + + #[wasm_bindgen_test] + fn test_replace_html_multiple_meta_tags() { + let mut model = cm("|"); + let html_with_multiple_metas = r#"

      Content after metas

      "#; + + let _ = model.replace_html( + html_with_multiple_metas.into(), + HtmlSource::UnknownExternal, + ); + + let html = model.get_content_as_html(); + let html_str = html.to_string(); + assert!(!html_str.contains("Content after metas

      "); + } + + #[wasm_bindgen_test] + fn test_replace_html_empty_content() { + let mut model = cm("Existing content|"); + let empty_html = ""; + + let _ = model.replace_html(empty_html.into(), HtmlSource::Matrix); + + let html = model.get_content_as_html(); + let html_str = html.to_string(); + assert_eq!(html_str, "

      Existing content

      "); + } + + #[wasm_bindgen_test] + fn test_insert_list_item_without_list_parent() { + let mut model = cm("hello|"); + let html = "
    7. list item
    8. "; + + let _ = model.replace_html(html.into(), HtmlSource::UnknownExternal); + + let html = model.get_content_as_html(); + let html_str = html.to_string(); + assert_eq!(html_str, "

      hello

      list item

      "); + } } diff --git a/crates/wysiwyg/src/dom/parser/parse.rs b/crates/wysiwyg/src/dom/parser/parse.rs index 720cd8123..d388f6837 100644 --- a/crates/wysiwyg/src/dom/parser/parse.rs +++ b/crates/wysiwyg/src/dom/parser/parse.rs @@ -207,7 +207,7 @@ mod sys { &mut self, padom: &PaDom, child: &PaNodeContainer, - node: &mut ContainerNode, + node_in: &mut ContainerNode, html_source: HtmlSource, ) -> Result<(), Error> where @@ -215,194 +215,230 @@ mod sys { { let cur_path_idx = self.current_path.len(); let tag = child.name.local.as_ref(); - match tag { - "b" | "code" | "del" | "em" | "i" | "strong" | "u" => { - let formatting_node = Self::new_formatting(tag); - if tag == "code" && self.current_path.contains(&CodeBlock) { + let mut invalid_node_error: Option = None; + let mut skip_children: bool = false; + let mut node = node_in.clone(); + if node.is_list() + && tag != "li" + && html_source != HtmlSource::GoogleDoc + { + // If we are inside a list, we can only have list items. + invalid_node_error = Some(Error::InvalidListItemNode); + skip_children = true; + } + + if invalid_node_error.is_none() { + match tag { + "b" | "code" | "del" | "em" | "i" | "strong" | "u" => { + let formatting_node = Self::new_formatting(tag); + if tag == "code" + && self.current_path.contains(&CodeBlock) + { + self.convert_children( + padom, + child, + Some(&mut node), + html_source, + )?; + } else { + self.current_path.push(formatting_node.kind()); + node.append_child(formatting_node); + self.convert_children( + padom, + child, + last_container_mut_in(&mut node), + html_source, + )?; + self.current_path.remove(cur_path_idx); + } + } + "span" => 'span: { + if html_source == HtmlSource::Matrix { + invalid_node_error = + Some(Error::UnknownNode(tag.to_string())); + break 'span; + } + + // For external sources, we check for common formatting styles for spans + // and convert them to appropriate formatting nodes. + let mut formatting_tag = None; + if child.contains_style("font-weight", "bold") { + formatting_tag = Some("b"); + } else if child.contains_style("font-style", "italic") { + formatting_tag = Some("i"); + } else if child + .contains_style("text-decoration", "underline") + { + formatting_tag = Some("u"); + } else if child + .contains_style("text-decoration", "line-through") + { + formatting_tag = Some("del"); + } + + if let Some(tag) = formatting_tag { + let formatting_node = Self::new_formatting(tag); + self.current_path.push(formatting_node.kind()); + node.append_child(formatting_node); + self.convert_children( + padom, + child, + last_container_mut_in(&mut node), + html_source, + )?; + self.current_path.remove(cur_path_idx); + } else { + // If no formatting tag was found, just skip and convert the children + invalid_node_error = + Some(Error::UnknownNode(tag.to_string())); + } + } + "br" => { + node.append_child(Self::new_line_break()); + } + "ol" | "ul" => 'list: { + let target_node = if node.is_list() { + // Google docs adds nested lists as children of the list node, this breaks our invariants. + // For the google docs case, we can add the nested list to the last list item instead. + if html_source != HtmlSource::GoogleDoc + || node.last_child_mut().is_none() + || node.last_child_mut().unwrap().is_list_item() + == false + { + // If source is not Google Docs or the last child is not a list item, we return an error. + invalid_node_error = + Some(Error::InvalidListItemNode); + break 'list; + } + node.last_child_mut() + .unwrap() + .as_container_mut() + .unwrap() + } else { + &mut node + }; + self.current_path.push(DomNodeKind::List); + if tag == "ol" { + let custom_start = child + .get_attr("start") + .and_then(|start| start.parse::().ok()); + target_node.append_child(Self::new_ordered_list( + custom_start, + )); + } else { + target_node + .append_child(Self::new_unordered_list()); + } self.convert_children( padom, child, - Some(node), + last_container_mut_in(target_node), html_source, )?; - } else { - self.current_path.push(formatting_node.kind()); - node.append_child(formatting_node); + self.current_path.remove(cur_path_idx); + } + "li" => 'li: { + if !node.is_list() { + invalid_node_error = Some(Error::ParentNotAList); + break 'li; + } + self.current_path.push(DomNodeKind::ListItem); + node.append_child(Self::new_list_item()); self.convert_children( padom, child, - last_container_mut_in(node), + last_container_mut_in(&mut node), html_source, )?; self.current_path.remove(cur_path_idx); } - } - "span" => { - if html_source == HtmlSource::Matrix { - return Err(Error::UnknownNode(tag.to_string())); - } + "a" => { + let is_mention = child.attrs.iter().any(|(k, v)| { + k == &String::from("href") + && Mention::is_valid_uri(v) + }); + + let text = + child.children.first().map(|gc| padom.get_node(gc)); + let text = match text { + Some(PaDomNode::Text(text)) => Some(text), + _ => None, + }; - // For external sources, we check for common formatting styles for spans - // and convert them to appropriate formatting nodes. - let mut formatting_tag = None; - if child.contains_style("font-weight", "bold") { - formatting_tag = Some("b"); - } else if child.contains_style("font-style", "italic") { - formatting_tag = Some("i"); - } else if child - .contains_style("text-decoration", "underline") - { - formatting_tag = Some("u"); - } else if child - .contains_style("text-decoration", "line-through") - { - formatting_tag = Some("del"); + if is_mention && text.is_some() { + self.current_path.push(DomNodeKind::Mention); + let mention = + Self::new_mention(child, text.unwrap()); + node.append_child(mention); + } else { + self.current_path.push(DomNodeKind::Link); + + let link = Self::new_link(child); + node.append_child(link); + self.convert_children( + padom, + child, + last_container_mut_in(&mut node), + html_source, + )?; + } + self.current_path.remove(cur_path_idx); } - - if let Some(tag) = formatting_tag { - let formatting_node = Self::new_formatting(tag); - self.current_path.push(formatting_node.kind()); - node.append_child(formatting_node); + "pre" => { + self.current_path.push(DomNodeKind::CodeBlock); + node.append_child(Self::new_code_block()); self.convert_children( padom, child, - last_container_mut_in(node), + last_container_mut_in(&mut node), html_source, )?; self.current_path.remove(cur_path_idx); - } else { - // If no formatting tag was found, just skip and convert the children - self.convert(padom, child, node, html_source)?; } - } - "br" => { - node.append_child(Self::new_line_break()); - } - "ol" | "ul" => { - let target_node = if node.is_list() { - // Google docs adds nested lists as children of the list node, this breaks our invariants. - // For the google docs case, we can add the nested list to the last list item instead. - if html_source != HtmlSource::GoogleDoc - || node.last_child_mut().is_none() - || node.last_child_mut().unwrap().is_list_item() - == false - { - // If source is not Google Docs or the last child is not a list item, we return an error. - return Err(Error::InvalidListItemNode); - } - node.last_child_mut() - .unwrap() - .as_container_mut() - .unwrap() - } else { - node - }; - self.current_path.push(DomNodeKind::List); - if tag == "ol" { - let custom_start = child - .get_attr("start") - .and_then(|start| start.parse::().ok()); - target_node - .append_child(Self::new_ordered_list(custom_start)); - } else { - target_node.append_child(Self::new_unordered_list()); + "blockquote" => { + self.current_path.push(DomNodeKind::Quote); + node.append_child(Self::new_quote()); + self.convert_children( + padom, + child, + last_container_mut_in(&mut node), + html_source, + )?; + + self.current_path.remove(cur_path_idx); } - self.convert_children( - padom, - child, - last_container_mut_in(target_node), - html_source, - )?; - self.current_path.remove(cur_path_idx); - } - "li" => { - self.current_path.push(DomNodeKind::ListItem); - node.append_child(Self::new_list_item()); - self.convert_children( - padom, - child, - last_container_mut_in(node), - html_source, - )?; - self.current_path.remove(cur_path_idx); - } - "a" => { - let is_mention = child.attrs.iter().any(|(k, v)| { - k == &String::from("href") && Mention::is_valid_uri(v) - }); - - let text = - child.children.first().map(|gc| padom.get_node(gc)); - let text = match text { - Some(PaDomNode::Text(text)) => Some(text), - _ => None, - }; - - if is_mention && text.is_some() { - self.current_path.push(DomNodeKind::Mention); - let mention = Self::new_mention(child, text.unwrap()); - node.append_child(mention); - } else { - self.current_path.push(DomNodeKind::Link); - - let link = Self::new_link(child); - node.append_child(link); + "html" => { + // Skip the html tag - add its children to the + // current node directly. + self.convert(padom, child, &mut node, html_source)?; + } + "p" => { + self.current_path.push(DomNodeKind::Paragraph); + node.append_child(Self::new_paragraph()); self.convert_children( padom, child, - last_container_mut_in(node), + last_container_mut_in(&mut node), html_source, )?; + self.current_path.remove(cur_path_idx); } - self.current_path.remove(cur_path_idx); - } - "pre" => { - self.current_path.push(DomNodeKind::CodeBlock); - node.append_child(Self::new_code_block()); - self.convert_children( - padom, - child, - last_container_mut_in(node), - html_source, - )?; - self.current_path.remove(cur_path_idx); - } - "blockquote" => { - self.current_path.push(DomNodeKind::Quote); - node.append_child(Self::new_quote()); - self.convert_children( - padom, - child, - last_container_mut_in(node), - html_source, - )?; - - self.current_path.remove(cur_path_idx); - } - "html" => { - // Skip the html tag - add its children to the - // current node directly. - self.convert(padom, child, node, html_source)?; - } - "p" => { - self.current_path.push(DomNodeKind::Paragraph); - node.append_child(Self::new_paragraph()); - self.convert_children( - padom, - child, - last_container_mut_in(node), - html_source, - )?; - self.current_path.remove(cur_path_idx); - } - _ => { - if html_source == HtmlSource::Matrix { - return Err(Error::UnknownNode(tag.to_string())); - } else { - self.convert(padom, child, node, html_source)?; + _ => { + invalid_node_error = + Some(Error::UnknownNode(tag.to_string())); } + }; + } + + if let Some(err) = invalid_node_error { + if html_source == HtmlSource::Matrix { + return Err(err); + } else if !skip_children { + // If the source is not Matrix and we haven't explicitly flagged to skip the children continue to parse them. + self.convert(padom, child, &mut node, html_source)?; } - }; + } + *node_in = node; Ok(()) } @@ -557,6 +593,7 @@ mod sys { NoBody, UnknownNode(String), InvalidListItemNode, + ParentNotAList, } impl fmt::Display for Error { @@ -577,6 +614,9 @@ mod sys { "Invalid list item node: a list must only contain list items" ) } + Self::ParentNotAList => { + write!(formatter, "Parent node is not a list") + } } } } @@ -1116,6 +1156,15 @@ mod sys { ); } + #[test] + fn parse_insert_text_directly_into_a_list() { + let html = r#"
      • hello
      • list item
      "#; + let dom: Dom = HtmlParser::default() + .parse_from_source(html, HtmlSource::UnknownExternal) + .unwrap(); + assert_eq!(dom.to_html(), r#"
      • hello
      "#); + } + #[test] fn parse_google_doc_rich_text() { let dom: Dom = HtmlParser::default() @@ -1591,314 +1640,385 @@ mod js { let node = nodes.get(nth as _).unwrap(); let node_name = node.node_name(); let tag = node_name.as_str(); - match tag { - "BR" => { - dom.append_child(DomNode::new_line_break()); - } - "#text" => match node.node_value() { - Some(value) => { - let is_inside_code_block = - self.current_path.contains(&CodeBlock); - let is_only_child_in_parent = number_of_nodes == 1; - convert_text( - value.as_str(), - dom, - is_inside_code_block, - is_only_child_in_parent, - ); + let mut invalid_node_error: Option = None; + let mut skip_children: bool = false; + + // Check if we're inside a list and this node is not a list item + if parent_kind == DomNodeKind::List + && tag != "LI" + && html_source != HtmlSource::GoogleDoc + { + // If we are inside a list, we can only have list items. + invalid_node_error = Some(Error::InvalidListItemNode); + skip_children = true; + } + + if invalid_node_error.is_none() { + match tag { + "BR" => { + dom.append_child(DomNode::new_line_break()); } - _ => {} - }, - "A" => { - self.current_path.push(DomNodeKind::Link); + "#text" => match node.node_value() { + Some(value) => { + let is_inside_code_block = + self.current_path.contains(&CodeBlock); + let is_only_child_in_parent = + number_of_nodes == 1; + convert_text( + value.as_str(), + dom, + is_inside_code_block, + is_only_child_in_parent, + ); + } + _ => {} + }, - let mut attributes = vec![]; - // we only need to pass in a style attribute from web to allow CSS variable insertion - let valid_attributes = ["style"]; + "A" => { + self.current_path.push(DomNodeKind::Link); - for attr in valid_attributes.into_iter() { - if node - .unchecked_ref::() - .has_attribute(attr) - { - attributes.push(( - attr.into(), - node.unchecked_ref::() - .get_attribute(attr) - .unwrap_or_default() - .into(), - )) - } - } + let mut attributes = vec![]; + // we only need to pass in a style attribute from web to allow CSS variable insertion + let valid_attributes = ["style"]; - let url = node - .unchecked_ref::() - .get_attribute("href") - .unwrap_or_default(); - - let is_mention = - Mention::is_valid_uri(&url.to_string()); - let text = node.child_nodes().get(0); - let has_text = match text.clone() { - Some(node) => { - node.node_type() == web_sys::Node::TEXT_NODE - } - None => false, - }; - if has_text && is_mention { - dom.append_child( - DomNode::Mention( - DomNode::new_mention( - url.into(), - text.unwrap() - .node_value() + for attr in valid_attributes.into_iter() { + if node + .unchecked_ref::() + .has_attribute(attr) + { + attributes.push(( + attr.into(), + node.unchecked_ref::() + .get_attribute(attr) .unwrap_or_default() .into(), - attributes, - ) - .unwrap(), - ), // we unwrap because we have already confirmed the uri is valid - ); - } else { - let children = self - .convert( - node.child_nodes(), - DomNodeKind::Link, - html_source, - )? - .take_children(); - dom.append_child(DomNode::new_link( - url.into(), - children, - attributes, - )); + )) + } + } + + let url = node + .unchecked_ref::() + .get_attribute("href") + .unwrap_or_default(); + + let is_mention = + Mention::is_valid_uri(&url.to_string()); + let text = node.child_nodes().get(0); + let has_text = match text.clone() { + Some(node) => { + node.node_type() == web_sys::Node::TEXT_NODE + } + None => false, + }; + if has_text && is_mention { + dom.append_child( + DomNode::Mention( + DomNode::new_mention( + url.into(), + text.unwrap() + .node_value() + .unwrap_or_default() + .into(), + attributes, + ) + .unwrap(), + ), // we unwrap because we have already confirmed the uri is valid + ); + } else { + let children = self + .convert( + node.child_nodes(), + DomNodeKind::Link, + html_source, + )? + .take_children(); + dom.append_child(DomNode::new_link( + url.into(), + children, + attributes, + )); + } + + self.current_path.pop(); } + "UL" | "OL" => { + let custom_start = node + .unchecked_ref::() + .get_attribute("start"); + + let attributes: Option> = + if tag == "OL" && custom_start.is_some() { + Some(vec![( + "start".into(), + custom_start.unwrap().into(), + )]) + } else { + None + }; - self.current_path.pop(); - } - "UL" | "OL" => { - let custom_start = node - .unchecked_ref::() - .get_attribute("start"); - - let attributes: Option> = - if tag == "ol" && custom_start.is_some() { - Some(vec![( - "start".into(), - custom_start.unwrap().into(), - )]) + let list_type = if tag == "OL" { + ListType::Ordered } else { - None + ListType::Unordered }; - let list_type = if tag == "OL" { - ListType::Ordered - } else { - ListType::Unordered - }; + if parent_kind == DomNodeKind::List { + // Google docs adds nested lists as children of the list node, this breaks our invariants. + // For the google docs case, we can add the nested list to the last list item instead. + if html_source != HtmlSource::GoogleDoc + || dom.last_child_mut().is_none() + || dom + .last_child_mut() + .unwrap() + .is_list_item() + == false + { + // If source is not Google Docs or the last child is not a list item, we return an error. + invalid_node_error = + Some(Error::InvalidListItemNode); + } else { + self.current_path.push(DomNodeKind::List); + let target = dom + .last_child_mut() + .unwrap() + .as_container_mut() + .unwrap(); + target.append_child(DomNode::Container( + ContainerNode::new_list( + list_type, + self.convert( + node.child_nodes(), + DomNodeKind::List, + html_source, + )? + .take_children(), + attributes, + ), + )); + self.current_path.pop(); + } + } else { + self.current_path.push(DomNodeKind::List); + dom.append_child(DomNode::Container( + ContainerNode::new_list( + list_type, + self.convert( + node.child_nodes(), + DomNodeKind::List, + html_source, + )? + .take_children(), + attributes, + ), + )); + self.current_path.pop(); + } + } - if parent_kind == DomNodeKind::List { - if html_source != HtmlSource::GoogleDoc { - return Err(Error::InvalidListItemNode); + "LI" => { + if parent_kind != DomNodeKind::List { + invalid_node_error = + Some(Error::ParentNotAList); + } else { + self.current_path.push(DomNodeKind::ListItem); + dom.append_child(DomNode::Container( + ContainerNode::new_list_item( + self.convert( + node.child_nodes(), + DomNodeKind::ListItem, + html_source, + )? + .take_children(), + ), + )); + self.current_path.pop(); } - self.current_path.push(DomNodeKind::List); - let target = dom - .last_child_mut() - .unwrap() - .as_container_mut() - .unwrap(); - target.append_child(DomNode::Container( - ContainerNode::new_list( - list_type, + } + + "PRE" => { + self.current_path.push(DomNodeKind::CodeBlock); + let children = node.child_nodes(); + let children = if children.length() == 1 + && children.get(0).unwrap().node_name().as_str() + == "CODE" + { + let code_node = children.get(0).unwrap(); + code_node.child_nodes() + } else { + children + }; + dom.append_child(DomNode::Container( + ContainerNode::new_code_block( self.convert( - node.child_nodes(), - DomNodeKind::List, + children, + DomNodeKind::CodeBlock, html_source, )? .take_children(), - attributes, ), )); - } else { + self.current_path.pop(); + } + + "BLOCKQUOTE" => { + self.current_path.push(DomNodeKind::Quote); dom.append_child(DomNode::Container( - ContainerNode::new_list( - list_type, + ContainerNode::new_quote( self.convert( node.child_nodes(), - DomNodeKind::List, + DomNodeKind::Quote, html_source, )? .take_children(), - attributes, ), )); + self.current_path.pop(); } - self.current_path.pop(); - } - - "LI" => { - self.current_path.push(DomNodeKind::ListItem); - dom.append_child(DomNode::Container( - ContainerNode::new_list_item( - self.convert( - node.child_nodes(), - DomNodeKind::ListItem, - html_source, - )? - .take_children(), - ), - )); - self.current_path.pop(); - } - - "PRE" => { - self.current_path.push(DomNodeKind::CodeBlock); - let children = node.child_nodes(); - let children = if children.length() == 1 - && children.get(0).unwrap().node_name().as_str() - == "CODE" - { - let code_node = children.get(0).unwrap(); - code_node.child_nodes() - } else { - children - }; - dom.append_child(DomNode::Container( - ContainerNode::new_code_block( - self.convert( - children, - DomNodeKind::CodeBlock, - html_source, - )? - .take_children(), - ), - )); - self.current_path.pop(); - } - - "BLOCKQUOTE" => { - self.current_path.push(DomNodeKind::Quote); - dom.append_child(DomNode::Container( - ContainerNode::new_quote( - self.convert( - node.child_nodes(), - DomNodeKind::Quote, - html_source, - )? - .take_children(), - ), - )); - self.current_path.pop(); - } - - "P" => { - self.current_path.push(DomNodeKind::Paragraph); - dom.append_child(DomNode::Container( - ContainerNode::new_paragraph( - self.convert( - node.child_nodes(), - DomNodeKind::Paragraph, - html_source, - )? - .take_children(), - ), - )); - self.current_path.pop(); - } - node_name => { - let formatting_kind = match node_name { - "STRONG" | "B" => Some(InlineFormatType::Bold), - "EM" | "I" => Some(InlineFormatType::Italic), - "DEL" => Some(InlineFormatType::StrikeThrough), - "U" => Some(InlineFormatType::Underline), - "CODE" => Some(InlineFormatType::InlineCode), - "SPAN" => { - if html_source == HtmlSource::Matrix { - return Err(Error::UnknownNode( - node_name.to_owned(), - )); + "P" => { + self.current_path.push(DomNodeKind::Paragraph); + dom.append_child(DomNode::Container( + ContainerNode::new_paragraph( + self.convert( + node.child_nodes(), + DomNodeKind::Paragraph, + html_source, + )? + .take_children(), + ), + )); + self.current_path.pop(); + } + node_name => { + let formatting_kind = match node_name { + "STRONG" | "B" => Some(InlineFormatType::Bold), + "EM" | "I" => Some(InlineFormatType::Italic), + "DEL" => Some(InlineFormatType::StrikeThrough), + "U" => Some(InlineFormatType::Underline), + "CODE" => Some(InlineFormatType::InlineCode), + "SPAN" => { + if html_source == HtmlSource::Matrix { + invalid_node_error = + Some(Error::UnknownNode( + node_name.to_owned(), + )); + None + } else { + // For external sources, we check for common formatting styles for spans + // and convert them to appropriate formatting nodes. + let style = node + .unchecked_ref::() + .style(); + if style + .get_property_value("font-weight") + .unwrap_or_default() + == "bold" + { + Some(InlineFormatType::Bold) + } else if style + .get_property_value("font-style") + .unwrap_or_default() + == "italic" + { + Some(InlineFormatType::Italic) + } else if style + .get_property_value( + "text-decoration", + ) + .unwrap_or_default() + == "underline" + { + Some(InlineFormatType::Underline) + } else if style + .get_property_value( + "text-decoration", + ) + .unwrap_or_default() + == "line-through" + { + Some( + InlineFormatType::StrikeThrough, + ) + } else { + invalid_node_error = + Some(Error::UnknownNode( + node_name.to_owned(), + )); + None + } + } } - // For external sources, we check for common formatting styles for spans - // and convert them to appropriate formatting nodes. - let style = - node.unchecked_ref::().style(); - if style - .get_property_value("font-weight") - .unwrap_or_default() - == "bold" - { - Some(InlineFormatType::Bold) - } else if style - .get_property_value("font-style") - .unwrap_or_default() - == "italic" - { - Some(InlineFormatType::Italic) - } else if style - .get_property_value("text-decoration") - .unwrap_or_default() - == "underline" - { - Some(InlineFormatType::Underline) - } else if style - .get_property_value("text-decoration") - .unwrap_or_default() - == "line-through" - { - Some(InlineFormatType::StrikeThrough) - } else { + _ => { + invalid_node_error = + Some(Error::UnknownNode( + node_name.to_owned(), + )); None } - } - _ => { - if html_source == HtmlSource::Matrix { - return Err(Error::UnknownNode( - node_name.to_owned(), + }; + + if let Some(formatting_kind) = formatting_kind { + // Special case for code inside code blocks - skip the inline code formatting + if formatting_kind + == InlineFormatType::InlineCode + && self.current_path.contains(&CodeBlock) + { + let children_nodes = self + .convert( + node.child_nodes(), + parent_kind.clone(), + html_source, + )? + .take_children(); + if !children_nodes.is_empty() { + dom.append_children(children_nodes); + } + } else { + self.current_path.push( + DomNodeKind::Formatting( + formatting_kind.clone(), + ), + ); + let children_nodes = self + .convert( + node.child_nodes(), + DomNodeKind::Formatting( + formatting_kind.clone(), + ), + html_source, + )? + .take_children(); + + dom.append_child(DomNode::Container( + ContainerNode::new_formatting( + formatting_kind.clone(), + children_nodes, + ), )); + self.current_path.pop(); } - None } - }; - - if let Some(formatting_kind) = formatting_kind { - self.current_path.push(DomNodeKind::Formatting( - formatting_kind.clone(), - )); - let children_nodes = self - .convert( - node.child_nodes(), - DomNodeKind::Formatting( - formatting_kind.clone(), - ), - html_source, - )? - .take_children(); - self.current_path.push(DomNodeKind::Formatting( - formatting_kind.clone(), - )); + } + } + } - dom.append_child(DomNode::Container( - ContainerNode::new_formatting( - formatting_kind.clone(), - children_nodes, - ), - )); - self.current_path.pop(); - } else { - // If it's an external source we skip the node and process it's children. - let children_nodes = self - .convert( - node.child_nodes(), - parent_kind.clone(), - html_source, - )? - .take_children(); - if !children_nodes.is_empty() { - dom.append_children(children_nodes); - } + // Handle invalid node errors + if let Some(err) = invalid_node_error { + if html_source == HtmlSource::Matrix { + return Err(err); + } else if !skip_children { + // If the source is not Matrix and we haven't explicitly flagged to skip the children continue to parse them. + let children_nodes = self + .convert( + node.child_nodes(), + parent_kind.clone(), + html_source, + )? + .take_children(); + if !children_nodes.is_empty() { + dom.append_children(children_nodes); } } } @@ -1921,6 +2041,7 @@ mod js { NoBody, UnknownNode(String), InvalidListItemNode, + ParentNotAList, } impl fmt::Display for Error { @@ -1942,6 +2063,9 @@ mod js { "Invalid list item node: a list must only contain list items" ) } + Self::ParentNotAList => { + write!(formatter, "Parent node is not a list") + } } } } @@ -1982,6 +2106,15 @@ mod js { roundtrip("foo bar baz"); } + #[wasm_bindgen_test] + fn parse_insert_text_directly_into_a_list() { + let html = r#"
      • hello
      • list item
      "#; + let dom: Dom = HtmlParser::default() + .parse_from_source(html, HtmlSource::UnknownExternal) + .unwrap(); + assert_eq!(dom.to_html(), r#"
      • hello
      "#); + } + #[wasm_bindgen_test] fn google_doc_rich_text() { let dom = HtmlParser::default() @@ -2013,7 +2146,7 @@ mod js { HtmlSource::UnknownExternal, ) .unwrap(); - assert_eq!(dom.to_string(), "
      1. Italic

      1. Bold

      1. Unformatted

      1. Strikethrough

      1. Underlined

      1. Linked

      • Nested

      "); + assert_eq!(dom.to_string(), "
      1. Italic

      2. Bold

      3. Unformatted

      4. Strikethrough

      5. Underlined

      6. Linked

      • Nested

      "); } #[wasm_bindgen_test] @@ -2057,17 +2190,23 @@ mod js { #[wasm_bindgen_test] fn ul() { - roundtrip("foo
      • item1
      • item2
      bar"); + roundtrip( + "

      foo

      • item1
      • item2

      bar

      ", + ); } #[wasm_bindgen_test] fn ol() { - roundtrip("foo
      1. item1
      2. item2
      bar"); + roundtrip( + "

      foo

      1. item1
      2. item2

      bar

      ", + ); } #[wasm_bindgen_test] fn pre() { - roundtrip("foo
      ~Some code
      bar"); + roundtrip( + "

      foo

      ~Some code

      bar

      ", + ); } #[wasm_bindgen_test] @@ -2098,7 +2237,9 @@ mod js { #[wasm_bindgen_test] fn blockquote() { - roundtrip("foo
      ~Some code
      bar"); + roundtrip( + "

      foo

      ~Some code

      bar

      ", + ); } #[wasm_bindgen_test] @@ -2129,7 +2270,8 @@ mod js {

       

      \
       \n 
      \

       

      "; - let dom = HtmlParser::default().parse::(html).unwrap(); + let dom: Dom = + HtmlParser::default().parse::(html).unwrap(); let tree = dom.to_tree().to_string(); assert_eq!( tree, From 92bdca22c33b458bb2d04b6739fb31dcbabc73c3 Mon Sep 17 00:00:00 2001 From: David Langley Date: Wed, 23 Jul 2025 15:03:07 +0100 Subject: [PATCH 13/17] Add support for parsing font-weight == 700 in addition to font-weight == "bold". This fixes the pasting bold content from google docs(and possibly other sources). --- .../src/composer_model/replace_html.rs | 2 +- crates/wysiwyg/src/dom/parser/parse.rs | 20 ++++++++++++++----- 2 files changed, 16 insertions(+), 6 deletions(-) diff --git a/crates/wysiwyg/src/composer_model/replace_html.rs b/crates/wysiwyg/src/composer_model/replace_html.rs index f7b1dfe24..fdb3b0755 100644 --- a/crates/wysiwyg/src/composer_model/replace_html.rs +++ b/crates/wysiwyg/src/composer_model/replace_html.rs @@ -100,7 +100,7 @@ mod test { let html_str = html.to_string(); assert!(!html_str.contains("
    9. Italic

    10. Bold

    11. Unformatted

    12. Strikethrough

    13. Underlined

    14. Linked

      • Nested

    "); + assert_eq!(html_str, "
    1. Italic

    2. Bold

    3. Unformatted

    4. Strikethrough

    5. Underlined

    6. Linked

      • Nested

    "); } #[test] diff --git a/crates/wysiwyg/src/dom/parser/parse.rs b/crates/wysiwyg/src/dom/parser/parse.rs index d388f6837..289e1d79b 100644 --- a/crates/wysiwyg/src/dom/parser/parse.rs +++ b/crates/wysiwyg/src/dom/parser/parse.rs @@ -262,7 +262,9 @@ mod sys { // For external sources, we check for common formatting styles for spans // and convert them to appropriate formatting nodes. let mut formatting_tag = None; - if child.contains_style("font-weight", "bold") { + if child.contains_style("font-weight", "bold") + || child.contains_style("font-weight", "700") + { formatting_tag = Some("b"); } else if child.contains_style("font-style", "italic") { formatting_tag = Some("i"); @@ -1174,6 +1176,7 @@ mod sys { ) .unwrap(); let tree = dom.to_tree().to_string(); + println!("{}", tree); assert_eq!( tree, indoc! { @@ -1186,7 +1189,8 @@ mod sys { │ └>"Italic" ├>li │ └>p - │ └>"Bold" + │ └>b + │ └>"Bold" ├>li │ └>p │ └>"Unformatted" @@ -1214,7 +1218,7 @@ mod sys { dom.to_markdown().unwrap().to_string(), indoc! {r#" 1. *Italic* - 2. Bold + 2. __Bold__ 3. Unformatted 4. ~~Strikethrough~~ 5. Underlined @@ -1915,6 +1919,12 @@ mod js { .get_property_value("font-weight") .unwrap_or_default() == "bold" + || style + .get_property_value( + "font-weight", + ) + .unwrap_or_default() + == "700" { Some(InlineFormatType::Bold) } else if style @@ -2123,12 +2133,12 @@ mod js { HtmlSource::GoogleDoc, ) .unwrap(); - assert_eq!(dom.to_string(), "
    1. Italic

    2. Bold

    3. Unformatted

    4. Strikethrough

    5. Underlined

    6. Linked

      • Nested

    "); + assert_eq!(dom.to_string(), "
    1. Italic

    2. Bold

    3. Unformatted

    4. Strikethrough

    5. Underlined

    6. Linked

      • Nested

    "); assert_eq!( dom.to_markdown().unwrap().to_string(), indoc! {r#" 1. *Italic* - 2. Bold + 2. __Bold__ 3. Unformatted 4. ~~Strikethrough~~ 5. Underlined From 868d0ab2f2652972f06056e86bb93455a5aca39c Mon Sep 17 00:00:00 2001 From: David Langley Date: Wed, 23 Jul 2025 16:03:05 +0100 Subject: [PATCH 14/17] lint rust and ts --- crates/wysiwyg/src/dom/parser/panode_container.rs | 2 +- platforms/web/lib/composer.ts | 6 ++++-- 2 files changed, 5 insertions(+), 3 deletions(-) diff --git a/crates/wysiwyg/src/dom/parser/panode_container.rs b/crates/wysiwyg/src/dom/parser/panode_container.rs index ab4aa5de4..1491f8941 100644 --- a/crates/wysiwyg/src/dom/parser/panode_container.rs +++ b/crates/wysiwyg/src/dom/parser/panode_container.rs @@ -48,4 +48,4 @@ fn test_contains_style() { }; assert!(node.contains_style("font-weight", "bold")); assert!(!node.contains_style("font-weight", "normal")); -} \ No newline at end of file +} diff --git a/platforms/web/lib/composer.ts b/platforms/web/lib/composer.ts index 19f890db5..297ccece2 100644 --- a/platforms/web/lib/composer.ts +++ b/platforms/web/lib/composer.ts @@ -69,11 +69,13 @@ export function processInput( const clipboardData = event.clipboardData; const htmlData = clipboardData?.getData('text/html'); const plainData = clipboardData?.getData('text/plain') ?? ''; - + if (htmlData && htmlData !== plainData) { const htmlSource = clipboardData?.types.includes( 'application/x-vnd.google-docs-document-slice-clip+wrapped', - ) ? HtmlSource.GoogleDoc : HtmlSource.UnknownExternal; + ) + ? HtmlSource.GoogleDoc + : HtmlSource.UnknownExternal; return action( composerModel.replace_html(htmlData, htmlSource), 'replace_html_paste', From edc0ab83fa8fb8c4a8e89c69b2bb2abb4bc52f3a Mon Sep 17 00:00:00 2001 From: David Langley Date: Wed, 23 Jul 2025 16:11:48 +0100 Subject: [PATCH 15/17] clippy --- crates/wysiwyg/src/dom/dom_struct.rs | 4 ++-- crates/wysiwyg/src/dom/parser/panode_container.rs | 9 ++++----- crates/wysiwyg/src/dom/parser/parse.rs | 10 ++++++---- 3 files changed, 12 insertions(+), 11 deletions(-) diff --git a/crates/wysiwyg/src/dom/dom_struct.rs b/crates/wysiwyg/src/dom/dom_struct.rs index d84d9bdde..eacf6a7ea 100644 --- a/crates/wysiwyg/src/dom/dom_struct.rs +++ b/crates/wysiwyg/src/dom/dom_struct.rs @@ -298,14 +298,14 @@ where pub fn location_for_node(&self, node_handle: &DomHandle) -> DomLocation { let locations = find_range::find_range(self, 0, usize::MAX); - return locations.find_location(node_handle).unwrap().clone(); + locations.find_location(node_handle).unwrap().clone() } pub fn locations_for_node( &self, node_handle: &DomHandle, ) -> Vec { - let result = find_range::find_pos(self, &node_handle, 0, usize::MAX); + let result = find_range::find_pos(self, node_handle, 0, usize::MAX); match result { FindResult::Found(locations) => locations, _ => panic!("Node does not exist"), diff --git a/crates/wysiwyg/src/dom/parser/panode_container.rs b/crates/wysiwyg/src/dom/parser/panode_container.rs index 1491f8941..fe6644a35 100644 --- a/crates/wysiwyg/src/dom/parser/panode_container.rs +++ b/crates/wysiwyg/src/dom/parser/panode_container.rs @@ -24,18 +24,17 @@ impl PaNodeContainer { } pub(crate) fn contains_style(&self, name: &str, value: &str) -> bool { - return self - .get_attr("style") + self.get_attr("style") .map(|v| { - return Regex::new(&format!( + Regex::new(&format!( r"(?i){}:\s*{};", regex::escape(name), regex::escape(value) )) .map(|re| re.is_match(v)) - .unwrap_or(false); + .unwrap_or(false) }) - .unwrap_or(false); + .unwrap_or(false) } } diff --git a/crates/wysiwyg/src/dom/parser/parse.rs b/crates/wysiwyg/src/dom/parser/parse.rs index 289e1d79b..c12e6da63 100644 --- a/crates/wysiwyg/src/dom/parser/parse.rs +++ b/crates/wysiwyg/src/dom/parser/parse.rs @@ -88,7 +88,7 @@ mod sys { where S: UnicodeString, { - return self.parse_internal(html, HtmlSource::Matrix); + self.parse_internal(html, HtmlSource::Matrix) } pub(super) fn parse_from_source( @@ -304,8 +304,10 @@ mod sys { // For the google docs case, we can add the nested list to the last list item instead. if html_source != HtmlSource::GoogleDoc || node.last_child_mut().is_none() - || node.last_child_mut().unwrap().is_list_item() - == false + || !node + .last_child_mut() + .unwrap() + .is_list_item() { // If source is not Google Docs or the last child is not a list item, we return an error. invalid_node_error = @@ -1309,7 +1311,7 @@ fn post_process_blocks(mut dom: Dom) -> Dom { let block_handles = find_blocks(&dom); for handle in block_handles.iter().rev() { dom = post_process_block_lines(dom, handle); - dom.join_nodes_in_container(&handle); + dom.join_nodes_in_container(handle); } dom } From f349ee9b2d6e3533e7ad0250c11aa8643aef9326 Mon Sep 17 00:00:00 2001 From: David Langley Date: Wed, 23 Jul 2025 17:20:52 +0100 Subject: [PATCH 16/17] Fix web test --- crates/wysiwyg/src/dom/parser/parse.rs | 1 - platforms/web/lib/composer.test.ts | 46 ++++++++++++++++++++------ platforms/web/lib/composer.ts | 2 -- 3 files changed, 36 insertions(+), 13 deletions(-) diff --git a/crates/wysiwyg/src/dom/parser/parse.rs b/crates/wysiwyg/src/dom/parser/parse.rs index c12e6da63..766f3f57e 100644 --- a/crates/wysiwyg/src/dom/parser/parse.rs +++ b/crates/wysiwyg/src/dom/parser/parse.rs @@ -1178,7 +1178,6 @@ mod sys { ) .unwrap(); let tree = dom.to_tree().to_string(); - println!("{}", tree); assert_eq!( tree, indoc! { diff --git a/platforms/web/lib/composer.test.ts b/platforms/web/lib/composer.test.ts index 883d4660d..fcaceba9a 100644 --- a/platforms/web/lib/composer.test.ts +++ b/platforms/web/lib/composer.test.ts @@ -6,7 +6,7 @@ SPDX-License-Identifier: AGPL-3.0-only OR LicenseRef-Element-Commercial Please see LICENSE in the repository root for full details. */ -import { ComposerModel } from '@vector-im/matrix-wysiwyg-wasm'; +import { ComposerModel, HtmlSource } from '@vector-im/matrix-wysiwyg-wasm'; import { processInput } from './composer'; import { FormattingFunctions } from './types'; @@ -14,6 +14,7 @@ import { FormattingFunctions } from './types'; // mocks and spies const mockComposerModel = { replace_text: vi.fn(), + replace_html: vi.fn(), code_block: vi.fn(), backspace_word: vi.fn(), delete_word: vi.fn(), @@ -187,16 +188,29 @@ describe('processInput', () => { }); it('handles truthy and falsy data from clipboard with replace_text', () => { - const sampleContent = ['clipboardData', null]; + const sampleContent = [ + ['clipboardData', 'clipboardData'], + [null, 'clipboardData'], + [null, null], + ]; sampleContent.forEach((clipboardContent) => { const e = new ClipboardEvent('some clipboard event'); - const mockGetter = vi.fn().mockReturnValue(clipboardContent); + const mockGetter = vi.fn().mockImplementation((type) => { + if (type === 'text/html') { + return clipboardContent[0]; + } else { + return clipboardContent[1]; + } + }); // We can't easily generate the correct type here, so disable ts // eslint-disable-next-line @typescript-eslint/ban-ts-comment // @ts-ignore - e.clipboardData = { getData: mockGetter }; + e.clipboardData = { + getData: mockGetter, + types: ['text/html', 'text/plain'], + }; processInput( e, @@ -207,12 +221,24 @@ describe('processInput', () => { mockSuggestion, ); - expect(mockGetter).toHaveBeenCalledTimes(1); - expect(mockComposerModel.replace_text).toHaveBeenCalledWith( - // falsy values are defaulted to empty string - clipboardContent || '', - ); - expect(mockAction).toHaveBeenCalledWith(undefined, 'paste'); + expect(mockGetter).toHaveBeenCalledTimes(2); + if (clipboardContent[0]) { + expect(mockComposerModel.replace_html).toHaveBeenCalledWith( + // falsy values are defaulted to empty string + clipboardContent[0] || '', + HtmlSource.UnknownExternal, + ); + } else { + expect(mockComposerModel.replace_text).toHaveBeenCalledWith( + // falsy values are defaulted to empty string + clipboardContent[1] || '', + ); + } + + const action = clipboardContent[0] + ? 'replace_html_paste' + : 'replace_text_paste'; + expect(mockAction).toHaveBeenCalledWith(undefined, action); }); }); diff --git a/platforms/web/lib/composer.ts b/platforms/web/lib/composer.ts index 297ccece2..75e0dfb06 100644 --- a/platforms/web/lib/composer.ts +++ b/platforms/web/lib/composer.ts @@ -79,13 +79,11 @@ export function processInput( return action( composerModel.replace_html(htmlData, htmlSource), 'replace_html_paste', - htmlData, ); } return action( composerModel.replace_text(plainData), 'replace_text_paste', - plainData, ); } From e11808145025dc6725f4ec42182c66fa65255c0a Mon Sep 17 00:00:00 2001 From: David Langley Date: Wed, 23 Jul 2025 19:14:41 +0100 Subject: [PATCH 17/17] Re use the same html chunk in our constants. Add better docs. --- .../src/composer_model/replace_html.rs | 31 +++++++-------- crates/wysiwyg/src/dom/parser.rs | 5 +++ crates/wysiwyg/src/dom/parser/parse.rs | 39 ++++++++++++++++--- 3 files changed, 52 insertions(+), 23 deletions(-) diff --git a/crates/wysiwyg/src/composer_model/replace_html.rs b/crates/wysiwyg/src/composer_model/replace_html.rs index fdb3b0755..22191933c 100644 --- a/crates/wysiwyg/src/composer_model/replace_html.rs +++ b/crates/wysiwyg/src/composer_model/replace_html.rs @@ -71,30 +71,26 @@ where } } -#[cfg(test)] -const GOOGLE_DOC_HTML_PASTEBOARD: &str = r#" -
    1. Italic

    2. Bold

    3. Unformatted

    4. Strikethrough

    5. Underlined

    6. Linked

      • Nested

    - "#; -#[cfg(test)] -const MS_DOC_HTML_PASTEBOARD: &str = r#" -
    1. Italic 

    1. Bold 

    1. Unformatted 

    1. Strikethrough 

    1. Underlined 

    • Nested

    - "#; - #[cfg(test)] mod test { - use super::*; use crate::dom::html_source::HtmlSource; + use crate::dom::parser::{ + GOOGLE_DOC_HTML_PASTEBOARD, MS_DOC_HTML_PASTEBOARD, + }; use crate::tests::testutils_composer_model::cm; #[test] fn test_replace_html_strips_meta_tags_google_docs() { let mut model = cm("|"); - let _ = model.replace_html( - GOOGLE_DOC_HTML_PASTEBOARD.into(), - HtmlSource::GoogleDoc, + // This html was copied directly from google docs and we are including the meta and bold tags that google docs adds. + let html = format!( + r#"{}"#, + GOOGLE_DOC_HTML_PASTEBOARD ); + let _ = model.replace_html(html.into(), HtmlSource::GoogleDoc); + // Verify the HTML doesn't contain meta or the outer b tag let html = model.get_content_as_html(); let html_str = html.to_string(); @@ -107,10 +103,11 @@ mod test { fn test_replace_html_strips_only_meta_tags_ms_docs() { let mut model = cm("|"); - let _ = model.replace_html( - MS_DOC_HTML_PASTEBOARD.into(), - HtmlSource::UnknownExternal, - ); + // This html was copied directly from ms docs and we are including the meta and bold tags that ms docs adds. + let html = + format!(r#"{}"#, MS_DOC_HTML_PASTEBOARD); + + let _ = model.replace_html(html.into(), HtmlSource::UnknownExternal); let html = model.get_content_as_html(); let html_str = html.to_string(); diff --git a/crates/wysiwyg/src/dom/parser.rs b/crates/wysiwyg/src/dom/parser.rs index c845f1951..574d889a9 100644 --- a/crates/wysiwyg/src/dom/parser.rs +++ b/crates/wysiwyg/src/dom/parser.rs @@ -50,3 +50,8 @@ use sys::*; pub use parse::parse; pub use parse::parse_from_source; + +#[cfg(test)] +pub use parse::GOOGLE_DOC_HTML_PASTEBOARD; +#[cfg(test)] +pub use parse::MS_DOC_HTML_PASTEBOARD; diff --git a/crates/wysiwyg/src/dom/parser/parse.rs b/crates/wysiwyg/src/dom/parser/parse.rs index 766f3f57e..8827a092a 100644 --- a/crates/wysiwyg/src/dom/parser/parse.rs +++ b/crates/wysiwyg/src/dom/parser/parse.rs @@ -46,14 +46,41 @@ where } } +/* These html fragments were copied directly from google docs/ms docs(minus the cleanup/stripping we do in "replace_html" function) and represents the following content: +└>ol + ├>li + │ └>p + │ └>i + │ └>"Italic" + ├>li + │ └>p + │ └>b + │ └>"Bold" + ├>li + │ └>p + │ └>"Unformatted" + ├>li + │ └>p + │ └>del + │ └>"Strikethrough" + ├>li + │ └>p + │ └>u + │ └>"Underlined" + └>li + ├>p + │ └>a "http://matrix.org" + │ └>u + │ └>"Linked" + └>ul + └>li + └>p + └>"Nested" +*/ #[cfg(test)] -const GOOGLE_DOC_HTML_PASTEBOARD: &str = r#" -
    1. Italic

    2. Bold

    3. Unformatted

    4. Strikethrough

    5. Underlined

    6. Linked

      • Nested

    - "#; +pub const GOOGLE_DOC_HTML_PASTEBOARD: &str = r#"
    1. Italic

    2. Bold

    3. Unformatted

    4. Strikethrough

    5. Underlined

    6. Linked

      • Nested

    "#; #[cfg(test)] -const MS_DOC_HTML_PASTEBOARD: &str = r#" -
    1. Italic 

    1. Bold 

    1. Unformatted 

    1. Strikethrough 

    1. Underlined 

    • Nested

    - "#; +pub const MS_DOC_HTML_PASTEBOARD: &str = r#"
    1. Italic 

    1. Bold 

    1. Unformatted 

    1. Strikethrough 

    1. Underlined 

    • Nested

    "#; #[cfg(feature = "sys")] mod sys {