From 38ab9d2c819f19b06386ceb137fe438f59f0b11a Mon Sep 17 00:00:00 2001
From: David Langley
Date: Wed, 2 Jul 2025 15:13:21 +0100
Subject: [PATCH 01/17] Add parsing support for html from external
sources(pasteboard from google docs and ms word)
---
crates/wysiwyg/Cargo.toml | 2 +-
crates/wysiwyg/src/dom/parser/parse.rs | 199 ++++++++++++++++++++-----
2 files changed, 162 insertions(+), 39 deletions(-)
diff --git a/crates/wysiwyg/Cargo.toml b/crates/wysiwyg/Cargo.toml
index ebbd5880b..8d298a5a6 100644
--- a/crates/wysiwyg/Cargo.toml
+++ b/crates/wysiwyg/Cargo.toml
@@ -26,7 +26,7 @@ strum = "0.27"
strum_macros = "0.27"
unicode-segmentation = "1.7.1"
wasm-bindgen = { version = "0.2.83", default-features = false, optional = true }
-web-sys = { version = "0.3.60", default-features = false, features = ["Document", "DomParser", "HtmlElement", "Node", "NodeList", "SupportedType"], optional = true }
+web-sys = { version = "0.3.60", default-features = false, features = ["Document", "DomParser", "HtmlElement", "Node", "NodeList", "SupportedType", "CssStyleDeclaration"], optional = true }
widestring = "1.0.2"
indoc = "2.0"
url="2.3.1"
diff --git a/crates/wysiwyg/src/dom/parser/parse.rs b/crates/wysiwyg/src/dom/parser/parse.rs
index 743f35c9f..15300dbc7 100644
--- a/crates/wysiwyg/src/dom/parser/parse.rs
+++ b/crates/wysiwyg/src/dom/parser/parse.rs
@@ -1162,7 +1162,9 @@ mod js {
use matrix_mentions::Mention;
use std::fmt;
use wasm_bindgen::JsCast;
- use web_sys::{Document, DomParser, Element, NodeList, SupportedType};
+ use web_sys::{
+ Document, DomParser, Element, HtmlElement, NodeList, SupportedType,
+ };
pub(super) struct HtmlParser {
current_path: Vec,
@@ -1178,6 +1180,27 @@ mod js {
&mut self,
html: &str,
) -> Result, HtmlParseError>
+ where
+ S: UnicodeString,
+ {
+ self.parse_internal(html, false)
+ }
+
+ pub(super) fn parse_from_external_html_source(
+ &mut self,
+ html: &str,
+ ) -> Result, HtmlParseError>
+ where
+ S: UnicodeString,
+ {
+ self.parse_internal(html, true)
+ }
+
+ fn parse_internal(
+ &mut self,
+ html: &str,
+ external_html_source: bool,
+ ) -> Result, HtmlParseError>
where
S: UnicodeString,
{
@@ -1195,7 +1218,7 @@ mod js {
)
})?;
- self.webdom_to_dom(document)
+ self.webdom_to_dom(document, external_html_source)
.map_err(to_dom_creation_error)
.map(post_process_blocks)
}
@@ -1203,15 +1226,20 @@ mod js {
fn webdom_to_dom(
&mut self,
webdoc: Document,
+ external_html_source: bool,
) -> Result, Error>
where
S: UnicodeString,
{
let body = webdoc.body().ok_or_else(|| Error::NoBody)?;
- self.convert(body.child_nodes())
+ self.convert(body.child_nodes(), external_html_source)
}
- fn convert(&mut self, nodes: NodeList) -> Result, Error>
+ fn convert(
+ &mut self,
+ nodes: NodeList,
+ external_html_source: bool,
+ ) -> Result, Error>
where
S: UnicodeString,
{
@@ -1219,7 +1247,7 @@ mod js {
let mut dom = Dom::new(Vec::with_capacity(number_of_nodes));
let dom_document = dom.document_mut();
- self.convert_container(nodes, dom_document)?;
+ self.convert_container(nodes, dom_document, external_html_source)?;
Ok(dom)
}
@@ -1228,6 +1256,7 @@ mod js {
&mut self,
nodes: NodeList,
dom: &mut ContainerNode,
+ external_html_source: bool,
) -> Result<(), Error>
where
S: UnicodeString,
@@ -1309,7 +1338,10 @@ mod js {
);
} else {
let children = self
- .convert(node.child_nodes())?
+ .convert(
+ node.child_nodes(),
+ external_html_source,
+ )?
.take_children();
dom.append_child(DomNode::new_link(
url.into(),
@@ -1329,8 +1361,11 @@ mod js {
dom.append_child(DomNode::Container(
ContainerNode::new_list(
ListType::Ordered,
- self.convert(node.child_nodes())?
- .take_children(),
+ self.convert(
+ node.child_nodes(),
+ external_html_source,
+ )?
+ .take_children(),
if let Some(custom_start) = custom_start {
Some(vec![(
"start".into(),
@@ -1349,8 +1384,11 @@ mod js {
dom.append_child(DomNode::Container(
ContainerNode::new_list(
ListType::Unordered,
- self.convert(node.child_nodes())?
- .take_children(),
+ self.convert(
+ node.child_nodes(),
+ external_html_source,
+ )?
+ .take_children(),
None,
),
));
@@ -1361,8 +1399,11 @@ mod js {
self.current_path.push(DomNodeKind::ListItem);
dom.append_child(DomNode::Container(
ContainerNode::new_list_item(
- self.convert(node.child_nodes())?
- .take_children(),
+ self.convert(
+ node.child_nodes(),
+ external_html_source,
+ )?
+ .take_children(),
),
));
self.current_path.pop();
@@ -1382,7 +1423,8 @@ mod js {
};
dom.append_child(DomNode::Container(
ContainerNode::new_code_block(
- self.convert(children)?.take_children(),
+ self.convert(children, external_html_source)?
+ .take_children(),
),
));
self.current_path.pop();
@@ -1392,8 +1434,11 @@ mod js {
self.current_path.push(DomNodeKind::Quote);
dom.append_child(DomNode::Container(
ContainerNode::new_quote(
- self.convert(node.child_nodes())?
- .take_children(),
+ self.convert(
+ node.child_nodes(),
+ external_html_source,
+ )?
+ .take_children(),
),
));
self.current_path.pop();
@@ -1403,40 +1448,87 @@ mod js {
self.current_path.push(DomNodeKind::Paragraph);
dom.append_child(DomNode::Container(
ContainerNode::new_paragraph(
- self.convert(node.child_nodes())?
- .take_children(),
+ self.convert(
+ node.child_nodes(),
+ external_html_source,
+ )?
+ .take_children(),
),
));
self.current_path.pop();
}
-
node_name => {
- let children_nodes =
- self.convert(node.child_nodes())?.take_children();
-
+ let children_nodes = self
+ .convert(node.child_nodes(), external_html_source)?
+ .take_children();
let formatting_kind = match node_name {
- "STRONG" | "B" => InlineFormatType::Bold,
- "EM" | "I" => InlineFormatType::Italic,
- "DEL" => InlineFormatType::StrikeThrough,
- "U" => InlineFormatType::Underline,
- "CODE" => InlineFormatType::InlineCode,
+ "STRONG" | "B" => Some(InlineFormatType::Bold),
+ "EM" | "I" => Some(InlineFormatType::Italic),
+ "DEL" => Some(InlineFormatType::StrikeThrough),
+ "U" => Some(InlineFormatType::Underline),
+ "CODE" => Some(InlineFormatType::InlineCode),
+ "SPAN" => {
+ if !external_html_source {
+ return Err(Error::UnknownNode(
+ node_name.to_owned(),
+ ));
+ }
+ let style =
+ node.unchecked_ref::().style();
+ if style
+ .get_property_value("font-weight")
+ .unwrap_or_default()
+ == "bold"
+ {
+ Some(InlineFormatType::Bold)
+ } else if style
+ .get_property_value("font-style")
+ .unwrap_or_default()
+ == "italic"
+ {
+ Some(InlineFormatType::Italic)
+ } else if style
+ .get_property_value("text-decoration")
+ .unwrap_or_default()
+ == "underline"
+ {
+ Some(InlineFormatType::Underline)
+ } else if style
+ .get_property_value("text-decoration")
+ .unwrap_or_default()
+ == "line-through"
+ {
+ Some(InlineFormatType::StrikeThrough)
+ } else {
+ None
+ }
+ }
_ => {
- return Err(Error::UnknownNode(
- node_name.to_owned(),
- ))
+ if !external_html_source {
+ return Err(Error::UnknownNode(
+ node_name.to_owned(),
+ ));
+ }
+ None
}
};
- self.current_path.push(DomNodeKind::Formatting(
- formatting_kind.clone(),
- ));
+ if formatting_kind.is_none() {
+ if !children_nodes.is_empty() {
+ dom.append_children(children_nodes);
+ }
+ } else {
+ self.current_path.push(DomNodeKind::Formatting(
+ formatting_kind.clone().unwrap(),
+ ));
- dom.append_child(DomNode::Container(
- ContainerNode::new_formatting(
- formatting_kind,
- children_nodes,
- ),
- ));
+ dom.append_child(DomNode::Container(
+ ContainerNode::new_formatting(
+ formatting_kind.unwrap(),
+ children_nodes,
+ ),
+ ));
+ }
self.current_path.pop();
}
}
@@ -1512,6 +1604,37 @@ mod js {
roundtrip("foo bar baz");
}
+ #[wasm_bindgen_test]
+ fn google_doc_rich_text() {
+ let html = r#"
+ Italic
Bold
Unformatted
Strikethrough
Underlined
Linked
+ "#;
+ let dom = HtmlParser::default()
+ .parse_from_external_html_source::(html)
+ .unwrap();
+ assert_eq!(dom.to_string(), "Italic
Bold
Unformatted
Strikethrough
Underlined
Linked
");
+ }
+
+ #[wasm_bindgen_test]
+ fn ms_rich_text() {
+ let html = r#"
+
+ "#;
+ let dom = HtmlParser::default()
+ .parse_from_external_html_source::(html)
+ .unwrap();
+ assert_eq!(dom.to_string(), "");
+ }
+
+ #[wasm_bindgen_test]
+ fn unknown_tag_errors() {
+ let html = r#"
+ Bold
+ "#;
+ let result = HtmlParser::default().parse::(html);
+ assert_eq!(result.is_err(), true);
+ }
+
#[wasm_bindgen_test]
fn br() {
let html = "foo
bar";
From 8a4837ad643ed822ed0d6d36ad5962885e6e4647 Mon Sep 17 00:00:00 2001
From: David Langley
Date: Wed, 2 Jul 2025 20:12:03 +0100
Subject: [PATCH 02/17] Add contains_style function to check if a pa node
contains a style attribute of a particular value
---
.../src/dom/parser/panode_container.rs | 27 +++++++++++++++++++
1 file changed, 27 insertions(+)
diff --git a/crates/wysiwyg/src/dom/parser/panode_container.rs b/crates/wysiwyg/src/dom/parser/panode_container.rs
index 4d83e8f18..ab4aa5de4 100644
--- a/crates/wysiwyg/src/dom/parser/panode_container.rs
+++ b/crates/wysiwyg/src/dom/parser/panode_container.rs
@@ -5,6 +5,7 @@
// Please see LICENSE in the repository root for full details.
use html5ever::QualName;
+use regex::Regex;
use super::PaDomHandle;
@@ -21,4 +22,30 @@ impl PaNodeContainer {
.find(|(n, _v)| n == name)
.map(|(_n, v)| v.as_str())
}
+
+ pub(crate) fn contains_style(&self, name: &str, value: &str) -> bool {
+ return self
+ .get_attr("style")
+ .map(|v| {
+ return Regex::new(&format!(
+ r"(?i){}:\s*{};",
+ regex::escape(name),
+ regex::escape(value)
+ ))
+ .map(|re| re.is_match(v))
+ .unwrap_or(false);
+ })
+ .unwrap_or(false);
+ }
}
+
+#[test]
+fn test_contains_style() {
+ let node = PaNodeContainer {
+ name: QualName::new(None, "div".into(), "div".into()),
+ attrs: vec![("style".into(), "font-weight:bold;".into())],
+ children: Vec::new(),
+ };
+ assert!(node.contains_style("font-weight", "bold"));
+ assert!(!node.contains_style("font-weight", "normal"));
+}
\ No newline at end of file
From 90caa7f78f3fae558a5f06b913f93be85245c326 Mon Sep 17 00:00:00 2001
From: David Langley
Date: Wed, 2 Jul 2025 20:14:54 +0100
Subject: [PATCH 03/17] Add support support for external_html_source/span to
sys parser
and bring it's error handling in line with sys parser so that it returns an error if external_html_source is false and it encounters tags it doesn't support.
---
crates/wysiwyg/src/dom/parser/parse.rs | 302 +++++++++++++++++++++----
1 file changed, 261 insertions(+), 41 deletions(-)
diff --git a/crates/wysiwyg/src/dom/parser/parse.rs b/crates/wysiwyg/src/dom/parser/parse.rs
index 15300dbc7..3687c44e4 100644
--- a/crates/wysiwyg/src/dom/parser/parse.rs
+++ b/crates/wysiwyg/src/dom/parser/parse.rs
@@ -27,8 +27,19 @@ where
}
}
+#[cfg(test)]
+const GOOGLE_DOC_HTML_PASTEBOARD: &str = r#"
+ Italic
Bold
Unformatted
Strikethrough
Underlined
Linked
+ "#;
+#[cfg(test)]
+const MS_DOC_HTML_PASTEBOARD: &str = r#"
+
+ "#;
+
#[cfg(feature = "sys")]
mod sys {
+ use std::fmt;
+
use matrix_mentions::Mention;
use super::super::padom_node::PaDomNode;
@@ -58,14 +69,37 @@ mod sys {
where
S: UnicodeString,
{
- PaDomCreator::parse(html)
- .map(|pa_dom| {
- let dom = self.padom_to_dom(pa_dom);
- post_process_blocks(dom)
- })
- .map_err(|err| {
- self.padom_creation_error_to_html_parse_error(err)
- })
+ return self.parse_internal(html, true);
+ }
+
+ pub(super) fn parse_from_external_html_source(
+ &mut self,
+ html: &str,
+ ) -> Result, HtmlParseError>
+ where
+ S: UnicodeString,
+ {
+ self.parse_internal(html, true)
+ }
+
+ pub(super) fn parse_internal(
+ &mut self,
+ html: &str,
+ external_html_source: bool,
+ ) -> Result, HtmlParseError>
+ where
+ S: UnicodeString,
+ {
+ let pa_dom = PaDomCreator::parse(html).map_err(|err| {
+ self.padom_creation_error_to_html_parse_error(err)
+ })?;
+
+ let dom = self.padom_to_dom(pa_dom, external_html_source).map_err(
+ |err| HtmlParseError {
+ parse_errors: vec![err.to_string()],
+ },
+ )?;
+ Ok(post_process_blocks(dom))
}
/// Convert a [PaDom] into a [Dom].
@@ -80,7 +114,11 @@ mod sys {
///
/// [Dom] is for general use. Parent nodes own their children, and Dom may be
/// cloned, compared, and converted into an HTML string.
- fn padom_to_dom(&mut self, padom: PaDom) -> Dom
+ fn padom_to_dom(
+ &mut self,
+ padom: PaDom,
+ external_html_source: bool,
+ ) -> Result, Error>
where
S: UnicodeString,
{
@@ -88,11 +126,11 @@ mod sys {
let doc = ret.document_mut();
if let PaDomNode::Document(padoc) = padom.get_document() {
- self.convert(&padom, padoc, doc)
+ self.convert(&padom, padoc, doc, external_html_source)?;
} else {
- panic!("Document was not a document!");
+ return Err(Error::NoBody);
}
- ret
+ Ok(ret)
}
/// Copy all panode's information into node.
@@ -101,14 +139,21 @@ mod sys {
padom: &PaDom,
panode: &PaNodeContainer,
node: &mut ContainerNode,
- ) where
+ external_html_source: bool,
+ ) -> Result<(), Error>
+ where
S: UnicodeString,
{
for child_handle in &panode.children {
let child = padom.get_node(child_handle);
match child {
PaDomNode::Container(child) => {
- self.convert_container(padom, child, node);
+ self.convert_container(
+ padom,
+ child,
+ node,
+ external_html_source,
+ )?;
}
PaDomNode::Document(_) => {
panic!("Found a document inside a document!")
@@ -128,6 +173,7 @@ mod sys {
}
}
}
+ Ok(())
}
/// Copy all panode's information into node (now we know it's a container).
@@ -136,7 +182,9 @@ mod sys {
padom: &PaDom,
child: &PaNodeContainer,
node: &mut ContainerNode,
- ) where
+ external_html_source: bool,
+ ) -> Result<(), Error>
+ where
S: UnicodeString,
{
let cur_path_idx = self.current_path.len();
@@ -145,7 +193,12 @@ mod sys {
"b" | "code" | "del" | "em" | "i" | "strong" | "u" => {
let formatting_node = Self::new_formatting(tag);
if tag == "code" && self.current_path.contains(&CodeBlock) {
- self.convert_children(padom, child, Some(node));
+ self.convert_children(
+ padom,
+ child,
+ Some(node),
+ external_html_source,
+ )?;
} else {
self.current_path.push(formatting_node.kind());
node.append_child(formatting_node);
@@ -153,10 +206,51 @@ mod sys {
padom,
child,
last_container_mut_in(node),
- );
+ external_html_source,
+ )?;
self.current_path.remove(cur_path_idx);
}
}
+ "span" => {
+ let mut formatting_tag = None;
+ if child.contains_style("font-weight", "bold") {
+ formatting_tag = Some("b");
+ } else if child.contains_style("font-style", "italic") {
+ formatting_tag = Some("i");
+ } else if child
+ .contains_style("text-decoration", "underline")
+ {
+ formatting_tag = Some("u");
+ } else if child
+ .contains_style("text-decoration", "line-through")
+ {
+ formatting_tag = Some("del");
+ }
+
+ if let Some(tag) = formatting_tag {
+ let formatting_node = Self::new_formatting(tag);
+ self.current_path.push(formatting_node.kind());
+ node.append_child(formatting_node);
+ self.convert_children(
+ padom,
+ child,
+ last_container_mut_in(node),
+ external_html_source,
+ )?;
+ self.current_path.remove(cur_path_idx);
+ } else {
+ if external_html_source {
+ self.convert(
+ padom,
+ child,
+ node,
+ external_html_source,
+ )?;
+ } else {
+ return Err(Error::UnknownNode(tag.to_string()));
+ }
+ }
+ }
"br" => {
node.append_child(Self::new_line_break());
}
@@ -174,7 +268,8 @@ mod sys {
padom,
child,
last_container_mut_in(node),
- );
+ external_html_source,
+ )?;
self.current_path.remove(cur_path_idx);
}
"li" => {
@@ -184,7 +279,8 @@ mod sys {
padom,
child,
last_container_mut_in(node),
- );
+ external_html_source,
+ )?;
self.current_path.remove(cur_path_idx);
}
"a" => {
@@ -212,7 +308,8 @@ mod sys {
padom,
child,
last_container_mut_in(node),
- );
+ external_html_source,
+ )?;
}
self.current_path.remove(cur_path_idx);
}
@@ -223,7 +320,8 @@ mod sys {
padom,
child,
last_container_mut_in(node),
- );
+ external_html_source,
+ )?;
self.current_path.remove(cur_path_idx);
}
"blockquote" => {
@@ -233,14 +331,15 @@ mod sys {
padom,
child,
last_container_mut_in(node),
- );
+ external_html_source,
+ )?;
self.current_path.remove(cur_path_idx);
}
"html" => {
// Skip the html tag - add its children to the
// current node directly.
- self.convert(padom, child, node);
+ self.convert(padom, child, node, external_html_source)?;
}
"p" => {
self.current_path.push(DomNodeKind::Paragraph);
@@ -249,14 +348,19 @@ mod sys {
padom,
child,
last_container_mut_in(node),
- );
+ external_html_source,
+ )?;
self.current_path.remove(cur_path_idx);
}
_ => {
- // Ignore tags we don't recognise
- // We should log - see internal task PSU-741
+ if external_html_source {
+ self.convert(padom, child, node, external_html_source)?;
+ } else {
+ return Err(Error::UnknownNode(tag.to_string()));
+ }
}
};
+ Ok(())
}
/// Recurse into panode's children and convert them too
@@ -265,14 +369,17 @@ mod sys {
padom: &PaDom,
child: &PaNodeContainer,
new_node: Option<&mut ContainerNode>,
- ) where
+ external_html_source: bool,
+ ) -> Result<(), Error>
+ where
S: UnicodeString,
{
if let Some(new_node) = new_node {
- self.convert(padom, child, new_node);
+ self.convert(padom, child, new_node, external_html_source)?;
} else {
panic!("Container became non-container!");
}
+ Ok(())
}
/// Create a formatting node
@@ -403,6 +510,27 @@ mod sys {
}
}
+ enum Error {
+ NoBody,
+ UnknownNode(String),
+ }
+
+ impl fmt::Display for Error {
+ fn fmt(&self, formatter: &mut fmt::Formatter<'_>) -> fmt::Result {
+ match self {
+ Self::NoBody => {
+ write!(
+ formatter,
+ "The `Document` does not have a `` element"
+ )
+ }
+ Self::UnknownNode(node_name) => {
+ write!(formatter, "Node `{node_name}` is not supported")
+ }
+ }
+ }
+ }
+
#[cfg(test)]
mod test {
use crate::dom::parser::parse::sys::HtmlParser;
@@ -790,10 +918,9 @@ mod sys {
fn parse_code_block_post_processes_it() {
let mut parser = HtmlParser::default();
let html = "Test\nCode
";
- let dom: Dom = PaDomCreator::parse(html)
- .map(|pa_dom| parser.padom_to_dom(pa_dom))
- .ok()
- .unwrap();
+ let pa_dom = PaDomCreator::parse(html).unwrap();
+ let dom: Dom =
+ parser.padom_to_dom(pa_dom, false).ok().unwrap();
// First, line breaks are added as placeholders for paragraphs
assert_eq!(
dom.to_html().to_string(),
@@ -936,6 +1063,100 @@ mod sys {
"#}
);
}
+
+ #[test]
+ fn parse_google_doc_rich_text() {
+ let dom: Dom = HtmlParser::default()
+ .parse(GOOGLE_DOC_HTML_PASTEBOARD)
+ .unwrap();
+ let tree = dom.to_tree().to_string();
+ assert_eq!(
+ tree,
+ indoc! {
+ r#"
+
+ └>ul
+ ├>li
+ │ └>p
+ │ └>i
+ │ └>"Italic"
+ ├>li
+ │ └>p
+ │ └>"Bold"
+ ├>li
+ │ └>p
+ │ └>"Unformatted"
+ ├>li
+ │ └>p
+ │ └>del
+ │ └>"Strikethrough"
+ ├>li
+ │ └>p
+ │ └>u
+ │ └>"Underlined"
+ ├>li
+ │ └>p
+ │ └>a "http://matrix.org"
+ │ └>u
+ │ └>"Linked"
+ └>ul
+ └>li
+ └>p
+ └>u
+ └>"nested"
+ "#
+ }
+ );
+ }
+
+ #[test]
+ fn parse_ms_doc_rich_text() {
+ let dom: Dom =
+ HtmlParser::default().parse(MS_DOC_HTML_PASTEBOARD).unwrap();
+ let tree = dom.to_tree().to_string();
+ assert_eq!(
+ tree,
+ indoc! {
+ r#"
+
+ ├>ul
+ │ └>li
+ │ └>p
+ │ └>i
+ │ └>"Italic"
+ ├>ul
+ │ └>li
+ │ └>p
+ │ └>b
+ │ └>"Bold"
+ ├>ul
+ │ └>li
+ │ └>p
+ │ └>"Unformatted"
+ ├>ul
+ │ └>li
+ │ └>p
+ │ └>del
+ │ └>"Strikethrough"
+ ├>ul
+ │ └>li
+ │ └>p
+ │ └>u
+ │ └>"Underlined"
+ ├>ul
+ │ └>li
+ │ └>p
+ │ └>a "https://matrix.org/"
+ │ └>u
+ │ └>"Linked"
+ └>ul
+ └>li
+ └>p
+ └>"nested"
+ "#
+ }
+ );
+ }
}
}
@@ -1517,6 +1738,7 @@ mod js {
if !children_nodes.is_empty() {
dom.append_children(children_nodes);
}
+ self.current_path.pop();
} else {
self.current_path.push(DomNodeKind::Formatting(
formatting_kind.clone().unwrap(),
@@ -1528,8 +1750,8 @@ mod js {
children_nodes,
),
));
+ self.current_path.pop();
}
- self.current_path.pop();
}
}
}
@@ -1606,22 +1828,20 @@ mod js {
#[wasm_bindgen_test]
fn google_doc_rich_text() {
- let html = r#"
- Italic
Bold
Unformatted
Strikethrough
Underlined
Linked
- "#;
let dom = HtmlParser::default()
- .parse_from_external_html_source::(html)
+ .parse_from_external_html_source::(
+ GOOGLE_DOC_HTML_PASTEBOARD,
+ )
.unwrap();
assert_eq!(dom.to_string(), "Italic
Bold
Unformatted
Strikethrough
Underlined
Linked
");
}
#[wasm_bindgen_test]
fn ms_rich_text() {
- let html = r#"
-
- "#;
let dom = HtmlParser::default()
- .parse_from_external_html_source::(html)
+ .parse_from_external_html_source::(
+ MS_DOC_HTML_PASTEBOARD,
+ )
.unwrap();
assert_eq!(dom.to_string(), "");
}
From d83a3fba303e75f5a1f9f3b8b18802817ff303d3 Mon Sep 17 00:00:00 2001
From: David Langley
Date: Thu, 3 Jul 2025 20:30:50 +0100
Subject: [PATCH 04/17] Add top level functions and e2e test
---
crates/wysiwyg/src/composer_model.rs | 1 +
.../src/composer_model/replace_html.rs | 52 +++++++++++++
crates/wysiwyg/src/dom/parser.rs | 1 +
crates/wysiwyg/src/dom/parser/parse.rs | 78 +++++++++++--------
.../web/cypress/e2e/clipboard/paste.spec.ts | 26 +++++++
5 files changed, 127 insertions(+), 31 deletions(-)
create mode 100644 crates/wysiwyg/src/composer_model/replace_html.rs
diff --git a/crates/wysiwyg/src/composer_model.rs b/crates/wysiwyg/src/composer_model.rs
index e655a9174..835003dbb 100644
--- a/crates/wysiwyg/src/composer_model.rs
+++ b/crates/wysiwyg/src/composer_model.rs
@@ -17,6 +17,7 @@ pub mod menu_action;
pub mod menu_state;
pub mod new_lines;
pub mod quotes;
+pub mod replace_html;
pub mod replace_text;
pub mod selection;
pub mod undo_redo;
diff --git a/crates/wysiwyg/src/composer_model/replace_html.rs b/crates/wysiwyg/src/composer_model/replace_html.rs
new file mode 100644
index 000000000..946248842
--- /dev/null
+++ b/crates/wysiwyg/src/composer_model/replace_html.rs
@@ -0,0 +1,52 @@
+// Copyright 2024 New Vector Ltd.
+// Copyright 2022 The Matrix.org Foundation C.I.C.
+//
+// SPDX-License-Identifier: AGPL-3.0-only OR LicenseRef-Element-Commercial
+// Please see LICENSE in the repository root for full details.
+
+use crate::dom::nodes::{ContainerNodeKind, DomNode};
+use crate::dom::parser::parse_from_external_html_source;
+use crate::{parse, ComposerModel, ComposerUpdate, Location, UnicodeString};
+
+impl ComposerModel
+where
+ S: UnicodeString,
+{
+ /// Replaces text in the current selection with new_html.
+ /// Treats its input as html that is parsed into a DomNode and inserted into
+ /// the document at the cursor.
+ pub fn replace_html(
+ &mut self,
+ new_html: S,
+ from_external_source: bool,
+ ) -> ComposerUpdate {
+ self.push_state_to_history();
+ if self.has_selection() {
+ self.do_replace_text(S::default());
+ }
+ let result = if from_external_source {
+ parse_from_external_html_source(&new_html.to_string())
+ } else {
+ parse(&new_html.to_string())
+ };
+
+ let dom = result.unwrap().into_document_node();
+
+ let (start, end) = self.safe_selection();
+ let range = self.state.dom.find_range(start, end);
+
+ let new_cursor_index = start + dom.text_len();
+ let handle = self.state.dom.insert_node_at_cursor(&range, dom);
+
+ // manually move the cursor to the end of the html
+ self.state.start = Location::from(new_cursor_index);
+ self.state.end = self.state.start;
+
+ // add a trailing space in cases when we do not have a next sibling
+ if self.state.dom.is_last_in_parent(&handle) {
+ self.do_replace_text(" ".into())
+ } else {
+ self.create_update_replace_all()
+ }
+ }
+}
diff --git a/crates/wysiwyg/src/dom/parser.rs b/crates/wysiwyg/src/dom/parser.rs
index e7c79b458..0c2d24eaf 100644
--- a/crates/wysiwyg/src/dom/parser.rs
+++ b/crates/wysiwyg/src/dom/parser.rs
@@ -49,3 +49,4 @@ mod sys {
use sys::*;
pub use parse::parse;
+pub use parse::parse_from_external_html_source;
diff --git a/crates/wysiwyg/src/dom/parser/parse.rs b/crates/wysiwyg/src/dom/parser/parse.rs
index 3687c44e4..a8fd59571 100644
--- a/crates/wysiwyg/src/dom/parser/parse.rs
+++ b/crates/wysiwyg/src/dom/parser/parse.rs
@@ -27,10 +27,27 @@ where
}
}
+pub fn parse_from_external_html_source(
+ html: &str,
+) -> Result, HtmlParseError>
+where
+ S: UnicodeString,
+{
+ cfg_if::cfg_if! {
+ if #[cfg(feature = "sys")] {
+ sys::HtmlParser::default().parse_from_external_html_source(html)
+ } else if #[cfg(all(feature = "js", target_arch = "wasm32"))] {
+ js::HtmlParser::default().parse_from_external_html_source(html)
+ } else {
+ unreachable!("The `sys` or `js` are mutually exclusive, and one of them must be enabled.")
+ }
+ }
+}
+
#[cfg(test)]
const GOOGLE_DOC_HTML_PASTEBOARD: &str = r#"
- Italic
Bold
Unformatted
Strikethrough
Underlined
Linked
- "#;
+ Italic
Bold
Unformatted
Strikethrough
Underlined
Linked
+ "#;
#[cfg(test)]
const MS_DOC_HTML_PASTEBOARD: &str = r#"
@@ -1074,36 +1091,35 @@ mod sys {
tree,
indoc! {
r#"
-
+
+ └>ul
+ ├>li
+ │ └>p
+ │ └>i
+ │ └>"Italic"
+ ├>li
+ │ └>p
+ │ └>"Bold"
+ ├>li
+ │ └>p
+ │ └>"Unformatted"
+ ├>li
+ │ └>p
+ │ └>del
+ │ └>"Strikethrough"
+ ├>li
+ │ └>p
+ │ └>u
+ │ └>"Underlined"
+ ├>li
+ │ └>p
+ │ └>a "http://matrix.org"
+ │ └>u
+ │ └>"Linked"
└>ul
- ├>li
- │ └>p
- │ └>i
- │ └>"Italic"
- ├>li
- │ └>p
- │ └>"Bold"
- ├>li
- │ └>p
- │ └>"Unformatted"
- ├>li
- │ └>p
- │ └>del
- │ └>"Strikethrough"
- ├>li
- │ └>p
- │ └>u
- │ └>"Underlined"
- ├>li
- │ └>p
- │ └>a "http://matrix.org"
- │ └>u
- │ └>"Linked"
- └>ul
└>li
- └>p
- └>u
- └>"nested"
+ └>p
+ └>"nested"
"#
}
);
@@ -1833,7 +1849,7 @@ mod js {
GOOGLE_DOC_HTML_PASTEBOARD,
)
.unwrap();
- assert_eq!(dom.to_string(), "Italic
Bold
Unformatted
Strikethrough
Underlined
Linked
");
+ assert_eq!(dom.to_string(), "Italic
Bold
Unformatted
Strikethrough
Underlined
Linked
");
}
#[wasm_bindgen_test]
diff --git a/platforms/web/cypress/e2e/clipboard/paste.spec.ts b/platforms/web/cypress/e2e/clipboard/paste.spec.ts
index 698b8ba16..b46fd7603 100644
--- a/platforms/web/cypress/e2e/clipboard/paste.spec.ts
+++ b/platforms/web/cypress/e2e/clipboard/paste.spec.ts
@@ -35,4 +35,30 @@ describe('Paste', () => {
// Note: we used to test it 'should convert pasted newlines into BRs' but
// the test was flakey, sometimes correctly showing text containing br tags,
// and sometimes mysteriously showing converted into two divs.
+
+ it(
+ 'should display pasted richtext after we type',
+ { browser: 'electron' },
+ () => {
+ cy.visit('/');
+ cy.get(editor).wait(500);
+ cy.get(editor).type('BEFORE');
+ cy.contains(editor, 'BEFORE');
+
+ cy.window().its('navigator.clipboard')
+ .then(async (clip) => {
+ const blob = new Blob(["link"], {type: 'text/html'});
+ const item = new ClipboardItem({'text/html': blob});
+ return await clip.write([item]);
+ })
+
+ cy.log("item");
+ cy.document().invoke('execCommand', 'paste');
+ cy.contains(editor, 'BEFORElink');
+
+ cy.get(editor).type('AFTER');
+ cy.contains(editor, /^BEFORElink AFTER/);
+ },
+ );
+
});
From 9744b3a8ce500de716755d0ef7cc0ed075a1d967 Mon Sep 17 00:00:00 2001
From: David Langley
Date: Fri, 4 Jul 2025 12:10:04 +0100
Subject: [PATCH 05/17] Add HtmlSource and stripping of meta tags and the outer
b tag for google docs.
---
.../src/composer_model/replace_html.rs | 110 +++++++--
crates/wysiwyg/src/dom.rs | 2 +
crates/wysiwyg/src/dom/html_source.rs | 6 +
crates/wysiwyg/src/dom/parser.rs | 2 +-
crates/wysiwyg/src/dom/parser/parse.rs | 211 +++++++++---------
crates/wysiwyg/src/lib.rs | 1 +
platforms/web/lib/composer.ts | 22 +-
7 files changed, 226 insertions(+), 128 deletions(-)
create mode 100644 crates/wysiwyg/src/dom/html_source.rs
diff --git a/crates/wysiwyg/src/composer_model/replace_html.rs b/crates/wysiwyg/src/composer_model/replace_html.rs
index 946248842..22cf3f3c2 100644
--- a/crates/wysiwyg/src/composer_model/replace_html.rs
+++ b/crates/wysiwyg/src/composer_model/replace_html.rs
@@ -4,8 +4,10 @@
// SPDX-License-Identifier: AGPL-3.0-only OR LicenseRef-Element-Commercial
// Please see LICENSE in the repository root for full details.
-use crate::dom::nodes::{ContainerNodeKind, DomNode};
-use crate::dom::parser::parse_from_external_html_source;
+use regex::Regex;
+
+use crate::dom::html_source::HtmlSource;
+use crate::dom::parser::parse_from_source;
use crate::{parse, ComposerModel, ComposerUpdate, Location, UnicodeString};
impl ComposerModel
@@ -18,35 +20,105 @@ where
pub fn replace_html(
&mut self,
new_html: S,
- from_external_source: bool,
+ external_source: HtmlSource,
) -> ComposerUpdate {
self.push_state_to_history();
if self.has_selection() {
self.do_replace_text(S::default());
}
- let result = if from_external_source {
- parse_from_external_html_source(&new_html.to_string())
+
+ let meta_regex = Regex::new(r"]*>").unwrap();
+ let mut cleaned_html = meta_regex
+ .replace_all(&new_html.to_string(), "")
+ .to_string();
+
+ if external_source == HtmlSource::GoogleDoc {
+ // Strip first b tag (opening and closing)
+ let b_regex = Regex::new(r"]*>(.*)<\/b>").unwrap();
+ cleaned_html = b_regex.replace(&cleaned_html, "$1").to_string();
+ }
+
+ println!("cleaned_html: {}", cleaned_html);
+ let result = if external_source == HtmlSource::Matrix {
+ parse(&cleaned_html.to_string())
} else {
- parse(&new_html.to_string())
+ parse_from_source(&cleaned_html.to_string(), external_source)
};
- let dom = result.unwrap().into_document_node();
+ // We should have only one top level dom node, so add each of the children at the cursor.
+ let dom_children = result.unwrap().into_container().take_children();
- let (start, end) = self.safe_selection();
- let range = self.state.dom.find_range(start, end);
+ for node in dom_children.iter() {
+ let (start, end) = self.safe_selection();
+ let range = self.state.dom.find_range(start, end);
- let new_cursor_index = start + dom.text_len();
- let handle = self.state.dom.insert_node_at_cursor(&range, dom);
+ let new_cursor_index = start + node.text_len();
+ let _ = self.state.dom.insert_node_at_cursor(&range, node.clone());
- // manually move the cursor to the end of the html
- self.state.start = Location::from(new_cursor_index);
- self.state.end = self.state.start;
+ // manually move the cursor to the end of the html
+ self.state.start = Location::from(new_cursor_index);
+ self.state.end = self.state.start;
+ }
// add a trailing space in cases when we do not have a next sibling
- if self.state.dom.is_last_in_parent(&handle) {
- self.do_replace_text(" ".into())
- } else {
- self.create_update_replace_all()
- }
+ self.create_update_replace_all()
+ }
+}
+
+#[cfg(test)]
+const GOOGLE_DOC_HTML_PASTEBOARD: &str = r#"test"#;
+#[cfg(test)]
+const MS_DOC_HTML_PASTEBOARD: &str = r#"test "#;
+
+// ...existing code...
+
+#[cfg(test)]
+mod test {
+ use super::*;
+ use crate::dom::html_source::HtmlSource;
+ use crate::tests::testutils_composer_model::cm;
+
+ #[test]
+ fn test_replace_html_strips_meta_tags_google_docs() {
+ let mut model = cm("|");
+
+ let _ = model.replace_html(
+ GOOGLE_DOC_HTML_PASTEBOARD.into(),
+ HtmlSource::GoogleDoc,
+ );
+
+ // Verify the HTML doesn't contain meta or the outer b tag
+ let html = model.get_content_as_html();
+ let html_str = html.to_string();
+ assert!(!html_str.contains("test
");
}
}
diff --git a/crates/wysiwyg/src/dom.rs b/crates/wysiwyg/src/dom.rs
index d0ae3cb34..5bad8e8b1 100644
--- a/crates/wysiwyg/src/dom.rs
+++ b/crates/wysiwyg/src/dom.rs
@@ -15,6 +15,7 @@ pub mod dom_struct;
pub mod find_extended_range;
pub mod find_range;
pub mod find_result;
+pub mod html_source;
pub mod insert_node_at_cursor;
pub mod insert_parent;
pub mod iter;
@@ -35,6 +36,7 @@ pub use dom_creation_error::MarkdownParseError;
pub use dom_handle::DomHandle;
pub use dom_struct::Dom;
pub use find_result::FindResult;
+pub use html_source::HtmlSource;
pub use range::DomLocation;
pub use range::Range;
pub use to_html::ToHtml;
diff --git a/crates/wysiwyg/src/dom/html_source.rs b/crates/wysiwyg/src/dom/html_source.rs
new file mode 100644
index 000000000..7707b0b83
--- /dev/null
+++ b/crates/wysiwyg/src/dom/html_source.rs
@@ -0,0 +1,6 @@
+#[derive(Copy, Clone, PartialEq, Eq)]
+pub enum HtmlSource {
+ Matrix,
+ GoogleDoc,
+ UnknownExternal,
+}
diff --git a/crates/wysiwyg/src/dom/parser.rs b/crates/wysiwyg/src/dom/parser.rs
index 0c2d24eaf..c845f1951 100644
--- a/crates/wysiwyg/src/dom/parser.rs
+++ b/crates/wysiwyg/src/dom/parser.rs
@@ -49,4 +49,4 @@ mod sys {
use sys::*;
pub use parse::parse;
-pub use parse::parse_from_external_html_source;
+pub use parse::parse_from_source;
diff --git a/crates/wysiwyg/src/dom/parser/parse.rs b/crates/wysiwyg/src/dom/parser/parse.rs
index a8fd59571..d639ae855 100644
--- a/crates/wysiwyg/src/dom/parser/parse.rs
+++ b/crates/wysiwyg/src/dom/parser/parse.rs
@@ -7,6 +7,7 @@
use regex::Regex;
use crate::dom::dom_creation_error::HtmlParseError;
+use crate::dom::html_source::HtmlSource;
use crate::dom::nodes::dom_node::DomNodeKind::{self};
use crate::dom::nodes::{ContainerNode, ContainerNodeKind};
use crate::dom::Dom;
@@ -27,17 +28,18 @@ where
}
}
-pub fn parse_from_external_html_source(
+pub fn parse_from_source(
html: &str,
+ source: HtmlSource,
) -> Result, HtmlParseError>
where
S: UnicodeString,
{
cfg_if::cfg_if! {
if #[cfg(feature = "sys")] {
- sys::HtmlParser::default().parse_from_external_html_source(html)
+ sys::HtmlParser::default().parse_from_source(html, source)
} else if #[cfg(all(feature = "js", target_arch = "wasm32"))] {
- js::HtmlParser::default().parse_from_external_html_source(html)
+ js::HtmlParser::default().parse_from_source(html, source)
} else {
unreachable!("The `sys` or `js` are mutually exclusive, and one of them must be enabled.")
}
@@ -86,23 +88,24 @@ mod sys {
where
S: UnicodeString,
{
- return self.parse_internal(html, true);
+ return self.parse_internal(html, HtmlSource::Matrix);
}
- pub(super) fn parse_from_external_html_source(
+ pub(super) fn parse_from_source(
&mut self,
html: &str,
+ source: HtmlSource,
) -> Result, HtmlParseError>
where
S: UnicodeString,
{
- self.parse_internal(html, true)
+ self.parse_internal(html, source)
}
pub(super) fn parse_internal(
&mut self,
html: &str,
- external_html_source: bool,
+ html_source: HtmlSource,
) -> Result, HtmlParseError>
where
S: UnicodeString,
@@ -111,11 +114,12 @@ mod sys {
self.padom_creation_error_to_html_parse_error(err)
})?;
- let dom = self.padom_to_dom(pa_dom, external_html_source).map_err(
- |err| HtmlParseError {
- parse_errors: vec![err.to_string()],
- },
- )?;
+ let dom =
+ self.padom_to_dom(pa_dom, html_source).map_err(|err| {
+ HtmlParseError {
+ parse_errors: vec![err.to_string()],
+ }
+ })?;
Ok(post_process_blocks(dom))
}
@@ -134,7 +138,7 @@ mod sys {
fn padom_to_dom(
&mut self,
padom: PaDom,
- external_html_source: bool,
+ html_source: HtmlSource,
) -> Result, Error>
where
S: UnicodeString,
@@ -143,7 +147,7 @@ mod sys {
let doc = ret.document_mut();
if let PaDomNode::Document(padoc) = padom.get_document() {
- self.convert(&padom, padoc, doc, external_html_source)?;
+ self.convert(&padom, padoc, doc, html_source)?;
} else {
return Err(Error::NoBody);
}
@@ -156,7 +160,7 @@ mod sys {
padom: &PaDom,
panode: &PaNodeContainer,
node: &mut ContainerNode,
- external_html_source: bool,
+ html_source: HtmlSource,
) -> Result<(), Error>
where
S: UnicodeString,
@@ -169,7 +173,7 @@ mod sys {
padom,
child,
node,
- external_html_source,
+ html_source,
)?;
}
PaDomNode::Document(_) => {
@@ -199,7 +203,7 @@ mod sys {
padom: &PaDom,
child: &PaNodeContainer,
node: &mut ContainerNode,
- external_html_source: bool,
+ html_source: HtmlSource,
) -> Result<(), Error>
where
S: UnicodeString,
@@ -214,7 +218,7 @@ mod sys {
padom,
child,
Some(node),
- external_html_source,
+ html_source,
)?;
} else {
self.current_path.push(formatting_node.kind());
@@ -223,7 +227,7 @@ mod sys {
padom,
child,
last_container_mut_in(node),
- external_html_source,
+ html_source,
)?;
self.current_path.remove(cur_path_idx);
}
@@ -252,19 +256,14 @@ mod sys {
padom,
child,
last_container_mut_in(node),
- external_html_source,
+ html_source,
)?;
self.current_path.remove(cur_path_idx);
} else {
- if external_html_source {
- self.convert(
- padom,
- child,
- node,
- external_html_source,
- )?;
- } else {
+ if html_source == HtmlSource::Matrix {
return Err(Error::UnknownNode(tag.to_string()));
+ } else {
+ self.convert(padom, child, node, html_source)?;
}
}
}
@@ -285,7 +284,7 @@ mod sys {
padom,
child,
last_container_mut_in(node),
- external_html_source,
+ html_source,
)?;
self.current_path.remove(cur_path_idx);
}
@@ -296,7 +295,7 @@ mod sys {
padom,
child,
last_container_mut_in(node),
- external_html_source,
+ html_source,
)?;
self.current_path.remove(cur_path_idx);
}
@@ -325,7 +324,7 @@ mod sys {
padom,
child,
last_container_mut_in(node),
- external_html_source,
+ html_source,
)?;
}
self.current_path.remove(cur_path_idx);
@@ -337,7 +336,7 @@ mod sys {
padom,
child,
last_container_mut_in(node),
- external_html_source,
+ html_source,
)?;
self.current_path.remove(cur_path_idx);
}
@@ -348,7 +347,7 @@ mod sys {
padom,
child,
last_container_mut_in(node),
- external_html_source,
+ html_source,
)?;
self.current_path.remove(cur_path_idx);
@@ -356,7 +355,7 @@ mod sys {
"html" => {
// Skip the html tag - add its children to the
// current node directly.
- self.convert(padom, child, node, external_html_source)?;
+ self.convert(padom, child, node, html_source)?;
}
"p" => {
self.current_path.push(DomNodeKind::Paragraph);
@@ -365,15 +364,15 @@ mod sys {
padom,
child,
last_container_mut_in(node),
- external_html_source,
+ html_source,
)?;
self.current_path.remove(cur_path_idx);
}
_ => {
- if external_html_source {
- self.convert(padom, child, node, external_html_source)?;
- } else {
+ if html_source == HtmlSource::Matrix {
return Err(Error::UnknownNode(tag.to_string()));
+ } else {
+ self.convert(padom, child, node, html_source)?;
}
}
};
@@ -386,13 +385,13 @@ mod sys {
padom: &PaDom,
child: &PaNodeContainer,
new_node: Option<&mut ContainerNode>,
- external_html_source: bool,
+ html_source: HtmlSource,
) -> Result<(), Error>
where
S: UnicodeString,
{
if let Some(new_node) = new_node {
- self.convert(padom, child, new_node, external_html_source)?;
+ self.convert(padom, child, new_node, html_source)?;
} else {
panic!("Container became non-container!");
}
@@ -936,8 +935,10 @@ mod sys {
let mut parser = HtmlParser::default();
let html = "Test\nCode
";
let pa_dom = PaDomCreator::parse(html).unwrap();
- let dom: Dom =
- parser.padom_to_dom(pa_dom, false).ok().unwrap();
+ let dom: Dom = parser
+ .padom_to_dom(pa_dom, HtmlSource::Matrix)
+ .ok()
+ .unwrap();
// First, line breaks are added as placeholders for paragraphs
assert_eq!(
dom.to_html().to_string(),
@@ -1084,41 +1085,44 @@ mod sys {
#[test]
fn parse_google_doc_rich_text() {
let dom: Dom = HtmlParser::default()
- .parse(GOOGLE_DOC_HTML_PASTEBOARD)
+ .parse_from_source(
+ GOOGLE_DOC_HTML_PASTEBOARD,
+ HtmlSource::GoogleDoc,
+ )
.unwrap();
let tree = dom.to_tree().to_string();
assert_eq!(
tree,
indoc! {
r#"
-
- └>ul
- ├>li
- │ └>p
- │ └>i
- │ └>"Italic"
- ├>li
- │ └>p
- │ └>"Bold"
- ├>li
- │ └>p
- │ └>"Unformatted"
- ├>li
- │ └>p
- │ └>del
- │ └>"Strikethrough"
- ├>li
- │ └>p
- │ └>u
- │ └>"Underlined"
- ├>li
- │ └>p
- │ └>a "http://matrix.org"
- │ └>u
- │ └>"Linked"
+
└>ul
+ ├>li
+ │ └>p
+ │ └>i
+ │ └>"Italic"
+ ├>li
+ │ └>p
+ │ └>"Bold"
+ ├>li
+ │ └>p
+ │ └>"Unformatted"
+ ├>li
+ │ └>p
+ │ └>del
+ │ └>"Strikethrough"
+ ├>li
+ │ └>p
+ │ └>u
+ │ └>"Underlined"
+ ├>li
+ │ └>p
+ │ └>a "http://matrix.org"
+ │ └>u
+ │ └>"Linked"
+ └>ul
└>li
- └>p
+ └>p
└>"nested"
"#
}
@@ -1127,8 +1131,12 @@ mod sys {
#[test]
fn parse_ms_doc_rich_text() {
- let dom: Dom =
- HtmlParser::default().parse(MS_DOC_HTML_PASTEBOARD).unwrap();
+ let dom: Dom = HtmlParser::default()
+ .parse_from_source(
+ MS_DOC_HTML_PASTEBOARD,
+ HtmlSource::UnknownExternal,
+ )
+ .unwrap();
let tree = dom.to_tree().to_string();
assert_eq!(
tree,
@@ -1420,23 +1428,24 @@ mod js {
where
S: UnicodeString,
{
- self.parse_internal(html, false)
+ self.parse_internal(html, HtmlSource::Matrix)
}
- pub(super) fn parse_from_external_html_source(
+ pub(super) fn parse_from_source(
&mut self,
html: &str,
+ html_source: HtmlSource,
) -> Result, HtmlParseError>
where
S: UnicodeString,
{
- self.parse_internal(html, true)
+ self.parse_internal(html, html_source)
}
fn parse_internal(
&mut self,
html: &str,
- external_html_source: bool,
+ html_source: HtmlSource,
) -> Result, HtmlParseError>
where
S: UnicodeString,
@@ -1455,7 +1464,7 @@ mod js {
)
})?;
- self.webdom_to_dom(document, external_html_source)
+ self.webdom_to_dom(document, html_source)
.map_err(to_dom_creation_error)
.map(post_process_blocks)
}
@@ -1463,19 +1472,19 @@ mod js {
fn webdom_to_dom(
&mut self,
webdoc: Document,
- external_html_source: bool,
+ html_source: HtmlSource,
) -> Result, Error>
where
S: UnicodeString,
{
let body = webdoc.body().ok_or_else(|| Error::NoBody)?;
- self.convert(body.child_nodes(), external_html_source)
+ self.convert(body.child_nodes(), html_source)
}
fn convert(
&mut self,
nodes: NodeList,
- external_html_source: bool,
+ html_source: HtmlSource,
) -> Result, Error>
where
S: UnicodeString,
@@ -1484,7 +1493,7 @@ mod js {
let mut dom = Dom::new(Vec::with_capacity(number_of_nodes));
let dom_document = dom.document_mut();
- self.convert_container(nodes, dom_document, external_html_source)?;
+ self.convert_container(nodes, dom_document, html_source)?;
Ok(dom)
}
@@ -1493,7 +1502,7 @@ mod js {
&mut self,
nodes: NodeList,
dom: &mut ContainerNode,
- external_html_source: bool,
+ html_source: HtmlSource,
) -> Result<(), Error>
where
S: UnicodeString,
@@ -1575,10 +1584,7 @@ mod js {
);
} else {
let children = self
- .convert(
- node.child_nodes(),
- external_html_source,
- )?
+ .convert(node.child_nodes(), html_source)?
.take_children();
dom.append_child(DomNode::new_link(
url.into(),
@@ -1636,11 +1642,8 @@ mod js {
self.current_path.push(DomNodeKind::ListItem);
dom.append_child(DomNode::Container(
ContainerNode::new_list_item(
- self.convert(
- node.child_nodes(),
- external_html_source,
- )?
- .take_children(),
+ self.convert(node.child_nodes(), html_source)?
+ .take_children(),
),
));
self.current_path.pop();
@@ -1660,7 +1663,7 @@ mod js {
};
dom.append_child(DomNode::Container(
ContainerNode::new_code_block(
- self.convert(children, external_html_source)?
+ self.convert(children, html_source)?
.take_children(),
),
));
@@ -1671,11 +1674,8 @@ mod js {
self.current_path.push(DomNodeKind::Quote);
dom.append_child(DomNode::Container(
ContainerNode::new_quote(
- self.convert(
- node.child_nodes(),
- external_html_source,
- )?
- .take_children(),
+ self.convert(node.child_nodes(), html_source)?
+ .take_children(),
),
));
self.current_path.pop();
@@ -1685,18 +1685,15 @@ mod js {
self.current_path.push(DomNodeKind::Paragraph);
dom.append_child(DomNode::Container(
ContainerNode::new_paragraph(
- self.convert(
- node.child_nodes(),
- external_html_source,
- )?
- .take_children(),
+ self.convert(node.child_nodes(), html_source)?
+ .take_children(),
),
));
self.current_path.pop();
}
node_name => {
let children_nodes = self
- .convert(node.child_nodes(), external_html_source)?
+ .convert(node.child_nodes(), html_source)?
.take_children();
let formatting_kind = match node_name {
"STRONG" | "B" => Some(InlineFormatType::Bold),
@@ -1705,7 +1702,7 @@ mod js {
"U" => Some(InlineFormatType::Underline),
"CODE" => Some(InlineFormatType::InlineCode),
"SPAN" => {
- if !external_html_source {
+ if html_source == HtmlSource::Matrix {
return Err(Error::UnknownNode(
node_name.to_owned(),
));
@@ -1741,7 +1738,7 @@ mod js {
}
}
_ => {
- if !external_html_source {
+ if html_source == HtmlSource::Matrix {
return Err(Error::UnknownNode(
node_name.to_owned(),
));
@@ -1845,8 +1842,9 @@ mod js {
#[wasm_bindgen_test]
fn google_doc_rich_text() {
let dom = HtmlParser::default()
- .parse_from_external_html_source::(
+ .parse_from_source::(
GOOGLE_DOC_HTML_PASTEBOARD,
+ HtmlSource::GoogleDoc,
)
.unwrap();
assert_eq!(dom.to_string(), "Italic
Bold
Unformatted
Strikethrough
Underlined
Linked
");
@@ -1855,8 +1853,9 @@ mod js {
#[wasm_bindgen_test]
fn ms_rich_text() {
let dom = HtmlParser::default()
- .parse_from_external_html_source::(
+ .parse_from_source::(
MS_DOC_HTML_PASTEBOARD,
+ HtmlSource::UnknownExternal,
)
.unwrap();
assert_eq!(dom.to_string(), "");
diff --git a/crates/wysiwyg/src/lib.rs b/crates/wysiwyg/src/lib.rs
index 79a4d33e3..2319b6994 100644
--- a/crates/wysiwyg/src/lib.rs
+++ b/crates/wysiwyg/src/lib.rs
@@ -33,6 +33,7 @@ pub use crate::dom::parser::parse;
pub use crate::dom::DomCreationError;
pub use crate::dom::DomHandle;
pub use crate::dom::HtmlParseError;
+pub use crate::dom::HtmlSource;
pub use crate::dom::MarkdownParseError;
pub use crate::dom::ToHtml;
pub use crate::dom::ToRawText;
diff --git a/platforms/web/lib/composer.ts b/platforms/web/lib/composer.ts
index 56ad75a16..19f890db5 100644
--- a/platforms/web/lib/composer.ts
+++ b/platforms/web/lib/composer.ts
@@ -9,6 +9,7 @@ Please see LICENSE in the repository root for full details.
import {
ComposerModel,
ComposerUpdate,
+ HtmlSource,
SuggestionPattern,
} from '@vector-im/matrix-wysiwyg-wasm';
@@ -65,8 +66,25 @@ export function processInput(
}
if (isClipboardEvent(event)) {
- const data = event.clipboardData?.getData('text/plain') ?? '';
- return action(composerModel.replace_text(data), 'paste');
+ const clipboardData = event.clipboardData;
+ const htmlData = clipboardData?.getData('text/html');
+ const plainData = clipboardData?.getData('text/plain') ?? '';
+
+ if (htmlData && htmlData !== plainData) {
+ const htmlSource = clipboardData?.types.includes(
+ 'application/x-vnd.google-docs-document-slice-clip+wrapped',
+ ) ? HtmlSource.GoogleDoc : HtmlSource.UnknownExternal;
+ return action(
+ composerModel.replace_html(htmlData, htmlSource),
+ 'replace_html_paste',
+ htmlData,
+ );
+ }
+ return action(
+ composerModel.replace_text(plainData),
+ 'replace_text_paste',
+ plainData,
+ );
}
switch (event.inputType) {
From f8e526a293585b764e9aeb06dca5c767cbc73807 Mon Sep 17 00:00:00 2001
From: David Langley
Date: Wed, 16 Jul 2025 12:09:53 +0100
Subject: [PATCH 06/17] Update parse.rs
---
crates/wysiwyg/src/dom/parser/parse.rs | 14 ++++----------
1 file changed, 4 insertions(+), 10 deletions(-)
diff --git a/crates/wysiwyg/src/dom/parser/parse.rs b/crates/wysiwyg/src/dom/parser/parse.rs
index d639ae855..1eb38c139 100644
--- a/crates/wysiwyg/src/dom/parser/parse.rs
+++ b/crates/wysiwyg/src/dom/parser/parse.rs
@@ -1604,11 +1604,8 @@ mod js {
dom.append_child(DomNode::Container(
ContainerNode::new_list(
ListType::Ordered,
- self.convert(
- node.child_nodes(),
- external_html_source,
- )?
- .take_children(),
+ self.convert(node.child_nodes(), html_source)?
+ .take_children(),
if let Some(custom_start) = custom_start {
Some(vec![(
"start".into(),
@@ -1627,11 +1624,8 @@ mod js {
dom.append_child(DomNode::Container(
ContainerNode::new_list(
ListType::Unordered,
- self.convert(
- node.child_nodes(),
- external_html_source,
- )?
- .take_children(),
+ self.convert(node.child_nodes(), html_source)?
+ .take_children(),
None,
),
));
From f8da65adb2d3fdc590fda9f934b08e965f755814 Mon Sep 17 00:00:00 2001
From: David Langley
Date: Fri, 4 Jul 2025 12:42:13 +0100
Subject: [PATCH 07/17] Fix e2e test
---
platforms/web/cypress/e2e/clipboard/paste.spec.ts | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/platforms/web/cypress/e2e/clipboard/paste.spec.ts b/platforms/web/cypress/e2e/clipboard/paste.spec.ts
index b46fd7603..44fb9a04e 100644
--- a/platforms/web/cypress/e2e/clipboard/paste.spec.ts
+++ b/platforms/web/cypress/e2e/clipboard/paste.spec.ts
@@ -57,7 +57,7 @@ describe('Paste', () => {
cy.contains(editor, 'BEFORElink');
cy.get(editor).type('AFTER');
- cy.contains(editor, /^BEFORElink AFTER/);
+ cy.contains(editor, /^BEFORElinkAFTER/);
},
);
From 974b899c0d078629f8c958edc8f4c237ad54129a Mon Sep 17 00:00:00 2001
From: David Langley
Date: Tue, 15 Jul 2025 16:41:04 +0100
Subject: [PATCH 08/17] Support nested lists for google docs(adds the ul/ol to
the li's children, not the li/ol's).
Also adds post processing to cleanup sibling text nodes.
---
bindings/wysiwyg-wasm/src/lib.rs | 29 ++
.../src/composer_model/replace_html.rs | 46 +--
crates/wysiwyg/src/dom/dom_methods.rs | 7 +-
crates/wysiwyg/src/dom/parser/parse.rs | 264 ++++++++++++++----
crates/wysiwyg/src/tests/test_deleting.rs | 10 +-
5 files changed, 274 insertions(+), 82 deletions(-)
diff --git a/bindings/wysiwyg-wasm/src/lib.rs b/bindings/wysiwyg-wasm/src/lib.rs
index 9117c33db..e4bee0db2 100644
--- a/bindings/wysiwyg-wasm/src/lib.rs
+++ b/bindings/wysiwyg-wasm/src/lib.rs
@@ -187,6 +187,17 @@ impl ComposerModel {
)
}
+ pub fn replace_html(
+ &mut self,
+ new_html: &str,
+ external_source: HtmlSource,
+ ) -> ComposerUpdate {
+ ComposerUpdate::from(self.inner.replace_html(
+ Utf16String::from_str(new_html),
+ external_source.into(),
+ ))
+ }
+
pub fn replace_text_suggestion(
&mut self,
new_text: &str,
@@ -914,6 +925,24 @@ impl From> for LinkAction {
}
}
+#[wasm_bindgen]
+#[derive(Clone)]
+pub enum HtmlSource {
+ Matrix,
+ GoogleDoc,
+ UnknownExternal,
+}
+
+impl From for wysiwyg::HtmlSource {
+ fn from(source: HtmlSource) -> Self {
+ match source {
+ HtmlSource::Matrix => Self::Matrix,
+ HtmlSource::GoogleDoc => Self::GoogleDoc,
+ HtmlSource::UnknownExternal => Self::UnknownExternal,
+ }
+ }
+}
+
#[cfg(test)]
mod test {
use super::ComposerModel;
diff --git a/crates/wysiwyg/src/composer_model/replace_html.rs b/crates/wysiwyg/src/composer_model/replace_html.rs
index 22cf3f3c2..d9ee660f7 100644
--- a/crates/wysiwyg/src/composer_model/replace_html.rs
+++ b/crates/wysiwyg/src/composer_model/replace_html.rs
@@ -7,8 +7,12 @@
use regex::Regex;
use crate::dom::html_source::HtmlSource;
+use crate::dom::nodes::ContainerNode;
use crate::dom::parser::parse_from_source;
-use crate::{parse, ComposerModel, ComposerUpdate, Location, UnicodeString};
+
+use crate::{
+ parse, ComposerModel, ComposerUpdate, DomNode, Location, UnicodeString,
+};
impl ComposerModel
where
@@ -38,39 +42,39 @@ where
cleaned_html = b_regex.replace(&cleaned_html, "$1").to_string();
}
- println!("cleaned_html: {}", cleaned_html);
let result = if external_source == HtmlSource::Matrix {
parse(&cleaned_html.to_string())
} else {
parse_from_source(&cleaned_html.to_string(), external_source)
};
- // We should have only one top level dom node, so add each of the children at the cursor.
- let dom_children = result.unwrap().into_container().take_children();
-
- for node in dom_children.iter() {
- let (start, end) = self.safe_selection();
- let range = self.state.dom.find_range(start, end);
+ let doc_node = result.unwrap().into_document_node();
+ let (start, end) = self.safe_selection();
+ let range = self.state.dom.find_range(start, end);
- let new_cursor_index = start + node.text_len();
- let _ = self.state.dom.insert_node_at_cursor(&range, node.clone());
+ let p = DomNode::Container(ContainerNode::new_paragraph(
+ doc_node.into_container().unwrap().take_children(),
+ ));
- // manually move the cursor to the end of the html
- self.state.start = Location::from(new_cursor_index);
- self.state.end = self.state.start;
- }
+ let new_cursor_index = start + p.text_len();
+ let handle = self.state.dom.insert_node_at_cursor(&range, p);
+ self.state.dom.replace_node_with_its_children(&handle);
- // add a trailing space in cases when we do not have a next sibling
+ // manually move the cursor to the end of the html
+ self.state.start = Location::from(new_cursor_index);
+ self.state.end = self.state.start;
self.create_update_replace_all()
}
}
#[cfg(test)]
-const GOOGLE_DOC_HTML_PASTEBOARD: &str = r#"test"#;
+const GOOGLE_DOC_HTML_PASTEBOARD: &str = r#"
+ Italic
Bold
Unformatted
Strikethrough
Underlined
Linked
+ "#;
#[cfg(test)]
-const MS_DOC_HTML_PASTEBOARD: &str = r#"test "#;
-
-// ...existing code...
+const MS_DOC_HTML_PASTEBOARD: &str = r#"
+
+ "#;
#[cfg(test)]
mod test {
@@ -92,6 +96,7 @@ mod test {
let html_str = html.to_string();
assert!(!html_str.contains("Italic
Bold
Unformatted
Strikethrough
Underlined
Linked
");
}
#[test]
@@ -106,8 +111,7 @@ mod test {
let html = model.get_content_as_html();
let html_str = html.to_string();
assert!(!html_str.contains("Italic
Bold
Unformatted
Strikethrough
Underlined
Linked
Nested
");
}
#[test]
diff --git a/crates/wysiwyg/src/dom/dom_methods.rs b/crates/wysiwyg/src/dom/dom_methods.rs
index c5eb653e5..f4a1f3293 100644
--- a/crates/wysiwyg/src/dom/dom_methods.rs
+++ b/crates/wysiwyg/src/dom/dom_methods.rs
@@ -696,9 +696,11 @@ where
self.merge_text_nodes_around(&first_location.node_handle);
}
}
+ #[cfg(any(test, feature = "assert-invariants"))]
+ self.assert_invariants();
}
- fn merge_text_nodes_around(&mut self, handle: &DomHandle) {
+ pub fn merge_text_nodes_around(&mut self, handle: &DomHandle) {
// TODO: make this method not public because it is used to make
// the invariants true, instead of assuming they are true at the
// beginning!
@@ -710,9 +712,6 @@ where
merge_if_adjacent_text_nodes(parent, idx - 1);
}
merge_if_adjacent_text_nodes(parent, idx);
-
- #[cfg(any(test, feature = "assert-invariants"))]
- self.assert_invariants();
}
/// Recursively visit container nodes, looking for block nodes and, if they contain a
diff --git a/crates/wysiwyg/src/dom/parser/parse.rs b/crates/wysiwyg/src/dom/parser/parse.rs
index 1eb38c139..3fad573c2 100644
--- a/crates/wysiwyg/src/dom/parser/parse.rs
+++ b/crates/wysiwyg/src/dom/parser/parse.rs
@@ -11,7 +11,7 @@ use crate::dom::html_source::HtmlSource;
use crate::dom::nodes::dom_node::DomNodeKind::{self};
use crate::dom::nodes::{ContainerNode, ContainerNodeKind};
use crate::dom::Dom;
-use crate::{DomHandle, DomNode, UnicodeString};
+use crate::{DomHandle, DomNode, ToTree, UnicodeString};
pub fn parse(html: &str) -> Result, HtmlParseError>
where
@@ -52,7 +52,7 @@ const GOOGLE_DOC_HTML_PASTEBOARD: &str = r#"
"#;
#[cfg(test)]
const MS_DOC_HTML_PASTEBOARD: &str = r#"
-
+
"#;
#[cfg(feature = "sys")]
@@ -120,7 +120,10 @@ mod sys {
parse_errors: vec![err.to_string()],
}
})?;
- Ok(post_process_blocks(dom))
+ let dom_blocks_done = post_process_blocks(dom);
+ let dom_adjacted_text_done =
+ post_process_for_adjacent_text(dom_blocks_done);
+ Ok(dom_adjacted_text_done)
}
/// Convert a [PaDom] into a [Dom].
@@ -272,18 +275,33 @@ mod sys {
}
"ol" | "ul" => {
self.current_path.push(DomNodeKind::List);
+
+ let target_node = if node.is_list() {
+ if html_source != HtmlSource::GoogleDoc
+ || node.last_child_mut().is_none()
+ {
+ return Err(Error::InvalidListItemNode);
+ }
+ node.last_child_mut()
+ .unwrap()
+ .as_container_mut()
+ .unwrap()
+ } else {
+ node
+ };
if tag == "ol" {
let custom_start = child
.get_attr("start")
.and_then(|start| start.parse::().ok());
- node.append_child(Self::new_ordered_list(custom_start));
+ target_node
+ .append_child(Self::new_ordered_list(custom_start));
} else {
- node.append_child(Self::new_unordered_list());
+ target_node.append_child(Self::new_unordered_list());
}
self.convert_children(
padom,
child,
- last_container_mut_in(node),
+ last_container_mut_in(target_node),
html_source,
)?;
self.current_path.remove(cur_path_idx);
@@ -529,6 +547,7 @@ mod sys {
enum Error {
NoBody,
UnknownNode(String),
+ InvalidListItemNode,
}
impl fmt::Display for Error {
@@ -543,6 +562,12 @@ mod sys {
Self::UnknownNode(node_name) => {
write!(formatter, "Node `{node_name}` is not supported")
}
+ Self::InvalidListItemNode => {
+ write!(
+ formatter,
+ "Invalid list item node: a list must only contain list items"
+ )
+ }
}
}
}
@@ -557,7 +582,7 @@ mod sys {
use super::*;
use crate::tests::testutils_composer_model::restore_whitespace;
- use crate::{ToHtml, ToTree};
+ use crate::{ToHtml, ToMarkdown, ToTree};
trait Roundtrips {
fn roundtrips(&self);
@@ -1095,7 +1120,7 @@ mod sys {
tree,
indoc! {
r#"
-
+
└>ul
├>li
│ └>p
@@ -1115,18 +1140,28 @@ mod sys {
│ └>p
│ └>u
│ └>"Underlined"
- ├>li
- │ └>p
- │ └>a "http://matrix.org"
- │ └>u
- │ └>"Linked"
- └>ul
- └>li
- └>p
- └>"nested"
+ └>li
+ ├>p
+ │ └>a "http://matrix.org"
+ │ └>u
+ │ └>"Linked"
+ └>ul
+ └>li
+ └>p
+ └>"nested"
"#
}
);
+ assert_eq!(
+ dom.to_markdown().unwrap().to_string(),
+ r#"* *Italic*
+* Bold
+* Unformatted
+* ~~Strikethrough~~
+* Underlined
+* [Linked]()
+ * nested"#
+ );
}
#[test]
@@ -1176,7 +1211,7 @@ mod sys {
└>ul
└>li
└>p
- └>"nested"
+ └>"Nested"
"#
}
);
@@ -1184,6 +1219,32 @@ mod sys {
}
}
+fn post_process_for_adjacent_text(mut dom: Dom) -> Dom {
+ println!(
+ "Post-processing adjacent text nodes: {}",
+ dom.to_tree().to_string()
+ );
+ let text_handles = find_text_nodes(&dom);
+ for handle in text_handles.iter().rev() {
+ dom = post_process_adjacent_text(dom, handle);
+ }
+ dom
+}
+
+fn find_text_nodes(dom: &Dom) -> Vec {
+ dom.iter()
+ .filter(|n| n.is_text_node())
+ .map(|n| n.handle())
+ .collect::>()
+}
+
+fn post_process_adjacent_text(
+ mut dom: Dom,
+ handle: &DomHandle,
+) -> Dom {
+ dom.merge_text_nodes_around(handle);
+ dom
+}
fn post_process_blocks(mut dom: Dom) -> Dom {
let block_handles = find_blocks(&dom);
for handle in block_handles.iter().rev() {
@@ -1359,8 +1420,7 @@ fn convert_text(
for (i, str) in text_nodes.into_iter().enumerate() {
let is_nbsp = str == "\u{A0}" || str == " ";
if !str.is_empty() && !is_nbsp {
- let text_node = DomNode::new_text(str.into());
- node.append_child(text_node);
+ node.append_child(DomNode::new_text(str.into()));
}
if i + 1 < text_nodes_len {
node.append_child(DomNode::new_line_break());
@@ -1406,6 +1466,7 @@ mod js {
};
use matrix_mentions::Mention;
use std::fmt;
+
use wasm_bindgen::JsCast;
use web_sys::{
Document, DomParser, Element, HtmlElement, NodeList, SupportedType,
@@ -1467,6 +1528,7 @@ mod js {
self.webdom_to_dom(document, html_source)
.map_err(to_dom_creation_error)
.map(post_process_blocks)
+ .map(post_process_for_adjacent_text)
}
fn webdom_to_dom(
@@ -1478,12 +1540,13 @@ mod js {
S: UnicodeString,
{
let body = webdoc.body().ok_or_else(|| Error::NoBody)?;
- self.convert(body.child_nodes(), html_source)
+ self.convert(body.child_nodes(), DomNodeKind::Generic, html_source)
}
fn convert(
&mut self,
nodes: NodeList,
+ parent_kind: DomNodeKind,
html_source: HtmlSource,
) -> Result, Error>
where
@@ -1493,7 +1556,12 @@ mod js {
let mut dom = Dom::new(Vec::with_capacity(number_of_nodes));
let dom_document = dom.document_mut();
- self.convert_container(nodes, dom_document, html_source)?;
+ self.convert_container(
+ nodes,
+ dom_document,
+ parent_kind,
+ html_source,
+ )?;
Ok(dom)
}
@@ -1502,6 +1570,7 @@ mod js {
&mut self,
nodes: NodeList,
dom: &mut ContainerNode,
+ parent_kind: DomNodeKind,
html_source: HtmlSource,
) -> Result<(), Error>
where
@@ -1584,7 +1653,11 @@ mod js {
);
} else {
let children = self
- .convert(node.child_nodes(), html_source)?
+ .convert(
+ node.child_nodes(),
+ DomNodeKind::Link,
+ html_source,
+ )?
.take_children();
dom.append_child(DomNode::new_link(
url.into(),
@@ -1601,11 +1674,16 @@ mod js {
.unchecked_ref::()
.get_attribute("start");
self.current_path.push(DomNodeKind::List);
+
dom.append_child(DomNode::Container(
ContainerNode::new_list(
ListType::Ordered,
- self.convert(node.child_nodes(), html_source)?
- .take_children(),
+ self.convert(
+ node.child_nodes(),
+ DomNodeKind::List,
+ html_source,
+ )?
+ .take_children(),
if let Some(custom_start) = custom_start {
Some(vec![(
"start".into(),
@@ -1621,14 +1699,43 @@ mod js {
"UL" => {
self.current_path.push(DomNodeKind::List);
- dom.append_child(DomNode::Container(
- ContainerNode::new_list(
- ListType::Unordered,
- self.convert(node.child_nodes(), html_source)?
+ // TODO We should pass the parent kind in so that we can bail out if a non-list item is being added to it's children.
+ if parent_kind == DomNodeKind::List {
+ if html_source != HtmlSource::GoogleDoc {
+ return Err(Error::InvalidListItemNode);
+ }
+ let target = dom
+ .last_child_mut()
+ .unwrap()
+ .as_container_mut()
+ .unwrap();
+ target.append_child(DomNode::Container(
+ ContainerNode::new_list(
+ ListType::Unordered,
+ self.convert(
+ node.child_nodes(),
+ DomNodeKind::List,
+ html_source,
+ )?
.take_children(),
- None,
- ),
- ));
+ None,
+ ),
+ ));
+ } else {
+ dom.append_child(DomNode::Container(
+ ContainerNode::new_list(
+ ListType::Unordered,
+ self.convert(
+ node.child_nodes(),
+ DomNodeKind::List,
+ html_source,
+ )?
+ .take_children(),
+ None,
+ ),
+ ));
+ }
+
self.current_path.pop();
}
@@ -1636,8 +1743,12 @@ mod js {
self.current_path.push(DomNodeKind::ListItem);
dom.append_child(DomNode::Container(
ContainerNode::new_list_item(
- self.convert(node.child_nodes(), html_source)?
- .take_children(),
+ self.convert(
+ node.child_nodes(),
+ DomNodeKind::ListItem,
+ html_source,
+ )?
+ .take_children(),
),
));
self.current_path.pop();
@@ -1657,8 +1768,12 @@ mod js {
};
dom.append_child(DomNode::Container(
ContainerNode::new_code_block(
- self.convert(children, html_source)?
- .take_children(),
+ self.convert(
+ children,
+ DomNodeKind::CodeBlock,
+ html_source,
+ )?
+ .take_children(),
),
));
self.current_path.pop();
@@ -1668,8 +1783,12 @@ mod js {
self.current_path.push(DomNodeKind::Quote);
dom.append_child(DomNode::Container(
ContainerNode::new_quote(
- self.convert(node.child_nodes(), html_source)?
- .take_children(),
+ self.convert(
+ node.child_nodes(),
+ DomNodeKind::Quote,
+ html_source,
+ )?
+ .take_children(),
),
));
self.current_path.pop();
@@ -1679,16 +1798,17 @@ mod js {
self.current_path.push(DomNodeKind::Paragraph);
dom.append_child(DomNode::Container(
ContainerNode::new_paragraph(
- self.convert(node.child_nodes(), html_source)?
- .take_children(),
+ self.convert(
+ node.child_nodes(),
+ DomNodeKind::Paragraph,
+ html_source,
+ )?
+ .take_children(),
),
));
self.current_path.pop();
}
node_name => {
- let children_nodes = self
- .convert(node.child_nodes(), html_source)?
- .take_children();
let formatting_kind = match node_name {
"STRONG" | "B" => Some(InlineFormatType::Bold),
"EM" | "I" => Some(InlineFormatType::Italic),
@@ -1741,23 +1861,43 @@ mod js {
}
};
- if formatting_kind.is_none() {
- if !children_nodes.is_empty() {
- dom.append_children(children_nodes);
- }
- self.current_path.pop();
- } else {
+ if let Some(formatting_kind) = formatting_kind {
self.current_path.push(DomNodeKind::Formatting(
- formatting_kind.clone().unwrap(),
+ formatting_kind.clone(),
+ ));
+ let children_nodes = self
+ .convert(
+ node.child_nodes(),
+ DomNodeKind::Formatting(
+ formatting_kind.clone(),
+ ),
+ html_source,
+ )?
+ .take_children();
+ self.current_path.push(DomNodeKind::Formatting(
+ formatting_kind.clone(),
));
dom.append_child(DomNode::Container(
ContainerNode::new_formatting(
- formatting_kind.unwrap(),
+ formatting_kind.clone(),
children_nodes,
),
));
self.current_path.pop();
+ } else {
+ self.current_path.push(parent_kind.clone());
+ let children_nodes = self
+ .convert(
+ node.child_nodes(),
+ parent_kind.clone(),
+ html_source,
+ )?
+ .take_children();
+ if !children_nodes.is_empty() {
+ dom.append_children(children_nodes);
+ }
+ self.current_path.pop();
}
}
}
@@ -1779,6 +1919,7 @@ mod js {
enum Error {
NoBody,
UnknownNode(String),
+ InvalidListItemNode,
}
impl fmt::Display for Error {
@@ -1794,6 +1935,12 @@ mod js {
Self::UnknownNode(node_name) => {
write!(formatter, "Node `{node_name}` is not supported")
}
+ Self::InvalidListItemNode => {
+ write!(
+ formatter,
+ "Invalid list item node: a list must only contain list items"
+ )
+ }
}
}
}
@@ -1802,7 +1949,8 @@ mod js {
mod tests {
use super::*;
use crate::{
- tests::testutils_composer_model::restore_whitespace, ToHtml, ToTree,
+ tests::testutils_composer_model::restore_whitespace, ToHtml,
+ ToMarkdown, ToTree,
};
use indoc::indoc;
use wasm_bindgen_test::*;
@@ -1841,7 +1989,17 @@ mod js {
HtmlSource::GoogleDoc,
)
.unwrap();
- assert_eq!(dom.to_string(), "Italic
Bold
Unformatted
Strikethrough
Underlined
Linked
");
+ assert_eq!(dom.to_string(), "Italic
Bold
Unformatted
Strikethrough
Underlined
Linked
");
+ assert_eq!(
+ dom.to_markdown().unwrap().to_string(),
+ r#"* *Italic*
+* Bold
+* Unformatted
+* ~~Strikethrough~~
+* Underlined
+* [Linked]()
+ * nested"#
+ );
}
#[wasm_bindgen_test]
diff --git a/crates/wysiwyg/src/tests/test_deleting.rs b/crates/wysiwyg/src/tests/test_deleting.rs
index 02d4ee390..fc5a01571 100644
--- a/crates/wysiwyg/src/tests/test_deleting.rs
+++ b/crates/wysiwyg/src/tests/test_deleting.rs
@@ -173,10 +173,12 @@ fn deleting_across_lists_joins_them() {
fn deleting_across_lists_joins_them_nested() {
let mut model = cm("\
- 1{1
\
- - 22
\
- \
- - 55
\
-
\
+ -
+
22
+ \
+ - 55
\
+
\
+ \
\
\
- 33
\
From 8dd57ebde224943d7b78d77acc06ffc6c02f2d7d Mon Sep 17 00:00:00 2001
From: David Langley
Date: Wed, 16 Jul 2025 14:17:21 +0100
Subject: [PATCH 09/17] Finish implementation of ordered lists in the js parser
an improve comments
---
.../src/composer_model/replace_html.rs | 26 ++--
crates/wysiwyg/src/dom/parser/parse.rs | 145 +++++++++---------
2 files changed, 83 insertions(+), 88 deletions(-)
diff --git a/crates/wysiwyg/src/composer_model/replace_html.rs b/crates/wysiwyg/src/composer_model/replace_html.rs
index d9ee660f7..ca40b5c32 100644
--- a/crates/wysiwyg/src/composer_model/replace_html.rs
+++ b/crates/wysiwyg/src/composer_model/replace_html.rs
@@ -10,9 +10,7 @@ use crate::dom::html_source::HtmlSource;
use crate::dom::nodes::ContainerNode;
use crate::dom::parser::parse_from_source;
-use crate::{
- parse, ComposerModel, ComposerUpdate, DomNode, Location, UnicodeString,
-};
+use crate::{ComposerModel, ComposerUpdate, DomNode, Location, UnicodeString};
impl ComposerModel
where
@@ -30,32 +28,30 @@ where
if self.has_selection() {
self.do_replace_text(S::default());
}
-
+ // Remove meta tags from the HTML which caused errors in html5ever
let meta_regex = Regex::new(r"]*>").unwrap();
let mut cleaned_html = meta_regex
.replace_all(&new_html.to_string(), "")
.to_string();
if external_source == HtmlSource::GoogleDoc {
- // Strip first b tag (opening and closing)
+ // Strip outer b tag that google docs adds
let b_regex = Regex::new(r"]*>(.*)<\/b>").unwrap();
cleaned_html = b_regex.replace(&cleaned_html, "$1").to_string();
}
- let result = if external_source == HtmlSource::Matrix {
- parse(&cleaned_html.to_string())
- } else {
- parse_from_source(&cleaned_html.to_string(), external_source)
- };
+ let result =
+ parse_from_source(&cleaned_html.to_string(), external_source);
let doc_node = result.unwrap().into_document_node();
let (start, end) = self.safe_selection();
let range = self.state.dom.find_range(start, end);
+ // We should only have 1 dom node, so add the children under a paragraph to take advantage of the exisitng
+ // insert_node_at_cursor api and then delete the paragraph node promoting it's the children up a level.
let p = DomNode::Container(ContainerNode::new_paragraph(
doc_node.into_container().unwrap().take_children(),
));
-
let new_cursor_index = start + p.text_len();
let handle = self.state.dom.insert_node_at_cursor(&range, p);
self.state.dom.replace_node_with_its_children(&handle);
@@ -69,11 +65,11 @@ where
#[cfg(test)]
const GOOGLE_DOC_HTML_PASTEBOARD: &str = r#"
- Italic
Bold
Unformatted
Strikethrough
Underlined
Linked
+ Italic
Bold
Unformatted
Strikethrough
Underlined
Linked
"#;
#[cfg(test)]
const MS_DOC_HTML_PASTEBOARD: &str = r#"
-
+
"#;
#[cfg(test)]
@@ -96,7 +92,7 @@ mod test {
let html_str = html.to_string();
assert!(!html_str.contains("Italic
Bold
Unformatted
Strikethrough
Underlined
Linked
");
+ assert_eq!(html_str, "Italic
Bold
Unformatted
Strikethrough
Underlined
Linked
");
}
#[test]
@@ -111,7 +107,7 @@ mod test {
let html = model.get_content_as_html();
let html_str = html.to_string();
assert!(!html_str.contains("Italic
Bold
Unformatted
Strikethrough
Underlined
Linked
Nested
");
+ assert_eq!(html_str, "Italic
Bold
Unformatted
Strikethrough
Underlined
Linked
");
}
#[test]
diff --git a/crates/wysiwyg/src/dom/parser/parse.rs b/crates/wysiwyg/src/dom/parser/parse.rs
index 3fad573c2..849712963 100644
--- a/crates/wysiwyg/src/dom/parser/parse.rs
+++ b/crates/wysiwyg/src/dom/parser/parse.rs
@@ -11,7 +11,7 @@ use crate::dom::html_source::HtmlSource;
use crate::dom::nodes::dom_node::DomNodeKind::{self};
use crate::dom::nodes::{ContainerNode, ContainerNodeKind};
use crate::dom::Dom;
-use crate::{DomHandle, DomNode, ToTree, UnicodeString};
+use crate::{DomHandle, DomNode, UnicodeString};
pub fn parse(html: &str) -> Result, HtmlParseError>
where
@@ -48,11 +48,11 @@ where
#[cfg(test)]
const GOOGLE_DOC_HTML_PASTEBOARD: &str = r#"
- Italic
Bold
Unformatted
Strikethrough
Underlined
Linked
+ Italic
Bold
Unformatted
Strikethrough
Underlined
Linked
"#;
#[cfg(test)]
const MS_DOC_HTML_PASTEBOARD: &str = r#"
-
+
"#;
#[cfg(feature = "sys")]
@@ -236,6 +236,12 @@ mod sys {
}
}
"span" => {
+ if html_source == HtmlSource::Matrix {
+ return Err(Error::UnknownNode(tag.to_string()));
+ }
+
+ // For external sources, we check for common formatting styles for spans
+ // and convert them to appropriate formatting nodes.
let mut formatting_tag = None;
if child.contains_style("font-weight", "bold") {
formatting_tag = Some("b");
@@ -263,23 +269,23 @@ mod sys {
)?;
self.current_path.remove(cur_path_idx);
} else {
- if html_source == HtmlSource::Matrix {
- return Err(Error::UnknownNode(tag.to_string()));
- } else {
- self.convert(padom, child, node, html_source)?;
- }
+ // If no formatting tag was found, just skip and convert the children
+ self.convert(padom, child, node, html_source)?;
}
}
"br" => {
node.append_child(Self::new_line_break());
}
"ol" | "ul" => {
- self.current_path.push(DomNodeKind::List);
-
let target_node = if node.is_list() {
+ // Google docs adds nested lists as children of the list node, this breaks our invariants.
+ // For the google docs case, we can add the nested list to the last list item instead.
if html_source != HtmlSource::GoogleDoc
|| node.last_child_mut().is_none()
+ || node.last_child_mut().unwrap().is_list_item()
+ == false
{
+ // If source is not Google Docs or the last child is not a list item, we return an error.
return Err(Error::InvalidListItemNode);
}
node.last_child_mut()
@@ -289,6 +295,7 @@ mod sys {
} else {
node
};
+ self.current_path.push(DomNodeKind::List);
if tag == "ol" {
let custom_start = child
.get_attr("start")
@@ -1121,7 +1128,7 @@ mod sys {
indoc! {
r#"
- └>ul
+ └>ol
├>li
│ └>p
│ └>i
@@ -1148,19 +1155,21 @@ mod sys {
└>ul
└>li
└>p
- └>"nested"
+ └>"Nested"
"#
}
);
assert_eq!(
dom.to_markdown().unwrap().to_string(),
- r#"* *Italic*
-* Bold
-* Unformatted
-* ~~Strikethrough~~
-* Underlined
-* [Linked]()
- * nested"#
+ indoc! {r#"
+ 1. *Italic*
+ 2. Bold
+ 3. Unformatted
+ 4. ~~Strikethrough~~
+ 5. Underlined
+ 6. [Linked]()
+ * Nested"#
+ }
);
}
@@ -1178,31 +1187,31 @@ mod sys {
indoc! {
r#"
- ├>ul
+ ├>ol
│ └>li
│ └>p
│ └>i
│ └>"Italic"
- ├>ul
+ ├>ol
│ └>li
│ └>p
│ └>b
│ └>"Bold"
- ├>ul
+ ├>ol
│ └>li
│ └>p
│ └>"Unformatted"
- ├>ul
+ ├>ol
│ └>li
│ └>p
│ └>del
│ └>"Strikethrough"
- ├>ul
+ ├>ol
│ └>li
│ └>p
│ └>u
│ └>"Underlined"
- ├>ul
+ ├>ol
│ └>li
│ └>p
│ └>a "https://matrix.org/"
@@ -1220,10 +1229,6 @@ mod sys {
}
fn post_process_for_adjacent_text(mut dom: Dom) -> Dom {
- println!(
- "Post-processing adjacent text nodes: {}",
- dom.to_tree().to_string()
- );
let text_handles = find_text_nodes(&dom);
for handle in text_handles.iter().rev() {
dom = post_process_adjacent_text(dom, handle);
@@ -1580,8 +1585,9 @@ mod js {
for nth in 0..number_of_nodes {
let node = nodes.get(nth as _).unwrap();
-
- match node.node_name().as_str() {
+ let node_name = node.node_name();
+ let tag = node_name.as_str();
+ match tag {
"BR" => {
dom.append_child(DomNode::new_line_break());
}
@@ -1668,42 +1674,32 @@ mod js {
self.current_path.pop();
}
-
- "OL" => {
+ "UL" | "OL" => {
let custom_start = node
.unchecked_ref::()
.get_attribute("start");
- self.current_path.push(DomNodeKind::List);
- dom.append_child(DomNode::Container(
- ContainerNode::new_list(
- ListType::Ordered,
- self.convert(
- node.child_nodes(),
- DomNodeKind::List,
- html_source,
- )?
- .take_children(),
- if let Some(custom_start) = custom_start {
- Some(vec![(
- "start".into(),
- custom_start.into(),
- )])
- } else {
- None
- },
- ),
- ));
- self.current_path.pop();
- }
+ let attributes: Option> =
+ if tag == "ol" && custom_start.is_some() {
+ Some(vec![(
+ "start".into(),
+ custom_start.unwrap().into(),
+ )])
+ } else {
+ None
+ };
+
+ let list_type = if tag == "OL" {
+ ListType::Ordered
+ } else {
+ ListType::Unordered
+ };
- "UL" => {
- self.current_path.push(DomNodeKind::List);
- // TODO We should pass the parent kind in so that we can bail out if a non-list item is being added to it's children.
if parent_kind == DomNodeKind::List {
if html_source != HtmlSource::GoogleDoc {
return Err(Error::InvalidListItemNode);
}
+ self.current_path.push(DomNodeKind::List);
let target = dom
.last_child_mut()
.unwrap()
@@ -1711,27 +1707,27 @@ mod js {
.unwrap();
target.append_child(DomNode::Container(
ContainerNode::new_list(
- ListType::Unordered,
+ list_type,
self.convert(
node.child_nodes(),
DomNodeKind::List,
html_source,
)?
.take_children(),
- None,
+ attributes,
),
));
} else {
dom.append_child(DomNode::Container(
ContainerNode::new_list(
- ListType::Unordered,
+ list_type,
self.convert(
node.child_nodes(),
DomNodeKind::List,
html_source,
)?
.take_children(),
- None,
+ attributes,
),
));
}
@@ -1821,6 +1817,8 @@ mod js {
node_name.to_owned(),
));
}
+ // For external sources, we check for common formatting styles for spans
+ // and convert them to appropriate formatting nodes.
let style =
node.unchecked_ref::().style();
if style
@@ -1886,7 +1884,7 @@ mod js {
));
self.current_path.pop();
} else {
- self.current_path.push(parent_kind.clone());
+ // If it's an external source we skip the node and process it's children.
let children_nodes = self
.convert(
node.child_nodes(),
@@ -1897,7 +1895,6 @@ mod js {
if !children_nodes.is_empty() {
dom.append_children(children_nodes);
}
- self.current_path.pop();
}
}
}
@@ -1989,16 +1986,18 @@ mod js {
HtmlSource::GoogleDoc,
)
.unwrap();
- assert_eq!(dom.to_string(), "Italic
Bold
Unformatted
Strikethrough
Underlined
Linked
");
+ assert_eq!(dom.to_string(), "Italic
Bold
Unformatted
Strikethrough
Underlined
Linked
");
assert_eq!(
dom.to_markdown().unwrap().to_string(),
- r#"* *Italic*
-* Bold
-* Unformatted
-* ~~Strikethrough~~
-* Underlined
-* [Linked]()
- * nested"#
+ indoc! {r#"
+ 1. *Italic*
+ 2. Bold
+ 3. Unformatted
+ 4. ~~Strikethrough~~
+ 5. Underlined
+ 6. [Linked]()
+ * Nested"#
+ }
);
}
@@ -2010,7 +2009,7 @@ mod js {
HtmlSource::UnknownExternal,
)
.unwrap();
- assert_eq!(dom.to_string(), "");
+ assert_eq!(dom.to_string(), "Italic
Bold
Unformatted
Strikethrough
Underlined
Linked
");
}
#[wasm_bindgen_test]
From 781038145f9152e1da668628d347ef5364e7355e Mon Sep 17 00:00:00 2001
From: David Langley
Date: Thu, 17 Jul 2025 13:47:10 +0100
Subject: [PATCH 10/17] Put back the enforcing of assertions and improve
parsing to fix cases where there are inline + block containers in the same
parent.
---
crates/wysiwyg/src/dom/dom_methods.rs | 5 +-
crates/wysiwyg/src/dom/dom_struct.rs | 7 ++-
.../wysiwyg/src/dom/nodes/container_node.rs | 10 ++--
crates/wysiwyg/src/dom/parser/parse.rs | 54 ++++++++++++++++---
4 files changed, 62 insertions(+), 14 deletions(-)
diff --git a/crates/wysiwyg/src/dom/dom_methods.rs b/crates/wysiwyg/src/dom/dom_methods.rs
index f4a1f3293..225678573 100644
--- a/crates/wysiwyg/src/dom/dom_methods.rs
+++ b/crates/wysiwyg/src/dom/dom_methods.rs
@@ -696,8 +696,6 @@ where
self.merge_text_nodes_around(&first_location.node_handle);
}
}
- #[cfg(any(test, feature = "assert-invariants"))]
- self.assert_invariants();
}
pub fn merge_text_nodes_around(&mut self, handle: &DomHandle) {
@@ -712,6 +710,9 @@ where
merge_if_adjacent_text_nodes(parent, idx - 1);
}
merge_if_adjacent_text_nodes(parent, idx);
+
+ #[cfg(any(test, feature = "assert-invariants"))]
+ self.assert_invariants();
}
/// Recursively visit container nodes, looking for block nodes and, if they contain a
diff --git a/crates/wysiwyg/src/dom/dom_struct.rs b/crates/wysiwyg/src/dom/dom_struct.rs
index 0ef76ea31..bff6acc9a 100644
--- a/crates/wysiwyg/src/dom/dom_struct.rs
+++ b/crates/wysiwyg/src/dom/dom_struct.rs
@@ -924,12 +924,15 @@ mod test {
#[test]
fn find_parent_list_item_or_self_finds_our_grandparent() {
let d = cm("|d").state.dom;
+ // The "|" at the start infers that when the dom is created, it be within a paragraph
+ // (as inline nodes and blocks are not allowed to be siblings).
+ // So the handle is [1, 0, 1, 0].
let res =
d.find_ancestor_list_item_or_self(&DomHandle::from_raw(vec![
- 0, 0, 1, 0,
+ 1, 0, 1, 0,
]));
let res = res.expect("Should have found a list parent!");
- assert_eq!(res.into_raw(), vec![0, 0]);
+ assert_eq!(res.into_raw(), vec![1, 0]);
}
#[test]
diff --git a/crates/wysiwyg/src/dom/nodes/container_node.rs b/crates/wysiwyg/src/dom/nodes/container_node.rs
index fc386986a..97092721d 100644
--- a/crates/wysiwyg/src/dom/nodes/container_node.rs
+++ b/crates/wysiwyg/src/dom/nodes/container_node.rs
@@ -1578,19 +1578,21 @@ mod test {
#[test]
fn paragraph_to_message_html() {
- let model = cm("
Hello!
|");
+ let model =
+ cm("
Hello!
|
");
assert_eq!(
&model.state.dom.to_message_html(),
- "
Hello!
"
+ "
Hello!
"
);
}
#[test]
fn paragraph_to_html() {
- let model = cm("
Hello!
|");
+ let model =
+ cm("
Hello!
|
");
assert_eq!(
&model.state.dom.to_html(),
- "\u{a0}
\u{a0}
Hello!
\u{a0}
"
+ "\u{a0}
\u{a0}
Hello!
\u{a0}
\u{a0}
"
);
}
diff --git a/crates/wysiwyg/src/dom/parser/parse.rs b/crates/wysiwyg/src/dom/parser/parse.rs
index 849712963..ad581d28a 100644
--- a/crates/wysiwyg/src/dom/parser/parse.rs
+++ b/crates/wysiwyg/src/dom/parser/parse.rs
@@ -9,7 +9,7 @@ use regex::Regex;
use crate::dom::dom_creation_error::HtmlParseError;
use crate::dom::html_source::HtmlSource;
use crate::dom::nodes::dom_node::DomNodeKind::{self};
-use crate::dom::nodes::{ContainerNode, ContainerNodeKind};
+use crate::dom::nodes::{container_node, ContainerNode, ContainerNodeKind};
use crate::dom::Dom;
use crate::{DomHandle, DomNode, UnicodeString};
@@ -121,8 +121,10 @@ mod sys {
}
})?;
let dom_blocks_done = post_process_blocks(dom);
+ let dom_inline_blocks_done =
+ post_process_for_block_and_inline_siblings(dom_blocks_done);
let dom_adjacted_text_done =
- post_process_for_adjacent_text(dom_blocks_done);
+ post_process_for_adjacent_text(dom_inline_blocks_done);
Ok(dom_adjacted_text_done)
}
@@ -1237,10 +1239,7 @@ fn post_process_for_adjacent_text(mut dom: Dom) -> Dom {
}
fn find_text_nodes(dom: &Dom) -> Vec {
- dom.iter()
- .filter(|n| n.is_text_node())
- .map(|n| n.handle())
- .collect::>()
+ dom.iter_text().map(|n| n.handle()).collect::>()
}
fn post_process_adjacent_text(
@@ -1250,6 +1249,49 @@ fn post_process_adjacent_text(
dom.merge_text_nodes_around(handle);
dom
}
+
+fn post_process_for_block_and_inline_siblings(
+ mut dom: Dom,
+) -> Dom {
+ let continer_handles = find_containers_with_inline_and_block_children(&dom);
+ for handle in continer_handles.iter().rev() {
+ dom = post_process_container_for_block_and_inline_siblings(dom, handle);
+ }
+ dom
+}
+
+fn find_containers_with_inline_and_block_children(
+ dom: &Dom,
+) -> Vec {
+ dom.iter_containers()
+ .filter(|n| {
+ if n.children().is_empty() {
+ return false; // Skip empty containers
+ }
+ let all_nodes_are_inline =
+ n.children().iter().all(|n| !n.is_block_node());
+ let all_nodes_are_block =
+ n.children().iter().all(|n| n.is_block_node());
+ !all_nodes_are_inline && !all_nodes_are_block
+ })
+ .map(|n| n.handle())
+ .collect::>()
+}
+
+fn post_process_container_for_block_and_inline_siblings(
+ mut dom: Dom,
+ handle: &DomHandle,
+) -> Dom {
+ // upate the container node by grouping inline nodes, to avoid
+ // having inline nodes as siblings of block nodes.
+ let container_node =
+ dom.lookup_node_mut(handle).as_container_mut().unwrap();
+ let new_children =
+ group_inline_nodes(container_node.remove_children().to_vec());
+ container_node.insert_children(0, new_children.clone());
+ dom
+}
+
fn post_process_blocks(mut dom: Dom) -> Dom {
let block_handles = find_blocks(&dom);
for handle in block_handles.iter().rev() {
From bc5d2c7f417cb8dc6640919f91dadac57292d834 Mon Sep 17 00:00:00 2001
From: David Langley
Date: Fri, 18 Jul 2025 10:01:19 +0100
Subject: [PATCH 11/17] Add tests and fix some edge cases
Namely, fixing the cursor position after insert, and adding more parsing post processing to keep the dom in an expected state with wrap_inline_nodes_into_paragraphs_if_needed and join_nodes_in_container.
Also fixing some tests.
---
.../src/composer_model/example_format.rs | 3 +-
.../src/composer_model/replace_html.rs | 88 +++++++++++++++++--
crates/wysiwyg/src/dom/dom_struct.rs | 19 +++-
.../wysiwyg/src/dom/insert_node_at_cursor.rs | 4 +
crates/wysiwyg/src/dom/parser/parse.rs | 84 +++++-------------
crates/wysiwyg/src/dom/range.rs | 16 ++++
crates/wysiwyg/src/tests/test_deleting.rs | 8 +-
7 files changed, 144 insertions(+), 78 deletions(-)
diff --git a/crates/wysiwyg/src/composer_model/example_format.rs b/crates/wysiwyg/src/composer_model/example_format.rs
index 62c15f8d6..48760ade7 100644
--- a/crates/wysiwyg/src/composer_model/example_format.rs
+++ b/crates/wysiwyg/src/composer_model/example_format.rs
@@ -903,7 +903,7 @@ mod test {
#[test]
fn selection_across_lists_roundtrips() {
assert_that!(
- "- 1{1
- 22
- 33
- 4}|4
"
+ "- 1{1
- 22
a
- 33
- 4}|4
"
)
.roundtrips();
}
@@ -915,6 +915,7 @@ mod test {
- 1{1
\
- 22
\
\
+ a
\
\
- 33
\
- 4}|4
\
diff --git a/crates/wysiwyg/src/composer_model/replace_html.rs b/crates/wysiwyg/src/composer_model/replace_html.rs
index ca40b5c32..e21613a8e 100644
--- a/crates/wysiwyg/src/composer_model/replace_html.rs
+++ b/crates/wysiwyg/src/composer_model/replace_html.rs
@@ -9,8 +9,7 @@ use regex::Regex;
use crate::dom::html_source::HtmlSource;
use crate::dom::nodes::ContainerNode;
use crate::dom::parser::parse_from_source;
-
-use crate::{ComposerModel, ComposerUpdate, DomNode, Location, UnicodeString};
+use crate::{ComposerModel, ComposerUpdate, DomNode, Location, UnicodeString}; // Import the trait for to_tree
impl ComposerModel
where
@@ -49,16 +48,25 @@ where
// We should only have 1 dom node, so add the children under a paragraph to take advantage of the exisitng
// insert_node_at_cursor api and then delete the paragraph node promoting it's the children up a level.
- let p = DomNode::Container(ContainerNode::new_paragraph(
- doc_node.into_container().unwrap().take_children(),
- ));
- let new_cursor_index = start + p.text_len();
+ let new_children = doc_node.into_container().unwrap().take_children();
+ let child_count = new_children.len();
+ let p = DomNode::Container(ContainerNode::new_paragraph(new_children));
+
let handle = self.state.dom.insert_node_at_cursor(&range, p);
self.state.dom.replace_node_with_its_children(&handle);
+ self.state.dom.wrap_inline_nodes_into_paragraphs_if_needed(
+ &self.state.dom.parent(&handle).handle(),
+ );
- // manually move the cursor to the end of the html
- self.state.start = Location::from(new_cursor_index);
+ // Track the index of the last inserted node for placing the cursor
+ let last_index = handle.index_in_parent() + child_count - 1;
+ let last_handle = handle.parent_handle().child_handle(last_index);
+ let location = self.state.dom.location_for_node(&last_handle);
+
+ self.state.start =
+ Location::from(location.position + location.length - 1);
self.state.end = self.state.start;
+ // add a trailing space in cases when we do not have a next sibling
self.create_update_replace_all()
}
}
@@ -121,4 +129,68 @@ mod test {
let html_str = html.to_string();
assert_eq!(html_str, "test
");
}
+
+ #[test]
+ fn test_replace_html_with_existing_selection() {
+ let mut model = cm("Hello{world}|test");
+ let new_html = "replacement
";
+
+ let _ =
+ model.replace_html(new_html.into(), HtmlSource::UnknownExternal);
+
+ let html = model.get_content_as_html();
+ let html_str = html.to_string();
+ assert_eq!(
+ html_str,
+ "Hello
replacement
test
"
+ );
+ }
+
+ #[test]
+ fn test_replace_html_cursor_position_after_insert() {
+ let mut model = cm("Start|");
+ let new_html = "Bold text";
+ let _ = model.replace_html(new_html.into(), HtmlSource::Matrix);
+ // Cursor should be positioned after the inserted content
+ let (start, end) = model.safe_selection();
+ assert_eq!(start, end); // No selection, just cursor
+ model.bold();
+ model.enter();
+ // Insert more text to verify cursor position
+ let _ = model.replace_text("End".into());
+ let html = model.get_content_as_html();
+ let html_str = html.to_string();
+ assert_eq!(
+ html_str,
+ "Start
Bold text
End
"
+ );
+ }
+
+ #[test]
+ fn test_replace_html_multiple_meta_tags() {
+ let mut model = cm("|");
+ let html_with_multiple_metas = r#"Content after metas
"#;
+
+ let _ = model.replace_html(
+ html_with_multiple_metas.into(),
+ HtmlSource::UnknownExternal,
+ );
+
+ let html = model.get_content_as_html();
+ let html_str = html.to_string();
+ assert!(!html_str.contains("Content after metas");
+ }
+
+ #[test]
+ fn test_replace_html_empty_content() {
+ let mut model = cm("Existing content|");
+ let empty_html = "";
+
+ let _ = model.replace_html(empty_html.into(), HtmlSource::Matrix);
+
+ let html = model.get_content_as_html();
+ let html_str = html.to_string();
+ assert_eq!(html_str, "Existing content
");
+ }
}
diff --git a/crates/wysiwyg/src/dom/dom_struct.rs b/crates/wysiwyg/src/dom/dom_struct.rs
index bff6acc9a..d84d9bdde 100644
--- a/crates/wysiwyg/src/dom/dom_struct.rs
+++ b/crates/wysiwyg/src/dom/dom_struct.rs
@@ -11,6 +11,7 @@ use crate::dom::nodes::{ContainerNode, DomNode};
use crate::dom::to_html::ToHtmlState;
use crate::dom::to_markdown::{MarkdownError, MarkdownOptions, ToMarkdown};
use crate::dom::unicode_string::UnicodeStrExt;
+use crate::dom::DomLocation;
use crate::dom::{
find_range, to_raw_text::ToRawText, DomHandle, Range, ToTree, UnicodeString,
};
@@ -295,14 +296,24 @@ where
find_range::find_range(self, start, end)
}
- pub fn find_range_by_node(&self, node_handle: &DomHandle) -> Range {
- let result = find_range::find_pos(self, node_handle, 0, usize::MAX);
+ pub fn location_for_node(&self, node_handle: &DomHandle) -> DomLocation {
+ let locations = find_range::find_range(self, 0, usize::MAX);
+ return locations.find_location(node_handle).unwrap().clone();
+ }
- let locations = match result {
+ pub fn locations_for_node(
+ &self,
+ node_handle: &DomHandle,
+ ) -> Vec {
+ let result = find_range::find_pos(self, &node_handle, 0, usize::MAX);
+ match result {
FindResult::Found(locations) => locations,
_ => panic!("Node does not exist"),
- };
+ }
+ }
+ pub fn find_range_by_node(&self, node_handle: &DomHandle) -> Range {
+ let locations = self.locations_for_node(node_handle);
let leaves = locations.iter().filter(|l| l.is_leaf());
let s = leaves.clone().map(|l| l.position).min().unwrap();
diff --git a/crates/wysiwyg/src/dom/insert_node_at_cursor.rs b/crates/wysiwyg/src/dom/insert_node_at_cursor.rs
index 2a892c42f..1a724542a 100644
--- a/crates/wysiwyg/src/dom/insert_node_at_cursor.rs
+++ b/crates/wysiwyg/src/dom/insert_node_at_cursor.rs
@@ -67,6 +67,10 @@ where
};
}
+ self.wrap_inline_nodes_into_paragraphs_if_needed(
+ &self.parent(&inserted_handle).handle(),
+ );
+
#[cfg(any(test, feature = "assert-invariants"))]
self.assert_invariants();
diff --git a/crates/wysiwyg/src/dom/parser/parse.rs b/crates/wysiwyg/src/dom/parser/parse.rs
index ad581d28a..720cd8123 100644
--- a/crates/wysiwyg/src/dom/parser/parse.rs
+++ b/crates/wysiwyg/src/dom/parser/parse.rs
@@ -9,7 +9,7 @@ use regex::Regex;
use crate::dom::dom_creation_error::HtmlParseError;
use crate::dom::html_source::HtmlSource;
use crate::dom::nodes::dom_node::DomNodeKind::{self};
-use crate::dom::nodes::{container_node, ContainerNode, ContainerNodeKind};
+use crate::dom::nodes::{ContainerNode, ContainerNodeKind};
use crate::dom::Dom;
use crate::{DomHandle, DomNode, UnicodeString};
@@ -1190,30 +1190,25 @@ mod sys {
r#"
├>ol
- │ └>li
- │ └>p
- │ └>i
- │ └>"Italic"
- ├>ol
- │ └>li
- │ └>p
- │ └>b
- │ └>"Bold"
- ├>ol
- │ └>li
- │ └>p
- │ └>"Unformatted"
- ├>ol
- │ └>li
- │ └>p
- │ └>del
- │ └>"Strikethrough"
- ├>ol
- │ └>li
- │ └>p
- │ └>u
- │ └>"Underlined"
- ├>ol
+ │ ├>li
+ │ │ └>p
+ │ │ └>i
+ │ │ └>"Italic"
+ │ ├>li
+ │ │ └>p
+ │ │ └>b
+ │ │ └>"Bold"
+ │ ├>li
+ │ │ └>p
+ │ │ └>"Unformatted"
+ │ ├>li
+ │ │ └>p
+ │ │ └>del
+ │ │ └>"Strikethrough"
+ │ ├>li
+ │ │ └>p
+ │ │ └>u
+ │ │ └>"Underlined"
│ └>li
│ └>p
│ └>a "https://matrix.org/"
@@ -1253,42 +1248,7 @@ fn post_process_adjacent_text(
fn post_process_for_block_and_inline_siblings(
mut dom: Dom,
) -> Dom {
- let continer_handles = find_containers_with_inline_and_block_children(&dom);
- for handle in continer_handles.iter().rev() {
- dom = post_process_container_for_block_and_inline_siblings(dom, handle);
- }
- dom
-}
-
-fn find_containers_with_inline_and_block_children(
- dom: &Dom,
-) -> Vec {
- dom.iter_containers()
- .filter(|n| {
- if n.children().is_empty() {
- return false; // Skip empty containers
- }
- let all_nodes_are_inline =
- n.children().iter().all(|n| !n.is_block_node());
- let all_nodes_are_block =
- n.children().iter().all(|n| n.is_block_node());
- !all_nodes_are_inline && !all_nodes_are_block
- })
- .map(|n| n.handle())
- .collect::>()
-}
-
-fn post_process_container_for_block_and_inline_siblings(
- mut dom: Dom,
- handle: &DomHandle,
-) -> Dom {
- // upate the container node by grouping inline nodes, to avoid
- // having inline nodes as siblings of block nodes.
- let container_node =
- dom.lookup_node_mut(handle).as_container_mut().unwrap();
- let new_children =
- group_inline_nodes(container_node.remove_children().to_vec());
- container_node.insert_children(0, new_children.clone());
+ dom.wrap_inline_nodes_into_paragraphs_if_needed(&DomHandle::root());
dom
}
@@ -1296,6 +1256,7 @@ fn post_process_blocks(mut dom: Dom) -> Dom {
let block_handles = find_blocks(&dom);
for handle in block_handles.iter().rev() {
dom = post_process_block_lines(dom, handle);
+ dom.join_nodes_in_container(&handle);
}
dom
}
@@ -1575,6 +1536,7 @@ mod js {
self.webdom_to_dom(document, html_source)
.map_err(to_dom_creation_error)
.map(post_process_blocks)
+ .map(post_process_for_block_and_inline_siblings)
.map(post_process_for_adjacent_text)
}
diff --git a/crates/wysiwyg/src/dom/range.rs b/crates/wysiwyg/src/dom/range.rs
index 4ec6b9e5c..22c1085a4 100644
--- a/crates/wysiwyg/src/dom/range.rs
+++ b/crates/wysiwyg/src/dom/range.rs
@@ -7,6 +7,7 @@
use crate::dom::dom_handle::DomHandle;
use crate::dom::nodes::dom_node::DomNodeKind;
use std::cmp::{min, Ordering};
+use std::fmt;
/// Represents the relative position of a DomLocation towards
/// the range start and end.
@@ -202,6 +203,21 @@ impl DomLocation {
}
}
+impl fmt::Display for DomLocation {
+ fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
+ write!(
+ f,
+ "DomLocation[node_handle: {:?}, position: {}, start_offset: {}, end_offset: {}, length: {}, kind: {:?}]",
+ self.node_handle.raw(),
+ self.position,
+ self.start_offset,
+ self.end_offset,
+ self.length,
+ self.kind
+ )
+ }
+}
+
impl PartialOrd for DomLocation {
fn partial_cmp(&self, other: &Self) -> Option {
Some(self.cmp(other))
diff --git a/crates/wysiwyg/src/tests/test_deleting.rs b/crates/wysiwyg/src/tests/test_deleting.rs
index fc5a01571..a28033287 100644
--- a/crates/wysiwyg/src/tests/test_deleting.rs
+++ b/crates/wysiwyg/src/tests/test_deleting.rs
@@ -878,7 +878,7 @@ fn backspace_immutable_link_from_inside_link() {
#[test]
fn backspace_immutable_link_multiple() {
let mut model = cm(
- "firstsecond|",
+ "firstsecond|",
);
model.backspace();
assert_eq!(
@@ -956,12 +956,12 @@ fn delete_mention_from_start() {
#[test]
fn delete_first_immutable_link_of_multiple() {
let mut model = cm(
- "|firstsecond",
+ "|firstsecond",
);
model.delete();
assert_eq!(
restore_whitespace(&tx(&model)),
- "|second"
+ "|second"
);
model.delete();
assert_eq!(restore_whitespace(&tx(&model)), "|");
@@ -984,7 +984,7 @@ fn delete_first_mention_of_multiple() {
#[test]
fn delete_second_immutable_link_of_multiple() {
let mut model = cm(
- "firstsecond|",
+ "firstsecond|",
);
model.backspace();
assert_eq!(
From 3b8b034df0438024e2942e3562d32f69e6ce51c3 Mon Sep 17 00:00:00 2001
From: David Langley
Date: Wed, 23 Jul 2025 14:42:31 +0100
Subject: [PATCH 12/17] Add stricter checks/tests around lists
As we are now parsing from non-matrix sources, adding some rigour around validation of a valid dom. E.g. making sure nodes other than list items are not added to lists or that list items are not added to containers other than lists.
---
.../src/composer_model/replace_html.rs | 97 ++
crates/wysiwyg/src/dom/parser/parse.rs | 1012 ++++++++++-------
2 files changed, 674 insertions(+), 435 deletions(-)
diff --git a/crates/wysiwyg/src/composer_model/replace_html.rs b/crates/wysiwyg/src/composer_model/replace_html.rs
index e21613a8e..f7b1dfe24 100644
--- a/crates/wysiwyg/src/composer_model/replace_html.rs
+++ b/crates/wysiwyg/src/composer_model/replace_html.rs
@@ -193,4 +193,101 @@ mod test {
let html_str = html.to_string();
assert_eq!(html_str, "Existing content
");
}
+
+ #[test]
+ fn test_insert_list_item_without_list_parent() {
+ let mut model = cm("hello|");
+ let html = "- list item
";
+
+ let _ = model.replace_html(html.into(), HtmlSource::UnknownExternal);
+
+ let html = model.get_content_as_html();
+ let html_str = html.to_string();
+ assert_eq!(html_str, "hello
list item
");
+ }
+}
+
+#[cfg(all(test, target_arch = "wasm32"))]
+mod wasm_tests {
+ use crate::dom::html_source::HtmlSource;
+ use crate::tests::testutils_composer_model::cm;
+ use wasm_bindgen_test::*;
+
+ wasm_bindgen_test_configure!(run_in_browser);
+
+ #[wasm_bindgen_test]
+ fn test_replace_html_with_existing_selection() {
+ let mut model = cm("Hello{world}|test");
+ let new_html = "replacement
";
+
+ let _ =
+ model.replace_html(new_html.into(), HtmlSource::UnknownExternal);
+
+ let html = model.get_content_as_html();
+ let html_str = html.to_string();
+ assert_eq!(
+ html_str,
+ "Hello
replacement
test
"
+ );
+ }
+
+ #[wasm_bindgen_test]
+ fn test_replace_html_cursor_position_after_insert() {
+ let mut model = cm("Start|");
+ let new_html = "Bold text";
+ let _ = model.replace_html(new_html.into(), HtmlSource::Matrix);
+ // Cursor should be positioned after the inserted content
+ let (start, end) = model.safe_selection();
+ assert_eq!(start, end); // No selection, just cursor
+ model.bold();
+ model.enter();
+ // Insert more text to verify cursor position
+ let _ = model.replace_text("End".into());
+ let html = model.get_content_as_html();
+ let html_str = html.to_string();
+ assert_eq!(
+ html_str,
+ "Start
Bold text
End
"
+ );
+ }
+
+ #[wasm_bindgen_test]
+ fn test_replace_html_multiple_meta_tags() {
+ let mut model = cm("|");
+ let html_with_multiple_metas = r#"Content after metas
"#;
+
+ let _ = model.replace_html(
+ html_with_multiple_metas.into(),
+ HtmlSource::UnknownExternal,
+ );
+
+ let html = model.get_content_as_html();
+ let html_str = html.to_string();
+ assert!(!html_str.contains("Content after metas");
+ }
+
+ #[wasm_bindgen_test]
+ fn test_replace_html_empty_content() {
+ let mut model = cm("Existing content|");
+ let empty_html = "";
+
+ let _ = model.replace_html(empty_html.into(), HtmlSource::Matrix);
+
+ let html = model.get_content_as_html();
+ let html_str = html.to_string();
+ assert_eq!(html_str, "Existing content
");
+ }
+
+ #[wasm_bindgen_test]
+ fn test_insert_list_item_without_list_parent() {
+ let mut model = cm("hello|");
+ let html = "- list item
";
+
+ let _ = model.replace_html(html.into(), HtmlSource::UnknownExternal);
+
+ let html = model.get_content_as_html();
+ let html_str = html.to_string();
+ assert_eq!(html_str, "hello
list item
");
+ }
}
diff --git a/crates/wysiwyg/src/dom/parser/parse.rs b/crates/wysiwyg/src/dom/parser/parse.rs
index 720cd8123..d388f6837 100644
--- a/crates/wysiwyg/src/dom/parser/parse.rs
+++ b/crates/wysiwyg/src/dom/parser/parse.rs
@@ -207,7 +207,7 @@ mod sys {
&mut self,
padom: &PaDom,
child: &PaNodeContainer,
- node: &mut ContainerNode,
+ node_in: &mut ContainerNode,
html_source: HtmlSource,
) -> Result<(), Error>
where
@@ -215,194 +215,230 @@ mod sys {
{
let cur_path_idx = self.current_path.len();
let tag = child.name.local.as_ref();
- match tag {
- "b" | "code" | "del" | "em" | "i" | "strong" | "u" => {
- let formatting_node = Self::new_formatting(tag);
- if tag == "code" && self.current_path.contains(&CodeBlock) {
+ let mut invalid_node_error: Option = None;
+ let mut skip_children: bool = false;
+ let mut node = node_in.clone();
+ if node.is_list()
+ && tag != "li"
+ && html_source != HtmlSource::GoogleDoc
+ {
+ // If we are inside a list, we can only have list items.
+ invalid_node_error = Some(Error::InvalidListItemNode);
+ skip_children = true;
+ }
+
+ if invalid_node_error.is_none() {
+ match tag {
+ "b" | "code" | "del" | "em" | "i" | "strong" | "u" => {
+ let formatting_node = Self::new_formatting(tag);
+ if tag == "code"
+ && self.current_path.contains(&CodeBlock)
+ {
+ self.convert_children(
+ padom,
+ child,
+ Some(&mut node),
+ html_source,
+ )?;
+ } else {
+ self.current_path.push(formatting_node.kind());
+ node.append_child(formatting_node);
+ self.convert_children(
+ padom,
+ child,
+ last_container_mut_in(&mut node),
+ html_source,
+ )?;
+ self.current_path.remove(cur_path_idx);
+ }
+ }
+ "span" => 'span: {
+ if html_source == HtmlSource::Matrix {
+ invalid_node_error =
+ Some(Error::UnknownNode(tag.to_string()));
+ break 'span;
+ }
+
+ // For external sources, we check for common formatting styles for spans
+ // and convert them to appropriate formatting nodes.
+ let mut formatting_tag = None;
+ if child.contains_style("font-weight", "bold") {
+ formatting_tag = Some("b");
+ } else if child.contains_style("font-style", "italic") {
+ formatting_tag = Some("i");
+ } else if child
+ .contains_style("text-decoration", "underline")
+ {
+ formatting_tag = Some("u");
+ } else if child
+ .contains_style("text-decoration", "line-through")
+ {
+ formatting_tag = Some("del");
+ }
+
+ if let Some(tag) = formatting_tag {
+ let formatting_node = Self::new_formatting(tag);
+ self.current_path.push(formatting_node.kind());
+ node.append_child(formatting_node);
+ self.convert_children(
+ padom,
+ child,
+ last_container_mut_in(&mut node),
+ html_source,
+ )?;
+ self.current_path.remove(cur_path_idx);
+ } else {
+ // If no formatting tag was found, just skip and convert the children
+ invalid_node_error =
+ Some(Error::UnknownNode(tag.to_string()));
+ }
+ }
+ "br" => {
+ node.append_child(Self::new_line_break());
+ }
+ "ol" | "ul" => 'list: {
+ let target_node = if node.is_list() {
+ // Google docs adds nested lists as children of the list node, this breaks our invariants.
+ // For the google docs case, we can add the nested list to the last list item instead.
+ if html_source != HtmlSource::GoogleDoc
+ || node.last_child_mut().is_none()
+ || node.last_child_mut().unwrap().is_list_item()
+ == false
+ {
+ // If source is not Google Docs or the last child is not a list item, we return an error.
+ invalid_node_error =
+ Some(Error::InvalidListItemNode);
+ break 'list;
+ }
+ node.last_child_mut()
+ .unwrap()
+ .as_container_mut()
+ .unwrap()
+ } else {
+ &mut node
+ };
+ self.current_path.push(DomNodeKind::List);
+ if tag == "ol" {
+ let custom_start = child
+ .get_attr("start")
+ .and_then(|start| start.parse::().ok());
+ target_node.append_child(Self::new_ordered_list(
+ custom_start,
+ ));
+ } else {
+ target_node
+ .append_child(Self::new_unordered_list());
+ }
self.convert_children(
padom,
child,
- Some(node),
+ last_container_mut_in(target_node),
html_source,
)?;
- } else {
- self.current_path.push(formatting_node.kind());
- node.append_child(formatting_node);
+ self.current_path.remove(cur_path_idx);
+ }
+ "li" => 'li: {
+ if !node.is_list() {
+ invalid_node_error = Some(Error::ParentNotAList);
+ break 'li;
+ }
+ self.current_path.push(DomNodeKind::ListItem);
+ node.append_child(Self::new_list_item());
self.convert_children(
padom,
child,
- last_container_mut_in(node),
+ last_container_mut_in(&mut node),
html_source,
)?;
self.current_path.remove(cur_path_idx);
}
- }
- "span" => {
- if html_source == HtmlSource::Matrix {
- return Err(Error::UnknownNode(tag.to_string()));
- }
+ "a" => {
+ let is_mention = child.attrs.iter().any(|(k, v)| {
+ k == &String::from("href")
+ && Mention::is_valid_uri(v)
+ });
+
+ let text =
+ child.children.first().map(|gc| padom.get_node(gc));
+ let text = match text {
+ Some(PaDomNode::Text(text)) => Some(text),
+ _ => None,
+ };
- // For external sources, we check for common formatting styles for spans
- // and convert them to appropriate formatting nodes.
- let mut formatting_tag = None;
- if child.contains_style("font-weight", "bold") {
- formatting_tag = Some("b");
- } else if child.contains_style("font-style", "italic") {
- formatting_tag = Some("i");
- } else if child
- .contains_style("text-decoration", "underline")
- {
- formatting_tag = Some("u");
- } else if child
- .contains_style("text-decoration", "line-through")
- {
- formatting_tag = Some("del");
+ if is_mention && text.is_some() {
+ self.current_path.push(DomNodeKind::Mention);
+ let mention =
+ Self::new_mention(child, text.unwrap());
+ node.append_child(mention);
+ } else {
+ self.current_path.push(DomNodeKind::Link);
+
+ let link = Self::new_link(child);
+ node.append_child(link);
+ self.convert_children(
+ padom,
+ child,
+ last_container_mut_in(&mut node),
+ html_source,
+ )?;
+ }
+ self.current_path.remove(cur_path_idx);
}
-
- if let Some(tag) = formatting_tag {
- let formatting_node = Self::new_formatting(tag);
- self.current_path.push(formatting_node.kind());
- node.append_child(formatting_node);
+ "pre" => {
+ self.current_path.push(DomNodeKind::CodeBlock);
+ node.append_child(Self::new_code_block());
self.convert_children(
padom,
child,
- last_container_mut_in(node),
+ last_container_mut_in(&mut node),
html_source,
)?;
self.current_path.remove(cur_path_idx);
- } else {
- // If no formatting tag was found, just skip and convert the children
- self.convert(padom, child, node, html_source)?;
}
- }
- "br" => {
- node.append_child(Self::new_line_break());
- }
- "ol" | "ul" => {
- let target_node = if node.is_list() {
- // Google docs adds nested lists as children of the list node, this breaks our invariants.
- // For the google docs case, we can add the nested list to the last list item instead.
- if html_source != HtmlSource::GoogleDoc
- || node.last_child_mut().is_none()
- || node.last_child_mut().unwrap().is_list_item()
- == false
- {
- // If source is not Google Docs or the last child is not a list item, we return an error.
- return Err(Error::InvalidListItemNode);
- }
- node.last_child_mut()
- .unwrap()
- .as_container_mut()
- .unwrap()
- } else {
- node
- };
- self.current_path.push(DomNodeKind::List);
- if tag == "ol" {
- let custom_start = child
- .get_attr("start")
- .and_then(|start| start.parse::().ok());
- target_node
- .append_child(Self::new_ordered_list(custom_start));
- } else {
- target_node.append_child(Self::new_unordered_list());
+ "blockquote" => {
+ self.current_path.push(DomNodeKind::Quote);
+ node.append_child(Self::new_quote());
+ self.convert_children(
+ padom,
+ child,
+ last_container_mut_in(&mut node),
+ html_source,
+ )?;
+
+ self.current_path.remove(cur_path_idx);
}
- self.convert_children(
- padom,
- child,
- last_container_mut_in(target_node),
- html_source,
- )?;
- self.current_path.remove(cur_path_idx);
- }
- "li" => {
- self.current_path.push(DomNodeKind::ListItem);
- node.append_child(Self::new_list_item());
- self.convert_children(
- padom,
- child,
- last_container_mut_in(node),
- html_source,
- )?;
- self.current_path.remove(cur_path_idx);
- }
- "a" => {
- let is_mention = child.attrs.iter().any(|(k, v)| {
- k == &String::from("href") && Mention::is_valid_uri(v)
- });
-
- let text =
- child.children.first().map(|gc| padom.get_node(gc));
- let text = match text {
- Some(PaDomNode::Text(text)) => Some(text),
- _ => None,
- };
-
- if is_mention && text.is_some() {
- self.current_path.push(DomNodeKind::Mention);
- let mention = Self::new_mention(child, text.unwrap());
- node.append_child(mention);
- } else {
- self.current_path.push(DomNodeKind::Link);
-
- let link = Self::new_link(child);
- node.append_child(link);
+ "html" => {
+ // Skip the html tag - add its children to the
+ // current node directly.
+ self.convert(padom, child, &mut node, html_source)?;
+ }
+ "p" => {
+ self.current_path.push(DomNodeKind::Paragraph);
+ node.append_child(Self::new_paragraph());
self.convert_children(
padom,
child,
- last_container_mut_in(node),
+ last_container_mut_in(&mut node),
html_source,
)?;
+ self.current_path.remove(cur_path_idx);
}
- self.current_path.remove(cur_path_idx);
- }
- "pre" => {
- self.current_path.push(DomNodeKind::CodeBlock);
- node.append_child(Self::new_code_block());
- self.convert_children(
- padom,
- child,
- last_container_mut_in(node),
- html_source,
- )?;
- self.current_path.remove(cur_path_idx);
- }
- "blockquote" => {
- self.current_path.push(DomNodeKind::Quote);
- node.append_child(Self::new_quote());
- self.convert_children(
- padom,
- child,
- last_container_mut_in(node),
- html_source,
- )?;
-
- self.current_path.remove(cur_path_idx);
- }
- "html" => {
- // Skip the html tag - add its children to the
- // current node directly.
- self.convert(padom, child, node, html_source)?;
- }
- "p" => {
- self.current_path.push(DomNodeKind::Paragraph);
- node.append_child(Self::new_paragraph());
- self.convert_children(
- padom,
- child,
- last_container_mut_in(node),
- html_source,
- )?;
- self.current_path.remove(cur_path_idx);
- }
- _ => {
- if html_source == HtmlSource::Matrix {
- return Err(Error::UnknownNode(tag.to_string()));
- } else {
- self.convert(padom, child, node, html_source)?;
+ _ => {
+ invalid_node_error =
+ Some(Error::UnknownNode(tag.to_string()));
}
+ };
+ }
+
+ if let Some(err) = invalid_node_error {
+ if html_source == HtmlSource::Matrix {
+ return Err(err);
+ } else if !skip_children {
+ // If the source is not Matrix and we haven't explicitly flagged to skip the children continue to parse them.
+ self.convert(padom, child, &mut node, html_source)?;
}
- };
+ }
+ *node_in = node;
Ok(())
}
@@ -557,6 +593,7 @@ mod sys {
NoBody,
UnknownNode(String),
InvalidListItemNode,
+ ParentNotAList,
}
impl fmt::Display for Error {
@@ -577,6 +614,9 @@ mod sys {
"Invalid list item node: a list must only contain list items"
)
}
+ Self::ParentNotAList => {
+ write!(formatter, "Parent node is not a list")
+ }
}
}
}
@@ -1116,6 +1156,15 @@ mod sys {
);
}
+ #[test]
+ fn parse_insert_text_directly_into_a_list() {
+ let html = r#""#;
+ let dom: Dom = HtmlParser::default()
+ .parse_from_source(html, HtmlSource::UnknownExternal)
+ .unwrap();
+ assert_eq!(dom.to_html(), r#""#);
+ }
+
#[test]
fn parse_google_doc_rich_text() {
let dom: Dom = HtmlParser::default()
@@ -1591,314 +1640,385 @@ mod js {
let node = nodes.get(nth as _).unwrap();
let node_name = node.node_name();
let tag = node_name.as_str();
- match tag {
- "BR" => {
- dom.append_child(DomNode::new_line_break());
- }
- "#text" => match node.node_value() {
- Some(value) => {
- let is_inside_code_block =
- self.current_path.contains(&CodeBlock);
- let is_only_child_in_parent = number_of_nodes == 1;
- convert_text(
- value.as_str(),
- dom,
- is_inside_code_block,
- is_only_child_in_parent,
- );
+ let mut invalid_node_error: Option = None;
+ let mut skip_children: bool = false;
+
+ // Check if we're inside a list and this node is not a list item
+ if parent_kind == DomNodeKind::List
+ && tag != "LI"
+ && html_source != HtmlSource::GoogleDoc
+ {
+ // If we are inside a list, we can only have list items.
+ invalid_node_error = Some(Error::InvalidListItemNode);
+ skip_children = true;
+ }
+
+ if invalid_node_error.is_none() {
+ match tag {
+ "BR" => {
+ dom.append_child(DomNode::new_line_break());
}
- _ => {}
- },
- "A" => {
- self.current_path.push(DomNodeKind::Link);
+ "#text" => match node.node_value() {
+ Some(value) => {
+ let is_inside_code_block =
+ self.current_path.contains(&CodeBlock);
+ let is_only_child_in_parent =
+ number_of_nodes == 1;
+ convert_text(
+ value.as_str(),
+ dom,
+ is_inside_code_block,
+ is_only_child_in_parent,
+ );
+ }
+ _ => {}
+ },
- let mut attributes = vec![];
- // we only need to pass in a style attribute from web to allow CSS variable insertion
- let valid_attributes = ["style"];
+ "A" => {
+ self.current_path.push(DomNodeKind::Link);
- for attr in valid_attributes.into_iter() {
- if node
- .unchecked_ref::()
- .has_attribute(attr)
- {
- attributes.push((
- attr.into(),
- node.unchecked_ref::()
- .get_attribute(attr)
- .unwrap_or_default()
- .into(),
- ))
- }
- }
+ let mut attributes = vec![];
+ // we only need to pass in a style attribute from web to allow CSS variable insertion
+ let valid_attributes = ["style"];
- let url = node
- .unchecked_ref::()
- .get_attribute("href")
- .unwrap_or_default();
-
- let is_mention =
- Mention::is_valid_uri(&url.to_string());
- let text = node.child_nodes().get(0);
- let has_text = match text.clone() {
- Some(node) => {
- node.node_type() == web_sys::Node::TEXT_NODE
- }
- None => false,
- };
- if has_text && is_mention {
- dom.append_child(
- DomNode::Mention(
- DomNode::new_mention(
- url.into(),
- text.unwrap()
- .node_value()
+ for attr in valid_attributes.into_iter() {
+ if node
+ .unchecked_ref::()
+ .has_attribute(attr)
+ {
+ attributes.push((
+ attr.into(),
+ node.unchecked_ref::()
+ .get_attribute(attr)
.unwrap_or_default()
.into(),
- attributes,
- )
- .unwrap(),
- ), // we unwrap because we have already confirmed the uri is valid
- );
- } else {
- let children = self
- .convert(
- node.child_nodes(),
- DomNodeKind::Link,
- html_source,
- )?
- .take_children();
- dom.append_child(DomNode::new_link(
- url.into(),
- children,
- attributes,
- ));
+ ))
+ }
+ }
+
+ let url = node
+ .unchecked_ref::()
+ .get_attribute("href")
+ .unwrap_or_default();
+
+ let is_mention =
+ Mention::is_valid_uri(&url.to_string());
+ let text = node.child_nodes().get(0);
+ let has_text = match text.clone() {
+ Some(node) => {
+ node.node_type() == web_sys::Node::TEXT_NODE
+ }
+ None => false,
+ };
+ if has_text && is_mention {
+ dom.append_child(
+ DomNode::Mention(
+ DomNode::new_mention(
+ url.into(),
+ text.unwrap()
+ .node_value()
+ .unwrap_or_default()
+ .into(),
+ attributes,
+ )
+ .unwrap(),
+ ), // we unwrap because we have already confirmed the uri is valid
+ );
+ } else {
+ let children = self
+ .convert(
+ node.child_nodes(),
+ DomNodeKind::Link,
+ html_source,
+ )?
+ .take_children();
+ dom.append_child(DomNode::new_link(
+ url.into(),
+ children,
+ attributes,
+ ));
+ }
+
+ self.current_path.pop();
}
+ "UL" | "OL" => {
+ let custom_start = node
+ .unchecked_ref::()
+ .get_attribute("start");
+
+ let attributes: Option> =
+ if tag == "OL" && custom_start.is_some() {
+ Some(vec![(
+ "start".into(),
+ custom_start.unwrap().into(),
+ )])
+ } else {
+ None
+ };
- self.current_path.pop();
- }
- "UL" | "OL" => {
- let custom_start = node
- .unchecked_ref::()
- .get_attribute("start");
-
- let attributes: Option> =
- if tag == "ol" && custom_start.is_some() {
- Some(vec![(
- "start".into(),
- custom_start.unwrap().into(),
- )])
+ let list_type = if tag == "OL" {
+ ListType::Ordered
} else {
- None
+ ListType::Unordered
};
- let list_type = if tag == "OL" {
- ListType::Ordered
- } else {
- ListType::Unordered
- };
+ if parent_kind == DomNodeKind::List {
+ // Google docs adds nested lists as children of the list node, this breaks our invariants.
+ // For the google docs case, we can add the nested list to the last list item instead.
+ if html_source != HtmlSource::GoogleDoc
+ || dom.last_child_mut().is_none()
+ || dom
+ .last_child_mut()
+ .unwrap()
+ .is_list_item()
+ == false
+ {
+ // If source is not Google Docs or the last child is not a list item, we return an error.
+ invalid_node_error =
+ Some(Error::InvalidListItemNode);
+ } else {
+ self.current_path.push(DomNodeKind::List);
+ let target = dom
+ .last_child_mut()
+ .unwrap()
+ .as_container_mut()
+ .unwrap();
+ target.append_child(DomNode::Container(
+ ContainerNode::new_list(
+ list_type,
+ self.convert(
+ node.child_nodes(),
+ DomNodeKind::List,
+ html_source,
+ )?
+ .take_children(),
+ attributes,
+ ),
+ ));
+ self.current_path.pop();
+ }
+ } else {
+ self.current_path.push(DomNodeKind::List);
+ dom.append_child(DomNode::Container(
+ ContainerNode::new_list(
+ list_type,
+ self.convert(
+ node.child_nodes(),
+ DomNodeKind::List,
+ html_source,
+ )?
+ .take_children(),
+ attributes,
+ ),
+ ));
+ self.current_path.pop();
+ }
+ }
- if parent_kind == DomNodeKind::List {
- if html_source != HtmlSource::GoogleDoc {
- return Err(Error::InvalidListItemNode);
+ "LI" => {
+ if parent_kind != DomNodeKind::List {
+ invalid_node_error =
+ Some(Error::ParentNotAList);
+ } else {
+ self.current_path.push(DomNodeKind::ListItem);
+ dom.append_child(DomNode::Container(
+ ContainerNode::new_list_item(
+ self.convert(
+ node.child_nodes(),
+ DomNodeKind::ListItem,
+ html_source,
+ )?
+ .take_children(),
+ ),
+ ));
+ self.current_path.pop();
}
- self.current_path.push(DomNodeKind::List);
- let target = dom
- .last_child_mut()
- .unwrap()
- .as_container_mut()
- .unwrap();
- target.append_child(DomNode::Container(
- ContainerNode::new_list(
- list_type,
+ }
+
+ "PRE" => {
+ self.current_path.push(DomNodeKind::CodeBlock);
+ let children = node.child_nodes();
+ let children = if children.length() == 1
+ && children.get(0).unwrap().node_name().as_str()
+ == "CODE"
+ {
+ let code_node = children.get(0).unwrap();
+ code_node.child_nodes()
+ } else {
+ children
+ };
+ dom.append_child(DomNode::Container(
+ ContainerNode::new_code_block(
self.convert(
- node.child_nodes(),
- DomNodeKind::List,
+ children,
+ DomNodeKind::CodeBlock,
html_source,
)?
.take_children(),
- attributes,
),
));
- } else {
+ self.current_path.pop();
+ }
+
+ "BLOCKQUOTE" => {
+ self.current_path.push(DomNodeKind::Quote);
dom.append_child(DomNode::Container(
- ContainerNode::new_list(
- list_type,
+ ContainerNode::new_quote(
self.convert(
node.child_nodes(),
- DomNodeKind::List,
+ DomNodeKind::Quote,
html_source,
)?
.take_children(),
- attributes,
),
));
+ self.current_path.pop();
}
- self.current_path.pop();
- }
-
- "LI" => {
- self.current_path.push(DomNodeKind::ListItem);
- dom.append_child(DomNode::Container(
- ContainerNode::new_list_item(
- self.convert(
- node.child_nodes(),
- DomNodeKind::ListItem,
- html_source,
- )?
- .take_children(),
- ),
- ));
- self.current_path.pop();
- }
-
- "PRE" => {
- self.current_path.push(DomNodeKind::CodeBlock);
- let children = node.child_nodes();
- let children = if children.length() == 1
- && children.get(0).unwrap().node_name().as_str()
- == "CODE"
- {
- let code_node = children.get(0).unwrap();
- code_node.child_nodes()
- } else {
- children
- };
- dom.append_child(DomNode::Container(
- ContainerNode::new_code_block(
- self.convert(
- children,
- DomNodeKind::CodeBlock,
- html_source,
- )?
- .take_children(),
- ),
- ));
- self.current_path.pop();
- }
-
- "BLOCKQUOTE" => {
- self.current_path.push(DomNodeKind::Quote);
- dom.append_child(DomNode::Container(
- ContainerNode::new_quote(
- self.convert(
- node.child_nodes(),
- DomNodeKind::Quote,
- html_source,
- )?
- .take_children(),
- ),
- ));
- self.current_path.pop();
- }
-
- "P" => {
- self.current_path.push(DomNodeKind::Paragraph);
- dom.append_child(DomNode::Container(
- ContainerNode::new_paragraph(
- self.convert(
- node.child_nodes(),
- DomNodeKind::Paragraph,
- html_source,
- )?
- .take_children(),
- ),
- ));
- self.current_path.pop();
- }
- node_name => {
- let formatting_kind = match node_name {
- "STRONG" | "B" => Some(InlineFormatType::Bold),
- "EM" | "I" => Some(InlineFormatType::Italic),
- "DEL" => Some(InlineFormatType::StrikeThrough),
- "U" => Some(InlineFormatType::Underline),
- "CODE" => Some(InlineFormatType::InlineCode),
- "SPAN" => {
- if html_source == HtmlSource::Matrix {
- return Err(Error::UnknownNode(
- node_name.to_owned(),
- ));
+ "P" => {
+ self.current_path.push(DomNodeKind::Paragraph);
+ dom.append_child(DomNode::Container(
+ ContainerNode::new_paragraph(
+ self.convert(
+ node.child_nodes(),
+ DomNodeKind::Paragraph,
+ html_source,
+ )?
+ .take_children(),
+ ),
+ ));
+ self.current_path.pop();
+ }
+ node_name => {
+ let formatting_kind = match node_name {
+ "STRONG" | "B" => Some(InlineFormatType::Bold),
+ "EM" | "I" => Some(InlineFormatType::Italic),
+ "DEL" => Some(InlineFormatType::StrikeThrough),
+ "U" => Some(InlineFormatType::Underline),
+ "CODE" => Some(InlineFormatType::InlineCode),
+ "SPAN" => {
+ if html_source == HtmlSource::Matrix {
+ invalid_node_error =
+ Some(Error::UnknownNode(
+ node_name.to_owned(),
+ ));
+ None
+ } else {
+ // For external sources, we check for common formatting styles for spans
+ // and convert them to appropriate formatting nodes.
+ let style = node
+ .unchecked_ref::()
+ .style();
+ if style
+ .get_property_value("font-weight")
+ .unwrap_or_default()
+ == "bold"
+ {
+ Some(InlineFormatType::Bold)
+ } else if style
+ .get_property_value("font-style")
+ .unwrap_or_default()
+ == "italic"
+ {
+ Some(InlineFormatType::Italic)
+ } else if style
+ .get_property_value(
+ "text-decoration",
+ )
+ .unwrap_or_default()
+ == "underline"
+ {
+ Some(InlineFormatType::Underline)
+ } else if style
+ .get_property_value(
+ "text-decoration",
+ )
+ .unwrap_or_default()
+ == "line-through"
+ {
+ Some(
+ InlineFormatType::StrikeThrough,
+ )
+ } else {
+ invalid_node_error =
+ Some(Error::UnknownNode(
+ node_name.to_owned(),
+ ));
+ None
+ }
+ }
}
- // For external sources, we check for common formatting styles for spans
- // and convert them to appropriate formatting nodes.
- let style =
- node.unchecked_ref::().style();
- if style
- .get_property_value("font-weight")
- .unwrap_or_default()
- == "bold"
- {
- Some(InlineFormatType::Bold)
- } else if style
- .get_property_value("font-style")
- .unwrap_or_default()
- == "italic"
- {
- Some(InlineFormatType::Italic)
- } else if style
- .get_property_value("text-decoration")
- .unwrap_or_default()
- == "underline"
- {
- Some(InlineFormatType::Underline)
- } else if style
- .get_property_value("text-decoration")
- .unwrap_or_default()
- == "line-through"
- {
- Some(InlineFormatType::StrikeThrough)
- } else {
+ _ => {
+ invalid_node_error =
+ Some(Error::UnknownNode(
+ node_name.to_owned(),
+ ));
None
}
- }
- _ => {
- if html_source == HtmlSource::Matrix {
- return Err(Error::UnknownNode(
- node_name.to_owned(),
+ };
+
+ if let Some(formatting_kind) = formatting_kind {
+ // Special case for code inside code blocks - skip the inline code formatting
+ if formatting_kind
+ == InlineFormatType::InlineCode
+ && self.current_path.contains(&CodeBlock)
+ {
+ let children_nodes = self
+ .convert(
+ node.child_nodes(),
+ parent_kind.clone(),
+ html_source,
+ )?
+ .take_children();
+ if !children_nodes.is_empty() {
+ dom.append_children(children_nodes);
+ }
+ } else {
+ self.current_path.push(
+ DomNodeKind::Formatting(
+ formatting_kind.clone(),
+ ),
+ );
+ let children_nodes = self
+ .convert(
+ node.child_nodes(),
+ DomNodeKind::Formatting(
+ formatting_kind.clone(),
+ ),
+ html_source,
+ )?
+ .take_children();
+
+ dom.append_child(DomNode::Container(
+ ContainerNode::new_formatting(
+ formatting_kind.clone(),
+ children_nodes,
+ ),
));
+ self.current_path.pop();
}
- None
}
- };
-
- if let Some(formatting_kind) = formatting_kind {
- self.current_path.push(DomNodeKind::Formatting(
- formatting_kind.clone(),
- ));
- let children_nodes = self
- .convert(
- node.child_nodes(),
- DomNodeKind::Formatting(
- formatting_kind.clone(),
- ),
- html_source,
- )?
- .take_children();
- self.current_path.push(DomNodeKind::Formatting(
- formatting_kind.clone(),
- ));
+ }
+ }
+ }
- dom.append_child(DomNode::Container(
- ContainerNode::new_formatting(
- formatting_kind.clone(),
- children_nodes,
- ),
- ));
- self.current_path.pop();
- } else {
- // If it's an external source we skip the node and process it's children.
- let children_nodes = self
- .convert(
- node.child_nodes(),
- parent_kind.clone(),
- html_source,
- )?
- .take_children();
- if !children_nodes.is_empty() {
- dom.append_children(children_nodes);
- }
+ // Handle invalid node errors
+ if let Some(err) = invalid_node_error {
+ if html_source == HtmlSource::Matrix {
+ return Err(err);
+ } else if !skip_children {
+ // If the source is not Matrix and we haven't explicitly flagged to skip the children continue to parse them.
+ let children_nodes = self
+ .convert(
+ node.child_nodes(),
+ parent_kind.clone(),
+ html_source,
+ )?
+ .take_children();
+ if !children_nodes.is_empty() {
+ dom.append_children(children_nodes);
}
}
}
@@ -1921,6 +2041,7 @@ mod js {
NoBody,
UnknownNode(String),
InvalidListItemNode,
+ ParentNotAList,
}
impl fmt::Display for Error {
@@ -1942,6 +2063,9 @@ mod js {
"Invalid list item node: a list must only contain list items"
)
}
+ Self::ParentNotAList => {
+ write!(formatter, "Parent node is not a list")
+ }
}
}
}
@@ -1982,6 +2106,15 @@ mod js {
roundtrip("foo bar baz");
}
+ #[wasm_bindgen_test]
+ fn parse_insert_text_directly_into_a_list() {
+ let html = r#""#;
+ let dom: Dom = HtmlParser::default()
+ .parse_from_source(html, HtmlSource::UnknownExternal)
+ .unwrap();
+ assert_eq!(dom.to_html(), r#""#);
+ }
+
#[wasm_bindgen_test]
fn google_doc_rich_text() {
let dom = HtmlParser::default()
@@ -2013,7 +2146,7 @@ mod js {
HtmlSource::UnknownExternal,
)
.unwrap();
- assert_eq!(dom.to_string(), "Italic
Bold
Unformatted
Strikethrough
Underlined
Linked
");
+ assert_eq!(dom.to_string(), "Italic
Bold
Unformatted
Strikethrough
Underlined
Linked
");
}
#[wasm_bindgen_test]
@@ -2057,17 +2190,23 @@ mod js {
#[wasm_bindgen_test]
fn ul() {
- roundtrip("foo bar");
+ roundtrip(
+ "foo
bar
",
+ );
}
#[wasm_bindgen_test]
fn ol() {
- roundtrip("foo - item1
- item2
bar");
+ roundtrip(
+ "foo
- item1
- item2
bar
",
+ );
}
#[wasm_bindgen_test]
fn pre() {
- roundtrip("foo ~Some code
bar");
+ roundtrip(
+ "foo
~Some code
bar
",
+ );
}
#[wasm_bindgen_test]
@@ -2098,7 +2237,9 @@ mod js {
#[wasm_bindgen_test]
fn blockquote() {
- roundtrip("foo ~Some code
bar");
+ roundtrip(
+ "foo
~Some code
bar
",
+ );
}
#[wasm_bindgen_test]
@@ -2129,7 +2270,8 @@ mod js {
\
\n
\
";
- let dom = HtmlParser::default().parse::(html).unwrap();
+ let dom: Dom =
+ HtmlParser::default().parse::(html).unwrap();
let tree = dom.to_tree().to_string();
assert_eq!(
tree,
From 92bdca22c33b458bb2d04b6739fb31dcbabc73c3 Mon Sep 17 00:00:00 2001
From: David Langley
Date: Wed, 23 Jul 2025 15:03:07 +0100
Subject: [PATCH 13/17] Add support for parsing font-weight == 700 in addition
to font-weight == "bold".
This fixes the pasting bold content from google docs(and possibly other sources).
---
.../src/composer_model/replace_html.rs | 2 +-
crates/wysiwyg/src/dom/parser/parse.rs | 20 ++++++++++++++-----
2 files changed, 16 insertions(+), 6 deletions(-)
diff --git a/crates/wysiwyg/src/composer_model/replace_html.rs b/crates/wysiwyg/src/composer_model/replace_html.rs
index f7b1dfe24..fdb3b0755 100644
--- a/crates/wysiwyg/src/composer_model/replace_html.rs
+++ b/crates/wysiwyg/src/composer_model/replace_html.rs
@@ -100,7 +100,7 @@ mod test {
let html_str = html.to_string();
assert!(!html_str.contains("Italic
Bold
Unformatted
Strikethrough
Underlined
Linked
");
+ assert_eq!(html_str, "Italic
Bold
Unformatted
Strikethrough
Underlined
Linked
");
}
#[test]
diff --git a/crates/wysiwyg/src/dom/parser/parse.rs b/crates/wysiwyg/src/dom/parser/parse.rs
index d388f6837..289e1d79b 100644
--- a/crates/wysiwyg/src/dom/parser/parse.rs
+++ b/crates/wysiwyg/src/dom/parser/parse.rs
@@ -262,7 +262,9 @@ mod sys {
// For external sources, we check for common formatting styles for spans
// and convert them to appropriate formatting nodes.
let mut formatting_tag = None;
- if child.contains_style("font-weight", "bold") {
+ if child.contains_style("font-weight", "bold")
+ || child.contains_style("font-weight", "700")
+ {
formatting_tag = Some("b");
} else if child.contains_style("font-style", "italic") {
formatting_tag = Some("i");
@@ -1174,6 +1176,7 @@ mod sys {
)
.unwrap();
let tree = dom.to_tree().to_string();
+ println!("{}", tree);
assert_eq!(
tree,
indoc! {
@@ -1186,7 +1189,8 @@ mod sys {
│ └>"Italic"
├>li
│ └>p
- │ └>"Bold"
+ │ └>b
+ │ └>"Bold"
├>li
│ └>p
│ └>"Unformatted"
@@ -1214,7 +1218,7 @@ mod sys {
dom.to_markdown().unwrap().to_string(),
indoc! {r#"
1. *Italic*
- 2. Bold
+ 2. __Bold__
3. Unformatted
4. ~~Strikethrough~~
5. Underlined
@@ -1915,6 +1919,12 @@ mod js {
.get_property_value("font-weight")
.unwrap_or_default()
== "bold"
+ || style
+ .get_property_value(
+ "font-weight",
+ )
+ .unwrap_or_default()
+ == "700"
{
Some(InlineFormatType::Bold)
} else if style
@@ -2123,12 +2133,12 @@ mod js {
HtmlSource::GoogleDoc,
)
.unwrap();
- assert_eq!(dom.to_string(), "Italic
Bold
Unformatted
Strikethrough
Underlined
Linked
");
+ assert_eq!(dom.to_string(), "Italic
Bold
Unformatted
Strikethrough
Underlined
Linked
");
assert_eq!(
dom.to_markdown().unwrap().to_string(),
indoc! {r#"
1. *Italic*
- 2. Bold
+ 2. __Bold__
3. Unformatted
4. ~~Strikethrough~~
5. Underlined
From 868d0ab2f2652972f06056e86bb93455a5aca39c Mon Sep 17 00:00:00 2001
From: David Langley
Date: Wed, 23 Jul 2025 16:03:05 +0100
Subject: [PATCH 14/17] lint rust and ts
---
crates/wysiwyg/src/dom/parser/panode_container.rs | 2 +-
platforms/web/lib/composer.ts | 6 ++++--
2 files changed, 5 insertions(+), 3 deletions(-)
diff --git a/crates/wysiwyg/src/dom/parser/panode_container.rs b/crates/wysiwyg/src/dom/parser/panode_container.rs
index ab4aa5de4..1491f8941 100644
--- a/crates/wysiwyg/src/dom/parser/panode_container.rs
+++ b/crates/wysiwyg/src/dom/parser/panode_container.rs
@@ -48,4 +48,4 @@ fn test_contains_style() {
};
assert!(node.contains_style("font-weight", "bold"));
assert!(!node.contains_style("font-weight", "normal"));
-}
\ No newline at end of file
+}
diff --git a/platforms/web/lib/composer.ts b/platforms/web/lib/composer.ts
index 19f890db5..297ccece2 100644
--- a/platforms/web/lib/composer.ts
+++ b/platforms/web/lib/composer.ts
@@ -69,11 +69,13 @@ export function processInput(
const clipboardData = event.clipboardData;
const htmlData = clipboardData?.getData('text/html');
const plainData = clipboardData?.getData('text/plain') ?? '';
-
+
if (htmlData && htmlData !== plainData) {
const htmlSource = clipboardData?.types.includes(
'application/x-vnd.google-docs-document-slice-clip+wrapped',
- ) ? HtmlSource.GoogleDoc : HtmlSource.UnknownExternal;
+ )
+ ? HtmlSource.GoogleDoc
+ : HtmlSource.UnknownExternal;
return action(
composerModel.replace_html(htmlData, htmlSource),
'replace_html_paste',
From edc0ab83fa8fb8c4a8e89c69b2bb2abb4bc52f3a Mon Sep 17 00:00:00 2001
From: David Langley
Date: Wed, 23 Jul 2025 16:11:48 +0100
Subject: [PATCH 15/17] clippy
---
crates/wysiwyg/src/dom/dom_struct.rs | 4 ++--
crates/wysiwyg/src/dom/parser/panode_container.rs | 9 ++++-----
crates/wysiwyg/src/dom/parser/parse.rs | 10 ++++++----
3 files changed, 12 insertions(+), 11 deletions(-)
diff --git a/crates/wysiwyg/src/dom/dom_struct.rs b/crates/wysiwyg/src/dom/dom_struct.rs
index d84d9bdde..eacf6a7ea 100644
--- a/crates/wysiwyg/src/dom/dom_struct.rs
+++ b/crates/wysiwyg/src/dom/dom_struct.rs
@@ -298,14 +298,14 @@ where
pub fn location_for_node(&self, node_handle: &DomHandle) -> DomLocation {
let locations = find_range::find_range(self, 0, usize::MAX);
- return locations.find_location(node_handle).unwrap().clone();
+ locations.find_location(node_handle).unwrap().clone()
}
pub fn locations_for_node(
&self,
node_handle: &DomHandle,
) -> Vec {
- let result = find_range::find_pos(self, &node_handle, 0, usize::MAX);
+ let result = find_range::find_pos(self, node_handle, 0, usize::MAX);
match result {
FindResult::Found(locations) => locations,
_ => panic!("Node does not exist"),
diff --git a/crates/wysiwyg/src/dom/parser/panode_container.rs b/crates/wysiwyg/src/dom/parser/panode_container.rs
index 1491f8941..fe6644a35 100644
--- a/crates/wysiwyg/src/dom/parser/panode_container.rs
+++ b/crates/wysiwyg/src/dom/parser/panode_container.rs
@@ -24,18 +24,17 @@ impl PaNodeContainer {
}
pub(crate) fn contains_style(&self, name: &str, value: &str) -> bool {
- return self
- .get_attr("style")
+ self.get_attr("style")
.map(|v| {
- return Regex::new(&format!(
+ Regex::new(&format!(
r"(?i){}:\s*{};",
regex::escape(name),
regex::escape(value)
))
.map(|re| re.is_match(v))
- .unwrap_or(false);
+ .unwrap_or(false)
})
- .unwrap_or(false);
+ .unwrap_or(false)
}
}
diff --git a/crates/wysiwyg/src/dom/parser/parse.rs b/crates/wysiwyg/src/dom/parser/parse.rs
index 289e1d79b..c12e6da63 100644
--- a/crates/wysiwyg/src/dom/parser/parse.rs
+++ b/crates/wysiwyg/src/dom/parser/parse.rs
@@ -88,7 +88,7 @@ mod sys {
where
S: UnicodeString,
{
- return self.parse_internal(html, HtmlSource::Matrix);
+ self.parse_internal(html, HtmlSource::Matrix)
}
pub(super) fn parse_from_source(
@@ -304,8 +304,10 @@ mod sys {
// For the google docs case, we can add the nested list to the last list item instead.
if html_source != HtmlSource::GoogleDoc
|| node.last_child_mut().is_none()
- || node.last_child_mut().unwrap().is_list_item()
- == false
+ || !node
+ .last_child_mut()
+ .unwrap()
+ .is_list_item()
{
// If source is not Google Docs or the last child is not a list item, we return an error.
invalid_node_error =
@@ -1309,7 +1311,7 @@ fn post_process_blocks(mut dom: Dom) -> Dom {
let block_handles = find_blocks(&dom);
for handle in block_handles.iter().rev() {
dom = post_process_block_lines(dom, handle);
- dom.join_nodes_in_container(&handle);
+ dom.join_nodes_in_container(handle);
}
dom
}
From f349ee9b2d6e3533e7ad0250c11aa8643aef9326 Mon Sep 17 00:00:00 2001
From: David Langley
Date: Wed, 23 Jul 2025 17:20:52 +0100
Subject: [PATCH 16/17] Fix web test
---
crates/wysiwyg/src/dom/parser/parse.rs | 1 -
platforms/web/lib/composer.test.ts | 46 ++++++++++++++++++++------
platforms/web/lib/composer.ts | 2 --
3 files changed, 36 insertions(+), 13 deletions(-)
diff --git a/crates/wysiwyg/src/dom/parser/parse.rs b/crates/wysiwyg/src/dom/parser/parse.rs
index c12e6da63..766f3f57e 100644
--- a/crates/wysiwyg/src/dom/parser/parse.rs
+++ b/crates/wysiwyg/src/dom/parser/parse.rs
@@ -1178,7 +1178,6 @@ mod sys {
)
.unwrap();
let tree = dom.to_tree().to_string();
- println!("{}", tree);
assert_eq!(
tree,
indoc! {
diff --git a/platforms/web/lib/composer.test.ts b/platforms/web/lib/composer.test.ts
index 883d4660d..fcaceba9a 100644
--- a/platforms/web/lib/composer.test.ts
+++ b/platforms/web/lib/composer.test.ts
@@ -6,7 +6,7 @@ SPDX-License-Identifier: AGPL-3.0-only OR LicenseRef-Element-Commercial
Please see LICENSE in the repository root for full details.
*/
-import { ComposerModel } from '@vector-im/matrix-wysiwyg-wasm';
+import { ComposerModel, HtmlSource } from '@vector-im/matrix-wysiwyg-wasm';
import { processInput } from './composer';
import { FormattingFunctions } from './types';
@@ -14,6 +14,7 @@ import { FormattingFunctions } from './types';
// mocks and spies
const mockComposerModel = {
replace_text: vi.fn(),
+ replace_html: vi.fn(),
code_block: vi.fn(),
backspace_word: vi.fn(),
delete_word: vi.fn(),
@@ -187,16 +188,29 @@ describe('processInput', () => {
});
it('handles truthy and falsy data from clipboard with replace_text', () => {
- const sampleContent = ['clipboardData', null];
+ const sampleContent = [
+ ['clipboardData', 'clipboardData'],
+ [null, 'clipboardData'],
+ [null, null],
+ ];
sampleContent.forEach((clipboardContent) => {
const e = new ClipboardEvent('some clipboard event');
- const mockGetter = vi.fn().mockReturnValue(clipboardContent);
+ const mockGetter = vi.fn().mockImplementation((type) => {
+ if (type === 'text/html') {
+ return clipboardContent[0];
+ } else {
+ return clipboardContent[1];
+ }
+ });
// We can't easily generate the correct type here, so disable ts
// eslint-disable-next-line @typescript-eslint/ban-ts-comment
// @ts-ignore
- e.clipboardData = { getData: mockGetter };
+ e.clipboardData = {
+ getData: mockGetter,
+ types: ['text/html', 'text/plain'],
+ };
processInput(
e,
@@ -207,12 +221,24 @@ describe('processInput', () => {
mockSuggestion,
);
- expect(mockGetter).toHaveBeenCalledTimes(1);
- expect(mockComposerModel.replace_text).toHaveBeenCalledWith(
- // falsy values are defaulted to empty string
- clipboardContent || '',
- );
- expect(mockAction).toHaveBeenCalledWith(undefined, 'paste');
+ expect(mockGetter).toHaveBeenCalledTimes(2);
+ if (clipboardContent[0]) {
+ expect(mockComposerModel.replace_html).toHaveBeenCalledWith(
+ // falsy values are defaulted to empty string
+ clipboardContent[0] || '',
+ HtmlSource.UnknownExternal,
+ );
+ } else {
+ expect(mockComposerModel.replace_text).toHaveBeenCalledWith(
+ // falsy values are defaulted to empty string
+ clipboardContent[1] || '',
+ );
+ }
+
+ const action = clipboardContent[0]
+ ? 'replace_html_paste'
+ : 'replace_text_paste';
+ expect(mockAction).toHaveBeenCalledWith(undefined, action);
});
});
diff --git a/platforms/web/lib/composer.ts b/platforms/web/lib/composer.ts
index 297ccece2..75e0dfb06 100644
--- a/platforms/web/lib/composer.ts
+++ b/platforms/web/lib/composer.ts
@@ -79,13 +79,11 @@ export function processInput(
return action(
composerModel.replace_html(htmlData, htmlSource),
'replace_html_paste',
- htmlData,
);
}
return action(
composerModel.replace_text(plainData),
'replace_text_paste',
- plainData,
);
}
From e11808145025dc6725f4ec42182c66fa65255c0a Mon Sep 17 00:00:00 2001
From: David Langley
Date: Wed, 23 Jul 2025 19:14:41 +0100
Subject: [PATCH 17/17] Re use the same html chunk in our constants. Add better
docs.
---
.../src/composer_model/replace_html.rs | 31 +++++++--------
crates/wysiwyg/src/dom/parser.rs | 5 +++
crates/wysiwyg/src/dom/parser/parse.rs | 39 ++++++++++++++++---
3 files changed, 52 insertions(+), 23 deletions(-)
diff --git a/crates/wysiwyg/src/composer_model/replace_html.rs b/crates/wysiwyg/src/composer_model/replace_html.rs
index fdb3b0755..22191933c 100644
--- a/crates/wysiwyg/src/composer_model/replace_html.rs
+++ b/crates/wysiwyg/src/composer_model/replace_html.rs
@@ -71,30 +71,26 @@ where
}
}
-#[cfg(test)]
-const GOOGLE_DOC_HTML_PASTEBOARD: &str = r#"
- Italic
Bold
Unformatted
Strikethrough
Underlined
Linked
- "#;
-#[cfg(test)]
-const MS_DOC_HTML_PASTEBOARD: &str = r#"
-
- "#;
-
#[cfg(test)]
mod test {
- use super::*;
use crate::dom::html_source::HtmlSource;
+ use crate::dom::parser::{
+ GOOGLE_DOC_HTML_PASTEBOARD, MS_DOC_HTML_PASTEBOARD,
+ };
use crate::tests::testutils_composer_model::cm;
#[test]
fn test_replace_html_strips_meta_tags_google_docs() {
let mut model = cm("|");
- let _ = model.replace_html(
- GOOGLE_DOC_HTML_PASTEBOARD.into(),
- HtmlSource::GoogleDoc,
+ // This html was copied directly from google docs and we are including the meta and bold tags that google docs adds.
+ let html = format!(
+ r#"{}"#,
+ GOOGLE_DOC_HTML_PASTEBOARD
);
+ let _ = model.replace_html(html.into(), HtmlSource::GoogleDoc);
+
// Verify the HTML doesn't contain meta or the outer b tag
let html = model.get_content_as_html();
let html_str = html.to_string();
@@ -107,10 +103,11 @@ mod test {
fn test_replace_html_strips_only_meta_tags_ms_docs() {
let mut model = cm("|");
- let _ = model.replace_html(
- MS_DOC_HTML_PASTEBOARD.into(),
- HtmlSource::UnknownExternal,
- );
+ // This html was copied directly from ms docs and we are including the meta and bold tags that ms docs adds.
+ let html =
+ format!(r#"{}"#, MS_DOC_HTML_PASTEBOARD);
+
+ let _ = model.replace_html(html.into(), HtmlSource::UnknownExternal);
let html = model.get_content_as_html();
let html_str = html.to_string();
diff --git a/crates/wysiwyg/src/dom/parser.rs b/crates/wysiwyg/src/dom/parser.rs
index c845f1951..574d889a9 100644
--- a/crates/wysiwyg/src/dom/parser.rs
+++ b/crates/wysiwyg/src/dom/parser.rs
@@ -50,3 +50,8 @@ use sys::*;
pub use parse::parse;
pub use parse::parse_from_source;
+
+#[cfg(test)]
+pub use parse::GOOGLE_DOC_HTML_PASTEBOARD;
+#[cfg(test)]
+pub use parse::MS_DOC_HTML_PASTEBOARD;
diff --git a/crates/wysiwyg/src/dom/parser/parse.rs b/crates/wysiwyg/src/dom/parser/parse.rs
index 766f3f57e..8827a092a 100644
--- a/crates/wysiwyg/src/dom/parser/parse.rs
+++ b/crates/wysiwyg/src/dom/parser/parse.rs
@@ -46,14 +46,41 @@ where
}
}
+/* These html fragments were copied directly from google docs/ms docs(minus the cleanup/stripping we do in "replace_html" function) and represents the following content:
+└>ol
+ ├>li
+ │ └>p
+ │ └>i
+ │ └>"Italic"
+ ├>li
+ │ └>p
+ │ └>b
+ │ └>"Bold"
+ ├>li
+ │ └>p
+ │ └>"Unformatted"
+ ├>li
+ │ └>p
+ │ └>del
+ │ └>"Strikethrough"
+ ├>li
+ │ └>p
+ │ └>u
+ │ └>"Underlined"
+ └>li
+ ├>p
+ │ └>a "http://matrix.org"
+ │ └>u
+ │ └>"Linked"
+ └>ul
+ └>li
+ └>p
+ └>"Nested"
+*/
#[cfg(test)]
-const GOOGLE_DOC_HTML_PASTEBOARD: &str = r#"
- Italic
Bold
Unformatted
Strikethrough
Underlined
Linked
- "#;
+pub const GOOGLE_DOC_HTML_PASTEBOARD: &str = r#"Italic
Bold
Unformatted
Strikethrough
Underlined
Linked
"#;
#[cfg(test)]
-const MS_DOC_HTML_PASTEBOARD: &str = r#"
-
- "#;
+pub const MS_DOC_HTML_PASTEBOARD: &str = r#""#;
#[cfg(feature = "sys")]
mod sys {