Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 7 additions & 0 deletions Changelog.md
Original file line number Diff line number Diff line change
Expand Up @@ -16,10 +16,17 @@

### New Features

- [#353]: Add ability to serialize textual content as CDATA sections in `Serializer`.
Everywhere where the text node may be created, a CDATA section(s) could be produced instead.
See the new [`Serializer::text_format()`] method.

### Bug Fixes

### Misc Changes

[#353]: https://github.com/tafia/quick-xml/issues/353
[`Serializer::text_format()`]: https://docs.rs/quick-xml/0.38.4/quick_xml/se/struct.Serializer.html#method.text_format


## 0.38.3 -- 2025-08-24

Expand Down
64 changes: 33 additions & 31 deletions src/escape.rs
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@

use memchr::{memchr, memchr2_iter, memchr3};
use std::borrow::Cow;
use std::fmt::{self, Write};
use std::num::ParseIntError;
use std::ops::Range;

Expand Down Expand Up @@ -147,54 +148,55 @@ pub fn minimal_escape<'a>(raw: impl Into<Cow<'a, str>>) -> Cow<'a, str> {
_escape(raw, |ch| matches!(ch, b'<' | b'&'))
}

pub(crate) fn escape_char<W>(writer: &mut W, value: &str, from: usize, to: usize) -> fmt::Result
where
W: fmt::Write,
{
writer.write_str(&value[from..to])?;
match value.as_bytes()[to] {
b'<' => writer.write_str("&lt;")?,
b'>' => writer.write_str("&gt;")?,
b'\'' => writer.write_str("&apos;")?,
b'&' => writer.write_str("&amp;")?,
b'"' => writer.write_str("&quot;")?,

// This set of escapes handles characters that should be escaped
// in elements of xs:lists, because those characters works as
// delimiters of list elements
b'\t' => writer.write_str("&#9;")?,
b'\n' => writer.write_str("&#10;")?,
b'\r' => writer.write_str("&#13;")?,
b' ' => writer.write_str("&#32;")?,
_ => unreachable!("Only '<', '>','\', '&', '\"', '\\t', '\\r', '\\n', and ' ' are escaped"),
}
Ok(())
}

/// Escapes an `&str` and replaces a subset of xml special characters (`<`, `>`,
/// `&`, `'`, `"`) with their corresponding xml escaped value.
pub(crate) fn _escape<'a, F: Fn(u8) -> bool>(
raw: impl Into<Cow<'a, str>>,
escape_chars: F,
) -> Cow<'a, str> {
fn _escape<'a, F: Fn(u8) -> bool>(raw: impl Into<Cow<'a, str>>, escape_chars: F) -> Cow<'a, str> {
let raw = raw.into();
let bytes = raw.as_bytes();
let mut escaped = None;
let mut iter = bytes.iter();
let mut pos = 0;
while let Some(i) = iter.position(|&b| escape_chars(b)) {
if escaped.is_none() {
escaped = Some(Vec::with_capacity(raw.len()));
escaped = Some(String::with_capacity(raw.len()));
}
let escaped = escaped.as_mut().expect("initialized");
let new_pos = pos + i;
escaped.extend_from_slice(&bytes[pos..new_pos]);
match bytes[new_pos] {
b'<' => escaped.extend_from_slice(b"&lt;"),
b'>' => escaped.extend_from_slice(b"&gt;"),
b'\'' => escaped.extend_from_slice(b"&apos;"),
b'&' => escaped.extend_from_slice(b"&amp;"),
b'"' => escaped.extend_from_slice(b"&quot;"),

// This set of escapes handles characters that should be escaped
// in elements of xs:lists, because those characters works as
// delimiters of list elements
b'\t' => escaped.extend_from_slice(b"&#9;"),
b'\n' => escaped.extend_from_slice(b"&#10;"),
b'\r' => escaped.extend_from_slice(b"&#13;"),
b' ' => escaped.extend_from_slice(b"&#32;"),
_ => unreachable!(
"Only '<', '>','\', '&', '\"', '\\t', '\\r', '\\n', and ' ' are escaped"
),
}
// SAFETY: It should fail only on OOM
escape_char(escaped, &raw, pos, new_pos).unwrap();
pos = new_pos + 1;
}

if let Some(mut escaped) = escaped {
if let Some(raw) = bytes.get(pos..) {
escaped.extend_from_slice(raw);
if let Some(raw) = raw.get(pos..) {
// SAFETY: It should fail only on OOM
escaped.write_str(raw).unwrap();
}
// SAFETY: we operate on UTF-8 input and search for an one byte chars only,
// so all slices that was put to the `escaped` is a valid UTF-8 encoded strings
// TODO: Can be replaced with `unsafe { String::from_utf8_unchecked() }`
// if unsafe code will be allowed
Cow::Owned(String::from_utf8(escaped).unwrap())
Cow::Owned(escaped)
} else {
raw
}
Expand Down
38 changes: 7 additions & 31 deletions src/events/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -53,7 +53,7 @@ use crate::escape::{
partial_escape, EscapeError,
};
use crate::name::{LocalName, QName};
use crate::utils::{name_len, trim_xml_end, trim_xml_start, write_cow_string, Bytes};
use crate::utils::{self, name_len, trim_xml_end, trim_xml_start, write_cow_string};
use attributes::{AttrError, Attribute, Attributes};

/// Opening tag data (`Event::Start`), with optional attributes: `<name attr="value">`.
Expand Down Expand Up @@ -783,8 +783,7 @@ impl<'a> BytesCData<'a> {
#[inline]
pub fn escaped(content: &'a str) -> CDataIterator<'a> {
CDataIterator {
unprocessed: content.as_bytes(),
finished: false,
inner: utils::CDataIterator::new(content),
}
}

Expand Down Expand Up @@ -984,41 +983,18 @@ impl<'a> arbitrary::Arbitrary<'a> for BytesCData<'a> {
/// Iterator over `CDATA` sections in a string.
///
/// This iterator is created by the [`BytesCData::escaped`] method.
#[derive(Clone)]
#[derive(Debug, Clone)]
pub struct CDataIterator<'a> {
/// The unprocessed data which should be emitted as `BytesCData` events.
/// At each iteration, the processed data is cut from this slice.
unprocessed: &'a [u8],
finished: bool,
}

impl<'a> Debug for CDataIterator<'a> {
fn fmt(&self, f: &mut Formatter) -> fmt::Result {
f.debug_struct("CDataIterator")
.field("unprocessed", &Bytes(self.unprocessed))
.field("finished", &self.finished)
.finish()
}
inner: utils::CDataIterator<'a>,
}

impl<'a> Iterator for CDataIterator<'a> {
type Item = BytesCData<'a>;

fn next(&mut self) -> Option<BytesCData<'a>> {
if self.finished {
return None;
}

for gt in memchr::memchr_iter(b'>', self.unprocessed) {
if self.unprocessed[..gt].ends_with(b"]]") {
let (slice, rest) = self.unprocessed.split_at(gt);
self.unprocessed = rest;
return Some(BytesCData::wrap(slice, Decoder::utf8()));
}
}

self.finished = true;
Some(BytesCData::wrap(self.unprocessed, Decoder::utf8()))
self.inner
.next()
.map(|slice| BytesCData::wrap(slice.as_bytes(), Decoder::utf8()))
}
}

Expand Down
15 changes: 12 additions & 3 deletions src/se/content.rs
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
use crate::de::TEXT_KEY;
use crate::se::element::{ElementSerializer, Struct, Tuple};
use crate::se::simple_type::{QuoteTarget, SimpleTypeSerializer};
use crate::se::{Indent, QuoteLevel, SeError, WriteResult, XmlName};
use crate::se::{Indent, QuoteLevel, SeError, TextFormat, WriteResult, XmlName};
use serde::ser::{
Impossible, Serialize, SerializeSeq, SerializeTuple, SerializeTupleStruct, Serializer,
};
Expand Down Expand Up @@ -71,6 +71,8 @@ pub struct ContentSerializer<'w, 'i, W: Write> {
/// If `true`, then current indent will be written before writing the content,
/// but only if content is not empty. This flag is reset after writing indent.
pub write_indent: bool,
/// Defines how text content should be serialized (as escaped text or CDATA)
pub text_format: TextFormat,
/// If `true`, then primitive types that serializes to a text content without
/// surrounding tag will be allowed, otherwise the [`SeError::Unsupported`]
/// will be returned.
Expand All @@ -88,10 +90,12 @@ impl<'w, 'i, W: Write> ContentSerializer<'w, 'i, W> {
/// Turns this serializer into serializer of a text content
#[inline]
pub fn into_simple_type_serializer_impl(self) -> SimpleTypeSerializer<&'w mut W> {
//TODO: Customization point: choose between CDATA and Text representation
SimpleTypeSerializer {
writer: self.writer,
target: QuoteTarget::Text,
target: match self.text_format {
TextFormat::Text => QuoteTarget::Text,
TextFormat::CData => QuoteTarget::CData,
},
level: self.level,
}
}
Expand Down Expand Up @@ -119,6 +123,7 @@ impl<'w, 'i, W: Write> ContentSerializer<'w, 'i, W> {
level: self.level,
indent: self.indent.borrow(),
write_indent: self.write_indent,
text_format: self.text_format,
allow_primitive,
expand_empty_elements: self.expand_empty_elements,
}
Expand Down Expand Up @@ -600,6 +605,7 @@ pub(super) mod tests {
level: QuoteLevel::Full,
indent: Indent::None,
write_indent: false,
text_format: TextFormat::Text,
allow_primitive: true,
expand_empty_elements: false,
};
Expand All @@ -623,6 +629,7 @@ pub(super) mod tests {
level: QuoteLevel::Full,
indent: Indent::None,
write_indent: false,
text_format: TextFormat::Text,
allow_primitive: true,
expand_empty_elements: false,
};
Expand Down Expand Up @@ -1070,6 +1077,7 @@ pub(super) mod tests {
level: QuoteLevel::Full,
indent: Indent::Owned(Indentation::new(b' ', 2)),
write_indent: false,
text_format: TextFormat::Text,
allow_primitive: true,
expand_empty_elements: false,
};
Expand All @@ -1093,6 +1101,7 @@ pub(super) mod tests {
level: QuoteLevel::Full,
indent: Indent::Owned(Indentation::new(b' ', 2)),
write_indent: false,
text_format: TextFormat::Text,
allow_primitive: true,
expand_empty_elements: false,
};
Expand Down
8 changes: 7 additions & 1 deletion src/se/element.rs
Original file line number Diff line number Diff line change
Expand Up @@ -443,6 +443,7 @@ impl<'w, 'k, W: Write> Struct<'w, 'k, W> {
indent: self.ser.ser.indent.borrow(),
// If previous field does not require indent, do not write it
write_indent: self.write_indent,
text_format: self.ser.ser.text_format,
allow_primitive: true,
expand_empty_elements: self.ser.ser.expand_empty_elements,
};
Expand Down Expand Up @@ -596,7 +597,7 @@ impl<'w, 'k, W: Write> SerializeMap for Map<'w, 'k, W> {
mod tests {
use super::*;
use crate::se::content::tests::*;
use crate::se::{Indent, QuoteLevel};
use crate::se::{Indent, QuoteLevel, TextFormat};
use crate::utils::Bytes;
use serde::Serialize;
use std::collections::BTreeMap;
Expand Down Expand Up @@ -635,6 +636,7 @@ mod tests {
level: QuoteLevel::Full,
indent: Indent::None,
write_indent: false,
text_format: TextFormat::Text,
allow_primitive: true,
expand_empty_elements: false,
},
Expand All @@ -661,6 +663,7 @@ mod tests {
level: QuoteLevel::Full,
indent: Indent::None,
write_indent: false,
text_format: TextFormat::Text,
allow_primitive: true,
expand_empty_elements: false,
},
Expand Down Expand Up @@ -1356,6 +1359,7 @@ mod tests {
level: QuoteLevel::Full,
indent: Indent::Owned(Indentation::new(b' ', 2)),
write_indent: false,
text_format: TextFormat::Text,
allow_primitive: true,
expand_empty_elements: false,
},
Expand All @@ -1382,6 +1386,7 @@ mod tests {
level: QuoteLevel::Full,
indent: Indent::Owned(Indentation::new(b' ', 2)),
write_indent: false,
text_format: TextFormat::Text,
allow_primitive: true,
expand_empty_elements: false,
},
Expand Down Expand Up @@ -2099,6 +2104,7 @@ mod tests {
level: QuoteLevel::Full,
indent: Indent::None,
write_indent: false,
text_format: TextFormat::Text,
allow_primitive: true,
expand_empty_elements: true,
},
Expand Down
46 changes: 46 additions & 0 deletions src/se/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -320,6 +320,16 @@ where

////////////////////////////////////////////////////////////////////////////////////////////////////

/// Defines the format for text content serialization
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
#[non_exhaustive]
pub enum TextFormat {
/// Serialize as regular text content with escaping
Text,
/// Serialize as CDATA section without escaping
CData,
}

/// Defines which characters would be escaped in [`Text`] events and attribute
/// values.
///
Expand Down Expand Up @@ -557,6 +567,7 @@ impl<'w, 'r, W: Write> Serializer<'w, 'r, W> {
level: QuoteLevel::Partial,
indent: Indent::None,
write_indent: false,
text_format: TextFormat::Text,
allow_primitive: true,
expand_empty_elements: false,
},
Expand Down Expand Up @@ -623,6 +634,7 @@ impl<'w, 'r, W: Write> Serializer<'w, 'r, W> {
level: QuoteLevel::Partial,
indent: Indent::None,
write_indent: false,
text_format: TextFormat::Text,
allow_primitive: true,
expand_empty_elements: false,
},
Expand Down Expand Up @@ -663,6 +675,40 @@ impl<'w, 'r, W: Write> Serializer<'w, 'r, W> {
self
}

/// Set the text format used for serializing text content.
///
/// - [`TextFormat::Text`]: Regular XML escaping (default)
/// - [`TextFormat::CData`]: CDATA sections for text content
///
/// # Examples
///
/// ```
/// # use pretty_assertions::assert_eq;
/// # use serde::Serialize;
/// # use quick_xml::se::{Serializer, TextFormat};
///
/// #[derive(Debug, PartialEq, Serialize)]
/// struct Document {
/// #[serde(rename = "$text")]
/// content: String,
/// }
///
/// let mut buffer = String::new();
/// let mut ser = Serializer::with_root(&mut buffer, Some("doc")).unwrap();
/// ser.text_format(TextFormat::CData);
///
/// let data = Document {
/// content: "Content with <markup> & entities".to_string(),
/// };
///
/// data.serialize(ser).unwrap();
/// assert_eq!(buffer, "<doc><![CDATA[Content with <markup> & entities]]></doc>");
/// ```
pub fn text_format(&mut self, format: TextFormat) -> &mut Self {
self.ser.text_format = format;
self
}

/// Configure indent for a serializer
pub fn indent(&mut self, indent_char: char, indent_size: usize) -> &mut Self {
self.ser.indent = Indent::Owned(Indentation::new(indent_char as u8, indent_size));
Expand Down
Loading