From f30a94fa297189fb787a789991e8bcc827187659 Mon Sep 17 00:00:00 2001 From: Daniel Alley Date: Tue, 12 Jul 2022 17:26:28 -0400 Subject: [PATCH 1/2] Parameterize macrobenchmarks, add throughput measurements --- Changelog.md | 2 + benches/macrobenches.rs | 118 ++++++++++++---------------------------- 2 files changed, 36 insertions(+), 84 deletions(-) diff --git a/Changelog.md b/Changelog.md index f8ff4472..9e67d69e 100644 --- a/Changelog.md +++ b/Changelog.md @@ -121,6 +121,7 @@ - [#393]: Added tests for reserved names (started with "xml"i) -- see - [#363]: Add tests for `Reader::read_event_impl` to ensure that proper events generated for corresponding inputs - [#407]: Improved benchmark suite to cover whole-document parsing, escaping and unescaping text +- [#418]: Parameterized macrobenchmarks, added throughput measurements via criterion [#8]: https://github.com/Mingun/fast-xml/pull/8 [#9]: https://github.com/Mingun/fast-xml/pull/9 @@ -137,6 +138,7 @@ [#407]: https://github.com/tafia/quick-xml/pull/407 [#412]: https://github.com/tafia/quick-xml/pull/412 [#416]: https://github.com/tafia/quick-xml/pull/416 +[#418]: https://github.com/tafia/quick-xml/pull/418 ## 0.23.0 -- 2022-05-08 diff --git a/benches/macrobenches.rs b/benches/macrobenches.rs index 1a00abed..1dc74dac 100644 --- a/benches/macrobenches.rs +++ b/benches/macrobenches.rs @@ -1,4 +1,4 @@ -use criterion::{self, criterion_group, Criterion}; +use criterion::{self, criterion_group, Criterion, Throughput}; use quick_xml::events::Event; use quick_xml::Reader; use quick_xml::Result as XmlResult; @@ -44,89 +44,39 @@ fn parse_document(doc: &[u8]) -> XmlResult<()> { pub fn bench_fully_parse_document(c: &mut Criterion) { let mut group = c.benchmark_group("fully_parse_document"); - // long, mix of attributes and text, not much escaping, mix of attribute lengths, some namespaces - group.bench_function("rpm_primary.xml", |b| { - b.iter(|| { - parse_document(RPM_PRIMARY).unwrap(); - }) - }); - - // long, mix of attributes and text, not much escaping, mix of attribute lengths, some namespaces - group.bench_function("rpm_primary2.xml", |b| { - b.iter(|| { - parse_document(RPM_PRIMARY2).unwrap(); - }) - }); - - // long, mostly medium-length text elements, not much escaping - group.bench_function("rpm_filelists.xml", |b| { - b.iter(|| { - parse_document(RPM_FILELISTS).unwrap(); - }) - }); - - // long, mix of attributes and text, lots of escaping (both entity and char literal), long attributes - group.bench_function("rpm_other.xml", |b| { - b.iter(|| { - parse_document(RPM_OTHER).unwrap(); - }) - }); - - // long, mix of attributes and text, not much escaping, lots of non-ascii characters, lots of namespaces - group.bench_function("libreoffice_document.fodt", |b| { - b.iter(|| { - parse_document(LIBREOFFICE_DOCUMENT).unwrap(); - }) - }); - - // medium length, mostly empty tags, a few short attributes per element, no escaping - group.bench_function("document.xml", |b| { - b.iter(|| { - parse_document(DOCUMENT).unwrap(); - }) - }); - - // medium length, lots of namespaces, no escaping - group.bench_function("test_writer_ident.xml", |b| { - b.iter(|| { - parse_document(TEST_WRITER_INDENT).unwrap(); - }) - }); - - // short, mix of attributes and text, lots of escapes - group.bench_function("sample_1.xml", |b| { - b.iter(|| { - parse_document(SAMPLE_1).unwrap(); - }) - }); - - // medium length, lots of attributes, short attributes, few escapes - group.bench_function("linescore.xml", |b| { - b.iter(|| { - parse_document(LINESCORE).unwrap(); - }) - }); - - // short, lots of namespaces, no escapes - group.bench_function("sample_ns.xml", |b| { - b.iter(|| { - parse_document(SAMPLE_NS).unwrap(); - }) - }); - - // long, few attributes, mix of attribute lengths, escapes in text content - group.bench_function("sample_rss.xml", |b| { - b.iter(|| { - parse_document(SAMPLE_RSS).unwrap(); - }) - }); - - // long, lots of attributes, short attributes, no text, no escapes - group.bench_function("players.xml", |b| { - b.iter(|| { - parse_document(PLAYERS).unwrap(); - }) - }); + let inputs = [ + // long, mix of attributes and text, not much escaping, mix of attribute lengths, some namespaces + ("rpm_primary.xml", RPM_PRIMARY), + // long, mix of attributes and text, not much escaping, mix of attribute lengths, some namespaces + ("rpm_primary2.xml", RPM_PRIMARY2), + // long, mostly medium-length text elements, not much escaping + ("rpm_filelists.xml", RPM_FILELISTS), + // long, mix of attributes and text, lots of escaping (both entity and char literal), long attributes + ("rpm_other.xml", RPM_OTHER), + // long, mix of attributes and text, not much escaping, lots of non-ascii characters, lots of namespaces + ("libreoffice_document.fodt", LIBREOFFICE_DOCUMENT), + // medium length, mostly empty tags, a few short attributes per element, no escaping + ("document.xml", DOCUMENT), + // medium length, lots of namespaces, no escaping + ("test_writer_ident.xml", TEST_WRITER_INDENT), + // short, mix of attributes and text, lots of escapes + ("sample_1.xml", SAMPLE_1), + // medium length, lots of attributes, short attributes, few escapes + ("linescore.xml", LINESCORE), + // short, lots of namespaces, no escapes + ("sample_ns.xml", SAMPLE_NS), + // long, few attributes, mix of attribute lengths, escapes in text content + ("sample_rss.xml", SAMPLE_RSS), + // long, lots of attributes, short attributes, no text, no escapes + ("players.xml", PLAYERS), + ]; + + for (id, data) in inputs.iter() { + group.throughput(Throughput::Bytes(data.len() as u64)); + group.bench_with_input(*id, *data, |b, input| { + b.iter(|| parse_document(input).unwrap()) + }); + } group.finish(); } From de51d9bab80949d4999d82642ab8aa8e8c05175f Mon Sep 17 00:00:00 2001 From: Daniel Alley Date: Tue, 12 Jul 2022 19:55:33 -0400 Subject: [PATCH 2/2] Parameterize comparative benchmarks, add throughput measurements --- Changelog.md | 2 +- compare/benches/bench.rs | 412 ++++++++++++++++++++++----------------- 2 files changed, 234 insertions(+), 180 deletions(-) diff --git a/Changelog.md b/Changelog.md index 9e67d69e..cafe5fc8 100644 --- a/Changelog.md +++ b/Changelog.md @@ -121,7 +121,7 @@ - [#393]: Added tests for reserved names (started with "xml"i) -- see - [#363]: Add tests for `Reader::read_event_impl` to ensure that proper events generated for corresponding inputs - [#407]: Improved benchmark suite to cover whole-document parsing, escaping and unescaping text -- [#418]: Parameterized macrobenchmarks, added throughput measurements via criterion +- [#418]: Parameterized macrobenchmarks and comparative benchmarks, added throughput measurements via criterion [#8]: https://github.com/Mingun/fast-xml/pull/8 [#9]: https://github.com/Mingun/fast-xml/pull/9 diff --git a/compare/benches/bench.rs b/compare/benches/bench.rs index 0a14fb40..7df5f029 100644 --- a/compare/benches/bench.rs +++ b/compare/benches/bench.rs @@ -1,192 +1,234 @@ -use criterion::{self, criterion_group, criterion_main, Criterion}; +use criterion::{self, criterion_group, criterion_main, BenchmarkId, Criterion, Throughput}; use pretty_assertions::assert_eq; use quick_xml::{self, events::Event, Reader}; use serde::Deserialize; use serde_xml_rs; use xml::reader::{EventReader, XmlEvent}; -static SOURCE: &str = include_str!("../../tests/documents/sample_rss.xml"); +static RPM_PRIMARY: &str = include_str!("../../tests/documents/rpm_primary.xml"); +static RPM_PRIMARY2: &str = include_str!("../../tests/documents/rpm_primary2.xml"); +static RPM_FILELISTS: &str = include_str!("../../tests/documents/rpm_filelists.xml"); +static RPM_OTHER: &str = include_str!("../../tests/documents/rpm_other.xml"); +static LIBREOFFICE_DOCUMENT: &str = include_str!("../../tests/documents/libreoffice_document.fodt"); +static DOCUMENT: &str = include_str!("../../tests/documents/document.xml"); +static TEST_WRITER_INDENT: &str = include_str!("../../tests/documents/test_writer_indent.xml"); +static SAMPLE_1: &str = include_str!("../../tests/documents/sample_1.xml"); +static LINESCORE: &str = include_str!("../../tests/documents/linescore.xml"); +static SAMPLE_RSS: &str = include_str!("../../tests/documents/sample_rss.xml"); +static SAMPLE_NS: &str = include_str!("../../tests/documents/sample_ns.xml"); +static PLAYERS: &str = include_str!("../../tests/documents/players.xml"); -/// Runs benchmarks for several XML libraries using low-level API +static TEST_FILES: [(&str, &str, usize); 12] = [ + // long, mix of attributes and text, not much escaping, mix of attribute lengths, some namespaces + ("rpm_primary.xml", RPM_PRIMARY, 369), + // long, mix of attributes and text, not much escaping, mix of attribute lengths, some namespaces + ("rpm_primary2.xml", RPM_PRIMARY2, 116), + // long, mostly medium-length text elements, not much escaping + ("rpm_filelists.xml", RPM_FILELISTS, 184), + // long, mix of attributes and text, lots of escaping (both entity and char literal), long attributes + ("rpm_other.xml", RPM_OTHER, 145), + // long, mix of attributes and text, not much escaping, lots of non-ascii characters, lots of namespaces + ("libreoffice_document.fodt", LIBREOFFICE_DOCUMENT, 659), + // medium length, mostly empty tags, a few short attributes per element, no escaping + ("document.xml", DOCUMENT, 342), + // medium length, lots of namespaces, no escaping + ("test_writer_ident.xml", TEST_WRITER_INDENT, 34), + // short, mix of attributes and text, lots of escapes + ("sample_1.xml", SAMPLE_1, 15), + // medium length, lots of attributes, short attributes, few escapes + ("linescore.xml", LINESCORE, 11), + // short, lots of namespaces, no escapes + ("sample_ns.xml", SAMPLE_NS, 11), + // long, few attributes, mix of attribute lengths, escapes in text content + ("sample_rss.xml", SAMPLE_RSS, 1550), + // long, lots of attributes, short attributes, no text, no escapes + ("players.xml", PLAYERS, 76), +]; + +// Comparison of low-level APIs from several XML libraries fn low_level_comparison(c: &mut Criterion) { let mut group = c.benchmark_group("low-level API"); + for (filename, data, total_tags) in TEST_FILES.iter() { + let total_tags = *total_tags; - group.bench_function("quick_xml", |b| { - b.iter(|| { - let mut r = Reader::from_reader(SOURCE.as_bytes()); - r.check_end_names(false).check_comments(false); - let mut count = criterion::black_box(0); - let mut buf = Vec::new(); - loop { - match r.read_event_into(&mut buf) { - Ok(Event::Start(_)) | Ok(Event::Empty(_)) => count += 1, - Ok(Event::Eof) => break, - _ => (), - } - buf.clear(); - } - assert_eq!( - count, 1550, - "Overall tag count in ./tests/documents/sample_rss.xml" - ); - }) - }); + group.throughput(Throughput::Bytes(data.len() as u64)); + group.bench_with_input( + BenchmarkId::new("quick_xml", filename), + *data, + |b, input| { + b.iter(|| { + let mut r = Reader::from_reader(input.as_bytes()); + r.check_end_names(false).check_comments(false); + let mut count = criterion::black_box(0); + let mut buf = Vec::new(); + loop { + match r.read_event_into(&mut buf) { + Ok(Event::Start(_)) | Ok(Event::Empty(_)) => count += 1, + Ok(Event::Eof) => break, + _ => (), + } + buf.clear(); + } + assert_eq!(count, total_tags, "Overall tag count in {}", filename); + }) + }, + ); - group.bench_function("maybe_xml", |b| { - use maybe_xml::eval::recv::RecvEvaluator; - use maybe_xml::token::borrowed::Token; + group.bench_with_input( + BenchmarkId::new("maybe_xml", filename), + *data, + |b, input| { + use maybe_xml::eval::recv::RecvEvaluator; + use maybe_xml::token::borrowed::Token; - b.iter(|| { - let mut input = SOURCE.as_bytes(); - let mut eval = RecvEvaluator::new(); - - let mut count = criterion::black_box(0); - loop { - let consumed = eval.recv(input); - match eval.next_token() { - Ok(Some(Token::StartTag(_))) => count += 1, - Ok(Some(Token::EmptyElementTag(_))) => count += 1, - Ok(Some(Token::Eof)) => break, - Ok(Some(Token::EofWithBytesNotEvaluated(_))) => break, - _ => (), - } - input = &input[consumed..]; - } - assert_eq!( - count, 1550, - "Overall tag count in ./tests/documents/sample_rss.xml" - ); - }) - }); + b.iter(|| { + let mut input = input.as_bytes(); + let mut eval = RecvEvaluator::new(); - group.bench_function("rapid-xml", |b| { - use rapid_xml::parser::{EventCode, Parser}; + let mut count = criterion::black_box(0); + loop { + let consumed = eval.recv(input); + match eval.next_token() { + Ok(Some(Token::StartTag(_))) => count += 1, + Ok(Some(Token::EmptyElementTag(_))) => count += 1, + Ok(Some(Token::Eof)) => break, + Ok(Some(Token::EofWithBytesNotEvaluated(_))) => break, + _ => (), + } + input = &input[consumed..]; + } + assert_eq!(count, total_tags, "Overall tag count in {}", filename); + }) + }, + ); - b.iter(|| { - let mut r = Parser::new(SOURCE.as_bytes()); - - let mut count = criterion::black_box(0); - loop { - // Makes no progress if error is returned, so need unwrap() - match r.next().unwrap().code() { - EventCode::StartTag => count += 1, - EventCode::Eof => break, - _ => (), - } - } - assert_eq!( - count, 1550, - "Overall tag count in ./tests/documents/sample_rss.xml" - ); - }) - }); + // DISABLED: fails to parse empty attributes + // group.bench_with_input(BenchmarkId::new("rapid_xml", filename), *data, |b, input| { + // use rapid_xml::parser::{EventCode, Parser}; - group.bench_function("xmlparser", |b| { - use xmlparser::{Token, Tokenizer}; + // b.iter(|| { + // let mut r = Parser::new(input.as_bytes()); - b.iter(|| { - let mut count = criterion::black_box(0); - for token in Tokenizer::from(SOURCE) { - match token { - Ok(Token::ElementStart { .. }) => count += 1, - _ => (), - } - } - assert_eq!( - count, 1550, - "Overall tag count in ./tests/documents/sample_rss.xml" - ); - }) - }); + // let mut count = criterion::black_box(0); + // loop { + // // Makes no progress if error is returned, so need unwrap() + // match r.next().unwrap().code() { + // EventCode::StartTag => count += 1, + // EventCode::Eof => break, + // _ => (), + // } + // } + // assert_eq!( + // count, total_tags, + // "Overall tag count in {}", filename + // ); + // }) + // }); - group.bench_function("RustyXML", |b| { - use rusty_xml::{Event, Parser}; + group.bench_with_input( + BenchmarkId::new("xmlparser", filename), + *data, + |b, input| { + use xmlparser::{Token, Tokenizer}; - b.iter(|| { - let mut r = Parser::new(); - r.feed_str(SOURCE); - - let mut count = criterion::black_box(0); - for event in r { - match event.unwrap() { - Event::ElementStart(_) => count += 1, - _ => (), - } - } - assert_eq!( - count, 1550, - "Overall tag count in ./tests/documents/sample_rss.xml" - ); - }) - }); + b.iter(|| { + let mut count = criterion::black_box(0); + for token in Tokenizer::from(input) { + match token { + Ok(Token::ElementStart { .. }) => count += 1, + _ => (), + } + } + assert_eq!(count, total_tags, "Overall tag count in {}", filename); + }) + }, + ); - group.bench_function("xml_oxide", |b| { - use xml_oxide::sax::parser::Parser; - use xml_oxide::sax::Event; + group.bench_with_input(BenchmarkId::new("RustyXml", filename), *data, |b, input| { + use rusty_xml::{Event, Parser}; - b.iter(|| { - let mut r = Parser::from_reader(SOURCE.as_bytes()); - - let mut count = criterion::black_box(0); - loop { - // Makes no progress if error is returned, so need unwrap() - match r.read_event().unwrap() { - Event::StartElement(_) => count += 1, - Event::EndDocument => break, - _ => (), + b.iter(|| { + let mut r = Parser::new(); + r.feed_str(input); + + let mut count = criterion::black_box(0); + for event in r { + match event.unwrap() { + Event::ElementStart(_) => count += 1, + _ => (), + } } - } - assert_eq!(count, 1550, "Overall tag count in ./tests/sample_rss.xml"); - }) - }); - - group.bench_function("xml5ever", |b| { - use xml5ever::buffer_queue::BufferQueue; - use xml5ever::tokenizer::{TagKind, Token, TokenSink, XmlTokenizer}; - - struct Sink(usize); - impl TokenSink for Sink { - fn process_token(&mut self, token: Token) { - match token { - Token::TagToken(tag) if tag.kind == TagKind::StartTag => self.0 += 1, - Token::TagToken(tag) if tag.kind == TagKind::EmptyTag => self.0 += 1, - _ => (), + assert_eq!(count, total_tags, "Overall tag count in {}", filename); + }) + }); + + group.bench_with_input( + BenchmarkId::new("xml_oxide", filename), + *data, + |b, input| { + use xml_oxide::sax::parser::Parser; + use xml_oxide::sax::Event; + + b.iter(|| { + let mut r = Parser::from_reader(input.as_bytes()); + + let mut count = criterion::black_box(0); + loop { + // Makes no progress if error is returned, so need unwrap() + match r.read_event().unwrap() { + Event::StartElement(_) => count += 1, + Event::EndDocument => break, + _ => (), + } + } + assert_eq!(count, total_tags, "Overall tag count in {}", filename); + }) + }, + ); + + group.bench_with_input(BenchmarkId::new("xml5ever", filename), *data, |b, input| { + use xml5ever::buffer_queue::BufferQueue; + use xml5ever::tokenizer::{TagKind, Token, TokenSink, XmlTokenizer}; + + struct Sink(usize); + impl TokenSink for Sink { + fn process_token(&mut self, token: Token) { + match token { + Token::TagToken(tag) if tag.kind == TagKind::StartTag => self.0 += 1, + Token::TagToken(tag) if tag.kind == TagKind::EmptyTag => self.0 += 1, + _ => (), + } } } - } - // Copied from xml5ever benchmarks - // https://github.com/servo/html5ever/blob/429f23943b24f739b78f4d703620d7b1b526475b/xml5ever/benches/xml5ever.rs - b.iter(|| { - let sink = criterion::black_box(Sink(0)); - let mut tok = XmlTokenizer::new(sink, Default::default()); - let mut buffer = BufferQueue::new(); - buffer.push_back(SOURCE.into()); - let _ = tok.feed(&mut buffer); - tok.end(); - - assert_eq!( - tok.sink.0, 1550, - "Overall tag count in ./tests/documents/sample_rss.xml" - ); - }) - }); - - group.bench_function("xml_rs", |b| { - b.iter(|| { - let r = EventReader::new(SOURCE.as_bytes()); - let mut count = criterion::black_box(0); - for e in r { - if let Ok(XmlEvent::StartElement { .. }) = e { - count += 1; + // Copied from xml5ever benchmarks + // https://github.com/servo/html5ever/blob/429f23943b24f739b78f4d703620d7b1b526475b/xml5ever/benches/xml5ever.rs + b.iter(|| { + let sink = criterion::black_box(Sink(0)); + let mut tok = XmlTokenizer::new(sink, Default::default()); + let mut buffer = BufferQueue::new(); + buffer.push_back(input.into()); + let _ = tok.feed(&mut buffer); + tok.end(); + + assert_eq!(tok.sink.0, total_tags, "Overall tag count in {}", filename); + }) + }); + + group.bench_with_input(BenchmarkId::new("xml_rs", filename), *data, |b, input| { + b.iter(|| { + let r = EventReader::new(input.as_bytes()); + let mut count = criterion::black_box(0); + for e in r { + if let Ok(XmlEvent::StartElement { .. }) = e { + count += 1; + } } - } - assert_eq!( - count, 1550, - "Overall tag count in ./tests/documents/sample_rss.xml" - ); - }) - }); + assert_eq!(count, total_tags, "Overall tag count in {}", filename); + }) + }); + } + group.finish(); } @@ -194,6 +236,7 @@ fn low_level_comparison(c: &mut Criterion) { #[allow(dead_code)] // We do not use structs fn serde_comparison(c: &mut Criterion) { let mut group = c.benchmark_group("serde"); + #[derive(Debug, Deserialize)] struct Rss { channel: Channel, @@ -223,32 +266,43 @@ fn serde_comparison(c: &mut Criterion) { typ: String, } - group.bench_function("quick_xml", |b| { - b.iter(|| { - let rss: Rss = quick_xml::de::from_str(SOURCE).unwrap(); - assert_eq!(rss.channel.items.len(), 99); - }) - }); + group.throughput(Throughput::Bytes(SAMPLE_RSS.len() as u64)); + + group.bench_with_input( + BenchmarkId::new("quick_xml", "sample_rss.xml"), + SAMPLE_RSS, + |b, input| { + b.iter(|| { + let rss: Rss = criterion::black_box(quick_xml::de::from_str(input).unwrap()); + assert_eq!(rss.channel.items.len(), 99); + }) + }, + ); /* NOTE: Most parts of deserializer are not implemented yet, so benchmark failed - group.bench_function("rapid-xml", |b| { + group.bench_with_input(BenchmarkId::new("rapid-xml", "sample_rss.xml"), SAMPLE_RSS, |b, input| { use rapid_xml::de::Deserializer; use rapid_xml::parser::Parser; b.iter(|| { - let mut r = Parser::new(SOURCE.as_bytes()); + let mut r = Parser::new(input.as_bytes()); let mut de = Deserializer::new(&mut r).unwrap(); - let rss = Rss::deserialize(&mut de).unwrap(); + let rss = criterion::black_box(Rss::deserialize(&mut de).unwrap()); assert_eq!(rss.channel.items.len(), 99); }); });*/ - group.bench_function("xml_rs", |b| { - b.iter(|| { - let rss: Rss = serde_xml_rs::from_str(SOURCE).unwrap(); - assert_eq!(rss.channel.items.len(), 99); - }); - }); + group.bench_with_input( + BenchmarkId::new("xml_rs", "sample_rss.xml"), + SAMPLE_RSS, + |b, input| { + b.iter(|| { + let rss: Rss = criterion::black_box(serde_xml_rs::from_str(input).unwrap()); + assert_eq!(rss.channel.items.len(), 99); + }) + }, + ); + group.finish(); }