From 8defd1062bb8b7b5050fbf5aa4d8a526e271821e Mon Sep 17 00:00:00 2001 From: Kornel Date: Sun, 2 Jul 2023 16:09:33 +0100 Subject: [PATCH] Allocate attr name HashSet only if necessary --- benches/bench.rs | 13 +++++++++++++ src/reader/indexset.rs | 16 ++++++++++++++-- 2 files changed, 27 insertions(+), 2 deletions(-) diff --git a/benches/bench.rs b/benches/bench.rs index c2065b1d..d7f74279 100644 --- a/benches/bench.rs +++ b/benches/bench.rs @@ -15,6 +15,19 @@ fn read(bencher: &mut Bencher) { }); } +#[bench] +fn read_lots_attrs(bencher: &mut Bencher) { + let xml = r#""#; + bencher.iter(move || { + let parser = EventReader::new(xml.as_bytes()); + for e in parser { + e.unwrap(); + } + }); +} + #[bench] fn write(bencher: &mut Bencher) { let xml = std::fs::read("tests/documents/sample_1.xml").unwrap(); diff --git a/src/reader/indexset.rs b/src/reader/indexset.rs index 82fb8e65..aaca88ca 100644 --- a/src/reader/indexset.rs +++ b/src/reader/indexset.rs @@ -16,6 +16,10 @@ pub(crate) struct AttributesSet { hasher: RandomState, } +/// Use linear search and don't allocate `HashSet` if there are few attributes, +/// because allocation costs more than a few comparisons. +const HASH_THRESHOLD: usize = 8; + impl AttributesSet { pub fn new() -> Self { Self { @@ -33,12 +37,20 @@ impl AttributesSet { pub fn contains(&self, name: &OwnedName) -> bool { // fall back to linear search only on duplicate or hash collision - self.may_contain.contains(&self.hash(name)) && + (self.vec.len() < HASH_THRESHOLD || self.may_contain.contains(&self.hash(name))) && self.vec.iter().any(move |a| &a.name == name) } pub fn push(&mut self, attr: OwnedAttribute) { - self.may_contain.insert(self.hash(&attr.name)); + if self.vec.len() >= HASH_THRESHOLD { + if self.vec.len() == HASH_THRESHOLD { + self.may_contain.reserve(HASH_THRESHOLD * 2); + for attr in &self.vec { + self.may_contain.insert(self.hash(&attr.name)); + } + } + self.may_contain.insert(self.hash(&attr.name)); + } self.vec.push(attr); }