Skip to content

Commit 463a006

Browse files
committed
Implement load_from_bytes
Closes #155 Also helps in some cases with #142, when the BOM is at the beginning of the file (common), but not in corner case where the BOM is at the start of a document which is not the first one.
1 parent 360a34d commit 463a006

File tree

2 files changed

+113
-0
lines changed

2 files changed

+113
-0
lines changed

Cargo.toml

+1
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,7 @@ readme = "README.md"
1111

1212
[dependencies]
1313
linked-hash-map = ">=0.0.9, <0.6"
14+
encoding = "0.2"
1415

1516
[dev-dependencies]
1617
quickcheck = "0.7"

src/yaml.rs

+112
Original file line numberDiff line numberDiff line change
@@ -157,6 +157,19 @@ impl MarkedEventReceiver for YamlLoader {
157157
}
158158
}
159159

160+
#[derive(Debug)]
161+
pub enum LoadError {
162+
IO(std::io::Error),
163+
Scan(ScanError),
164+
Decode(std::borrow::Cow<'static, str>),
165+
}
166+
167+
impl From<std::io::Error> for LoadError {
168+
fn from(error: std::io::Error) -> Self {
169+
LoadError::IO(error)
170+
}
171+
}
172+
160173
impl YamlLoader {
161174
fn insert_new_node(&mut self, node: (Yaml, usize)) {
162175
// valid anchor id starts from 1
@@ -197,6 +210,42 @@ impl YamlLoader {
197210
parser.load(&mut loader, true)?;
198211
Ok(loader.docs)
199212
}
213+
214+
pub fn load_from_bytes(mut source: impl std::io::Read) -> Result<Vec<Yaml>, LoadError> {
215+
let mut buffer = Vec::new();
216+
source.read_to_end(&mut buffer)?;
217+
218+
// Decodes the input buffer using either UTF-8, UTF-16LE or UTF-16BE depending on the BOM codepoint.
219+
// If the buffer doesn't start with a BOM codepoint, it will use a fallback encoding obtained by
220+
// detect_utf16_endianness.
221+
let (res, _) = encoding::types::decode(
222+
&buffer,
223+
encoding::DecoderTrap::Replace,
224+
detect_utf16_endianness(&buffer),
225+
);
226+
let s = res.map_err(LoadError::Decode)?;
227+
YamlLoader::load_from_str(&s).map_err(LoadError::Scan)
228+
}
229+
}
230+
231+
/// The encoding crate knows how to tell apart UTF-8 from UTF-16LE and utf-16BE, when the
232+
/// bytestream starts with BOM codepoint.
233+
/// However, it doesn't even attempt to guess the UTF-16 endianness of the input bytestream since
234+
/// in the general case the bytestream could start with a codepoint that uses both bytes.
235+
///
236+
/// The YAML-1.2 spec mandates that the first character of a YAML document is an ASCII character.
237+
/// This allows the encoding to be deduced by the pattern of null (#x00) characters.
238+
//
239+
/// See spec at https://yaml.org/spec/1.2/spec.html#id2771184
240+
fn detect_utf16_endianness(b: &[u8]) -> encoding::types::EncodingRef {
241+
if b.len() > 1 && (b[0] != b[1]) {
242+
if b[0] == 0 {
243+
return encoding::all::UTF_16BE;
244+
} else if b[1] == 0 {
245+
return encoding::all::UTF_16LE;
246+
}
247+
}
248+
encoding::all::UTF_8
200249
}
201250

202251
macro_rules! define_as (
@@ -736,4 +785,67 @@ subcommands3:
736785
let s = "[".repeat(10_000) + &"]".repeat(10_000);
737786
assert!(YamlLoader::load_from_str(&s).is_err());
738787
}
788+
789+
#[test]
790+
fn test_read_bom() {
791+
let s = b"\xef\xbb\xbf---
792+
a: 1
793+
b: 2.2
794+
c: [1, 2]
795+
";
796+
let out = YamlLoader::load_from_bytes(s as &[u8]).unwrap();
797+
let doc = &out[0];
798+
assert_eq!(doc["a"].as_i64().unwrap(), 1i64);
799+
assert_eq!(doc["b"].as_f64().unwrap(), 2.2f64);
800+
assert_eq!(doc["c"][1].as_i64().unwrap(), 2i64);
801+
assert!(doc["d"][0].is_badvalue());
802+
}
803+
804+
#[test]
805+
fn test_read_utf16le() {
806+
let s = b"\xff\xfe-\x00-\x00-\x00
807+
\x00a\x00:\x00 \x001\x00
808+
\x00b\x00:\x00 \x002\x00.\x002\x00
809+
\x00c\x00:\x00 \x00[\x001\x00,\x00 \x002\x00]\x00
810+
\x00";
811+
let out = YamlLoader::load_from_bytes(s as &[u8]).unwrap();
812+
let doc = &out[0];
813+
println!("GOT: {:?}", doc);
814+
assert_eq!(doc["a"].as_i64().unwrap(), 1i64);
815+
assert_eq!(doc["b"].as_f64().unwrap(), 2.2f64);
816+
assert_eq!(doc["c"][1].as_i64().unwrap(), 2i64);
817+
assert!(doc["d"][0].is_badvalue());
818+
}
819+
820+
#[test]
821+
fn test_read_utf16be() {
822+
let s = b"\xfe\xff\x00-\x00-\x00-\x00
823+
\x00a\x00:\x00 \x001\x00
824+
\x00b\x00:\x00 \x002\x00.\x002\x00
825+
\x00c\x00:\x00 \x00[\x001\x00,\x00 \x002\x00]\x00
826+
";
827+
let out = YamlLoader::load_from_bytes(s as &[u8]).unwrap();
828+
let doc = &out[0];
829+
println!("GOT: {:?}", doc);
830+
assert_eq!(doc["a"].as_i64().unwrap(), 1i64);
831+
assert_eq!(doc["b"].as_f64().unwrap(), 2.2f64);
832+
assert_eq!(doc["c"][1].as_i64().unwrap(), 2i64);
833+
assert!(doc["d"][0].is_badvalue());
834+
}
835+
836+
#[test]
837+
fn test_read_utf16le_nobom() {
838+
let s = b"-\x00-\x00-\x00
839+
\x00a\x00:\x00 \x001\x00
840+
\x00b\x00:\x00 \x002\x00.\x002\x00
841+
\x00c\x00:\x00 \x00[\x001\x00,\x00 \x002\x00]\x00
842+
\x00";
843+
let out = YamlLoader::load_from_bytes(s as &[u8]).unwrap();
844+
let doc = &out[0];
845+
println!("GOT: {:?}", doc);
846+
assert_eq!(doc["a"].as_i64().unwrap(), 1i64);
847+
assert_eq!(doc["b"].as_f64().unwrap(), 2.2f64);
848+
assert_eq!(doc["c"][1].as_i64().unwrap(), 2i64);
849+
assert!(doc["d"][0].is_badvalue());
850+
}
739851
}

0 commit comments

Comments
 (0)