Skip to content

Commit

Permalink
Implement load_from_bytes
Browse files Browse the repository at this point in the history
Closes #155

Also helps in some cases with #142, when the BOM is at the beginning of the file (common),
but not in corner case where the BOM is at the start of a document which is not the first one.
  • Loading branch information
mkmik committed May 7, 2020
1 parent 360a34d commit 3f7ec04
Show file tree
Hide file tree
Showing 2 changed files with 113 additions and 0 deletions.
1 change: 1 addition & 0 deletions Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@ readme = "README.md"

[dependencies]
linked-hash-map = ">=0.0.9, <0.6"
encoding = "0.2"

[dev-dependencies]
quickcheck = "0.7"
112 changes: 112 additions & 0 deletions src/yaml.rs
Original file line number Diff line number Diff line change
Expand Up @@ -157,6 +157,19 @@ impl MarkedEventReceiver for YamlLoader {
}
}

#[derive(Debug)]
pub enum LoadError {
IO(std::io::Error),
Scan(ScanError),
Decode(std::borrow::Cow<'static, str>),
}

impl From<std::io::Error> for LoadError {
fn from(error: std::io::Error) -> Self {
LoadError::IO(error)
}
}

impl YamlLoader {
fn insert_new_node(&mut self, node: (Yaml, usize)) {
// valid anchor id starts from 1
Expand Down Expand Up @@ -197,6 +210,42 @@ impl YamlLoader {
parser.load(&mut loader, true)?;
Ok(loader.docs)
}

pub fn load_from_bytes(mut source: impl std::io::Read) -> Result<Vec<Yaml>, LoadError> {
let mut buffer = Vec::new();
source.read_to_end(&mut buffer)?;

// Decodes the input buffer using either UTF-8, UTF-16LE or UTF-16BE depending on the BOM codepoint.
// If the buffer doesn't start with a BOM codepoint, it will use a fallback encoding obtained by
// detect_utf16_endianness.
let (res, _) = encoding::types::decode(
&buffer,
encoding::DecoderTrap::Strict,
detect_utf16_endianness(&buffer),
);
let s = res.map_err(LoadError::Decode)?;
YamlLoader::load_from_str(&s).map_err(LoadError::Scan)
}
}

/// The encoding crate knows how to tell apart UTF-8 from UTF-16LE and utf-16BE, when the
/// bytestream starts with BOM codepoint.
/// However, it doesn't even attempt to guess the UTF-16 endianness of the input bytestream since
/// in the general case the bytestream could start with a codepoint that uses both bytes.
///
/// The YAML-1.2 spec mandates that the first character of a YAML document is an ASCII character.
/// This allows the encoding to be deduced by the pattern of null (#x00) characters.
//
/// See spec at https://yaml.org/spec/1.2/spec.html#id2771184
fn detect_utf16_endianness(b: &[u8]) -> encoding::types::EncodingRef {
if b.len() > 1 && (b[0] != b[1]) {
if b[0] == 0 {
return encoding::all::UTF_16BE;
} else if b[1] == 0 {
return encoding::all::UTF_16LE;
}
}
encoding::all::UTF_8
}

macro_rules! define_as (
Expand Down Expand Up @@ -736,4 +785,67 @@ subcommands3:
let s = "[".repeat(10_000) + &"]".repeat(10_000);
assert!(YamlLoader::load_from_str(&s).is_err());
}

#[test]
fn test_read_bom() {
let s = b"\xef\xbb\xbf---
a: 1
b: 2.2
c: [1, 2]
";
let out = YamlLoader::load_from_bytes(s as &[u8]).unwrap();
let doc = &out[0];
assert_eq!(doc["a"].as_i64().unwrap(), 1i64);
assert_eq!(doc["b"].as_f64().unwrap(), 2.2f64);
assert_eq!(doc["c"][1].as_i64().unwrap(), 2i64);
assert!(doc["d"][0].is_badvalue());
}

#[test]
fn test_read_utf16le() {
let s = b"\xff\xfe-\x00-\x00-\x00
\x00a\x00:\x00 \x001\x00
\x00b\x00:\x00 \x002\x00.\x002\x00
\x00c\x00:\x00 \x00[\x001\x00,\x00 \x002\x00]\x00
\x00";
let out = YamlLoader::load_from_bytes(s as &[u8]).unwrap();
let doc = &out[0];
println!("GOT: {:?}", doc);
assert_eq!(doc["a"].as_i64().unwrap(), 1i64);
assert_eq!(doc["b"].as_f64().unwrap(), 2.2f64);
assert_eq!(doc["c"][1].as_i64().unwrap(), 2i64);
assert!(doc["d"][0].is_badvalue());
}

#[test]
fn test_read_utf16be() {
let s = b"\xfe\xff\x00-\x00-\x00-\x00
\x00a\x00:\x00 \x001\x00
\x00b\x00:\x00 \x002\x00.\x002\x00
\x00c\x00:\x00 \x00[\x001\x00,\x00 \x002\x00]\x00
";
let out = YamlLoader::load_from_bytes(s as &[u8]).unwrap();
let doc = &out[0];
println!("GOT: {:?}", doc);
assert_eq!(doc["a"].as_i64().unwrap(), 1i64);
assert_eq!(doc["b"].as_f64().unwrap(), 2.2f64);
assert_eq!(doc["c"][1].as_i64().unwrap(), 2i64);
assert!(doc["d"][0].is_badvalue());
}

#[test]
fn test_read_utf16le_nobom() {
let s = b"-\x00-\x00-\x00
\x00a\x00:\x00 \x001\x00
\x00b\x00:\x00 \x002\x00.\x002\x00
\x00c\x00:\x00 \x00[\x001\x00,\x00 \x002\x00]\x00
\x00";
let out = YamlLoader::load_from_bytes(s as &[u8]).unwrap();
let doc = &out[0];
println!("GOT: {:?}", doc);
assert_eq!(doc["a"].as_i64().unwrap(), 1i64);
assert_eq!(doc["b"].as_f64().unwrap(), 2.2f64);
assert_eq!(doc["c"][1].as_i64().unwrap(), 2i64);
assert!(doc["d"][0].is_badvalue());
}
}

0 comments on commit 3f7ec04

Please sign in to comment.