From 59d6e61cfd6e6b7d5681cd454f4a5abbec8bd06b Mon Sep 17 00:00:00 2001 From: Colin Rofls Date: Mon, 17 Jun 2024 13:33:13 -0400 Subject: [PATCH] [text-format] Fix parsing of string literals This renames `next_byte_value` to `next_str_lit_bytes` and may return between 1..=4 bytes per call, representing the variable-length nature of the UTF-8 encoding. --- protobuf-support/src/lexer/lexer_impl.rs | 74 ++++++++++++++----- protobuf-support/src/lexer/str_lit.rs | 8 +- .../src/common/v2/test_fmt_text_format.rs | 6 ++ 3 files changed, 66 insertions(+), 22 deletions(-) diff --git a/protobuf-support/src/lexer/lexer_impl.rs b/protobuf-support/src/lexer/lexer_impl.rs index f0d6a9609..0add903c1 100644 --- a/protobuf-support/src/lexer/lexer_impl.rs +++ b/protobuf-support/src/lexer/lexer_impl.rs @@ -67,6 +67,15 @@ impl From for LexerError { } } +/// The raw bytes for a single char or escape sequence in a string literal +/// +/// The raw bytes are available via an `into_iter` implementation. +pub struct DecodedBytes { + // a single char can be up to 4-bytes when encoded in utf-8 + buf: [u8; 4], + len: u8, +} + #[derive(Copy, Clone)] pub struct Lexer<'a> { language: ParserLanguage, @@ -440,24 +449,24 @@ impl<'a> Lexer<'a> { // octEscape = '\' octalDigit octalDigit octalDigit // charEscape = '\' ( "a" | "b" | "f" | "n" | "r" | "t" | "v" | '\' | "'" | '"' ) // quote = "'" | '"' - pub fn next_byte_value(&mut self) -> LexerResult { + pub fn next_str_lit_bytes(&mut self) -> LexerResult { match self.next_char()? { '\\' => { match self.next_char()? { - '\'' => Ok(b'\''), - '"' => Ok(b'"'), - '\\' => Ok(b'\\'), - 'a' => Ok(b'\x07'), - 'b' => Ok(b'\x08'), - 'f' => Ok(b'\x0c'), - 'n' => Ok(b'\n'), - 'r' => Ok(b'\r'), - 't' => Ok(b'\t'), - 'v' => Ok(b'\x0b'), + '\'' => Ok(b'\''.into()), + '"' => Ok(b'"'.into()), + '\\' => Ok(b'\\'.into()), + 'a' => Ok(b'\x07'.into()), + 'b' => Ok(b'\x08'.into()), + 'f' => Ok(b'\x0c'.into()), + 'n' => Ok(b'\n'.into()), + 'r' => Ok(b'\r'.into()), + 't' => Ok(b'\t'.into()), + 'v' => Ok(b'\x0b'.into()), 'x' => { let d1 = self.next_hex_digit()? as u8; let d2 = self.next_hex_digit()? as u8; - Ok(((d1 << 4) | d2) as u8) + Ok((((d1 << 4) | d2) as u8).into()) } d if d >= '0' && d <= '7' => { let mut r = d as u8 - b'0'; @@ -467,16 +476,14 @@ impl<'a> Lexer<'a> { Ok(d) => r = (r << 3) + d as u8, } } - Ok(r) + Ok(r.into()) } // https://github.com/google/protobuf/issues/4562 - // TODO: overflow - c => Ok(c as u8), + c => Ok(c.into()), } } '\n' | '\0' => Err(LexerError::IncorrectInput), - // TODO: check overflow - c => Ok(c as u8), + c => Ok(c.into()), } } @@ -530,7 +537,7 @@ impl<'a> Lexer<'a> { }; first = false; while self.lookahead_char() != Some(q) { - self.next_byte_value()?; + self.next_str_lit_bytes()?; } self.next_char_expect_eq(q)?; @@ -663,6 +670,37 @@ impl<'a> Lexer<'a> { } } +impl From for DecodedBytes { + fn from(value: u8) -> Self { + DecodedBytes { + buf: [value, 0, 0, 0], + len: 1, + } + } +} + +impl From for DecodedBytes { + fn from(value: char) -> Self { + let mut this = DecodedBytes { + buf: [0; 4], + len: 0, + }; + let len = value.encode_utf8(&mut this.buf).len(); + this.len = len as _; + this + } +} + +// means that we work with `Vec::extend`. +impl IntoIterator for DecodedBytes { + type Item = u8; + type IntoIter = std::iter::Take>; + + fn into_iter(self) -> Self::IntoIter { + self.buf.into_iter().take(self.len as _) + } +} + #[cfg(test)] mod test { use super::*; diff --git a/protobuf-support/src/lexer/str_lit.rs b/protobuf-support/src/lexer/str_lit.rs index 0e51a16bf..840c9eb33 100644 --- a/protobuf-support/src/lexer/str_lit.rs +++ b/protobuf-support/src/lexer/str_lit.rs @@ -32,9 +32,9 @@ impl StrLit { let mut lexer = Lexer::new(&self.escaped, ParserLanguage::Json); let mut r = Vec::new(); while !lexer.eof() { - r.push( + r.extend( lexer - .next_byte_value() + .next_str_lit_bytes() .map_err(|_| StrLitDecodeError::OtherError)?, ); } @@ -45,9 +45,9 @@ impl StrLit { let mut lexer = Lexer::new(&self.escaped, ParserLanguage::Json); let mut r = Vec::new(); while !lexer.eof() { - r.push( + r.extend( lexer - .next_byte_value() + .next_str_lit_bytes() .map_err(|_| StrLitDecodeError::OtherError)?, ); } diff --git a/test-crates/protobuf-codegen-protoc-test/src/common/v2/test_fmt_text_format.rs b/test-crates/protobuf-codegen-protoc-test/src/common/v2/test_fmt_text_format.rs index eb5d8cc83..fd3c5eed7 100644 --- a/test-crates/protobuf-codegen-protoc-test/src/common/v2/test_fmt_text_format.rs +++ b/test-crates/protobuf-codegen-protoc-test/src/common/v2/test_fmt_text_format.rs @@ -114,6 +114,12 @@ fn test_string_bytes() { ); } +#[test] +fn non_ascii_strings() { + test_text_format_str_descriptor("string_singular: \"À\"", &TestTypes::descriptor()); + test_text_format_str_descriptor("string_singular: \"😭\"", &TestTypes::descriptor()); +} + #[test] fn test_message() { test_text_format_str_descriptor("test_message_singular {}", &TestTypes::descriptor());