Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Accept underscores in unicode escapes #43716

Merged
merged 1 commit into from
Sep 12, 2017
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
87 changes: 47 additions & 40 deletions src/libsyntax/parse/lexer/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -963,60 +963,67 @@ impl<'a> StringReader<'a> {
true
}

/// Scan over a \u{...} escape
/// Scan over a `\u{...}` escape
///
/// At this point, we have already seen the \ and the u, the { is the current character. We
/// will read at least one digit, and up to 6, and pass over the }.
/// At this point, we have already seen the `\` and the `u`, the `{` is the current character.
/// We will read a hex number (with `_` separators), with 1 to 6 actual digits,
/// and pass over the `}`.
fn scan_unicode_escape(&mut self, delim: char) -> bool {
self.bump(); // past the {
let start_bpos = self.pos;
let mut count = 0;
let mut accum_int = 0;
let mut valid = true;

while !self.ch_is('}') && count <= 6 {
let c = match self.ch {
Some(c) => c,
None => {
panic!(self.fatal_span_(start_bpos,
self.pos,
"unterminated unicode escape (found EOF)"));
}
};
accum_int *= 16;
accum_int += c.to_digit(16).unwrap_or_else(|| {
if c == delim {
panic!(self.fatal_span_(self.pos,
self.next_pos,
"unterminated unicode escape (needed a `}`)"));
} else {
self.err_span_char(self.pos,
self.next_pos,
"invalid character in unicode escape",
c);
}
valid = false;
0
});
self.bump();
count += 1;
if let Some('_') = self.ch {
// disallow leading `_`
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

do we need a compile-fail test checking that leading _ is disallowed?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

There is already a parse-fail test that checks that, do I need to move it to compile-fail?

self.err_span_(self.pos,
self.next_pos,
"invalid start of unicode escape");
valid = false;
}

let count = self.scan_digits(16, 16);

if count > 6 {
self.err_span_(start_bpos,
self.pos,
"overlong unicode escape (can have at most 6 hex digits)");
"overlong unicode escape (must have at most 6 hex digits)");
valid = false;
}

if valid && (char::from_u32(accum_int).is_none() || count == 0) {
self.err_span_(start_bpos,
self.pos,
"invalid unicode character escape");
valid = false;
loop {
match self.ch {
Some('}') => {
if valid && count == 0 {
self.err_span_(start_bpos,
self.pos,
"empty unicode escape (must have at least 1 hex digit)");
valid = false;
}
self.bump(); // past the ending `}`
break;
},
Some(c) => {
if c == delim {
self.err_span_(self.pos,
self.pos,
"unterminated unicode escape (needed a `}`)");
valid = false;
break;
} else if valid {
self.err_span_char(start_bpos,
self.pos,
"invalid character in unicode escape",
c);
valid = false;
}
},
None => {
panic!(self.fatal_span_(start_bpos,
self.pos,
"unterminated unicode escape (found EOF)"));
}
}
self.bump();
}

self.bump(); // past the ending }
valid
}

Expand Down
25 changes: 18 additions & 7 deletions src/libsyntax/parse/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -230,7 +230,7 @@ pub fn stream_to_parser(sess: &ParseSess, stream: TokenStream) -> Parser {
/// Rather than just accepting/rejecting a given literal, unescapes it as
/// well. Can take any slice prefixed by a character escape. Returns the
/// character and the number of characters consumed.
pub fn char_lit(lit: &str) -> (char, isize) {
pub fn char_lit(lit: &str, diag: Option<(Span, &Handler)>) -> (char, isize) {
use std::char;

// Handle non-escaped chars first.
Expand Down Expand Up @@ -258,8 +258,19 @@ pub fn char_lit(lit: &str) -> (char, isize) {
'u' => {
assert_eq!(lit.as_bytes()[2], b'{');
let idx = lit.find('}').unwrap();
let v = u32::from_str_radix(&lit[3..idx], 16).unwrap();
let c = char::from_u32(v).unwrap();
let s = &lit[3..idx].chars().filter(|&c| c != '_').collect::<String>();
let v = u32::from_str_radix(&s, 16).unwrap();
let c = char::from_u32(v).unwrap_or_else(|| {
if let Some((span, diag)) = diag {
let mut diag = diag.struct_span_err(span, "invalid unicode character escape");
if v > 0x10FFFF {
diag.help("unicode escape must be at most 10FFFF").emit();
} else {
diag.help("unicode escape must not be a surrogate").emit();
}
}
'\u{FFFD}'
});
(c, (idx + 1) as isize)
}
_ => panic!("lexer should have rejected a bad character escape {}", lit)
Expand All @@ -272,7 +283,7 @@ pub fn escape_default(s: &str) -> String {

/// Parse a string representing a string literal into its final form. Does
/// unescaping.
pub fn str_lit(lit: &str) -> String {
pub fn str_lit(lit: &str, diag: Option<(Span, &Handler)>) -> String {
debug!("parse_str_lit: given {}", escape_default(lit));
let mut res = String::with_capacity(lit.len());

Expand Down Expand Up @@ -313,7 +324,7 @@ pub fn str_lit(lit: &str) -> String {
eat(&mut chars);
} else {
// otherwise, a normal escape
let (c, n) = char_lit(&lit[i..]);
let (c, n) = char_lit(&lit[i..], diag);
for _ in 0..n - 1 { // we don't need to move past the first \
chars.next();
}
Expand Down Expand Up @@ -385,15 +396,15 @@ pub fn lit_token(lit: token::Lit, suf: Option<Symbol>, diag: Option<(Span, &Hand

match lit {
token::Byte(i) => (true, Some(LitKind::Byte(byte_lit(&i.as_str()).0))),
token::Char(i) => (true, Some(LitKind::Char(char_lit(&i.as_str()).0))),
token::Char(i) => (true, Some(LitKind::Char(char_lit(&i.as_str(), diag).0))),

// There are some valid suffixes for integer and float literals,
// so all the handling is done internally.
token::Integer(s) => (false, integer_lit(&s.as_str(), suf, diag)),
token::Float(s) => (false, float_lit(&s.as_str(), suf, diag)),

token::Str_(s) => {
let s = Symbol::intern(&str_lit(&s.as_str()));
let s = Symbol::intern(&str_lit(&s.as_str(), diag));
(true, Some(LitKind::Str(s, ast::StrStyle::Cooked)))
}
token::StrRaw(s, n) => {
Expand Down
9 changes: 4 additions & 5 deletions src/test/parse-fail/issue-23620-invalid-escapes.rs
Original file line number Diff line number Diff line change
Expand Up @@ -41,9 +41,8 @@ fn main() {
//~^^^ ERROR incorrect unicode escape sequence
//~^^^^ ERROR unicode escape sequences cannot be used as a byte or in a byte string

let _ = "\u{ffffff} \xf \u";
//~^ ERROR invalid unicode character escape
//~^^ ERROR invalid character in numeric character escape:
//~^^^ ERROR form of character escape may only be used with characters in the range [\x00-\x7f]
//~^^^^ ERROR incorrect unicode escape sequence
let _ = "\xf \u";
//~^ ERROR invalid character in numeric character escape:
//~^^ ERROR form of character escape may only be used with characters in the range [\x00-\x7f]
//~^^^ ERROR incorrect unicode escape sequence
}
15 changes: 15 additions & 0 deletions src/test/parse-fail/issue-43692.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
// Copyright 2017 The Rust Project Developers. See the COPYRIGHT
// file at the top-level directory of this distribution and at
// http://rust-lang.org/COPYRIGHT.
//
// Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
// http://www.apache.org/licenses/LICENSE-2.0> or the MIT license
// <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your
// option. This file may not be copied, modified, or distributed
// except according to those terms.

// compile-flags: -Z parse-only

fn main() {
'\u{_10FFFF}'; //~ ERROR invalid start of unicode escape
}
2 changes: 1 addition & 1 deletion src/test/parse-fail/new-unicode-escapes-2.rs
Original file line number Diff line number Diff line change
Expand Up @@ -11,5 +11,5 @@
// compile-flags: -Z parse-only

pub fn main() {
let s = "\u{260311111111}"; //~ ERROR overlong unicode escape (can have at most 6 hex digits)
let s = "\u{260311111111}"; //~ ERROR overlong unicode escape (must have at most 6 hex digits)
}
3 changes: 2 additions & 1 deletion src/test/parse-fail/new-unicode-escapes-3.rs
Original file line number Diff line number Diff line change
Expand Up @@ -11,5 +11,6 @@
// compile-flags: -Z parse-only

pub fn main() {
let s = "\u{d805}"; //~ ERROR invalid unicode character escape
let s1 = "\u{d805}"; //~ ERROR invalid unicode character escape
let s2 = "\u{ffffff}"; //~ ERROR invalid unicode character escape
}
2 changes: 0 additions & 2 deletions src/test/parse-fail/new-unicode-escapes-4.rs
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,4 @@
pub fn main() {
let s = "\u{lol}";
//~^ ERROR invalid character in unicode escape: l
//~^^ ERROR invalid character in unicode escape: o
//~^^^ ERROR invalid character in unicode escape: l
}
14 changes: 14 additions & 0 deletions src/test/run-pass/issue-43692.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
// Copyright 2017 The Rust Project Developers. See the COPYRIGHT
// file at the top-level directory of this distribution and at
// http://rust-lang.org/COPYRIGHT.
//
// Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
// http://www.apache.org/licenses/LICENSE-2.0> or the MIT license
// <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your
// option. This file may not be copied, modified, or distributed
// except according to those terms.

fn main() {
assert_eq!('\u{10__FFFF}', '\u{10FFFF}');
assert_eq!("\u{10_F0FF__}foo\u{1_0_0_0__}", "\u{10F0FF}foo\u{1000}");
}