Skip to content

Commit 3ad8e2d

Browse files
committed
Auto merge of rust-lang#118897 - nnethercote:more-unescaping-cleanups, r=fee1-dead
More unescaping cleanups More minor improvements I found while working on rust-lang#118699. r? `@fee1-dead`
2 parents 1c6a061 + b900eb7 commit 3ad8e2d

File tree

3 files changed

+82
-88
lines changed

3 files changed

+82
-88
lines changed

compiler/rustc_ast/src/util/literal.rs

+30-44
Original file line numberDiff line numberDiff line change
@@ -77,6 +77,8 @@ impl LitKind {
7777
// new symbol because the string in the LitKind is different to the
7878
// string in the token.
7979
let s = symbol.as_str();
80+
// Vanilla strings are so common we optimize for the common case where no chars
81+
// requiring special behaviour are present.
8082
let symbol = if s.contains(['\\', '\r']) {
8183
let mut buf = String::with_capacity(s.len());
8284
let mut error = Ok(());
@@ -104,27 +106,20 @@ impl LitKind {
104106
LitKind::Str(symbol, ast::StrStyle::Cooked)
105107
}
106108
token::StrRaw(n) => {
107-
// Ditto.
108-
let s = symbol.as_str();
109-
let symbol =
110-
if s.contains('\r') {
111-
let mut buf = String::with_capacity(s.len());
112-
let mut error = Ok(());
113-
unescape_literal(s, Mode::RawStr, &mut |_, unescaped_char| {
114-
match unescaped_char {
115-
Ok(c) => buf.push(c),
116-
Err(err) => {
117-
if err.is_fatal() {
118-
error = Err(LitError::LexerError);
119-
}
120-
}
109+
// Raw strings have no escapes, so we only need to check for invalid chars, and we
110+
// can reuse the symbol on success.
111+
let mut error = Ok(());
112+
unescape_literal(symbol.as_str(), Mode::RawStr, &mut |_, unescaped_char| {
113+
match unescaped_char {
114+
Ok(_) => {}
115+
Err(err) => {
116+
if err.is_fatal() {
117+
error = Err(LitError::LexerError);
121118
}
122-
});
123-
error?;
124-
Symbol::intern(&buf)
125-
} else {
126-
symbol
127-
};
119+
}
120+
}
121+
});
122+
error?;
128123
LitKind::Str(symbol, ast::StrStyle::Raw(n))
129124
}
130125
token::ByteStr => {
@@ -143,25 +138,19 @@ impl LitKind {
143138
LitKind::ByteStr(buf.into(), StrStyle::Cooked)
144139
}
145140
token::ByteStrRaw(n) => {
141+
// Raw strings have no escapes, so we only need to check for invalid chars, and we
142+
// can convert the symbol directly to a `Lrc<u8>` on success.
146143
let s = symbol.as_str();
147-
let bytes = if s.contains('\r') {
148-
let mut buf = Vec::with_capacity(s.len());
149-
let mut error = Ok(());
150-
unescape_literal(s, Mode::RawByteStr, &mut |_, c| match c {
151-
Ok(c) => buf.push(byte_from_char(c)),
152-
Err(err) => {
153-
if err.is_fatal() {
154-
error = Err(LitError::LexerError);
155-
}
144+
let mut error = Ok(());
145+
unescape_literal(s, Mode::RawByteStr, &mut |_, c| match c {
146+
Ok(_) => {}
147+
Err(err) => {
148+
if err.is_fatal() {
149+
error = Err(LitError::LexerError);
156150
}
157-
});
158-
error?;
159-
buf
160-
} else {
161-
symbol.to_string().into_bytes()
162-
};
163-
164-
LitKind::ByteStr(bytes.into(), StrStyle::Raw(n))
151+
}
152+
});
153+
LitKind::ByteStr(s.to_owned().into_bytes().into(), StrStyle::Raw(n))
165154
}
166155
token::CStr => {
167156
let s = symbol.as_str();
@@ -172,7 +161,6 @@ impl LitKind {
172161
error = Err(LitError::NulInCStr(span));
173162
}
174163
Ok(CStrUnit::Byte(b)) => buf.push(b),
175-
Ok(CStrUnit::Char(c)) if c.len_utf8() == 1 => buf.push(c as u8),
176164
Ok(CStrUnit::Char(c)) => {
177165
buf.extend_from_slice(c.encode_utf8(&mut [0; 4]).as_bytes())
178166
}
@@ -187,25 +175,23 @@ impl LitKind {
187175
LitKind::CStr(buf.into(), StrStyle::Cooked)
188176
}
189177
token::CStrRaw(n) => {
178+
// Raw strings have no escapes, so we only need to check for invalid chars, and we
179+
// can convert the symbol directly to a `Lrc<u8>` on success.
190180
let s = symbol.as_str();
191-
let mut buf = Vec::with_capacity(s.len());
192181
let mut error = Ok(());
193182
unescape_c_string(s, Mode::RawCStr, &mut |span, c| match c {
194183
Ok(CStrUnit::Byte(0) | CStrUnit::Char('\0')) => {
195184
error = Err(LitError::NulInCStr(span));
196185
}
197-
Ok(CStrUnit::Byte(b)) => buf.push(b),
198-
Ok(CStrUnit::Char(c)) if c.len_utf8() == 1 => buf.push(c as u8),
199-
Ok(CStrUnit::Char(c)) => {
200-
buf.extend_from_slice(c.encode_utf8(&mut [0; 4]).as_bytes())
201-
}
186+
Ok(_) => {}
202187
Err(err) => {
203188
if err.is_fatal() {
204189
error = Err(LitError::LexerError);
205190
}
206191
}
207192
});
208193
error?;
194+
let mut buf = s.to_owned().into_bytes();
209195
buf.push(0);
210196
LitKind::CStr(buf.into(), StrStyle::Raw(n))
211197
}

compiler/rustc_lexer/src/unescape.rs

+10-11
Original file line numberDiff line numberDiff line change
@@ -92,8 +92,8 @@ where
9292
let res = unescape_char_or_byte(&mut chars, mode);
9393
callback(0..(src.len() - chars.as_str().len()), res);
9494
}
95-
Str | ByteStr => unescape_str_common(src, mode, callback),
96-
RawStr | RawByteStr => unescape_raw_str_or_raw_byte_str(src, mode, callback),
95+
Str | ByteStr => unescape_non_raw_common(src, mode, callback),
96+
RawStr | RawByteStr => check_raw_common(src, mode, callback),
9797
CStr | RawCStr => unreachable!(),
9898
}
9999
}
@@ -122,12 +122,10 @@ where
122122
{
123123
match mode {
124124
CStr => {
125-
unescape_str_common(src, mode, callback);
125+
unescape_non_raw_common(src, mode, callback);
126126
}
127127
RawCStr => {
128-
unescape_raw_str_or_raw_byte_str(src, mode, &mut |r, result| {
129-
callback(r, result.map(CStrUnit::Char))
130-
});
128+
check_raw_common(src, mode, &mut |r, result| callback(r, result.map(CStrUnit::Char)));
131129
}
132130
Char | Byte | Str | RawStr | ByteStr | RawByteStr => unreachable!(),
133131
}
@@ -191,8 +189,9 @@ impl Mode {
191189
/// Byte literals do not allow unicode escape.
192190
fn is_unicode_escape_disallowed(self) -> bool {
193191
match self {
194-
Byte | ByteStr | RawByteStr => true,
195-
Char | Str | RawStr | CStr | RawCStr => false,
192+
Byte | ByteStr => true,
193+
Char | Str | CStr => false,
194+
RawByteStr | RawStr | RawCStr => unreachable!(),
196195
}
197196
}
198197

@@ -324,7 +323,7 @@ fn unescape_char_or_byte(chars: &mut Chars<'_>, mode: Mode) -> Result<char, Esca
324323

325324
/// Takes a contents of a string literal (without quotes) and produces a
326325
/// sequence of escaped characters or errors.
327-
fn unescape_str_common<F, T: From<u8> + From<char>>(src: &str, mode: Mode, callback: &mut F)
326+
fn unescape_non_raw_common<F, T: From<u8> + From<char>>(src: &str, mode: Mode, callback: &mut F)
328327
where
329328
F: FnMut(Range<usize>, Result<T, EscapeError>),
330329
{
@@ -391,15 +390,15 @@ where
391390
/// sequence of characters or errors.
392391
/// NOTE: Raw strings do not perform any explicit character escaping, here we
393392
/// only produce errors on bare CR.
394-
fn unescape_raw_str_or_raw_byte_str<F>(src: &str, mode: Mode, callback: &mut F)
393+
fn check_raw_common<F>(src: &str, mode: Mode, callback: &mut F)
395394
where
396395
F: FnMut(Range<usize>, Result<char, EscapeError>),
397396
{
398397
let mut chars = src.chars();
399398
let chars_should_be_ascii = mode.chars_should_be_ascii(); // get this outside the loop
400399

401400
// The `start` and `end` computation here matches the one in
402-
// `unescape_str_common` for consistency, even though this function
401+
// `unescape_non_raw_common` for consistency, even though this function
403402
// doesn't have to worry about skipping any chars.
404403
while let Some(c) = chars.next() {
405404
let start = src.len() - chars.as_str().len() - c.len_utf8();

compiler/rustc_parse/src/lexer/unescape_error_reporting.rs

+42-33
Original file line numberDiff line numberDiff line change
@@ -11,32 +11,34 @@ use crate::errors::{MoreThanOneCharNote, MoreThanOneCharSugg, NoBraceUnicodeSub,
1111

1212
pub(crate) fn emit_unescape_error(
1313
handler: &Handler,
14-
// interior part of the literal, without quotes
14+
// interior part of the literal, between quotes
1515
lit: &str,
16-
// full span of the literal, including quotes
17-
span_with_quotes: Span,
18-
// interior span of the literal, without quotes
19-
span: Span,
16+
// full span of the literal, including quotes and any prefix
17+
full_lit_span: Span,
18+
// span of the error part of the literal
19+
err_span: Span,
2020
mode: Mode,
2121
// range of the error inside `lit`
2222
range: Range<usize>,
2323
error: EscapeError,
2424
) {
2525
debug!(
2626
"emit_unescape_error: {:?}, {:?}, {:?}, {:?}, {:?}",
27-
lit, span_with_quotes, mode, range, error
27+
lit, full_lit_span, mode, range, error
2828
);
2929
let last_char = || {
3030
let c = lit[range.clone()].chars().next_back().unwrap();
31-
let span = span.with_lo(span.hi() - BytePos(c.len_utf8() as u32));
31+
let span = err_span.with_lo(err_span.hi() - BytePos(c.len_utf8() as u32));
3232
(c, span)
3333
};
3434
match error {
3535
EscapeError::LoneSurrogateUnicodeEscape => {
36-
handler.emit_err(UnescapeError::InvalidUnicodeEscape { span, surrogate: true });
36+
handler
37+
.emit_err(UnescapeError::InvalidUnicodeEscape { span: err_span, surrogate: true });
3738
}
3839
EscapeError::OutOfRangeUnicodeEscape => {
39-
handler.emit_err(UnescapeError::InvalidUnicodeEscape { span, surrogate: false });
40+
handler
41+
.emit_err(UnescapeError::InvalidUnicodeEscape { span: err_span, surrogate: false });
4042
}
4143
EscapeError::MoreThanOneChar => {
4244
use unicode_normalization::{char::is_combining_mark, UnicodeNormalization};
@@ -49,12 +51,16 @@ pub(crate) fn emit_unescape_error(
4951
let normalized = lit.nfc().to_string();
5052
if normalized.chars().count() == 1 {
5153
let ch = normalized.chars().next().unwrap().escape_default().to_string();
52-
sugg = Some(MoreThanOneCharSugg::NormalizedForm { span, ch, normalized });
54+
sugg = Some(MoreThanOneCharSugg::NormalizedForm {
55+
span: err_span,
56+
ch,
57+
normalized,
58+
});
5359
}
5460
let escaped_marks =
5561
rest.iter().map(|c| c.escape_default().to_string()).collect::<Vec<_>>();
5662
note = Some(MoreThanOneCharNote::AllCombining {
57-
span,
63+
span: err_span,
5864
chr: format!("{first}"),
5965
len: escaped_marks.len(),
6066
escaped_marks: escaped_marks.join(""),
@@ -69,10 +75,12 @@ pub(crate) fn emit_unescape_error(
6975
.collect();
7076

7177
if let &[ch] = printable.as_slice() {
72-
sugg =
73-
Some(MoreThanOneCharSugg::RemoveNonPrinting { span, ch: ch.to_string() });
78+
sugg = Some(MoreThanOneCharSugg::RemoveNonPrinting {
79+
span: err_span,
80+
ch: ch.to_string(),
81+
});
7482
note = Some(MoreThanOneCharNote::NonPrinting {
75-
span,
83+
span: err_span,
7684
escaped: lit.escape_default().to_string(),
7785
});
7886
}
@@ -91,21 +99,21 @@ pub(crate) fn emit_unescape_error(
9199
}
92100
let sugg = format!("{prefix}\"{escaped}\"");
93101
MoreThanOneCharSugg::Quotes {
94-
span: span_with_quotes,
102+
span: full_lit_span,
95103
is_byte: mode == Mode::Byte,
96104
sugg,
97105
}
98106
});
99107
handler.emit_err(UnescapeError::MoreThanOneChar {
100-
span: span_with_quotes,
108+
span: full_lit_span,
101109
note,
102110
suggestion: sugg,
103111
});
104112
}
105113
EscapeError::EscapeOnlyChar => {
106114
let (c, char_span) = last_char();
107115
handler.emit_err(UnescapeError::EscapeOnlyChar {
108-
span,
116+
span: err_span,
109117
char_span,
110118
escaped_sugg: c.escape_default().to_string(),
111119
escaped_msg: escaped_char(c),
@@ -114,11 +122,11 @@ pub(crate) fn emit_unescape_error(
114122
}
115123
EscapeError::BareCarriageReturn => {
116124
let double_quotes = mode.in_double_quotes();
117-
handler.emit_err(UnescapeError::BareCr { span, double_quotes });
125+
handler.emit_err(UnescapeError::BareCr { span: err_span, double_quotes });
118126
}
119127
EscapeError::BareCarriageReturnInRawString => {
120128
assert!(mode.in_double_quotes());
121-
handler.emit_err(UnescapeError::BareCrRawString(span));
129+
handler.emit_err(UnescapeError::BareCrRawString(err_span));
122130
}
123131
EscapeError::InvalidEscape => {
124132
let (c, span) = last_char();
@@ -143,7 +151,7 @@ pub(crate) fn emit_unescape_error(
143151
} else {
144152
if mode == Mode::Str || mode == Mode::Char {
145153
diag.span_suggestion(
146-
span_with_quotes,
154+
full_lit_span,
147155
"if you meant to write a literal backslash (perhaps escaping in a regular expression), consider a raw string literal",
148156
format!("r\"{lit}\""),
149157
Applicability::MaybeIncorrect,
@@ -158,7 +166,7 @@ pub(crate) fn emit_unescape_error(
158166
diag.emit();
159167
}
160168
EscapeError::TooShortHexEscape => {
161-
handler.emit_err(UnescapeError::TooShortHexEscape(span));
169+
handler.emit_err(UnescapeError::TooShortHexEscape(err_span));
162170
}
163171
EscapeError::InvalidCharInHexEscape | EscapeError::InvalidCharInUnicodeEscape => {
164172
let (c, span) = last_char();
@@ -210,7 +218,7 @@ pub(crate) fn emit_unescape_error(
210218
err.emit();
211219
}
212220
EscapeError::OutOfRangeHexEscape => {
213-
handler.emit_err(UnescapeError::OutOfRangeHexEscape(span));
221+
handler.emit_err(UnescapeError::OutOfRangeHexEscape(err_span));
214222
}
215223
EscapeError::LeadingUnderscoreUnicodeEscape => {
216224
let (c, span) = last_char();
@@ -220,10 +228,11 @@ pub(crate) fn emit_unescape_error(
220228
});
221229
}
222230
EscapeError::OverlongUnicodeEscape => {
223-
handler.emit_err(UnescapeError::OverlongUnicodeEscape(span));
231+
handler.emit_err(UnescapeError::OverlongUnicodeEscape(err_span));
224232
}
225233
EscapeError::UnclosedUnicodeEscape => {
226-
handler.emit_err(UnescapeError::UnclosedUnicodeEscape(span, span.shrink_to_hi()));
234+
handler
235+
.emit_err(UnescapeError::UnclosedUnicodeEscape(err_span, err_span.shrink_to_hi()));
227236
}
228237
EscapeError::NoBraceInUnicodeEscape => {
229238
let mut suggestion = "\\u{".to_owned();
@@ -238,34 +247,34 @@ pub(crate) fn emit_unescape_error(
238247
let (label, sub) = if suggestion_len > 0 {
239248
suggestion.push('}');
240249
let hi = char_span.lo() + BytePos(suggestion_len as u32);
241-
(None, NoBraceUnicodeSub::Suggestion { span: span.with_hi(hi), suggestion })
250+
(None, NoBraceUnicodeSub::Suggestion { span: err_span.with_hi(hi), suggestion })
242251
} else {
243-
(Some(span), NoBraceUnicodeSub::Help)
252+
(Some(err_span), NoBraceUnicodeSub::Help)
244253
};
245-
handler.emit_err(UnescapeError::NoBraceInUnicodeEscape { span, label, sub });
254+
handler.emit_err(UnescapeError::NoBraceInUnicodeEscape { span: err_span, label, sub });
246255
}
247256
EscapeError::UnicodeEscapeInByte => {
248-
handler.emit_err(UnescapeError::UnicodeEscapeInByte(span));
257+
handler.emit_err(UnescapeError::UnicodeEscapeInByte(err_span));
249258
}
250259
EscapeError::EmptyUnicodeEscape => {
251-
handler.emit_err(UnescapeError::EmptyUnicodeEscape(span));
260+
handler.emit_err(UnescapeError::EmptyUnicodeEscape(err_span));
252261
}
253262
EscapeError::ZeroChars => {
254-
handler.emit_err(UnescapeError::ZeroChars(span));
263+
handler.emit_err(UnescapeError::ZeroChars(err_span));
255264
}
256265
EscapeError::LoneSlash => {
257-
handler.emit_err(UnescapeError::LoneSlash(span));
266+
handler.emit_err(UnescapeError::LoneSlash(err_span));
258267
}
259268
EscapeError::UnskippedWhitespaceWarning => {
260269
let (c, char_span) = last_char();
261270
handler.emit_warning(UnescapeError::UnskippedWhitespace {
262-
span,
271+
span: err_span,
263272
ch: escaped_char(c),
264273
char_span,
265274
});
266275
}
267276
EscapeError::MultipleSkippedLinesWarning => {
268-
handler.emit_warning(UnescapeError::MultipleSkippedLinesWarning(span));
277+
handler.emit_warning(UnescapeError::MultipleSkippedLinesWarning(err_span));
269278
}
270279
}
271280
}

0 commit comments

Comments
 (0)