Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Normalize newlines when loading files #62948

Merged
merged 2 commits into from
Aug 18, 2019
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 0 additions & 2 deletions src/librustc_lexer/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -352,7 +352,6 @@ impl Cursor<'_> {
loop {
match self.nth_char(0) {
'\n' => break,
'\r' if self.nth_char(1) == '\n' => break,
EOF_CHAR if self.is_eof() => break,
_ => {
self.bump();
Expand Down Expand Up @@ -525,7 +524,6 @@ impl Cursor<'_> {
match self.nth_char(0) {
'/' if !first => break,
'\n' if self.nth_char(1) != '\'' => break,
'\r' if self.nth_char(1) == '\n' => break,
EOF_CHAR if self.is_eof() => break,
'\'' => {
self.bump();
Expand Down
36 changes: 8 additions & 28 deletions src/librustc_lexer/src/unescape.rs
Original file line number Diff line number Diff line change
Expand Up @@ -128,11 +128,7 @@ fn scan_escape(first_char: char, chars: &mut Chars<'_>, mode: Mode) -> Result<ch
if first_char != '\\' {
return match first_char {
'\t' | '\n' => Err(EscapeError::EscapeOnlyChar),
'\r' => Err(if chars.clone().next() == Some('\n') {
EscapeError::EscapeOnlyChar
} else {
EscapeError::BareCarriageReturn
}),
'\r' => Err(EscapeError::BareCarriageReturn),
'\'' if mode.in_single_quotes() => Err(EscapeError::EscapeOnlyChar),
'"' if mode.in_double_quotes() => Err(EscapeError::EscapeOnlyChar),
_ => {
Expand Down Expand Up @@ -244,27 +240,15 @@ where

let unescaped_char = match first_char {
'\\' => {
let (second_char, third_char) = {
let mut chars = chars.clone();
(chars.next(), chars.next())
};
match (second_char, third_char) {
(Some('\n'), _) | (Some('\r'), Some('\n')) => {
let second_char = chars.clone().next();
match second_char {
Some('\n') => {
skip_ascii_whitespace(&mut chars);
continue;
}
_ => scan_escape(first_char, &mut chars, mode),
}
}
'\r' => {
let second_char = chars.clone().next();
if second_char == Some('\n') {
chars.next();
Ok('\n')
} else {
scan_escape(first_char, &mut chars, mode)
}
}
'\n' => Ok('\n'),
'\t' => Ok('\t'),
_ => scan_escape(first_char, &mut chars, mode),
Expand Down Expand Up @@ -298,15 +282,11 @@ where
while let Some(curr) = chars.next() {
let start = initial_len - chars.as_str().len() - curr.len_utf8();

let result = match (curr, chars.clone().next()) {
('\r', Some('\n')) => {
chars.next();
Ok('\n')
},
('\r', _) => Err(EscapeError::BareCarriageReturnInRawString),
(c, _) if mode.is_bytes() && !c.is_ascii() =>
let result = match curr {
'\r' => Err(EscapeError::BareCarriageReturnInRawString),
c if mode.is_bytes() && !c.is_ascii() =>
Err(EscapeError::NonAsciiCharInByteString),
(c, _) => Ok(c),
c => Ok(c),
};
let end = initial_len - chars.as_str().len();

Expand Down
11 changes: 3 additions & 8 deletions src/librustc_lexer/src/unescape/tests.rs
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,6 @@ fn test_unescape_char_bad() {
check(r"\", EscapeError::LoneSlash);

check("\n", EscapeError::EscapeOnlyChar);
check("\r\n", EscapeError::EscapeOnlyChar);
check("\t", EscapeError::EscapeOnlyChar);
check("'", EscapeError::EscapeOnlyChar);
check("\r", EscapeError::BareCarriageReturn);
Expand All @@ -31,6 +30,7 @@ fn test_unescape_char_bad() {
check(r"\v", EscapeError::InvalidEscape);
check(r"\💩", EscapeError::InvalidEscape);
check(r"\●", EscapeError::InvalidEscape);
check("\\\r", EscapeError::InvalidEscape);

check(r"\x", EscapeError::TooShortHexEscape);
check(r"\x0", EscapeError::TooShortHexEscape);
Expand Down Expand Up @@ -116,10 +116,9 @@ fn test_unescape_str_good() {

check("foo", "foo");
check("", "");
check(" \t\n\r\n", " \t\n\n");
check(" \t\n", " \t\n");

check("hello \\\n world", "hello world");
check("hello \\\r\n world", "hello world");
check("thread's", "thread's")
}

Expand All @@ -134,7 +133,6 @@ fn test_unescape_byte_bad() {
check(r"\", EscapeError::LoneSlash);

check("\n", EscapeError::EscapeOnlyChar);
check("\r\n", EscapeError::EscapeOnlyChar);
check("\t", EscapeError::EscapeOnlyChar);
check("'", EscapeError::EscapeOnlyChar);
check("\r", EscapeError::BareCarriageReturn);
Expand Down Expand Up @@ -238,10 +236,9 @@ fn test_unescape_byte_str_good() {

check("foo", b"foo");
check("", b"");
check(" \t\n\r\n", b" \t\n\n");
check(" \t\n", b" \t\n");

check("hello \\\n world", b"hello world");
check("hello \\\r\n world", b"hello world");
check("thread's", b"thread's")
}

Expand All @@ -253,7 +250,6 @@ fn test_unescape_raw_str() {
assert_eq!(unescaped, expected);
}

check("\r\n", &[(0..2, Ok('\n'))]);
check("\r", &[(0..1, Err(EscapeError::BareCarriageReturnInRawString))]);
check("\rx", &[(0..1, Err(EscapeError::BareCarriageReturnInRawString)), (1..2, Ok('x'))]);
}
Expand All @@ -266,7 +262,6 @@ fn test_unescape_raw_byte_str() {
assert_eq!(unescaped, expected);
}

check("\r\n", &[(0..2, Ok(byte_from_char('\n')))]);
check("\r", &[(0..1, Err(EscapeError::BareCarriageReturnInRawString))]);
check("🦀", &[(0..4, Err(EscapeError::NonAsciiCharInByteString))]);
check(
Expand Down
81 changes: 15 additions & 66 deletions src/libsyntax/parse/lexer/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -8,9 +8,7 @@ use syntax_pos::{BytePos, Pos, Span, NO_EXPANSION};
use rustc_lexer::Base;
use rustc_lexer::unescape;

use std::borrow::Cow;
use std::char;
use std::iter;
use std::convert::TryInto;
use rustc_data_structures::sync::Lrc;
use log::debug;
Expand Down Expand Up @@ -181,18 +179,7 @@ impl<'a> StringReader<'a> {
let string = self.str_from(start);
// comments with only more "/"s are not doc comments
let tok = if is_doc_comment(string) {
let mut idx = 0;
loop {
idx = match string[idx..].find('\r') {
None => break,
Some(it) => idx + it + 1
};
if string[idx..].chars().next() != Some('\n') {
self.err_span_(start + BytePos(idx as u32 - 1),
start + BytePos(idx as u32),
"bare CR not allowed in doc-comment");
}
}
self.forbid_bare_cr(start, string, "bare CR not allowed in doc-comment");
token::DocComment(Symbol::intern(string))
} else {
token::Comment
Expand All @@ -217,15 +204,10 @@ impl<'a> StringReader<'a> {
}

let tok = if is_doc_comment {
let has_cr = string.contains('\r');
let string = if has_cr {
self.translate_crlf(start,
string,
"bare CR not allowed in block doc-comment")
} else {
string.into()
};
token::DocComment(Symbol::intern(&string[..]))
self.forbid_bare_cr(start,
string,
"bare CR not allowed in block doc-comment");
token::DocComment(Symbol::intern(string))
} else {
token::Comment
};
Expand Down Expand Up @@ -516,49 +498,16 @@ impl<'a> StringReader<'a> {
&self.src[self.src_index(start)..self.src_index(end)]
}

/// Converts CRLF to LF in the given string, raising an error on bare CR.
fn translate_crlf<'b>(&self, start: BytePos, s: &'b str, errmsg: &'b str) -> Cow<'b, str> {
let mut chars = s.char_indices().peekable();
while let Some((i, ch)) = chars.next() {
if ch == '\r' {
if let Some((lf_idx, '\n')) = chars.peek() {
return translate_crlf_(self, start, s, *lf_idx, chars, errmsg).into();
}
let pos = start + BytePos(i as u32);
let end_pos = start + BytePos((i + ch.len_utf8()) as u32);
self.err_span_(pos, end_pos, errmsg);
}
}
return s.into();

fn translate_crlf_(rdr: &StringReader<'_>,
start: BytePos,
s: &str,
mut j: usize,
mut chars: iter::Peekable<impl Iterator<Item = (usize, char)>>,
errmsg: &str)
-> String {
let mut buf = String::with_capacity(s.len());
// Skip first CR
buf.push_str(&s[.. j - 1]);
while let Some((i, ch)) = chars.next() {
if ch == '\r' {
if j < i {
buf.push_str(&s[j..i]);
}
let next = i + ch.len_utf8();
j = next;
if chars.peek().map(|(_, ch)| *ch) != Some('\n') {
let pos = start + BytePos(i as u32);
let end_pos = start + BytePos(next as u32);
rdr.err_span_(pos, end_pos, errmsg);
}
}
}
if j < s.len() {
buf.push_str(&s[j..]);
}
buf
fn forbid_bare_cr(&self, start: BytePos, s: &str, errmsg: &str) {
let mut idx = 0;
loop {
idx = match s[idx..].find('\r') {
None => break,
Some(it) => idx + it + 1
};
self.err_span_(start + BytePos(idx as u32 - 1),
start + BytePos(idx as u32),
errmsg);
}
}

Expand Down
56 changes: 56 additions & 0 deletions src/libsyntax_pos/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -1045,6 +1045,7 @@ impl SourceFile {
mut src: String,
start_pos: BytePos) -> Result<SourceFile, OffsetOverflowError> {
remove_bom(&mut src);
normalize_newlines(&mut src);

let src_hash = {
let mut hasher: StableHasher<u128> = StableHasher::new();
Expand Down Expand Up @@ -1212,6 +1213,61 @@ fn remove_bom(src: &mut String) {
}
}


/// Replaces `\r\n` with `\n` in-place in `src`.
///
/// Returns error if there's a lone `\r` in the string
fn normalize_newlines(src: &mut String) {
if !src.as_bytes().contains(&b'\r') {
return;
}

// We replace `\r\n` with `\n` in-place, which doesn't break utf-8 encoding.
// While we *can* call `as_mut_vec` and do surgery on the live string
// directly, let's rather steal the contents of `src`. This makes the code
// safe even if a panic occurs.

let mut buf = std::mem::replace(src, String::new()).into_bytes();
let mut gap_len = 0;
let mut tail = buf.as_mut_slice();
loop {
let idx = match find_crlf(&tail[gap_len..]) {
None => tail.len(),
Some(idx) => idx + gap_len,
};
tail.copy_within(gap_len..idx, 0);
tail = &mut tail[idx - gap_len..];
if tail.len() == gap_len {
break;
}
gap_len += 1;
}

// Account for removed `\r`.
// After `set_len`, `buf` is guaranteed to contain utf-8 again.
let new_len = buf.len() - gap_len;
unsafe {
buf.set_len(new_len);
*src = String::from_utf8_unchecked(buf);
}

fn find_crlf(src: &[u8]) -> Option<usize> {
let mut search_idx = 0;
while let Some(idx) = find_cr(&src[search_idx..]) {
if src[search_idx..].get(idx + 1) != Some(&b'\n') {
search_idx += idx + 1;
continue;
}
return Some(search_idx + idx);
}
None
}

fn find_cr(src: &[u8]) -> Option<usize> {
src.iter().position(|&b| b == b'\r')
}
}

// _____________________________________________________________________________
// Pos, BytePos, CharPos
//
Expand Down
20 changes: 20 additions & 0 deletions src/libsyntax_pos/tests.rs
Original file line number Diff line number Diff line change
Expand Up @@ -16,3 +16,23 @@ fn test_lookup_line() {
assert_eq!(lookup_line(lines, BytePos(28)), 2);
assert_eq!(lookup_line(lines, BytePos(29)), 2);
}

#[test]
fn test_normalize_newlines() {
fn check(before: &str, after: &str) {
let mut actual = before.to_string();
normalize_newlines(&mut actual);
assert_eq!(actual.as_str(), after);
}
check("", "");
check("\n", "\n");
check("\r", "\r");
check("\r\r", "\r\r");
check("\r\n", "\n");
check("hello world", "hello world");
check("hello\nworld", "hello\nworld");
check("hello\r\nworld", "hello\nworld");
check("\r\nhello\r\nworld\r\n", "\nhello\nworld\n");
check("\r\r\n", "\r\n");
check("hello\rworld", "hello\rworld");
}