Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Point at invalid utf-8 span on user's source code #135557

Merged
merged 1 commit into from
Jan 23, 2025
Merged
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions compiler/rustc_builtin_macros/src/lib.rs
Original file line number Diff line number Diff line change
@@ -16,6 +16,7 @@
#![feature(proc_macro_internals)]
#![feature(proc_macro_quote)]
#![feature(rustdoc_internals)]
#![feature(string_from_utf8_lossy_owned)]
#![feature(try_blocks)]
#![warn(unreachable_pub)]
// tidy-alphabetical-end
11 changes: 6 additions & 5 deletions compiler/rustc_builtin_macros/src/source_util.rs
Original file line number Diff line number Diff line change
@@ -13,7 +13,7 @@ use rustc_expand::base::{
use rustc_expand::module::DirOwnership;
use rustc_lint_defs::BuiltinLintDiag;
use rustc_parse::parser::{ForceCollect, Parser};
use rustc_parse::{new_parser_from_file, unwrap_or_emit_fatal};
use rustc_parse::{new_parser_from_file, unwrap_or_emit_fatal, utf8_error};
use rustc_session::lint::builtin::INCOMPLETE_INCLUDE;
use rustc_span::source_map::SourceMap;
use rustc_span::{Pos, Span, Symbol};
@@ -209,9 +209,10 @@ pub(crate) fn expand_include_str(
let interned_src = Symbol::intern(src);
MacEager::expr(cx.expr_str(cx.with_def_site_ctxt(bsp), interned_src))
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

maybe worth looking into unifying the logic here and the rustc_parse logic somehow?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I moved the logic to a separate function and called it from both places.

}
Err(_) => {
let guar = cx.dcx().span_err(sp, format!("`{path}` wasn't a utf-8 file"));
DummyResult::any(sp, guar)
Err(utf8err) => {
let mut err = cx.dcx().struct_span_err(sp, format!("`{path}` wasn't a utf-8 file"));
utf8_error(cx.source_map(), path.as_str(), None, &mut err, utf8err, &bytes[..]);
DummyResult::any(sp, err.emit())
}
},
Err(dummy) => dummy,
@@ -273,7 +274,7 @@ fn load_binary_file(
.and_then(|path| path.into_os_string().into_string().ok());

if let Some(new_path) = new_path {
err.span_suggestion(
err.span_suggestion_verbose(
path_span,
"there is a file with the same name in a different directory",
format!("\"{}\"", new_path.replace('\\', "/").escape_debug()),
67 changes: 63 additions & 4 deletions compiler/rustc_parse/src/lib.rs
Original file line number Diff line number Diff line change
@@ -11,18 +11,21 @@
#![feature(if_let_guard)]
#![feature(iter_intersperse)]
#![feature(let_chains)]
#![feature(string_from_utf8_lossy_owned)]
#![warn(unreachable_pub)]
// tidy-alphabetical-end

use std::path::Path;
use std::path::{Path, PathBuf};
use std::str::Utf8Error;

use rustc_ast as ast;
use rustc_ast::tokenstream::TokenStream;
use rustc_ast::{AttrItem, Attribute, MetaItemInner, token};
use rustc_ast_pretty::pprust;
use rustc_data_structures::sync::Lrc;
use rustc_errors::{Diag, FatalError, PResult};
use rustc_errors::{Diag, EmissionGuarantee, FatalError, PResult, pluralize};
use rustc_session::parse::ParseSess;
use rustc_span::source_map::SourceMap;
use rustc_span::{FileName, SourceFile, Span};
pub use unicode_normalization::UNICODE_VERSION as UNICODE_NORMALIZATION_VERSION;

@@ -73,9 +76,22 @@ pub fn new_parser_from_file<'a>(
path: &Path,
sp: Option<Span>,
) -> Result<Parser<'a>, Vec<Diag<'a>>> {
let source_file = psess.source_map().load_file(path).unwrap_or_else(|e| {
let msg = format!("couldn't read {}: {}", path.display(), e);
let sm = psess.source_map();
let source_file = sm.load_file(path).unwrap_or_else(|e| {
let msg = format!("couldn't read `{}`: {}", path.display(), e);
let mut err = psess.dcx().struct_fatal(msg);
if let Ok(contents) = std::fs::read(path)
&& let Err(utf8err) = String::from_utf8(contents.clone())
{
utf8_error(
sm,
&path.display().to_string(),
sp,
&mut err,
utf8err.utf8_error(),
&contents,
);
}
if let Some(sp) = sp {
err.span(sp);
}
@@ -84,6 +100,49 @@ pub fn new_parser_from_file<'a>(
new_parser_from_source_file(psess, source_file)
}

pub fn utf8_error<E: EmissionGuarantee>(
sm: &SourceMap,
path: &str,
sp: Option<Span>,
err: &mut Diag<'_, E>,
utf8err: Utf8Error,
contents: &[u8],
) {
// The file exists, but it wasn't valid UTF-8.
let start = utf8err.valid_up_to();
let note = format!("invalid utf-8 at byte `{start}`");
let msg = if let Some(len) = utf8err.error_len() {
format!(
"byte{s} `{bytes}` {are} not valid utf-8",
bytes = if len == 1 {
format!("{:?}", contents[start])
} else {
format!("{:?}", &contents[start..start + len])
},
s = pluralize!(len),
are = if len == 1 { "is" } else { "are" },
)
} else {
note.clone()
};
let contents = String::from_utf8_lossy(contents).to_string();
let source = sm.new_source_file(PathBuf::from(path).into(), contents);
let span = Span::with_root_ctxt(
source.normalized_byte_pos(start as u32),
source.normalized_byte_pos(start as u32),
);
if span.is_dummy() {
err.note(note);
} else {
if sp.is_some() {
err.span_note(span, msg);
} else {
err.span(span);
err.span_label(span, msg);
}
}
}

/// Given a session and a `source_file`, return a parser. Returns any buffered errors from lexing
/// the initial token stream.
fn new_parser_from_source_file(
2 changes: 2 additions & 0 deletions src/tools/compiletest/src/errors.rs
Original file line number Diff line number Diff line change
@@ -101,6 +101,8 @@ pub fn load_errors(testfile: &Path, revision: Option<&str>) -> Vec<Error> {

rdr.lines()
.enumerate()
// We want to ignore utf-8 failures in tests during collection of annotations.
.filter(|(_, line)| line.is_ok())
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This...

.filter_map(|(line_num, line)| {
parse_expected(last_nonfollow_error, line_num + 1, &line.unwrap(), revision).map(
|(which, error)| {
Original file line number Diff line number Diff line change
@@ -58,7 +58,7 @@ pub fn check(tests_path: impl AsRef<Path>, bad: &mut bool) {

let mut expected_revisions = BTreeSet::new();

let contents = std::fs::read_to_string(test).unwrap();
let Ok(contents) = std::fs::read_to_string(test) else { continue };
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

...this, and another line in md-doc blow up if a test has non-utf-8 bytes.

I ended up removing a test of ".rs file with invalid few utf-8 bytes" because md-docs is in a separate repo. I feel we should make the entire test suite more resilient to these, but I think I can make that test I added and removed be in run-make...


// Collect directives.
iter_header(&contents, &mut |HeaderLine { revision, directive, .. }| {
2 changes: 1 addition & 1 deletion tests/ui/macros/not-utf8.rs
Original file line number Diff line number Diff line change
@@ -3,5 +3,5 @@
//@ reference: input.encoding.invalid

fn foo() {
include!("not-utf8.bin")
include!("not-utf8.bin");
}
9 changes: 7 additions & 2 deletions tests/ui/macros/not-utf8.stderr
Original file line number Diff line number Diff line change
@@ -1,9 +1,14 @@
error: couldn't read $DIR/not-utf8.bin: stream did not contain valid UTF-8
error: couldn't read `$DIR/not-utf8.bin`: stream did not contain valid UTF-8
--> $DIR/not-utf8.rs:6:5
|
LL | include!("not-utf8.bin")
LL | include!("not-utf8.bin");
| ^^^^^^^^^^^^^^^^^^^^^^^^
|
note: byte `193` is not valid utf-8
--> $DIR/not-utf8.bin:1:1
|
LL | �|�␂!5�cc␕␂�Ӻi��WWj�ȥ�'�}�␒�J�ȉ��W�␞O�@����␜w�V���LO����␔[ ␃_�'���SQ�~ذ��ų&��- ��lN~��!@␌ _#���kQ��h�␝�:�...
| ^
= note: this error originates in the macro `include` (in Nightly builds, run with -Z macro-backtrace for more info)

error: aborting due to 1 previous error
2 changes: 1 addition & 1 deletion tests/ui/modules/path-no-file-name.rs
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
//@ normalize-stderr: "\.:.*\(" -> ".: $$ACCESS_DENIED_MSG ("
//@ normalize-stderr: "\.`:.*\(" -> ".`: $$ACCESS_DENIED_MSG ("
//@ normalize-stderr: "os error \d+" -> "os error $$ACCESS_DENIED_CODE"

#[path = "."]
2 changes: 1 addition & 1 deletion tests/ui/modules/path-no-file-name.stderr
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
error: couldn't read $DIR/.: $ACCESS_DENIED_MSG (os error $ACCESS_DENIED_CODE)
error: couldn't read `$DIR/.`: $ACCESS_DENIED_MSG (os error $ACCESS_DENIED_CODE)
--> $DIR/path-no-file-name.rs:5:1
|
LL | mod m;
2 changes: 1 addition & 1 deletion tests/ui/parser/issues/issue-5806.rs
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
//@ normalize-stderr: "parser:.*\(" -> "parser: $$ACCESS_DENIED_MSG ("
//@ normalize-stderr: "parser`:.*\(" -> "parser`: $$ACCESS_DENIED_MSG ("
//@ normalize-stderr: "os error \d+" -> "os error $$ACCESS_DENIED_CODE"

#[path = "../parser"]
2 changes: 1 addition & 1 deletion tests/ui/parser/issues/issue-5806.stderr
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
error: couldn't read $DIR/../parser: $ACCESS_DENIED_MSG (os error $ACCESS_DENIED_CODE)
error: couldn't read `$DIR/../parser`: $ACCESS_DENIED_MSG (os error $ACCESS_DENIED_CODE)
--> $DIR/issue-5806.rs:5:1
|
LL | mod foo;
2 changes: 1 addition & 1 deletion tests/ui/parser/mod_file_with_path_attr.rs
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
//@ normalize-stderr: "not_a_real_file.rs:.*\(" -> "not_a_real_file.rs: $$FILE_NOT_FOUND_MSG ("
//@ normalize-stderr: "not_a_real_file.rs`:.*\(" -> "not_a_real_file.rs`: $$FILE_NOT_FOUND_MSG ("

#[path = "not_a_real_file.rs"]
mod m; //~ ERROR not_a_real_file.rs
2 changes: 1 addition & 1 deletion tests/ui/parser/mod_file_with_path_attr.stderr
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
error: couldn't read $DIR/not_a_real_file.rs: $FILE_NOT_FOUND_MSG (os error 2)
error: couldn't read `$DIR/not_a_real_file.rs`: $FILE_NOT_FOUND_MSG (os error 2)
--> $DIR/mod_file_with_path_attr.rs:4:1
|
LL | mod m;
2 changes: 1 addition & 1 deletion tests/ui/unpretty/staged-api-invalid-path-108697.stderr
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
error: couldn't read $DIR/lol: No such file or directory (os error 2)
error: couldn't read `$DIR/lol`: No such file or directory (os error 2)
--> $DIR/staged-api-invalid-path-108697.rs:8:1
|
LL | mod foo;