From 57dd42d6134539f5a98f59039bcba6d93daf9d6a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Esteban=20K=C3=BCber?= Date: Wed, 15 Jan 2025 21:24:31 +0000 Subject: [PATCH] Point at invalid utf-8 span on user's source code MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit ``` error: couldn't read `$DIR/not-utf8-bin-file.rs`: stream did not contain valid UTF-8 --> $DIR/not-utf8-2.rs:6:5 | LL | include!("not-utf8-bin-file.rs"); | ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ | note: `[193]` is not valid utf-8 --> $DIR/not-utf8-bin-file.rs:2:14 | LL | let _ = "�|�␂!5�cc␕␂��"; | ^ = note: this error originates in the macro `include` (in Nightly builds, run with -Z macro-backtrace for more info) ``` When we attempt to load a Rust source code file, if there is a OS file failure we try reading the file as bytes. If that succeeds we try to turn it into UTF-8. If *that* fails, we provide additional context about *where* the file has the first invalid UTF-8 character. Fix #76869. --- compiler/rustc_builtin_macros/src/lib.rs | 1 + .../rustc_builtin_macros/src/source_util.rs | 11 +-- compiler/rustc_parse/src/lib.rs | 67 +++++++++++++++++-- src/tools/compiletest/src/errors.rs | 2 + .../tests_revision_unpaired_stdout_stderr.rs | 2 +- tests/ui/macros/not-utf8.rs | 2 +- tests/ui/macros/not-utf8.stderr | 9 ++- tests/ui/modules/path-no-file-name.rs | 2 +- tests/ui/modules/path-no-file-name.stderr | 2 +- tests/ui/parser/issues/issue-5806.rs | 2 +- tests/ui/parser/issues/issue-5806.stderr | 2 +- tests/ui/parser/mod_file_with_path_attr.rs | 2 +- .../ui/parser/mod_file_with_path_attr.stderr | 2 +- .../staged-api-invalid-path-108697.stderr | 2 +- 14 files changed, 88 insertions(+), 20 deletions(-) diff --git a/compiler/rustc_builtin_macros/src/lib.rs b/compiler/rustc_builtin_macros/src/lib.rs index 6071d36f8ebbc..0918403b8555a 100644 --- a/compiler/rustc_builtin_macros/src/lib.rs +++ b/compiler/rustc_builtin_macros/src/lib.rs @@ -16,6 +16,7 @@ #![feature(proc_macro_internals)] #![feature(proc_macro_quote)] #![feature(rustdoc_internals)] +#![feature(string_from_utf8_lossy_owned)] #![feature(try_blocks)] #![warn(unreachable_pub)] // tidy-alphabetical-end diff --git a/compiler/rustc_builtin_macros/src/source_util.rs b/compiler/rustc_builtin_macros/src/source_util.rs index 123b96f6bcaaa..d163da3ddea3b 100644 --- a/compiler/rustc_builtin_macros/src/source_util.rs +++ b/compiler/rustc_builtin_macros/src/source_util.rs @@ -13,7 +13,7 @@ use rustc_expand::base::{ use rustc_expand::module::DirOwnership; use rustc_lint_defs::BuiltinLintDiag; use rustc_parse::parser::{ForceCollect, Parser}; -use rustc_parse::{new_parser_from_file, unwrap_or_emit_fatal}; +use rustc_parse::{new_parser_from_file, unwrap_or_emit_fatal, utf8_error}; use rustc_session::lint::builtin::INCOMPLETE_INCLUDE; use rustc_span::source_map::SourceMap; use rustc_span::{Pos, Span, Symbol}; @@ -209,9 +209,10 @@ pub(crate) fn expand_include_str( let interned_src = Symbol::intern(src); MacEager::expr(cx.expr_str(cx.with_def_site_ctxt(bsp), interned_src)) } - Err(_) => { - let guar = cx.dcx().span_err(sp, format!("`{path}` wasn't a utf-8 file")); - DummyResult::any(sp, guar) + Err(utf8err) => { + let mut err = cx.dcx().struct_span_err(sp, format!("`{path}` wasn't a utf-8 file")); + utf8_error(cx.source_map(), path.as_str(), None, &mut err, utf8err, &bytes[..]); + DummyResult::any(sp, err.emit()) } }, Err(dummy) => dummy, @@ -273,7 +274,7 @@ fn load_binary_file( .and_then(|path| path.into_os_string().into_string().ok()); if let Some(new_path) = new_path { - err.span_suggestion( + err.span_suggestion_verbose( path_span, "there is a file with the same name in a different directory", format!("\"{}\"", new_path.replace('\\', "/").escape_debug()), diff --git a/compiler/rustc_parse/src/lib.rs b/compiler/rustc_parse/src/lib.rs index f963a424a7fd6..e681987ff07f7 100644 --- a/compiler/rustc_parse/src/lib.rs +++ b/compiler/rustc_parse/src/lib.rs @@ -11,18 +11,21 @@ #![feature(if_let_guard)] #![feature(iter_intersperse)] #![feature(let_chains)] +#![feature(string_from_utf8_lossy_owned)] #![warn(unreachable_pub)] // tidy-alphabetical-end -use std::path::Path; +use std::path::{Path, PathBuf}; +use std::str::Utf8Error; use rustc_ast as ast; use rustc_ast::tokenstream::TokenStream; use rustc_ast::{AttrItem, Attribute, MetaItemInner, token}; use rustc_ast_pretty::pprust; use rustc_data_structures::sync::Lrc; -use rustc_errors::{Diag, FatalError, PResult}; +use rustc_errors::{Diag, EmissionGuarantee, FatalError, PResult, pluralize}; use rustc_session::parse::ParseSess; +use rustc_span::source_map::SourceMap; use rustc_span::{FileName, SourceFile, Span}; pub use unicode_normalization::UNICODE_VERSION as UNICODE_NORMALIZATION_VERSION; @@ -73,9 +76,22 @@ pub fn new_parser_from_file<'a>( path: &Path, sp: Option, ) -> Result, Vec>> { - let source_file = psess.source_map().load_file(path).unwrap_or_else(|e| { - let msg = format!("couldn't read {}: {}", path.display(), e); + let sm = psess.source_map(); + let source_file = sm.load_file(path).unwrap_or_else(|e| { + let msg = format!("couldn't read `{}`: {}", path.display(), e); let mut err = psess.dcx().struct_fatal(msg); + if let Ok(contents) = std::fs::read(path) + && let Err(utf8err) = String::from_utf8(contents.clone()) + { + utf8_error( + sm, + &path.display().to_string(), + sp, + &mut err, + utf8err.utf8_error(), + &contents, + ); + } if let Some(sp) = sp { err.span(sp); } @@ -84,6 +100,49 @@ pub fn new_parser_from_file<'a>( new_parser_from_source_file(psess, source_file) } +pub fn utf8_error( + sm: &SourceMap, + path: &str, + sp: Option, + err: &mut Diag<'_, E>, + utf8err: Utf8Error, + contents: &[u8], +) { + // The file exists, but it wasn't valid UTF-8. + let start = utf8err.valid_up_to(); + let note = format!("invalid utf-8 at byte `{start}`"); + let msg = if let Some(len) = utf8err.error_len() { + format!( + "byte{s} `{bytes}` {are} not valid utf-8", + bytes = if len == 1 { + format!("{:?}", contents[start]) + } else { + format!("{:?}", &contents[start..start + len]) + }, + s = pluralize!(len), + are = if len == 1 { "is" } else { "are" }, + ) + } else { + note.clone() + }; + let contents = String::from_utf8_lossy(contents).to_string(); + let source = sm.new_source_file(PathBuf::from(path).into(), contents); + let span = Span::with_root_ctxt( + source.normalized_byte_pos(start as u32), + source.normalized_byte_pos(start as u32), + ); + if span.is_dummy() { + err.note(note); + } else { + if sp.is_some() { + err.span_note(span, msg); + } else { + err.span(span); + err.span_label(span, msg); + } + } +} + /// Given a session and a `source_file`, return a parser. Returns any buffered errors from lexing /// the initial token stream. fn new_parser_from_source_file( diff --git a/src/tools/compiletest/src/errors.rs b/src/tools/compiletest/src/errors.rs index efe758e65cf66..37ef63ae42ea9 100644 --- a/src/tools/compiletest/src/errors.rs +++ b/src/tools/compiletest/src/errors.rs @@ -101,6 +101,8 @@ pub fn load_errors(testfile: &Path, revision: Option<&str>) -> Vec { rdr.lines() .enumerate() + // We want to ignore utf-8 failures in tests during collection of annotations. + .filter(|(_, line)| line.is_ok()) .filter_map(|(line_num, line)| { parse_expected(last_nonfollow_error, line_num + 1, &line.unwrap(), revision).map( |(which, error)| { diff --git a/src/tools/tidy/src/tests_revision_unpaired_stdout_stderr.rs b/src/tools/tidy/src/tests_revision_unpaired_stdout_stderr.rs index 00edf99a30dbd..ee92d302db3c9 100644 --- a/src/tools/tidy/src/tests_revision_unpaired_stdout_stderr.rs +++ b/src/tools/tidy/src/tests_revision_unpaired_stdout_stderr.rs @@ -58,7 +58,7 @@ pub fn check(tests_path: impl AsRef, bad: &mut bool) { let mut expected_revisions = BTreeSet::new(); - let contents = std::fs::read_to_string(test).unwrap(); + let Ok(contents) = std::fs::read_to_string(test) else { continue }; // Collect directives. iter_header(&contents, &mut |HeaderLine { revision, directive, .. }| { diff --git a/tests/ui/macros/not-utf8.rs b/tests/ui/macros/not-utf8.rs index 8100d65a9f811..ad8ac39d230e4 100644 --- a/tests/ui/macros/not-utf8.rs +++ b/tests/ui/macros/not-utf8.rs @@ -3,5 +3,5 @@ //@ reference: input.encoding.invalid fn foo() { - include!("not-utf8.bin") + include!("not-utf8.bin"); } diff --git a/tests/ui/macros/not-utf8.stderr b/tests/ui/macros/not-utf8.stderr index 0d587cab5f3ed..17ee8197ac8be 100644 --- a/tests/ui/macros/not-utf8.stderr +++ b/tests/ui/macros/not-utf8.stderr @@ -1,9 +1,14 @@ -error: couldn't read $DIR/not-utf8.bin: stream did not contain valid UTF-8 +error: couldn't read `$DIR/not-utf8.bin`: stream did not contain valid UTF-8 --> $DIR/not-utf8.rs:6:5 | -LL | include!("not-utf8.bin") +LL | include!("not-utf8.bin"); | ^^^^^^^^^^^^^^^^^^^^^^^^ | +note: byte `193` is not valid utf-8 + --> $DIR/not-utf8.bin:1:1 + | +LL | �|�␂!5�cc␕␂�Ӻi��WWj�ȥ�'�}�␒�J�ȉ��W�␞O�@����␜w�V���LO����␔[ ␃_�'���SQ�~ذ��ų&��- ��lN~��!@␌ _#���kQ��h�␝�:�... + | ^ = note: this error originates in the macro `include` (in Nightly builds, run with -Z macro-backtrace for more info) error: aborting due to 1 previous error diff --git a/tests/ui/modules/path-no-file-name.rs b/tests/ui/modules/path-no-file-name.rs index 23127346e02aa..753a09501235a 100644 --- a/tests/ui/modules/path-no-file-name.rs +++ b/tests/ui/modules/path-no-file-name.rs @@ -1,4 +1,4 @@ -//@ normalize-stderr: "\.:.*\(" -> ".: $$ACCESS_DENIED_MSG (" +//@ normalize-stderr: "\.`:.*\(" -> ".`: $$ACCESS_DENIED_MSG (" //@ normalize-stderr: "os error \d+" -> "os error $$ACCESS_DENIED_CODE" #[path = "."] diff --git a/tests/ui/modules/path-no-file-name.stderr b/tests/ui/modules/path-no-file-name.stderr index 834e8ea6b03de..6274ecfed1365 100644 --- a/tests/ui/modules/path-no-file-name.stderr +++ b/tests/ui/modules/path-no-file-name.stderr @@ -1,4 +1,4 @@ -error: couldn't read $DIR/.: $ACCESS_DENIED_MSG (os error $ACCESS_DENIED_CODE) +error: couldn't read `$DIR/.`: $ACCESS_DENIED_MSG (os error $ACCESS_DENIED_CODE) --> $DIR/path-no-file-name.rs:5:1 | LL | mod m; diff --git a/tests/ui/parser/issues/issue-5806.rs b/tests/ui/parser/issues/issue-5806.rs index dbd53a7adc433..1a819e22197fe 100644 --- a/tests/ui/parser/issues/issue-5806.rs +++ b/tests/ui/parser/issues/issue-5806.rs @@ -1,4 +1,4 @@ -//@ normalize-stderr: "parser:.*\(" -> "parser: $$ACCESS_DENIED_MSG (" +//@ normalize-stderr: "parser`:.*\(" -> "parser`: $$ACCESS_DENIED_MSG (" //@ normalize-stderr: "os error \d+" -> "os error $$ACCESS_DENIED_CODE" #[path = "../parser"] diff --git a/tests/ui/parser/issues/issue-5806.stderr b/tests/ui/parser/issues/issue-5806.stderr index 4b025bd19a048..88cc982baf259 100644 --- a/tests/ui/parser/issues/issue-5806.stderr +++ b/tests/ui/parser/issues/issue-5806.stderr @@ -1,4 +1,4 @@ -error: couldn't read $DIR/../parser: $ACCESS_DENIED_MSG (os error $ACCESS_DENIED_CODE) +error: couldn't read `$DIR/../parser`: $ACCESS_DENIED_MSG (os error $ACCESS_DENIED_CODE) --> $DIR/issue-5806.rs:5:1 | LL | mod foo; diff --git a/tests/ui/parser/mod_file_with_path_attr.rs b/tests/ui/parser/mod_file_with_path_attr.rs index ff964f750e296..b7f4a9c6ae02f 100644 --- a/tests/ui/parser/mod_file_with_path_attr.rs +++ b/tests/ui/parser/mod_file_with_path_attr.rs @@ -1,4 +1,4 @@ -//@ normalize-stderr: "not_a_real_file.rs:.*\(" -> "not_a_real_file.rs: $$FILE_NOT_FOUND_MSG (" +//@ normalize-stderr: "not_a_real_file.rs`:.*\(" -> "not_a_real_file.rs`: $$FILE_NOT_FOUND_MSG (" #[path = "not_a_real_file.rs"] mod m; //~ ERROR not_a_real_file.rs diff --git a/tests/ui/parser/mod_file_with_path_attr.stderr b/tests/ui/parser/mod_file_with_path_attr.stderr index 9ccb775daab13..ef8a715712bbd 100644 --- a/tests/ui/parser/mod_file_with_path_attr.stderr +++ b/tests/ui/parser/mod_file_with_path_attr.stderr @@ -1,4 +1,4 @@ -error: couldn't read $DIR/not_a_real_file.rs: $FILE_NOT_FOUND_MSG (os error 2) +error: couldn't read `$DIR/not_a_real_file.rs`: $FILE_NOT_FOUND_MSG (os error 2) --> $DIR/mod_file_with_path_attr.rs:4:1 | LL | mod m; diff --git a/tests/ui/unpretty/staged-api-invalid-path-108697.stderr b/tests/ui/unpretty/staged-api-invalid-path-108697.stderr index 9c6d1a042d755..e68e19c4dc99e 100644 --- a/tests/ui/unpretty/staged-api-invalid-path-108697.stderr +++ b/tests/ui/unpretty/staged-api-invalid-path-108697.stderr @@ -1,4 +1,4 @@ -error: couldn't read $DIR/lol: No such file or directory (os error 2) +error: couldn't read `$DIR/lol`: No such file or directory (os error 2) --> $DIR/staged-api-invalid-path-108697.rs:8:1 | LL | mod foo;